Compare commits

...

2 Commits

Author SHA1 Message Date
Arthur Idema
3ea1faed78 Removed unnecessary code 2025-09-04 17:31:31 +02:00
Arthur Idema
affc0f05e4 Updated the tokenizer 2025-09-04 17:09:05 +02:00
3 changed files with 13 additions and 8 deletions

4
.gitignore vendored
View File

@ -5,6 +5,10 @@ cranfield.qrels
cranfieldoutput cranfieldoutput
/duckdb-fts-main/ /duckdb-fts-main/
/trec_eval/ /trec_eval/
/spreadsheets/
*.png
plot*
*lock*
*.db *.db
*.ciff *.ciff
output*.txt output*.txt

View File

@ -159,7 +159,6 @@ def create_tokenizer_duckdb(con):
); );
""") """)
def create_tokenizer_ciff(con, fts_schema="fts_main_documents"): def create_tokenizer_ciff(con, fts_schema="fts_main_documents"):
con.sql(f""" con.sql(f"""
CREATE TABLE IF NOT EXISTS {fts_schema}.dict (termid BIGINT, term TEXT, df BIGINT); CREATE TABLE IF NOT EXISTS {fts_schema}.dict (termid BIGINT, term TEXT, df BIGINT);
@ -190,7 +189,7 @@ def create_tokenizer_ciff(con, fts_schema="fts_main_documents"):
def create_stopwords_table(con, fts_schema="fts_main_documents", stopwords='none'): def create_stopwords_table(con, fts_schema="fts_main_documents", stopwords='none'):
""" """
Create the stopwords table. Create the stopwords table.
If stopwords is 'english', it will create a table with English stopwords. If stopwords is 'english', it will create a table with English stopwords.
If stopwords is 'none', it will create an empty table. If stopwords is 'none', it will create an empty table.
""" """
con.sql(f"DROP TABLE IF EXISTS {fts_schema}.stopwords;") con.sql(f"DROP TABLE IF EXISTS {fts_schema}.stopwords;")
@ -252,15 +251,17 @@ def create_terms_table(con, fts_schema="fts_main_documents", input_schema="main"
Assumes the table fts_main_documents.dict already exists. Assumes the table fts_main_documents.dict already exists.
Adds a fieldid and termid column for compatibility with fielded search macros. Adds a fieldid and termid column for compatibility with fielded search macros.
""" """
# Cleanup input text removing special characters # Cleanup input text using the same regex as DuckDB's tokenizer
con.sql(f""" con.sql(f"""
CREATE OR REPLACE TABLE {fts_schema}.cleaned_docs AS CREATE OR REPLACE TABLE {fts_schema}.cleaned_docs AS
SELECT SELECT
did, {input_id},
regexp_replace(content, '[0-9!@#$%^&*()_+={{}}\\[\\]:;<>,.?~\\\\/\\|''''"`-]+', ' ', 'g') AS content regexp_replace(lower(strip_accents(CAST({input_val} AS VARCHAR))),
'[0-9!@#$%^&*()_+={{}}\\[\\]:;<>,.?~\\\\/\\|''''"`-]+', ' ', 'g') AS content,
FROM {input_schema}.{input_table} FROM {input_schema}.{input_table}
""") """)
# Use the ciff tokenizer to find bigrams and unigrams
con.sql(f""" con.sql(f"""
CREATE OR REPLACE TABLE {fts_schema}.terms AS ( CREATE OR REPLACE TABLE {fts_schema}.terms AS (
SELECT SELECT
@ -269,8 +270,8 @@ def create_terms_table(con, fts_schema="fts_main_documents", input_schema="main"
t.docid t.docid
FROM ( FROM (
SELECT SELECT
row_number() OVER (ORDER BY (SELECT NULL)) AS docid, row_number() OVER () AS docid,
unnest({fts_schema}.tokenize({input_val})) AS term unnest({fts_schema}.tokenize(content)) AS term
FROM {fts_schema}.cleaned_docs FROM {fts_schema}.cleaned_docs
) AS t ) AS t
JOIN {fts_schema}.dict d ON t.term = d.term JOIN {fts_schema}.dict d ON t.term = d.term

View File

@ -59,7 +59,7 @@ def create_lm(con, stemmer):
SELECT docs.name AS docname, LN(MAX(doc_len)) + sum(subscore) AS score FROM subscores, fts_main_documents.docs AS docs WHERE subscores.docid = docs.docid GROUP BY docs.name SELECT docs.name AS docname, LN(MAX(doc_len)) + sum(subscore) AS score FROM subscores, fts_main_documents.docs AS docs WHERE subscores.docid = docs.docid GROUP BY docs.name
), ),
postings_cost AS ( postings_cost AS (
SELECT COUNT(DISTINCT docid) AS cost FROM qterms SELECT COUNT(*) AS cost FROM term_tf
) )
SELECT docname, score, (SELECT cost FROM postings_cost) AS postings_cost FROM scores SELECT docname, score, (SELECT cost FROM postings_cost) AS postings_cost FROM scores
); );