diff --git a/.gitignore b/.gitignore index fb43b6c..5c4f9a5 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,10 @@ cranfield.qrels cranfieldoutput /duckdb-fts-main/ /trec_eval/ +/spreadsheets/ +*.png +plot* +*lock* *.db *.ciff output*.txt diff --git a/phrase_index.py b/phrase_index.py index cb1a909..d3d52a5 100644 --- a/phrase_index.py +++ b/phrase_index.py @@ -159,7 +159,6 @@ def create_tokenizer_duckdb(con): ); """) - def create_tokenizer_ciff(con, fts_schema="fts_main_documents"): con.sql(f""" CREATE TABLE IF NOT EXISTS {fts_schema}.dict (termid BIGINT, term TEXT, df BIGINT); @@ -190,7 +189,7 @@ def create_tokenizer_ciff(con, fts_schema="fts_main_documents"): def create_stopwords_table(con, fts_schema="fts_main_documents", stopwords='none'): """ Create the stopwords table. - If stopwords is 'english', it will create a table with English stopwords. + If stopwords is 'english', it will create a table with English stopwords. If stopwords is 'none', it will create an empty table. """ con.sql(f"DROP TABLE IF EXISTS {fts_schema}.stopwords;") @@ -252,15 +251,17 @@ def create_terms_table(con, fts_schema="fts_main_documents", input_schema="main" Assumes the table fts_main_documents.dict already exists. Adds a fieldid and termid column for compatibility with fielded search macros. """ - # Cleanup input text removing special characters + # Cleanup input text using the same regex as DuckDB's tokenizer con.sql(f""" CREATE OR REPLACE TABLE {fts_schema}.cleaned_docs AS SELECT - did, - regexp_replace(content, '[0-9!@#$%^&*()_+={{}}\\[\\]:;<>,.?~\\\\/\\|''''"`-]+', ' ', 'g') AS content + {input_id}, + regexp_replace(lower(strip_accents(CAST({input_val} AS VARCHAR))), + '[0-9!@#$%^&*()_+={{}}\\[\\]:;<>,.?~\\\\/\\|''''"`-]+', ' ', 'g') AS content, FROM {input_schema}.{input_table} """) + # Use the ciff tokenizer to find bigrams and unigrams con.sql(f""" CREATE OR REPLACE TABLE {fts_schema}.terms AS ( SELECT @@ -270,7 +271,7 @@ def create_terms_table(con, fts_schema="fts_main_documents", input_schema="main" FROM ( SELECT row_number() OVER (ORDER BY (SELECT NULL)) AS docid, - unnest({fts_schema}.tokenize({input_val})) AS term + unnest({fts_schema}.tokenize(content)) AS term FROM {fts_schema}.cleaned_docs ) AS t JOIN {fts_schema}.dict d ON t.term = d.term diff --git a/ze_index.py b/ze_index.py index 5d69ebc..dddd126 100644 --- a/ze_index.py +++ b/ze_index.py @@ -59,7 +59,7 @@ def create_lm(con, stemmer): SELECT docs.name AS docname, LN(MAX(doc_len)) + sum(subscore) AS score FROM subscores, fts_main_documents.docs AS docs WHERE subscores.docid = docs.docid GROUP BY docs.name ), postings_cost AS ( - SELECT COUNT(DISTINCT docid) AS cost FROM qterms + SELECT COUNT(*) AS cost FROM term_tf ) SELECT docname, score, (SELECT cost FROM postings_cost) AS postings_cost FROM scores );