2025-10-27 08:44:21 +00:00
3 changed files with 8 additions and 13 deletions
--- a/.gitignore
+++ b/.gitignore
@ -5,10 +5,6 @@ cranfield.qrels
 cranfieldoutput
 /duckdb-fts-main/
 /trec_eval/
 /spreadsheets/
 *.png
 plot*
 *lock*
 *.db
 *.ciff
 output*.txt
--- a/phrase_index.py
+++ b/phrase_index.py
@ -159,6 +159,7 @@ def create_tokenizer_duckdb(con):
        );
    """)
 def create_tokenizer_ciff(con, fts_schema="fts_main_documents"):
    con.sql(f"""
        CREATE TABLE IF NOT EXISTS {fts_schema}.dict (termid BIGINT, term TEXT, df BIGINT);
@ -251,17 +252,15 @@ def create_terms_table(con, fts_schema="fts_main_documents", input_schema="main"
    Assumes the table fts_main_documents.dict already exists.
    Adds a fieldid and termid column for compatibility with fielded search macros.
    """
-    # Cleanup input text using the same regex as DuckDB's tokenizer
+    # Cleanup input text removing special characters
    con.sql(f"""
        CREATE OR REPLACE TABLE {fts_schema}.cleaned_docs AS
        SELECT
-            {input_id},
+            did,
-            regexp_replace(lower(strip_accents(CAST({input_val} AS VARCHAR))),
+            regexp_replace(content, '[0-9!@#$%^&*()_+={{}}\\[\\]:;<>,.?~\\\\/\\|''''"`-]+', ' ', 'g') AS content
                '[0-9!@#$%^&*()_+={{}}\\[\\]:;<>,.?~\\\\/\\|''''"`-]+', ' ', 'g') AS content,
        FROM {input_schema}.{input_table}
    """)
    # Use the ciff tokenizer to find bigrams and unigrams
    con.sql(f"""
        CREATE OR REPLACE TABLE {fts_schema}.terms AS (
            SELECT
@ -270,8 +269,8 @@ def create_terms_table(con, fts_schema="fts_main_documents", input_schema="main"
                t.docid
            FROM (
                SELECT
-                    row_number() OVER () AS docid,
+                    row_number() OVER (ORDER BY (SELECT NULL)) AS docid,
-                    unnest({fts_schema}.tokenize(content)) AS term
+                    unnest({fts_schema}.tokenize({input_val})) AS term
                FROM {fts_schema}.cleaned_docs
            ) AS t
            JOIN {fts_schema}.dict d ON t.term = d.term
--- a/ze_index.py
+++ b/ze_index.py
@ -59,7 +59,7 @@ def create_lm(con, stemmer):
           SELECT docs.name AS docname, LN(MAX(doc_len)) + sum(subscore) AS score FROM subscores, fts_main_documents.docs AS docs WHERE subscores.docid = docs.docid GROUP BY docs.name
        ),
        postings_cost AS (
-           SELECT COUNT(*) AS cost FROM term_tf
+           SELECT COUNT(DISTINCT docid) AS cost FROM qterms
        )
        SELECT docname, score, (SELECT cost FROM postings_cost) AS postings_cost FROM scores
        );