mirror of
https://github.com/ArthurIdema/Zoekeend-Phrase-Indexing.git
synced 2025-10-27 00:34:21 +00:00
Compare commits
No commits in common. "3ea1faed788b39b7e287e14c485b0c6cfdae8c6a" and "37b17fcbd47426e0bf02795aa87e3c29c6b49a32" have entirely different histories.
3ea1faed78
...
37b17fcbd4
4
.gitignore
vendored
4
.gitignore
vendored
@ -5,10 +5,6 @@ cranfield.qrels
|
|||||||
cranfieldoutput
|
cranfieldoutput
|
||||||
/duckdb-fts-main/
|
/duckdb-fts-main/
|
||||||
/trec_eval/
|
/trec_eval/
|
||||||
/spreadsheets/
|
|
||||||
*.png
|
|
||||||
plot*
|
|
||||||
*lock*
|
|
||||||
*.db
|
*.db
|
||||||
*.ciff
|
*.ciff
|
||||||
output*.txt
|
output*.txt
|
||||||
|
|||||||
@ -159,6 +159,7 @@ def create_tokenizer_duckdb(con):
|
|||||||
);
|
);
|
||||||
""")
|
""")
|
||||||
|
|
||||||
|
|
||||||
def create_tokenizer_ciff(con, fts_schema="fts_main_documents"):
|
def create_tokenizer_ciff(con, fts_schema="fts_main_documents"):
|
||||||
con.sql(f"""
|
con.sql(f"""
|
||||||
CREATE TABLE IF NOT EXISTS {fts_schema}.dict (termid BIGINT, term TEXT, df BIGINT);
|
CREATE TABLE IF NOT EXISTS {fts_schema}.dict (termid BIGINT, term TEXT, df BIGINT);
|
||||||
@ -189,7 +190,7 @@ def create_tokenizer_ciff(con, fts_schema="fts_main_documents"):
|
|||||||
def create_stopwords_table(con, fts_schema="fts_main_documents", stopwords='none'):
|
def create_stopwords_table(con, fts_schema="fts_main_documents", stopwords='none'):
|
||||||
"""
|
"""
|
||||||
Create the stopwords table.
|
Create the stopwords table.
|
||||||
If stopwords is 'english', it will create a table with English stopwords.
|
If stopwords is 'english', it will create a table with English stopwords.
|
||||||
If stopwords is 'none', it will create an empty table.
|
If stopwords is 'none', it will create an empty table.
|
||||||
"""
|
"""
|
||||||
con.sql(f"DROP TABLE IF EXISTS {fts_schema}.stopwords;")
|
con.sql(f"DROP TABLE IF EXISTS {fts_schema}.stopwords;")
|
||||||
@ -251,17 +252,15 @@ def create_terms_table(con, fts_schema="fts_main_documents", input_schema="main"
|
|||||||
Assumes the table fts_main_documents.dict already exists.
|
Assumes the table fts_main_documents.dict already exists.
|
||||||
Adds a fieldid and termid column for compatibility with fielded search macros.
|
Adds a fieldid and termid column for compatibility with fielded search macros.
|
||||||
"""
|
"""
|
||||||
# Cleanup input text using the same regex as DuckDB's tokenizer
|
# Cleanup input text removing special characters
|
||||||
con.sql(f"""
|
con.sql(f"""
|
||||||
CREATE OR REPLACE TABLE {fts_schema}.cleaned_docs AS
|
CREATE OR REPLACE TABLE {fts_schema}.cleaned_docs AS
|
||||||
SELECT
|
SELECT
|
||||||
{input_id},
|
did,
|
||||||
regexp_replace(lower(strip_accents(CAST({input_val} AS VARCHAR))),
|
regexp_replace(content, '[0-9!@#$%^&*()_+={{}}\\[\\]:;<>,.?~\\\\/\\|''''"`-]+', ' ', 'g') AS content
|
||||||
'[0-9!@#$%^&*()_+={{}}\\[\\]:;<>,.?~\\\\/\\|''''"`-]+', ' ', 'g') AS content,
|
|
||||||
FROM {input_schema}.{input_table}
|
FROM {input_schema}.{input_table}
|
||||||
""")
|
""")
|
||||||
|
|
||||||
# Use the ciff tokenizer to find bigrams and unigrams
|
|
||||||
con.sql(f"""
|
con.sql(f"""
|
||||||
CREATE OR REPLACE TABLE {fts_schema}.terms AS (
|
CREATE OR REPLACE TABLE {fts_schema}.terms AS (
|
||||||
SELECT
|
SELECT
|
||||||
@ -270,8 +269,8 @@ def create_terms_table(con, fts_schema="fts_main_documents", input_schema="main"
|
|||||||
t.docid
|
t.docid
|
||||||
FROM (
|
FROM (
|
||||||
SELECT
|
SELECT
|
||||||
row_number() OVER () AS docid,
|
row_number() OVER (ORDER BY (SELECT NULL)) AS docid,
|
||||||
unnest({fts_schema}.tokenize(content)) AS term
|
unnest({fts_schema}.tokenize({input_val})) AS term
|
||||||
FROM {fts_schema}.cleaned_docs
|
FROM {fts_schema}.cleaned_docs
|
||||||
) AS t
|
) AS t
|
||||||
JOIN {fts_schema}.dict d ON t.term = d.term
|
JOIN {fts_schema}.dict d ON t.term = d.term
|
||||||
|
|||||||
@ -59,7 +59,7 @@ def create_lm(con, stemmer):
|
|||||||
SELECT docs.name AS docname, LN(MAX(doc_len)) + sum(subscore) AS score FROM subscores, fts_main_documents.docs AS docs WHERE subscores.docid = docs.docid GROUP BY docs.name
|
SELECT docs.name AS docname, LN(MAX(doc_len)) + sum(subscore) AS score FROM subscores, fts_main_documents.docs AS docs WHERE subscores.docid = docs.docid GROUP BY docs.name
|
||||||
),
|
),
|
||||||
postings_cost AS (
|
postings_cost AS (
|
||||||
SELECT COUNT(*) AS cost FROM term_tf
|
SELECT COUNT(DISTINCT docid) AS cost FROM qterms
|
||||||
)
|
)
|
||||||
SELECT docname, score, (SELECT cost FROM postings_cost) AS postings_cost FROM scores
|
SELECT docname, score, (SELECT cost FROM postings_cost) AS postings_cost FROM scores
|
||||||
);
|
);
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user