mirror of
https://github.com/ArthurIdema/Zoekeend-Phrase-Indexing.git
synced 2025-10-26 16:24:21 +00:00
527 lines
27 KiB
Python
527 lines
27 KiB
Python
import pathlib
|
|
import sys
|
|
import duckdb
|
|
import ir_datasets
|
|
|
|
|
|
from phrases_extractor import extract_phrases_pmi_duckdb
|
|
from ze_index import normalize
|
|
|
|
def insert_dataset(con, ir_dataset, logging=True):
|
|
"""
|
|
Insert documents from an ir_dataset. Works with several datasets.
|
|
Add document attributes if needed.
|
|
"""
|
|
con.sql('CREATE TABLE documents (did TEXT, content TEXT)')
|
|
insert = 'INSERT INTO documents(did, content) VALUES '
|
|
sql = insert
|
|
part = 0
|
|
total = 0
|
|
count = ir_dataset.docs_count()
|
|
if logging:
|
|
print(f"Inserting {count} docs...", file=sys.stderr)
|
|
for doc in ir_dataset.docs_iter():
|
|
doc_text = ""
|
|
if hasattr(doc, 'title'):
|
|
doc_text = doc.title
|
|
if hasattr(doc, 'body'):
|
|
doc_text += " " + doc.body
|
|
if hasattr(doc, 'text'):
|
|
doc_text += " " + doc.text
|
|
sql += "('" + doc.doc_id + "','" + normalize(doc_text) + "'),"
|
|
part += 1
|
|
if part > 9999:
|
|
total += part
|
|
if logging:
|
|
print(str(total) + " docs", file=sys.stderr)
|
|
con.sql(sql)
|
|
part = 0
|
|
sql = insert
|
|
con.sql(sql)
|
|
|
|
def create_lm(con, stemmer):
|
|
con.sql(f"""
|
|
CREATE OR REPLACE MACRO fts_main_documents.match_lm(query_string, fields := NULL, lambda := 0.3, conjunctive := 0) AS TABLE (
|
|
WITH tokens AS (
|
|
SELECT DISTINCT stem(unnest(fts_main_documents.tokenize(query_string)), '{stemmer}') AS t
|
|
),
|
|
fieldids AS (
|
|
SELECT fieldid
|
|
FROM fts_main_documents.fields
|
|
WHERE CASE WHEN ((fields IS NULL)) THEN (1) ELSE (field = ANY(SELECT * FROM (SELECT unnest(string_split(fields, ','))) AS fsq)) END
|
|
),
|
|
qtermids AS (
|
|
SELECT termid, df
|
|
FROM fts_main_documents.dict AS dict, tokens
|
|
WHERE (dict.term = tokens.t)
|
|
),
|
|
qterms AS (
|
|
SELECT termid, docid
|
|
FROM fts_main_documents.terms AS terms
|
|
WHERE (CASE WHEN ((fields IS NULL)) THEN (1) ELSE (fieldid = ANY(SELECT * FROM fieldids)) END
|
|
AND (termid = ANY(SELECT qtermids.termid FROM qtermids)))
|
|
),
|
|
term_tf AS (
|
|
SELECT termid, docid, count_star() AS tf
|
|
FROM qterms
|
|
GROUP BY docid, termid
|
|
),
|
|
cdocs AS (
|
|
SELECT docid
|
|
FROM qterms
|
|
GROUP BY docid
|
|
HAVING CASE WHEN (conjunctive) THEN ((count(DISTINCT termid) = (SELECT count_star() FROM tokens))) ELSE 1 END
|
|
),
|
|
subscores AS (
|
|
SELECT docs.docid, docs.len AS doc_len, term_tf.termid, term_tf.tf, qtermids.df, LN(1 + (lambda * tf * (SELECT ANY_VALUE(sumdf) FROM fts_main_documents.stats)) / ((1-lambda) * df * docs.len)) AS subscore
|
|
FROM term_tf, cdocs, fts_main_documents.docs AS docs, qtermids
|
|
WHERE ((term_tf.docid = cdocs.docid)
|
|
AND (term_tf.docid = docs.docid)
|
|
AND (term_tf.termid = qtermids.termid))
|
|
),
|
|
scores AS (
|
|
SELECT docs.name AS docname, LN(MAX(doc_len)) + sum(subscore) AS score FROM subscores, fts_main_documents.docs AS docs WHERE subscores.docid = docs.docid GROUP BY docs.name
|
|
),
|
|
postings_cost AS (
|
|
SELECT COUNT(*) AS cost FROM term_tf
|
|
)
|
|
SELECT docname, score, (SELECT cost FROM postings_cost) AS postings_cost FROM scores
|
|
);
|
|
""")
|
|
|
|
def create_bm25(con, stemmer):
|
|
con.sql(f"""
|
|
CREATE MACRO fts_main_documents.match_bm25(docname, query_string, b := 0.75, conjunctive := 0, k := 1.2, fields := NULL) AS (
|
|
WITH tokens AS (
|
|
SELECT DISTINCT stem(unnest(fts_main_documents.tokenize(query_string)), '{stemmer}') AS t
|
|
),
|
|
fieldids AS (
|
|
SELECT fieldid
|
|
FROM fts_main_documents.fields
|
|
WHERE CASE WHEN ((fields IS NULL)) THEN (1) ELSE (field = ANY(SELECT * FROM (SELECT unnest(string_split(fields, ','))) AS fsq)) END
|
|
),
|
|
qtermids AS (
|
|
SELECT termid, df
|
|
FROM fts_main_documents.dict AS dict, tokens
|
|
WHERE (dict.term = tokens.t)
|
|
),
|
|
qterms AS (
|
|
SELECT termid, docid
|
|
FROM fts_main_documents.terms AS terms
|
|
WHERE (CASE WHEN ((fields IS NULL)) THEN (1) ELSE (fieldid = ANY(SELECT * FROM fieldids)) END
|
|
AND (termid = ANY(SELECT qtermids.termid FROM qtermids)))
|
|
),
|
|
term_tf AS (
|
|
SELECT termid, docid, count_star() AS tf
|
|
FROM qterms
|
|
GROUP BY docid, termid
|
|
),
|
|
cdocs AS (
|
|
SELECT docid
|
|
FROM qterms
|
|
GROUP BY docid
|
|
HAVING CASE WHEN (conjunctive) THEN ((count(DISTINCT termid) = (SELECT count_star() FROM tokens))) ELSE 1 END
|
|
),
|
|
subscores AS (
|
|
SELECT docs.docid, docs.len, term_tf.termid, term_tf.tf, qtermids.df, (log((((((SELECT num_docs FROM fts_main_documents.stats) - df) + 0.5) / (df + 0.5)) + 1)) * ((tf * (k + 1)) / (tf + (k * ((1 - b) + (b * (len / (SELECT avgdl FROM fts_main_documents.stats)))))))) AS subscore
|
|
FROM term_tf, cdocs, fts_main_documents.docs AS docs, qtermids
|
|
WHERE ((term_tf.docid = cdocs.docid)
|
|
AND (term_tf.docid = docs.docid)
|
|
AND (term_tf.termid = qtermids.termid))
|
|
),
|
|
scores AS (
|
|
SELECT docid, sum(subscore) AS score FROM subscores GROUP BY docid
|
|
),
|
|
SELECT score FROM scores, fts_main_documents.docs AS docs
|
|
WHERE ((scores.docid = docs.docid) AND (docs."name" = docname)))
|
|
""")
|
|
|
|
def create_docs_table(con, fts_schema="fts_main_documents", input_schema="main", input_table="documents", input_id="did"):
|
|
"""
|
|
Create the documents table.
|
|
input_id should be the column name in input_table that uniquely identifies each document (e.g., 'did').
|
|
"""
|
|
con.sql(f"""
|
|
CREATE SCHEMA IF NOT EXISTS {fts_schema};
|
|
CREATE TABLE {fts_schema}.docs AS (
|
|
SELECT
|
|
row_number() OVER () AS docid,
|
|
{input_id} AS name
|
|
FROM
|
|
{input_schema}.{input_table}
|
|
);
|
|
""")
|
|
|
|
def create_tokenizer_duckdb(con):
|
|
con.sql("""
|
|
CREATE MACRO fts_main_documents.tokenize(s) AS (
|
|
string_split_regex(regexp_replace(lower(strip_accents(CAST(s AS VARCHAR))), '[0-9!@#$%^&*()_+={}\\[\\]:;<>,.?~\\\\/\\|''''"`-]+', ' ', 'g'), '\\s+')
|
|
);
|
|
""")
|
|
|
|
def create_tokenizer_ciff(con, fts_schema="fts_main_documents"):
|
|
con.sql(f"""
|
|
CREATE TABLE IF NOT EXISTS {fts_schema}.dict (termid BIGINT, term TEXT, df BIGINT);
|
|
CREATE OR REPLACE MACRO {fts_schema}.tokenize(query_string) AS (
|
|
WITH RECURSIVE sequence AS (
|
|
SELECT range AS nr
|
|
FROM RANGE((SELECT MAX(LEN(term)) + 1 FROM {fts_schema}.dict))
|
|
),
|
|
simpledict AS (
|
|
SELECT '' AS term
|
|
UNION
|
|
SELECT term FROM {fts_schema}.dict
|
|
),
|
|
subterms(term, subquery) AS (
|
|
SELECT '', lower(strip_accents(CAST(query_string AS VARCHAR)))
|
|
UNION
|
|
SELECT MAX(dict.term), SUBSTRING(subquery,
|
|
CASE WHEN MAX(nr) < 1 THEN 2 ELSE MAX(nr) + 1 END,
|
|
LEN(subquery)) AS subquery
|
|
FROM subterms, sequence, simpledict as dict
|
|
WHERE SUBSTRING(subquery, 1, nr) = dict.term
|
|
GROUP BY subquery
|
|
)
|
|
SELECT LIST(term) FROM subterms WHERE NOT term = ''
|
|
)
|
|
""")
|
|
|
|
def create_stopwords_table(con, fts_schema="fts_main_documents", stopwords='none'):
|
|
"""
|
|
Create the stopwords table.
|
|
If stopwords is 'english', it will create a table with English stopwords.
|
|
If stopwords is 'none', it will create an empty table.
|
|
"""
|
|
con.sql(f"DROP TABLE IF EXISTS {fts_schema}.stopwords;")
|
|
if stopwords == 'english':
|
|
con.sql(f"""
|
|
CREATE TABLE {fts_schema}.stopwords (sw VARCHAR);
|
|
INSERT INTO {fts_schema}.stopwords VALUES ('a'), ('a''s'), ('able'), ('about'), ('above'), ('according'), ('accordingly'), ('across'), ('actually'), ('after'), ('afterwards'), ('again'), ('against'), ('ain''t'), ('all'), ('allow'), ('allows'), ('almost'), ('alone'), ('along'), ('already'), ('also'), ('although'), ('always'), ('am'), ('among'), ('amongst'), ('an'), ('and'), ('another'), ('any'), ('anybody'), ('anyhow'), ('anyone'), ('anything'), ('anyway'), ('anyways'), ('anywhere'), ('apart'), ('appear'), ('appreciate'), ('appropriate'), ('are'), ('aren''t'), ('around'), ('as'), ('aside'), ('ask'), ('asking'), ('associated'), ('at'), ('available'), ('away'), ('awfully'), ('b'), ('be'), ('became'), ('because'), ('become'), ('becomes'), ('becoming'), ('been'), ('before'), ('beforehand'), ('behind'), ('being'), ('believe'), ('below'), ('beside'), ('besides'), ('best'), ('better'), ('between'), ('beyond'), ('both'), ('brief'), ('but'), ('by'), ('c'), ('c''mon'), ('c''s'), ('came'), ('can'), ('can''t'), ('cannot'), ('cant'), ('cause'), ('causes'), ('certain'), ('certainly'), ('changes'), ('clearly'), ('co'), ('com'), ('come'), ('comes'), ('concerning'), ('consequently'), ('consider'), ('considering'), ('contain'), ('containing'), ('contains'), ('corresponding'), ('could'), ('couldn''t'), ('course'), ('currently'), ('d'), ('definitely'), ('described'), ('despite'), ('did'), ('didn''t'), ('different'), ('do'), ('does'), ('doesn''t'), ('doing'), ('don''t'), ('done'), ('down'), ('downwards'), ('during'), ('e'), ('each'), ('edu'), ('eg'), ('eight'), ('either'), ('else'), ('elsewhere'), ('enough'), ('entirely'), ('especially'), ('et'), ('etc'), ('even'), ('ever'), ('every'), ('everybody'), ('everyone'), ('everything'), ('everywhere'), ('ex'), ('exactly'), ('example'), ('except'), ('f'), ('far'), ('few'), ('fifth'), ('first'), ('five'), ('followed'), ('following'), ('follows'), ('for'), ('former'), ('formerly'), ('forth'), ('four'), ('from'), ('further'), ('furthermore'), ('g'), ('get'), ('gets'), ('getting'), ('given'), ('gives'), ('go'), ('goes'), ('going'), ('gone'), ('got'), ('gotten'), ('greetings'), ('h'), ('had'), ('hadn''t'), ('happens'), ('hardly'), ('has'), ('hasn''t'), ('have'), ('haven''t'), ('having'), ('he'), ('he''s'), ('hello'), ('help'), ('hence'), ('her'), ('here'), ('here''s'), ('hereafter'), ('hereby'), ('herein'), ('hereupon'), ('hers'), ('herself'), ('hi'), ('him'), ('himself'), ('his'), ('hither'), ('hopefully'), ('how'), ('howbeit'), ('however'), ('i'), ('i''d'), ('i''ll'), ('i''m'), ('i''ve'), ('ie'), ('if'), ('ignored'), ('immediate'), ('in'), ('inasmuch'), ('inc'), ('indeed'), ('indicate'), ('indicated'), ('indicates'), ('inner'), ('insofar'), ('instead'), ('into'), ('inward'), ('is'), ('isn''t'), ('it'), ('it''d'), ('it''ll'), ('it''s'), ('its'), ('itself'), ('j'), ('just'), ('k'), ('keep'), ('keeps'), ('kept'), ('know'), ('knows'), ('known'), ('l'), ('last'), ('lately'), ('later'), ('latter'), ('latterly'), ('least'), ('less'), ('lest'), ('let'), ('let''s'), ('like'), ('liked'), ('likely'), ('little'), ('look'), ('looking'), ('looks'), ('ltd'), ('m'), ('mainly'), ('many'), ('may'), ('maybe'), ('me'), ('mean'), ('meanwhile'), ('merely'), ('might'), ('more'), ('moreover'), ('most'), ('mostly'), ('much'), ('must'), ('my'), ('myself'), ('n'), ('name'), ('namely'), ('nd'), ('near'), ('nearly'), ('necessary'), ('need'), ('needs'), ('neither'), ('never'), ('nevertheless'), ('new'), ('next'), ('nine'), ('no'), ('nobody'), ('non'), ('none'), ('noone'), ('nor'), ('normally'), ('not'), ('nothing'), ('novel'), ('now'), ('nowhere'), ('o'), ('obviously'), ('of'), ('off'), ('often'), ('oh'), ('ok'), ('okay'), ('old'), ('on'), ('once'), ('one'), ('ones'), ('only'), ('onto'), ('or'), ('other'), ('others'), ('otherwise'), ('ought'), ('our'), ('ours'), ('ourselves'), ('out'), ('outside'), ('over'), ('overall'), ('own');
|
|
INSERT INTO {fts_schema}.stopwords VALUES ('p'), ('particular'), ('particularly'), ('per'), ('perhaps'), ('placed'), ('please'), ('plus'), ('possible'), ('presumably'), ('probably'), ('provides'), ('q'), ('que'), ('quite'), ('qv'), ('r'), ('rather'), ('rd'), ('re'), ('really'), ('reasonably'), ('regarding'), ('regardless'), ('regards'), ('relatively'), ('respectively'), ('right'), ('s'), ('said'), ('same'), ('saw'), ('say'), ('saying'), ('says'), ('second'), ('secondly'), ('see'), ('seeing'), ('seem'), ('seemed'), ('seeming'), ('seems'), ('seen'), ('self'), ('selves'), ('sensible'), ('sent'), ('serious'), ('seriously'), ('seven'), ('several'), ('shall'), ('she'), ('should'), ('shouldn''t'), ('since'), ('six'), ('so'), ('some'), ('somebody'), ('somehow'), ('someone'), ('something'), ('sometime'), ('sometimes'), ('somewhat'), ('somewhere'), ('soon'), ('sorry'), ('specified'), ('specify'), ('specifying'), ('still'), ('sub'), ('such'), ('sup'), ('sure'), ('t'), ('t''s'), ('take'), ('taken'), ('tell'), ('tends'), ('th'), ('than'), ('thank'), ('thanks'), ('thanx'), ('that'), ('that''s'), ('thats'), ('the'), ('their'), ('theirs'), ('them'), ('themselves'), ('then'), ('thence'), ('there'), ('there''s'), ('thereafter'), ('thereby'), ('therefore'), ('therein'), ('theres'), ('thereupon'), ('these'), ('they'), ('they''d'), ('they''ll'), ('they''re'), ('they''ve'), ('think'), ('third'), ('this'), ('thorough'), ('thoroughly'), ('those'), ('though'), ('three'), ('through'), ('throughout'), ('thru'), ('thus'), ('to'), ('together'), ('too'), ('took'), ('toward'), ('towards'), ('tried'), ('tries'), ('truly'), ('try'), ('trying'), ('twice'), ('two'), ('u'), ('un'), ('under'), ('unfortunately'), ('unless'), ('unlikely'), ('until'), ('unto'), ('up'), ('upon'), ('us'), ('use'), ('used'), ('useful'), ('uses'), ('using'), ('usually'), ('uucp'), ('v'), ('value'), ('various'), ('very'), ('via'), ('viz'), ('vs'), ('w'), ('want'), ('wants'), ('was'), ('wasn''t'), ('way'), ('we'), ('we''d'), ('we''ll'), ('we''re'), ('we''ve'), ('welcome'), ('well'), ('went'), ('were'), ('weren''t'), ('what'), ('what''s'), ('whatever'), ('when'), ('whence'), ('whenever'), ('where'), ('where''s'), ('whereafter'), ('whereas'), ('whereby'), ('wherein'), ('whereupon'), ('wherever'), ('whether'), ('which'), ('while'), ('whither'), ('who'), ('who''s'), ('whoever'), ('whole'), ('whom'), ('whose'), ('why'), ('will'), ('willing'), ('wish'), ('with'), ('within'), ('without'), ('won''t'), ('wonder'), ('would'), ('would'), ('wouldn''t'), ('x'), ('y'), ('yes'), ('yet'), ('you'), ('you''d'), ('you''ll'), ('you''re'), ('you''ve'), ('your'), ('yours'), ('yourself'), ('yourselves'), ('z'), ('zero');
|
|
""")
|
|
else:
|
|
con.sql(f"CREATE TABLE {fts_schema}.stopwords (sw VARCHAR);")
|
|
|
|
def create_duckdb_dict_table(con, fts_schema="fts_main_documents", stopwords='none'):
|
|
"""
|
|
Create the dict table using DuckDB's built-in dictionary functionality.
|
|
"""
|
|
con.sql(f"DROP TABLE IF EXISTS {fts_schema}.dict;")
|
|
create_stopwords_table(con, fts_schema, stopwords)
|
|
|
|
con.sql(f"""
|
|
CREATE TABLE {fts_schema}.dict AS
|
|
WITH distinct_terms AS (
|
|
SELECT DISTINCT term
|
|
FROM {fts_schema}.terms
|
|
)
|
|
SELECT
|
|
row_number() OVER () AS termid,
|
|
term
|
|
FROM
|
|
distinct_terms
|
|
{"WHERE term NOT IN (SELECT sw FROM " + fts_schema + ".stopwords)" if stopwords == 'english' else ''}
|
|
ORDER BY term;
|
|
""")
|
|
|
|
def build_dict_table(con, mode='duckdb', fts_schema="fts_main_documents", stopwords='none', gpt4_token_file=None, ngram_range=(1,2), min_freq=10, min_pmi=5.0):
|
|
"""
|
|
Build the dictionary table using the specified mode.
|
|
mode: 'phrases', 'ngrams', 'gpt4', or 'duckdb'
|
|
"""
|
|
if mode == 'phrases':
|
|
create_stopwords_table(con, fts_schema=fts_schema, stopwords=stopwords)
|
|
extract_phrases_pmi_duckdb(con, fts_schema="fts_main_documents", n=2, min_freq=min_freq, min_pmi=min_pmi)
|
|
print("Extracted phrases:", con.execute("SELECT * FROM fts_main_documents.phrases LIMIT 10").fetchall())
|
|
|
|
print("\nAdded phrases to dictionary:", con.execute(f"SELECT * FROM {fts_schema}.dict LIMIT 10").fetchall())
|
|
|
|
print("\nAdded tokens to dictionary:", con.execute(f"SELECT * FROM {fts_schema}.dict WHERE term NOT LIKE '% %' LIMIT 10").fetchall())
|
|
con.execute(f"DROP TABLE IF EXISTS {fts_schema}.tokens")
|
|
con.execute(f"DROP TABLE IF EXISTS {fts_schema}.phrases")
|
|
elif mode == 'duckdb':
|
|
create_terms_table_duckdb(con, fts_schema=fts_schema, input_schema="main", input_table="documents", input_id="did", input_val="content")
|
|
create_duckdb_dict_table(con, fts_schema=fts_schema, stopwords=stopwords)
|
|
else:
|
|
raise ValueError(f"Unknown dict table build mode: {mode}")
|
|
|
|
def create_terms_table(con, fts_schema="fts_main_documents", input_schema="main", input_table="documents", input_id="did", input_val="content"):
|
|
"""
|
|
Create the terms table with unique terms per docid.
|
|
Assumes the table fts_main_documents.dict already exists.
|
|
Adds a fieldid and termid column for compatibility with fielded search macros.
|
|
"""
|
|
# Cleanup input text using the same regex as DuckDB's tokenizer
|
|
con.sql(f"""
|
|
CREATE OR REPLACE TABLE {fts_schema}.cleaned_docs AS
|
|
SELECT
|
|
{input_id},
|
|
regexp_replace(lower(strip_accents(CAST({input_val} AS VARCHAR))),
|
|
'[0-9!@#$%^&*()_+={{}}\\[\\]:;<>,.?~\\\\/\\|''''"`-]+', ' ', 'g') AS content,
|
|
FROM {input_schema}.{input_table}
|
|
""")
|
|
|
|
# Use the ciff tokenizer to find bigrams and unigrams
|
|
con.sql(f"""
|
|
CREATE OR REPLACE TABLE {fts_schema}.terms AS (
|
|
SELECT
|
|
0 AS fieldid,
|
|
d.termid,
|
|
t.docid
|
|
FROM (
|
|
SELECT
|
|
row_number() OVER () AS docid,
|
|
unnest({fts_schema}.tokenize(content)) AS term
|
|
FROM {fts_schema}.cleaned_docs
|
|
) AS t
|
|
JOIN {fts_schema}.dict d ON t.term = d.term
|
|
WHERE t.term != ''
|
|
);
|
|
""")
|
|
|
|
|
|
def create_terms_table_duckdb(con, fts_schema="fts_main_documents", input_schema="main", input_table="documents", input_id="did", input_val="content"):
|
|
"""
|
|
Step 1: Create the initial terms table (term, docid).
|
|
"""
|
|
con.sql(f"""
|
|
CREATE OR REPLACE TABLE {fts_schema}.terms AS (
|
|
SELECT
|
|
row_number() OVER () AS docid,
|
|
unnest({fts_schema}.tokenize({input_val})) AS term
|
|
FROM {input_schema}.{input_table}
|
|
WHERE {input_val} != ''
|
|
);
|
|
""")
|
|
|
|
def assign_termids_to_terms(con, fts_schema="fts_main_documents"):
|
|
"""
|
|
Step 3: Recreate the terms table, joining with dict to assign termid.
|
|
"""
|
|
con.sql(f"""
|
|
CREATE OR REPLACE TABLE {fts_schema}.terms AS (
|
|
SELECT
|
|
0 AS fieldid,
|
|
d.termid,
|
|
t.docid,
|
|
t.term,
|
|
row_number() OVER (PARTITION BY t.docid) AS pos
|
|
FROM {fts_schema}.terms t
|
|
JOIN {fts_schema}.dict d ON t.term = d.term
|
|
WHERE t.term != ''
|
|
);
|
|
""")
|
|
|
|
def update_docs_table(con, fts_schema="fts_main_documents"):
|
|
"""
|
|
Create the documents table.
|
|
input_id should be the column name in input_table that uniquely identifies each document (e.g., 'did').
|
|
"""
|
|
# Remove old 'len' column if it exists, then add and populate a fresh one
|
|
con.sql(f"ALTER TABLE {fts_schema}.docs DROP COLUMN IF EXISTS len;")
|
|
con.sql(f"ALTER TABLE {fts_schema}.docs ADD COLUMN len INT;")
|
|
con.sql(f"""
|
|
UPDATE {fts_schema}.docs d
|
|
SET len = (
|
|
SELECT COUNT(termid)
|
|
FROM {fts_schema}.terms t
|
|
WHERE t.docid = d.docid
|
|
);
|
|
""")
|
|
|
|
def update_dict_table(con, fts_schema="fts_main_documents"):
|
|
"""
|
|
Update the dictionary table with document frequency (df).
|
|
Assumes the table fts_main_documents.dict already exists.
|
|
"""
|
|
con.sql(f"ALTER TABLE {fts_schema}.dict ADD COLUMN IF NOT EXISTS df BIGINT;")
|
|
con.sql(f"""
|
|
UPDATE {fts_schema}.dict d
|
|
SET df = (
|
|
SELECT count(DISTINCT docid)
|
|
FROM {fts_schema}.terms t
|
|
WHERE t.termid = d.termid
|
|
);
|
|
""")
|
|
|
|
def limit_dict_table(con, max_terms=10000, fts_schema="fts_main_documents"):
|
|
# Create a temporary table with limited terms and reassigned termid
|
|
con.sql(f"""
|
|
CREATE OR REPLACE TEMP TABLE temp_limited_dict AS
|
|
SELECT
|
|
ROW_NUMBER() OVER (ORDER BY df DESC, term ASC) AS termid,
|
|
term,
|
|
df
|
|
FROM {fts_schema}.dict
|
|
ORDER BY df DESC, term ASC
|
|
LIMIT {max_terms};
|
|
""")
|
|
|
|
# Drop original dict table
|
|
con.sql(f"DROP TABLE IF EXISTS {fts_schema}.dict;")
|
|
|
|
# Recreate dict table from temp table
|
|
con.sql(f"""
|
|
CREATE TABLE {fts_schema}.dict AS
|
|
SELECT * FROM temp_limited_dict;
|
|
""")
|
|
|
|
# Drop temp table
|
|
con.sql("DROP TABLE IF EXISTS temp_limited_dict;")
|
|
|
|
|
|
|
|
def create_stats_table(con, fts_schema="fts_main_documents", index_type="standard", stemmer="none"):
|
|
"""
|
|
Create the stats table.
|
|
This table contains statistics about the FTS index.
|
|
Columns: num_docs, avgdl, sumdf, index_type, stemmer
|
|
"""
|
|
con.sql(f"DROP TABLE IF EXISTS {fts_schema}.stats;")
|
|
con.sql(f"""
|
|
CREATE TABLE {fts_schema}.stats AS (
|
|
SELECT
|
|
COUNT(docs.docid) AS num_docs,
|
|
SUM(docs.len) / COUNT(docs.len) AS avgdl,
|
|
(SELECT SUM(df) FROM fts_main_documents.dict) AS sumdf,
|
|
'{index_type}' AS index_type,
|
|
'{stemmer}' AS stemmer
|
|
FROM {fts_schema}.docs AS docs
|
|
);
|
|
""")
|
|
|
|
def create_fields_table(con, fts_schema="fts_main_documents"):
|
|
con.sql(f'''
|
|
CREATE TABLE IF NOT EXISTS {fts_schema}.fields (
|
|
fieldid INTEGER,
|
|
field TEXT
|
|
);
|
|
''')
|
|
# Insert a default field if table is empty
|
|
con.sql(f'''
|
|
INSERT INTO {fts_schema}.fields (fieldid, field)
|
|
SELECT 0, 'content'
|
|
WHERE NOT EXISTS (SELECT 1 FROM {fts_schema}.fields);
|
|
''')
|
|
|
|
def index_documents(db_name, ir_dataset, stemmer='none', stopwords='none',
|
|
logging=True, keepcontent=False, limit=10000, mode='duckdb', min_freq=10, min_pmi=5.0):
|
|
"""
|
|
Insert and index documents.
|
|
"""
|
|
if pathlib.Path(db_name).is_file():
|
|
raise ValueError(f"File {db_name} already exists.")
|
|
con = duckdb.connect(db_name)
|
|
insert_dataset(con, ir_dataset, logging)
|
|
if logging:
|
|
print("Indexing...", file=sys.stderr)
|
|
|
|
docs = con.sql("SELECT * FROM documents LIMIT 10").df()
|
|
print("Docs:\n", docs)
|
|
|
|
create_docs_table(con, input_schema="main", input_table="documents", input_id="did")
|
|
|
|
fts_docs = con.sql("SELECT * FROM fts_main_documents.docs LIMIT 10").df()
|
|
print("fts_main_documents.docs:\n", fts_docs)
|
|
|
|
con.sql("CREATE SCHEMA IF NOT EXISTS fts_main_documents;")
|
|
con.sql("CREATE TABLE IF NOT EXISTS fts_main_documents.dict (term TEXT);")
|
|
|
|
create_tokenizer_duckdb(con)
|
|
|
|
# Create the dict table
|
|
build_dict_table(con, mode=mode, fts_schema="fts_main_documents", stopwords=stopwords, ngram_range=(1,2), min_freq=min_freq, min_pmi=min_pmi)
|
|
|
|
create_tokenizer_ciff(con)
|
|
|
|
dict = con.sql("SELECT * FROM fts_main_documents.dict LIMIT 10").df()
|
|
print("fts_main_documents.dict:\n", dict)
|
|
|
|
# Clean up the terms table
|
|
if mode == 'phrases':
|
|
con.sql("DROP TABLE IF EXISTS fts_main_documents.terms;")
|
|
create_terms_table(con, input_schema="main", input_table="documents", input_id="did", input_val="content")
|
|
else:
|
|
assign_termids_to_terms(con, fts_schema="fts_main_documents")
|
|
|
|
terms = con.sql("SELECT * FROM fts_main_documents.terms LIMIT 10").df()
|
|
print("fts_main_documents.terms:\n", terms)
|
|
|
|
update_docs_table(con, fts_schema="fts_main_documents")
|
|
|
|
docs = con.sql("SELECT * FROM fts_main_documents.docs LIMIT 10").df()
|
|
print("fts_main_documents.docs:\n", docs)
|
|
|
|
update_dict_table(con, fts_schema="fts_main_documents")
|
|
print("Updated fts_main_documents.dict with document frequencies.")
|
|
|
|
|
|
# Limit the dictionary to the `max_terms` most frequent terms
|
|
if limit > 0:
|
|
limit_dict_table(con, max_terms=limit, fts_schema="fts_main_documents")
|
|
create_terms_table(con, fts_schema="fts_main_documents", input_schema="main", input_table="documents", input_id="did", input_val="content")
|
|
update_dict_table(con, fts_schema="fts_main_documents")
|
|
print("Limited fts_main_documents.dict to 10000 most frequent terms.")
|
|
|
|
update_docs_table(con, fts_schema="fts_main_documents")
|
|
|
|
dict = con.sql("SELECT * FROM fts_main_documents.dict LIMIT 10").df()
|
|
print("fts_main_documents.dict:\n", dict)
|
|
|
|
# Remove unused words from dictionary
|
|
con.sql('''
|
|
DELETE FROM fts_main_documents.dict
|
|
WHERE df == 0;
|
|
''')
|
|
|
|
create_stats_table(con, fts_schema="fts_main_documents", index_type="standard", stemmer=stemmer)
|
|
|
|
stats = con.sql("SELECT * FROM fts_main_documents.stats").df()
|
|
print("fts_main_documents.stats:\n", stats)
|
|
|
|
create_fields_table(con, fts_schema="fts_main_documents")
|
|
create_lm(con, stemmer)
|
|
con.close()
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import argparse
|
|
import ze_eval
|
|
import os
|
|
|
|
parser = argparse.ArgumentParser(description="Manual index builder for IR datasets.")
|
|
parser.add_argument('--db', type=str, default='testje_docs.db', help='Database file name')
|
|
parser.add_argument('--dataset', type=str, default='cranfield', help='ir_datasets name (e.g., cranfield, msmarco-passage)')
|
|
parser.add_argument('--stemmer', type=str, default='none', help='Stemmer to use (none, porter, etc.)')
|
|
parser.add_argument('--stopwords', type=str, default='english', help='Stopwords to use (english, none)')
|
|
parser.add_argument('--mode', type=str, default='duckdb', help='Indexing mode (duckdb, ngrams, phrases, gpt4)')
|
|
parser.add_argument('--keepcontent', action='store_true', help='Keep document content')
|
|
parser.add_argument('--limit', type=int, default=10000, help='Limit the number of terms in the dictionary')
|
|
parser.add_argument('--min-freq', type=int, default=10, help='Minimum frequency for phrases (only for mode "phrases")')
|
|
parser.add_argument('--min-pmi', type=float, default=5.0, help='Minimum PMI for phrases (only for mode "phrases")')
|
|
args = parser.parse_args()
|
|
|
|
dataset = None
|
|
if (args.dataset == 'custom'):
|
|
dataset = ze_eval.ir_dataset_test()
|
|
else:
|
|
dataset = ir_datasets.load(args.dataset)
|
|
db_name = args.db
|
|
if os.path.exists(db_name):
|
|
print(f"Removing {db_name}")
|
|
os.remove(db_name)
|
|
|
|
print("Creating index...")
|
|
index_documents(
|
|
db_name,
|
|
dataset,
|
|
stemmer=args.stemmer,
|
|
stopwords=args.stopwords,
|
|
keepcontent=args.keepcontent,
|
|
mode=args.mode,
|
|
limit=args.limit,
|
|
min_freq=args.min_freq,
|
|
min_pmi=args.min_pmi
|
|
)
|
|
print("") |