Zoekeend-Phrase-Indexing/ze_reindex_group.py
2025-08-19 17:23:02 +02:00

113 lines
4.3 KiB
Python

import duckdb
import pathlib
import sys
def copy_file(name_in, name_out):
path1 = pathlib.Path(name_in)
if not(path1.is_file()):
raise ValueError(f"File {name_in} does not exist.")
path2 = pathlib.Path(name_out)
if path2.is_file():
raise ValueError(f"File {name_out} already exists.")
path2.write_bytes(path1.read_bytes())
def get_stats_stemmer(con):
sql = "SELECT stemmer FROM fts_main_documents.stats"
return con.sql(sql).fetchall()[0][0]
def replace_bm25(con, stemmer):
""" The standard DuckDB BM25 implementation does not work with the grouped index.
This version also works with the standard DuckDB index.
"""
con.sql(f"""
CREATE OR REPLACE MACRO fts_main_documents.match_bm25(docname, query_string, b := 0.75, k := 1.2, conjunctive := 0, fields := NULL) AS (
WITH tokens AS (
SELECT DISTINCT stem(unnest(fts_main_documents.tokenize(query_string)), '{stemmer}') AS t
),
fieldids AS (
SELECT fieldid
FROM fts_main_documents.fields
WHERE CASE WHEN ((fields IS NULL)) THEN (1) ELSE (field = ANY(SELECT * FROM (SELECT unnest(string_split(fields, ','))) AS fsq)) END
),
qtermids AS (
SELECT termid, df
FROM fts_main_documents.dict AS dict, tokens
WHERE (dict.term = tokens.t)
),
qterms AS (
SELECT termid, docid
FROM fts_main_documents.terms AS terms
WHERE (CASE WHEN ((fields IS NULL)) THEN (1) ELSE (fieldid = ANY(SELECT * FROM fieldids)) END
AND (termid = ANY(SELECT qtermids.termid FROM qtermids)))
),
term_tf AS (
SELECT termid, docid, count_star() AS tf
FROM qterms
GROUP BY docid, termid
),
cdocs AS (
SELECT docid
FROM qterms
GROUP BY docid
HAVING CASE WHEN (conjunctive) THEN ((count(DISTINCT termid) = (SELECT count_star() FROM tokens))) ELSE 1 END
),
subscores AS (
SELECT docs.docid, len, term_tf.termid, tf, df,
(log((((((SELECT num_docs FROM fts_main_documents.stats) - df) + 0.5) / (df + 0.5)) + 1)) * ((tf * (k + 1)) / (tf + (k * ((1 - b) + (b * (len / (SELECT avgdl FROM fts_main_documents.stats)))))))) AS subscore
FROM term_tf, cdocs, fts_main_documents.docs AS docs, qtermids
WHERE (term_tf.docid = cdocs.docid)
AND (term_tf.docid = docs.docid)
AND (term_tf.termid = qtermids.termid)
),
scores AS (
SELECT docid, sum(subscore) AS score
FROM subscores
GROUP BY docid
)
SELECT score
FROM scores, fts_main_documents.docs AS docs
WHERE (scores.docid = docs.docid) AND (docs."name" = docname)
)
""")
def reindex_group(name_in, name_out, stemmer='porter'):
copy_file(name_in, name_out)
con = duckdb.connect(name_out)
oldstemmer = get_stats_stemmer(con)
if oldstemmer != 'none':
print(f"Warning: stemmer {oldstemmer} was already used on this database")
con.sql(f"""
-- newdict gives stems unique ids
CREATE TABLE fts_main_documents.newdict AS
SELECT termid, term, stem(term, '{stemmer}') AS stem, DENSE_RANK() OVER (ORDER BY stem) AS newid, df
FROM fts_main_documents.dict;
DROP TABLE fts_main_documents.dict;
-- newterms uses those new ids
CREATE TABLE fts_main_documents.newterms AS
SELECT terms.docid, terms.fieldid, newdict.newid AS termid
FROM fts_main_documents.terms AS terms, fts_main_documents.newdict AS newdict
WHERE terms.termid = newdict.termid;
DROP TABLE fts_main_documents.terms;
ALTER TABLE fts_main_documents.newterms RENAME TO terms;
-- now remove old ids from dict table and compute new dfs.
CREATE TABLE fts_main_documents.dict AS
SELECT D.newid AS termid, D.term, COUNT(DISTINCT T.docid) AS df
FROM fts_main_documents.newdict D, fts_main_documents.terms T
WHERE T.termid = D.newid
GROUP BY D.newid, D.term;
DROP TABLE fts_main_documents.newdict;
-- update stats
UPDATE fts_main_documents.stats SET index_type = 'grouped({stemmer})';
""")
replace_bm25(con, oldstemmer)
con.close()
if __name__ == "__main__":
reindex_group('robustZE.db', 'robustZEgrouped.db')