Zoekeend-Phrase-Indexing/ze_vacuum.py
2025-08-19 17:23:02 +02:00

50 lines
1.3 KiB
Python

import duckdb
import pathlib
def copy_file_force(name_in, name_out):
path1 = pathlib.Path(name_in)
if not(path1.is_file()):
raise ValueError(f"File {name_in} does not exist.")
path2 = pathlib.Path(name_out)
path2.write_bytes(path1.read_bytes())
def rm_file(name):
path = pathlib.Path(name)
path.unlink()
def cluster_index(con):
con.sql("""
USE fts_main_documents;
CREATE TABLE terms_new AS SELECT * FROM terms ORDER BY termid, docid;
DROP TABLE terms;
ALTER TABLE terms_new RENAME TO terms;
CREATE TABLE dict_new AS SELECT * FROM dict ORDER BY term;
DROP TABLE dict;
ALTER TABLE dict_new RENAME TO dict;
CREATE TABLE docs_new AS SELECT * FROM docs ORDER BY docid;
DROP TABLE docs;
ALTER TABLE docs_new RENAME TO docs;
""")
def reclaim_disk_space(name, cluster=True):
# Unfortunately, DuckDB does not reclaim disk space automatically
# therefore, we do a copy
tmpname = name + '.tmp'
copy_file_force(name, tmpname)
con = duckdb.connect(tmpname)
if cluster:
cluster_index(con)
rm_file(name)
con.sql(f"""
ATTACH '{tmpname}' AS tmpdb;
ATTACH '{name}' AS db;
COPY FROM DATABASE tmpdb TO db;
""")
con.close()
rm_file(tmpname)