mirror of
https://github.com/ArthurIdema/Zoekeend-Phrase-Indexing.git
synced 2025-10-26 16:24:21 +00:00
50 lines
1.3 KiB
Python
50 lines
1.3 KiB
Python
import duckdb
|
|
import pathlib
|
|
|
|
|
|
def copy_file_force(name_in, name_out):
|
|
path1 = pathlib.Path(name_in)
|
|
if not(path1.is_file()):
|
|
raise ValueError(f"File {name_in} does not exist.")
|
|
path2 = pathlib.Path(name_out)
|
|
path2.write_bytes(path1.read_bytes())
|
|
|
|
|
|
def rm_file(name):
|
|
path = pathlib.Path(name)
|
|
path.unlink()
|
|
|
|
|
|
def cluster_index(con):
|
|
con.sql("""
|
|
USE fts_main_documents;
|
|
CREATE TABLE terms_new AS SELECT * FROM terms ORDER BY termid, docid;
|
|
DROP TABLE terms;
|
|
ALTER TABLE terms_new RENAME TO terms;
|
|
CREATE TABLE dict_new AS SELECT * FROM dict ORDER BY term;
|
|
DROP TABLE dict;
|
|
ALTER TABLE dict_new RENAME TO dict;
|
|
CREATE TABLE docs_new AS SELECT * FROM docs ORDER BY docid;
|
|
DROP TABLE docs;
|
|
ALTER TABLE docs_new RENAME TO docs;
|
|
""")
|
|
|
|
|
|
def reclaim_disk_space(name, cluster=True):
|
|
# Unfortunately, DuckDB does not reclaim disk space automatically
|
|
# therefore, we do a copy
|
|
tmpname = name + '.tmp'
|
|
copy_file_force(name, tmpname)
|
|
con = duckdb.connect(tmpname)
|
|
if cluster:
|
|
cluster_index(con)
|
|
rm_file(name)
|
|
con.sql(f"""
|
|
ATTACH '{tmpname}' AS tmpdb;
|
|
ATTACH '{name}' AS db;
|
|
COPY FROM DATABASE tmpdb TO db;
|
|
""")
|
|
con.close()
|
|
rm_file(tmpname)
|
|
|