Zoekeend-Phrase-Indexing/zoekeend
2025-08-19 17:23:02 +02:00

601 lines
16 KiB
Python
Executable File

#!/usr/bin/env python
"""
Zoekeend experimental information retrieval system using DuckDB
Copyright (C) 2024 Djoerd Hiemstra
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published
by the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
Contact: hiemstra@cs.ru.nl
"""
import argparse
import pathlib
import sys
import duckdb
import ir_datasets
import ze_eval
ze_datasets = {
"rb04": "disks45/nocr/trec-robust-2004",
"msm2": "msmarco-passage",
"msm2dev": "msmarco-passage/trec-dl-2019/judged",
"msm2tst": "msmarco-passage/trec-dl-2020/judged",
"cran": "cranfield",
}
def fatal(message):
"""Print error message and exit."""
print(message, file=sys.stderr)
sys.exit(1)
# TODO: def zoekeend_index_bydict(args):
# index_bydict test.db dataset --in dictionary --out dictionary
# --max_size 99999 --algorithm bytepair --dryrun
# out dictionary is dictionary for future index, if called again.
# TODO: add to ze_search: report query cross entropy and Cost-in-Postings.
def zoekeend_index(args):
"""
Create the index file for an Information Retrieval dataset.
This index uses the standard DuckDB FTS extension. Based on:
Hannes Mühleisen, Thaer Samar, Jimmy Lin, and Arjen de Vries, Old dogs
are great at new tricks: Column stores for IR prototyping. In SIGIR 2014.
"""
import ze_index # defer imports, so no dependencies needed, unless used
if args.dataset in ze_datasets:
args.dataset = ze_datasets[args.dataset]
try:
if args.dataset == "custom":
ir_dataset = ze_eval.ir_dataset_test()
else:
ir_dataset = ir_datasets.load(args.dataset)
ze_index.index_documents(
args.dbname,
ir_dataset,
stemmer=args.wordstemmer,
stopwords=args.stopwords,
keepcontent=args.keep_content,
)
except ValueError as e:
fatal(e)
except KeyError as e:
fatal("Unknown dataset: " + str(e))
def zoekeend_search(args):
"""
Run queries and create a run file in TREC output.
The language model (lm) is based on: Djoerd Hiemstra, A probabilistic
justification for using tf.idf term weighting in information retrieval,
International Journal on Digital Libraries 3(2), 2000.
"""
import ze_search
if not pathlib.Path(args.dbname).is_file():
fatal(f"Error: file {args.dbname} does not exist")
if args.out and pathlib.Path(args.out).is_file():
fatal(f"Error: file {args.out} exists")
if args.queries in ze_datasets:
query_tag = ze_datasets[args.queries]
else:
query_tag = args.queries
try:
ze_search.search_run(
args.dbname,
query_tag,
matcher=args.match,
run_tag=args.run,
k=args.bm25k,
b=args.bm25b,
limit=args.top,
fileout=args.out,
startq=args.start,
endq=args.end,
)
except FileNotFoundError:
fatal(f"Error: queryset '{args.queries}' does not exist.")
except ValueError as e:
fatal(e)
def zoekeend_eval(args):
"""Evaluate run using trec_eval"""
import ze_eval
if args.queries in ze_datasets:
query_tag = ze_datasets[args.queries]
else:
query_tag = args.queries
try:
ze_eval.trec_eval(
args.run, query_tag, args.complete_rel, args.ndcg, args.query_eval
)
except (KeyError, AttributeError):
fatal(f"Error: query/qrel set '{args.queries}' does not exist.")
except ValueError as e:
fatal(e)
def zoekeend_vacuum(args):
"""Vacuum index to reclaim disk space."""
import ze_vacuum
try:
ze_vacuum.reclaim_disk_space(args.dbname, args.cluster)
except (ValueError, FileNotFoundError):
fatal(f"File not found: {args.dbname}")
def zoekeend_index_import(args):
"""
Import a CIFF (Common Index File Format) index.
Based on: Djoerd Hiemstra, Gijs Hendriksen, Chris Kamphuis, and
Arjen de Vries, Challenges of index exchange for search engine
interoperability, OSSYM 2023. (see also: zoekeend index_export)
"""
import ze_index_import
if pathlib.Path(args.dbname).is_file():
fatal(f"Error: file {args.dbname} exists")
if not pathlib.Path(args.ciff_file).is_file():
fatal(f"Error: file {args.ciff_file} does not exist")
try:
ze_index_import.ciff_import(
args.dbname,
args.ciff_file,
tokenizer=args.tokenizer,
stemmer=args.wordstemmer,
)
except ValueError as e:
fatal("Error in CIFF import: " + str(e))
def zoekeend_index_export(args):
"""
Export a CIFF (Common Index File Format) index.
Based on: Jimmy Lin, Joel Mackenzie, Chris Kamphuis, Craig Macdonald,
Antonio Mallia, Michał Siedlaczek, Andrew Trotman, and Arjen de Vries.
Supporting interoperability between open-source search engines with the
common index file format, SIGIR 2020; (see also: zoekeend index_import)
"""
import ze_index_export
if not pathlib.Path(args.dbname).is_file():
fatal(f"Error: file {args.dbname} does not exist")
if pathlib.Path(args.ciff_file).is_file():
fatal(f"Error: file {args.ciff_file} exists")
try:
ze_index_export.ciff_export(
args.dbname,
args.ciff_file,
description=args.description,
batch_size=args.batch_size,
)
except ValueError as e:
fatal("Error in CIFF export: " + str(e))
def zoekeend_reindex_prior(args):
"""
Recreate the index by including prior (static rank) scores.
Based on: Wessel Kraaij, Thijs Westerveld and Djoerd Hiemstra,
The Importance of Prior Probabilities for Entry Page Search,
SIGIR 2002.
"""
import ze_reindex_prior
if not pathlib.Path(args.dbname_in).is_file():
fatal(f"Error: file {args.dbname_in} does not exist")
if pathlib.Path(args.dbname_out).is_file():
fatal(f"Error: file {args.dbname_out} exists")
try:
ze_reindex_prior.reindex_prior(
args.dbname_in,
args.dbname_out,
csv_file=args.file,
default=args.default,
init=args.init,
)
except Exception as e:
fatal("Error in reindex prior: " + str(e))
def zoekeend_reindex_fitted(args):
"""
Recreate the index using by fitting document lengths (len) or prior
scores (prior) using linear regression. The length / prior scores
are removed from the new index.
"""
import ze_reindex_fitted
if not pathlib.Path(args.dbname_in).is_file():
fatal(f"Error: file {args.dbname_in} does not exist")
if pathlib.Path(args.dbname_out).is_file():
fatal(f"Error: file {args.dbname_out} exists")
if args.qrls in ze_datasets:
args.qrls = ze_datasets[args.qrls]
try:
ze_reindex_fitted.reindex_fitted_column(
args.dbname_in,
args.dbname_out,
column=args.column,
total=args.bins,
print_sample=args.print,
threshold=args.threshold,
qrels=args.qrls,
)
except ValueError as e:
fatal("Error in reindex fitted: " + str(e))
def zoekeend_reindex_const(args):
"""
Recreate the index using by rescaling term frequencies such that all
documents get an artificial length of CONST, using a normalization
weight beta inspired by BM25 document length normalization.
"""
import ze_reindex_const
if not pathlib.Path(args.dbname_in).is_file():
fatal(f"Error: file {args.dbname_in} does not exist")
if pathlib.Path(args.dbname_out).is_file():
fatal(f"Error: file {args.dbname_out} exists")
try:
ze_reindex_const.reindex_const(
args.dbname_in,
args.dbname_out,
const_len=args.const,
b=args.beta,
keep_terms=args.keepterms,
)
except ValueError as e:
fatal("Error in reindex const: " + str(e))
global_parser = argparse.ArgumentParser(prog="zoekeend")
global_parser.add_argument(
"-v",
"--version",
action="version",
version="zoekeend v0.0.1 (using duckdb v" + duckdb.__version__ + ")",
)
subparsers = global_parser.add_subparsers(metavar="subexperiment ...")
index_parser = subparsers.add_parser(
"index",
help="create the index file for an IR dataset",
description=zoekeend_index.__doc__,
)
index_parser.set_defaults(func=zoekeend_index)
index_parser.add_argument(
"dbname",
help="file name of index",
)
index_parser.add_argument(
"dataset",
help="ir_dataset, see: https://ir-datasets.com",
)
index_parser.add_argument(
"-w",
"--wordstemmer",
help="word stemmer (default: none)",
default="none",
choices=["none", "porter", "dutch"],
)
index_parser.add_argument(
"-s",
"--stopwords",
help="stop words (default: none)",
default="none",
choices=["none", "english"],
)
index_parser.add_argument(
"-k",
"--keep_content",
help="keep the document content column",
action="store_true",
)
reindex_prior_parser = subparsers.add_parser(
"reindex_prior",
help="recreate the index including prior scores",
description=zoekeend_reindex_prior.__doc__,
)
reindex_prior_parser.set_defaults(func=zoekeend_reindex_prior)
reindex_prior_parser.add_argument(
"dbname_in",
help="file name of old index",
)
reindex_prior_parser.add_argument(
"dbname_out",
help="file name of new index with priors",
)
reindex_prior_parser.add_argument(
"-i",
"--init",
help="initialize with standard prior ('len' or 'uniform')",
choices=["len", "uniform"],
)
reindex_prior_parser.add_argument(
"-f",
"--file",
help="file with comma-separated (did,prior) pairs",
)
reindex_prior_parser.add_argument(
"-d",
"--default",
help="default prior for documents missing in the file",
type=float,
)
reindex_fitted_parser = subparsers.add_parser(
"reindex_fitted",
help="recreate the index by fitting prior scores",
description=zoekeend_reindex_fitted.__doc__,
)
reindex_fitted_parser.set_defaults(func=zoekeend_reindex_fitted)
reindex_fitted_parser.add_argument(
"dbname_in",
help="file name of old index",
)
reindex_fitted_parser.add_argument(
"dbname_out",
help="file name of new fitted index",
)
reindex_fitted_parser.add_argument(
"-c",
"--column",
help="column to be used for fitting (default: prior)",
default="prior",
choices=["len", "prior"],
)
reindex_fitted_parser.add_argument(
"-b",
"--bins",
help="number of bins",
type=int,
)
reindex_fitted_parser.add_argument(
"-p",
"--print",
help="print sample used for fitting",
action="store_true",
)
reindex_fitted_parser.add_argument(
"-q",
"--qrls",
help="training queries/qrels",
)
reindex_fitted_parser.add_argument(
"-t",
"--threshold",
help="prior values <= threshold are ignored (default: 0)",
default=0,
type=int,
)
reindex_const_parser = subparsers.add_parser(
"reindex_const",
help="recreate the index by rescaling term frequencies",
description=zoekeend_reindex_const.__doc__,
)
reindex_const_parser.set_defaults(func=zoekeend_reindex_const)
reindex_const_parser.add_argument(
"dbname_in",
help="file name of old index",
)
reindex_const_parser.add_argument(
"dbname_out",
help="file name of new fitted index",
)
reindex_const_parser.add_argument(
"-c",
"--const",
help="constant document length (default: 400)",
type=int,
default=400,
)
reindex_const_parser.add_argument(
"-b",
"--beta",
help="length normalization parameter (default: 1.0)",
type=float,
default=1.0,
)
reindex_const_parser.add_argument(
"-k",
"--keepterms",
action="store_true",
help="keep all terms, even if new tf is small",
)
search_parser = subparsers.add_parser(
"search",
help="execute queries and create run output",
description=zoekeend_search.__doc__,
)
search_parser.set_defaults(func=zoekeend_search)
search_parser.add_argument(
"dbname",
help="file name of index",
)
search_parser.add_argument(
"queries",
help="ir_dataset queries id or tab-separated query file",
)
search_parser.add_argument(
"-r",
"--run",
help="run tag",
)
search_parser.add_argument(
"-t",
"--top",
type=int,
default=1000,
help="amount of top results (default: 1000)",
)
search_parser.add_argument(
"-o", "--out", help="the run file to be outputted (default: stdout)"
)
search_parser.add_argument(
"-m",
"--match",
help="match function: languge models (default) or bm25",
default="lm",
choices=["lm", "bm25"],
)
search_parser.add_argument(
"-l", "--lmbda", help="lm lambda parameter (default: 0.3)", type=float, default=0.3
)
search_parser.add_argument(
"-k", "--bm25k", help="bm25 k parameter (default: 0.9)", type=float, default=0.9
)
search_parser.add_argument(
"-b", "--bm25b", help="bm25 b parameter (default: 0.4)", type=float, default=0.4
)
search_parser.add_argument(
"-s",
"--start",
help="start identifier of query",
type=int,
)
search_parser.add_argument(
"-e",
"--end",
help="end identifier of query",
type=int,
)
vacuum_parser = subparsers.add_parser(
"vacuum",
help="vacuum index to reclaim disk space",
description=zoekeend_vacuum.__doc__,
)
vacuum_parser.set_defaults(func=zoekeend_vacuum)
vacuum_parser.add_argument(
"dbname",
help="file name of index",
)
vacuum_parser.add_argument("-c", "--cluster", action="store_true", help="cluster index")
eval_parser = subparsers.add_parser(
"eval", help="evaluate run using trec_eval", description=zoekeend_eval.__doc__
)
eval_parser.set_defaults(func=zoekeend_eval)
eval_parser.add_argument(
"run",
help="trec run file",
)
eval_parser.add_argument(
"queries",
help="ir_dataset queries id or trec qrel file",
)
eval_parser.add_argument(
"-c",
"--complete_rel",
action="store_true",
help="queries with missing results contribute a value of 0",
)
eval_parser.add_argument(
"-n",
"--ndcg",
action="store_true",
help="add normalized discounted cummaltive gain (ndcg)",
)
eval_parser.add_argument(
"-q",
"--query_eval",
action="store_true",
help="give evaluation for each query/topic",
)
index_import_parser = subparsers.add_parser(
"index_import", help="import ciff index", description=zoekeend_index_import.__doc__
)
index_import_parser.set_defaults(func=zoekeend_index_import)
index_import_parser.add_argument(
"dbname",
help="file name of index",
)
index_import_parser.add_argument(
"ciff_file",
help="ciff file",
)
index_import_parser.add_argument(
"-t",
"--tokenizer",
help="tokenizer (default: ciff)",
default="ciff",
choices=["ciff", "duckdb"],
)
index_import_parser.add_argument(
"-w",
"--wordstemmer",
help="word stemmer (default: none)",
default="none",
choices=["none", "porter", "dutch"],
)
index_export_parser = subparsers.add_parser(
"index_export", help="export ciff index", description=zoekeend_index_import.__doc__
)
index_export_parser.set_defaults(func=zoekeend_index_export)
index_export_parser.add_argument(
"dbname",
help="file name of index",
)
index_export_parser.add_argument(
"ciff_file",
help="ciff file",
)
index_export_parser.add_argument(
"-d",
"--description",
help="CIFF description (default: Exported from DuckDB)",
default="Exported from DuckDB",
)
index_export_parser.add_argument(
"-b",
"--batch-size",
help="batch size (default: 1024)",
default=1024,
type=int,
)
parsed_args = global_parser.parse_args()
if hasattr(parsed_args, "func"):
parsed_args.func(parsed_args)
else:
global_parser.print_usage()