#!/usr/bin/env python """ Zoekeend experimental information retrieval system using DuckDB Copyright (C) 2024 Djoerd Hiemstra This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . Contact: hiemstra@cs.ru.nl """ import argparse import pathlib import sys import duckdb import ir_datasets import ze_eval ze_datasets = { "rb04": "disks45/nocr/trec-robust-2004", "msm2": "msmarco-passage", "msm2dev": "msmarco-passage/trec-dl-2019/judged", "msm2tst": "msmarco-passage/trec-dl-2020/judged", "cran": "cranfield", } def fatal(message): """Print error message and exit.""" print(message, file=sys.stderr) sys.exit(1) # TODO: def zoekeend_index_bydict(args): # index_bydict test.db dataset --in dictionary --out dictionary # --max_size 99999 --algorithm bytepair --dryrun # out dictionary is dictionary for future index, if called again. # TODO: add to ze_search: report query cross entropy and Cost-in-Postings. def zoekeend_index(args): """ Create the index file for an Information Retrieval dataset. This index uses the standard DuckDB FTS extension. Based on: Hannes Mühleisen, Thaer Samar, Jimmy Lin, and Arjen de Vries, Old dogs are great at new tricks: Column stores for IR prototyping. In SIGIR 2014. """ import ze_index # defer imports, so no dependencies needed, unless used if args.dataset in ze_datasets: args.dataset = ze_datasets[args.dataset] try: if args.dataset == "custom": ir_dataset = ze_eval.ir_dataset_test() else: ir_dataset = ir_datasets.load(args.dataset) ze_index.index_documents( args.dbname, ir_dataset, stemmer=args.wordstemmer, stopwords=args.stopwords, keepcontent=args.keep_content, ) except ValueError as e: fatal(e) except KeyError as e: fatal("Unknown dataset: " + str(e)) def zoekeend_search(args): """ Run queries and create a run file in TREC output. The language model (lm) is based on: Djoerd Hiemstra, A probabilistic justification for using tf.idf term weighting in information retrieval, International Journal on Digital Libraries 3(2), 2000. """ import ze_search if not pathlib.Path(args.dbname).is_file(): fatal(f"Error: file {args.dbname} does not exist") if args.out and pathlib.Path(args.out).is_file(): fatal(f"Error: file {args.out} exists") if args.queries in ze_datasets: query_tag = ze_datasets[args.queries] else: query_tag = args.queries try: ze_search.search_run( args.dbname, query_tag, matcher=args.match, run_tag=args.run, k=args.bm25k, b=args.bm25b, limit=args.top, fileout=args.out, startq=args.start, endq=args.end, ) except FileNotFoundError: fatal(f"Error: queryset '{args.queries}' does not exist.") except ValueError as e: fatal(e) def zoekeend_eval(args): """Evaluate run using trec_eval""" import ze_eval if args.queries in ze_datasets: query_tag = ze_datasets[args.queries] else: query_tag = args.queries try: ze_eval.trec_eval( args.run, query_tag, args.complete_rel, args.ndcg, args.query_eval ) except (KeyError, AttributeError): fatal(f"Error: query/qrel set '{args.queries}' does not exist.") except ValueError as e: fatal(e) def zoekeend_vacuum(args): """Vacuum index to reclaim disk space.""" import ze_vacuum try: ze_vacuum.reclaim_disk_space(args.dbname, args.cluster) except (ValueError, FileNotFoundError): fatal(f"File not found: {args.dbname}") def zoekeend_index_import(args): """ Import a CIFF (Common Index File Format) index. Based on: Djoerd Hiemstra, Gijs Hendriksen, Chris Kamphuis, and Arjen de Vries, Challenges of index exchange for search engine interoperability, OSSYM 2023. (see also: zoekeend index_export) """ import ze_index_import if pathlib.Path(args.dbname).is_file(): fatal(f"Error: file {args.dbname} exists") if not pathlib.Path(args.ciff_file).is_file(): fatal(f"Error: file {args.ciff_file} does not exist") try: ze_index_import.ciff_import( args.dbname, args.ciff_file, tokenizer=args.tokenizer, stemmer=args.wordstemmer, ) except ValueError as e: fatal("Error in CIFF import: " + str(e)) def zoekeend_index_export(args): """ Export a CIFF (Common Index File Format) index. Based on: Jimmy Lin, Joel Mackenzie, Chris Kamphuis, Craig Macdonald, Antonio Mallia, Michał Siedlaczek, Andrew Trotman, and Arjen de Vries. Supporting interoperability between open-source search engines with the common index file format, SIGIR 2020; (see also: zoekeend index_import) """ import ze_index_export if not pathlib.Path(args.dbname).is_file(): fatal(f"Error: file {args.dbname} does not exist") if pathlib.Path(args.ciff_file).is_file(): fatal(f"Error: file {args.ciff_file} exists") try: ze_index_export.ciff_export( args.dbname, args.ciff_file, description=args.description, batch_size=args.batch_size, ) except ValueError as e: fatal("Error in CIFF export: " + str(e)) def zoekeend_reindex_prior(args): """ Recreate the index by including prior (static rank) scores. Based on: Wessel Kraaij, Thijs Westerveld and Djoerd Hiemstra, The Importance of Prior Probabilities for Entry Page Search, SIGIR 2002. """ import ze_reindex_prior if not pathlib.Path(args.dbname_in).is_file(): fatal(f"Error: file {args.dbname_in} does not exist") if pathlib.Path(args.dbname_out).is_file(): fatal(f"Error: file {args.dbname_out} exists") try: ze_reindex_prior.reindex_prior( args.dbname_in, args.dbname_out, csv_file=args.file, default=args.default, init=args.init, ) except Exception as e: fatal("Error in reindex prior: " + str(e)) def zoekeend_reindex_fitted(args): """ Recreate the index using by fitting document lengths (len) or prior scores (prior) using linear regression. The length / prior scores are removed from the new index. """ import ze_reindex_fitted if not pathlib.Path(args.dbname_in).is_file(): fatal(f"Error: file {args.dbname_in} does not exist") if pathlib.Path(args.dbname_out).is_file(): fatal(f"Error: file {args.dbname_out} exists") if args.qrls in ze_datasets: args.qrls = ze_datasets[args.qrls] try: ze_reindex_fitted.reindex_fitted_column( args.dbname_in, args.dbname_out, column=args.column, total=args.bins, print_sample=args.print, threshold=args.threshold, qrels=args.qrls, ) except ValueError as e: fatal("Error in reindex fitted: " + str(e)) def zoekeend_reindex_const(args): """ Recreate the index using by rescaling term frequencies such that all documents get an artificial length of CONST, using a normalization weight beta inspired by BM25 document length normalization. """ import ze_reindex_const if not pathlib.Path(args.dbname_in).is_file(): fatal(f"Error: file {args.dbname_in} does not exist") if pathlib.Path(args.dbname_out).is_file(): fatal(f"Error: file {args.dbname_out} exists") try: ze_reindex_const.reindex_const( args.dbname_in, args.dbname_out, const_len=args.const, b=args.beta, keep_terms=args.keepterms, ) except ValueError as e: fatal("Error in reindex const: " + str(e)) global_parser = argparse.ArgumentParser(prog="zoekeend") global_parser.add_argument( "-v", "--version", action="version", version="zoekeend v0.0.1 (using duckdb v" + duckdb.__version__ + ")", ) subparsers = global_parser.add_subparsers(metavar="subexperiment ...") index_parser = subparsers.add_parser( "index", help="create the index file for an IR dataset", description=zoekeend_index.__doc__, ) index_parser.set_defaults(func=zoekeend_index) index_parser.add_argument( "dbname", help="file name of index", ) index_parser.add_argument( "dataset", help="ir_dataset, see: https://ir-datasets.com", ) index_parser.add_argument( "-w", "--wordstemmer", help="word stemmer (default: none)", default="none", choices=["none", "porter", "dutch"], ) index_parser.add_argument( "-s", "--stopwords", help="stop words (default: none)", default="none", choices=["none", "english"], ) index_parser.add_argument( "-k", "--keep_content", help="keep the document content column", action="store_true", ) reindex_prior_parser = subparsers.add_parser( "reindex_prior", help="recreate the index including prior scores", description=zoekeend_reindex_prior.__doc__, ) reindex_prior_parser.set_defaults(func=zoekeend_reindex_prior) reindex_prior_parser.add_argument( "dbname_in", help="file name of old index", ) reindex_prior_parser.add_argument( "dbname_out", help="file name of new index with priors", ) reindex_prior_parser.add_argument( "-i", "--init", help="initialize with standard prior ('len' or 'uniform')", choices=["len", "uniform"], ) reindex_prior_parser.add_argument( "-f", "--file", help="file with comma-separated (did,prior) pairs", ) reindex_prior_parser.add_argument( "-d", "--default", help="default prior for documents missing in the file", type=float, ) reindex_fitted_parser = subparsers.add_parser( "reindex_fitted", help="recreate the index by fitting prior scores", description=zoekeend_reindex_fitted.__doc__, ) reindex_fitted_parser.set_defaults(func=zoekeend_reindex_fitted) reindex_fitted_parser.add_argument( "dbname_in", help="file name of old index", ) reindex_fitted_parser.add_argument( "dbname_out", help="file name of new fitted index", ) reindex_fitted_parser.add_argument( "-c", "--column", help="column to be used for fitting (default: prior)", default="prior", choices=["len", "prior"], ) reindex_fitted_parser.add_argument( "-b", "--bins", help="number of bins", type=int, ) reindex_fitted_parser.add_argument( "-p", "--print", help="print sample used for fitting", action="store_true", ) reindex_fitted_parser.add_argument( "-q", "--qrls", help="training queries/qrels", ) reindex_fitted_parser.add_argument( "-t", "--threshold", help="prior values <= threshold are ignored (default: 0)", default=0, type=int, ) reindex_const_parser = subparsers.add_parser( "reindex_const", help="recreate the index by rescaling term frequencies", description=zoekeend_reindex_const.__doc__, ) reindex_const_parser.set_defaults(func=zoekeend_reindex_const) reindex_const_parser.add_argument( "dbname_in", help="file name of old index", ) reindex_const_parser.add_argument( "dbname_out", help="file name of new fitted index", ) reindex_const_parser.add_argument( "-c", "--const", help="constant document length (default: 400)", type=int, default=400, ) reindex_const_parser.add_argument( "-b", "--beta", help="length normalization parameter (default: 1.0)", type=float, default=1.0, ) reindex_const_parser.add_argument( "-k", "--keepterms", action="store_true", help="keep all terms, even if new tf is small", ) search_parser = subparsers.add_parser( "search", help="execute queries and create run output", description=zoekeend_search.__doc__, ) search_parser.set_defaults(func=zoekeend_search) search_parser.add_argument( "dbname", help="file name of index", ) search_parser.add_argument( "queries", help="ir_dataset queries id or tab-separated query file", ) search_parser.add_argument( "-r", "--run", help="run tag", ) search_parser.add_argument( "-t", "--top", type=int, default=1000, help="amount of top results (default: 1000)", ) search_parser.add_argument( "-o", "--out", help="the run file to be outputted (default: stdout)" ) search_parser.add_argument( "-m", "--match", help="match function: languge models (default) or bm25", default="lm", choices=["lm", "bm25"], ) search_parser.add_argument( "-l", "--lmbda", help="lm lambda parameter (default: 0.3)", type=float, default=0.3 ) search_parser.add_argument( "-k", "--bm25k", help="bm25 k parameter (default: 0.9)", type=float, default=0.9 ) search_parser.add_argument( "-b", "--bm25b", help="bm25 b parameter (default: 0.4)", type=float, default=0.4 ) search_parser.add_argument( "-s", "--start", help="start identifier of query", type=int, ) search_parser.add_argument( "-e", "--end", help="end identifier of query", type=int, ) vacuum_parser = subparsers.add_parser( "vacuum", help="vacuum index to reclaim disk space", description=zoekeend_vacuum.__doc__, ) vacuum_parser.set_defaults(func=zoekeend_vacuum) vacuum_parser.add_argument( "dbname", help="file name of index", ) vacuum_parser.add_argument("-c", "--cluster", action="store_true", help="cluster index") eval_parser = subparsers.add_parser( "eval", help="evaluate run using trec_eval", description=zoekeend_eval.__doc__ ) eval_parser.set_defaults(func=zoekeend_eval) eval_parser.add_argument( "run", help="trec run file", ) eval_parser.add_argument( "queries", help="ir_dataset queries id or trec qrel file", ) eval_parser.add_argument( "-c", "--complete_rel", action="store_true", help="queries with missing results contribute a value of 0", ) eval_parser.add_argument( "-n", "--ndcg", action="store_true", help="add normalized discounted cummaltive gain (ndcg)", ) eval_parser.add_argument( "-q", "--query_eval", action="store_true", help="give evaluation for each query/topic", ) index_import_parser = subparsers.add_parser( "index_import", help="import ciff index", description=zoekeend_index_import.__doc__ ) index_import_parser.set_defaults(func=zoekeend_index_import) index_import_parser.add_argument( "dbname", help="file name of index", ) index_import_parser.add_argument( "ciff_file", help="ciff file", ) index_import_parser.add_argument( "-t", "--tokenizer", help="tokenizer (default: ciff)", default="ciff", choices=["ciff", "duckdb"], ) index_import_parser.add_argument( "-w", "--wordstemmer", help="word stemmer (default: none)", default="none", choices=["none", "porter", "dutch"], ) index_export_parser = subparsers.add_parser( "index_export", help="export ciff index", description=zoekeend_index_import.__doc__ ) index_export_parser.set_defaults(func=zoekeend_index_export) index_export_parser.add_argument( "dbname", help="file name of index", ) index_export_parser.add_argument( "ciff_file", help="ciff file", ) index_export_parser.add_argument( "-d", "--description", help="CIFF description (default: Exported from DuckDB)", default="Exported from DuckDB", ) index_export_parser.add_argument( "-b", "--batch-size", help="batch size (default: 1024)", default=1024, type=int, ) parsed_args = global_parser.parse_args() if hasattr(parsed_args, "func"): parsed_args.func(parsed_args) else: global_parser.print_usage()