mirror of
https://github.com/ArthurIdema/Zoekeend-Phrase-Indexing.git
synced 2025-10-26 16:24:21 +00:00
Improved code
This commit is contained in:
commit
e7a025deb1
20
.gitignore
vendored
Normal file
20
.gitignore
vendored
Normal file
@ -0,0 +1,20 @@
|
||||
cranfield.db
|
||||
venv
|
||||
__pycache__
|
||||
cranfield.qrels
|
||||
cranfieldoutput
|
||||
/duckdb-fts-main/
|
||||
/trec_eval/
|
||||
*.db
|
||||
*.ciff
|
||||
output*.txt
|
||||
results*.txt
|
||||
*.txt
|
||||
/results/
|
||||
/resultszoekeend/
|
||||
/oldresults/
|
||||
*.ciff.gz
|
||||
INSTALL
|
||||
custom.qrels
|
||||
custom_index
|
||||
database.db.wal
|
||||
18
README.md
Normal file
18
README.md
Normal file
@ -0,0 +1,18 @@
|
||||
## How to use
|
||||
Run `python3 phrase_index.py` with any of the parameters listed below:
|
||||
```
|
||||
-h, --help show this help message and exit
|
||||
--db DB Database file name
|
||||
--dataset DATASET ir_datasets name (e.g., cranfield, msmarco-passage)
|
||||
--stopwords STOPWORDS Stopwords to use (english, none)
|
||||
--mode MODE Indexing mode (duckdb, phrases)
|
||||
--min-freq MIN_FREQ Minimum frequency for phrases (only for mode "phrases")
|
||||
--min-pmi MIN_PMI Minimum PMI for phrases (only for mode "phrases")
|
||||
```
|
||||
|
||||
## Helper scripts
|
||||
- `./auto_phrase.sh` and `./auto_zoekeend.sh` can be used to automatically index, search and evaluate the results and store it in a results directory. `auto_phrase` uses `phrase_index.py`, while `auto_zoekeend` uses `ze_index.py`.
|
||||
|
||||
- `./batch_phrase.sh` can be used to create the results using multiple different variables in one go.
|
||||
|
||||
- And display_results.sh can be used to display the evaluation metrics of all previous results. (So MAP, CiP, dictionary size, terms size, number of phrases, AVGDL and SUMDF)
|
||||
64
helper_scripts/auto_phrase.sh
Executable file
64
helper_scripts/auto_phrase.sh
Executable file
@ -0,0 +1,64 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
# Settings
|
||||
DB="database.db"
|
||||
OUT="results.txt"
|
||||
DATASET="cranfield"
|
||||
QUERY="cran"
|
||||
STOPWORDS="english"
|
||||
MODE="duckdb"
|
||||
CODE="phrase_index.py"
|
||||
EXTRACTOR="phrases_extractor.py"
|
||||
LIMIT=-1
|
||||
MIN_FREQ=9
|
||||
MIN_PMI=4.0
|
||||
|
||||
# remove old if exists
|
||||
[ -f ${DB} ] && rm ${DB}
|
||||
[ -f ${OUT} ] && rm ${OUT}
|
||||
[ -f eval.txt ] && rm eval.txt
|
||||
|
||||
# Timestamped results directory
|
||||
RUN_ID=$(date +"%Y%m%d_%H%M%S")
|
||||
RESULTS_DIR="results/$RUN_ID"
|
||||
|
||||
|
||||
# Step 1: Build the index
|
||||
python $CODE --db "$DB" --dataset "$DATASET" --stopwords "$STOPWORDS" --mode "$MODE" --limit "$LIMIT" --min-freq "$MIN_FREQ" --min-pmi "$MIN_PMI"
|
||||
|
||||
# Step 2: Search
|
||||
./zoekeend search "$DB" "$QUERY" -o "$OUT"
|
||||
|
||||
# Step 3: Evaluate
|
||||
./zoekeend eval "$OUT" "$QUERY" | tee eval.txt
|
||||
|
||||
# Save all outputs and settings
|
||||
mkdir -p "$RESULTS_DIR"
|
||||
mv "$DB" "$RESULTS_DIR/"
|
||||
mv "$OUT" "$RESULTS_DIR/"
|
||||
mv eval.txt "$RESULTS_DIR/"
|
||||
cp $CODE "$RESULTS_DIR/"
|
||||
cp $EXTRACTOR "$RESULTS_DIR/"
|
||||
|
||||
# Save settings
|
||||
cat > "$RESULTS_DIR/settings.txt" <<EOF
|
||||
DB: $DB
|
||||
OUT: $OUT
|
||||
DATASET: $DATASET
|
||||
QUERY: $QUERY
|
||||
STOPWORDS: $STOPWORDS
|
||||
MODE: $MODE
|
||||
LIMIT: $LIMIT
|
||||
MIN_FREQ: $MIN_FREQ
|
||||
MIN_PMI: $MIN_PMI
|
||||
RUN_ID: $RUN_ID
|
||||
EOF
|
||||
|
||||
# Remove temporary files
|
||||
rm -f ${DB} $OUT eval.txt
|
||||
|
||||
echo ""
|
||||
echo "Done. Results stored in $RESULTS_DIR"
|
||||
echo "duckdb -ui $RESULTS_DIR/$DB"
|
||||
ls "$RESULTS_DIR"
|
||||
52
helper_scripts/auto_zoekeend.sh
Executable file
52
helper_scripts/auto_zoekeend.sh
Executable file
@ -0,0 +1,52 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
# Settings
|
||||
DB="database.db"
|
||||
OUT="results.txt"
|
||||
DATASET="cranfield"
|
||||
QUERY="cran"
|
||||
STOPWORDS="english"
|
||||
|
||||
# remove old if exists
|
||||
[ -f ${DB} ] && rm ${DB}
|
||||
[ -f ${OUT} ] && rm ${OUT}
|
||||
[ -f eval.txt ] && rm eval.txt
|
||||
|
||||
# Timestamped results directory
|
||||
RUN_ID=$(date +"%Y%m%d_%H%M%S")
|
||||
RESULTS_DIR="resultszoekeend/$RUN_ID"
|
||||
|
||||
|
||||
cd ..
|
||||
|
||||
# Step 1: Build the index
|
||||
python ./zoekeend index $DB $DATASET -s "$STOPWORDS"
|
||||
|
||||
# Step 2: Search
|
||||
./zoekeend search "$DB" "$QUERY" -o "$OUT"
|
||||
|
||||
# Step 3: Evaluate
|
||||
./zoekeend eval "$OUT" "$QUERY" | tee eval.txt
|
||||
|
||||
# Save all outputs and settings
|
||||
mkdir -p "$RESULTS_DIR"
|
||||
mv "$DB" "$RESULTS_DIR/"
|
||||
mv "$OUT" "$RESULTS_DIR/"
|
||||
mv eval.txt "$RESULTS_DIR/"
|
||||
|
||||
# Save settings
|
||||
cat > "$RESULTS_DIR/settings.txt" <<EOF
|
||||
DB: $DB
|
||||
OUT: $OUT
|
||||
DATASET: $DATASET
|
||||
QUERY: $QUERY
|
||||
STOPWORDS: $STOPWORDS
|
||||
RUN_ID: $RUN_ID
|
||||
EOF
|
||||
|
||||
# Remove temporary files
|
||||
rm -f ${DB} $OUT eval.txt
|
||||
|
||||
echo "Done. Results stored in $RESULTS_DIR"
|
||||
ls -lh "$RESULTS_DIR"
|
||||
79
helper_scripts/batch_phrase.sh
Executable file
79
helper_scripts/batch_phrase.sh
Executable file
@ -0,0 +1,79 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
DB_BASE="database"
|
||||
OUT_BASE="results"
|
||||
DATASET="cranfield"
|
||||
QUERY="cran"
|
||||
INDEXER="phrase_index.py"
|
||||
|
||||
STOPWORDS_LIST=("english" "none")
|
||||
MODE_LIST=("duckdb" "phrases")
|
||||
LIMIT_LIST=(-1)
|
||||
MIN_FREQ_LIST=(4 5 6 7 9 10 11)
|
||||
MIN_PMI_LIST=(5 6 7 8 9 10 11 12 13 14)
|
||||
|
||||
cd ..
|
||||
|
||||
for STOPWORDS in "${STOPWORDS_LIST[@]}"; do
|
||||
for MODE in "${MODE_LIST[@]}"; do
|
||||
for LIMIT in "${LIMIT_LIST[@]}"; do
|
||||
for MIN_FREQ in "${MIN_FREQ_LIST[@]}"; do
|
||||
for MIN_PMI in "${MIN_PMI_LIST[@]}"; do
|
||||
# For duckdb mode, only run once per LIMIT/STOPWORDS (ignore min_freq/min_pmi except first)
|
||||
if [[ "$MODE" == "duckdb" && ( "$MIN_FREQ" != "${MIN_FREQ_LIST[0]}" || "$MIN_PMI" != "${MIN_PMI_LIST[0]}" ) ]]; then
|
||||
continue
|
||||
fi
|
||||
DB="${DB_BASE}_${MODE}_${STOPWORDS}_${LIMIT}_${MIN_FREQ}_${MIN_PMI}.db"
|
||||
OUT="${OUT_BASE}_${MODE}_${STOPWORDS}_${LIMIT}_${MIN_FREQ}_${MIN_PMI}.txt"
|
||||
|
||||
# Remove old files if they exist
|
||||
[ -f "$DB" ] && rm "$DB"
|
||||
[ -f "$OUT" ] && rm "$OUT"
|
||||
[ -f eval.txt ] && rm eval.txt
|
||||
|
||||
# Timestamped results directory
|
||||
RUN_ID=$(date +"%Y%m%d_%H%M%S")_${MODE}_${STOPWORDS}_${LIMIT}_${MIN_FREQ}_${MIN_PMI}
|
||||
RESULTS_DIR="results/$RUN_ID"
|
||||
mkdir -p "$RESULTS_DIR"
|
||||
|
||||
# Step 1: Build the index
|
||||
python "$INDEXER" --db "$DB" --dataset "$DATASET" --stopwords "$STOPWORDS" --mode "$MODE" --limit "$LIMIT" --min-freq "$MIN_FREQ" --min-pmi "$MIN_PMI"
|
||||
|
||||
# Step 2: Search
|
||||
./zoekeend search "$DB" "$QUERY" -o "$OUT"
|
||||
|
||||
# Step 3: Evaluate
|
||||
./zoekeend eval "$OUT" "$QUERY" > eval.txt
|
||||
|
||||
# Save all outputs and settings
|
||||
mkdir -p "$RESULTS_DIR"
|
||||
mv "$DB" "$RESULTS_DIR/"
|
||||
mv "$OUT" "$RESULTS_DIR/"
|
||||
mv eval.txt "$RESULTS_DIR/"
|
||||
|
||||
# Save settings
|
||||
cat > "$RESULTS_DIR/settings.txt" <<EOF
|
||||
DB: $DB
|
||||
OUT: $OUT
|
||||
DATASET: $DATASET
|
||||
QUERY: $QUERY
|
||||
STOPWORDS: $STOPWORDS
|
||||
MODE: $MODE
|
||||
LIMIT: $LIMIT
|
||||
MIN_FREQ: $MIN_FREQ
|
||||
MIN_PMI: $MIN_PMI
|
||||
RUN_ID: $RUN_ID
|
||||
EOF
|
||||
|
||||
# Remove temporary files
|
||||
rm -f "$DB" "$OUT" eval.txt
|
||||
|
||||
echo "Done. Results stored in $RESULTS_DIR"
|
||||
ls -lh "$RESULTS_DIR"
|
||||
echo "--------------------------------------"
|
||||
done
|
||||
done
|
||||
done
|
||||
done
|
||||
done
|
||||
26
helper_scripts/display_results.sh
Executable file
26
helper_scripts/display_results.sh
Executable file
@ -0,0 +1,26 @@
|
||||
#!/bin/bash
|
||||
|
||||
echo -e "RUN_ID\tMODE\tSTOPWORDS\tMIN_FREQ\tMIN_PMI\tMAP\tPOSTINGS_COST\tDICT_SIZE\tTERMS_SIZE\tNGRAMS\tAVGDL\tSUMDF"
|
||||
|
||||
for dir in ../results/*; do
|
||||
[ -d "$dir" ] || continue
|
||||
SETTINGS="$dir/settings.txt"
|
||||
EVAL="$dir/eval.txt"
|
||||
DB=$(grep '^DB:' "$SETTINGS" | awk '{print $2}')
|
||||
DB="$dir/$(basename "$DB")"
|
||||
if [[ -f "$SETTINGS" && -f "$EVAL" && -f "$DB" ]]; then
|
||||
RUN_ID=$(grep '^RUN_ID:' "$SETTINGS" | awk '{print $2}')
|
||||
MODE=$(grep '^MODE:' "$SETTINGS" | awk '{print $2}')
|
||||
STOPWORDS=$(grep '^STOPWORDS:' "$SETTINGS" | awk '{print $2}')
|
||||
MIN_FREQ=$(grep '^MIN_FREQ:' "$SETTINGS" | awk '{print $2}')
|
||||
MIN_PMI=$(grep '^MIN_PMI:' "$SETTINGS" | awk '{print $2}')
|
||||
MAP=$(grep -E '^map[[:space:]]+all' "$EVAL" | awk '{print $3}')
|
||||
POSTINGS_COST=$(grep '^Average cost in postings:' "$EVAL" | awk '{print $5}')
|
||||
DICT_SIZE=$(duckdb "$DB" -csv -noheader "SELECT COUNT(*) FROM fts_main_documents.dict;")
|
||||
TERMS_SIZE=$(duckdb "$DB" -csv -noheader "SELECT COUNT(*) FROM fts_main_documents.terms;")
|
||||
NGRAMS=$(duckdb "$DB" -csv -noheader "SELECT COUNT(*) FROM fts_main_documents.dict WHERE term LIKE '% %';")
|
||||
AVGDL=$(duckdb "$DB" -csv -noheader "SELECT avgdl FROM fts_main_documents.stats;")
|
||||
SUMDF=$(duckdb "$DB" -csv -noheader "SELECT sumdf FROM fts_main_documents.stats;")
|
||||
echo -e "${RUN_ID}\t${MODE}\t${STOPWORDS}\t${MIN_FREQ}\t${MIN_PMI}\t${MAP}\t${POSTINGS_COST}\t${DICT_SIZE}\t${TERMS_SIZE}\t${NGRAMS}\t${AVGDL}\t${SUMDF}"
|
||||
fi
|
||||
done
|
||||
272
helper_scripts/to_csv.py
Normal file
272
helper_scripts/to_csv.py
Normal file
@ -0,0 +1,272 @@
|
||||
# A super simple helper script
|
||||
import pandas as pd
|
||||
|
||||
# Raw data as provided
|
||||
data = """20250813_160649_duckdb_english_-1_0_0 duckdb english 0 0 0.2881 704.8000 6639 128030 0 91.45 80317
|
||||
20250813_160726_phrases_english_-1_0_0 phrases english 0 0 0.1572 476.8444 21928 102342 16988 73.10142857142857 77971
|
||||
20250813_160938_phrases_english_-1_0_1 phrases english 0 1 0.1629 493.3867 21685 102526 16746 73.23285714285714 77793
|
||||
20250813_161143_phrases_english_-1_0_2 phrases english 0 2 0.1688 509.5022 21052 103046 16109 73.60428571428571 77506
|
||||
20250813_161351_phrases_english_-1_0_4 phrases english 0 4 0.1778 559.0311 18662 106284 13680 75.91714285714286 77921
|
||||
20250813_161553_phrases_english_-1_0_8 phrases english 0 8 0.2568 790.6044 11156 126009 5981 90.00642857142857 82155
|
||||
20250813_161718_phrases_english_-1_0_16 phrases english 0 16 0.2906 816.3067 6593 138733 129 99.095 86796
|
||||
20250813_161835_phrases_english_-1_0_24 phrases english 0 24 0.2906 816.3200 6639 138873 0 99.195 86916
|
||||
20250813_161949_phrases_english_-1_0_48 phrases english 0 48 0.2906 816.3200 6639 138873 0 99.195 86916
|
||||
20250813_162104_phrases_english_-1_1_0 phrases english 1 0 0.1572 476.8444 21928 102342 16988 73.10142857142857 77971
|
||||
20250813_162314_phrases_english_-1_1_1 phrases english 1 1 0.1629 493.3867 21685 102526 16746 73.23285714285714 77793
|
||||
20250813_162521_phrases_english_-1_1_2 phrases english 1 2 0.1688 509.5022 21052 103046 16109 73.60428571428571 77506
|
||||
20250813_162652_phrases_english_-1_1_4 phrases english 1 4 0.1778 559.0311 18662 106284 13680 75.91714285714286 77921
|
||||
20250813_162851_phrases_english_-1_1_8 phrases english 1 8 0.2568 790.6044 11156 126009 5981 90.00642857142857 82155
|
||||
20250813_163027_phrases_english_-1_1_16 phrases english 1 16 0.2906 816.3067 6593 138733 129 99.095 86796
|
||||
20250813_163147_phrases_english_-1_1_24 phrases english 1 24 0.2906 816.3200 6639 138873 0 99.195 86916
|
||||
20250813_163259_phrases_english_-1_1_48 phrases english 1 48 0.2906 816.3200 6639 138873 0 99.195 86916
|
||||
20250813_163405_phrases_english_-1_2_0 phrases english 2 0 0.1763 379.8711 10740 105739 6586 75.52785714285714 79664
|
||||
20250813_163530_phrases_english_-1_2_1 phrases english 2 1 0.1772 396.0311 10666 105816 6512 75.58285714285714 79489
|
||||
20250813_163654_phrases_english_-1_2_2 phrases english 2 2 0.1794 411.7600 10427 106096 6273 75.78285714285714 79150
|
||||
20250813_163822_phrases_english_-1_2_4 phrases english 2 4 0.1938 459.2800 9433 108408 5271 77.43428571428572 79099
|
||||
20250813_163949_phrases_english_-1_2_8 phrases english 2 8 0.2627 709.4400 6083 124639 1908 89.02785714285714 81508
|
||||
20250813_164108_phrases_english_-1_2_16 phrases english 2 16 0.2932 739.6222 4347 133593 12 95.42357142857144 83902
|
||||
20250813_164216_phrases_english_-1_2_24 phrases english 2 24 0.2932 739.6222 4349 133616 0 95.44 83913
|
||||
20250813_164322_phrases_english_-1_2_48 phrases english 2 48 0.2932 739.6222 4349 133616 0 95.44 83913
|
||||
20250813_164425_phrases_english_-1_4_0 phrases english 4 0 0.1926 412.1156 5392 108052 2439 77.18 77560
|
||||
20250813_164542_phrases_english_-1_4_1 phrases english 4 1 0.1954 422.1467 5376 108073 2423 77.195 77458
|
||||
20250813_164656_phrases_english_-1_4_2 phrases english 4 2 0.1955 431.9733 5305 108193 2352 77.28071428571428 77205
|
||||
20250813_164814_phrases_english_-1_4_4 phrases english 4 4 0.2077 469.7067 4969 109698 2015 78.35571428571428 77107
|
||||
20250813_164929_phrases_english_-1_4_8 phrases english 4 8 0.2703 705.4267 3584 122735 630 87.66785714285714 78610
|
||||
20250813_165045_phrases_english_-1_4_16 phrases english 4 16 0.2938 732.3067 2980 129255 0 92.325 80420
|
||||
20250813_165151_phrases_english_-1_4_24 phrases english 4 24 0.2938 732.3067 2980 129255 0 92.325 80420
|
||||
20250813_165256_phrases_english_-1_4_48 phrases english 4 48 0.2938 732.3067 2980 129255 0 92.325 80420
|
||||
20250813_165402_phrases_english_-1_8_0 phrases english 8 0 0.2316 481.0444 2889 109018 915 77.87 74606
|
||||
20250813_165514_phrases_english_-1_8_1 phrases english 8 1 0.2315 485.4489 2887 109018 913 77.87 74543
|
||||
20250813_165623_phrases_english_-1_8_2 phrases english 8 2 0.2312 489.1467 2869 109078 895 77.91285714285715 74428
|
||||
20250813_165734_phrases_english_-1_8_4 phrases english 8 4 0.2402 513.2978 2760 109979 786 78.55642857142857 74316
|
||||
20250813_165845_phrases_english_-1_8_8 phrases english 8 8 0.2772 710.0622 2195 120281 221 85.915 75361
|
||||
20250813_165956_phrases_english_-1_8_16 phrases english 8 16 0.2933 733.9689 1978 125124 0 89.37428571428572 76721
|
||||
20250813_170104_phrases_english_-1_8_24 phrases english 8 24 0.2933 733.9689 1978 125124 0 89.37428571428572 76721
|
||||
20250813_170215_phrases_english_-1_8_48 phrases english 8 48 0.2933 733.9689 1978 125124 0 89.37428571428572 76721
|
||||
20250813_170321_phrases_english_-1_16_0 phrases english 16 0 0.2509 538.9644 1636 105876 341 75.62571428571428 69335
|
||||
20250813_170435_phrases_english_-1_16_1 phrases english 16 1 0.2519 543.3867 1635 105876 340 75.62571428571428 69280
|
||||
20250813_170543_phrases_english_-1_16_2 phrases english 16 2 0.2522 545.7333 1632 105914 337 75.65285714285714 69255
|
||||
20250813_170652_phrases_english_-1_16_4 phrases english 16 4 0.2599 560.0044 1601 106384 306 75.98857142857143 69176
|
||||
20250813_170758_phrases_english_-1_16_8 phrases english 16 8 0.2889 721.7467 1376 114387 81 81.705 69972
|
||||
20250813_170906_phrases_english_-1_16_16 phrases english 16 16 0.2978 742.4178 1296 117990 0 84.27857142857142 71021
|
||||
20250813_171011_phrases_english_-1_16_24 phrases english 16 24 0.2978 742.4178 1296 117990 0 84.27857142857142 71021
|
||||
20250813_171115_phrases_english_-1_16_48 phrases english 16 48 0.2978 742.4178 1296 117990 0 84.27857142857142 71021
|
||||
20250813_171220_phrases_english_-1_24_0 phrases english 24 0 0.2563 578.9422 1164 102808 188 73.43428571428572 65298
|
||||
20250813_171330_phrases_english_-1_24_1 phrases english 24 1 0.2563 578.9422 1164 102808 188 73.43428571428572 65298
|
||||
20250813_171441_phrases_english_-1_24_2 phrases english 24 2 0.2568 579.9022 1163 102829 187 73.44928571428571 65298
|
||||
20250813_171553_phrases_english_-1_24_4 phrases english 24 4 0.2617 592.4044 1149 103170 173 73.69285714285714 65267
|
||||
20250813_171703_phrases_english_-1_24_8 phrases english 24 8 0.2851 731.9244 1017 109871 41 78.47928571428571 65953
|
||||
20250813_171818_phrases_english_-1_24_16 phrases english 24 16 0.2901 749.5022 977 112836 0 80.59714285714286 66831
|
||||
20250813_171930_phrases_english_-1_24_24 phrases english 24 24 0.2901 749.5022 977 112836 0 80.59714285714286 66831
|
||||
20250813_172041_phrases_english_-1_24_48 phrases english 24 48 0.2901 749.5022 977 112836 0 80.59714285714286 66831
|
||||
20250813_172151_duckdb_none_-1_0_0 duckdb none 0 0 0.2713 1366.4711 7044 239151 0 170.82214285714286 120060
|
||||
20250813_172225_phrases_none_-1_0_0 phrases none 0 0 0.0942 606.3111 39156 138456 36189 98.89714285714285 116154
|
||||
20250813_172407_phrases_none_-1_0_1 phrases none 0 1 0.1055 865.5467 36615 146196 33656 104.42571428571429 118449
|
||||
20250813_172553_phrases_none_-1_0_2 phrases none 0 2 0.1207 1160.9289 32862 161095 29874 115.06785714285714 113754
|
||||
20250813_172741_phrases_none_-1_0_4 phrases none 0 4 0.1507 1305.4889 25669 184408 22135 131.72 107278
|
||||
20250813_172918_phrases_none_-1_0_8 phrases none 0 8 0.2366 1365.3956 12779 223412 7754 159.58 113556
|
||||
20250813_173023_phrases_none_-1_0_16 phrases none 0 16 0.2712 1366.4711 6994 238997 142 170.71214285714285 119929
|
||||
20250813_173123_phrases_none_-1_0_24 phrases none 0 24 0.2713 1366.4711 7044 239151 0 170.82214285714286 120060
|
||||
20250813_173214_phrases_none_-1_0_48 phrases none 0 48 0.2713 1366.4711 7044 239151 0 170.82214285714286 120060
|
||||
20250813_173306_phrases_none_-1_1_0 phrases none 1 0 0.0942 606.3111 39156 138456 36189 98.89714285714285 116154
|
||||
20250813_173448_phrases_none_-1_1_1 phrases none 1 1 0.1055 865.5467 36615 146196 33656 104.42571428571429 118449
|
||||
20250813_173636_phrases_none_-1_1_2 phrases none 1 2 0.1207 1160.9289 32862 161095 29874 115.06785714285714 113754
|
||||
20250813_173824_phrases_none_-1_1_4 phrases none 1 4 0.1507 1305.4889 25669 184408 22135 131.72 107278
|
||||
20250813_174000_phrases_none_-1_1_8 phrases none 1 8 0.2366 1365.3956 12779 223412 7754 159.58 113556
|
||||
20250813_174119_phrases_none_-1_1_16 phrases none 1 16 0.2712 1366.4711 6994 238997 142 170.71214285714285 119929
|
||||
20250813_174220_phrases_none_-1_1_24 phrases none 1 24 0.2713 1366.4711 7044 239151 0 170.82214285714286 120060
|
||||
20250813_174314_phrases_none_-1_1_48 phrases none 1 48 0.2713 1366.4711 7044 239151 0 170.82214285714286 120060
|
||||
20250813_174406_phrases_none_-1_2_0 phrases none 2 0 0.1154 775.2578 23364 162542 19103 116.10142857142857 131171
|
||||
20250813_174516_phrases_none_-1_2_1 phrases none 2 1 0.1229 925.6178 21484 168916 17216 120.65428571428572 131354
|
||||
20250813_174625_phrases_none_-1_2_2 phrases none 2 2 0.1401 1175.8044 18673 181497 14424 129.6407142857143 124656
|
||||
20250813_174734_phrases_none_-1_2_4 phrases none 2 4 0.1745 1309.2400 13265 203089 8938 145.06357142857144 117056
|
||||
20250813_174842_phrases_none_-1_2_8 phrases none 2 8 0.2438 1367.9511 6821 236944 2307 169.24571428571429 121637
|
||||
20250813_174938_phrases_none_-1_2_16 phrases none 2 16 0.2715 1368.6533 4719 247372 13 176.6942857142857 124752
|
||||
20250813_175030_phrases_none_-1_2_24 phrases none 2 24 0.2715 1368.6533 4720 247398 0 176.71285714285713 124765
|
||||
20250813_175123_phrases_none_-1_2_48 phrases none 2 48 0.2715 1368.6533 4720 247398 0 176.71285714285713 124765
|
||||
20250813_175212_phrases_none_-1_4_0 phrases none 4 0 0.1461 917.3200 12382 188395 9140 134.56785714285715 138261
|
||||
20250813_175308_phrases_none_-1_4_1 phrases none 4 1 0.1444 1016.7067 11297 192252 8053 137.32285714285715 137005
|
||||
20250813_175355_phrases_none_-1_4_2 phrases none 4 2 0.1640 1212.6044 9670 203157 6432 145.11214285714286 129190
|
||||
20250813_175450_phrases_none_-1_4_4 phrases none 4 4 0.1844 1328.3778 6782 224561 3533 160.4007142857143 122890
|
||||
20250813_175540_phrases_none_-1_4_8 phrases none 4 8 0.2532 1373.9778 4029 253029 745 180.735 126124
|
||||
20250813_175635_phrases_none_-1_4_16 phrases none 4 16 0.2731 1374.4044 3313 260575 0 186.125 128467
|
||||
20250813_175724_phrases_none_-1_4_24 phrases none 4 24 0.2731 1374.4044 3313 260575 0 186.125 128467
|
||||
20250813_175813_phrases_none_-1_4_48 phrases none 4 48 0.2731 1374.4044 3313 260575 0 186.125 128467
|
||||
20250813_175900_phrases_none_-1_8_0 phrases none 8 0 0.1722 1081.2489 6529 220276 4267 157.34 141465
|
||||
20250813_175950_phrases_none_-1_8_1 phrases none 8 1 0.1759 1147.3956 6030 222849 3768 159.17785714285714 139509
|
||||
20250813_180037_phrases_none_-1_8_2 phrases none 8 2 0.1925 1262.1600 5229 234827 2968 167.73357142857142 133023
|
||||
20250813_180131_phrases_none_-1_8_4 phrases none 8 4 0.2184 1346.5333 3785 254655 1523 181.89642857142857 127665
|
||||
20250813_180222_phrases_none_-1_8_8 phrases none 8 8 0.2630 1379.0667 2537 278374 269 198.83857142857144 129892
|
||||
20250813_180315_phrases_none_-1_8_16 phrases none 8 16 0.2752 1379.3156 2273 284030 0 202.87857142857143 131682
|
||||
20250813_180404_phrases_none_-1_8_24 phrases none 8 24 0.2752 1379.3156 2273 284030 0 202.87857142857143 131682
|
||||
20250813_180453_phrases_none_-1_8_48 phrases none 8 48 0.2752 1379.3156 2273 284030 0 202.87857142857143 131682
|
||||
20250813_180542_phrases_none_-1_16_0 phrases none 16 0 0.2053 1267.8489 3405 269846 1865 192.74714285714285 142548
|
||||
20250813_180632_phrases_none_-1_16_1 phrases none 16 1 0.2059 1279.7244 3196 271773 1656 194.12357142857144 140763
|
||||
20250813_180723_phrases_none_-1_16_2 phrases none 16 2 0.2214 1325.5867 2835 282258 1296 201.61285714285714 134192
|
||||
20250813_180816_phrases_none_-1_16_4 phrases none 16 4 0.2379 1375.3067 2176 300256 637 214.46857142857144 129654
|
||||
20250813_180910_phrases_none_-1_16_8 phrases none 16 8 0.2662 1391.0533 1641 319121 101 227.94357142857143 130933
|
||||
20250813_181005_phrases_none_-1_16_16 phrases none 16 16 0.2722 1391.0400 1541 323278 0 230.91285714285715 132277
|
||||
20250813_181056_phrases_none_-1_16_24 phrases none 16 24 0.2722 1391.0400 1541 323278 0 230.91285714285715 132277
|
||||
20250813_181150_phrases_none_-1_16_48 phrases none 16 48 0.2722 1391.0400 1541 323278 0 230.91285714285715 132277
|
||||
20250813_181242_phrases_none_-1_24_0 phrases none 24 0 0.2074 1341.2933 2290 303650 1109 216.89285714285714 140095
|
||||
20250813_181336_phrases_none_-1_24_1 phrases none 24 1 0.2134 1339.4400 2184 304857 1003 217.755 138507
|
||||
20250813_181430_phrases_none_-1_24_2 phrases none 24 2 0.2330 1354.8667 1960 315279 780 225.1992857142857 131976
|
||||
20250813_181523_phrases_none_-1_24_4 phrases none 24 4 0.2451 1386.2356 1568 330903 387 236.3592857142857 127722
|
||||
20250813_181620_phrases_none_-1_24_8 phrases none 24 8 0.2702 1393.9689 1235 347214 54 248.01 128647
|
||||
20250813_181717_phrases_none_-1_24_16 phrases none 24 16 0.2721 1393.9244 1182 350595 0 250.425 129773
|
||||
20250813_181811_phrases_none_-1_24_24 phrases none 24 24 0.2721 1393.9244 1182 350595 0 250.425 129773
|
||||
20250813_181906_phrases_none_-1_24_48 phrases none 24 48 0.2721 1393.9244 1182 350595 0 250.425 129773
|
||||
20250813_210100_duckdb_english_-1_4_5 duckdb english 4 5 0.2881 704.8000 6639 128030 0 91.45 80317
|
||||
20250813_210142_phrases_english_-1_4_5 phrases english 4 5 0.2083 521.3733 4654 111616 1700 79.72571428571429 77189
|
||||
20250813_210257_phrases_english_-1_4_6 phrases english 4 6 0.2195 595.7778 4250 114239 1297 81.59928571428571 77437
|
||||
20250813_210406_phrases_english_-1_4_7 phrases english 4 7 0.2465 654.8444 3904 117509 950 83.935 77710
|
||||
20250813_210516_phrases_english_-1_4_8 phrases english 4 8 0.2703 705.4267 3584 122735 630 87.66785714285714 78610
|
||||
20250813_210624_phrases_english_-1_4_9 phrases english 4 9 0.2796 722.9733 3346 125792 390 89.85142857142857 79288
|
||||
20250813_210735_phrases_english_-1_4_10 phrases english 4 10 0.2870 729.9289 3205 127524 245 91.08857142857143 79810
|
||||
20250813_210846_phrases_english_-1_4_11 phrases english 4 11 0.2910 731.6133 3106 128280 141 91.62857142857143 80077
|
||||
20250813_210955_phrases_english_-1_4_12 phrases english 4 12 0.2940 732.1911 3044 128816 74 92.01142857142857 80251
|
||||
20250813_211102_phrases_english_-1_4_13 phrases english 4 13 0.2948 732.2889 3006 129079 34 92.19928571428571 80351
|
||||
20250813_211209_phrases_english_-1_4_14 phrases english 4 14 0.2938 732.2978 2989 129185 14 92.275 80400
|
||||
20250813_211315_phrases_english_-1_5_5 phrases english 5 5 0.2173 517.2000 3835 111400 1235 79.57142857142857 76183
|
||||
20250813_211424_phrases_english_-1_5_6 phrases english 5 6 0.2309 590.4667 3536 113742 936 81.24428571428571 76390
|
||||
20250813_211534_phrases_english_-1_5_7 phrases english 5 7 0.2538 648.1467 3280 116753 680 83.395 76603
|
||||
20250813_211644_phrases_english_-1_5_8 phrases english 5 8 0.2713 698.4800 3035 121771 435 86.97928571428571 77476
|
||||
20250813_211755_phrases_english_-1_5_9 phrases english 5 9 0.2818 715.6089 2870 124598 270 88.99857142857142 78102
|
||||
20250813_211908_phrases_english_-1_5_10 phrases english 5 10 0.2888 722.5244 2768 126202 166 90.14428571428572 78608
|
||||
20250813_212014_phrases_english_-1_5_11 phrases english 5 11 0.2923 724.1733 2702 126846 97 90.60428571428571 78836
|
||||
20250813_212121_phrases_english_-1_5_12 phrases english 5 12 0.2926 724.7289 2656 127310 49 90.93571428571428 79002
|
||||
20250813_212229_phrases_english_-1_5_13 phrases english 5 13 0.2940 724.7778 2632 127516 23 91.08285714285714 79084
|
||||
20250813_212336_phrases_english_-1_5_14 phrases english 5 14 0.2936 724.7822 2621 127595 10 91.13928571428572 79117
|
||||
20250813_212441_phrases_english_-1_6_5 phrases english 6 5 0.2286 532.4889 3326 111572 976 79.69428571428571 75634
|
||||
20250813_212550_phrases_english_-1_6_6 phrases english 6 6 0.2452 601.1067 3090 113712 740 81.22285714285714 75810
|
||||
20250813_212701_phrases_english_-1_6_7 phrases english 6 7 0.2606 656.0533 2883 116571 533 83.265 76015
|
||||
20250813_212811_phrases_english_-1_6_8 phrases english 6 8 0.2758 703.9644 2686 121408 336 86.72 76833
|
||||
20250813_212922_phrases_english_-1_6_9 phrases english 6 9 0.2832 720.5422 2556 124095 206 88.63928571428572 77428
|
||||
20250813_213032_phrases_english_-1_6_10 phrases english 6 10 0.2871 727.2844 2468 125638 116 89.74142857142857 77916
|
||||
20250813_213140_phrases_english_-1_6_11 phrases english 6 11 0.2896 728.7956 2417 126211 64 90.15071428571429 78124
|
||||
20250813_213257_phrases_english_-1_6_12 phrases english 6 12 0.2913 729.3200 2382 126624 27 90.44571428571429 78278
|
||||
20250813_213410_phrases_english_-1_6_13 phrases english 6 13 0.2927 729.3644 2366 126799 9 90.57071428571429 78352
|
||||
20250813_213517_phrases_english_-1_6_14 phrases english 6 14 0.2928 729.3644 2362 126842 4 90.60142857142857 78373
|
||||
20250813_213626_phrases_english_-1_7_5 phrases english 7 5 0.2358 547.9867 2934 111748 784 79.82 75148
|
||||
20250813_213741_phrases_english_-1_7_6 phrases english 7 6 0.2455 606.9378 2748 113674 598 81.19571428571429 75290
|
||||
20250813_213858_phrases_english_-1_7_7 phrases english 7 7 0.2617 660.2489 2577 116381 427 83.12928571428571 75452
|
||||
20250813_214015_phrases_english_-1_7_8 phrases english 7 8 0.2790 707.7867 2410 121097 260 86.49785714285714 76252
|
||||
20250813_214134_phrases_english_-1_7_9 phrases english 7 9 0.2862 724.0533 2302 123686 152 88.34714285714286 76823
|
||||
20250813_214245_phrases_english_-1_7_10 phrases english 7 10 0.2887 730.5244 2234 125113 83 89.36642857142857 77280
|
||||
20250813_214357_phrases_english_-1_7_11 phrases english 7 11 0.2924 731.9333 2197 125607 45 89.71928571428572 77469
|
||||
20250813_214509_phrases_english_-1_7_12 phrases english 7 12 0.2934 732.4000 2171 125965 18 89.975 77607
|
||||
20250813_214616_phrases_english_-1_7_13 phrases english 7 13 0.2946 732.4356 2159 126118 5 90.08428571428571 77672
|
||||
20250813_214724_phrases_english_-1_7_14 phrases english 7 14 0.2946 732.4356 2157 126149 2 90.10642857142857 77690
|
||||
20250813_214831_phrases_english_-1_9_5 phrases english 9 5 0.2402 565.6178 2392 111075 566 79.33928571428571 73618
|
||||
20250813_214943_phrases_english_-1_9_6 phrases english 9 6 0.2510 619.9644 2263 112722 437 80.51571428571428 73737
|
||||
20250813_215055_phrases_english_-1_9_7 phrases english 9 7 0.2631 667.1956 2134 115185 308 82.275 73839
|
||||
20250813_215207_phrases_english_-1_9_8 phrases english 9 8 0.2781 713.2356 2007 119663 181 85.47357142857143 74601
|
||||
20250813_215319_phrases_english_-1_9_9 phrases english 9 9 0.2834 729.0667 1930 122062 104 87.18714285714286 75124
|
||||
20250813_215427_phrases_english_-1_9_10 phrases english 9 10 0.2879 734.9022 1885 123339 58 88.09928571428571 75539
|
||||
20250813_215537_phrases_english_-1_9_11 phrases english 9 11 0.2909 736.1733 1858 123770 31 88.40714285714286 75702
|
||||
20250813_215648_phrases_english_-1_9_12 phrases english 9 12 0.2917 736.6178 1840 124073 12 88.62357142857142 75818
|
||||
20250813_215751_phrases_english_-1_9_13 phrases english 9 13 0.2938 736.6311 1832 124199 3 88.71357142857143 75872
|
||||
20250813_215854_phrases_english_-1_9_14 phrases english 9 14 0.2938 736.6311 1830 124230 0 88.73571428571428 75890
|
||||
20250813_215956_phrases_english_-1_10_5 phrases english 10 5 0.2405 564.1200 2220 110076 501 78.62571428571428 72645
|
||||
20250813_220107_phrases_english_-1_10_6 phrases english 10 6 0.2524 617.2489 2108 111644 389 79.74571428571429 72756
|
||||
20250813_220217_phrases_english_-1_10_7 phrases english 10 7 0.2623 663.9422 1992 114024 273 81.44571428571429 72840
|
||||
20250813_220321_phrases_english_-1_10_8 phrases english 10 8 0.2778 709.8178 1878 118417 159 84.58357142857143 73574
|
||||
20250813_220426_phrases_english_-1_10_9 phrases english 10 9 0.2844 725.6089 1810 120744 91 86.24571428571429 74083
|
||||
20250813_220532_phrases_english_-1_10_10 phrases english 10 10 0.2881 731.2444 1770 121970 51 87.12142857142857 74476
|
||||
20250813_220637_phrases_english_-1_10_11 phrases english 10 11 0.2904 732.5156 1744 122397 25 87.42642857142857 74639
|
||||
20250813_220743_phrases_english_-1_10_12 phrases english 10 12 0.2914 732.9689 1726 122699 7 87.64214285714286 74751
|
||||
20250813_220845_phrases_english_-1_10_13 phrases english 10 13 0.2934 732.9778 1721 122801 1 87.715 74797
|
||||
20250813_220946_phrases_english_-1_10_14 phrases english 10 14 0.2934 732.9778 1720 122814 0 87.72428571428571 74805
|
||||
20250813_221046_phrases_english_-1_11_5 phrases english 11 5 0.2486 569.0800 2065 109381 435 78.12928571428571 71865
|
||||
20250813_221153_phrases_english_-1_11_6 phrases english 11 6 0.2586 619.0044 1971 110817 341 79.155 71977
|
||||
20250813_221259_phrases_english_-1_11_7 phrases english 11 7 0.2684 664.9822 1868 113100 238 80.78571428571429 72052
|
||||
20250813_221406_phrases_english_-1_11_8 phrases english 11 8 0.2800 708.9333 1769 117377 139 83.84071428571428 72755
|
||||
20250813_221510_phrases_english_-1_11_9 phrases english 11 9 0.2862 724.6178 1709 119653 79 85.46642857142857 73262
|
||||
20250813_221616_phrases_english_-1_11_10 phrases english 11 10 0.2900 730.1911 1675 120836 45 86.31142857142858 73640
|
||||
20250813_221721_phrases_english_-1_11_11 phrases english 11 11 0.2926 731.4533 1651 121253 21 86.60928571428572 73802
|
||||
20250813_221825_phrases_english_-1_11_12 phrases english 11 12 0.2935 731.9200 1636 121524 6 86.80285714285715 73896
|
||||
20250813_221929_phrases_english_-1_11_13 phrases english 11 13 0.2954 731.9289 1632 121616 1 86.86857142857143 73940
|
||||
20250813_222030_phrases_english_-1_11_14 phrases english 11 14 0.2954 731.9289 1631 121629 0 86.87785714285714 73948
|
||||
20250813_222129_duckdb_none_-1_4_5 duckdb none 4 5 0.2713 1366.4711 7044 239151 0 170.82214285714286 120060
|
||||
20250813_222206_phrases_none_-1_4_5 phrases none 4 5 0.1932 1350.9111 5821 232789 2560 166.27785714285713 122981
|
||||
20250813_222300_phrases_none_-1_4_6 phrases none 4 6 0.2036 1365.5067 5027 239895 1753 171.35357142857143 123552
|
||||
20250813_222354_phrases_none_-1_4_7 phrases none 4 7 0.2349 1372.5200 4456 246070 1174 175.7642857142857 124560
|
||||
20250813_222448_phrases_none_-1_4_8 phrases none 4 8 0.2532 1373.9778 4029 253029 745 180.735 126124
|
||||
20250813_222544_phrases_none_-1_4_9 phrases none 4 9 0.2624 1374.2622 3740 256538 454 183.24142857142857 126990
|
||||
20250813_222640_phrases_none_-1_4_10 phrases none 4 10 0.2687 1374.3556 3562 258648 271 184.74857142857144 127703
|
||||
20250813_222733_phrases_none_-1_4_11 phrases none 4 11 0.2729 1374.4400 3452 259500 155 185.35714285714286 128044
|
||||
20250813_222827_phrases_none_-1_4_12 phrases none 4 12 0.2745 1374.4267 3381 260125 78 185.80357142857142 128286
|
||||
20250813_222922_phrases_none_-1_4_13 phrases none 4 13 0.2760 1374.4044 3339 260403 34 186.00214285714284 128399
|
||||
20250813_223015_phrases_none_-1_4_14 phrases none 4 14 0.2732 1374.4044 3322 260507 14 186.07642857142858 128447
|
||||
20250813_223106_phrases_none_-1_5_5 phrases none 5 5 0.1988 1353.4622 4792 241122 1886 172.23 124557
|
||||
20250813_223204_phrases_none_-1_5_6 phrases none 5 6 0.2122 1367.3422 4194 247778 1278 176.9842857142857 125103
|
||||
20250813_223257_phrases_none_-1_5_7 phrases none 5 7 0.2399 1373.6933 3767 253550 847 181.10714285714286 126007
|
||||
20250813_223351_phrases_none_-1_5_8 phrases none 5 8 0.2577 1375.0089 3440 260253 519 185.895 127535
|
||||
20250813_223446_phrases_none_-1_5_9 phrases none 5 9 0.2663 1375.2622 3235 263519 314 188.22785714285715 128348
|
||||
20250813_223542_phrases_none_-1_5_10 phrases none 5 10 0.2717 1375.3111 3107 265465 183 189.61785714285713 129025
|
||||
20250813_223634_phrases_none_-1_5_11 phrases none 5 11 0.2756 1375.4000 3034 266187 106 190.13357142857143 129317
|
||||
20250813_223727_phrases_none_-1_5_12 phrases none 5 12 0.2763 1375.3911 2983 266726 53 190.51857142857142 129538
|
||||
20250813_223820_phrases_none_-1_5_13 phrases none 5 13 0.2780 1375.3733 2955 266950 23 190.67857142857142 129634
|
||||
20250813_223914_phrases_none_-1_5_14 phrases none 5 14 0.2763 1375.3733 2944 267029 10 190.735 129667
|
||||
20250813_224006_phrases_none_-1_6_5 phrases none 6 5 0.2083 1357.2311 4179 248276 1526 177.34 125864
|
||||
20250813_224059_phrases_none_-1_6_6 phrases none 6 6 0.2227 1370.1111 3683 254618 1024 181.87 126320
|
||||
20250813_224154_phrases_none_-1_6_7 phrases none 6 7 0.2446 1375.9467 3327 260186 666 185.84714285714287 127204
|
||||
20250813_224249_phrases_none_-1_6_8 phrases none 6 8 0.2607 1377.0578 3066 266658 405 190.47 128655
|
||||
20250813_224344_phrases_none_-1_6_9 phrases none 6 9 0.2626 1377.2933 2905 269749 244 192.67785714285714 129428
|
||||
20250813_224440_phrases_none_-1_6_10 phrases none 6 10 0.2659 1377.3022 2796 271612 132 194.00857142857143 130067
|
||||
20250813_224533_phrases_none_-1_6_11 phrases none 6 11 0.2705 1377.3867 2738 272274 72 194.48142857142858 130349
|
||||
20250813_224626_phrases_none_-1_6_12 phrases none 6 12 0.2723 1377.3867 2698 272764 30 194.83142857142857 130558
|
||||
20250813_224720_phrases_none_-1_6_13 phrases none 6 13 0.2740 1377.3556 2679 272950 9 194.96428571428572 130646
|
||||
20250813_224811_phrases_none_-1_6_14 phrases none 6 14 0.2739 1377.3556 2675 272993 4 194.995 130667
|
||||
20250813_224901_phrases_none_-1_7_5 phrases none 7 5 0.2179 1360.0756 3692 254793 1245 181.995 126840
|
||||
20250813_224954_phrases_none_-1_7_6 phrases none 7 6 0.2272 1372.5289 3282 260774 830 186.26714285714286 127217
|
||||
20250813_225050_phrases_none_-1_7_7 phrases none 7 7 0.2467 1377.2667 2990 266094 537 190.06714285714287 128021
|
||||
20250813_225145_phrases_none_-1_7_8 phrases none 7 8 0.2626 1378.2800 2769 272417 316 194.58357142857142 129446
|
||||
20250813_225241_phrases_none_-1_7_9 phrases none 7 9 0.2655 1378.4978 2636 275372 183 196.6942857142857 130197
|
||||
20250813_225336_phrases_none_-1_7_10 phrases none 7 10 0.2689 1378.5244 2550 277111 96 197.93642857142856 130797
|
||||
20250813_225427_phrases_none_-1_7_11 phrases none 7 11 0.2731 1378.6000 2508 277686 52 198.34714285714287 131056
|
||||
20250813_225519_phrases_none_-1_7_12 phrases none 7 12 0.2745 1378.6000 2478 278119 21 198.65642857142856 131243
|
||||
20250813_225611_phrases_none_-1_7_13 phrases none 7 13 0.2757 1378.5733 2463 278283 5 198.77357142857142 131322
|
||||
20250813_225702_phrases_none_-1_7_14 phrases none 7 14 0.2756 1378.5733 2461 278314 2 198.7957142857143 131340
|
||||
20250813_225753_phrases_none_-1_9_5 phrases none 9 5 0.2237 1365.1600 3018 267606 906 191.14714285714285 128162
|
||||
20250813_225847_phrases_none_-1_9_6 phrases none 9 6 0.2326 1376.7600 2729 273019 613 195.01357142857142 128423
|
||||
20250813_225941_phrases_none_-1_9_7 phrases none 9 7 0.2467 1379.9867 2504 277959 388 198.54214285714286 129108
|
||||
20250813_230036_phrases_none_-1_9_8 phrases none 9 8 0.2596 1380.8756 2336 284007 220 202.86214285714286 130474
|
||||
20250813_230132_phrases_none_-1_9_9 phrases none 9 9 0.2657 1381.0489 2242 286741 126 204.815 131156
|
||||
20250813_230227_phrases_none_-1_9_10 phrases none 9 10 0.2692 1381.0711 2181 288318 64 205.94142857142856 131709
|
||||
20250813_230320_phrases_none_-1_9_11 phrases none 9 11 0.2725 1381.1467 2150 288817 33 206.29785714285714 131931
|
||||
20250813_230413_phrases_none_-1_9_12 phrases none 9 12 0.2729 1381.1467 2130 289182 12 206.55857142857144 132086
|
||||
20250813_230504_phrases_none_-1_9_13 phrases none 9 13 0.2743 1381.1244 2122 289303 3 206.645 132139
|
||||
20250813_230555_phrases_none_-1_9_14 phrases none 9 14 0.2743 1381.1244 2120 289334 0 206.66714285714286 132157
|
||||
20250813_230646_phrases_none_-1_10_5 phrases none 10 5 0.2247 1367.4844 2799 273691 802 195.49357142857144 128332
|
||||
20250813_230736_phrases_none_-1_10_6 phrases none 10 6 0.2334 1377.9467 2546 278907 546 199.21928571428572 128545
|
||||
20250813_230830_phrases_none_-1_10_7 phrases none 10 7 0.2472 1380.7244 2343 283700 343 202.64285714285714 129172
|
||||
20250813_230922_phrases_none_-1_10_8 phrases none 10 8 0.2593 1381.5600 2196 289586 196 206.84714285714287 130480
|
||||
20250813_231016_phrases_none_-1_10_9 phrases none 10 9 0.2654 1381.7111 2112 292248 112 208.74857142857144 131145
|
||||
20250813_231109_phrases_none_-1_10_10 phrases none 10 10 0.2686 1381.7378 2056 293784 56 209.84571428571428 131675
|
||||
20250813_231203_phrases_none_-1_10_11 phrases none 10 11 0.2725 1381.8178 2027 294274 27 210.19571428571427 131892
|
||||
20250813_231257_phrases_none_-1_10_12 phrases none 10 12 0.2728 1381.8133 2007 294631 7 210.4507142857143 132041
|
||||
20250813_231346_phrases_none_-1_10_13 phrases none 10 13 0.2746 1381.7956 2002 294733 1 210.52357142857142 132087
|
||||
20250813_231437_phrases_none_-1_10_14 phrases none 10 14 0.2746 1381.7956 2001 294746 0 210.53285714285715 132095
|
||||
20250813_231528_phrases_none_-1_11_5 phrases none 11 5 0.2291 1371.9422 2616 279880 716 199.9142857142857 128487
|
||||
20250813_231620_phrases_none_-1_11_6 phrases none 11 6 0.2365 1381.1200 2395 284879 492 203.485 128668
|
||||
20250813_231714_phrases_none_-1_11_7 phrases none 11 7 0.2480 1383.4489 2210 289537 307 206.81214285714285 129265
|
||||
20250813_231809_phrases_none_-1_11_8 phrases none 11 8 0.2597 1384.1422 2079 295296 176 210.9257142857143 130535
|
||||
20250814_104809_phrases_none_-1_11_9 phrases none 11 9 0.2656 1384.2533 2003 297899 100 212.785 131201
|
||||
20250814_104904_phrases_none_-1_11_10 phrases none 11 10 0.2684 1384.2756 1953 299393 50 213.85214285714287 131713
|
||||
20250814_105006_phrases_none_-1_11_11 phrases none 11 11 0.2712 1384.3244 1926 299865 23 214.18928571428572 131927
|
||||
20250814_105103_phrases_none_-1_11_12 phrases none 11 12 0.2713 1384.3200 1909 300191 6 214.42214285714286 132058
|
||||
20250814_105158_phrases_none_-1_11_13 phrases none 11 13 0.2732 1384.2978 1905 300283 1 214.48785714285714 132102
|
||||
20250814_105250_phrases_none_-1_11_14 phrases none 11 14 0.2732 1384.2978 1904 300296 0 214.49714285714285 132110"""
|
||||
|
||||
# Split into rows and then by whitespace
|
||||
rows = [line.split() for line in data.splitlines()]
|
||||
|
||||
# Create DataFrame
|
||||
df = pd.DataFrame(rows)
|
||||
|
||||
# Save to CSV
|
||||
csv_path = "output.csv"
|
||||
df.to_csv(csv_path, index=False, header=False)
|
||||
|
||||
csv_path
|
||||
257
output.csv
Normal file
257
output.csv
Normal file
@ -0,0 +1,257 @@
|
||||
Run,Mode,Stopword,Min Freq,Min PMI,MAP,CiP,Dict Size,Terms Size,Num phrases,AVGDL,SUMDF
|
||||
20250813_160649_duckdb_english_-1_0_0,duckdb,english,0,0,0.2881,704.8000,6639,128030,0,91.45,80317
|
||||
20250813_160726_phrases_english_-1_0_0,phrases,english,0,0,0.1572,476.8444,21928,102342,16988,73.10142857142857,77971
|
||||
20250813_160938_phrases_english_-1_0_1,phrases,english,0,1,0.1629,493.3867,21685,102526,16746,73.23285714285714,77793
|
||||
20250813_161143_phrases_english_-1_0_2,phrases,english,0,2,0.1688,509.5022,21052,103046,16109,73.60428571428571,77506
|
||||
20250813_161351_phrases_english_-1_0_4,phrases,english,0,4,0.1778,559.0311,18662,106284,13680,75.91714285714286,77921
|
||||
20250813_161553_phrases_english_-1_0_8,phrases,english,0,8,0.2568,790.6044,11156,126009,5981,90.00642857142857,82155
|
||||
20250813_161718_phrases_english_-1_0_16,phrases,english,0,16,0.2906,816.3067,6593,138733,129,99.095,86796
|
||||
20250813_161835_phrases_english_-1_0_24,phrases,english,0,24,0.2906,816.3200,6639,138873,0,99.195,86916
|
||||
20250813_161949_phrases_english_-1_0_48,phrases,english,0,48,0.2906,816.3200,6639,138873,0,99.195,86916
|
||||
20250813_162104_phrases_english_-1_1_0,phrases,english,1,0,0.1572,476.8444,21928,102342,16988,73.10142857142857,77971
|
||||
20250813_162314_phrases_english_-1_1_1,phrases,english,1,1,0.1629,493.3867,21685,102526,16746,73.23285714285714,77793
|
||||
20250813_162521_phrases_english_-1_1_2,phrases,english,1,2,0.1688,509.5022,21052,103046,16109,73.60428571428571,77506
|
||||
20250813_162652_phrases_english_-1_1_4,phrases,english,1,4,0.1778,559.0311,18662,106284,13680,75.91714285714286,77921
|
||||
20250813_162851_phrases_english_-1_1_8,phrases,english,1,8,0.2568,790.6044,11156,126009,5981,90.00642857142857,82155
|
||||
20250813_163027_phrases_english_-1_1_16,phrases,english,1,16,0.2906,816.3067,6593,138733,129,99.095,86796
|
||||
20250813_163147_phrases_english_-1_1_24,phrases,english,1,24,0.2906,816.3200,6639,138873,0,99.195,86916
|
||||
20250813_163259_phrases_english_-1_1_48,phrases,english,1,48,0.2906,816.3200,6639,138873,0,99.195,86916
|
||||
20250813_163405_phrases_english_-1_2_0,phrases,english,2,0,0.1763,379.8711,10740,105739,6586,75.52785714285714,79664
|
||||
20250813_163530_phrases_english_-1_2_1,phrases,english,2,1,0.1772,396.0311,10666,105816,6512,75.58285714285714,79489
|
||||
20250813_163654_phrases_english_-1_2_2,phrases,english,2,2,0.1794,411.7600,10427,106096,6273,75.78285714285714,79150
|
||||
20250813_163822_phrases_english_-1_2_4,phrases,english,2,4,0.1938,459.2800,9433,108408,5271,77.43428571428572,79099
|
||||
20250813_163949_phrases_english_-1_2_8,phrases,english,2,8,0.2627,709.4400,6083,124639,1908,89.02785714285714,81508
|
||||
20250813_164108_phrases_english_-1_2_16,phrases,english,2,16,0.2932,739.6222,4347,133593,12,95.42357142857144,83902
|
||||
20250813_164216_phrases_english_-1_2_24,phrases,english,2,24,0.2932,739.6222,4349,133616,0,95.44,83913
|
||||
20250813_164322_phrases_english_-1_2_48,phrases,english,2,48,0.2932,739.6222,4349,133616,0,95.44,83913
|
||||
20250813_164425_phrases_english_-1_4_0,phrases,english,4,0,0.1926,412.1156,5392,108052,2439,77.18,77560
|
||||
20250813_164542_phrases_english_-1_4_1,phrases,english,4,1,0.1954,422.1467,5376,108073,2423,77.195,77458
|
||||
20250813_164656_phrases_english_-1_4_2,phrases,english,4,2,0.1955,431.9733,5305,108193,2352,77.28071428571428,77205
|
||||
20250813_164814_phrases_english_-1_4_4,phrases,english,4,4,0.2077,469.7067,4969,109698,2015,78.35571428571428,77107
|
||||
20250813_164929_phrases_english_-1_4_8,phrases,english,4,8,0.2703,705.4267,3584,122735,630,87.66785714285714,78610
|
||||
20250813_165045_phrases_english_-1_4_16,phrases,english,4,16,0.2938,732.3067,2980,129255,0,92.325,80420
|
||||
20250813_165151_phrases_english_-1_4_24,phrases,english,4,24,0.2938,732.3067,2980,129255,0,92.325,80420
|
||||
20250813_165256_phrases_english_-1_4_48,phrases,english,4,48,0.2938,732.3067,2980,129255,0,92.325,80420
|
||||
20250813_165402_phrases_english_-1_8_0,phrases,english,8,0,0.2316,481.0444,2889,109018,915,77.87,74606
|
||||
20250813_165514_phrases_english_-1_8_1,phrases,english,8,1,0.2315,485.4489,2887,109018,913,77.87,74543
|
||||
20250813_165623_phrases_english_-1_8_2,phrases,english,8,2,0.2312,489.1467,2869,109078,895,77.91285714285715,74428
|
||||
20250813_165734_phrases_english_-1_8_4,phrases,english,8,4,0.2402,513.2978,2760,109979,786,78.55642857142857,74316
|
||||
20250813_165845_phrases_english_-1_8_8,phrases,english,8,8,0.2772,710.0622,2195,120281,221,85.915,75361
|
||||
20250813_165956_phrases_english_-1_8_16,phrases,english,8,16,0.2933,733.9689,1978,125124,0,89.37428571428572,76721
|
||||
20250813_170104_phrases_english_-1_8_24,phrases,english,8,24,0.2933,733.9689,1978,125124,0,89.37428571428572,76721
|
||||
20250813_170215_phrases_english_-1_8_48,phrases,english,8,48,0.2933,733.9689,1978,125124,0,89.37428571428572,76721
|
||||
20250813_170321_phrases_english_-1_16_0,phrases,english,16,0,0.2509,538.9644,1636,105876,341,75.62571428571428,69335
|
||||
20250813_170435_phrases_english_-1_16_1,phrases,english,16,1,0.2519,543.3867,1635,105876,340,75.62571428571428,69280
|
||||
20250813_170543_phrases_english_-1_16_2,phrases,english,16,2,0.2522,545.7333,1632,105914,337,75.65285714285714,69255
|
||||
20250813_170652_phrases_english_-1_16_4,phrases,english,16,4,0.2599,560.0044,1601,106384,306,75.98857142857143,69176
|
||||
20250813_170758_phrases_english_-1_16_8,phrases,english,16,8,0.2889,721.7467,1376,114387,81,81.705,69972
|
||||
20250813_170906_phrases_english_-1_16_16,phrases,english,16,16,0.2978,742.4178,1296,117990,0,84.27857142857142,71021
|
||||
20250813_171011_phrases_english_-1_16_24,phrases,english,16,24,0.2978,742.4178,1296,117990,0,84.27857142857142,71021
|
||||
20250813_171115_phrases_english_-1_16_48,phrases,english,16,48,0.2978,742.4178,1296,117990,0,84.27857142857142,71021
|
||||
20250813_171220_phrases_english_-1_24_0,phrases,english,24,0,0.2563,578.9422,1164,102808,188,73.43428571428572,65298
|
||||
20250813_171330_phrases_english_-1_24_1,phrases,english,24,1,0.2563,578.9422,1164,102808,188,73.43428571428572,65298
|
||||
20250813_171441_phrases_english_-1_24_2,phrases,english,24,2,0.2568,579.9022,1163,102829,187,73.44928571428571,65298
|
||||
20250813_171553_phrases_english_-1_24_4,phrases,english,24,4,0.2617,592.4044,1149,103170,173,73.69285714285714,65267
|
||||
20250813_171703_phrases_english_-1_24_8,phrases,english,24,8,0.2851,731.9244,1017,109871,41,78.47928571428571,65953
|
||||
20250813_171818_phrases_english_-1_24_16,phrases,english,24,16,0.2901,749.5022,977,112836,0,80.59714285714286,66831
|
||||
20250813_171930_phrases_english_-1_24_24,phrases,english,24,24,0.2901,749.5022,977,112836,0,80.59714285714286,66831
|
||||
20250813_172041_phrases_english_-1_24_48,phrases,english,24,48,0.2901,749.5022,977,112836,0,80.59714285714286,66831
|
||||
20250813_172151_duckdb_none_-1_0_0,duckdb,none,0,0,0.2713,1366.4711,7044,239151,0,170.82214285714286,120060
|
||||
20250813_172225_phrases_none_-1_0_0,phrases,none,0,0,0.0942,606.3111,39156,138456,36189,98.89714285714285,116154
|
||||
20250813_172407_phrases_none_-1_0_1,phrases,none,0,1,0.1055,865.5467,36615,146196,33656,104.42571428571429,118449
|
||||
20250813_172553_phrases_none_-1_0_2,phrases,none,0,2,0.1207,1160.9289,32862,161095,29874,115.06785714285714,113754
|
||||
20250813_172741_phrases_none_-1_0_4,phrases,none,0,4,0.1507,1305.4889,25669,184408,22135,131.72,107278
|
||||
20250813_172918_phrases_none_-1_0_8,phrases,none,0,8,0.2366,1365.3956,12779,223412,7754,159.58,113556
|
||||
20250813_173023_phrases_none_-1_0_16,phrases,none,0,16,0.2712,1366.4711,6994,238997,142,170.71214285714285,119929
|
||||
20250813_173123_phrases_none_-1_0_24,phrases,none,0,24,0.2713,1366.4711,7044,239151,0,170.82214285714286,120060
|
||||
20250813_173214_phrases_none_-1_0_48,phrases,none,0,48,0.2713,1366.4711,7044,239151,0,170.82214285714286,120060
|
||||
20250813_173306_phrases_none_-1_1_0,phrases,none,1,0,0.0942,606.3111,39156,138456,36189,98.89714285714285,116154
|
||||
20250813_173448_phrases_none_-1_1_1,phrases,none,1,1,0.1055,865.5467,36615,146196,33656,104.42571428571429,118449
|
||||
20250813_173636_phrases_none_-1_1_2,phrases,none,1,2,0.1207,1160.9289,32862,161095,29874,115.06785714285714,113754
|
||||
20250813_173824_phrases_none_-1_1_4,phrases,none,1,4,0.1507,1305.4889,25669,184408,22135,131.72,107278
|
||||
20250813_174000_phrases_none_-1_1_8,phrases,none,1,8,0.2366,1365.3956,12779,223412,7754,159.58,113556
|
||||
20250813_174119_phrases_none_-1_1_16,phrases,none,1,16,0.2712,1366.4711,6994,238997,142,170.71214285714285,119929
|
||||
20250813_174220_phrases_none_-1_1_24,phrases,none,1,24,0.2713,1366.4711,7044,239151,0,170.82214285714286,120060
|
||||
20250813_174314_phrases_none_-1_1_48,phrases,none,1,48,0.2713,1366.4711,7044,239151,0,170.82214285714286,120060
|
||||
20250813_174406_phrases_none_-1_2_0,phrases,none,2,0,0.1154,775.2578,23364,162542,19103,116.10142857142857,131171
|
||||
20250813_174516_phrases_none_-1_2_1,phrases,none,2,1,0.1229,925.6178,21484,168916,17216,120.65428571428572,131354
|
||||
20250813_174625_phrases_none_-1_2_2,phrases,none,2,2,0.1401,1175.8044,18673,181497,14424,129.6407142857143,124656
|
||||
20250813_174734_phrases_none_-1_2_4,phrases,none,2,4,0.1745,1309.2400,13265,203089,8938,145.06357142857144,117056
|
||||
20250813_174842_phrases_none_-1_2_8,phrases,none,2,8,0.2438,1367.9511,6821,236944,2307,169.24571428571429,121637
|
||||
20250813_174938_phrases_none_-1_2_16,phrases,none,2,16,0.2715,1368.6533,4719,247372,13,176.6942857142857,124752
|
||||
20250813_175030_phrases_none_-1_2_24,phrases,none,2,24,0.2715,1368.6533,4720,247398,0,176.71285714285713,124765
|
||||
20250813_175123_phrases_none_-1_2_48,phrases,none,2,48,0.2715,1368.6533,4720,247398,0,176.71285714285713,124765
|
||||
20250813_175212_phrases_none_-1_4_0,phrases,none,4,0,0.1461,917.3200,12382,188395,9140,134.56785714285715,138261
|
||||
20250813_175308_phrases_none_-1_4_1,phrases,none,4,1,0.1444,1016.7067,11297,192252,8053,137.32285714285715,137005
|
||||
20250813_175355_phrases_none_-1_4_2,phrases,none,4,2,0.1640,1212.6044,9670,203157,6432,145.11214285714286,129190
|
||||
20250813_175450_phrases_none_-1_4_4,phrases,none,4,4,0.1844,1328.3778,6782,224561,3533,160.4007142857143,122890
|
||||
20250813_175540_phrases_none_-1_4_8,phrases,none,4,8,0.2532,1373.9778,4029,253029,745,180.735,126124
|
||||
20250813_175635_phrases_none_-1_4_16,phrases,none,4,16,0.2731,1374.4044,3313,260575,0,186.125,128467
|
||||
20250813_175724_phrases_none_-1_4_24,phrases,none,4,24,0.2731,1374.4044,3313,260575,0,186.125,128467
|
||||
20250813_175813_phrases_none_-1_4_48,phrases,none,4,48,0.2731,1374.4044,3313,260575,0,186.125,128467
|
||||
20250813_175900_phrases_none_-1_8_0,phrases,none,8,0,0.1722,1081.2489,6529,220276,4267,157.34,141465
|
||||
20250813_175950_phrases_none_-1_8_1,phrases,none,8,1,0.1759,1147.3956,6030,222849,3768,159.17785714285714,139509
|
||||
20250813_180037_phrases_none_-1_8_2,phrases,none,8,2,0.1925,1262.1600,5229,234827,2968,167.73357142857142,133023
|
||||
20250813_180131_phrases_none_-1_8_4,phrases,none,8,4,0.2184,1346.5333,3785,254655,1523,181.89642857142857,127665
|
||||
20250813_180222_phrases_none_-1_8_8,phrases,none,8,8,0.2630,1379.0667,2537,278374,269,198.83857142857144,129892
|
||||
20250813_180315_phrases_none_-1_8_16,phrases,none,8,16,0.2752,1379.3156,2273,284030,0,202.87857142857143,131682
|
||||
20250813_180404_phrases_none_-1_8_24,phrases,none,8,24,0.2752,1379.3156,2273,284030,0,202.87857142857143,131682
|
||||
20250813_180453_phrases_none_-1_8_48,phrases,none,8,48,0.2752,1379.3156,2273,284030,0,202.87857142857143,131682
|
||||
20250813_180542_phrases_none_-1_16_0,phrases,none,16,0,0.2053,1267.8489,3405,269846,1865,192.74714285714285,142548
|
||||
20250813_180632_phrases_none_-1_16_1,phrases,none,16,1,0.2059,1279.7244,3196,271773,1656,194.12357142857144,140763
|
||||
20250813_180723_phrases_none_-1_16_2,phrases,none,16,2,0.2214,1325.5867,2835,282258,1296,201.61285714285714,134192
|
||||
20250813_180816_phrases_none_-1_16_4,phrases,none,16,4,0.2379,1375.3067,2176,300256,637,214.46857142857144,129654
|
||||
20250813_180910_phrases_none_-1_16_8,phrases,none,16,8,0.2662,1391.0533,1641,319121,101,227.94357142857143,130933
|
||||
20250813_181005_phrases_none_-1_16_16,phrases,none,16,16,0.2722,1391.0400,1541,323278,0,230.91285714285715,132277
|
||||
20250813_181056_phrases_none_-1_16_24,phrases,none,16,24,0.2722,1391.0400,1541,323278,0,230.91285714285715,132277
|
||||
20250813_181150_phrases_none_-1_16_48,phrases,none,16,48,0.2722,1391.0400,1541,323278,0,230.91285714285715,132277
|
||||
20250813_181242_phrases_none_-1_24_0,phrases,none,24,0,0.2074,1341.2933,2290,303650,1109,216.89285714285714,140095
|
||||
20250813_181336_phrases_none_-1_24_1,phrases,none,24,1,0.2134,1339.4400,2184,304857,1003,217.755,138507
|
||||
20250813_181430_phrases_none_-1_24_2,phrases,none,24,2,0.2330,1354.8667,1960,315279,780,225.1992857142857,131976
|
||||
20250813_181523_phrases_none_-1_24_4,phrases,none,24,4,0.2451,1386.2356,1568,330903,387,236.3592857142857,127722
|
||||
20250813_181620_phrases_none_-1_24_8,phrases,none,24,8,0.2702,1393.9689,1235,347214,54,248.01,128647
|
||||
20250813_181717_phrases_none_-1_24_16,phrases,none,24,16,0.2721,1393.9244,1182,350595,0,250.425,129773
|
||||
20250813_181811_phrases_none_-1_24_24,phrases,none,24,24,0.2721,1393.9244,1182,350595,0,250.425,129773
|
||||
20250813_181906_phrases_none_-1_24_48,phrases,none,24,48,0.2721,1393.9244,1182,350595,0,250.425,129773
|
||||
20250813_210100_duckdb_english_-1_4_5,duckdb,english,4,5,0.2881,704.8000,6639,128030,0,91.45,80317
|
||||
20250813_210142_phrases_english_-1_4_5,phrases,english,4,5,0.2083,521.3733,4654,111616,1700,79.72571428571429,77189
|
||||
20250813_210257_phrases_english_-1_4_6,phrases,english,4,6,0.2195,595.7778,4250,114239,1297,81.59928571428571,77437
|
||||
20250813_210406_phrases_english_-1_4_7,phrases,english,4,7,0.2465,654.8444,3904,117509,950,83.935,77710
|
||||
20250813_210516_phrases_english_-1_4_8,phrases,english,4,8,0.2703,705.4267,3584,122735,630,87.66785714285714,78610
|
||||
20250813_210624_phrases_english_-1_4_9,phrases,english,4,9,0.2796,722.9733,3346,125792,390,89.85142857142857,79288
|
||||
20250813_210735_phrases_english_-1_4_10,phrases,english,4,10,0.2870,729.9289,3205,127524,245,91.08857142857143,79810
|
||||
20250813_210846_phrases_english_-1_4_11,phrases,english,4,11,0.2910,731.6133,3106,128280,141,91.62857142857143,80077
|
||||
20250813_210955_phrases_english_-1_4_12,phrases,english,4,12,0.2940,732.1911,3044,128816,74,92.01142857142857,80251
|
||||
20250813_211102_phrases_english_-1_4_13,phrases,english,4,13,0.2948,732.2889,3006,129079,34,92.19928571428571,80351
|
||||
20250813_211209_phrases_english_-1_4_14,phrases,english,4,14,0.2938,732.2978,2989,129185,14,92.275,80400
|
||||
20250813_211315_phrases_english_-1_5_5,phrases,english,5,5,0.2173,517.2000,3835,111400,1235,79.57142857142857,76183
|
||||
20250813_211424_phrases_english_-1_5_6,phrases,english,5,6,0.2309,590.4667,3536,113742,936,81.24428571428571,76390
|
||||
20250813_211534_phrases_english_-1_5_7,phrases,english,5,7,0.2538,648.1467,3280,116753,680,83.395,76603
|
||||
20250813_211644_phrases_english_-1_5_8,phrases,english,5,8,0.2713,698.4800,3035,121771,435,86.97928571428571,77476
|
||||
20250813_211755_phrases_english_-1_5_9,phrases,english,5,9,0.2818,715.6089,2870,124598,270,88.99857142857142,78102
|
||||
20250813_211908_phrases_english_-1_5_10,phrases,english,5,10,0.2888,722.5244,2768,126202,166,90.14428571428572,78608
|
||||
20250813_212014_phrases_english_-1_5_11,phrases,english,5,11,0.2923,724.1733,2702,126846,97,90.60428571428571,78836
|
||||
20250813_212121_phrases_english_-1_5_12,phrases,english,5,12,0.2926,724.7289,2656,127310,49,90.93571428571428,79002
|
||||
20250813_212229_phrases_english_-1_5_13,phrases,english,5,13,0.2940,724.7778,2632,127516,23,91.08285714285714,79084
|
||||
20250813_212336_phrases_english_-1_5_14,phrases,english,5,14,0.2936,724.7822,2621,127595,10,91.13928571428572,79117
|
||||
20250813_212441_phrases_english_-1_6_5,phrases,english,6,5,0.2286,532.4889,3326,111572,976,79.69428571428571,75634
|
||||
20250813_212550_phrases_english_-1_6_6,phrases,english,6,6,0.2452,601.1067,3090,113712,740,81.22285714285714,75810
|
||||
20250813_212701_phrases_english_-1_6_7,phrases,english,6,7,0.2606,656.0533,2883,116571,533,83.265,76015
|
||||
20250813_212811_phrases_english_-1_6_8,phrases,english,6,8,0.2758,703.9644,2686,121408,336,86.72,76833
|
||||
20250813_212922_phrases_english_-1_6_9,phrases,english,6,9,0.2832,720.5422,2556,124095,206,88.63928571428572,77428
|
||||
20250813_213032_phrases_english_-1_6_10,phrases,english,6,10,0.2871,727.2844,2468,125638,116,89.74142857142857,77916
|
||||
20250813_213140_phrases_english_-1_6_11,phrases,english,6,11,0.2896,728.7956,2417,126211,64,90.15071428571429,78124
|
||||
20250813_213257_phrases_english_-1_6_12,phrases,english,6,12,0.2913,729.3200,2382,126624,27,90.44571428571429,78278
|
||||
20250813_213410_phrases_english_-1_6_13,phrases,english,6,13,0.2927,729.3644,2366,126799,9,90.57071428571429,78352
|
||||
20250813_213517_phrases_english_-1_6_14,phrases,english,6,14,0.2928,729.3644,2362,126842,4,90.60142857142857,78373
|
||||
20250813_213626_phrases_english_-1_7_5,phrases,english,7,5,0.2358,547.9867,2934,111748,784,79.82,75148
|
||||
20250813_213741_phrases_english_-1_7_6,phrases,english,7,6,0.2455,606.9378,2748,113674,598,81.19571428571429,75290
|
||||
20250813_213858_phrases_english_-1_7_7,phrases,english,7,7,0.2617,660.2489,2577,116381,427,83.12928571428571,75452
|
||||
20250813_214015_phrases_english_-1_7_8,phrases,english,7,8,0.2790,707.7867,2410,121097,260,86.49785714285714,76252
|
||||
20250813_214134_phrases_english_-1_7_9,phrases,english,7,9,0.2862,724.0533,2302,123686,152,88.34714285714286,76823
|
||||
20250813_214245_phrases_english_-1_7_10,phrases,english,7,10,0.2887,730.5244,2234,125113,83,89.36642857142857,77280
|
||||
20250813_214357_phrases_english_-1_7_11,phrases,english,7,11,0.2924,731.9333,2197,125607,45,89.71928571428572,77469
|
||||
20250813_214509_phrases_english_-1_7_12,phrases,english,7,12,0.2934,732.4000,2171,125965,18,89.975,77607
|
||||
20250813_214616_phrases_english_-1_7_13,phrases,english,7,13,0.2946,732.4356,2159,126118,5,90.08428571428571,77672
|
||||
20250813_214724_phrases_english_-1_7_14,phrases,english,7,14,0.2946,732.4356,2157,126149,2,90.10642857142857,77690
|
||||
20250813_214831_phrases_english_-1_9_5,phrases,english,9,5,0.2402,565.6178,2392,111075,566,79.33928571428571,73618
|
||||
20250813_214943_phrases_english_-1_9_6,phrases,english,9,6,0.2510,619.9644,2263,112722,437,80.51571428571428,73737
|
||||
20250813_215055_phrases_english_-1_9_7,phrases,english,9,7,0.2631,667.1956,2134,115185,308,82.275,73839
|
||||
20250813_215207_phrases_english_-1_9_8,phrases,english,9,8,0.2781,713.2356,2007,119663,181,85.47357142857143,74601
|
||||
20250813_215319_phrases_english_-1_9_9,phrases,english,9,9,0.2834,729.0667,1930,122062,104,87.18714285714286,75124
|
||||
20250813_215427_phrases_english_-1_9_10,phrases,english,9,10,0.2879,734.9022,1885,123339,58,88.09928571428571,75539
|
||||
20250813_215537_phrases_english_-1_9_11,phrases,english,9,11,0.2909,736.1733,1858,123770,31,88.40714285714286,75702
|
||||
20250813_215648_phrases_english_-1_9_12,phrases,english,9,12,0.2917,736.6178,1840,124073,12,88.62357142857142,75818
|
||||
20250813_215751_phrases_english_-1_9_13,phrases,english,9,13,0.2938,736.6311,1832,124199,3,88.71357142857143,75872
|
||||
20250813_215854_phrases_english_-1_9_14,phrases,english,9,14,0.2938,736.6311,1830,124230,0,88.73571428571428,75890
|
||||
20250813_215956_phrases_english_-1_10_5,phrases,english,10,5,0.2405,564.1200,2220,110076,501,78.62571428571428,72645
|
||||
20250813_220107_phrases_english_-1_10_6,phrases,english,10,6,0.2524,617.2489,2108,111644,389,79.74571428571429,72756
|
||||
20250813_220217_phrases_english_-1_10_7,phrases,english,10,7,0.2623,663.9422,1992,114024,273,81.44571428571429,72840
|
||||
20250813_220321_phrases_english_-1_10_8,phrases,english,10,8,0.2778,709.8178,1878,118417,159,84.58357142857143,73574
|
||||
20250813_220426_phrases_english_-1_10_9,phrases,english,10,9,0.2844,725.6089,1810,120744,91,86.24571428571429,74083
|
||||
20250813_220532_phrases_english_-1_10_10,phrases,english,10,10,0.2881,731.2444,1770,121970,51,87.12142857142857,74476
|
||||
20250813_220637_phrases_english_-1_10_11,phrases,english,10,11,0.2904,732.5156,1744,122397,25,87.42642857142857,74639
|
||||
20250813_220743_phrases_english_-1_10_12,phrases,english,10,12,0.2914,732.9689,1726,122699,7,87.64214285714286,74751
|
||||
20250813_220845_phrases_english_-1_10_13,phrases,english,10,13,0.2934,732.9778,1721,122801,1,87.715,74797
|
||||
20250813_220946_phrases_english_-1_10_14,phrases,english,10,14,0.2934,732.9778,1720,122814,0,87.72428571428571,74805
|
||||
20250813_221046_phrases_english_-1_11_5,phrases,english,11,5,0.2486,569.0800,2065,109381,435,78.12928571428571,71865
|
||||
20250813_221153_phrases_english_-1_11_6,phrases,english,11,6,0.2586,619.0044,1971,110817,341,79.155,71977
|
||||
20250813_221259_phrases_english_-1_11_7,phrases,english,11,7,0.2684,664.9822,1868,113100,238,80.78571428571429,72052
|
||||
20250813_221406_phrases_english_-1_11_8,phrases,english,11,8,0.2800,708.9333,1769,117377,139,83.84071428571428,72755
|
||||
20250813_221510_phrases_english_-1_11_9,phrases,english,11,9,0.2862,724.6178,1709,119653,79,85.46642857142857,73262
|
||||
20250813_221616_phrases_english_-1_11_10,phrases,english,11,10,0.2900,730.1911,1675,120836,45,86.31142857142858,73640
|
||||
20250813_221721_phrases_english_-1_11_11,phrases,english,11,11,0.2926,731.4533,1651,121253,21,86.60928571428572,73802
|
||||
20250813_221825_phrases_english_-1_11_12,phrases,english,11,12,0.2935,731.9200,1636,121524,6,86.80285714285715,73896
|
||||
20250813_221929_phrases_english_-1_11_13,phrases,english,11,13,0.2954,731.9289,1632,121616,1,86.86857142857143,73940
|
||||
20250813_222030_phrases_english_-1_11_14,phrases,english,11,14,0.2954,731.9289,1631,121629,0,86.87785714285714,73948
|
||||
20250813_222129_duckdb_none_-1_4_5,duckdb,none,4,5,0.2713,1366.4711,7044,239151,0,170.82214285714286,120060
|
||||
20250813_222206_phrases_none_-1_4_5,phrases,none,4,5,0.1932,1350.9111,5821,232789,2560,166.27785714285713,122981
|
||||
20250813_222300_phrases_none_-1_4_6,phrases,none,4,6,0.2036,1365.5067,5027,239895,1753,171.35357142857143,123552
|
||||
20250813_222354_phrases_none_-1_4_7,phrases,none,4,7,0.2349,1372.5200,4456,246070,1174,175.7642857142857,124560
|
||||
20250813_222448_phrases_none_-1_4_8,phrases,none,4,8,0.2532,1373.9778,4029,253029,745,180.735,126124
|
||||
20250813_222544_phrases_none_-1_4_9,phrases,none,4,9,0.2624,1374.2622,3740,256538,454,183.24142857142857,126990
|
||||
20250813_222640_phrases_none_-1_4_10,phrases,none,4,10,0.2687,1374.3556,3562,258648,271,184.74857142857144,127703
|
||||
20250813_222733_phrases_none_-1_4_11,phrases,none,4,11,0.2729,1374.4400,3452,259500,155,185.35714285714286,128044
|
||||
20250813_222827_phrases_none_-1_4_12,phrases,none,4,12,0.2745,1374.4267,3381,260125,78,185.80357142857142,128286
|
||||
20250813_222922_phrases_none_-1_4_13,phrases,none,4,13,0.2760,1374.4044,3339,260403,34,186.00214285714284,128399
|
||||
20250813_223015_phrases_none_-1_4_14,phrases,none,4,14,0.2732,1374.4044,3322,260507,14,186.07642857142858,128447
|
||||
20250813_223106_phrases_none_-1_5_5,phrases,none,5,5,0.1988,1353.4622,4792,241122,1886,172.23,124557
|
||||
20250813_223204_phrases_none_-1_5_6,phrases,none,5,6,0.2122,1367.3422,4194,247778,1278,176.9842857142857,125103
|
||||
20250813_223257_phrases_none_-1_5_7,phrases,none,5,7,0.2399,1373.6933,3767,253550,847,181.10714285714286,126007
|
||||
20250813_223351_phrases_none_-1_5_8,phrases,none,5,8,0.2577,1375.0089,3440,260253,519,185.895,127535
|
||||
20250813_223446_phrases_none_-1_5_9,phrases,none,5,9,0.2663,1375.2622,3235,263519,314,188.22785714285715,128348
|
||||
20250813_223542_phrases_none_-1_5_10,phrases,none,5,10,0.2717,1375.3111,3107,265465,183,189.61785714285713,129025
|
||||
20250813_223634_phrases_none_-1_5_11,phrases,none,5,11,0.2756,1375.4000,3034,266187,106,190.13357142857143,129317
|
||||
20250813_223727_phrases_none_-1_5_12,phrases,none,5,12,0.2763,1375.3911,2983,266726,53,190.51857142857142,129538
|
||||
20250813_223820_phrases_none_-1_5_13,phrases,none,5,13,0.2780,1375.3733,2955,266950,23,190.67857142857142,129634
|
||||
20250813_223914_phrases_none_-1_5_14,phrases,none,5,14,0.2763,1375.3733,2944,267029,10,190.735,129667
|
||||
20250813_224006_phrases_none_-1_6_5,phrases,none,6,5,0.2083,1357.2311,4179,248276,1526,177.34,125864
|
||||
20250813_224059_phrases_none_-1_6_6,phrases,none,6,6,0.2227,1370.1111,3683,254618,1024,181.87,126320
|
||||
20250813_224154_phrases_none_-1_6_7,phrases,none,6,7,0.2446,1375.9467,3327,260186,666,185.84714285714287,127204
|
||||
20250813_224249_phrases_none_-1_6_8,phrases,none,6,8,0.2607,1377.0578,3066,266658,405,190.47,128655
|
||||
20250813_224344_phrases_none_-1_6_9,phrases,none,6,9,0.2626,1377.2933,2905,269749,244,192.67785714285714,129428
|
||||
20250813_224440_phrases_none_-1_6_10,phrases,none,6,10,0.2659,1377.3022,2796,271612,132,194.00857142857143,130067
|
||||
20250813_224533_phrases_none_-1_6_11,phrases,none,6,11,0.2705,1377.3867,2738,272274,72,194.48142857142858,130349
|
||||
20250813_224626_phrases_none_-1_6_12,phrases,none,6,12,0.2723,1377.3867,2698,272764,30,194.83142857142857,130558
|
||||
20250813_224720_phrases_none_-1_6_13,phrases,none,6,13,0.2740,1377.3556,2679,272950,9,194.96428571428572,130646
|
||||
20250813_224811_phrases_none_-1_6_14,phrases,none,6,14,0.2739,1377.3556,2675,272993,4,194.995,130667
|
||||
20250813_224901_phrases_none_-1_7_5,phrases,none,7,5,0.2179,1360.0756,3692,254793,1245,181.995,126840
|
||||
20250813_224954_phrases_none_-1_7_6,phrases,none,7,6,0.2272,1372.5289,3282,260774,830,186.26714285714286,127217
|
||||
20250813_225050_phrases_none_-1_7_7,phrases,none,7,7,0.2467,1377.2667,2990,266094,537,190.06714285714287,128021
|
||||
20250813_225145_phrases_none_-1_7_8,phrases,none,7,8,0.2626,1378.2800,2769,272417,316,194.58357142857142,129446
|
||||
20250813_225241_phrases_none_-1_7_9,phrases,none,7,9,0.2655,1378.4978,2636,275372,183,196.6942857142857,130197
|
||||
20250813_225336_phrases_none_-1_7_10,phrases,none,7,10,0.2689,1378.5244,2550,277111,96,197.93642857142856,130797
|
||||
20250813_225427_phrases_none_-1_7_11,phrases,none,7,11,0.2731,1378.6000,2508,277686,52,198.34714285714287,131056
|
||||
20250813_225519_phrases_none_-1_7_12,phrases,none,7,12,0.2745,1378.6000,2478,278119,21,198.65642857142856,131243
|
||||
20250813_225611_phrases_none_-1_7_13,phrases,none,7,13,0.2757,1378.5733,2463,278283,5,198.77357142857142,131322
|
||||
20250813_225702_phrases_none_-1_7_14,phrases,none,7,14,0.2756,1378.5733,2461,278314,2,198.7957142857143,131340
|
||||
20250813_225753_phrases_none_-1_9_5,phrases,none,9,5,0.2237,1365.1600,3018,267606,906,191.14714285714285,128162
|
||||
20250813_225847_phrases_none_-1_9_6,phrases,none,9,6,0.2326,1376.7600,2729,273019,613,195.01357142857142,128423
|
||||
20250813_225941_phrases_none_-1_9_7,phrases,none,9,7,0.2467,1379.9867,2504,277959,388,198.54214285714286,129108
|
||||
20250813_230036_phrases_none_-1_9_8,phrases,none,9,8,0.2596,1380.8756,2336,284007,220,202.86214285714286,130474
|
||||
20250813_230132_phrases_none_-1_9_9,phrases,none,9,9,0.2657,1381.0489,2242,286741,126,204.815,131156
|
||||
20250813_230227_phrases_none_-1_9_10,phrases,none,9,10,0.2692,1381.0711,2181,288318,64,205.94142857142856,131709
|
||||
20250813_230320_phrases_none_-1_9_11,phrases,none,9,11,0.2725,1381.1467,2150,288817,33,206.29785714285714,131931
|
||||
20250813_230413_phrases_none_-1_9_12,phrases,none,9,12,0.2729,1381.1467,2130,289182,12,206.55857142857144,132086
|
||||
20250813_230504_phrases_none_-1_9_13,phrases,none,9,13,0.2743,1381.1244,2122,289303,3,206.645,132139
|
||||
20250813_230555_phrases_none_-1_9_14,phrases,none,9,14,0.2743,1381.1244,2120,289334,0,206.66714285714286,132157
|
||||
20250813_230646_phrases_none_-1_10_5,phrases,none,10,5,0.2247,1367.4844,2799,273691,802,195.49357142857144,128332
|
||||
20250813_230736_phrases_none_-1_10_6,phrases,none,10,6,0.2334,1377.9467,2546,278907,546,199.21928571428572,128545
|
||||
20250813_230830_phrases_none_-1_10_7,phrases,none,10,7,0.2472,1380.7244,2343,283700,343,202.64285714285714,129172
|
||||
20250813_230922_phrases_none_-1_10_8,phrases,none,10,8,0.2593,1381.5600,2196,289586,196,206.84714285714287,130480
|
||||
20250813_231016_phrases_none_-1_10_9,phrases,none,10,9,0.2654,1381.7111,2112,292248,112,208.74857142857144,131145
|
||||
20250813_231109_phrases_none_-1_10_10,phrases,none,10,10,0.2686,1381.7378,2056,293784,56,209.84571428571428,131675
|
||||
20250813_231203_phrases_none_-1_10_11,phrases,none,10,11,0.2725,1381.8178,2027,294274,27,210.19571428571427,131892
|
||||
20250813_231257_phrases_none_-1_10_12,phrases,none,10,12,0.2728,1381.8133,2007,294631,7,210.4507142857143,132041
|
||||
20250813_231346_phrases_none_-1_10_13,phrases,none,10,13,0.2746,1381.7956,2002,294733,1,210.52357142857142,132087
|
||||
20250813_231437_phrases_none_-1_10_14,phrases,none,10,14,0.2746,1381.7956,2001,294746,0,210.53285714285715,132095
|
||||
20250813_231528_phrases_none_-1_11_5,phrases,none,11,5,0.2291,1371.9422,2616,279880,716,199.9142857142857,128487
|
||||
20250813_231620_phrases_none_-1_11_6,phrases,none,11,6,0.2365,1381.1200,2395,284879,492,203.485,128668
|
||||
20250813_231714_phrases_none_-1_11_7,phrases,none,11,7,0.2480,1383.4489,2210,289537,307,206.81214285714285,129265
|
||||
20250813_231809_phrases_none_-1_11_8,phrases,none,11,8,0.2597,1384.1422,2079,295296,176,210.9257142857143,130535
|
||||
20250814_104809_phrases_none_-1_11_9,phrases,none,11,9,0.2656,1384.2533,2003,297899,100,212.785,131201
|
||||
20250814_104904_phrases_none_-1_11_10,phrases,none,11,10,0.2684,1384.2756,1953,299393,50,213.85214285714287,131713
|
||||
20250814_105006_phrases_none_-1_11_11,phrases,none,11,11,0.2712,1384.3244,1926,299865,23,214.18928571428572,131927
|
||||
20250814_105103_phrases_none_-1_11_12,phrases,none,11,12,0.2713,1384.3200,1909,300191,6,214.42214285714286,132058
|
||||
20250814_105158_phrases_none_-1_11_13,phrases,none,11,13,0.2732,1384.2978,1905,300283,1,214.48785714285714,132102
|
||||
20250814_105250_phrases_none_-1_11_14,phrases,none,11,14,0.2732,1384.2978,1904,300296,0,214.49714285714285,132110
|
||||
|
527
phrase_index.py
Normal file
527
phrase_index.py
Normal file
@ -0,0 +1,527 @@
|
||||
import pathlib
|
||||
import sys
|
||||
import duckdb
|
||||
import ir_datasets
|
||||
import collections
|
||||
import pandas as pd
|
||||
|
||||
from phrases_extractor import extract_phrases_pmi_duckdb
|
||||
from ze_index import normalize
|
||||
|
||||
def insert_dataset(con, ir_dataset, logging=True):
|
||||
"""
|
||||
Insert documents from an ir_dataset. Works with several datasets.
|
||||
Add document attributes if needed.
|
||||
"""
|
||||
con.sql('CREATE TABLE documents (did TEXT, content TEXT)')
|
||||
insert = 'INSERT INTO documents(did, content) VALUES '
|
||||
sql = insert
|
||||
part = 0
|
||||
total = 0
|
||||
count = ir_dataset.docs_count()
|
||||
if logging:
|
||||
print(f"Inserting {count} docs...", file=sys.stderr)
|
||||
for doc in ir_dataset.docs_iter():
|
||||
doc_text = ""
|
||||
if hasattr(doc, 'title'):
|
||||
doc_text = doc.title
|
||||
if hasattr(doc, 'body'):
|
||||
doc_text += " " + doc.body
|
||||
if hasattr(doc, 'text'):
|
||||
doc_text += " " + doc.text
|
||||
sql += "('" + doc.doc_id + "','" + normalize(doc_text) + "'),"
|
||||
part += 1
|
||||
if part > 9999:
|
||||
total += part
|
||||
if logging:
|
||||
print(str(total) + " docs", file=sys.stderr)
|
||||
con.sql(sql)
|
||||
part = 0
|
||||
sql = insert
|
||||
con.sql(sql)
|
||||
|
||||
def create_lm(con, stemmer):
|
||||
con.sql(f"""
|
||||
CREATE OR REPLACE MACRO fts_main_documents.match_lm(query_string, fields := NULL, lambda := 0.3, conjunctive := 0) AS TABLE (
|
||||
WITH tokens AS (
|
||||
SELECT DISTINCT stem(unnest(fts_main_documents.tokenize(query_string)), '{stemmer}') AS t
|
||||
),
|
||||
fieldids AS (
|
||||
SELECT fieldid
|
||||
FROM fts_main_documents.fields
|
||||
WHERE CASE WHEN ((fields IS NULL)) THEN (1) ELSE (field = ANY(SELECT * FROM (SELECT unnest(string_split(fields, ','))) AS fsq)) END
|
||||
),
|
||||
qtermids AS (
|
||||
SELECT termid, df
|
||||
FROM fts_main_documents.dict AS dict, tokens
|
||||
WHERE (dict.term = tokens.t)
|
||||
),
|
||||
qterms AS (
|
||||
SELECT termid, docid
|
||||
FROM fts_main_documents.terms AS terms
|
||||
WHERE (CASE WHEN ((fields IS NULL)) THEN (1) ELSE (fieldid = ANY(SELECT * FROM fieldids)) END
|
||||
AND (termid = ANY(SELECT qtermids.termid FROM qtermids)))
|
||||
),
|
||||
term_tf AS (
|
||||
SELECT termid, docid, count_star() AS tf
|
||||
FROM qterms
|
||||
GROUP BY docid, termid
|
||||
),
|
||||
cdocs AS (
|
||||
SELECT docid
|
||||
FROM qterms
|
||||
GROUP BY docid
|
||||
HAVING CASE WHEN (conjunctive) THEN ((count(DISTINCT termid) = (SELECT count_star() FROM tokens))) ELSE 1 END
|
||||
),
|
||||
subscores AS (
|
||||
SELECT docs.docid, docs.len AS doc_len, term_tf.termid, term_tf.tf, qtermids.df, LN(1 + (lambda * tf * (SELECT ANY_VALUE(sumdf) FROM fts_main_documents.stats)) / ((1-lambda) * df * docs.len)) AS subscore
|
||||
FROM term_tf, cdocs, fts_main_documents.docs AS docs, qtermids
|
||||
WHERE ((term_tf.docid = cdocs.docid)
|
||||
AND (term_tf.docid = docs.docid)
|
||||
AND (term_tf.termid = qtermids.termid))
|
||||
),
|
||||
scores AS (
|
||||
SELECT docs.name AS docname, LN(MAX(doc_len)) + sum(subscore) AS score FROM subscores, fts_main_documents.docs AS docs WHERE subscores.docid = docs.docid GROUP BY docs.name
|
||||
),
|
||||
postings_cost AS (
|
||||
SELECT COUNT(DISTINCT docid) AS cost FROM qterms
|
||||
)
|
||||
SELECT docname, score, (SELECT cost FROM postings_cost) AS postings_cost FROM scores
|
||||
);
|
||||
""")
|
||||
|
||||
def create_bm25(con, stemmer):
|
||||
con.sql(f"""
|
||||
CREATE MACRO fts_main_documents.match_bm25(docname, query_string, b := 0.75, conjunctive := 0, k := 1.2, fields := NULL) AS (
|
||||
WITH tokens AS (
|
||||
SELECT DISTINCT stem(unnest(fts_main_documents.tokenize(query_string)), '{stemmer}') AS t
|
||||
),
|
||||
fieldids AS (
|
||||
SELECT fieldid
|
||||
FROM fts_main_documents.fields
|
||||
WHERE CASE WHEN ((fields IS NULL)) THEN (1) ELSE (field = ANY(SELECT * FROM (SELECT unnest(string_split(fields, ','))) AS fsq)) END
|
||||
),
|
||||
qtermids AS (
|
||||
SELECT termid, df
|
||||
FROM fts_main_documents.dict AS dict, tokens
|
||||
WHERE (dict.term = tokens.t)
|
||||
),
|
||||
qterms AS (
|
||||
SELECT termid, docid
|
||||
FROM fts_main_documents.terms AS terms
|
||||
WHERE (CASE WHEN ((fields IS NULL)) THEN (1) ELSE (fieldid = ANY(SELECT * FROM fieldids)) END
|
||||
AND (termid = ANY(SELECT qtermids.termid FROM qtermids)))
|
||||
),
|
||||
term_tf AS (
|
||||
SELECT termid, docid, count_star() AS tf
|
||||
FROM qterms
|
||||
GROUP BY docid, termid
|
||||
),
|
||||
cdocs AS (
|
||||
SELECT docid
|
||||
FROM qterms
|
||||
GROUP BY docid
|
||||
HAVING CASE WHEN (conjunctive) THEN ((count(DISTINCT termid) = (SELECT count_star() FROM tokens))) ELSE 1 END
|
||||
),
|
||||
subscores AS (
|
||||
SELECT docs.docid, docs.len, term_tf.termid, term_tf.tf, qtermids.df, (log((((((SELECT num_docs FROM fts_main_documents.stats) - df) + 0.5) / (df + 0.5)) + 1)) * ((tf * (k + 1)) / (tf + (k * ((1 - b) + (b * (len / (SELECT avgdl FROM fts_main_documents.stats)))))))) AS subscore
|
||||
FROM term_tf, cdocs, fts_main_documents.docs AS docs, qtermids
|
||||
WHERE ((term_tf.docid = cdocs.docid)
|
||||
AND (term_tf.docid = docs.docid)
|
||||
AND (term_tf.termid = qtermids.termid))
|
||||
),
|
||||
scores AS (
|
||||
SELECT docid, sum(subscore) AS score FROM subscores GROUP BY docid
|
||||
),
|
||||
SELECT score FROM scores, fts_main_documents.docs AS docs
|
||||
WHERE ((scores.docid = docs.docid) AND (docs."name" = docname)))
|
||||
""")
|
||||
|
||||
def create_docs_table(con, fts_schema="fts_main_documents", input_schema="main", input_table="documents", input_id="did"):
|
||||
"""
|
||||
Create the documents table.
|
||||
input_id should be the column name in input_table that uniquely identifies each document (e.g., 'did').
|
||||
"""
|
||||
con.sql(f"""
|
||||
CREATE SCHEMA IF NOT EXISTS {fts_schema};
|
||||
CREATE TABLE {fts_schema}.docs AS (
|
||||
SELECT
|
||||
row_number() OVER () AS docid,
|
||||
{input_id} AS name
|
||||
FROM
|
||||
{input_schema}.{input_table}
|
||||
);
|
||||
""")
|
||||
|
||||
def create_tokenizer_duckdb(con):
|
||||
con.sql("""
|
||||
CREATE MACRO fts_main_documents.tokenize(s) AS (
|
||||
string_split_regex(regexp_replace(lower(strip_accents(CAST(s AS VARCHAR))), '[0-9!@#$%^&*()_+={}\\[\\]:;<>,.?~\\\\/\\|''''"`-]+', ' ', 'g'), '\\s+')
|
||||
);
|
||||
""")
|
||||
|
||||
|
||||
def create_tokenizer_ciff(con, fts_schema="fts_main_documents"):
|
||||
con.sql(f"""
|
||||
CREATE TABLE IF NOT EXISTS {fts_schema}.dict (termid BIGINT, term TEXT, df BIGINT);
|
||||
CREATE OR REPLACE MACRO {fts_schema}.tokenize(query_string) AS (
|
||||
WITH RECURSIVE sequence AS (
|
||||
SELECT range AS nr
|
||||
FROM RANGE((SELECT MAX(LEN(term)) + 1 FROM {fts_schema}.dict))
|
||||
),
|
||||
simpledict AS (
|
||||
SELECT '' AS term
|
||||
UNION
|
||||
SELECT term FROM {fts_schema}.dict
|
||||
),
|
||||
subterms(term, subquery) AS (
|
||||
SELECT '', lower(strip_accents(CAST(query_string AS VARCHAR)))
|
||||
UNION
|
||||
SELECT MAX(dict.term), SUBSTRING(subquery,
|
||||
CASE WHEN MAX(nr) < 1 THEN 2 ELSE MAX(nr) + 1 END,
|
||||
LEN(subquery)) AS subquery
|
||||
FROM subterms, sequence, simpledict as dict
|
||||
WHERE SUBSTRING(subquery, 1, nr) = dict.term
|
||||
GROUP BY subquery
|
||||
)
|
||||
SELECT LIST(term) FROM subterms WHERE NOT term = ''
|
||||
)
|
||||
""")
|
||||
|
||||
def create_stopwords_table(con, fts_schema="fts_main_documents", stopwords='none'):
|
||||
"""
|
||||
Create the stopwords table.
|
||||
If stopwords is 'english', it will create a table with English stopwords.
|
||||
If stopwords is 'none', it will create an empty table.
|
||||
"""
|
||||
con.sql(f"DROP TABLE IF EXISTS {fts_schema}.stopwords;")
|
||||
if stopwords == 'english':
|
||||
con.sql(f"""
|
||||
CREATE TABLE {fts_schema}.stopwords (sw VARCHAR);
|
||||
INSERT INTO {fts_schema}.stopwords VALUES ('a'), ('a''s'), ('able'), ('about'), ('above'), ('according'), ('accordingly'), ('across'), ('actually'), ('after'), ('afterwards'), ('again'), ('against'), ('ain''t'), ('all'), ('allow'), ('allows'), ('almost'), ('alone'), ('along'), ('already'), ('also'), ('although'), ('always'), ('am'), ('among'), ('amongst'), ('an'), ('and'), ('another'), ('any'), ('anybody'), ('anyhow'), ('anyone'), ('anything'), ('anyway'), ('anyways'), ('anywhere'), ('apart'), ('appear'), ('appreciate'), ('appropriate'), ('are'), ('aren''t'), ('around'), ('as'), ('aside'), ('ask'), ('asking'), ('associated'), ('at'), ('available'), ('away'), ('awfully'), ('b'), ('be'), ('became'), ('because'), ('become'), ('becomes'), ('becoming'), ('been'), ('before'), ('beforehand'), ('behind'), ('being'), ('believe'), ('below'), ('beside'), ('besides'), ('best'), ('better'), ('between'), ('beyond'), ('both'), ('brief'), ('but'), ('by'), ('c'), ('c''mon'), ('c''s'), ('came'), ('can'), ('can''t'), ('cannot'), ('cant'), ('cause'), ('causes'), ('certain'), ('certainly'), ('changes'), ('clearly'), ('co'), ('com'), ('come'), ('comes'), ('concerning'), ('consequently'), ('consider'), ('considering'), ('contain'), ('containing'), ('contains'), ('corresponding'), ('could'), ('couldn''t'), ('course'), ('currently'), ('d'), ('definitely'), ('described'), ('despite'), ('did'), ('didn''t'), ('different'), ('do'), ('does'), ('doesn''t'), ('doing'), ('don''t'), ('done'), ('down'), ('downwards'), ('during'), ('e'), ('each'), ('edu'), ('eg'), ('eight'), ('either'), ('else'), ('elsewhere'), ('enough'), ('entirely'), ('especially'), ('et'), ('etc'), ('even'), ('ever'), ('every'), ('everybody'), ('everyone'), ('everything'), ('everywhere'), ('ex'), ('exactly'), ('example'), ('except'), ('f'), ('far'), ('few'), ('fifth'), ('first'), ('five'), ('followed'), ('following'), ('follows'), ('for'), ('former'), ('formerly'), ('forth'), ('four'), ('from'), ('further'), ('furthermore'), ('g'), ('get'), ('gets'), ('getting'), ('given'), ('gives'), ('go'), ('goes'), ('going'), ('gone'), ('got'), ('gotten'), ('greetings'), ('h'), ('had'), ('hadn''t'), ('happens'), ('hardly'), ('has'), ('hasn''t'), ('have'), ('haven''t'), ('having'), ('he'), ('he''s'), ('hello'), ('help'), ('hence'), ('her'), ('here'), ('here''s'), ('hereafter'), ('hereby'), ('herein'), ('hereupon'), ('hers'), ('herself'), ('hi'), ('him'), ('himself'), ('his'), ('hither'), ('hopefully'), ('how'), ('howbeit'), ('however'), ('i'), ('i''d'), ('i''ll'), ('i''m'), ('i''ve'), ('ie'), ('if'), ('ignored'), ('immediate'), ('in'), ('inasmuch'), ('inc'), ('indeed'), ('indicate'), ('indicated'), ('indicates'), ('inner'), ('insofar'), ('instead'), ('into'), ('inward'), ('is'), ('isn''t'), ('it'), ('it''d'), ('it''ll'), ('it''s'), ('its'), ('itself'), ('j'), ('just'), ('k'), ('keep'), ('keeps'), ('kept'), ('know'), ('knows'), ('known'), ('l'), ('last'), ('lately'), ('later'), ('latter'), ('latterly'), ('least'), ('less'), ('lest'), ('let'), ('let''s'), ('like'), ('liked'), ('likely'), ('little'), ('look'), ('looking'), ('looks'), ('ltd'), ('m'), ('mainly'), ('many'), ('may'), ('maybe'), ('me'), ('mean'), ('meanwhile'), ('merely'), ('might'), ('more'), ('moreover'), ('most'), ('mostly'), ('much'), ('must'), ('my'), ('myself'), ('n'), ('name'), ('namely'), ('nd'), ('near'), ('nearly'), ('necessary'), ('need'), ('needs'), ('neither'), ('never'), ('nevertheless'), ('new'), ('next'), ('nine'), ('no'), ('nobody'), ('non'), ('none'), ('noone'), ('nor'), ('normally'), ('not'), ('nothing'), ('novel'), ('now'), ('nowhere'), ('o'), ('obviously'), ('of'), ('off'), ('often'), ('oh'), ('ok'), ('okay'), ('old'), ('on'), ('once'), ('one'), ('ones'), ('only'), ('onto'), ('or'), ('other'), ('others'), ('otherwise'), ('ought'), ('our'), ('ours'), ('ourselves'), ('out'), ('outside'), ('over'), ('overall'), ('own');
|
||||
INSERT INTO {fts_schema}.stopwords VALUES ('p'), ('particular'), ('particularly'), ('per'), ('perhaps'), ('placed'), ('please'), ('plus'), ('possible'), ('presumably'), ('probably'), ('provides'), ('q'), ('que'), ('quite'), ('qv'), ('r'), ('rather'), ('rd'), ('re'), ('really'), ('reasonably'), ('regarding'), ('regardless'), ('regards'), ('relatively'), ('respectively'), ('right'), ('s'), ('said'), ('same'), ('saw'), ('say'), ('saying'), ('says'), ('second'), ('secondly'), ('see'), ('seeing'), ('seem'), ('seemed'), ('seeming'), ('seems'), ('seen'), ('self'), ('selves'), ('sensible'), ('sent'), ('serious'), ('seriously'), ('seven'), ('several'), ('shall'), ('she'), ('should'), ('shouldn''t'), ('since'), ('six'), ('so'), ('some'), ('somebody'), ('somehow'), ('someone'), ('something'), ('sometime'), ('sometimes'), ('somewhat'), ('somewhere'), ('soon'), ('sorry'), ('specified'), ('specify'), ('specifying'), ('still'), ('sub'), ('such'), ('sup'), ('sure'), ('t'), ('t''s'), ('take'), ('taken'), ('tell'), ('tends'), ('th'), ('than'), ('thank'), ('thanks'), ('thanx'), ('that'), ('that''s'), ('thats'), ('the'), ('their'), ('theirs'), ('them'), ('themselves'), ('then'), ('thence'), ('there'), ('there''s'), ('thereafter'), ('thereby'), ('therefore'), ('therein'), ('theres'), ('thereupon'), ('these'), ('they'), ('they''d'), ('they''ll'), ('they''re'), ('they''ve'), ('think'), ('third'), ('this'), ('thorough'), ('thoroughly'), ('those'), ('though'), ('three'), ('through'), ('throughout'), ('thru'), ('thus'), ('to'), ('together'), ('too'), ('took'), ('toward'), ('towards'), ('tried'), ('tries'), ('truly'), ('try'), ('trying'), ('twice'), ('two'), ('u'), ('un'), ('under'), ('unfortunately'), ('unless'), ('unlikely'), ('until'), ('unto'), ('up'), ('upon'), ('us'), ('use'), ('used'), ('useful'), ('uses'), ('using'), ('usually'), ('uucp'), ('v'), ('value'), ('various'), ('very'), ('via'), ('viz'), ('vs'), ('w'), ('want'), ('wants'), ('was'), ('wasn''t'), ('way'), ('we'), ('we''d'), ('we''ll'), ('we''re'), ('we''ve'), ('welcome'), ('well'), ('went'), ('were'), ('weren''t'), ('what'), ('what''s'), ('whatever'), ('when'), ('whence'), ('whenever'), ('where'), ('where''s'), ('whereafter'), ('whereas'), ('whereby'), ('wherein'), ('whereupon'), ('wherever'), ('whether'), ('which'), ('while'), ('whither'), ('who'), ('who''s'), ('whoever'), ('whole'), ('whom'), ('whose'), ('why'), ('will'), ('willing'), ('wish'), ('with'), ('within'), ('without'), ('won''t'), ('wonder'), ('would'), ('would'), ('wouldn''t'), ('x'), ('y'), ('yes'), ('yet'), ('you'), ('you''d'), ('you''ll'), ('you''re'), ('you''ve'), ('your'), ('yours'), ('yourself'), ('yourselves'), ('z'), ('zero');
|
||||
""")
|
||||
else:
|
||||
con.sql(f"CREATE TABLE {fts_schema}.stopwords (sw VARCHAR);")
|
||||
|
||||
def create_duckdb_dict_table(con, fts_schema="fts_main_documents", stopwords='none'):
|
||||
"""
|
||||
Create the dict table using DuckDB's built-in dictionary functionality.
|
||||
"""
|
||||
con.sql(f"DROP TABLE IF EXISTS {fts_schema}.dict;")
|
||||
create_stopwords_table(con, fts_schema, stopwords)
|
||||
|
||||
con.sql(f"""
|
||||
CREATE TABLE {fts_schema}.dict AS
|
||||
WITH distinct_terms AS (
|
||||
SELECT DISTINCT term
|
||||
FROM {fts_schema}.terms
|
||||
)
|
||||
SELECT
|
||||
row_number() OVER () AS termid,
|
||||
term
|
||||
FROM
|
||||
distinct_terms
|
||||
{"WHERE term NOT IN (SELECT sw FROM " + fts_schema + ".stopwords)" if stopwords == 'english' else ''}
|
||||
ORDER BY term;
|
||||
""")
|
||||
|
||||
def build_dict_table(con, mode='duckdb', fts_schema="fts_main_documents", stopwords='none', gpt4_token_file=None, ngram_range=(1,2), min_freq=10, min_pmi=5.0):
|
||||
"""
|
||||
Build the dictionary table using the specified mode.
|
||||
mode: 'phrases', 'ngrams', 'gpt4', or 'duckdb'
|
||||
"""
|
||||
if mode == 'phrases':
|
||||
create_stopwords_table(con, fts_schema=fts_schema, stopwords=stopwords)
|
||||
extract_phrases_pmi_duckdb(con, fts_schema="fts_main_documents", n=2, min_freq=min_freq, min_pmi=min_pmi)
|
||||
print("Extracted phrases:", con.execute("SELECT * FROM fts_main_documents.phrases LIMIT 10").fetchall())
|
||||
|
||||
print("\nAdded phrases to dictionary:", con.execute(f"SELECT * FROM {fts_schema}.dict LIMIT 10").fetchall())
|
||||
|
||||
print("\nAdded tokens to dictionary:", con.execute(f"SELECT * FROM {fts_schema}.dict WHERE term NOT LIKE '% %' LIMIT 10").fetchall())
|
||||
con.execute(f"DROP TABLE IF EXISTS {fts_schema}.tokens")
|
||||
con.execute(f"DROP TABLE IF EXISTS {fts_schema}.phrases")
|
||||
elif mode == 'duckdb':
|
||||
create_terms_table_duckdb(con, fts_schema=fts_schema, input_schema="main", input_table="documents", input_id="did", input_val="content")
|
||||
create_duckdb_dict_table(con, fts_schema=fts_schema, stopwords=stopwords)
|
||||
else:
|
||||
raise ValueError(f"Unknown dict table build mode: {mode}")
|
||||
|
||||
def create_terms_table(con, fts_schema="fts_main_documents", input_schema="main", input_table="documents", input_id="did", input_val="content"):
|
||||
"""
|
||||
Create the terms table with unique terms per docid.
|
||||
Assumes the table fts_main_documents.dict already exists.
|
||||
Adds a fieldid and termid column for compatibility with fielded search macros.
|
||||
"""
|
||||
# Cleanup input text removing special characters
|
||||
con.sql(f"""
|
||||
CREATE OR REPLACE TABLE {fts_schema}.cleaned_docs AS
|
||||
SELECT
|
||||
did,
|
||||
regexp_replace(content, '[0-9!@#$%^&*()_+={{}}\\[\\]:;<>,.?~\\\\/\\|''''"`-]+', ' ', 'g') AS content
|
||||
FROM {input_schema}.{input_table}
|
||||
""")
|
||||
|
||||
con.sql(f"""
|
||||
CREATE OR REPLACE TABLE {fts_schema}.terms AS (
|
||||
SELECT
|
||||
0 AS fieldid,
|
||||
d.termid,
|
||||
t.docid
|
||||
FROM (
|
||||
SELECT
|
||||
row_number() OVER (ORDER BY (SELECT NULL)) AS docid,
|
||||
unnest({fts_schema}.tokenize({input_val})) AS term
|
||||
FROM {fts_schema}.cleaned_docs
|
||||
) AS t
|
||||
JOIN {fts_schema}.dict d ON t.term = d.term
|
||||
WHERE t.term != ''
|
||||
);
|
||||
""")
|
||||
|
||||
|
||||
def create_terms_table_duckdb(con, fts_schema="fts_main_documents", input_schema="main", input_table="documents", input_id="did", input_val="content"):
|
||||
"""
|
||||
Step 1: Create the initial terms table (term, docid).
|
||||
"""
|
||||
con.sql(f"""
|
||||
CREATE OR REPLACE TABLE {fts_schema}.terms AS (
|
||||
SELECT
|
||||
row_number() OVER () AS docid,
|
||||
unnest({fts_schema}.tokenize({input_val})) AS term
|
||||
FROM {input_schema}.{input_table}
|
||||
WHERE {input_val} != ''
|
||||
);
|
||||
""")
|
||||
|
||||
def assign_termids_to_terms(con, fts_schema="fts_main_documents"):
|
||||
"""
|
||||
Step 3: Recreate the terms table, joining with dict to assign termid.
|
||||
"""
|
||||
con.sql(f"""
|
||||
CREATE OR REPLACE TABLE {fts_schema}.terms AS (
|
||||
SELECT
|
||||
0 AS fieldid,
|
||||
d.termid,
|
||||
t.docid,
|
||||
t.term,
|
||||
row_number() OVER (PARTITION BY t.docid) AS pos
|
||||
FROM {fts_schema}.terms t
|
||||
JOIN {fts_schema}.dict d ON t.term = d.term
|
||||
WHERE t.term != ''
|
||||
);
|
||||
""")
|
||||
|
||||
def update_docs_table(con, fts_schema="fts_main_documents"):
|
||||
"""
|
||||
Create the documents table.
|
||||
input_id should be the column name in input_table that uniquely identifies each document (e.g., 'did').
|
||||
"""
|
||||
# Remove old 'len' column if it exists, then add and populate a fresh one
|
||||
con.sql(f"ALTER TABLE {fts_schema}.docs DROP COLUMN IF EXISTS len;")
|
||||
con.sql(f"ALTER TABLE {fts_schema}.docs ADD COLUMN len INT;")
|
||||
con.sql(f"""
|
||||
UPDATE {fts_schema}.docs d
|
||||
SET len = (
|
||||
SELECT COUNT(termid)
|
||||
FROM {fts_schema}.terms t
|
||||
WHERE t.docid = d.docid
|
||||
);
|
||||
""")
|
||||
|
||||
def update_dict_table(con, fts_schema="fts_main_documents"):
|
||||
"""
|
||||
Update the dictionary table with document frequency (df).
|
||||
Assumes the table fts_main_documents.dict already exists.
|
||||
"""
|
||||
con.sql(f"ALTER TABLE {fts_schema}.dict ADD COLUMN IF NOT EXISTS df BIGINT;")
|
||||
con.sql(f"""
|
||||
UPDATE {fts_schema}.dict d
|
||||
SET df = (
|
||||
SELECT count(DISTINCT docid)
|
||||
FROM {fts_schema}.terms t
|
||||
WHERE t.termid = d.termid
|
||||
);
|
||||
""")
|
||||
|
||||
def limit_dict_table(con, max_terms=10000, fts_schema="fts_main_documents"):
|
||||
# Create a temporary table with limited terms and reassigned termid
|
||||
con.sql(f"""
|
||||
CREATE OR REPLACE TEMP TABLE temp_limited_dict AS
|
||||
SELECT
|
||||
ROW_NUMBER() OVER (ORDER BY df DESC, term ASC) AS termid,
|
||||
term,
|
||||
df
|
||||
FROM {fts_schema}.dict
|
||||
ORDER BY df DESC, term ASC
|
||||
LIMIT {max_terms};
|
||||
""")
|
||||
|
||||
# Drop original dict table
|
||||
con.sql(f"DROP TABLE IF EXISTS {fts_schema}.dict;")
|
||||
|
||||
# Recreate dict table from temp table
|
||||
con.sql(f"""
|
||||
CREATE TABLE {fts_schema}.dict AS
|
||||
SELECT * FROM temp_limited_dict;
|
||||
""")
|
||||
|
||||
# Drop temp table
|
||||
con.sql("DROP TABLE IF EXISTS temp_limited_dict;")
|
||||
|
||||
|
||||
|
||||
def create_stats_table(con, fts_schema="fts_main_documents", index_type="standard", stemmer="none"):
|
||||
"""
|
||||
Create the stats table.
|
||||
This table contains statistics about the FTS index.
|
||||
Columns: num_docs, avgdl, sumdf, index_type, stemmer
|
||||
"""
|
||||
con.sql(f"DROP TABLE IF EXISTS {fts_schema}.stats;")
|
||||
con.sql(f"""
|
||||
CREATE TABLE {fts_schema}.stats AS (
|
||||
SELECT
|
||||
COUNT(docs.docid) AS num_docs,
|
||||
SUM(docs.len) / COUNT(docs.len) AS avgdl,
|
||||
(SELECT SUM(df) FROM fts_main_documents.dict) AS sumdf,
|
||||
'{index_type}' AS index_type,
|
||||
'{stemmer}' AS stemmer
|
||||
FROM {fts_schema}.docs AS docs
|
||||
);
|
||||
""")
|
||||
|
||||
def create_fields_table(con, fts_schema="fts_main_documents"):
|
||||
con.sql(f'''
|
||||
CREATE TABLE IF NOT EXISTS {fts_schema}.fields (
|
||||
fieldid INTEGER,
|
||||
field TEXT
|
||||
);
|
||||
''')
|
||||
# Insert a default field if table is empty
|
||||
con.sql(f'''
|
||||
INSERT INTO {fts_schema}.fields (fieldid, field)
|
||||
SELECT 0, 'content'
|
||||
WHERE NOT EXISTS (SELECT 1 FROM {fts_schema}.fields);
|
||||
''')
|
||||
|
||||
def index_documents(db_name, ir_dataset, stemmer='none', stopwords='none',
|
||||
logging=True, keepcontent=False, limit=10000, mode='duckdb', min_freq=10, min_pmi=5.0):
|
||||
"""
|
||||
Insert and index documents.
|
||||
"""
|
||||
if pathlib.Path(db_name).is_file():
|
||||
raise ValueError(f"File {db_name} already exists.")
|
||||
con = duckdb.connect(db_name)
|
||||
insert_dataset(con, ir_dataset, logging)
|
||||
if logging:
|
||||
print("Indexing...", file=sys.stderr)
|
||||
|
||||
docs = con.sql("SELECT * FROM documents LIMIT 10").df()
|
||||
print("Docs:\n", docs)
|
||||
|
||||
create_docs_table(con, input_schema="main", input_table="documents", input_id="did")
|
||||
|
||||
fts_docs = con.sql("SELECT * FROM fts_main_documents.docs LIMIT 10").df()
|
||||
print("fts_main_documents.docs:\n", fts_docs)
|
||||
|
||||
con.sql("CREATE SCHEMA IF NOT EXISTS fts_main_documents;")
|
||||
con.sql("CREATE TABLE IF NOT EXISTS fts_main_documents.dict (term TEXT);")
|
||||
|
||||
create_tokenizer_duckdb(con)
|
||||
|
||||
# Create the dict table
|
||||
build_dict_table(con, mode=mode, fts_schema="fts_main_documents", stopwords=stopwords, ngram_range=(1,2), min_freq=min_freq, min_pmi=min_pmi)
|
||||
|
||||
create_tokenizer_ciff(con)
|
||||
|
||||
dict = con.sql("SELECT * FROM fts_main_documents.dict LIMIT 10").df()
|
||||
print("fts_main_documents.dict:\n", dict)
|
||||
|
||||
# Clean up the terms table
|
||||
if mode == 'phrases':
|
||||
con.sql("DROP TABLE IF EXISTS fts_main_documents.terms;")
|
||||
create_terms_table(con, input_schema="main", input_table="documents", input_id="did", input_val="content")
|
||||
else:
|
||||
assign_termids_to_terms(con, fts_schema="fts_main_documents")
|
||||
|
||||
terms = con.sql("SELECT * FROM fts_main_documents.terms LIMIT 10").df()
|
||||
print("fts_main_documents.terms:\n", terms)
|
||||
|
||||
update_docs_table(con, fts_schema="fts_main_documents")
|
||||
|
||||
docs = con.sql("SELECT * FROM fts_main_documents.docs LIMIT 10").df()
|
||||
print("fts_main_documents.docs:\n", docs)
|
||||
|
||||
update_dict_table(con, fts_schema="fts_main_documents")
|
||||
print("Updated fts_main_documents.dict with document frequencies.")
|
||||
|
||||
|
||||
# Limit the dictionary to the `max_terms` most frequent terms
|
||||
if limit > 0:
|
||||
limit_dict_table(con, max_terms=limit, fts_schema="fts_main_documents")
|
||||
create_terms_table(con, fts_schema="fts_main_documents", input_schema="main", input_table="documents", input_id="did", input_val="content")
|
||||
update_dict_table(con, fts_schema="fts_main_documents")
|
||||
print("Limited fts_main_documents.dict to 10000 most frequent terms.")
|
||||
|
||||
update_docs_table(con, fts_schema="fts_main_documents")
|
||||
|
||||
dict = con.sql("SELECT * FROM fts_main_documents.dict LIMIT 10").df()
|
||||
print("fts_main_documents.dict:\n", dict)
|
||||
|
||||
# Remove unused words from dictionary
|
||||
con.sql('''
|
||||
DELETE FROM fts_main_documents.dict
|
||||
WHERE df == 0;
|
||||
''')
|
||||
|
||||
create_stats_table(con, fts_schema="fts_main_documents", index_type="standard", stemmer=stemmer)
|
||||
|
||||
stats = con.sql("SELECT * FROM fts_main_documents.stats").df()
|
||||
print("fts_main_documents.stats:\n", stats)
|
||||
|
||||
create_fields_table(con, fts_schema="fts_main_documents")
|
||||
create_lm(con, stemmer)
|
||||
con.close()
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
import ze_eval
|
||||
import os
|
||||
|
||||
parser = argparse.ArgumentParser(description="Manual index builder for IR datasets.")
|
||||
parser.add_argument('--db', type=str, default='testje_docs.db', help='Database file name')
|
||||
parser.add_argument('--dataset', type=str, default='cranfield', help='ir_datasets name (e.g., cranfield, msmarco-passage)')
|
||||
parser.add_argument('--stemmer', type=str, default='none', help='Stemmer to use (none, porter, etc.)')
|
||||
parser.add_argument('--stopwords', type=str, default='english', help='Stopwords to use (english, none)')
|
||||
parser.add_argument('--mode', type=str, default='duckdb', help='Indexing mode (duckdb, ngrams, phrases, gpt4)')
|
||||
parser.add_argument('--keepcontent', action='store_true', help='Keep document content')
|
||||
parser.add_argument('--limit', type=int, default=10000, help='Limit the number of terms in the dictionary')
|
||||
parser.add_argument('--min-freq', type=int, default=10, help='Minimum frequency for phrases (only for mode "phrases")')
|
||||
parser.add_argument('--min-pmi', type=float, default=5.0, help='Minimum PMI for phrases (only for mode "phrases")')
|
||||
args = parser.parse_args()
|
||||
|
||||
dataset = None
|
||||
if (args.dataset == 'custom'):
|
||||
dataset = ze_eval.ir_dataset_test()
|
||||
else:
|
||||
dataset = ir_datasets.load(args.dataset)
|
||||
db_name = args.db
|
||||
if os.path.exists(db_name):
|
||||
print(f"Removing {db_name}")
|
||||
os.remove(db_name)
|
||||
|
||||
print("Creating index...")
|
||||
index_documents(
|
||||
db_name,
|
||||
dataset,
|
||||
stemmer=args.stemmer,
|
||||
stopwords=args.stopwords,
|
||||
keepcontent=args.keepcontent,
|
||||
mode=args.mode,
|
||||
limit=args.limit,
|
||||
min_freq=args.min_freq,
|
||||
min_pmi=args.min_pmi
|
||||
)
|
||||
print("")
|
||||
137
phrases_extractor.py
Normal file
137
phrases_extractor.py
Normal file
@ -0,0 +1,137 @@
|
||||
import duckdb
|
||||
import math
|
||||
from collections import Counter
|
||||
|
||||
def create_tokenizer_duckdb(con):
|
||||
con.sql("""
|
||||
CREATE TEMPORARY MACRO tokenize(s) AS (
|
||||
string_split_regex(regexp_replace(lower(strip_accents(CAST(s AS VARCHAR))), '[0-9!@#$%^&*()_+={}\\[\\]:;<>,.?~\\\\/\\|''''"`-]+', ' ', 'g'), '\\s+')
|
||||
);
|
||||
""")
|
||||
|
||||
def extract_phrases(documents, n=2, min_freq=2, db_path='phrases.db'):
|
||||
con = duckdb.connect(database=db_path)
|
||||
create_tokenizer_duckdb(con)
|
||||
|
||||
# Load documents into DuckDB table
|
||||
con.execute("CREATE TEMP TABLE docs AS SELECT * FROM (VALUES " +
|
||||
",".join(["(?, ?)"] * len(documents)) +
|
||||
") AS t(doc_id, text)", [item for pair in documents for item in pair])
|
||||
|
||||
# Tokenize and flatten tokens in DuckDB
|
||||
tokens_df = con.sql("""
|
||||
SELECT doc_id, unnest(tokenize(text)) AS token
|
||||
FROM docs
|
||||
""").df()
|
||||
|
||||
# Generate n-grams in Python
|
||||
token_counter = Counter()
|
||||
ngram_counter = Counter()
|
||||
|
||||
grouped = tokens_df.groupby('doc_id')['token'].apply(list)
|
||||
|
||||
total_tokens = 0
|
||||
for token_list in grouped:
|
||||
total_tokens += len(token_list)
|
||||
token_counter.update(token_list)
|
||||
ngrams = zip(*[token_list[i:] for i in range(n)])
|
||||
ngram_counter.update(ngrams)
|
||||
|
||||
# Extract frequent phrases
|
||||
phrases = [" ".join(ngram) for ngram, freq in ngram_counter.items() if freq >= min_freq]
|
||||
return phrases
|
||||
|
||||
def extract_phrases_pmi_duckdb(con, fts_schema, n=2, min_freq=2, min_pmi=3.0):
|
||||
# 1. Create a tokenized table
|
||||
con.execute(f"""CREATE OR REPLACE TABLE {fts_schema}.tokens AS
|
||||
SELECT
|
||||
did AS doc_id,
|
||||
unnest({fts_schema}.tokenize(content)) AS token
|
||||
FROM
|
||||
documents;
|
||||
|
||||
""")
|
||||
|
||||
print("Tokenized documents:\n", con.execute(f"SELECT * FROM {fts_schema}.tokens LIMIT 10").fetchall())
|
||||
|
||||
# 2. Add position index for each token in its document
|
||||
con.execute(f"""
|
||||
CREATE OR REPLACE TABLE {fts_schema}.tokens_pos AS
|
||||
SELECT doc_id, token,
|
||||
ROW_NUMBER() OVER (PARTITION BY doc_id ORDER BY rowid) AS pos
|
||||
FROM {fts_schema}.tokens
|
||||
""")
|
||||
|
||||
# 3. Compute total token count
|
||||
total_tokens = con.execute(f"SELECT COUNT(*)::DOUBLE FROM {fts_schema}.tokens_pos").fetchone()[0]
|
||||
|
||||
# 4. Compute token frequencies
|
||||
con.execute(f"""
|
||||
CREATE OR REPLACE TABLE {fts_schema}.token_freq AS
|
||||
SELECT token,
|
||||
COUNT(*) AS freq,
|
||||
COUNT(DISTINCT doc_id) AS doc_freq
|
||||
FROM {fts_schema}.tokens_pos
|
||||
GROUP BY token
|
||||
""")
|
||||
print("Token frequency:\n", con.execute(f"SELECT * FROM {fts_schema}.token_freq LIMIT 10").fetchall())
|
||||
|
||||
# 5. Compute bigrams (or n-grams)
|
||||
con.execute(f"""
|
||||
CREATE OR REPLACE TABLE {fts_schema}.ngrams AS
|
||||
SELECT t1.token AS w1, t2.token AS w2,
|
||||
t1.doc_id AS doc_id
|
||||
FROM {fts_schema}.tokens_pos t1
|
||||
JOIN {fts_schema}.tokens_pos t2
|
||||
ON t1.doc_id = t2.doc_id AND t2.pos = t1.pos + 1
|
||||
""")
|
||||
|
||||
# 6. Compute n-gram frequencies
|
||||
con.execute(f"""
|
||||
CREATE OR REPLACE TABLE {fts_schema}.ngram_freq AS
|
||||
SELECT w1, w2, COUNT(*) AS freq,
|
||||
COUNT(DISTINCT doc_id) AS doc_freq
|
||||
FROM {fts_schema}.ngrams
|
||||
GROUP BY w1, w2
|
||||
HAVING COUNT(*) >= {min_freq}
|
||||
""")
|
||||
|
||||
print("N-gram frequency:\n", con.execute(f"SELECT * FROM {fts_schema}.ngram_freq LIMIT 10").fetchall())
|
||||
print(f"Number of n-grams: {con.execute(f'SELECT COUNT(*) FROM {fts_schema}.ngram_freq').fetchone()[0]}")
|
||||
# 7. Compute PMI for bigrams
|
||||
con.execute(f"""
|
||||
CREATE OR REPLACE TABLE {fts_schema}.phrases AS
|
||||
SELECT w1 || ' ' || w2 AS phrase,
|
||||
LOG(n.freq * {total_tokens} / (f1.freq * f2.freq)) / LOG(2) AS pmi,
|
||||
n.doc_freq AS df
|
||||
FROM {fts_schema}.ngram_freq n
|
||||
JOIN {fts_schema}.token_freq f1 ON n.w1 = f1.token
|
||||
JOIN {fts_schema}.token_freq f2 ON n.w2 = f2.token
|
||||
WHERE LOG(n.freq * {total_tokens} / (f1.freq * f2.freq)) / LOG(2) >= {min_pmi}
|
||||
ORDER BY pmi DESC
|
||||
""")
|
||||
|
||||
print("Extracted phrases:\n", con.execute(f"SELECT phrase, pmi, df FROM {fts_schema}.phrases LIMIT 10").fetchall())
|
||||
print("Extracted tokens:\n", con.execute(f"SELECT token FROM {fts_schema}.token_freq LIMIT 10").fetchall())
|
||||
# 8. Combine phrases and words
|
||||
con.execute(f"""
|
||||
CREATE OR REPLACE TABLE {fts_schema}.dict AS
|
||||
SELECT ROW_NUMBER() OVER () AS termid, phrase as term, df
|
||||
FROM {fts_schema}.phrases
|
||||
WHERE NOT EXISTS (
|
||||
SELECT 1 FROM UNNEST(string_split(phrase, ' ')) AS word
|
||||
WHERE word.unnest IN (SELECT sw FROM {fts_schema}.stopwords)
|
||||
)
|
||||
UNION ALL
|
||||
SELECT ROW_NUMBER() OVER () + (SELECT COUNT(*) FROM {fts_schema}.phrases) AS termid, token AS term, doc_freq AS df
|
||||
FROM {fts_schema}.token_freq
|
||||
WHERE token NOT IN (SELECT sw FROM {fts_schema}.stopwords)
|
||||
AND freq >= {min_freq}
|
||||
""")
|
||||
|
||||
print("Phrases:\n", con.execute(f"SELECT term, df FROM {fts_schema}.dict LIMIT 10").fetchall())
|
||||
|
||||
con.execute(f"DROP TABLE IF EXISTS {fts_schema}.tokens_pos")
|
||||
con.execute(f"DROP TABLE IF EXISTS {fts_schema}.token_freq")
|
||||
con.execute(f"DROP TABLE IF EXISTS {fts_schema}.ngrams")
|
||||
con.execute(f"DROP TABLE IF EXISTS {fts_schema}.ngram_freq")
|
||||
BIN
testje_docs
Normal file
BIN
testje_docs
Normal file
Binary file not shown.
132
ze_eval.py
Normal file
132
ze_eval.py
Normal file
@ -0,0 +1,132 @@
|
||||
import pathlib
|
||||
import os
|
||||
|
||||
import ir_datasets
|
||||
|
||||
|
||||
class ir_dataset_test:
|
||||
class Doc:
|
||||
def __init__(self, doc_id, text):
|
||||
self.doc_id = doc_id
|
||||
self.text = text
|
||||
class Query:
|
||||
def __init__(self, query_id, text):
|
||||
self.query_id = query_id
|
||||
self.text = text
|
||||
class Qrel:
|
||||
def __init__(self, query_id, doc_id, relevance):
|
||||
self.query_id = query_id
|
||||
self.doc_id = doc_id
|
||||
self.relevance = relevance
|
||||
|
||||
# Custom documents
|
||||
# Custom documents
|
||||
doc1 = Doc('d1', 'Custom document one about information retrieval.')
|
||||
doc2 = Doc('d2', 'Custom document two about machine learning.')
|
||||
doc3 = Doc('d3', 'Custom document three about artificial intelligence.')
|
||||
doc4 = Doc('d4', 'Custom-document FOUR about INFORMATION-RETRIEVAL and its applications.')
|
||||
doc5 = Doc('d5', 'Another custom document, artificial intelligence with punctuation! And special characters like @#$%.')
|
||||
doc6 = Doc('d6', 'Machine-learning is artificial amazing; it combines AI, data-science, and more.')
|
||||
doc7 = Doc('d7', 'Information retrieval is the backbone of search engines and academic research.')
|
||||
doc8 = Doc('d8', 'Machine learning has become a core part of artificial intelligence.')
|
||||
doc9 = Doc('d9', 'Artificial intelligence artificial kip saté and machine learning are fields with significant overlap.')
|
||||
doc10 = Doc('d10', 'Machine learning is a subfield of artificial intelligence focused on data.')
|
||||
doc11 = Doc('d11', 'The process of information retrieval includes indexing and ranking documents.')
|
||||
doc12 = Doc('d12', 'Many AI systems rely on both machine learning and information retrieval.')
|
||||
doc13 = Doc('d13', 'Artificial intelligence kip saté is widely used in natural language processing and robotics.')
|
||||
doc14 = Doc('d14', 'Information retrieval systems are essential for finding relevant documents.')
|
||||
doc15 = Doc('d15', 'Machine learning algorithms adapt based on data patterns.')
|
||||
doc16 = Doc('d16', 'Artificial intelligence kip saté applications range from games to healthcare.')
|
||||
doc17 = Doc('d17', 'Information retrieval helps systems return relevant search results.')
|
||||
doc18 = Doc('d18', 'Machine learning and artificial intelligence are driving modern technology.')
|
||||
doc19 = Doc('d19', 'Artificial intelligence is often combined with information retrieval to build smart assistants.')
|
||||
doc20 = Doc('d20', 'The in the over at on Advanced machine learning techniques artificial intelligence are part of the artificial intelligence stack.')
|
||||
|
||||
docs = [doc1, doc2, doc3, doc4, doc5, doc6, doc7, doc8, doc9, doc10,
|
||||
doc11, doc12, doc13, doc14, doc15, doc16, doc17, doc18, doc19, doc20]
|
||||
|
||||
# Custom queries
|
||||
query1 = Query('1', 'information retrieval')
|
||||
query2 = Query('2', 'machine learning')
|
||||
query3 = Query('3', 'artificial intelligence')
|
||||
queries = [query1, query2, query3]
|
||||
|
||||
# Custom relevance judgments
|
||||
qrel1 = Qrel('1', 'd1', 2)
|
||||
qrel2 = Qrel('2', 'd2', 1)
|
||||
qrel3 = Qrel('3', 'd3', 1)
|
||||
qrels = [qrel1, qrel2, qrel3]
|
||||
|
||||
def docs_count(self):
|
||||
return len(self.docs)
|
||||
|
||||
def docs_iter(self):
|
||||
return self.docs
|
||||
|
||||
def queries_iter(self):
|
||||
return self.queries
|
||||
|
||||
def qrels_iter(self):
|
||||
return self.qrels
|
||||
|
||||
|
||||
def file_exists(name_in):
|
||||
return pathlib.Path(name_in).is_file()
|
||||
|
||||
|
||||
def get_qrels(experiment):
|
||||
if experiment == "custom":
|
||||
from ze_eval import ir_dataset_test
|
||||
qrel_file = "custom.qrels"
|
||||
if not pathlib.Path(qrel_file).is_file():
|
||||
with open(qrel_file, 'w') as file:
|
||||
for q in ir_dataset_test().qrels_iter():
|
||||
line = q.query_id + ' Q0 ' + q.doc_id + " " + str(q.relevance)
|
||||
file.write(line + '\n')
|
||||
return qrel_file
|
||||
if pathlib.Path(experiment).is_file(): # provide a qrels file directly...
|
||||
return experiment
|
||||
ir_dataset = ir_datasets.load(experiment) # ... or an ir_dataset
|
||||
ir_dataset_qrels = ir_dataset.qrels_iter()
|
||||
qrel_file = experiment + '.qrels'
|
||||
qrel_file = qrel_file.replace('/', '_')
|
||||
if not pathlib.Path(qrel_file).is_file():
|
||||
with open(qrel_file, 'w') as file:
|
||||
for q in ir_dataset_qrels:
|
||||
line = q.query_id + ' Q0 ' + q.doc_id + " " + str(q.relevance)
|
||||
file.write(line + '\n')
|
||||
return qrel_file
|
||||
|
||||
def trec_eval(run_name, experiment, complete_rel=False,
|
||||
ndcg=False, query_eval=False):
|
||||
qrel_file = get_qrels(experiment)
|
||||
switches = '-m official'
|
||||
if ndcg:
|
||||
switches += ' -m ndcg_cut'
|
||||
if complete_rel:
|
||||
switches += ' -c'
|
||||
if query_eval:
|
||||
switches += ' -q'
|
||||
command = f"trec_eval {switches} {qrel_file} {run_name}"
|
||||
print(command)
|
||||
os.system(command)
|
||||
# After running trec_eval, compute and print average postings cost if available in run file
|
||||
try:
|
||||
with open(run_name, 'r') as f:
|
||||
postings_costs = {}
|
||||
for line in f:
|
||||
parts = line.strip().split()
|
||||
if len(parts) >= 7:
|
||||
query_id = parts[0]
|
||||
try:
|
||||
cost = float(parts[6])
|
||||
if query_id not in postings_costs:
|
||||
postings_costs[query_id] = cost
|
||||
except Exception:
|
||||
continue
|
||||
if postings_costs:
|
||||
avg_cost = sum(postings_costs.values()) / len(postings_costs)
|
||||
print(f"Average cost in postings: {avg_cost:.4f}")
|
||||
print(f"Total postings cost: {sum(postings_costs.values()):.4f}")
|
||||
except Exception:
|
||||
pass
|
||||
141
ze_index.py
Normal file
141
ze_index.py
Normal file
@ -0,0 +1,141 @@
|
||||
"""
|
||||
Zoekeend indexer.
|
||||
Author: Djoerd Hiemstra
|
||||
"""
|
||||
|
||||
import pathlib
|
||||
import sys
|
||||
|
||||
import duckdb
|
||||
import ir_datasets
|
||||
|
||||
|
||||
def normalize(text):
|
||||
""" Escape quotes for SQL """
|
||||
return text.replace("'", "''")
|
||||
|
||||
|
||||
def create_lm(con, stemmer):
|
||||
con.sql(f"""
|
||||
CREATE OR REPLACE MACRO fts_main_documents.match_lm(query_string, fields := NULL, lambda := 0.3, conjunctive := 0) AS TABLE (
|
||||
WITH tokens AS (
|
||||
SELECT DISTINCT stem(unnest(fts_main_documents.tokenize(query_string)), '{stemmer}') AS t
|
||||
),
|
||||
fieldids AS (
|
||||
SELECT fieldid
|
||||
FROM fts_main_documents.fields
|
||||
WHERE CASE WHEN ((fields IS NULL)) THEN (1) ELSE (field = ANY(SELECT * FROM (SELECT unnest(string_split(fields, ','))) AS fsq)) END
|
||||
),
|
||||
qtermids AS (
|
||||
SELECT termid, df
|
||||
FROM fts_main_documents.dict AS dict, tokens
|
||||
WHERE (dict.term = tokens.t)
|
||||
),
|
||||
qterms AS (
|
||||
SELECT termid, docid
|
||||
FROM fts_main_documents.terms AS terms
|
||||
WHERE (CASE WHEN ((fields IS NULL)) THEN (1) ELSE (fieldid = ANY(SELECT * FROM fieldids)) END
|
||||
AND (termid = ANY(SELECT qtermids.termid FROM qtermids)))
|
||||
),
|
||||
term_tf AS (
|
||||
SELECT termid, docid, count_star() AS tf
|
||||
FROM qterms
|
||||
GROUP BY docid, termid
|
||||
),
|
||||
cdocs AS (
|
||||
SELECT docid
|
||||
FROM qterms
|
||||
GROUP BY docid
|
||||
HAVING CASE WHEN (conjunctive) THEN ((count(DISTINCT termid) = (SELECT count_star() FROM tokens))) ELSE 1 END
|
||||
),
|
||||
subscores AS (
|
||||
SELECT docs.docid, docs.len AS doc_len, term_tf.termid, term_tf.tf, qtermids.df, LN(1 + (lambda * tf * (SELECT ANY_VALUE(sumdf) FROM fts_main_documents.stats)) / ((1-lambda) * df * docs.len)) AS subscore
|
||||
FROM term_tf, cdocs, fts_main_documents.docs AS docs, qtermids
|
||||
WHERE ((term_tf.docid = cdocs.docid)
|
||||
AND (term_tf.docid = docs.docid)
|
||||
AND (term_tf.termid = qtermids.termid))
|
||||
),
|
||||
scores AS (
|
||||
SELECT docs.name AS docname, LN(MAX(doc_len)) + sum(subscore) AS score FROM subscores, fts_main_documents.docs AS docs WHERE subscores.docid = docs.docid GROUP BY docs.name
|
||||
),
|
||||
postings_cost AS (
|
||||
SELECT COUNT(DISTINCT docid) AS cost FROM qterms
|
||||
)
|
||||
SELECT docname, score, (SELECT cost FROM postings_cost) AS postings_cost FROM scores
|
||||
);
|
||||
""")
|
||||
|
||||
|
||||
def insert_dataset(con, ir_dataset, logging=True):
|
||||
"""
|
||||
Insert documents from an ir_dataset. Works with several datasets.
|
||||
Add document attributes if needed.
|
||||
"""
|
||||
con.sql('CREATE TABLE documents (did TEXT, content TEXT)')
|
||||
insert = 'INSERT INTO documents(did, content) VALUES '
|
||||
sql = insert
|
||||
part = 0
|
||||
total = 0
|
||||
count = ir_dataset.docs_count()
|
||||
if logging:
|
||||
print(f"Inserting {count} docs...", file=sys.stderr)
|
||||
for doc in ir_dataset.docs_iter():
|
||||
doc_text = ""
|
||||
if hasattr(doc, 'title'):
|
||||
doc_text = doc.title
|
||||
if hasattr(doc, 'body'):
|
||||
doc_text += " " + doc.body
|
||||
if hasattr(doc, 'text'):
|
||||
doc_text += " " + doc.text
|
||||
sql += "('" + doc.doc_id + "','" + normalize(doc_text) + "'),"
|
||||
part += 1
|
||||
if part > 9999:
|
||||
total += part
|
||||
if logging:
|
||||
print(str(total) + " docs", file=sys.stderr)
|
||||
con.sql(sql)
|
||||
part = 0
|
||||
sql = insert
|
||||
con.sql(sql)
|
||||
|
||||
|
||||
def index_documents(db_name, ir_dataset, stemmer='none', stopwords='none',
|
||||
logging=True, keepcontent=False):
|
||||
"""
|
||||
Insert and index documents.
|
||||
"""
|
||||
if pathlib.Path(db_name).is_file():
|
||||
raise ValueError(f"File {db_name} already exists.")
|
||||
con = duckdb.connect(db_name)
|
||||
insert_dataset(con, ir_dataset, logging)
|
||||
if logging:
|
||||
print("Indexing...", file=sys.stderr)
|
||||
con.sql(f"""
|
||||
PRAGMA create_fts_index('documents', 'did', 'content', stemmer='{stemmer}',
|
||||
stopwords='{stopwords}')
|
||||
""")
|
||||
con.sql(f"""
|
||||
ALTER TABLE fts_main_documents.stats ADD sumdf BIGINT;
|
||||
UPDATE fts_main_documents.stats SET sumdf =
|
||||
(SELECT SUM(df) FROM fts_main_documents.dict);
|
||||
ALTER TABLE fts_main_documents.stats ADD index_type TEXT;
|
||||
UPDATE fts_main_documents.stats SET index_type = 'standard';
|
||||
ALTER TABLE fts_main_documents.stats ADD stemmer TEXT;
|
||||
UPDATE fts_main_documents.stats SET stemmer = '{stemmer}';
|
||||
|
||||
""")
|
||||
create_lm(con, stemmer)
|
||||
if not keepcontent:
|
||||
con.sql("ALTER TABLE documents DROP COLUMN content")
|
||||
con.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import ze_eval
|
||||
dataset = ze_eval.ir_dataset_test()
|
||||
dataset = ir_datasets.load("cranfield")
|
||||
import os
|
||||
if os.path.exists('testje_docs.db'):
|
||||
os.remove('testje_docs.db')
|
||||
index_documents('testje_docs.db', dataset, stemmer='none', stopwords='none',
|
||||
keepcontent=False)
|
||||
115
ze_index_export.py
Normal file
115
ze_index_export.py
Normal file
@ -0,0 +1,115 @@
|
||||
"""
|
||||
Zoekeend CIFF exporter
|
||||
|
||||
Author: Gijs Hendriksen
|
||||
"""
|
||||
|
||||
from typing import Iterable, Type, TypeVar
|
||||
import duckdb
|
||||
|
||||
from ciff_toolkit.write import CiffWriter
|
||||
from ciff_toolkit.ciff_pb2 import Header, PostingsList, DocRecord
|
||||
from google.protobuf.message import Message
|
||||
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
M = TypeVar('M', bound=Message)
|
||||
|
||||
|
||||
def _create_message_from_row(row: tuple | dict, message_type: Type[M]) -> M:
|
||||
if isinstance(row, tuple):
|
||||
mapping = zip(message_type.DESCRIPTOR.fields, row)
|
||||
else:
|
||||
mapping = [(field, row[field.name]) for field in message_type.DESCRIPTOR.fields]
|
||||
|
||||
msg = message_type()
|
||||
for field, value in mapping:
|
||||
if field.label == field.LABEL_REPEATED:
|
||||
for x in value:
|
||||
getattr(msg, field.name).append(_create_message_from_row(x, field.message_type._concrete_class))
|
||||
else:
|
||||
setattr(msg, field.name, value)
|
||||
return msg
|
||||
|
||||
|
||||
def create_protobuf_messages_from_result(result: duckdb.DuckDBPyRelation, message_type: Type[M], batch_size: int = 1024) -> Iterable[M]:
|
||||
try:
|
||||
import protarrow
|
||||
for batch in result.fetch_arrow_reader(batch_size):
|
||||
yield from protarrow.record_batch_to_messages(batch, message_type)
|
||||
except ImportError:
|
||||
while batch := result.fetchmany(batch_size):
|
||||
for row in batch:
|
||||
yield _create_message_from_row(row, message_type)
|
||||
|
||||
|
||||
def create_ciff_header(conn: duckdb.DuckDBPyConnection, description: str) -> Header:
|
||||
header_info = conn.execute("""
|
||||
SELECT
|
||||
1 AS version,
|
||||
(SELECT COUNT(*) FROM fts_main_documents.dict) AS num_postings_lists,
|
||||
num_docs,
|
||||
(SELECT COUNT(*) FROM fts_main_documents.dict) AS total_postings_lists,
|
||||
num_docs AS total_docs,
|
||||
(SELECT SUM(len) FROM fts_main_documents.docs)::BIGINT AS total_terms_in_collection,
|
||||
avgdl AS average_doclength,
|
||||
? AS description,
|
||||
FROM fts_main_documents.stats
|
||||
""", [description])
|
||||
|
||||
header, = create_protobuf_messages_from_result(header_info, Header)
|
||||
return header
|
||||
|
||||
|
||||
def create_ciff_postings_lists(conn: duckdb.DuckDBPyConnection, batch_size: int = 1024) -> Iterable[PostingsList]:
|
||||
postings_info = conn.sql("""
|
||||
WITH postings AS (
|
||||
SELECT termid, docid, COUNT(*) AS tf
|
||||
FROM fts_main_documents.terms
|
||||
GROUP BY ALL
|
||||
),
|
||||
gapped_postings AS (
|
||||
SELECT *, docid - lag(docid, 1, 0) OVER (PARTITION BY termid ORDER BY docid) AS gap
|
||||
FROM postings
|
||||
),
|
||||
grouped_postings AS (
|
||||
SELECT termid, list(row(gap, tf)::STRUCT(docid BIGINT, tf BIGINT) ORDER BY docid) AS postings, SUM(tf)::BIGINT AS cf
|
||||
FROM gapped_postings
|
||||
GROUP BY termid
|
||||
)
|
||||
SELECT term, df, cf, postings
|
||||
FROM grouped_postings
|
||||
JOIN fts_main_documents.dict USING (termid)
|
||||
ORDER BY term;
|
||||
""")
|
||||
|
||||
yield from create_protobuf_messages_from_result(postings_info, PostingsList, batch_size=batch_size)
|
||||
|
||||
|
||||
def create_ciff_doc_records(conn: duckdb.DuckDBPyConnection, batch_size: int = 1024) -> Iterable[DocRecord]:
|
||||
docs_info = conn.sql("""
|
||||
SELECT
|
||||
docid,
|
||||
name AS collection_docid,
|
||||
len AS doclength,
|
||||
FROM fts_main_documents.docs
|
||||
ORDER BY collection_docid
|
||||
""")
|
||||
|
||||
yield from create_protobuf_messages_from_result(docs_info, DocRecord, batch_size=batch_size)
|
||||
|
||||
|
||||
def ciff_export(db_name: str, file_name: str, description: str, batch_size: int = 1024):
|
||||
with duckdb.connect(db_name) as conn, CiffWriter(file_name) as writer:
|
||||
header = create_ciff_header(conn, description)
|
||||
print(header)
|
||||
writer.write_header(header)
|
||||
writer.write_postings_lists(tqdm(create_ciff_postings_lists(conn, batch_size=batch_size), total=header.num_postings_lists,
|
||||
desc='Writing posting lists', unit='pl'))
|
||||
writer.write_documents(tqdm(create_ciff_doc_records(conn, batch_size=batch_size), total=header.num_docs,
|
||||
desc='Writing documents', unit='d'))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
ciff_export('index.db', 'index-copy.ciff.gz', 'OWS.eu index', batch_size=2**12)
|
||||
286
ze_index_import.py
Normal file
286
ze_index_import.py
Normal file
@ -0,0 +1,286 @@
|
||||
"""
|
||||
CIFF importer
|
||||
|
||||
Author: Arjen P. de Vries
|
||||
|
||||
Adapted from: https://github.com/arjenpdevries/CIFF2DuckDB
|
||||
"""
|
||||
|
||||
import duckdb
|
||||
import pyarrow as pa
|
||||
|
||||
from ciff_toolkit.read import CiffReader
|
||||
from ciff_toolkit.ciff_pb2 import DocRecord, Header, PostingsList
|
||||
from google.protobuf.json_format import MessageToJson, MessageToDict
|
||||
from typing import Iterator, TypeVar, Iterable
|
||||
|
||||
pbopt = {"including_default_value_fields": True,
|
||||
"preserving_proto_field_name": True}
|
||||
|
||||
|
||||
def iter_posting_batches(reader: Iterable[PostingsList]):
|
||||
"""
|
||||
Generator for reading batches of postings
|
||||
Note: Term identifiers handed out here, while reading term-posting
|
||||
pairs from the CIFF file
|
||||
"""
|
||||
batch = []
|
||||
for tid, p in enumerate(reader.read_postings_lists()):
|
||||
pp = MessageToDict(p, **pbopt)
|
||||
pp['termid']=tid
|
||||
# Gap Decompression...
|
||||
pp['postings']=[prev := {"docid":0}] and \
|
||||
[prev := {"docid": posting['docid'] + prev['docid'], "tf": posting['tf']} for posting in pp['postings']]
|
||||
batch.append(pp)
|
||||
if len(batch) == 4096:
|
||||
yield pa.RecordBatch.from_pylist(batch)
|
||||
batch = []
|
||||
yield pa.RecordBatch.from_pylist(batch)
|
||||
|
||||
|
||||
def iter_docs_batches(reader: Iterable[DocRecord]):
|
||||
""" Generator for reading batches of docs """
|
||||
batch = []
|
||||
for doc in reader.read_documents():
|
||||
batch.append(MessageToDict(doc, **pbopt))
|
||||
if len(batch) == 8192:
|
||||
yield pa.RecordBatch.from_pylist(batch)
|
||||
batch = []
|
||||
yield pa.RecordBatch.from_pylist(batch)
|
||||
|
||||
|
||||
def ciff_arrow(con, file_name, stemmer):
|
||||
""" Use CIFFReader to create RecordBatches for table (using Arrow) """
|
||||
# Schema: manually defined
|
||||
# (alternative: protarrow could create the datastructure from the proto definition)
|
||||
postings_schema = pa.schema([
|
||||
("term", pa.string()),
|
||||
("termid", pa.int64()),
|
||||
("df", pa.int64()),
|
||||
("cf", pa.int64()),
|
||||
("postings", pa.list_(pa.struct([
|
||||
("docid", pa.int32()),
|
||||
("tf", pa.int32())
|
||||
])))
|
||||
])
|
||||
|
||||
docs_schema = pa.schema([
|
||||
("docid", pa.int32()),
|
||||
("collection_docid", pa.string()),
|
||||
("doclength", pa.int32())
|
||||
])
|
||||
|
||||
with CiffReader(file_name) as reader:
|
||||
# Header info: TBD
|
||||
h = reader.read_header()
|
||||
header = MessageToJson(h, **pbopt)
|
||||
con.execute(f"""
|
||||
CREATE TABLE stats(num_docs BIGINT, avgdl DOUBLE, sumdf BIGINT, index_type TEXT, stemmer TEXT);
|
||||
INSERT INTO stats(num_docs, avgdl, index_type, stemmer) VALUES
|
||||
({h.num_docs}, {h.average_doclength}, 'standard', '{stemmer}');
|
||||
""")
|
||||
|
||||
# RecordBatches for postings to an Arrow Datastructure
|
||||
postings_rb = iter_posting_batches(reader)
|
||||
postings_rbr = pa.ipc.RecordBatchReader.from_batches(postings_schema, postings_rb)
|
||||
|
||||
# Create a DuckDB table from the Arrow data
|
||||
con.execute("CREATE TABLE ciff_postings AS SELECT * FROM postings_rbr;")
|
||||
|
||||
# RecordBatches for docs to an Arrow Datastructure
|
||||
docs_rb = iter_docs_batches(reader)
|
||||
docs_rbr = pa.ipc.RecordBatchReader.from_batches(docs_schema, docs_rb)
|
||||
|
||||
# Create a DuckDB table from the Arrow data
|
||||
# Dropping cf here because DuckDB FTS does not use it
|
||||
con.execute("""
|
||||
CREATE TABLE docs AS SELECT docid::BIGINT AS docid, collection_docid AS name, doclength::BIGINT AS len FROM docs_rbr;
|
||||
""")
|
||||
|
||||
|
||||
def create_tokenizer(con, tokenizer):
|
||||
if tokenizer == 'ciff':
|
||||
create_tokenizer_ciff(con)
|
||||
elif tokenizer == 'duckdb':
|
||||
create_tokenizer_duckdb(con)
|
||||
else:
|
||||
raise ValueError(f"Unknown tokenizer: {tokenizer}")
|
||||
|
||||
|
||||
def create_tokenizer_duckdb(con):
|
||||
con.sql("""
|
||||
CREATE MACRO fts_main_documents.tokenize(s) AS (
|
||||
string_split_regex(regexp_replace(lower(strip_accents(CAST(s AS VARCHAR))), '[0-9!@#$%^&*()_+={}\\[\\]:;<>,.?~\\\\/\\|''''"`-]+', ' ', 'g'), '\\s+')
|
||||
);
|
||||
""")
|
||||
|
||||
|
||||
def create_tokenizer_ciff(con):
|
||||
con.sql("""
|
||||
CREATE MACRO fts_main_documents.tokenize(query_string) AS (
|
||||
WITH RECURSIVE sequence AS (
|
||||
SELECT range AS nr
|
||||
FROM RANGE((SELECT MAX(LEN(term)) + 1 FROM fts_main_documents.dict))
|
||||
),
|
||||
simpledict AS (
|
||||
SELECT '' AS term
|
||||
UNION
|
||||
SELECT term FROM fts_main_documents.dict
|
||||
),
|
||||
subterms(term, subquery) AS (
|
||||
SELECT '', lower(strip_accents(CAST(query_string AS VARCHAR)))
|
||||
UNION
|
||||
SELECT MAX(dict.term), SUBSTRING(subquery,
|
||||
-- MAX(dict.term) selects the longest term, for a
|
||||
-- start position using alphabetic sorting
|
||||
CASE WHEN MAX(nr) < 1 THEN 2 ELSE MAX(nr) + 1 END,
|
||||
LEN(subquery)) AS subquery
|
||||
FROM subterms, sequence, simpledict as dict
|
||||
WHERE SUBSTRING(subquery, 1, nr) = dict.term
|
||||
GROUP BY subquery
|
||||
)
|
||||
SELECT LIST(term) FROM subterms WHERE NOT term = ''
|
||||
)
|
||||
""")
|
||||
|
||||
|
||||
def create_lm(con, stemmer):
|
||||
con.sql(f"""
|
||||
CREATE MACRO fts_main_documents.match_lm(docname, query_string, fields := NULL, lambda := 0.3, conjunctive := 0) AS (
|
||||
WITH tokens AS (
|
||||
SELECT DISTINCT stem(unnest(fts_main_documents.tokenize(query_string)), '{stemmer}') AS t
|
||||
),
|
||||
fieldids AS (
|
||||
SELECT fieldid
|
||||
FROM fts_main_documents.fields
|
||||
WHERE CASE WHEN ((fields IS NULL)) THEN (1) ELSE (field = ANY(SELECT * FROM (SELECT unnest(string_split(fields, ','))) AS fsq)) END
|
||||
),
|
||||
qtermids AS (
|
||||
SELECT termid, df
|
||||
FROM fts_main_documents.dict AS dict, tokens
|
||||
WHERE (dict.term = tokens.t)
|
||||
),
|
||||
qterms AS (
|
||||
SELECT termid, docid
|
||||
FROM fts_main_documents.terms AS terms
|
||||
WHERE (CASE WHEN ((fields IS NULL)) THEN (1) ELSE (fieldid = ANY(SELECT * FROM fieldids)) END
|
||||
AND (termid = ANY(SELECT qtermids.termid FROM qtermids)))
|
||||
),
|
||||
term_tf AS (
|
||||
SELECT termid, docid, count_star() AS tf
|
||||
FROM qterms
|
||||
GROUP BY docid, termid
|
||||
),
|
||||
cdocs AS (
|
||||
SELECT docid
|
||||
FROM qterms
|
||||
GROUP BY docid
|
||||
HAVING CASE WHEN (conjunctive) THEN ((count(DISTINCT termid) = (SELECT count_star() FROM tokens))) ELSE 1 END
|
||||
),
|
||||
subscores AS (
|
||||
SELECT docs.docid, docs.len, term_tf.termid, term_tf.tf, qtermids.df, LN(1 + (lambda * tf * (SELECT ANY_VALUE(sumdf) FROM fts_main_documents.stats)) / ((1-lambda) * df * len)) AS subscore
|
||||
FROM term_tf, cdocs, fts_main_documents.docs AS docs, qtermids
|
||||
WHERE ((term_tf.docid = cdocs.docid)
|
||||
AND (term_tf.docid = docs.docid)
|
||||
AND (term_tf.termid = qtermids.termid))
|
||||
),
|
||||
scores AS (
|
||||
SELECT docid, LN(MAX(len)) + sum(subscore) AS score FROM subscores GROUP BY docid
|
||||
)
|
||||
SELECT score FROM scores, fts_main_documents.docs AS docs
|
||||
WHERE ((scores.docid = docs.docid) AND (docs."name" = docname)))
|
||||
""")
|
||||
|
||||
|
||||
def create_bm25(con, stemmer):
|
||||
con.sql(f"""
|
||||
CREATE MACRO fts_main_documents.match_bm25(docname, query_string, b := 0.75, conjunctive := 0, k := 1.2, fields := NULL) AS (
|
||||
WITH tokens AS (
|
||||
SELECT DISTINCT stem(unnest(fts_main_documents.tokenize(query_string)), '{stemmer}') AS t
|
||||
),
|
||||
fieldids AS (
|
||||
SELECT fieldid
|
||||
FROM fts_main_documents.fields
|
||||
WHERE CASE WHEN ((fields IS NULL)) THEN (1) ELSE (field = ANY(SELECT * FROM (SELECT unnest(string_split(fields, ','))) AS fsq)) END
|
||||
),
|
||||
qtermids AS (
|
||||
SELECT termid, df
|
||||
FROM fts_main_documents.dict AS dict, tokens
|
||||
WHERE (dict.term = tokens.t)
|
||||
),
|
||||
qterms AS (
|
||||
SELECT termid, docid
|
||||
FROM fts_main_documents.terms AS terms
|
||||
WHERE (CASE WHEN ((fields IS NULL)) THEN (1) ELSE (fieldid = ANY(SELECT * FROM fieldids)) END
|
||||
AND (termid = ANY(SELECT qtermids.termid FROM qtermids)))
|
||||
),
|
||||
term_tf AS (
|
||||
SELECT termid, docid, count_star() AS tf
|
||||
FROM qterms
|
||||
GROUP BY docid, termid
|
||||
),
|
||||
cdocs AS (
|
||||
SELECT docid
|
||||
FROM qterms
|
||||
GROUP BY docid
|
||||
HAVING CASE WHEN (conjunctive) THEN ((count(DISTINCT termid) = (SELECT count_star() FROM tokens))) ELSE 1 END
|
||||
),
|
||||
subscores AS (
|
||||
SELECT docs.docid, docs.len, term_tf.termid, term_tf.tf, qtermids.df, (log((((((SELECT num_docs FROM fts_main_documents.stats) - df) + 0.5) / (df + 0.5)) + 1)) * ((tf * (k + 1)) / (tf + (k * ((1 - b) + (b * (len / (SELECT avgdl FROM fts_main_documents.stats)))))))) AS subscore
|
||||
FROM term_tf, cdocs, fts_main_documents.docs AS docs, qtermids
|
||||
WHERE ((term_tf.docid = cdocs.docid)
|
||||
AND (term_tf.docid = docs.docid)
|
||||
AND (term_tf.termid = qtermids.termid))
|
||||
),
|
||||
scores AS (
|
||||
SELECT docid, sum(subscore) AS score FROM subscores GROUP BY docid
|
||||
)
|
||||
SELECT score FROM scores, fts_main_documents.docs AS docs
|
||||
WHERE ((scores.docid = docs.docid) AND (docs."name" = docname)))
|
||||
""")
|
||||
|
||||
|
||||
def ciff_import(db_name, file_name, tokenizer='ciff', stemmer='none'):
|
||||
con = duckdb.connect(db_name)
|
||||
con.execute("""
|
||||
CREATE SCHEMA fts_main_documents;
|
||||
USE fts_main_documents;
|
||||
""")
|
||||
ciff_arrow(con, file_name, stemmer)
|
||||
con.execute("""
|
||||
CREATE TABLE dict AS SELECT termid, term, df FROM ciff_postings;
|
||||
CREATE TABLE fts_main_documents.fields(fieldid BIGINT, field VARCHAR);
|
||||
CREATE TABLE terms(docid BIGINT, fieldid BIGINT, termid BIGINT);
|
||||
WITH postings AS (
|
||||
SELECT termid, unnest(postings, recursive := true)
|
||||
FROM ciff_postings
|
||||
)
|
||||
INSERT INTO terms(docid, fieldid, termid)
|
||||
SELECT docid, 0, termid
|
||||
FROM postings, range(tf)
|
||||
ORDER BY termid;
|
||||
DROP TABLE ciff_postings;
|
||||
CREATE TABLE main.documents AS SELECT DISTINCT name AS did FROM fts_main_documents.docs;
|
||||
-- new stats
|
||||
UPDATE fts_main_documents.stats SET sumdf = (SELECT SUM(df) FROM fts_main_documents.dict);
|
||||
""")
|
||||
create_tokenizer(con, tokenizer)
|
||||
create_lm(con, stemmer)
|
||||
create_bm25(con, stemmer)
|
||||
con.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
DB_NAME = "ciff-geesedb.db"
|
||||
FILE_NAME = "geesedb.ciff.gz"
|
||||
ciff_import(DB_NAME, FILE_NAME, tokenizer='ciff', stemmer='none')
|
||||
|
||||
# Only for testing:
|
||||
# Query the index using the DuckDB tables
|
||||
|
||||
connect = duckdb.connect(DB_NAME)
|
||||
connect.execute("USE fts_main_documents;")
|
||||
results = connect.execute("SELECT termid FROM dict WHERE term LIKE '%radboud%' OR term LIKE '%university%'").arrow()
|
||||
print(results)
|
||||
results = connect.execute("SELECT * FROM terms WHERE termid IN (select termid FROM dict WHERE term LIKE '%radboud%' OR term LIKE '%university%')").arrow()
|
||||
print(results)
|
||||
198
ze_reindex_const.py
Normal file
198
ze_reindex_const.py
Normal file
@ -0,0 +1,198 @@
|
||||
import duckdb
|
||||
import pathlib
|
||||
import sys
|
||||
|
||||
|
||||
def copy_file(name_in, name_out):
|
||||
path1 = pathlib.Path(name_in)
|
||||
if not(path1.is_file()):
|
||||
raise ValueError(f"File {name_in} does not exist.")
|
||||
path2 = pathlib.Path(name_out)
|
||||
if path2.is_file():
|
||||
raise ValueError(f"File {name_out} already exists.")
|
||||
path2.write_bytes(path1.read_bytes())
|
||||
|
||||
|
||||
def get_stats_stemmer(con):
|
||||
sql = "SELECT stemmer FROM fts_main_documents.stats"
|
||||
return con.sql(sql).fetchall()[0][0]
|
||||
|
||||
|
||||
def replace_bm25_const(con, stemmer):
|
||||
""" New version of BM25; assuming that const_len=avgdl, the document
|
||||
length normalization part disappears and the ranking function
|
||||
becomes BM1 from Robertson and Walker's SIGIR 1994 paper.
|
||||
"""
|
||||
con.sql(f"""
|
||||
CREATE OR REPLACE MACRO fts_main_documents.match_bm25(docname, query_string, b := 0.75, k := 1.2, conjunctive := 0, fields := NULL) AS (
|
||||
WITH tokens AS (
|
||||
SELECT DISTINCT stem(unnest(fts_main_documents.tokenize(query_string)), '{stemmer}') AS t
|
||||
),
|
||||
fieldids AS (
|
||||
SELECT fieldid
|
||||
FROM fts_main_documents.fields
|
||||
WHERE CASE WHEN ((fields IS NULL)) THEN (1) ELSE (field = ANY(SELECT * FROM (SELECT unnest(string_split(fields, ','))) AS fsq)) END
|
||||
),
|
||||
qtermids AS (
|
||||
SELECT termid, df
|
||||
FROM fts_main_documents.dict AS dict, tokens
|
||||
WHERE (dict.term = tokens.t)
|
||||
),
|
||||
qterms AS (
|
||||
SELECT termid, docid
|
||||
FROM fts_main_documents.terms AS terms
|
||||
WHERE (CASE WHEN ((fields IS NULL)) THEN (1) ELSE (fieldid = ANY(SELECT * FROM fieldids)) END
|
||||
AND (termid = ANY(SELECT qtermids.termid FROM qtermids)))
|
||||
),
|
||||
term_tf AS (
|
||||
SELECT termid, docid, count_star() AS tf
|
||||
FROM qterms
|
||||
GROUP BY docid, termid
|
||||
),
|
||||
cdocs AS (
|
||||
SELECT docid
|
||||
FROM qterms
|
||||
GROUP BY docid
|
||||
HAVING CASE WHEN (conjunctive) THEN ((count(DISTINCT termid) = (SELECT count_star() FROM tokens))) ELSE 1 END
|
||||
),
|
||||
subscores AS (
|
||||
SELECT docs.docid, term_tf.termid, tf, df,
|
||||
(log((((((SELECT num_docs FROM fts_main_documents.stats) - df) + 0.5) / (df + 0.5)) + 1)) * ((tf * (k + 1)) / (tf + k))) AS subscore
|
||||
FROM term_tf, cdocs, fts_main_documents.docs AS docs, qtermids
|
||||
WHERE (term_tf.docid = cdocs.docid)
|
||||
AND (term_tf.docid = docs.docid)
|
||||
AND (term_tf.termid = qtermids.termid)
|
||||
),
|
||||
scores AS (
|
||||
SELECT docid, sum(subscore) AS score
|
||||
FROM subscores
|
||||
GROUP BY docid
|
||||
)
|
||||
SELECT score
|
||||
FROM scores, fts_main_documents.docs AS docs
|
||||
WHERE (scores.docid = docs.docid) AND (docs."name" = docname)
|
||||
)
|
||||
""")
|
||||
|
||||
|
||||
def get_sql_selects(con):
|
||||
try:
|
||||
con.sql('SELECT prior FROM fts_main_documents.docs')
|
||||
except duckdb.duckdb.BinderException:
|
||||
pass
|
||||
else: # there is a prior column (from reindex_prior)
|
||||
return ("docs.prior,", "LN(ANY_VALUE(prior)) +")
|
||||
try:
|
||||
con.sql('SELECT slope FROM fts_main_documents.stats')
|
||||
except duckdb.duckdb.BinderException:
|
||||
pass
|
||||
else: # there is a slope column (from reindex_fitted)
|
||||
return ("", "(LN(docid)*(SELECT ANY_VALUE(slope) FROM fts_main_documents.stats)) +")
|
||||
return ("", "")
|
||||
|
||||
|
||||
def replace_lm_const(con, stemmer, const_len):
|
||||
""" This is a language model matcher where len is replaced by a constant.
|
||||
It uses the prior column or fitted score, if present in the old index.
|
||||
"""
|
||||
(subscores_select, scores_select) = get_sql_selects(con) # adapt to previous index
|
||||
con.sql(f"""
|
||||
CREATE OR REPLACE MACRO fts_main_documents.match_lm(docname, query_string, fields := NULL, lambda := 0.3, conjunctive := 0) AS (
|
||||
WITH tokens AS (
|
||||
SELECT DISTINCT stem(unnest(fts_main_documents.tokenize(query_string)), '{stemmer}') AS t
|
||||
),
|
||||
fieldids AS (
|
||||
SELECT fieldid
|
||||
FROM fts_main_documents.fields
|
||||
WHERE CASE WHEN fields IS NULL THEN 1 ELSE field IN (SELECT * FROM (SELECT UNNEST(string_split(fields, ','))) AS fsq) END
|
||||
),
|
||||
qtermids AS (
|
||||
SELECT termid, df
|
||||
FROM fts_main_documents.dict AS dict,
|
||||
tokens
|
||||
WHERE dict.term = tokens.t
|
||||
),
|
||||
qterms AS (
|
||||
SELECT termid,
|
||||
docid
|
||||
FROM fts_main_documents.terms AS terms
|
||||
WHERE CASE WHEN fields IS NULL THEN 1 ELSE fieldid IN (SELECT * FROM fieldids) END
|
||||
AND termid IN (SELECT qtermids.termid FROM qtermids)
|
||||
),
|
||||
term_tf AS (
|
||||
SELECT termid, docid, COUNT(*) AS tf
|
||||
FROM qterms
|
||||
GROUP BY docid, termid
|
||||
),
|
||||
cdocs AS (
|
||||
SELECT docid
|
||||
FROM qterms
|
||||
GROUP BY docid
|
||||
HAVING CASE WHEN conjunctive THEN COUNT(DISTINCT termid) = (SELECT COUNT(*) FROM tokens) ELSE 1 END
|
||||
),
|
||||
subscores AS (
|
||||
SELECT {subscores_select} docs.docid, term_tf.termid, term_tf.tf, qtermids.df,
|
||||
LN(1 + (lambda * tf * (SELECT ANY_VALUE(sumdf) FROM fts_main_documents.stats)) / ((1-lambda) * df * (SELECT ANY_VALUE(const_len) FROM fts_main_documents.stats))) AS subscore
|
||||
FROM term_tf, cdocs, fts_main_documents.docs AS docs, qtermids
|
||||
WHERE term_tf.docid = cdocs.docid
|
||||
AND term_tf.docid = docs.docid
|
||||
AND term_tf.termid = qtermids.termid
|
||||
),
|
||||
scores AS (
|
||||
SELECT docid, {scores_select} sum(subscore) AS score
|
||||
FROM subscores
|
||||
GROUP BY docid
|
||||
)
|
||||
SELECT score
|
||||
FROM scores, fts_main_documents.docs AS docs
|
||||
WHERE scores.docid = docs.docid
|
||||
AND docs.name = docname
|
||||
)
|
||||
""")
|
||||
|
||||
|
||||
def reindex_const(name_in, name_out, const_len=400, b=1, keep_terms=False, maxp=1.0):
|
||||
copy_file(name_in, name_out)
|
||||
con = duckdb.connect(name_out)
|
||||
max_tf = int(const_len * maxp)
|
||||
if keep_terms:
|
||||
new_tf = 'CASE WHEN tf > 0.5 THEN tf - 0.5 ELSE 0.1 END'
|
||||
else:
|
||||
new_tf = 'tf - 0.5'
|
||||
con.sql(f"""
|
||||
CREATE TABLE fts_main_documents.terms_new (
|
||||
docid BIGINT, fieldid BIGINT, termid BIGINT);
|
||||
WITH sequence AS (
|
||||
SELECT range AS nr FROM RANGE({max_tf})
|
||||
),
|
||||
tf_new AS (
|
||||
SELECT T.docid, T.fieldid, termid,
|
||||
-- BM25-like length normalization:
|
||||
COUNT(*) / (1 - {b} + {b} * (ANY_VALUE(D.len) / {const_len})) AS tf,
|
||||
-- proper rounding, but do not remove terms:
|
||||
{new_tf} AS new_tf
|
||||
FROM fts_main_documents.terms T, fts_main_documents.docs D
|
||||
WHERE T.docid = D.docid
|
||||
GROUP BY T.docid, T.fieldid, T.termid
|
||||
)
|
||||
INSERT INTO fts_main_documents.terms_new
|
||||
SELECT docid, fieldid, termid
|
||||
FROM tf_new, sequence WHERE sequence.nr < tf_new.new_tf;
|
||||
DROP TABLE fts_main_documents.terms;
|
||||
ALTER TABLE fts_main_documents.terms_new RENAME TO terms;
|
||||
UPDATE fts_main_documents.stats
|
||||
SET index_type = 'const(len={const_len},b={b})';
|
||||
ALTER TABLE fts_main_documents.stats ADD const_len BIGINT;
|
||||
UPDATE fts_main_documents.stats SET const_len = {const_len};
|
||||
-- really remove len column
|
||||
ALTER TABLE fts_main_documents.docs DROP COLUMN len;
|
||||
""")
|
||||
stemmer = get_stats_stemmer(con)
|
||||
replace_bm25_const(con, stemmer)
|
||||
replace_lm_const(con, stemmer, const_len)
|
||||
con.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
reindex_const('robustZE.db', 'robustZEfitted01.db', const_len=500, maxp=0.1)
|
||||
|
||||
383
ze_reindex_fitted.py
Normal file
383
ze_reindex_fitted.py
Normal file
@ -0,0 +1,383 @@
|
||||
import pathlib
|
||||
import sys
|
||||
|
||||
import duckdb
|
||||
import ir_datasets
|
||||
|
||||
|
||||
def copy_file(name_in, name_out):
|
||||
""" Simple file copy """
|
||||
path1 = pathlib.Path(name_in)
|
||||
if not path1.is_file():
|
||||
raise ValueError(f"File {name_in} does not exist.")
|
||||
path2 = pathlib.Path(name_out)
|
||||
if path2.is_file():
|
||||
raise ValueError(f"File {name_out} already exists.")
|
||||
path2.write_bytes(path1.read_bytes())
|
||||
|
||||
|
||||
def get_stats_stemmer(con):
|
||||
""" What stemmer was used on this index? """
|
||||
sql = "SELECT stemmer FROM fts_main_documents.stats"
|
||||
return con.sql(sql).fetchall()[0][0]
|
||||
|
||||
|
||||
def sample_by_values(con, column, threshold):
|
||||
""" Takes one sample per unique value of len/prior. """
|
||||
con.sql(f"""
|
||||
CREATE VIEW sample AS
|
||||
WITH histogram as (
|
||||
SELECT "{column}", COUNT(*) AS count
|
||||
FROM fts_main_documents.docs
|
||||
WHERE "{column}" > {threshold}
|
||||
GROUP BY "{column}"
|
||||
)
|
||||
SELECT LN(SUM(H2.count)) AS x, LN(H1."{column}") AS y
|
||||
FROM histogram H1, histogram H2
|
||||
WHERE H1."{column}" <= H2."{column}"
|
||||
GROUP BY H1."{column}"
|
||||
""")
|
||||
|
||||
|
||||
def sample_by_fixed_points(con, column, threshold, total):
|
||||
""" Takes {total} samples and averages len/prior for each. """
|
||||
con.sql(f"""
|
||||
CREATE VIEW sample AS
|
||||
WITH groups AS (
|
||||
SELECT (CASE WHEN range = 2 THEN 0 ELSE range END) *
|
||||
LN(num_docs + 1) / ({total} + 2) AS group_start,
|
||||
(range + 1) * LN(num_docs + 1) / ({total} + 2) AS group_end
|
||||
FROM RANGE({total} + 2), fts_main_documents.stats
|
||||
WHERE range > 1
|
||||
)
|
||||
SELECT (group_start + group_end) / 2 AS X, LN(AVG({column})) AS Y
|
||||
FROM groups, fts_main_documents.docs AS docs
|
||||
WHERE LN(docid + 1) >= group_start AND LN(docid + 1) < group_end
|
||||
AND "{column}" > {threshold}
|
||||
GROUP BY group_start, group_end
|
||||
""")
|
||||
|
||||
|
||||
def sample_by_fixed_points_qrels(con, total):
|
||||
"""
|
||||
Takes {total} samples and estimates the probability of relevance
|
||||
from the provided qrels
|
||||
"""
|
||||
con.sql(f"""
|
||||
CREATE VIEW sample AS
|
||||
WITH groups AS (
|
||||
SELECT (CASE WHEN range = 2 THEN 0 ELSE range END) *
|
||||
LN(num_docs + 1) / ({total} + 2) AS group_start,
|
||||
(range + 1) * LN(num_docs + 1) / ({total} + 2) AS group_end
|
||||
FROM RANGE({total} + 2), fts_main_documents.stats
|
||||
WHERE range > 1
|
||||
)
|
||||
SELECT (group_start + group_end) / 2 AS X,
|
||||
LN(COUNT(*)/(EXP(group_end) - EXP(group_start))) AS Y
|
||||
FROM groups, fts_main_documents.docs AS docs, qrels
|
||||
WHERE LN(docid + 1) >= group_start AND LN(docid + 1) < group_end
|
||||
AND docs.name = qrels.did
|
||||
AND qrels.rel > 0
|
||||
GROUP BY group_start, group_end
|
||||
""")
|
||||
|
||||
|
||||
def print_sample_tsv(con, total=None):
|
||||
""" Prints sample for drawing nice graphs. """
|
||||
result = con.sql("SELECT x, y FROM sample ORDER BY x").fetchall()
|
||||
if total and len(result) != total:
|
||||
print(f"Warning: less than {total} datapoints.", file=sys.stderr)
|
||||
for (x, y) in result:
|
||||
print(str(x) + "\t" + str(y))
|
||||
|
||||
|
||||
def train_linear_regression(con):
|
||||
""" Approximate sample by using linear regression. """
|
||||
con.sql("""
|
||||
WITH sums AS (
|
||||
SELECT COUNT(*) AS N, SUM(x) AS Sx, SUM(y) AS Sy,
|
||||
SUM(x*x) AS Sxx, SUM(x*y) AS Sxy
|
||||
FROM sample
|
||||
),
|
||||
model AS (
|
||||
SELECT (Sy*Sxx - Sx*Sxy) / (N*Sxx - Sx*Sx) AS intercept,
|
||||
(N*Sxy - Sx*Sy) / (N*Sxx - Sx*Sx) AS slope
|
||||
FROM sums
|
||||
)
|
||||
UPDATE fts_main_documents.stats AS stats
|
||||
SET intercept = model.intercept, slope =
|
||||
CASE WHEN model.slope < 0 THEN model.slope ELSE 0 END
|
||||
FROM model
|
||||
""")
|
||||
|
||||
|
||||
def get_qrels_from_file(qrel_file):
|
||||
inserts = []
|
||||
with open(qrel_file, "r", encoding="ascii") as file:
|
||||
for line in file:
|
||||
(query_id, q0 ,doc_id, relevance) = line.split()
|
||||
if relevance != 0:
|
||||
inserts.append([query_id, doc_id, relevance])
|
||||
return inserts
|
||||
|
||||
|
||||
def get_qrels_from_ir_datasets(qrels_tag):
|
||||
inserts = []
|
||||
for q in ir_datasets.load(qrels_tag).qrels_iter():
|
||||
if q.relevance != 0:
|
||||
inserts.append([q.query_id, q.doc_id, q.relevance])
|
||||
return inserts
|
||||
|
||||
|
||||
def insert_qrels(con, qrels_tag):
|
||||
con.sql("CREATE OR REPLACE TABLE main.qrels(qid TEXT, did TEXT, rel INT)")
|
||||
try:
|
||||
inserts = get_qrels_from_ir_datasets(qrels_tag)
|
||||
except KeyError:
|
||||
inserts = get_qrels_from_file(qrels_tag)
|
||||
con.sql("BEGIN TRANSACTION")
|
||||
con.executemany("INSERT INTO qrels VALUES (?, ?, ?)", inserts)
|
||||
con.sql("COMMIT")
|
||||
|
||||
|
||||
def replace_bm25_fitted_doclen(con, stemmer):
|
||||
con.sql(f"""
|
||||
CREATE OR REPLACE MACRO fts_main_documents.match_bm25(docname, query_string, b := 0.75, k := 1.2, conjunctive := 0, fields := NULL) AS (
|
||||
WITH tokens AS (
|
||||
SELECT DISTINCT stem(unnest(fts_main_documents.tokenize(query_string)), '{stemmer}') AS t
|
||||
),
|
||||
fieldids AS (
|
||||
SELECT fieldid
|
||||
FROM fts_main_documents.fields
|
||||
WHERE CASE WHEN fields IS NULL THEN 1 ELSE field IN (SELECT * FROM (SELECT UNNEST(string_split(fields, ','))) AS fsq) END
|
||||
),
|
||||
qtermids AS (
|
||||
SELECT termid, df
|
||||
FROM fts_main_documents.dict AS dict,
|
||||
tokens
|
||||
WHERE dict.term = tokens.t
|
||||
),
|
||||
qterms AS (
|
||||
SELECT termid,
|
||||
docid
|
||||
FROM fts_main_documents.terms AS terms
|
||||
WHERE CASE WHEN fields IS NULL THEN 1 ELSE fieldid IN (SELECT * FROM fieldids) END
|
||||
AND termid IN (SELECT qtermids.termid FROM qtermids)
|
||||
),
|
||||
term_tf AS (
|
||||
SELECT termid, docid, COUNT(*) AS tf
|
||||
FROM qterms
|
||||
GROUP BY docid, termid
|
||||
),
|
||||
cdocs AS (
|
||||
SELECT docid
|
||||
FROM qterms
|
||||
GROUP BY docid
|
||||
HAVING CASE WHEN conjunctive THEN COUNT(DISTINCT termid) = (SELECT COUNT(*) FROM tokens) ELSE 1 END
|
||||
),
|
||||
subscores AS (
|
||||
SELECT docs.docid, EXP(LN(docs.docid)*stats.slope + stats.intercept) AS newlen, term_tf.termid, tf, df, (log((((stats.num_docs - df) + 0.5) / (df + 0.5))) * ((tf * (k + 1)) / (tf + (k * ((1 - b) + (b * (newlen / stats.avgdl))))))) AS subscore
|
||||
FROM term_tf, cdocs, fts_main_documents.docs AS docs, qtermids,
|
||||
fts_main_documents.stats AS stats,
|
||||
WHERE term_tf.docid = cdocs.docid
|
||||
AND term_tf.docid = docs.docid
|
||||
AND term_tf.termid = qtermids.termid
|
||||
),
|
||||
scores AS (
|
||||
SELECT docid, sum(subscore) AS score
|
||||
FROM subscores
|
||||
GROUP BY docid
|
||||
)
|
||||
SELECT score
|
||||
FROM scores, fts_main_documents.docs AS docs
|
||||
WHERE scores.docid = docs.docid
|
||||
AND docs.name = docname
|
||||
)"""
|
||||
)
|
||||
|
||||
|
||||
def replace_lm_fitted_doclen(con, stemmer):
|
||||
con.sql(f"""
|
||||
CREATE OR REPLACE MACRO fts_main_documents.match_lm(docname, query_string, fields := NULL, lambda := 0.3, conjunctive := 0) AS (
|
||||
WITH tokens AS (
|
||||
SELECT DISTINCT stem(unnest(fts_main_documents.tokenize(query_string)), '{stemmer}') AS t
|
||||
),
|
||||
fieldids AS (
|
||||
SELECT fieldid
|
||||
FROM fts_main_documents.fields
|
||||
WHERE CASE WHEN fields IS NULL THEN 1 ELSE field IN (SELECT * FROM (SELECT UNNEST(string_split(fields, ','))) AS fsq) END
|
||||
),
|
||||
qtermids AS (
|
||||
SELECT termid, df
|
||||
FROM fts_main_documents.dict AS dict,
|
||||
tokens
|
||||
WHERE dict.term = tokens.t
|
||||
),
|
||||
qterms AS (
|
||||
SELECT termid,
|
||||
docid
|
||||
FROM fts_main_documents.terms AS terms
|
||||
WHERE CASE WHEN fields IS NULL THEN 1 ELSE fieldid IN (SELECT * FROM fieldids) END
|
||||
AND termid IN (SELECT qtermids.termid FROM qtermids)
|
||||
),
|
||||
term_tf AS (
|
||||
SELECT termid, docid, COUNT(*) AS tf
|
||||
FROM qterms
|
||||
GROUP BY docid, termid
|
||||
),
|
||||
cdocs AS (
|
||||
SELECT docid
|
||||
FROM qterms
|
||||
GROUP BY docid
|
||||
HAVING CASE WHEN conjunctive THEN COUNT(DISTINCT termid) = (SELECT COUNT(*) FROM tokens) ELSE 1 END
|
||||
),
|
||||
subscores AS (
|
||||
SELECT docs.docid, EXP(LN(docs.docid)*stats.slope + stats.intercept) AS newlen,
|
||||
term_tf.termid, tf, df,
|
||||
LN(1 + (lambda * tf * (SELECT sumdf FROM fts_main_documents.stats)) / ((1-lambda) * df * newlen)) AS subscore
|
||||
FROM term_tf, cdocs, fts_main_documents.docs AS docs, qtermids,
|
||||
fts_main_documents.stats AS stats
|
||||
WHERE term_tf.docid = cdocs.docid
|
||||
AND term_tf.docid = docs.docid
|
||||
AND term_tf.termid = qtermids.termid
|
||||
),
|
||||
scores AS (
|
||||
SELECT docid, LN(ANY_VALUE(newlen)) + sum(subscore) AS score
|
||||
FROM subscores
|
||||
GROUP BY docid
|
||||
)
|
||||
SELECT score
|
||||
FROM scores, fts_main_documents.docs AS docs
|
||||
WHERE scores.docid = docs.docid
|
||||
AND docs.name = docname
|
||||
)"""
|
||||
)
|
||||
|
||||
|
||||
def replace_lm_fitted_prior(con, stemmer='none'):
|
||||
"""
|
||||
Only use fitted prior, but keep on using the old document lengths.
|
||||
"""
|
||||
sql = f"""
|
||||
CREATE OR REPLACE MACRO fts_main_documents.match_lm(docname, query_string, fields := NULL, lambda := 0.3, conjunctive := 0) AS (
|
||||
WITH tokens AS (
|
||||
SELECT stem(unnest(fts_main_documents.tokenize(query_string)), '{stemmer}') AS t
|
||||
),
|
||||
fieldids AS (
|
||||
SELECT fieldid
|
||||
FROM fts_main_documents.fields
|
||||
WHERE CASE WHEN fields IS NULL THEN 1 ELSE field IN (SELECT * FROM (SELECT UNNEST(string_split(fields, ','))) AS fsq) END
|
||||
),
|
||||
qtermids AS (
|
||||
SELECT termid, df, COUNT(*) AS qtf
|
||||
FROM fts_main_documents.dict AS dict,
|
||||
tokens
|
||||
WHERE dict.term = tokens.t
|
||||
GROUP BY termid, df
|
||||
),
|
||||
qterms AS (
|
||||
SELECT termid,
|
||||
docid
|
||||
FROM fts_main_documents.terms AS terms
|
||||
WHERE CASE WHEN fields IS NULL THEN 1 ELSE fieldid IN (SELECT * FROM fieldids) END
|
||||
AND termid IN (SELECT qtermids.termid FROM qtermids)
|
||||
),
|
||||
term_tf AS (
|
||||
SELECT termid, docid, COUNT(*) AS tf
|
||||
FROM qterms
|
||||
GROUP BY docid, termid
|
||||
),
|
||||
cdocs AS (
|
||||
SELECT docid
|
||||
FROM qterms
|
||||
GROUP BY docid
|
||||
HAVING CASE WHEN conjunctive THEN COUNT(DISTINCT termid) = (SELECT COUNT(*) FROM tokens) ELSE 1 END
|
||||
),
|
||||
subscores AS (
|
||||
SELECT docs.docid, docs.len, term_tf.termid, term_tf.tf, qtermids.df,
|
||||
qtermids.qtf * LN(1 + (lambda * tf * (SELECT ANY_VALUE(sumdf) FROM fts_main_documents.stats)) / ((1-lambda) * df * len)) AS subscore
|
||||
FROM term_tf, cdocs, fts_main_documents.docs AS docs, qtermids
|
||||
WHERE term_tf.docid = cdocs.docid
|
||||
AND term_tf.docid = docs.docid
|
||||
AND term_tf.termid = qtermids.termid
|
||||
),
|
||||
scores AS (
|
||||
SELECT docid, (LN(docid)*(SELECT ANY_VALUE(slope) FROM fts_main_documents.stats)) + sum(subscore) AS score
|
||||
FROM subscores
|
||||
GROUP BY docid
|
||||
)
|
||||
SELECT score
|
||||
FROM scores, fts_main_documents.docs AS docs
|
||||
WHERE scores.docid = docs.docid
|
||||
AND docs.name = docname
|
||||
)
|
||||
"""
|
||||
con.sql(sql)
|
||||
|
||||
|
||||
def renumber_doc_ids(con, column):
|
||||
con.sql(f"""
|
||||
-- renumber document ids by decreasing len/prior column
|
||||
CREATE TABLE fts_main_documents.docs_new AS
|
||||
SELECT ROW_NUMBER() over (ORDER BY "{column}" DESC, name ASC) newid, docs.*
|
||||
FROM fts_main_documents.docs AS docs;
|
||||
-- update postings
|
||||
CREATE TABLE fts_main_documents.terms_new AS
|
||||
SELECT D.newid as docid, T.fieldid, T.termid
|
||||
FROM fts_main_documents.terms T, fts_main_documents.docs_new D
|
||||
WHERE T.docid = D.docid
|
||||
ORDER BY T.termid;
|
||||
-- replace old by new data
|
||||
ALTER TABLE fts_main_documents.docs_new DROP COLUMN docid;
|
||||
ALTER TABLE fts_main_documents.docs_new RENAME COLUMN newid TO docid;
|
||||
DROP TABLE fts_main_documents.docs;
|
||||
DROP TABLE fts_main_documents.terms;
|
||||
ALTER TABLE fts_main_documents.docs_new RENAME TO docs;
|
||||
ALTER TABLE fts_main_documents.terms_new RENAME TO terms;
|
||||
UPDATE fts_main_documents.stats SET index_type = 'fitted';
|
||||
""")
|
||||
|
||||
|
||||
def reindex_fitted_column(name_in, name_out, column='prior', total=None,
|
||||
print_sample=False, threshold=0, qrels=None):
|
||||
if column not in ['len', 'prior']:
|
||||
raise ValueError(f'Column "{column}" not allowed: use len or prior.')
|
||||
copy_file(name_in, name_out)
|
||||
con = duckdb.connect(name_out)
|
||||
renumber_doc_ids(con, column)
|
||||
try:
|
||||
con.sql("""
|
||||
ALTER TABLE fts_main_documents.stats ADD intercept DOUBLE;
|
||||
ALTER TABLE fts_main_documents.stats ADD slope DOUBLE;
|
||||
""")
|
||||
except duckdb.duckdb.CatalogException as e:
|
||||
print ("Warning: " + str(e), file=sys.stderr)
|
||||
if qrels:
|
||||
insert_qrels(con, qrels)
|
||||
if total:
|
||||
sample_by_fixed_points_qrels(con, total)
|
||||
else:
|
||||
raise ValueError("Not implemented.")
|
||||
else:
|
||||
if total:
|
||||
sample_by_fixed_points(con, column, threshold, total)
|
||||
else:
|
||||
sample_by_values(con, column, threshold)
|
||||
if print_sample:
|
||||
print_sample_tsv(con, total)
|
||||
train_linear_regression(con)
|
||||
con.sql(f"""
|
||||
DROP VIEW sample;
|
||||
ALTER TABLE fts_main_documents.docs DROP COLUMN "{column}";
|
||||
""")
|
||||
stemmer = get_stats_stemmer(con)
|
||||
if column == 'len':
|
||||
replace_lm_fitted_doclen(con, stemmer=stemmer)
|
||||
replace_bm25_fitted_doclen(con, stemmer=stemmer)
|
||||
else:
|
||||
replace_lm_fitted_prior(con, stemmer=stemmer)
|
||||
con.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
reindex_fitted_column('robustZE.db', 'robustZE_fitted20.db', column='len', total=None, print_sample=True, threshold=20, qrels=None)
|
||||
112
ze_reindex_group.py
Normal file
112
ze_reindex_group.py
Normal file
@ -0,0 +1,112 @@
|
||||
import duckdb
|
||||
import pathlib
|
||||
import sys
|
||||
|
||||
|
||||
def copy_file(name_in, name_out):
|
||||
path1 = pathlib.Path(name_in)
|
||||
if not(path1.is_file()):
|
||||
raise ValueError(f"File {name_in} does not exist.")
|
||||
path2 = pathlib.Path(name_out)
|
||||
if path2.is_file():
|
||||
raise ValueError(f"File {name_out} already exists.")
|
||||
path2.write_bytes(path1.read_bytes())
|
||||
|
||||
|
||||
def get_stats_stemmer(con):
|
||||
sql = "SELECT stemmer FROM fts_main_documents.stats"
|
||||
return con.sql(sql).fetchall()[0][0]
|
||||
|
||||
|
||||
def replace_bm25(con, stemmer):
|
||||
""" The standard DuckDB BM25 implementation does not work with the grouped index.
|
||||
This version also works with the standard DuckDB index.
|
||||
"""
|
||||
con.sql(f"""
|
||||
CREATE OR REPLACE MACRO fts_main_documents.match_bm25(docname, query_string, b := 0.75, k := 1.2, conjunctive := 0, fields := NULL) AS (
|
||||
WITH tokens AS (
|
||||
SELECT DISTINCT stem(unnest(fts_main_documents.tokenize(query_string)), '{stemmer}') AS t
|
||||
),
|
||||
fieldids AS (
|
||||
SELECT fieldid
|
||||
FROM fts_main_documents.fields
|
||||
WHERE CASE WHEN ((fields IS NULL)) THEN (1) ELSE (field = ANY(SELECT * FROM (SELECT unnest(string_split(fields, ','))) AS fsq)) END
|
||||
),
|
||||
qtermids AS (
|
||||
SELECT termid, df
|
||||
FROM fts_main_documents.dict AS dict, tokens
|
||||
WHERE (dict.term = tokens.t)
|
||||
),
|
||||
qterms AS (
|
||||
SELECT termid, docid
|
||||
FROM fts_main_documents.terms AS terms
|
||||
WHERE (CASE WHEN ((fields IS NULL)) THEN (1) ELSE (fieldid = ANY(SELECT * FROM fieldids)) END
|
||||
AND (termid = ANY(SELECT qtermids.termid FROM qtermids)))
|
||||
),
|
||||
term_tf AS (
|
||||
SELECT termid, docid, count_star() AS tf
|
||||
FROM qterms
|
||||
GROUP BY docid, termid
|
||||
),
|
||||
cdocs AS (
|
||||
SELECT docid
|
||||
FROM qterms
|
||||
GROUP BY docid
|
||||
HAVING CASE WHEN (conjunctive) THEN ((count(DISTINCT termid) = (SELECT count_star() FROM tokens))) ELSE 1 END
|
||||
),
|
||||
subscores AS (
|
||||
SELECT docs.docid, len, term_tf.termid, tf, df,
|
||||
(log((((((SELECT num_docs FROM fts_main_documents.stats) - df) + 0.5) / (df + 0.5)) + 1)) * ((tf * (k + 1)) / (tf + (k * ((1 - b) + (b * (len / (SELECT avgdl FROM fts_main_documents.stats)))))))) AS subscore
|
||||
FROM term_tf, cdocs, fts_main_documents.docs AS docs, qtermids
|
||||
WHERE (term_tf.docid = cdocs.docid)
|
||||
AND (term_tf.docid = docs.docid)
|
||||
AND (term_tf.termid = qtermids.termid)
|
||||
),
|
||||
scores AS (
|
||||
SELECT docid, sum(subscore) AS score
|
||||
FROM subscores
|
||||
GROUP BY docid
|
||||
)
|
||||
SELECT score
|
||||
FROM scores, fts_main_documents.docs AS docs
|
||||
WHERE (scores.docid = docs.docid) AND (docs."name" = docname)
|
||||
)
|
||||
""")
|
||||
|
||||
|
||||
def reindex_group(name_in, name_out, stemmer='porter'):
|
||||
copy_file(name_in, name_out)
|
||||
con = duckdb.connect(name_out)
|
||||
oldstemmer = get_stats_stemmer(con)
|
||||
if oldstemmer != 'none':
|
||||
print(f"Warning: stemmer {oldstemmer} was already used on this database")
|
||||
con.sql(f"""
|
||||
-- newdict gives stems unique ids
|
||||
CREATE TABLE fts_main_documents.newdict AS
|
||||
SELECT termid, term, stem(term, '{stemmer}') AS stem, DENSE_RANK() OVER (ORDER BY stem) AS newid, df
|
||||
FROM fts_main_documents.dict;
|
||||
DROP TABLE fts_main_documents.dict;
|
||||
-- newterms uses those new ids
|
||||
CREATE TABLE fts_main_documents.newterms AS
|
||||
SELECT terms.docid, terms.fieldid, newdict.newid AS termid
|
||||
FROM fts_main_documents.terms AS terms, fts_main_documents.newdict AS newdict
|
||||
WHERE terms.termid = newdict.termid;
|
||||
DROP TABLE fts_main_documents.terms;
|
||||
ALTER TABLE fts_main_documents.newterms RENAME TO terms;
|
||||
-- now remove old ids from dict table and compute new dfs.
|
||||
CREATE TABLE fts_main_documents.dict AS
|
||||
SELECT D.newid AS termid, D.term, COUNT(DISTINCT T.docid) AS df
|
||||
FROM fts_main_documents.newdict D, fts_main_documents.terms T
|
||||
WHERE T.termid = D.newid
|
||||
GROUP BY D.newid, D.term;
|
||||
DROP TABLE fts_main_documents.newdict;
|
||||
-- update stats
|
||||
UPDATE fts_main_documents.stats SET index_type = 'grouped({stemmer})';
|
||||
""")
|
||||
replace_bm25(con, oldstemmer)
|
||||
con.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
reindex_group('robustZE.db', 'robustZEgrouped.db')
|
||||
|
||||
114
ze_reindex_prior.py
Normal file
114
ze_reindex_prior.py
Normal file
@ -0,0 +1,114 @@
|
||||
import pathlib
|
||||
import sys
|
||||
|
||||
import duckdb
|
||||
|
||||
|
||||
def copy_file(name_in, name_out):
|
||||
path1 = pathlib.Path(name_in)
|
||||
if not path1.is_file():
|
||||
raise ValueError(f"File {name_in} does not exist.")
|
||||
path2 = pathlib.Path(name_out)
|
||||
if path2.is_file():
|
||||
raise ValueError(f"File {name_out} already exists.")
|
||||
path2.write_bytes(path1.read_bytes())
|
||||
|
||||
|
||||
def get_stats_stemmer(con):
|
||||
sql = "SELECT stemmer FROM fts_main_documents.stats"
|
||||
return con.sql(sql).fetchall()[0][0]
|
||||
|
||||
|
||||
def replace_lm_prior(con, stemmer):
|
||||
con.sql(f"""
|
||||
CREATE OR REPLACE MACRO fts_main_documents.match_lm(docname, query_string, fields := NULL, lambda := 0.3, conjunctive := 0) AS (
|
||||
WITH tokens AS (
|
||||
SELECT DISTINCT stem(unnest(fts_main_documents.tokenize(query_string)), '{stemmer}') AS t
|
||||
),
|
||||
fieldids AS (
|
||||
SELECT fieldid
|
||||
FROM fts_main_documents.fields
|
||||
WHERE CASE WHEN ((fields IS NULL)) THEN (1) ELSE (field = ANY(SELECT * FROM (SELECT unnest(string_split(fields, ','))) AS fsq)) END
|
||||
),
|
||||
qtermids AS (
|
||||
SELECT termid, df
|
||||
FROM fts_main_documents.dict AS dict, tokens
|
||||
WHERE (dict.term = tokens.t)
|
||||
),
|
||||
qterms AS (
|
||||
SELECT termid, docid
|
||||
FROM fts_main_documents.terms AS terms
|
||||
WHERE (CASE WHEN ((fields IS NULL)) THEN (1) ELSE (fieldid = ANY(SELECT * FROM fieldids)) END
|
||||
AND (termid = ANY(SELECT qtermids.termid FROM qtermids)))
|
||||
),
|
||||
term_tf AS (
|
||||
SELECT termid, docid, count_star() AS tf
|
||||
FROM qterms
|
||||
GROUP BY docid, termid
|
||||
),
|
||||
cdocs AS (
|
||||
SELECT docid
|
||||
FROM qterms
|
||||
GROUP BY docid
|
||||
HAVING CASE WHEN (conjunctive) THEN ((count(DISTINCT termid) = (SELECT count_star() FROM tokens))) ELSE 1 END
|
||||
),
|
||||
subscores AS (
|
||||
SELECT docs.docid, prior, len, term_tf.termid, tf, df, LN(1 + (lambda * tf * (SELECT ANY_VALUE(sumdf) FROM fts_main_documents.stats)) / ((1-lambda) * df * len)) AS subscore
|
||||
FROM term_tf, cdocs, fts_main_documents.docs AS docs, qtermids
|
||||
WHERE ((term_tf.docid = cdocs.docid)
|
||||
AND (term_tf.docid = docs.docid)
|
||||
AND (term_tf.termid = qtermids.termid))
|
||||
),
|
||||
scores AS (
|
||||
SELECT docid, LN(ANY_VALUE(prior)) + sum(subscore) AS score FROM subscores GROUP BY docid
|
||||
)
|
||||
SELECT score FROM scores, fts_main_documents.docs AS docs
|
||||
WHERE ((scores.docid = docs.docid) AND (docs."name" = docname)))
|
||||
""")
|
||||
|
||||
|
||||
def insert_priors(con, csv_file, default):
|
||||
con.sql(f"""
|
||||
UPDATE fts_main_documents.docs AS docs
|
||||
SET prior = priors.prior
|
||||
FROM read_csv({csv_file}) AS priors
|
||||
WHERE docs.name = priors.did
|
||||
""")
|
||||
if not default is None:
|
||||
con.sql(f"""
|
||||
UPDATE fts_main_documents.docs
|
||||
SET prior = {default}
|
||||
WHERE prior IS NULL
|
||||
""")
|
||||
else:
|
||||
count = con.sql("""
|
||||
SELECT COUNT(*)
|
||||
FROM fts_main_documents.docs
|
||||
WHERE prior IS NULL
|
||||
""").fetchall()[0][0]
|
||||
if count > 0:
|
||||
print(f"Warning: {count} rows missing from file. Use --default", file=sys.stderr)
|
||||
|
||||
|
||||
def reindex_prior(name_in, name_out, csv_file=None, default=None, init=None):
|
||||
copy_file(name_in, name_out)
|
||||
con = duckdb.connect(name_out)
|
||||
con.sql("ALTER TABLE fts_main_documents.docs ADD prior DOUBLE")
|
||||
if (csv_file and init):
|
||||
print(f"Warning: init={init} ignored.", file=sys.stderr)
|
||||
if csv_file:
|
||||
insert_priors(con, csv_file, default)
|
||||
elif init:
|
||||
if init == 'len':
|
||||
con.sql("UPDATE fts_main_documents.docs SET prior = len")
|
||||
elif init == 'uniform':
|
||||
con.sql("UPDATE fts_main_documents.docs SET prior = 1")
|
||||
else:
|
||||
raise ValueError(f'Unknown value for init: {init}')
|
||||
stemmer = get_stats_stemmer(con)
|
||||
replace_lm_prior(con, stemmer=stemmer)
|
||||
con.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
reindex_prior('cran.db', 'cran_prior.db', csv_file='test_priors.csv')
|
||||
99
ze_search.py
Normal file
99
ze_search.py
Normal file
@ -0,0 +1,99 @@
|
||||
"""
|
||||
Zoekeend searcher.
|
||||
Author: Djoerd Hiemstra
|
||||
"""
|
||||
|
||||
import sys
|
||||
|
||||
import duckdb
|
||||
import ir_datasets
|
||||
|
||||
|
||||
def duckdb_search_lm(con, query, limit):
|
||||
sql = """
|
||||
SELECT docname, score, postings_cost
|
||||
FROM fts_main_documents.match_lm($1)
|
||||
ORDER BY score DESC
|
||||
LIMIT $2
|
||||
"""
|
||||
return con.execute(sql, [query, limit]).fetchall()
|
||||
|
||||
# def duckdb_search_lm(con, query, limit, l):
|
||||
# print(f"Searching for: {query} with limit {limit} and l={l}")
|
||||
# sql = """
|
||||
# SELECT docname, score, postings_cost
|
||||
# FROM fts_main_documents.match_lm(docname, $1)
|
||||
# ORDER BY score DESC
|
||||
# LIMIT $2
|
||||
# """
|
||||
# return con.execute(sql, [query, limit]).fetchall()
|
||||
|
||||
def duckdb_search_bm25(con, query, limit, b, k):
|
||||
sql = """
|
||||
SELECT did, score
|
||||
FROM (
|
||||
SELECT did, fts_main_documents.match_bm25(did, $1, b=$2, k=$3) AS score
|
||||
FROM documents) sq
|
||||
WHERE score IS NOT NULL
|
||||
ORDER BY score DESC
|
||||
LIMIT $4
|
||||
"""
|
||||
return con.execute(sql, [query, b, k, limit]).fetchall()
|
||||
|
||||
class Query:
|
||||
def __init__(self, query_id, text):
|
||||
self.query_id = query_id
|
||||
self.text = text
|
||||
|
||||
|
||||
def get_queries_from_file(query_file):
|
||||
with open(query_file, "r") as file:
|
||||
for line in file:
|
||||
(query_id, text) = line.split('\t')
|
||||
yield Query(query_id, text)
|
||||
|
||||
|
||||
def get_queries(query_tag):
|
||||
if query_tag == "custom":
|
||||
from ze_eval import ir_dataset_test
|
||||
return ir_dataset_test().queries_iter()
|
||||
try:
|
||||
return ir_datasets.load(query_tag).queries_iter()
|
||||
except KeyError:
|
||||
pass
|
||||
return get_queries_from_file(query_tag)
|
||||
|
||||
|
||||
def search_run(db_name, query_tag, matcher='lm', run_tag=None,
|
||||
b=0.75, k=1.2, limit=1000, fileout=None,
|
||||
startq=None, endq=None):
|
||||
con = duckdb.connect(db_name, read_only=True)
|
||||
if fileout:
|
||||
file = open(fileout, "w")
|
||||
else:
|
||||
file = sys.stdout
|
||||
if not run_tag:
|
||||
run_tag = matcher
|
||||
queries = get_queries(query_tag)
|
||||
for query in queries:
|
||||
qid = query.query_id
|
||||
if (startq and int(qid) < startq) or (endq and int(qid) > endq):
|
||||
continue
|
||||
if hasattr(query, 'title'):
|
||||
q_string = query.title
|
||||
else:
|
||||
q_string = query.text
|
||||
if matcher == 'lm':
|
||||
hits = duckdb_search_lm(con, q_string, limit)
|
||||
elif matcher == 'bm25':
|
||||
hits = duckdb_search_bm25(con, q_string, limit, b, k)
|
||||
else:
|
||||
raise ValueError(f"Unknown match function: {matcher}")
|
||||
for rank, (docno, score, postings_cost) in enumerate(hits):
|
||||
file.write(f'{qid} Q0 {docno} {rank} {score} {run_tag} {postings_cost}\n')
|
||||
con.close()
|
||||
file.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
search_run('cran.db', 'cranfield.tsv')
|
||||
49
ze_vacuum.py
Normal file
49
ze_vacuum.py
Normal file
@ -0,0 +1,49 @@
|
||||
import duckdb
|
||||
import pathlib
|
||||
|
||||
|
||||
def copy_file_force(name_in, name_out):
|
||||
path1 = pathlib.Path(name_in)
|
||||
if not(path1.is_file()):
|
||||
raise ValueError(f"File {name_in} does not exist.")
|
||||
path2 = pathlib.Path(name_out)
|
||||
path2.write_bytes(path1.read_bytes())
|
||||
|
||||
|
||||
def rm_file(name):
|
||||
path = pathlib.Path(name)
|
||||
path.unlink()
|
||||
|
||||
|
||||
def cluster_index(con):
|
||||
con.sql("""
|
||||
USE fts_main_documents;
|
||||
CREATE TABLE terms_new AS SELECT * FROM terms ORDER BY termid, docid;
|
||||
DROP TABLE terms;
|
||||
ALTER TABLE terms_new RENAME TO terms;
|
||||
CREATE TABLE dict_new AS SELECT * FROM dict ORDER BY term;
|
||||
DROP TABLE dict;
|
||||
ALTER TABLE dict_new RENAME TO dict;
|
||||
CREATE TABLE docs_new AS SELECT * FROM docs ORDER BY docid;
|
||||
DROP TABLE docs;
|
||||
ALTER TABLE docs_new RENAME TO docs;
|
||||
""")
|
||||
|
||||
|
||||
def reclaim_disk_space(name, cluster=True):
|
||||
# Unfortunately, DuckDB does not reclaim disk space automatically
|
||||
# therefore, we do a copy
|
||||
tmpname = name + '.tmp'
|
||||
copy_file_force(name, tmpname)
|
||||
con = duckdb.connect(tmpname)
|
||||
if cluster:
|
||||
cluster_index(con)
|
||||
rm_file(name)
|
||||
con.sql(f"""
|
||||
ATTACH '{tmpname}' AS tmpdb;
|
||||
ATTACH '{name}' AS db;
|
||||
COPY FROM DATABASE tmpdb TO db;
|
||||
""")
|
||||
con.close()
|
||||
rm_file(tmpname)
|
||||
|
||||
600
zoekeend
Executable file
600
zoekeend
Executable file
@ -0,0 +1,600 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
"""
|
||||
Zoekeend experimental information retrieval system using DuckDB
|
||||
Copyright (C) 2024 Djoerd Hiemstra
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as published
|
||||
by the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
Contact: hiemstra@cs.ru.nl
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import pathlib
|
||||
import sys
|
||||
|
||||
import duckdb
|
||||
import ir_datasets
|
||||
|
||||
import ze_eval
|
||||
|
||||
|
||||
ze_datasets = {
|
||||
"rb04": "disks45/nocr/trec-robust-2004",
|
||||
"msm2": "msmarco-passage",
|
||||
"msm2dev": "msmarco-passage/trec-dl-2019/judged",
|
||||
"msm2tst": "msmarco-passage/trec-dl-2020/judged",
|
||||
"cran": "cranfield",
|
||||
}
|
||||
|
||||
|
||||
def fatal(message):
|
||||
"""Print error message and exit."""
|
||||
print(message, file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
# TODO: def zoekeend_index_bydict(args):
|
||||
# index_bydict test.db dataset --in dictionary --out dictionary
|
||||
# --max_size 99999 --algorithm bytepair --dryrun
|
||||
# out dictionary is dictionary for future index, if called again.
|
||||
|
||||
# TODO: add to ze_search: report query cross entropy and Cost-in-Postings.
|
||||
|
||||
|
||||
def zoekeend_index(args):
|
||||
"""
|
||||
Create the index file for an Information Retrieval dataset.
|
||||
This index uses the standard DuckDB FTS extension. Based on:
|
||||
Hannes Mühleisen, Thaer Samar, Jimmy Lin, and Arjen de Vries, Old dogs
|
||||
are great at new tricks: Column stores for IR prototyping. In SIGIR 2014.
|
||||
"""
|
||||
import ze_index # defer imports, so no dependencies needed, unless used
|
||||
|
||||
if args.dataset in ze_datasets:
|
||||
args.dataset = ze_datasets[args.dataset]
|
||||
try:
|
||||
if args.dataset == "custom":
|
||||
ir_dataset = ze_eval.ir_dataset_test()
|
||||
else:
|
||||
ir_dataset = ir_datasets.load(args.dataset)
|
||||
ze_index.index_documents(
|
||||
args.dbname,
|
||||
ir_dataset,
|
||||
stemmer=args.wordstemmer,
|
||||
stopwords=args.stopwords,
|
||||
keepcontent=args.keep_content,
|
||||
)
|
||||
except ValueError as e:
|
||||
fatal(e)
|
||||
except KeyError as e:
|
||||
fatal("Unknown dataset: " + str(e))
|
||||
|
||||
|
||||
def zoekeend_search(args):
|
||||
"""
|
||||
Run queries and create a run file in TREC output.
|
||||
The language model (lm) is based on: Djoerd Hiemstra, A probabilistic
|
||||
justification for using tf.idf term weighting in information retrieval,
|
||||
International Journal on Digital Libraries 3(2), 2000.
|
||||
"""
|
||||
import ze_search
|
||||
|
||||
if not pathlib.Path(args.dbname).is_file():
|
||||
fatal(f"Error: file {args.dbname} does not exist")
|
||||
if args.out and pathlib.Path(args.out).is_file():
|
||||
fatal(f"Error: file {args.out} exists")
|
||||
if args.queries in ze_datasets:
|
||||
query_tag = ze_datasets[args.queries]
|
||||
else:
|
||||
query_tag = args.queries
|
||||
try:
|
||||
ze_search.search_run(
|
||||
args.dbname,
|
||||
query_tag,
|
||||
matcher=args.match,
|
||||
run_tag=args.run,
|
||||
k=args.bm25k,
|
||||
b=args.bm25b,
|
||||
limit=args.top,
|
||||
fileout=args.out,
|
||||
startq=args.start,
|
||||
endq=args.end,
|
||||
)
|
||||
except FileNotFoundError:
|
||||
fatal(f"Error: queryset '{args.queries}' does not exist.")
|
||||
except ValueError as e:
|
||||
fatal(e)
|
||||
|
||||
|
||||
def zoekeend_eval(args):
|
||||
"""Evaluate run using trec_eval"""
|
||||
import ze_eval
|
||||
|
||||
if args.queries in ze_datasets:
|
||||
query_tag = ze_datasets[args.queries]
|
||||
else:
|
||||
query_tag = args.queries
|
||||
try:
|
||||
ze_eval.trec_eval(
|
||||
args.run, query_tag, args.complete_rel, args.ndcg, args.query_eval
|
||||
)
|
||||
except (KeyError, AttributeError):
|
||||
fatal(f"Error: query/qrel set '{args.queries}' does not exist.")
|
||||
except ValueError as e:
|
||||
fatal(e)
|
||||
|
||||
|
||||
def zoekeend_vacuum(args):
|
||||
"""Vacuum index to reclaim disk space."""
|
||||
import ze_vacuum
|
||||
|
||||
try:
|
||||
ze_vacuum.reclaim_disk_space(args.dbname, args.cluster)
|
||||
except (ValueError, FileNotFoundError):
|
||||
fatal(f"File not found: {args.dbname}")
|
||||
|
||||
|
||||
def zoekeend_index_import(args):
|
||||
"""
|
||||
Import a CIFF (Common Index File Format) index.
|
||||
Based on: Djoerd Hiemstra, Gijs Hendriksen, Chris Kamphuis, and
|
||||
Arjen de Vries, Challenges of index exchange for search engine
|
||||
interoperability, OSSYM 2023. (see also: zoekeend index_export)
|
||||
"""
|
||||
import ze_index_import
|
||||
|
||||
if pathlib.Path(args.dbname).is_file():
|
||||
fatal(f"Error: file {args.dbname} exists")
|
||||
if not pathlib.Path(args.ciff_file).is_file():
|
||||
fatal(f"Error: file {args.ciff_file} does not exist")
|
||||
try:
|
||||
ze_index_import.ciff_import(
|
||||
args.dbname,
|
||||
args.ciff_file,
|
||||
tokenizer=args.tokenizer,
|
||||
stemmer=args.wordstemmer,
|
||||
)
|
||||
except ValueError as e:
|
||||
fatal("Error in CIFF import: " + str(e))
|
||||
|
||||
|
||||
def zoekeend_index_export(args):
|
||||
"""
|
||||
Export a CIFF (Common Index File Format) index.
|
||||
Based on: Jimmy Lin, Joel Mackenzie, Chris Kamphuis, Craig Macdonald,
|
||||
Antonio Mallia, Michał Siedlaczek, Andrew Trotman, and Arjen de Vries.
|
||||
Supporting interoperability between open-source search engines with the
|
||||
common index file format, SIGIR 2020; (see also: zoekeend index_import)
|
||||
"""
|
||||
import ze_index_export
|
||||
|
||||
if not pathlib.Path(args.dbname).is_file():
|
||||
fatal(f"Error: file {args.dbname} does not exist")
|
||||
if pathlib.Path(args.ciff_file).is_file():
|
||||
fatal(f"Error: file {args.ciff_file} exists")
|
||||
try:
|
||||
ze_index_export.ciff_export(
|
||||
args.dbname,
|
||||
args.ciff_file,
|
||||
description=args.description,
|
||||
batch_size=args.batch_size,
|
||||
)
|
||||
except ValueError as e:
|
||||
fatal("Error in CIFF export: " + str(e))
|
||||
|
||||
|
||||
def zoekeend_reindex_prior(args):
|
||||
"""
|
||||
Recreate the index by including prior (static rank) scores.
|
||||
Based on: Wessel Kraaij, Thijs Westerveld and Djoerd Hiemstra,
|
||||
The Importance of Prior Probabilities for Entry Page Search,
|
||||
SIGIR 2002.
|
||||
"""
|
||||
import ze_reindex_prior
|
||||
|
||||
if not pathlib.Path(args.dbname_in).is_file():
|
||||
fatal(f"Error: file {args.dbname_in} does not exist")
|
||||
if pathlib.Path(args.dbname_out).is_file():
|
||||
fatal(f"Error: file {args.dbname_out} exists")
|
||||
try:
|
||||
ze_reindex_prior.reindex_prior(
|
||||
args.dbname_in,
|
||||
args.dbname_out,
|
||||
csv_file=args.file,
|
||||
default=args.default,
|
||||
init=args.init,
|
||||
)
|
||||
except Exception as e:
|
||||
fatal("Error in reindex prior: " + str(e))
|
||||
|
||||
|
||||
def zoekeend_reindex_fitted(args):
|
||||
"""
|
||||
Recreate the index using by fitting document lengths (len) or prior
|
||||
scores (prior) using linear regression. The length / prior scores
|
||||
are removed from the new index.
|
||||
"""
|
||||
import ze_reindex_fitted
|
||||
|
||||
if not pathlib.Path(args.dbname_in).is_file():
|
||||
fatal(f"Error: file {args.dbname_in} does not exist")
|
||||
if pathlib.Path(args.dbname_out).is_file():
|
||||
fatal(f"Error: file {args.dbname_out} exists")
|
||||
if args.qrls in ze_datasets:
|
||||
args.qrls = ze_datasets[args.qrls]
|
||||
try:
|
||||
ze_reindex_fitted.reindex_fitted_column(
|
||||
args.dbname_in,
|
||||
args.dbname_out,
|
||||
column=args.column,
|
||||
total=args.bins,
|
||||
print_sample=args.print,
|
||||
threshold=args.threshold,
|
||||
qrels=args.qrls,
|
||||
)
|
||||
except ValueError as e:
|
||||
fatal("Error in reindex fitted: " + str(e))
|
||||
|
||||
|
||||
def zoekeend_reindex_const(args):
|
||||
"""
|
||||
Recreate the index using by rescaling term frequencies such that all
|
||||
documents get an artificial length of CONST, using a normalization
|
||||
weight beta inspired by BM25 document length normalization.
|
||||
"""
|
||||
import ze_reindex_const
|
||||
|
||||
if not pathlib.Path(args.dbname_in).is_file():
|
||||
fatal(f"Error: file {args.dbname_in} does not exist")
|
||||
if pathlib.Path(args.dbname_out).is_file():
|
||||
fatal(f"Error: file {args.dbname_out} exists")
|
||||
try:
|
||||
ze_reindex_const.reindex_const(
|
||||
args.dbname_in,
|
||||
args.dbname_out,
|
||||
const_len=args.const,
|
||||
b=args.beta,
|
||||
keep_terms=args.keepterms,
|
||||
)
|
||||
except ValueError as e:
|
||||
fatal("Error in reindex const: " + str(e))
|
||||
|
||||
|
||||
global_parser = argparse.ArgumentParser(prog="zoekeend")
|
||||
global_parser.add_argument(
|
||||
"-v",
|
||||
"--version",
|
||||
action="version",
|
||||
version="zoekeend v0.0.1 (using duckdb v" + duckdb.__version__ + ")",
|
||||
)
|
||||
subparsers = global_parser.add_subparsers(metavar="subexperiment ...")
|
||||
|
||||
|
||||
index_parser = subparsers.add_parser(
|
||||
"index",
|
||||
help="create the index file for an IR dataset",
|
||||
description=zoekeend_index.__doc__,
|
||||
)
|
||||
index_parser.set_defaults(func=zoekeend_index)
|
||||
index_parser.add_argument(
|
||||
"dbname",
|
||||
help="file name of index",
|
||||
)
|
||||
index_parser.add_argument(
|
||||
"dataset",
|
||||
help="ir_dataset, see: https://ir-datasets.com",
|
||||
)
|
||||
index_parser.add_argument(
|
||||
"-w",
|
||||
"--wordstemmer",
|
||||
help="word stemmer (default: none)",
|
||||
default="none",
|
||||
choices=["none", "porter", "dutch"],
|
||||
)
|
||||
index_parser.add_argument(
|
||||
"-s",
|
||||
"--stopwords",
|
||||
help="stop words (default: none)",
|
||||
default="none",
|
||||
choices=["none", "english"],
|
||||
)
|
||||
index_parser.add_argument(
|
||||
"-k",
|
||||
"--keep_content",
|
||||
help="keep the document content column",
|
||||
action="store_true",
|
||||
)
|
||||
|
||||
|
||||
reindex_prior_parser = subparsers.add_parser(
|
||||
"reindex_prior",
|
||||
help="recreate the index including prior scores",
|
||||
description=zoekeend_reindex_prior.__doc__,
|
||||
)
|
||||
reindex_prior_parser.set_defaults(func=zoekeend_reindex_prior)
|
||||
reindex_prior_parser.add_argument(
|
||||
"dbname_in",
|
||||
help="file name of old index",
|
||||
)
|
||||
reindex_prior_parser.add_argument(
|
||||
"dbname_out",
|
||||
help="file name of new index with priors",
|
||||
)
|
||||
reindex_prior_parser.add_argument(
|
||||
"-i",
|
||||
"--init",
|
||||
help="initialize with standard prior ('len' or 'uniform')",
|
||||
choices=["len", "uniform"],
|
||||
)
|
||||
reindex_prior_parser.add_argument(
|
||||
"-f",
|
||||
"--file",
|
||||
help="file with comma-separated (did,prior) pairs",
|
||||
)
|
||||
reindex_prior_parser.add_argument(
|
||||
"-d",
|
||||
"--default",
|
||||
help="default prior for documents missing in the file",
|
||||
type=float,
|
||||
)
|
||||
|
||||
|
||||
reindex_fitted_parser = subparsers.add_parser(
|
||||
"reindex_fitted",
|
||||
help="recreate the index by fitting prior scores",
|
||||
description=zoekeend_reindex_fitted.__doc__,
|
||||
)
|
||||
reindex_fitted_parser.set_defaults(func=zoekeend_reindex_fitted)
|
||||
reindex_fitted_parser.add_argument(
|
||||
"dbname_in",
|
||||
help="file name of old index",
|
||||
)
|
||||
reindex_fitted_parser.add_argument(
|
||||
"dbname_out",
|
||||
help="file name of new fitted index",
|
||||
)
|
||||
reindex_fitted_parser.add_argument(
|
||||
"-c",
|
||||
"--column",
|
||||
help="column to be used for fitting (default: prior)",
|
||||
default="prior",
|
||||
choices=["len", "prior"],
|
||||
)
|
||||
reindex_fitted_parser.add_argument(
|
||||
"-b",
|
||||
"--bins",
|
||||
help="number of bins",
|
||||
type=int,
|
||||
)
|
||||
reindex_fitted_parser.add_argument(
|
||||
"-p",
|
||||
"--print",
|
||||
help="print sample used for fitting",
|
||||
action="store_true",
|
||||
)
|
||||
reindex_fitted_parser.add_argument(
|
||||
"-q",
|
||||
"--qrls",
|
||||
help="training queries/qrels",
|
||||
)
|
||||
reindex_fitted_parser.add_argument(
|
||||
"-t",
|
||||
"--threshold",
|
||||
help="prior values <= threshold are ignored (default: 0)",
|
||||
default=0,
|
||||
type=int,
|
||||
)
|
||||
|
||||
|
||||
reindex_const_parser = subparsers.add_parser(
|
||||
"reindex_const",
|
||||
help="recreate the index by rescaling term frequencies",
|
||||
description=zoekeend_reindex_const.__doc__,
|
||||
)
|
||||
reindex_const_parser.set_defaults(func=zoekeend_reindex_const)
|
||||
reindex_const_parser.add_argument(
|
||||
"dbname_in",
|
||||
help="file name of old index",
|
||||
)
|
||||
reindex_const_parser.add_argument(
|
||||
"dbname_out",
|
||||
help="file name of new fitted index",
|
||||
)
|
||||
reindex_const_parser.add_argument(
|
||||
"-c",
|
||||
"--const",
|
||||
help="constant document length (default: 400)",
|
||||
type=int,
|
||||
default=400,
|
||||
)
|
||||
reindex_const_parser.add_argument(
|
||||
"-b",
|
||||
"--beta",
|
||||
help="length normalization parameter (default: 1.0)",
|
||||
type=float,
|
||||
default=1.0,
|
||||
)
|
||||
reindex_const_parser.add_argument(
|
||||
"-k",
|
||||
"--keepterms",
|
||||
action="store_true",
|
||||
help="keep all terms, even if new tf is small",
|
||||
)
|
||||
|
||||
|
||||
search_parser = subparsers.add_parser(
|
||||
"search",
|
||||
help="execute queries and create run output",
|
||||
description=zoekeend_search.__doc__,
|
||||
)
|
||||
search_parser.set_defaults(func=zoekeend_search)
|
||||
search_parser.add_argument(
|
||||
"dbname",
|
||||
help="file name of index",
|
||||
)
|
||||
search_parser.add_argument(
|
||||
"queries",
|
||||
help="ir_dataset queries id or tab-separated query file",
|
||||
)
|
||||
search_parser.add_argument(
|
||||
"-r",
|
||||
"--run",
|
||||
help="run tag",
|
||||
)
|
||||
search_parser.add_argument(
|
||||
"-t",
|
||||
"--top",
|
||||
type=int,
|
||||
default=1000,
|
||||
help="amount of top results (default: 1000)",
|
||||
)
|
||||
search_parser.add_argument(
|
||||
"-o", "--out", help="the run file to be outputted (default: stdout)"
|
||||
)
|
||||
search_parser.add_argument(
|
||||
"-m",
|
||||
"--match",
|
||||
help="match function: languge models (default) or bm25",
|
||||
default="lm",
|
||||
choices=["lm", "bm25"],
|
||||
)
|
||||
search_parser.add_argument(
|
||||
"-l", "--lmbda", help="lm lambda parameter (default: 0.3)", type=float, default=0.3
|
||||
)
|
||||
search_parser.add_argument(
|
||||
"-k", "--bm25k", help="bm25 k parameter (default: 0.9)", type=float, default=0.9
|
||||
)
|
||||
search_parser.add_argument(
|
||||
"-b", "--bm25b", help="bm25 b parameter (default: 0.4)", type=float, default=0.4
|
||||
)
|
||||
search_parser.add_argument(
|
||||
"-s",
|
||||
"--start",
|
||||
help="start identifier of query",
|
||||
type=int,
|
||||
)
|
||||
search_parser.add_argument(
|
||||
"-e",
|
||||
"--end",
|
||||
help="end identifier of query",
|
||||
type=int,
|
||||
)
|
||||
|
||||
|
||||
vacuum_parser = subparsers.add_parser(
|
||||
"vacuum",
|
||||
help="vacuum index to reclaim disk space",
|
||||
description=zoekeend_vacuum.__doc__,
|
||||
)
|
||||
vacuum_parser.set_defaults(func=zoekeend_vacuum)
|
||||
vacuum_parser.add_argument(
|
||||
"dbname",
|
||||
help="file name of index",
|
||||
)
|
||||
vacuum_parser.add_argument("-c", "--cluster", action="store_true", help="cluster index")
|
||||
|
||||
|
||||
eval_parser = subparsers.add_parser(
|
||||
"eval", help="evaluate run using trec_eval", description=zoekeend_eval.__doc__
|
||||
)
|
||||
eval_parser.set_defaults(func=zoekeend_eval)
|
||||
eval_parser.add_argument(
|
||||
"run",
|
||||
help="trec run file",
|
||||
)
|
||||
eval_parser.add_argument(
|
||||
"queries",
|
||||
help="ir_dataset queries id or trec qrel file",
|
||||
)
|
||||
eval_parser.add_argument(
|
||||
"-c",
|
||||
"--complete_rel",
|
||||
action="store_true",
|
||||
help="queries with missing results contribute a value of 0",
|
||||
)
|
||||
eval_parser.add_argument(
|
||||
"-n",
|
||||
"--ndcg",
|
||||
action="store_true",
|
||||
help="add normalized discounted cummaltive gain (ndcg)",
|
||||
)
|
||||
eval_parser.add_argument(
|
||||
"-q",
|
||||
"--query_eval",
|
||||
action="store_true",
|
||||
help="give evaluation for each query/topic",
|
||||
)
|
||||
|
||||
|
||||
index_import_parser = subparsers.add_parser(
|
||||
"index_import", help="import ciff index", description=zoekeend_index_import.__doc__
|
||||
)
|
||||
index_import_parser.set_defaults(func=zoekeend_index_import)
|
||||
index_import_parser.add_argument(
|
||||
"dbname",
|
||||
help="file name of index",
|
||||
)
|
||||
index_import_parser.add_argument(
|
||||
"ciff_file",
|
||||
help="ciff file",
|
||||
)
|
||||
index_import_parser.add_argument(
|
||||
"-t",
|
||||
"--tokenizer",
|
||||
help="tokenizer (default: ciff)",
|
||||
default="ciff",
|
||||
choices=["ciff", "duckdb"],
|
||||
)
|
||||
index_import_parser.add_argument(
|
||||
"-w",
|
||||
"--wordstemmer",
|
||||
help="word stemmer (default: none)",
|
||||
default="none",
|
||||
choices=["none", "porter", "dutch"],
|
||||
)
|
||||
|
||||
|
||||
index_export_parser = subparsers.add_parser(
|
||||
"index_export", help="export ciff index", description=zoekeend_index_import.__doc__
|
||||
)
|
||||
index_export_parser.set_defaults(func=zoekeend_index_export)
|
||||
index_export_parser.add_argument(
|
||||
"dbname",
|
||||
help="file name of index",
|
||||
)
|
||||
index_export_parser.add_argument(
|
||||
"ciff_file",
|
||||
help="ciff file",
|
||||
)
|
||||
index_export_parser.add_argument(
|
||||
"-d",
|
||||
"--description",
|
||||
help="CIFF description (default: Exported from DuckDB)",
|
||||
default="Exported from DuckDB",
|
||||
)
|
||||
index_export_parser.add_argument(
|
||||
"-b",
|
||||
"--batch-size",
|
||||
help="batch size (default: 1024)",
|
||||
default=1024,
|
||||
type=int,
|
||||
)
|
||||
|
||||
|
||||
parsed_args = global_parser.parse_args()
|
||||
if hasattr(parsed_args, "func"):
|
||||
parsed_args.func(parsed_args)
|
||||
else:
|
||||
global_parser.print_usage()
|
||||
Loading…
Reference in New Issue
Block a user