Improved code

This commit is contained in:
Arthur Idema 2025-08-19 17:23:02 +02:00
commit e7a025deb1
22 changed files with 3681 additions and 0 deletions

20
.gitignore vendored Normal file
View File

@ -0,0 +1,20 @@
cranfield.db
venv
__pycache__
cranfield.qrels
cranfieldoutput
/duckdb-fts-main/
/trec_eval/
*.db
*.ciff
output*.txt
results*.txt
*.txt
/results/
/resultszoekeend/
/oldresults/
*.ciff.gz
INSTALL
custom.qrels
custom_index
database.db.wal

18
README.md Normal file
View File

@ -0,0 +1,18 @@
## How to use
Run `python3 phrase_index.py` with any of the parameters listed below:
```
-h, --help show this help message and exit
--db DB Database file name
--dataset DATASET ir_datasets name (e.g., cranfield, msmarco-passage)
--stopwords STOPWORDS Stopwords to use (english, none)
--mode MODE Indexing mode (duckdb, phrases)
--min-freq MIN_FREQ Minimum frequency for phrases (only for mode "phrases")
--min-pmi MIN_PMI Minimum PMI for phrases (only for mode "phrases")
```
## Helper scripts
- `./auto_phrase.sh` and `./auto_zoekeend.sh` can be used to automatically index, search and evaluate the results and store it in a results directory. `auto_phrase` uses `phrase_index.py`, while `auto_zoekeend` uses `ze_index.py`.
- `./batch_phrase.sh` can be used to create the results using multiple different variables in one go.
- And display_results.sh can be used to display the evaluation metrics of all previous results. (So MAP, CiP, dictionary size, terms size, number of phrases, AVGDL and SUMDF)

64
helper_scripts/auto_phrase.sh Executable file
View File

@ -0,0 +1,64 @@
#!/bin/bash
set -e
# Settings
DB="database.db"
OUT="results.txt"
DATASET="cranfield"
QUERY="cran"
STOPWORDS="english"
MODE="duckdb"
CODE="phrase_index.py"
EXTRACTOR="phrases_extractor.py"
LIMIT=-1
MIN_FREQ=9
MIN_PMI=4.0
# remove old if exists
[ -f ${DB} ] && rm ${DB}
[ -f ${OUT} ] && rm ${OUT}
[ -f eval.txt ] && rm eval.txt
# Timestamped results directory
RUN_ID=$(date +"%Y%m%d_%H%M%S")
RESULTS_DIR="results/$RUN_ID"
# Step 1: Build the index
python $CODE --db "$DB" --dataset "$DATASET" --stopwords "$STOPWORDS" --mode "$MODE" --limit "$LIMIT" --min-freq "$MIN_FREQ" --min-pmi "$MIN_PMI"
# Step 2: Search
./zoekeend search "$DB" "$QUERY" -o "$OUT"
# Step 3: Evaluate
./zoekeend eval "$OUT" "$QUERY" | tee eval.txt
# Save all outputs and settings
mkdir -p "$RESULTS_DIR"
mv "$DB" "$RESULTS_DIR/"
mv "$OUT" "$RESULTS_DIR/"
mv eval.txt "$RESULTS_DIR/"
cp $CODE "$RESULTS_DIR/"
cp $EXTRACTOR "$RESULTS_DIR/"
# Save settings
cat > "$RESULTS_DIR/settings.txt" <<EOF
DB: $DB
OUT: $OUT
DATASET: $DATASET
QUERY: $QUERY
STOPWORDS: $STOPWORDS
MODE: $MODE
LIMIT: $LIMIT
MIN_FREQ: $MIN_FREQ
MIN_PMI: $MIN_PMI
RUN_ID: $RUN_ID
EOF
# Remove temporary files
rm -f ${DB} $OUT eval.txt
echo ""
echo "Done. Results stored in $RESULTS_DIR"
echo "duckdb -ui $RESULTS_DIR/$DB"
ls "$RESULTS_DIR"

52
helper_scripts/auto_zoekeend.sh Executable file
View File

@ -0,0 +1,52 @@
#!/bin/bash
set -e
# Settings
DB="database.db"
OUT="results.txt"
DATASET="cranfield"
QUERY="cran"
STOPWORDS="english"
# remove old if exists
[ -f ${DB} ] && rm ${DB}
[ -f ${OUT} ] && rm ${OUT}
[ -f eval.txt ] && rm eval.txt
# Timestamped results directory
RUN_ID=$(date +"%Y%m%d_%H%M%S")
RESULTS_DIR="resultszoekeend/$RUN_ID"
cd ..
# Step 1: Build the index
python ./zoekeend index $DB $DATASET -s "$STOPWORDS"
# Step 2: Search
./zoekeend search "$DB" "$QUERY" -o "$OUT"
# Step 3: Evaluate
./zoekeend eval "$OUT" "$QUERY" | tee eval.txt
# Save all outputs and settings
mkdir -p "$RESULTS_DIR"
mv "$DB" "$RESULTS_DIR/"
mv "$OUT" "$RESULTS_DIR/"
mv eval.txt "$RESULTS_DIR/"
# Save settings
cat > "$RESULTS_DIR/settings.txt" <<EOF
DB: $DB
OUT: $OUT
DATASET: $DATASET
QUERY: $QUERY
STOPWORDS: $STOPWORDS
RUN_ID: $RUN_ID
EOF
# Remove temporary files
rm -f ${DB} $OUT eval.txt
echo "Done. Results stored in $RESULTS_DIR"
ls -lh "$RESULTS_DIR"

79
helper_scripts/batch_phrase.sh Executable file
View File

@ -0,0 +1,79 @@
#!/bin/bash
set -e
DB_BASE="database"
OUT_BASE="results"
DATASET="cranfield"
QUERY="cran"
INDEXER="phrase_index.py"
STOPWORDS_LIST=("english" "none")
MODE_LIST=("duckdb" "phrases")
LIMIT_LIST=(-1)
MIN_FREQ_LIST=(4 5 6 7 9 10 11)
MIN_PMI_LIST=(5 6 7 8 9 10 11 12 13 14)
cd ..
for STOPWORDS in "${STOPWORDS_LIST[@]}"; do
for MODE in "${MODE_LIST[@]}"; do
for LIMIT in "${LIMIT_LIST[@]}"; do
for MIN_FREQ in "${MIN_FREQ_LIST[@]}"; do
for MIN_PMI in "${MIN_PMI_LIST[@]}"; do
# For duckdb mode, only run once per LIMIT/STOPWORDS (ignore min_freq/min_pmi except first)
if [[ "$MODE" == "duckdb" && ( "$MIN_FREQ" != "${MIN_FREQ_LIST[0]}" || "$MIN_PMI" != "${MIN_PMI_LIST[0]}" ) ]]; then
continue
fi
DB="${DB_BASE}_${MODE}_${STOPWORDS}_${LIMIT}_${MIN_FREQ}_${MIN_PMI}.db"
OUT="${OUT_BASE}_${MODE}_${STOPWORDS}_${LIMIT}_${MIN_FREQ}_${MIN_PMI}.txt"
# Remove old files if they exist
[ -f "$DB" ] && rm "$DB"
[ -f "$OUT" ] && rm "$OUT"
[ -f eval.txt ] && rm eval.txt
# Timestamped results directory
RUN_ID=$(date +"%Y%m%d_%H%M%S")_${MODE}_${STOPWORDS}_${LIMIT}_${MIN_FREQ}_${MIN_PMI}
RESULTS_DIR="results/$RUN_ID"
mkdir -p "$RESULTS_DIR"
# Step 1: Build the index
python "$INDEXER" --db "$DB" --dataset "$DATASET" --stopwords "$STOPWORDS" --mode "$MODE" --limit "$LIMIT" --min-freq "$MIN_FREQ" --min-pmi "$MIN_PMI"
# Step 2: Search
./zoekeend search "$DB" "$QUERY" -o "$OUT"
# Step 3: Evaluate
./zoekeend eval "$OUT" "$QUERY" > eval.txt
# Save all outputs and settings
mkdir -p "$RESULTS_DIR"
mv "$DB" "$RESULTS_DIR/"
mv "$OUT" "$RESULTS_DIR/"
mv eval.txt "$RESULTS_DIR/"
# Save settings
cat > "$RESULTS_DIR/settings.txt" <<EOF
DB: $DB
OUT: $OUT
DATASET: $DATASET
QUERY: $QUERY
STOPWORDS: $STOPWORDS
MODE: $MODE
LIMIT: $LIMIT
MIN_FREQ: $MIN_FREQ
MIN_PMI: $MIN_PMI
RUN_ID: $RUN_ID
EOF
# Remove temporary files
rm -f "$DB" "$OUT" eval.txt
echo "Done. Results stored in $RESULTS_DIR"
ls -lh "$RESULTS_DIR"
echo "--------------------------------------"
done
done
done
done
done

View File

@ -0,0 +1,26 @@
#!/bin/bash
echo -e "RUN_ID\tMODE\tSTOPWORDS\tMIN_FREQ\tMIN_PMI\tMAP\tPOSTINGS_COST\tDICT_SIZE\tTERMS_SIZE\tNGRAMS\tAVGDL\tSUMDF"
for dir in ../results/*; do
[ -d "$dir" ] || continue
SETTINGS="$dir/settings.txt"
EVAL="$dir/eval.txt"
DB=$(grep '^DB:' "$SETTINGS" | awk '{print $2}')
DB="$dir/$(basename "$DB")"
if [[ -f "$SETTINGS" && -f "$EVAL" && -f "$DB" ]]; then
RUN_ID=$(grep '^RUN_ID:' "$SETTINGS" | awk '{print $2}')
MODE=$(grep '^MODE:' "$SETTINGS" | awk '{print $2}')
STOPWORDS=$(grep '^STOPWORDS:' "$SETTINGS" | awk '{print $2}')
MIN_FREQ=$(grep '^MIN_FREQ:' "$SETTINGS" | awk '{print $2}')
MIN_PMI=$(grep '^MIN_PMI:' "$SETTINGS" | awk '{print $2}')
MAP=$(grep -E '^map[[:space:]]+all' "$EVAL" | awk '{print $3}')
POSTINGS_COST=$(grep '^Average cost in postings:' "$EVAL" | awk '{print $5}')
DICT_SIZE=$(duckdb "$DB" -csv -noheader "SELECT COUNT(*) FROM fts_main_documents.dict;")
TERMS_SIZE=$(duckdb "$DB" -csv -noheader "SELECT COUNT(*) FROM fts_main_documents.terms;")
NGRAMS=$(duckdb "$DB" -csv -noheader "SELECT COUNT(*) FROM fts_main_documents.dict WHERE term LIKE '% %';")
AVGDL=$(duckdb "$DB" -csv -noheader "SELECT avgdl FROM fts_main_documents.stats;")
SUMDF=$(duckdb "$DB" -csv -noheader "SELECT sumdf FROM fts_main_documents.stats;")
echo -e "${RUN_ID}\t${MODE}\t${STOPWORDS}\t${MIN_FREQ}\t${MIN_PMI}\t${MAP}\t${POSTINGS_COST}\t${DICT_SIZE}\t${TERMS_SIZE}\t${NGRAMS}\t${AVGDL}\t${SUMDF}"
fi
done

272
helper_scripts/to_csv.py Normal file
View File

@ -0,0 +1,272 @@
# A super simple helper script
import pandas as pd
# Raw data as provided
data = """20250813_160649_duckdb_english_-1_0_0 duckdb english 0 0 0.2881 704.8000 6639 128030 0 91.45 80317
20250813_160726_phrases_english_-1_0_0 phrases english 0 0 0.1572 476.8444 21928 102342 16988 73.10142857142857 77971
20250813_160938_phrases_english_-1_0_1 phrases english 0 1 0.1629 493.3867 21685 102526 16746 73.23285714285714 77793
20250813_161143_phrases_english_-1_0_2 phrases english 0 2 0.1688 509.5022 21052 103046 16109 73.60428571428571 77506
20250813_161351_phrases_english_-1_0_4 phrases english 0 4 0.1778 559.0311 18662 106284 13680 75.91714285714286 77921
20250813_161553_phrases_english_-1_0_8 phrases english 0 8 0.2568 790.6044 11156 126009 5981 90.00642857142857 82155
20250813_161718_phrases_english_-1_0_16 phrases english 0 16 0.2906 816.3067 6593 138733 129 99.095 86796
20250813_161835_phrases_english_-1_0_24 phrases english 0 24 0.2906 816.3200 6639 138873 0 99.195 86916
20250813_161949_phrases_english_-1_0_48 phrases english 0 48 0.2906 816.3200 6639 138873 0 99.195 86916
20250813_162104_phrases_english_-1_1_0 phrases english 1 0 0.1572 476.8444 21928 102342 16988 73.10142857142857 77971
20250813_162314_phrases_english_-1_1_1 phrases english 1 1 0.1629 493.3867 21685 102526 16746 73.23285714285714 77793
20250813_162521_phrases_english_-1_1_2 phrases english 1 2 0.1688 509.5022 21052 103046 16109 73.60428571428571 77506
20250813_162652_phrases_english_-1_1_4 phrases english 1 4 0.1778 559.0311 18662 106284 13680 75.91714285714286 77921
20250813_162851_phrases_english_-1_1_8 phrases english 1 8 0.2568 790.6044 11156 126009 5981 90.00642857142857 82155
20250813_163027_phrases_english_-1_1_16 phrases english 1 16 0.2906 816.3067 6593 138733 129 99.095 86796
20250813_163147_phrases_english_-1_1_24 phrases english 1 24 0.2906 816.3200 6639 138873 0 99.195 86916
20250813_163259_phrases_english_-1_1_48 phrases english 1 48 0.2906 816.3200 6639 138873 0 99.195 86916
20250813_163405_phrases_english_-1_2_0 phrases english 2 0 0.1763 379.8711 10740 105739 6586 75.52785714285714 79664
20250813_163530_phrases_english_-1_2_1 phrases english 2 1 0.1772 396.0311 10666 105816 6512 75.58285714285714 79489
20250813_163654_phrases_english_-1_2_2 phrases english 2 2 0.1794 411.7600 10427 106096 6273 75.78285714285714 79150
20250813_163822_phrases_english_-1_2_4 phrases english 2 4 0.1938 459.2800 9433 108408 5271 77.43428571428572 79099
20250813_163949_phrases_english_-1_2_8 phrases english 2 8 0.2627 709.4400 6083 124639 1908 89.02785714285714 81508
20250813_164108_phrases_english_-1_2_16 phrases english 2 16 0.2932 739.6222 4347 133593 12 95.42357142857144 83902
20250813_164216_phrases_english_-1_2_24 phrases english 2 24 0.2932 739.6222 4349 133616 0 95.44 83913
20250813_164322_phrases_english_-1_2_48 phrases english 2 48 0.2932 739.6222 4349 133616 0 95.44 83913
20250813_164425_phrases_english_-1_4_0 phrases english 4 0 0.1926 412.1156 5392 108052 2439 77.18 77560
20250813_164542_phrases_english_-1_4_1 phrases english 4 1 0.1954 422.1467 5376 108073 2423 77.195 77458
20250813_164656_phrases_english_-1_4_2 phrases english 4 2 0.1955 431.9733 5305 108193 2352 77.28071428571428 77205
20250813_164814_phrases_english_-1_4_4 phrases english 4 4 0.2077 469.7067 4969 109698 2015 78.35571428571428 77107
20250813_164929_phrases_english_-1_4_8 phrases english 4 8 0.2703 705.4267 3584 122735 630 87.66785714285714 78610
20250813_165045_phrases_english_-1_4_16 phrases english 4 16 0.2938 732.3067 2980 129255 0 92.325 80420
20250813_165151_phrases_english_-1_4_24 phrases english 4 24 0.2938 732.3067 2980 129255 0 92.325 80420
20250813_165256_phrases_english_-1_4_48 phrases english 4 48 0.2938 732.3067 2980 129255 0 92.325 80420
20250813_165402_phrases_english_-1_8_0 phrases english 8 0 0.2316 481.0444 2889 109018 915 77.87 74606
20250813_165514_phrases_english_-1_8_1 phrases english 8 1 0.2315 485.4489 2887 109018 913 77.87 74543
20250813_165623_phrases_english_-1_8_2 phrases english 8 2 0.2312 489.1467 2869 109078 895 77.91285714285715 74428
20250813_165734_phrases_english_-1_8_4 phrases english 8 4 0.2402 513.2978 2760 109979 786 78.55642857142857 74316
20250813_165845_phrases_english_-1_8_8 phrases english 8 8 0.2772 710.0622 2195 120281 221 85.915 75361
20250813_165956_phrases_english_-1_8_16 phrases english 8 16 0.2933 733.9689 1978 125124 0 89.37428571428572 76721
20250813_170104_phrases_english_-1_8_24 phrases english 8 24 0.2933 733.9689 1978 125124 0 89.37428571428572 76721
20250813_170215_phrases_english_-1_8_48 phrases english 8 48 0.2933 733.9689 1978 125124 0 89.37428571428572 76721
20250813_170321_phrases_english_-1_16_0 phrases english 16 0 0.2509 538.9644 1636 105876 341 75.62571428571428 69335
20250813_170435_phrases_english_-1_16_1 phrases english 16 1 0.2519 543.3867 1635 105876 340 75.62571428571428 69280
20250813_170543_phrases_english_-1_16_2 phrases english 16 2 0.2522 545.7333 1632 105914 337 75.65285714285714 69255
20250813_170652_phrases_english_-1_16_4 phrases english 16 4 0.2599 560.0044 1601 106384 306 75.98857142857143 69176
20250813_170758_phrases_english_-1_16_8 phrases english 16 8 0.2889 721.7467 1376 114387 81 81.705 69972
20250813_170906_phrases_english_-1_16_16 phrases english 16 16 0.2978 742.4178 1296 117990 0 84.27857142857142 71021
20250813_171011_phrases_english_-1_16_24 phrases english 16 24 0.2978 742.4178 1296 117990 0 84.27857142857142 71021
20250813_171115_phrases_english_-1_16_48 phrases english 16 48 0.2978 742.4178 1296 117990 0 84.27857142857142 71021
20250813_171220_phrases_english_-1_24_0 phrases english 24 0 0.2563 578.9422 1164 102808 188 73.43428571428572 65298
20250813_171330_phrases_english_-1_24_1 phrases english 24 1 0.2563 578.9422 1164 102808 188 73.43428571428572 65298
20250813_171441_phrases_english_-1_24_2 phrases english 24 2 0.2568 579.9022 1163 102829 187 73.44928571428571 65298
20250813_171553_phrases_english_-1_24_4 phrases english 24 4 0.2617 592.4044 1149 103170 173 73.69285714285714 65267
20250813_171703_phrases_english_-1_24_8 phrases english 24 8 0.2851 731.9244 1017 109871 41 78.47928571428571 65953
20250813_171818_phrases_english_-1_24_16 phrases english 24 16 0.2901 749.5022 977 112836 0 80.59714285714286 66831
20250813_171930_phrases_english_-1_24_24 phrases english 24 24 0.2901 749.5022 977 112836 0 80.59714285714286 66831
20250813_172041_phrases_english_-1_24_48 phrases english 24 48 0.2901 749.5022 977 112836 0 80.59714285714286 66831
20250813_172151_duckdb_none_-1_0_0 duckdb none 0 0 0.2713 1366.4711 7044 239151 0 170.82214285714286 120060
20250813_172225_phrases_none_-1_0_0 phrases none 0 0 0.0942 606.3111 39156 138456 36189 98.89714285714285 116154
20250813_172407_phrases_none_-1_0_1 phrases none 0 1 0.1055 865.5467 36615 146196 33656 104.42571428571429 118449
20250813_172553_phrases_none_-1_0_2 phrases none 0 2 0.1207 1160.9289 32862 161095 29874 115.06785714285714 113754
20250813_172741_phrases_none_-1_0_4 phrases none 0 4 0.1507 1305.4889 25669 184408 22135 131.72 107278
20250813_172918_phrases_none_-1_0_8 phrases none 0 8 0.2366 1365.3956 12779 223412 7754 159.58 113556
20250813_173023_phrases_none_-1_0_16 phrases none 0 16 0.2712 1366.4711 6994 238997 142 170.71214285714285 119929
20250813_173123_phrases_none_-1_0_24 phrases none 0 24 0.2713 1366.4711 7044 239151 0 170.82214285714286 120060
20250813_173214_phrases_none_-1_0_48 phrases none 0 48 0.2713 1366.4711 7044 239151 0 170.82214285714286 120060
20250813_173306_phrases_none_-1_1_0 phrases none 1 0 0.0942 606.3111 39156 138456 36189 98.89714285714285 116154
20250813_173448_phrases_none_-1_1_1 phrases none 1 1 0.1055 865.5467 36615 146196 33656 104.42571428571429 118449
20250813_173636_phrases_none_-1_1_2 phrases none 1 2 0.1207 1160.9289 32862 161095 29874 115.06785714285714 113754
20250813_173824_phrases_none_-1_1_4 phrases none 1 4 0.1507 1305.4889 25669 184408 22135 131.72 107278
20250813_174000_phrases_none_-1_1_8 phrases none 1 8 0.2366 1365.3956 12779 223412 7754 159.58 113556
20250813_174119_phrases_none_-1_1_16 phrases none 1 16 0.2712 1366.4711 6994 238997 142 170.71214285714285 119929
20250813_174220_phrases_none_-1_1_24 phrases none 1 24 0.2713 1366.4711 7044 239151 0 170.82214285714286 120060
20250813_174314_phrases_none_-1_1_48 phrases none 1 48 0.2713 1366.4711 7044 239151 0 170.82214285714286 120060
20250813_174406_phrases_none_-1_2_0 phrases none 2 0 0.1154 775.2578 23364 162542 19103 116.10142857142857 131171
20250813_174516_phrases_none_-1_2_1 phrases none 2 1 0.1229 925.6178 21484 168916 17216 120.65428571428572 131354
20250813_174625_phrases_none_-1_2_2 phrases none 2 2 0.1401 1175.8044 18673 181497 14424 129.6407142857143 124656
20250813_174734_phrases_none_-1_2_4 phrases none 2 4 0.1745 1309.2400 13265 203089 8938 145.06357142857144 117056
20250813_174842_phrases_none_-1_2_8 phrases none 2 8 0.2438 1367.9511 6821 236944 2307 169.24571428571429 121637
20250813_174938_phrases_none_-1_2_16 phrases none 2 16 0.2715 1368.6533 4719 247372 13 176.6942857142857 124752
20250813_175030_phrases_none_-1_2_24 phrases none 2 24 0.2715 1368.6533 4720 247398 0 176.71285714285713 124765
20250813_175123_phrases_none_-1_2_48 phrases none 2 48 0.2715 1368.6533 4720 247398 0 176.71285714285713 124765
20250813_175212_phrases_none_-1_4_0 phrases none 4 0 0.1461 917.3200 12382 188395 9140 134.56785714285715 138261
20250813_175308_phrases_none_-1_4_1 phrases none 4 1 0.1444 1016.7067 11297 192252 8053 137.32285714285715 137005
20250813_175355_phrases_none_-1_4_2 phrases none 4 2 0.1640 1212.6044 9670 203157 6432 145.11214285714286 129190
20250813_175450_phrases_none_-1_4_4 phrases none 4 4 0.1844 1328.3778 6782 224561 3533 160.4007142857143 122890
20250813_175540_phrases_none_-1_4_8 phrases none 4 8 0.2532 1373.9778 4029 253029 745 180.735 126124
20250813_175635_phrases_none_-1_4_16 phrases none 4 16 0.2731 1374.4044 3313 260575 0 186.125 128467
20250813_175724_phrases_none_-1_4_24 phrases none 4 24 0.2731 1374.4044 3313 260575 0 186.125 128467
20250813_175813_phrases_none_-1_4_48 phrases none 4 48 0.2731 1374.4044 3313 260575 0 186.125 128467
20250813_175900_phrases_none_-1_8_0 phrases none 8 0 0.1722 1081.2489 6529 220276 4267 157.34 141465
20250813_175950_phrases_none_-1_8_1 phrases none 8 1 0.1759 1147.3956 6030 222849 3768 159.17785714285714 139509
20250813_180037_phrases_none_-1_8_2 phrases none 8 2 0.1925 1262.1600 5229 234827 2968 167.73357142857142 133023
20250813_180131_phrases_none_-1_8_4 phrases none 8 4 0.2184 1346.5333 3785 254655 1523 181.89642857142857 127665
20250813_180222_phrases_none_-1_8_8 phrases none 8 8 0.2630 1379.0667 2537 278374 269 198.83857142857144 129892
20250813_180315_phrases_none_-1_8_16 phrases none 8 16 0.2752 1379.3156 2273 284030 0 202.87857142857143 131682
20250813_180404_phrases_none_-1_8_24 phrases none 8 24 0.2752 1379.3156 2273 284030 0 202.87857142857143 131682
20250813_180453_phrases_none_-1_8_48 phrases none 8 48 0.2752 1379.3156 2273 284030 0 202.87857142857143 131682
20250813_180542_phrases_none_-1_16_0 phrases none 16 0 0.2053 1267.8489 3405 269846 1865 192.74714285714285 142548
20250813_180632_phrases_none_-1_16_1 phrases none 16 1 0.2059 1279.7244 3196 271773 1656 194.12357142857144 140763
20250813_180723_phrases_none_-1_16_2 phrases none 16 2 0.2214 1325.5867 2835 282258 1296 201.61285714285714 134192
20250813_180816_phrases_none_-1_16_4 phrases none 16 4 0.2379 1375.3067 2176 300256 637 214.46857142857144 129654
20250813_180910_phrases_none_-1_16_8 phrases none 16 8 0.2662 1391.0533 1641 319121 101 227.94357142857143 130933
20250813_181005_phrases_none_-1_16_16 phrases none 16 16 0.2722 1391.0400 1541 323278 0 230.91285714285715 132277
20250813_181056_phrases_none_-1_16_24 phrases none 16 24 0.2722 1391.0400 1541 323278 0 230.91285714285715 132277
20250813_181150_phrases_none_-1_16_48 phrases none 16 48 0.2722 1391.0400 1541 323278 0 230.91285714285715 132277
20250813_181242_phrases_none_-1_24_0 phrases none 24 0 0.2074 1341.2933 2290 303650 1109 216.89285714285714 140095
20250813_181336_phrases_none_-1_24_1 phrases none 24 1 0.2134 1339.4400 2184 304857 1003 217.755 138507
20250813_181430_phrases_none_-1_24_2 phrases none 24 2 0.2330 1354.8667 1960 315279 780 225.1992857142857 131976
20250813_181523_phrases_none_-1_24_4 phrases none 24 4 0.2451 1386.2356 1568 330903 387 236.3592857142857 127722
20250813_181620_phrases_none_-1_24_8 phrases none 24 8 0.2702 1393.9689 1235 347214 54 248.01 128647
20250813_181717_phrases_none_-1_24_16 phrases none 24 16 0.2721 1393.9244 1182 350595 0 250.425 129773
20250813_181811_phrases_none_-1_24_24 phrases none 24 24 0.2721 1393.9244 1182 350595 0 250.425 129773
20250813_181906_phrases_none_-1_24_48 phrases none 24 48 0.2721 1393.9244 1182 350595 0 250.425 129773
20250813_210100_duckdb_english_-1_4_5 duckdb english 4 5 0.2881 704.8000 6639 128030 0 91.45 80317
20250813_210142_phrases_english_-1_4_5 phrases english 4 5 0.2083 521.3733 4654 111616 1700 79.72571428571429 77189
20250813_210257_phrases_english_-1_4_6 phrases english 4 6 0.2195 595.7778 4250 114239 1297 81.59928571428571 77437
20250813_210406_phrases_english_-1_4_7 phrases english 4 7 0.2465 654.8444 3904 117509 950 83.935 77710
20250813_210516_phrases_english_-1_4_8 phrases english 4 8 0.2703 705.4267 3584 122735 630 87.66785714285714 78610
20250813_210624_phrases_english_-1_4_9 phrases english 4 9 0.2796 722.9733 3346 125792 390 89.85142857142857 79288
20250813_210735_phrases_english_-1_4_10 phrases english 4 10 0.2870 729.9289 3205 127524 245 91.08857142857143 79810
20250813_210846_phrases_english_-1_4_11 phrases english 4 11 0.2910 731.6133 3106 128280 141 91.62857142857143 80077
20250813_210955_phrases_english_-1_4_12 phrases english 4 12 0.2940 732.1911 3044 128816 74 92.01142857142857 80251
20250813_211102_phrases_english_-1_4_13 phrases english 4 13 0.2948 732.2889 3006 129079 34 92.19928571428571 80351
20250813_211209_phrases_english_-1_4_14 phrases english 4 14 0.2938 732.2978 2989 129185 14 92.275 80400
20250813_211315_phrases_english_-1_5_5 phrases english 5 5 0.2173 517.2000 3835 111400 1235 79.57142857142857 76183
20250813_211424_phrases_english_-1_5_6 phrases english 5 6 0.2309 590.4667 3536 113742 936 81.24428571428571 76390
20250813_211534_phrases_english_-1_5_7 phrases english 5 7 0.2538 648.1467 3280 116753 680 83.395 76603
20250813_211644_phrases_english_-1_5_8 phrases english 5 8 0.2713 698.4800 3035 121771 435 86.97928571428571 77476
20250813_211755_phrases_english_-1_5_9 phrases english 5 9 0.2818 715.6089 2870 124598 270 88.99857142857142 78102
20250813_211908_phrases_english_-1_5_10 phrases english 5 10 0.2888 722.5244 2768 126202 166 90.14428571428572 78608
20250813_212014_phrases_english_-1_5_11 phrases english 5 11 0.2923 724.1733 2702 126846 97 90.60428571428571 78836
20250813_212121_phrases_english_-1_5_12 phrases english 5 12 0.2926 724.7289 2656 127310 49 90.93571428571428 79002
20250813_212229_phrases_english_-1_5_13 phrases english 5 13 0.2940 724.7778 2632 127516 23 91.08285714285714 79084
20250813_212336_phrases_english_-1_5_14 phrases english 5 14 0.2936 724.7822 2621 127595 10 91.13928571428572 79117
20250813_212441_phrases_english_-1_6_5 phrases english 6 5 0.2286 532.4889 3326 111572 976 79.69428571428571 75634
20250813_212550_phrases_english_-1_6_6 phrases english 6 6 0.2452 601.1067 3090 113712 740 81.22285714285714 75810
20250813_212701_phrases_english_-1_6_7 phrases english 6 7 0.2606 656.0533 2883 116571 533 83.265 76015
20250813_212811_phrases_english_-1_6_8 phrases english 6 8 0.2758 703.9644 2686 121408 336 86.72 76833
20250813_212922_phrases_english_-1_6_9 phrases english 6 9 0.2832 720.5422 2556 124095 206 88.63928571428572 77428
20250813_213032_phrases_english_-1_6_10 phrases english 6 10 0.2871 727.2844 2468 125638 116 89.74142857142857 77916
20250813_213140_phrases_english_-1_6_11 phrases english 6 11 0.2896 728.7956 2417 126211 64 90.15071428571429 78124
20250813_213257_phrases_english_-1_6_12 phrases english 6 12 0.2913 729.3200 2382 126624 27 90.44571428571429 78278
20250813_213410_phrases_english_-1_6_13 phrases english 6 13 0.2927 729.3644 2366 126799 9 90.57071428571429 78352
20250813_213517_phrases_english_-1_6_14 phrases english 6 14 0.2928 729.3644 2362 126842 4 90.60142857142857 78373
20250813_213626_phrases_english_-1_7_5 phrases english 7 5 0.2358 547.9867 2934 111748 784 79.82 75148
20250813_213741_phrases_english_-1_7_6 phrases english 7 6 0.2455 606.9378 2748 113674 598 81.19571428571429 75290
20250813_213858_phrases_english_-1_7_7 phrases english 7 7 0.2617 660.2489 2577 116381 427 83.12928571428571 75452
20250813_214015_phrases_english_-1_7_8 phrases english 7 8 0.2790 707.7867 2410 121097 260 86.49785714285714 76252
20250813_214134_phrases_english_-1_7_9 phrases english 7 9 0.2862 724.0533 2302 123686 152 88.34714285714286 76823
20250813_214245_phrases_english_-1_7_10 phrases english 7 10 0.2887 730.5244 2234 125113 83 89.36642857142857 77280
20250813_214357_phrases_english_-1_7_11 phrases english 7 11 0.2924 731.9333 2197 125607 45 89.71928571428572 77469
20250813_214509_phrases_english_-1_7_12 phrases english 7 12 0.2934 732.4000 2171 125965 18 89.975 77607
20250813_214616_phrases_english_-1_7_13 phrases english 7 13 0.2946 732.4356 2159 126118 5 90.08428571428571 77672
20250813_214724_phrases_english_-1_7_14 phrases english 7 14 0.2946 732.4356 2157 126149 2 90.10642857142857 77690
20250813_214831_phrases_english_-1_9_5 phrases english 9 5 0.2402 565.6178 2392 111075 566 79.33928571428571 73618
20250813_214943_phrases_english_-1_9_6 phrases english 9 6 0.2510 619.9644 2263 112722 437 80.51571428571428 73737
20250813_215055_phrases_english_-1_9_7 phrases english 9 7 0.2631 667.1956 2134 115185 308 82.275 73839
20250813_215207_phrases_english_-1_9_8 phrases english 9 8 0.2781 713.2356 2007 119663 181 85.47357142857143 74601
20250813_215319_phrases_english_-1_9_9 phrases english 9 9 0.2834 729.0667 1930 122062 104 87.18714285714286 75124
20250813_215427_phrases_english_-1_9_10 phrases english 9 10 0.2879 734.9022 1885 123339 58 88.09928571428571 75539
20250813_215537_phrases_english_-1_9_11 phrases english 9 11 0.2909 736.1733 1858 123770 31 88.40714285714286 75702
20250813_215648_phrases_english_-1_9_12 phrases english 9 12 0.2917 736.6178 1840 124073 12 88.62357142857142 75818
20250813_215751_phrases_english_-1_9_13 phrases english 9 13 0.2938 736.6311 1832 124199 3 88.71357142857143 75872
20250813_215854_phrases_english_-1_9_14 phrases english 9 14 0.2938 736.6311 1830 124230 0 88.73571428571428 75890
20250813_215956_phrases_english_-1_10_5 phrases english 10 5 0.2405 564.1200 2220 110076 501 78.62571428571428 72645
20250813_220107_phrases_english_-1_10_6 phrases english 10 6 0.2524 617.2489 2108 111644 389 79.74571428571429 72756
20250813_220217_phrases_english_-1_10_7 phrases english 10 7 0.2623 663.9422 1992 114024 273 81.44571428571429 72840
20250813_220321_phrases_english_-1_10_8 phrases english 10 8 0.2778 709.8178 1878 118417 159 84.58357142857143 73574
20250813_220426_phrases_english_-1_10_9 phrases english 10 9 0.2844 725.6089 1810 120744 91 86.24571428571429 74083
20250813_220532_phrases_english_-1_10_10 phrases english 10 10 0.2881 731.2444 1770 121970 51 87.12142857142857 74476
20250813_220637_phrases_english_-1_10_11 phrases english 10 11 0.2904 732.5156 1744 122397 25 87.42642857142857 74639
20250813_220743_phrases_english_-1_10_12 phrases english 10 12 0.2914 732.9689 1726 122699 7 87.64214285714286 74751
20250813_220845_phrases_english_-1_10_13 phrases english 10 13 0.2934 732.9778 1721 122801 1 87.715 74797
20250813_220946_phrases_english_-1_10_14 phrases english 10 14 0.2934 732.9778 1720 122814 0 87.72428571428571 74805
20250813_221046_phrases_english_-1_11_5 phrases english 11 5 0.2486 569.0800 2065 109381 435 78.12928571428571 71865
20250813_221153_phrases_english_-1_11_6 phrases english 11 6 0.2586 619.0044 1971 110817 341 79.155 71977
20250813_221259_phrases_english_-1_11_7 phrases english 11 7 0.2684 664.9822 1868 113100 238 80.78571428571429 72052
20250813_221406_phrases_english_-1_11_8 phrases english 11 8 0.2800 708.9333 1769 117377 139 83.84071428571428 72755
20250813_221510_phrases_english_-1_11_9 phrases english 11 9 0.2862 724.6178 1709 119653 79 85.46642857142857 73262
20250813_221616_phrases_english_-1_11_10 phrases english 11 10 0.2900 730.1911 1675 120836 45 86.31142857142858 73640
20250813_221721_phrases_english_-1_11_11 phrases english 11 11 0.2926 731.4533 1651 121253 21 86.60928571428572 73802
20250813_221825_phrases_english_-1_11_12 phrases english 11 12 0.2935 731.9200 1636 121524 6 86.80285714285715 73896
20250813_221929_phrases_english_-1_11_13 phrases english 11 13 0.2954 731.9289 1632 121616 1 86.86857142857143 73940
20250813_222030_phrases_english_-1_11_14 phrases english 11 14 0.2954 731.9289 1631 121629 0 86.87785714285714 73948
20250813_222129_duckdb_none_-1_4_5 duckdb none 4 5 0.2713 1366.4711 7044 239151 0 170.82214285714286 120060
20250813_222206_phrases_none_-1_4_5 phrases none 4 5 0.1932 1350.9111 5821 232789 2560 166.27785714285713 122981
20250813_222300_phrases_none_-1_4_6 phrases none 4 6 0.2036 1365.5067 5027 239895 1753 171.35357142857143 123552
20250813_222354_phrases_none_-1_4_7 phrases none 4 7 0.2349 1372.5200 4456 246070 1174 175.7642857142857 124560
20250813_222448_phrases_none_-1_4_8 phrases none 4 8 0.2532 1373.9778 4029 253029 745 180.735 126124
20250813_222544_phrases_none_-1_4_9 phrases none 4 9 0.2624 1374.2622 3740 256538 454 183.24142857142857 126990
20250813_222640_phrases_none_-1_4_10 phrases none 4 10 0.2687 1374.3556 3562 258648 271 184.74857142857144 127703
20250813_222733_phrases_none_-1_4_11 phrases none 4 11 0.2729 1374.4400 3452 259500 155 185.35714285714286 128044
20250813_222827_phrases_none_-1_4_12 phrases none 4 12 0.2745 1374.4267 3381 260125 78 185.80357142857142 128286
20250813_222922_phrases_none_-1_4_13 phrases none 4 13 0.2760 1374.4044 3339 260403 34 186.00214285714284 128399
20250813_223015_phrases_none_-1_4_14 phrases none 4 14 0.2732 1374.4044 3322 260507 14 186.07642857142858 128447
20250813_223106_phrases_none_-1_5_5 phrases none 5 5 0.1988 1353.4622 4792 241122 1886 172.23 124557
20250813_223204_phrases_none_-1_5_6 phrases none 5 6 0.2122 1367.3422 4194 247778 1278 176.9842857142857 125103
20250813_223257_phrases_none_-1_5_7 phrases none 5 7 0.2399 1373.6933 3767 253550 847 181.10714285714286 126007
20250813_223351_phrases_none_-1_5_8 phrases none 5 8 0.2577 1375.0089 3440 260253 519 185.895 127535
20250813_223446_phrases_none_-1_5_9 phrases none 5 9 0.2663 1375.2622 3235 263519 314 188.22785714285715 128348
20250813_223542_phrases_none_-1_5_10 phrases none 5 10 0.2717 1375.3111 3107 265465 183 189.61785714285713 129025
20250813_223634_phrases_none_-1_5_11 phrases none 5 11 0.2756 1375.4000 3034 266187 106 190.13357142857143 129317
20250813_223727_phrases_none_-1_5_12 phrases none 5 12 0.2763 1375.3911 2983 266726 53 190.51857142857142 129538
20250813_223820_phrases_none_-1_5_13 phrases none 5 13 0.2780 1375.3733 2955 266950 23 190.67857142857142 129634
20250813_223914_phrases_none_-1_5_14 phrases none 5 14 0.2763 1375.3733 2944 267029 10 190.735 129667
20250813_224006_phrases_none_-1_6_5 phrases none 6 5 0.2083 1357.2311 4179 248276 1526 177.34 125864
20250813_224059_phrases_none_-1_6_6 phrases none 6 6 0.2227 1370.1111 3683 254618 1024 181.87 126320
20250813_224154_phrases_none_-1_6_7 phrases none 6 7 0.2446 1375.9467 3327 260186 666 185.84714285714287 127204
20250813_224249_phrases_none_-1_6_8 phrases none 6 8 0.2607 1377.0578 3066 266658 405 190.47 128655
20250813_224344_phrases_none_-1_6_9 phrases none 6 9 0.2626 1377.2933 2905 269749 244 192.67785714285714 129428
20250813_224440_phrases_none_-1_6_10 phrases none 6 10 0.2659 1377.3022 2796 271612 132 194.00857142857143 130067
20250813_224533_phrases_none_-1_6_11 phrases none 6 11 0.2705 1377.3867 2738 272274 72 194.48142857142858 130349
20250813_224626_phrases_none_-1_6_12 phrases none 6 12 0.2723 1377.3867 2698 272764 30 194.83142857142857 130558
20250813_224720_phrases_none_-1_6_13 phrases none 6 13 0.2740 1377.3556 2679 272950 9 194.96428571428572 130646
20250813_224811_phrases_none_-1_6_14 phrases none 6 14 0.2739 1377.3556 2675 272993 4 194.995 130667
20250813_224901_phrases_none_-1_7_5 phrases none 7 5 0.2179 1360.0756 3692 254793 1245 181.995 126840
20250813_224954_phrases_none_-1_7_6 phrases none 7 6 0.2272 1372.5289 3282 260774 830 186.26714285714286 127217
20250813_225050_phrases_none_-1_7_7 phrases none 7 7 0.2467 1377.2667 2990 266094 537 190.06714285714287 128021
20250813_225145_phrases_none_-1_7_8 phrases none 7 8 0.2626 1378.2800 2769 272417 316 194.58357142857142 129446
20250813_225241_phrases_none_-1_7_9 phrases none 7 9 0.2655 1378.4978 2636 275372 183 196.6942857142857 130197
20250813_225336_phrases_none_-1_7_10 phrases none 7 10 0.2689 1378.5244 2550 277111 96 197.93642857142856 130797
20250813_225427_phrases_none_-1_7_11 phrases none 7 11 0.2731 1378.6000 2508 277686 52 198.34714285714287 131056
20250813_225519_phrases_none_-1_7_12 phrases none 7 12 0.2745 1378.6000 2478 278119 21 198.65642857142856 131243
20250813_225611_phrases_none_-1_7_13 phrases none 7 13 0.2757 1378.5733 2463 278283 5 198.77357142857142 131322
20250813_225702_phrases_none_-1_7_14 phrases none 7 14 0.2756 1378.5733 2461 278314 2 198.7957142857143 131340
20250813_225753_phrases_none_-1_9_5 phrases none 9 5 0.2237 1365.1600 3018 267606 906 191.14714285714285 128162
20250813_225847_phrases_none_-1_9_6 phrases none 9 6 0.2326 1376.7600 2729 273019 613 195.01357142857142 128423
20250813_225941_phrases_none_-1_9_7 phrases none 9 7 0.2467 1379.9867 2504 277959 388 198.54214285714286 129108
20250813_230036_phrases_none_-1_9_8 phrases none 9 8 0.2596 1380.8756 2336 284007 220 202.86214285714286 130474
20250813_230132_phrases_none_-1_9_9 phrases none 9 9 0.2657 1381.0489 2242 286741 126 204.815 131156
20250813_230227_phrases_none_-1_9_10 phrases none 9 10 0.2692 1381.0711 2181 288318 64 205.94142857142856 131709
20250813_230320_phrases_none_-1_9_11 phrases none 9 11 0.2725 1381.1467 2150 288817 33 206.29785714285714 131931
20250813_230413_phrases_none_-1_9_12 phrases none 9 12 0.2729 1381.1467 2130 289182 12 206.55857142857144 132086
20250813_230504_phrases_none_-1_9_13 phrases none 9 13 0.2743 1381.1244 2122 289303 3 206.645 132139
20250813_230555_phrases_none_-1_9_14 phrases none 9 14 0.2743 1381.1244 2120 289334 0 206.66714285714286 132157
20250813_230646_phrases_none_-1_10_5 phrases none 10 5 0.2247 1367.4844 2799 273691 802 195.49357142857144 128332
20250813_230736_phrases_none_-1_10_6 phrases none 10 6 0.2334 1377.9467 2546 278907 546 199.21928571428572 128545
20250813_230830_phrases_none_-1_10_7 phrases none 10 7 0.2472 1380.7244 2343 283700 343 202.64285714285714 129172
20250813_230922_phrases_none_-1_10_8 phrases none 10 8 0.2593 1381.5600 2196 289586 196 206.84714285714287 130480
20250813_231016_phrases_none_-1_10_9 phrases none 10 9 0.2654 1381.7111 2112 292248 112 208.74857142857144 131145
20250813_231109_phrases_none_-1_10_10 phrases none 10 10 0.2686 1381.7378 2056 293784 56 209.84571428571428 131675
20250813_231203_phrases_none_-1_10_11 phrases none 10 11 0.2725 1381.8178 2027 294274 27 210.19571428571427 131892
20250813_231257_phrases_none_-1_10_12 phrases none 10 12 0.2728 1381.8133 2007 294631 7 210.4507142857143 132041
20250813_231346_phrases_none_-1_10_13 phrases none 10 13 0.2746 1381.7956 2002 294733 1 210.52357142857142 132087
20250813_231437_phrases_none_-1_10_14 phrases none 10 14 0.2746 1381.7956 2001 294746 0 210.53285714285715 132095
20250813_231528_phrases_none_-1_11_5 phrases none 11 5 0.2291 1371.9422 2616 279880 716 199.9142857142857 128487
20250813_231620_phrases_none_-1_11_6 phrases none 11 6 0.2365 1381.1200 2395 284879 492 203.485 128668
20250813_231714_phrases_none_-1_11_7 phrases none 11 7 0.2480 1383.4489 2210 289537 307 206.81214285714285 129265
20250813_231809_phrases_none_-1_11_8 phrases none 11 8 0.2597 1384.1422 2079 295296 176 210.9257142857143 130535
20250814_104809_phrases_none_-1_11_9 phrases none 11 9 0.2656 1384.2533 2003 297899 100 212.785 131201
20250814_104904_phrases_none_-1_11_10 phrases none 11 10 0.2684 1384.2756 1953 299393 50 213.85214285714287 131713
20250814_105006_phrases_none_-1_11_11 phrases none 11 11 0.2712 1384.3244 1926 299865 23 214.18928571428572 131927
20250814_105103_phrases_none_-1_11_12 phrases none 11 12 0.2713 1384.3200 1909 300191 6 214.42214285714286 132058
20250814_105158_phrases_none_-1_11_13 phrases none 11 13 0.2732 1384.2978 1905 300283 1 214.48785714285714 132102
20250814_105250_phrases_none_-1_11_14 phrases none 11 14 0.2732 1384.2978 1904 300296 0 214.49714285714285 132110"""
# Split into rows and then by whitespace
rows = [line.split() for line in data.splitlines()]
# Create DataFrame
df = pd.DataFrame(rows)
# Save to CSV
csv_path = "output.csv"
df.to_csv(csv_path, index=False, header=False)
csv_path

257
output.csv Normal file
View File

@ -0,0 +1,257 @@
Run,Mode,Stopword,Min Freq,Min PMI,MAP,CiP,Dict Size,Terms Size,Num phrases,AVGDL,SUMDF
20250813_160649_duckdb_english_-1_0_0,duckdb,english,0,0,0.2881,704.8000,6639,128030,0,91.45,80317
20250813_160726_phrases_english_-1_0_0,phrases,english,0,0,0.1572,476.8444,21928,102342,16988,73.10142857142857,77971
20250813_160938_phrases_english_-1_0_1,phrases,english,0,1,0.1629,493.3867,21685,102526,16746,73.23285714285714,77793
20250813_161143_phrases_english_-1_0_2,phrases,english,0,2,0.1688,509.5022,21052,103046,16109,73.60428571428571,77506
20250813_161351_phrases_english_-1_0_4,phrases,english,0,4,0.1778,559.0311,18662,106284,13680,75.91714285714286,77921
20250813_161553_phrases_english_-1_0_8,phrases,english,0,8,0.2568,790.6044,11156,126009,5981,90.00642857142857,82155
20250813_161718_phrases_english_-1_0_16,phrases,english,0,16,0.2906,816.3067,6593,138733,129,99.095,86796
20250813_161835_phrases_english_-1_0_24,phrases,english,0,24,0.2906,816.3200,6639,138873,0,99.195,86916
20250813_161949_phrases_english_-1_0_48,phrases,english,0,48,0.2906,816.3200,6639,138873,0,99.195,86916
20250813_162104_phrases_english_-1_1_0,phrases,english,1,0,0.1572,476.8444,21928,102342,16988,73.10142857142857,77971
20250813_162314_phrases_english_-1_1_1,phrases,english,1,1,0.1629,493.3867,21685,102526,16746,73.23285714285714,77793
20250813_162521_phrases_english_-1_1_2,phrases,english,1,2,0.1688,509.5022,21052,103046,16109,73.60428571428571,77506
20250813_162652_phrases_english_-1_1_4,phrases,english,1,4,0.1778,559.0311,18662,106284,13680,75.91714285714286,77921
20250813_162851_phrases_english_-1_1_8,phrases,english,1,8,0.2568,790.6044,11156,126009,5981,90.00642857142857,82155
20250813_163027_phrases_english_-1_1_16,phrases,english,1,16,0.2906,816.3067,6593,138733,129,99.095,86796
20250813_163147_phrases_english_-1_1_24,phrases,english,1,24,0.2906,816.3200,6639,138873,0,99.195,86916
20250813_163259_phrases_english_-1_1_48,phrases,english,1,48,0.2906,816.3200,6639,138873,0,99.195,86916
20250813_163405_phrases_english_-1_2_0,phrases,english,2,0,0.1763,379.8711,10740,105739,6586,75.52785714285714,79664
20250813_163530_phrases_english_-1_2_1,phrases,english,2,1,0.1772,396.0311,10666,105816,6512,75.58285714285714,79489
20250813_163654_phrases_english_-1_2_2,phrases,english,2,2,0.1794,411.7600,10427,106096,6273,75.78285714285714,79150
20250813_163822_phrases_english_-1_2_4,phrases,english,2,4,0.1938,459.2800,9433,108408,5271,77.43428571428572,79099
20250813_163949_phrases_english_-1_2_8,phrases,english,2,8,0.2627,709.4400,6083,124639,1908,89.02785714285714,81508
20250813_164108_phrases_english_-1_2_16,phrases,english,2,16,0.2932,739.6222,4347,133593,12,95.42357142857144,83902
20250813_164216_phrases_english_-1_2_24,phrases,english,2,24,0.2932,739.6222,4349,133616,0,95.44,83913
20250813_164322_phrases_english_-1_2_48,phrases,english,2,48,0.2932,739.6222,4349,133616,0,95.44,83913
20250813_164425_phrases_english_-1_4_0,phrases,english,4,0,0.1926,412.1156,5392,108052,2439,77.18,77560
20250813_164542_phrases_english_-1_4_1,phrases,english,4,1,0.1954,422.1467,5376,108073,2423,77.195,77458
20250813_164656_phrases_english_-1_4_2,phrases,english,4,2,0.1955,431.9733,5305,108193,2352,77.28071428571428,77205
20250813_164814_phrases_english_-1_4_4,phrases,english,4,4,0.2077,469.7067,4969,109698,2015,78.35571428571428,77107
20250813_164929_phrases_english_-1_4_8,phrases,english,4,8,0.2703,705.4267,3584,122735,630,87.66785714285714,78610
20250813_165045_phrases_english_-1_4_16,phrases,english,4,16,0.2938,732.3067,2980,129255,0,92.325,80420
20250813_165151_phrases_english_-1_4_24,phrases,english,4,24,0.2938,732.3067,2980,129255,0,92.325,80420
20250813_165256_phrases_english_-1_4_48,phrases,english,4,48,0.2938,732.3067,2980,129255,0,92.325,80420
20250813_165402_phrases_english_-1_8_0,phrases,english,8,0,0.2316,481.0444,2889,109018,915,77.87,74606
20250813_165514_phrases_english_-1_8_1,phrases,english,8,1,0.2315,485.4489,2887,109018,913,77.87,74543
20250813_165623_phrases_english_-1_8_2,phrases,english,8,2,0.2312,489.1467,2869,109078,895,77.91285714285715,74428
20250813_165734_phrases_english_-1_8_4,phrases,english,8,4,0.2402,513.2978,2760,109979,786,78.55642857142857,74316
20250813_165845_phrases_english_-1_8_8,phrases,english,8,8,0.2772,710.0622,2195,120281,221,85.915,75361
20250813_165956_phrases_english_-1_8_16,phrases,english,8,16,0.2933,733.9689,1978,125124,0,89.37428571428572,76721
20250813_170104_phrases_english_-1_8_24,phrases,english,8,24,0.2933,733.9689,1978,125124,0,89.37428571428572,76721
20250813_170215_phrases_english_-1_8_48,phrases,english,8,48,0.2933,733.9689,1978,125124,0,89.37428571428572,76721
20250813_170321_phrases_english_-1_16_0,phrases,english,16,0,0.2509,538.9644,1636,105876,341,75.62571428571428,69335
20250813_170435_phrases_english_-1_16_1,phrases,english,16,1,0.2519,543.3867,1635,105876,340,75.62571428571428,69280
20250813_170543_phrases_english_-1_16_2,phrases,english,16,2,0.2522,545.7333,1632,105914,337,75.65285714285714,69255
20250813_170652_phrases_english_-1_16_4,phrases,english,16,4,0.2599,560.0044,1601,106384,306,75.98857142857143,69176
20250813_170758_phrases_english_-1_16_8,phrases,english,16,8,0.2889,721.7467,1376,114387,81,81.705,69972
20250813_170906_phrases_english_-1_16_16,phrases,english,16,16,0.2978,742.4178,1296,117990,0,84.27857142857142,71021
20250813_171011_phrases_english_-1_16_24,phrases,english,16,24,0.2978,742.4178,1296,117990,0,84.27857142857142,71021
20250813_171115_phrases_english_-1_16_48,phrases,english,16,48,0.2978,742.4178,1296,117990,0,84.27857142857142,71021
20250813_171220_phrases_english_-1_24_0,phrases,english,24,0,0.2563,578.9422,1164,102808,188,73.43428571428572,65298
20250813_171330_phrases_english_-1_24_1,phrases,english,24,1,0.2563,578.9422,1164,102808,188,73.43428571428572,65298
20250813_171441_phrases_english_-1_24_2,phrases,english,24,2,0.2568,579.9022,1163,102829,187,73.44928571428571,65298
20250813_171553_phrases_english_-1_24_4,phrases,english,24,4,0.2617,592.4044,1149,103170,173,73.69285714285714,65267
20250813_171703_phrases_english_-1_24_8,phrases,english,24,8,0.2851,731.9244,1017,109871,41,78.47928571428571,65953
20250813_171818_phrases_english_-1_24_16,phrases,english,24,16,0.2901,749.5022,977,112836,0,80.59714285714286,66831
20250813_171930_phrases_english_-1_24_24,phrases,english,24,24,0.2901,749.5022,977,112836,0,80.59714285714286,66831
20250813_172041_phrases_english_-1_24_48,phrases,english,24,48,0.2901,749.5022,977,112836,0,80.59714285714286,66831
20250813_172151_duckdb_none_-1_0_0,duckdb,none,0,0,0.2713,1366.4711,7044,239151,0,170.82214285714286,120060
20250813_172225_phrases_none_-1_0_0,phrases,none,0,0,0.0942,606.3111,39156,138456,36189,98.89714285714285,116154
20250813_172407_phrases_none_-1_0_1,phrases,none,0,1,0.1055,865.5467,36615,146196,33656,104.42571428571429,118449
20250813_172553_phrases_none_-1_0_2,phrases,none,0,2,0.1207,1160.9289,32862,161095,29874,115.06785714285714,113754
20250813_172741_phrases_none_-1_0_4,phrases,none,0,4,0.1507,1305.4889,25669,184408,22135,131.72,107278
20250813_172918_phrases_none_-1_0_8,phrases,none,0,8,0.2366,1365.3956,12779,223412,7754,159.58,113556
20250813_173023_phrases_none_-1_0_16,phrases,none,0,16,0.2712,1366.4711,6994,238997,142,170.71214285714285,119929
20250813_173123_phrases_none_-1_0_24,phrases,none,0,24,0.2713,1366.4711,7044,239151,0,170.82214285714286,120060
20250813_173214_phrases_none_-1_0_48,phrases,none,0,48,0.2713,1366.4711,7044,239151,0,170.82214285714286,120060
20250813_173306_phrases_none_-1_1_0,phrases,none,1,0,0.0942,606.3111,39156,138456,36189,98.89714285714285,116154
20250813_173448_phrases_none_-1_1_1,phrases,none,1,1,0.1055,865.5467,36615,146196,33656,104.42571428571429,118449
20250813_173636_phrases_none_-1_1_2,phrases,none,1,2,0.1207,1160.9289,32862,161095,29874,115.06785714285714,113754
20250813_173824_phrases_none_-1_1_4,phrases,none,1,4,0.1507,1305.4889,25669,184408,22135,131.72,107278
20250813_174000_phrases_none_-1_1_8,phrases,none,1,8,0.2366,1365.3956,12779,223412,7754,159.58,113556
20250813_174119_phrases_none_-1_1_16,phrases,none,1,16,0.2712,1366.4711,6994,238997,142,170.71214285714285,119929
20250813_174220_phrases_none_-1_1_24,phrases,none,1,24,0.2713,1366.4711,7044,239151,0,170.82214285714286,120060
20250813_174314_phrases_none_-1_1_48,phrases,none,1,48,0.2713,1366.4711,7044,239151,0,170.82214285714286,120060
20250813_174406_phrases_none_-1_2_0,phrases,none,2,0,0.1154,775.2578,23364,162542,19103,116.10142857142857,131171
20250813_174516_phrases_none_-1_2_1,phrases,none,2,1,0.1229,925.6178,21484,168916,17216,120.65428571428572,131354
20250813_174625_phrases_none_-1_2_2,phrases,none,2,2,0.1401,1175.8044,18673,181497,14424,129.6407142857143,124656
20250813_174734_phrases_none_-1_2_4,phrases,none,2,4,0.1745,1309.2400,13265,203089,8938,145.06357142857144,117056
20250813_174842_phrases_none_-1_2_8,phrases,none,2,8,0.2438,1367.9511,6821,236944,2307,169.24571428571429,121637
20250813_174938_phrases_none_-1_2_16,phrases,none,2,16,0.2715,1368.6533,4719,247372,13,176.6942857142857,124752
20250813_175030_phrases_none_-1_2_24,phrases,none,2,24,0.2715,1368.6533,4720,247398,0,176.71285714285713,124765
20250813_175123_phrases_none_-1_2_48,phrases,none,2,48,0.2715,1368.6533,4720,247398,0,176.71285714285713,124765
20250813_175212_phrases_none_-1_4_0,phrases,none,4,0,0.1461,917.3200,12382,188395,9140,134.56785714285715,138261
20250813_175308_phrases_none_-1_4_1,phrases,none,4,1,0.1444,1016.7067,11297,192252,8053,137.32285714285715,137005
20250813_175355_phrases_none_-1_4_2,phrases,none,4,2,0.1640,1212.6044,9670,203157,6432,145.11214285714286,129190
20250813_175450_phrases_none_-1_4_4,phrases,none,4,4,0.1844,1328.3778,6782,224561,3533,160.4007142857143,122890
20250813_175540_phrases_none_-1_4_8,phrases,none,4,8,0.2532,1373.9778,4029,253029,745,180.735,126124
20250813_175635_phrases_none_-1_4_16,phrases,none,4,16,0.2731,1374.4044,3313,260575,0,186.125,128467
20250813_175724_phrases_none_-1_4_24,phrases,none,4,24,0.2731,1374.4044,3313,260575,0,186.125,128467
20250813_175813_phrases_none_-1_4_48,phrases,none,4,48,0.2731,1374.4044,3313,260575,0,186.125,128467
20250813_175900_phrases_none_-1_8_0,phrases,none,8,0,0.1722,1081.2489,6529,220276,4267,157.34,141465
20250813_175950_phrases_none_-1_8_1,phrases,none,8,1,0.1759,1147.3956,6030,222849,3768,159.17785714285714,139509
20250813_180037_phrases_none_-1_8_2,phrases,none,8,2,0.1925,1262.1600,5229,234827,2968,167.73357142857142,133023
20250813_180131_phrases_none_-1_8_4,phrases,none,8,4,0.2184,1346.5333,3785,254655,1523,181.89642857142857,127665
20250813_180222_phrases_none_-1_8_8,phrases,none,8,8,0.2630,1379.0667,2537,278374,269,198.83857142857144,129892
20250813_180315_phrases_none_-1_8_16,phrases,none,8,16,0.2752,1379.3156,2273,284030,0,202.87857142857143,131682
20250813_180404_phrases_none_-1_8_24,phrases,none,8,24,0.2752,1379.3156,2273,284030,0,202.87857142857143,131682
20250813_180453_phrases_none_-1_8_48,phrases,none,8,48,0.2752,1379.3156,2273,284030,0,202.87857142857143,131682
20250813_180542_phrases_none_-1_16_0,phrases,none,16,0,0.2053,1267.8489,3405,269846,1865,192.74714285714285,142548
20250813_180632_phrases_none_-1_16_1,phrases,none,16,1,0.2059,1279.7244,3196,271773,1656,194.12357142857144,140763
20250813_180723_phrases_none_-1_16_2,phrases,none,16,2,0.2214,1325.5867,2835,282258,1296,201.61285714285714,134192
20250813_180816_phrases_none_-1_16_4,phrases,none,16,4,0.2379,1375.3067,2176,300256,637,214.46857142857144,129654
20250813_180910_phrases_none_-1_16_8,phrases,none,16,8,0.2662,1391.0533,1641,319121,101,227.94357142857143,130933
20250813_181005_phrases_none_-1_16_16,phrases,none,16,16,0.2722,1391.0400,1541,323278,0,230.91285714285715,132277
20250813_181056_phrases_none_-1_16_24,phrases,none,16,24,0.2722,1391.0400,1541,323278,0,230.91285714285715,132277
20250813_181150_phrases_none_-1_16_48,phrases,none,16,48,0.2722,1391.0400,1541,323278,0,230.91285714285715,132277
20250813_181242_phrases_none_-1_24_0,phrases,none,24,0,0.2074,1341.2933,2290,303650,1109,216.89285714285714,140095
20250813_181336_phrases_none_-1_24_1,phrases,none,24,1,0.2134,1339.4400,2184,304857,1003,217.755,138507
20250813_181430_phrases_none_-1_24_2,phrases,none,24,2,0.2330,1354.8667,1960,315279,780,225.1992857142857,131976
20250813_181523_phrases_none_-1_24_4,phrases,none,24,4,0.2451,1386.2356,1568,330903,387,236.3592857142857,127722
20250813_181620_phrases_none_-1_24_8,phrases,none,24,8,0.2702,1393.9689,1235,347214,54,248.01,128647
20250813_181717_phrases_none_-1_24_16,phrases,none,24,16,0.2721,1393.9244,1182,350595,0,250.425,129773
20250813_181811_phrases_none_-1_24_24,phrases,none,24,24,0.2721,1393.9244,1182,350595,0,250.425,129773
20250813_181906_phrases_none_-1_24_48,phrases,none,24,48,0.2721,1393.9244,1182,350595,0,250.425,129773
20250813_210100_duckdb_english_-1_4_5,duckdb,english,4,5,0.2881,704.8000,6639,128030,0,91.45,80317
20250813_210142_phrases_english_-1_4_5,phrases,english,4,5,0.2083,521.3733,4654,111616,1700,79.72571428571429,77189
20250813_210257_phrases_english_-1_4_6,phrases,english,4,6,0.2195,595.7778,4250,114239,1297,81.59928571428571,77437
20250813_210406_phrases_english_-1_4_7,phrases,english,4,7,0.2465,654.8444,3904,117509,950,83.935,77710
20250813_210516_phrases_english_-1_4_8,phrases,english,4,8,0.2703,705.4267,3584,122735,630,87.66785714285714,78610
20250813_210624_phrases_english_-1_4_9,phrases,english,4,9,0.2796,722.9733,3346,125792,390,89.85142857142857,79288
20250813_210735_phrases_english_-1_4_10,phrases,english,4,10,0.2870,729.9289,3205,127524,245,91.08857142857143,79810
20250813_210846_phrases_english_-1_4_11,phrases,english,4,11,0.2910,731.6133,3106,128280,141,91.62857142857143,80077
20250813_210955_phrases_english_-1_4_12,phrases,english,4,12,0.2940,732.1911,3044,128816,74,92.01142857142857,80251
20250813_211102_phrases_english_-1_4_13,phrases,english,4,13,0.2948,732.2889,3006,129079,34,92.19928571428571,80351
20250813_211209_phrases_english_-1_4_14,phrases,english,4,14,0.2938,732.2978,2989,129185,14,92.275,80400
20250813_211315_phrases_english_-1_5_5,phrases,english,5,5,0.2173,517.2000,3835,111400,1235,79.57142857142857,76183
20250813_211424_phrases_english_-1_5_6,phrases,english,5,6,0.2309,590.4667,3536,113742,936,81.24428571428571,76390
20250813_211534_phrases_english_-1_5_7,phrases,english,5,7,0.2538,648.1467,3280,116753,680,83.395,76603
20250813_211644_phrases_english_-1_5_8,phrases,english,5,8,0.2713,698.4800,3035,121771,435,86.97928571428571,77476
20250813_211755_phrases_english_-1_5_9,phrases,english,5,9,0.2818,715.6089,2870,124598,270,88.99857142857142,78102
20250813_211908_phrases_english_-1_5_10,phrases,english,5,10,0.2888,722.5244,2768,126202,166,90.14428571428572,78608
20250813_212014_phrases_english_-1_5_11,phrases,english,5,11,0.2923,724.1733,2702,126846,97,90.60428571428571,78836
20250813_212121_phrases_english_-1_5_12,phrases,english,5,12,0.2926,724.7289,2656,127310,49,90.93571428571428,79002
20250813_212229_phrases_english_-1_5_13,phrases,english,5,13,0.2940,724.7778,2632,127516,23,91.08285714285714,79084
20250813_212336_phrases_english_-1_5_14,phrases,english,5,14,0.2936,724.7822,2621,127595,10,91.13928571428572,79117
20250813_212441_phrases_english_-1_6_5,phrases,english,6,5,0.2286,532.4889,3326,111572,976,79.69428571428571,75634
20250813_212550_phrases_english_-1_6_6,phrases,english,6,6,0.2452,601.1067,3090,113712,740,81.22285714285714,75810
20250813_212701_phrases_english_-1_6_7,phrases,english,6,7,0.2606,656.0533,2883,116571,533,83.265,76015
20250813_212811_phrases_english_-1_6_8,phrases,english,6,8,0.2758,703.9644,2686,121408,336,86.72,76833
20250813_212922_phrases_english_-1_6_9,phrases,english,6,9,0.2832,720.5422,2556,124095,206,88.63928571428572,77428
20250813_213032_phrases_english_-1_6_10,phrases,english,6,10,0.2871,727.2844,2468,125638,116,89.74142857142857,77916
20250813_213140_phrases_english_-1_6_11,phrases,english,6,11,0.2896,728.7956,2417,126211,64,90.15071428571429,78124
20250813_213257_phrases_english_-1_6_12,phrases,english,6,12,0.2913,729.3200,2382,126624,27,90.44571428571429,78278
20250813_213410_phrases_english_-1_6_13,phrases,english,6,13,0.2927,729.3644,2366,126799,9,90.57071428571429,78352
20250813_213517_phrases_english_-1_6_14,phrases,english,6,14,0.2928,729.3644,2362,126842,4,90.60142857142857,78373
20250813_213626_phrases_english_-1_7_5,phrases,english,7,5,0.2358,547.9867,2934,111748,784,79.82,75148
20250813_213741_phrases_english_-1_7_6,phrases,english,7,6,0.2455,606.9378,2748,113674,598,81.19571428571429,75290
20250813_213858_phrases_english_-1_7_7,phrases,english,7,7,0.2617,660.2489,2577,116381,427,83.12928571428571,75452
20250813_214015_phrases_english_-1_7_8,phrases,english,7,8,0.2790,707.7867,2410,121097,260,86.49785714285714,76252
20250813_214134_phrases_english_-1_7_9,phrases,english,7,9,0.2862,724.0533,2302,123686,152,88.34714285714286,76823
20250813_214245_phrases_english_-1_7_10,phrases,english,7,10,0.2887,730.5244,2234,125113,83,89.36642857142857,77280
20250813_214357_phrases_english_-1_7_11,phrases,english,7,11,0.2924,731.9333,2197,125607,45,89.71928571428572,77469
20250813_214509_phrases_english_-1_7_12,phrases,english,7,12,0.2934,732.4000,2171,125965,18,89.975,77607
20250813_214616_phrases_english_-1_7_13,phrases,english,7,13,0.2946,732.4356,2159,126118,5,90.08428571428571,77672
20250813_214724_phrases_english_-1_7_14,phrases,english,7,14,0.2946,732.4356,2157,126149,2,90.10642857142857,77690
20250813_214831_phrases_english_-1_9_5,phrases,english,9,5,0.2402,565.6178,2392,111075,566,79.33928571428571,73618
20250813_214943_phrases_english_-1_9_6,phrases,english,9,6,0.2510,619.9644,2263,112722,437,80.51571428571428,73737
20250813_215055_phrases_english_-1_9_7,phrases,english,9,7,0.2631,667.1956,2134,115185,308,82.275,73839
20250813_215207_phrases_english_-1_9_8,phrases,english,9,8,0.2781,713.2356,2007,119663,181,85.47357142857143,74601
20250813_215319_phrases_english_-1_9_9,phrases,english,9,9,0.2834,729.0667,1930,122062,104,87.18714285714286,75124
20250813_215427_phrases_english_-1_9_10,phrases,english,9,10,0.2879,734.9022,1885,123339,58,88.09928571428571,75539
20250813_215537_phrases_english_-1_9_11,phrases,english,9,11,0.2909,736.1733,1858,123770,31,88.40714285714286,75702
20250813_215648_phrases_english_-1_9_12,phrases,english,9,12,0.2917,736.6178,1840,124073,12,88.62357142857142,75818
20250813_215751_phrases_english_-1_9_13,phrases,english,9,13,0.2938,736.6311,1832,124199,3,88.71357142857143,75872
20250813_215854_phrases_english_-1_9_14,phrases,english,9,14,0.2938,736.6311,1830,124230,0,88.73571428571428,75890
20250813_215956_phrases_english_-1_10_5,phrases,english,10,5,0.2405,564.1200,2220,110076,501,78.62571428571428,72645
20250813_220107_phrases_english_-1_10_6,phrases,english,10,6,0.2524,617.2489,2108,111644,389,79.74571428571429,72756
20250813_220217_phrases_english_-1_10_7,phrases,english,10,7,0.2623,663.9422,1992,114024,273,81.44571428571429,72840
20250813_220321_phrases_english_-1_10_8,phrases,english,10,8,0.2778,709.8178,1878,118417,159,84.58357142857143,73574
20250813_220426_phrases_english_-1_10_9,phrases,english,10,9,0.2844,725.6089,1810,120744,91,86.24571428571429,74083
20250813_220532_phrases_english_-1_10_10,phrases,english,10,10,0.2881,731.2444,1770,121970,51,87.12142857142857,74476
20250813_220637_phrases_english_-1_10_11,phrases,english,10,11,0.2904,732.5156,1744,122397,25,87.42642857142857,74639
20250813_220743_phrases_english_-1_10_12,phrases,english,10,12,0.2914,732.9689,1726,122699,7,87.64214285714286,74751
20250813_220845_phrases_english_-1_10_13,phrases,english,10,13,0.2934,732.9778,1721,122801,1,87.715,74797
20250813_220946_phrases_english_-1_10_14,phrases,english,10,14,0.2934,732.9778,1720,122814,0,87.72428571428571,74805
20250813_221046_phrases_english_-1_11_5,phrases,english,11,5,0.2486,569.0800,2065,109381,435,78.12928571428571,71865
20250813_221153_phrases_english_-1_11_6,phrases,english,11,6,0.2586,619.0044,1971,110817,341,79.155,71977
20250813_221259_phrases_english_-1_11_7,phrases,english,11,7,0.2684,664.9822,1868,113100,238,80.78571428571429,72052
20250813_221406_phrases_english_-1_11_8,phrases,english,11,8,0.2800,708.9333,1769,117377,139,83.84071428571428,72755
20250813_221510_phrases_english_-1_11_9,phrases,english,11,9,0.2862,724.6178,1709,119653,79,85.46642857142857,73262
20250813_221616_phrases_english_-1_11_10,phrases,english,11,10,0.2900,730.1911,1675,120836,45,86.31142857142858,73640
20250813_221721_phrases_english_-1_11_11,phrases,english,11,11,0.2926,731.4533,1651,121253,21,86.60928571428572,73802
20250813_221825_phrases_english_-1_11_12,phrases,english,11,12,0.2935,731.9200,1636,121524,6,86.80285714285715,73896
20250813_221929_phrases_english_-1_11_13,phrases,english,11,13,0.2954,731.9289,1632,121616,1,86.86857142857143,73940
20250813_222030_phrases_english_-1_11_14,phrases,english,11,14,0.2954,731.9289,1631,121629,0,86.87785714285714,73948
20250813_222129_duckdb_none_-1_4_5,duckdb,none,4,5,0.2713,1366.4711,7044,239151,0,170.82214285714286,120060
20250813_222206_phrases_none_-1_4_5,phrases,none,4,5,0.1932,1350.9111,5821,232789,2560,166.27785714285713,122981
20250813_222300_phrases_none_-1_4_6,phrases,none,4,6,0.2036,1365.5067,5027,239895,1753,171.35357142857143,123552
20250813_222354_phrases_none_-1_4_7,phrases,none,4,7,0.2349,1372.5200,4456,246070,1174,175.7642857142857,124560
20250813_222448_phrases_none_-1_4_8,phrases,none,4,8,0.2532,1373.9778,4029,253029,745,180.735,126124
20250813_222544_phrases_none_-1_4_9,phrases,none,4,9,0.2624,1374.2622,3740,256538,454,183.24142857142857,126990
20250813_222640_phrases_none_-1_4_10,phrases,none,4,10,0.2687,1374.3556,3562,258648,271,184.74857142857144,127703
20250813_222733_phrases_none_-1_4_11,phrases,none,4,11,0.2729,1374.4400,3452,259500,155,185.35714285714286,128044
20250813_222827_phrases_none_-1_4_12,phrases,none,4,12,0.2745,1374.4267,3381,260125,78,185.80357142857142,128286
20250813_222922_phrases_none_-1_4_13,phrases,none,4,13,0.2760,1374.4044,3339,260403,34,186.00214285714284,128399
20250813_223015_phrases_none_-1_4_14,phrases,none,4,14,0.2732,1374.4044,3322,260507,14,186.07642857142858,128447
20250813_223106_phrases_none_-1_5_5,phrases,none,5,5,0.1988,1353.4622,4792,241122,1886,172.23,124557
20250813_223204_phrases_none_-1_5_6,phrases,none,5,6,0.2122,1367.3422,4194,247778,1278,176.9842857142857,125103
20250813_223257_phrases_none_-1_5_7,phrases,none,5,7,0.2399,1373.6933,3767,253550,847,181.10714285714286,126007
20250813_223351_phrases_none_-1_5_8,phrases,none,5,8,0.2577,1375.0089,3440,260253,519,185.895,127535
20250813_223446_phrases_none_-1_5_9,phrases,none,5,9,0.2663,1375.2622,3235,263519,314,188.22785714285715,128348
20250813_223542_phrases_none_-1_5_10,phrases,none,5,10,0.2717,1375.3111,3107,265465,183,189.61785714285713,129025
20250813_223634_phrases_none_-1_5_11,phrases,none,5,11,0.2756,1375.4000,3034,266187,106,190.13357142857143,129317
20250813_223727_phrases_none_-1_5_12,phrases,none,5,12,0.2763,1375.3911,2983,266726,53,190.51857142857142,129538
20250813_223820_phrases_none_-1_5_13,phrases,none,5,13,0.2780,1375.3733,2955,266950,23,190.67857142857142,129634
20250813_223914_phrases_none_-1_5_14,phrases,none,5,14,0.2763,1375.3733,2944,267029,10,190.735,129667
20250813_224006_phrases_none_-1_6_5,phrases,none,6,5,0.2083,1357.2311,4179,248276,1526,177.34,125864
20250813_224059_phrases_none_-1_6_6,phrases,none,6,6,0.2227,1370.1111,3683,254618,1024,181.87,126320
20250813_224154_phrases_none_-1_6_7,phrases,none,6,7,0.2446,1375.9467,3327,260186,666,185.84714285714287,127204
20250813_224249_phrases_none_-1_6_8,phrases,none,6,8,0.2607,1377.0578,3066,266658,405,190.47,128655
20250813_224344_phrases_none_-1_6_9,phrases,none,6,9,0.2626,1377.2933,2905,269749,244,192.67785714285714,129428
20250813_224440_phrases_none_-1_6_10,phrases,none,6,10,0.2659,1377.3022,2796,271612,132,194.00857142857143,130067
20250813_224533_phrases_none_-1_6_11,phrases,none,6,11,0.2705,1377.3867,2738,272274,72,194.48142857142858,130349
20250813_224626_phrases_none_-1_6_12,phrases,none,6,12,0.2723,1377.3867,2698,272764,30,194.83142857142857,130558
20250813_224720_phrases_none_-1_6_13,phrases,none,6,13,0.2740,1377.3556,2679,272950,9,194.96428571428572,130646
20250813_224811_phrases_none_-1_6_14,phrases,none,6,14,0.2739,1377.3556,2675,272993,4,194.995,130667
20250813_224901_phrases_none_-1_7_5,phrases,none,7,5,0.2179,1360.0756,3692,254793,1245,181.995,126840
20250813_224954_phrases_none_-1_7_6,phrases,none,7,6,0.2272,1372.5289,3282,260774,830,186.26714285714286,127217
20250813_225050_phrases_none_-1_7_7,phrases,none,7,7,0.2467,1377.2667,2990,266094,537,190.06714285714287,128021
20250813_225145_phrases_none_-1_7_8,phrases,none,7,8,0.2626,1378.2800,2769,272417,316,194.58357142857142,129446
20250813_225241_phrases_none_-1_7_9,phrases,none,7,9,0.2655,1378.4978,2636,275372,183,196.6942857142857,130197
20250813_225336_phrases_none_-1_7_10,phrases,none,7,10,0.2689,1378.5244,2550,277111,96,197.93642857142856,130797
20250813_225427_phrases_none_-1_7_11,phrases,none,7,11,0.2731,1378.6000,2508,277686,52,198.34714285714287,131056
20250813_225519_phrases_none_-1_7_12,phrases,none,7,12,0.2745,1378.6000,2478,278119,21,198.65642857142856,131243
20250813_225611_phrases_none_-1_7_13,phrases,none,7,13,0.2757,1378.5733,2463,278283,5,198.77357142857142,131322
20250813_225702_phrases_none_-1_7_14,phrases,none,7,14,0.2756,1378.5733,2461,278314,2,198.7957142857143,131340
20250813_225753_phrases_none_-1_9_5,phrases,none,9,5,0.2237,1365.1600,3018,267606,906,191.14714285714285,128162
20250813_225847_phrases_none_-1_9_6,phrases,none,9,6,0.2326,1376.7600,2729,273019,613,195.01357142857142,128423
20250813_225941_phrases_none_-1_9_7,phrases,none,9,7,0.2467,1379.9867,2504,277959,388,198.54214285714286,129108
20250813_230036_phrases_none_-1_9_8,phrases,none,9,8,0.2596,1380.8756,2336,284007,220,202.86214285714286,130474
20250813_230132_phrases_none_-1_9_9,phrases,none,9,9,0.2657,1381.0489,2242,286741,126,204.815,131156
20250813_230227_phrases_none_-1_9_10,phrases,none,9,10,0.2692,1381.0711,2181,288318,64,205.94142857142856,131709
20250813_230320_phrases_none_-1_9_11,phrases,none,9,11,0.2725,1381.1467,2150,288817,33,206.29785714285714,131931
20250813_230413_phrases_none_-1_9_12,phrases,none,9,12,0.2729,1381.1467,2130,289182,12,206.55857142857144,132086
20250813_230504_phrases_none_-1_9_13,phrases,none,9,13,0.2743,1381.1244,2122,289303,3,206.645,132139
20250813_230555_phrases_none_-1_9_14,phrases,none,9,14,0.2743,1381.1244,2120,289334,0,206.66714285714286,132157
20250813_230646_phrases_none_-1_10_5,phrases,none,10,5,0.2247,1367.4844,2799,273691,802,195.49357142857144,128332
20250813_230736_phrases_none_-1_10_6,phrases,none,10,6,0.2334,1377.9467,2546,278907,546,199.21928571428572,128545
20250813_230830_phrases_none_-1_10_7,phrases,none,10,7,0.2472,1380.7244,2343,283700,343,202.64285714285714,129172
20250813_230922_phrases_none_-1_10_8,phrases,none,10,8,0.2593,1381.5600,2196,289586,196,206.84714285714287,130480
20250813_231016_phrases_none_-1_10_9,phrases,none,10,9,0.2654,1381.7111,2112,292248,112,208.74857142857144,131145
20250813_231109_phrases_none_-1_10_10,phrases,none,10,10,0.2686,1381.7378,2056,293784,56,209.84571428571428,131675
20250813_231203_phrases_none_-1_10_11,phrases,none,10,11,0.2725,1381.8178,2027,294274,27,210.19571428571427,131892
20250813_231257_phrases_none_-1_10_12,phrases,none,10,12,0.2728,1381.8133,2007,294631,7,210.4507142857143,132041
20250813_231346_phrases_none_-1_10_13,phrases,none,10,13,0.2746,1381.7956,2002,294733,1,210.52357142857142,132087
20250813_231437_phrases_none_-1_10_14,phrases,none,10,14,0.2746,1381.7956,2001,294746,0,210.53285714285715,132095
20250813_231528_phrases_none_-1_11_5,phrases,none,11,5,0.2291,1371.9422,2616,279880,716,199.9142857142857,128487
20250813_231620_phrases_none_-1_11_6,phrases,none,11,6,0.2365,1381.1200,2395,284879,492,203.485,128668
20250813_231714_phrases_none_-1_11_7,phrases,none,11,7,0.2480,1383.4489,2210,289537,307,206.81214285714285,129265
20250813_231809_phrases_none_-1_11_8,phrases,none,11,8,0.2597,1384.1422,2079,295296,176,210.9257142857143,130535
20250814_104809_phrases_none_-1_11_9,phrases,none,11,9,0.2656,1384.2533,2003,297899,100,212.785,131201
20250814_104904_phrases_none_-1_11_10,phrases,none,11,10,0.2684,1384.2756,1953,299393,50,213.85214285714287,131713
20250814_105006_phrases_none_-1_11_11,phrases,none,11,11,0.2712,1384.3244,1926,299865,23,214.18928571428572,131927
20250814_105103_phrases_none_-1_11_12,phrases,none,11,12,0.2713,1384.3200,1909,300191,6,214.42214285714286,132058
20250814_105158_phrases_none_-1_11_13,phrases,none,11,13,0.2732,1384.2978,1905,300283,1,214.48785714285714,132102
20250814_105250_phrases_none_-1_11_14,phrases,none,11,14,0.2732,1384.2978,1904,300296,0,214.49714285714285,132110
1 Run Mode Stopword Min Freq Min PMI MAP CiP Dict Size Terms Size Num phrases AVGDL SUMDF
2 20250813_160649_duckdb_english_-1_0_0 duckdb english 0 0 0.2881 704.8000 6639 128030 0 91.45 80317
3 20250813_160726_phrases_english_-1_0_0 phrases english 0 0 0.1572 476.8444 21928 102342 16988 73.10142857142857 77971
4 20250813_160938_phrases_english_-1_0_1 phrases english 0 1 0.1629 493.3867 21685 102526 16746 73.23285714285714 77793
5 20250813_161143_phrases_english_-1_0_2 phrases english 0 2 0.1688 509.5022 21052 103046 16109 73.60428571428571 77506
6 20250813_161351_phrases_english_-1_0_4 phrases english 0 4 0.1778 559.0311 18662 106284 13680 75.91714285714286 77921
7 20250813_161553_phrases_english_-1_0_8 phrases english 0 8 0.2568 790.6044 11156 126009 5981 90.00642857142857 82155
8 20250813_161718_phrases_english_-1_0_16 phrases english 0 16 0.2906 816.3067 6593 138733 129 99.095 86796
9 20250813_161835_phrases_english_-1_0_24 phrases english 0 24 0.2906 816.3200 6639 138873 0 99.195 86916
10 20250813_161949_phrases_english_-1_0_48 phrases english 0 48 0.2906 816.3200 6639 138873 0 99.195 86916
11 20250813_162104_phrases_english_-1_1_0 phrases english 1 0 0.1572 476.8444 21928 102342 16988 73.10142857142857 77971
12 20250813_162314_phrases_english_-1_1_1 phrases english 1 1 0.1629 493.3867 21685 102526 16746 73.23285714285714 77793
13 20250813_162521_phrases_english_-1_1_2 phrases english 1 2 0.1688 509.5022 21052 103046 16109 73.60428571428571 77506
14 20250813_162652_phrases_english_-1_1_4 phrases english 1 4 0.1778 559.0311 18662 106284 13680 75.91714285714286 77921
15 20250813_162851_phrases_english_-1_1_8 phrases english 1 8 0.2568 790.6044 11156 126009 5981 90.00642857142857 82155
16 20250813_163027_phrases_english_-1_1_16 phrases english 1 16 0.2906 816.3067 6593 138733 129 99.095 86796
17 20250813_163147_phrases_english_-1_1_24 phrases english 1 24 0.2906 816.3200 6639 138873 0 99.195 86916
18 20250813_163259_phrases_english_-1_1_48 phrases english 1 48 0.2906 816.3200 6639 138873 0 99.195 86916
19 20250813_163405_phrases_english_-1_2_0 phrases english 2 0 0.1763 379.8711 10740 105739 6586 75.52785714285714 79664
20 20250813_163530_phrases_english_-1_2_1 phrases english 2 1 0.1772 396.0311 10666 105816 6512 75.58285714285714 79489
21 20250813_163654_phrases_english_-1_2_2 phrases english 2 2 0.1794 411.7600 10427 106096 6273 75.78285714285714 79150
22 20250813_163822_phrases_english_-1_2_4 phrases english 2 4 0.1938 459.2800 9433 108408 5271 77.43428571428572 79099
23 20250813_163949_phrases_english_-1_2_8 phrases english 2 8 0.2627 709.4400 6083 124639 1908 89.02785714285714 81508
24 20250813_164108_phrases_english_-1_2_16 phrases english 2 16 0.2932 739.6222 4347 133593 12 95.42357142857144 83902
25 20250813_164216_phrases_english_-1_2_24 phrases english 2 24 0.2932 739.6222 4349 133616 0 95.44 83913
26 20250813_164322_phrases_english_-1_2_48 phrases english 2 48 0.2932 739.6222 4349 133616 0 95.44 83913
27 20250813_164425_phrases_english_-1_4_0 phrases english 4 0 0.1926 412.1156 5392 108052 2439 77.18 77560
28 20250813_164542_phrases_english_-1_4_1 phrases english 4 1 0.1954 422.1467 5376 108073 2423 77.195 77458
29 20250813_164656_phrases_english_-1_4_2 phrases english 4 2 0.1955 431.9733 5305 108193 2352 77.28071428571428 77205
30 20250813_164814_phrases_english_-1_4_4 phrases english 4 4 0.2077 469.7067 4969 109698 2015 78.35571428571428 77107
31 20250813_164929_phrases_english_-1_4_8 phrases english 4 8 0.2703 705.4267 3584 122735 630 87.66785714285714 78610
32 20250813_165045_phrases_english_-1_4_16 phrases english 4 16 0.2938 732.3067 2980 129255 0 92.325 80420
33 20250813_165151_phrases_english_-1_4_24 phrases english 4 24 0.2938 732.3067 2980 129255 0 92.325 80420
34 20250813_165256_phrases_english_-1_4_48 phrases english 4 48 0.2938 732.3067 2980 129255 0 92.325 80420
35 20250813_165402_phrases_english_-1_8_0 phrases english 8 0 0.2316 481.0444 2889 109018 915 77.87 74606
36 20250813_165514_phrases_english_-1_8_1 phrases english 8 1 0.2315 485.4489 2887 109018 913 77.87 74543
37 20250813_165623_phrases_english_-1_8_2 phrases english 8 2 0.2312 489.1467 2869 109078 895 77.91285714285715 74428
38 20250813_165734_phrases_english_-1_8_4 phrases english 8 4 0.2402 513.2978 2760 109979 786 78.55642857142857 74316
39 20250813_165845_phrases_english_-1_8_8 phrases english 8 8 0.2772 710.0622 2195 120281 221 85.915 75361
40 20250813_165956_phrases_english_-1_8_16 phrases english 8 16 0.2933 733.9689 1978 125124 0 89.37428571428572 76721
41 20250813_170104_phrases_english_-1_8_24 phrases english 8 24 0.2933 733.9689 1978 125124 0 89.37428571428572 76721
42 20250813_170215_phrases_english_-1_8_48 phrases english 8 48 0.2933 733.9689 1978 125124 0 89.37428571428572 76721
43 20250813_170321_phrases_english_-1_16_0 phrases english 16 0 0.2509 538.9644 1636 105876 341 75.62571428571428 69335
44 20250813_170435_phrases_english_-1_16_1 phrases english 16 1 0.2519 543.3867 1635 105876 340 75.62571428571428 69280
45 20250813_170543_phrases_english_-1_16_2 phrases english 16 2 0.2522 545.7333 1632 105914 337 75.65285714285714 69255
46 20250813_170652_phrases_english_-1_16_4 phrases english 16 4 0.2599 560.0044 1601 106384 306 75.98857142857143 69176
47 20250813_170758_phrases_english_-1_16_8 phrases english 16 8 0.2889 721.7467 1376 114387 81 81.705 69972
48 20250813_170906_phrases_english_-1_16_16 phrases english 16 16 0.2978 742.4178 1296 117990 0 84.27857142857142 71021
49 20250813_171011_phrases_english_-1_16_24 phrases english 16 24 0.2978 742.4178 1296 117990 0 84.27857142857142 71021
50 20250813_171115_phrases_english_-1_16_48 phrases english 16 48 0.2978 742.4178 1296 117990 0 84.27857142857142 71021
51 20250813_171220_phrases_english_-1_24_0 phrases english 24 0 0.2563 578.9422 1164 102808 188 73.43428571428572 65298
52 20250813_171330_phrases_english_-1_24_1 phrases english 24 1 0.2563 578.9422 1164 102808 188 73.43428571428572 65298
53 20250813_171441_phrases_english_-1_24_2 phrases english 24 2 0.2568 579.9022 1163 102829 187 73.44928571428571 65298
54 20250813_171553_phrases_english_-1_24_4 phrases english 24 4 0.2617 592.4044 1149 103170 173 73.69285714285714 65267
55 20250813_171703_phrases_english_-1_24_8 phrases english 24 8 0.2851 731.9244 1017 109871 41 78.47928571428571 65953
56 20250813_171818_phrases_english_-1_24_16 phrases english 24 16 0.2901 749.5022 977 112836 0 80.59714285714286 66831
57 20250813_171930_phrases_english_-1_24_24 phrases english 24 24 0.2901 749.5022 977 112836 0 80.59714285714286 66831
58 20250813_172041_phrases_english_-1_24_48 phrases english 24 48 0.2901 749.5022 977 112836 0 80.59714285714286 66831
59 20250813_172151_duckdb_none_-1_0_0 duckdb none 0 0 0.2713 1366.4711 7044 239151 0 170.82214285714286 120060
60 20250813_172225_phrases_none_-1_0_0 phrases none 0 0 0.0942 606.3111 39156 138456 36189 98.89714285714285 116154
61 20250813_172407_phrases_none_-1_0_1 phrases none 0 1 0.1055 865.5467 36615 146196 33656 104.42571428571429 118449
62 20250813_172553_phrases_none_-1_0_2 phrases none 0 2 0.1207 1160.9289 32862 161095 29874 115.06785714285714 113754
63 20250813_172741_phrases_none_-1_0_4 phrases none 0 4 0.1507 1305.4889 25669 184408 22135 131.72 107278
64 20250813_172918_phrases_none_-1_0_8 phrases none 0 8 0.2366 1365.3956 12779 223412 7754 159.58 113556
65 20250813_173023_phrases_none_-1_0_16 phrases none 0 16 0.2712 1366.4711 6994 238997 142 170.71214285714285 119929
66 20250813_173123_phrases_none_-1_0_24 phrases none 0 24 0.2713 1366.4711 7044 239151 0 170.82214285714286 120060
67 20250813_173214_phrases_none_-1_0_48 phrases none 0 48 0.2713 1366.4711 7044 239151 0 170.82214285714286 120060
68 20250813_173306_phrases_none_-1_1_0 phrases none 1 0 0.0942 606.3111 39156 138456 36189 98.89714285714285 116154
69 20250813_173448_phrases_none_-1_1_1 phrases none 1 1 0.1055 865.5467 36615 146196 33656 104.42571428571429 118449
70 20250813_173636_phrases_none_-1_1_2 phrases none 1 2 0.1207 1160.9289 32862 161095 29874 115.06785714285714 113754
71 20250813_173824_phrases_none_-1_1_4 phrases none 1 4 0.1507 1305.4889 25669 184408 22135 131.72 107278
72 20250813_174000_phrases_none_-1_1_8 phrases none 1 8 0.2366 1365.3956 12779 223412 7754 159.58 113556
73 20250813_174119_phrases_none_-1_1_16 phrases none 1 16 0.2712 1366.4711 6994 238997 142 170.71214285714285 119929
74 20250813_174220_phrases_none_-1_1_24 phrases none 1 24 0.2713 1366.4711 7044 239151 0 170.82214285714286 120060
75 20250813_174314_phrases_none_-1_1_48 phrases none 1 48 0.2713 1366.4711 7044 239151 0 170.82214285714286 120060
76 20250813_174406_phrases_none_-1_2_0 phrases none 2 0 0.1154 775.2578 23364 162542 19103 116.10142857142857 131171
77 20250813_174516_phrases_none_-1_2_1 phrases none 2 1 0.1229 925.6178 21484 168916 17216 120.65428571428572 131354
78 20250813_174625_phrases_none_-1_2_2 phrases none 2 2 0.1401 1175.8044 18673 181497 14424 129.6407142857143 124656
79 20250813_174734_phrases_none_-1_2_4 phrases none 2 4 0.1745 1309.2400 13265 203089 8938 145.06357142857144 117056
80 20250813_174842_phrases_none_-1_2_8 phrases none 2 8 0.2438 1367.9511 6821 236944 2307 169.24571428571429 121637
81 20250813_174938_phrases_none_-1_2_16 phrases none 2 16 0.2715 1368.6533 4719 247372 13 176.6942857142857 124752
82 20250813_175030_phrases_none_-1_2_24 phrases none 2 24 0.2715 1368.6533 4720 247398 0 176.71285714285713 124765
83 20250813_175123_phrases_none_-1_2_48 phrases none 2 48 0.2715 1368.6533 4720 247398 0 176.71285714285713 124765
84 20250813_175212_phrases_none_-1_4_0 phrases none 4 0 0.1461 917.3200 12382 188395 9140 134.56785714285715 138261
85 20250813_175308_phrases_none_-1_4_1 phrases none 4 1 0.1444 1016.7067 11297 192252 8053 137.32285714285715 137005
86 20250813_175355_phrases_none_-1_4_2 phrases none 4 2 0.1640 1212.6044 9670 203157 6432 145.11214285714286 129190
87 20250813_175450_phrases_none_-1_4_4 phrases none 4 4 0.1844 1328.3778 6782 224561 3533 160.4007142857143 122890
88 20250813_175540_phrases_none_-1_4_8 phrases none 4 8 0.2532 1373.9778 4029 253029 745 180.735 126124
89 20250813_175635_phrases_none_-1_4_16 phrases none 4 16 0.2731 1374.4044 3313 260575 0 186.125 128467
90 20250813_175724_phrases_none_-1_4_24 phrases none 4 24 0.2731 1374.4044 3313 260575 0 186.125 128467
91 20250813_175813_phrases_none_-1_4_48 phrases none 4 48 0.2731 1374.4044 3313 260575 0 186.125 128467
92 20250813_175900_phrases_none_-1_8_0 phrases none 8 0 0.1722 1081.2489 6529 220276 4267 157.34 141465
93 20250813_175950_phrases_none_-1_8_1 phrases none 8 1 0.1759 1147.3956 6030 222849 3768 159.17785714285714 139509
94 20250813_180037_phrases_none_-1_8_2 phrases none 8 2 0.1925 1262.1600 5229 234827 2968 167.73357142857142 133023
95 20250813_180131_phrases_none_-1_8_4 phrases none 8 4 0.2184 1346.5333 3785 254655 1523 181.89642857142857 127665
96 20250813_180222_phrases_none_-1_8_8 phrases none 8 8 0.2630 1379.0667 2537 278374 269 198.83857142857144 129892
97 20250813_180315_phrases_none_-1_8_16 phrases none 8 16 0.2752 1379.3156 2273 284030 0 202.87857142857143 131682
98 20250813_180404_phrases_none_-1_8_24 phrases none 8 24 0.2752 1379.3156 2273 284030 0 202.87857142857143 131682
99 20250813_180453_phrases_none_-1_8_48 phrases none 8 48 0.2752 1379.3156 2273 284030 0 202.87857142857143 131682
100 20250813_180542_phrases_none_-1_16_0 phrases none 16 0 0.2053 1267.8489 3405 269846 1865 192.74714285714285 142548
101 20250813_180632_phrases_none_-1_16_1 phrases none 16 1 0.2059 1279.7244 3196 271773 1656 194.12357142857144 140763
102 20250813_180723_phrases_none_-1_16_2 phrases none 16 2 0.2214 1325.5867 2835 282258 1296 201.61285714285714 134192
103 20250813_180816_phrases_none_-1_16_4 phrases none 16 4 0.2379 1375.3067 2176 300256 637 214.46857142857144 129654
104 20250813_180910_phrases_none_-1_16_8 phrases none 16 8 0.2662 1391.0533 1641 319121 101 227.94357142857143 130933
105 20250813_181005_phrases_none_-1_16_16 phrases none 16 16 0.2722 1391.0400 1541 323278 0 230.91285714285715 132277
106 20250813_181056_phrases_none_-1_16_24 phrases none 16 24 0.2722 1391.0400 1541 323278 0 230.91285714285715 132277
107 20250813_181150_phrases_none_-1_16_48 phrases none 16 48 0.2722 1391.0400 1541 323278 0 230.91285714285715 132277
108 20250813_181242_phrases_none_-1_24_0 phrases none 24 0 0.2074 1341.2933 2290 303650 1109 216.89285714285714 140095
109 20250813_181336_phrases_none_-1_24_1 phrases none 24 1 0.2134 1339.4400 2184 304857 1003 217.755 138507
110 20250813_181430_phrases_none_-1_24_2 phrases none 24 2 0.2330 1354.8667 1960 315279 780 225.1992857142857 131976
111 20250813_181523_phrases_none_-1_24_4 phrases none 24 4 0.2451 1386.2356 1568 330903 387 236.3592857142857 127722
112 20250813_181620_phrases_none_-1_24_8 phrases none 24 8 0.2702 1393.9689 1235 347214 54 248.01 128647
113 20250813_181717_phrases_none_-1_24_16 phrases none 24 16 0.2721 1393.9244 1182 350595 0 250.425 129773
114 20250813_181811_phrases_none_-1_24_24 phrases none 24 24 0.2721 1393.9244 1182 350595 0 250.425 129773
115 20250813_181906_phrases_none_-1_24_48 phrases none 24 48 0.2721 1393.9244 1182 350595 0 250.425 129773
116 20250813_210100_duckdb_english_-1_4_5 duckdb english 4 5 0.2881 704.8000 6639 128030 0 91.45 80317
117 20250813_210142_phrases_english_-1_4_5 phrases english 4 5 0.2083 521.3733 4654 111616 1700 79.72571428571429 77189
118 20250813_210257_phrases_english_-1_4_6 phrases english 4 6 0.2195 595.7778 4250 114239 1297 81.59928571428571 77437
119 20250813_210406_phrases_english_-1_4_7 phrases english 4 7 0.2465 654.8444 3904 117509 950 83.935 77710
120 20250813_210516_phrases_english_-1_4_8 phrases english 4 8 0.2703 705.4267 3584 122735 630 87.66785714285714 78610
121 20250813_210624_phrases_english_-1_4_9 phrases english 4 9 0.2796 722.9733 3346 125792 390 89.85142857142857 79288
122 20250813_210735_phrases_english_-1_4_10 phrases english 4 10 0.2870 729.9289 3205 127524 245 91.08857142857143 79810
123 20250813_210846_phrases_english_-1_4_11 phrases english 4 11 0.2910 731.6133 3106 128280 141 91.62857142857143 80077
124 20250813_210955_phrases_english_-1_4_12 phrases english 4 12 0.2940 732.1911 3044 128816 74 92.01142857142857 80251
125 20250813_211102_phrases_english_-1_4_13 phrases english 4 13 0.2948 732.2889 3006 129079 34 92.19928571428571 80351
126 20250813_211209_phrases_english_-1_4_14 phrases english 4 14 0.2938 732.2978 2989 129185 14 92.275 80400
127 20250813_211315_phrases_english_-1_5_5 phrases english 5 5 0.2173 517.2000 3835 111400 1235 79.57142857142857 76183
128 20250813_211424_phrases_english_-1_5_6 phrases english 5 6 0.2309 590.4667 3536 113742 936 81.24428571428571 76390
129 20250813_211534_phrases_english_-1_5_7 phrases english 5 7 0.2538 648.1467 3280 116753 680 83.395 76603
130 20250813_211644_phrases_english_-1_5_8 phrases english 5 8 0.2713 698.4800 3035 121771 435 86.97928571428571 77476
131 20250813_211755_phrases_english_-1_5_9 phrases english 5 9 0.2818 715.6089 2870 124598 270 88.99857142857142 78102
132 20250813_211908_phrases_english_-1_5_10 phrases english 5 10 0.2888 722.5244 2768 126202 166 90.14428571428572 78608
133 20250813_212014_phrases_english_-1_5_11 phrases english 5 11 0.2923 724.1733 2702 126846 97 90.60428571428571 78836
134 20250813_212121_phrases_english_-1_5_12 phrases english 5 12 0.2926 724.7289 2656 127310 49 90.93571428571428 79002
135 20250813_212229_phrases_english_-1_5_13 phrases english 5 13 0.2940 724.7778 2632 127516 23 91.08285714285714 79084
136 20250813_212336_phrases_english_-1_5_14 phrases english 5 14 0.2936 724.7822 2621 127595 10 91.13928571428572 79117
137 20250813_212441_phrases_english_-1_6_5 phrases english 6 5 0.2286 532.4889 3326 111572 976 79.69428571428571 75634
138 20250813_212550_phrases_english_-1_6_6 phrases english 6 6 0.2452 601.1067 3090 113712 740 81.22285714285714 75810
139 20250813_212701_phrases_english_-1_6_7 phrases english 6 7 0.2606 656.0533 2883 116571 533 83.265 76015
140 20250813_212811_phrases_english_-1_6_8 phrases english 6 8 0.2758 703.9644 2686 121408 336 86.72 76833
141 20250813_212922_phrases_english_-1_6_9 phrases english 6 9 0.2832 720.5422 2556 124095 206 88.63928571428572 77428
142 20250813_213032_phrases_english_-1_6_10 phrases english 6 10 0.2871 727.2844 2468 125638 116 89.74142857142857 77916
143 20250813_213140_phrases_english_-1_6_11 phrases english 6 11 0.2896 728.7956 2417 126211 64 90.15071428571429 78124
144 20250813_213257_phrases_english_-1_6_12 phrases english 6 12 0.2913 729.3200 2382 126624 27 90.44571428571429 78278
145 20250813_213410_phrases_english_-1_6_13 phrases english 6 13 0.2927 729.3644 2366 126799 9 90.57071428571429 78352
146 20250813_213517_phrases_english_-1_6_14 phrases english 6 14 0.2928 729.3644 2362 126842 4 90.60142857142857 78373
147 20250813_213626_phrases_english_-1_7_5 phrases english 7 5 0.2358 547.9867 2934 111748 784 79.82 75148
148 20250813_213741_phrases_english_-1_7_6 phrases english 7 6 0.2455 606.9378 2748 113674 598 81.19571428571429 75290
149 20250813_213858_phrases_english_-1_7_7 phrases english 7 7 0.2617 660.2489 2577 116381 427 83.12928571428571 75452
150 20250813_214015_phrases_english_-1_7_8 phrases english 7 8 0.2790 707.7867 2410 121097 260 86.49785714285714 76252
151 20250813_214134_phrases_english_-1_7_9 phrases english 7 9 0.2862 724.0533 2302 123686 152 88.34714285714286 76823
152 20250813_214245_phrases_english_-1_7_10 phrases english 7 10 0.2887 730.5244 2234 125113 83 89.36642857142857 77280
153 20250813_214357_phrases_english_-1_7_11 phrases english 7 11 0.2924 731.9333 2197 125607 45 89.71928571428572 77469
154 20250813_214509_phrases_english_-1_7_12 phrases english 7 12 0.2934 732.4000 2171 125965 18 89.975 77607
155 20250813_214616_phrases_english_-1_7_13 phrases english 7 13 0.2946 732.4356 2159 126118 5 90.08428571428571 77672
156 20250813_214724_phrases_english_-1_7_14 phrases english 7 14 0.2946 732.4356 2157 126149 2 90.10642857142857 77690
157 20250813_214831_phrases_english_-1_9_5 phrases english 9 5 0.2402 565.6178 2392 111075 566 79.33928571428571 73618
158 20250813_214943_phrases_english_-1_9_6 phrases english 9 6 0.2510 619.9644 2263 112722 437 80.51571428571428 73737
159 20250813_215055_phrases_english_-1_9_7 phrases english 9 7 0.2631 667.1956 2134 115185 308 82.275 73839
160 20250813_215207_phrases_english_-1_9_8 phrases english 9 8 0.2781 713.2356 2007 119663 181 85.47357142857143 74601
161 20250813_215319_phrases_english_-1_9_9 phrases english 9 9 0.2834 729.0667 1930 122062 104 87.18714285714286 75124
162 20250813_215427_phrases_english_-1_9_10 phrases english 9 10 0.2879 734.9022 1885 123339 58 88.09928571428571 75539
163 20250813_215537_phrases_english_-1_9_11 phrases english 9 11 0.2909 736.1733 1858 123770 31 88.40714285714286 75702
164 20250813_215648_phrases_english_-1_9_12 phrases english 9 12 0.2917 736.6178 1840 124073 12 88.62357142857142 75818
165 20250813_215751_phrases_english_-1_9_13 phrases english 9 13 0.2938 736.6311 1832 124199 3 88.71357142857143 75872
166 20250813_215854_phrases_english_-1_9_14 phrases english 9 14 0.2938 736.6311 1830 124230 0 88.73571428571428 75890
167 20250813_215956_phrases_english_-1_10_5 phrases english 10 5 0.2405 564.1200 2220 110076 501 78.62571428571428 72645
168 20250813_220107_phrases_english_-1_10_6 phrases english 10 6 0.2524 617.2489 2108 111644 389 79.74571428571429 72756
169 20250813_220217_phrases_english_-1_10_7 phrases english 10 7 0.2623 663.9422 1992 114024 273 81.44571428571429 72840
170 20250813_220321_phrases_english_-1_10_8 phrases english 10 8 0.2778 709.8178 1878 118417 159 84.58357142857143 73574
171 20250813_220426_phrases_english_-1_10_9 phrases english 10 9 0.2844 725.6089 1810 120744 91 86.24571428571429 74083
172 20250813_220532_phrases_english_-1_10_10 phrases english 10 10 0.2881 731.2444 1770 121970 51 87.12142857142857 74476
173 20250813_220637_phrases_english_-1_10_11 phrases english 10 11 0.2904 732.5156 1744 122397 25 87.42642857142857 74639
174 20250813_220743_phrases_english_-1_10_12 phrases english 10 12 0.2914 732.9689 1726 122699 7 87.64214285714286 74751
175 20250813_220845_phrases_english_-1_10_13 phrases english 10 13 0.2934 732.9778 1721 122801 1 87.715 74797
176 20250813_220946_phrases_english_-1_10_14 phrases english 10 14 0.2934 732.9778 1720 122814 0 87.72428571428571 74805
177 20250813_221046_phrases_english_-1_11_5 phrases english 11 5 0.2486 569.0800 2065 109381 435 78.12928571428571 71865
178 20250813_221153_phrases_english_-1_11_6 phrases english 11 6 0.2586 619.0044 1971 110817 341 79.155 71977
179 20250813_221259_phrases_english_-1_11_7 phrases english 11 7 0.2684 664.9822 1868 113100 238 80.78571428571429 72052
180 20250813_221406_phrases_english_-1_11_8 phrases english 11 8 0.2800 708.9333 1769 117377 139 83.84071428571428 72755
181 20250813_221510_phrases_english_-1_11_9 phrases english 11 9 0.2862 724.6178 1709 119653 79 85.46642857142857 73262
182 20250813_221616_phrases_english_-1_11_10 phrases english 11 10 0.2900 730.1911 1675 120836 45 86.31142857142858 73640
183 20250813_221721_phrases_english_-1_11_11 phrases english 11 11 0.2926 731.4533 1651 121253 21 86.60928571428572 73802
184 20250813_221825_phrases_english_-1_11_12 phrases english 11 12 0.2935 731.9200 1636 121524 6 86.80285714285715 73896
185 20250813_221929_phrases_english_-1_11_13 phrases english 11 13 0.2954 731.9289 1632 121616 1 86.86857142857143 73940
186 20250813_222030_phrases_english_-1_11_14 phrases english 11 14 0.2954 731.9289 1631 121629 0 86.87785714285714 73948
187 20250813_222129_duckdb_none_-1_4_5 duckdb none 4 5 0.2713 1366.4711 7044 239151 0 170.82214285714286 120060
188 20250813_222206_phrases_none_-1_4_5 phrases none 4 5 0.1932 1350.9111 5821 232789 2560 166.27785714285713 122981
189 20250813_222300_phrases_none_-1_4_6 phrases none 4 6 0.2036 1365.5067 5027 239895 1753 171.35357142857143 123552
190 20250813_222354_phrases_none_-1_4_7 phrases none 4 7 0.2349 1372.5200 4456 246070 1174 175.7642857142857 124560
191 20250813_222448_phrases_none_-1_4_8 phrases none 4 8 0.2532 1373.9778 4029 253029 745 180.735 126124
192 20250813_222544_phrases_none_-1_4_9 phrases none 4 9 0.2624 1374.2622 3740 256538 454 183.24142857142857 126990
193 20250813_222640_phrases_none_-1_4_10 phrases none 4 10 0.2687 1374.3556 3562 258648 271 184.74857142857144 127703
194 20250813_222733_phrases_none_-1_4_11 phrases none 4 11 0.2729 1374.4400 3452 259500 155 185.35714285714286 128044
195 20250813_222827_phrases_none_-1_4_12 phrases none 4 12 0.2745 1374.4267 3381 260125 78 185.80357142857142 128286
196 20250813_222922_phrases_none_-1_4_13 phrases none 4 13 0.2760 1374.4044 3339 260403 34 186.00214285714284 128399
197 20250813_223015_phrases_none_-1_4_14 phrases none 4 14 0.2732 1374.4044 3322 260507 14 186.07642857142858 128447
198 20250813_223106_phrases_none_-1_5_5 phrases none 5 5 0.1988 1353.4622 4792 241122 1886 172.23 124557
199 20250813_223204_phrases_none_-1_5_6 phrases none 5 6 0.2122 1367.3422 4194 247778 1278 176.9842857142857 125103
200 20250813_223257_phrases_none_-1_5_7 phrases none 5 7 0.2399 1373.6933 3767 253550 847 181.10714285714286 126007
201 20250813_223351_phrases_none_-1_5_8 phrases none 5 8 0.2577 1375.0089 3440 260253 519 185.895 127535
202 20250813_223446_phrases_none_-1_5_9 phrases none 5 9 0.2663 1375.2622 3235 263519 314 188.22785714285715 128348
203 20250813_223542_phrases_none_-1_5_10 phrases none 5 10 0.2717 1375.3111 3107 265465 183 189.61785714285713 129025
204 20250813_223634_phrases_none_-1_5_11 phrases none 5 11 0.2756 1375.4000 3034 266187 106 190.13357142857143 129317
205 20250813_223727_phrases_none_-1_5_12 phrases none 5 12 0.2763 1375.3911 2983 266726 53 190.51857142857142 129538
206 20250813_223820_phrases_none_-1_5_13 phrases none 5 13 0.2780 1375.3733 2955 266950 23 190.67857142857142 129634
207 20250813_223914_phrases_none_-1_5_14 phrases none 5 14 0.2763 1375.3733 2944 267029 10 190.735 129667
208 20250813_224006_phrases_none_-1_6_5 phrases none 6 5 0.2083 1357.2311 4179 248276 1526 177.34 125864
209 20250813_224059_phrases_none_-1_6_6 phrases none 6 6 0.2227 1370.1111 3683 254618 1024 181.87 126320
210 20250813_224154_phrases_none_-1_6_7 phrases none 6 7 0.2446 1375.9467 3327 260186 666 185.84714285714287 127204
211 20250813_224249_phrases_none_-1_6_8 phrases none 6 8 0.2607 1377.0578 3066 266658 405 190.47 128655
212 20250813_224344_phrases_none_-1_6_9 phrases none 6 9 0.2626 1377.2933 2905 269749 244 192.67785714285714 129428
213 20250813_224440_phrases_none_-1_6_10 phrases none 6 10 0.2659 1377.3022 2796 271612 132 194.00857142857143 130067
214 20250813_224533_phrases_none_-1_6_11 phrases none 6 11 0.2705 1377.3867 2738 272274 72 194.48142857142858 130349
215 20250813_224626_phrases_none_-1_6_12 phrases none 6 12 0.2723 1377.3867 2698 272764 30 194.83142857142857 130558
216 20250813_224720_phrases_none_-1_6_13 phrases none 6 13 0.2740 1377.3556 2679 272950 9 194.96428571428572 130646
217 20250813_224811_phrases_none_-1_6_14 phrases none 6 14 0.2739 1377.3556 2675 272993 4 194.995 130667
218 20250813_224901_phrases_none_-1_7_5 phrases none 7 5 0.2179 1360.0756 3692 254793 1245 181.995 126840
219 20250813_224954_phrases_none_-1_7_6 phrases none 7 6 0.2272 1372.5289 3282 260774 830 186.26714285714286 127217
220 20250813_225050_phrases_none_-1_7_7 phrases none 7 7 0.2467 1377.2667 2990 266094 537 190.06714285714287 128021
221 20250813_225145_phrases_none_-1_7_8 phrases none 7 8 0.2626 1378.2800 2769 272417 316 194.58357142857142 129446
222 20250813_225241_phrases_none_-1_7_9 phrases none 7 9 0.2655 1378.4978 2636 275372 183 196.6942857142857 130197
223 20250813_225336_phrases_none_-1_7_10 phrases none 7 10 0.2689 1378.5244 2550 277111 96 197.93642857142856 130797
224 20250813_225427_phrases_none_-1_7_11 phrases none 7 11 0.2731 1378.6000 2508 277686 52 198.34714285714287 131056
225 20250813_225519_phrases_none_-1_7_12 phrases none 7 12 0.2745 1378.6000 2478 278119 21 198.65642857142856 131243
226 20250813_225611_phrases_none_-1_7_13 phrases none 7 13 0.2757 1378.5733 2463 278283 5 198.77357142857142 131322
227 20250813_225702_phrases_none_-1_7_14 phrases none 7 14 0.2756 1378.5733 2461 278314 2 198.7957142857143 131340
228 20250813_225753_phrases_none_-1_9_5 phrases none 9 5 0.2237 1365.1600 3018 267606 906 191.14714285714285 128162
229 20250813_225847_phrases_none_-1_9_6 phrases none 9 6 0.2326 1376.7600 2729 273019 613 195.01357142857142 128423
230 20250813_225941_phrases_none_-1_9_7 phrases none 9 7 0.2467 1379.9867 2504 277959 388 198.54214285714286 129108
231 20250813_230036_phrases_none_-1_9_8 phrases none 9 8 0.2596 1380.8756 2336 284007 220 202.86214285714286 130474
232 20250813_230132_phrases_none_-1_9_9 phrases none 9 9 0.2657 1381.0489 2242 286741 126 204.815 131156
233 20250813_230227_phrases_none_-1_9_10 phrases none 9 10 0.2692 1381.0711 2181 288318 64 205.94142857142856 131709
234 20250813_230320_phrases_none_-1_9_11 phrases none 9 11 0.2725 1381.1467 2150 288817 33 206.29785714285714 131931
235 20250813_230413_phrases_none_-1_9_12 phrases none 9 12 0.2729 1381.1467 2130 289182 12 206.55857142857144 132086
236 20250813_230504_phrases_none_-1_9_13 phrases none 9 13 0.2743 1381.1244 2122 289303 3 206.645 132139
237 20250813_230555_phrases_none_-1_9_14 phrases none 9 14 0.2743 1381.1244 2120 289334 0 206.66714285714286 132157
238 20250813_230646_phrases_none_-1_10_5 phrases none 10 5 0.2247 1367.4844 2799 273691 802 195.49357142857144 128332
239 20250813_230736_phrases_none_-1_10_6 phrases none 10 6 0.2334 1377.9467 2546 278907 546 199.21928571428572 128545
240 20250813_230830_phrases_none_-1_10_7 phrases none 10 7 0.2472 1380.7244 2343 283700 343 202.64285714285714 129172
241 20250813_230922_phrases_none_-1_10_8 phrases none 10 8 0.2593 1381.5600 2196 289586 196 206.84714285714287 130480
242 20250813_231016_phrases_none_-1_10_9 phrases none 10 9 0.2654 1381.7111 2112 292248 112 208.74857142857144 131145
243 20250813_231109_phrases_none_-1_10_10 phrases none 10 10 0.2686 1381.7378 2056 293784 56 209.84571428571428 131675
244 20250813_231203_phrases_none_-1_10_11 phrases none 10 11 0.2725 1381.8178 2027 294274 27 210.19571428571427 131892
245 20250813_231257_phrases_none_-1_10_12 phrases none 10 12 0.2728 1381.8133 2007 294631 7 210.4507142857143 132041
246 20250813_231346_phrases_none_-1_10_13 phrases none 10 13 0.2746 1381.7956 2002 294733 1 210.52357142857142 132087
247 20250813_231437_phrases_none_-1_10_14 phrases none 10 14 0.2746 1381.7956 2001 294746 0 210.53285714285715 132095
248 20250813_231528_phrases_none_-1_11_5 phrases none 11 5 0.2291 1371.9422 2616 279880 716 199.9142857142857 128487
249 20250813_231620_phrases_none_-1_11_6 phrases none 11 6 0.2365 1381.1200 2395 284879 492 203.485 128668
250 20250813_231714_phrases_none_-1_11_7 phrases none 11 7 0.2480 1383.4489 2210 289537 307 206.81214285714285 129265
251 20250813_231809_phrases_none_-1_11_8 phrases none 11 8 0.2597 1384.1422 2079 295296 176 210.9257142857143 130535
252 20250814_104809_phrases_none_-1_11_9 phrases none 11 9 0.2656 1384.2533 2003 297899 100 212.785 131201
253 20250814_104904_phrases_none_-1_11_10 phrases none 11 10 0.2684 1384.2756 1953 299393 50 213.85214285714287 131713
254 20250814_105006_phrases_none_-1_11_11 phrases none 11 11 0.2712 1384.3244 1926 299865 23 214.18928571428572 131927
255 20250814_105103_phrases_none_-1_11_12 phrases none 11 12 0.2713 1384.3200 1909 300191 6 214.42214285714286 132058
256 20250814_105158_phrases_none_-1_11_13 phrases none 11 13 0.2732 1384.2978 1905 300283 1 214.48785714285714 132102
257 20250814_105250_phrases_none_-1_11_14 phrases none 11 14 0.2732 1384.2978 1904 300296 0 214.49714285714285 132110

527
phrase_index.py Normal file
View File

@ -0,0 +1,527 @@
import pathlib
import sys
import duckdb
import ir_datasets
import collections
import pandas as pd
from phrases_extractor import extract_phrases_pmi_duckdb
from ze_index import normalize
def insert_dataset(con, ir_dataset, logging=True):
"""
Insert documents from an ir_dataset. Works with several datasets.
Add document attributes if needed.
"""
con.sql('CREATE TABLE documents (did TEXT, content TEXT)')
insert = 'INSERT INTO documents(did, content) VALUES '
sql = insert
part = 0
total = 0
count = ir_dataset.docs_count()
if logging:
print(f"Inserting {count} docs...", file=sys.stderr)
for doc in ir_dataset.docs_iter():
doc_text = ""
if hasattr(doc, 'title'):
doc_text = doc.title
if hasattr(doc, 'body'):
doc_text += " " + doc.body
if hasattr(doc, 'text'):
doc_text += " " + doc.text
sql += "('" + doc.doc_id + "','" + normalize(doc_text) + "'),"
part += 1
if part > 9999:
total += part
if logging:
print(str(total) + " docs", file=sys.stderr)
con.sql(sql)
part = 0
sql = insert
con.sql(sql)
def create_lm(con, stemmer):
con.sql(f"""
CREATE OR REPLACE MACRO fts_main_documents.match_lm(query_string, fields := NULL, lambda := 0.3, conjunctive := 0) AS TABLE (
WITH tokens AS (
SELECT DISTINCT stem(unnest(fts_main_documents.tokenize(query_string)), '{stemmer}') AS t
),
fieldids AS (
SELECT fieldid
FROM fts_main_documents.fields
WHERE CASE WHEN ((fields IS NULL)) THEN (1) ELSE (field = ANY(SELECT * FROM (SELECT unnest(string_split(fields, ','))) AS fsq)) END
),
qtermids AS (
SELECT termid, df
FROM fts_main_documents.dict AS dict, tokens
WHERE (dict.term = tokens.t)
),
qterms AS (
SELECT termid, docid
FROM fts_main_documents.terms AS terms
WHERE (CASE WHEN ((fields IS NULL)) THEN (1) ELSE (fieldid = ANY(SELECT * FROM fieldids)) END
AND (termid = ANY(SELECT qtermids.termid FROM qtermids)))
),
term_tf AS (
SELECT termid, docid, count_star() AS tf
FROM qterms
GROUP BY docid, termid
),
cdocs AS (
SELECT docid
FROM qterms
GROUP BY docid
HAVING CASE WHEN (conjunctive) THEN ((count(DISTINCT termid) = (SELECT count_star() FROM tokens))) ELSE 1 END
),
subscores AS (
SELECT docs.docid, docs.len AS doc_len, term_tf.termid, term_tf.tf, qtermids.df, LN(1 + (lambda * tf * (SELECT ANY_VALUE(sumdf) FROM fts_main_documents.stats)) / ((1-lambda) * df * docs.len)) AS subscore
FROM term_tf, cdocs, fts_main_documents.docs AS docs, qtermids
WHERE ((term_tf.docid = cdocs.docid)
AND (term_tf.docid = docs.docid)
AND (term_tf.termid = qtermids.termid))
),
scores AS (
SELECT docs.name AS docname, LN(MAX(doc_len)) + sum(subscore) AS score FROM subscores, fts_main_documents.docs AS docs WHERE subscores.docid = docs.docid GROUP BY docs.name
),
postings_cost AS (
SELECT COUNT(DISTINCT docid) AS cost FROM qterms
)
SELECT docname, score, (SELECT cost FROM postings_cost) AS postings_cost FROM scores
);
""")
def create_bm25(con, stemmer):
con.sql(f"""
CREATE MACRO fts_main_documents.match_bm25(docname, query_string, b := 0.75, conjunctive := 0, k := 1.2, fields := NULL) AS (
WITH tokens AS (
SELECT DISTINCT stem(unnest(fts_main_documents.tokenize(query_string)), '{stemmer}') AS t
),
fieldids AS (
SELECT fieldid
FROM fts_main_documents.fields
WHERE CASE WHEN ((fields IS NULL)) THEN (1) ELSE (field = ANY(SELECT * FROM (SELECT unnest(string_split(fields, ','))) AS fsq)) END
),
qtermids AS (
SELECT termid, df
FROM fts_main_documents.dict AS dict, tokens
WHERE (dict.term = tokens.t)
),
qterms AS (
SELECT termid, docid
FROM fts_main_documents.terms AS terms
WHERE (CASE WHEN ((fields IS NULL)) THEN (1) ELSE (fieldid = ANY(SELECT * FROM fieldids)) END
AND (termid = ANY(SELECT qtermids.termid FROM qtermids)))
),
term_tf AS (
SELECT termid, docid, count_star() AS tf
FROM qterms
GROUP BY docid, termid
),
cdocs AS (
SELECT docid
FROM qterms
GROUP BY docid
HAVING CASE WHEN (conjunctive) THEN ((count(DISTINCT termid) = (SELECT count_star() FROM tokens))) ELSE 1 END
),
subscores AS (
SELECT docs.docid, docs.len, term_tf.termid, term_tf.tf, qtermids.df, (log((((((SELECT num_docs FROM fts_main_documents.stats) - df) + 0.5) / (df + 0.5)) + 1)) * ((tf * (k + 1)) / (tf + (k * ((1 - b) + (b * (len / (SELECT avgdl FROM fts_main_documents.stats)))))))) AS subscore
FROM term_tf, cdocs, fts_main_documents.docs AS docs, qtermids
WHERE ((term_tf.docid = cdocs.docid)
AND (term_tf.docid = docs.docid)
AND (term_tf.termid = qtermids.termid))
),
scores AS (
SELECT docid, sum(subscore) AS score FROM subscores GROUP BY docid
),
SELECT score FROM scores, fts_main_documents.docs AS docs
WHERE ((scores.docid = docs.docid) AND (docs."name" = docname)))
""")
def create_docs_table(con, fts_schema="fts_main_documents", input_schema="main", input_table="documents", input_id="did"):
"""
Create the documents table.
input_id should be the column name in input_table that uniquely identifies each document (e.g., 'did').
"""
con.sql(f"""
CREATE SCHEMA IF NOT EXISTS {fts_schema};
CREATE TABLE {fts_schema}.docs AS (
SELECT
row_number() OVER () AS docid,
{input_id} AS name
FROM
{input_schema}.{input_table}
);
""")
def create_tokenizer_duckdb(con):
con.sql("""
CREATE MACRO fts_main_documents.tokenize(s) AS (
string_split_regex(regexp_replace(lower(strip_accents(CAST(s AS VARCHAR))), '[0-9!@#$%^&*()_+={}\\[\\]:;<>,.?~\\\\/\\|''''"`-]+', ' ', 'g'), '\\s+')
);
""")
def create_tokenizer_ciff(con, fts_schema="fts_main_documents"):
con.sql(f"""
CREATE TABLE IF NOT EXISTS {fts_schema}.dict (termid BIGINT, term TEXT, df BIGINT);
CREATE OR REPLACE MACRO {fts_schema}.tokenize(query_string) AS (
WITH RECURSIVE sequence AS (
SELECT range AS nr
FROM RANGE((SELECT MAX(LEN(term)) + 1 FROM {fts_schema}.dict))
),
simpledict AS (
SELECT '' AS term
UNION
SELECT term FROM {fts_schema}.dict
),
subterms(term, subquery) AS (
SELECT '', lower(strip_accents(CAST(query_string AS VARCHAR)))
UNION
SELECT MAX(dict.term), SUBSTRING(subquery,
CASE WHEN MAX(nr) < 1 THEN 2 ELSE MAX(nr) + 1 END,
LEN(subquery)) AS subquery
FROM subterms, sequence, simpledict as dict
WHERE SUBSTRING(subquery, 1, nr) = dict.term
GROUP BY subquery
)
SELECT LIST(term) FROM subterms WHERE NOT term = ''
)
""")
def create_stopwords_table(con, fts_schema="fts_main_documents", stopwords='none'):
"""
Create the stopwords table.
If stopwords is 'english', it will create a table with English stopwords.
If stopwords is 'none', it will create an empty table.
"""
con.sql(f"DROP TABLE IF EXISTS {fts_schema}.stopwords;")
if stopwords == 'english':
con.sql(f"""
CREATE TABLE {fts_schema}.stopwords (sw VARCHAR);
INSERT INTO {fts_schema}.stopwords VALUES ('a'), ('a''s'), ('able'), ('about'), ('above'), ('according'), ('accordingly'), ('across'), ('actually'), ('after'), ('afterwards'), ('again'), ('against'), ('ain''t'), ('all'), ('allow'), ('allows'), ('almost'), ('alone'), ('along'), ('already'), ('also'), ('although'), ('always'), ('am'), ('among'), ('amongst'), ('an'), ('and'), ('another'), ('any'), ('anybody'), ('anyhow'), ('anyone'), ('anything'), ('anyway'), ('anyways'), ('anywhere'), ('apart'), ('appear'), ('appreciate'), ('appropriate'), ('are'), ('aren''t'), ('around'), ('as'), ('aside'), ('ask'), ('asking'), ('associated'), ('at'), ('available'), ('away'), ('awfully'), ('b'), ('be'), ('became'), ('because'), ('become'), ('becomes'), ('becoming'), ('been'), ('before'), ('beforehand'), ('behind'), ('being'), ('believe'), ('below'), ('beside'), ('besides'), ('best'), ('better'), ('between'), ('beyond'), ('both'), ('brief'), ('but'), ('by'), ('c'), ('c''mon'), ('c''s'), ('came'), ('can'), ('can''t'), ('cannot'), ('cant'), ('cause'), ('causes'), ('certain'), ('certainly'), ('changes'), ('clearly'), ('co'), ('com'), ('come'), ('comes'), ('concerning'), ('consequently'), ('consider'), ('considering'), ('contain'), ('containing'), ('contains'), ('corresponding'), ('could'), ('couldn''t'), ('course'), ('currently'), ('d'), ('definitely'), ('described'), ('despite'), ('did'), ('didn''t'), ('different'), ('do'), ('does'), ('doesn''t'), ('doing'), ('don''t'), ('done'), ('down'), ('downwards'), ('during'), ('e'), ('each'), ('edu'), ('eg'), ('eight'), ('either'), ('else'), ('elsewhere'), ('enough'), ('entirely'), ('especially'), ('et'), ('etc'), ('even'), ('ever'), ('every'), ('everybody'), ('everyone'), ('everything'), ('everywhere'), ('ex'), ('exactly'), ('example'), ('except'), ('f'), ('far'), ('few'), ('fifth'), ('first'), ('five'), ('followed'), ('following'), ('follows'), ('for'), ('former'), ('formerly'), ('forth'), ('four'), ('from'), ('further'), ('furthermore'), ('g'), ('get'), ('gets'), ('getting'), ('given'), ('gives'), ('go'), ('goes'), ('going'), ('gone'), ('got'), ('gotten'), ('greetings'), ('h'), ('had'), ('hadn''t'), ('happens'), ('hardly'), ('has'), ('hasn''t'), ('have'), ('haven''t'), ('having'), ('he'), ('he''s'), ('hello'), ('help'), ('hence'), ('her'), ('here'), ('here''s'), ('hereafter'), ('hereby'), ('herein'), ('hereupon'), ('hers'), ('herself'), ('hi'), ('him'), ('himself'), ('his'), ('hither'), ('hopefully'), ('how'), ('howbeit'), ('however'), ('i'), ('i''d'), ('i''ll'), ('i''m'), ('i''ve'), ('ie'), ('if'), ('ignored'), ('immediate'), ('in'), ('inasmuch'), ('inc'), ('indeed'), ('indicate'), ('indicated'), ('indicates'), ('inner'), ('insofar'), ('instead'), ('into'), ('inward'), ('is'), ('isn''t'), ('it'), ('it''d'), ('it''ll'), ('it''s'), ('its'), ('itself'), ('j'), ('just'), ('k'), ('keep'), ('keeps'), ('kept'), ('know'), ('knows'), ('known'), ('l'), ('last'), ('lately'), ('later'), ('latter'), ('latterly'), ('least'), ('less'), ('lest'), ('let'), ('let''s'), ('like'), ('liked'), ('likely'), ('little'), ('look'), ('looking'), ('looks'), ('ltd'), ('m'), ('mainly'), ('many'), ('may'), ('maybe'), ('me'), ('mean'), ('meanwhile'), ('merely'), ('might'), ('more'), ('moreover'), ('most'), ('mostly'), ('much'), ('must'), ('my'), ('myself'), ('n'), ('name'), ('namely'), ('nd'), ('near'), ('nearly'), ('necessary'), ('need'), ('needs'), ('neither'), ('never'), ('nevertheless'), ('new'), ('next'), ('nine'), ('no'), ('nobody'), ('non'), ('none'), ('noone'), ('nor'), ('normally'), ('not'), ('nothing'), ('novel'), ('now'), ('nowhere'), ('o'), ('obviously'), ('of'), ('off'), ('often'), ('oh'), ('ok'), ('okay'), ('old'), ('on'), ('once'), ('one'), ('ones'), ('only'), ('onto'), ('or'), ('other'), ('others'), ('otherwise'), ('ought'), ('our'), ('ours'), ('ourselves'), ('out'), ('outside'), ('over'), ('overall'), ('own');
INSERT INTO {fts_schema}.stopwords VALUES ('p'), ('particular'), ('particularly'), ('per'), ('perhaps'), ('placed'), ('please'), ('plus'), ('possible'), ('presumably'), ('probably'), ('provides'), ('q'), ('que'), ('quite'), ('qv'), ('r'), ('rather'), ('rd'), ('re'), ('really'), ('reasonably'), ('regarding'), ('regardless'), ('regards'), ('relatively'), ('respectively'), ('right'), ('s'), ('said'), ('same'), ('saw'), ('say'), ('saying'), ('says'), ('second'), ('secondly'), ('see'), ('seeing'), ('seem'), ('seemed'), ('seeming'), ('seems'), ('seen'), ('self'), ('selves'), ('sensible'), ('sent'), ('serious'), ('seriously'), ('seven'), ('several'), ('shall'), ('she'), ('should'), ('shouldn''t'), ('since'), ('six'), ('so'), ('some'), ('somebody'), ('somehow'), ('someone'), ('something'), ('sometime'), ('sometimes'), ('somewhat'), ('somewhere'), ('soon'), ('sorry'), ('specified'), ('specify'), ('specifying'), ('still'), ('sub'), ('such'), ('sup'), ('sure'), ('t'), ('t''s'), ('take'), ('taken'), ('tell'), ('tends'), ('th'), ('than'), ('thank'), ('thanks'), ('thanx'), ('that'), ('that''s'), ('thats'), ('the'), ('their'), ('theirs'), ('them'), ('themselves'), ('then'), ('thence'), ('there'), ('there''s'), ('thereafter'), ('thereby'), ('therefore'), ('therein'), ('theres'), ('thereupon'), ('these'), ('they'), ('they''d'), ('they''ll'), ('they''re'), ('they''ve'), ('think'), ('third'), ('this'), ('thorough'), ('thoroughly'), ('those'), ('though'), ('three'), ('through'), ('throughout'), ('thru'), ('thus'), ('to'), ('together'), ('too'), ('took'), ('toward'), ('towards'), ('tried'), ('tries'), ('truly'), ('try'), ('trying'), ('twice'), ('two'), ('u'), ('un'), ('under'), ('unfortunately'), ('unless'), ('unlikely'), ('until'), ('unto'), ('up'), ('upon'), ('us'), ('use'), ('used'), ('useful'), ('uses'), ('using'), ('usually'), ('uucp'), ('v'), ('value'), ('various'), ('very'), ('via'), ('viz'), ('vs'), ('w'), ('want'), ('wants'), ('was'), ('wasn''t'), ('way'), ('we'), ('we''d'), ('we''ll'), ('we''re'), ('we''ve'), ('welcome'), ('well'), ('went'), ('were'), ('weren''t'), ('what'), ('what''s'), ('whatever'), ('when'), ('whence'), ('whenever'), ('where'), ('where''s'), ('whereafter'), ('whereas'), ('whereby'), ('wherein'), ('whereupon'), ('wherever'), ('whether'), ('which'), ('while'), ('whither'), ('who'), ('who''s'), ('whoever'), ('whole'), ('whom'), ('whose'), ('why'), ('will'), ('willing'), ('wish'), ('with'), ('within'), ('without'), ('won''t'), ('wonder'), ('would'), ('would'), ('wouldn''t'), ('x'), ('y'), ('yes'), ('yet'), ('you'), ('you''d'), ('you''ll'), ('you''re'), ('you''ve'), ('your'), ('yours'), ('yourself'), ('yourselves'), ('z'), ('zero');
""")
else:
con.sql(f"CREATE TABLE {fts_schema}.stopwords (sw VARCHAR);")
def create_duckdb_dict_table(con, fts_schema="fts_main_documents", stopwords='none'):
"""
Create the dict table using DuckDB's built-in dictionary functionality.
"""
con.sql(f"DROP TABLE IF EXISTS {fts_schema}.dict;")
create_stopwords_table(con, fts_schema, stopwords)
con.sql(f"""
CREATE TABLE {fts_schema}.dict AS
WITH distinct_terms AS (
SELECT DISTINCT term
FROM {fts_schema}.terms
)
SELECT
row_number() OVER () AS termid,
term
FROM
distinct_terms
{"WHERE term NOT IN (SELECT sw FROM " + fts_schema + ".stopwords)" if stopwords == 'english' else ''}
ORDER BY term;
""")
def build_dict_table(con, mode='duckdb', fts_schema="fts_main_documents", stopwords='none', gpt4_token_file=None, ngram_range=(1,2), min_freq=10, min_pmi=5.0):
"""
Build the dictionary table using the specified mode.
mode: 'phrases', 'ngrams', 'gpt4', or 'duckdb'
"""
if mode == 'phrases':
create_stopwords_table(con, fts_schema=fts_schema, stopwords=stopwords)
extract_phrases_pmi_duckdb(con, fts_schema="fts_main_documents", n=2, min_freq=min_freq, min_pmi=min_pmi)
print("Extracted phrases:", con.execute("SELECT * FROM fts_main_documents.phrases LIMIT 10").fetchall())
print("\nAdded phrases to dictionary:", con.execute(f"SELECT * FROM {fts_schema}.dict LIMIT 10").fetchall())
print("\nAdded tokens to dictionary:", con.execute(f"SELECT * FROM {fts_schema}.dict WHERE term NOT LIKE '% %' LIMIT 10").fetchall())
con.execute(f"DROP TABLE IF EXISTS {fts_schema}.tokens")
con.execute(f"DROP TABLE IF EXISTS {fts_schema}.phrases")
elif mode == 'duckdb':
create_terms_table_duckdb(con, fts_schema=fts_schema, input_schema="main", input_table="documents", input_id="did", input_val="content")
create_duckdb_dict_table(con, fts_schema=fts_schema, stopwords=stopwords)
else:
raise ValueError(f"Unknown dict table build mode: {mode}")
def create_terms_table(con, fts_schema="fts_main_documents", input_schema="main", input_table="documents", input_id="did", input_val="content"):
"""
Create the terms table with unique terms per docid.
Assumes the table fts_main_documents.dict already exists.
Adds a fieldid and termid column for compatibility with fielded search macros.
"""
# Cleanup input text removing special characters
con.sql(f"""
CREATE OR REPLACE TABLE {fts_schema}.cleaned_docs AS
SELECT
did,
regexp_replace(content, '[0-9!@#$%^&*()_+={{}}\\[\\]:;<>,.?~\\\\/\\|''''"`-]+', ' ', 'g') AS content
FROM {input_schema}.{input_table}
""")
con.sql(f"""
CREATE OR REPLACE TABLE {fts_schema}.terms AS (
SELECT
0 AS fieldid,
d.termid,
t.docid
FROM (
SELECT
row_number() OVER (ORDER BY (SELECT NULL)) AS docid,
unnest({fts_schema}.tokenize({input_val})) AS term
FROM {fts_schema}.cleaned_docs
) AS t
JOIN {fts_schema}.dict d ON t.term = d.term
WHERE t.term != ''
);
""")
def create_terms_table_duckdb(con, fts_schema="fts_main_documents", input_schema="main", input_table="documents", input_id="did", input_val="content"):
"""
Step 1: Create the initial terms table (term, docid).
"""
con.sql(f"""
CREATE OR REPLACE TABLE {fts_schema}.terms AS (
SELECT
row_number() OVER () AS docid,
unnest({fts_schema}.tokenize({input_val})) AS term
FROM {input_schema}.{input_table}
WHERE {input_val} != ''
);
""")
def assign_termids_to_terms(con, fts_schema="fts_main_documents"):
"""
Step 3: Recreate the terms table, joining with dict to assign termid.
"""
con.sql(f"""
CREATE OR REPLACE TABLE {fts_schema}.terms AS (
SELECT
0 AS fieldid,
d.termid,
t.docid,
t.term,
row_number() OVER (PARTITION BY t.docid) AS pos
FROM {fts_schema}.terms t
JOIN {fts_schema}.dict d ON t.term = d.term
WHERE t.term != ''
);
""")
def update_docs_table(con, fts_schema="fts_main_documents"):
"""
Create the documents table.
input_id should be the column name in input_table that uniquely identifies each document (e.g., 'did').
"""
# Remove old 'len' column if it exists, then add and populate a fresh one
con.sql(f"ALTER TABLE {fts_schema}.docs DROP COLUMN IF EXISTS len;")
con.sql(f"ALTER TABLE {fts_schema}.docs ADD COLUMN len INT;")
con.sql(f"""
UPDATE {fts_schema}.docs d
SET len = (
SELECT COUNT(termid)
FROM {fts_schema}.terms t
WHERE t.docid = d.docid
);
""")
def update_dict_table(con, fts_schema="fts_main_documents"):
"""
Update the dictionary table with document frequency (df).
Assumes the table fts_main_documents.dict already exists.
"""
con.sql(f"ALTER TABLE {fts_schema}.dict ADD COLUMN IF NOT EXISTS df BIGINT;")
con.sql(f"""
UPDATE {fts_schema}.dict d
SET df = (
SELECT count(DISTINCT docid)
FROM {fts_schema}.terms t
WHERE t.termid = d.termid
);
""")
def limit_dict_table(con, max_terms=10000, fts_schema="fts_main_documents"):
# Create a temporary table with limited terms and reassigned termid
con.sql(f"""
CREATE OR REPLACE TEMP TABLE temp_limited_dict AS
SELECT
ROW_NUMBER() OVER (ORDER BY df DESC, term ASC) AS termid,
term,
df
FROM {fts_schema}.dict
ORDER BY df DESC, term ASC
LIMIT {max_terms};
""")
# Drop original dict table
con.sql(f"DROP TABLE IF EXISTS {fts_schema}.dict;")
# Recreate dict table from temp table
con.sql(f"""
CREATE TABLE {fts_schema}.dict AS
SELECT * FROM temp_limited_dict;
""")
# Drop temp table
con.sql("DROP TABLE IF EXISTS temp_limited_dict;")
def create_stats_table(con, fts_schema="fts_main_documents", index_type="standard", stemmer="none"):
"""
Create the stats table.
This table contains statistics about the FTS index.
Columns: num_docs, avgdl, sumdf, index_type, stemmer
"""
con.sql(f"DROP TABLE IF EXISTS {fts_schema}.stats;")
con.sql(f"""
CREATE TABLE {fts_schema}.stats AS (
SELECT
COUNT(docs.docid) AS num_docs,
SUM(docs.len) / COUNT(docs.len) AS avgdl,
(SELECT SUM(df) FROM fts_main_documents.dict) AS sumdf,
'{index_type}' AS index_type,
'{stemmer}' AS stemmer
FROM {fts_schema}.docs AS docs
);
""")
def create_fields_table(con, fts_schema="fts_main_documents"):
con.sql(f'''
CREATE TABLE IF NOT EXISTS {fts_schema}.fields (
fieldid INTEGER,
field TEXT
);
''')
# Insert a default field if table is empty
con.sql(f'''
INSERT INTO {fts_schema}.fields (fieldid, field)
SELECT 0, 'content'
WHERE NOT EXISTS (SELECT 1 FROM {fts_schema}.fields);
''')
def index_documents(db_name, ir_dataset, stemmer='none', stopwords='none',
logging=True, keepcontent=False, limit=10000, mode='duckdb', min_freq=10, min_pmi=5.0):
"""
Insert and index documents.
"""
if pathlib.Path(db_name).is_file():
raise ValueError(f"File {db_name} already exists.")
con = duckdb.connect(db_name)
insert_dataset(con, ir_dataset, logging)
if logging:
print("Indexing...", file=sys.stderr)
docs = con.sql("SELECT * FROM documents LIMIT 10").df()
print("Docs:\n", docs)
create_docs_table(con, input_schema="main", input_table="documents", input_id="did")
fts_docs = con.sql("SELECT * FROM fts_main_documents.docs LIMIT 10").df()
print("fts_main_documents.docs:\n", fts_docs)
con.sql("CREATE SCHEMA IF NOT EXISTS fts_main_documents;")
con.sql("CREATE TABLE IF NOT EXISTS fts_main_documents.dict (term TEXT);")
create_tokenizer_duckdb(con)
# Create the dict table
build_dict_table(con, mode=mode, fts_schema="fts_main_documents", stopwords=stopwords, ngram_range=(1,2), min_freq=min_freq, min_pmi=min_pmi)
create_tokenizer_ciff(con)
dict = con.sql("SELECT * FROM fts_main_documents.dict LIMIT 10").df()
print("fts_main_documents.dict:\n", dict)
# Clean up the terms table
if mode == 'phrases':
con.sql("DROP TABLE IF EXISTS fts_main_documents.terms;")
create_terms_table(con, input_schema="main", input_table="documents", input_id="did", input_val="content")
else:
assign_termids_to_terms(con, fts_schema="fts_main_documents")
terms = con.sql("SELECT * FROM fts_main_documents.terms LIMIT 10").df()
print("fts_main_documents.terms:\n", terms)
update_docs_table(con, fts_schema="fts_main_documents")
docs = con.sql("SELECT * FROM fts_main_documents.docs LIMIT 10").df()
print("fts_main_documents.docs:\n", docs)
update_dict_table(con, fts_schema="fts_main_documents")
print("Updated fts_main_documents.dict with document frequencies.")
# Limit the dictionary to the `max_terms` most frequent terms
if limit > 0:
limit_dict_table(con, max_terms=limit, fts_schema="fts_main_documents")
create_terms_table(con, fts_schema="fts_main_documents", input_schema="main", input_table="documents", input_id="did", input_val="content")
update_dict_table(con, fts_schema="fts_main_documents")
print("Limited fts_main_documents.dict to 10000 most frequent terms.")
update_docs_table(con, fts_schema="fts_main_documents")
dict = con.sql("SELECT * FROM fts_main_documents.dict LIMIT 10").df()
print("fts_main_documents.dict:\n", dict)
# Remove unused words from dictionary
con.sql('''
DELETE FROM fts_main_documents.dict
WHERE df == 0;
''')
create_stats_table(con, fts_schema="fts_main_documents", index_type="standard", stemmer=stemmer)
stats = con.sql("SELECT * FROM fts_main_documents.stats").df()
print("fts_main_documents.stats:\n", stats)
create_fields_table(con, fts_schema="fts_main_documents")
create_lm(con, stemmer)
con.close()
if __name__ == "__main__":
import argparse
import ze_eval
import os
parser = argparse.ArgumentParser(description="Manual index builder for IR datasets.")
parser.add_argument('--db', type=str, default='testje_docs.db', help='Database file name')
parser.add_argument('--dataset', type=str, default='cranfield', help='ir_datasets name (e.g., cranfield, msmarco-passage)')
parser.add_argument('--stemmer', type=str, default='none', help='Stemmer to use (none, porter, etc.)')
parser.add_argument('--stopwords', type=str, default='english', help='Stopwords to use (english, none)')
parser.add_argument('--mode', type=str, default='duckdb', help='Indexing mode (duckdb, ngrams, phrases, gpt4)')
parser.add_argument('--keepcontent', action='store_true', help='Keep document content')
parser.add_argument('--limit', type=int, default=10000, help='Limit the number of terms in the dictionary')
parser.add_argument('--min-freq', type=int, default=10, help='Minimum frequency for phrases (only for mode "phrases")')
parser.add_argument('--min-pmi', type=float, default=5.0, help='Minimum PMI for phrases (only for mode "phrases")')
args = parser.parse_args()
dataset = None
if (args.dataset == 'custom'):
dataset = ze_eval.ir_dataset_test()
else:
dataset = ir_datasets.load(args.dataset)
db_name = args.db
if os.path.exists(db_name):
print(f"Removing {db_name}")
os.remove(db_name)
print("Creating index...")
index_documents(
db_name,
dataset,
stemmer=args.stemmer,
stopwords=args.stopwords,
keepcontent=args.keepcontent,
mode=args.mode,
limit=args.limit,
min_freq=args.min_freq,
min_pmi=args.min_pmi
)
print("")

137
phrases_extractor.py Normal file
View File

@ -0,0 +1,137 @@
import duckdb
import math
from collections import Counter
def create_tokenizer_duckdb(con):
con.sql("""
CREATE TEMPORARY MACRO tokenize(s) AS (
string_split_regex(regexp_replace(lower(strip_accents(CAST(s AS VARCHAR))), '[0-9!@#$%^&*()_+={}\\[\\]:;<>,.?~\\\\/\\|''''"`-]+', ' ', 'g'), '\\s+')
);
""")
def extract_phrases(documents, n=2, min_freq=2, db_path='phrases.db'):
con = duckdb.connect(database=db_path)
create_tokenizer_duckdb(con)
# Load documents into DuckDB table
con.execute("CREATE TEMP TABLE docs AS SELECT * FROM (VALUES " +
",".join(["(?, ?)"] * len(documents)) +
") AS t(doc_id, text)", [item for pair in documents for item in pair])
# Tokenize and flatten tokens in DuckDB
tokens_df = con.sql("""
SELECT doc_id, unnest(tokenize(text)) AS token
FROM docs
""").df()
# Generate n-grams in Python
token_counter = Counter()
ngram_counter = Counter()
grouped = tokens_df.groupby('doc_id')['token'].apply(list)
total_tokens = 0
for token_list in grouped:
total_tokens += len(token_list)
token_counter.update(token_list)
ngrams = zip(*[token_list[i:] for i in range(n)])
ngram_counter.update(ngrams)
# Extract frequent phrases
phrases = [" ".join(ngram) for ngram, freq in ngram_counter.items() if freq >= min_freq]
return phrases
def extract_phrases_pmi_duckdb(con, fts_schema, n=2, min_freq=2, min_pmi=3.0):
# 1. Create a tokenized table
con.execute(f"""CREATE OR REPLACE TABLE {fts_schema}.tokens AS
SELECT
did AS doc_id,
unnest({fts_schema}.tokenize(content)) AS token
FROM
documents;
""")
print("Tokenized documents:\n", con.execute(f"SELECT * FROM {fts_schema}.tokens LIMIT 10").fetchall())
# 2. Add position index for each token in its document
con.execute(f"""
CREATE OR REPLACE TABLE {fts_schema}.tokens_pos AS
SELECT doc_id, token,
ROW_NUMBER() OVER (PARTITION BY doc_id ORDER BY rowid) AS pos
FROM {fts_schema}.tokens
""")
# 3. Compute total token count
total_tokens = con.execute(f"SELECT COUNT(*)::DOUBLE FROM {fts_schema}.tokens_pos").fetchone()[0]
# 4. Compute token frequencies
con.execute(f"""
CREATE OR REPLACE TABLE {fts_schema}.token_freq AS
SELECT token,
COUNT(*) AS freq,
COUNT(DISTINCT doc_id) AS doc_freq
FROM {fts_schema}.tokens_pos
GROUP BY token
""")
print("Token frequency:\n", con.execute(f"SELECT * FROM {fts_schema}.token_freq LIMIT 10").fetchall())
# 5. Compute bigrams (or n-grams)
con.execute(f"""
CREATE OR REPLACE TABLE {fts_schema}.ngrams AS
SELECT t1.token AS w1, t2.token AS w2,
t1.doc_id AS doc_id
FROM {fts_schema}.tokens_pos t1
JOIN {fts_schema}.tokens_pos t2
ON t1.doc_id = t2.doc_id AND t2.pos = t1.pos + 1
""")
# 6. Compute n-gram frequencies
con.execute(f"""
CREATE OR REPLACE TABLE {fts_schema}.ngram_freq AS
SELECT w1, w2, COUNT(*) AS freq,
COUNT(DISTINCT doc_id) AS doc_freq
FROM {fts_schema}.ngrams
GROUP BY w1, w2
HAVING COUNT(*) >= {min_freq}
""")
print("N-gram frequency:\n", con.execute(f"SELECT * FROM {fts_schema}.ngram_freq LIMIT 10").fetchall())
print(f"Number of n-grams: {con.execute(f'SELECT COUNT(*) FROM {fts_schema}.ngram_freq').fetchone()[0]}")
# 7. Compute PMI for bigrams
con.execute(f"""
CREATE OR REPLACE TABLE {fts_schema}.phrases AS
SELECT w1 || ' ' || w2 AS phrase,
LOG(n.freq * {total_tokens} / (f1.freq * f2.freq)) / LOG(2) AS pmi,
n.doc_freq AS df
FROM {fts_schema}.ngram_freq n
JOIN {fts_schema}.token_freq f1 ON n.w1 = f1.token
JOIN {fts_schema}.token_freq f2 ON n.w2 = f2.token
WHERE LOG(n.freq * {total_tokens} / (f1.freq * f2.freq)) / LOG(2) >= {min_pmi}
ORDER BY pmi DESC
""")
print("Extracted phrases:\n", con.execute(f"SELECT phrase, pmi, df FROM {fts_schema}.phrases LIMIT 10").fetchall())
print("Extracted tokens:\n", con.execute(f"SELECT token FROM {fts_schema}.token_freq LIMIT 10").fetchall())
# 8. Combine phrases and words
con.execute(f"""
CREATE OR REPLACE TABLE {fts_schema}.dict AS
SELECT ROW_NUMBER() OVER () AS termid, phrase as term, df
FROM {fts_schema}.phrases
WHERE NOT EXISTS (
SELECT 1 FROM UNNEST(string_split(phrase, ' ')) AS word
WHERE word.unnest IN (SELECT sw FROM {fts_schema}.stopwords)
)
UNION ALL
SELECT ROW_NUMBER() OVER () + (SELECT COUNT(*) FROM {fts_schema}.phrases) AS termid, token AS term, doc_freq AS df
FROM {fts_schema}.token_freq
WHERE token NOT IN (SELECT sw FROM {fts_schema}.stopwords)
AND freq >= {min_freq}
""")
print("Phrases:\n", con.execute(f"SELECT term, df FROM {fts_schema}.dict LIMIT 10").fetchall())
con.execute(f"DROP TABLE IF EXISTS {fts_schema}.tokens_pos")
con.execute(f"DROP TABLE IF EXISTS {fts_schema}.token_freq")
con.execute(f"DROP TABLE IF EXISTS {fts_schema}.ngrams")
con.execute(f"DROP TABLE IF EXISTS {fts_schema}.ngram_freq")

BIN
testje_docs Normal file

Binary file not shown.

132
ze_eval.py Normal file
View File

@ -0,0 +1,132 @@
import pathlib
import os
import ir_datasets
class ir_dataset_test:
class Doc:
def __init__(self, doc_id, text):
self.doc_id = doc_id
self.text = text
class Query:
def __init__(self, query_id, text):
self.query_id = query_id
self.text = text
class Qrel:
def __init__(self, query_id, doc_id, relevance):
self.query_id = query_id
self.doc_id = doc_id
self.relevance = relevance
# Custom documents
# Custom documents
doc1 = Doc('d1', 'Custom document one about information retrieval.')
doc2 = Doc('d2', 'Custom document two about machine learning.')
doc3 = Doc('d3', 'Custom document three about artificial intelligence.')
doc4 = Doc('d4', 'Custom-document FOUR about INFORMATION-RETRIEVAL and its applications.')
doc5 = Doc('d5', 'Another custom document, artificial intelligence with punctuation! And special characters like @#$%.')
doc6 = Doc('d6', 'Machine-learning is artificial amazing; it combines AI, data-science, and more.')
doc7 = Doc('d7', 'Information retrieval is the backbone of search engines and academic research.')
doc8 = Doc('d8', 'Machine learning has become a core part of artificial intelligence.')
doc9 = Doc('d9', 'Artificial intelligence artificial kip saté and machine learning are fields with significant overlap.')
doc10 = Doc('d10', 'Machine learning is a subfield of artificial intelligence focused on data.')
doc11 = Doc('d11', 'The process of information retrieval includes indexing and ranking documents.')
doc12 = Doc('d12', 'Many AI systems rely on both machine learning and information retrieval.')
doc13 = Doc('d13', 'Artificial intelligence kip saté is widely used in natural language processing and robotics.')
doc14 = Doc('d14', 'Information retrieval systems are essential for finding relevant documents.')
doc15 = Doc('d15', 'Machine learning algorithms adapt based on data patterns.')
doc16 = Doc('d16', 'Artificial intelligence kip saté applications range from games to healthcare.')
doc17 = Doc('d17', 'Information retrieval helps systems return relevant search results.')
doc18 = Doc('d18', 'Machine learning and artificial intelligence are driving modern technology.')
doc19 = Doc('d19', 'Artificial intelligence is often combined with information retrieval to build smart assistants.')
doc20 = Doc('d20', 'The in the over at on Advanced machine learning techniques artificial intelligence are part of the artificial intelligence stack.')
docs = [doc1, doc2, doc3, doc4, doc5, doc6, doc7, doc8, doc9, doc10,
doc11, doc12, doc13, doc14, doc15, doc16, doc17, doc18, doc19, doc20]
# Custom queries
query1 = Query('1', 'information retrieval')
query2 = Query('2', 'machine learning')
query3 = Query('3', 'artificial intelligence')
queries = [query1, query2, query3]
# Custom relevance judgments
qrel1 = Qrel('1', 'd1', 2)
qrel2 = Qrel('2', 'd2', 1)
qrel3 = Qrel('3', 'd3', 1)
qrels = [qrel1, qrel2, qrel3]
def docs_count(self):
return len(self.docs)
def docs_iter(self):
return self.docs
def queries_iter(self):
return self.queries
def qrels_iter(self):
return self.qrels
def file_exists(name_in):
return pathlib.Path(name_in).is_file()
def get_qrels(experiment):
if experiment == "custom":
from ze_eval import ir_dataset_test
qrel_file = "custom.qrels"
if not pathlib.Path(qrel_file).is_file():
with open(qrel_file, 'w') as file:
for q in ir_dataset_test().qrels_iter():
line = q.query_id + ' Q0 ' + q.doc_id + " " + str(q.relevance)
file.write(line + '\n')
return qrel_file
if pathlib.Path(experiment).is_file(): # provide a qrels file directly...
return experiment
ir_dataset = ir_datasets.load(experiment) # ... or an ir_dataset
ir_dataset_qrels = ir_dataset.qrels_iter()
qrel_file = experiment + '.qrels'
qrel_file = qrel_file.replace('/', '_')
if not pathlib.Path(qrel_file).is_file():
with open(qrel_file, 'w') as file:
for q in ir_dataset_qrels:
line = q.query_id + ' Q0 ' + q.doc_id + " " + str(q.relevance)
file.write(line + '\n')
return qrel_file
def trec_eval(run_name, experiment, complete_rel=False,
ndcg=False, query_eval=False):
qrel_file = get_qrels(experiment)
switches = '-m official'
if ndcg:
switches += ' -m ndcg_cut'
if complete_rel:
switches += ' -c'
if query_eval:
switches += ' -q'
command = f"trec_eval {switches} {qrel_file} {run_name}"
print(command)
os.system(command)
# After running trec_eval, compute and print average postings cost if available in run file
try:
with open(run_name, 'r') as f:
postings_costs = {}
for line in f:
parts = line.strip().split()
if len(parts) >= 7:
query_id = parts[0]
try:
cost = float(parts[6])
if query_id not in postings_costs:
postings_costs[query_id] = cost
except Exception:
continue
if postings_costs:
avg_cost = sum(postings_costs.values()) / len(postings_costs)
print(f"Average cost in postings: {avg_cost:.4f}")
print(f"Total postings cost: {sum(postings_costs.values()):.4f}")
except Exception:
pass

141
ze_index.py Normal file
View File

@ -0,0 +1,141 @@
"""
Zoekeend indexer.
Author: Djoerd Hiemstra
"""
import pathlib
import sys
import duckdb
import ir_datasets
def normalize(text):
""" Escape quotes for SQL """
return text.replace("'", "''")
def create_lm(con, stemmer):
con.sql(f"""
CREATE OR REPLACE MACRO fts_main_documents.match_lm(query_string, fields := NULL, lambda := 0.3, conjunctive := 0) AS TABLE (
WITH tokens AS (
SELECT DISTINCT stem(unnest(fts_main_documents.tokenize(query_string)), '{stemmer}') AS t
),
fieldids AS (
SELECT fieldid
FROM fts_main_documents.fields
WHERE CASE WHEN ((fields IS NULL)) THEN (1) ELSE (field = ANY(SELECT * FROM (SELECT unnest(string_split(fields, ','))) AS fsq)) END
),
qtermids AS (
SELECT termid, df
FROM fts_main_documents.dict AS dict, tokens
WHERE (dict.term = tokens.t)
),
qterms AS (
SELECT termid, docid
FROM fts_main_documents.terms AS terms
WHERE (CASE WHEN ((fields IS NULL)) THEN (1) ELSE (fieldid = ANY(SELECT * FROM fieldids)) END
AND (termid = ANY(SELECT qtermids.termid FROM qtermids)))
),
term_tf AS (
SELECT termid, docid, count_star() AS tf
FROM qterms
GROUP BY docid, termid
),
cdocs AS (
SELECT docid
FROM qterms
GROUP BY docid
HAVING CASE WHEN (conjunctive) THEN ((count(DISTINCT termid) = (SELECT count_star() FROM tokens))) ELSE 1 END
),
subscores AS (
SELECT docs.docid, docs.len AS doc_len, term_tf.termid, term_tf.tf, qtermids.df, LN(1 + (lambda * tf * (SELECT ANY_VALUE(sumdf) FROM fts_main_documents.stats)) / ((1-lambda) * df * docs.len)) AS subscore
FROM term_tf, cdocs, fts_main_documents.docs AS docs, qtermids
WHERE ((term_tf.docid = cdocs.docid)
AND (term_tf.docid = docs.docid)
AND (term_tf.termid = qtermids.termid))
),
scores AS (
SELECT docs.name AS docname, LN(MAX(doc_len)) + sum(subscore) AS score FROM subscores, fts_main_documents.docs AS docs WHERE subscores.docid = docs.docid GROUP BY docs.name
),
postings_cost AS (
SELECT COUNT(DISTINCT docid) AS cost FROM qterms
)
SELECT docname, score, (SELECT cost FROM postings_cost) AS postings_cost FROM scores
);
""")
def insert_dataset(con, ir_dataset, logging=True):
"""
Insert documents from an ir_dataset. Works with several datasets.
Add document attributes if needed.
"""
con.sql('CREATE TABLE documents (did TEXT, content TEXT)')
insert = 'INSERT INTO documents(did, content) VALUES '
sql = insert
part = 0
total = 0
count = ir_dataset.docs_count()
if logging:
print(f"Inserting {count} docs...", file=sys.stderr)
for doc in ir_dataset.docs_iter():
doc_text = ""
if hasattr(doc, 'title'):
doc_text = doc.title
if hasattr(doc, 'body'):
doc_text += " " + doc.body
if hasattr(doc, 'text'):
doc_text += " " + doc.text
sql += "('" + doc.doc_id + "','" + normalize(doc_text) + "'),"
part += 1
if part > 9999:
total += part
if logging:
print(str(total) + " docs", file=sys.stderr)
con.sql(sql)
part = 0
sql = insert
con.sql(sql)
def index_documents(db_name, ir_dataset, stemmer='none', stopwords='none',
logging=True, keepcontent=False):
"""
Insert and index documents.
"""
if pathlib.Path(db_name).is_file():
raise ValueError(f"File {db_name} already exists.")
con = duckdb.connect(db_name)
insert_dataset(con, ir_dataset, logging)
if logging:
print("Indexing...", file=sys.stderr)
con.sql(f"""
PRAGMA create_fts_index('documents', 'did', 'content', stemmer='{stemmer}',
stopwords='{stopwords}')
""")
con.sql(f"""
ALTER TABLE fts_main_documents.stats ADD sumdf BIGINT;
UPDATE fts_main_documents.stats SET sumdf =
(SELECT SUM(df) FROM fts_main_documents.dict);
ALTER TABLE fts_main_documents.stats ADD index_type TEXT;
UPDATE fts_main_documents.stats SET index_type = 'standard';
ALTER TABLE fts_main_documents.stats ADD stemmer TEXT;
UPDATE fts_main_documents.stats SET stemmer = '{stemmer}';
""")
create_lm(con, stemmer)
if not keepcontent:
con.sql("ALTER TABLE documents DROP COLUMN content")
con.close()
if __name__ == "__main__":
import ze_eval
dataset = ze_eval.ir_dataset_test()
dataset = ir_datasets.load("cranfield")
import os
if os.path.exists('testje_docs.db'):
os.remove('testje_docs.db')
index_documents('testje_docs.db', dataset, stemmer='none', stopwords='none',
keepcontent=False)

115
ze_index_export.py Normal file
View File

@ -0,0 +1,115 @@
"""
Zoekeend CIFF exporter
Author: Gijs Hendriksen
"""
from typing import Iterable, Type, TypeVar
import duckdb
from ciff_toolkit.write import CiffWriter
from ciff_toolkit.ciff_pb2 import Header, PostingsList, DocRecord
from google.protobuf.message import Message
from tqdm import tqdm
M = TypeVar('M', bound=Message)
def _create_message_from_row(row: tuple | dict, message_type: Type[M]) -> M:
if isinstance(row, tuple):
mapping = zip(message_type.DESCRIPTOR.fields, row)
else:
mapping = [(field, row[field.name]) for field in message_type.DESCRIPTOR.fields]
msg = message_type()
for field, value in mapping:
if field.label == field.LABEL_REPEATED:
for x in value:
getattr(msg, field.name).append(_create_message_from_row(x, field.message_type._concrete_class))
else:
setattr(msg, field.name, value)
return msg
def create_protobuf_messages_from_result(result: duckdb.DuckDBPyRelation, message_type: Type[M], batch_size: int = 1024) -> Iterable[M]:
try:
import protarrow
for batch in result.fetch_arrow_reader(batch_size):
yield from protarrow.record_batch_to_messages(batch, message_type)
except ImportError:
while batch := result.fetchmany(batch_size):
for row in batch:
yield _create_message_from_row(row, message_type)
def create_ciff_header(conn: duckdb.DuckDBPyConnection, description: str) -> Header:
header_info = conn.execute("""
SELECT
1 AS version,
(SELECT COUNT(*) FROM fts_main_documents.dict) AS num_postings_lists,
num_docs,
(SELECT COUNT(*) FROM fts_main_documents.dict) AS total_postings_lists,
num_docs AS total_docs,
(SELECT SUM(len) FROM fts_main_documents.docs)::BIGINT AS total_terms_in_collection,
avgdl AS average_doclength,
? AS description,
FROM fts_main_documents.stats
""", [description])
header, = create_protobuf_messages_from_result(header_info, Header)
return header
def create_ciff_postings_lists(conn: duckdb.DuckDBPyConnection, batch_size: int = 1024) -> Iterable[PostingsList]:
postings_info = conn.sql("""
WITH postings AS (
SELECT termid, docid, COUNT(*) AS tf
FROM fts_main_documents.terms
GROUP BY ALL
),
gapped_postings AS (
SELECT *, docid - lag(docid, 1, 0) OVER (PARTITION BY termid ORDER BY docid) AS gap
FROM postings
),
grouped_postings AS (
SELECT termid, list(row(gap, tf)::STRUCT(docid BIGINT, tf BIGINT) ORDER BY docid) AS postings, SUM(tf)::BIGINT AS cf
FROM gapped_postings
GROUP BY termid
)
SELECT term, df, cf, postings
FROM grouped_postings
JOIN fts_main_documents.dict USING (termid)
ORDER BY term;
""")
yield from create_protobuf_messages_from_result(postings_info, PostingsList, batch_size=batch_size)
def create_ciff_doc_records(conn: duckdb.DuckDBPyConnection, batch_size: int = 1024) -> Iterable[DocRecord]:
docs_info = conn.sql("""
SELECT
docid,
name AS collection_docid,
len AS doclength,
FROM fts_main_documents.docs
ORDER BY collection_docid
""")
yield from create_protobuf_messages_from_result(docs_info, DocRecord, batch_size=batch_size)
def ciff_export(db_name: str, file_name: str, description: str, batch_size: int = 1024):
with duckdb.connect(db_name) as conn, CiffWriter(file_name) as writer:
header = create_ciff_header(conn, description)
print(header)
writer.write_header(header)
writer.write_postings_lists(tqdm(create_ciff_postings_lists(conn, batch_size=batch_size), total=header.num_postings_lists,
desc='Writing posting lists', unit='pl'))
writer.write_documents(tqdm(create_ciff_doc_records(conn, batch_size=batch_size), total=header.num_docs,
desc='Writing documents', unit='d'))
if __name__ == '__main__':
ciff_export('index.db', 'index-copy.ciff.gz', 'OWS.eu index', batch_size=2**12)

286
ze_index_import.py Normal file
View File

@ -0,0 +1,286 @@
"""
CIFF importer
Author: Arjen P. de Vries
Adapted from: https://github.com/arjenpdevries/CIFF2DuckDB
"""
import duckdb
import pyarrow as pa
from ciff_toolkit.read import CiffReader
from ciff_toolkit.ciff_pb2 import DocRecord, Header, PostingsList
from google.protobuf.json_format import MessageToJson, MessageToDict
from typing import Iterator, TypeVar, Iterable
pbopt = {"including_default_value_fields": True,
"preserving_proto_field_name": True}
def iter_posting_batches(reader: Iterable[PostingsList]):
"""
Generator for reading batches of postings
Note: Term identifiers handed out here, while reading term-posting
pairs from the CIFF file
"""
batch = []
for tid, p in enumerate(reader.read_postings_lists()):
pp = MessageToDict(p, **pbopt)
pp['termid']=tid
# Gap Decompression...
pp['postings']=[prev := {"docid":0}] and \
[prev := {"docid": posting['docid'] + prev['docid'], "tf": posting['tf']} for posting in pp['postings']]
batch.append(pp)
if len(batch) == 4096:
yield pa.RecordBatch.from_pylist(batch)
batch = []
yield pa.RecordBatch.from_pylist(batch)
def iter_docs_batches(reader: Iterable[DocRecord]):
""" Generator for reading batches of docs """
batch = []
for doc in reader.read_documents():
batch.append(MessageToDict(doc, **pbopt))
if len(batch) == 8192:
yield pa.RecordBatch.from_pylist(batch)
batch = []
yield pa.RecordBatch.from_pylist(batch)
def ciff_arrow(con, file_name, stemmer):
""" Use CIFFReader to create RecordBatches for table (using Arrow) """
# Schema: manually defined
# (alternative: protarrow could create the datastructure from the proto definition)
postings_schema = pa.schema([
("term", pa.string()),
("termid", pa.int64()),
("df", pa.int64()),
("cf", pa.int64()),
("postings", pa.list_(pa.struct([
("docid", pa.int32()),
("tf", pa.int32())
])))
])
docs_schema = pa.schema([
("docid", pa.int32()),
("collection_docid", pa.string()),
("doclength", pa.int32())
])
with CiffReader(file_name) as reader:
# Header info: TBD
h = reader.read_header()
header = MessageToJson(h, **pbopt)
con.execute(f"""
CREATE TABLE stats(num_docs BIGINT, avgdl DOUBLE, sumdf BIGINT, index_type TEXT, stemmer TEXT);
INSERT INTO stats(num_docs, avgdl, index_type, stemmer) VALUES
({h.num_docs}, {h.average_doclength}, 'standard', '{stemmer}');
""")
# RecordBatches for postings to an Arrow Datastructure
postings_rb = iter_posting_batches(reader)
postings_rbr = pa.ipc.RecordBatchReader.from_batches(postings_schema, postings_rb)
# Create a DuckDB table from the Arrow data
con.execute("CREATE TABLE ciff_postings AS SELECT * FROM postings_rbr;")
# RecordBatches for docs to an Arrow Datastructure
docs_rb = iter_docs_batches(reader)
docs_rbr = pa.ipc.RecordBatchReader.from_batches(docs_schema, docs_rb)
# Create a DuckDB table from the Arrow data
# Dropping cf here because DuckDB FTS does not use it
con.execute("""
CREATE TABLE docs AS SELECT docid::BIGINT AS docid, collection_docid AS name, doclength::BIGINT AS len FROM docs_rbr;
""")
def create_tokenizer(con, tokenizer):
if tokenizer == 'ciff':
create_tokenizer_ciff(con)
elif tokenizer == 'duckdb':
create_tokenizer_duckdb(con)
else:
raise ValueError(f"Unknown tokenizer: {tokenizer}")
def create_tokenizer_duckdb(con):
con.sql("""
CREATE MACRO fts_main_documents.tokenize(s) AS (
string_split_regex(regexp_replace(lower(strip_accents(CAST(s AS VARCHAR))), '[0-9!@#$%^&*()_+={}\\[\\]:;<>,.?~\\\\/\\|''''"`-]+', ' ', 'g'), '\\s+')
);
""")
def create_tokenizer_ciff(con):
con.sql("""
CREATE MACRO fts_main_documents.tokenize(query_string) AS (
WITH RECURSIVE sequence AS (
SELECT range AS nr
FROM RANGE((SELECT MAX(LEN(term)) + 1 FROM fts_main_documents.dict))
),
simpledict AS (
SELECT '' AS term
UNION
SELECT term FROM fts_main_documents.dict
),
subterms(term, subquery) AS (
SELECT '', lower(strip_accents(CAST(query_string AS VARCHAR)))
UNION
SELECT MAX(dict.term), SUBSTRING(subquery,
-- MAX(dict.term) selects the longest term, for a
-- start position using alphabetic sorting
CASE WHEN MAX(nr) < 1 THEN 2 ELSE MAX(nr) + 1 END,
LEN(subquery)) AS subquery
FROM subterms, sequence, simpledict as dict
WHERE SUBSTRING(subquery, 1, nr) = dict.term
GROUP BY subquery
)
SELECT LIST(term) FROM subterms WHERE NOT term = ''
)
""")
def create_lm(con, stemmer):
con.sql(f"""
CREATE MACRO fts_main_documents.match_lm(docname, query_string, fields := NULL, lambda := 0.3, conjunctive := 0) AS (
WITH tokens AS (
SELECT DISTINCT stem(unnest(fts_main_documents.tokenize(query_string)), '{stemmer}') AS t
),
fieldids AS (
SELECT fieldid
FROM fts_main_documents.fields
WHERE CASE WHEN ((fields IS NULL)) THEN (1) ELSE (field = ANY(SELECT * FROM (SELECT unnest(string_split(fields, ','))) AS fsq)) END
),
qtermids AS (
SELECT termid, df
FROM fts_main_documents.dict AS dict, tokens
WHERE (dict.term = tokens.t)
),
qterms AS (
SELECT termid, docid
FROM fts_main_documents.terms AS terms
WHERE (CASE WHEN ((fields IS NULL)) THEN (1) ELSE (fieldid = ANY(SELECT * FROM fieldids)) END
AND (termid = ANY(SELECT qtermids.termid FROM qtermids)))
),
term_tf AS (
SELECT termid, docid, count_star() AS tf
FROM qterms
GROUP BY docid, termid
),
cdocs AS (
SELECT docid
FROM qterms
GROUP BY docid
HAVING CASE WHEN (conjunctive) THEN ((count(DISTINCT termid) = (SELECT count_star() FROM tokens))) ELSE 1 END
),
subscores AS (
SELECT docs.docid, docs.len, term_tf.termid, term_tf.tf, qtermids.df, LN(1 + (lambda * tf * (SELECT ANY_VALUE(sumdf) FROM fts_main_documents.stats)) / ((1-lambda) * df * len)) AS subscore
FROM term_tf, cdocs, fts_main_documents.docs AS docs, qtermids
WHERE ((term_tf.docid = cdocs.docid)
AND (term_tf.docid = docs.docid)
AND (term_tf.termid = qtermids.termid))
),
scores AS (
SELECT docid, LN(MAX(len)) + sum(subscore) AS score FROM subscores GROUP BY docid
)
SELECT score FROM scores, fts_main_documents.docs AS docs
WHERE ((scores.docid = docs.docid) AND (docs."name" = docname)))
""")
def create_bm25(con, stemmer):
con.sql(f"""
CREATE MACRO fts_main_documents.match_bm25(docname, query_string, b := 0.75, conjunctive := 0, k := 1.2, fields := NULL) AS (
WITH tokens AS (
SELECT DISTINCT stem(unnest(fts_main_documents.tokenize(query_string)), '{stemmer}') AS t
),
fieldids AS (
SELECT fieldid
FROM fts_main_documents.fields
WHERE CASE WHEN ((fields IS NULL)) THEN (1) ELSE (field = ANY(SELECT * FROM (SELECT unnest(string_split(fields, ','))) AS fsq)) END
),
qtermids AS (
SELECT termid, df
FROM fts_main_documents.dict AS dict, tokens
WHERE (dict.term = tokens.t)
),
qterms AS (
SELECT termid, docid
FROM fts_main_documents.terms AS terms
WHERE (CASE WHEN ((fields IS NULL)) THEN (1) ELSE (fieldid = ANY(SELECT * FROM fieldids)) END
AND (termid = ANY(SELECT qtermids.termid FROM qtermids)))
),
term_tf AS (
SELECT termid, docid, count_star() AS tf
FROM qterms
GROUP BY docid, termid
),
cdocs AS (
SELECT docid
FROM qterms
GROUP BY docid
HAVING CASE WHEN (conjunctive) THEN ((count(DISTINCT termid) = (SELECT count_star() FROM tokens))) ELSE 1 END
),
subscores AS (
SELECT docs.docid, docs.len, term_tf.termid, term_tf.tf, qtermids.df, (log((((((SELECT num_docs FROM fts_main_documents.stats) - df) + 0.5) / (df + 0.5)) + 1)) * ((tf * (k + 1)) / (tf + (k * ((1 - b) + (b * (len / (SELECT avgdl FROM fts_main_documents.stats)))))))) AS subscore
FROM term_tf, cdocs, fts_main_documents.docs AS docs, qtermids
WHERE ((term_tf.docid = cdocs.docid)
AND (term_tf.docid = docs.docid)
AND (term_tf.termid = qtermids.termid))
),
scores AS (
SELECT docid, sum(subscore) AS score FROM subscores GROUP BY docid
)
SELECT score FROM scores, fts_main_documents.docs AS docs
WHERE ((scores.docid = docs.docid) AND (docs."name" = docname)))
""")
def ciff_import(db_name, file_name, tokenizer='ciff', stemmer='none'):
con = duckdb.connect(db_name)
con.execute("""
CREATE SCHEMA fts_main_documents;
USE fts_main_documents;
""")
ciff_arrow(con, file_name, stemmer)
con.execute("""
CREATE TABLE dict AS SELECT termid, term, df FROM ciff_postings;
CREATE TABLE fts_main_documents.fields(fieldid BIGINT, field VARCHAR);
CREATE TABLE terms(docid BIGINT, fieldid BIGINT, termid BIGINT);
WITH postings AS (
SELECT termid, unnest(postings, recursive := true)
FROM ciff_postings
)
INSERT INTO terms(docid, fieldid, termid)
SELECT docid, 0, termid
FROM postings, range(tf)
ORDER BY termid;
DROP TABLE ciff_postings;
CREATE TABLE main.documents AS SELECT DISTINCT name AS did FROM fts_main_documents.docs;
-- new stats
UPDATE fts_main_documents.stats SET sumdf = (SELECT SUM(df) FROM fts_main_documents.dict);
""")
create_tokenizer(con, tokenizer)
create_lm(con, stemmer)
create_bm25(con, stemmer)
con.close()
if __name__ == "__main__":
DB_NAME = "ciff-geesedb.db"
FILE_NAME = "geesedb.ciff.gz"
ciff_import(DB_NAME, FILE_NAME, tokenizer='ciff', stemmer='none')
# Only for testing:
# Query the index using the DuckDB tables
connect = duckdb.connect(DB_NAME)
connect.execute("USE fts_main_documents;")
results = connect.execute("SELECT termid FROM dict WHERE term LIKE '%radboud%' OR term LIKE '%university%'").arrow()
print(results)
results = connect.execute("SELECT * FROM terms WHERE termid IN (select termid FROM dict WHERE term LIKE '%radboud%' OR term LIKE '%university%')").arrow()
print(results)

198
ze_reindex_const.py Normal file
View File

@ -0,0 +1,198 @@
import duckdb
import pathlib
import sys
def copy_file(name_in, name_out):
path1 = pathlib.Path(name_in)
if not(path1.is_file()):
raise ValueError(f"File {name_in} does not exist.")
path2 = pathlib.Path(name_out)
if path2.is_file():
raise ValueError(f"File {name_out} already exists.")
path2.write_bytes(path1.read_bytes())
def get_stats_stemmer(con):
sql = "SELECT stemmer FROM fts_main_documents.stats"
return con.sql(sql).fetchall()[0][0]
def replace_bm25_const(con, stemmer):
""" New version of BM25; assuming that const_len=avgdl, the document
length normalization part disappears and the ranking function
becomes BM1 from Robertson and Walker's SIGIR 1994 paper.
"""
con.sql(f"""
CREATE OR REPLACE MACRO fts_main_documents.match_bm25(docname, query_string, b := 0.75, k := 1.2, conjunctive := 0, fields := NULL) AS (
WITH tokens AS (
SELECT DISTINCT stem(unnest(fts_main_documents.tokenize(query_string)), '{stemmer}') AS t
),
fieldids AS (
SELECT fieldid
FROM fts_main_documents.fields
WHERE CASE WHEN ((fields IS NULL)) THEN (1) ELSE (field = ANY(SELECT * FROM (SELECT unnest(string_split(fields, ','))) AS fsq)) END
),
qtermids AS (
SELECT termid, df
FROM fts_main_documents.dict AS dict, tokens
WHERE (dict.term = tokens.t)
),
qterms AS (
SELECT termid, docid
FROM fts_main_documents.terms AS terms
WHERE (CASE WHEN ((fields IS NULL)) THEN (1) ELSE (fieldid = ANY(SELECT * FROM fieldids)) END
AND (termid = ANY(SELECT qtermids.termid FROM qtermids)))
),
term_tf AS (
SELECT termid, docid, count_star() AS tf
FROM qterms
GROUP BY docid, termid
),
cdocs AS (
SELECT docid
FROM qterms
GROUP BY docid
HAVING CASE WHEN (conjunctive) THEN ((count(DISTINCT termid) = (SELECT count_star() FROM tokens))) ELSE 1 END
),
subscores AS (
SELECT docs.docid, term_tf.termid, tf, df,
(log((((((SELECT num_docs FROM fts_main_documents.stats) - df) + 0.5) / (df + 0.5)) + 1)) * ((tf * (k + 1)) / (tf + k))) AS subscore
FROM term_tf, cdocs, fts_main_documents.docs AS docs, qtermids
WHERE (term_tf.docid = cdocs.docid)
AND (term_tf.docid = docs.docid)
AND (term_tf.termid = qtermids.termid)
),
scores AS (
SELECT docid, sum(subscore) AS score
FROM subscores
GROUP BY docid
)
SELECT score
FROM scores, fts_main_documents.docs AS docs
WHERE (scores.docid = docs.docid) AND (docs."name" = docname)
)
""")
def get_sql_selects(con):
try:
con.sql('SELECT prior FROM fts_main_documents.docs')
except duckdb.duckdb.BinderException:
pass
else: # there is a prior column (from reindex_prior)
return ("docs.prior,", "LN(ANY_VALUE(prior)) +")
try:
con.sql('SELECT slope FROM fts_main_documents.stats')
except duckdb.duckdb.BinderException:
pass
else: # there is a slope column (from reindex_fitted)
return ("", "(LN(docid)*(SELECT ANY_VALUE(slope) FROM fts_main_documents.stats)) +")
return ("", "")
def replace_lm_const(con, stemmer, const_len):
""" This is a language model matcher where len is replaced by a constant.
It uses the prior column or fitted score, if present in the old index.
"""
(subscores_select, scores_select) = get_sql_selects(con) # adapt to previous index
con.sql(f"""
CREATE OR REPLACE MACRO fts_main_documents.match_lm(docname, query_string, fields := NULL, lambda := 0.3, conjunctive := 0) AS (
WITH tokens AS (
SELECT DISTINCT stem(unnest(fts_main_documents.tokenize(query_string)), '{stemmer}') AS t
),
fieldids AS (
SELECT fieldid
FROM fts_main_documents.fields
WHERE CASE WHEN fields IS NULL THEN 1 ELSE field IN (SELECT * FROM (SELECT UNNEST(string_split(fields, ','))) AS fsq) END
),
qtermids AS (
SELECT termid, df
FROM fts_main_documents.dict AS dict,
tokens
WHERE dict.term = tokens.t
),
qterms AS (
SELECT termid,
docid
FROM fts_main_documents.terms AS terms
WHERE CASE WHEN fields IS NULL THEN 1 ELSE fieldid IN (SELECT * FROM fieldids) END
AND termid IN (SELECT qtermids.termid FROM qtermids)
),
term_tf AS (
SELECT termid, docid, COUNT(*) AS tf
FROM qterms
GROUP BY docid, termid
),
cdocs AS (
SELECT docid
FROM qterms
GROUP BY docid
HAVING CASE WHEN conjunctive THEN COUNT(DISTINCT termid) = (SELECT COUNT(*) FROM tokens) ELSE 1 END
),
subscores AS (
SELECT {subscores_select} docs.docid, term_tf.termid, term_tf.tf, qtermids.df,
LN(1 + (lambda * tf * (SELECT ANY_VALUE(sumdf) FROM fts_main_documents.stats)) / ((1-lambda) * df * (SELECT ANY_VALUE(const_len) FROM fts_main_documents.stats))) AS subscore
FROM term_tf, cdocs, fts_main_documents.docs AS docs, qtermids
WHERE term_tf.docid = cdocs.docid
AND term_tf.docid = docs.docid
AND term_tf.termid = qtermids.termid
),
scores AS (
SELECT docid, {scores_select} sum(subscore) AS score
FROM subscores
GROUP BY docid
)
SELECT score
FROM scores, fts_main_documents.docs AS docs
WHERE scores.docid = docs.docid
AND docs.name = docname
)
""")
def reindex_const(name_in, name_out, const_len=400, b=1, keep_terms=False, maxp=1.0):
copy_file(name_in, name_out)
con = duckdb.connect(name_out)
max_tf = int(const_len * maxp)
if keep_terms:
new_tf = 'CASE WHEN tf > 0.5 THEN tf - 0.5 ELSE 0.1 END'
else:
new_tf = 'tf - 0.5'
con.sql(f"""
CREATE TABLE fts_main_documents.terms_new (
docid BIGINT, fieldid BIGINT, termid BIGINT);
WITH sequence AS (
SELECT range AS nr FROM RANGE({max_tf})
),
tf_new AS (
SELECT T.docid, T.fieldid, termid,
-- BM25-like length normalization:
COUNT(*) / (1 - {b} + {b} * (ANY_VALUE(D.len) / {const_len})) AS tf,
-- proper rounding, but do not remove terms:
{new_tf} AS new_tf
FROM fts_main_documents.terms T, fts_main_documents.docs D
WHERE T.docid = D.docid
GROUP BY T.docid, T.fieldid, T.termid
)
INSERT INTO fts_main_documents.terms_new
SELECT docid, fieldid, termid
FROM tf_new, sequence WHERE sequence.nr < tf_new.new_tf;
DROP TABLE fts_main_documents.terms;
ALTER TABLE fts_main_documents.terms_new RENAME TO terms;
UPDATE fts_main_documents.stats
SET index_type = 'const(len={const_len},b={b})';
ALTER TABLE fts_main_documents.stats ADD const_len BIGINT;
UPDATE fts_main_documents.stats SET const_len = {const_len};
-- really remove len column
ALTER TABLE fts_main_documents.docs DROP COLUMN len;
""")
stemmer = get_stats_stemmer(con)
replace_bm25_const(con, stemmer)
replace_lm_const(con, stemmer, const_len)
con.close()
if __name__ == "__main__":
reindex_const('robustZE.db', 'robustZEfitted01.db', const_len=500, maxp=0.1)

383
ze_reindex_fitted.py Normal file
View File

@ -0,0 +1,383 @@
import pathlib
import sys
import duckdb
import ir_datasets
def copy_file(name_in, name_out):
""" Simple file copy """
path1 = pathlib.Path(name_in)
if not path1.is_file():
raise ValueError(f"File {name_in} does not exist.")
path2 = pathlib.Path(name_out)
if path2.is_file():
raise ValueError(f"File {name_out} already exists.")
path2.write_bytes(path1.read_bytes())
def get_stats_stemmer(con):
""" What stemmer was used on this index? """
sql = "SELECT stemmer FROM fts_main_documents.stats"
return con.sql(sql).fetchall()[0][0]
def sample_by_values(con, column, threshold):
""" Takes one sample per unique value of len/prior. """
con.sql(f"""
CREATE VIEW sample AS
WITH histogram as (
SELECT "{column}", COUNT(*) AS count
FROM fts_main_documents.docs
WHERE "{column}" > {threshold}
GROUP BY "{column}"
)
SELECT LN(SUM(H2.count)) AS x, LN(H1."{column}") AS y
FROM histogram H1, histogram H2
WHERE H1."{column}" <= H2."{column}"
GROUP BY H1."{column}"
""")
def sample_by_fixed_points(con, column, threshold, total):
""" Takes {total} samples and averages len/prior for each. """
con.sql(f"""
CREATE VIEW sample AS
WITH groups AS (
SELECT (CASE WHEN range = 2 THEN 0 ELSE range END) *
LN(num_docs + 1) / ({total} + 2) AS group_start,
(range + 1) * LN(num_docs + 1) / ({total} + 2) AS group_end
FROM RANGE({total} + 2), fts_main_documents.stats
WHERE range > 1
)
SELECT (group_start + group_end) / 2 AS X, LN(AVG({column})) AS Y
FROM groups, fts_main_documents.docs AS docs
WHERE LN(docid + 1) >= group_start AND LN(docid + 1) < group_end
AND "{column}" > {threshold}
GROUP BY group_start, group_end
""")
def sample_by_fixed_points_qrels(con, total):
"""
Takes {total} samples and estimates the probability of relevance
from the provided qrels
"""
con.sql(f"""
CREATE VIEW sample AS
WITH groups AS (
SELECT (CASE WHEN range = 2 THEN 0 ELSE range END) *
LN(num_docs + 1) / ({total} + 2) AS group_start,
(range + 1) * LN(num_docs + 1) / ({total} + 2) AS group_end
FROM RANGE({total} + 2), fts_main_documents.stats
WHERE range > 1
)
SELECT (group_start + group_end) / 2 AS X,
LN(COUNT(*)/(EXP(group_end) - EXP(group_start))) AS Y
FROM groups, fts_main_documents.docs AS docs, qrels
WHERE LN(docid + 1) >= group_start AND LN(docid + 1) < group_end
AND docs.name = qrels.did
AND qrels.rel > 0
GROUP BY group_start, group_end
""")
def print_sample_tsv(con, total=None):
""" Prints sample for drawing nice graphs. """
result = con.sql("SELECT x, y FROM sample ORDER BY x").fetchall()
if total and len(result) != total:
print(f"Warning: less than {total} datapoints.", file=sys.stderr)
for (x, y) in result:
print(str(x) + "\t" + str(y))
def train_linear_regression(con):
""" Approximate sample by using linear regression. """
con.sql("""
WITH sums AS (
SELECT COUNT(*) AS N, SUM(x) AS Sx, SUM(y) AS Sy,
SUM(x*x) AS Sxx, SUM(x*y) AS Sxy
FROM sample
),
model AS (
SELECT (Sy*Sxx - Sx*Sxy) / (N*Sxx - Sx*Sx) AS intercept,
(N*Sxy - Sx*Sy) / (N*Sxx - Sx*Sx) AS slope
FROM sums
)
UPDATE fts_main_documents.stats AS stats
SET intercept = model.intercept, slope =
CASE WHEN model.slope < 0 THEN model.slope ELSE 0 END
FROM model
""")
def get_qrels_from_file(qrel_file):
inserts = []
with open(qrel_file, "r", encoding="ascii") as file:
for line in file:
(query_id, q0 ,doc_id, relevance) = line.split()
if relevance != 0:
inserts.append([query_id, doc_id, relevance])
return inserts
def get_qrels_from_ir_datasets(qrels_tag):
inserts = []
for q in ir_datasets.load(qrels_tag).qrels_iter():
if q.relevance != 0:
inserts.append([q.query_id, q.doc_id, q.relevance])
return inserts
def insert_qrels(con, qrels_tag):
con.sql("CREATE OR REPLACE TABLE main.qrels(qid TEXT, did TEXT, rel INT)")
try:
inserts = get_qrels_from_ir_datasets(qrels_tag)
except KeyError:
inserts = get_qrels_from_file(qrels_tag)
con.sql("BEGIN TRANSACTION")
con.executemany("INSERT INTO qrels VALUES (?, ?, ?)", inserts)
con.sql("COMMIT")
def replace_bm25_fitted_doclen(con, stemmer):
con.sql(f"""
CREATE OR REPLACE MACRO fts_main_documents.match_bm25(docname, query_string, b := 0.75, k := 1.2, conjunctive := 0, fields := NULL) AS (
WITH tokens AS (
SELECT DISTINCT stem(unnest(fts_main_documents.tokenize(query_string)), '{stemmer}') AS t
),
fieldids AS (
SELECT fieldid
FROM fts_main_documents.fields
WHERE CASE WHEN fields IS NULL THEN 1 ELSE field IN (SELECT * FROM (SELECT UNNEST(string_split(fields, ','))) AS fsq) END
),
qtermids AS (
SELECT termid, df
FROM fts_main_documents.dict AS dict,
tokens
WHERE dict.term = tokens.t
),
qterms AS (
SELECT termid,
docid
FROM fts_main_documents.terms AS terms
WHERE CASE WHEN fields IS NULL THEN 1 ELSE fieldid IN (SELECT * FROM fieldids) END
AND termid IN (SELECT qtermids.termid FROM qtermids)
),
term_tf AS (
SELECT termid, docid, COUNT(*) AS tf
FROM qterms
GROUP BY docid, termid
),
cdocs AS (
SELECT docid
FROM qterms
GROUP BY docid
HAVING CASE WHEN conjunctive THEN COUNT(DISTINCT termid) = (SELECT COUNT(*) FROM tokens) ELSE 1 END
),
subscores AS (
SELECT docs.docid, EXP(LN(docs.docid)*stats.slope + stats.intercept) AS newlen, term_tf.termid, tf, df, (log((((stats.num_docs - df) + 0.5) / (df + 0.5))) * ((tf * (k + 1)) / (tf + (k * ((1 - b) + (b * (newlen / stats.avgdl))))))) AS subscore
FROM term_tf, cdocs, fts_main_documents.docs AS docs, qtermids,
fts_main_documents.stats AS stats,
WHERE term_tf.docid = cdocs.docid
AND term_tf.docid = docs.docid
AND term_tf.termid = qtermids.termid
),
scores AS (
SELECT docid, sum(subscore) AS score
FROM subscores
GROUP BY docid
)
SELECT score
FROM scores, fts_main_documents.docs AS docs
WHERE scores.docid = docs.docid
AND docs.name = docname
)"""
)
def replace_lm_fitted_doclen(con, stemmer):
con.sql(f"""
CREATE OR REPLACE MACRO fts_main_documents.match_lm(docname, query_string, fields := NULL, lambda := 0.3, conjunctive := 0) AS (
WITH tokens AS (
SELECT DISTINCT stem(unnest(fts_main_documents.tokenize(query_string)), '{stemmer}') AS t
),
fieldids AS (
SELECT fieldid
FROM fts_main_documents.fields
WHERE CASE WHEN fields IS NULL THEN 1 ELSE field IN (SELECT * FROM (SELECT UNNEST(string_split(fields, ','))) AS fsq) END
),
qtermids AS (
SELECT termid, df
FROM fts_main_documents.dict AS dict,
tokens
WHERE dict.term = tokens.t
),
qterms AS (
SELECT termid,
docid
FROM fts_main_documents.terms AS terms
WHERE CASE WHEN fields IS NULL THEN 1 ELSE fieldid IN (SELECT * FROM fieldids) END
AND termid IN (SELECT qtermids.termid FROM qtermids)
),
term_tf AS (
SELECT termid, docid, COUNT(*) AS tf
FROM qterms
GROUP BY docid, termid
),
cdocs AS (
SELECT docid
FROM qterms
GROUP BY docid
HAVING CASE WHEN conjunctive THEN COUNT(DISTINCT termid) = (SELECT COUNT(*) FROM tokens) ELSE 1 END
),
subscores AS (
SELECT docs.docid, EXP(LN(docs.docid)*stats.slope + stats.intercept) AS newlen,
term_tf.termid, tf, df,
LN(1 + (lambda * tf * (SELECT sumdf FROM fts_main_documents.stats)) / ((1-lambda) * df * newlen)) AS subscore
FROM term_tf, cdocs, fts_main_documents.docs AS docs, qtermids,
fts_main_documents.stats AS stats
WHERE term_tf.docid = cdocs.docid
AND term_tf.docid = docs.docid
AND term_tf.termid = qtermids.termid
),
scores AS (
SELECT docid, LN(ANY_VALUE(newlen)) + sum(subscore) AS score
FROM subscores
GROUP BY docid
)
SELECT score
FROM scores, fts_main_documents.docs AS docs
WHERE scores.docid = docs.docid
AND docs.name = docname
)"""
)
def replace_lm_fitted_prior(con, stemmer='none'):
"""
Only use fitted prior, but keep on using the old document lengths.
"""
sql = f"""
CREATE OR REPLACE MACRO fts_main_documents.match_lm(docname, query_string, fields := NULL, lambda := 0.3, conjunctive := 0) AS (
WITH tokens AS (
SELECT stem(unnest(fts_main_documents.tokenize(query_string)), '{stemmer}') AS t
),
fieldids AS (
SELECT fieldid
FROM fts_main_documents.fields
WHERE CASE WHEN fields IS NULL THEN 1 ELSE field IN (SELECT * FROM (SELECT UNNEST(string_split(fields, ','))) AS fsq) END
),
qtermids AS (
SELECT termid, df, COUNT(*) AS qtf
FROM fts_main_documents.dict AS dict,
tokens
WHERE dict.term = tokens.t
GROUP BY termid, df
),
qterms AS (
SELECT termid,
docid
FROM fts_main_documents.terms AS terms
WHERE CASE WHEN fields IS NULL THEN 1 ELSE fieldid IN (SELECT * FROM fieldids) END
AND termid IN (SELECT qtermids.termid FROM qtermids)
),
term_tf AS (
SELECT termid, docid, COUNT(*) AS tf
FROM qterms
GROUP BY docid, termid
),
cdocs AS (
SELECT docid
FROM qterms
GROUP BY docid
HAVING CASE WHEN conjunctive THEN COUNT(DISTINCT termid) = (SELECT COUNT(*) FROM tokens) ELSE 1 END
),
subscores AS (
SELECT docs.docid, docs.len, term_tf.termid, term_tf.tf, qtermids.df,
qtermids.qtf * LN(1 + (lambda * tf * (SELECT ANY_VALUE(sumdf) FROM fts_main_documents.stats)) / ((1-lambda) * df * len)) AS subscore
FROM term_tf, cdocs, fts_main_documents.docs AS docs, qtermids
WHERE term_tf.docid = cdocs.docid
AND term_tf.docid = docs.docid
AND term_tf.termid = qtermids.termid
),
scores AS (
SELECT docid, (LN(docid)*(SELECT ANY_VALUE(slope) FROM fts_main_documents.stats)) + sum(subscore) AS score
FROM subscores
GROUP BY docid
)
SELECT score
FROM scores, fts_main_documents.docs AS docs
WHERE scores.docid = docs.docid
AND docs.name = docname
)
"""
con.sql(sql)
def renumber_doc_ids(con, column):
con.sql(f"""
-- renumber document ids by decreasing len/prior column
CREATE TABLE fts_main_documents.docs_new AS
SELECT ROW_NUMBER() over (ORDER BY "{column}" DESC, name ASC) newid, docs.*
FROM fts_main_documents.docs AS docs;
-- update postings
CREATE TABLE fts_main_documents.terms_new AS
SELECT D.newid as docid, T.fieldid, T.termid
FROM fts_main_documents.terms T, fts_main_documents.docs_new D
WHERE T.docid = D.docid
ORDER BY T.termid;
-- replace old by new data
ALTER TABLE fts_main_documents.docs_new DROP COLUMN docid;
ALTER TABLE fts_main_documents.docs_new RENAME COLUMN newid TO docid;
DROP TABLE fts_main_documents.docs;
DROP TABLE fts_main_documents.terms;
ALTER TABLE fts_main_documents.docs_new RENAME TO docs;
ALTER TABLE fts_main_documents.terms_new RENAME TO terms;
UPDATE fts_main_documents.stats SET index_type = 'fitted';
""")
def reindex_fitted_column(name_in, name_out, column='prior', total=None,
print_sample=False, threshold=0, qrels=None):
if column not in ['len', 'prior']:
raise ValueError(f'Column "{column}" not allowed: use len or prior.')
copy_file(name_in, name_out)
con = duckdb.connect(name_out)
renumber_doc_ids(con, column)
try:
con.sql("""
ALTER TABLE fts_main_documents.stats ADD intercept DOUBLE;
ALTER TABLE fts_main_documents.stats ADD slope DOUBLE;
""")
except duckdb.duckdb.CatalogException as e:
print ("Warning: " + str(e), file=sys.stderr)
if qrels:
insert_qrels(con, qrels)
if total:
sample_by_fixed_points_qrels(con, total)
else:
raise ValueError("Not implemented.")
else:
if total:
sample_by_fixed_points(con, column, threshold, total)
else:
sample_by_values(con, column, threshold)
if print_sample:
print_sample_tsv(con, total)
train_linear_regression(con)
con.sql(f"""
DROP VIEW sample;
ALTER TABLE fts_main_documents.docs DROP COLUMN "{column}";
""")
stemmer = get_stats_stemmer(con)
if column == 'len':
replace_lm_fitted_doclen(con, stemmer=stemmer)
replace_bm25_fitted_doclen(con, stemmer=stemmer)
else:
replace_lm_fitted_prior(con, stemmer=stemmer)
con.close()
if __name__ == "__main__":
reindex_fitted_column('robustZE.db', 'robustZE_fitted20.db', column='len', total=None, print_sample=True, threshold=20, qrels=None)

112
ze_reindex_group.py Normal file
View File

@ -0,0 +1,112 @@
import duckdb
import pathlib
import sys
def copy_file(name_in, name_out):
path1 = pathlib.Path(name_in)
if not(path1.is_file()):
raise ValueError(f"File {name_in} does not exist.")
path2 = pathlib.Path(name_out)
if path2.is_file():
raise ValueError(f"File {name_out} already exists.")
path2.write_bytes(path1.read_bytes())
def get_stats_stemmer(con):
sql = "SELECT stemmer FROM fts_main_documents.stats"
return con.sql(sql).fetchall()[0][0]
def replace_bm25(con, stemmer):
""" The standard DuckDB BM25 implementation does not work with the grouped index.
This version also works with the standard DuckDB index.
"""
con.sql(f"""
CREATE OR REPLACE MACRO fts_main_documents.match_bm25(docname, query_string, b := 0.75, k := 1.2, conjunctive := 0, fields := NULL) AS (
WITH tokens AS (
SELECT DISTINCT stem(unnest(fts_main_documents.tokenize(query_string)), '{stemmer}') AS t
),
fieldids AS (
SELECT fieldid
FROM fts_main_documents.fields
WHERE CASE WHEN ((fields IS NULL)) THEN (1) ELSE (field = ANY(SELECT * FROM (SELECT unnest(string_split(fields, ','))) AS fsq)) END
),
qtermids AS (
SELECT termid, df
FROM fts_main_documents.dict AS dict, tokens
WHERE (dict.term = tokens.t)
),
qterms AS (
SELECT termid, docid
FROM fts_main_documents.terms AS terms
WHERE (CASE WHEN ((fields IS NULL)) THEN (1) ELSE (fieldid = ANY(SELECT * FROM fieldids)) END
AND (termid = ANY(SELECT qtermids.termid FROM qtermids)))
),
term_tf AS (
SELECT termid, docid, count_star() AS tf
FROM qterms
GROUP BY docid, termid
),
cdocs AS (
SELECT docid
FROM qterms
GROUP BY docid
HAVING CASE WHEN (conjunctive) THEN ((count(DISTINCT termid) = (SELECT count_star() FROM tokens))) ELSE 1 END
),
subscores AS (
SELECT docs.docid, len, term_tf.termid, tf, df,
(log((((((SELECT num_docs FROM fts_main_documents.stats) - df) + 0.5) / (df + 0.5)) + 1)) * ((tf * (k + 1)) / (tf + (k * ((1 - b) + (b * (len / (SELECT avgdl FROM fts_main_documents.stats)))))))) AS subscore
FROM term_tf, cdocs, fts_main_documents.docs AS docs, qtermids
WHERE (term_tf.docid = cdocs.docid)
AND (term_tf.docid = docs.docid)
AND (term_tf.termid = qtermids.termid)
),
scores AS (
SELECT docid, sum(subscore) AS score
FROM subscores
GROUP BY docid
)
SELECT score
FROM scores, fts_main_documents.docs AS docs
WHERE (scores.docid = docs.docid) AND (docs."name" = docname)
)
""")
def reindex_group(name_in, name_out, stemmer='porter'):
copy_file(name_in, name_out)
con = duckdb.connect(name_out)
oldstemmer = get_stats_stemmer(con)
if oldstemmer != 'none':
print(f"Warning: stemmer {oldstemmer} was already used on this database")
con.sql(f"""
-- newdict gives stems unique ids
CREATE TABLE fts_main_documents.newdict AS
SELECT termid, term, stem(term, '{stemmer}') AS stem, DENSE_RANK() OVER (ORDER BY stem) AS newid, df
FROM fts_main_documents.dict;
DROP TABLE fts_main_documents.dict;
-- newterms uses those new ids
CREATE TABLE fts_main_documents.newterms AS
SELECT terms.docid, terms.fieldid, newdict.newid AS termid
FROM fts_main_documents.terms AS terms, fts_main_documents.newdict AS newdict
WHERE terms.termid = newdict.termid;
DROP TABLE fts_main_documents.terms;
ALTER TABLE fts_main_documents.newterms RENAME TO terms;
-- now remove old ids from dict table and compute new dfs.
CREATE TABLE fts_main_documents.dict AS
SELECT D.newid AS termid, D.term, COUNT(DISTINCT T.docid) AS df
FROM fts_main_documents.newdict D, fts_main_documents.terms T
WHERE T.termid = D.newid
GROUP BY D.newid, D.term;
DROP TABLE fts_main_documents.newdict;
-- update stats
UPDATE fts_main_documents.stats SET index_type = 'grouped({stemmer})';
""")
replace_bm25(con, oldstemmer)
con.close()
if __name__ == "__main__":
reindex_group('robustZE.db', 'robustZEgrouped.db')

114
ze_reindex_prior.py Normal file
View File

@ -0,0 +1,114 @@
import pathlib
import sys
import duckdb
def copy_file(name_in, name_out):
path1 = pathlib.Path(name_in)
if not path1.is_file():
raise ValueError(f"File {name_in} does not exist.")
path2 = pathlib.Path(name_out)
if path2.is_file():
raise ValueError(f"File {name_out} already exists.")
path2.write_bytes(path1.read_bytes())
def get_stats_stemmer(con):
sql = "SELECT stemmer FROM fts_main_documents.stats"
return con.sql(sql).fetchall()[0][0]
def replace_lm_prior(con, stemmer):
con.sql(f"""
CREATE OR REPLACE MACRO fts_main_documents.match_lm(docname, query_string, fields := NULL, lambda := 0.3, conjunctive := 0) AS (
WITH tokens AS (
SELECT DISTINCT stem(unnest(fts_main_documents.tokenize(query_string)), '{stemmer}') AS t
),
fieldids AS (
SELECT fieldid
FROM fts_main_documents.fields
WHERE CASE WHEN ((fields IS NULL)) THEN (1) ELSE (field = ANY(SELECT * FROM (SELECT unnest(string_split(fields, ','))) AS fsq)) END
),
qtermids AS (
SELECT termid, df
FROM fts_main_documents.dict AS dict, tokens
WHERE (dict.term = tokens.t)
),
qterms AS (
SELECT termid, docid
FROM fts_main_documents.terms AS terms
WHERE (CASE WHEN ((fields IS NULL)) THEN (1) ELSE (fieldid = ANY(SELECT * FROM fieldids)) END
AND (termid = ANY(SELECT qtermids.termid FROM qtermids)))
),
term_tf AS (
SELECT termid, docid, count_star() AS tf
FROM qterms
GROUP BY docid, termid
),
cdocs AS (
SELECT docid
FROM qterms
GROUP BY docid
HAVING CASE WHEN (conjunctive) THEN ((count(DISTINCT termid) = (SELECT count_star() FROM tokens))) ELSE 1 END
),
subscores AS (
SELECT docs.docid, prior, len, term_tf.termid, tf, df, LN(1 + (lambda * tf * (SELECT ANY_VALUE(sumdf) FROM fts_main_documents.stats)) / ((1-lambda) * df * len)) AS subscore
FROM term_tf, cdocs, fts_main_documents.docs AS docs, qtermids
WHERE ((term_tf.docid = cdocs.docid)
AND (term_tf.docid = docs.docid)
AND (term_tf.termid = qtermids.termid))
),
scores AS (
SELECT docid, LN(ANY_VALUE(prior)) + sum(subscore) AS score FROM subscores GROUP BY docid
)
SELECT score FROM scores, fts_main_documents.docs AS docs
WHERE ((scores.docid = docs.docid) AND (docs."name" = docname)))
""")
def insert_priors(con, csv_file, default):
con.sql(f"""
UPDATE fts_main_documents.docs AS docs
SET prior = priors.prior
FROM read_csv({csv_file}) AS priors
WHERE docs.name = priors.did
""")
if not default is None:
con.sql(f"""
UPDATE fts_main_documents.docs
SET prior = {default}
WHERE prior IS NULL
""")
else:
count = con.sql("""
SELECT COUNT(*)
FROM fts_main_documents.docs
WHERE prior IS NULL
""").fetchall()[0][0]
if count > 0:
print(f"Warning: {count} rows missing from file. Use --default", file=sys.stderr)
def reindex_prior(name_in, name_out, csv_file=None, default=None, init=None):
copy_file(name_in, name_out)
con = duckdb.connect(name_out)
con.sql("ALTER TABLE fts_main_documents.docs ADD prior DOUBLE")
if (csv_file and init):
print(f"Warning: init={init} ignored.", file=sys.stderr)
if csv_file:
insert_priors(con, csv_file, default)
elif init:
if init == 'len':
con.sql("UPDATE fts_main_documents.docs SET prior = len")
elif init == 'uniform':
con.sql("UPDATE fts_main_documents.docs SET prior = 1")
else:
raise ValueError(f'Unknown value for init: {init}')
stemmer = get_stats_stemmer(con)
replace_lm_prior(con, stemmer=stemmer)
con.close()
if __name__ == "__main__":
reindex_prior('cran.db', 'cran_prior.db', csv_file='test_priors.csv')

99
ze_search.py Normal file
View File

@ -0,0 +1,99 @@
"""
Zoekeend searcher.
Author: Djoerd Hiemstra
"""
import sys
import duckdb
import ir_datasets
def duckdb_search_lm(con, query, limit):
sql = """
SELECT docname, score, postings_cost
FROM fts_main_documents.match_lm($1)
ORDER BY score DESC
LIMIT $2
"""
return con.execute(sql, [query, limit]).fetchall()
# def duckdb_search_lm(con, query, limit, l):
# print(f"Searching for: {query} with limit {limit} and l={l}")
# sql = """
# SELECT docname, score, postings_cost
# FROM fts_main_documents.match_lm(docname, $1)
# ORDER BY score DESC
# LIMIT $2
# """
# return con.execute(sql, [query, limit]).fetchall()
def duckdb_search_bm25(con, query, limit, b, k):
sql = """
SELECT did, score
FROM (
SELECT did, fts_main_documents.match_bm25(did, $1, b=$2, k=$3) AS score
FROM documents) sq
WHERE score IS NOT NULL
ORDER BY score DESC
LIMIT $4
"""
return con.execute(sql, [query, b, k, limit]).fetchall()
class Query:
def __init__(self, query_id, text):
self.query_id = query_id
self.text = text
def get_queries_from_file(query_file):
with open(query_file, "r") as file:
for line in file:
(query_id, text) = line.split('\t')
yield Query(query_id, text)
def get_queries(query_tag):
if query_tag == "custom":
from ze_eval import ir_dataset_test
return ir_dataset_test().queries_iter()
try:
return ir_datasets.load(query_tag).queries_iter()
except KeyError:
pass
return get_queries_from_file(query_tag)
def search_run(db_name, query_tag, matcher='lm', run_tag=None,
b=0.75, k=1.2, limit=1000, fileout=None,
startq=None, endq=None):
con = duckdb.connect(db_name, read_only=True)
if fileout:
file = open(fileout, "w")
else:
file = sys.stdout
if not run_tag:
run_tag = matcher
queries = get_queries(query_tag)
for query in queries:
qid = query.query_id
if (startq and int(qid) < startq) or (endq and int(qid) > endq):
continue
if hasattr(query, 'title'):
q_string = query.title
else:
q_string = query.text
if matcher == 'lm':
hits = duckdb_search_lm(con, q_string, limit)
elif matcher == 'bm25':
hits = duckdb_search_bm25(con, q_string, limit, b, k)
else:
raise ValueError(f"Unknown match function: {matcher}")
for rank, (docno, score, postings_cost) in enumerate(hits):
file.write(f'{qid} Q0 {docno} {rank} {score} {run_tag} {postings_cost}\n')
con.close()
file.close()
if __name__ == "__main__":
search_run('cran.db', 'cranfield.tsv')

49
ze_vacuum.py Normal file
View File

@ -0,0 +1,49 @@
import duckdb
import pathlib
def copy_file_force(name_in, name_out):
path1 = pathlib.Path(name_in)
if not(path1.is_file()):
raise ValueError(f"File {name_in} does not exist.")
path2 = pathlib.Path(name_out)
path2.write_bytes(path1.read_bytes())
def rm_file(name):
path = pathlib.Path(name)
path.unlink()
def cluster_index(con):
con.sql("""
USE fts_main_documents;
CREATE TABLE terms_new AS SELECT * FROM terms ORDER BY termid, docid;
DROP TABLE terms;
ALTER TABLE terms_new RENAME TO terms;
CREATE TABLE dict_new AS SELECT * FROM dict ORDER BY term;
DROP TABLE dict;
ALTER TABLE dict_new RENAME TO dict;
CREATE TABLE docs_new AS SELECT * FROM docs ORDER BY docid;
DROP TABLE docs;
ALTER TABLE docs_new RENAME TO docs;
""")
def reclaim_disk_space(name, cluster=True):
# Unfortunately, DuckDB does not reclaim disk space automatically
# therefore, we do a copy
tmpname = name + '.tmp'
copy_file_force(name, tmpname)
con = duckdb.connect(tmpname)
if cluster:
cluster_index(con)
rm_file(name)
con.sql(f"""
ATTACH '{tmpname}' AS tmpdb;
ATTACH '{name}' AS db;
COPY FROM DATABASE tmpdb TO db;
""")
con.close()
rm_file(tmpname)

600
zoekeend Executable file
View File

@ -0,0 +1,600 @@
#!/usr/bin/env python
"""
Zoekeend experimental information retrieval system using DuckDB
Copyright (C) 2024 Djoerd Hiemstra
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published
by the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
Contact: hiemstra@cs.ru.nl
"""
import argparse
import pathlib
import sys
import duckdb
import ir_datasets
import ze_eval
ze_datasets = {
"rb04": "disks45/nocr/trec-robust-2004",
"msm2": "msmarco-passage",
"msm2dev": "msmarco-passage/trec-dl-2019/judged",
"msm2tst": "msmarco-passage/trec-dl-2020/judged",
"cran": "cranfield",
}
def fatal(message):
"""Print error message and exit."""
print(message, file=sys.stderr)
sys.exit(1)
# TODO: def zoekeend_index_bydict(args):
# index_bydict test.db dataset --in dictionary --out dictionary
# --max_size 99999 --algorithm bytepair --dryrun
# out dictionary is dictionary for future index, if called again.
# TODO: add to ze_search: report query cross entropy and Cost-in-Postings.
def zoekeend_index(args):
"""
Create the index file for an Information Retrieval dataset.
This index uses the standard DuckDB FTS extension. Based on:
Hannes Mühleisen, Thaer Samar, Jimmy Lin, and Arjen de Vries, Old dogs
are great at new tricks: Column stores for IR prototyping. In SIGIR 2014.
"""
import ze_index # defer imports, so no dependencies needed, unless used
if args.dataset in ze_datasets:
args.dataset = ze_datasets[args.dataset]
try:
if args.dataset == "custom":
ir_dataset = ze_eval.ir_dataset_test()
else:
ir_dataset = ir_datasets.load(args.dataset)
ze_index.index_documents(
args.dbname,
ir_dataset,
stemmer=args.wordstemmer,
stopwords=args.stopwords,
keepcontent=args.keep_content,
)
except ValueError as e:
fatal(e)
except KeyError as e:
fatal("Unknown dataset: " + str(e))
def zoekeend_search(args):
"""
Run queries and create a run file in TREC output.
The language model (lm) is based on: Djoerd Hiemstra, A probabilistic
justification for using tf.idf term weighting in information retrieval,
International Journal on Digital Libraries 3(2), 2000.
"""
import ze_search
if not pathlib.Path(args.dbname).is_file():
fatal(f"Error: file {args.dbname} does not exist")
if args.out and pathlib.Path(args.out).is_file():
fatal(f"Error: file {args.out} exists")
if args.queries in ze_datasets:
query_tag = ze_datasets[args.queries]
else:
query_tag = args.queries
try:
ze_search.search_run(
args.dbname,
query_tag,
matcher=args.match,
run_tag=args.run,
k=args.bm25k,
b=args.bm25b,
limit=args.top,
fileout=args.out,
startq=args.start,
endq=args.end,
)
except FileNotFoundError:
fatal(f"Error: queryset '{args.queries}' does not exist.")
except ValueError as e:
fatal(e)
def zoekeend_eval(args):
"""Evaluate run using trec_eval"""
import ze_eval
if args.queries in ze_datasets:
query_tag = ze_datasets[args.queries]
else:
query_tag = args.queries
try:
ze_eval.trec_eval(
args.run, query_tag, args.complete_rel, args.ndcg, args.query_eval
)
except (KeyError, AttributeError):
fatal(f"Error: query/qrel set '{args.queries}' does not exist.")
except ValueError as e:
fatal(e)
def zoekeend_vacuum(args):
"""Vacuum index to reclaim disk space."""
import ze_vacuum
try:
ze_vacuum.reclaim_disk_space(args.dbname, args.cluster)
except (ValueError, FileNotFoundError):
fatal(f"File not found: {args.dbname}")
def zoekeend_index_import(args):
"""
Import a CIFF (Common Index File Format) index.
Based on: Djoerd Hiemstra, Gijs Hendriksen, Chris Kamphuis, and
Arjen de Vries, Challenges of index exchange for search engine
interoperability, OSSYM 2023. (see also: zoekeend index_export)
"""
import ze_index_import
if pathlib.Path(args.dbname).is_file():
fatal(f"Error: file {args.dbname} exists")
if not pathlib.Path(args.ciff_file).is_file():
fatal(f"Error: file {args.ciff_file} does not exist")
try:
ze_index_import.ciff_import(
args.dbname,
args.ciff_file,
tokenizer=args.tokenizer,
stemmer=args.wordstemmer,
)
except ValueError as e:
fatal("Error in CIFF import: " + str(e))
def zoekeend_index_export(args):
"""
Export a CIFF (Common Index File Format) index.
Based on: Jimmy Lin, Joel Mackenzie, Chris Kamphuis, Craig Macdonald,
Antonio Mallia, Michał Siedlaczek, Andrew Trotman, and Arjen de Vries.
Supporting interoperability between open-source search engines with the
common index file format, SIGIR 2020; (see also: zoekeend index_import)
"""
import ze_index_export
if not pathlib.Path(args.dbname).is_file():
fatal(f"Error: file {args.dbname} does not exist")
if pathlib.Path(args.ciff_file).is_file():
fatal(f"Error: file {args.ciff_file} exists")
try:
ze_index_export.ciff_export(
args.dbname,
args.ciff_file,
description=args.description,
batch_size=args.batch_size,
)
except ValueError as e:
fatal("Error in CIFF export: " + str(e))
def zoekeend_reindex_prior(args):
"""
Recreate the index by including prior (static rank) scores.
Based on: Wessel Kraaij, Thijs Westerveld and Djoerd Hiemstra,
The Importance of Prior Probabilities for Entry Page Search,
SIGIR 2002.
"""
import ze_reindex_prior
if not pathlib.Path(args.dbname_in).is_file():
fatal(f"Error: file {args.dbname_in} does not exist")
if pathlib.Path(args.dbname_out).is_file():
fatal(f"Error: file {args.dbname_out} exists")
try:
ze_reindex_prior.reindex_prior(
args.dbname_in,
args.dbname_out,
csv_file=args.file,
default=args.default,
init=args.init,
)
except Exception as e:
fatal("Error in reindex prior: " + str(e))
def zoekeend_reindex_fitted(args):
"""
Recreate the index using by fitting document lengths (len) or prior
scores (prior) using linear regression. The length / prior scores
are removed from the new index.
"""
import ze_reindex_fitted
if not pathlib.Path(args.dbname_in).is_file():
fatal(f"Error: file {args.dbname_in} does not exist")
if pathlib.Path(args.dbname_out).is_file():
fatal(f"Error: file {args.dbname_out} exists")
if args.qrls in ze_datasets:
args.qrls = ze_datasets[args.qrls]
try:
ze_reindex_fitted.reindex_fitted_column(
args.dbname_in,
args.dbname_out,
column=args.column,
total=args.bins,
print_sample=args.print,
threshold=args.threshold,
qrels=args.qrls,
)
except ValueError as e:
fatal("Error in reindex fitted: " + str(e))
def zoekeend_reindex_const(args):
"""
Recreate the index using by rescaling term frequencies such that all
documents get an artificial length of CONST, using a normalization
weight beta inspired by BM25 document length normalization.
"""
import ze_reindex_const
if not pathlib.Path(args.dbname_in).is_file():
fatal(f"Error: file {args.dbname_in} does not exist")
if pathlib.Path(args.dbname_out).is_file():
fatal(f"Error: file {args.dbname_out} exists")
try:
ze_reindex_const.reindex_const(
args.dbname_in,
args.dbname_out,
const_len=args.const,
b=args.beta,
keep_terms=args.keepterms,
)
except ValueError as e:
fatal("Error in reindex const: " + str(e))
global_parser = argparse.ArgumentParser(prog="zoekeend")
global_parser.add_argument(
"-v",
"--version",
action="version",
version="zoekeend v0.0.1 (using duckdb v" + duckdb.__version__ + ")",
)
subparsers = global_parser.add_subparsers(metavar="subexperiment ...")
index_parser = subparsers.add_parser(
"index",
help="create the index file for an IR dataset",
description=zoekeend_index.__doc__,
)
index_parser.set_defaults(func=zoekeend_index)
index_parser.add_argument(
"dbname",
help="file name of index",
)
index_parser.add_argument(
"dataset",
help="ir_dataset, see: https://ir-datasets.com",
)
index_parser.add_argument(
"-w",
"--wordstemmer",
help="word stemmer (default: none)",
default="none",
choices=["none", "porter", "dutch"],
)
index_parser.add_argument(
"-s",
"--stopwords",
help="stop words (default: none)",
default="none",
choices=["none", "english"],
)
index_parser.add_argument(
"-k",
"--keep_content",
help="keep the document content column",
action="store_true",
)
reindex_prior_parser = subparsers.add_parser(
"reindex_prior",
help="recreate the index including prior scores",
description=zoekeend_reindex_prior.__doc__,
)
reindex_prior_parser.set_defaults(func=zoekeend_reindex_prior)
reindex_prior_parser.add_argument(
"dbname_in",
help="file name of old index",
)
reindex_prior_parser.add_argument(
"dbname_out",
help="file name of new index with priors",
)
reindex_prior_parser.add_argument(
"-i",
"--init",
help="initialize with standard prior ('len' or 'uniform')",
choices=["len", "uniform"],
)
reindex_prior_parser.add_argument(
"-f",
"--file",
help="file with comma-separated (did,prior) pairs",
)
reindex_prior_parser.add_argument(
"-d",
"--default",
help="default prior for documents missing in the file",
type=float,
)
reindex_fitted_parser = subparsers.add_parser(
"reindex_fitted",
help="recreate the index by fitting prior scores",
description=zoekeend_reindex_fitted.__doc__,
)
reindex_fitted_parser.set_defaults(func=zoekeend_reindex_fitted)
reindex_fitted_parser.add_argument(
"dbname_in",
help="file name of old index",
)
reindex_fitted_parser.add_argument(
"dbname_out",
help="file name of new fitted index",
)
reindex_fitted_parser.add_argument(
"-c",
"--column",
help="column to be used for fitting (default: prior)",
default="prior",
choices=["len", "prior"],
)
reindex_fitted_parser.add_argument(
"-b",
"--bins",
help="number of bins",
type=int,
)
reindex_fitted_parser.add_argument(
"-p",
"--print",
help="print sample used for fitting",
action="store_true",
)
reindex_fitted_parser.add_argument(
"-q",
"--qrls",
help="training queries/qrels",
)
reindex_fitted_parser.add_argument(
"-t",
"--threshold",
help="prior values <= threshold are ignored (default: 0)",
default=0,
type=int,
)
reindex_const_parser = subparsers.add_parser(
"reindex_const",
help="recreate the index by rescaling term frequencies",
description=zoekeend_reindex_const.__doc__,
)
reindex_const_parser.set_defaults(func=zoekeend_reindex_const)
reindex_const_parser.add_argument(
"dbname_in",
help="file name of old index",
)
reindex_const_parser.add_argument(
"dbname_out",
help="file name of new fitted index",
)
reindex_const_parser.add_argument(
"-c",
"--const",
help="constant document length (default: 400)",
type=int,
default=400,
)
reindex_const_parser.add_argument(
"-b",
"--beta",
help="length normalization parameter (default: 1.0)",
type=float,
default=1.0,
)
reindex_const_parser.add_argument(
"-k",
"--keepterms",
action="store_true",
help="keep all terms, even if new tf is small",
)
search_parser = subparsers.add_parser(
"search",
help="execute queries and create run output",
description=zoekeend_search.__doc__,
)
search_parser.set_defaults(func=zoekeend_search)
search_parser.add_argument(
"dbname",
help="file name of index",
)
search_parser.add_argument(
"queries",
help="ir_dataset queries id or tab-separated query file",
)
search_parser.add_argument(
"-r",
"--run",
help="run tag",
)
search_parser.add_argument(
"-t",
"--top",
type=int,
default=1000,
help="amount of top results (default: 1000)",
)
search_parser.add_argument(
"-o", "--out", help="the run file to be outputted (default: stdout)"
)
search_parser.add_argument(
"-m",
"--match",
help="match function: languge models (default) or bm25",
default="lm",
choices=["lm", "bm25"],
)
search_parser.add_argument(
"-l", "--lmbda", help="lm lambda parameter (default: 0.3)", type=float, default=0.3
)
search_parser.add_argument(
"-k", "--bm25k", help="bm25 k parameter (default: 0.9)", type=float, default=0.9
)
search_parser.add_argument(
"-b", "--bm25b", help="bm25 b parameter (default: 0.4)", type=float, default=0.4
)
search_parser.add_argument(
"-s",
"--start",
help="start identifier of query",
type=int,
)
search_parser.add_argument(
"-e",
"--end",
help="end identifier of query",
type=int,
)
vacuum_parser = subparsers.add_parser(
"vacuum",
help="vacuum index to reclaim disk space",
description=zoekeend_vacuum.__doc__,
)
vacuum_parser.set_defaults(func=zoekeend_vacuum)
vacuum_parser.add_argument(
"dbname",
help="file name of index",
)
vacuum_parser.add_argument("-c", "--cluster", action="store_true", help="cluster index")
eval_parser = subparsers.add_parser(
"eval", help="evaluate run using trec_eval", description=zoekeend_eval.__doc__
)
eval_parser.set_defaults(func=zoekeend_eval)
eval_parser.add_argument(
"run",
help="trec run file",
)
eval_parser.add_argument(
"queries",
help="ir_dataset queries id or trec qrel file",
)
eval_parser.add_argument(
"-c",
"--complete_rel",
action="store_true",
help="queries with missing results contribute a value of 0",
)
eval_parser.add_argument(
"-n",
"--ndcg",
action="store_true",
help="add normalized discounted cummaltive gain (ndcg)",
)
eval_parser.add_argument(
"-q",
"--query_eval",
action="store_true",
help="give evaluation for each query/topic",
)
index_import_parser = subparsers.add_parser(
"index_import", help="import ciff index", description=zoekeend_index_import.__doc__
)
index_import_parser.set_defaults(func=zoekeend_index_import)
index_import_parser.add_argument(
"dbname",
help="file name of index",
)
index_import_parser.add_argument(
"ciff_file",
help="ciff file",
)
index_import_parser.add_argument(
"-t",
"--tokenizer",
help="tokenizer (default: ciff)",
default="ciff",
choices=["ciff", "duckdb"],
)
index_import_parser.add_argument(
"-w",
"--wordstemmer",
help="word stemmer (default: none)",
default="none",
choices=["none", "porter", "dutch"],
)
index_export_parser = subparsers.add_parser(
"index_export", help="export ciff index", description=zoekeend_index_import.__doc__
)
index_export_parser.set_defaults(func=zoekeend_index_export)
index_export_parser.add_argument(
"dbname",
help="file name of index",
)
index_export_parser.add_argument(
"ciff_file",
help="ciff file",
)
index_export_parser.add_argument(
"-d",
"--description",
help="CIFF description (default: Exported from DuckDB)",
default="Exported from DuckDB",
)
index_export_parser.add_argument(
"-b",
"--batch-size",
help="batch size (default: 1024)",
default=1024,
type=int,
)
parsed_args = global_parser.parse_args()
if hasattr(parsed_args, "func"):
parsed_args.func(parsed_args)
else:
global_parser.print_usage()