mirror of
https://github.com/ArthurIdema/Zoekeend-Phrase-Indexing.git
synced 2025-10-27 00:34:21 +00:00
31 lines
1.8 KiB
Bash
Executable File
31 lines
1.8 KiB
Bash
Executable File
#!/bin/bash
|
|
|
|
echo -e "RUN_ID\tMODE\tSTOPWORDS\tMIN_FREQ\tMIN_PMI\tMAP\tPOSTINGS_COST\tDICT_SIZE\tTERMS_SIZE\tNGRAMS\tAVGDL\tSUMDF"
|
|
|
|
for dir in ../results_new_postings/*; do
|
|
[ -d "$dir" ] || continue
|
|
SETTINGS="$dir/settings.txt"
|
|
DB=$(grep '^DB:' "$SETTINGS" | awk '{print $2}')
|
|
DB="$dir/$(basename "$DB")"
|
|
if [[ -f "$SETTINGS" && -f "$DB" ]]; then
|
|
RUN_ID=$(grep '^RUN_ID:' "$SETTINGS" | awk '{print $2}')
|
|
MODE=$(grep '^MODE:' "$SETTINGS" | awk '{print $2}')
|
|
STOPWORDS=$(grep '^STOPWORDS:' "$SETTINGS" | awk '{print $2}')
|
|
MIN_FREQ=$(grep '^MIN_FREQ:' "$SETTINGS" | awk '{print $2}')
|
|
MIN_PMI=$(grep '^MIN_PMI:' "$SETTINGS" | awk '{print $2}')
|
|
DICT_SIZE=$(duckdb "$DB" -csv -noheader "SELECT COUNT(*) FROM fts_main_documents.dict;")
|
|
TERMS_SIZE=$(duckdb "$DB" -csv -noheader "SELECT COUNT(*) FROM fts_main_documents.terms;")
|
|
NGRAMS=$(duckdb "$DB" -csv -noheader "SELECT COUNT(*) FROM fts_main_documents.dict WHERE term LIKE '% %';")
|
|
AVGDL=$(duckdb "$DB" -csv -noheader "SELECT avgdl FROM fts_main_documents.stats;")
|
|
SUMDF=$(duckdb "$DB" -csv -noheader "SELECT sumdf FROM fts_main_documents.stats;")
|
|
# Get eval from cranfield_queries_half1 where name matches DB
|
|
DB_BASENAME=$(basename "$DB" .db)
|
|
EVAL_HALF1="../results_new_postings/cranfield_queries_with_2_bigrams/${DB_BASENAME}_eval.txt"
|
|
if [[ -f "$EVAL_HALF1" ]]; then
|
|
MAP=$(grep -E '^map[[:space:]]+all' "$EVAL_HALF1" | awk '{print $3}')
|
|
POSTINGS_COST=$(grep '^Average cost in postings:' "$EVAL_HALF1" | awk '{print $5}')
|
|
echo -e "${RUN_ID}\t${MODE}\t${STOPWORDS}\t${MIN_FREQ}\t${MIN_PMI}\t${MAP}\t${POSTINGS_COST}\t${DICT_SIZE}\t${TERMS_SIZE}\t${NGRAMS}\t${AVGDL}\t${SUMDF}"
|
|
fi
|
|
fi
|
|
done
|