#!/bin/bash echo -e "RUN_ID\tMODE\tSTOPWORDS\tMIN_FREQ\tMIN_PMI\tMAP\tPOSTINGS_COST\tDICT_SIZE\tTERMS_SIZE\tNGRAMS\tAVGDL\tSUMDF" for dir in ../results_new_postings/*; do [ -d "$dir" ] || continue SETTINGS="$dir/settings.txt" DB=$(grep '^DB:' "$SETTINGS" | awk '{print $2}') DB="$dir/$(basename "$DB")" if [[ -f "$SETTINGS" && -f "$DB" ]]; then RUN_ID=$(grep '^RUN_ID:' "$SETTINGS" | awk '{print $2}') MODE=$(grep '^MODE:' "$SETTINGS" | awk '{print $2}') STOPWORDS=$(grep '^STOPWORDS:' "$SETTINGS" | awk '{print $2}') MIN_FREQ=$(grep '^MIN_FREQ:' "$SETTINGS" | awk '{print $2}') MIN_PMI=$(grep '^MIN_PMI:' "$SETTINGS" | awk '{print $2}') DICT_SIZE=$(duckdb "$DB" -csv -noheader "SELECT COUNT(*) FROM fts_main_documents.dict;") TERMS_SIZE=$(duckdb "$DB" -csv -noheader "SELECT COUNT(*) FROM fts_main_documents.terms;") NGRAMS=$(duckdb "$DB" -csv -noheader "SELECT COUNT(*) FROM fts_main_documents.dict WHERE term LIKE '% %';") AVGDL=$(duckdb "$DB" -csv -noheader "SELECT avgdl FROM fts_main_documents.stats;") SUMDF=$(duckdb "$DB" -csv -noheader "SELECT sumdf FROM fts_main_documents.stats;") # Get eval from cranfield_queries_half1 where name matches DB DB_BASENAME=$(basename "$DB" .db) EVAL_HALF1="../results_new_postings/cranfield_queries_half2/${DB_BASENAME}_eval.txt" if [[ -f "$EVAL_HALF1" ]]; then MAP=$(grep -E '^map[[:space:]]+all' "$EVAL_HALF1" | awk '{print $3}') POSTINGS_COST=$(grep '^Average cost in postings:' "$EVAL_HALF1" | awk '{print $5}') echo -e "${RUN_ID}\t${MODE}\t${STOPWORDS}\t${MIN_FREQ}\t${MIN_PMI}\t${MAP}\t${POSTINGS_COST}\t${DICT_SIZE}\t${TERMS_SIZE}\t${NGRAMS}\t${AVGDL}\t${SUMDF}" fi fi done