mirror of
https://github.com/ArthurIdema/Zoekeend-Phrase-Indexing.git
synced 2026-02-09 01:22:23 +00:00
45 lines
2.0 KiB
Bash
Executable File
45 lines
2.0 KiB
Bash
Executable File
#!/bin/bash
|
|
|
|
set -euo pipefail
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
BASE_DIR="$(cd "${SCRIPT_DIR}/.." && pwd)"
|
|
RESULTS_DIR="${BASE_DIR}/attempt_fixed_baseline/results_new_postings_no_min_pmi"
|
|
EVAL_DIR_HALF1="${RESULTS_DIR}/cranfield_queries_half1"
|
|
|
|
if [[ ! -d "${RESULTS_DIR}" ]]; then
|
|
echo "Results directory not found: ${RESULTS_DIR}" >&2
|
|
exit 1
|
|
fi
|
|
|
|
echo -e "RUN_ID\tMODE\tSTOPWORDS\tMIN_FREQ\tMIN_PMI\tMAP\tPOSTINGS_COST\tDICT_SIZE\tTERMS_SIZE\tNGRAMS\tAVGDL\tSUMDF"
|
|
|
|
for dir in "${RESULTS_DIR}"/*; do
|
|
[[ -d "${dir}" ]] || continue
|
|
SETTINGS="${dir}/settings.txt"
|
|
[[ -f "${SETTINGS}" ]] || continue
|
|
|
|
DB_NAME=$(grep '^DB:' "${SETTINGS}" | awk '{print $2}')
|
|
DB_PATH="${dir}/$(basename "${DB_NAME}")"
|
|
[[ -f "${DB_PATH}" ]] || continue
|
|
|
|
RUN_ID=$(grep '^RUN_ID:' "${SETTINGS}" | awk '{print $2}')
|
|
MODE=$(grep '^MODE:' "${SETTINGS}" | awk '{print $2}')
|
|
STOPWORDS=$(grep '^STOPWORDS:' "${SETTINGS}" | awk '{print $2}')
|
|
MIN_FREQ=$(grep '^MIN_FREQ:' "${SETTINGS}" | awk '{print $2}')
|
|
MIN_PMI=$(grep '^MIN_PMI:' "${SETTINGS}" | awk '{print $2}')
|
|
DICT_SIZE=$(duckdb "${DB_PATH}" -csv -noheader "SELECT COUNT(*) FROM fts_main_documents.dict;")
|
|
TERMS_SIZE=$(duckdb "${DB_PATH}" -csv -noheader "SELECT COUNT(*) FROM fts_main_documents.terms;")
|
|
NGRAMS=$(duckdb "${DB_PATH}" -csv -noheader "SELECT COUNT(*) FROM fts_main_documents.dict WHERE term LIKE '% %';")
|
|
AVGDL=$(duckdb "${DB_PATH}" -csv -noheader "SELECT avgdl FROM fts_main_documents.stats;")
|
|
SUMDF=$(duckdb "${DB_PATH}" -csv -noheader "SELECT sumdf FROM fts_main_documents.stats;")
|
|
|
|
DB_BASENAME=$(basename "${DB_PATH}" .db)
|
|
EVAL_HALF1="${EVAL_DIR_HALF1}/${DB_BASENAME}_eval.txt"
|
|
[[ -f "${EVAL_HALF1}" ]] || continue
|
|
|
|
MAP=$(grep -E '^map[[:space:]]+all' "${EVAL_HALF1}" | awk '{print $3}')
|
|
POSTINGS_COST=$(grep '^Average cost in postings:' "${EVAL_HALF1}" | awk '{print $5}')
|
|
echo -e "${RUN_ID}\t${MODE}\t${STOPWORDS}\t${MIN_FREQ}\t${MIN_PMI}\t${MAP}\t${POSTINGS_COST}\t${DICT_SIZE}\t${TERMS_SIZE}\t${NGRAMS}\t${AVGDL}\t${SUMDF}"
|
|
done
|