Added descriptions to files

2026-02-08 17:12:22 +00:00 · 2026-01-12 15:29:00 +01:00 · 2026-01-12 15:29:00 +01:00 · d1d7eb517b
commit d1d7eb517b
parent 872a13a394
12 changed files with 20 additions and 4 deletions
--- a/.gitignore
+++ b/.gitignore
@ -11,7 +11,12 @@ plot*
 *lock*
 *.db
 *.ciff
-*.csv
+combined*.csv
+comparison*.csv
+no*.csv
+output*.csv
+p*.csv
+results*.csv
 *.sync*
 *.log
 /trec_eval/
--- a/README.md
+++ b/README.md
@ -15,4 +15,9 @@ Run `python3 phrase_index.py` with any of the parameters listed below:

 - `./batch_phrase.sh` can be used to create the results using multiple different variables in one go.

- And display_results.sh can be used to display the evaluation metrics of all previous results. (So MAP, CiP, dictionary size, terms size, number of phrases, AVGDL and SUMDF)
+- And `display_results.sh` can be used to display the evaluation metrics of all previous results. (So MAP, CiP, dictionary size, terms size, number of phrases, AVGDL and SUMDF)
+
+### Statistical Analysis and Comparison
+- **[compare_phrases_vs_duckdb.py](compare_phrases_vs_duckdb.py)** - Performs two-tailed pairwise sign test comparing MAP (Mean Average Precision) results between phrase-based and baseline approaches. Uses min_pmi=24 as baseline. Requires scipy for statistical testing.
+
+- **[compare_postings_cost_vs_duckdb.py](compare_postings_cost_vs_duckdb.py)** - Similar to above but compares Cost in Postings (CiP) metric instead of MAP. Evaluates computational efficiency of different indexing approaches.
--- a/batch_search_eval.sh
+++ b/batch_search_eval.sh
@ -1,4 +1,5 @@
 #!/bin/bash
+# This script can be used to run search and evaluation over existing databases in a results directory
 # Usage: ./batch_search_eval.sh <results_dir> <queries_dir> <qrels_file>

 if [ "$#" -ne 3 ]; then
--- a/compare_phrases_vs_duckdb.py
+++ b/compare_phrases_vs_duckdb.py
@ -1,5 +1,6 @@
 import pandas as pd
 from pathlib import Path
+# This script is a two tailed pairwise sign test comparing MAP results against a baseline with min_pmi=24

 try:
 	from scipy.stats import binomtest
--- a/compare_postings_cost_vs_duckdb.py
+++ b/compare_postings_cost_vs_duckdb.py
@ -1,5 +1,6 @@
 import pandas as pd
 from pathlib import Path
+# This script is a two tailed pairwise sign test comparing Cost in Postings against a baseline with min_pmi=24

 try:
 	from scipy.stats import binomtest
--- a/helper_scripts/auto_phrase.sh
+++ b/helper_scripts/auto_phrase.sh
@ -1,6 +1,6 @@
 #!/bin/bash
 set -e
-
+# This script can be used to run an automated CIFF indexing, searching and evaluation process (with bigrams)
 # Settings
 DB="database.db"
 OUT="results.txt"
--- a/helper_scripts/auto_zoekeend.sh
+++ b/helper_scripts/auto_zoekeend.sh
@ -1,4 +1,5 @@
 #!/bin/bash
+# This script can be used to run an automated zoekeend indexing, searching and evaluation process (no bigrams)
 set -e

 # Settings
--- a/helper_scripts/batch_phrase.sh
+++ b/helper_scripts/batch_phrase.sh
@ -1,4 +1,6 @@
 #!/bin/bash
+# This script can be used to run a batch of phrase indexing experiments with varying parameters
+# Like the minimum frequency and minimum PMI thresholds, to use stopwords or not etc.
 set -e

 DB_BASE="database"
--- a/helper_scripts/display_results.sh
+++ b/helper_scripts/display_results.sh
@ -1,5 +1,5 @@
 #!/bin/bash
-
+# This script can be used to display results (CiP, MAP, etc.) from a batch of experiments, given a folder
 set -euo pipefail

 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
--- a/result_tables/statistics_queries_1-112.csv
+++ b/result_tables/statistics_queries_1-112.csv
--- a/result_tables/statistics_queries_1-225.csv
+++ b/result_tables/statistics_queries_1-225.csv
--- a/result_tables/statistics_queries_113-225.csv
+++ b/result_tables/statistics_queries_113-225.csv