diff --git a/.gitignore b/.gitignore index 17016e9..50a5908 100644 --- a/.gitignore +++ b/.gitignore @@ -11,7 +11,12 @@ plot* *lock* *.db *.ciff -*.csv +combined*.csv +comparison*.csv +no*.csv +output*.csv +p*.csv +results*.csv *.sync* *.log /trec_eval/ diff --git a/README.md b/README.md index 4e30364..0a1c549 100644 --- a/README.md +++ b/README.md @@ -15,4 +15,9 @@ Run `python3 phrase_index.py` with any of the parameters listed below: - `./batch_phrase.sh` can be used to create the results using multiple different variables in one go. -- And display_results.sh can be used to display the evaluation metrics of all previous results. (So MAP, CiP, dictionary size, terms size, number of phrases, AVGDL and SUMDF) \ No newline at end of file +- And `display_results.sh` can be used to display the evaluation metrics of all previous results. (So MAP, CiP, dictionary size, terms size, number of phrases, AVGDL and SUMDF) + +### Statistical Analysis and Comparison +- **[compare_phrases_vs_duckdb.py](compare_phrases_vs_duckdb.py)** - Performs two-tailed pairwise sign test comparing MAP (Mean Average Precision) results between phrase-based and baseline approaches. Uses min_pmi=24 as baseline. Requires scipy for statistical testing. + +- **[compare_postings_cost_vs_duckdb.py](compare_postings_cost_vs_duckdb.py)** - Similar to above but compares Cost in Postings (CiP) metric instead of MAP. Evaluates computational efficiency of different indexing approaches. diff --git a/batch_search_eval.sh b/batch_search_eval.sh index 495372a..c15312c 100755 --- a/batch_search_eval.sh +++ b/batch_search_eval.sh @@ -1,4 +1,5 @@ #!/bin/bash +# This script can be used to run search and evaluation over existing databases in a results directory # Usage: ./batch_search_eval.sh if [ "$#" -ne 3 ]; then diff --git a/compare_phrases_vs_duckdb.py b/compare_phrases_vs_duckdb.py index 13c4fd6..0c6ced1 100644 --- a/compare_phrases_vs_duckdb.py +++ b/compare_phrases_vs_duckdb.py @@ -1,5 +1,6 @@ import pandas as pd from pathlib import Path +# This script is a two tailed pairwise sign test comparing MAP results against a baseline with min_pmi=24 try: from scipy.stats import binomtest diff --git a/compare_postings_cost_vs_duckdb.py b/compare_postings_cost_vs_duckdb.py index 8c07229..2ea6c73 100644 --- a/compare_postings_cost_vs_duckdb.py +++ b/compare_postings_cost_vs_duckdb.py @@ -1,5 +1,6 @@ import pandas as pd from pathlib import Path +# This script is a two tailed pairwise sign test comparing Cost in Postings against a baseline with min_pmi=24 try: from scipy.stats import binomtest diff --git a/helper_scripts/auto_phrase.sh b/helper_scripts/auto_phrase.sh index 73eb405..7dda733 100755 --- a/helper_scripts/auto_phrase.sh +++ b/helper_scripts/auto_phrase.sh @@ -1,6 +1,6 @@ #!/bin/bash set -e - +# This script can be used to run an automated CIFF indexing, searching and evaluation process (with bigrams) # Settings DB="database.db" OUT="results.txt" diff --git a/helper_scripts/auto_zoekeend.sh b/helper_scripts/auto_zoekeend.sh index cd68d40..91f4e40 100755 --- a/helper_scripts/auto_zoekeend.sh +++ b/helper_scripts/auto_zoekeend.sh @@ -1,4 +1,5 @@ #!/bin/bash +# This script can be used to run an automated zoekeend indexing, searching and evaluation process (no bigrams) set -e # Settings diff --git a/helper_scripts/batch_phrase.sh b/helper_scripts/batch_phrase.sh index 6a85bc3..65b43ab 100755 --- a/helper_scripts/batch_phrase.sh +++ b/helper_scripts/batch_phrase.sh @@ -1,4 +1,6 @@ #!/bin/bash +# This script can be used to run a batch of phrase indexing experiments with varying parameters +# Like the minimum frequency and minimum PMI thresholds, to use stopwords or not etc. set -e DB_BASE="database" diff --git a/helper_scripts/display_results.sh b/helper_scripts/display_results.sh index 5c200e2..cf28cbd 100755 --- a/helper_scripts/display_results.sh +++ b/helper_scripts/display_results.sh @@ -1,5 +1,5 @@ #!/bin/bash - +# This script can be used to display results (CiP, MAP, etc.) from a batch of experiments, given a folder set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" diff --git a/statistics_queries_1-112.csv b/result_tables/statistics_queries_1-112.csv similarity index 100% rename from statistics_queries_1-112.csv rename to result_tables/statistics_queries_1-112.csv diff --git a/statistics_queries_1-225.csv b/result_tables/statistics_queries_1-225.csv similarity index 100% rename from statistics_queries_1-225.csv rename to result_tables/statistics_queries_1-225.csv diff --git a/statistics_queries_113-225.csv b/result_tables/statistics_queries_113-225.csv similarity index 100% rename from statistics_queries_113-225.csv rename to result_tables/statistics_queries_113-225.csv