Zoekeend-Phrase-Indexing/parse_eval_to_csv.py
2026-01-08 12:29:53 +01:00

86 lines
2.8 KiB
Python

import csv
import re
from pathlib import Path
base_folder = Path("attempt_fixed_baseline/no_min_pmi-1-112") # adjust if needed
output_file = "attempt_fixed_baseline/spreadsheets/results_per_query_no_min_pmi_1-112v2.csv"
# regex for metrics
re_all = re.compile(r"^(\S+)\s+all\s+([0-9.\-]+)\s*$")
re_colon = re.compile(r"^([A-Za-z0-9 \-]+):\s*([0-9.\-]+)\s*$")
def normalize_name(s: str) -> str:
return re.sub(r"\s+", "_", s.strip().lower())
def parse_file(filepath: Path):
metrics = {}
with open(filepath, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
if m := re_all.match(line):
metrics[m.group(1)] = m.group(2)
elif m2 := re_colon.match(line):
metrics[normalize_name(m2.group(1))] = m2.group(2)
return metrics
def parse_filename(filename: str):
base = Path(filename).stem
parts = base.split("_")
# Expected: ['database', 'duckdb', 'english', '-1', '0', '0', 'eval']
try:
return {
"mode": parts[1],
"stopwords": parts[2],
"min_freq": parts[4],
"min_pmi": parts[5]
}
except IndexError:
return {
"mode": "unknown",
"stopwords": "unknown",
"min_freq": "unknown",
"min_pmi": "unknown"
}
rows = []
all_keys = set()
# Only include numeric folders, sort numerically. If there are none, fall back to
# flat files directly inside base_folder (e.g., aggregated results for queries 1-112).
numeric_folders = [f for f in base_folder.iterdir() if f.is_dir() and f.name.isdigit()]
numeric_folders.sort(key=lambda f: int(f.name))
if numeric_folders:
for query_folder in numeric_folders:
query_num = query_folder.name
for file in query_folder.glob("*.txt"):
if not re.match(r"database_.*_eval\.txt", file.name):
continue
metrics = parse_file(file)
metadata = parse_filename(file.name)
combined = {"query": query_num, **metadata, **metrics}
rows.append(combined)
all_keys.update(combined.keys())
else:
for file in base_folder.glob("*.txt"):
if not re.match(r"database_.*_eval\.txt", file.name):
continue
metrics = parse_file(file)
metadata = parse_filename(file.name)
combined = {"query": "all", **metadata, **metrics}
rows.append(combined)
all_keys.update(combined.keys())
# Write output CSV
all_keys = sorted(all_keys)
with open(output_file, "w", newline="", encoding="utf-8") as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=all_keys)
writer.writeheader()
writer.writerows(rows)
print(f"✅ Combined {len(rows)} result files from {len(numeric_folders)} folders into {output_file}")