mirror of
https://github.com/ArthurIdema/Zoekeend-Phrase-Indexing.git
synced 2026-02-08 17:12:22 +00:00
86 lines
2.8 KiB
Python
86 lines
2.8 KiB
Python
import csv
|
|
import re
|
|
from pathlib import Path
|
|
|
|
base_folder = Path("attempt_fixed_baseline/no_min_pmi-1-112") # adjust if needed
|
|
output_file = "attempt_fixed_baseline/spreadsheets/results_per_query_no_min_pmi_1-112v2.csv"
|
|
|
|
# regex for metrics
|
|
re_all = re.compile(r"^(\S+)\s+all\s+([0-9.\-]+)\s*$")
|
|
re_colon = re.compile(r"^([A-Za-z0-9 \-]+):\s*([0-9.\-]+)\s*$")
|
|
|
|
def normalize_name(s: str) -> str:
|
|
return re.sub(r"\s+", "_", s.strip().lower())
|
|
|
|
def parse_file(filepath: Path):
|
|
metrics = {}
|
|
with open(filepath, "r", encoding="utf-8") as f:
|
|
for line in f:
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
if m := re_all.match(line):
|
|
metrics[m.group(1)] = m.group(2)
|
|
elif m2 := re_colon.match(line):
|
|
metrics[normalize_name(m2.group(1))] = m2.group(2)
|
|
return metrics
|
|
|
|
def parse_filename(filename: str):
|
|
base = Path(filename).stem
|
|
parts = base.split("_")
|
|
# Expected: ['database', 'duckdb', 'english', '-1', '0', '0', 'eval']
|
|
try:
|
|
return {
|
|
"mode": parts[1],
|
|
"stopwords": parts[2],
|
|
"min_freq": parts[4],
|
|
"min_pmi": parts[5]
|
|
}
|
|
except IndexError:
|
|
return {
|
|
"mode": "unknown",
|
|
"stopwords": "unknown",
|
|
"min_freq": "unknown",
|
|
"min_pmi": "unknown"
|
|
}
|
|
|
|
rows = []
|
|
all_keys = set()
|
|
|
|
# Only include numeric folders, sort numerically. If there are none, fall back to
|
|
# flat files directly inside base_folder (e.g., aggregated results for queries 1-112).
|
|
numeric_folders = [f for f in base_folder.iterdir() if f.is_dir() and f.name.isdigit()]
|
|
numeric_folders.sort(key=lambda f: int(f.name))
|
|
|
|
if numeric_folders:
|
|
for query_folder in numeric_folders:
|
|
query_num = query_folder.name
|
|
for file in query_folder.glob("*.txt"):
|
|
if not re.match(r"database_.*_eval\.txt", file.name):
|
|
continue
|
|
|
|
metrics = parse_file(file)
|
|
metadata = parse_filename(file.name)
|
|
combined = {"query": query_num, **metadata, **metrics}
|
|
rows.append(combined)
|
|
all_keys.update(combined.keys())
|
|
else:
|
|
for file in base_folder.glob("*.txt"):
|
|
if not re.match(r"database_.*_eval\.txt", file.name):
|
|
continue
|
|
|
|
metrics = parse_file(file)
|
|
metadata = parse_filename(file.name)
|
|
combined = {"query": "all", **metadata, **metrics}
|
|
rows.append(combined)
|
|
all_keys.update(combined.keys())
|
|
|
|
# Write output CSV
|
|
all_keys = sorted(all_keys)
|
|
with open(output_file, "w", newline="", encoding="utf-8") as csvfile:
|
|
writer = csv.DictWriter(csvfile, fieldnames=all_keys)
|
|
writer.writeheader()
|
|
writer.writerows(rows)
|
|
|
|
print(f"✅ Combined {len(rows)} result files from {len(numeric_folders)} folders into {output_file}")
|