diff --git a/tests/search_eval/compare_runs.py b/tests/search_eval/compare_runs.py new file mode 100644 index 0000000..db1b435 --- /dev/null +++ b/tests/search_eval/compare_runs.py @@ -0,0 +1,97 @@ +"""Phase 2A E-4 비교기 — baseline vs 후보 run CSV 들의 per-query 판정. + + python tests/search_eval/compare_runs.py \ + --baseline baselines/.csv \ + --cand qwen06=<...>.csv --cand qwen4=<...>.csv --cand qwen4m=<...>.csv \ + [--epsilon 0.01] [--bootstrap 2000] + +판정 출력(plan r3 E-4): 전체 graded NDCG 평균 delta · per-query win/loss/tie(|d|<ε=tie) +· 부트스트랩 95% CI · 카테고리별 평균 · 상위 개선/퇴행 쿼리. failure_expected/error 행 제외. +""" + +from __future__ import annotations + +import argparse +import csv +import random +import statistics +from pathlib import Path + + +def load(path: str) -> dict[str, dict]: + out = {} + with Path(path).open(encoding="utf-8") as f: + for row in csv.DictReader(f): + if row.get("failure_expected", "").lower() in ("true", "1"): + continue + if row.get("error"): + continue + try: + row["_g"] = float(row["graded_ndcg_at_10"]) + except (TypeError, ValueError): + continue + out[row["id"]] = row + return out + + +def bootstrap_ci(deltas: list[float], n: int, seed: int = 42) -> tuple[float, float]: + rng = random.Random(seed) + means = sorted( + statistics.mean(rng.choices(deltas, k=len(deltas))) for _ in range(n) + ) + return means[int(0.025 * n)], means[int(0.975 * n)] + + +def main() -> None: + p = argparse.ArgumentParser() + p.add_argument("--baseline", required=True) + p.add_argument("--cand", action="append", required=True, metavar="name=csv") + p.add_argument("--epsilon", type=float, default=0.01) + p.add_argument("--bootstrap", type=int, default=2000) + a = p.parse_args() + + base = load(a.baseline) + print(f"baseline: {a.baseline} — scored {len(base)}, " + f"graded NDCG mean {statistics.mean(r['_g'] for r in base.values()):.4f}") + + for spec in a.cand: + name, path = spec.split("=", 1) + cand = load(path) + ids = sorted(set(base) & set(cand)) + if len(ids) != len(base): + print(f" ⚠ {name}: 교집합 {len(ids)} != baseline {len(base)} — 누락 쿼리 확인") + deltas = [cand[i]["_g"] - base[i]["_g"] for i in ids] + mean_b = statistics.mean(base[i]["_g"] for i in ids) + mean_c = statistics.mean(cand[i]["_g"] for i in ids) + win = sum(1 for d in deltas if d > a.epsilon) + loss = sum(1 for d in deltas if d < -a.epsilon) + tie = len(deltas) - win - loss + lo, hi = bootstrap_ci(deltas, a.bootstrap) + decided = win + loss + win_rate = (win / decided * 100) if decided else 0.0 + + print(f"\n== {name} ==") + print(f" graded NDCG: {mean_b:.4f} → {mean_c:.4f} (delta {mean_c-mean_b:+.4f}, " + f"bootstrap95% [{lo:+.4f}, {hi:+.4f}])") + print(f" per-query: win {win} / loss {loss} / tie {tie} (ε={a.epsilon}) — " + f"win-rate(결정전) {win_rate:.0f}%") + + cats: dict[str, list[float]] = {} + for i in ids: + cats.setdefault(base[i].get("category", "?"), []).append( + cand[i]["_g"] - base[i]["_g"] + ) + for c in sorted(cats): + ds = cats[c] + cb = statistics.mean(base[i]["_g"] for i in ids if base[i].get("category") == c) + cc = statistics.mean(cand[i]["_g"] for i in ids if base[i].get("category") == c) + print(f" {c:<18} {cb:.3f} → {cc:.3f} ({statistics.mean(ds):+.3f}, n={len(ds)})") + + ranked = sorted(ids, key=lambda i: cand[i]["_g"] - base[i]["_g"]) + worst = [(i, round(cand[i]['_g']-base[i]['_g'],3)) for i in ranked[:3]] + best = [(i, round(cand[i]['_g']-base[i]['_g'],3)) for i in ranked[-3:][::-1]] + print(f" 개선 top3 {best} / 퇴행 top3 {worst}") + + +if __name__ == "__main__": + main()