test(eval): Phase 2A E-4 비교기 — per-query win/loss/tie(ε)·부트스트랩 CI·카테고리 분해
Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,97 @@
|
||||
"""Phase 2A E-4 비교기 — baseline vs 후보 run CSV 들의 per-query 판정.
|
||||
|
||||
python tests/search_eval/compare_runs.py \
|
||||
--baseline baselines/<exact 재측정>.csv \
|
||||
--cand qwen06=<...>.csv --cand qwen4=<...>.csv --cand qwen4m=<...>.csv \
|
||||
[--epsilon 0.01] [--bootstrap 2000]
|
||||
|
||||
판정 출력(plan r3 E-4): 전체 graded NDCG 평균 delta · per-query win/loss/tie(|d|<ε=tie)
|
||||
· 부트스트랩 95% CI · 카테고리별 평균 · 상위 개선/퇴행 쿼리. failure_expected/error 행 제외.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import random
|
||||
import statistics
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def load(path: str) -> dict[str, dict]:
|
||||
out = {}
|
||||
with Path(path).open(encoding="utf-8") as f:
|
||||
for row in csv.DictReader(f):
|
||||
if row.get("failure_expected", "").lower() in ("true", "1"):
|
||||
continue
|
||||
if row.get("error"):
|
||||
continue
|
||||
try:
|
||||
row["_g"] = float(row["graded_ndcg_at_10"])
|
||||
except (TypeError, ValueError):
|
||||
continue
|
||||
out[row["id"]] = row
|
||||
return out
|
||||
|
||||
|
||||
def bootstrap_ci(deltas: list[float], n: int, seed: int = 42) -> tuple[float, float]:
|
||||
rng = random.Random(seed)
|
||||
means = sorted(
|
||||
statistics.mean(rng.choices(deltas, k=len(deltas))) for _ in range(n)
|
||||
)
|
||||
return means[int(0.025 * n)], means[int(0.975 * n)]
|
||||
|
||||
|
||||
def main() -> None:
|
||||
p = argparse.ArgumentParser()
|
||||
p.add_argument("--baseline", required=True)
|
||||
p.add_argument("--cand", action="append", required=True, metavar="name=csv")
|
||||
p.add_argument("--epsilon", type=float, default=0.01)
|
||||
p.add_argument("--bootstrap", type=int, default=2000)
|
||||
a = p.parse_args()
|
||||
|
||||
base = load(a.baseline)
|
||||
print(f"baseline: {a.baseline} — scored {len(base)}, "
|
||||
f"graded NDCG mean {statistics.mean(r['_g'] for r in base.values()):.4f}")
|
||||
|
||||
for spec in a.cand:
|
||||
name, path = spec.split("=", 1)
|
||||
cand = load(path)
|
||||
ids = sorted(set(base) & set(cand))
|
||||
if len(ids) != len(base):
|
||||
print(f" ⚠ {name}: 교집합 {len(ids)} != baseline {len(base)} — 누락 쿼리 확인")
|
||||
deltas = [cand[i]["_g"] - base[i]["_g"] for i in ids]
|
||||
mean_b = statistics.mean(base[i]["_g"] for i in ids)
|
||||
mean_c = statistics.mean(cand[i]["_g"] for i in ids)
|
||||
win = sum(1 for d in deltas if d > a.epsilon)
|
||||
loss = sum(1 for d in deltas if d < -a.epsilon)
|
||||
tie = len(deltas) - win - loss
|
||||
lo, hi = bootstrap_ci(deltas, a.bootstrap)
|
||||
decided = win + loss
|
||||
win_rate = (win / decided * 100) if decided else 0.0
|
||||
|
||||
print(f"\n== {name} ==")
|
||||
print(f" graded NDCG: {mean_b:.4f} → {mean_c:.4f} (delta {mean_c-mean_b:+.4f}, "
|
||||
f"bootstrap95% [{lo:+.4f}, {hi:+.4f}])")
|
||||
print(f" per-query: win {win} / loss {loss} / tie {tie} (ε={a.epsilon}) — "
|
||||
f"win-rate(결정전) {win_rate:.0f}%")
|
||||
|
||||
cats: dict[str, list[float]] = {}
|
||||
for i in ids:
|
||||
cats.setdefault(base[i].get("category", "?"), []).append(
|
||||
cand[i]["_g"] - base[i]["_g"]
|
||||
)
|
||||
for c in sorted(cats):
|
||||
ds = cats[c]
|
||||
cb = statistics.mean(base[i]["_g"] for i in ids if base[i].get("category") == c)
|
||||
cc = statistics.mean(cand[i]["_g"] for i in ids if base[i].get("category") == c)
|
||||
print(f" {c:<18} {cb:.3f} → {cc:.3f} ({statistics.mean(ds):+.3f}, n={len(ds)})")
|
||||
|
||||
ranked = sorted(ids, key=lambda i: cand[i]["_g"] - base[i]["_g"])
|
||||
worst = [(i, round(cand[i]['_g']-base[i]['_g'],3)) for i in ranked[:3]]
|
||||
best = [(i, round(cand[i]['_g']-base[i]['_g'],3)) for i in ranked[-3:][::-1]]
|
||||
print(f" 개선 top3 {best} / 퇴행 top3 {worst}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user