Files
hyungi_document_server/tests/search_eval/scripts/capture_passage_rag.py
T
hyungi 9c039139ef feat(search): passage-RAG capture runner + raw JSONL (diagnose c3)
PR-DocSrv-Hier-PassageRAG-Diagnose-1 c3. 22Q x {prehier,hier_sim_clean} /ask?debug=true
exact_knn capture (44 rec). ai_answer/evidence/target_doc_present/target_span_used/
objective signals(hallucination/grounding/completeness/refused) 박제.
관찰: hier 일부 타깃 retrieval 실패(exam_005/006,cl_007=doc-search NO-GO 일관) + 일부 gain
(cl_001/002). empty-answer 케이스(cl_005/cl_007 prehier, cl_006/exam_004 skipped) 존재.
JWT 15min 만료로 1차 부분실패 → cache-warm 재실행 완주.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-25 06:53:11 +00:00

78 lines
3.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""PR-DocSrv-Hier-PassageRAG-Diagnose-1 c3 — /ask capture for prehier vs hier_sim_clean.
각 Q × variant 호출 → JSONL 박제 (judge 입력 raw). EVAL-ONLY."""
import json, sys, time, os
import httpx, yaml
BASE = "http://100.110.63.63:8000"
VARIANTS = ["prehier", "hier_sim_clean"]
SUBSET = "tests/search_eval/queries_passage_rag.yaml"
def trim_citations(cits):
out = []
for c in cits or []:
out.append({k: c.get(k) for k in ("n", "doc_id", "chunk_id", "section_title", "span_text", "rerank_score")})
return out
def main():
token = os.environ["DOCSRV_TOKEN"]
out_path = sys.argv[1] if len(sys.argv) > 1 else "reports/passage_rag_capture.jsonl"
d = yaml.safe_load(open(SUBSET))
queries = d["queries"]
recs = []
with httpx.Client(timeout=90.0) as cli:
for q in queries:
g2 = set(q.get("targets_g2") or [])
for variant in VARIANTS:
params = {"q": q["query"], "debug": "true",
"corpus_variant": variant, "exact_knn": "true"}
t0 = time.perf_counter()
try:
r = cli.get(f"{BASE}/api/search/ask",
headers={"Authorization": f"Bearer {token}"}, params=params)
lat = (time.perf_counter() - t0) * 1000
r.raise_for_status()
data = r.json()
except Exception as exc:
print(f"{q['id']}/{variant}: {type(exc).__name__}: {repr(exc)[:120]}", flush=True)
recs.append({"q_id": q["id"], "variant": variant, "error": f"{type(exc).__name__}"})
continue
cits = trim_citations(data.get("citations"))
used = data.get("used_citations") or [c.get("n") for c in (data.get("citations") or []) if c.get("n")]
cited_docs = {c["doc_id"] for c in cits if c.get("doc_id")}
target_doc_present = bool(cited_docs & g2)
# cited span from a target doc that actually appears in the answer (used citation)
used_set = set(used or [])
target_span_used = any((c.get("doc_id") in g2) and (c.get("n") in used_set) for c in cits)
ans = data.get("ai_answer") or ""
rec = {
"q_id": q["id"], "category": q["category"], "query": q["query"],
"variant": variant, "targets_g2": sorted(g2), "targets_g3": q.get("targets_g3"),
"answer_hint": q.get("notes"),
"ai_answer": ans, "answer_len_chars": len(ans),
"evidence": cits, "used_citations": sorted(used_set),
"cited_docs": sorted(cited_docs),
"target_doc_present": target_doc_present,
"target_span_used_in_citation": target_span_used,
"synthesis_status": data.get("synthesis_status"),
"completeness": data.get("completeness"),
"covered_aspects": data.get("covered_aspects"),
"missing_aspects": data.get("missing_aspects"),
"confidence": data.get("confidence"),
"refused": data.get("refused"),
"backend_used": data.get("backend_used"),
"synthesis_ms": data.get("synthesis_ms"),
"latency_ms": round(lat, 1),
"debug": data.get("debug"),
}
recs.append(rec)
print(f"{q['id']}/{variant}: ans_len={len(ans)} cited={sorted(cited_docs)} "
f"tgt_present={target_doc_present} tgt_used={target_span_used} status={data.get('synthesis_status')}", flush=True)
with open(out_path, "w") as f:
for rec in recs:
f.write(json.dumps(rec, ensure_ascii=False) + "\n")
print(f"\nwrote {len(recs)} records → {out_path}")
if __name__ == "__main__":
main()