feat(search): passage-RAG capture runner + raw JSONL (diagnose c3)

PR-DocSrv-Hier-PassageRAG-Diagnose-1 c3. 22Q x {prehier,hier_sim_clean} /ask?debug=true
exact_knn capture (44 rec). ai_answer/evidence/target_doc_present/target_span_used/
objective signals(hallucination/grounding/completeness/refused) 박제.
관찰: hier 일부 타깃 retrieval 실패(exam_005/006,cl_007=doc-search NO-GO 일관) + 일부 gain
(cl_001/002). empty-answer 케이스(cl_005/cl_007 prehier, cl_006/exam_004 skipped) 존재.
JWT 15min 만료로 1차 부분실패 → cache-warm 재실행 완주.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
hyungi
2026-05-25 06:53:11 +00:00
parent 698510bc0e
commit 9c039139ef
2 changed files with 121 additions and 0 deletions
File diff suppressed because one or more lines are too long
@@ -0,0 +1,77 @@
#!/usr/bin/env python3
"""PR-DocSrv-Hier-PassageRAG-Diagnose-1 c3 — /ask capture for prehier vs hier_sim_clean.
각 Q × variant 호출 → JSONL 박제 (judge 입력 raw). EVAL-ONLY."""
import json, sys, time, os
import httpx, yaml
BASE = "http://100.110.63.63:8000"
VARIANTS = ["prehier", "hier_sim_clean"]
SUBSET = "tests/search_eval/queries_passage_rag.yaml"
def trim_citations(cits):
out = []
for c in cits or []:
out.append({k: c.get(k) for k in ("n", "doc_id", "chunk_id", "section_title", "span_text", "rerank_score")})
return out
def main():
token = os.environ["DOCSRV_TOKEN"]
out_path = sys.argv[1] if len(sys.argv) > 1 else "reports/passage_rag_capture.jsonl"
d = yaml.safe_load(open(SUBSET))
queries = d["queries"]
recs = []
with httpx.Client(timeout=90.0) as cli:
for q in queries:
g2 = set(q.get("targets_g2") or [])
for variant in VARIANTS:
params = {"q": q["query"], "debug": "true",
"corpus_variant": variant, "exact_knn": "true"}
t0 = time.perf_counter()
try:
r = cli.get(f"{BASE}/api/search/ask",
headers={"Authorization": f"Bearer {token}"}, params=params)
lat = (time.perf_counter() - t0) * 1000
r.raise_for_status()
data = r.json()
except Exception as exc:
print(f"{q['id']}/{variant}: {type(exc).__name__}: {repr(exc)[:120]}", flush=True)
recs.append({"q_id": q["id"], "variant": variant, "error": f"{type(exc).__name__}"})
continue
cits = trim_citations(data.get("citations"))
used = data.get("used_citations") or [c.get("n") for c in (data.get("citations") or []) if c.get("n")]
cited_docs = {c["doc_id"] for c in cits if c.get("doc_id")}
target_doc_present = bool(cited_docs & g2)
# cited span from a target doc that actually appears in the answer (used citation)
used_set = set(used or [])
target_span_used = any((c.get("doc_id") in g2) and (c.get("n") in used_set) for c in cits)
ans = data.get("ai_answer") or ""
rec = {
"q_id": q["id"], "category": q["category"], "query": q["query"],
"variant": variant, "targets_g2": sorted(g2), "targets_g3": q.get("targets_g3"),
"answer_hint": q.get("notes"),
"ai_answer": ans, "answer_len_chars": len(ans),
"evidence": cits, "used_citations": sorted(used_set),
"cited_docs": sorted(cited_docs),
"target_doc_present": target_doc_present,
"target_span_used_in_citation": target_span_used,
"synthesis_status": data.get("synthesis_status"),
"completeness": data.get("completeness"),
"covered_aspects": data.get("covered_aspects"),
"missing_aspects": data.get("missing_aspects"),
"confidence": data.get("confidence"),
"refused": data.get("refused"),
"backend_used": data.get("backend_used"),
"synthesis_ms": data.get("synthesis_ms"),
"latency_ms": round(lat, 1),
"debug": data.get("debug"),
}
recs.append(rec)
print(f"{q['id']}/{variant}: ans_len={len(ans)} cited={sorted(cited_docs)} "
f"tgt_present={target_doc_present} tgt_used={target_span_used} status={data.get('synthesis_status')}", flush=True)
with open(out_path, "w") as f:
for rec in recs:
f.write(json.dumps(rec, ensure_ascii=False) + "\n")
print(f"\nwrote {len(recs)} records → {out_path}")
if __name__ == "__main__":
main()