9c039139ef
PR-DocSrv-Hier-PassageRAG-Diagnose-1 c3. 22Q x {prehier,hier_sim_clean} /ask?debug=true
exact_knn capture (44 rec). ai_answer/evidence/target_doc_present/target_span_used/
objective signals(hallucination/grounding/completeness/refused) 박제.
관찰: hier 일부 타깃 retrieval 실패(exam_005/006,cl_007=doc-search NO-GO 일관) + 일부 gain
(cl_001/002). empty-answer 케이스(cl_005/cl_007 prehier, cl_006/exam_004 skipped) 존재.
JWT 15min 만료로 1차 부분실패 → cache-warm 재실행 완주.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
78 lines
3.9 KiB
Python
78 lines
3.9 KiB
Python
#!/usr/bin/env python3
|
||
"""PR-DocSrv-Hier-PassageRAG-Diagnose-1 c3 — /ask capture for prehier vs hier_sim_clean.
|
||
각 Q × variant 호출 → JSONL 박제 (judge 입력 raw). EVAL-ONLY."""
|
||
import json, sys, time, os
|
||
import httpx, yaml
|
||
|
||
BASE = "http://100.110.63.63:8000"
|
||
VARIANTS = ["prehier", "hier_sim_clean"]
|
||
SUBSET = "tests/search_eval/queries_passage_rag.yaml"
|
||
|
||
def trim_citations(cits):
|
||
out = []
|
||
for c in cits or []:
|
||
out.append({k: c.get(k) for k in ("n", "doc_id", "chunk_id", "section_title", "span_text", "rerank_score")})
|
||
return out
|
||
|
||
def main():
|
||
token = os.environ["DOCSRV_TOKEN"]
|
||
out_path = sys.argv[1] if len(sys.argv) > 1 else "reports/passage_rag_capture.jsonl"
|
||
d = yaml.safe_load(open(SUBSET))
|
||
queries = d["queries"]
|
||
recs = []
|
||
with httpx.Client(timeout=90.0) as cli:
|
||
for q in queries:
|
||
g2 = set(q.get("targets_g2") or [])
|
||
for variant in VARIANTS:
|
||
params = {"q": q["query"], "debug": "true",
|
||
"corpus_variant": variant, "exact_knn": "true"}
|
||
t0 = time.perf_counter()
|
||
try:
|
||
r = cli.get(f"{BASE}/api/search/ask",
|
||
headers={"Authorization": f"Bearer {token}"}, params=params)
|
||
lat = (time.perf_counter() - t0) * 1000
|
||
r.raise_for_status()
|
||
data = r.json()
|
||
except Exception as exc:
|
||
print(f" ✗ {q['id']}/{variant}: {type(exc).__name__}: {repr(exc)[:120]}", flush=True)
|
||
recs.append({"q_id": q["id"], "variant": variant, "error": f"{type(exc).__name__}"})
|
||
continue
|
||
cits = trim_citations(data.get("citations"))
|
||
used = data.get("used_citations") or [c.get("n") for c in (data.get("citations") or []) if c.get("n")]
|
||
cited_docs = {c["doc_id"] for c in cits if c.get("doc_id")}
|
||
target_doc_present = bool(cited_docs & g2)
|
||
# cited span from a target doc that actually appears in the answer (used citation)
|
||
used_set = set(used or [])
|
||
target_span_used = any((c.get("doc_id") in g2) and (c.get("n") in used_set) for c in cits)
|
||
ans = data.get("ai_answer") or ""
|
||
rec = {
|
||
"q_id": q["id"], "category": q["category"], "query": q["query"],
|
||
"variant": variant, "targets_g2": sorted(g2), "targets_g3": q.get("targets_g3"),
|
||
"answer_hint": q.get("notes"),
|
||
"ai_answer": ans, "answer_len_chars": len(ans),
|
||
"evidence": cits, "used_citations": sorted(used_set),
|
||
"cited_docs": sorted(cited_docs),
|
||
"target_doc_present": target_doc_present,
|
||
"target_span_used_in_citation": target_span_used,
|
||
"synthesis_status": data.get("synthesis_status"),
|
||
"completeness": data.get("completeness"),
|
||
"covered_aspects": data.get("covered_aspects"),
|
||
"missing_aspects": data.get("missing_aspects"),
|
||
"confidence": data.get("confidence"),
|
||
"refused": data.get("refused"),
|
||
"backend_used": data.get("backend_used"),
|
||
"synthesis_ms": data.get("synthesis_ms"),
|
||
"latency_ms": round(lat, 1),
|
||
"debug": data.get("debug"),
|
||
}
|
||
recs.append(rec)
|
||
print(f" ✓ {q['id']}/{variant}: ans_len={len(ans)} cited={sorted(cited_docs)} "
|
||
f"tgt_present={target_doc_present} tgt_used={target_span_used} status={data.get('synthesis_status')}", flush=True)
|
||
with open(out_path, "w") as f:
|
||
for rec in recs:
|
||
f.write(json.dumps(rec, ensure_ascii=False) + "\n")
|
||
print(f"\nwrote {len(recs)} records → {out_path}")
|
||
|
||
if __name__ == "__main__":
|
||
main()
|