#!/usr/bin/env python3 """c4: conditional subset + objective signals + anonymized 3-file split.""" import json, random, os os.chdir(os.path.expanduser("~/Documents/code/hyungi_Document_Server")) recs = [json.loads(l) for l in open("reports/passage_rag_capture_2026-05-25.jsonl")] by = {} for r in recs: by.setdefault(r["q_id"], {})[r["variant"]] = r def nonempty(r): return (r.get("answer_len_chars") or 0) > 0 and r.get("synthesis_status") == "completed" # conditional subset: both variants retrieved a target-g2 doc AND both produced an answer cond, excluded = [], [] for qid, vs in by.items(): p, h = vs.get("prehier"), vs.get("hier_sim_clean") if not p or not h: excluded.append((qid, "missing variant")); continue if not (p["target_doc_present"] and h["target_doc_present"]): excluded.append((qid, f"tgt_present p={p['target_doc_present']} h={h['target_doc_present']}")); continue if not (nonempty(p) and nonempty(h)): excluded.append((qid, f"empty/skip p={p.get('answer_len_chars')}/{p.get('synthesis_status')} h={h.get('answer_len_chars')}/{h.get('synthesis_status')}")); continue cond.append(qid) print(f"=== CONDITIONAL SUBSET (둘 다 tgt_present + non-empty) N={len(cond)} ===") print(" ", sorted(cond)) print(f"=== EXCLUDED {len(excluded)} ===") for qid, why in sorted(excluded): print(f" {qid}: {why}") def halluc(r): return len((r.get("debug") or {}).get("hallucination_flags") or []) def grounding_weak(r): g = ((r.get("debug") or {}).get("defense_layers") or {}).get("grounding") or {} return len(g.get("weak") or []) def grounding_strong(r): g = ((r.get("debug") or {}).get("defense_layers") or {}).get("grounding") or {} return len(g.get("strong") or []) print(f"\n=== OBJECTIVE SIGNALS on conditional subset (N={len(cond)}) ===") for v in ["prehier", "hier_sim_clean"]: rs = [by[q][v] for q in cond] print(f" {v}: halluc_flags={sum(halluc(r) for r in rs)} " f"grounding_weak={sum(grounding_weak(r) for r in rs)} " f"grounding_strong={sum(grounding_strong(r) for r in rs)} " f"avg_answer_len={sum(r['answer_len_chars'] for r in rs)//len(rs)} " f"completeness={[r.get('completeness') for r in rs].count('full')}full/" f"{[r.get('completeness') for r in rs].count('partial')}part/" f"{[r.get('completeness') for r in rs].count('insufficient')}insuf " f"refused={sum(1 for r in rs if r.get('refused'))}") # anonymized 3-file split (conditional only) rng = random.Random(42) pairs, key = [], {} for i, qid in enumerate(sorted(cond)): p, h = by[qid]["prehier"], by[qid]["hier_sim_clean"] swap = rng.random() < 0.5 a, b = (p, h) if not swap else (h, p) pid = f"pair_{i+1:02d}" def spans(r): return [e.get("span_text") for e in (r.get("evidence") or []) if e.get("span_text")] pairs.append({ "pair_id": pid, "question": p["query"], "answer_A": a["ai_answer"], "evidence_A": spans(a), "answer_B": b["ai_answer"], "evidence_B": spans(b), }) key[pid] = {"q_id": qid, "A": a["variant"], "B": b["variant"]} with open("reports/passage_rag_judge_pairs_2026-05-25.jsonl", "w") as f: for pr in pairs: f.write(json.dumps(pr, ensure_ascii=False) + "\n") with open("reports/passage_rag_judge_key_2026-05-25.json", "w") as f: json.dump(key, f, ensure_ascii=False, indent=2) print(f"\nwrote {len(pairs)} anonymized pairs → passage_rag_judge_pairs_2026-05-25.jsonl") print("wrote key → passage_rag_judge_key_2026-05-25.json (judge 미제공)")