diff --git a/app/api/search.py b/app/api/search.py index 75a3a33..7428083 100644 --- a/app/api/search.py +++ b/app/api/search.py @@ -370,6 +370,28 @@ def _build_ask_debug( ) +def _detect_synthesis_failure(sr: SynthesisResult) -> str | None: + """Synthesis 가 유효한 답을 못 냈으면 re_gate 라벨, 아니면 None. + + 판정 우선순위 (Phase 3.5 fix3): + 1) sr.refused → LLM self-refuse (status="completed") 또는 mechanical fail 후 refused 전파 + - status=="completed" + refused=True → "synthesis_self_refuse" + - 그 외 → f"synthesis_failed({status})" + 2) sr.status ∈ {timeout, parse_failed, llm_error} → f"synthesis_failed({status})" + 3) answer 공백 → f"synthesis_failed({status})" + 4) 유효 → None + """ + if sr.refused: + if sr.status == "completed": + return "synthesis_self_refuse" + return f"synthesis_failed({sr.status})" + if sr.status in ("timeout", "parse_failed", "llm_error"): + return f"synthesis_failed({sr.status})" + if not (sr.answer or "").strip(): + return f"synthesis_failed({sr.status})" + return None + + def _resolve_eval_identity( x_source: str | None, x_eval_case_id: str | None, @@ -659,7 +681,19 @@ async def ask( 1 for f in v_strong if f.startswith("verifier_numeric_conflict") ) - if len(g_strong) >= 2: + # ── Tier 0 (Phase 3.5 fix3): synthesis 자체 실패 처리 ── + # LLM self-refuse, 메커니즘 실패(timeout/parse_failed/llm_error), answer 공백. + # 빈 답에 대해 grounding/verifier flag 가 0건이라 기존 체인이 "else clean" 으로 빠지며 + # completeness="full" 초기값이 보존되던 모순을 여기서 일관되게 차단. + # 과거 baseline(v1-400char) 에서 20(self-refuse)+4(timeout) = 24/223 (10.8%) 해당. + tier0_label = _detect_synthesis_failure(sr) + if tier0_label: + completeness = "insufficient" + sr.answer = None + sr.refused = True + sr.confidence = None + defense_log["re_gate"] = tier0_label + elif len(g_strong) >= 2: # Tier 1: grounding strong 2+ → refuse completeness = "insufficient" sr.answer = None @@ -733,7 +767,11 @@ async def ask( citations = _build_citations(evidence, sr.used_citations) no_reason = _map_no_results_reason(pr, evidence, ev_skip, sr) if completeness == "insufficient" and not no_reason: - no_reason = "답변 검증에서 복수 오류 감지" + # Tier 0 경로: synthesis self-refuse 는 LLM 이 준 사유가 가장 정확. + if sr.refused and sr.refuse_reason: + no_reason = sr.refuse_reason + else: + no_reason = "답변 검증에서 복수 오류 감지" logger.info( "ask query=%r results=%d evidence=%d cite=%d synth=%s conf=%s completeness=%s " diff --git a/tests/test_synthesis_failure_regate.py b/tests/test_synthesis_failure_regate.py new file mode 100644 index 0000000..f06577f --- /dev/null +++ b/tests/test_synthesis_failure_regate.py @@ -0,0 +1,123 @@ +"""Phase 3.5 fix3: re-gate Tier 0 — synthesis 자체 실패 처리. + +`_detect_synthesis_failure()` 단위 테스트. + +기존 버그: + synthesis LLM self-refuse (`sr.refused=True, status="completed"`) 또는 + timeout/parse_failed/llm_error 시 grounding/verifier flag 0건 → re-gate else clean + 분기로 빠져 `completeness="full"` 초기값이 남아 `full + refused=True` 모순. + baseline v1-400char 에서 24/223 (10.8%) 해당. + +Tier 0 판정: + - LLM self-refuse (completed + refused) → "synthesis_self_refuse" + - mechanical fail (timeout/parse_failed/llm_error) → "synthesis_failed({status})" + - answer 공백 → "synthesis_failed({status})" + - 유효 답변 → None (기존 tier 1~7 경로) +""" + +from __future__ import annotations + +import os +import sys + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "app")) + +from api.search import _detect_synthesis_failure +from services.search.synthesis_service import SynthesisResult + + +def _sr( + status: str = "completed", + answer: str | None = "ok", + refused: bool = False, + refuse_reason: str | None = None, +) -> SynthesisResult: + return SynthesisResult( + status=status, # type: ignore[arg-type] + answer=answer, + used_citations=[], + confidence="low", + refused=refused, + refuse_reason=refuse_reason, + elapsed_ms=100.0, + cache_hit=False, + ) + + +# ─── self-refuse 케이스 ────────────────────────────────── + + +def test_llm_self_refuse_completed(): + """LLM 이 JSON 에 refused=true 반환 → synthesis_self_refuse.""" + sr = _sr(status="completed", answer=None, refused=True, refuse_reason="범위 밖") + assert _detect_synthesis_failure(sr) == "synthesis_self_refuse" + + +def test_llm_self_refuse_with_answer_still_refused(): + """refused=True 면 answer 있어도 Tier 0 처리 (일관성).""" + sr = _sr(status="completed", answer="왜 답변함", refused=True) + assert _detect_synthesis_failure(sr) == "synthesis_self_refuse" + + +# ─── mechanical failure 케이스 ────────────────────────── + + +def test_timeout(): + sr = _sr(status="timeout", answer=None, refused=False) + assert _detect_synthesis_failure(sr) == "synthesis_failed(timeout)" + + +def test_parse_failed(): + sr = _sr(status="parse_failed", answer=None, refused=False) + assert _detect_synthesis_failure(sr) == "synthesis_failed(parse_failed)" + + +def test_llm_error(): + sr = _sr(status="llm_error", answer=None, refused=False) + assert _detect_synthesis_failure(sr) == "synthesis_failed(llm_error)" + + +def test_refused_with_mechanical_fail_propagates_status(): + """refused=True + status!=completed → synthesis_failed({status}) 형식.""" + sr = _sr(status="timeout", answer=None, refused=True) + assert _detect_synthesis_failure(sr) == "synthesis_failed(timeout)" + + +# ─── empty answer 케이스 ─────────────────────────────── + + +def test_empty_answer_completed(): + """status=completed 인데 answer 공백 → synthesis_failed(completed).""" + sr = _sr(status="completed", answer="", refused=False) + assert _detect_synthesis_failure(sr) == "synthesis_failed(completed)" + + +def test_whitespace_only_answer(): + """공백/탭/개행만 있어도 empty 로 간주.""" + sr = _sr(status="completed", answer=" \n\t ", refused=False) + assert _detect_synthesis_failure(sr) == "synthesis_failed(completed)" + + +def test_none_answer_completed(): + """answer=None + status=completed → failed.""" + sr = _sr(status="completed", answer=None, refused=False) + assert _detect_synthesis_failure(sr) == "synthesis_failed(completed)" + + +# ─── 유효 답변 케이스 (None 반환) ────────────────────── + + +def test_valid_answer_returns_none(): + """status=completed + answer 있고 refused=False → Tier 0 통과 (None).""" + sr = _sr(status="completed", answer="교육 시간은 매년 6시간 이상이다 [1].", refused=False) + assert _detect_synthesis_failure(sr) is None + + +def test_skipped_status_with_answer_passes(): + """status=skipped 는 Tier 0 대상 아님 — 초기 refusal gate 에서 이미 early-return 처리됨. + + (skipped 는 여기까지 도달하지 않는다는 전제. 만약 도달하더라도 refused 가 True 일 것.) + """ + sr = _sr(status="skipped", answer="abc", refused=False) + # 이 경우 Tier 0 미발동 (answer 있고 refused 아님) — 정상 경로로 나감. + assert _detect_synthesis_failure(sr) is None