From ec45d9d31097076dc59aa2680c026be372250125 Mon Sep 17 00:00:00 2001
From: Hyungi Ahn <hyungiahn@Hyungiui-MacBookPro.local>
Date: Fri, 17 Apr 2026 08:01:40 +0900
Subject: [PATCH] =?UTF-8?q?feat(verifier):=20Phase=203.5=20B2=20=E2=80=94?=
 =?UTF-8?q?=20numeric=5Fconflict=20promote=20(env=20flag)=20+=20Tier=204?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

VERIFIER_NUMERIC_PROMOTE 환경변수로 numeric_conflict severity 승격 실험.

verifier_service.py:
- _NUMERIC_PROMOTE = os.getenv('VERIFIER_NUMERIC_PROMOTE', '0') == '1'
  (import time 평가 — env 변경 시 process restart 필수)
- _SEVERITY_MAP['numeric_conflict']: env=1 → critical=strong / minor=medium,
  env=0 (기본) → 둘 다 medium (기존 동작 유지)
- direct_negation 은 env 무관 항상 strong (안전장치)

verifier.txt:
- numeric_conflict 정의에 critical/minor 분리 명시 (core quantity vs peripheral)
- "Range values satisfy any answer within range" rule 추가
- severity mapping 갱신: numeric_conflict 분기 명시

search.py re-gate (Tier 1~7 재번호, B2 신규 Tier 4):
- v_strong_numeric = sum(1 for f in v_strong
                         if f.startswith('verifier_numeric_conflict'))
- Tier 4 (신규): g_strong + v_strong_numeric >= 1 + low_conf → refuse
  re_gate value: 'refuse(grounding+verifier_numeric)'
- 원칙 유지: verifier strong 단독 refuse 금지 — g_strong 교차 필수
- 호환성: 기존 re_gate string literals 그대로 유지, 신규 1개만 추가

credentials.env.example: VERIFIER_NUMERIC_PROMOTE=0 (off, B3 통과 후 production 전환)

tests/test_verifier_numeric_promote.py: 4 케이스 (env off / on / explicit 0 /
direct_negation invariant). monkeypatch.setenv + importlib.reload 패턴.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 app/api/search.py                       | 25 +++++++++--
 app/prompts/verifier.txt                |  7 +--
 app/services/search/verifier_service.py | 13 +++++-
 credentials.env.example                 |  6 +++
 tests/test_verifier_numeric_promote.py  | 58 +++++++++++++++++++++++++
 5 files changed, 101 insertions(+), 8 deletions(-)
 create mode 100644 tests/test_verifier_numeric_promote.py

diff --git a/app/api/search.py b/app/api/search.py
index 0ed58f1..75a3a33 100644
--- a/app/api/search.py
+++ b/app/api/search.py
@@ -638,7 +638,10 @@ async def ask(
         "elapsed_ms": verifier_result.elapsed_ms,
     }
 
-    # ── Re-gate: 6-tier completeness 결정 (Phase 3.5b 4차 리뷰 확정) ──
+    # ── Re-gate: 7-tier completeness 결정 (Phase 3.5 B2 — Tier 4 신규 삽입, 재번호) ──
+    # 기존 6-tier (3.5b 4차 리뷰) + Tier 4(g_strong + v_strong_numeric + low_conf → refuse).
+    # 호환성: defense_layers["re_gate"] 의 string literal 들은 기존 그대로 유지.
+    # 신규 "refuse(grounding+verifier_numeric)" 만 추가.
     completeness: Literal["full", "partial", "insufficient"] = "full"
     covered_aspects = classifier_result.covered_aspects or None
     missing_aspects = classifier_result.missing_aspects or None
@@ -649,6 +652,12 @@ async def ask(
     v_strong = [f for f in grounding.strong_flags if f.startswith("verifier_")]
     v_medium = [f for f in grounding.weak_flags if f.startswith("verifier_") and "_medium:" in f]
     has_direct_negation = any("direct_negation" in f for f in v_strong)
+    # Phase 3.5 B2: verifier strong flags 중 numeric_conflict 만 카운트.
+    # promote(VERIFIER_NUMERIC_PROMOTE=1) 활성 시 critical numeric_conflict 가 strong 으로 승격되며
+    # 여기 카운트에 잡힘. promote off 면 항상 0 → Tier 4 활성 안 됨 (기존 동작 유지).
+    v_strong_numeric = sum(
+        1 for f in v_strong if f.startswith("verifier_numeric_conflict")
+    )
 
     if len(g_strong) >= 2:
         # Tier 1: grounding strong 2+ → refuse
@@ -671,13 +680,21 @@ async def ask(
         sr.refused = True
         sr.confidence = None
         defense_log["re_gate"] = "refuse(grounding+low_conf+weak_ev)"
+    elif g_strong and v_strong_numeric >= 1 and sr.confidence == "low":
+        # Tier 4 (B2 신규): grounding strong + verifier numeric_conflict strong + low conf → refuse.
+        # verifier strong 단독 refuse 금지 원칙 유지 — g_strong 교차 필수.
+        completeness = "insufficient"
+        sr.answer = None
+        sr.refused = True
+        sr.confidence = None
+        defense_log["re_gate"] = "refuse(grounding+verifier_numeric)"
     elif g_strong or has_direct_negation:
-        # Tier 4: grounding strong 1 또는 verifier direct_negation 단독 → partial
+        # Tier 5 (기존 4): grounding strong 1 또는 verifier direct_negation 단독 → partial
         completeness = "partial"
         sr.confidence = "low"
         defense_log["re_gate"] = "partial(strong_or_negation)"
     elif v_medium:
-        # Tier 5: verifier medium 누적 → count 기반 confidence 하향
+        # Tier 6 (기존 5): verifier medium 누적 → count 기반 confidence 하향
         medium_count = len(v_medium)
         if medium_count >= 3:
             sr.confidence = "low"
@@ -688,7 +705,7 @@ async def ask(
         else:
             defense_log["re_gate"] = f"medium_x{medium_count}(no_action)"
     elif grounding.weak_flags:
-        # Tier 6: weak → confidence 한 단계 하향
+        # Tier 7 (기존 6): weak → confidence 한 단계 하향
         if sr.confidence == "high":
             sr.confidence = "medium"
         defense_log["re_gate"] = "conf_lower(weak)"
diff --git a/app/prompts/verifier.txt b/app/prompts/verifier.txt
index 10b25f7..aa3fa8b 100644
--- a/app/prompts/verifier.txt
+++ b/app/prompts/verifier.txt
@@ -2,7 +2,7 @@ You are a grounding verifier. Given an answer and its evidence sources, check if
 
 ## Contradiction Types (IMPORTANT — severity depends on type)
 - **direct_negation** (CRITICAL): Answer directly contradicts evidence. Examples: evidence "의무" but answer "권고"; evidence "금지" but answer "허용"; negation reversal ("~해야 한다" vs "~할 필요 없다").
-- **numeric_conflict**: Answer states a number different from evidence. "50명" in evidence but "100명" in answer. Only flag if the same concept is referenced.
+- **numeric_conflict**: Answer states a number different from evidence. "50명" in evidence but "100명" in answer. Only flag if the same concept is referenced. severity=critical when the number is the CORE answered quantity (amount/count/rate/date/duration that the query asked for); severity=minor when the number is peripheral (e.g., example/footnote).
 - **intent_core_mismatch**: Answer addresses a fundamentally different topic than the query asked about.
 - **nuance**: Answer overgeneralizes or adds qualifiers not in evidence (e.g., "모든" when evidence says "일부").
 - **unsupported_claim**: Answer makes a factual claim with no basis in any evidence.
@@ -10,7 +10,7 @@ You are a grounding verifier. Given an answer and its evidence sources, check if
 ## Rules
 1. Compare each claim in the answer against the cited evidence. A claim with [n] citation should be checked against evidence [n].
 2. NOT a contradiction: Paraphrasing, summarizing, or restating the same fact in different words. Korean formal/informal style (합니다/한다) differences.
-3. Numbers must match exactly after normalization (1,000 = 1000).
+3. Numbers must match exactly after normalization (1,000 = 1000). Range values (e.g., "100~200명") satisfy any answer within range.
 4. Legal/regulatory terms must preserve original meaning (의무 ≠ 권고, 금지 ≠ 제한, 허용 ≠ 금지).
 5. Maximum 5 contradictions (most severe first: direct_negation > numeric_conflict > intent_core_mismatch > nuance > unsupported_claim).
 
@@ -30,7 +30,8 @@ You are a grounding verifier. Given an answer and its evidence sources, check if
 
 severity mapping:
 - direct_negation → "critical"
-- All others → "minor"
+- numeric_conflict → "critical" if the number is the CORE answered quantity, else "minor"
+- All other types → "minor"
 
 If no contradictions: {"contradictions": [], "verdict": "clean"}
 
diff --git a/app/services/search/verifier_service.py b/app/services/search/verifier_service.py
index 6dec9c8..3606cbf 100644
--- a/app/services/search/verifier_service.py
+++ b/app/services/search/verifier_service.py
@@ -17,6 +17,7 @@
 from __future__ import annotations
 
 import asyncio
+import os
 import time
 from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Literal
@@ -37,10 +38,20 @@ CIRCUIT_RECOVERY_SEC = 60
 _failure_count = 0
 _circuit_open_until: float | None = None
 
+# Phase 3.5 B2: numeric_conflict severity promote 실험.
+# import time 평가 — env 변경 후 process restart 필수 (docker compose restart fastapi).
+# default=0 (off). production 적용은 B3 FP 검증 통과 후만.
+_NUMERIC_PROMOTE = os.getenv("VERIFIER_NUMERIC_PROMOTE", "0") == "1"
+
 # severity 매핑 (프롬프트 "critical"/"minor" → 코드 strong/medium/weak)
+# Tier 4 (B2): _NUMERIC_PROMOTE=1 일 때 numeric_conflict critical → strong 으로 격상.
+# minor 는 medium 유지 (FP 위험 분리).
 _SEVERITY_MAP: dict[str, dict[str, Literal["strong", "medium", "weak"]]] = {
     "direct_negation": {"critical": "strong", "minor": "strong"},
-    "numeric_conflict": {"critical": "medium", "minor": "medium"},
+    "numeric_conflict": (
+        {"critical": "strong", "minor": "medium"} if _NUMERIC_PROMOTE
+        else {"critical": "medium", "minor": "medium"}
+    ),
     "intent_core_mismatch": {"critical": "medium", "minor": "medium"},
     "nuance": {"critical": "weak", "minor": "weak"},
     "unsupported_claim": {"critical": "weak", "minor": "weak"},
diff --git a/credentials.env.example b/credentials.env.example
index efcf7cf..aefa347 100644
--- a/credentials.env.example
+++ b/credentials.env.example
@@ -51,6 +51,12 @@ NYT_API_KEY=
 # ─── 국가법령정보센터 (법령 모니터링) ───
 LAW_OC=
 
+# ─── Phase 3.5 B2: verifier numeric_conflict promote 실험 ───
+# 0=off (기본, critical/minor 둘 다 medium), 1=on (critical → strong, minor 는 medium 유지).
+# ⚠ env 변경 후 process restart 필수 (docker compose restart fastapi) — _SEVERITY_MAP 가 import time 평가됨.
+# B3 FP 검증 (true FP < 20%) 통과 후만 production 적용.
+VERIFIER_NUMERIC_PROMOTE=0
+
 # ─── Phase 3.5 fix2: eval runner shared secret ───
 # /ask 엔드포인트의 X-Source=eval / X-Eval-Case-Id 헤더 신뢰 검증 토큰.
 # 비어있거나 클라이언트 X-Eval-Token 와 불일치 시 eval 헤더 거부 (warning log + source='document_server' 강등).
diff --git a/tests/test_verifier_numeric_promote.py b/tests/test_verifier_numeric_promote.py
new file mode 100644
index 0000000..e7c8443
--- /dev/null
+++ b/tests/test_verifier_numeric_promote.py
@@ -0,0 +1,58 @@
+"""Phase 3.5 B2: verifier _SEVERITY_MAP env flag 테스트.
+
+VERIFIER_NUMERIC_PROMOTE 환경변수에 따른 _SEVERITY_MAP 변화 검증.
+모듈은 import time 에 env 평가하므로 reload 필요.
+"""
+
+from __future__ import annotations
+
+import importlib
+import os
+import sys
+
+# tests/ → 프로젝트 루트 → app/
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "app"))
+
+import pytest
+
+
+def _reload_verifier(monkeypatch, value: str | None):
+    """env 설정 후 verifier_service 를 reload 하여 _SEVERITY_MAP 재평가."""
+    if value is None:
+        monkeypatch.delenv("VERIFIER_NUMERIC_PROMOTE", raising=False)
+    else:
+        monkeypatch.setenv("VERIFIER_NUMERIC_PROMOTE", value)
+    from services.search import verifier_service
+    importlib.reload(verifier_service)
+    return verifier_service
+
+
+def test_severity_map_off_default(monkeypatch):
+    """env 미설정 → numeric_conflict critical 은 medium (기존 동작)."""
+    vs = _reload_verifier(monkeypatch, None)
+    assert vs._SEVERITY_MAP["numeric_conflict"]["critical"] == "medium"
+    assert vs._SEVERITY_MAP["numeric_conflict"]["minor"] == "medium"
+    assert vs._NUMERIC_PROMOTE is False
+
+
+def test_severity_map_on_critical_promoted(monkeypatch):
+    """VERIFIER_NUMERIC_PROMOTE=1 → critical 만 strong, minor 는 medium 유지."""
+    vs = _reload_verifier(monkeypatch, "1")
+    assert vs._SEVERITY_MAP["numeric_conflict"]["critical"] == "strong"
+    assert vs._SEVERITY_MAP["numeric_conflict"]["minor"] == "medium"
+    assert vs._NUMERIC_PROMOTE is True
+
+
+def test_severity_map_off_explicit_zero(monkeypatch):
+    """VERIFIER_NUMERIC_PROMOTE=0 명시 → off (default 와 동일)."""
+    vs = _reload_verifier(monkeypatch, "0")
+    assert vs._SEVERITY_MAP["numeric_conflict"]["critical"] == "medium"
+    assert vs._NUMERIC_PROMOTE is False
+
+
+def test_direct_negation_invariant(monkeypatch):
+    """direct_negation 은 env 무관 항상 strong (불변 — 안전장치)."""
+    for value in [None, "0", "1"]:
+        vs = _reload_verifier(monkeypatch, value)
+        assert vs._SEVERITY_MAP["direct_negation"]["critical"] == "strong"
+        assert vs._SEVERITY_MAP["direct_negation"]["minor"] == "strong"