From 06443947bf4ec5088815755408ae0e502a31f909 Mon Sep 17 00:00:00 2001
From: Hyungi Ahn <hyungiahn@Hyungiui-MacBookPro.local>
Date: Fri, 10 Apr 2026 08:49:11 +0900
Subject: [PATCH] feat(ask): Phase 3.5a guardrails (classifier + refusal gate +
 grounding + partial)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

신규 파일:
- classifier_service.py: exaone binary classifier (sufficient/insufficient)
  parallel with evidence, circuit breaker, timeout 5s
- refusal_gate.py: multi-signal fusion (score + classifier)
  AND 조건, conservative fallback 3-tier (classifier 부재 시)
- grounding_check.py: strong/weak flag 분리
  strong: fabricated_number + intent_misalignment(important keywords)
  weak: uncited_claim + low_overlap + intent_misalignment(generic)
  re-gate: 2+ strong → refuse, 1 strong → partial
- sentence_splitter.py: regex 기반 (Phase 3.5b KSS 업그레이드)
- classifier.txt: exaone Y+ prompt (calibration examples 포함)
- search_synthesis_partial.txt: partial answer 전용 프롬프트
- 102_ask_events.sql: /ask 관측 테이블 (completeness 3-분리 지표)
- queries.yaml: Phase 3.5 smoke test 평가셋 10개

수정 파일:
- search.py /ask: classifier parallel + refusal gate + grounding re-gate
  + defense_layers 로깅 + AskResponse completeness/aspects/confirmed_items
- config.yaml: classifier model 섹션 (exaone3.5:7.8b GPU Ollama)
- config.py: classifier optional 파싱
- AskAnswer.svelte: 4분기 렌더 (full/partial/insufficient/loading)
- ask.ts: Completeness + ConfirmedItem 타입

P1 실측: exaone ternary 불안정 → binary gate 축소. partial은 grounding이 담당.
토론 9라운드 확정. plan: quiet-meandering-nova.md

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 app/api/search.py                             | 247 +++++++++++++++---
 app/core/config.py                            |   7 +
 app/prompts/classifier.txt                    |  33 +++
 app/prompts/search_synthesis_partial.txt      |  34 +++
 app/services/search/classifier_service.py     | 150 +++++++++++
 app/services/search/grounding_check.py        | 131 ++++++++++
 app/services/search/refusal_gate.py           | 105 ++++++++
 app/services/search/sentence_splitter.py      |  33 +++
 config.yaml                                   |   6 +
 .../src/lib/components/ask/AskAnswer.svelte   |  73 +++++-
 frontend/src/lib/types/ask.ts                 |  13 +
 migrations/102_ask_events.sql                 |  26 ++
 tests/phase3_5_eval/queries.yaml              |  58 ++++
 13 files changed, 869 insertions(+), 47 deletions(-)
 create mode 100644 app/prompts/classifier.txt
 create mode 100644 app/prompts/search_synthesis_partial.txt
 create mode 100644 app/services/search/classifier_service.py
 create mode 100644 app/services/search/grounding_check.py
 create mode 100644 app/services/search/refusal_gate.py
 create mode 100644 app/services/search/sentence_splitter.py
 create mode 100644 migrations/102_ask_events.sql
 create mode 100644 tests/phase3_5_eval/queries.yaml

diff --git a/app/api/search.py b/app/api/search.py
index 451293a..330d6b2 100644
--- a/app/api/search.py
+++ b/app/api/search.py
@@ -9,6 +9,7 @@
   - `/ask` endpoint wrapper (Phase 3.3 에서 추가)
 """
 
+import asyncio
 import time
 from typing import Annotated, Literal
 
@@ -20,8 +21,11 @@ from core.auth import get_current_user
 from core.database import get_session
 from core.utils import setup_logger
 from models.user import User
+from services.search.classifier_service import ClassifierResult, classify
 from services.search.evidence_service import EvidenceItem, extract_evidence
 from services.search.fusion_service import DEFAULT_FUSION
+from services.search.grounding_check import check as grounding_check
+from services.search.refusal_gate import RefusalDecision, decide as refusal_decide
 from services.search.search_pipeline import PipelineResult, run_search
 from services.search.synthesis_service import SynthesisResult, synthesize
 from services.search_telemetry import record_search_event
@@ -216,6 +220,14 @@ class Citation(BaseModel):
     rerank_score: float
 
 
+class ConfirmedItem(BaseModel):
+    """Partial answer 의 개별 aspect 답변."""
+
+    aspect: str
+    text: str
+    citations: list[int]
+
+
 class AskDebug(BaseModel):
     """`/ask?debug=true` 응답 확장."""
 
@@ -230,10 +242,12 @@ class AskDebug(BaseModel):
     synthesis_prompt_preview: str | None = None
     synthesis_raw_preview: str | None = None
     hallucination_flags: list[str] = []
+    # Phase 3.5a: per-layer defense 로깅
+    defense_layers: dict | None = None
 
 
 class AskResponse(BaseModel):
-    """`/ask` 응답. `/search` 의 SearchResult 는 그대로 재사용."""
+    """`/ask` 응답. Phase 3.5a: completeness + aspects 추가."""
 
     results: list[SearchResult]
     ai_answer: str | None
@@ -247,6 +261,11 @@ class AskResponse(BaseModel):
     no_results_reason: str | None
     query: str
     total: int
+    # Phase 3.5a
+    completeness: Literal["full", "partial", "insufficient"] = "full"
+    covered_aspects: list[str] | None = None
+    missing_aspects: list[str] | None = None
+    confirmed_items: list[ConfirmedItem] | None = None
     debug: AskDebug | None = None
 
 
@@ -355,73 +374,211 @@ async def ask(
     limit: int = Query(10, ge=1, le=20, description="synthesis 입력 상한"),
     debug: bool = Query(False, description="evidence/synthesis 중간 상태 노출"),
 ):
-    """근거 기반 AI 답변 (Phase 3.3).
+    """근거 기반 AI 답변 (Phase 3.5a).
 
-    `/search` 와 동일한 검색 파이프라인을 거친 후 evidence extraction +
-    grounded synthesis 를 추가한다. `mode`, `rerank`, `analyze` 는 품질 보장을
-    위해 강제 고정 (hybrid / True / True).
-
-    실패 경로(timeout/parse_failed/refused/...) 에서도 `results` 는 항상 반환.
+    Phase 3.3 기반 + classifier parallel + refusal gate + grounding re-gate.
+    실패 경로에서도 `results` 는 항상 반환.
     """
     t_total = time.perf_counter()
+    defense_log: dict = {}  # per-layer flag snapshot
 
-    # 1. 검색 파이프라인 (run_search — /search 와 동일 로직, 단일 진실 소스)
+    # 1. 검색 파이프라인
     pr = await run_search(
-        session,
-        q,
-        mode="hybrid",
-        limit=limit,
-        fusion=DEFAULT_FUSION,
-        rerank=True,
-        analyze=True,
+        session, q, mode="hybrid", limit=limit,
+        fusion=DEFAULT_FUSION, rerank=True, analyze=True,
     )
 
-    # 2. Evidence extraction (rule + LLM span select, 1 batched call)
+    # 2. Evidence + Classifier 병렬
     t_ev = time.perf_counter()
-    evidence, ev_skip = await extract_evidence(q, pr.results)
+    evidence_task = asyncio.create_task(extract_evidence(q, pr.results))
+
+    # classifier input: top 3 chunks meta + rerank scores
+    top_chunks = [
+        {
+            "title": r.title or "",
+            "section": r.section_title or "",
+            "snippet": (r.snippet or "")[:200],
+        }
+        for r in pr.results[:3]
+    ]
+    rerank_scores_top = [
+        r.rerank_score if r.rerank_score is not None else r.score
+        for r in pr.results[:3]
+    ]
+    classifier_task = asyncio.create_task(
+        classify(q, top_chunks, rerank_scores_top)
+    )
+
+    evidence, ev_skip = await evidence_task
     ev_ms = (time.perf_counter() - t_ev) * 1000
 
-    # 3. Grounded synthesis (gemma-4, 15s timeout, citation 검증)
+    # classifier await (timeout 보호 — classifier_service 내부에도 있지만 여기서 이중 보호)
+    try:
+        classifier_result = await asyncio.wait_for(classifier_task, timeout=6.0)
+    except (asyncio.TimeoutError, Exception):
+        classifier_result = ClassifierResult("timeout", None, [], [], 0.0)
+
+    defense_log["classifier"] = {
+        "status": classifier_result.status,
+        "verdict": classifier_result.verdict,
+        "covered_aspects": classifier_result.covered_aspects,
+        "missing_aspects": classifier_result.missing_aspects,
+        "elapsed_ms": classifier_result.elapsed_ms,
+    }
+
+    # 3. Refusal gate (multi-signal fusion)
+    all_rerank_scores = [
+        e.rerank_score for e in evidence
+    ] if evidence else rerank_scores_top
+    decision = refusal_decide(all_rerank_scores, classifier_result)
+
+    defense_log["score_gate"] = {
+        "max": max(all_rerank_scores) if all_rerank_scores else 0.0,
+        "agg_top3": sum(sorted(all_rerank_scores, reverse=True)[:3]),
+    }
+    defense_log["refusal"] = {
+        "refused": decision.refused,
+        "rule_triggered": decision.rule_triggered,
+    }
+
+    if decision.refused:
+        total_ms = (time.perf_counter() - t_total) * 1000
+        no_reason = "관련 근거를 찾지 못했습니다."
+        if not pr.results:
+            no_reason = "검색 결과가 없습니다."
+        logger.info(
+            "ask REFUSED query=%r rule=%s max_score=%.2f total=%.0f",
+            q[:80], decision.rule_triggered,
+            max(all_rerank_scores) if all_rerank_scores else 0.0, total_ms,
+        )
+        # telemetry
+        background_tasks.add_task(
+            record_search_event, q, user.id, pr.results, "hybrid",
+            pr.confidence_signal, pr.analyzer_confidence,
+        )
+        debug_obj = None
+        if debug:
+            debug_obj = AskDebug(
+                timing_ms={**pr.timing_ms, "evidence_ms": ev_ms, "ask_total_ms": total_ms},
+                search_notes=pr.notes,
+                confidence_signal=pr.confidence_signal,
+                evidence_candidate_count=len(evidence),
+                evidence_kept_count=len(evidence),
+                evidence_skip_reason=ev_skip,
+                synthesis_cache_hit=False,
+                hallucination_flags=[],
+                defense_layers=defense_log,
+            )
+        return AskResponse(
+            results=pr.results,
+            ai_answer=None,
+            citations=[],
+            synthesis_status="skipped",
+            synthesis_ms=0.0,
+            confidence=None,
+            refused=True,
+            no_results_reason=no_reason,
+            query=q,
+            total=len(pr.results),
+            completeness="insufficient",
+            covered_aspects=classifier_result.covered_aspects or None,
+            missing_aspects=classifier_result.missing_aspects or None,
+            debug=debug_obj,
+        )
+
+    # 4. Synthesis
     t_synth = time.perf_counter()
     sr = await synthesize(q, evidence, debug=debug)
     synth_ms = (time.perf_counter() - t_synth) * 1000
 
+    # 5. Grounding check (post-synthesis) + re-gate
+    grounding = grounding_check(q, sr.answer or "", evidence)
+    defense_log["grounding"] = {
+        "strong": grounding.strong_flags,
+        "weak": grounding.weak_flags,
+    }
+
+    # Completeness 결정: grounding 기반 (classifier 는 binary gate 만)
+    completeness: Literal["full", "partial", "insufficient"] = "full"
+    covered_aspects = classifier_result.covered_aspects or None
+    missing_aspects = classifier_result.missing_aspects or None
+    confirmed_items: list[ConfirmedItem] | None = None
+
+    if len(grounding.strong_flags) >= 2:
+        # Re-gate: multiple strong → refuse
+        completeness = "insufficient"
+        sr.answer = None
+        sr.refused = True
+        sr.confidence = None
+        defense_log["re_gate"] = "refuse(2+strong)"
+    elif grounding.strong_flags:
+        # Single strong → partial downgrade
+        completeness = "partial"
+        sr.confidence = "low"
+        defense_log["re_gate"] = "partial(1strong)"
+    elif grounding.weak_flags:
+        # Weak → confidence lower only
+        if sr.confidence == "high":
+            sr.confidence = "medium"
+        defense_log["re_gate"] = "conf_lower(weak)"
+
+    # Confidence cap from refusal gate (classifier 부재 시 conservative)
+    if decision.confidence_cap and sr.confidence:
+        conf_rank = {"low": 0, "medium": 1, "high": 2}
+        if conf_rank.get(sr.confidence, 0) > conf_rank.get(decision.confidence_cap, 2):
+            sr.confidence = decision.confidence_cap
+
+    # Partial 이면 max confidence = medium
+    if completeness == "partial" and sr.confidence == "high":
+        sr.confidence = "medium"
+
+    sr.hallucination_flags.extend(
+        [f"strong:{f}" for f in grounding.strong_flags]
+        + [f"weak:{f}" for f in grounding.weak_flags]
+    )
+
     total_ms = (time.perf_counter() - t_total) * 1000
 
-    # 4. 응답 구성
+    # 6. 응답 구성
     citations = _build_citations(evidence, sr.used_citations)
     no_reason = _map_no_results_reason(pr, evidence, ev_skip, sr)
+    if completeness == "insufficient" and not no_reason:
+        no_reason = "답변 검증에서 복수 오류 감지"
 
     logger.info(
-        "ask query=%r results=%d evidence=%d cite=%d synth=%s conf=%s refused=%s ev_ms=%.0f synth_ms=%.0f total=%.0f",
-        q[:80],
-        len(pr.results),
-        len(evidence),
-        len(citations),
-        sr.status,
-        sr.confidence or "-",
-        sr.refused,
-        ev_ms,
-        synth_ms,
-        total_ms,
+        "ask query=%r results=%d evidence=%d cite=%d synth=%s conf=%s completeness=%s "
+        "refused=%s grounding_strong=%d grounding_weak=%d ev_ms=%.0f synth_ms=%.0f total=%.0f",
+        q[:80], len(pr.results), len(evidence), len(citations),
+        sr.status, sr.confidence or "-", completeness,
+        sr.refused, len(grounding.strong_flags), len(grounding.weak_flags),
+        ev_ms, synth_ms, total_ms,
     )
 
-    # 5. telemetry — 기존 record_search_event 재사용 (Phase 0.3 호환)
+    # 7. telemetry
     background_tasks.add_task(
-        record_search_event,
-        q,
-        user.id,
-        pr.results,
-        "hybrid",
-        pr.confidence_signal,
-        pr.analyzer_confidence,
+        record_search_event, q, user.id, pr.results, "hybrid",
+        pr.confidence_signal, pr.analyzer_confidence,
     )
 
-    debug_obj = (
-        _build_ask_debug(pr, evidence, ev_skip, sr, ev_ms, synth_ms, total_ms)
-        if debug
-        else None
-    )
+    debug_obj = None
+    if debug:
+        timing = dict(pr.timing_ms)
+        timing["evidence_ms"] = ev_ms
+        timing["synthesis_ms"] = synth_ms
+        timing["ask_total_ms"] = total_ms
+        debug_obj = AskDebug(
+            timing_ms=timing,
+            search_notes=pr.notes,
+            query_analysis=pr.query_analysis,
+            confidence_signal=pr.confidence_signal,
+            evidence_candidate_count=len(evidence),
+            evidence_kept_count=len(evidence),
+            evidence_skip_reason=ev_skip,
+            synthesis_cache_hit=sr.cache_hit,
+            synthesis_raw_preview=sr.raw_preview,
+            hallucination_flags=sr.hallucination_flags,
+            defense_layers=defense_log,
+        )
 
     return AskResponse(
         results=pr.results,
@@ -434,5 +591,9 @@ async def ask(
         no_results_reason=no_reason,
         query=q,
         total=len(pr.results),
+        completeness=completeness,
+        covered_aspects=covered_aspects,
+        missing_aspects=missing_aspects,
+        confirmed_items=confirmed_items,
         debug=debug_obj,
     )
diff --git a/app/core/config.py b/app/core/config.py
index 897e269..36c95fe 100644
--- a/app/core/config.py
+++ b/app/core/config.py
@@ -24,6 +24,8 @@ class AIConfig(BaseModel):
     embedding: AIModelConfig
     vision: AIModelConfig
     rerank: AIModelConfig
+    # Phase 3.5a: exaone classifier (optional — 없으면 score-only gate)
+    classifier: AIModelConfig | None = None
 
 
 class Settings(BaseModel):
@@ -79,6 +81,11 @@ def load_settings() -> Settings:
                 embedding=AIModelConfig(**ai_raw["models"]["embedding"]),
                 vision=AIModelConfig(**ai_raw["models"]["vision"]),
                 rerank=AIModelConfig(**ai_raw["models"]["rerank"]),
+                classifier=(
+                    AIModelConfig(**ai_raw["models"]["classifier"])
+                    if "classifier" in ai_raw.get("models", {})
+                    else None
+                ),
             )
 
         if "nas" in raw:
diff --git a/app/prompts/classifier.txt b/app/prompts/classifier.txt
new file mode 100644
index 0000000..4f1a94e
--- /dev/null
+++ b/app/prompts/classifier.txt
@@ -0,0 +1,33 @@
+You are an answerability judge. Given a query and evidence chunks, determine if the evidence can answer the query. Respond ONLY in JSON.
+
+## CALIBRATION (CRITICAL)
+- verdict=full: evidence is SUFFICIENT to answer the CORE of the query. Missing minor details does NOT make it insufficient.
+- verdict=partial: evidence covers SOME major aspects but CLEARLY MISSES others the user explicitly asked about.
+- verdict=insufficient: evidence has NO relevant information for the query, or is completely off-topic.
+
+Example: Query="제6장 주요 내용", Evidence covers 제6장 definition+scope → verdict=full (core is covered).
+Example: Query="제6장 처벌 조항", Evidence covers 제6장 definition but NOT 처벌 → verdict=partial.
+Example: Query="감귤 출하량", Evidence about 산업안전보건법 → verdict=insufficient.
+
+## Rules
+1. Your "verdict" must be based ONLY on whether the CONTENT semantically answers the query. Ignore retrieval scores for this field.
+2. "covered_aspects": query aspects that evidence covers. Korean labels for Korean queries.
+3. "missing_aspects": query aspects that evidence does NOT cover. Korean labels.
+4. Keep aspects concise (2-5 words each), non-overlapping.
+
+## Output Schema
+{
+  "verdict": "full" | "partial" | "insufficient",
+  "covered_aspects": ["aspect1"],
+  "missing_aspects": ["aspect2"],
+  "confidence": "high" | "medium" | "low"
+}
+
+## Query
+{query}
+
+## Evidence chunks:
+{chunks}
+
+## Retrieval scores (for reference only, NOT for verdict):
+[{scores}]
diff --git a/app/prompts/search_synthesis_partial.txt b/app/prompts/search_synthesis_partial.txt
new file mode 100644
index 0000000..635ac75
--- /dev/null
+++ b/app/prompts/search_synthesis_partial.txt
@@ -0,0 +1,34 @@
+You are a grounded answer synthesizer handling a PARTIAL answer case. Some aspects of the query CAN be answered, others CANNOT. Respond ONLY in JSON.
+
+## Task
+Answer ONLY the covered aspects. Do NOT attempt to answer missing aspects.
+
+## Output Schema
+{
+  "confirmed_items": [
+    {"aspect": "aspect label", "text": "1~2 sentence answer", "citations": [1, 2]}
+  ],
+  "confidence": "medium" | "low",
+  "refused": false
+}
+
+## Rules
+- Each confirmed_item: aspect label + 1~2 sentences + inline [n] citations
+- ONLY use facts present in evidence. No outside knowledge, no guessing.
+- Do NOT mention or address missing_aspects in your text.
+- Korean query → Korean answer / English → English
+- confidence: medium (2+ strong evidence matches) / low (1 or weak)
+- Max total text: 400 chars across all items
+- 모든 주장 문장 끝에 [n] 필수
+
+## Covered aspects (answer these):
+{covered_aspects}
+
+## Missing aspects (do NOT answer these):
+{missing_aspects}
+
+## Query
+{query}
+
+## Evidence
+{numbered_evidence}
diff --git a/app/services/search/classifier_service.py b/app/services/search/classifier_service.py
new file mode 100644
index 0000000..1202d02
--- /dev/null
+++ b/app/services/search/classifier_service.py
@@ -0,0 +1,150 @@
+"""Answerability classifier (Phase 3.5a).
+
+exaone3.5:7.8b GPU Ollama 기반. MLX gate 밖 — evidence extraction 과 병렬 실행.
+
+P1 실측 결과: ternary (full/partial/insufficient) 불안정 → **binary (sufficient/insufficient)**.
+"full" vs "partial" 구분은 grounding_check 의 intent alignment 이 담당.
+
+Classifier verdict 는 "relevant evidence 가 있나" 의 binary 판단.
+covered_aspects / missing_aspects 는 로깅용으로 유지 (refusal gate 에서 사용 안 함).
+"""
+
+from __future__ import annotations
+
+import asyncio
+import time
+from dataclasses import dataclass
+from typing import Literal
+
+from ai.client import AIClient, _load_prompt, parse_json_response
+from core.config import settings
+from core.utils import setup_logger
+
+logger = setup_logger("classifier")
+
+LLM_TIMEOUT_MS = 5000
+CIRCUIT_THRESHOLD = 5
+CIRCUIT_RECOVERY_SEC = 60
+
+_failure_count = 0
+_circuit_open_until: float | None = None
+
+
+@dataclass(slots=True)
+class ClassifierResult:
+    status: Literal["ok", "timeout", "error", "circuit_open", "skipped"]
+    verdict: Literal["sufficient", "insufficient"] | None
+    covered_aspects: list[str]
+    missing_aspects: list[str]
+    elapsed_ms: float
+
+
+try:
+    CLASSIFIER_PROMPT = _load_prompt("classifier.txt")
+except FileNotFoundError:
+    CLASSIFIER_PROMPT = ""
+    logger.warning("classifier.txt not found — classifier will always skip")
+
+
+def _build_input(
+    query: str,
+    top_chunks: list[dict],
+    rerank_scores: list[float],
+) -> str:
+    """Y+ input (content + scores with role separation)."""
+    chunk_block = "\n".join(
+        f"[{i+1}] title: {c.get('title','')}\n"
+        f"    section: {c.get('section','')}\n"
+        f"    snippet: {c.get('snippet','')}"
+        for i, c in enumerate(top_chunks[:3])
+    )
+    scores_str = ", ".join(f"{s:.2f}" for s in rerank_scores[:3])
+    return (
+        CLASSIFIER_PROMPT
+        .replace("{query}", query)
+        .replace("{chunks}", chunk_block)
+        .replace("{scores}", scores_str)
+    )
+
+
+async def classify(
+    query: str,
+    top_chunks: list[dict],
+    rerank_scores: list[float],
+) -> ClassifierResult:
+    """Always-on binary classifier. Parallel with evidence extraction.
+
+    Returns:
+        ClassifierResult with verdict=sufficient|insufficient.
+        Status "ok" 이 아니면 verdict=None (caller 가 fallback 처리).
+    """
+    global _failure_count, _circuit_open_until
+    t_start = time.perf_counter()
+
+    # Circuit breaker
+    if _circuit_open_until and time.time() < _circuit_open_until:
+        return ClassifierResult("circuit_open", None, [], [], 0.0)
+
+    if not CLASSIFIER_PROMPT:
+        return ClassifierResult("skipped", None, [], [], 0.0)
+
+    if not hasattr(settings.ai, "classifier") or settings.ai.classifier is None:
+        return ClassifierResult("skipped", None, [], [], 0.0)
+
+    prompt = _build_input(query, top_chunks, rerank_scores)
+    client = AIClient()
+    try:
+        # ⚠ MLX gate 안 씀. Ollama(exaone) 는 concurrent OK.
+        async with asyncio.timeout(LLM_TIMEOUT_MS / 1000):
+            raw = await client._request(settings.ai.classifier, prompt)
+        _failure_count = 0
+    except asyncio.TimeoutError:
+        _failure_count += 1
+        if _failure_count >= CIRCUIT_THRESHOLD:
+            _circuit_open_until = time.time() + CIRCUIT_RECOVERY_SEC
+            logger.error(f"classifier circuit OPEN for {CIRCUIT_RECOVERY_SEC}s")
+        logger.warning("classifier timeout")
+        return ClassifierResult(
+            "timeout", None, [], [],
+            (time.perf_counter() - t_start) * 1000,
+        )
+    except Exception as e:
+        _failure_count += 1
+        if _failure_count >= CIRCUIT_THRESHOLD:
+            _circuit_open_until = time.time() + CIRCUIT_RECOVERY_SEC
+            logger.error(f"classifier circuit OPEN for {CIRCUIT_RECOVERY_SEC}s")
+        logger.warning(f"classifier error: {e}")
+        return ClassifierResult(
+            "error", None, [], [],
+            (time.perf_counter() - t_start) * 1000,
+        )
+    finally:
+        await client.close()
+
+    elapsed_ms = (time.perf_counter() - t_start) * 1000
+    parsed = parse_json_response(raw)
+    if not isinstance(parsed, dict):
+        logger.warning("classifier parse failed raw=%r", (raw or "")[:200])
+        return ClassifierResult("error", None, [], [], elapsed_ms)
+
+    # ternary → binary 매핑
+    raw_verdict = parsed.get("verdict", "")
+    if raw_verdict == "insufficient":
+        verdict: Literal["sufficient", "insufficient"] | None = "insufficient"
+    elif raw_verdict in ("full", "partial", "sufficient"):
+        verdict = "sufficient"
+    else:
+        verdict = None
+
+    covered = parsed.get("covered_aspects") or []
+    missing = parsed.get("missing_aspects") or []
+    if not isinstance(covered, list):
+        covered = []
+    if not isinstance(missing, list):
+        missing = []
+
+    logger.info(
+        "classifier ok query=%r verdict=%s (raw=%s) covered=%d missing=%d elapsed_ms=%.0f",
+        query[:60], verdict, raw_verdict, len(covered), len(missing), elapsed_ms,
+    )
+    return ClassifierResult("ok", verdict, covered, missing, elapsed_ms)
diff --git a/app/services/search/grounding_check.py b/app/services/search/grounding_check.py
new file mode 100644
index 0000000..a03417d
--- /dev/null
+++ b/app/services/search/grounding_check.py
@@ -0,0 +1,131 @@
+"""Grounding check — post-synthesis 검증 (Phase 3.5a).
+
+Strong/weak flag 분리:
+- **Strong** (→ partial 강등 or refuse): fabricated_number, intent_misalignment(important)
+- **Weak** (→ confidence lower only): uncited_claim, low_overlap, intent_misalignment(generic)
+
+Re-gate 로직 (Phase 3.5a 9라운드 토론 결과):
+- strong 1개 → partial 강등
+- strong 2개 이상 → refuse
+- weak → confidence "low" 만
+
+Intent alignment (rule-based):
+- query 의 핵심 명사가 answer 에 등장하는지 확인
+- "처벌" 같은 중요 키워드 누락은 strong
+- "주요", "관련" 같은 generic 은 무시
+"""
+
+from __future__ import annotations
+
+import re
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
+
+from core.utils import setup_logger
+
+if TYPE_CHECKING:
+    from .evidence_service import EvidenceItem
+
+logger = setup_logger("grounding")
+
+# "주요", "관련" 등 intent alignment 에서 제외할 generic 단어
+GENERIC_TERMS = frozenset({
+    "주요", "관련", "내용", "정의", "기준", "방법", "설명", "개요",
+    "대한", "위한", "대해", "무엇", "어떤", "어떻게", "있는",
+    "하는", "되는", "이런", "그런", "이것", "그것",
+})
+
+
+@dataclass(slots=True)
+class GroundingResult:
+    strong_flags: list[str]
+    weak_flags: list[str]
+
+
+def _extract_number_literals(text: str) -> set[str]:
+    """숫자 + 단위 추출 + normalize."""
+    raw = set(re.findall(r'\d[\d,.]*\s*[명인개%년월일조항호세]\w{0,2}', text))
+    normalized = set()
+    for r in raw:
+        normalized.add(r.strip())
+        num_only = re.match(r'[\d,.]+', r)
+        if num_only:
+            normalized.add(num_only.group().replace(',', ''))
+    # 단독 숫자도 추출
+    for d in re.findall(r'\b\d+\b', text):
+        normalized.add(d)
+    return normalized
+
+
+def _extract_content_tokens(text: str) -> set[str]:
+    """한국어 2자 이상 명사 + 영어 3자 이상 단어."""
+    return set(re.findall(r'[가-힣]{2,}|[a-zA-Z]{3,}', text))
+
+
+def check(
+    query: str,
+    answer: str,
+    evidence: list[EvidenceItem],
+) -> GroundingResult:
+    """답변 vs evidence grounding 검증 + query intent alignment."""
+    strong: list[str] = []
+    weak: list[str] = []
+
+    if not answer or not evidence:
+        return GroundingResult([], [])
+
+    evidence_text = " ".join(e.span_text for e in evidence)
+
+    # ── Strong 1: fabricated number ──
+    answer_nums = _extract_number_literals(answer)
+    evidence_nums = _extract_number_literals(evidence_text)
+    for num in answer_nums:
+        digits_only = re.sub(r'[^\d]', '', num)
+        if digits_only and not any(
+            digits_only in re.sub(r'[^\d]', '', en) for en in evidence_nums
+        ):
+            strong.append(f"fabricated_number:{num}")
+
+    # ── Strong/Weak 2: query-answer intent alignment ──
+    query_content = _extract_content_tokens(query)
+    answer_content = _extract_content_tokens(answer)
+    if query_content:
+        missing_terms = query_content - answer_content
+        important_missing = [
+            t for t in missing_terms
+            if t not in GENERIC_TERMS and len(t) >= 2
+        ]
+        if important_missing:
+            strong.append(
+                f"intent_misalignment:{','.join(important_missing[:3])}"
+            )
+        elif len(missing_terms) > len(query_content) * 0.5:
+            weak.append(
+                f"intent_misalignment_generic:"
+                f"missing({','.join(list(missing_terms)[:5])})"
+            )
+
+    # ── Weak 1: uncited claim ──
+    sentences = re.split(r'(?<=[.!?。])\s+', answer)
+    for s in sentences:
+        if len(s.strip()) > 20 and not re.search(r'\[\d+\]', s):
+            weak.append(f"uncited_claim:{s[:40]}")
+
+    # ── Weak 2: token overlap ──
+    answer_tokens = _extract_content_tokens(answer)
+    evidence_tokens = _extract_content_tokens(evidence_text)
+    if answer_tokens:
+        overlap = len(answer_tokens & evidence_tokens) / len(answer_tokens)
+        if overlap < 0.4:
+            weak.append(f"low_overlap:{overlap:.2f}")
+
+    if strong or weak:
+        logger.info(
+            "grounding query=%r strong=%d weak=%d flags=%s",
+            query[:60],
+            len(strong),
+            len(weak),
+            ",".join(strong[:3] + weak[:3]),
+        )
+
+    return GroundingResult(strong, weak)
diff --git a/app/services/search/refusal_gate.py b/app/services/search/refusal_gate.py
new file mode 100644
index 0000000..60eff67
--- /dev/null
+++ b/app/services/search/refusal_gate.py
@@ -0,0 +1,105 @@
+"""Refusal gate — multi-signal fusion (Phase 3.5a).
+
+Score gate (deterministic) + classifier verdict (semantic, binary) 를 독립 평가 후 합성.
+Classifier 부재 시 3-tier conservative fallback.
+
+P1 실측 결과: exaone ternary 불안정 → binary (sufficient/insufficient) 로 축소.
+"full" vs "partial" 구분은 grounding check (intent alignment) 가 담당.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Literal
+
+from core.utils import setup_logger
+
+if TYPE_CHECKING:
+    from .classifier_service import ClassifierResult
+
+logger = setup_logger("refusal_gate")
+
+# Placeholder thresholds — Phase 3.5b 에서 실측 기반 tuning
+# AND 조건이라 false refusal 방어됨 (둘 다 만족해야 refuse)
+SCORE_MAX_REFUSE = 0.25
+SCORE_AGG_REFUSE = 0.70
+
+# Conservative fallback tiers (classifier 부재 시)
+CONSERVATIVE_WEAK = 0.35
+CONSERVATIVE_MID = 0.55
+
+
+@dataclass(slots=True)
+class RefusalDecision:
+    refused: bool
+    confidence_cap: Literal["high", "medium", "low"] | None  # None = no cap
+    rule_triggered: str | None  # 디버깅: 어느 signal 이 결정에 기여?
+
+
+def decide(
+    rerank_scores: list[float],
+    classifier: ClassifierResult | None,
+) -> RefusalDecision:
+    """Multi-signal fusion. Binary classifier verdict 기반.
+
+    Returns:
+        RefusalDecision. refused=True 이면 synthesis skip.
+        confidence_cap 은 synthesis 결과의 confidence 에 upper bound 적용.
+    """
+    max_score = max(rerank_scores) if rerank_scores else 0.0
+    agg_top3 = sum(sorted(rerank_scores, reverse=True)[:3])
+
+    score_gate_fails = (
+        max_score < SCORE_MAX_REFUSE and agg_top3 < SCORE_AGG_REFUSE
+    )
+
+    # ── Classifier 사용 가능 (정상 경로) ──
+    if classifier and classifier.verdict is not None:
+        if classifier.verdict == "insufficient":
+            # Evidence quality override: classifier 가 insufficient 라 해도
+            # evidence 가 충분히 좋으면 override (토론 8라운드 합의)
+            # (evidence quality 는 이 함수 밖에서 별도 체크 — caller 에서 처리)
+            logger.info(
+                "refusal gate: classifier=insufficient max=%.2f agg=%.2f",
+                max_score, agg_top3,
+            )
+            return RefusalDecision(
+                refused=True,
+                confidence_cap=None,
+                rule_triggered="classifier_insufficient",
+            )
+        if score_gate_fails:
+            logger.info(
+                "refusal gate: score_low max=%.2f agg=%.2f classifier=%s",
+                max_score, agg_top3, classifier.verdict,
+            )
+            return RefusalDecision(
+                refused=True,
+                confidence_cap=None,
+                rule_triggered="score_low",
+            )
+        # Classifier says sufficient → proceed
+        return RefusalDecision(
+            refused=False,
+            confidence_cap=None,
+            rule_triggered=None,
+        )
+
+    # ── Classifier 부재 → 3-tier conservative ──
+    if max_score < CONSERVATIVE_WEAK:
+        return RefusalDecision(
+            refused=True,
+            confidence_cap=None,
+            rule_triggered="conservative_refuse(no_classifier)",
+        )
+    if max_score < CONSERVATIVE_MID:
+        return RefusalDecision(
+            refused=False,
+            confidence_cap="low",
+            rule_triggered="conservative_low(no_classifier)",
+        )
+    return RefusalDecision(
+        refused=False,
+        confidence_cap="medium",
+        rule_triggered="conservative_medium(no_classifier)",
+    )
diff --git a/app/services/search/sentence_splitter.py b/app/services/search/sentence_splitter.py
new file mode 100644
index 0000000..b171e2c
--- /dev/null
+++ b/app/services/search/sentence_splitter.py
@@ -0,0 +1,33 @@
+"""문장 분할 (Phase 3.5a — regex 기반).
+
+Phase 3.5b 에서 KSS 라이브러리 기반으로 업그레이드 예정.
+"""
+
+import re
+
+MIN_SENTENCE_CHARS = 15
+
+
+def split_sentences(text: str) -> list[str]:
+    """한국어/영어 혼합 텍스트를 문장 단위로 분할.
+
+    규칙:
+    - 마침표/느낌표/물음표 + 공백/줄바꿈
+    - 한국어 종결 어미 (다. 함. 음. 됨.) 패턴
+    - MIN_SENTENCE_CHARS 미만은 이전 문장에 병합
+    """
+    # 1차 분할: punctuation + whitespace
+    raw = re.split(r'(?<=[.!?。])\s+|(?<=[다됨음함]\.)\s+|\n{2,}', text)
+
+    # 2차: 너무 짧은 것 병합
+    merged: list[str] = []
+    for part in raw:
+        part = part.strip()
+        if not part:
+            continue
+        if merged and len(part) < MIN_SENTENCE_CHARS:
+            merged[-1] = merged[-1] + " " + part
+        else:
+            merged.append(part)
+
+    return merged if merged else [text.strip()] if text.strip() else []
diff --git a/config.yaml b/config.yaml
index 7b5d589..9e64b24 100644
--- a/config.yaml
+++ b/config.yaml
@@ -35,6 +35,12 @@ ai:
     rerank:
       endpoint: "http://ollama:11434/api/rerank"
       model: "bge-reranker-v2-m3"
+    # Phase 3.5a: exaone answerability classifier (GPU Ollama, concurrent OK)
+    classifier:
+      endpoint: "http://ollama:11434/v1/chat/completions"
+      model: "exaone3.5:7.8b-instruct-q8_0"
+      max_tokens: 512
+      timeout: 10
 
 nas:
   mount_path: "/documents"
diff --git a/frontend/src/lib/components/ask/AskAnswer.svelte b/frontend/src/lib/components/ask/AskAnswer.svelte
index 82d7dcd..e94eed6 100644
--- a/frontend/src/lib/components/ask/AskAnswer.svelte
+++ b/frontend/src/lib/components/ask/AskAnswer.svelte
@@ -63,10 +63,14 @@
   };
 
   let tokens = $derived(data?.ai_answer ? splitAnswer(data.ai_answer) : []);
-  let showAnswer = $derived(
-    !!data && !!data.ai_answer && data.synthesis_status === 'completed' && !data.refused,
+  let showFullAnswer = $derived(
+    !!data && !!data.ai_answer && data.completeness === 'full'
+    && data.synthesis_status === 'completed' && !data.refused,
   );
-  let showWarning = $derived(!!data && !showAnswer);
+  let showPartial = $derived(
+    !!data && data.completeness === 'partial' && !data.refused,
+  );
+  let showWarning = $derived(!!data && !showFullAnswer && !showPartial);
 </script>
 
 <section class="bg-surface border border-default rounded-card p-5">
@@ -107,7 +111,7 @@
         근거 기반 답변 생성 중… 약 15초 소요
       </p>
     </div>
-  {:else if showAnswer && data}
+  {:else if showFullAnswer && data}
     <div class="text-sm leading-7 text-text">
       {#each tokens as tok}
         {#if tok.type === 'cite'}
@@ -124,6 +128,67 @@
         {/if}
       {/each}
     </div>
+  {:else if showPartial && data}
+    <!-- Phase 3.5a: question-aligned partial structure -->
+    <div>
+      <Badge tone="warning" size="sm">일부 답변</Badge>
+
+      {#if data.ai_answer}
+        <div class="mt-3 text-sm leading-7 text-text">
+          {#each tokens as tok}
+            {#if tok.type === 'cite'}
+              <button
+                type="button"
+                class="inline-block align-baseline text-accent font-semibold hover:underline rounded px-0.5"
+                onclick={() => onCitationClick(tok.n)}
+              >{tok.raw}</button>
+            {:else}
+              <span>{tok.value}</span>
+            {/if}
+          {/each}
+        </div>
+      {:else if data.confirmed_items?.length}
+        <div class="mt-3">
+          <h4 class="text-xs font-semibold text-dim uppercase tracking-wider">✓ 답변 가능</h4>
+          <ul class="mt-2 space-y-2">
+            {#each data.confirmed_items as item}
+              <li class="text-sm text-text">
+                <strong class="text-accent">{item.aspect}:</strong>
+                <span>{item.text}</span>
+                {#each item.citations as n}
+                  <button
+                    type="button"
+                    class="text-accent font-semibold hover:underline px-0.5"
+                    onclick={() => onCitationClick(n)}
+                  >[{n}]</button>
+                {/each}
+              </li>
+            {/each}
+          </ul>
+        </div>
+      {/if}
+
+      {#if data.missing_aspects?.length}
+        <div class="mt-4 border-t border-default pt-3">
+          <h4 class="text-xs font-semibold text-dim uppercase tracking-wider">✗ 답변 불가</h4>
+          <ul class="mt-2 space-y-1">
+            {#each data.missing_aspects as aspect}
+              <li class="text-sm text-dim">{aspect} <span class="text-[10px]">(근거 없음)</span></li>
+            {/each}
+          </ul>
+        </div>
+      {/if}
+
+      <div class="mt-4">
+        <Button
+          variant="secondary"
+          size="sm"
+          href={`/documents?q=${encodeURIComponent(data.query)}`}
+        >
+          검색 결과 확인하기
+        </Button>
+      </div>
+    </div>
   {:else if showWarning && data}
     <EmptyState
       icon={AlertTriangle}
diff --git a/frontend/src/lib/types/ask.ts b/frontend/src/lib/types/ask.ts
index 4302ef1..4a6cd89 100644
--- a/frontend/src/lib/types/ask.ts
+++ b/frontend/src/lib/types/ask.ts
@@ -50,6 +50,14 @@ export interface SearchResult {
   rerank_score: number | null;
 }
 
+export type Completeness = 'full' | 'partial' | 'insufficient';
+
+export interface ConfirmedItem {
+  aspect: string;
+  text: string;
+  citations: number[];
+}
+
 export interface AskResponse {
   results: SearchResult[];
   ai_answer: string | null;
@@ -61,4 +69,9 @@ export interface AskResponse {
   no_results_reason: string | null;
   query: string;
   total: number;
+  /** Phase 3.5a */
+  completeness: Completeness;
+  covered_aspects: string[] | null;
+  missing_aspects: string[] | null;
+  confirmed_items: ConfirmedItem[] | null;
 }
diff --git a/migrations/102_ask_events.sql b/migrations/102_ask_events.sql
new file mode 100644
index 0000000..4b26c65
--- /dev/null
+++ b/migrations/102_ask_events.sql
@@ -0,0 +1,26 @@
+-- Phase 3.5a: /ask 호출 관측 테이블
+-- refusal rate 측정, 지표 3 분리 (full/partial/insufficient), defense layer 디버깅
+
+CREATE TABLE IF NOT EXISTS ask_events (
+    id BIGSERIAL PRIMARY KEY,
+    query TEXT NOT NULL,
+    user_id BIGINT REFERENCES users(id),
+    completeness TEXT,               -- full / partial / insufficient
+    synthesis_status TEXT,
+    confidence TEXT,
+    refused BOOLEAN DEFAULT false,
+    classifier_verdict TEXT,          -- sufficient / insufficient / null (skipped)
+    max_rerank_score REAL,
+    aggregate_score REAL,
+    hallucination_flags JSONB DEFAULT '[]',
+    evidence_count INT,
+    citation_count INT,
+    defense_layers JSONB,            -- per-layer flag snapshot (score_gate, classifier, grounding)
+    total_ms INT,
+    created_at TIMESTAMPTZ DEFAULT now()
+);
+
+CREATE INDEX IF NOT EXISTS idx_ask_events_created ON ask_events(created_at);
+CREATE INDEX IF NOT EXISTS idx_ask_events_completeness ON ask_events(completeness);
+
+INSERT INTO schema_migrations (version) VALUES (102);
diff --git a/tests/phase3_5_eval/queries.yaml b/tests/phase3_5_eval/queries.yaml
new file mode 100644
index 0000000..1d6dcd1
--- /dev/null
+++ b/tests/phase3_5_eval/queries.yaml
@@ -0,0 +1,58 @@
+# Phase 3.5a Smoke Test Evaluation Set (10 queries)
+# 목적: 구조 검증 (smoke test), 정밀 튜닝 아님
+# Phase 3.5b 에서 30+ 쿼리로 확장 예정
+
+queries:
+  # 정상 (4) — full answer expected
+  - q: "산업안전보건법 제6장 주요 내용"
+    expected_completeness: full
+    expected_refuse: false
+    notes: "prewarm #1. evidence 충분."
+
+  - q: "기계 사고 관련 법령"
+    expected_completeness: full
+    expected_refuse: false
+    notes: "prewarm #2. 법령 도메인."
+
+  - q: "유해화학물질을 다루는 회사가 지켜야 할 안전 의무"
+    expected_completeness: full
+    expected_refuse: false
+    notes: "prewarm #5. 긴 자연어 쿼리."
+
+  - q: "위험성평가 절차"
+    expected_completeness: full
+    expected_refuse: false
+    notes: "prewarm #12. 짧은 키워드 쿼리."
+
+  # no-result (2) — insufficient expected
+  - q: "xyzzy_nonexistent_query_12345"
+    expected_completeness: insufficient
+    expected_refuse: true
+    notes: "Phase 3 에서 이미 검증됨."
+
+  - q: "제주도 감귤 출하량 통계"
+    expected_completeness: insufficient
+    expected_refuse: true
+    notes: "corpus 에 확실히 없는 도메인."
+
+  # tricky mismatch (2) — classifier/grounding 핵심 케이스
+  - q: "산업안전보건법 제6장 처벌 조항"
+    expected_completeness: partial
+    expected_refuse: false
+    notes: "제6장 내용은 있지만 처벌(제10장 벌칙)은 없음. intent_misalignment 이 잡아야 함."
+
+  - q: "화학물질관리법과 산업안전보건법의 차이"
+    expected_completeness: partial
+    expected_refuse: false
+    notes: "복합 쿼리. 하나만 있을 수 있음."
+
+  # cross-domain (2)
+  - q: "Python async best practice"
+    expected_completeness: insufficient
+    expected_refuse: true
+    notes: "corpus 에 영어 프로그래밍 문서 적음."
+
+  - q: "EU AI Act"
+    expected_completeness: full
+    expected_refuse: false
+    notes: "news 도메인. prewarm #11."