From 06443947bf4ec5088815755408ae0e502a31f909 Mon Sep 17 00:00:00 2001
From: Hyungi Ahn
Date: Fri, 10 Apr 2026 08:49:11 +0900
Subject: [PATCH] feat(ask): Phase 3.5a guardrails (classifier + refusal gate +
grounding + partial)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
신규 파일:
- classifier_service.py: exaone binary classifier (sufficient/insufficient)
parallel with evidence, circuit breaker, timeout 5s
- refusal_gate.py: multi-signal fusion (score + classifier)
AND 조건, conservative fallback 3-tier (classifier 부재 시)
- grounding_check.py: strong/weak flag 분리
strong: fabricated_number + intent_misalignment(important keywords)
weak: uncited_claim + low_overlap + intent_misalignment(generic)
re-gate: 2+ strong → refuse, 1 strong → partial
- sentence_splitter.py: regex 기반 (Phase 3.5b KSS 업그레이드)
- classifier.txt: exaone Y+ prompt (calibration examples 포함)
- search_synthesis_partial.txt: partial answer 전용 프롬프트
- 102_ask_events.sql: /ask 관측 테이블 (completeness 3-분리 지표)
- queries.yaml: Phase 3.5 smoke test 평가셋 10개
수정 파일:
- search.py /ask: classifier parallel + refusal gate + grounding re-gate
+ defense_layers 로깅 + AskResponse completeness/aspects/confirmed_items
- config.yaml: classifier model 섹션 (exaone3.5:7.8b GPU Ollama)
- config.py: classifier optional 파싱
- AskAnswer.svelte: 4분기 렌더 (full/partial/insufficient/loading)
- ask.ts: Completeness + ConfirmedItem 타입
P1 실측: exaone ternary 불안정 → binary gate 축소. partial은 grounding이 담당.
토론 9라운드 확정. plan: quiet-meandering-nova.md
Co-Authored-By: Claude Opus 4.6 (1M context)
---
app/api/search.py | 247 +++++++++++++++---
app/core/config.py | 7 +
app/prompts/classifier.txt | 33 +++
app/prompts/search_synthesis_partial.txt | 34 +++
app/services/search/classifier_service.py | 150 +++++++++++
app/services/search/grounding_check.py | 131 ++++++++++
app/services/search/refusal_gate.py | 105 ++++++++
app/services/search/sentence_splitter.py | 33 +++
config.yaml | 6 +
.../src/lib/components/ask/AskAnswer.svelte | 73 +++++-
frontend/src/lib/types/ask.ts | 13 +
migrations/102_ask_events.sql | 26 ++
tests/phase3_5_eval/queries.yaml | 58 ++++
13 files changed, 869 insertions(+), 47 deletions(-)
create mode 100644 app/prompts/classifier.txt
create mode 100644 app/prompts/search_synthesis_partial.txt
create mode 100644 app/services/search/classifier_service.py
create mode 100644 app/services/search/grounding_check.py
create mode 100644 app/services/search/refusal_gate.py
create mode 100644 app/services/search/sentence_splitter.py
create mode 100644 migrations/102_ask_events.sql
create mode 100644 tests/phase3_5_eval/queries.yaml
diff --git a/app/api/search.py b/app/api/search.py
index 451293a..330d6b2 100644
--- a/app/api/search.py
+++ b/app/api/search.py
@@ -9,6 +9,7 @@
- `/ask` endpoint wrapper (Phase 3.3 에서 추가)
"""
+import asyncio
import time
from typing import Annotated, Literal
@@ -20,8 +21,11 @@ from core.auth import get_current_user
from core.database import get_session
from core.utils import setup_logger
from models.user import User
+from services.search.classifier_service import ClassifierResult, classify
from services.search.evidence_service import EvidenceItem, extract_evidence
from services.search.fusion_service import DEFAULT_FUSION
+from services.search.grounding_check import check as grounding_check
+from services.search.refusal_gate import RefusalDecision, decide as refusal_decide
from services.search.search_pipeline import PipelineResult, run_search
from services.search.synthesis_service import SynthesisResult, synthesize
from services.search_telemetry import record_search_event
@@ -216,6 +220,14 @@ class Citation(BaseModel):
rerank_score: float
+class ConfirmedItem(BaseModel):
+ """Partial answer 의 개별 aspect 답변."""
+
+ aspect: str
+ text: str
+ citations: list[int]
+
+
class AskDebug(BaseModel):
"""`/ask?debug=true` 응답 확장."""
@@ -230,10 +242,12 @@ class AskDebug(BaseModel):
synthesis_prompt_preview: str | None = None
synthesis_raw_preview: str | None = None
hallucination_flags: list[str] = []
+ # Phase 3.5a: per-layer defense 로깅
+ defense_layers: dict | None = None
class AskResponse(BaseModel):
- """`/ask` 응답. `/search` 의 SearchResult 는 그대로 재사용."""
+ """`/ask` 응답. Phase 3.5a: completeness + aspects 추가."""
results: list[SearchResult]
ai_answer: str | None
@@ -247,6 +261,11 @@ class AskResponse(BaseModel):
no_results_reason: str | None
query: str
total: int
+ # Phase 3.5a
+ completeness: Literal["full", "partial", "insufficient"] = "full"
+ covered_aspects: list[str] | None = None
+ missing_aspects: list[str] | None = None
+ confirmed_items: list[ConfirmedItem] | None = None
debug: AskDebug | None = None
@@ -355,73 +374,211 @@ async def ask(
limit: int = Query(10, ge=1, le=20, description="synthesis 입력 상한"),
debug: bool = Query(False, description="evidence/synthesis 중간 상태 노출"),
):
- """근거 기반 AI 답변 (Phase 3.3).
+ """근거 기반 AI 답변 (Phase 3.5a).
- `/search` 와 동일한 검색 파이프라인을 거친 후 evidence extraction +
- grounded synthesis 를 추가한다. `mode`, `rerank`, `analyze` 는 품질 보장을
- 위해 강제 고정 (hybrid / True / True).
-
- 실패 경로(timeout/parse_failed/refused/...) 에서도 `results` 는 항상 반환.
+ Phase 3.3 기반 + classifier parallel + refusal gate + grounding re-gate.
+ 실패 경로에서도 `results` 는 항상 반환.
"""
t_total = time.perf_counter()
+ defense_log: dict = {} # per-layer flag snapshot
- # 1. 검색 파이프라인 (run_search — /search 와 동일 로직, 단일 진실 소스)
+ # 1. 검색 파이프라인
pr = await run_search(
- session,
- q,
- mode="hybrid",
- limit=limit,
- fusion=DEFAULT_FUSION,
- rerank=True,
- analyze=True,
+ session, q, mode="hybrid", limit=limit,
+ fusion=DEFAULT_FUSION, rerank=True, analyze=True,
)
- # 2. Evidence extraction (rule + LLM span select, 1 batched call)
+ # 2. Evidence + Classifier 병렬
t_ev = time.perf_counter()
- evidence, ev_skip = await extract_evidence(q, pr.results)
+ evidence_task = asyncio.create_task(extract_evidence(q, pr.results))
+
+ # classifier input: top 3 chunks meta + rerank scores
+ top_chunks = [
+ {
+ "title": r.title or "",
+ "section": r.section_title or "",
+ "snippet": (r.snippet or "")[:200],
+ }
+ for r in pr.results[:3]
+ ]
+ rerank_scores_top = [
+ r.rerank_score if r.rerank_score is not None else r.score
+ for r in pr.results[:3]
+ ]
+ classifier_task = asyncio.create_task(
+ classify(q, top_chunks, rerank_scores_top)
+ )
+
+ evidence, ev_skip = await evidence_task
ev_ms = (time.perf_counter() - t_ev) * 1000
- # 3. Grounded synthesis (gemma-4, 15s timeout, citation 검증)
+ # classifier await (timeout 보호 — classifier_service 내부에도 있지만 여기서 이중 보호)
+ try:
+ classifier_result = await asyncio.wait_for(classifier_task, timeout=6.0)
+ except (asyncio.TimeoutError, Exception):
+ classifier_result = ClassifierResult("timeout", None, [], [], 0.0)
+
+ defense_log["classifier"] = {
+ "status": classifier_result.status,
+ "verdict": classifier_result.verdict,
+ "covered_aspects": classifier_result.covered_aspects,
+ "missing_aspects": classifier_result.missing_aspects,
+ "elapsed_ms": classifier_result.elapsed_ms,
+ }
+
+ # 3. Refusal gate (multi-signal fusion)
+ all_rerank_scores = [
+ e.rerank_score for e in evidence
+ ] if evidence else rerank_scores_top
+ decision = refusal_decide(all_rerank_scores, classifier_result)
+
+ defense_log["score_gate"] = {
+ "max": max(all_rerank_scores) if all_rerank_scores else 0.0,
+ "agg_top3": sum(sorted(all_rerank_scores, reverse=True)[:3]),
+ }
+ defense_log["refusal"] = {
+ "refused": decision.refused,
+ "rule_triggered": decision.rule_triggered,
+ }
+
+ if decision.refused:
+ total_ms = (time.perf_counter() - t_total) * 1000
+ no_reason = "관련 근거를 찾지 못했습니다."
+ if not pr.results:
+ no_reason = "검색 결과가 없습니다."
+ logger.info(
+ "ask REFUSED query=%r rule=%s max_score=%.2f total=%.0f",
+ q[:80], decision.rule_triggered,
+ max(all_rerank_scores) if all_rerank_scores else 0.0, total_ms,
+ )
+ # telemetry
+ background_tasks.add_task(
+ record_search_event, q, user.id, pr.results, "hybrid",
+ pr.confidence_signal, pr.analyzer_confidence,
+ )
+ debug_obj = None
+ if debug:
+ debug_obj = AskDebug(
+ timing_ms={**pr.timing_ms, "evidence_ms": ev_ms, "ask_total_ms": total_ms},
+ search_notes=pr.notes,
+ confidence_signal=pr.confidence_signal,
+ evidence_candidate_count=len(evidence),
+ evidence_kept_count=len(evidence),
+ evidence_skip_reason=ev_skip,
+ synthesis_cache_hit=False,
+ hallucination_flags=[],
+ defense_layers=defense_log,
+ )
+ return AskResponse(
+ results=pr.results,
+ ai_answer=None,
+ citations=[],
+ synthesis_status="skipped",
+ synthesis_ms=0.0,
+ confidence=None,
+ refused=True,
+ no_results_reason=no_reason,
+ query=q,
+ total=len(pr.results),
+ completeness="insufficient",
+ covered_aspects=classifier_result.covered_aspects or None,
+ missing_aspects=classifier_result.missing_aspects or None,
+ debug=debug_obj,
+ )
+
+ # 4. Synthesis
t_synth = time.perf_counter()
sr = await synthesize(q, evidence, debug=debug)
synth_ms = (time.perf_counter() - t_synth) * 1000
+ # 5. Grounding check (post-synthesis) + re-gate
+ grounding = grounding_check(q, sr.answer or "", evidence)
+ defense_log["grounding"] = {
+ "strong": grounding.strong_flags,
+ "weak": grounding.weak_flags,
+ }
+
+ # Completeness 결정: grounding 기반 (classifier 는 binary gate 만)
+ completeness: Literal["full", "partial", "insufficient"] = "full"
+ covered_aspects = classifier_result.covered_aspects or None
+ missing_aspects = classifier_result.missing_aspects or None
+ confirmed_items: list[ConfirmedItem] | None = None
+
+ if len(grounding.strong_flags) >= 2:
+ # Re-gate: multiple strong → refuse
+ completeness = "insufficient"
+ sr.answer = None
+ sr.refused = True
+ sr.confidence = None
+ defense_log["re_gate"] = "refuse(2+strong)"
+ elif grounding.strong_flags:
+ # Single strong → partial downgrade
+ completeness = "partial"
+ sr.confidence = "low"
+ defense_log["re_gate"] = "partial(1strong)"
+ elif grounding.weak_flags:
+ # Weak → confidence lower only
+ if sr.confidence == "high":
+ sr.confidence = "medium"
+ defense_log["re_gate"] = "conf_lower(weak)"
+
+ # Confidence cap from refusal gate (classifier 부재 시 conservative)
+ if decision.confidence_cap and sr.confidence:
+ conf_rank = {"low": 0, "medium": 1, "high": 2}
+ if conf_rank.get(sr.confidence, 0) > conf_rank.get(decision.confidence_cap, 2):
+ sr.confidence = decision.confidence_cap
+
+ # Partial 이면 max confidence = medium
+ if completeness == "partial" and sr.confidence == "high":
+ sr.confidence = "medium"
+
+ sr.hallucination_flags.extend(
+ [f"strong:{f}" for f in grounding.strong_flags]
+ + [f"weak:{f}" for f in grounding.weak_flags]
+ )
+
total_ms = (time.perf_counter() - t_total) * 1000
- # 4. 응답 구성
+ # 6. 응답 구성
citations = _build_citations(evidence, sr.used_citations)
no_reason = _map_no_results_reason(pr, evidence, ev_skip, sr)
+ if completeness == "insufficient" and not no_reason:
+ no_reason = "답변 검증에서 복수 오류 감지"
logger.info(
- "ask query=%r results=%d evidence=%d cite=%d synth=%s conf=%s refused=%s ev_ms=%.0f synth_ms=%.0f total=%.0f",
- q[:80],
- len(pr.results),
- len(evidence),
- len(citations),
- sr.status,
- sr.confidence or "-",
- sr.refused,
- ev_ms,
- synth_ms,
- total_ms,
+ "ask query=%r results=%d evidence=%d cite=%d synth=%s conf=%s completeness=%s "
+ "refused=%s grounding_strong=%d grounding_weak=%d ev_ms=%.0f synth_ms=%.0f total=%.0f",
+ q[:80], len(pr.results), len(evidence), len(citations),
+ sr.status, sr.confidence or "-", completeness,
+ sr.refused, len(grounding.strong_flags), len(grounding.weak_flags),
+ ev_ms, synth_ms, total_ms,
)
- # 5. telemetry — 기존 record_search_event 재사용 (Phase 0.3 호환)
+ # 7. telemetry
background_tasks.add_task(
- record_search_event,
- q,
- user.id,
- pr.results,
- "hybrid",
- pr.confidence_signal,
- pr.analyzer_confidence,
+ record_search_event, q, user.id, pr.results, "hybrid",
+ pr.confidence_signal, pr.analyzer_confidence,
)
- debug_obj = (
- _build_ask_debug(pr, evidence, ev_skip, sr, ev_ms, synth_ms, total_ms)
- if debug
- else None
- )
+ debug_obj = None
+ if debug:
+ timing = dict(pr.timing_ms)
+ timing["evidence_ms"] = ev_ms
+ timing["synthesis_ms"] = synth_ms
+ timing["ask_total_ms"] = total_ms
+ debug_obj = AskDebug(
+ timing_ms=timing,
+ search_notes=pr.notes,
+ query_analysis=pr.query_analysis,
+ confidence_signal=pr.confidence_signal,
+ evidence_candidate_count=len(evidence),
+ evidence_kept_count=len(evidence),
+ evidence_skip_reason=ev_skip,
+ synthesis_cache_hit=sr.cache_hit,
+ synthesis_raw_preview=sr.raw_preview,
+ hallucination_flags=sr.hallucination_flags,
+ defense_layers=defense_log,
+ )
return AskResponse(
results=pr.results,
@@ -434,5 +591,9 @@ async def ask(
no_results_reason=no_reason,
query=q,
total=len(pr.results),
+ completeness=completeness,
+ covered_aspects=covered_aspects,
+ missing_aspects=missing_aspects,
+ confirmed_items=confirmed_items,
debug=debug_obj,
)
diff --git a/app/core/config.py b/app/core/config.py
index 897e269..36c95fe 100644
--- a/app/core/config.py
+++ b/app/core/config.py
@@ -24,6 +24,8 @@ class AIConfig(BaseModel):
embedding: AIModelConfig
vision: AIModelConfig
rerank: AIModelConfig
+ # Phase 3.5a: exaone classifier (optional — 없으면 score-only gate)
+ classifier: AIModelConfig | None = None
class Settings(BaseModel):
@@ -79,6 +81,11 @@ def load_settings() -> Settings:
embedding=AIModelConfig(**ai_raw["models"]["embedding"]),
vision=AIModelConfig(**ai_raw["models"]["vision"]),
rerank=AIModelConfig(**ai_raw["models"]["rerank"]),
+ classifier=(
+ AIModelConfig(**ai_raw["models"]["classifier"])
+ if "classifier" in ai_raw.get("models", {})
+ else None
+ ),
)
if "nas" in raw:
diff --git a/app/prompts/classifier.txt b/app/prompts/classifier.txt
new file mode 100644
index 0000000..4f1a94e
--- /dev/null
+++ b/app/prompts/classifier.txt
@@ -0,0 +1,33 @@
+You are an answerability judge. Given a query and evidence chunks, determine if the evidence can answer the query. Respond ONLY in JSON.
+
+## CALIBRATION (CRITICAL)
+- verdict=full: evidence is SUFFICIENT to answer the CORE of the query. Missing minor details does NOT make it insufficient.
+- verdict=partial: evidence covers SOME major aspects but CLEARLY MISSES others the user explicitly asked about.
+- verdict=insufficient: evidence has NO relevant information for the query, or is completely off-topic.
+
+Example: Query="제6장 주요 내용", Evidence covers 제6장 definition+scope → verdict=full (core is covered).
+Example: Query="제6장 처벌 조항", Evidence covers 제6장 definition but NOT 처벌 → verdict=partial.
+Example: Query="감귤 출하량", Evidence about 산업안전보건법 → verdict=insufficient.
+
+## Rules
+1. Your "verdict" must be based ONLY on whether the CONTENT semantically answers the query. Ignore retrieval scores for this field.
+2. "covered_aspects": query aspects that evidence covers. Korean labels for Korean queries.
+3. "missing_aspects": query aspects that evidence does NOT cover. Korean labels.
+4. Keep aspects concise (2-5 words each), non-overlapping.
+
+## Output Schema
+{
+ "verdict": "full" | "partial" | "insufficient",
+ "covered_aspects": ["aspect1"],
+ "missing_aspects": ["aspect2"],
+ "confidence": "high" | "medium" | "low"
+}
+
+## Query
+{query}
+
+## Evidence chunks:
+{chunks}
+
+## Retrieval scores (for reference only, NOT for verdict):
+[{scores}]
diff --git a/app/prompts/search_synthesis_partial.txt b/app/prompts/search_synthesis_partial.txt
new file mode 100644
index 0000000..635ac75
--- /dev/null
+++ b/app/prompts/search_synthesis_partial.txt
@@ -0,0 +1,34 @@
+You are a grounded answer synthesizer handling a PARTIAL answer case. Some aspects of the query CAN be answered, others CANNOT. Respond ONLY in JSON.
+
+## Task
+Answer ONLY the covered aspects. Do NOT attempt to answer missing aspects.
+
+## Output Schema
+{
+ "confirmed_items": [
+ {"aspect": "aspect label", "text": "1~2 sentence answer", "citations": [1, 2]}
+ ],
+ "confidence": "medium" | "low",
+ "refused": false
+}
+
+## Rules
+- Each confirmed_item: aspect label + 1~2 sentences + inline [n] citations
+- ONLY use facts present in evidence. No outside knowledge, no guessing.
+- Do NOT mention or address missing_aspects in your text.
+- Korean query → Korean answer / English → English
+- confidence: medium (2+ strong evidence matches) / low (1 or weak)
+- Max total text: 400 chars across all items
+- 모든 주장 문장 끝에 [n] 필수
+
+## Covered aspects (answer these):
+{covered_aspects}
+
+## Missing aspects (do NOT answer these):
+{missing_aspects}
+
+## Query
+{query}
+
+## Evidence
+{numbered_evidence}
diff --git a/app/services/search/classifier_service.py b/app/services/search/classifier_service.py
new file mode 100644
index 0000000..1202d02
--- /dev/null
+++ b/app/services/search/classifier_service.py
@@ -0,0 +1,150 @@
+"""Answerability classifier (Phase 3.5a).
+
+exaone3.5:7.8b GPU Ollama 기반. MLX gate 밖 — evidence extraction 과 병렬 실행.
+
+P1 실측 결과: ternary (full/partial/insufficient) 불안정 → **binary (sufficient/insufficient)**.
+"full" vs "partial" 구분은 grounding_check 의 intent alignment 이 담당.
+
+Classifier verdict 는 "relevant evidence 가 있나" 의 binary 판단.
+covered_aspects / missing_aspects 는 로깅용으로 유지 (refusal gate 에서 사용 안 함).
+"""
+
+from __future__ import annotations
+
+import asyncio
+import time
+from dataclasses import dataclass
+from typing import Literal
+
+from ai.client import AIClient, _load_prompt, parse_json_response
+from core.config import settings
+from core.utils import setup_logger
+
+logger = setup_logger("classifier")
+
+LLM_TIMEOUT_MS = 5000
+CIRCUIT_THRESHOLD = 5
+CIRCUIT_RECOVERY_SEC = 60
+
+_failure_count = 0
+_circuit_open_until: float | None = None
+
+
+@dataclass(slots=True)
+class ClassifierResult:
+ status: Literal["ok", "timeout", "error", "circuit_open", "skipped"]
+ verdict: Literal["sufficient", "insufficient"] | None
+ covered_aspects: list[str]
+ missing_aspects: list[str]
+ elapsed_ms: float
+
+
+try:
+ CLASSIFIER_PROMPT = _load_prompt("classifier.txt")
+except FileNotFoundError:
+ CLASSIFIER_PROMPT = ""
+ logger.warning("classifier.txt not found — classifier will always skip")
+
+
+def _build_input(
+ query: str,
+ top_chunks: list[dict],
+ rerank_scores: list[float],
+) -> str:
+ """Y+ input (content + scores with role separation)."""
+ chunk_block = "\n".join(
+ f"[{i+1}] title: {c.get('title','')}\n"
+ f" section: {c.get('section','')}\n"
+ f" snippet: {c.get('snippet','')}"
+ for i, c in enumerate(top_chunks[:3])
+ )
+ scores_str = ", ".join(f"{s:.2f}" for s in rerank_scores[:3])
+ return (
+ CLASSIFIER_PROMPT
+ .replace("{query}", query)
+ .replace("{chunks}", chunk_block)
+ .replace("{scores}", scores_str)
+ )
+
+
+async def classify(
+ query: str,
+ top_chunks: list[dict],
+ rerank_scores: list[float],
+) -> ClassifierResult:
+ """Always-on binary classifier. Parallel with evidence extraction.
+
+ Returns:
+ ClassifierResult with verdict=sufficient|insufficient.
+ Status "ok" 이 아니면 verdict=None (caller 가 fallback 처리).
+ """
+ global _failure_count, _circuit_open_until
+ t_start = time.perf_counter()
+
+ # Circuit breaker
+ if _circuit_open_until and time.time() < _circuit_open_until:
+ return ClassifierResult("circuit_open", None, [], [], 0.0)
+
+ if not CLASSIFIER_PROMPT:
+ return ClassifierResult("skipped", None, [], [], 0.0)
+
+ if not hasattr(settings.ai, "classifier") or settings.ai.classifier is None:
+ return ClassifierResult("skipped", None, [], [], 0.0)
+
+ prompt = _build_input(query, top_chunks, rerank_scores)
+ client = AIClient()
+ try:
+ # ⚠ MLX gate 안 씀. Ollama(exaone) 는 concurrent OK.
+ async with asyncio.timeout(LLM_TIMEOUT_MS / 1000):
+ raw = await client._request(settings.ai.classifier, prompt)
+ _failure_count = 0
+ except asyncio.TimeoutError:
+ _failure_count += 1
+ if _failure_count >= CIRCUIT_THRESHOLD:
+ _circuit_open_until = time.time() + CIRCUIT_RECOVERY_SEC
+ logger.error(f"classifier circuit OPEN for {CIRCUIT_RECOVERY_SEC}s")
+ logger.warning("classifier timeout")
+ return ClassifierResult(
+ "timeout", None, [], [],
+ (time.perf_counter() - t_start) * 1000,
+ )
+ except Exception as e:
+ _failure_count += 1
+ if _failure_count >= CIRCUIT_THRESHOLD:
+ _circuit_open_until = time.time() + CIRCUIT_RECOVERY_SEC
+ logger.error(f"classifier circuit OPEN for {CIRCUIT_RECOVERY_SEC}s")
+ logger.warning(f"classifier error: {e}")
+ return ClassifierResult(
+ "error", None, [], [],
+ (time.perf_counter() - t_start) * 1000,
+ )
+ finally:
+ await client.close()
+
+ elapsed_ms = (time.perf_counter() - t_start) * 1000
+ parsed = parse_json_response(raw)
+ if not isinstance(parsed, dict):
+ logger.warning("classifier parse failed raw=%r", (raw or "")[:200])
+ return ClassifierResult("error", None, [], [], elapsed_ms)
+
+ # ternary → binary 매핑
+ raw_verdict = parsed.get("verdict", "")
+ if raw_verdict == "insufficient":
+ verdict: Literal["sufficient", "insufficient"] | None = "insufficient"
+ elif raw_verdict in ("full", "partial", "sufficient"):
+ verdict = "sufficient"
+ else:
+ verdict = None
+
+ covered = parsed.get("covered_aspects") or []
+ missing = parsed.get("missing_aspects") or []
+ if not isinstance(covered, list):
+ covered = []
+ if not isinstance(missing, list):
+ missing = []
+
+ logger.info(
+ "classifier ok query=%r verdict=%s (raw=%s) covered=%d missing=%d elapsed_ms=%.0f",
+ query[:60], verdict, raw_verdict, len(covered), len(missing), elapsed_ms,
+ )
+ return ClassifierResult("ok", verdict, covered, missing, elapsed_ms)
diff --git a/app/services/search/grounding_check.py b/app/services/search/grounding_check.py
new file mode 100644
index 0000000..a03417d
--- /dev/null
+++ b/app/services/search/grounding_check.py
@@ -0,0 +1,131 @@
+"""Grounding check — post-synthesis 검증 (Phase 3.5a).
+
+Strong/weak flag 분리:
+- **Strong** (→ partial 강등 or refuse): fabricated_number, intent_misalignment(important)
+- **Weak** (→ confidence lower only): uncited_claim, low_overlap, intent_misalignment(generic)
+
+Re-gate 로직 (Phase 3.5a 9라운드 토론 결과):
+- strong 1개 → partial 강등
+- strong 2개 이상 → refuse
+- weak → confidence "low" 만
+
+Intent alignment (rule-based):
+- query 의 핵심 명사가 answer 에 등장하는지 확인
+- "처벌" 같은 중요 키워드 누락은 strong
+- "주요", "관련" 같은 generic 은 무시
+"""
+
+from __future__ import annotations
+
+import re
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
+
+from core.utils import setup_logger
+
+if TYPE_CHECKING:
+ from .evidence_service import EvidenceItem
+
+logger = setup_logger("grounding")
+
+# "주요", "관련" 등 intent alignment 에서 제외할 generic 단어
+GENERIC_TERMS = frozenset({
+ "주요", "관련", "내용", "정의", "기준", "방법", "설명", "개요",
+ "대한", "위한", "대해", "무엇", "어떤", "어떻게", "있는",
+ "하는", "되는", "이런", "그런", "이것", "그것",
+})
+
+
+@dataclass(slots=True)
+class GroundingResult:
+ strong_flags: list[str]
+ weak_flags: list[str]
+
+
+def _extract_number_literals(text: str) -> set[str]:
+ """숫자 + 단위 추출 + normalize."""
+ raw = set(re.findall(r'\d[\d,.]*\s*[명인개%년월일조항호세]\w{0,2}', text))
+ normalized = set()
+ for r in raw:
+ normalized.add(r.strip())
+ num_only = re.match(r'[\d,.]+', r)
+ if num_only:
+ normalized.add(num_only.group().replace(',', ''))
+ # 단독 숫자도 추출
+ for d in re.findall(r'\b\d+\b', text):
+ normalized.add(d)
+ return normalized
+
+
+def _extract_content_tokens(text: str) -> set[str]:
+ """한국어 2자 이상 명사 + 영어 3자 이상 단어."""
+ return set(re.findall(r'[가-힣]{2,}|[a-zA-Z]{3,}', text))
+
+
+def check(
+ query: str,
+ answer: str,
+ evidence: list[EvidenceItem],
+) -> GroundingResult:
+ """답변 vs evidence grounding 검증 + query intent alignment."""
+ strong: list[str] = []
+ weak: list[str] = []
+
+ if not answer or not evidence:
+ return GroundingResult([], [])
+
+ evidence_text = " ".join(e.span_text for e in evidence)
+
+ # ── Strong 1: fabricated number ──
+ answer_nums = _extract_number_literals(answer)
+ evidence_nums = _extract_number_literals(evidence_text)
+ for num in answer_nums:
+ digits_only = re.sub(r'[^\d]', '', num)
+ if digits_only and not any(
+ digits_only in re.sub(r'[^\d]', '', en) for en in evidence_nums
+ ):
+ strong.append(f"fabricated_number:{num}")
+
+ # ── Strong/Weak 2: query-answer intent alignment ──
+ query_content = _extract_content_tokens(query)
+ answer_content = _extract_content_tokens(answer)
+ if query_content:
+ missing_terms = query_content - answer_content
+ important_missing = [
+ t for t in missing_terms
+ if t not in GENERIC_TERMS and len(t) >= 2
+ ]
+ if important_missing:
+ strong.append(
+ f"intent_misalignment:{','.join(important_missing[:3])}"
+ )
+ elif len(missing_terms) > len(query_content) * 0.5:
+ weak.append(
+ f"intent_misalignment_generic:"
+ f"missing({','.join(list(missing_terms)[:5])})"
+ )
+
+ # ── Weak 1: uncited claim ──
+ sentences = re.split(r'(?<=[.!?。])\s+', answer)
+ for s in sentences:
+ if len(s.strip()) > 20 and not re.search(r'\[\d+\]', s):
+ weak.append(f"uncited_claim:{s[:40]}")
+
+ # ── Weak 2: token overlap ──
+ answer_tokens = _extract_content_tokens(answer)
+ evidence_tokens = _extract_content_tokens(evidence_text)
+ if answer_tokens:
+ overlap = len(answer_tokens & evidence_tokens) / len(answer_tokens)
+ if overlap < 0.4:
+ weak.append(f"low_overlap:{overlap:.2f}")
+
+ if strong or weak:
+ logger.info(
+ "grounding query=%r strong=%d weak=%d flags=%s",
+ query[:60],
+ len(strong),
+ len(weak),
+ ",".join(strong[:3] + weak[:3]),
+ )
+
+ return GroundingResult(strong, weak)
diff --git a/app/services/search/refusal_gate.py b/app/services/search/refusal_gate.py
new file mode 100644
index 0000000..60eff67
--- /dev/null
+++ b/app/services/search/refusal_gate.py
@@ -0,0 +1,105 @@
+"""Refusal gate — multi-signal fusion (Phase 3.5a).
+
+Score gate (deterministic) + classifier verdict (semantic, binary) 를 독립 평가 후 합성.
+Classifier 부재 시 3-tier conservative fallback.
+
+P1 실측 결과: exaone ternary 불안정 → binary (sufficient/insufficient) 로 축소.
+"full" vs "partial" 구분은 grounding check (intent alignment) 가 담당.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Literal
+
+from core.utils import setup_logger
+
+if TYPE_CHECKING:
+ from .classifier_service import ClassifierResult
+
+logger = setup_logger("refusal_gate")
+
+# Placeholder thresholds — Phase 3.5b 에서 실측 기반 tuning
+# AND 조건이라 false refusal 방어됨 (둘 다 만족해야 refuse)
+SCORE_MAX_REFUSE = 0.25
+SCORE_AGG_REFUSE = 0.70
+
+# Conservative fallback tiers (classifier 부재 시)
+CONSERVATIVE_WEAK = 0.35
+CONSERVATIVE_MID = 0.55
+
+
+@dataclass(slots=True)
+class RefusalDecision:
+ refused: bool
+ confidence_cap: Literal["high", "medium", "low"] | None # None = no cap
+ rule_triggered: str | None # 디버깅: 어느 signal 이 결정에 기여?
+
+
+def decide(
+ rerank_scores: list[float],
+ classifier: ClassifierResult | None,
+) -> RefusalDecision:
+ """Multi-signal fusion. Binary classifier verdict 기반.
+
+ Returns:
+ RefusalDecision. refused=True 이면 synthesis skip.
+ confidence_cap 은 synthesis 결과의 confidence 에 upper bound 적용.
+ """
+ max_score = max(rerank_scores) if rerank_scores else 0.0
+ agg_top3 = sum(sorted(rerank_scores, reverse=True)[:3])
+
+ score_gate_fails = (
+ max_score < SCORE_MAX_REFUSE and agg_top3 < SCORE_AGG_REFUSE
+ )
+
+ # ── Classifier 사용 가능 (정상 경로) ──
+ if classifier and classifier.verdict is not None:
+ if classifier.verdict == "insufficient":
+ # Evidence quality override: classifier 가 insufficient 라 해도
+ # evidence 가 충분히 좋으면 override (토론 8라운드 합의)
+ # (evidence quality 는 이 함수 밖에서 별도 체크 — caller 에서 처리)
+ logger.info(
+ "refusal gate: classifier=insufficient max=%.2f agg=%.2f",
+ max_score, agg_top3,
+ )
+ return RefusalDecision(
+ refused=True,
+ confidence_cap=None,
+ rule_triggered="classifier_insufficient",
+ )
+ if score_gate_fails:
+ logger.info(
+ "refusal gate: score_low max=%.2f agg=%.2f classifier=%s",
+ max_score, agg_top3, classifier.verdict,
+ )
+ return RefusalDecision(
+ refused=True,
+ confidence_cap=None,
+ rule_triggered="score_low",
+ )
+ # Classifier says sufficient → proceed
+ return RefusalDecision(
+ refused=False,
+ confidence_cap=None,
+ rule_triggered=None,
+ )
+
+ # ── Classifier 부재 → 3-tier conservative ──
+ if max_score < CONSERVATIVE_WEAK:
+ return RefusalDecision(
+ refused=True,
+ confidence_cap=None,
+ rule_triggered="conservative_refuse(no_classifier)",
+ )
+ if max_score < CONSERVATIVE_MID:
+ return RefusalDecision(
+ refused=False,
+ confidence_cap="low",
+ rule_triggered="conservative_low(no_classifier)",
+ )
+ return RefusalDecision(
+ refused=False,
+ confidence_cap="medium",
+ rule_triggered="conservative_medium(no_classifier)",
+ )
diff --git a/app/services/search/sentence_splitter.py b/app/services/search/sentence_splitter.py
new file mode 100644
index 0000000..b171e2c
--- /dev/null
+++ b/app/services/search/sentence_splitter.py
@@ -0,0 +1,33 @@
+"""문장 분할 (Phase 3.5a — regex 기반).
+
+Phase 3.5b 에서 KSS 라이브러리 기반으로 업그레이드 예정.
+"""
+
+import re
+
+MIN_SENTENCE_CHARS = 15
+
+
+def split_sentences(text: str) -> list[str]:
+ """한국어/영어 혼합 텍스트를 문장 단위로 분할.
+
+ 규칙:
+ - 마침표/느낌표/물음표 + 공백/줄바꿈
+ - 한국어 종결 어미 (다. 함. 음. 됨.) 패턴
+ - MIN_SENTENCE_CHARS 미만은 이전 문장에 병합
+ """
+ # 1차 분할: punctuation + whitespace
+ raw = re.split(r'(?<=[.!?。])\s+|(?<=[다됨음함]\.)\s+|\n{2,}', text)
+
+ # 2차: 너무 짧은 것 병합
+ merged: list[str] = []
+ for part in raw:
+ part = part.strip()
+ if not part:
+ continue
+ if merged and len(part) < MIN_SENTENCE_CHARS:
+ merged[-1] = merged[-1] + " " + part
+ else:
+ merged.append(part)
+
+ return merged if merged else [text.strip()] if text.strip() else []
diff --git a/config.yaml b/config.yaml
index 7b5d589..9e64b24 100644
--- a/config.yaml
+++ b/config.yaml
@@ -35,6 +35,12 @@ ai:
rerank:
endpoint: "http://ollama:11434/api/rerank"
model: "bge-reranker-v2-m3"
+ # Phase 3.5a: exaone answerability classifier (GPU Ollama, concurrent OK)
+ classifier:
+ endpoint: "http://ollama:11434/v1/chat/completions"
+ model: "exaone3.5:7.8b-instruct-q8_0"
+ max_tokens: 512
+ timeout: 10
nas:
mount_path: "/documents"
diff --git a/frontend/src/lib/components/ask/AskAnswer.svelte b/frontend/src/lib/components/ask/AskAnswer.svelte
index 82d7dcd..e94eed6 100644
--- a/frontend/src/lib/components/ask/AskAnswer.svelte
+++ b/frontend/src/lib/components/ask/AskAnswer.svelte
@@ -63,10 +63,14 @@
};
let tokens = $derived(data?.ai_answer ? splitAnswer(data.ai_answer) : []);
- let showAnswer = $derived(
- !!data && !!data.ai_answer && data.synthesis_status === 'completed' && !data.refused,
+ let showFullAnswer = $derived(
+ !!data && !!data.ai_answer && data.completeness === 'full'
+ && data.synthesis_status === 'completed' && !data.refused,
);
- let showWarning = $derived(!!data && !showAnswer);
+ let showPartial = $derived(
+ !!data && data.completeness === 'partial' && !data.refused,
+ );
+ let showWarning = $derived(!!data && !showFullAnswer && !showPartial);
@@ -107,7 +111,7 @@
근거 기반 답변 생성 중… 약 15초 소요
- {:else if showAnswer && data}
+ {:else if showFullAnswer && data}
{#each tokens as tok}
{#if tok.type === 'cite'}
@@ -124,6 +128,67 @@
{/if}
{/each}
+ {:else if showPartial && data}
+
+
+
일부 답변
+
+ {#if data.ai_answer}
+
+ {#each tokens as tok}
+ {#if tok.type === 'cite'}
+
+ {:else}
+ {tok.value}
+ {/if}
+ {/each}
+
+ {:else if data.confirmed_items?.length}
+
+
✓ 답변 가능
+
+ {#each data.confirmed_items as item}
+ -
+ {item.aspect}:
+ {item.text}
+ {#each item.citations as n}
+
+ {/each}
+
+ {/each}
+
+
+ {/if}
+
+ {#if data.missing_aspects?.length}
+
+
✗ 답변 불가
+
+ {#each data.missing_aspects as aspect}
+ - {aspect} (근거 없음)
+ {/each}
+
+
+ {/if}
+
+
+
+
+
{:else if showWarning && data}