diff --git a/app/api/search.py b/app/api/search.py index 451293a..330d6b2 100644 --- a/app/api/search.py +++ b/app/api/search.py @@ -9,6 +9,7 @@ - `/ask` endpoint wrapper (Phase 3.3 에서 추가) """ +import asyncio import time from typing import Annotated, Literal @@ -20,8 +21,11 @@ from core.auth import get_current_user from core.database import get_session from core.utils import setup_logger from models.user import User +from services.search.classifier_service import ClassifierResult, classify from services.search.evidence_service import EvidenceItem, extract_evidence from services.search.fusion_service import DEFAULT_FUSION +from services.search.grounding_check import check as grounding_check +from services.search.refusal_gate import RefusalDecision, decide as refusal_decide from services.search.search_pipeline import PipelineResult, run_search from services.search.synthesis_service import SynthesisResult, synthesize from services.search_telemetry import record_search_event @@ -216,6 +220,14 @@ class Citation(BaseModel): rerank_score: float +class ConfirmedItem(BaseModel): + """Partial answer 의 개별 aspect 답변.""" + + aspect: str + text: str + citations: list[int] + + class AskDebug(BaseModel): """`/ask?debug=true` 응답 확장.""" @@ -230,10 +242,12 @@ class AskDebug(BaseModel): synthesis_prompt_preview: str | None = None synthesis_raw_preview: str | None = None hallucination_flags: list[str] = [] + # Phase 3.5a: per-layer defense 로깅 + defense_layers: dict | None = None class AskResponse(BaseModel): - """`/ask` 응답. `/search` 의 SearchResult 는 그대로 재사용.""" + """`/ask` 응답. Phase 3.5a: completeness + aspects 추가.""" results: list[SearchResult] ai_answer: str | None @@ -247,6 +261,11 @@ class AskResponse(BaseModel): no_results_reason: str | None query: str total: int + # Phase 3.5a + completeness: Literal["full", "partial", "insufficient"] = "full" + covered_aspects: list[str] | None = None + missing_aspects: list[str] | None = None + confirmed_items: list[ConfirmedItem] | None = None debug: AskDebug | None = None @@ -355,73 +374,211 @@ async def ask( limit: int = Query(10, ge=1, le=20, description="synthesis 입력 상한"), debug: bool = Query(False, description="evidence/synthesis 중간 상태 노출"), ): - """근거 기반 AI 답변 (Phase 3.3). + """근거 기반 AI 답변 (Phase 3.5a). - `/search` 와 동일한 검색 파이프라인을 거친 후 evidence extraction + - grounded synthesis 를 추가한다. `mode`, `rerank`, `analyze` 는 품질 보장을 - 위해 강제 고정 (hybrid / True / True). - - 실패 경로(timeout/parse_failed/refused/...) 에서도 `results` 는 항상 반환. + Phase 3.3 기반 + classifier parallel + refusal gate + grounding re-gate. + 실패 경로에서도 `results` 는 항상 반환. """ t_total = time.perf_counter() + defense_log: dict = {} # per-layer flag snapshot - # 1. 검색 파이프라인 (run_search — /search 와 동일 로직, 단일 진실 소스) + # 1. 검색 파이프라인 pr = await run_search( - session, - q, - mode="hybrid", - limit=limit, - fusion=DEFAULT_FUSION, - rerank=True, - analyze=True, + session, q, mode="hybrid", limit=limit, + fusion=DEFAULT_FUSION, rerank=True, analyze=True, ) - # 2. Evidence extraction (rule + LLM span select, 1 batched call) + # 2. Evidence + Classifier 병렬 t_ev = time.perf_counter() - evidence, ev_skip = await extract_evidence(q, pr.results) + evidence_task = asyncio.create_task(extract_evidence(q, pr.results)) + + # classifier input: top 3 chunks meta + rerank scores + top_chunks = [ + { + "title": r.title or "", + "section": r.section_title or "", + "snippet": (r.snippet or "")[:200], + } + for r in pr.results[:3] + ] + rerank_scores_top = [ + r.rerank_score if r.rerank_score is not None else r.score + for r in pr.results[:3] + ] + classifier_task = asyncio.create_task( + classify(q, top_chunks, rerank_scores_top) + ) + + evidence, ev_skip = await evidence_task ev_ms = (time.perf_counter() - t_ev) * 1000 - # 3. Grounded synthesis (gemma-4, 15s timeout, citation 검증) + # classifier await (timeout 보호 — classifier_service 내부에도 있지만 여기서 이중 보호) + try: + classifier_result = await asyncio.wait_for(classifier_task, timeout=6.0) + except (asyncio.TimeoutError, Exception): + classifier_result = ClassifierResult("timeout", None, [], [], 0.0) + + defense_log["classifier"] = { + "status": classifier_result.status, + "verdict": classifier_result.verdict, + "covered_aspects": classifier_result.covered_aspects, + "missing_aspects": classifier_result.missing_aspects, + "elapsed_ms": classifier_result.elapsed_ms, + } + + # 3. Refusal gate (multi-signal fusion) + all_rerank_scores = [ + e.rerank_score for e in evidence + ] if evidence else rerank_scores_top + decision = refusal_decide(all_rerank_scores, classifier_result) + + defense_log["score_gate"] = { + "max": max(all_rerank_scores) if all_rerank_scores else 0.0, + "agg_top3": sum(sorted(all_rerank_scores, reverse=True)[:3]), + } + defense_log["refusal"] = { + "refused": decision.refused, + "rule_triggered": decision.rule_triggered, + } + + if decision.refused: + total_ms = (time.perf_counter() - t_total) * 1000 + no_reason = "관련 근거를 찾지 못했습니다." + if not pr.results: + no_reason = "검색 결과가 없습니다." + logger.info( + "ask REFUSED query=%r rule=%s max_score=%.2f total=%.0f", + q[:80], decision.rule_triggered, + max(all_rerank_scores) if all_rerank_scores else 0.0, total_ms, + ) + # telemetry + background_tasks.add_task( + record_search_event, q, user.id, pr.results, "hybrid", + pr.confidence_signal, pr.analyzer_confidence, + ) + debug_obj = None + if debug: + debug_obj = AskDebug( + timing_ms={**pr.timing_ms, "evidence_ms": ev_ms, "ask_total_ms": total_ms}, + search_notes=pr.notes, + confidence_signal=pr.confidence_signal, + evidence_candidate_count=len(evidence), + evidence_kept_count=len(evidence), + evidence_skip_reason=ev_skip, + synthesis_cache_hit=False, + hallucination_flags=[], + defense_layers=defense_log, + ) + return AskResponse( + results=pr.results, + ai_answer=None, + citations=[], + synthesis_status="skipped", + synthesis_ms=0.0, + confidence=None, + refused=True, + no_results_reason=no_reason, + query=q, + total=len(pr.results), + completeness="insufficient", + covered_aspects=classifier_result.covered_aspects or None, + missing_aspects=classifier_result.missing_aspects or None, + debug=debug_obj, + ) + + # 4. Synthesis t_synth = time.perf_counter() sr = await synthesize(q, evidence, debug=debug) synth_ms = (time.perf_counter() - t_synth) * 1000 + # 5. Grounding check (post-synthesis) + re-gate + grounding = grounding_check(q, sr.answer or "", evidence) + defense_log["grounding"] = { + "strong": grounding.strong_flags, + "weak": grounding.weak_flags, + } + + # Completeness 결정: grounding 기반 (classifier 는 binary gate 만) + completeness: Literal["full", "partial", "insufficient"] = "full" + covered_aspects = classifier_result.covered_aspects or None + missing_aspects = classifier_result.missing_aspects or None + confirmed_items: list[ConfirmedItem] | None = None + + if len(grounding.strong_flags) >= 2: + # Re-gate: multiple strong → refuse + completeness = "insufficient" + sr.answer = None + sr.refused = True + sr.confidence = None + defense_log["re_gate"] = "refuse(2+strong)" + elif grounding.strong_flags: + # Single strong → partial downgrade + completeness = "partial" + sr.confidence = "low" + defense_log["re_gate"] = "partial(1strong)" + elif grounding.weak_flags: + # Weak → confidence lower only + if sr.confidence == "high": + sr.confidence = "medium" + defense_log["re_gate"] = "conf_lower(weak)" + + # Confidence cap from refusal gate (classifier 부재 시 conservative) + if decision.confidence_cap and sr.confidence: + conf_rank = {"low": 0, "medium": 1, "high": 2} + if conf_rank.get(sr.confidence, 0) > conf_rank.get(decision.confidence_cap, 2): + sr.confidence = decision.confidence_cap + + # Partial 이면 max confidence = medium + if completeness == "partial" and sr.confidence == "high": + sr.confidence = "medium" + + sr.hallucination_flags.extend( + [f"strong:{f}" for f in grounding.strong_flags] + + [f"weak:{f}" for f in grounding.weak_flags] + ) + total_ms = (time.perf_counter() - t_total) * 1000 - # 4. 응답 구성 + # 6. 응답 구성 citations = _build_citations(evidence, sr.used_citations) no_reason = _map_no_results_reason(pr, evidence, ev_skip, sr) + if completeness == "insufficient" and not no_reason: + no_reason = "답변 검증에서 복수 오류 감지" logger.info( - "ask query=%r results=%d evidence=%d cite=%d synth=%s conf=%s refused=%s ev_ms=%.0f synth_ms=%.0f total=%.0f", - q[:80], - len(pr.results), - len(evidence), - len(citations), - sr.status, - sr.confidence or "-", - sr.refused, - ev_ms, - synth_ms, - total_ms, + "ask query=%r results=%d evidence=%d cite=%d synth=%s conf=%s completeness=%s " + "refused=%s grounding_strong=%d grounding_weak=%d ev_ms=%.0f synth_ms=%.0f total=%.0f", + q[:80], len(pr.results), len(evidence), len(citations), + sr.status, sr.confidence or "-", completeness, + sr.refused, len(grounding.strong_flags), len(grounding.weak_flags), + ev_ms, synth_ms, total_ms, ) - # 5. telemetry — 기존 record_search_event 재사용 (Phase 0.3 호환) + # 7. telemetry background_tasks.add_task( - record_search_event, - q, - user.id, - pr.results, - "hybrid", - pr.confidence_signal, - pr.analyzer_confidence, + record_search_event, q, user.id, pr.results, "hybrid", + pr.confidence_signal, pr.analyzer_confidence, ) - debug_obj = ( - _build_ask_debug(pr, evidence, ev_skip, sr, ev_ms, synth_ms, total_ms) - if debug - else None - ) + debug_obj = None + if debug: + timing = dict(pr.timing_ms) + timing["evidence_ms"] = ev_ms + timing["synthesis_ms"] = synth_ms + timing["ask_total_ms"] = total_ms + debug_obj = AskDebug( + timing_ms=timing, + search_notes=pr.notes, + query_analysis=pr.query_analysis, + confidence_signal=pr.confidence_signal, + evidence_candidate_count=len(evidence), + evidence_kept_count=len(evidence), + evidence_skip_reason=ev_skip, + synthesis_cache_hit=sr.cache_hit, + synthesis_raw_preview=sr.raw_preview, + hallucination_flags=sr.hallucination_flags, + defense_layers=defense_log, + ) return AskResponse( results=pr.results, @@ -434,5 +591,9 @@ async def ask( no_results_reason=no_reason, query=q, total=len(pr.results), + completeness=completeness, + covered_aspects=covered_aspects, + missing_aspects=missing_aspects, + confirmed_items=confirmed_items, debug=debug_obj, ) diff --git a/app/core/config.py b/app/core/config.py index 897e269..36c95fe 100644 --- a/app/core/config.py +++ b/app/core/config.py @@ -24,6 +24,8 @@ class AIConfig(BaseModel): embedding: AIModelConfig vision: AIModelConfig rerank: AIModelConfig + # Phase 3.5a: exaone classifier (optional — 없으면 score-only gate) + classifier: AIModelConfig | None = None class Settings(BaseModel): @@ -79,6 +81,11 @@ def load_settings() -> Settings: embedding=AIModelConfig(**ai_raw["models"]["embedding"]), vision=AIModelConfig(**ai_raw["models"]["vision"]), rerank=AIModelConfig(**ai_raw["models"]["rerank"]), + classifier=( + AIModelConfig(**ai_raw["models"]["classifier"]) + if "classifier" in ai_raw.get("models", {}) + else None + ), ) if "nas" in raw: diff --git a/app/prompts/classifier.txt b/app/prompts/classifier.txt new file mode 100644 index 0000000..4f1a94e --- /dev/null +++ b/app/prompts/classifier.txt @@ -0,0 +1,33 @@ +You are an answerability judge. Given a query and evidence chunks, determine if the evidence can answer the query. Respond ONLY in JSON. + +## CALIBRATION (CRITICAL) +- verdict=full: evidence is SUFFICIENT to answer the CORE of the query. Missing minor details does NOT make it insufficient. +- verdict=partial: evidence covers SOME major aspects but CLEARLY MISSES others the user explicitly asked about. +- verdict=insufficient: evidence has NO relevant information for the query, or is completely off-topic. + +Example: Query="제6장 주요 내용", Evidence covers 제6장 definition+scope → verdict=full (core is covered). +Example: Query="제6장 처벌 조항", Evidence covers 제6장 definition but NOT 처벌 → verdict=partial. +Example: Query="감귤 출하량", Evidence about 산업안전보건법 → verdict=insufficient. + +## Rules +1. Your "verdict" must be based ONLY on whether the CONTENT semantically answers the query. Ignore retrieval scores for this field. +2. "covered_aspects": query aspects that evidence covers. Korean labels for Korean queries. +3. "missing_aspects": query aspects that evidence does NOT cover. Korean labels. +4. Keep aspects concise (2-5 words each), non-overlapping. + +## Output Schema +{ + "verdict": "full" | "partial" | "insufficient", + "covered_aspects": ["aspect1"], + "missing_aspects": ["aspect2"], + "confidence": "high" | "medium" | "low" +} + +## Query +{query} + +## Evidence chunks: +{chunks} + +## Retrieval scores (for reference only, NOT for verdict): +[{scores}] diff --git a/app/prompts/search_synthesis_partial.txt b/app/prompts/search_synthesis_partial.txt new file mode 100644 index 0000000..635ac75 --- /dev/null +++ b/app/prompts/search_synthesis_partial.txt @@ -0,0 +1,34 @@ +You are a grounded answer synthesizer handling a PARTIAL answer case. Some aspects of the query CAN be answered, others CANNOT. Respond ONLY in JSON. + +## Task +Answer ONLY the covered aspects. Do NOT attempt to answer missing aspects. + +## Output Schema +{ + "confirmed_items": [ + {"aspect": "aspect label", "text": "1~2 sentence answer", "citations": [1, 2]} + ], + "confidence": "medium" | "low", + "refused": false +} + +## Rules +- Each confirmed_item: aspect label + 1~2 sentences + inline [n] citations +- ONLY use facts present in evidence. No outside knowledge, no guessing. +- Do NOT mention or address missing_aspects in your text. +- Korean query → Korean answer / English → English +- confidence: medium (2+ strong evidence matches) / low (1 or weak) +- Max total text: 400 chars across all items +- 모든 주장 문장 끝에 [n] 필수 + +## Covered aspects (answer these): +{covered_aspects} + +## Missing aspects (do NOT answer these): +{missing_aspects} + +## Query +{query} + +## Evidence +{numbered_evidence} diff --git a/app/services/search/classifier_service.py b/app/services/search/classifier_service.py new file mode 100644 index 0000000..1202d02 --- /dev/null +++ b/app/services/search/classifier_service.py @@ -0,0 +1,150 @@ +"""Answerability classifier (Phase 3.5a). + +exaone3.5:7.8b GPU Ollama 기반. MLX gate 밖 — evidence extraction 과 병렬 실행. + +P1 실측 결과: ternary (full/partial/insufficient) 불안정 → **binary (sufficient/insufficient)**. +"full" vs "partial" 구분은 grounding_check 의 intent alignment 이 담당. + +Classifier verdict 는 "relevant evidence 가 있나" 의 binary 판단. +covered_aspects / missing_aspects 는 로깅용으로 유지 (refusal gate 에서 사용 안 함). +""" + +from __future__ import annotations + +import asyncio +import time +from dataclasses import dataclass +from typing import Literal + +from ai.client import AIClient, _load_prompt, parse_json_response +from core.config import settings +from core.utils import setup_logger + +logger = setup_logger("classifier") + +LLM_TIMEOUT_MS = 5000 +CIRCUIT_THRESHOLD = 5 +CIRCUIT_RECOVERY_SEC = 60 + +_failure_count = 0 +_circuit_open_until: float | None = None + + +@dataclass(slots=True) +class ClassifierResult: + status: Literal["ok", "timeout", "error", "circuit_open", "skipped"] + verdict: Literal["sufficient", "insufficient"] | None + covered_aspects: list[str] + missing_aspects: list[str] + elapsed_ms: float + + +try: + CLASSIFIER_PROMPT = _load_prompt("classifier.txt") +except FileNotFoundError: + CLASSIFIER_PROMPT = "" + logger.warning("classifier.txt not found — classifier will always skip") + + +def _build_input( + query: str, + top_chunks: list[dict], + rerank_scores: list[float], +) -> str: + """Y+ input (content + scores with role separation).""" + chunk_block = "\n".join( + f"[{i+1}] title: {c.get('title','')}\n" + f" section: {c.get('section','')}\n" + f" snippet: {c.get('snippet','')}" + for i, c in enumerate(top_chunks[:3]) + ) + scores_str = ", ".join(f"{s:.2f}" for s in rerank_scores[:3]) + return ( + CLASSIFIER_PROMPT + .replace("{query}", query) + .replace("{chunks}", chunk_block) + .replace("{scores}", scores_str) + ) + + +async def classify( + query: str, + top_chunks: list[dict], + rerank_scores: list[float], +) -> ClassifierResult: + """Always-on binary classifier. Parallel with evidence extraction. + + Returns: + ClassifierResult with verdict=sufficient|insufficient. + Status "ok" 이 아니면 verdict=None (caller 가 fallback 처리). + """ + global _failure_count, _circuit_open_until + t_start = time.perf_counter() + + # Circuit breaker + if _circuit_open_until and time.time() < _circuit_open_until: + return ClassifierResult("circuit_open", None, [], [], 0.0) + + if not CLASSIFIER_PROMPT: + return ClassifierResult("skipped", None, [], [], 0.0) + + if not hasattr(settings.ai, "classifier") or settings.ai.classifier is None: + return ClassifierResult("skipped", None, [], [], 0.0) + + prompt = _build_input(query, top_chunks, rerank_scores) + client = AIClient() + try: + # ⚠ MLX gate 안 씀. Ollama(exaone) 는 concurrent OK. + async with asyncio.timeout(LLM_TIMEOUT_MS / 1000): + raw = await client._request(settings.ai.classifier, prompt) + _failure_count = 0 + except asyncio.TimeoutError: + _failure_count += 1 + if _failure_count >= CIRCUIT_THRESHOLD: + _circuit_open_until = time.time() + CIRCUIT_RECOVERY_SEC + logger.error(f"classifier circuit OPEN for {CIRCUIT_RECOVERY_SEC}s") + logger.warning("classifier timeout") + return ClassifierResult( + "timeout", None, [], [], + (time.perf_counter() - t_start) * 1000, + ) + except Exception as e: + _failure_count += 1 + if _failure_count >= CIRCUIT_THRESHOLD: + _circuit_open_until = time.time() + CIRCUIT_RECOVERY_SEC + logger.error(f"classifier circuit OPEN for {CIRCUIT_RECOVERY_SEC}s") + logger.warning(f"classifier error: {e}") + return ClassifierResult( + "error", None, [], [], + (time.perf_counter() - t_start) * 1000, + ) + finally: + await client.close() + + elapsed_ms = (time.perf_counter() - t_start) * 1000 + parsed = parse_json_response(raw) + if not isinstance(parsed, dict): + logger.warning("classifier parse failed raw=%r", (raw or "")[:200]) + return ClassifierResult("error", None, [], [], elapsed_ms) + + # ternary → binary 매핑 + raw_verdict = parsed.get("verdict", "") + if raw_verdict == "insufficient": + verdict: Literal["sufficient", "insufficient"] | None = "insufficient" + elif raw_verdict in ("full", "partial", "sufficient"): + verdict = "sufficient" + else: + verdict = None + + covered = parsed.get("covered_aspects") or [] + missing = parsed.get("missing_aspects") or [] + if not isinstance(covered, list): + covered = [] + if not isinstance(missing, list): + missing = [] + + logger.info( + "classifier ok query=%r verdict=%s (raw=%s) covered=%d missing=%d elapsed_ms=%.0f", + query[:60], verdict, raw_verdict, len(covered), len(missing), elapsed_ms, + ) + return ClassifierResult("ok", verdict, covered, missing, elapsed_ms) diff --git a/app/services/search/grounding_check.py b/app/services/search/grounding_check.py new file mode 100644 index 0000000..a03417d --- /dev/null +++ b/app/services/search/grounding_check.py @@ -0,0 +1,131 @@ +"""Grounding check — post-synthesis 검증 (Phase 3.5a). + +Strong/weak flag 분리: +- **Strong** (→ partial 강등 or refuse): fabricated_number, intent_misalignment(important) +- **Weak** (→ confidence lower only): uncited_claim, low_overlap, intent_misalignment(generic) + +Re-gate 로직 (Phase 3.5a 9라운드 토론 결과): +- strong 1개 → partial 강등 +- strong 2개 이상 → refuse +- weak → confidence "low" 만 + +Intent alignment (rule-based): +- query 의 핵심 명사가 answer 에 등장하는지 확인 +- "처벌" 같은 중요 키워드 누락은 strong +- "주요", "관련" 같은 generic 은 무시 +""" + +from __future__ import annotations + +import re +from dataclasses import dataclass +from typing import TYPE_CHECKING + +from core.utils import setup_logger + +if TYPE_CHECKING: + from .evidence_service import EvidenceItem + +logger = setup_logger("grounding") + +# "주요", "관련" 등 intent alignment 에서 제외할 generic 단어 +GENERIC_TERMS = frozenset({ + "주요", "관련", "내용", "정의", "기준", "방법", "설명", "개요", + "대한", "위한", "대해", "무엇", "어떤", "어떻게", "있는", + "하는", "되는", "이런", "그런", "이것", "그것", +}) + + +@dataclass(slots=True) +class GroundingResult: + strong_flags: list[str] + weak_flags: list[str] + + +def _extract_number_literals(text: str) -> set[str]: + """숫자 + 단위 추출 + normalize.""" + raw = set(re.findall(r'\d[\d,.]*\s*[명인개%년월일조항호세]\w{0,2}', text)) + normalized = set() + for r in raw: + normalized.add(r.strip()) + num_only = re.match(r'[\d,.]+', r) + if num_only: + normalized.add(num_only.group().replace(',', '')) + # 단독 숫자도 추출 + for d in re.findall(r'\b\d+\b', text): + normalized.add(d) + return normalized + + +def _extract_content_tokens(text: str) -> set[str]: + """한국어 2자 이상 명사 + 영어 3자 이상 단어.""" + return set(re.findall(r'[가-힣]{2,}|[a-zA-Z]{3,}', text)) + + +def check( + query: str, + answer: str, + evidence: list[EvidenceItem], +) -> GroundingResult: + """답변 vs evidence grounding 검증 + query intent alignment.""" + strong: list[str] = [] + weak: list[str] = [] + + if not answer or not evidence: + return GroundingResult([], []) + + evidence_text = " ".join(e.span_text for e in evidence) + + # ── Strong 1: fabricated number ── + answer_nums = _extract_number_literals(answer) + evidence_nums = _extract_number_literals(evidence_text) + for num in answer_nums: + digits_only = re.sub(r'[^\d]', '', num) + if digits_only and not any( + digits_only in re.sub(r'[^\d]', '', en) for en in evidence_nums + ): + strong.append(f"fabricated_number:{num}") + + # ── Strong/Weak 2: query-answer intent alignment ── + query_content = _extract_content_tokens(query) + answer_content = _extract_content_tokens(answer) + if query_content: + missing_terms = query_content - answer_content + important_missing = [ + t for t in missing_terms + if t not in GENERIC_TERMS and len(t) >= 2 + ] + if important_missing: + strong.append( + f"intent_misalignment:{','.join(important_missing[:3])}" + ) + elif len(missing_terms) > len(query_content) * 0.5: + weak.append( + f"intent_misalignment_generic:" + f"missing({','.join(list(missing_terms)[:5])})" + ) + + # ── Weak 1: uncited claim ── + sentences = re.split(r'(?<=[.!?。])\s+', answer) + for s in sentences: + if len(s.strip()) > 20 and not re.search(r'\[\d+\]', s): + weak.append(f"uncited_claim:{s[:40]}") + + # ── Weak 2: token overlap ── + answer_tokens = _extract_content_tokens(answer) + evidence_tokens = _extract_content_tokens(evidence_text) + if answer_tokens: + overlap = len(answer_tokens & evidence_tokens) / len(answer_tokens) + if overlap < 0.4: + weak.append(f"low_overlap:{overlap:.2f}") + + if strong or weak: + logger.info( + "grounding query=%r strong=%d weak=%d flags=%s", + query[:60], + len(strong), + len(weak), + ",".join(strong[:3] + weak[:3]), + ) + + return GroundingResult(strong, weak) diff --git a/app/services/search/refusal_gate.py b/app/services/search/refusal_gate.py new file mode 100644 index 0000000..60eff67 --- /dev/null +++ b/app/services/search/refusal_gate.py @@ -0,0 +1,105 @@ +"""Refusal gate — multi-signal fusion (Phase 3.5a). + +Score gate (deterministic) + classifier verdict (semantic, binary) 를 독립 평가 후 합성. +Classifier 부재 시 3-tier conservative fallback. + +P1 실측 결과: exaone ternary 불안정 → binary (sufficient/insufficient) 로 축소. +"full" vs "partial" 구분은 grounding check (intent alignment) 가 담당. +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import TYPE_CHECKING, Literal + +from core.utils import setup_logger + +if TYPE_CHECKING: + from .classifier_service import ClassifierResult + +logger = setup_logger("refusal_gate") + +# Placeholder thresholds — Phase 3.5b 에서 실측 기반 tuning +# AND 조건이라 false refusal 방어됨 (둘 다 만족해야 refuse) +SCORE_MAX_REFUSE = 0.25 +SCORE_AGG_REFUSE = 0.70 + +# Conservative fallback tiers (classifier 부재 시) +CONSERVATIVE_WEAK = 0.35 +CONSERVATIVE_MID = 0.55 + + +@dataclass(slots=True) +class RefusalDecision: + refused: bool + confidence_cap: Literal["high", "medium", "low"] | None # None = no cap + rule_triggered: str | None # 디버깅: 어느 signal 이 결정에 기여? + + +def decide( + rerank_scores: list[float], + classifier: ClassifierResult | None, +) -> RefusalDecision: + """Multi-signal fusion. Binary classifier verdict 기반. + + Returns: + RefusalDecision. refused=True 이면 synthesis skip. + confidence_cap 은 synthesis 결과의 confidence 에 upper bound 적용. + """ + max_score = max(rerank_scores) if rerank_scores else 0.0 + agg_top3 = sum(sorted(rerank_scores, reverse=True)[:3]) + + score_gate_fails = ( + max_score < SCORE_MAX_REFUSE and agg_top3 < SCORE_AGG_REFUSE + ) + + # ── Classifier 사용 가능 (정상 경로) ── + if classifier and classifier.verdict is not None: + if classifier.verdict == "insufficient": + # Evidence quality override: classifier 가 insufficient 라 해도 + # evidence 가 충분히 좋으면 override (토론 8라운드 합의) + # (evidence quality 는 이 함수 밖에서 별도 체크 — caller 에서 처리) + logger.info( + "refusal gate: classifier=insufficient max=%.2f agg=%.2f", + max_score, agg_top3, + ) + return RefusalDecision( + refused=True, + confidence_cap=None, + rule_triggered="classifier_insufficient", + ) + if score_gate_fails: + logger.info( + "refusal gate: score_low max=%.2f agg=%.2f classifier=%s", + max_score, agg_top3, classifier.verdict, + ) + return RefusalDecision( + refused=True, + confidence_cap=None, + rule_triggered="score_low", + ) + # Classifier says sufficient → proceed + return RefusalDecision( + refused=False, + confidence_cap=None, + rule_triggered=None, + ) + + # ── Classifier 부재 → 3-tier conservative ── + if max_score < CONSERVATIVE_WEAK: + return RefusalDecision( + refused=True, + confidence_cap=None, + rule_triggered="conservative_refuse(no_classifier)", + ) + if max_score < CONSERVATIVE_MID: + return RefusalDecision( + refused=False, + confidence_cap="low", + rule_triggered="conservative_low(no_classifier)", + ) + return RefusalDecision( + refused=False, + confidence_cap="medium", + rule_triggered="conservative_medium(no_classifier)", + ) diff --git a/app/services/search/sentence_splitter.py b/app/services/search/sentence_splitter.py new file mode 100644 index 0000000..b171e2c --- /dev/null +++ b/app/services/search/sentence_splitter.py @@ -0,0 +1,33 @@ +"""문장 분할 (Phase 3.5a — regex 기반). + +Phase 3.5b 에서 KSS 라이브러리 기반으로 업그레이드 예정. +""" + +import re + +MIN_SENTENCE_CHARS = 15 + + +def split_sentences(text: str) -> list[str]: + """한국어/영어 혼합 텍스트를 문장 단위로 분할. + + 규칙: + - 마침표/느낌표/물음표 + 공백/줄바꿈 + - 한국어 종결 어미 (다. 함. 음. 됨.) 패턴 + - MIN_SENTENCE_CHARS 미만은 이전 문장에 병합 + """ + # 1차 분할: punctuation + whitespace + raw = re.split(r'(?<=[.!?。])\s+|(?<=[다됨음함]\.)\s+|\n{2,}', text) + + # 2차: 너무 짧은 것 병합 + merged: list[str] = [] + for part in raw: + part = part.strip() + if not part: + continue + if merged and len(part) < MIN_SENTENCE_CHARS: + merged[-1] = merged[-1] + " " + part + else: + merged.append(part) + + return merged if merged else [text.strip()] if text.strip() else [] diff --git a/config.yaml b/config.yaml index 7b5d589..9e64b24 100644 --- a/config.yaml +++ b/config.yaml @@ -35,6 +35,12 @@ ai: rerank: endpoint: "http://ollama:11434/api/rerank" model: "bge-reranker-v2-m3" + # Phase 3.5a: exaone answerability classifier (GPU Ollama, concurrent OK) + classifier: + endpoint: "http://ollama:11434/v1/chat/completions" + model: "exaone3.5:7.8b-instruct-q8_0" + max_tokens: 512 + timeout: 10 nas: mount_path: "/documents" diff --git a/frontend/src/lib/components/ask/AskAnswer.svelte b/frontend/src/lib/components/ask/AskAnswer.svelte index 82d7dcd..e94eed6 100644 --- a/frontend/src/lib/components/ask/AskAnswer.svelte +++ b/frontend/src/lib/components/ask/AskAnswer.svelte @@ -63,10 +63,14 @@ }; let tokens = $derived(data?.ai_answer ? splitAnswer(data.ai_answer) : []); - let showAnswer = $derived( - !!data && !!data.ai_answer && data.synthesis_status === 'completed' && !data.refused, + let showFullAnswer = $derived( + !!data && !!data.ai_answer && data.completeness === 'full' + && data.synthesis_status === 'completed' && !data.refused, ); - let showWarning = $derived(!!data && !showAnswer); + let showPartial = $derived( + !!data && data.completeness === 'partial' && !data.refused, + ); + let showWarning = $derived(!!data && !showFullAnswer && !showPartial);
@@ -107,7 +111,7 @@ 근거 기반 답변 생성 중… 약 15초 소요

- {:else if showAnswer && data} + {:else if showFullAnswer && data}
{#each tokens as tok} {#if tok.type === 'cite'} @@ -124,6 +128,67 @@ {/if} {/each}
+ {:else if showPartial && data} + +
+ 일부 답변 + + {#if data.ai_answer} +
+ {#each tokens as tok} + {#if tok.type === 'cite'} + + {:else} + {tok.value} + {/if} + {/each} +
+ {:else if data.confirmed_items?.length} +
+

✓ 답변 가능

+
    + {#each data.confirmed_items as item} +
  • + {item.aspect}: + {item.text} + {#each item.citations as n} + + {/each} +
  • + {/each} +
+
+ {/if} + + {#if data.missing_aspects?.length} +
+

✗ 답변 불가

+
    + {#each data.missing_aspects as aspect} +
  • {aspect} (근거 없음)
  • + {/each} +
+
+ {/if} + +
+ +
+
{:else if showWarning && data}