diff --git a/app/ai/client.py b/app/ai/client.py index cd87f5b..60d1b82 100644 --- a/app/ai/client.py +++ b/app/ai/client.py @@ -260,23 +260,6 @@ class AIClient: cfg = self.ai.deep or self.ai.primary return await self._request(cfg, prompt, system=system) - async def call_classifier(self, prompt: str) -> str: - """answerability classifier (config ai.classifier, Mac mini 26B MLX). - - private _request 직접 호출(classifier_service)을 봉인하는 public 진입점. gate 는 - caller(classifier_service)가 acquire_mlx_gate 로 관리 — call_primary 와 동일한 - caller-managed 계약(여기서 self-gate 하면 caller 와 double-acquire 데드락). - """ - return await self._request(self.ai.classifier, prompt) - - async def call_verifier(self, prompt: str) -> str: - """semantic verifier (config ai.verifier, Mac mini 26B MLX). - - private _request 직접 호출(verifier_service)을 봉인. gate 는 caller(verifier_service) - 가 관리(caller-managed — self-gate 금지). - """ - return await self._request(self.ai.verifier, prompt) - # ─── Legacy API (classify_worker 교체 시 제거 예정) ─────────────────── async def classify(self, text: str, cfg=None) -> dict: diff --git a/app/api/search.py b/app/api/search.py index 5d6f559..c4fb1c7 100644 --- a/app/api/search.py +++ b/app/api/search.py @@ -3,42 +3,28 @@ 실제 검색 파이프라인(retrieval → fusion → rerank → diversity → confidence) 은 `services/search/search_pipeline.py::run_search()` 로 분리되어 있다. 이 파일은 다음만 담당: - - Pydantic 스키마 (SearchResult / SearchResponse / SearchDebug / DebugCandidate - / Citation / AskResponse / AskDebug) + - Pydantic 스키마 (SearchResult / SearchResponse / SearchDebug / DebugCandidate) - `/search` endpoint wrapper (run_search 호출 + logger + telemetry + 직렬화) - - `/ask` endpoint wrapper (Phase 3.3 에서 추가) """ -import asyncio -import hmac -import time from datetime import date -from typing import Annotated, Literal +from typing import Annotated -from fastapi import APIRouter, BackgroundTasks, Depends, Header, Query +from fastapi import APIRouter, BackgroundTasks, Depends, Query from fastapi.responses import JSONResponse from pydantic import BaseModel from sqlalchemy.ext.asyncio import AsyncSession from core.auth import get_current_user -from core.config import settings from core.database import get_session from core.utils import setup_logger from models.user import User -from services.document_telemetry import sanitize_source -from services.search.classifier_service import ClassifierResult, classify -from services.search.evidence_service import EvidenceItem, extract_evidence from services.search.fusion_service import DEFAULT_FUSION -from services.search.grounding_check import check as grounding_check -from services.search.refusal_gate import RefusalDecision, decide as refusal_decide from services.search import query_rewriter from services.search.retrieval_service import AxisFilter from services.search.result_decorate import compute_facets, decorate_version_status from services.search.search_pipeline import PipelineResult, run_search -from services.search.synthesis_service import SynthesisResult, synthesize -from services.search.verifier_service import VerifierResult, verify -from services.prompt_versions import ASK_PROMPT_VERSION, resolve_primary_model -from services.search_telemetry import record_ask_event, record_search_event +from services.search_telemetry import record_search_event # logs/search.log + stdout 동시 출력 (Phase 0.4) logger = setup_logger("search") @@ -354,241 +340,3 @@ async def search( debug=debug_obj, facets=facets_obj, ) - - -# ═══════════════════════════════════════════════════════════ -# Phase 3.3: /api/search/ask — Evidence + Grounded Synthesis -# ═══════════════════════════════════════════════════════════ - - -class Citation(BaseModel): - """answer 본문의 [n] 에 해당하는 근거 단일 행.""" - - n: int - chunk_id: int | None - doc_id: int - title: str | None - section_title: str | None - span_text: str # evidence LLM 이 추출한 50~300자 - full_snippet: str # 원본 800자 (citation 원문 보기 전용) - relevance: float - rerank_score: float - - -class ConfirmedItem(BaseModel): - """Partial answer 의 개별 aspect 답변.""" - - aspect: str - text: str - citations: list[int] - - -class AskDebug(BaseModel): - """`/ask?debug=true` 응답 확장.""" - - timing_ms: dict[str, float] - search_notes: list[str] - query_analysis: dict | None = None - confidence_signal: float - evidence_candidate_count: int - evidence_kept_count: int - evidence_skip_reason: str | None - synthesis_cache_hit: bool - synthesis_prompt_preview: str | None = None - synthesis_raw_preview: str | None = None - hallucination_flags: list[str] = [] - # Phase 3.5a: per-layer defense 로깅 - defense_layers: dict | None = None - - -class AskResponse(BaseModel): - """`/ask` 응답. Phase 3.5a: completeness + aspects 추가.""" - - results: list[SearchResult] - ai_answer: str | None - citations: list[Citation] - synthesis_status: Literal[ - "completed", "timeout", "skipped", "no_evidence", "parse_failed", "llm_error", - # PR-MacBook-RAG-Backend-1: 200 응답에는 등장하지 않음 (해당 status 는 503 분기). - # Literal 호환성 위해 포함. - "backend_unavailable", - ] - synthesis_ms: float - confidence: Literal["high", "medium", "low"] | None - refused: bool - no_results_reason: str | None - query: str - total: int - # Phase 3.5a - completeness: Literal["full", "partial", "insufficient"] = "full" - covered_aspects: list[str] | None = None - missing_aspects: list[str] | None = None - confirmed_items: list[ConfirmedItem] | None = None - # PR-MacBook-RAG-Backend-1: backend dispatcher metadata. - # backend 미지정 호출은 둘 다 None 으로 유지 (기존 호출자 호환 — Hermes docsrv_ask / - # voice-memo-bot 응답 형식 변동 0). 명시 opt-in 시만 채워짐. - backend_requested: str | None = None - backend_used: str | None = None - debug: AskDebug | None = None - - -def _map_no_results_reason( - pr: PipelineResult, - evidence: list[EvidenceItem], - ev_skip: str | None, - sr: SynthesisResult, -) -> str | None: - """사용자에게 보여줄 한국어 메시지 매핑. - - Failure mode 표 (plan §Failure Modes) 기반. - """ - # LLM 자가 refused → 모델이 준 사유 그대로 - if sr.refused and sr.refuse_reason: - return sr.refuse_reason - - # synthesis 상태 우선 - if sr.status == "no_evidence": - if not pr.results: - return "검색 결과가 없습니다." - return "관련도 높은 근거를 찾지 못했습니다." - if sr.status == "skipped": - return "검색 결과가 없습니다." - if sr.status == "timeout": - return "답변 생성이 지연되어 생략했습니다. 검색 결과를 확인해 주세요." - if sr.status == "parse_failed": - return "답변 형식 오류로 생략했습니다." - if sr.status == "llm_error": - return "AI 서버에 일시적 문제가 있습니다." - - # evidence 단계 실패는 fallback 을 탔더라도 notes 용 - if ev_skip == "all_low_rerank": - return "관련도 높은 근거를 찾지 못했습니다." - if ev_skip == "empty_retrieval": - return "검색 결과가 없습니다." - - return None - - -def _build_citations( - evidence: list[EvidenceItem], used_citations: list[int] -) -> list[Citation]: - """answer 본문에 실제로 등장한 n 만 Citation 으로 변환.""" - by_n = {e.n: e for e in evidence} - out: list[Citation] = [] - for n in used_citations: - e = by_n.get(n) - if e is None: - continue - out.append( - Citation( - n=e.n, - chunk_id=e.chunk_id, - doc_id=e.doc_id, - title=e.title, - section_title=e.section_title, - span_text=e.span_text, - full_snippet=e.full_snippet, - relevance=e.relevance, - rerank_score=e.rerank_score, - ) - ) - return out - - -def _build_ask_debug( - pr: PipelineResult, - evidence: list[EvidenceItem], - ev_skip: str | None, - sr: SynthesisResult, - ev_ms: float, - synth_ms: float, - total_ms: float, -) -> AskDebug: - timing: dict[str, float] = dict(pr.timing_ms) - timing["evidence_ms"] = ev_ms - timing["synthesis_ms"] = synth_ms - timing["ask_total_ms"] = total_ms - - # candidate count 는 rule filter 통과한 수 (recomputable from results) - # 엄밀히는 evidence_service 내부 숫자인데, evidence 길이 ≈ kept, candidate - # 는 관측이 어려움 → kept 는 evidence 길이, candidate 는 별도 필드 없음. - # 단순화: candidate_count = len(evidence) 를 상한 근사로 둠 (debug 전용). - return AskDebug( - timing_ms=timing, - search_notes=pr.notes, - query_analysis=pr.query_analysis, - confidence_signal=pr.confidence_signal, - evidence_candidate_count=len(evidence), - evidence_kept_count=len(evidence), - evidence_skip_reason=ev_skip, - synthesis_cache_hit=sr.cache_hit, - synthesis_prompt_preview=None, # 현재 synthesis_service 에서 노출 안 함 - synthesis_raw_preview=sr.raw_preview, - hallucination_flags=sr.hallucination_flags, - ) - - -def _detect_synthesis_failure(sr: SynthesisResult) -> str | None: - """Synthesis 가 유효한 답을 못 냈으면 re_gate 라벨, 아니면 None. - - 판정 우선순위 (Phase 3.5 fix3): - 1) sr.refused → LLM self-refuse (status="completed") 또는 mechanical fail 후 refused 전파 - - status=="completed" + refused=True → "synthesis_self_refuse" - - 그 외 → f"synthesis_failed({status})" - 2) sr.status ∈ {timeout, parse_failed, llm_error} → f"synthesis_failed({status})" - 3) answer 공백 → f"synthesis_failed({status})" - 4) 유효 → None - """ - if sr.refused: - if sr.status == "completed": - return "synthesis_self_refuse" - return f"synthesis_failed({sr.status})" - if sr.status in ("timeout", "parse_failed", "llm_error"): - return f"synthesis_failed({sr.status})" - if not (sr.answer or "").strip(): - return f"synthesis_failed({sr.status})" - return None - - -def _resolve_eval_identity( - x_source: str | None, - x_eval_case_id: str | None, - x_eval_token: str | None, -) -> tuple[str, str | None]: - """X-Source/X-Eval-Case-Id 신뢰 검증 (Phase 3.5 fix2). - - 규칙: - - 기본값: source='document_server', eval_case_id=None - - X-Source=eval 또는 X-Eval-Case-Id 가 들어왔다면 eval claim 으로 간주 - - eval claim 은 X-Eval-Token == settings.eval_runner_token 일 때만 수용 - (constant-time compare, env 미설정 시 항상 거부) - - 거부 시: 헤더 무시 + warning log + source=sanitize(non-eval) / eval_case_id=None - - 통과 시: source='eval', eval_case_id=x_eval_case_id - - 반환: (source, eval_case_id) - """ - claimed_source = sanitize_source(x_source) - is_eval_claim = (claimed_source == "eval") or bool(x_eval_case_id) - if not is_eval_claim: - # 일반 호출 — eval_case_id 강제 None (source != 'eval' 이면 case_id 의미 없음) - return claimed_source, None - - # eval claim — token 검증 - expected = settings.eval_runner_token - presented = x_eval_token or "" - token_valid = bool(expected) and hmac.compare_digest(presented, expected) - if not token_valid: - logger.warning( - "eval header rejected: source=%s case_id=%s token_present=%s expected_set=%s", - x_source, x_eval_case_id, bool(x_eval_token), bool(expected), - ) - # 일반 호출로 강등 — source='eval' 주장은 무시, case_id 도 무시 - # claimed_source 가 'eval' 이면 default 'document_server' 로 - if claimed_source == "eval": - return "document_server", None - return claimed_source, None - - # token OK — eval 라벨 수용 - return "eval", x_eval_case_id - - diff --git a/app/prompts/classifier.txt b/app/prompts/classifier.txt deleted file mode 100644 index 4f1a94e..0000000 --- a/app/prompts/classifier.txt +++ /dev/null @@ -1,33 +0,0 @@ -You are an answerability judge. Given a query and evidence chunks, determine if the evidence can answer the query. Respond ONLY in JSON. - -## CALIBRATION (CRITICAL) -- verdict=full: evidence is SUFFICIENT to answer the CORE of the query. Missing minor details does NOT make it insufficient. -- verdict=partial: evidence covers SOME major aspects but CLEARLY MISSES others the user explicitly asked about. -- verdict=insufficient: evidence has NO relevant information for the query, or is completely off-topic. - -Example: Query="제6장 주요 내용", Evidence covers 제6장 definition+scope → verdict=full (core is covered). -Example: Query="제6장 처벌 조항", Evidence covers 제6장 definition but NOT 처벌 → verdict=partial. -Example: Query="감귤 출하량", Evidence about 산업안전보건법 → verdict=insufficient. - -## Rules -1. Your "verdict" must be based ONLY on whether the CONTENT semantically answers the query. Ignore retrieval scores for this field. -2. "covered_aspects": query aspects that evidence covers. Korean labels for Korean queries. -3. "missing_aspects": query aspects that evidence does NOT cover. Korean labels. -4. Keep aspects concise (2-5 words each), non-overlapping. - -## Output Schema -{ - "verdict": "full" | "partial" | "insufficient", - "covered_aspects": ["aspect1"], - "missing_aspects": ["aspect2"], - "confidence": "high" | "medium" | "low" -} - -## Query -{query} - -## Evidence chunks: -{chunks} - -## Retrieval scores (for reference only, NOT for verdict): -[{scores}] diff --git a/app/prompts/verifier.txt b/app/prompts/verifier.txt deleted file mode 100644 index aa3fa8b..0000000 --- a/app/prompts/verifier.txt +++ /dev/null @@ -1,42 +0,0 @@ -You are a grounding verifier. Given an answer and its evidence sources, check if the answer contradicts or fabricates information. Respond ONLY in JSON. - -## Contradiction Types (IMPORTANT — severity depends on type) -- **direct_negation** (CRITICAL): Answer directly contradicts evidence. Examples: evidence "의무" but answer "권고"; evidence "금지" but answer "허용"; negation reversal ("~해야 한다" vs "~할 필요 없다"). -- **numeric_conflict**: Answer states a number different from evidence. "50명" in evidence but "100명" in answer. Only flag if the same concept is referenced. severity=critical when the number is the CORE answered quantity (amount/count/rate/date/duration that the query asked for); severity=minor when the number is peripheral (e.g., example/footnote). -- **intent_core_mismatch**: Answer addresses a fundamentally different topic than the query asked about. -- **nuance**: Answer overgeneralizes or adds qualifiers not in evidence (e.g., "모든" when evidence says "일부"). -- **unsupported_claim**: Answer makes a factual claim with no basis in any evidence. - -## Rules -1. Compare each claim in the answer against the cited evidence. A claim with [n] citation should be checked against evidence [n]. -2. NOT a contradiction: Paraphrasing, summarizing, or restating the same fact in different words. Korean formal/informal style (합니다/한다) differences. -3. Numbers must match exactly after normalization (1,000 = 1000). Range values (e.g., "100~200명") satisfy any answer within range. -4. Legal/regulatory terms must preserve original meaning (의무 ≠ 권고, 금지 ≠ 제한, 허용 ≠ 금지). -5. Maximum 5 contradictions (most severe first: direct_negation > numeric_conflict > intent_core_mismatch > nuance > unsupported_claim). - -## Output Schema -{ - "contradictions": [ - { - "type": "direct_negation" | "numeric_conflict" | "intent_core_mismatch" | "nuance" | "unsupported_claim", - "severity": "critical" | "minor", - "claim": "answer 내 해당 구절 (50자 이내)", - "evidence_ref": "대응 근거 내용 (50자 이내, [n] 포함)", - "explanation": "모순 이유 (한국어, 30자 이내)" - } - ], - "verdict": "clean" | "minor_issues" | "major_issues" -} - -severity mapping: -- direct_negation → "critical" -- numeric_conflict → "critical" if the number is the CORE answered quantity, else "minor" -- All other types → "minor" - -If no contradictions: {"contradictions": [], "verdict": "clean"} - -## Answer -{answer} - -## Evidence -{numbered_evidence} diff --git a/app/services/search/classifier_service.py b/app/services/search/classifier_service.py deleted file mode 100644 index da44f48..0000000 --- a/app/services/search/classifier_service.py +++ /dev/null @@ -1,156 +0,0 @@ -"""Answerability classifier (Phase 3.5a). - -Mac mini 26B MLX 기반 (config.yaml ai.models.classifier — PR #20 이후 triage/primary/classifier 동일 endpoint). MLX gate 밖 — evidence extraction 과 병렬 실행 (concurrent 안전성 별 검토). - -P1 실측 결과: ternary (full/partial/insufficient) 불안정 → **binary (sufficient/insufficient)**. -"full" vs "partial" 구분은 grounding_check 의 intent alignment 이 담당. - -Classifier verdict 는 "relevant evidence 가 있나" 의 binary 판단. -covered_aspects / missing_aspects 는 로깅용으로 유지 (refusal gate 에서 사용 안 함). -""" - -from __future__ import annotations - -import asyncio -import time -from dataclasses import dataclass -from typing import Literal - -from ai.client import AIClient, _load_prompt, parse_json_response -from core.config import settings -from core.utils import setup_logger - -from .llm_gate import Priority, acquire_mlx_gate - -logger = setup_logger("classifier") - -LLM_TIMEOUT_MS = 30000 -CIRCUIT_THRESHOLD = 5 -CIRCUIT_RECOVERY_SEC = 60 - -_failure_count = 0 -_circuit_open_until: float | None = None - - -@dataclass(slots=True) -class ClassifierResult: - status: Literal["ok", "timeout", "error", "circuit_open", "skipped"] - verdict: Literal["sufficient", "insufficient"] | None - covered_aspects: list[str] - missing_aspects: list[str] - elapsed_ms: float - - -try: - CLASSIFIER_PROMPT = _load_prompt("classifier.txt") -except FileNotFoundError: - CLASSIFIER_PROMPT = "" - logger.warning("classifier.txt not found — classifier will always skip") - - -def _build_input( - query: str, - top_chunks: list[dict], - rerank_scores: list[float], -) -> str: - """Y+ input (content + scores with role separation).""" - chunk_block = "\n".join( - f"[{i+1}] title: {c.get('title','')}\n" - f" section: {c.get('section','')}\n" - f" snippet: {c.get('snippet','')}" - for i, c in enumerate(top_chunks[:3]) - ) - scores_str = ", ".join(f"{s:.2f}" for s in rerank_scores[:3]) - return ( - CLASSIFIER_PROMPT - .replace("{query}", query) - .replace("{chunks}", chunk_block) - .replace("{scores}", scores_str) - ) - - -async def classify( - query: str, - top_chunks: list[dict], - rerank_scores: list[float], -) -> ClassifierResult: - """Always-on binary classifier. Parallel with evidence extraction. - - Returns: - ClassifierResult with verdict=sufficient|insufficient. - Status "ok" 이 아니면 verdict=None (caller 가 fallback 처리). - """ - global _failure_count, _circuit_open_until - t_start = time.perf_counter() - - # Circuit breaker - if _circuit_open_until and time.time() < _circuit_open_until: - return ClassifierResult("circuit_open", None, [], [], 0.0) - - if not CLASSIFIER_PROMPT: - return ClassifierResult("skipped", None, [], [], 0.0) - - if not hasattr(settings.ai, "classifier") or settings.ai.classifier is None: - return ClassifierResult("skipped", None, [], [], 0.0) - - prompt = _build_input(query, top_chunks, rerank_scores) - client = AIClient() - try: - # 2026-05-17: PR #20 이후 endpoint 가 Mac mini 26B → llm_gate Semaphore(1) 필수. - # Gate 미사용 시 classifier + evidence + synthesis 가 동시에 single-inference - # MLX 에 race → 거의 모두 timeout (실측: 8/10 fixture query). docstring 영구 룰: - # "MLX primary 호출 경로는 예외 없이 gate 획득 필수". - async with acquire_mlx_gate(Priority.FOREGROUND): - async with asyncio.timeout(LLM_TIMEOUT_MS / 1000): - raw = await client.call_classifier(prompt) - _failure_count = 0 - except asyncio.TimeoutError: - _failure_count += 1 - if _failure_count >= CIRCUIT_THRESHOLD: - _circuit_open_until = time.time() + CIRCUIT_RECOVERY_SEC - logger.error(f"classifier circuit OPEN for {CIRCUIT_RECOVERY_SEC}s") - logger.warning("classifier timeout") - return ClassifierResult( - "timeout", None, [], [], - (time.perf_counter() - t_start) * 1000, - ) - except Exception as e: - _failure_count += 1 - if _failure_count >= CIRCUIT_THRESHOLD: - _circuit_open_until = time.time() + CIRCUIT_RECOVERY_SEC - logger.error(f"classifier circuit OPEN for {CIRCUIT_RECOVERY_SEC}s") - logger.warning("classifier error: type=%s repr=%r", type(e).__name__, e) - return ClassifierResult( - "error", None, [], [], - (time.perf_counter() - t_start) * 1000, - ) - finally: - await client.close() - - elapsed_ms = (time.perf_counter() - t_start) * 1000 - parsed = parse_json_response(raw) - if not isinstance(parsed, dict): - logger.warning("classifier parse failed raw=%r", (raw or "")[:200]) - return ClassifierResult("error", None, [], [], elapsed_ms) - - # ternary → binary 매핑 - raw_verdict = parsed.get("verdict", "") - if raw_verdict == "insufficient": - verdict: Literal["sufficient", "insufficient"] | None = "insufficient" - elif raw_verdict in ("full", "partial", "sufficient"): - verdict = "sufficient" - else: - verdict = None - - covered = parsed.get("covered_aspects") or [] - missing = parsed.get("missing_aspects") or [] - if not isinstance(covered, list): - covered = [] - if not isinstance(missing, list): - missing = [] - - logger.info( - "classifier ok query=%r verdict=%s (raw=%s) covered=%d missing=%d elapsed_ms=%.0f", - query[:60], verdict, raw_verdict, len(covered), len(missing), elapsed_ms, - ) - return ClassifierResult("ok", verdict, covered, missing, elapsed_ms) diff --git a/app/services/search/grounding_check.py b/app/services/search/grounding_check.py deleted file mode 100644 index 66108ca..0000000 --- a/app/services/search/grounding_check.py +++ /dev/null @@ -1,505 +0,0 @@ -"""Grounding check — post-synthesis 검증 (Phase 3.5a). - -Strong/weak flag 분리: -- **Strong** (→ partial 강등 or refuse): fabricated_number, intent_misalignment(important) -- **Weak** (→ confidence lower only): uncited_claim, low_overlap, intent_misalignment(generic) - -Re-gate 로직 (Phase 3.5a 9라운드 토론 결과): -- strong 1개 → partial 강등 -- strong 2개 이상 → refuse -- weak → confidence "low" 만 - -Intent alignment (rule-based): -- query 의 핵심 명사가 answer 에 등장하는지 확인 -- "처벌" 같은 중요 키워드 누락은 strong -- "주요", "관련" 같은 generic 은 무시 -""" - -from __future__ import annotations - -import re -from dataclasses import dataclass -from typing import TYPE_CHECKING - -from core.utils import setup_logger - -if TYPE_CHECKING: - from .evidence_service import EvidenceItem - -logger = setup_logger("grounding") - -# "주요", "관련" 등 intent alignment 에서 제외할 generic 단어 -GENERIC_TERMS = frozenset({ - "주요", "관련", "내용", "정의", "기준", "방법", "설명", "개요", - "대한", "위한", "대해", "무엇", "어떤", "어떻게", "있는", - "하는", "되는", "이런", "그런", "이것", "그것", -}) - - -@dataclass(slots=True) -class GroundingResult: - strong_flags: list[str] - weak_flags: list[str] - - -_UNIT_CHARS = r'명인개%년월일조항호세건원회' - -# "이상/이하/초과/미만" — threshold 표현 (numeric conflict 에서 skip 대상) -_THRESHOLD_SUFFIXES = re.compile(r'이상|이하|초과|미만') - -# 약칭/근사치 prefix — 매칭 전 제거 (Phase 3.5 B1). -# ⚠ 최대/최소 는 의도적으로 제외 — 이들은 bound operator 라 의미가 다름 (Phase 3.5 B1 fix3). -# 약/대략/거의/얼추 만 노이즈 prefix 로 strip. -_APPROX_PREFIX_RE = re.compile(r'(약|대략|거의|얼추)\s*') - -# 단위 동의어 dict — 추출 직후 정규화 (Phase 3.5 B1) -# 의미가 동일한 단위는 같은 표기로 통일해서 set 비교/range overlap 안정화. -_UNIT_SYNONYMS: dict[str, str] = { - "인": "명", - "사람": "명", - "퍼센트": "%", - "프로": "%", - "KRW": "원", - "krw": "원", -} - -# tolerance(±1%) 허용 단위 — 양적 측정값 (Phase 3.5 B1) -_TOLERANCE_UNITS: frozenset[str] = frozenset({"명", "원", "%", "건", "개"}) - -# tolerance 미적용 단위 — 식별자성 숫자 (연도/조문/횟수) -_EXACT_ONLY_UNITS: frozenset[str] = frozenset({"년", "월", "일", "조", "항", "호", "회"}) - -# 최대/최소 prefix 패턴 — bound operator (Phase 3.5 B1 fix3). -# 매칭된 숫자는 exact pool 에서 제외하고 one-sided range 로 변환. -# 경계값 자체는 clear 대상 아님 (Codex 권장: "최대 100명" + answer "100명" → flag 유지). -_BOUND_PATTERN_RE = re.compile( - rf'(최대|최소)\s*(\d[\d,.]*)\s*([{_UNIT_CHARS}]|인|사람|퍼센트|프로|KRW|krw)' -) -_RANGE_INF = 10**18 # one-sided range 상한 sentinel - - -def _normalize_unit(unit: str) -> str: - """단위 동의어 → 대표 표기.""" - return _UNIT_SYNONYMS.get(unit, unit) - - -def _extract_unit(literal: str) -> str | None: - """리터럴에서 숫자 뒤 단위(한 글자 또는 동의어) 추출 + 정규화.""" - # 천단위 콤마 + 옵션 소수 + 한글 단위 한 글자 또는 동의어 - m = re.match(rf'[\d,.]+\s*([{_UNIT_CHARS}]|인|사람|퍼센트|프로|KRW|krw)', literal) - if not m: - return None - return _normalize_unit(m.group(1)) - - -def _extract_numeric_corpus(text: str) -> dict: - """단위별 숫자 + 범위 + bound 통합 추출 (Phase 3.5 B1 fix1+fix3). - - Returns: - { - "exact_by_unit": {unit_or_None: set(digits)}, # 평범한 숫자 (bound 제외) - "ranges_by_unit": {unit: [(lo, hi), ...]}, # 양방향(A~B) + 단방향(최대/최소) - } - - None 키는 단위 없는 bare 숫자. - `최대 N ` → ranges[(0, N-1)] (경계값 자체는 cleared 대상 아님) - `최소 N ` → ranges[(N+1, INF)] - """ - cleaned = _APPROX_PREFIX_RE.sub('', text) - - exact_by_unit: dict[str | None, set[str]] = {None: set()} - ranges_by_unit: dict[str, list[tuple[int, int]]] = {} - - # 1) 최대/최소 — bound. exact pool 에서 제외, one-sided range 로 변환. - bound_spans: list[tuple[int, int]] = [] # 매칭 substring 위치 — 이후 단계에서 skip - for m in _BOUND_PATTERN_RE.finditer(cleaned): - bound_kind = m.group(1) - try: - n = int(m.group(2).replace(',', '').split('.')[0]) - except ValueError: - continue - unit = _normalize_unit(m.group(3)) - if bound_kind == "최대": - ranges_by_unit.setdefault(unit, []).append((0, max(0, n - 1))) - else: # 최소 - ranges_by_unit.setdefault(unit, []).append((n + 1, _RANGE_INF)) - bound_spans.append((m.start(), m.end())) - - def _in_bound_span(pos: int) -> bool: - return any(s <= pos < e for s, e in bound_spans) - - # 2) 천단위 콤마 bare number - for m in re.finditer(r'\d{1,3}(?:,\d{3})+(?:\.\d+)?', cleaned): - if _in_bound_span(m.start()): - continue - exact_by_unit[None].add(m.group().replace(',', '')) - - # 3) 단위 있는 숫자 (단위 동의어 포함) - for m in re.finditer( - rf'(\d[\d,.]*)\s*([{_UNIT_CHARS}]|인|사람|퍼센트|프로|KRW|krw)', - cleaned, - ): - if _in_bound_span(m.start()): - continue - digits = m.group(1).replace(',', '').split('.')[0] - if not digits: - continue - unit = _normalize_unit(m.group(2)) - exact_by_unit.setdefault(unit, set()).add(digits) - - # 4) 양방향 범위 표현 (A~B / A 부터 B) - for m in re.finditer( - rf'(\d[\d,.]*)\s*(?:[~\-–]|부터)\s*(\d[\d,.]*)\s*([{_UNIT_CHARS}]|인|사람|퍼센트|프로)', - cleaned, - ): - if _in_bound_span(m.start()): - continue - try: - lo = int(m.group(1).replace(',', '').split('.')[0]) - hi = int(m.group(2).replace(',', '').split('.')[0]) - except ValueError: - continue - unit = _normalize_unit(m.group(3)) - ranges_by_unit.setdefault(unit, []).append((min(lo, hi), max(lo, hi))) - - # 5) bare 2자리+ 단독 숫자 - for m in re.finditer(r'\b(\d{2,})\b', cleaned): - if _in_bound_span(m.start()): - continue - exact_by_unit[None].add(m.group()) - - return { - "exact_by_unit": exact_by_unit, - "ranges_by_unit": ranges_by_unit, - } - - -def _within_unit_range( - n: int, unit: str | None, ranges_by_unit: dict[str, list[tuple[int, int]]] -) -> bool: - """unit-matching range 검증. - - answer unit 이 None (bare 숫자) 면 보수적으로 False — bare 답변은 range clear 대상 아님. - """ - if unit is None: - return False - return any(lo <= n <= hi for lo, hi in ranges_by_unit.get(unit, [])) - - -def _close_to_unit_pool( - n: int, unit: str | None, exact_by_unit: dict[str | None, set[str]], tol: float -) -> bool: - """unit-matching tolerance 검증. - - answer unit 이 None 이면 False — bare 답변은 tolerance 대상 아님. - 같은 unit bucket 안의 후보만 비교. - """ - if unit is None: - return False - candidates = exact_by_unit.get(unit, set()) - for c in candidates: - try: - cn = int(c) - except ValueError: - continue - if cn == 0: - continue - if abs(n - cn) / cn <= tol: - return True - return False - - -def _extract_number_literals(text: str) -> set[str]: - """숫자 + 단위 추출 + normalize (Phase 3.5 B1: 6단계 확장). - - 1) 약칭 prefix 제거 ("약 100명" → "100명") - 2) 천단위 콤마 bare number 우선 ("1,000" → "1000" set 등록) - 3) 한국어 단위 접미사 매칭 (기존) - 4) 범위 표현 양쪽 숫자 추출 (separator: ~, -, –, 부터) - 5) 단위 동의어 정규화 (인→명, 퍼센트→%, KRW→원) - 6) bare 2자리+ 추출 (기존) - """ - # 1. 약칭 prefix 제거 (전체 텍스트에서) - cleaned = _APPROX_PREFIX_RE.sub('', text) - - # 2. 천단위 콤마 bare number — normalize 된 값을 set 에 선등록 - normalized: set[str] = set() - for m in re.finditer(r'\d{1,3}(?:,\d{3})+(?:\.\d+)?', cleaned): - normalized.add(m.group().replace(',', '')) - - # 3. 숫자 + 한국어 단위 접미사 (동의어 포함) - raw: set[str] = set(re.findall( - rf'\d[\d,.]*\s*(?:[{_UNIT_CHARS}]|인|사람|퍼센트|프로|KRW|krw)\w{{0,2}}', - cleaned, - )) - - # 4. 범위 표현 — separator 에 "부터" 추가 - for m in re.finditer( - rf'(\d[\d,.]*)\s*(?:[~\-–]|부터)\s*(\d[\d,.]*)\s*([{_UNIT_CHARS}]|인|사람|퍼센트|프로)', - cleaned, - ): - unit_norm = _normalize_unit(m.group(3)) - raw.add(m.group(1) + unit_norm) - raw.add(m.group(2) + unit_norm) - - # 5. normalize: 단위 동의어 통일 + 콤마 제거 - for r in raw: - # 단위 부분 정규화 - m = re.match(r'([\d,.]+)\s*([^\d\s]+)', r) - if m: - digits_part = m.group(1) - unit_part = _normalize_unit(m.group(2)) - normalized.add(digits_part + unit_part) - normalized.add(digits_part.replace(',', '') + unit_part) - normalized.add(r.strip()) - num_only = re.match(r'[\d,.]+', r) - if num_only: - normalized.add(num_only.group().replace(',', '')) - - # 6. 단독 숫자 (2자리+ 만) - for d in re.findall(r'\b(\d{2,})\b', cleaned): - normalized.add(d) - return normalized - - -def _within_evidence_range(digits: str, raw: str, evidence_text: str) -> bool: - """evidence 에 'A~B 단위' 가 있고 answer 의 숫자가 그 범위 안이면 True. - - 범위 단위는 무시 (단위 비교는 호출 전 단계). digits = 정수 문자열. - """ - try: - n = int(digits) - except ValueError: - return False - cleaned_ev = _APPROX_PREFIX_RE.sub('', evidence_text) - for m in re.finditer( - rf'(\d[\d,.]*)\s*(?:[~\-–]|부터)\s*(\d[\d,.]*)\s*[{_UNIT_CHARS}]', - cleaned_ev, - ): - try: - lo = int(m.group(1).replace(',', '').split('.')[0]) - hi = int(m.group(2).replace(',', '').split('.')[0]) - if min(lo, hi) <= n <= max(lo, hi): - return True - except ValueError: - continue - return False - - -def _close_to_any(n: int, candidates: set[str], tol: float) -> bool: - """candidates 중 하나라도 (1±tol) 배율 안에 들어오면 True. - - n 은 정수, candidates 는 digits-only 문자열 집합. - """ - for c in candidates: - try: - cn = int(c) - except ValueError: - continue - if cn == 0: - continue - if abs(n - cn) / cn <= tol: - return True - return False - - -def _extract_content_tokens(text: str) -> set[str]: - """한국어 2자 이상 명사 + 영어 3자 이상 단어.""" - return set(re.findall(r'[가-힣]{2,}|[a-zA-Z]{3,}', text)) - - -def _parse_number_with_unit(literal: str) -> tuple[str, str] | None: - """숫자 리터럴에서 (digits_only, unit) 분리. 단위 없으면 None.""" - m = re.match(rf'([\d,.]+)\s*([{_UNIT_CHARS}])', literal) - if not m: - return None - digits = m.group(1).replace(',', '') - unit = m.group(2) - return (digits, unit) - - -def _check_evidence_numeric_conflicts(evidence: list["EvidenceItem"]) -> list[str]: - """evidence 간 숫자 충돌 감지 (Phase 3.5b). evidence >= 2 일 때만 활성. - - 동일 단위, 다른 숫자 → weak flag. "이상/이하/초과/미만" 포함 시 skip. - bare number 는 비교 안 함 (조항 번호 등 false positive 방지). - """ - if len(evidence) < 2: - return [] - - # 각 evidence 에서 단위 있는 숫자 + threshold 여부 추출 - # {evidence_idx: [(digits, unit, has_threshold), ...]} - per_evidence: dict[int, list[tuple[str, str, bool]]] = {} - for idx, ev in enumerate(evidence): - nums = re.findall( - rf'\d[\d,.]*\s*[{_UNIT_CHARS}]\w{{0,4}}', - ev.span_text, - ) - entries = [] - for raw in nums: - parsed = _parse_number_with_unit(raw) - if not parsed: - continue - has_thr = bool(_THRESHOLD_SUFFIXES.search(raw)) - entries.append((parsed[0], parsed[1], has_thr)) - if entries: - per_evidence[idx] = entries - - if len(per_evidence) < 2: - return [] - - # 단위별로 evidence 간 숫자 비교 - # {unit: {digits: [evidence_idx, ...]}} - unit_map: dict[str, dict[str, list[int]]] = {} - for idx, entries in per_evidence.items(): - for digits, unit, has_thr in entries: - if has_thr: - continue # threshold 표현은 skip - if unit not in unit_map: - unit_map[unit] = {} - if digits not in unit_map[unit]: - unit_map[unit][digits] = [] - if idx not in unit_map[unit][digits]: - unit_map[unit][digits].append(idx) - - flags: list[str] = [] - for unit, digits_map in unit_map.items(): - distinct_values = list(digits_map.keys()) - if len(distinct_values) >= 2: - # 가장 많이 등장하는 2개 비교 - top2 = sorted(distinct_values, key=lambda d: len(digits_map[d]), reverse=True)[:2] - flags.append( - f"evidence_numeric_conflict:{top2[0]}{unit}_vs_{top2[1]}{unit}" - ) - - return flags - - -def check( - query: str, - answer: str, - evidence: list[EvidenceItem], -) -> GroundingResult: - """답변 vs evidence grounding 검증 + query intent alignment.""" - strong: list[str] = [] - weak: list[str] = [] - - if not answer or not evidence: - return GroundingResult([], []) - - # ⚠ citation marker [n] 양측 제거 (대칭성 — Phase 3.5 B1) - evidence_text = re.sub(r'\[\d+\]', '', " ".join(e.span_text for e in evidence)) - - # ── Strong 1: fabricated number (unit-aware 3단계 — Phase 3.5 B1 fix1+fix3) ── - # Codex 지적 반영: - # - fix1: range/tolerance/exact 모두 단위 일치 시에만 clear - # (예: "150원" vs "100~200명" → flag 유지) - # - fix3: 최대/최소 prefix 는 bound 의미 보존 - # (예: "최대 100명" + answer "100명" → flag 유지, "최대 100명" + answer "50명" → cleared) - answer_clean = re.sub(r'\[\d+\]', '', answer) - answer_corpus = _extract_numeric_corpus(answer_clean) - evidence_corpus = _extract_numeric_corpus(evidence_text) - ev_exact_by_unit = evidence_corpus["exact_by_unit"] - ev_ranges_by_unit = evidence_corpus["ranges_by_unit"] - - # cleared 는 (unit, digits) 쌍 단위로 추적 — 단위 충돌 케이스 방어 - cleared_pairs: set[tuple[str | None, str]] = set() - - # Pass 1: 각 (unit, digits) 가 evidence 에서 정당화되는지 판정 - for unit, digits_set in answer_corpus["exact_by_unit"].items(): - for d in digits_set: - # 1) exact match — 같은 unit bucket 내에서만 - if d in ev_exact_by_unit.get(unit, set()): - cleared_pairs.add((unit, d)) - continue - # bare answer (unit=None) 는 evidence bare bucket 도 보조 매칭 - if unit is None and d in ev_exact_by_unit.get(None, set()): - cleared_pairs.add((unit, d)) - continue - try: - n = int(d) - except ValueError: - continue - # 2) range — same-unit 만 (bare answer 는 range clear 대상 아님) - if _within_unit_range(n, unit, ev_ranges_by_unit): - cleared_pairs.add((unit, d)) - continue - # 3) ±1% tolerance — 단위가 양적(_TOLERANCE_UNITS) + 4자리+ + same-unit - if ( - unit in _TOLERANCE_UNITS - and len(d) >= 4 - and _close_to_unit_pool(n, unit, ev_exact_by_unit, tol=0.01) - ): - cleared_pairs.add((unit, d)) - continue - # 식별자성 단위(_EXACT_ONLY_UNITS) 는 tolerance 패스 X. - - # Pass 2: cleared 되지 않은 (unit, digits) 를 strong flag. - # 1자리 무시는 unit 이 식별자성(_EXACT_ONLY_UNITS: 년/월/일/조/항/호/회) 이 아닐 때만 적용. - # bare(None) 답변 숫자는 같은 digit 이 다른 unit 에서 cleared 됐으면 skip — 추출 부산물 방어. - # ⚠ 단위 cross-clear (예: "원" cleared → "명" 도 skip) 은 금지: Codex unit-mismatch 케이스가 깨짐. - unit_anchored_cleared: set[str] = {d for (u, d) in cleared_pairs if u is not None} - flagged_keys: set[tuple[str | None, str]] = set() - for unit, digits_set in answer_corpus["exact_by_unit"].items(): - for d in digits_set: - if (unit, d) in cleared_pairs or (unit, d) in flagged_keys: - continue - # bare(None) 답변 숫자가 임의의 단위 bucket 에서 cleared 됐으면 duplicate 로 처리. - # 사례: "1,000명" → unit bucket "명" 에 1000 + bare bucket None 에 1000 (comma normalize 부산물). - # 이미 ("명", "1000") 가 cleared 라면 (None, "1000") 도 같은 사실을 가리키므로 skip. - if unit is None and d in unit_anchored_cleared: - continue - if len(d) < 2 and unit not in _EXACT_ONLY_UNITS: - continue - flagged_keys.add((unit, d)) - # 사람이 읽기 좋게 "{digits}{unit}" 또는 bare 형태로 표기 - label = f"{d}{unit}" if unit else d - strong.append(f"fabricated_number:{label}") - - # ── Strong/Weak 2: query-answer intent alignment ── - query_content = _extract_content_tokens(query) - answer_content = _extract_content_tokens(answer) - if query_content: - missing_terms = query_content - answer_content - important_missing = [ - t for t in missing_terms - if t not in GENERIC_TERMS and len(t) >= 2 - ] - if important_missing: - strong.append( - f"intent_misalignment:{','.join(important_missing[:3])}" - ) - elif len(missing_terms) > len(query_content) * 0.5: - weak.append( - f"intent_misalignment_generic:" - f"missing({','.join(list(missing_terms)[:5])})" - ) - - # ── Weak 1: uncited claim ── - sentences = re.split(r'(?<=[.!?。])\s+', answer) - for s in sentences: - if len(s.strip()) > 20 and not re.search(r'\[\d+\]', s): - weak.append(f"uncited_claim:{s[:40]}") - - # ── Weak: evidence 간 숫자 충돌 (Phase 3.5b) ── - conflicts = _check_evidence_numeric_conflicts(evidence) - weak.extend(conflicts) - - # ── Weak 2: token overlap ── - answer_tokens = _extract_content_tokens(answer) - evidence_tokens = _extract_content_tokens(evidence_text) - if answer_tokens: - overlap = len(answer_tokens & evidence_tokens) / len(answer_tokens) - if overlap < 0.4: - weak.append(f"low_overlap:{overlap:.2f}") - - if strong or weak: - logger.info( - "grounding query=%r strong=%d weak=%d flags=%s", - query[:60], - len(strong), - len(weak), - ",".join(strong[:3] + weak[:3]), - ) - - return GroundingResult(strong, weak) diff --git a/app/services/search/refusal_gate.py b/app/services/search/refusal_gate.py deleted file mode 100644 index 60eff67..0000000 --- a/app/services/search/refusal_gate.py +++ /dev/null @@ -1,105 +0,0 @@ -"""Refusal gate — multi-signal fusion (Phase 3.5a). - -Score gate (deterministic) + classifier verdict (semantic, binary) 를 독립 평가 후 합성. -Classifier 부재 시 3-tier conservative fallback. - -P1 실측 결과: exaone ternary 불안정 → binary (sufficient/insufficient) 로 축소. -"full" vs "partial" 구분은 grounding check (intent alignment) 가 담당. -""" - -from __future__ import annotations - -from dataclasses import dataclass, field -from typing import TYPE_CHECKING, Literal - -from core.utils import setup_logger - -if TYPE_CHECKING: - from .classifier_service import ClassifierResult - -logger = setup_logger("refusal_gate") - -# Placeholder thresholds — Phase 3.5b 에서 실측 기반 tuning -# AND 조건이라 false refusal 방어됨 (둘 다 만족해야 refuse) -SCORE_MAX_REFUSE = 0.25 -SCORE_AGG_REFUSE = 0.70 - -# Conservative fallback tiers (classifier 부재 시) -CONSERVATIVE_WEAK = 0.35 -CONSERVATIVE_MID = 0.55 - - -@dataclass(slots=True) -class RefusalDecision: - refused: bool - confidence_cap: Literal["high", "medium", "low"] | None # None = no cap - rule_triggered: str | None # 디버깅: 어느 signal 이 결정에 기여? - - -def decide( - rerank_scores: list[float], - classifier: ClassifierResult | None, -) -> RefusalDecision: - """Multi-signal fusion. Binary classifier verdict 기반. - - Returns: - RefusalDecision. refused=True 이면 synthesis skip. - confidence_cap 은 synthesis 결과의 confidence 에 upper bound 적용. - """ - max_score = max(rerank_scores) if rerank_scores else 0.0 - agg_top3 = sum(sorted(rerank_scores, reverse=True)[:3]) - - score_gate_fails = ( - max_score < SCORE_MAX_REFUSE and agg_top3 < SCORE_AGG_REFUSE - ) - - # ── Classifier 사용 가능 (정상 경로) ── - if classifier and classifier.verdict is not None: - if classifier.verdict == "insufficient": - # Evidence quality override: classifier 가 insufficient 라 해도 - # evidence 가 충분히 좋으면 override (토론 8라운드 합의) - # (evidence quality 는 이 함수 밖에서 별도 체크 — caller 에서 처리) - logger.info( - "refusal gate: classifier=insufficient max=%.2f agg=%.2f", - max_score, agg_top3, - ) - return RefusalDecision( - refused=True, - confidence_cap=None, - rule_triggered="classifier_insufficient", - ) - if score_gate_fails: - logger.info( - "refusal gate: score_low max=%.2f agg=%.2f classifier=%s", - max_score, agg_top3, classifier.verdict, - ) - return RefusalDecision( - refused=True, - confidence_cap=None, - rule_triggered="score_low", - ) - # Classifier says sufficient → proceed - return RefusalDecision( - refused=False, - confidence_cap=None, - rule_triggered=None, - ) - - # ── Classifier 부재 → 3-tier conservative ── - if max_score < CONSERVATIVE_WEAK: - return RefusalDecision( - refused=True, - confidence_cap=None, - rule_triggered="conservative_refuse(no_classifier)", - ) - if max_score < CONSERVATIVE_MID: - return RefusalDecision( - refused=False, - confidence_cap="low", - rule_triggered="conservative_low(no_classifier)", - ) - return RefusalDecision( - refused=False, - confidence_cap="medium", - rule_triggered="conservative_medium(no_classifier)", - ) diff --git a/app/services/search/verifier_service.py b/app/services/search/verifier_service.py deleted file mode 100644 index c2a8fe5..0000000 --- a/app/services/search/verifier_service.py +++ /dev/null @@ -1,196 +0,0 @@ -"""Exaone semantic verifier (Phase 3.5b). - -답변-근거 간 의미적 모순(contradiction) 감지. rule-based grounding_check 가 못 잡는 -미묘한 모순 포착. classifier 와 동일 패턴: circuit breaker + timeout + fail open. - -## Severity 3단계 -- strong: direct_negation (완전 모순) → re-gate 교차 자격 -- medium: numeric_conflict, intent_core_mismatch → confidence 하향 (누적 시 강제 low) -- weak: nuance, unsupported_claim → 로깅 + mild confidence 하향 - -## 핵심 원칙 -- **Verifier strong 단독 refuse 금지** — grounding strong 과 교차해야 refuse -- **Timeout 3s** — 느리면 없는 게 낫다 (fail open) -- MLX gate 사용 (Mac mini 26B endpoint — classifier/evidence 와 동일 gate 공유, 동시 race 방지) -""" - -from __future__ import annotations - -import asyncio -import os -import time -from dataclasses import dataclass, field -from typing import TYPE_CHECKING, Literal - -from ai.client import AIClient, _load_prompt, parse_json_response -from core.config import settings -from core.utils import setup_logger -from .llm_gate import Priority, acquire_mlx_gate - -if TYPE_CHECKING: - from .evidence_service import EvidenceItem - -logger = setup_logger("verifier") - -LLM_TIMEOUT_MS = 10000 # 2026-05-17 B-3: 3s 시 동시 부하 시 verifier 빈발 skip → grounding 약화. Mac mini 26B 가 verifier-style 짧은 LLM call 도 concurrent 호출 시 3s 초과 빈번 — 10s 로 raise -CIRCUIT_THRESHOLD = 5 -CIRCUIT_RECOVERY_SEC = 60 - -_failure_count = 0 -_circuit_open_until: float | None = None - -# Phase 3.5 B2: numeric_conflict severity promote 실험. -# import time 평가 — env 변경 후 process restart 필수 (docker compose restart fastapi). -# default=0 (off). production 적용은 B3 FP 검증 통과 후만. -_NUMERIC_PROMOTE = os.getenv("VERIFIER_NUMERIC_PROMOTE", "0") == "1" - -# severity 매핑 (프롬프트 "critical"/"minor" → 코드 strong/medium/weak) -# Tier 4 (B2): _NUMERIC_PROMOTE=1 일 때 numeric_conflict critical → strong 으로 격상. -# minor 는 medium 유지 (FP 위험 분리). -_SEVERITY_MAP: dict[str, dict[str, Literal["strong", "medium", "weak"]]] = { - "direct_negation": {"critical": "strong", "minor": "strong"}, - "numeric_conflict": ( - {"critical": "strong", "minor": "medium"} if _NUMERIC_PROMOTE - else {"critical": "medium", "minor": "medium"} - ), - "intent_core_mismatch": {"critical": "medium", "minor": "medium"}, - "nuance": {"critical": "weak", "minor": "weak"}, - "unsupported_claim": {"critical": "weak", "minor": "weak"}, -} - - -@dataclass(slots=True) -class Contradiction: - """개별 모순 발견.""" - type: str # direct_negation / numeric_conflict / intent_core_mismatch / nuance / unsupported_claim - severity: Literal["strong", "medium", "weak"] - claim: str - evidence_ref: str - explanation: str - - -@dataclass(slots=True) -class VerifierResult: - status: Literal["ok", "timeout", "error", "circuit_open", "skipped"] - contradictions: list[Contradiction] - elapsed_ms: float - - -try: - VERIFIER_PROMPT = _load_prompt("verifier.txt") -except FileNotFoundError: - VERIFIER_PROMPT = "" - logger.warning("verifier.txt not found — verifier will always skip") - - -def _build_input( - answer: str, - evidence: list[EvidenceItem], -) -> str: - """답변 + evidence spans → 프롬프트.""" - spans = "\n\n".join( - f"[{e.n}] {(e.title or '').strip()}\n{e.span_text}" - for e in evidence - ) - return ( - VERIFIER_PROMPT - .replace("{answer}", answer) - .replace("{numbered_evidence}", spans) - ) - - -def _map_severity(ctype: str, raw_severity: str) -> Literal["strong", "medium", "weak"]: - """type + raw severity → 코드 severity 3단계.""" - type_map = _SEVERITY_MAP.get(ctype, {"critical": "weak", "minor": "weak"}) - return type_map.get(raw_severity, "weak") - - -async def verify( - query: str, - answer: str, - evidence: list[EvidenceItem], -) -> VerifierResult: - """답변-근거 semantic 검증. Parallel with grounding_check. - - Returns: - VerifierResult. status "ok" 이 아니면 contradictions 빈 리스트 (fail open). - """ - global _failure_count, _circuit_open_until - t_start = time.perf_counter() - - if _circuit_open_until and time.time() < _circuit_open_until: - return VerifierResult("circuit_open", [], 0.0) - - if not VERIFIER_PROMPT: - return VerifierResult("skipped", [], 0.0) - - if not hasattr(settings.ai, "verifier") or settings.ai.verifier is None: - return VerifierResult("skipped", [], 0.0) - - if not answer or not evidence: - return VerifierResult("skipped", [], 0.0) - - prompt = _build_input(answer, evidence) - client = AIClient() - try: - async with acquire_mlx_gate(Priority.FOREGROUND): - async with asyncio.timeout(LLM_TIMEOUT_MS / 1000): - raw = await client.call_verifier(prompt) - _failure_count = 0 - except asyncio.TimeoutError: - _failure_count += 1 - if _failure_count >= CIRCUIT_THRESHOLD: - _circuit_open_until = time.time() + CIRCUIT_RECOVERY_SEC - logger.error(f"verifier circuit OPEN for {CIRCUIT_RECOVERY_SEC}s") - logger.warning("verifier timeout") - return VerifierResult( - "timeout", [], - (time.perf_counter() - t_start) * 1000, - ) - except Exception as e: - _failure_count += 1 - if _failure_count >= CIRCUIT_THRESHOLD: - _circuit_open_until = time.time() + CIRCUIT_RECOVERY_SEC - logger.error(f"verifier circuit OPEN for {CIRCUIT_RECOVERY_SEC}s") - logger.warning(f"verifier error: {e}") - return VerifierResult( - "error", [], - (time.perf_counter() - t_start) * 1000, - ) - finally: - await client.close() - - elapsed_ms = (time.perf_counter() - t_start) * 1000 - parsed = parse_json_response(raw) - if not isinstance(parsed, dict): - logger.warning("verifier parse failed raw=%r", (raw or "")[:200]) - return VerifierResult("error", [], elapsed_ms) - - # contradiction 파싱 - raw_items = parsed.get("contradictions") or [] - if not isinstance(raw_items, list): - raw_items = [] - - results: list[Contradiction] = [] - for item in raw_items[:5]: - if not isinstance(item, dict): - continue - ctype = item.get("type", "") - if ctype not in _SEVERITY_MAP: - ctype = "unsupported_claim" - raw_sev = item.get("severity", "minor") - severity = _map_severity(ctype, raw_sev) - claim = str(item.get("claim", ""))[:50] - ev_ref = str(item.get("evidence_ref", ""))[:50] - explanation = str(item.get("explanation", ""))[:30] - results.append(Contradiction(ctype, severity, claim, ev_ref, explanation)) - - logger.info( - "verifier ok query=%r contradictions=%d strong=%d medium=%d elapsed_ms=%.0f", - query[:60], - len(results), - sum(1 for c in results if c.severity == "strong"), - sum(1 for c in results if c.severity == "medium"), - elapsed_ms, - ) - return VerifierResult("ok", results, elapsed_ms) diff --git a/tests/api/test_search_ask_macbook_503.py b/tests/api/test_search_ask_macbook_503.py deleted file mode 100644 index a6cdd19..0000000 --- a/tests/api/test_search_ask_macbook_503.py +++ /dev/null @@ -1,291 +0,0 @@ -"""PR-MacBook-RAG-Backend-1 정정 4 핵심 테스트. - -검증 invariant (synthesize 함수 레벨 — /ask wrapper 의 503 매핑은 search.py 의 -status="backend_unavailable" 분기로 1:1 deterministic): - -1. backend="qwen-macbook" + MacBook URL 죽은 포트 - → synthesize() 가 SynthesisResult(status="backend_unavailable", ...) 반환 - → Gemma backend 의 generate() 가 **단 1번도 호출되지 않음** (자동 fallback 부재) - -2. backend 미지정 (None) - → Gemma backend.generate() 호출, Qwen backend.generate() 호출 0 - → 기존 호출자 (Hermes docsrv_ask / voice-memo-bot) 회귀 0 - -3. backend="qwen-macbook" + MacBook 정상 응답 - → status="completed" + answer 채워짐, Gemma backend 호출 0 - -테스트 전략: -- synthesize() 가 호출하는 backend dispatcher (services.llm.get_backend) 를 - monkeypatch 해서 mock backend 주입. -- Gemma backend 의 generate AsyncMock 호출 횟수를 추적. -- 정정 4 의 핵심 가드: `gemma_backend.generate.assert_not_called()` -""" - -from __future__ import annotations - -import asyncio -import os -import sys -from dataclasses import dataclass -from unittest.mock import AsyncMock - -import pytest - -sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "app")) - - -# ── 가짜 evidence (synthesize 의 no_evidence 분기 회피용 최소 객체) ───────── - - -@dataclass -class _FakeEvidence: - n: int = 1 - doc_id: int = 100 - chunk_id: int | None = 200 - title: str | None = "fake doc" - span_text: str = "이것은 짧은 근거 텍스트입니다." - source: str = "llm" - - -def _make_evidence(): - return [_FakeEvidence()] - - -# ── backend mock ─────────────────────────────────────────────────────────── - - -def _gemma_mock(content: str = "GEMMA_SHOULD_NEVER_BE_CALLED"): - m = AsyncMock() - m.name = "gemma-macmini" - m.generate = AsyncMock(return_value=content) - return m - - -def _qwen_mock_success(content: str): - m = AsyncMock() - m.name = "qwen-macbook" - m.generate = AsyncMock(return_value=content) - return m - - -def _qwen_mock_unavailable(): - from services.llm import BackendUnavailable - - m = AsyncMock() - m.name = "qwen-macbook" - m.generate = AsyncMock( - side_effect=BackendUnavailable("qwen-macbook", "ConnectError") - ) - return m - - -# ── 공통 fixture: synthesis_service 에 mock backend 주입 ─────────────────── - - -@pytest.fixture -def patched_backends(monkeypatch): - """services.llm.get_backend 를 mock dispatcher 로 치환. - - Returns (gemma_mock, qwen_mock, set_qwen_unavailable_fn). - """ - from services.search import synthesis_service - - gemma = _gemma_mock() - qwen_holder = {"backend": _qwen_mock_success( - '{"answer":"Qwen ok [1]","confidence":"high","refused":false}' - )} - - def _fake_get_backend(name: str | None): - key = (name or "").strip().lower() or "gemma-macmini" - if key == "gemma-macmini": - return gemma - if key == "qwen-macbook": - return qwen_holder["backend"] - raise ValueError(f"unknown backend: {name!r}") - - monkeypatch.setattr(synthesis_service, "get_backend", _fake_get_backend) - # synthesis_service 캐시 비움 (qwen vs gemma 캐시 분리 invariant) - synthesis_service._CACHE.clear() - - def _swap_qwen_unavailable(): - qwen_holder["backend"] = _qwen_mock_unavailable() - - return gemma, qwen_holder, _swap_qwen_unavailable - - -# ── 정정 4 핵심: backend=qwen-macbook + MacBook 비가용 → Gemma 호출 0 ───── - - -def test_qwen_unavailable_yields_backend_unavailable_status_and_gemma_not_called( - patched_backends, -): - """**정정 4 의 핵심 invariant**. - - backend="qwen-macbook" 명시 + Qwen 호출이 BackendUnavailable 로 실패 → - synthesize() 는 status="backend_unavailable" 반환. Gemma backend 의 - generate() 는 **단 한 번도 호출되지 않음** (silent fallback 금지). - """ - from services.search.synthesis_service import synthesize - - gemma, qwen_holder, swap_qwen_unavailable = patched_backends - swap_qwen_unavailable() - qwen = qwen_holder["backend"] - - result = asyncio.run( - synthesize( - query="압력용기 최대허용응력은?", - evidence=_make_evidence(), - backend="qwen-macbook", - ) - ) - - # 1. status - assert result.status == "backend_unavailable" - assert result.answer is None - assert result.confidence is None - assert result.refused is False - - # 2. flag 에 backend 비가용 사유 기록 - assert any( - f.startswith("backend_unavailable:qwen-macbook:") for f in result.hallucination_flags - ), f"expected backend_unavailable flag, got {result.hallucination_flags}" - - # 3. ★ 핵심 가드 ★ — Gemma backend 자동 fallback 금지 - gemma.generate.assert_not_called() - - # 4. Qwen 은 1회만 호출 (재시도 없음) - assert qwen.generate.call_count == 1 - - -def test_qwen_unavailable_result_not_cached(patched_backends): - """비가용 결과는 캐시 X — 다음 호출이 다시 Qwen 시도해야 함.""" - from services.search.synthesis_service import synthesize - - gemma, qwen_holder, swap_qwen_unavailable = patched_backends - swap_qwen_unavailable() - qwen = qwen_holder["backend"] - - asyncio.run( - synthesize( - query="동일 쿼리", - evidence=_make_evidence(), - backend="qwen-macbook", - ) - ) - asyncio.run( - synthesize( - query="동일 쿼리", - evidence=_make_evidence(), - backend="qwen-macbook", - ) - ) - - # 두 번 모두 실제 호출 (캐시 적중 X) — Gemma 는 여전히 0 - assert qwen.generate.call_count == 2 - gemma.generate.assert_not_called() - - -# ── 정정 4: backend 미지정 → 기존 Gemma path (회귀 0) ───────────────────── - - -def test_default_backend_calls_gemma_not_qwen(patched_backends): - """backend 미지정 = 기본 Gemma. Qwen 호출 0.""" - from services.search.synthesis_service import synthesize - - gemma, qwen_holder, _ = patched_backends - qwen = qwen_holder["backend"] - gemma.generate.return_value = ( - '{"answer":"Gemma 답변 [1]","confidence":"high","refused":false}' - ) - - result = asyncio.run( - synthesize( - query="기본 호출", - evidence=_make_evidence(), - backend=None, # 명시 None = default - ) - ) - - assert result.status == "completed" - assert result.answer is not None and "Gemma" in result.answer - - # Qwen 은 호출 0 - qwen.generate.assert_not_called() - # Gemma 는 1회 - assert gemma.generate.call_count == 1 - - -# ── backend="qwen-macbook" + 정상 응답 ────────────────────────────────────── - - -def test_qwen_success_does_not_call_gemma(patched_backends): - """Qwen 정상 응답 시 Gemma 는 호출되지 않음 (대칭 invariant).""" - from services.search.synthesis_service import synthesize - - gemma, qwen_holder, _ = patched_backends - qwen = qwen_holder["backend"] - - result = asyncio.run( - synthesize( - query="정상 호출", - evidence=_make_evidence(), - backend="qwen-macbook", - ) - ) - - assert result.status == "completed" - assert result.answer is not None and "Qwen" in result.answer - - # Gemma 는 0회 - gemma.generate.assert_not_called() - # Qwen 은 1회 - assert qwen.generate.call_count == 1 - - -# ── 캐시 분리 (qwen vs gemma 키 충돌 없음) ───────────────────────────────── - - -def test_qwen_and_gemma_have_separate_caches(patched_backends): - """같은 query 라도 backend 다르면 캐시 분리 — Qwen 결과가 Gemma 호출 답으로 둔갑하지 않음.""" - from services.search.synthesis_service import synthesize - - gemma, qwen_holder, _ = patched_backends - qwen = qwen_holder["backend"] - gemma.generate.return_value = ( - '{"answer":"GEMMA_ANSWER [1]","confidence":"high","refused":false}' - ) - qwen.generate.return_value = ( - '{"answer":"QWEN_ANSWER [1]","confidence":"high","refused":false}' - ) - - r_qwen_1 = asyncio.run( - synthesize( - query="같은 query", - evidence=_make_evidence(), - backend="qwen-macbook", - ) - ) - r_gemma_1 = asyncio.run( - synthesize( - query="같은 query", - evidence=_make_evidence(), - backend=None, - ) - ) - r_qwen_2 = asyncio.run( - synthesize( - query="같은 query", - evidence=_make_evidence(), - backend="qwen-macbook", - ) - ) - - assert "QWEN_ANSWER" in (r_qwen_1.answer or "") - assert "GEMMA_ANSWER" in (r_gemma_1.answer or "") - # 두 번째 Qwen 호출은 캐시 적중 — 결과는 동일하지만 generate 추가 호출 X - assert "QWEN_ANSWER" in (r_qwen_2.answer or "") - assert r_qwen_2.cache_hit is True - - # generate 호출 횟수: Qwen 1 (두번째는 캐시), Gemma 1 - assert qwen.generate.call_count == 1 - assert gemma.generate.call_count == 1 diff --git a/tests/api/test_search_ask_react_endpoint.py b/tests/api/test_search_ask_react_endpoint.py deleted file mode 100644 index 380d70f..0000000 --- a/tests/api/test_search_ask_react_endpoint.py +++ /dev/null @@ -1,218 +0,0 @@ -"""PR-DocSrv-Ask-ToolCalling-ReAct-1: /api/search/ask/react endpoint integration. - -검증 항목 (G0-3 trace exposure + 정정 4 invariant): -- backend unavailable → HTTP 503 + error_reason=macbook_unavailable - + ★ `run_search` mock 호출 횟수 == 0 (search 단계 진입 자체 차단) -- 정상 응답 → 200 + final_answer + sources + debug_trace=null (default) -- debug=true → debug_trace 채워짐 -- max rounds 도달 → iterations=2 + partial=false (final content 정상) - -endpoint 함수 (`api.search.ask_react`) 를 직접 호출하는 lightweight 패턴. -TestClient 없이 FastAPI deps 를 MagicMock 으로 우회. (priority_gate / backend_dispatcher -test 와 동일 service-layer 패턴.) -""" - -from __future__ import annotations - -import asyncio -import json -import os -import sys -from unittest.mock import AsyncMock, MagicMock - -import pytest - -sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "app")) - - -# ── helpers ──────────────────────────────────────────────────────────────── - - -def _msg_with_tool_call(q: str, tc_id: str = "tc-1") -> dict: - return { - "role": "assistant", - "content": None, - "tool_calls": [ - { - "id": tc_id, - "type": "function", - "function": { - "name": "search", - "arguments": json.dumps({"q": q}, ensure_ascii=False), - }, - } - ], - } - - -def _msg_with_content(text: str) -> dict: - return {"role": "assistant", "content": text, "tool_calls": None} - - -def _fake_chunk(chunk_id: int, doc_id: int = 100): - m = MagicMock() - m.id = chunk_id - m.chunk_id = chunk_id - m.doc_id = doc_id - m.title = f"doc {doc_id}" - m.score = 0.9 - m.snippet = f"snippet {chunk_id}" - m.text = None - return m - - -def _fake_pr(chunks: list): - pr = MagicMock() - pr.results = chunks - return pr - - -@pytest.fixture -def patched_backend_and_search(monkeypatch): - """get_backend + run_search 둘 다 mock. backend 의 generate_with_tools 는 - 각 테스트가 side_effect 설정. - - Returns: (backend_mock, run_search_mock, set_backend_unavailable_fn). - """ - from services.llm.backends import BackendUnavailable, QwenMacBookBackend - from services.llm import backends as backends_mod - from services.search import react_loop - - backend = MagicMock(spec=QwenMacBookBackend) - backend.name = "qwen-macbook" - backend.generate_with_tools = AsyncMock() - - def _fake_get_backend(name): - # endpoint 가 qwen-macbook 만 호출하므로 단일 backend 반환 - return backend - - monkeypatch.setattr(backends_mod, "get_backend", _fake_get_backend) - # search.py 의 ask_react 안에서 `from services.llm.backends import ... get_backend` - # 로 import 하므로 module-level patch 만으로 충분 (지연 import 라 매번 fresh). - - run_search_mock = AsyncMock(return_value=_fake_pr([_fake_chunk(1)])) - monkeypatch.setattr(react_loop, "run_search", run_search_mock) - - def _make_unavailable(): - backend.generate_with_tools.side_effect = BackendUnavailable( - "qwen-macbook", "ConnectError" - ) - - return backend, run_search_mock, _make_unavailable - - -def _call_endpoint(payload): - """ask_react 를 직접 호출. user/session 은 MagicMock 으로 우회.""" - from api.search import ask_react - - user = MagicMock() - session = MagicMock() - return asyncio.run(ask_react(payload, user=user, session=session)) - - -# ── ★ 정정 4 invariant: backend unavailable → 503 + run_search 호출 0 ────── - - -def test_qwen_unavailable_returns_503(patched_backend_and_search): - """backend BackendUnavailable → HTTP 503 + error_reason=macbook_unavailable.""" - from api.search import AskReactRequest - - backend, run_search_mock, make_unavailable = patched_backend_and_search - make_unavailable() - - response = _call_endpoint(AskReactRequest(query="Q")) - - # JSONResponse instance - assert response.status_code == 503 - body = json.loads(response.body) - assert body["error_reason"] == "macbook_unavailable" - assert body["backend_used"] is None - assert body["backend_requested"] == "qwen-macbook" - - # ★ run_search 호출 0 (search 진입 자체 차단) - assert run_search_mock.call_count == 0 - - -# ── 정상 200 + G0-3 default debug_trace=null ────────────────────────────── - - -def test_successful_response_default_no_debug_trace(patched_backend_and_search): - """debug 미지정 (default false) → 200 + debug_trace == null.""" - from api.search import AskReactRequest, AskReactResponse - - backend, run_search_mock, _ = patched_backend_and_search - backend.generate_with_tools.side_effect = [ - _msg_with_tool_call("q1"), - _msg_with_content("최종 답입니다"), - ] - - response = _call_endpoint(AskReactRequest(query="Q")) - - # Pydantic instance (FastAPI response_model 적용 전 raw return) - assert isinstance(response, AskReactResponse) - assert response.final_answer == "최종 답입니다" - assert response.iterations == 2 - assert response.partial is False - assert response.debug_trace is None # ★ G0-3 - assert len(response.sources) == 1 - - -# ── G0-3: debug=true → debug_trace 채워짐 ────────────────────────────────── - - -def test_debug_true_populates_trace(patched_backend_and_search): - from api.search import AskReactRequest - - backend, run_search_mock, _ = patched_backend_and_search - backend.generate_with_tools.side_effect = [ - _msg_with_content("바로 답"), - ] - - response = _call_endpoint(AskReactRequest(query="Q", debug=True)) - - assert response.debug_trace is not None - assert isinstance(response.debug_trace, list) - assert len(response.debug_trace) >= 1 - - -# ── max rounds → final content 정상 → partial=false ────────────────────── - - -def test_max_rounds_with_final_content(patched_backend_and_search): - from api.search import AskReactRequest - - backend, run_search_mock, _ = patched_backend_and_search - backend.generate_with_tools.side_effect = [ - _msg_with_tool_call("q1"), - _msg_with_tool_call("q2", tc_id="tc-2"), - _msg_with_content("정리된 최종 답"), - ] - - response = _call_endpoint(AskReactRequest(query="Q")) - - assert response.iterations == 2 - assert response.partial is False - assert response.final_answer == "정리된 최종 답" - # LLM 호출 3회, search 2회 (G0-2 cap) - assert backend.generate_with_tools.call_count == 3 - assert run_search_mock.call_count == 2 - - -# ── max rounds + final content 빈 string → partial=true ────────────────── - - -def test_max_rounds_with_empty_final_partial(patched_backend_and_search): - from api.search import AskReactRequest - - backend, run_search_mock, _ = patched_backend_and_search - backend.generate_with_tools.side_effect = [ - _msg_with_tool_call("q1"), - _msg_with_tool_call("q2", tc_id="tc-2"), - _msg_with_content(""), - ] - - response = _call_endpoint(AskReactRequest(query="Q")) - - assert response.iterations == 2 - assert response.partial is True - assert response.final_answer == "" diff --git a/tests/test_ask_eval_auth.py b/tests/test_ask_eval_auth.py deleted file mode 100644 index 0492d21..0000000 --- a/tests/test_ask_eval_auth.py +++ /dev/null @@ -1,92 +0,0 @@ -"""Phase 3.5 fix2: /ask 의 X-Source / X-Eval-Case-Id trust boundary. - -`_resolve_eval_identity()` 단위 테스트. -- token 없음/틀림 + X-Source=eval → source='document_server', eval_case_id=None -- token 일치 + X-Source=eval + X-Eval-Case-Id=case_xxx → ('eval', 'case_xxx') -- token 틀림 + X-Eval-Case-Id 만 (X-Source 미지정) → eval_case_id=None -- 일반 호출 (X-Source=ui_search, no eval headers) → ('ui_search', None) -- env 미설정 (eval_runner_token='') 시 모든 eval claim 거부 -""" - -from __future__ import annotations - -import os -import sys - -sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "app")) - -import pytest - - -@pytest.fixture -def resolve_with_token(monkeypatch): - """settings.eval_runner_token 을 monkey-patch 해서 _resolve_eval_identity 테스트.""" - def _make(token: str): - from core import config as cfg_mod - from api import search as search_mod - # 두 모듈 모두에서 settings 객체 참조하므로 직접 attr 변경 - monkeypatch.setattr(search_mod.settings, "eval_runner_token", token) - return search_mod._resolve_eval_identity - return _make - - -def test_no_token_no_eval_headers_default(resolve_with_token): - """일반 호출 — eval 헤더 없음, source 기본값.""" - resolve = resolve_with_token("secret123") - assert resolve(None, None, None) == ("document_server", None) - - -def test_normal_source_with_token(resolve_with_token): - """ui_search 호출 — eval 클레임 아님이라 token 무관.""" - resolve = resolve_with_token("secret123") - assert resolve("ui_search", None, None) == ("ui_search", None) - - -def test_eval_claim_no_token_rejected(resolve_with_token): - """X-Source=eval 인데 token 없음 → 거부, source='document_server'.""" - resolve = resolve_with_token("secret123") - assert resolve("eval", "case_001", None) == ("document_server", None) - - -def test_eval_claim_wrong_token_rejected(resolve_with_token): - """token 틀림 → 거부.""" - resolve = resolve_with_token("secret123") - assert resolve("eval", "case_001", "wrong_token") == ("document_server", None) - - -def test_eval_claim_correct_token_accepted(resolve_with_token): - """token 일치 → 'eval' source + case_id 적재.""" - resolve = resolve_with_token("secret123") - assert resolve("eval", "case_001", "secret123") == ("eval", "case_001") - - -def test_eval_case_id_only_no_source_no_token(resolve_with_token): - """X-Eval-Case-Id 만 있고 token 없음 → 거부, case_id=None.""" - resolve = resolve_with_token("secret123") - assert resolve(None, "case_001", None) == ("document_server", None) - - -def test_eval_case_id_only_wrong_token(resolve_with_token): - """X-Eval-Case-Id 만 + token 틀림 → 거부.""" - resolve = resolve_with_token("secret123") - assert resolve(None, "case_001", "wrong") == ("document_server", None) - - -def test_env_unset_rejects_even_correct_format(resolve_with_token): - """settings.eval_runner_token='' 인 환경 → 모든 eval 클레임 거부.""" - resolve = resolve_with_token("") - # token 헤더가 와도 server side 가 비어있으면 거부 (constant-time False) - assert resolve("eval", "case_001", "") == ("document_server", None) - assert resolve("eval", "case_001", "anything") == ("document_server", None) - - -def test_non_eval_source_forces_case_id_none(resolve_with_token): - """X-Source=ui_detail + X-Eval-Case-Id (실수로 같이 보냄) → case_id=None. - - eval claim 아님 (source != 'eval' 이고 case_id 가 fallback 으로 eval claim 트리거) - 이지만 source claim 이 명시적으로 non-eval 이라 token 검증 후 case_id None. - """ - resolve = resolve_with_token("secret123") - # case_id 가 있으면 eval claim 으로 처리됨 → token 없으면 거부 → ('ui_detail' 클레임, - # 하지만 거부 분기에서 claimed_source != 'eval' 이라 그대로 'ui_detail' 반환, case_id=None) - assert resolve("ui_detail", "case_001", None) == ("ui_detail", None) diff --git a/tests/test_grounding_fabricated_number.py b/tests/test_grounding_fabricated_number.py deleted file mode 100644 index ae76b6d..0000000 --- a/tests/test_grounding_fabricated_number.py +++ /dev/null @@ -1,188 +0,0 @@ -"""Phase 3.5 B1 (fix1+fix3): unit-aware fabricated_number + bound semantics. - -기준: -- 단위 일치 시에만 exact/range/tolerance clear (fix1: Codex unit-mismatch regression 방지) -- 약/대략/거의/얼추 만 approx prefix strip; 최대/최소 는 bound operator 로 보존 (fix3) -- tolerance 는 양적 단위(_TOLERANCE_UNITS) + 4자리+ 만; 식별자성(_EXACT_ONLY_UNITS) 은 strict -""" - -from __future__ import annotations - -import os -import sys - -# tests/ → 프로젝트 루트 → app/ -sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "app")) - -import pytest - -from services.search.evidence_service import EvidenceItem -from services.search.grounding_check import check - - -def _ev(text: str, n: int = 1) -> EvidenceItem: - return EvidenceItem( - n=n, - chunk_id=None, - doc_id=100 + n, - title=f"doc{n}", - section_title=None, - span_text=text, - relevance=0.9, - rerank_score=0.85, - full_snippet=text, - source="llm", - ) - - -def _has_fabricated(result, sub: str | None = None) -> bool: - for f in result.strong_flags: - if not f.startswith("fabricated_number:"): - continue - if sub is None or sub in f: - return True - return False - - -# ─── 콤마/prefix/range/단위 동의어/citation (기존 17 케이스) ────── - - -def test_comma_thousand_match(): - r = check("질문", "총 1,000명 [1]", [_ev("총원은 1000명입니다.")]) - assert not _has_fabricated(r, "1000") - - -def test_comma_thousand_reverse(): - r = check("질문", "총 1000명 [1]", [_ev("총원은 1,000명입니다.")]) - assert not _has_fabricated(r) - - -def test_approx_prefix_in_answer(): - r = check("질문", "약 100명이 참여 [1]", [_ev("100명이 참여")]) - assert not _has_fabricated(r) - - -def test_approx_prefix_in_evidence(): - r = check("질문", "100명이 참여 [1]", [_ev("약 100명이 참여")]) - assert not _has_fabricated(r) - - -def test_range_inner_value_passes(): - r = check("질문", "약 150명 [1]", [_ev("100~200명 사이 추정")]) - assert not _has_fabricated(r, "150") - - -def test_range_outer_value_flagged(): - r = check("질문", "300명 [1]", [_ev("100~200명 사이 추정")]) - assert _has_fabricated(r, "300") - - -def test_unit_synonym_in_to_myeong(): - r = check("질문", "총 50인이 모임 [1]", [_ev("총 50명이 모임.")]) - assert not _has_fabricated(r) - - -def test_unit_synonym_percent_to_pct(): - r = check("질문", "비율 30퍼센트 [1]", [_ev("비율 30%이다.")]) - assert not _has_fabricated(r) - - -def test_citation_marker_both_sides(): - """bug fix: evidence 측 [n] 미제거로 디지트 합쳐지던 케이스.""" - r = check("질문", "가격 [1] 5,000원", [_ev("[2] 5,000원이 정확")]) - assert not _has_fabricated(r) - - -def test_genuine_fabricated_number(): - r = check("질문", "결과 777명 [1]", [_ev("500명, 300명을 받음.")]) - assert _has_fabricated(r, "777") - - -def test_amount_4digit_tolerance_passes(): - r = check("질문", "9,990원 [1]", [_ev("10,000원입니다.")]) - assert not _has_fabricated(r) - - -def test_year_no_tolerance_flagged(): - r = check("질문", "2024년 [1]", [_ev("2026년에 발효")]) - assert _has_fabricated(r, "2024") - - -def test_article_no_tolerance_flagged(): - r = check("질문", "제5조에 명시 [1]", [_ev("제6조에 따라")]) - assert _has_fabricated(r) - - -def test_count_no_tolerance_flagged(): - r = check("질문", "총 3회 위반 [1]", [_ev("총 4회 적발")]) - assert _has_fabricated(r) - - -def test_three_digit_strict(): - r = check("질문", "총 15개 [1]", [_ev("총 10개")]) - assert _has_fabricated(r, "15") - - -def test_single_digit_ignored(): - """1자리 + 양적 단위 → 무시 (오탐 방지).""" - r = check("질문", "총 3개 발생 [1]", [_ev("관련 통계 별도")]) - assert not _has_fabricated(r, "3개") - - -def test_range_korean_butter_separator(): - r = check("질문", "약 150명 [1]", [_ev("100부터 200명까지 대상.")]) - assert not _has_fabricated(r, "150") - - -# ─── fix1: unit-mismatch (Codex no-ship) ────────────────── - - -def test_won_vs_myeong_range_flagged(): - """answer '150원' vs evidence '100~200명' → 단위 불일치, flag 유지.""" - r = check("질문", "약 150원이 든다 [1]", [_ev("대상은 100~200명")]) - assert _has_fabricated(r, "150") - - -def test_won_vs_myeong_tolerance_flagged(): - """answer '9,990원' vs evidence '10,000명' → tolerance pool 단위 다름, flag 유지.""" - r = check("질문", "9,990원 [1]", [_ev("10,000명입니다.")]) - assert _has_fabricated(r, "9990") - - -def test_pct_vs_myeong_range_flagged(): - """answer '15%' vs evidence '10~20명' → 단위 불일치, flag 유지.""" - r = check("질문", "약 15% [1]", [_ev("대상 10~20명")]) - assert _has_fabricated(r, "15") - - -# ─── fix3: 최대/최소 bound semantics ─────────────────────── - - -def test_choedae_exact_boundary_flagged(): - """evidence '최대 100명' + answer '100명' → 경계값 자체는 cleared 아님.""" - r = check("질문", "100명이다 [1]", [_ev("최대 100명까지 가능")]) - assert _has_fabricated(r, "100") - - -def test_choeso_exact_boundary_flagged(): - """evidence '최소 100명' + answer '100명' → 경계값 자체는 cleared 아님.""" - r = check("질문", "100명이다 [1]", [_ev("최소 100명 이상 필요")]) - assert _has_fabricated(r, "100") - - -def test_choedae_inner_value_passes(): - """evidence '최대 100명' + answer '50명' → bound 안, cleared.""" - r = check("질문", "50명이다 [1]", [_ev("최대 100명까지 가능")]) - assert not _has_fabricated(r, "50") - - -def test_choeso_above_value_passes(): - """evidence '최소 100명' + answer '150명' → bound 안, cleared.""" - r = check("질문", "150명이다 [1]", [_ev("최소 100명 이상 필요")]) - assert not _has_fabricated(r, "150") - - -def test_choedae_outer_value_flagged(): - """evidence '최대 100명' + answer '200명' → bound 밖, flag.""" - r = check("질문", "200명이다 [1]", [_ev("최대 100명까지 가능")]) - assert _has_fabricated(r, "200") diff --git a/tests/test_synthesis_failure_regate.py b/tests/test_synthesis_failure_regate.py deleted file mode 100644 index f06577f..0000000 --- a/tests/test_synthesis_failure_regate.py +++ /dev/null @@ -1,123 +0,0 @@ -"""Phase 3.5 fix3: re-gate Tier 0 — synthesis 자체 실패 처리. - -`_detect_synthesis_failure()` 단위 테스트. - -기존 버그: - synthesis LLM self-refuse (`sr.refused=True, status="completed"`) 또는 - timeout/parse_failed/llm_error 시 grounding/verifier flag 0건 → re-gate else clean - 분기로 빠져 `completeness="full"` 초기값이 남아 `full + refused=True` 모순. - baseline v1-400char 에서 24/223 (10.8%) 해당. - -Tier 0 판정: - - LLM self-refuse (completed + refused) → "synthesis_self_refuse" - - mechanical fail (timeout/parse_failed/llm_error) → "synthesis_failed({status})" - - answer 공백 → "synthesis_failed({status})" - - 유효 답변 → None (기존 tier 1~7 경로) -""" - -from __future__ import annotations - -import os -import sys - -sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "app")) - -from api.search import _detect_synthesis_failure -from services.search.synthesis_service import SynthesisResult - - -def _sr( - status: str = "completed", - answer: str | None = "ok", - refused: bool = False, - refuse_reason: str | None = None, -) -> SynthesisResult: - return SynthesisResult( - status=status, # type: ignore[arg-type] - answer=answer, - used_citations=[], - confidence="low", - refused=refused, - refuse_reason=refuse_reason, - elapsed_ms=100.0, - cache_hit=False, - ) - - -# ─── self-refuse 케이스 ────────────────────────────────── - - -def test_llm_self_refuse_completed(): - """LLM 이 JSON 에 refused=true 반환 → synthesis_self_refuse.""" - sr = _sr(status="completed", answer=None, refused=True, refuse_reason="범위 밖") - assert _detect_synthesis_failure(sr) == "synthesis_self_refuse" - - -def test_llm_self_refuse_with_answer_still_refused(): - """refused=True 면 answer 있어도 Tier 0 처리 (일관성).""" - sr = _sr(status="completed", answer="왜 답변함", refused=True) - assert _detect_synthesis_failure(sr) == "synthesis_self_refuse" - - -# ─── mechanical failure 케이스 ────────────────────────── - - -def test_timeout(): - sr = _sr(status="timeout", answer=None, refused=False) - assert _detect_synthesis_failure(sr) == "synthesis_failed(timeout)" - - -def test_parse_failed(): - sr = _sr(status="parse_failed", answer=None, refused=False) - assert _detect_synthesis_failure(sr) == "synthesis_failed(parse_failed)" - - -def test_llm_error(): - sr = _sr(status="llm_error", answer=None, refused=False) - assert _detect_synthesis_failure(sr) == "synthesis_failed(llm_error)" - - -def test_refused_with_mechanical_fail_propagates_status(): - """refused=True + status!=completed → synthesis_failed({status}) 형식.""" - sr = _sr(status="timeout", answer=None, refused=True) - assert _detect_synthesis_failure(sr) == "synthesis_failed(timeout)" - - -# ─── empty answer 케이스 ─────────────────────────────── - - -def test_empty_answer_completed(): - """status=completed 인데 answer 공백 → synthesis_failed(completed).""" - sr = _sr(status="completed", answer="", refused=False) - assert _detect_synthesis_failure(sr) == "synthesis_failed(completed)" - - -def test_whitespace_only_answer(): - """공백/탭/개행만 있어도 empty 로 간주.""" - sr = _sr(status="completed", answer=" \n\t ", refused=False) - assert _detect_synthesis_failure(sr) == "synthesis_failed(completed)" - - -def test_none_answer_completed(): - """answer=None + status=completed → failed.""" - sr = _sr(status="completed", answer=None, refused=False) - assert _detect_synthesis_failure(sr) == "synthesis_failed(completed)" - - -# ─── 유효 답변 케이스 (None 반환) ────────────────────── - - -def test_valid_answer_returns_none(): - """status=completed + answer 있고 refused=False → Tier 0 통과 (None).""" - sr = _sr(status="completed", answer="교육 시간은 매년 6시간 이상이다 [1].", refused=False) - assert _detect_synthesis_failure(sr) is None - - -def test_skipped_status_with_answer_passes(): - """status=skipped 는 Tier 0 대상 아님 — 초기 refusal gate 에서 이미 early-return 처리됨. - - (skipped 는 여기까지 도달하지 않는다는 전제. 만약 도달하더라도 refused 가 True 일 것.) - """ - sr = _sr(status="skipped", answer="abc", refused=False) - # 이 경우 Tier 0 미발동 (answer 있고 refused 아님) — 정상 경로로 나감. - assert _detect_synthesis_failure(sr) is None diff --git a/tests/test_verifier_numeric_promote.py b/tests/test_verifier_numeric_promote.py deleted file mode 100644 index e7c8443..0000000 --- a/tests/test_verifier_numeric_promote.py +++ /dev/null @@ -1,58 +0,0 @@ -"""Phase 3.5 B2: verifier _SEVERITY_MAP env flag 테스트. - -VERIFIER_NUMERIC_PROMOTE 환경변수에 따른 _SEVERITY_MAP 변화 검증. -모듈은 import time 에 env 평가하므로 reload 필요. -""" - -from __future__ import annotations - -import importlib -import os -import sys - -# tests/ → 프로젝트 루트 → app/ -sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "app")) - -import pytest - - -def _reload_verifier(monkeypatch, value: str | None): - """env 설정 후 verifier_service 를 reload 하여 _SEVERITY_MAP 재평가.""" - if value is None: - monkeypatch.delenv("VERIFIER_NUMERIC_PROMOTE", raising=False) - else: - monkeypatch.setenv("VERIFIER_NUMERIC_PROMOTE", value) - from services.search import verifier_service - importlib.reload(verifier_service) - return verifier_service - - -def test_severity_map_off_default(monkeypatch): - """env 미설정 → numeric_conflict critical 은 medium (기존 동작).""" - vs = _reload_verifier(monkeypatch, None) - assert vs._SEVERITY_MAP["numeric_conflict"]["critical"] == "medium" - assert vs._SEVERITY_MAP["numeric_conflict"]["minor"] == "medium" - assert vs._NUMERIC_PROMOTE is False - - -def test_severity_map_on_critical_promoted(monkeypatch): - """VERIFIER_NUMERIC_PROMOTE=1 → critical 만 strong, minor 는 medium 유지.""" - vs = _reload_verifier(monkeypatch, "1") - assert vs._SEVERITY_MAP["numeric_conflict"]["critical"] == "strong" - assert vs._SEVERITY_MAP["numeric_conflict"]["minor"] == "medium" - assert vs._NUMERIC_PROMOTE is True - - -def test_severity_map_off_explicit_zero(monkeypatch): - """VERIFIER_NUMERIC_PROMOTE=0 명시 → off (default 와 동일).""" - vs = _reload_verifier(monkeypatch, "0") - assert vs._SEVERITY_MAP["numeric_conflict"]["critical"] == "medium" - assert vs._NUMERIC_PROMOTE is False - - -def test_direct_negation_invariant(monkeypatch): - """direct_negation 은 env 무관 항상 strong (불변 — 안전장치).""" - for value in [None, "0", "1"]: - vs = _reload_verifier(monkeypatch, value) - assert vs._SEVERITY_MAP["direct_negation"]["critical"] == "strong" - assert vs._SEVERITY_MAP["direct_negation"]["minor"] == "strong"