"""하이브리드 검색 API — thin endpoint (Phase 3.1 이후). 실제 검색 파이프라인(retrieval → fusion → rerank → diversity → confidence) 은 `services/search/search_pipeline.py::run_search()` 로 분리되어 있다. 이 파일은 다음만 담당: - Pydantic 스키마 (SearchResult / SearchResponse / SearchDebug / DebugCandidate / Citation / AskResponse / AskDebug) - `/search` endpoint wrapper (run_search 호출 + logger + telemetry + 직렬화) - `/ask` endpoint wrapper (Phase 3.3 에서 추가) """ import asyncio import hmac import time from datetime import date from typing import Annotated, Literal from fastapi import APIRouter, BackgroundTasks, Depends, Header, Query from fastapi.responses import JSONResponse from pydantic import BaseModel from sqlalchemy.ext.asyncio import AsyncSession from core.auth import get_current_user from core.config import settings from core.database import get_session from core.utils import setup_logger from models.user import User from services.document_telemetry import sanitize_source from services.search.classifier_service import ClassifierResult, classify from services.search.evidence_service import EvidenceItem, extract_evidence from services.search.fusion_service import DEFAULT_FUSION from services.search.grounding_check import check as grounding_check from services.search.refusal_gate import RefusalDecision, decide as refusal_decide from services.search import query_rewriter from services.search.retrieval_service import AxisFilter from services.search.result_decorate import compute_facets, decorate_version_status from services.search.search_pipeline import PipelineResult, run_search from services.search.synthesis_service import SynthesisResult, synthesize from services.search.verifier_service import VerifierResult, verify from services.prompt_versions import ASK_PROMPT_VERSION, resolve_primary_model from services.search_telemetry import record_ask_event, record_search_event # logs/search.log + stdout 동시 출력 (Phase 0.4) logger = setup_logger("search") router = APIRouter() class SearchResult(BaseModel): """검색 결과 단일 행. Phase 1.2-C: chunk-level vector retrieval 도입으로 chunk 메타 필드 추가. text 검색 결과는 chunk_id 등이 None (doc-level). vector 검색 결과는 chunk_id 등이 채워짐 (chunk-level). """ id: int # doc_id (text/vector 공통) title: str | None ai_domain: str | None ai_summary: str | None file_format: str score: float snippet: str | None match_reason: str | None = None # Phase 1.2-C: chunk 메타 (vector 검색 시 채워짐) chunk_id: int | None = None chunk_index: int | None = None section_title: str | None = None # Phase 3.1: reranker raw score 보존 (display score drift 방지). # rerank 경로를 탄 chunk에만 채워짐. normalize_display_scores는 이 필드를 # 건드리지 않는다. Phase 3 evidence fast-path 판단에 사용. rerank_score: float | None = None # PR-RAG-Time-1: freshness decay 디버그 메타. apply_freshness_decay 가 채움. # 비적용 row 도 채워짐(freshness_policy=None). base_score 는 항상 보존. freshness_debug: dict | None = None # 안전 자료실 C-1: 분류 축 메타 (3 leg SELECT 에서 채움 — additive, ranking 무관). # D-1 UI 결과 카드 유형별 렌더 + 해외 법령(B-5) 가동 시 국가 무표지 혼재 차단의 선행 조건. material_type: str | None = None jurisdiction: str | None = None published_date: date | None = None # 안전 자료실 C-1 후속: 법령 버전 상태(legal_meta.version_status) — wrapper 1회 decorate. # law 결과만 채워짐(legal_meta 위성), 그 외/무매핑 law = None. D-1 버전 뱃지 선행. version_status: str | None = None # ─── Phase 0.4: 디버그 응답 스키마 ───────────────────────── class DebugCandidate(BaseModel): """단계별 후보 (debug=true 응답에서만 노출).""" id: int rank: int score: float match_reason: str | None = None class SearchDebug(BaseModel): timing_ms: dict[str, float] text_candidates: list[DebugCandidate] | None = None vector_candidates: list[DebugCandidate] | None = None fused_candidates: list[DebugCandidate] | None = None confidence: float notes: list[str] = [] # Phase 1/2 도입 후 채워질 placeholder query_analysis: dict | None = None reranker_scores: list[DebugCandidate] | None = None class SearchResponse(BaseModel): results: list[SearchResult] total: int query: str mode: str debug: SearchDebug | None = None # 안전 자료실 C-1 후속: facets=true 일 때만 채워짐(미요청=None, byte 불변). # top-K 결과 내 분류 축 분포 라벨 {axis: {label: count}}. facets: dict[str, dict[str, int]] | None = None def _to_debug_candidates(rows: list[SearchResult], n: int = 20) -> list[DebugCandidate]: return [ DebugCandidate( id=r.id, rank=i + 1, score=r.score, match_reason=r.match_reason ) for i, r in enumerate(rows[:n]) ] def _build_search_debug(pr: PipelineResult) -> SearchDebug: """PipelineResult → SearchDebug (기존 search()의 debug 구성 블록 복사).""" return SearchDebug( timing_ms=pr.timing_ms, text_candidates=( _to_debug_candidates(pr.text_results) if pr.text_results or pr.mode != "vector" else None ), vector_candidates=( _to_debug_candidates(pr.vector_results) if pr.vector_results or pr.mode in ("vector", "hybrid") else None ), fused_candidates=( _to_debug_candidates(pr.results) if pr.mode == "hybrid" else None ), confidence=pr.confidence_signal, notes=pr.notes, query_analysis=pr.query_analysis, ) @router.get("/", response_model=SearchResponse) async def search( q: str, user: Annotated[User, Depends(get_current_user)], session: Annotated[AsyncSession, Depends(get_session)], background_tasks: BackgroundTasks, mode: str = Query("hybrid", pattern="^(fts|trgm|vector|hybrid)$"), limit: int = Query(20, ge=1, le=100), fusion: str = Query( DEFAULT_FUSION, pattern="^(legacy|rrf|rrf_boost)$", description="hybrid 모드 fusion 전략 (legacy=기존 가중합, rrf=RRF k=60, rrf_boost=RRF+강한신호 boost)", ), rerank: bool = Query( True, description="bge-reranker-v2-m3 활성화 (Phase 1.3, hybrid 모드만 동작)", ), analyze: bool = Query( False, description="QueryAnalyzer 활성화 (Phase 2.1, LLM 호출). Phase 2.1은 debug 노출만, 검색 경로 영향 X", ), debug: bool = Query(False, description="단계별 candidates + timing 응답에 포함"), embedding_backend: str | None = Query( None, pattern=r"^(baseline|cand_[a-z0-9_]+)$", description="Phase 2A Diagnose dispatcher (R2-2 + R2-B1). slug 만 받음 (raw table name X). baseline|cand_. 미지정/baseline = production path.", ), snapshot_doc_id_max: int | None = Query( None, ge=1, description="Phase 2A snapshot freeze (R2-D + R2-B2). documents.id <= 값 filter. baseline 측정 시에도 동일 filter 적용.", ), snapshot_chunk_id_max: int | None = Query( None, ge=1, description="Phase 2A snapshot freeze (R2-D + R2-B2). document_chunks.id <= 값 filter. baseline 측정 시에도 동일 filter 적용.", ), reranker_backend: str | None = Query( None, pattern=r"^(baseline|cand_[a-z0-9_]+)$", description="Phase 2B Diagnose reranker dispatcher (R2-B1 slug-based). slug 만 받음 (raw endpoint URL X). baseline|cand_. 미지정/baseline = production reranker.", ), rewrite_backend: str | None = Query( None, pattern=r"^(baseline|cand_[a-z0-9_]+)$", description=( "⚠️ EXPERIMENTAL / DEPRECATED (Phase 2Q closed 2026-05-24 as evaluated experiment). " "Result-level dedup 정정 후 net gain marginal (NDCG +0.019, Recall t≥2 +0.030) " "vs latency cost 큼 (cold +876%, warm +320%). default production rollout 권고 X. " "slug-based, no silent fallback. baseline|cand_multi_query_macmini|cand_multi_query_macbook. " "미지정/baseline = single-query path (회귀 0 invariant, 권장 default). " "opt-in 실험 reference 만 유지 — docs/phase_2q_apply_opt_in.md 의 closed status 참조." ), ), corpus_variant: str | None = Query( None, pattern=r"^(prehier|hier_sim_raw|hier_sim_clean)$", description=( "⚠️ EVAL ONLY (Hier-Replace-Diagnose-1). chunk leg 를 측정 뷰로 교체 — " "prehier(legacy baseline) | hier_sim_raw | hier_sim_clean(childless-tiny 제외). " "doc-level + fts/trgm 는 documents 테이블 = 변종 무관. 미지정 = production corpus_chunks. " "embedding_backend cand 와 동시 사용 불가 (400)." ), ), exact_knn: bool = Query( False, description=( "⚠️ EVAL ONLY (Hier-Replace-Diagnose-1). vector leg 에 SET LOCAL enable_indexscan/" "bitmapscan=off → ivfflat 근사 제거(exact seqscan). prehier vs hier_sim 의 index 변수 " "분리용. production 검색에는 사용 금지 (latency 큼)." ), ), material_type: str | None = Query( None, description="안전 자료실 C-1: 자료유형 필터 CSV (law,paper,incident,...). material_type = ANY"), jurisdiction: str | None = Query( None, description="안전 자료실 C-1: 관할 필터 (KR/US/EU/JP/GB/INT)"), year_from: int | None = Query(None, ge=1900, le=2100, description="published_date 연도 하한 (NULL=created_at fallback)"), year_to: int | None = Query(None, ge=1900, le=2100, description="published_date 연도 상한"), facets: bool = Query(False, description="안전 자료실 C-1 후속: top-K 결과 분류 축 분포(material_type/jurisdiction/version_status)를 응답 facets 에 집계. 미지정=계산/노출 0"), ): """문서 검색 — FTS + ILIKE + 벡터 결합 (Phase 3.1 이후 run_search wrapper)""" try: axis = AxisFilter( material_types=[m.strip() for m in material_type.split(",") if m.strip()] if material_type else None, jurisdiction=jurisdiction, year_from=year_from, year_to=year_to, ) pr = await run_search( session, q, mode=mode, # type: ignore[arg-type] limit=limit, fusion=fusion, rerank=rerank, analyze=analyze, embedding_backend=embedding_backend, snapshot_doc_id_max=snapshot_doc_id_max, snapshot_chunk_id_max=snapshot_chunk_id_max, reranker_backend=reranker_backend, rewrite_backend=rewrite_backend, corpus_variant=corpus_variant, exact_knn=exact_knn, axis=axis, ) except ValueError as e: # _resolve_backend / _resolve_reranker / _resolve_rewrite_backend / _resolve_corpus_variant unknown slug → HTTP 400 msg = str(e) if msg.startswith("unknown_corpus_variant") or msg.startswith("corpus_variant_incompatible"): return JSONResponse( status_code=400, content={ "error_reason": msg.split(":")[0].split(" ")[0], "corpus_variant_requested": corpus_variant, "allowed": ["prehier", "hier_sim_raw", "hier_sim_clean"], "detail": msg, }, ) if msg.startswith("unknown_rewrite_backend"): return JSONResponse( status_code=400, content={ "error_reason": "unknown_rewrite_backend", "backend_requested": rewrite_backend, "allowed": query_rewriter.allowed_slugs(), "detail": msg, }, ) if msg.startswith("unknown_reranker_backend"): return JSONResponse( status_code=400, content={ "error_reason": "unknown_reranker_backend", "backend_requested": reranker_backend, "allowed": ["baseline", "cand_gte_ml_base"], "detail": msg, }, ) return JSONResponse( status_code=400, content={ "error_reason": "unknown_embedding_backend", "backend_requested": embedding_backend, "allowed": ["baseline", "cand_me5_large_inst", "cand_snowflake_l_v2"], "detail": msg, }, ) except RuntimeError as e: # query_rewriter.rewrite() 실패 (LLM unavailable / parse fail) → HTTP 503 msg = str(e) if msg.startswith("rewrite_llm_unavailable"): return JSONResponse( status_code=503, content={ "error_reason": "rewrite_llm_unavailable", "backend_requested": rewrite_backend, "detail": msg, }, ) raise # 사용자 feedback: 모든 단계 timing은 debug 응답과 별도로 항상 로그로 남긴다 timing_str = " ".join(f"{k}={v:.0f}" for k, v in pr.timing_ms.items()) fusion_str = f" fusion={fusion}" if mode == "hybrid" else "" analyzer_str = ( f" analyzer=hit={pr.analyzer_cache_hit}/conf={pr.analyzer_confidence:.2f}/tier={pr.analyzer_tier}" if analyze else "" ) logger.info( "search query=%r mode=%s%s%s results=%d conf=%.2f %s", q[:80], pr.mode, fusion_str, analyzer_str, len(pr.results), pr.confidence_signal, timing_str, ) # Phase 0.3: 실패 자동 로깅 (응답 latency에 영향 X — background task) # Phase 2.1: analyze=true일 때만 analyzer_confidence 전달 (False는 None → 기존 호환) background_tasks.add_task( record_search_event, q, user.id, pr.results, pr.mode, pr.confidence_signal, pr.analyzer_confidence if analyze else None, ) debug_obj = _build_search_debug(pr) if debug else None # 안전 자료실 C-1 후속 — wrapper decoration (검색 코어 무접촉, ranking 무관) await decorate_version_status(session, pr.results) # 법령 결과에 version_status facets_obj = compute_facets(pr.results) if facets else None return SearchResponse( results=pr.results, total=len(pr.results), query=q, mode=pr.mode, debug=debug_obj, facets=facets_obj, ) # ═══════════════════════════════════════════════════════════ # Phase 3.3: /api/search/ask — Evidence + Grounded Synthesis # ═══════════════════════════════════════════════════════════ class Citation(BaseModel): """answer 본문의 [n] 에 해당하는 근거 단일 행.""" n: int chunk_id: int | None doc_id: int title: str | None section_title: str | None span_text: str # evidence LLM 이 추출한 50~300자 full_snippet: str # 원본 800자 (citation 원문 보기 전용) relevance: float rerank_score: float class ConfirmedItem(BaseModel): """Partial answer 의 개별 aspect 답변.""" aspect: str text: str citations: list[int] class AskDebug(BaseModel): """`/ask?debug=true` 응답 확장.""" timing_ms: dict[str, float] search_notes: list[str] query_analysis: dict | None = None confidence_signal: float evidence_candidate_count: int evidence_kept_count: int evidence_skip_reason: str | None synthesis_cache_hit: bool synthesis_prompt_preview: str | None = None synthesis_raw_preview: str | None = None hallucination_flags: list[str] = [] # Phase 3.5a: per-layer defense 로깅 defense_layers: dict | None = None class AskResponse(BaseModel): """`/ask` 응답. Phase 3.5a: completeness + aspects 추가.""" results: list[SearchResult] ai_answer: str | None citations: list[Citation] synthesis_status: Literal[ "completed", "timeout", "skipped", "no_evidence", "parse_failed", "llm_error", # PR-MacBook-RAG-Backend-1: 200 응답에는 등장하지 않음 (해당 status 는 503 분기). # Literal 호환성 위해 포함. "backend_unavailable", ] synthesis_ms: float confidence: Literal["high", "medium", "low"] | None refused: bool no_results_reason: str | None query: str total: int # Phase 3.5a completeness: Literal["full", "partial", "insufficient"] = "full" covered_aspects: list[str] | None = None missing_aspects: list[str] | None = None confirmed_items: list[ConfirmedItem] | None = None # PR-MacBook-RAG-Backend-1: backend dispatcher metadata. # backend 미지정 호출은 둘 다 None 으로 유지 (기존 호출자 호환 — Hermes docsrv_ask / # voice-memo-bot 응답 형식 변동 0). 명시 opt-in 시만 채워짐. backend_requested: str | None = None backend_used: str | None = None debug: AskDebug | None = None def _map_no_results_reason( pr: PipelineResult, evidence: list[EvidenceItem], ev_skip: str | None, sr: SynthesisResult, ) -> str | None: """사용자에게 보여줄 한국어 메시지 매핑. Failure mode 표 (plan §Failure Modes) 기반. """ # LLM 자가 refused → 모델이 준 사유 그대로 if sr.refused and sr.refuse_reason: return sr.refuse_reason # synthesis 상태 우선 if sr.status == "no_evidence": if not pr.results: return "검색 결과가 없습니다." return "관련도 높은 근거를 찾지 못했습니다." if sr.status == "skipped": return "검색 결과가 없습니다." if sr.status == "timeout": return "답변 생성이 지연되어 생략했습니다. 검색 결과를 확인해 주세요." if sr.status == "parse_failed": return "답변 형식 오류로 생략했습니다." if sr.status == "llm_error": return "AI 서버에 일시적 문제가 있습니다." # evidence 단계 실패는 fallback 을 탔더라도 notes 용 if ev_skip == "all_low_rerank": return "관련도 높은 근거를 찾지 못했습니다." if ev_skip == "empty_retrieval": return "검색 결과가 없습니다." return None def _build_citations( evidence: list[EvidenceItem], used_citations: list[int] ) -> list[Citation]: """answer 본문에 실제로 등장한 n 만 Citation 으로 변환.""" by_n = {e.n: e for e in evidence} out: list[Citation] = [] for n in used_citations: e = by_n.get(n) if e is None: continue out.append( Citation( n=e.n, chunk_id=e.chunk_id, doc_id=e.doc_id, title=e.title, section_title=e.section_title, span_text=e.span_text, full_snippet=e.full_snippet, relevance=e.relevance, rerank_score=e.rerank_score, ) ) return out def _build_ask_debug( pr: PipelineResult, evidence: list[EvidenceItem], ev_skip: str | None, sr: SynthesisResult, ev_ms: float, synth_ms: float, total_ms: float, ) -> AskDebug: timing: dict[str, float] = dict(pr.timing_ms) timing["evidence_ms"] = ev_ms timing["synthesis_ms"] = synth_ms timing["ask_total_ms"] = total_ms # candidate count 는 rule filter 통과한 수 (recomputable from results) # 엄밀히는 evidence_service 내부 숫자인데, evidence 길이 ≈ kept, candidate # 는 관측이 어려움 → kept 는 evidence 길이, candidate 는 별도 필드 없음. # 단순화: candidate_count = len(evidence) 를 상한 근사로 둠 (debug 전용). return AskDebug( timing_ms=timing, search_notes=pr.notes, query_analysis=pr.query_analysis, confidence_signal=pr.confidence_signal, evidence_candidate_count=len(evidence), evidence_kept_count=len(evidence), evidence_skip_reason=ev_skip, synthesis_cache_hit=sr.cache_hit, synthesis_prompt_preview=None, # 현재 synthesis_service 에서 노출 안 함 synthesis_raw_preview=sr.raw_preview, hallucination_flags=sr.hallucination_flags, ) def _detect_synthesis_failure(sr: SynthesisResult) -> str | None: """Synthesis 가 유효한 답을 못 냈으면 re_gate 라벨, 아니면 None. 판정 우선순위 (Phase 3.5 fix3): 1) sr.refused → LLM self-refuse (status="completed") 또는 mechanical fail 후 refused 전파 - status=="completed" + refused=True → "synthesis_self_refuse" - 그 외 → f"synthesis_failed({status})" 2) sr.status ∈ {timeout, parse_failed, llm_error} → f"synthesis_failed({status})" 3) answer 공백 → f"synthesis_failed({status})" 4) 유효 → None """ if sr.refused: if sr.status == "completed": return "synthesis_self_refuse" return f"synthesis_failed({sr.status})" if sr.status in ("timeout", "parse_failed", "llm_error"): return f"synthesis_failed({sr.status})" if not (sr.answer or "").strip(): return f"synthesis_failed({sr.status})" return None def _resolve_eval_identity( x_source: str | None, x_eval_case_id: str | None, x_eval_token: str | None, ) -> tuple[str, str | None]: """X-Source/X-Eval-Case-Id 신뢰 검증 (Phase 3.5 fix2). 규칙: - 기본값: source='document_server', eval_case_id=None - X-Source=eval 또는 X-Eval-Case-Id 가 들어왔다면 eval claim 으로 간주 - eval claim 은 X-Eval-Token == settings.eval_runner_token 일 때만 수용 (constant-time compare, env 미설정 시 항상 거부) - 거부 시: 헤더 무시 + warning log + source=sanitize(non-eval) / eval_case_id=None - 통과 시: source='eval', eval_case_id=x_eval_case_id 반환: (source, eval_case_id) """ claimed_source = sanitize_source(x_source) is_eval_claim = (claimed_source == "eval") or bool(x_eval_case_id) if not is_eval_claim: # 일반 호출 — eval_case_id 강제 None (source != 'eval' 이면 case_id 의미 없음) return claimed_source, None # eval claim — token 검증 expected = settings.eval_runner_token presented = x_eval_token or "" token_valid = bool(expected) and hmac.compare_digest(presented, expected) if not token_valid: logger.warning( "eval header rejected: source=%s case_id=%s token_present=%s expected_set=%s", x_source, x_eval_case_id, bool(x_eval_token), bool(expected), ) # 일반 호출로 강등 — source='eval' 주장은 무시, case_id 도 무시 # claimed_source 가 'eval' 이면 default 'document_server' 로 if claimed_source == "eval": return "document_server", None return claimed_source, None # token OK — eval 라벨 수용 return "eval", x_eval_case_id @router.get("/ask", response_model=AskResponse) async def ask( q: str, user: Annotated[User, Depends(get_current_user)], session: Annotated[AsyncSession, Depends(get_session)], background_tasks: BackgroundTasks, limit: int = Query(10, ge=1, le=20, description="synthesis 입력 상한"), debug: bool = Query(False, description="evidence/synthesis 중간 상태 노출"), backend: Annotated[ str | None, Query( pattern="^(qwen-macbook|gemma-macmini|mac-mini-default|claude-cloud|auto)$", description=( "PR-2 of DS AI routing policy (2026-05-23) — 명시 backend opt-in via llm-router. " "미지정 = mac-mini-default (gemma-macmini alias, default). " "'mac-mini-default' = router 가 tier_b (Mac mini gemma-4-26b). " "'qwen-macbook' = router 가 named upstream (M5 Max Qwen 3.6 27B). " "'claude-cloud' = router 가 503 provider_not_configured (활성화 별 PR). " "'auto' = router 의 rule + LLM triage. " "backend unavailable 시 503 + error_reason=macbook_unavailable / router_* " "(자동 fallback 없음 — 다시 호출하거나 backend 인자 제거 후 재시도)." ), ), ] = None, corpus_variant: str | None = Query( None, pattern=r"^(prehier|hier_sim_raw|hier_sim_clean)$", description=( "⚠️ EVAL-ONLY (Hier-PassageRAG-Diagnose-1). evidence retrieval 의 chunk leg 를 측정 뷰로 " "교체 — prehier(legacy) | hier_sim_raw | hier_sim_clean. 운영 UI 미사용. " "미지정 = production corpus_chunks (기존 /ask 동작 동일)." ), ), exact_knn: bool = Query( False, description=( "⚠️ EVAL-ONLY (Hier-PassageRAG-Diagnose-1). vector leg exact KNN (ivfflat 근사 제거). " "passage 변종 공정 비교용. 운영 미사용. 미지정(false) = 기존 /ask 동작 동일." ), ), x_source: Annotated[str | None, Header(alias="X-Source")] = None, x_eval_case_id: Annotated[str | None, Header(alias="X-Eval-Case-Id")] = None, x_eval_token: Annotated[str | None, Header(alias="X-Eval-Token")] = None, ): """근거 기반 AI 답변 (Phase 3.5a). Phase 3.3 기반 + classifier parallel + refusal gate + grounding re-gate. 실패 경로에서도 `results` 는 항상 반환. Phase 3.5 calibration trust boundary (fix2): - X-Source / X-Eval-Case-Id 는 X-Eval-Token 이 EVAL_RUNNER_TOKEN 와 일치하는 trusted internal eval runner 에서만 수용된다. - 일반 client 의 X-Source=eval 시도는 무시되고 source='document_server' 로 강제. - source != 'eval' 이면 eval_case_id 항상 None. """ t_total = time.perf_counter() defense_log: dict = {} # per-layer flag snapshot source, eval_case_id = _resolve_eval_identity(x_source, x_eval_case_id, x_eval_token) # 1. 검색 파이프라인 (corpus_variant/exact_knn = EVAL-ONLY, 미지정 시 기존 동작 동일) pr = await run_search( session, q, mode="hybrid", limit=limit, fusion=DEFAULT_FUSION, rerank=True, analyze=True, corpus_variant=corpus_variant, exact_knn=exact_knn, ) # 1.5. ask_includable=false 문서를 evidence 입력에서 제외 # 검색 결과 자체는 유지 (사용자에게 보여줌), evidence만 필터 if pr.results: from sqlalchemy import select as sa_select from models.document import Document as DocModel ask_doc_ids = set() excluded_ids = {r.id for r in pr.results} rows = await session.execute( sa_select(DocModel.id, DocModel.ask_includable).where( DocModel.id.in_(excluded_ids) ) ) for doc_id, includable in rows: if includable is False: ask_doc_ids.add(doc_id) evidence_results = [r for r in pr.results if r.id not in ask_doc_ids] else: evidence_results = pr.results # 2. Evidence + Classifier 병렬 t_ev = time.perf_counter() evidence_task = asyncio.create_task(extract_evidence(q, evidence_results)) # classifier input: top 3 chunks meta + rerank scores top_chunks = [ { "title": r.title or "", "section": r.section_title or "", "snippet": (r.snippet or "")[:200], } for r in pr.results[:3] ] rerank_scores_top = [ r.rerank_score if r.rerank_score is not None else r.score for r in pr.results[:3] ] classifier_task = asyncio.create_task( classify(q, top_chunks, rerank_scores_top) ) evidence, ev_skip = await evidence_task ev_ms = (time.perf_counter() - t_ev) * 1000 # classifier await (timeout 보호 — classifier_service 내부에도 있지만 여기서 이중 보호) # 2026-05-17: 6s outer wrapper 가 classifier_service.LLM_TIMEOUT_MS (30s) 를 override → 동시 부하 시 # 거의 모든 classifier 호출 timeout → conservative_refuse(no_classifier) 경로. 15s 로 상향 — classifier # 가 실제 작동하도록 (단, ask 전체 응답 시간 상한 영향: ev_ms + max(classifier_wait, evidence_extract) + # synth_ms + verifier 누적). # 2026-05-17 B-3: 15s 도 동시 부하 시 부족 (classifier_service LLM_TIMEOUT_MS 30s 와 misalign). # 30s 로 align → classifier 동작 안정. ask 응답 latency 상한 ↑ 의도. try: classifier_result = await asyncio.wait_for(classifier_task, timeout=30.0) except (asyncio.TimeoutError, Exception): classifier_result = ClassifierResult("timeout", None, [], [], 0.0) defense_log["classifier"] = { "status": classifier_result.status, "verdict": classifier_result.verdict, "covered_aspects": classifier_result.covered_aspects, "missing_aspects": classifier_result.missing_aspects, "elapsed_ms": classifier_result.elapsed_ms, } # 3. Refusal gate (multi-signal fusion) all_rerank_scores = [ e.rerank_score for e in evidence ] if evidence else rerank_scores_top decision = refusal_decide(all_rerank_scores, classifier_result) defense_log["score_gate"] = { "max": max(all_rerank_scores) if all_rerank_scores else 0.0, "agg_top3": sum(sorted(all_rerank_scores, reverse=True)[:3]), } defense_log["refusal"] = { "refused": decision.refused, "rule_triggered": decision.rule_triggered, } if decision.refused: total_ms = (time.perf_counter() - t_total) * 1000 no_reason = "관련 근거를 찾지 못했습니다." if not pr.results: no_reason = "검색 결과가 없습니다." logger.info( "ask REFUSED query=%r rule=%s max_score=%.2f total=%.0f", q[:80], decision.rule_triggered, max(all_rerank_scores) if all_rerank_scores else 0.0, total_ms, ) # telemetry — search + ask_events 두 경로 동시 background_tasks.add_task( record_search_event, q, user.id, pr.results, "hybrid", pr.confidence_signal, pr.analyzer_confidence, ) # input_snapshot (디버깅/재현용) defense_log["input_snapshot"] = { "query": q, "top_chunks_preview": [ {"title": c.get("title", ""), "snippet": c.get("snippet", "")[:100]} for c in top_chunks[:3] ], "answer_preview": None, } background_tasks.add_task( record_ask_event, q, user.id, "insufficient", "skipped", None, True, classifier_result.verdict, max(all_rerank_scores) if all_rerank_scores else 0.0, sum(sorted(all_rerank_scores, reverse=True)[:3]), [], len(evidence), 0, defense_log, int(total_ms), # Phase E.1 측정 필드 answer_length=0, covered_aspects=classifier_result.covered_aspects or None, missing_aspects=classifier_result.missing_aspects or None, model_name=resolve_primary_model(), prompt_version=ASK_PROMPT_VERSION, # Phase 3.5 calibration source=source, eval_case_id=eval_case_id, ) debug_obj = None if debug: debug_obj = AskDebug( timing_ms={**pr.timing_ms, "evidence_ms": ev_ms, "ask_total_ms": total_ms}, search_notes=pr.notes, confidence_signal=pr.confidence_signal, evidence_candidate_count=len(evidence), evidence_kept_count=len(evidence), evidence_skip_reason=ev_skip, synthesis_cache_hit=False, hallucination_flags=[], defense_layers=defense_log, ) return AskResponse( results=pr.results, ai_answer=None, citations=[], synthesis_status="skipped", synthesis_ms=0.0, confidence=None, refused=True, no_results_reason=no_reason, query=q, total=len(pr.results), completeness="insufficient", covered_aspects=classifier_result.covered_aspects or None, missing_aspects=classifier_result.missing_aspects or None, # refusal gate 단계에서는 backend 호출 자체가 일어나지 않음 → # backend_used = None. backend_requested 는 호출자 의도 표시용. backend_requested=backend, backend_used=None, debug=debug_obj, ) # 4. Synthesis (backend dispatcher 적용 — PR-MacBook-RAG-Backend-1) t_synth = time.perf_counter() sr = await synthesize(q, evidence, debug=debug, backend=backend) synth_ms = (time.perf_counter() - t_synth) * 1000 # 4.1. backend_unavailable → 503 fail-fast (자동 fallback 금지) # 명시 opt-in backend (예: qwen-macbook) 가 비가용일 때만 발생. /ask wrapper 는 # 절대 다른 backend 로 재시도하지 않음. 사용자가 backend 인자 제거 또는 wake 후 재시도. if sr.status == "backend_unavailable": backend_requested_val = backend or "gemma-macmini" total_ms = (time.perf_counter() - t_total) * 1000 logger.warning( "ask backend_unavailable backend=%s query=%r total_ms=%.0f flags=%s", backend_requested_val, q[:80], total_ms, ",".join(sr.hallucination_flags) if sr.hallucination_flags else "-", ) # error_reason 명명 — macbook_unavailable 만 정착 (자동 fallback 부재). error_reason = ( "macbook_unavailable" if backend_requested_val == "qwen-macbook" else "backend_unavailable" ) # telemetry — search 만 기록 (ask_events 는 200 응답 path 전용) background_tasks.add_task( record_search_event, q, user.id, pr.results, "hybrid", pr.confidence_signal, pr.analyzer_confidence, ) return JSONResponse( status_code=503, content={ "error": "backend_unavailable", "error_reason": error_reason, "backend_requested": backend_requested_val, "backend_used": None, "query": q, "detail": ( "명시 선택한 backend 가 일시적으로 응답할 수 없습니다. " "MacBook 깨우거나 backend 인자를 제거하고 (기본 Gemma) 다시 호출하세요." ), }, ) # 5. Grounding check + Verifier (조건부 병렬) + re-gate (Phase 3.5b) grounding = grounding_check(q, sr.answer or "", evidence) # verifier skip: grounding strong 2+ OR retrieval 자체가 망함 grounding_only_strong = [ f for f in grounding.strong_flags if not f.startswith("verifier_") ] max_rerank = max(all_rerank_scores, default=0.0) if len(grounding_only_strong) >= 2 or max_rerank < 0.2: verifier_result = VerifierResult("skipped", [], 0.0) else: verifier_task = asyncio.create_task( verify(q, sr.answer or "", evidence) ) # 2026-05-17 B-3: 4s outer wait_for 가 verifier_service LLM_TIMEOUT_MS (10s) 를 override # → classifier 와 동일 패턴 (search.py:522 가 6s→15s swap 했던 case). 10s 로 align. try: verifier_result = await asyncio.wait_for(verifier_task, timeout=10.0) except (asyncio.TimeoutError, Exception): verifier_result = VerifierResult("timeout", [], 0.0) # Verifier contradictions → grounding flags 머지 (prefix 로 구분, severity 3단계) for c in verifier_result.contradictions: if c.severity == "strong": grounding.strong_flags.append(f"verifier_{c.type}:{c.claim[:30]}") elif c.severity == "medium": grounding.weak_flags.append(f"verifier_{c.type}_medium:{c.claim[:30]}") else: grounding.weak_flags.append(f"verifier_{c.type}:{c.claim[:30]}") defense_log["evidence"] = { "skip_reason": ev_skip, "kept_count": len(evidence), } defense_log["grounding"] = { "strong": grounding.strong_flags, "weak": grounding.weak_flags, } defense_log["verifier"] = { "status": verifier_result.status, "contradictions_count": len(verifier_result.contradictions), "strong_count": sum(1 for c in verifier_result.contradictions if c.severity == "strong"), "medium_count": sum(1 for c in verifier_result.contradictions if c.severity == "medium"), "elapsed_ms": verifier_result.elapsed_ms, } # ── Re-gate: 7-tier completeness 결정 (Phase 3.5 B2 — Tier 4 신규 삽입, 재번호) ── # 기존 6-tier (3.5b 4차 리뷰) + Tier 4(g_strong + v_strong_numeric + low_conf → refuse). # 호환성: defense_layers["re_gate"] 의 string literal 들은 기존 그대로 유지. # 신규 "refuse(grounding+verifier_numeric)" 만 추가. completeness: Literal["full", "partial", "insufficient"] = "full" covered_aspects = classifier_result.covered_aspects or None missing_aspects = classifier_result.missing_aspects or None confirmed_items: list[ConfirmedItem] | None = None # verifier/grounding strong 구분 g_strong = [f for f in grounding.strong_flags if not f.startswith("verifier_")] v_strong = [f for f in grounding.strong_flags if f.startswith("verifier_")] v_medium = [f for f in grounding.weak_flags if f.startswith("verifier_") and "_medium:" in f] has_direct_negation = any("direct_negation" in f for f in v_strong) # Phase 3.5 B2: verifier strong flags 중 numeric_conflict 만 카운트. # promote(VERIFIER_NUMERIC_PROMOTE=1) 활성 시 critical numeric_conflict 가 strong 으로 승격되며 # 여기 카운트에 잡힘. promote off 면 항상 0 → Tier 4 활성 안 됨 (기존 동작 유지). v_strong_numeric = sum( 1 for f in v_strong if f.startswith("verifier_numeric_conflict") ) # ── Tier 0 (Phase 3.5 fix3): synthesis 자체 실패 처리 ── # LLM self-refuse, 메커니즘 실패(timeout/parse_failed/llm_error), answer 공백. # 빈 답에 대해 grounding/verifier flag 가 0건이라 기존 체인이 "else clean" 으로 빠지며 # completeness="full" 초기값이 보존되던 모순을 여기서 일관되게 차단. # 과거 baseline(v1-400char) 에서 20(self-refuse)+4(timeout) = 24/223 (10.8%) 해당. tier0_label = _detect_synthesis_failure(sr) if tier0_label: completeness = "insufficient" sr.answer = None sr.refused = True sr.confidence = None defense_log["re_gate"] = tier0_label elif len(g_strong) >= 2: # Tier 1: grounding strong 2+ → refuse completeness = "insufficient" sr.answer = None sr.refused = True sr.confidence = None defense_log["re_gate"] = "refuse(grounding_2+strong)" elif g_strong and has_direct_negation: # Tier 2: grounding strong + verifier direct_negation → refuse completeness = "insufficient" sr.answer = None sr.refused = True sr.confidence = None defense_log["re_gate"] = "refuse(grounding+direct_negation)" elif g_strong and sr.confidence == "low" and max_rerank < 0.25: # Tier 3: grounding strong 1 + (low confidence AND weak evidence) → refuse completeness = "insufficient" sr.answer = None sr.refused = True sr.confidence = None defense_log["re_gate"] = "refuse(grounding+low_conf+weak_ev)" elif g_strong and v_strong_numeric >= 1 and sr.confidence == "low": # Tier 4 (B2 신규): grounding strong + verifier numeric_conflict strong + low conf → refuse. # verifier strong 단독 refuse 금지 원칙 유지 — g_strong 교차 필수. completeness = "insufficient" sr.answer = None sr.refused = True sr.confidence = None defense_log["re_gate"] = "refuse(grounding+verifier_numeric)" elif g_strong or has_direct_negation: # Tier 5 (기존 4): grounding strong 1 또는 verifier direct_negation 단독 → partial completeness = "partial" sr.confidence = "low" defense_log["re_gate"] = "partial(strong_or_negation)" elif v_medium: # Tier 6 (기존 5): verifier medium 누적 → count 기반 confidence 하향 medium_count = len(v_medium) if medium_count >= 3: sr.confidence = "low" defense_log["re_gate"] = f"conf_low(medium_x{medium_count})" elif medium_count == 2 and sr.confidence == "high": sr.confidence = "medium" defense_log["re_gate"] = "conf_cap_medium(medium_x2)" else: defense_log["re_gate"] = f"medium_x{medium_count}(no_action)" elif grounding.weak_flags: # Tier 7 (기존 6): weak → confidence 한 단계 하향 if sr.confidence == "high": sr.confidence = "medium" defense_log["re_gate"] = "conf_lower(weak)" else: defense_log["re_gate"] = "clean" # Confidence cap from refusal gate (classifier 부재 시 conservative) if decision.confidence_cap and sr.confidence: conf_rank = {"low": 0, "medium": 1, "high": 2} if conf_rank.get(sr.confidence, 0) > conf_rank.get(decision.confidence_cap, 2): sr.confidence = decision.confidence_cap # Partial 이면 max confidence = medium if completeness == "partial" and sr.confidence == "high": sr.confidence = "medium" sr.hallucination_flags.extend( [f"strong:{f}" for f in grounding.strong_flags] + [f"weak:{f}" for f in grounding.weak_flags] ) total_ms = (time.perf_counter() - t_total) * 1000 # 6. 응답 구성 citations = _build_citations(evidence, sr.used_citations) no_reason = _map_no_results_reason(pr, evidence, ev_skip, sr) if completeness == "insufficient" and not no_reason: # Tier 0 경로: synthesis self-refuse 는 LLM 이 준 사유가 가장 정확. if sr.refused and sr.refuse_reason: no_reason = sr.refuse_reason else: no_reason = "답변 검증에서 복수 오류 감지" logger.info( "ask query=%r results=%d evidence=%d cite=%d synth=%s conf=%s completeness=%s " "refused=%s grounding_strong=%d grounding_weak=%d ev_ms=%.0f synth_ms=%.0f total=%.0f", q[:80], len(pr.results), len(evidence), len(citations), sr.status, sr.confidence or "-", completeness, sr.refused, len(grounding.strong_flags), len(grounding.weak_flags), ev_ms, synth_ms, total_ms, ) # 7. telemetry — search + ask_events 두 경로 동시 background_tasks.add_task( record_search_event, q, user.id, pr.results, "hybrid", pr.confidence_signal, pr.analyzer_confidence, ) # input_snapshot (디버깅/재현용) defense_log["input_snapshot"] = { "query": q, "top_chunks_preview": [ {"title": (r.title or "")[:50], "snippet": (r.snippet or "")[:100]} for r in pr.results[:3] ], "answer_preview": (sr.answer or "")[:200], } background_tasks.add_task( record_ask_event, q, user.id, completeness, sr.status, sr.confidence, sr.refused, classifier_result.verdict, max(all_rerank_scores) if all_rerank_scores else 0.0, sum(sorted(all_rerank_scores, reverse=True)[:3]), sr.hallucination_flags, len(evidence), len(citations), defense_log, int(total_ms), # Phase E.1 측정 필드 answer_length=len(sr.answer or ""), covered_aspects=covered_aspects, missing_aspects=missing_aspects, model_name=resolve_primary_model(), prompt_version=ASK_PROMPT_VERSION, # Phase 3.5 calibration source=source, eval_case_id=eval_case_id, ) debug_obj = None if debug: timing = dict(pr.timing_ms) timing["evidence_ms"] = ev_ms timing["synthesis_ms"] = synth_ms timing["ask_total_ms"] = total_ms debug_obj = AskDebug( timing_ms=timing, search_notes=pr.notes, query_analysis=pr.query_analysis, confidence_signal=pr.confidence_signal, evidence_candidate_count=len(evidence), evidence_kept_count=len(evidence), evidence_skip_reason=ev_skip, synthesis_cache_hit=sr.cache_hit, synthesis_raw_preview=sr.raw_preview, hallucination_flags=sr.hallucination_flags, defense_layers=defense_log, ) # backend_used: synthesize 가 실제 호출한 backend (backend 인자 그대로 신뢰 OK — # backend_unavailable 은 위 503 분기에서 이미 return 됨). backend_used_val = backend or "gemma-macmini" return AskResponse( results=pr.results, ai_answer=sr.answer, citations=citations, synthesis_status=sr.status, synthesis_ms=sr.elapsed_ms, confidence=sr.confidence, refused=sr.refused, no_results_reason=no_reason, query=q, total=len(pr.results), completeness=completeness, covered_aspects=covered_aspects, missing_aspects=missing_aspects, confirmed_items=confirmed_items, backend_requested=backend, backend_used=backend_used_val, debug=debug_obj, ) # ─── PR-DocSrv-Ask-ToolCalling-ReAct-1 ──────────────────────────────────── # /api/search/ask/react — Qwen native tool calling 로 ReAct loop. # 본 endpoint 는 qwen-macbook only (endpoint 자체가 implicit opt-in). # MacBook unavailable 시 503 + error_reason=macbook_unavailable. Gemma 자동 fallback X. # G0-2 counter semantics: max_tool_rounds=2, max LLM calls=3, search exec ≤ 2. # G0-3 trace exposure: default response 의 debug_trace=None, debug=True 시만 채움. class AskReactRequest(BaseModel): query: str debug: bool = False class AskReactResponse(BaseModel): final_answer: str iterations: int partial: bool sources: list[dict] debug_trace: list[dict] | None = None @router.post("/ask/react", response_model=AskReactResponse) async def ask_react( payload: AskReactRequest, user: Annotated[User, Depends(get_current_user)], session: Annotated[AsyncSession, Depends(get_session)], ): """ReAct loop endpoint (qwen-macbook only, no fallback). 호출자가 명시 opt-in 한 endpoint. MacBook 가 sleep / unreachable / 5xx 시 HTTP 503 + body `{error_reason: "macbook_unavailable", backend: "qwen-macbook"}` 를 반환한다. Gemma Mac mini 로 자동 fallback 하지 않는다 (정정 4 의 연장). request body: - query: str (사용자 원본 질의) - debug: bool (default false; true 시 응답 `debug_trace` 채움) response body (성공 200): - final_answer: str (Qwen 종합문, partial 일 수 있음) - iterations: int (실제 진행된 tool round 수) - partial: bool (max_tool_rounds 도달 후 LLM content 비었을 때 true) - sources: list[dict] (검색에서 모인 evidence 메타, id-기준 dedup) - debug_trace: list[dict] | null (debug=true 시 round 별 trace) """ # 지연 import — 순환 의존성 회피 (react_loop 가 api.search.SearchResult 사용 안 함) from services.llm.backends import BackendUnavailable, get_backend from services.search.react_loop import agentic_ask_loop backend_inst = get_backend("qwen-macbook") # PR-2 of DS AI routing policy: backend_inst may be RouterBackend (default) # or QwenMacBookBackend (DS_BACKENDS_VIA_ROUTER=false rollback). Both # implement generate_with_tools so the ReAct loop is identical. assert hasattr(backend_inst, "generate_with_tools") try: result = await agentic_ask_loop( session, payload.query, backend=backend_inst, debug=payload.debug, ) except BackendUnavailable as exc: logger.warning( "ask_react backend unavailable backend=%s reason=%s", exc.backend_name, exc.reason, ) return JSONResponse( status_code=503, content={ "error_reason": "macbook_unavailable", "backend_requested": "qwen-macbook", "backend_used": None, "detail": exc.reason, }, ) return AskReactResponse( final_answer=result.final_answer, iterations=result.iterations, partial=result.partial, sources=result.sources, debug_trace=result.debug_trace, )