3971cf08d2
이전 버그: synthesis LLM self-refuse(status=completed + refused=True) 또는
timeout/parse_failed/llm_error/empty answer 시 grounding/verifier flag 가 0건이라
re-gate 체인이 `else clean` 분기로 빠지며 `completeness="full"` 초기값이 보존됨.
결과: `completeness=full + refused=True + re_gate=clean` 모순 row 생성.
실측: baseline v1-400char (2026-04-17) 223 row 중 24 (10.8%) 해당.
- LLM self-refuse: 20 (completed + refused=True)
- synthesis timeout: 4 (timeout + refused=False + empty answer)
수정: re-gate 최상위에 Tier 0 삽입 + 판정 로직을 `_detect_synthesis_failure()`
helper 로 분리. self-refuse 는 `synthesis_self_refuse`, 메커니즘 실패는
`synthesis_failed({status})` 라벨로 구분. no_reason fallback 도 refuse_reason 우선
활용하도록 보강.
테스트: tests/test_synthesis_failure_regate.py — self-refuse / timeout /
parse_failed / llm_error / empty answer / whitespace / valid answer 총 10 case.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
855 lines
33 KiB
Python
855 lines
33 KiB
Python
"""하이브리드 검색 API — thin endpoint (Phase 3.1 이후).
|
|
|
|
실제 검색 파이프라인(retrieval → fusion → rerank → diversity → confidence)
|
|
은 `services/search/search_pipeline.py::run_search()` 로 분리되어 있다.
|
|
이 파일은 다음만 담당:
|
|
- Pydantic 스키마 (SearchResult / SearchResponse / SearchDebug / DebugCandidate
|
|
/ Citation / AskResponse / AskDebug)
|
|
- `/search` endpoint wrapper (run_search 호출 + logger + telemetry + 직렬화)
|
|
- `/ask` endpoint wrapper (Phase 3.3 에서 추가)
|
|
"""
|
|
|
|
import asyncio
|
|
import hmac
|
|
import time
|
|
from typing import Annotated, Literal
|
|
|
|
from fastapi import APIRouter, BackgroundTasks, Depends, Header, Query
|
|
from pydantic import BaseModel
|
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
|
|
from core.auth import get_current_user
|
|
from core.config import settings
|
|
from core.database import get_session
|
|
from core.utils import setup_logger
|
|
from models.user import User
|
|
from services.document_telemetry import sanitize_source
|
|
from services.search.classifier_service import ClassifierResult, classify
|
|
from services.search.evidence_service import EvidenceItem, extract_evidence
|
|
from services.search.fusion_service import DEFAULT_FUSION
|
|
from services.search.grounding_check import check as grounding_check
|
|
from services.search.refusal_gate import RefusalDecision, decide as refusal_decide
|
|
from services.search.search_pipeline import PipelineResult, run_search
|
|
from services.search.synthesis_service import SynthesisResult, synthesize
|
|
from services.search.verifier_service import VerifierResult, verify
|
|
from services.prompt_versions import ASK_PROMPT_VERSION, resolve_primary_model
|
|
from services.search_telemetry import record_ask_event, record_search_event
|
|
|
|
# logs/search.log + stdout 동시 출력 (Phase 0.4)
|
|
logger = setup_logger("search")
|
|
|
|
router = APIRouter()
|
|
|
|
|
|
class SearchResult(BaseModel):
|
|
"""검색 결과 단일 행.
|
|
|
|
Phase 1.2-C: chunk-level vector retrieval 도입으로 chunk 메타 필드 추가.
|
|
text 검색 결과는 chunk_id 등이 None (doc-level).
|
|
vector 검색 결과는 chunk_id 등이 채워짐 (chunk-level).
|
|
"""
|
|
|
|
id: int # doc_id (text/vector 공통)
|
|
title: str | None
|
|
ai_domain: str | None
|
|
ai_summary: str | None
|
|
file_format: str
|
|
score: float
|
|
snippet: str | None
|
|
match_reason: str | None = None
|
|
# Phase 1.2-C: chunk 메타 (vector 검색 시 채워짐)
|
|
chunk_id: int | None = None
|
|
chunk_index: int | None = None
|
|
section_title: str | None = None
|
|
# Phase 3.1: reranker raw score 보존 (display score drift 방지).
|
|
# rerank 경로를 탄 chunk에만 채워짐. normalize_display_scores는 이 필드를
|
|
# 건드리지 않는다. Phase 3 evidence fast-path 판단에 사용.
|
|
rerank_score: float | None = None
|
|
|
|
|
|
# ─── Phase 0.4: 디버그 응답 스키마 ─────────────────────────
|
|
|
|
|
|
class DebugCandidate(BaseModel):
|
|
"""단계별 후보 (debug=true 응답에서만 노출)."""
|
|
id: int
|
|
rank: int
|
|
score: float
|
|
match_reason: str | None = None
|
|
|
|
|
|
class SearchDebug(BaseModel):
|
|
timing_ms: dict[str, float]
|
|
text_candidates: list[DebugCandidate] | None = None
|
|
vector_candidates: list[DebugCandidate] | None = None
|
|
fused_candidates: list[DebugCandidate] | None = None
|
|
confidence: float
|
|
notes: list[str] = []
|
|
# Phase 1/2 도입 후 채워질 placeholder
|
|
query_analysis: dict | None = None
|
|
reranker_scores: list[DebugCandidate] | None = None
|
|
|
|
|
|
class SearchResponse(BaseModel):
|
|
results: list[SearchResult]
|
|
total: int
|
|
query: str
|
|
mode: str
|
|
debug: SearchDebug | None = None
|
|
|
|
|
|
def _to_debug_candidates(rows: list[SearchResult], n: int = 20) -> list[DebugCandidate]:
|
|
return [
|
|
DebugCandidate(
|
|
id=r.id, rank=i + 1, score=r.score, match_reason=r.match_reason
|
|
)
|
|
for i, r in enumerate(rows[:n])
|
|
]
|
|
|
|
|
|
def _build_search_debug(pr: PipelineResult) -> SearchDebug:
|
|
"""PipelineResult → SearchDebug (기존 search()의 debug 구성 블록 복사)."""
|
|
return SearchDebug(
|
|
timing_ms=pr.timing_ms,
|
|
text_candidates=(
|
|
_to_debug_candidates(pr.text_results)
|
|
if pr.text_results or pr.mode != "vector"
|
|
else None
|
|
),
|
|
vector_candidates=(
|
|
_to_debug_candidates(pr.vector_results)
|
|
if pr.vector_results or pr.mode in ("vector", "hybrid")
|
|
else None
|
|
),
|
|
fused_candidates=(
|
|
_to_debug_candidates(pr.results) if pr.mode == "hybrid" else None
|
|
),
|
|
confidence=pr.confidence_signal,
|
|
notes=pr.notes,
|
|
query_analysis=pr.query_analysis,
|
|
)
|
|
|
|
|
|
@router.get("/", response_model=SearchResponse)
|
|
async def search(
|
|
q: str,
|
|
user: Annotated[User, Depends(get_current_user)],
|
|
session: Annotated[AsyncSession, Depends(get_session)],
|
|
background_tasks: BackgroundTasks,
|
|
mode: str = Query("hybrid", pattern="^(fts|trgm|vector|hybrid)$"),
|
|
limit: int = Query(20, ge=1, le=100),
|
|
fusion: str = Query(
|
|
DEFAULT_FUSION,
|
|
pattern="^(legacy|rrf|rrf_boost)$",
|
|
description="hybrid 모드 fusion 전략 (legacy=기존 가중합, rrf=RRF k=60, rrf_boost=RRF+강한신호 boost)",
|
|
),
|
|
rerank: bool = Query(
|
|
True,
|
|
description="bge-reranker-v2-m3 활성화 (Phase 1.3, hybrid 모드만 동작)",
|
|
),
|
|
analyze: bool = Query(
|
|
False,
|
|
description="QueryAnalyzer 활성화 (Phase 2.1, LLM 호출). Phase 2.1은 debug 노출만, 검색 경로 영향 X",
|
|
),
|
|
debug: bool = Query(False, description="단계별 candidates + timing 응답에 포함"),
|
|
):
|
|
"""문서 검색 — FTS + ILIKE + 벡터 결합 (Phase 3.1 이후 run_search wrapper)"""
|
|
pr = await run_search(
|
|
session,
|
|
q,
|
|
mode=mode, # type: ignore[arg-type]
|
|
limit=limit,
|
|
fusion=fusion,
|
|
rerank=rerank,
|
|
analyze=analyze,
|
|
)
|
|
|
|
# 사용자 feedback: 모든 단계 timing은 debug 응답과 별도로 항상 로그로 남긴다
|
|
timing_str = " ".join(f"{k}={v:.0f}" for k, v in pr.timing_ms.items())
|
|
fusion_str = f" fusion={fusion}" if mode == "hybrid" else ""
|
|
analyzer_str = (
|
|
f" analyzer=hit={pr.analyzer_cache_hit}/conf={pr.analyzer_confidence:.2f}/tier={pr.analyzer_tier}"
|
|
if analyze
|
|
else ""
|
|
)
|
|
logger.info(
|
|
"search query=%r mode=%s%s%s results=%d conf=%.2f %s",
|
|
q[:80],
|
|
pr.mode,
|
|
fusion_str,
|
|
analyzer_str,
|
|
len(pr.results),
|
|
pr.confidence_signal,
|
|
timing_str,
|
|
)
|
|
|
|
# Phase 0.3: 실패 자동 로깅 (응답 latency에 영향 X — background task)
|
|
# Phase 2.1: analyze=true일 때만 analyzer_confidence 전달 (False는 None → 기존 호환)
|
|
background_tasks.add_task(
|
|
record_search_event,
|
|
q,
|
|
user.id,
|
|
pr.results,
|
|
pr.mode,
|
|
pr.confidence_signal,
|
|
pr.analyzer_confidence if analyze else None,
|
|
)
|
|
|
|
debug_obj = _build_search_debug(pr) if debug else None
|
|
|
|
return SearchResponse(
|
|
results=pr.results,
|
|
total=len(pr.results),
|
|
query=q,
|
|
mode=pr.mode,
|
|
debug=debug_obj,
|
|
)
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════
|
|
# Phase 3.3: /api/search/ask — Evidence + Grounded Synthesis
|
|
# ═══════════════════════════════════════════════════════════
|
|
|
|
|
|
class Citation(BaseModel):
|
|
"""answer 본문의 [n] 에 해당하는 근거 단일 행."""
|
|
|
|
n: int
|
|
chunk_id: int | None
|
|
doc_id: int
|
|
title: str | None
|
|
section_title: str | None
|
|
span_text: str # evidence LLM 이 추출한 50~300자
|
|
full_snippet: str # 원본 800자 (citation 원문 보기 전용)
|
|
relevance: float
|
|
rerank_score: float
|
|
|
|
|
|
class ConfirmedItem(BaseModel):
|
|
"""Partial answer 의 개별 aspect 답변."""
|
|
|
|
aspect: str
|
|
text: str
|
|
citations: list[int]
|
|
|
|
|
|
class AskDebug(BaseModel):
|
|
"""`/ask?debug=true` 응답 확장."""
|
|
|
|
timing_ms: dict[str, float]
|
|
search_notes: list[str]
|
|
query_analysis: dict | None = None
|
|
confidence_signal: float
|
|
evidence_candidate_count: int
|
|
evidence_kept_count: int
|
|
evidence_skip_reason: str | None
|
|
synthesis_cache_hit: bool
|
|
synthesis_prompt_preview: str | None = None
|
|
synthesis_raw_preview: str | None = None
|
|
hallucination_flags: list[str] = []
|
|
# Phase 3.5a: per-layer defense 로깅
|
|
defense_layers: dict | None = None
|
|
|
|
|
|
class AskResponse(BaseModel):
|
|
"""`/ask` 응답. Phase 3.5a: completeness + aspects 추가."""
|
|
|
|
results: list[SearchResult]
|
|
ai_answer: str | None
|
|
citations: list[Citation]
|
|
synthesis_status: Literal[
|
|
"completed", "timeout", "skipped", "no_evidence", "parse_failed", "llm_error"
|
|
]
|
|
synthesis_ms: float
|
|
confidence: Literal["high", "medium", "low"] | None
|
|
refused: bool
|
|
no_results_reason: str | None
|
|
query: str
|
|
total: int
|
|
# Phase 3.5a
|
|
completeness: Literal["full", "partial", "insufficient"] = "full"
|
|
covered_aspects: list[str] | None = None
|
|
missing_aspects: list[str] | None = None
|
|
confirmed_items: list[ConfirmedItem] | None = None
|
|
debug: AskDebug | None = None
|
|
|
|
|
|
def _map_no_results_reason(
|
|
pr: PipelineResult,
|
|
evidence: list[EvidenceItem],
|
|
ev_skip: str | None,
|
|
sr: SynthesisResult,
|
|
) -> str | None:
|
|
"""사용자에게 보여줄 한국어 메시지 매핑.
|
|
|
|
Failure mode 표 (plan §Failure Modes) 기반.
|
|
"""
|
|
# LLM 자가 refused → 모델이 준 사유 그대로
|
|
if sr.refused and sr.refuse_reason:
|
|
return sr.refuse_reason
|
|
|
|
# synthesis 상태 우선
|
|
if sr.status == "no_evidence":
|
|
if not pr.results:
|
|
return "검색 결과가 없습니다."
|
|
return "관련도 높은 근거를 찾지 못했습니다."
|
|
if sr.status == "skipped":
|
|
return "검색 결과가 없습니다."
|
|
if sr.status == "timeout":
|
|
return "답변 생성이 지연되어 생략했습니다. 검색 결과를 확인해 주세요."
|
|
if sr.status == "parse_failed":
|
|
return "답변 형식 오류로 생략했습니다."
|
|
if sr.status == "llm_error":
|
|
return "AI 서버에 일시적 문제가 있습니다."
|
|
|
|
# evidence 단계 실패는 fallback 을 탔더라도 notes 용
|
|
if ev_skip == "all_low_rerank":
|
|
return "관련도 높은 근거를 찾지 못했습니다."
|
|
if ev_skip == "empty_retrieval":
|
|
return "검색 결과가 없습니다."
|
|
|
|
return None
|
|
|
|
|
|
def _build_citations(
|
|
evidence: list[EvidenceItem], used_citations: list[int]
|
|
) -> list[Citation]:
|
|
"""answer 본문에 실제로 등장한 n 만 Citation 으로 변환."""
|
|
by_n = {e.n: e for e in evidence}
|
|
out: list[Citation] = []
|
|
for n in used_citations:
|
|
e = by_n.get(n)
|
|
if e is None:
|
|
continue
|
|
out.append(
|
|
Citation(
|
|
n=e.n,
|
|
chunk_id=e.chunk_id,
|
|
doc_id=e.doc_id,
|
|
title=e.title,
|
|
section_title=e.section_title,
|
|
span_text=e.span_text,
|
|
full_snippet=e.full_snippet,
|
|
relevance=e.relevance,
|
|
rerank_score=e.rerank_score,
|
|
)
|
|
)
|
|
return out
|
|
|
|
|
|
def _build_ask_debug(
|
|
pr: PipelineResult,
|
|
evidence: list[EvidenceItem],
|
|
ev_skip: str | None,
|
|
sr: SynthesisResult,
|
|
ev_ms: float,
|
|
synth_ms: float,
|
|
total_ms: float,
|
|
) -> AskDebug:
|
|
timing: dict[str, float] = dict(pr.timing_ms)
|
|
timing["evidence_ms"] = ev_ms
|
|
timing["synthesis_ms"] = synth_ms
|
|
timing["ask_total_ms"] = total_ms
|
|
|
|
# candidate count 는 rule filter 통과한 수 (recomputable from results)
|
|
# 엄밀히는 evidence_service 내부 숫자인데, evidence 길이 ≈ kept, candidate
|
|
# 는 관측이 어려움 → kept 는 evidence 길이, candidate 는 별도 필드 없음.
|
|
# 단순화: candidate_count = len(evidence) 를 상한 근사로 둠 (debug 전용).
|
|
return AskDebug(
|
|
timing_ms=timing,
|
|
search_notes=pr.notes,
|
|
query_analysis=pr.query_analysis,
|
|
confidence_signal=pr.confidence_signal,
|
|
evidence_candidate_count=len(evidence),
|
|
evidence_kept_count=len(evidence),
|
|
evidence_skip_reason=ev_skip,
|
|
synthesis_cache_hit=sr.cache_hit,
|
|
synthesis_prompt_preview=None, # 현재 synthesis_service 에서 노출 안 함
|
|
synthesis_raw_preview=sr.raw_preview,
|
|
hallucination_flags=sr.hallucination_flags,
|
|
)
|
|
|
|
|
|
def _detect_synthesis_failure(sr: SynthesisResult) -> str | None:
|
|
"""Synthesis 가 유효한 답을 못 냈으면 re_gate 라벨, 아니면 None.
|
|
|
|
판정 우선순위 (Phase 3.5 fix3):
|
|
1) sr.refused → LLM self-refuse (status="completed") 또는 mechanical fail 후 refused 전파
|
|
- status=="completed" + refused=True → "synthesis_self_refuse"
|
|
- 그 외 → f"synthesis_failed({status})"
|
|
2) sr.status ∈ {timeout, parse_failed, llm_error} → f"synthesis_failed({status})"
|
|
3) answer 공백 → f"synthesis_failed({status})"
|
|
4) 유효 → None
|
|
"""
|
|
if sr.refused:
|
|
if sr.status == "completed":
|
|
return "synthesis_self_refuse"
|
|
return f"synthesis_failed({sr.status})"
|
|
if sr.status in ("timeout", "parse_failed", "llm_error"):
|
|
return f"synthesis_failed({sr.status})"
|
|
if not (sr.answer or "").strip():
|
|
return f"synthesis_failed({sr.status})"
|
|
return None
|
|
|
|
|
|
def _resolve_eval_identity(
|
|
x_source: str | None,
|
|
x_eval_case_id: str | None,
|
|
x_eval_token: str | None,
|
|
) -> tuple[str, str | None]:
|
|
"""X-Source/X-Eval-Case-Id 신뢰 검증 (Phase 3.5 fix2).
|
|
|
|
규칙:
|
|
- 기본값: source='document_server', eval_case_id=None
|
|
- X-Source=eval 또는 X-Eval-Case-Id 가 들어왔다면 eval claim 으로 간주
|
|
- eval claim 은 X-Eval-Token == settings.eval_runner_token 일 때만 수용
|
|
(constant-time compare, env 미설정 시 항상 거부)
|
|
- 거부 시: 헤더 무시 + warning log + source=sanitize(non-eval) / eval_case_id=None
|
|
- 통과 시: source='eval', eval_case_id=x_eval_case_id
|
|
|
|
반환: (source, eval_case_id)
|
|
"""
|
|
claimed_source = sanitize_source(x_source)
|
|
is_eval_claim = (claimed_source == "eval") or bool(x_eval_case_id)
|
|
if not is_eval_claim:
|
|
# 일반 호출 — eval_case_id 강제 None (source != 'eval' 이면 case_id 의미 없음)
|
|
return claimed_source, None
|
|
|
|
# eval claim — token 검증
|
|
expected = settings.eval_runner_token
|
|
presented = x_eval_token or ""
|
|
token_valid = bool(expected) and hmac.compare_digest(presented, expected)
|
|
if not token_valid:
|
|
logger.warning(
|
|
"eval header rejected: source=%s case_id=%s token_present=%s expected_set=%s",
|
|
x_source, x_eval_case_id, bool(x_eval_token), bool(expected),
|
|
)
|
|
# 일반 호출로 강등 — source='eval' 주장은 무시, case_id 도 무시
|
|
# claimed_source 가 'eval' 이면 default 'document_server' 로
|
|
if claimed_source == "eval":
|
|
return "document_server", None
|
|
return claimed_source, None
|
|
|
|
# token OK — eval 라벨 수용
|
|
return "eval", x_eval_case_id
|
|
|
|
|
|
@router.get("/ask", response_model=AskResponse)
|
|
async def ask(
|
|
q: str,
|
|
user: Annotated[User, Depends(get_current_user)],
|
|
session: Annotated[AsyncSession, Depends(get_session)],
|
|
background_tasks: BackgroundTasks,
|
|
limit: int = Query(10, ge=1, le=20, description="synthesis 입력 상한"),
|
|
debug: bool = Query(False, description="evidence/synthesis 중간 상태 노출"),
|
|
x_source: Annotated[str | None, Header(alias="X-Source")] = None,
|
|
x_eval_case_id: Annotated[str | None, Header(alias="X-Eval-Case-Id")] = None,
|
|
x_eval_token: Annotated[str | None, Header(alias="X-Eval-Token")] = None,
|
|
):
|
|
"""근거 기반 AI 답변 (Phase 3.5a).
|
|
|
|
Phase 3.3 기반 + classifier parallel + refusal gate + grounding re-gate.
|
|
실패 경로에서도 `results` 는 항상 반환.
|
|
|
|
Phase 3.5 calibration trust boundary (fix2):
|
|
- X-Source / X-Eval-Case-Id 는 X-Eval-Token 이 EVAL_RUNNER_TOKEN 와 일치하는
|
|
trusted internal eval runner 에서만 수용된다.
|
|
- 일반 client 의 X-Source=eval 시도는 무시되고 source='document_server' 로 강제.
|
|
- source != 'eval' 이면 eval_case_id 항상 None.
|
|
"""
|
|
t_total = time.perf_counter()
|
|
defense_log: dict = {} # per-layer flag snapshot
|
|
source, eval_case_id = _resolve_eval_identity(x_source, x_eval_case_id, x_eval_token)
|
|
|
|
# 1. 검색 파이프라인
|
|
pr = await run_search(
|
|
session, q, mode="hybrid", limit=limit,
|
|
fusion=DEFAULT_FUSION, rerank=True, analyze=True,
|
|
)
|
|
|
|
# 1.5. ask_includable=false 문서를 evidence 입력에서 제외
|
|
# 검색 결과 자체는 유지 (사용자에게 보여줌), evidence만 필터
|
|
if pr.results:
|
|
from sqlalchemy import select as sa_select
|
|
from models.document import Document as DocModel
|
|
ask_doc_ids = set()
|
|
excluded_ids = {r.id for r in pr.results}
|
|
rows = await session.execute(
|
|
sa_select(DocModel.id, DocModel.ask_includable).where(
|
|
DocModel.id.in_(excluded_ids)
|
|
)
|
|
)
|
|
for doc_id, includable in rows:
|
|
if includable is False:
|
|
ask_doc_ids.add(doc_id)
|
|
evidence_results = [r for r in pr.results if r.id not in ask_doc_ids]
|
|
else:
|
|
evidence_results = pr.results
|
|
|
|
# 2. Evidence + Classifier 병렬
|
|
t_ev = time.perf_counter()
|
|
evidence_task = asyncio.create_task(extract_evidence(q, evidence_results))
|
|
|
|
# classifier input: top 3 chunks meta + rerank scores
|
|
top_chunks = [
|
|
{
|
|
"title": r.title or "",
|
|
"section": r.section_title or "",
|
|
"snippet": (r.snippet or "")[:200],
|
|
}
|
|
for r in pr.results[:3]
|
|
]
|
|
rerank_scores_top = [
|
|
r.rerank_score if r.rerank_score is not None else r.score
|
|
for r in pr.results[:3]
|
|
]
|
|
classifier_task = asyncio.create_task(
|
|
classify(q, top_chunks, rerank_scores_top)
|
|
)
|
|
|
|
evidence, ev_skip = await evidence_task
|
|
ev_ms = (time.perf_counter() - t_ev) * 1000
|
|
|
|
# classifier await (timeout 보호 — classifier_service 내부에도 있지만 여기서 이중 보호)
|
|
try:
|
|
classifier_result = await asyncio.wait_for(classifier_task, timeout=6.0)
|
|
except (asyncio.TimeoutError, Exception):
|
|
classifier_result = ClassifierResult("timeout", None, [], [], 0.0)
|
|
|
|
defense_log["classifier"] = {
|
|
"status": classifier_result.status,
|
|
"verdict": classifier_result.verdict,
|
|
"covered_aspects": classifier_result.covered_aspects,
|
|
"missing_aspects": classifier_result.missing_aspects,
|
|
"elapsed_ms": classifier_result.elapsed_ms,
|
|
}
|
|
|
|
# 3. Refusal gate (multi-signal fusion)
|
|
all_rerank_scores = [
|
|
e.rerank_score for e in evidence
|
|
] if evidence else rerank_scores_top
|
|
decision = refusal_decide(all_rerank_scores, classifier_result)
|
|
|
|
defense_log["score_gate"] = {
|
|
"max": max(all_rerank_scores) if all_rerank_scores else 0.0,
|
|
"agg_top3": sum(sorted(all_rerank_scores, reverse=True)[:3]),
|
|
}
|
|
defense_log["refusal"] = {
|
|
"refused": decision.refused,
|
|
"rule_triggered": decision.rule_triggered,
|
|
}
|
|
|
|
if decision.refused:
|
|
total_ms = (time.perf_counter() - t_total) * 1000
|
|
no_reason = "관련 근거를 찾지 못했습니다."
|
|
if not pr.results:
|
|
no_reason = "검색 결과가 없습니다."
|
|
logger.info(
|
|
"ask REFUSED query=%r rule=%s max_score=%.2f total=%.0f",
|
|
q[:80], decision.rule_triggered,
|
|
max(all_rerank_scores) if all_rerank_scores else 0.0, total_ms,
|
|
)
|
|
# telemetry — search + ask_events 두 경로 동시
|
|
background_tasks.add_task(
|
|
record_search_event, q, user.id, pr.results, "hybrid",
|
|
pr.confidence_signal, pr.analyzer_confidence,
|
|
)
|
|
# input_snapshot (디버깅/재현용)
|
|
defense_log["input_snapshot"] = {
|
|
"query": q,
|
|
"top_chunks_preview": [
|
|
{"title": c.get("title", ""), "snippet": c.get("snippet", "")[:100]}
|
|
for c in top_chunks[:3]
|
|
],
|
|
"answer_preview": None,
|
|
}
|
|
background_tasks.add_task(
|
|
record_ask_event,
|
|
q, user.id, "insufficient", "skipped", None,
|
|
True, classifier_result.verdict,
|
|
max(all_rerank_scores) if all_rerank_scores else 0.0,
|
|
sum(sorted(all_rerank_scores, reverse=True)[:3]),
|
|
[], len(evidence), 0,
|
|
defense_log, int(total_ms),
|
|
# Phase E.1 측정 필드
|
|
answer_length=0,
|
|
covered_aspects=classifier_result.covered_aspects or None,
|
|
missing_aspects=classifier_result.missing_aspects or None,
|
|
model_name=resolve_primary_model(),
|
|
prompt_version=ASK_PROMPT_VERSION,
|
|
# Phase 3.5 calibration
|
|
source=source,
|
|
eval_case_id=eval_case_id,
|
|
)
|
|
debug_obj = None
|
|
if debug:
|
|
debug_obj = AskDebug(
|
|
timing_ms={**pr.timing_ms, "evidence_ms": ev_ms, "ask_total_ms": total_ms},
|
|
search_notes=pr.notes,
|
|
confidence_signal=pr.confidence_signal,
|
|
evidence_candidate_count=len(evidence),
|
|
evidence_kept_count=len(evidence),
|
|
evidence_skip_reason=ev_skip,
|
|
synthesis_cache_hit=False,
|
|
hallucination_flags=[],
|
|
defense_layers=defense_log,
|
|
)
|
|
return AskResponse(
|
|
results=pr.results,
|
|
ai_answer=None,
|
|
citations=[],
|
|
synthesis_status="skipped",
|
|
synthesis_ms=0.0,
|
|
confidence=None,
|
|
refused=True,
|
|
no_results_reason=no_reason,
|
|
query=q,
|
|
total=len(pr.results),
|
|
completeness="insufficient",
|
|
covered_aspects=classifier_result.covered_aspects or None,
|
|
missing_aspects=classifier_result.missing_aspects or None,
|
|
debug=debug_obj,
|
|
)
|
|
|
|
# 4. Synthesis
|
|
t_synth = time.perf_counter()
|
|
sr = await synthesize(q, evidence, debug=debug)
|
|
synth_ms = (time.perf_counter() - t_synth) * 1000
|
|
|
|
# 5. Grounding check + Verifier (조건부 병렬) + re-gate (Phase 3.5b)
|
|
grounding = grounding_check(q, sr.answer or "", evidence)
|
|
|
|
# verifier skip: grounding strong 2+ OR retrieval 자체가 망함
|
|
grounding_only_strong = [
|
|
f for f in grounding.strong_flags if not f.startswith("verifier_")
|
|
]
|
|
max_rerank = max(all_rerank_scores, default=0.0)
|
|
if len(grounding_only_strong) >= 2 or max_rerank < 0.2:
|
|
verifier_result = VerifierResult("skipped", [], 0.0)
|
|
else:
|
|
verifier_task = asyncio.create_task(
|
|
verify(q, sr.answer or "", evidence)
|
|
)
|
|
try:
|
|
verifier_result = await asyncio.wait_for(verifier_task, timeout=4.0)
|
|
except (asyncio.TimeoutError, Exception):
|
|
verifier_result = VerifierResult("timeout", [], 0.0)
|
|
|
|
# Verifier contradictions → grounding flags 머지 (prefix 로 구분, severity 3단계)
|
|
for c in verifier_result.contradictions:
|
|
if c.severity == "strong":
|
|
grounding.strong_flags.append(f"verifier_{c.type}:{c.claim[:30]}")
|
|
elif c.severity == "medium":
|
|
grounding.weak_flags.append(f"verifier_{c.type}_medium:{c.claim[:30]}")
|
|
else:
|
|
grounding.weak_flags.append(f"verifier_{c.type}:{c.claim[:30]}")
|
|
|
|
defense_log["evidence"] = {
|
|
"skip_reason": ev_skip,
|
|
"kept_count": len(evidence),
|
|
}
|
|
defense_log["grounding"] = {
|
|
"strong": grounding.strong_flags,
|
|
"weak": grounding.weak_flags,
|
|
}
|
|
defense_log["verifier"] = {
|
|
"status": verifier_result.status,
|
|
"contradictions_count": len(verifier_result.contradictions),
|
|
"strong_count": sum(1 for c in verifier_result.contradictions if c.severity == "strong"),
|
|
"medium_count": sum(1 for c in verifier_result.contradictions if c.severity == "medium"),
|
|
"elapsed_ms": verifier_result.elapsed_ms,
|
|
}
|
|
|
|
# ── Re-gate: 7-tier completeness 결정 (Phase 3.5 B2 — Tier 4 신규 삽입, 재번호) ──
|
|
# 기존 6-tier (3.5b 4차 리뷰) + Tier 4(g_strong + v_strong_numeric + low_conf → refuse).
|
|
# 호환성: defense_layers["re_gate"] 의 string literal 들은 기존 그대로 유지.
|
|
# 신규 "refuse(grounding+verifier_numeric)" 만 추가.
|
|
completeness: Literal["full", "partial", "insufficient"] = "full"
|
|
covered_aspects = classifier_result.covered_aspects or None
|
|
missing_aspects = classifier_result.missing_aspects or None
|
|
confirmed_items: list[ConfirmedItem] | None = None
|
|
|
|
# verifier/grounding strong 구분
|
|
g_strong = [f for f in grounding.strong_flags if not f.startswith("verifier_")]
|
|
v_strong = [f for f in grounding.strong_flags if f.startswith("verifier_")]
|
|
v_medium = [f for f in grounding.weak_flags if f.startswith("verifier_") and "_medium:" in f]
|
|
has_direct_negation = any("direct_negation" in f for f in v_strong)
|
|
# Phase 3.5 B2: verifier strong flags 중 numeric_conflict 만 카운트.
|
|
# promote(VERIFIER_NUMERIC_PROMOTE=1) 활성 시 critical numeric_conflict 가 strong 으로 승격되며
|
|
# 여기 카운트에 잡힘. promote off 면 항상 0 → Tier 4 활성 안 됨 (기존 동작 유지).
|
|
v_strong_numeric = sum(
|
|
1 for f in v_strong if f.startswith("verifier_numeric_conflict")
|
|
)
|
|
|
|
# ── Tier 0 (Phase 3.5 fix3): synthesis 자체 실패 처리 ──
|
|
# LLM self-refuse, 메커니즘 실패(timeout/parse_failed/llm_error), answer 공백.
|
|
# 빈 답에 대해 grounding/verifier flag 가 0건이라 기존 체인이 "else clean" 으로 빠지며
|
|
# completeness="full" 초기값이 보존되던 모순을 여기서 일관되게 차단.
|
|
# 과거 baseline(v1-400char) 에서 20(self-refuse)+4(timeout) = 24/223 (10.8%) 해당.
|
|
tier0_label = _detect_synthesis_failure(sr)
|
|
if tier0_label:
|
|
completeness = "insufficient"
|
|
sr.answer = None
|
|
sr.refused = True
|
|
sr.confidence = None
|
|
defense_log["re_gate"] = tier0_label
|
|
elif len(g_strong) >= 2:
|
|
# Tier 1: grounding strong 2+ → refuse
|
|
completeness = "insufficient"
|
|
sr.answer = None
|
|
sr.refused = True
|
|
sr.confidence = None
|
|
defense_log["re_gate"] = "refuse(grounding_2+strong)"
|
|
elif g_strong and has_direct_negation:
|
|
# Tier 2: grounding strong + verifier direct_negation → refuse
|
|
completeness = "insufficient"
|
|
sr.answer = None
|
|
sr.refused = True
|
|
sr.confidence = None
|
|
defense_log["re_gate"] = "refuse(grounding+direct_negation)"
|
|
elif g_strong and sr.confidence == "low" and max_rerank < 0.25:
|
|
# Tier 3: grounding strong 1 + (low confidence AND weak evidence) → refuse
|
|
completeness = "insufficient"
|
|
sr.answer = None
|
|
sr.refused = True
|
|
sr.confidence = None
|
|
defense_log["re_gate"] = "refuse(grounding+low_conf+weak_ev)"
|
|
elif g_strong and v_strong_numeric >= 1 and sr.confidence == "low":
|
|
# Tier 4 (B2 신규): grounding strong + verifier numeric_conflict strong + low conf → refuse.
|
|
# verifier strong 단독 refuse 금지 원칙 유지 — g_strong 교차 필수.
|
|
completeness = "insufficient"
|
|
sr.answer = None
|
|
sr.refused = True
|
|
sr.confidence = None
|
|
defense_log["re_gate"] = "refuse(grounding+verifier_numeric)"
|
|
elif g_strong or has_direct_negation:
|
|
# Tier 5 (기존 4): grounding strong 1 또는 verifier direct_negation 단독 → partial
|
|
completeness = "partial"
|
|
sr.confidence = "low"
|
|
defense_log["re_gate"] = "partial(strong_or_negation)"
|
|
elif v_medium:
|
|
# Tier 6 (기존 5): verifier medium 누적 → count 기반 confidence 하향
|
|
medium_count = len(v_medium)
|
|
if medium_count >= 3:
|
|
sr.confidence = "low"
|
|
defense_log["re_gate"] = f"conf_low(medium_x{medium_count})"
|
|
elif medium_count == 2 and sr.confidence == "high":
|
|
sr.confidence = "medium"
|
|
defense_log["re_gate"] = "conf_cap_medium(medium_x2)"
|
|
else:
|
|
defense_log["re_gate"] = f"medium_x{medium_count}(no_action)"
|
|
elif grounding.weak_flags:
|
|
# Tier 7 (기존 6): weak → confidence 한 단계 하향
|
|
if sr.confidence == "high":
|
|
sr.confidence = "medium"
|
|
defense_log["re_gate"] = "conf_lower(weak)"
|
|
else:
|
|
defense_log["re_gate"] = "clean"
|
|
|
|
# Confidence cap from refusal gate (classifier 부재 시 conservative)
|
|
if decision.confidence_cap and sr.confidence:
|
|
conf_rank = {"low": 0, "medium": 1, "high": 2}
|
|
if conf_rank.get(sr.confidence, 0) > conf_rank.get(decision.confidence_cap, 2):
|
|
sr.confidence = decision.confidence_cap
|
|
|
|
# Partial 이면 max confidence = medium
|
|
if completeness == "partial" and sr.confidence == "high":
|
|
sr.confidence = "medium"
|
|
|
|
sr.hallucination_flags.extend(
|
|
[f"strong:{f}" for f in grounding.strong_flags]
|
|
+ [f"weak:{f}" for f in grounding.weak_flags]
|
|
)
|
|
|
|
total_ms = (time.perf_counter() - t_total) * 1000
|
|
|
|
# 6. 응답 구성
|
|
citations = _build_citations(evidence, sr.used_citations)
|
|
no_reason = _map_no_results_reason(pr, evidence, ev_skip, sr)
|
|
if completeness == "insufficient" and not no_reason:
|
|
# Tier 0 경로: synthesis self-refuse 는 LLM 이 준 사유가 가장 정확.
|
|
if sr.refused and sr.refuse_reason:
|
|
no_reason = sr.refuse_reason
|
|
else:
|
|
no_reason = "답변 검증에서 복수 오류 감지"
|
|
|
|
logger.info(
|
|
"ask query=%r results=%d evidence=%d cite=%d synth=%s conf=%s completeness=%s "
|
|
"refused=%s grounding_strong=%d grounding_weak=%d ev_ms=%.0f synth_ms=%.0f total=%.0f",
|
|
q[:80], len(pr.results), len(evidence), len(citations),
|
|
sr.status, sr.confidence or "-", completeness,
|
|
sr.refused, len(grounding.strong_flags), len(grounding.weak_flags),
|
|
ev_ms, synth_ms, total_ms,
|
|
)
|
|
|
|
# 7. telemetry — search + ask_events 두 경로 동시
|
|
background_tasks.add_task(
|
|
record_search_event, q, user.id, pr.results, "hybrid",
|
|
pr.confidence_signal, pr.analyzer_confidence,
|
|
)
|
|
# input_snapshot (디버깅/재현용)
|
|
defense_log["input_snapshot"] = {
|
|
"query": q,
|
|
"top_chunks_preview": [
|
|
{"title": (r.title or "")[:50], "snippet": (r.snippet or "")[:100]}
|
|
for r in pr.results[:3]
|
|
],
|
|
"answer_preview": (sr.answer or "")[:200],
|
|
}
|
|
background_tasks.add_task(
|
|
record_ask_event,
|
|
q, user.id, completeness, sr.status, sr.confidence,
|
|
sr.refused, classifier_result.verdict,
|
|
max(all_rerank_scores) if all_rerank_scores else 0.0,
|
|
sum(sorted(all_rerank_scores, reverse=True)[:3]),
|
|
sr.hallucination_flags, len(evidence), len(citations),
|
|
defense_log, int(total_ms),
|
|
# Phase E.1 측정 필드
|
|
answer_length=len(sr.answer or ""),
|
|
covered_aspects=covered_aspects,
|
|
missing_aspects=missing_aspects,
|
|
model_name=resolve_primary_model(),
|
|
prompt_version=ASK_PROMPT_VERSION,
|
|
# Phase 3.5 calibration
|
|
source=source,
|
|
eval_case_id=eval_case_id,
|
|
)
|
|
|
|
debug_obj = None
|
|
if debug:
|
|
timing = dict(pr.timing_ms)
|
|
timing["evidence_ms"] = ev_ms
|
|
timing["synthesis_ms"] = synth_ms
|
|
timing["ask_total_ms"] = total_ms
|
|
debug_obj = AskDebug(
|
|
timing_ms=timing,
|
|
search_notes=pr.notes,
|
|
query_analysis=pr.query_analysis,
|
|
confidence_signal=pr.confidence_signal,
|
|
evidence_candidate_count=len(evidence),
|
|
evidence_kept_count=len(evidence),
|
|
evidence_skip_reason=ev_skip,
|
|
synthesis_cache_hit=sr.cache_hit,
|
|
synthesis_raw_preview=sr.raw_preview,
|
|
hallucination_flags=sr.hallucination_flags,
|
|
defense_layers=defense_log,
|
|
)
|
|
|
|
return AskResponse(
|
|
results=pr.results,
|
|
ai_answer=sr.answer,
|
|
citations=citations,
|
|
synthesis_status=sr.status,
|
|
synthesis_ms=sr.elapsed_ms,
|
|
confidence=sr.confidence,
|
|
refused=sr.refused,
|
|
no_results_reason=no_reason,
|
|
query=q,
|
|
total=len(pr.results),
|
|
completeness=completeness,
|
|
covered_aspects=covered_aspects,
|
|
missing_aspects=missing_aspects,
|
|
confirmed_items=confirmed_items,
|
|
debug=debug_obj,
|
|
)
|