refactor(search)!: /ask 고아 service·테스트·프롬프트 정리 (검색 단일화 Phase 2)

/ask 삭제로 0-consumer 된 자산 제거(3-gate 실증): search.py /ask 섹션(Citation/ConfirmedItem/AskDebug/AskResponse 모델 + 헬퍼 + _resolve_eval_identity) + 죽은 import 13개. service 4(classifier/verifier/refusal_gate/grounding_check). AIClient.call_classifier/call_verifier(고아). 프롬프트 2(classifier/verifier.txt). broken test 6. evidence/synthesis 는 공유(documents.py 등)라 유지. 실 pyflakes 클린(이전 세션 pyflakes 미설치로 검증 누락 → 설치 후 실검증).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
hyungi
2026-06-27 14:39:53 +09:00
parent 86a71ec4d1
commit 495e1c786f
14 changed files with 4 additions and 2280 deletions
-291
View File
@@ -1,291 +0,0 @@
"""PR-MacBook-RAG-Backend-1 정정 4 핵심 테스트.
검증 invariant (synthesize 함수 레벨 — /ask wrapper 의 503 매핑은 search.py 의
status="backend_unavailable" 분기로 1:1 deterministic):
1. backend="qwen-macbook" + MacBook URL 죽은 포트
→ synthesize() 가 SynthesisResult(status="backend_unavailable", ...) 반환
→ Gemma backend 의 generate() 가 **단 1번도 호출되지 않음** (자동 fallback 부재)
2. backend 미지정 (None)
→ Gemma backend.generate() 호출, Qwen backend.generate() 호출 0
→ 기존 호출자 (Hermes docsrv_ask / voice-memo-bot) 회귀 0
3. backend="qwen-macbook" + MacBook 정상 응답
→ status="completed" + answer 채워짐, Gemma backend 호출 0
테스트 전략:
- synthesize() 가 호출하는 backend dispatcher (services.llm.get_backend) 를
monkeypatch 해서 mock backend 주입.
- Gemma backend 의 generate AsyncMock 호출 횟수를 추적.
- 정정 4 의 핵심 가드: `gemma_backend.generate.assert_not_called()`
"""
from __future__ import annotations
import asyncio
import os
import sys
from dataclasses import dataclass
from unittest.mock import AsyncMock
import pytest
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "app"))
# ── 가짜 evidence (synthesize 의 no_evidence 분기 회피용 최소 객체) ─────────
@dataclass
class _FakeEvidence:
n: int = 1
doc_id: int = 100
chunk_id: int | None = 200
title: str | None = "fake doc"
span_text: str = "이것은 짧은 근거 텍스트입니다."
source: str = "llm"
def _make_evidence():
return [_FakeEvidence()]
# ── backend mock ───────────────────────────────────────────────────────────
def _gemma_mock(content: str = "GEMMA_SHOULD_NEVER_BE_CALLED"):
m = AsyncMock()
m.name = "gemma-macmini"
m.generate = AsyncMock(return_value=content)
return m
def _qwen_mock_success(content: str):
m = AsyncMock()
m.name = "qwen-macbook"
m.generate = AsyncMock(return_value=content)
return m
def _qwen_mock_unavailable():
from services.llm import BackendUnavailable
m = AsyncMock()
m.name = "qwen-macbook"
m.generate = AsyncMock(
side_effect=BackendUnavailable("qwen-macbook", "ConnectError")
)
return m
# ── 공통 fixture: synthesis_service 에 mock backend 주입 ───────────────────
@pytest.fixture
def patched_backends(monkeypatch):
"""services.llm.get_backend 를 mock dispatcher 로 치환.
Returns (gemma_mock, qwen_mock, set_qwen_unavailable_fn).
"""
from services.search import synthesis_service
gemma = _gemma_mock()
qwen_holder = {"backend": _qwen_mock_success(
'{"answer":"Qwen ok [1]","confidence":"high","refused":false}'
)}
def _fake_get_backend(name: str | None):
key = (name or "").strip().lower() or "gemma-macmini"
if key == "gemma-macmini":
return gemma
if key == "qwen-macbook":
return qwen_holder["backend"]
raise ValueError(f"unknown backend: {name!r}")
monkeypatch.setattr(synthesis_service, "get_backend", _fake_get_backend)
# synthesis_service 캐시 비움 (qwen vs gemma 캐시 분리 invariant)
synthesis_service._CACHE.clear()
def _swap_qwen_unavailable():
qwen_holder["backend"] = _qwen_mock_unavailable()
return gemma, qwen_holder, _swap_qwen_unavailable
# ── 정정 4 핵심: backend=qwen-macbook + MacBook 비가용 → Gemma 호출 0 ─────
def test_qwen_unavailable_yields_backend_unavailable_status_and_gemma_not_called(
patched_backends,
):
"""**정정 4 의 핵심 invariant**.
backend="qwen-macbook" 명시 + Qwen 호출이 BackendUnavailable 로 실패 →
synthesize() 는 status="backend_unavailable" 반환. Gemma backend 의
generate() 는 **단 한 번도 호출되지 않음** (silent fallback 금지).
"""
from services.search.synthesis_service import synthesize
gemma, qwen_holder, swap_qwen_unavailable = patched_backends
swap_qwen_unavailable()
qwen = qwen_holder["backend"]
result = asyncio.run(
synthesize(
query="압력용기 최대허용응력은?",
evidence=_make_evidence(),
backend="qwen-macbook",
)
)
# 1. status
assert result.status == "backend_unavailable"
assert result.answer is None
assert result.confidence is None
assert result.refused is False
# 2. flag 에 backend 비가용 사유 기록
assert any(
f.startswith("backend_unavailable:qwen-macbook:") for f in result.hallucination_flags
), f"expected backend_unavailable flag, got {result.hallucination_flags}"
# 3. ★ 핵심 가드 ★ — Gemma backend 자동 fallback 금지
gemma.generate.assert_not_called()
# 4. Qwen 은 1회만 호출 (재시도 없음)
assert qwen.generate.call_count == 1
def test_qwen_unavailable_result_not_cached(patched_backends):
"""비가용 결과는 캐시 X — 다음 호출이 다시 Qwen 시도해야 함."""
from services.search.synthesis_service import synthesize
gemma, qwen_holder, swap_qwen_unavailable = patched_backends
swap_qwen_unavailable()
qwen = qwen_holder["backend"]
asyncio.run(
synthesize(
query="동일 쿼리",
evidence=_make_evidence(),
backend="qwen-macbook",
)
)
asyncio.run(
synthesize(
query="동일 쿼리",
evidence=_make_evidence(),
backend="qwen-macbook",
)
)
# 두 번 모두 실제 호출 (캐시 적중 X) — Gemma 는 여전히 0
assert qwen.generate.call_count == 2
gemma.generate.assert_not_called()
# ── 정정 4: backend 미지정 → 기존 Gemma path (회귀 0) ─────────────────────
def test_default_backend_calls_gemma_not_qwen(patched_backends):
"""backend 미지정 = 기본 Gemma. Qwen 호출 0."""
from services.search.synthesis_service import synthesize
gemma, qwen_holder, _ = patched_backends
qwen = qwen_holder["backend"]
gemma.generate.return_value = (
'{"answer":"Gemma 답변 [1]","confidence":"high","refused":false}'
)
result = asyncio.run(
synthesize(
query="기본 호출",
evidence=_make_evidence(),
backend=None, # 명시 None = default
)
)
assert result.status == "completed"
assert result.answer is not None and "Gemma" in result.answer
# Qwen 은 호출 0
qwen.generate.assert_not_called()
# Gemma 는 1회
assert gemma.generate.call_count == 1
# ── backend="qwen-macbook" + 정상 응답 ──────────────────────────────────────
def test_qwen_success_does_not_call_gemma(patched_backends):
"""Qwen 정상 응답 시 Gemma 는 호출되지 않음 (대칭 invariant)."""
from services.search.synthesis_service import synthesize
gemma, qwen_holder, _ = patched_backends
qwen = qwen_holder["backend"]
result = asyncio.run(
synthesize(
query="정상 호출",
evidence=_make_evidence(),
backend="qwen-macbook",
)
)
assert result.status == "completed"
assert result.answer is not None and "Qwen" in result.answer
# Gemma 는 0회
gemma.generate.assert_not_called()
# Qwen 은 1회
assert qwen.generate.call_count == 1
# ── 캐시 분리 (qwen vs gemma 키 충돌 없음) ─────────────────────────────────
def test_qwen_and_gemma_have_separate_caches(patched_backends):
"""같은 query 라도 backend 다르면 캐시 분리 — Qwen 결과가 Gemma 호출 답으로 둔갑하지 않음."""
from services.search.synthesis_service import synthesize
gemma, qwen_holder, _ = patched_backends
qwen = qwen_holder["backend"]
gemma.generate.return_value = (
'{"answer":"GEMMA_ANSWER [1]","confidence":"high","refused":false}'
)
qwen.generate.return_value = (
'{"answer":"QWEN_ANSWER [1]","confidence":"high","refused":false}'
)
r_qwen_1 = asyncio.run(
synthesize(
query="같은 query",
evidence=_make_evidence(),
backend="qwen-macbook",
)
)
r_gemma_1 = asyncio.run(
synthesize(
query="같은 query",
evidence=_make_evidence(),
backend=None,
)
)
r_qwen_2 = asyncio.run(
synthesize(
query="같은 query",
evidence=_make_evidence(),
backend="qwen-macbook",
)
)
assert "QWEN_ANSWER" in (r_qwen_1.answer or "")
assert "GEMMA_ANSWER" in (r_gemma_1.answer or "")
# 두 번째 Qwen 호출은 캐시 적중 — 결과는 동일하지만 generate 추가 호출 X
assert "QWEN_ANSWER" in (r_qwen_2.answer or "")
assert r_qwen_2.cache_hit is True
# generate 호출 횟수: Qwen 1 (두번째는 캐시), Gemma 1
assert qwen.generate.call_count == 1
assert gemma.generate.call_count == 1
-218
View File
@@ -1,218 +0,0 @@
"""PR-DocSrv-Ask-ToolCalling-ReAct-1: /api/search/ask/react endpoint integration.
검증 항목 (G0-3 trace exposure + 정정 4 invariant):
- backend unavailable → HTTP 503 + error_reason=macbook_unavailable
+ ★ `run_search` mock 호출 횟수 == 0 (search 단계 진입 자체 차단)
- 정상 응답 → 200 + final_answer + sources + debug_trace=null (default)
- debug=true → debug_trace 채워짐
- max rounds 도달 → iterations=2 + partial=false (final content 정상)
endpoint 함수 (`api.search.ask_react`) 를 직접 호출하는 lightweight 패턴.
TestClient 없이 FastAPI deps 를 MagicMock 으로 우회. (priority_gate / backend_dispatcher
test 와 동일 service-layer 패턴.)
"""
from __future__ import annotations
import asyncio
import json
import os
import sys
from unittest.mock import AsyncMock, MagicMock
import pytest
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "app"))
# ── helpers ────────────────────────────────────────────────────────────────
def _msg_with_tool_call(q: str, tc_id: str = "tc-1") -> dict:
return {
"role": "assistant",
"content": None,
"tool_calls": [
{
"id": tc_id,
"type": "function",
"function": {
"name": "search",
"arguments": json.dumps({"q": q}, ensure_ascii=False),
},
}
],
}
def _msg_with_content(text: str) -> dict:
return {"role": "assistant", "content": text, "tool_calls": None}
def _fake_chunk(chunk_id: int, doc_id: int = 100):
m = MagicMock()
m.id = chunk_id
m.chunk_id = chunk_id
m.doc_id = doc_id
m.title = f"doc {doc_id}"
m.score = 0.9
m.snippet = f"snippet {chunk_id}"
m.text = None
return m
def _fake_pr(chunks: list):
pr = MagicMock()
pr.results = chunks
return pr
@pytest.fixture
def patched_backend_and_search(monkeypatch):
"""get_backend + run_search 둘 다 mock. backend 의 generate_with_tools 는
각 테스트가 side_effect 설정.
Returns: (backend_mock, run_search_mock, set_backend_unavailable_fn).
"""
from services.llm.backends import BackendUnavailable, QwenMacBookBackend
from services.llm import backends as backends_mod
from services.search import react_loop
backend = MagicMock(spec=QwenMacBookBackend)
backend.name = "qwen-macbook"
backend.generate_with_tools = AsyncMock()
def _fake_get_backend(name):
# endpoint 가 qwen-macbook 만 호출하므로 단일 backend 반환
return backend
monkeypatch.setattr(backends_mod, "get_backend", _fake_get_backend)
# search.py 의 ask_react 안에서 `from services.llm.backends import ... get_backend`
# 로 import 하므로 module-level patch 만으로 충분 (지연 import 라 매번 fresh).
run_search_mock = AsyncMock(return_value=_fake_pr([_fake_chunk(1)]))
monkeypatch.setattr(react_loop, "run_search", run_search_mock)
def _make_unavailable():
backend.generate_with_tools.side_effect = BackendUnavailable(
"qwen-macbook", "ConnectError"
)
return backend, run_search_mock, _make_unavailable
def _call_endpoint(payload):
"""ask_react 를 직접 호출. user/session 은 MagicMock 으로 우회."""
from api.search import ask_react
user = MagicMock()
session = MagicMock()
return asyncio.run(ask_react(payload, user=user, session=session))
# ── ★ 정정 4 invariant: backend unavailable → 503 + run_search 호출 0 ──────
def test_qwen_unavailable_returns_503(patched_backend_and_search):
"""backend BackendUnavailable → HTTP 503 + error_reason=macbook_unavailable."""
from api.search import AskReactRequest
backend, run_search_mock, make_unavailable = patched_backend_and_search
make_unavailable()
response = _call_endpoint(AskReactRequest(query="Q"))
# JSONResponse instance
assert response.status_code == 503
body = json.loads(response.body)
assert body["error_reason"] == "macbook_unavailable"
assert body["backend_used"] is None
assert body["backend_requested"] == "qwen-macbook"
# ★ run_search 호출 0 (search 진입 자체 차단)
assert run_search_mock.call_count == 0
# ── 정상 200 + G0-3 default debug_trace=null ──────────────────────────────
def test_successful_response_default_no_debug_trace(patched_backend_and_search):
"""debug 미지정 (default false) → 200 + debug_trace == null."""
from api.search import AskReactRequest, AskReactResponse
backend, run_search_mock, _ = patched_backend_and_search
backend.generate_with_tools.side_effect = [
_msg_with_tool_call("q1"),
_msg_with_content("최종 답입니다"),
]
response = _call_endpoint(AskReactRequest(query="Q"))
# Pydantic instance (FastAPI response_model 적용 전 raw return)
assert isinstance(response, AskReactResponse)
assert response.final_answer == "최종 답입니다"
assert response.iterations == 2
assert response.partial is False
assert response.debug_trace is None # ★ G0-3
assert len(response.sources) == 1
# ── G0-3: debug=true → debug_trace 채워짐 ──────────────────────────────────
def test_debug_true_populates_trace(patched_backend_and_search):
from api.search import AskReactRequest
backend, run_search_mock, _ = patched_backend_and_search
backend.generate_with_tools.side_effect = [
_msg_with_content("바로 답"),
]
response = _call_endpoint(AskReactRequest(query="Q", debug=True))
assert response.debug_trace is not None
assert isinstance(response.debug_trace, list)
assert len(response.debug_trace) >= 1
# ── max rounds → final content 정상 → partial=false ──────────────────────
def test_max_rounds_with_final_content(patched_backend_and_search):
from api.search import AskReactRequest
backend, run_search_mock, _ = patched_backend_and_search
backend.generate_with_tools.side_effect = [
_msg_with_tool_call("q1"),
_msg_with_tool_call("q2", tc_id="tc-2"),
_msg_with_content("정리된 최종 답"),
]
response = _call_endpoint(AskReactRequest(query="Q"))
assert response.iterations == 2
assert response.partial is False
assert response.final_answer == "정리된 최종 답"
# LLM 호출 3회, search 2회 (G0-2 cap)
assert backend.generate_with_tools.call_count == 3
assert run_search_mock.call_count == 2
# ── max rounds + final content 빈 string → partial=true ──────────────────
def test_max_rounds_with_empty_final_partial(patched_backend_and_search):
from api.search import AskReactRequest
backend, run_search_mock, _ = patched_backend_and_search
backend.generate_with_tools.side_effect = [
_msg_with_tool_call("q1"),
_msg_with_tool_call("q2", tc_id="tc-2"),
_msg_with_content(""),
]
response = _call_endpoint(AskReactRequest(query="Q"))
assert response.iterations == 2
assert response.partial is True
assert response.final_answer == ""
-92
View File
@@ -1,92 +0,0 @@
"""Phase 3.5 fix2: /ask 의 X-Source / X-Eval-Case-Id trust boundary.
`_resolve_eval_identity()` 단위 테스트.
- token 없음/틀림 + X-Source=eval → source='document_server', eval_case_id=None
- token 일치 + X-Source=eval + X-Eval-Case-Id=case_xxx → ('eval', 'case_xxx')
- token 틀림 + X-Eval-Case-Id 만 (X-Source 미지정) → eval_case_id=None
- 일반 호출 (X-Source=ui_search, no eval headers) → ('ui_search', None)
- env 미설정 (eval_runner_token='') 시 모든 eval claim 거부
"""
from __future__ import annotations
import os
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "app"))
import pytest
@pytest.fixture
def resolve_with_token(monkeypatch):
"""settings.eval_runner_token 을 monkey-patch 해서 _resolve_eval_identity 테스트."""
def _make(token: str):
from core import config as cfg_mod
from api import search as search_mod
# 두 모듈 모두에서 settings 객체 참조하므로 직접 attr 변경
monkeypatch.setattr(search_mod.settings, "eval_runner_token", token)
return search_mod._resolve_eval_identity
return _make
def test_no_token_no_eval_headers_default(resolve_with_token):
"""일반 호출 — eval 헤더 없음, source 기본값."""
resolve = resolve_with_token("secret123")
assert resolve(None, None, None) == ("document_server", None)
def test_normal_source_with_token(resolve_with_token):
"""ui_search 호출 — eval 클레임 아님이라 token 무관."""
resolve = resolve_with_token("secret123")
assert resolve("ui_search", None, None) == ("ui_search", None)
def test_eval_claim_no_token_rejected(resolve_with_token):
"""X-Source=eval 인데 token 없음 → 거부, source='document_server'."""
resolve = resolve_with_token("secret123")
assert resolve("eval", "case_001", None) == ("document_server", None)
def test_eval_claim_wrong_token_rejected(resolve_with_token):
"""token 틀림 → 거부."""
resolve = resolve_with_token("secret123")
assert resolve("eval", "case_001", "wrong_token") == ("document_server", None)
def test_eval_claim_correct_token_accepted(resolve_with_token):
"""token 일치 → 'eval' source + case_id 적재."""
resolve = resolve_with_token("secret123")
assert resolve("eval", "case_001", "secret123") == ("eval", "case_001")
def test_eval_case_id_only_no_source_no_token(resolve_with_token):
"""X-Eval-Case-Id 만 있고 token 없음 → 거부, case_id=None."""
resolve = resolve_with_token("secret123")
assert resolve(None, "case_001", None) == ("document_server", None)
def test_eval_case_id_only_wrong_token(resolve_with_token):
"""X-Eval-Case-Id 만 + token 틀림 → 거부."""
resolve = resolve_with_token("secret123")
assert resolve(None, "case_001", "wrong") == ("document_server", None)
def test_env_unset_rejects_even_correct_format(resolve_with_token):
"""settings.eval_runner_token='' 인 환경 → 모든 eval 클레임 거부."""
resolve = resolve_with_token("")
# token 헤더가 와도 server side 가 비어있으면 거부 (constant-time False)
assert resolve("eval", "case_001", "") == ("document_server", None)
assert resolve("eval", "case_001", "anything") == ("document_server", None)
def test_non_eval_source_forces_case_id_none(resolve_with_token):
"""X-Source=ui_detail + X-Eval-Case-Id (실수로 같이 보냄) → case_id=None.
eval claim 아님 (source != 'eval' 이고 case_id 가 fallback 으로 eval claim 트리거)
이지만 source claim 이 명시적으로 non-eval 이라 token 검증 후 case_id None.
"""
resolve = resolve_with_token("secret123")
# case_id 가 있으면 eval claim 으로 처리됨 → token 없으면 거부 → ('ui_detail' 클레임,
# 하지만 거부 분기에서 claimed_source != 'eval' 이라 그대로 'ui_detail' 반환, case_id=None)
assert resolve("ui_detail", "case_001", None) == ("ui_detail", None)
-188
View File
@@ -1,188 +0,0 @@
"""Phase 3.5 B1 (fix1+fix3): unit-aware fabricated_number + bound semantics.
기준:
- 단위 일치 시에만 exact/range/tolerance clear (fix1: Codex unit-mismatch regression 방지)
- 약/대략/거의/얼추 만 approx prefix strip; 최대/최소 는 bound operator 로 보존 (fix3)
- tolerance 는 양적 단위(_TOLERANCE_UNITS) + 4자리+ 만; 식별자성(_EXACT_ONLY_UNITS) 은 strict
"""
from __future__ import annotations
import os
import sys
# tests/ → 프로젝트 루트 → app/
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "app"))
import pytest
from services.search.evidence_service import EvidenceItem
from services.search.grounding_check import check
def _ev(text: str, n: int = 1) -> EvidenceItem:
return EvidenceItem(
n=n,
chunk_id=None,
doc_id=100 + n,
title=f"doc{n}",
section_title=None,
span_text=text,
relevance=0.9,
rerank_score=0.85,
full_snippet=text,
source="llm",
)
def _has_fabricated(result, sub: str | None = None) -> bool:
for f in result.strong_flags:
if not f.startswith("fabricated_number:"):
continue
if sub is None or sub in f:
return True
return False
# ─── 콤마/prefix/range/단위 동의어/citation (기존 17 케이스) ──────
def test_comma_thousand_match():
r = check("질문", "총 1,000명 [1]", [_ev("총원은 1000명입니다.")])
assert not _has_fabricated(r, "1000")
def test_comma_thousand_reverse():
r = check("질문", "총 1000명 [1]", [_ev("총원은 1,000명입니다.")])
assert not _has_fabricated(r)
def test_approx_prefix_in_answer():
r = check("질문", "약 100명이 참여 [1]", [_ev("100명이 참여")])
assert not _has_fabricated(r)
def test_approx_prefix_in_evidence():
r = check("질문", "100명이 참여 [1]", [_ev("약 100명이 참여")])
assert not _has_fabricated(r)
def test_range_inner_value_passes():
r = check("질문", "약 150명 [1]", [_ev("100~200명 사이 추정")])
assert not _has_fabricated(r, "150")
def test_range_outer_value_flagged():
r = check("질문", "300명 [1]", [_ev("100~200명 사이 추정")])
assert _has_fabricated(r, "300")
def test_unit_synonym_in_to_myeong():
r = check("질문", "총 50인이 모임 [1]", [_ev("총 50명이 모임.")])
assert not _has_fabricated(r)
def test_unit_synonym_percent_to_pct():
r = check("질문", "비율 30퍼센트 [1]", [_ev("비율 30%이다.")])
assert not _has_fabricated(r)
def test_citation_marker_both_sides():
"""bug fix: evidence 측 [n] 미제거로 디지트 합쳐지던 케이스."""
r = check("질문", "가격 [1] 5,000원", [_ev("[2] 5,000원이 정확")])
assert not _has_fabricated(r)
def test_genuine_fabricated_number():
r = check("질문", "결과 777명 [1]", [_ev("500명, 300명을 받음.")])
assert _has_fabricated(r, "777")
def test_amount_4digit_tolerance_passes():
r = check("질문", "9,990원 [1]", [_ev("10,000원입니다.")])
assert not _has_fabricated(r)
def test_year_no_tolerance_flagged():
r = check("질문", "2024년 [1]", [_ev("2026년에 발효")])
assert _has_fabricated(r, "2024")
def test_article_no_tolerance_flagged():
r = check("질문", "제5조에 명시 [1]", [_ev("제6조에 따라")])
assert _has_fabricated(r)
def test_count_no_tolerance_flagged():
r = check("질문", "총 3회 위반 [1]", [_ev("총 4회 적발")])
assert _has_fabricated(r)
def test_three_digit_strict():
r = check("질문", "총 15개 [1]", [_ev("총 10개")])
assert _has_fabricated(r, "15")
def test_single_digit_ignored():
"""1자리 + 양적 단위 → 무시 (오탐 방지)."""
r = check("질문", "총 3개 발생 [1]", [_ev("관련 통계 별도")])
assert not _has_fabricated(r, "3개")
def test_range_korean_butter_separator():
r = check("질문", "약 150명 [1]", [_ev("100부터 200명까지 대상.")])
assert not _has_fabricated(r, "150")
# ─── fix1: unit-mismatch (Codex no-ship) ──────────────────
def test_won_vs_myeong_range_flagged():
"""answer '150원' vs evidence '100~200명' → 단위 불일치, flag 유지."""
r = check("질문", "약 150원이 든다 [1]", [_ev("대상은 100~200명")])
assert _has_fabricated(r, "150")
def test_won_vs_myeong_tolerance_flagged():
"""answer '9,990원' vs evidence '10,000명' → tolerance pool 단위 다름, flag 유지."""
r = check("질문", "9,990원 [1]", [_ev("10,000명입니다.")])
assert _has_fabricated(r, "9990")
def test_pct_vs_myeong_range_flagged():
"""answer '15%' vs evidence '10~20명' → 단위 불일치, flag 유지."""
r = check("질문", "약 15% [1]", [_ev("대상 10~20명")])
assert _has_fabricated(r, "15")
# ─── fix3: 최대/최소 bound semantics ───────────────────────
def test_choedae_exact_boundary_flagged():
"""evidence '최대 100명' + answer '100명' → 경계값 자체는 cleared 아님."""
r = check("질문", "100명이다 [1]", [_ev("최대 100명까지 가능")])
assert _has_fabricated(r, "100")
def test_choeso_exact_boundary_flagged():
"""evidence '최소 100명' + answer '100명' → 경계값 자체는 cleared 아님."""
r = check("질문", "100명이다 [1]", [_ev("최소 100명 이상 필요")])
assert _has_fabricated(r, "100")
def test_choedae_inner_value_passes():
"""evidence '최대 100명' + answer '50명' → bound 안, cleared."""
r = check("질문", "50명이다 [1]", [_ev("최대 100명까지 가능")])
assert not _has_fabricated(r, "50")
def test_choeso_above_value_passes():
"""evidence '최소 100명' + answer '150명' → bound 안, cleared."""
r = check("질문", "150명이다 [1]", [_ev("최소 100명 이상 필요")])
assert not _has_fabricated(r, "150")
def test_choedae_outer_value_flagged():
"""evidence '최대 100명' + answer '200명' → bound 밖, flag."""
r = check("질문", "200명이다 [1]", [_ev("최대 100명까지 가능")])
assert _has_fabricated(r, "200")
-123
View File
@@ -1,123 +0,0 @@
"""Phase 3.5 fix3: re-gate Tier 0 — synthesis 자체 실패 처리.
`_detect_synthesis_failure()` 단위 테스트.
기존 버그:
synthesis LLM self-refuse (`sr.refused=True, status="completed"`) 또는
timeout/parse_failed/llm_error 시 grounding/verifier flag 0건 → re-gate else clean
분기로 빠져 `completeness="full"` 초기값이 남아 `full + refused=True` 모순.
baseline v1-400char 에서 24/223 (10.8%) 해당.
Tier 0 판정:
- LLM self-refuse (completed + refused) → "synthesis_self_refuse"
- mechanical fail (timeout/parse_failed/llm_error) → "synthesis_failed({status})"
- answer 공백 → "synthesis_failed({status})"
- 유효 답변 → None (기존 tier 1~7 경로)
"""
from __future__ import annotations
import os
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "app"))
from api.search import _detect_synthesis_failure
from services.search.synthesis_service import SynthesisResult
def _sr(
status: str = "completed",
answer: str | None = "ok",
refused: bool = False,
refuse_reason: str | None = None,
) -> SynthesisResult:
return SynthesisResult(
status=status, # type: ignore[arg-type]
answer=answer,
used_citations=[],
confidence="low",
refused=refused,
refuse_reason=refuse_reason,
elapsed_ms=100.0,
cache_hit=False,
)
# ─── self-refuse 케이스 ──────────────────────────────────
def test_llm_self_refuse_completed():
"""LLM 이 JSON 에 refused=true 반환 → synthesis_self_refuse."""
sr = _sr(status="completed", answer=None, refused=True, refuse_reason="범위 밖")
assert _detect_synthesis_failure(sr) == "synthesis_self_refuse"
def test_llm_self_refuse_with_answer_still_refused():
"""refused=True 면 answer 있어도 Tier 0 처리 (일관성)."""
sr = _sr(status="completed", answer="왜 답변함", refused=True)
assert _detect_synthesis_failure(sr) == "synthesis_self_refuse"
# ─── mechanical failure 케이스 ──────────────────────────
def test_timeout():
sr = _sr(status="timeout", answer=None, refused=False)
assert _detect_synthesis_failure(sr) == "synthesis_failed(timeout)"
def test_parse_failed():
sr = _sr(status="parse_failed", answer=None, refused=False)
assert _detect_synthesis_failure(sr) == "synthesis_failed(parse_failed)"
def test_llm_error():
sr = _sr(status="llm_error", answer=None, refused=False)
assert _detect_synthesis_failure(sr) == "synthesis_failed(llm_error)"
def test_refused_with_mechanical_fail_propagates_status():
"""refused=True + status!=completed → synthesis_failed({status}) 형식."""
sr = _sr(status="timeout", answer=None, refused=True)
assert _detect_synthesis_failure(sr) == "synthesis_failed(timeout)"
# ─── empty answer 케이스 ───────────────────────────────
def test_empty_answer_completed():
"""status=completed 인데 answer 공백 → synthesis_failed(completed)."""
sr = _sr(status="completed", answer="", refused=False)
assert _detect_synthesis_failure(sr) == "synthesis_failed(completed)"
def test_whitespace_only_answer():
"""공백/탭/개행만 있어도 empty 로 간주."""
sr = _sr(status="completed", answer=" \n\t ", refused=False)
assert _detect_synthesis_failure(sr) == "synthesis_failed(completed)"
def test_none_answer_completed():
"""answer=None + status=completed → failed."""
sr = _sr(status="completed", answer=None, refused=False)
assert _detect_synthesis_failure(sr) == "synthesis_failed(completed)"
# ─── 유효 답변 케이스 (None 반환) ──────────────────────
def test_valid_answer_returns_none():
"""status=completed + answer 있고 refused=False → Tier 0 통과 (None)."""
sr = _sr(status="completed", answer="교육 시간은 매년 6시간 이상이다 [1].", refused=False)
assert _detect_synthesis_failure(sr) is None
def test_skipped_status_with_answer_passes():
"""status=skipped 는 Tier 0 대상 아님 — 초기 refusal gate 에서 이미 early-return 처리됨.
(skipped 는 여기까지 도달하지 않는다는 전제. 만약 도달하더라도 refused 가 True 일 것.)
"""
sr = _sr(status="skipped", answer="abc", refused=False)
# 이 경우 Tier 0 미발동 (answer 있고 refused 아님) — 정상 경로로 나감.
assert _detect_synthesis_failure(sr) is None
-58
View File
@@ -1,58 +0,0 @@
"""Phase 3.5 B2: verifier _SEVERITY_MAP env flag 테스트.
VERIFIER_NUMERIC_PROMOTE 환경변수에 따른 _SEVERITY_MAP 변화 검증.
모듈은 import time 에 env 평가하므로 reload 필요.
"""
from __future__ import annotations
import importlib
import os
import sys
# tests/ → 프로젝트 루트 → app/
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "app"))
import pytest
def _reload_verifier(monkeypatch, value: str | None):
"""env 설정 후 verifier_service 를 reload 하여 _SEVERITY_MAP 재평가."""
if value is None:
monkeypatch.delenv("VERIFIER_NUMERIC_PROMOTE", raising=False)
else:
monkeypatch.setenv("VERIFIER_NUMERIC_PROMOTE", value)
from services.search import verifier_service
importlib.reload(verifier_service)
return verifier_service
def test_severity_map_off_default(monkeypatch):
"""env 미설정 → numeric_conflict critical 은 medium (기존 동작)."""
vs = _reload_verifier(monkeypatch, None)
assert vs._SEVERITY_MAP["numeric_conflict"]["critical"] == "medium"
assert vs._SEVERITY_MAP["numeric_conflict"]["minor"] == "medium"
assert vs._NUMERIC_PROMOTE is False
def test_severity_map_on_critical_promoted(monkeypatch):
"""VERIFIER_NUMERIC_PROMOTE=1 → critical 만 strong, minor 는 medium 유지."""
vs = _reload_verifier(monkeypatch, "1")
assert vs._SEVERITY_MAP["numeric_conflict"]["critical"] == "strong"
assert vs._SEVERITY_MAP["numeric_conflict"]["minor"] == "medium"
assert vs._NUMERIC_PROMOTE is True
def test_severity_map_off_explicit_zero(monkeypatch):
"""VERIFIER_NUMERIC_PROMOTE=0 명시 → off (default 와 동일)."""
vs = _reload_verifier(monkeypatch, "0")
assert vs._SEVERITY_MAP["numeric_conflict"]["critical"] == "medium"
assert vs._NUMERIC_PROMOTE is False
def test_direct_negation_invariant(monkeypatch):
"""direct_negation 은 env 무관 항상 strong (불변 — 안전장치)."""
for value in [None, "0", "1"]:
vs = _reload_verifier(monkeypatch, value)
assert vs._SEVERITY_MAP["direct_negation"]["critical"] == "strong"
assert vs._SEVERITY_MAP["direct_negation"]["minor"] == "strong"