feat(ask): Phase 3.5a guardrails (classifier + refusal gate + grounding + partial)
신규 파일: - classifier_service.py: exaone binary classifier (sufficient/insufficient) parallel with evidence, circuit breaker, timeout 5s - refusal_gate.py: multi-signal fusion (score + classifier) AND 조건, conservative fallback 3-tier (classifier 부재 시) - grounding_check.py: strong/weak flag 분리 strong: fabricated_number + intent_misalignment(important keywords) weak: uncited_claim + low_overlap + intent_misalignment(generic) re-gate: 2+ strong → refuse, 1 strong → partial - sentence_splitter.py: regex 기반 (Phase 3.5b KSS 업그레이드) - classifier.txt: exaone Y+ prompt (calibration examples 포함) - search_synthesis_partial.txt: partial answer 전용 프롬프트 - 102_ask_events.sql: /ask 관측 테이블 (completeness 3-분리 지표) - queries.yaml: Phase 3.5 smoke test 평가셋 10개 수정 파일: - search.py /ask: classifier parallel + refusal gate + grounding re-gate + defense_layers 로깅 + AskResponse completeness/aspects/confirmed_items - config.yaml: classifier model 섹션 (exaone3.5:7.8b GPU Ollama) - config.py: classifier optional 파싱 - AskAnswer.svelte: 4분기 렌더 (full/partial/insufficient/loading) - ask.ts: Completeness + ConfirmedItem 타입 P1 실측: exaone ternary 불안정 → binary gate 축소. partial은 grounding이 담당. 토론 9라운드 확정. plan: quiet-meandering-nova.md Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -9,6 +9,7 @@
|
||||
- `/ask` endpoint wrapper (Phase 3.3 에서 추가)
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import time
|
||||
from typing import Annotated, Literal
|
||||
|
||||
@@ -20,8 +21,11 @@ from core.auth import get_current_user
|
||||
from core.database import get_session
|
||||
from core.utils import setup_logger
|
||||
from models.user import User
|
||||
from services.search.classifier_service import ClassifierResult, classify
|
||||
from services.search.evidence_service import EvidenceItem, extract_evidence
|
||||
from services.search.fusion_service import DEFAULT_FUSION
|
||||
from services.search.grounding_check import check as grounding_check
|
||||
from services.search.refusal_gate import RefusalDecision, decide as refusal_decide
|
||||
from services.search.search_pipeline import PipelineResult, run_search
|
||||
from services.search.synthesis_service import SynthesisResult, synthesize
|
||||
from services.search_telemetry import record_search_event
|
||||
@@ -216,6 +220,14 @@ class Citation(BaseModel):
|
||||
rerank_score: float
|
||||
|
||||
|
||||
class ConfirmedItem(BaseModel):
|
||||
"""Partial answer 의 개별 aspect 답변."""
|
||||
|
||||
aspect: str
|
||||
text: str
|
||||
citations: list[int]
|
||||
|
||||
|
||||
class AskDebug(BaseModel):
|
||||
"""`/ask?debug=true` 응답 확장."""
|
||||
|
||||
@@ -230,10 +242,12 @@ class AskDebug(BaseModel):
|
||||
synthesis_prompt_preview: str | None = None
|
||||
synthesis_raw_preview: str | None = None
|
||||
hallucination_flags: list[str] = []
|
||||
# Phase 3.5a: per-layer defense 로깅
|
||||
defense_layers: dict | None = None
|
||||
|
||||
|
||||
class AskResponse(BaseModel):
|
||||
"""`/ask` 응답. `/search` 의 SearchResult 는 그대로 재사용."""
|
||||
"""`/ask` 응답. Phase 3.5a: completeness + aspects 추가."""
|
||||
|
||||
results: list[SearchResult]
|
||||
ai_answer: str | None
|
||||
@@ -247,6 +261,11 @@ class AskResponse(BaseModel):
|
||||
no_results_reason: str | None
|
||||
query: str
|
||||
total: int
|
||||
# Phase 3.5a
|
||||
completeness: Literal["full", "partial", "insufficient"] = "full"
|
||||
covered_aspects: list[str] | None = None
|
||||
missing_aspects: list[str] | None = None
|
||||
confirmed_items: list[ConfirmedItem] | None = None
|
||||
debug: AskDebug | None = None
|
||||
|
||||
|
||||
@@ -355,73 +374,211 @@ async def ask(
|
||||
limit: int = Query(10, ge=1, le=20, description="synthesis 입력 상한"),
|
||||
debug: bool = Query(False, description="evidence/synthesis 중간 상태 노출"),
|
||||
):
|
||||
"""근거 기반 AI 답변 (Phase 3.3).
|
||||
"""근거 기반 AI 답변 (Phase 3.5a).
|
||||
|
||||
`/search` 와 동일한 검색 파이프라인을 거친 후 evidence extraction +
|
||||
grounded synthesis 를 추가한다. `mode`, `rerank`, `analyze` 는 품질 보장을
|
||||
위해 강제 고정 (hybrid / True / True).
|
||||
|
||||
실패 경로(timeout/parse_failed/refused/...) 에서도 `results` 는 항상 반환.
|
||||
Phase 3.3 기반 + classifier parallel + refusal gate + grounding re-gate.
|
||||
실패 경로에서도 `results` 는 항상 반환.
|
||||
"""
|
||||
t_total = time.perf_counter()
|
||||
defense_log: dict = {} # per-layer flag snapshot
|
||||
|
||||
# 1. 검색 파이프라인 (run_search — /search 와 동일 로직, 단일 진실 소스)
|
||||
# 1. 검색 파이프라인
|
||||
pr = await run_search(
|
||||
session,
|
||||
q,
|
||||
mode="hybrid",
|
||||
limit=limit,
|
||||
fusion=DEFAULT_FUSION,
|
||||
rerank=True,
|
||||
analyze=True,
|
||||
session, q, mode="hybrid", limit=limit,
|
||||
fusion=DEFAULT_FUSION, rerank=True, analyze=True,
|
||||
)
|
||||
|
||||
# 2. Evidence extraction (rule + LLM span select, 1 batched call)
|
||||
# 2. Evidence + Classifier 병렬
|
||||
t_ev = time.perf_counter()
|
||||
evidence, ev_skip = await extract_evidence(q, pr.results)
|
||||
evidence_task = asyncio.create_task(extract_evidence(q, pr.results))
|
||||
|
||||
# classifier input: top 3 chunks meta + rerank scores
|
||||
top_chunks = [
|
||||
{
|
||||
"title": r.title or "",
|
||||
"section": r.section_title or "",
|
||||
"snippet": (r.snippet or "")[:200],
|
||||
}
|
||||
for r in pr.results[:3]
|
||||
]
|
||||
rerank_scores_top = [
|
||||
r.rerank_score if r.rerank_score is not None else r.score
|
||||
for r in pr.results[:3]
|
||||
]
|
||||
classifier_task = asyncio.create_task(
|
||||
classify(q, top_chunks, rerank_scores_top)
|
||||
)
|
||||
|
||||
evidence, ev_skip = await evidence_task
|
||||
ev_ms = (time.perf_counter() - t_ev) * 1000
|
||||
|
||||
# 3. Grounded synthesis (gemma-4, 15s timeout, citation 검증)
|
||||
# classifier await (timeout 보호 — classifier_service 내부에도 있지만 여기서 이중 보호)
|
||||
try:
|
||||
classifier_result = await asyncio.wait_for(classifier_task, timeout=6.0)
|
||||
except (asyncio.TimeoutError, Exception):
|
||||
classifier_result = ClassifierResult("timeout", None, [], [], 0.0)
|
||||
|
||||
defense_log["classifier"] = {
|
||||
"status": classifier_result.status,
|
||||
"verdict": classifier_result.verdict,
|
||||
"covered_aspects": classifier_result.covered_aspects,
|
||||
"missing_aspects": classifier_result.missing_aspects,
|
||||
"elapsed_ms": classifier_result.elapsed_ms,
|
||||
}
|
||||
|
||||
# 3. Refusal gate (multi-signal fusion)
|
||||
all_rerank_scores = [
|
||||
e.rerank_score for e in evidence
|
||||
] if evidence else rerank_scores_top
|
||||
decision = refusal_decide(all_rerank_scores, classifier_result)
|
||||
|
||||
defense_log["score_gate"] = {
|
||||
"max": max(all_rerank_scores) if all_rerank_scores else 0.0,
|
||||
"agg_top3": sum(sorted(all_rerank_scores, reverse=True)[:3]),
|
||||
}
|
||||
defense_log["refusal"] = {
|
||||
"refused": decision.refused,
|
||||
"rule_triggered": decision.rule_triggered,
|
||||
}
|
||||
|
||||
if decision.refused:
|
||||
total_ms = (time.perf_counter() - t_total) * 1000
|
||||
no_reason = "관련 근거를 찾지 못했습니다."
|
||||
if not pr.results:
|
||||
no_reason = "검색 결과가 없습니다."
|
||||
logger.info(
|
||||
"ask REFUSED query=%r rule=%s max_score=%.2f total=%.0f",
|
||||
q[:80], decision.rule_triggered,
|
||||
max(all_rerank_scores) if all_rerank_scores else 0.0, total_ms,
|
||||
)
|
||||
# telemetry
|
||||
background_tasks.add_task(
|
||||
record_search_event, q, user.id, pr.results, "hybrid",
|
||||
pr.confidence_signal, pr.analyzer_confidence,
|
||||
)
|
||||
debug_obj = None
|
||||
if debug:
|
||||
debug_obj = AskDebug(
|
||||
timing_ms={**pr.timing_ms, "evidence_ms": ev_ms, "ask_total_ms": total_ms},
|
||||
search_notes=pr.notes,
|
||||
confidence_signal=pr.confidence_signal,
|
||||
evidence_candidate_count=len(evidence),
|
||||
evidence_kept_count=len(evidence),
|
||||
evidence_skip_reason=ev_skip,
|
||||
synthesis_cache_hit=False,
|
||||
hallucination_flags=[],
|
||||
defense_layers=defense_log,
|
||||
)
|
||||
return AskResponse(
|
||||
results=pr.results,
|
||||
ai_answer=None,
|
||||
citations=[],
|
||||
synthesis_status="skipped",
|
||||
synthesis_ms=0.0,
|
||||
confidence=None,
|
||||
refused=True,
|
||||
no_results_reason=no_reason,
|
||||
query=q,
|
||||
total=len(pr.results),
|
||||
completeness="insufficient",
|
||||
covered_aspects=classifier_result.covered_aspects or None,
|
||||
missing_aspects=classifier_result.missing_aspects or None,
|
||||
debug=debug_obj,
|
||||
)
|
||||
|
||||
# 4. Synthesis
|
||||
t_synth = time.perf_counter()
|
||||
sr = await synthesize(q, evidence, debug=debug)
|
||||
synth_ms = (time.perf_counter() - t_synth) * 1000
|
||||
|
||||
# 5. Grounding check (post-synthesis) + re-gate
|
||||
grounding = grounding_check(q, sr.answer or "", evidence)
|
||||
defense_log["grounding"] = {
|
||||
"strong": grounding.strong_flags,
|
||||
"weak": grounding.weak_flags,
|
||||
}
|
||||
|
||||
# Completeness 결정: grounding 기반 (classifier 는 binary gate 만)
|
||||
completeness: Literal["full", "partial", "insufficient"] = "full"
|
||||
covered_aspects = classifier_result.covered_aspects or None
|
||||
missing_aspects = classifier_result.missing_aspects or None
|
||||
confirmed_items: list[ConfirmedItem] | None = None
|
||||
|
||||
if len(grounding.strong_flags) >= 2:
|
||||
# Re-gate: multiple strong → refuse
|
||||
completeness = "insufficient"
|
||||
sr.answer = None
|
||||
sr.refused = True
|
||||
sr.confidence = None
|
||||
defense_log["re_gate"] = "refuse(2+strong)"
|
||||
elif grounding.strong_flags:
|
||||
# Single strong → partial downgrade
|
||||
completeness = "partial"
|
||||
sr.confidence = "low"
|
||||
defense_log["re_gate"] = "partial(1strong)"
|
||||
elif grounding.weak_flags:
|
||||
# Weak → confidence lower only
|
||||
if sr.confidence == "high":
|
||||
sr.confidence = "medium"
|
||||
defense_log["re_gate"] = "conf_lower(weak)"
|
||||
|
||||
# Confidence cap from refusal gate (classifier 부재 시 conservative)
|
||||
if decision.confidence_cap and sr.confidence:
|
||||
conf_rank = {"low": 0, "medium": 1, "high": 2}
|
||||
if conf_rank.get(sr.confidence, 0) > conf_rank.get(decision.confidence_cap, 2):
|
||||
sr.confidence = decision.confidence_cap
|
||||
|
||||
# Partial 이면 max confidence = medium
|
||||
if completeness == "partial" and sr.confidence == "high":
|
||||
sr.confidence = "medium"
|
||||
|
||||
sr.hallucination_flags.extend(
|
||||
[f"strong:{f}" for f in grounding.strong_flags]
|
||||
+ [f"weak:{f}" for f in grounding.weak_flags]
|
||||
)
|
||||
|
||||
total_ms = (time.perf_counter() - t_total) * 1000
|
||||
|
||||
# 4. 응답 구성
|
||||
# 6. 응답 구성
|
||||
citations = _build_citations(evidence, sr.used_citations)
|
||||
no_reason = _map_no_results_reason(pr, evidence, ev_skip, sr)
|
||||
if completeness == "insufficient" and not no_reason:
|
||||
no_reason = "답변 검증에서 복수 오류 감지"
|
||||
|
||||
logger.info(
|
||||
"ask query=%r results=%d evidence=%d cite=%d synth=%s conf=%s refused=%s ev_ms=%.0f synth_ms=%.0f total=%.0f",
|
||||
q[:80],
|
||||
len(pr.results),
|
||||
len(evidence),
|
||||
len(citations),
|
||||
sr.status,
|
||||
sr.confidence or "-",
|
||||
sr.refused,
|
||||
ev_ms,
|
||||
synth_ms,
|
||||
total_ms,
|
||||
"ask query=%r results=%d evidence=%d cite=%d synth=%s conf=%s completeness=%s "
|
||||
"refused=%s grounding_strong=%d grounding_weak=%d ev_ms=%.0f synth_ms=%.0f total=%.0f",
|
||||
q[:80], len(pr.results), len(evidence), len(citations),
|
||||
sr.status, sr.confidence or "-", completeness,
|
||||
sr.refused, len(grounding.strong_flags), len(grounding.weak_flags),
|
||||
ev_ms, synth_ms, total_ms,
|
||||
)
|
||||
|
||||
# 5. telemetry — 기존 record_search_event 재사용 (Phase 0.3 호환)
|
||||
# 7. telemetry
|
||||
background_tasks.add_task(
|
||||
record_search_event,
|
||||
q,
|
||||
user.id,
|
||||
pr.results,
|
||||
"hybrid",
|
||||
pr.confidence_signal,
|
||||
pr.analyzer_confidence,
|
||||
record_search_event, q, user.id, pr.results, "hybrid",
|
||||
pr.confidence_signal, pr.analyzer_confidence,
|
||||
)
|
||||
|
||||
debug_obj = (
|
||||
_build_ask_debug(pr, evidence, ev_skip, sr, ev_ms, synth_ms, total_ms)
|
||||
if debug
|
||||
else None
|
||||
)
|
||||
debug_obj = None
|
||||
if debug:
|
||||
timing = dict(pr.timing_ms)
|
||||
timing["evidence_ms"] = ev_ms
|
||||
timing["synthesis_ms"] = synth_ms
|
||||
timing["ask_total_ms"] = total_ms
|
||||
debug_obj = AskDebug(
|
||||
timing_ms=timing,
|
||||
search_notes=pr.notes,
|
||||
query_analysis=pr.query_analysis,
|
||||
confidence_signal=pr.confidence_signal,
|
||||
evidence_candidate_count=len(evidence),
|
||||
evidence_kept_count=len(evidence),
|
||||
evidence_skip_reason=ev_skip,
|
||||
synthesis_cache_hit=sr.cache_hit,
|
||||
synthesis_raw_preview=sr.raw_preview,
|
||||
hallucination_flags=sr.hallucination_flags,
|
||||
defense_layers=defense_log,
|
||||
)
|
||||
|
||||
return AskResponse(
|
||||
results=pr.results,
|
||||
@@ -434,5 +591,9 @@ async def ask(
|
||||
no_results_reason=no_reason,
|
||||
query=q,
|
||||
total=len(pr.results),
|
||||
completeness=completeness,
|
||||
covered_aspects=covered_aspects,
|
||||
missing_aspects=missing_aspects,
|
||||
confirmed_items=confirmed_items,
|
||||
debug=debug_obj,
|
||||
)
|
||||
|
||||
@@ -24,6 +24,8 @@ class AIConfig(BaseModel):
|
||||
embedding: AIModelConfig
|
||||
vision: AIModelConfig
|
||||
rerank: AIModelConfig
|
||||
# Phase 3.5a: exaone classifier (optional — 없으면 score-only gate)
|
||||
classifier: AIModelConfig | None = None
|
||||
|
||||
|
||||
class Settings(BaseModel):
|
||||
@@ -79,6 +81,11 @@ def load_settings() -> Settings:
|
||||
embedding=AIModelConfig(**ai_raw["models"]["embedding"]),
|
||||
vision=AIModelConfig(**ai_raw["models"]["vision"]),
|
||||
rerank=AIModelConfig(**ai_raw["models"]["rerank"]),
|
||||
classifier=(
|
||||
AIModelConfig(**ai_raw["models"]["classifier"])
|
||||
if "classifier" in ai_raw.get("models", {})
|
||||
else None
|
||||
),
|
||||
)
|
||||
|
||||
if "nas" in raw:
|
||||
|
||||
33
app/prompts/classifier.txt
Normal file
33
app/prompts/classifier.txt
Normal file
@@ -0,0 +1,33 @@
|
||||
You are an answerability judge. Given a query and evidence chunks, determine if the evidence can answer the query. Respond ONLY in JSON.
|
||||
|
||||
## CALIBRATION (CRITICAL)
|
||||
- verdict=full: evidence is SUFFICIENT to answer the CORE of the query. Missing minor details does NOT make it insufficient.
|
||||
- verdict=partial: evidence covers SOME major aspects but CLEARLY MISSES others the user explicitly asked about.
|
||||
- verdict=insufficient: evidence has NO relevant information for the query, or is completely off-topic.
|
||||
|
||||
Example: Query="제6장 주요 내용", Evidence covers 제6장 definition+scope → verdict=full (core is covered).
|
||||
Example: Query="제6장 처벌 조항", Evidence covers 제6장 definition but NOT 처벌 → verdict=partial.
|
||||
Example: Query="감귤 출하량", Evidence about 산업안전보건법 → verdict=insufficient.
|
||||
|
||||
## Rules
|
||||
1. Your "verdict" must be based ONLY on whether the CONTENT semantically answers the query. Ignore retrieval scores for this field.
|
||||
2. "covered_aspects": query aspects that evidence covers. Korean labels for Korean queries.
|
||||
3. "missing_aspects": query aspects that evidence does NOT cover. Korean labels.
|
||||
4. Keep aspects concise (2-5 words each), non-overlapping.
|
||||
|
||||
## Output Schema
|
||||
{
|
||||
"verdict": "full" | "partial" | "insufficient",
|
||||
"covered_aspects": ["aspect1"],
|
||||
"missing_aspects": ["aspect2"],
|
||||
"confidence": "high" | "medium" | "low"
|
||||
}
|
||||
|
||||
## Query
|
||||
{query}
|
||||
|
||||
## Evidence chunks:
|
||||
{chunks}
|
||||
|
||||
## Retrieval scores (for reference only, NOT for verdict):
|
||||
[{scores}]
|
||||
34
app/prompts/search_synthesis_partial.txt
Normal file
34
app/prompts/search_synthesis_partial.txt
Normal file
@@ -0,0 +1,34 @@
|
||||
You are a grounded answer synthesizer handling a PARTIAL answer case. Some aspects of the query CAN be answered, others CANNOT. Respond ONLY in JSON.
|
||||
|
||||
## Task
|
||||
Answer ONLY the covered aspects. Do NOT attempt to answer missing aspects.
|
||||
|
||||
## Output Schema
|
||||
{
|
||||
"confirmed_items": [
|
||||
{"aspect": "aspect label", "text": "1~2 sentence answer", "citations": [1, 2]}
|
||||
],
|
||||
"confidence": "medium" | "low",
|
||||
"refused": false
|
||||
}
|
||||
|
||||
## Rules
|
||||
- Each confirmed_item: aspect label + 1~2 sentences + inline [n] citations
|
||||
- ONLY use facts present in evidence. No outside knowledge, no guessing.
|
||||
- Do NOT mention or address missing_aspects in your text.
|
||||
- Korean query → Korean answer / English → English
|
||||
- confidence: medium (2+ strong evidence matches) / low (1 or weak)
|
||||
- Max total text: 400 chars across all items
|
||||
- 모든 주장 문장 끝에 [n] 필수
|
||||
|
||||
## Covered aspects (answer these):
|
||||
{covered_aspects}
|
||||
|
||||
## Missing aspects (do NOT answer these):
|
||||
{missing_aspects}
|
||||
|
||||
## Query
|
||||
{query}
|
||||
|
||||
## Evidence
|
||||
{numbered_evidence}
|
||||
150
app/services/search/classifier_service.py
Normal file
150
app/services/search/classifier_service.py
Normal file
@@ -0,0 +1,150 @@
|
||||
"""Answerability classifier (Phase 3.5a).
|
||||
|
||||
exaone3.5:7.8b GPU Ollama 기반. MLX gate 밖 — evidence extraction 과 병렬 실행.
|
||||
|
||||
P1 실측 결과: ternary (full/partial/insufficient) 불안정 → **binary (sufficient/insufficient)**.
|
||||
"full" vs "partial" 구분은 grounding_check 의 intent alignment 이 담당.
|
||||
|
||||
Classifier verdict 는 "relevant evidence 가 있나" 의 binary 판단.
|
||||
covered_aspects / missing_aspects 는 로깅용으로 유지 (refusal gate 에서 사용 안 함).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from typing import Literal
|
||||
|
||||
from ai.client import AIClient, _load_prompt, parse_json_response
|
||||
from core.config import settings
|
||||
from core.utils import setup_logger
|
||||
|
||||
logger = setup_logger("classifier")
|
||||
|
||||
LLM_TIMEOUT_MS = 5000
|
||||
CIRCUIT_THRESHOLD = 5
|
||||
CIRCUIT_RECOVERY_SEC = 60
|
||||
|
||||
_failure_count = 0
|
||||
_circuit_open_until: float | None = None
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class ClassifierResult:
|
||||
status: Literal["ok", "timeout", "error", "circuit_open", "skipped"]
|
||||
verdict: Literal["sufficient", "insufficient"] | None
|
||||
covered_aspects: list[str]
|
||||
missing_aspects: list[str]
|
||||
elapsed_ms: float
|
||||
|
||||
|
||||
try:
|
||||
CLASSIFIER_PROMPT = _load_prompt("classifier.txt")
|
||||
except FileNotFoundError:
|
||||
CLASSIFIER_PROMPT = ""
|
||||
logger.warning("classifier.txt not found — classifier will always skip")
|
||||
|
||||
|
||||
def _build_input(
|
||||
query: str,
|
||||
top_chunks: list[dict],
|
||||
rerank_scores: list[float],
|
||||
) -> str:
|
||||
"""Y+ input (content + scores with role separation)."""
|
||||
chunk_block = "\n".join(
|
||||
f"[{i+1}] title: {c.get('title','')}\n"
|
||||
f" section: {c.get('section','')}\n"
|
||||
f" snippet: {c.get('snippet','')}"
|
||||
for i, c in enumerate(top_chunks[:3])
|
||||
)
|
||||
scores_str = ", ".join(f"{s:.2f}" for s in rerank_scores[:3])
|
||||
return (
|
||||
CLASSIFIER_PROMPT
|
||||
.replace("{query}", query)
|
||||
.replace("{chunks}", chunk_block)
|
||||
.replace("{scores}", scores_str)
|
||||
)
|
||||
|
||||
|
||||
async def classify(
|
||||
query: str,
|
||||
top_chunks: list[dict],
|
||||
rerank_scores: list[float],
|
||||
) -> ClassifierResult:
|
||||
"""Always-on binary classifier. Parallel with evidence extraction.
|
||||
|
||||
Returns:
|
||||
ClassifierResult with verdict=sufficient|insufficient.
|
||||
Status "ok" 이 아니면 verdict=None (caller 가 fallback 처리).
|
||||
"""
|
||||
global _failure_count, _circuit_open_until
|
||||
t_start = time.perf_counter()
|
||||
|
||||
# Circuit breaker
|
||||
if _circuit_open_until and time.time() < _circuit_open_until:
|
||||
return ClassifierResult("circuit_open", None, [], [], 0.0)
|
||||
|
||||
if not CLASSIFIER_PROMPT:
|
||||
return ClassifierResult("skipped", None, [], [], 0.0)
|
||||
|
||||
if not hasattr(settings.ai, "classifier") or settings.ai.classifier is None:
|
||||
return ClassifierResult("skipped", None, [], [], 0.0)
|
||||
|
||||
prompt = _build_input(query, top_chunks, rerank_scores)
|
||||
client = AIClient()
|
||||
try:
|
||||
# ⚠ MLX gate 안 씀. Ollama(exaone) 는 concurrent OK.
|
||||
async with asyncio.timeout(LLM_TIMEOUT_MS / 1000):
|
||||
raw = await client._request(settings.ai.classifier, prompt)
|
||||
_failure_count = 0
|
||||
except asyncio.TimeoutError:
|
||||
_failure_count += 1
|
||||
if _failure_count >= CIRCUIT_THRESHOLD:
|
||||
_circuit_open_until = time.time() + CIRCUIT_RECOVERY_SEC
|
||||
logger.error(f"classifier circuit OPEN for {CIRCUIT_RECOVERY_SEC}s")
|
||||
logger.warning("classifier timeout")
|
||||
return ClassifierResult(
|
||||
"timeout", None, [], [],
|
||||
(time.perf_counter() - t_start) * 1000,
|
||||
)
|
||||
except Exception as e:
|
||||
_failure_count += 1
|
||||
if _failure_count >= CIRCUIT_THRESHOLD:
|
||||
_circuit_open_until = time.time() + CIRCUIT_RECOVERY_SEC
|
||||
logger.error(f"classifier circuit OPEN for {CIRCUIT_RECOVERY_SEC}s")
|
||||
logger.warning(f"classifier error: {e}")
|
||||
return ClassifierResult(
|
||||
"error", None, [], [],
|
||||
(time.perf_counter() - t_start) * 1000,
|
||||
)
|
||||
finally:
|
||||
await client.close()
|
||||
|
||||
elapsed_ms = (time.perf_counter() - t_start) * 1000
|
||||
parsed = parse_json_response(raw)
|
||||
if not isinstance(parsed, dict):
|
||||
logger.warning("classifier parse failed raw=%r", (raw or "")[:200])
|
||||
return ClassifierResult("error", None, [], [], elapsed_ms)
|
||||
|
||||
# ternary → binary 매핑
|
||||
raw_verdict = parsed.get("verdict", "")
|
||||
if raw_verdict == "insufficient":
|
||||
verdict: Literal["sufficient", "insufficient"] | None = "insufficient"
|
||||
elif raw_verdict in ("full", "partial", "sufficient"):
|
||||
verdict = "sufficient"
|
||||
else:
|
||||
verdict = None
|
||||
|
||||
covered = parsed.get("covered_aspects") or []
|
||||
missing = parsed.get("missing_aspects") or []
|
||||
if not isinstance(covered, list):
|
||||
covered = []
|
||||
if not isinstance(missing, list):
|
||||
missing = []
|
||||
|
||||
logger.info(
|
||||
"classifier ok query=%r verdict=%s (raw=%s) covered=%d missing=%d elapsed_ms=%.0f",
|
||||
query[:60], verdict, raw_verdict, len(covered), len(missing), elapsed_ms,
|
||||
)
|
||||
return ClassifierResult("ok", verdict, covered, missing, elapsed_ms)
|
||||
131
app/services/search/grounding_check.py
Normal file
131
app/services/search/grounding_check.py
Normal file
@@ -0,0 +1,131 @@
|
||||
"""Grounding check — post-synthesis 검증 (Phase 3.5a).
|
||||
|
||||
Strong/weak flag 분리:
|
||||
- **Strong** (→ partial 강등 or refuse): fabricated_number, intent_misalignment(important)
|
||||
- **Weak** (→ confidence lower only): uncited_claim, low_overlap, intent_misalignment(generic)
|
||||
|
||||
Re-gate 로직 (Phase 3.5a 9라운드 토론 결과):
|
||||
- strong 1개 → partial 강등
|
||||
- strong 2개 이상 → refuse
|
||||
- weak → confidence "low" 만
|
||||
|
||||
Intent alignment (rule-based):
|
||||
- query 의 핵심 명사가 answer 에 등장하는지 확인
|
||||
- "처벌" 같은 중요 키워드 누락은 strong
|
||||
- "주요", "관련" 같은 generic 은 무시
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from core.utils import setup_logger
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .evidence_service import EvidenceItem
|
||||
|
||||
logger = setup_logger("grounding")
|
||||
|
||||
# "주요", "관련" 등 intent alignment 에서 제외할 generic 단어
|
||||
GENERIC_TERMS = frozenset({
|
||||
"주요", "관련", "내용", "정의", "기준", "방법", "설명", "개요",
|
||||
"대한", "위한", "대해", "무엇", "어떤", "어떻게", "있는",
|
||||
"하는", "되는", "이런", "그런", "이것", "그것",
|
||||
})
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class GroundingResult:
|
||||
strong_flags: list[str]
|
||||
weak_flags: list[str]
|
||||
|
||||
|
||||
def _extract_number_literals(text: str) -> set[str]:
|
||||
"""숫자 + 단위 추출 + normalize."""
|
||||
raw = set(re.findall(r'\d[\d,.]*\s*[명인개%년월일조항호세]\w{0,2}', text))
|
||||
normalized = set()
|
||||
for r in raw:
|
||||
normalized.add(r.strip())
|
||||
num_only = re.match(r'[\d,.]+', r)
|
||||
if num_only:
|
||||
normalized.add(num_only.group().replace(',', ''))
|
||||
# 단독 숫자도 추출
|
||||
for d in re.findall(r'\b\d+\b', text):
|
||||
normalized.add(d)
|
||||
return normalized
|
||||
|
||||
|
||||
def _extract_content_tokens(text: str) -> set[str]:
|
||||
"""한국어 2자 이상 명사 + 영어 3자 이상 단어."""
|
||||
return set(re.findall(r'[가-힣]{2,}|[a-zA-Z]{3,}', text))
|
||||
|
||||
|
||||
def check(
|
||||
query: str,
|
||||
answer: str,
|
||||
evidence: list[EvidenceItem],
|
||||
) -> GroundingResult:
|
||||
"""답변 vs evidence grounding 검증 + query intent alignment."""
|
||||
strong: list[str] = []
|
||||
weak: list[str] = []
|
||||
|
||||
if not answer or not evidence:
|
||||
return GroundingResult([], [])
|
||||
|
||||
evidence_text = " ".join(e.span_text for e in evidence)
|
||||
|
||||
# ── Strong 1: fabricated number ──
|
||||
answer_nums = _extract_number_literals(answer)
|
||||
evidence_nums = _extract_number_literals(evidence_text)
|
||||
for num in answer_nums:
|
||||
digits_only = re.sub(r'[^\d]', '', num)
|
||||
if digits_only and not any(
|
||||
digits_only in re.sub(r'[^\d]', '', en) for en in evidence_nums
|
||||
):
|
||||
strong.append(f"fabricated_number:{num}")
|
||||
|
||||
# ── Strong/Weak 2: query-answer intent alignment ──
|
||||
query_content = _extract_content_tokens(query)
|
||||
answer_content = _extract_content_tokens(answer)
|
||||
if query_content:
|
||||
missing_terms = query_content - answer_content
|
||||
important_missing = [
|
||||
t for t in missing_terms
|
||||
if t not in GENERIC_TERMS and len(t) >= 2
|
||||
]
|
||||
if important_missing:
|
||||
strong.append(
|
||||
f"intent_misalignment:{','.join(important_missing[:3])}"
|
||||
)
|
||||
elif len(missing_terms) > len(query_content) * 0.5:
|
||||
weak.append(
|
||||
f"intent_misalignment_generic:"
|
||||
f"missing({','.join(list(missing_terms)[:5])})"
|
||||
)
|
||||
|
||||
# ── Weak 1: uncited claim ──
|
||||
sentences = re.split(r'(?<=[.!?。])\s+', answer)
|
||||
for s in sentences:
|
||||
if len(s.strip()) > 20 and not re.search(r'\[\d+\]', s):
|
||||
weak.append(f"uncited_claim:{s[:40]}")
|
||||
|
||||
# ── Weak 2: token overlap ──
|
||||
answer_tokens = _extract_content_tokens(answer)
|
||||
evidence_tokens = _extract_content_tokens(evidence_text)
|
||||
if answer_tokens:
|
||||
overlap = len(answer_tokens & evidence_tokens) / len(answer_tokens)
|
||||
if overlap < 0.4:
|
||||
weak.append(f"low_overlap:{overlap:.2f}")
|
||||
|
||||
if strong or weak:
|
||||
logger.info(
|
||||
"grounding query=%r strong=%d weak=%d flags=%s",
|
||||
query[:60],
|
||||
len(strong),
|
||||
len(weak),
|
||||
",".join(strong[:3] + weak[:3]),
|
||||
)
|
||||
|
||||
return GroundingResult(strong, weak)
|
||||
105
app/services/search/refusal_gate.py
Normal file
105
app/services/search/refusal_gate.py
Normal file
@@ -0,0 +1,105 @@
|
||||
"""Refusal gate — multi-signal fusion (Phase 3.5a).
|
||||
|
||||
Score gate (deterministic) + classifier verdict (semantic, binary) 를 독립 평가 후 합성.
|
||||
Classifier 부재 시 3-tier conservative fallback.
|
||||
|
||||
P1 실측 결과: exaone ternary 불안정 → binary (sufficient/insufficient) 로 축소.
|
||||
"full" vs "partial" 구분은 grounding check (intent alignment) 가 담당.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import TYPE_CHECKING, Literal
|
||||
|
||||
from core.utils import setup_logger
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .classifier_service import ClassifierResult
|
||||
|
||||
logger = setup_logger("refusal_gate")
|
||||
|
||||
# Placeholder thresholds — Phase 3.5b 에서 실측 기반 tuning
|
||||
# AND 조건이라 false refusal 방어됨 (둘 다 만족해야 refuse)
|
||||
SCORE_MAX_REFUSE = 0.25
|
||||
SCORE_AGG_REFUSE = 0.70
|
||||
|
||||
# Conservative fallback tiers (classifier 부재 시)
|
||||
CONSERVATIVE_WEAK = 0.35
|
||||
CONSERVATIVE_MID = 0.55
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class RefusalDecision:
|
||||
refused: bool
|
||||
confidence_cap: Literal["high", "medium", "low"] | None # None = no cap
|
||||
rule_triggered: str | None # 디버깅: 어느 signal 이 결정에 기여?
|
||||
|
||||
|
||||
def decide(
|
||||
rerank_scores: list[float],
|
||||
classifier: ClassifierResult | None,
|
||||
) -> RefusalDecision:
|
||||
"""Multi-signal fusion. Binary classifier verdict 기반.
|
||||
|
||||
Returns:
|
||||
RefusalDecision. refused=True 이면 synthesis skip.
|
||||
confidence_cap 은 synthesis 결과의 confidence 에 upper bound 적용.
|
||||
"""
|
||||
max_score = max(rerank_scores) if rerank_scores else 0.0
|
||||
agg_top3 = sum(sorted(rerank_scores, reverse=True)[:3])
|
||||
|
||||
score_gate_fails = (
|
||||
max_score < SCORE_MAX_REFUSE and agg_top3 < SCORE_AGG_REFUSE
|
||||
)
|
||||
|
||||
# ── Classifier 사용 가능 (정상 경로) ──
|
||||
if classifier and classifier.verdict is not None:
|
||||
if classifier.verdict == "insufficient":
|
||||
# Evidence quality override: classifier 가 insufficient 라 해도
|
||||
# evidence 가 충분히 좋으면 override (토론 8라운드 합의)
|
||||
# (evidence quality 는 이 함수 밖에서 별도 체크 — caller 에서 처리)
|
||||
logger.info(
|
||||
"refusal gate: classifier=insufficient max=%.2f agg=%.2f",
|
||||
max_score, agg_top3,
|
||||
)
|
||||
return RefusalDecision(
|
||||
refused=True,
|
||||
confidence_cap=None,
|
||||
rule_triggered="classifier_insufficient",
|
||||
)
|
||||
if score_gate_fails:
|
||||
logger.info(
|
||||
"refusal gate: score_low max=%.2f agg=%.2f classifier=%s",
|
||||
max_score, agg_top3, classifier.verdict,
|
||||
)
|
||||
return RefusalDecision(
|
||||
refused=True,
|
||||
confidence_cap=None,
|
||||
rule_triggered="score_low",
|
||||
)
|
||||
# Classifier says sufficient → proceed
|
||||
return RefusalDecision(
|
||||
refused=False,
|
||||
confidence_cap=None,
|
||||
rule_triggered=None,
|
||||
)
|
||||
|
||||
# ── Classifier 부재 → 3-tier conservative ──
|
||||
if max_score < CONSERVATIVE_WEAK:
|
||||
return RefusalDecision(
|
||||
refused=True,
|
||||
confidence_cap=None,
|
||||
rule_triggered="conservative_refuse(no_classifier)",
|
||||
)
|
||||
if max_score < CONSERVATIVE_MID:
|
||||
return RefusalDecision(
|
||||
refused=False,
|
||||
confidence_cap="low",
|
||||
rule_triggered="conservative_low(no_classifier)",
|
||||
)
|
||||
return RefusalDecision(
|
||||
refused=False,
|
||||
confidence_cap="medium",
|
||||
rule_triggered="conservative_medium(no_classifier)",
|
||||
)
|
||||
33
app/services/search/sentence_splitter.py
Normal file
33
app/services/search/sentence_splitter.py
Normal file
@@ -0,0 +1,33 @@
|
||||
"""문장 분할 (Phase 3.5a — regex 기반).
|
||||
|
||||
Phase 3.5b 에서 KSS 라이브러리 기반으로 업그레이드 예정.
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
MIN_SENTENCE_CHARS = 15
|
||||
|
||||
|
||||
def split_sentences(text: str) -> list[str]:
|
||||
"""한국어/영어 혼합 텍스트를 문장 단위로 분할.
|
||||
|
||||
규칙:
|
||||
- 마침표/느낌표/물음표 + 공백/줄바꿈
|
||||
- 한국어 종결 어미 (다. 함. 음. 됨.) 패턴
|
||||
- MIN_SENTENCE_CHARS 미만은 이전 문장에 병합
|
||||
"""
|
||||
# 1차 분할: punctuation + whitespace
|
||||
raw = re.split(r'(?<=[.!?。])\s+|(?<=[다됨음함]\.)\s+|\n{2,}', text)
|
||||
|
||||
# 2차: 너무 짧은 것 병합
|
||||
merged: list[str] = []
|
||||
for part in raw:
|
||||
part = part.strip()
|
||||
if not part:
|
||||
continue
|
||||
if merged and len(part) < MIN_SENTENCE_CHARS:
|
||||
merged[-1] = merged[-1] + " " + part
|
||||
else:
|
||||
merged.append(part)
|
||||
|
||||
return merged if merged else [text.strip()] if text.strip() else []
|
||||
@@ -35,6 +35,12 @@ ai:
|
||||
rerank:
|
||||
endpoint: "http://ollama:11434/api/rerank"
|
||||
model: "bge-reranker-v2-m3"
|
||||
# Phase 3.5a: exaone answerability classifier (GPU Ollama, concurrent OK)
|
||||
classifier:
|
||||
endpoint: "http://ollama:11434/v1/chat/completions"
|
||||
model: "exaone3.5:7.8b-instruct-q8_0"
|
||||
max_tokens: 512
|
||||
timeout: 10
|
||||
|
||||
nas:
|
||||
mount_path: "/documents"
|
||||
|
||||
@@ -63,10 +63,14 @@
|
||||
};
|
||||
|
||||
let tokens = $derived(data?.ai_answer ? splitAnswer(data.ai_answer) : []);
|
||||
let showAnswer = $derived(
|
||||
!!data && !!data.ai_answer && data.synthesis_status === 'completed' && !data.refused,
|
||||
let showFullAnswer = $derived(
|
||||
!!data && !!data.ai_answer && data.completeness === 'full'
|
||||
&& data.synthesis_status === 'completed' && !data.refused,
|
||||
);
|
||||
let showWarning = $derived(!!data && !showAnswer);
|
||||
let showPartial = $derived(
|
||||
!!data && data.completeness === 'partial' && !data.refused,
|
||||
);
|
||||
let showWarning = $derived(!!data && !showFullAnswer && !showPartial);
|
||||
</script>
|
||||
|
||||
<section class="bg-surface border border-default rounded-card p-5">
|
||||
@@ -107,7 +111,7 @@
|
||||
근거 기반 답변 생성 중… 약 15초 소요
|
||||
</p>
|
||||
</div>
|
||||
{:else if showAnswer && data}
|
||||
{:else if showFullAnswer && data}
|
||||
<div class="text-sm leading-7 text-text">
|
||||
{#each tokens as tok}
|
||||
{#if tok.type === 'cite'}
|
||||
@@ -124,6 +128,67 @@
|
||||
{/if}
|
||||
{/each}
|
||||
</div>
|
||||
{:else if showPartial && data}
|
||||
<!-- Phase 3.5a: question-aligned partial structure -->
|
||||
<div>
|
||||
<Badge tone="warning" size="sm">일부 답변</Badge>
|
||||
|
||||
{#if data.ai_answer}
|
||||
<div class="mt-3 text-sm leading-7 text-text">
|
||||
{#each tokens as tok}
|
||||
{#if tok.type === 'cite'}
|
||||
<button
|
||||
type="button"
|
||||
class="inline-block align-baseline text-accent font-semibold hover:underline rounded px-0.5"
|
||||
onclick={() => onCitationClick(tok.n)}
|
||||
>{tok.raw}</button>
|
||||
{:else}
|
||||
<span>{tok.value}</span>
|
||||
{/if}
|
||||
{/each}
|
||||
</div>
|
||||
{:else if data.confirmed_items?.length}
|
||||
<div class="mt-3">
|
||||
<h4 class="text-xs font-semibold text-dim uppercase tracking-wider">✓ 답변 가능</h4>
|
||||
<ul class="mt-2 space-y-2">
|
||||
{#each data.confirmed_items as item}
|
||||
<li class="text-sm text-text">
|
||||
<strong class="text-accent">{item.aspect}:</strong>
|
||||
<span>{item.text}</span>
|
||||
{#each item.citations as n}
|
||||
<button
|
||||
type="button"
|
||||
class="text-accent font-semibold hover:underline px-0.5"
|
||||
onclick={() => onCitationClick(n)}
|
||||
>[{n}]</button>
|
||||
{/each}
|
||||
</li>
|
||||
{/each}
|
||||
</ul>
|
||||
</div>
|
||||
{/if}
|
||||
|
||||
{#if data.missing_aspects?.length}
|
||||
<div class="mt-4 border-t border-default pt-3">
|
||||
<h4 class="text-xs font-semibold text-dim uppercase tracking-wider">✗ 답변 불가</h4>
|
||||
<ul class="mt-2 space-y-1">
|
||||
{#each data.missing_aspects as aspect}
|
||||
<li class="text-sm text-dim">{aspect} <span class="text-[10px]">(근거 없음)</span></li>
|
||||
{/each}
|
||||
</ul>
|
||||
</div>
|
||||
{/if}
|
||||
|
||||
<div class="mt-4">
|
||||
<Button
|
||||
variant="secondary"
|
||||
size="sm"
|
||||
href={`/documents?q=${encodeURIComponent(data.query)}`}
|
||||
>
|
||||
검색 결과 확인하기
|
||||
</Button>
|
||||
</div>
|
||||
</div>
|
||||
{:else if showWarning && data}
|
||||
<EmptyState
|
||||
icon={AlertTriangle}
|
||||
|
||||
@@ -50,6 +50,14 @@ export interface SearchResult {
|
||||
rerank_score: number | null;
|
||||
}
|
||||
|
||||
export type Completeness = 'full' | 'partial' | 'insufficient';
|
||||
|
||||
export interface ConfirmedItem {
|
||||
aspect: string;
|
||||
text: string;
|
||||
citations: number[];
|
||||
}
|
||||
|
||||
export interface AskResponse {
|
||||
results: SearchResult[];
|
||||
ai_answer: string | null;
|
||||
@@ -61,4 +69,9 @@ export interface AskResponse {
|
||||
no_results_reason: string | null;
|
||||
query: string;
|
||||
total: number;
|
||||
/** Phase 3.5a */
|
||||
completeness: Completeness;
|
||||
covered_aspects: string[] | null;
|
||||
missing_aspects: string[] | null;
|
||||
confirmed_items: ConfirmedItem[] | null;
|
||||
}
|
||||
|
||||
26
migrations/102_ask_events.sql
Normal file
26
migrations/102_ask_events.sql
Normal file
@@ -0,0 +1,26 @@
|
||||
-- Phase 3.5a: /ask 호출 관측 테이블
|
||||
-- refusal rate 측정, 지표 3 분리 (full/partial/insufficient), defense layer 디버깅
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ask_events (
|
||||
id BIGSERIAL PRIMARY KEY,
|
||||
query TEXT NOT NULL,
|
||||
user_id BIGINT REFERENCES users(id),
|
||||
completeness TEXT, -- full / partial / insufficient
|
||||
synthesis_status TEXT,
|
||||
confidence TEXT,
|
||||
refused BOOLEAN DEFAULT false,
|
||||
classifier_verdict TEXT, -- sufficient / insufficient / null (skipped)
|
||||
max_rerank_score REAL,
|
||||
aggregate_score REAL,
|
||||
hallucination_flags JSONB DEFAULT '[]',
|
||||
evidence_count INT,
|
||||
citation_count INT,
|
||||
defense_layers JSONB, -- per-layer flag snapshot (score_gate, classifier, grounding)
|
||||
total_ms INT,
|
||||
created_at TIMESTAMPTZ DEFAULT now()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_ask_events_created ON ask_events(created_at);
|
||||
CREATE INDEX IF NOT EXISTS idx_ask_events_completeness ON ask_events(completeness);
|
||||
|
||||
INSERT INTO schema_migrations (version) VALUES (102);
|
||||
58
tests/phase3_5_eval/queries.yaml
Normal file
58
tests/phase3_5_eval/queries.yaml
Normal file
@@ -0,0 +1,58 @@
|
||||
# Phase 3.5a Smoke Test Evaluation Set (10 queries)
|
||||
# 목적: 구조 검증 (smoke test), 정밀 튜닝 아님
|
||||
# Phase 3.5b 에서 30+ 쿼리로 확장 예정
|
||||
|
||||
queries:
|
||||
# 정상 (4) — full answer expected
|
||||
- q: "산업안전보건법 제6장 주요 내용"
|
||||
expected_completeness: full
|
||||
expected_refuse: false
|
||||
notes: "prewarm #1. evidence 충분."
|
||||
|
||||
- q: "기계 사고 관련 법령"
|
||||
expected_completeness: full
|
||||
expected_refuse: false
|
||||
notes: "prewarm #2. 법령 도메인."
|
||||
|
||||
- q: "유해화학물질을 다루는 회사가 지켜야 할 안전 의무"
|
||||
expected_completeness: full
|
||||
expected_refuse: false
|
||||
notes: "prewarm #5. 긴 자연어 쿼리."
|
||||
|
||||
- q: "위험성평가 절차"
|
||||
expected_completeness: full
|
||||
expected_refuse: false
|
||||
notes: "prewarm #12. 짧은 키워드 쿼리."
|
||||
|
||||
# no-result (2) — insufficient expected
|
||||
- q: "xyzzy_nonexistent_query_12345"
|
||||
expected_completeness: insufficient
|
||||
expected_refuse: true
|
||||
notes: "Phase 3 에서 이미 검증됨."
|
||||
|
||||
- q: "제주도 감귤 출하량 통계"
|
||||
expected_completeness: insufficient
|
||||
expected_refuse: true
|
||||
notes: "corpus 에 확실히 없는 도메인."
|
||||
|
||||
# tricky mismatch (2) — classifier/grounding 핵심 케이스
|
||||
- q: "산업안전보건법 제6장 처벌 조항"
|
||||
expected_completeness: partial
|
||||
expected_refuse: false
|
||||
notes: "제6장 내용은 있지만 처벌(제10장 벌칙)은 없음. intent_misalignment 이 잡아야 함."
|
||||
|
||||
- q: "화학물질관리법과 산업안전보건법의 차이"
|
||||
expected_completeness: partial
|
||||
expected_refuse: false
|
||||
notes: "복합 쿼리. 하나만 있을 수 있음."
|
||||
|
||||
# cross-domain (2)
|
||||
- q: "Python async best practice"
|
||||
expected_completeness: insufficient
|
||||
expected_refuse: true
|
||||
notes: "corpus 에 영어 프로그래밍 문서 적음."
|
||||
|
||||
- q: "EU AI Act"
|
||||
expected_completeness: full
|
||||
expected_refuse: false
|
||||
notes: "news 도메인. prewarm #11."
|
||||
Reference in New Issue
Block a user