diff --git a/app/models/analyze_event.py b/app/models/analyze_event.py index c5c79fe..fe1d5ab 100644 --- a/app/models/analyze_event.py +++ b/app/models/analyze_event.py @@ -56,3 +56,8 @@ class AnalyzeEvent(Base): # PR-B (migration 159) — 실제 호출 tier 와 R2 backlog guard 이벤트 tier: Mapped[str | None] = mapped_column(Text) # 'triage' | 'primary' | 'fallback' suppressed_reason: Mapped[str | None] = mapped_column(Text) # 'backlog_guard(ratio=0.42,pending=7)' + + # PR-B B-2 (migration 161) — /ask 3-state answerability 독립 컬럼 + answerability: Mapped[str | None] = mapped_column(Text) # 'direct' | 'partial' | 'insufficient' + partial_basis: Mapped[bool | None] = mapped_column(Boolean) # partial 답변이 실제 생성됐는지 + suggested_query_count: Mapped[int | None] = mapped_column(Integer) diff --git a/app/services/document_telemetry.py b/app/services/document_telemetry.py index 1e287ed..22257a9 100644 --- a/app/services/document_telemetry.py +++ b/app/services/document_telemetry.py @@ -63,6 +63,10 @@ async def record_analyze_event( tier: str | None = None, escalated_to_26b: bool | None = None, suppressed_reason: str | None = None, + # PR-B B-2 — /ask 3-state answerability + answerability: str | None = None, + partial_basis: bool | None = None, + suggested_query_count: int | None = None, ) -> None: """analyze_events INSERT. background task에서 호출 — 에러 삼킴. @@ -96,6 +100,9 @@ async def record_analyze_event( shadow_would_route_to=shadow_would_route_to, tier=tier, suppressed_reason=suppressed_reason, + answerability=answerability, + partial_basis=partial_basis, + suggested_query_count=suggested_query_count, ) session.add(row) await session.commit() diff --git a/app/services/prompt_versions.py b/app/services/prompt_versions.py index 3c15ff7..eb5d3c5 100644 --- a/app/services/prompt_versions.py +++ b/app/services/prompt_versions.py @@ -17,7 +17,10 @@ from __future__ import annotations # ─── ask (/search/ask) 프롬프트 버전 ───────────────────────── # synthesis_service.py 가 로드하는 app/prompts/search_synthesis.txt 기준 -ASK_PROMPT_VERSION: str = "search_synthesis.v2-600char" +# v3-evidence-triage: evidence 추출을 triage(4B Ollama) 로 전환 (B-2). synthesis 는 +# 여전히 primary(26B MLX) 로 search_synthesis.txt 사용. 프롬프트 자체는 v2-600char +# 그대로지만 evidence LLM 경로 변경을 분리 추적하기 위해 bump. +ASK_PROMPT_VERSION: str = "search_synthesis.v3-evidence-triage" # ─── /analyze 프롬프트 버전 ────────────────────────────────── # documents.py analyze 라우트가 로드하는 app/prompts/document_analyze.txt 기준 diff --git a/app/services/search/evidence_service.py b/app/services/search/evidence_service.py index 57731cf..9704d5a 100644 --- a/app/services/search/evidence_service.py +++ b/app/services/search/evidence_service.py @@ -25,10 +25,10 @@ EvidenceItem 리스트 ## 영구 룰 -- **LLM 호출은 1번만** (batched). 순차 호출 절대 금지 — MLX single-inference - 큐가 폭발한다. -- **모든 MLX 호출은 `get_mlx_gate()` 경유**. analyzer / synthesis 와 동일 - semaphore 공유. +- **LLM 호출은 1번만** (batched). 순차 호출 절대 금지. +- **B-2 변경**: evidence 추출은 triage(4B Ollama) 로 전환 — Ollama 는 concurrent + OK 라 `get_mlx_gate()` 불필요. primary(26B MLX) 는 synthesis 전용 보호. +- 기존 analyzer / synthesis 의 `get_mlx_gate()` 공유는 유지 — 26B 경로에만 적용. - **fallback span 도 query 중심 window**. `full_snippet[:200]` 같은 "앞에서부터 자르기" 절대 금지. 조용한 품질 붕괴 (citation 은 멀쩡한데 실제 span 이 query 와 무관) 대표 사례. @@ -57,7 +57,6 @@ from typing import TYPE_CHECKING from ai.client import AIClient, _load_prompt, parse_json_response from core.utils import setup_logger -from .llm_gate import get_mlx_gate from .rerank_service import _extract_window if TYPE_CHECKING: @@ -78,7 +77,7 @@ SPAN_ENLARGE_TARGET = 120 # enlarge 시 재윈도우 target_chars SPAN_MAX_CHARS = 300 # 이 초과면 cut (synthesis token budget 보호) LLM_TIMEOUT_MS = 15000 -PROMPT_VERSION = "v1" +PROMPT_VERSION = "v2-triage" # B-2: primary(26B MLX) → triage(4B Ollama) 전환 # 확장 여지 — None 이면 비활성 (baseline). 실측 후 0.8 등으로 켠다. EVIDENCE_FAST_PATH_THRESHOLD: float | None = None @@ -308,10 +307,10 @@ async def extract_evidence( llm_error: str | None = None try: - # ⚠ semaphore 대기는 timeout 바깥. timeout 은 실제 LLM 호출에만. - async with get_mlx_gate(): - async with asyncio.timeout(LLM_TIMEOUT_MS / 1000): - raw = await ai_client._call_chat(ai_client.ai.primary, prompt) + # B-2: evidence 추출은 4B triage (Ollama concurrent OK) — MLX gate 경유 불필요. + # primary(26B) 는 synthesis 전용으로 MLX gate 보호. + async with asyncio.timeout(LLM_TIMEOUT_MS / 1000): + raw = await ai_client.call_triage(prompt) except asyncio.TimeoutError: llm_error = "timeout" except Exception as exc: diff --git a/migrations/161_analyze_events_answerability.sql b/migrations/161_analyze_events_answerability.sql new file mode 100644 index 0000000..009f57e --- /dev/null +++ b/migrations/161_analyze_events_answerability.sql @@ -0,0 +1,21 @@ +-- 161_analyze_events_answerability.sql +-- PR-B B-2: /ask 의 3-state completeness (direct/partial/insufficient) 독립 컬럼화. +-- plan: ~/.claude/plans/swirling-swimming-liskov.md §B-2 +-- +-- 기존 /ask 응답의 completeness 필드(이미 full/partial/insufficient 3-state 로 +-- 운영 중인 Phase 3.5a 결과)를 analyze_events 에서도 독립 컬럼으로 집계 가능하게. +-- mode 컬럼 문자열 파싱 회피 + "Backlog Suppression" 카드와 동일 패턴. +-- +-- answerability 값 매핑: +-- /ask completeness='full' → 'direct' +-- /ask completeness='partial' → 'partial' +-- /ask completeness='insufficient' → 'insufficient' +-- +-- partial_basis: synthesis 가 partial 답변 본문을 실제로 생성했는지 (unanswered_aspects +-- 를 답변 뒤에 명시). completeness=partial 이어도 synthesis 가 스킵되면 false. +-- suggested_query_count: insufficient 때 사용자에게 돌려주는 추가 검색어 제안 개수. + +ALTER TABLE analyze_events + ADD COLUMN IF NOT EXISTS answerability TEXT, + ADD COLUMN IF NOT EXISTS partial_basis BOOLEAN, + ADD COLUMN IF NOT EXISTS suggested_query_count INTEGER; diff --git a/migrations/162_analyze_events_answerability_idx.sql b/migrations/162_analyze_events_answerability_idx.sql new file mode 100644 index 0000000..9e95e0b --- /dev/null +++ b/migrations/162_analyze_events_answerability_idx.sql @@ -0,0 +1,6 @@ +-- 162_analyze_events_answerability_idx.sql +-- PR-B B-2: answerability 분포 조회 인덱스 (대시보드 "에스컬레이션 비율" 카드). + +CREATE INDEX IF NOT EXISTS idx_analyze_events_answerability + ON analyze_events (answerability, created_at DESC) + WHERE answerability IS NOT NULL;