From 34f79f84f2d4e67c086be7b531040c2251c8c8eb Mon Sep 17 00:00:00 2001
From: Hyungi Ahn <hyungiahn@Hyungiui-MacBookPro.local>
Date: Fri, 24 Apr 2026 10:33:32 +0900
Subject: [PATCH] =?UTF-8?q?feat(search):=20B-2=20evidence=20LLM=20?=
 =?UTF-8?q?=E2=86=92=204B=20triage=20=EC=A0=84=ED=99=98=20+=20answerabilit?=
 =?UTF-8?q?y=20=EC=BB=AC=EB=9F=BC?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Plan 본래 의도: 근거 선별은 4B, 합성은 26B.

- evidence_service: LLM 호출을 primary(26B MLX) → triage(4B Ollama) 로 전환.
  Ollama concurrent 가능하므로 get_mlx_gate() 제거. synthesis 는 여전히
  llm_gate Semaphore(1) 경유로 MLX 보호.
- prompt_version v3-evidence-triage bump (synthesis 프롬프트 자체는 v2-600char
  그대로, evidence LLM 경로 변경을 분리 추적).
- migrations 161/162: analyze_events 에 answerability / partial_basis /
  suggested_query_count 컬럼 + partial index. /ask 는 이미 ask_events 에
  completeness (full/partial/insufficient) 기록 운영 중이므로, analyze_events
  쪽은 향후 문서 분석에서 answerability 개념 도입 시 활용 예비.
- telemetry record_analyze_event 에 answerability / partial_basis /
  suggested_query_count 파라미터 확장.

기존 /ask 3-state completeness 로직 (classifier_service + 7-tier gate) 은
그대로 유지 — 이미 Phase 3.5a 에서 완성된 상태. B-2 는 LLM 부하 재분배와
관측성 확장에 집중.

MLX 부하 감소 효과: 이전엔 쿼리 1건당 evidence(26B) + synthesis(26B) 2번
MLX 호출. 이제는 evidence(4B Ollama) + synthesis(26B MLX) 로 MLX 호출 절반.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 app/models/analyze_event.py                   |  5 +++++
 app/services/document_telemetry.py            |  7 +++++++
 app/services/prompt_versions.py               |  5 ++++-
 app/services/search/evidence_service.py       | 19 ++++++++---------
 .../161_analyze_events_answerability.sql      | 21 +++++++++++++++++++
 .../162_analyze_events_answerability_idx.sql  |  6 ++++++
 6 files changed, 52 insertions(+), 11 deletions(-)
 create mode 100644 migrations/161_analyze_events_answerability.sql
 create mode 100644 migrations/162_analyze_events_answerability_idx.sql

diff --git a/app/models/analyze_event.py b/app/models/analyze_event.py
index c5c79fe..fe1d5ab 100644
--- a/app/models/analyze_event.py
+++ b/app/models/analyze_event.py
@@ -56,3 +56,8 @@ class AnalyzeEvent(Base):
     # PR-B (migration 159) — 실제 호출 tier 와 R2 backlog guard 이벤트
     tier: Mapped[str | None] = mapped_column(Text)                  # 'triage' | 'primary' | 'fallback'
     suppressed_reason: Mapped[str | None] = mapped_column(Text)     # 'backlog_guard(ratio=0.42,pending=7)'
+
+    # PR-B B-2 (migration 161) — /ask 3-state answerability 독립 컬럼
+    answerability: Mapped[str | None] = mapped_column(Text)         # 'direct' | 'partial' | 'insufficient'
+    partial_basis: Mapped[bool | None] = mapped_column(Boolean)     # partial 답변이 실제 생성됐는지
+    suggested_query_count: Mapped[int | None] = mapped_column(Integer)
diff --git a/app/services/document_telemetry.py b/app/services/document_telemetry.py
index 1e287ed..22257a9 100644
--- a/app/services/document_telemetry.py
+++ b/app/services/document_telemetry.py
@@ -63,6 +63,10 @@ async def record_analyze_event(
     tier: str | None = None,
     escalated_to_26b: bool | None = None,
     suppressed_reason: str | None = None,
+    # PR-B B-2 — /ask 3-state answerability
+    answerability: str | None = None,
+    partial_basis: bool | None = None,
+    suggested_query_count: int | None = None,
 ) -> None:
     """analyze_events INSERT. background task에서 호출 — 에러 삼킴.
 
@@ -96,6 +100,9 @@ async def record_analyze_event(
                 shadow_would_route_to=shadow_would_route_to,
                 tier=tier,
                 suppressed_reason=suppressed_reason,
+                answerability=answerability,
+                partial_basis=partial_basis,
+                suggested_query_count=suggested_query_count,
             )
             session.add(row)
             await session.commit()
diff --git a/app/services/prompt_versions.py b/app/services/prompt_versions.py
index 3c15ff7..eb5d3c5 100644
--- a/app/services/prompt_versions.py
+++ b/app/services/prompt_versions.py
@@ -17,7 +17,10 @@ from __future__ import annotations
 
 # ─── ask (/search/ask) 프롬프트 버전 ─────────────────────────
 # synthesis_service.py 가 로드하는 app/prompts/search_synthesis.txt 기준
-ASK_PROMPT_VERSION: str = "search_synthesis.v2-600char"
+# v3-evidence-triage: evidence 추출을 triage(4B Ollama) 로 전환 (B-2). synthesis 는
+# 여전히 primary(26B MLX) 로 search_synthesis.txt 사용. 프롬프트 자체는 v2-600char
+# 그대로지만 evidence LLM 경로 변경을 분리 추적하기 위해 bump.
+ASK_PROMPT_VERSION: str = "search_synthesis.v3-evidence-triage"
 
 # ─── /analyze 프롬프트 버전 ──────────────────────────────────
 # documents.py analyze 라우트가 로드하는 app/prompts/document_analyze.txt 기준
diff --git a/app/services/search/evidence_service.py b/app/services/search/evidence_service.py
index 57731cf..9704d5a 100644
--- a/app/services/search/evidence_service.py
+++ b/app/services/search/evidence_service.py
@@ -25,10 +25,10 @@ EvidenceItem 리스트
 
 ## 영구 룰
 
-- **LLM 호출은 1번만** (batched). 순차 호출 절대 금지 — MLX single-inference
-  큐가 폭발한다.
-- **모든 MLX 호출은 `get_mlx_gate()` 경유**. analyzer / synthesis 와 동일
-  semaphore 공유.
+- **LLM 호출은 1번만** (batched). 순차 호출 절대 금지.
+- **B-2 변경**: evidence 추출은 triage(4B Ollama) 로 전환 — Ollama 는 concurrent
+  OK 라 `get_mlx_gate()` 불필요. primary(26B MLX) 는 synthesis 전용 보호.
+- 기존 analyzer / synthesis 의 `get_mlx_gate()` 공유는 유지 — 26B 경로에만 적용.
 - **fallback span 도 query 중심 window**. `full_snippet[:200]` 같은 "앞에서부터
   자르기" 절대 금지. 조용한 품질 붕괴 (citation 은 멀쩡한데 실제 span 이 query
   와 무관) 대표 사례.
@@ -57,7 +57,6 @@ from typing import TYPE_CHECKING
 from ai.client import AIClient, _load_prompt, parse_json_response
 from core.utils import setup_logger
 
-from .llm_gate import get_mlx_gate
 from .rerank_service import _extract_window
 
 if TYPE_CHECKING:
@@ -78,7 +77,7 @@ SPAN_ENLARGE_TARGET = 120  # enlarge 시 재윈도우 target_chars
 SPAN_MAX_CHARS = 300  # 이 초과면 cut (synthesis token budget 보호)
 
 LLM_TIMEOUT_MS = 15000
-PROMPT_VERSION = "v1"
+PROMPT_VERSION = "v2-triage"   # B-2: primary(26B MLX) → triage(4B Ollama) 전환
 
 # 확장 여지 — None 이면 비활성 (baseline). 실측 후 0.8 등으로 켠다.
 EVIDENCE_FAST_PATH_THRESHOLD: float | None = None
@@ -308,10 +307,10 @@ async def extract_evidence(
     llm_error: str | None = None
 
     try:
-        # ⚠ semaphore 대기는 timeout 바깥. timeout 은 실제 LLM 호출에만.
-        async with get_mlx_gate():
-            async with asyncio.timeout(LLM_TIMEOUT_MS / 1000):
-                raw = await ai_client._call_chat(ai_client.ai.primary, prompt)
+        # B-2: evidence 추출은 4B triage (Ollama concurrent OK) — MLX gate 경유 불필요.
+        # primary(26B) 는 synthesis 전용으로 MLX gate 보호.
+        async with asyncio.timeout(LLM_TIMEOUT_MS / 1000):
+            raw = await ai_client.call_triage(prompt)
     except asyncio.TimeoutError:
         llm_error = "timeout"
     except Exception as exc:
diff --git a/migrations/161_analyze_events_answerability.sql b/migrations/161_analyze_events_answerability.sql
new file mode 100644
index 0000000..009f57e
--- /dev/null
+++ b/migrations/161_analyze_events_answerability.sql
@@ -0,0 +1,21 @@
+-- 161_analyze_events_answerability.sql
+-- PR-B B-2: /ask 의 3-state completeness (direct/partial/insufficient) 독립 컬럼화.
+-- plan: ~/.claude/plans/swirling-swimming-liskov.md §B-2
+--
+-- 기존 /ask 응답의 completeness 필드(이미 full/partial/insufficient 3-state 로
+-- 운영 중인 Phase 3.5a 결과)를 analyze_events 에서도 독립 컬럼으로 집계 가능하게.
+-- mode 컬럼 문자열 파싱 회피 + "Backlog Suppression" 카드와 동일 패턴.
+--
+-- answerability 값 매핑:
+--   /ask completeness='full'         → 'direct'
+--   /ask completeness='partial'      → 'partial'
+--   /ask completeness='insufficient' → 'insufficient'
+--
+-- partial_basis: synthesis 가 partial 답변 본문을 실제로 생성했는지 (unanswered_aspects
+-- 를 답변 뒤에 명시). completeness=partial 이어도 synthesis 가 스킵되면 false.
+-- suggested_query_count: insufficient 때 사용자에게 돌려주는 추가 검색어 제안 개수.
+
+ALTER TABLE analyze_events
+    ADD COLUMN IF NOT EXISTS answerability         TEXT,
+    ADD COLUMN IF NOT EXISTS partial_basis         BOOLEAN,
+    ADD COLUMN IF NOT EXISTS suggested_query_count INTEGER;
diff --git a/migrations/162_analyze_events_answerability_idx.sql b/migrations/162_analyze_events_answerability_idx.sql
new file mode 100644
index 0000000..9e95e0b
--- /dev/null
+++ b/migrations/162_analyze_events_answerability_idx.sql
@@ -0,0 +1,6 @@
+-- 162_analyze_events_answerability_idx.sql
+-- PR-B B-2: answerability 분포 조회 인덱스 (대시보드 "에스컬레이션 비율" 카드).
+
+CREATE INDEX IF NOT EXISTS idx_analyze_events_answerability
+    ON analyze_events (answerability, created_at DESC)
+    WHERE answerability IS NOT NULL;