From 118f32f9b102dc6f8f2c0012760106abfeceb286 Mon Sep 17 00:00:00 2001
From: hyungi <hyun49196@gmail.com>
Date: Fri, 15 May 2026 12:05:36 +0000
Subject: [PATCH] =?UTF-8?q?refactor(ai):=20PR=20#20=20reframe=20cleanup=20?=
 =?UTF-8?q?=E2=80=94=20Ollama=20LLM=20=EC=9E=94=EC=9E=AC=20=EC=A3=BC?=
 =?UTF-8?q?=EC=84=9D=20=EC=A0=95=EC=A0=95?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PR #20 (2026-05-14, GPU LLM 제거 + Mac mini 26B MLX 흡수) 의 swap 이
backends.json + 코드 주석/docstring 까지 따라가지 못한 표현 잔재 정리.

- app/ai/client.py: AIClient docstring 및 call_triage / call_fallback
  docstring 의 "4B Ollama" → "Mac mini 26B MLX" / "현재는 triage 와
  동일 엔드포인트" → "Claude Sonnet 4 API (PR #20 swap 완료)"
- app/core/config.py: triage/primary/fallback 주석 통합 + Phase 3.5
  classifier/verifier 주석에 PR #20 endpoint 명시 (history 보존)
- app/services/search/{llm_gate,classifier_service,verifier_service,
  evidence_service}.py: "fallback(Ollama)" / "Ollama concurrent OK"
  / "triage(4B Ollama)" 표현을 Mac mini 26B MLX endpoint 기준으로
  정정 + concurrent 안전성 별 검토 마커 추가
- app/services/digest/summarizer.py: "MLX hang/Ollama stall 방어"
  → "MLX hang / fallback Claude API stall 방어"
- app/services/prompt_versions.py: SUMMARY_TRIAGE_TASK + ASK_PROMPT_VERSION
  주석의 "4B Ollama" / "4B gemma Ollama" → Mac mini 26B MLX
- app/workers/classify_worker.py: B-1 tier triage docstring 정정

코드 동작 변경 0 (주석/docstring 만). embed_worker / study_question_embed_worker
의 "Ollama bge-m3" 표현은 사실 정확이라 유지.

검증:
- ollama list → bge-m3:latest 잔존 (embedding owner)
- /api/embeddings probe → 1024-dim 200 OK
- fastapi embed/ollama error 0 (last 10min)
- document.hyungi.net 200

plan: ~/.claude/plans/4-stateless-dongarra.md

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 app/ai/client.py                          | 10 +++++-----
 app/core/config.py                        |  6 +++---
 app/services/digest/summarizer.py         |  2 +-
 app/services/prompt_versions.py           |  6 +++---
 app/services/search/classifier_service.py |  4 ++--
 app/services/search/evidence_service.py   |  8 ++++----
 app/services/search/llm_gate.py           |  2 +-
 app/services/search/verifier_service.py   |  2 +-
 app/workers/classify_worker.py            |  4 ++--
 9 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/app/ai/client.py b/app/ai/client.py
index 8765cae..45a7401 100644
--- a/app/ai/client.py
+++ b/app/ai/client.py
@@ -149,9 +149,9 @@ class AIClient:
     """AI 모델 통합 클라이언트.
 
     B-0 3-tier routing:
-      - call_triage(): 4B Ollama, 상시 호출 (llm_gate 외부 — 병렬 OK)
-      - call_primary(): 26B MLX, 에스컬레이션 전용 (llm_gate Semaphore(1) 는 **caller 책임**)
-      - call_fallback(): triage/primary 실패 시 최후 방어선 (현재 4B 동일)
+      - call_triage(): Mac mini 26B MLX, 상시 호출 (llm_gate 외부 — concurrent 안전성 별 검토)
+      - call_primary(): Mac mini 26B MLX, 에스컬레이션 전용 (llm_gate Semaphore(1) 는 **caller 책임**)
+      - call_fallback(): triage/primary 실패 시 최후 방어선. Claude Sonnet 4 API (PR #20 swap 완료)
 
     Legacy: classify() / summarize() 는 기존 호출부(tests/eval runner)를 위해 남겨둠.
     신규 worker 경로는 전부 call_triage / call_primary 사용.
@@ -164,7 +164,7 @@ class AIClient:
     # ─── 3-tier routing (B-0) ───────────────────────────────────────────────
 
     async def call_triage(self, prompt: str) -> str:
-        """4B Ollama 직접 호출. llm_gate 밖 (Ollama 는 concurrent OK).
+        """Mac mini 26B MLX 직접 호출 (config.yaml ai.models.triage). llm_gate 외부 실행 — PR #20 이후 triage/primary 동일 endpoint 라 concurrent 안전성 별 검토.
 
         timeout 은 config.yaml ai.models.triage.timeout (기본 30s).
         실패 시 caller 가 에스컬레이션 또는 fallback 판단.
@@ -180,7 +180,7 @@ class AIClient:
         return await self._request(self.ai.primary, prompt)
 
     async def call_fallback(self, prompt: str) -> str:
-        """triage/primary 실패 시 최후 방어선. 현재는 triage 와 동일 엔드포인트."""
+        """triage/primary 실패 시 최후 방어선. Claude Sonnet 4 API (config.yaml ai.models.fallback) — PR #20 이후 swap 완료."""
         return await self._request(self.ai.fallback, prompt)
 
     # ─── Legacy API (classify_worker 교체 시 제거 예정) ───────────────────
diff --git a/app/core/config.py b/app/core/config.py
index 08f20ed..6bfc1d5 100644
--- a/app/core/config.py
+++ b/app/core/config.py
@@ -37,16 +37,16 @@ class DeepSummaryBacklogConfig(BaseModel):
 
 class AIConfig(BaseModel):
     gateway_endpoint: str
-    # B-0: 3-tier routing. triage(4B) 상시, primary(26B) escalation-only, fallback(4B) 최후.
+    # B-0: 3-tier routing. triage/primary = Mac mini 26B MLX (PR #20 endpoint 통합). fallback = Claude Sonnet 4 API.
     triage: AIModelConfig
     primary: AIModelConfig
     fallback: AIModelConfig
     premium: AIModelConfig
     embedding: AIModelConfig
     rerank: AIModelConfig
-    # Phase 3.5a: exaone classifier (optional — 없으면 score-only gate)
+    # Phase 3.5a: answerability classifier (optional — 없으면 score-only gate). PR #20 이후 Mac mini 26B MLX endpoint (initial = exaone3.5).
     classifier: AIModelConfig | None = None
-    # Phase 3.5b: exaone verifier (optional — 없으면 grounding-only)
+    # Phase 3.5b: semantic verifier (optional — 없으면 grounding-only). PR #20 이후 Mac mini 26B MLX endpoint (initial = exaone3.5).
     verifier: AIModelConfig | None = None
     # Legacy: vision 슬롯 (현재 사용처 0 — Document Server 는 OCR/STT 별도 서비스).
     # 제거 진행 중이므로 optional 로 관대한 로딩 유지.
diff --git a/app/services/digest/summarizer.py b/app/services/digest/summarizer.py
index 35d85fb..173f58c 100644
--- a/app/services/digest/summarizer.py
+++ b/app/services/digest/summarizer.py
@@ -3,7 +3,7 @@
 핵심 결정:
 - AIClient._call_chat 직접 호출 (client.py 수정 회피, fallback 로직 재사용)
 - Semaphore(1) 로 MLX 과부하 회피
-- Per-call timeout 25초 (asyncio.wait_for) — MLX hang/Ollama stall 방어
+- Per-call timeout 25초 (asyncio.wait_for) — MLX hang / fallback Claude API stall 방어
 - JSON 파싱 실패 → 1회 재시도 → 그래도 실패 시 minimal fallback (drop 금지)
 - fallback: topic_label="주요 뉴스 묶음", summary = top member ai_summary[:200]
 """
diff --git a/app/services/prompt_versions.py b/app/services/prompt_versions.py
index eb5d3c5..8367595 100644
--- a/app/services/prompt_versions.py
+++ b/app/services/prompt_versions.py
@@ -17,8 +17,8 @@ from __future__ import annotations
 
 # ─── ask (/search/ask) 프롬프트 버전 ─────────────────────────
 # synthesis_service.py 가 로드하는 app/prompts/search_synthesis.txt 기준
-# v3-evidence-triage: evidence 추출을 triage(4B Ollama) 로 전환 (B-2). synthesis 는
-# 여전히 primary(26B MLX) 로 search_synthesis.txt 사용. 프롬프트 자체는 v2-600char
+# v3-evidence-triage: evidence 추출을 triage path 로 전환 (B-2). PR #20 이후 triage/primary 동일
+# Mac mini 26B endpoint — path 분리는 prompt 레벨. synthesis 는 search_synthesis.txt 사용. 프롬프트 자체는 v2-600char
 # 그대로지만 evidence LLM 경로 변경을 분리 추적하기 위해 bump.
 ASK_PROMPT_VERSION: str = "search_synthesis.v3-evidence-triage"
 
@@ -29,7 +29,7 @@ ANALYZE_PROMPT_VERSION: str = "document_analyze.v1"
 # ─── PR-B B-1: summary tier 분할 task 이름 ─────────────────────
 # classify_worker / deep_summary_worker 가 PR-A 정책 템플릿 + policy_version 해시
 # 조합으로 analyze_events.prompt_version 을 기록한다. (예: "p3a_short_summary@abc123")
-SUMMARY_TRIAGE_TASK: str = "p3a_short_summary"   # 4B gemma Ollama
+SUMMARY_TRIAGE_TASK: str = "p3a_short_summary"   # Mac mini 26B MLX (config.yaml ai.models.triage)
 SUMMARY_DEEP_TASK: str = "p3c_deep_summary"      # 26B MLX
 
 
diff --git a/app/services/search/classifier_service.py b/app/services/search/classifier_service.py
index 1202d02..c42ebdf 100644
--- a/app/services/search/classifier_service.py
+++ b/app/services/search/classifier_service.py
@@ -1,6 +1,6 @@
 """Answerability classifier (Phase 3.5a).
 
-exaone3.5:7.8b GPU Ollama 기반. MLX gate 밖 — evidence extraction 과 병렬 실행.
+Mac mini 26B MLX 기반 (config.yaml ai.models.classifier — PR #20 이후 triage/primary/classifier 동일 endpoint). MLX gate 밖 — evidence extraction 과 병렬 실행 (concurrent 안전성 별 검토).
 
 P1 실측 결과: ternary (full/partial/insufficient) 불안정 → **binary (sufficient/insufficient)**.
 "full" vs "partial" 구분은 grounding_check 의 intent alignment 이 담당.
@@ -94,7 +94,7 @@ async def classify(
     prompt = _build_input(query, top_chunks, rerank_scores)
     client = AIClient()
     try:
-        # ⚠ MLX gate 안 씀. Ollama(exaone) 는 concurrent OK.
+        # ⚠ MLX gate 안 씀 (PR #20 이후 endpoint 가 Mac mini 26B 라 concurrent 안전성 별 검토).
         async with asyncio.timeout(LLM_TIMEOUT_MS / 1000):
             raw = await client._request(settings.ai.classifier, prompt)
         _failure_count = 0
diff --git a/app/services/search/evidence_service.py b/app/services/search/evidence_service.py
index 9704d5a..cc56377 100644
--- a/app/services/search/evidence_service.py
+++ b/app/services/search/evidence_service.py
@@ -26,8 +26,8 @@ EvidenceItem 리스트
 ## 영구 룰
 
 - **LLM 호출은 1번만** (batched). 순차 호출 절대 금지.
-- **B-2 변경**: evidence 추출은 triage(4B Ollama) 로 전환 — Ollama 는 concurrent
-  OK 라 `get_mlx_gate()` 불필요. primary(26B MLX) 는 synthesis 전용 보호.
+- **B-2 변경**: evidence 추출은 triage(Mac mini 26B MLX) 로 전환. PR #20 이후 triage/primary 동일 endpoint 라
+  path 분리는 prompt 레벨만 — `get_mlx_gate()` 외부 실행 (concurrent 안전성 별 검토). primary 의 gate 보호는 synthesis 전용.
 - 기존 analyzer / synthesis 의 `get_mlx_gate()` 공유는 유지 — 26B 경로에만 적용.
 - **fallback span 도 query 중심 window**. `full_snippet[:200]` 같은 "앞에서부터
   자르기" 절대 금지. 조용한 품질 붕괴 (citation 은 멀쩡한데 실제 span 이 query
@@ -77,7 +77,7 @@ SPAN_ENLARGE_TARGET = 120  # enlarge 시 재윈도우 target_chars
 SPAN_MAX_CHARS = 300  # 이 초과면 cut (synthesis token budget 보호)
 
 LLM_TIMEOUT_MS = 15000
-PROMPT_VERSION = "v2-triage"   # B-2: primary(26B MLX) → triage(4B Ollama) 전환
+PROMPT_VERSION = "v2-triage"   # B-2: primary(26B MLX) → triage path 전환. PR #20 이후 triage/primary 동일 endpoint (Mac mini 26B).
 
 # 확장 여지 — None 이면 비활성 (baseline). 실측 후 0.8 등으로 켠다.
 EVIDENCE_FAST_PATH_THRESHOLD: float | None = None
@@ -307,7 +307,7 @@ async def extract_evidence(
     llm_error: str | None = None
 
     try:
-        # B-2: evidence 추출은 4B triage (Ollama concurrent OK) — MLX gate 경유 불필요.
+        # B-2: evidence 추출은 triage path (Mac mini 26B MLX) — gate 외부 실행. PR #20 이후 endpoint 통합으로 concurrent 안전성 별 검토.
         # primary(26B) 는 synthesis 전용으로 MLX gate 보호.
         async with asyncio.timeout(LLM_TIMEOUT_MS / 1000):
             raw = await ai_client.call_triage(prompt)
diff --git a/app/services/search/llm_gate.py b/app/services/search/llm_gate.py
index fc36f57..560dc0a 100644
--- a/app/services/search/llm_gate.py
+++ b/app/services/search/llm_gate.py
@@ -16,7 +16,7 @@ Mac mini MLX primary(gemma-4-26b-a4b-it-8bit)는 **single-inference**다.
   동시 실행 발생).
 - **`asyncio.timeout(...)`은 gate 안쪽에서만 적용**. gate 대기 자체에 timeout을
   걸면 "대기만으로 timeout 발동" 버그가 재발한다(query_analyzer 초기 이슈).
-- **fallback(Ollama) 경로는 gate 제외**. GPU Ollama는 concurrent OK. 단 현재
+- **fallback(Claude Sonnet 4 API) 경로는 gate 제외**. PR #20 이후 fallback = Claude API. 단 현재
   구현상 `AIClient._call_chat` 내부에서 primary→fallback 전환이 일어나므로
   fallback도 gate 점유 상태로 실행된다. 허용 가능(fallback 빈도 낮음).
 - **MLX concurrency는 `MLX_CONCURRENCY = 1` 고정**. 모델이 바뀌어도 single-
diff --git a/app/services/search/verifier_service.py b/app/services/search/verifier_service.py
index 3606cbf..8cda6bd 100644
--- a/app/services/search/verifier_service.py
+++ b/app/services/search/verifier_service.py
@@ -11,7 +11,7 @@
 ## 핵심 원칙
 - **Verifier strong 단독 refuse 금지** — grounding strong 과 교차해야 refuse
 - **Timeout 3s** — 느리면 없는 게 낫다 (fail open)
-- MLX gate 미사용 (GPU Ollama concurrent OK)
+- MLX gate 미사용 (PR #20 이후 Mac mini 26B endpoint — concurrent 안전성 별 검토)
 """
 
 from __future__ import annotations
diff --git a/app/workers/classify_worker.py b/app/workers/classify_worker.py
index fe5dbf4..6a3ea78 100644
--- a/app/workers/classify_worker.py
+++ b/app/workers/classify_worker.py
@@ -7,10 +7,10 @@ Legacy 경로 (primary 26B 호출):
   → ai_domain / ai_sub_group / document_type / ai_confidence / ai_tags /
      ai_summary / ai_suggestion / facet_doctype / importance 필드
 
-PR-B B-1 tier triage (신규, 4B gemma Ollama):
+PR-B B-1 tier triage (Mac mini 26B MLX, config.yaml ai.models.triage):
   - policy.routing.decide_routing 으로 RoutingDecision
   - policy.prompt_render.render_4b("p3a_short_summary", subject_domain) 로 프롬프트 렌더
-  - AIClient.call_triage(rendered) 호출 (llm_gate 외부, Ollama concurrent OK)
+  - AIClient.call_triage(rendered) 호출 (llm_gate 외부, Mac mini 26B MLX — concurrent 안전성 별 검토)
   - TriageOutput pydantic validate + JSON 깨짐 시 fallback escalate (R1)
   - R2 backlog guard: deep_summary 큐 ratio > threshold or pending >= threshold 이면 suppress
   - R3 head/middle/tail: 260k 초과 시 envelope text_ranges 3조각