From ad3d51e3e0a0dcc234289facf18a68565c56e158 Mon Sep 17 00:00:00 2001
From: Hyungi Ahn <hyungiahn@Hyungiui-MacBookPro.local>
Date: Sat, 16 May 2026 19:54:55 +0900
Subject: [PATCH] =?UTF-8?q?fix(search):=20classifier=20+=20evidence=20gate?=
 =?UTF-8?q?=20=EC=95=88=EC=9C=BC=EB=A1=9C=20=EC=9D=B4=EB=8F=99=20(Mac=20mi?=
 =?UTF-8?q?ni=2026B=20race=20=EC=A2=85=EA=B2=B0)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

llm_gate.py docstring 영구 룰: "MLX primary 호출 경로는 예외 없이 gate 획득 필수".
PR #20 이후 classifier (Mac mini 26B 신규) + evidence (triage→Mac mini 26B 통합)
모두 gate 외부 실행 — concurrent 안전성 별 검토 명시. 1주 관찰 결과: race 빈번.

본 PR-Hermes-Docsrv-Search-1 Layer 1 fixture 측정:
- 8/10 query "conservative_refuse(no_classifier)" — classifier 가 동시 부하 시
  거의 모두 ReadTimeout 또는 wait_for(6s) timeout
- evidence ev_ms=15005 — synthesis 와 race 로 15s 누적

영향:
- ask total 시간 증가 (parallel race → serialized): query_analyzer 5s +
  classifier 3-5s + evidence 5s + synthesis 30s ≈ 40-45s 상한 (현실 평균)
- 응답률 ↑: race timeout 으로 인한 conservative_refuse 해소
- 사용자 체감: 빠른 거절 → 의미있는 답변. 단 대기 시간 ↑

후속:
- skill `docsrv_ask` curl `--max-time 20` → 60s 상향 필요 (별 PR 또는 본 PR
  안의 follow-up)
- 본 메모리 `2026-05-21 Mac mini 26B 1주 부하 측정` observation 의 결정
  outcome: gate 복귀 (triage 별 작은 모델 재도입 옵션은 보류)
---
 app/services/search/classifier_service.py | 12 +++++++++---
 app/services/search/evidence_service.py   | 12 ++++++++----
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/app/services/search/classifier_service.py b/app/services/search/classifier_service.py
index e6a2b59..23c55ee 100644
--- a/app/services/search/classifier_service.py
+++ b/app/services/search/classifier_service.py
@@ -20,6 +20,8 @@ from ai.client import AIClient, _load_prompt, parse_json_response
 from core.config import settings
 from core.utils import setup_logger
 
+from .llm_gate import get_mlx_gate
+
 logger = setup_logger("classifier")
 
 LLM_TIMEOUT_MS = 30000
@@ -94,9 +96,13 @@ async def classify(
     prompt = _build_input(query, top_chunks, rerank_scores)
     client = AIClient()
     try:
-        # ⚠ MLX gate 안 씀 (PR #20 이후 endpoint 가 Mac mini 26B 라 concurrent 안전성 별 검토).
-        async with asyncio.timeout(LLM_TIMEOUT_MS / 1000):
-            raw = await client._request(settings.ai.classifier, prompt)
+        # 2026-05-17: PR #20 이후 endpoint 가 Mac mini 26B → llm_gate Semaphore(1) 필수.
+        # Gate 미사용 시 classifier + evidence + synthesis 가 동시에 single-inference
+        # MLX 에 race → 거의 모두 timeout (실측: 8/10 fixture query). docstring 영구 룰:
+        # "MLX primary 호출 경로는 예외 없이 gate 획득 필수".
+        async with get_mlx_gate():
+            async with asyncio.timeout(LLM_TIMEOUT_MS / 1000):
+                raw = await client._request(settings.ai.classifier, prompt)
         _failure_count = 0
     except asyncio.TimeoutError:
         _failure_count += 1
diff --git a/app/services/search/evidence_service.py b/app/services/search/evidence_service.py
index cc56377..84ee264 100644
--- a/app/services/search/evidence_service.py
+++ b/app/services/search/evidence_service.py
@@ -57,6 +57,8 @@ from typing import TYPE_CHECKING
 from ai.client import AIClient, _load_prompt, parse_json_response
 from core.utils import setup_logger
 
+from .llm_gate import get_mlx_gate
+
 from .rerank_service import _extract_window
 
 if TYPE_CHECKING:
@@ -307,10 +309,12 @@ async def extract_evidence(
     llm_error: str | None = None
 
     try:
-        # B-2: evidence 추출은 triage path (Mac mini 26B MLX) — gate 외부 실행. PR #20 이후 endpoint 통합으로 concurrent 안전성 별 검토.
-        # primary(26B) 는 synthesis 전용으로 MLX gate 보호.
-        async with asyncio.timeout(LLM_TIMEOUT_MS / 1000):
-            raw = await ai_client.call_triage(prompt)
+        # 2026-05-17: PR #20 이후 triage/primary 동일 Mac mini 26B endpoint. gate 외부 실행이 docstring
+        # 영구 룰 ("MLX primary 호출 경로는 예외 없이 gate 획득 필수") 위반 — race condition 으로 동시
+        # 호출 timeout 빈번. gate 안쪽으로 이동.
+        async with get_mlx_gate():
+            async with asyncio.timeout(LLM_TIMEOUT_MS / 1000):
+                raw = await ai_client.call_triage(prompt)
     except asyncio.TimeoutError:
         llm_error = "timeout"
     except Exception as exc: