diff --git a/app/services/search/classifier_service.py b/app/services/search/classifier_service.py index e6a2b59..23c55ee 100644 --- a/app/services/search/classifier_service.py +++ b/app/services/search/classifier_service.py @@ -20,6 +20,8 @@ from ai.client import AIClient, _load_prompt, parse_json_response from core.config import settings from core.utils import setup_logger +from .llm_gate import get_mlx_gate + logger = setup_logger("classifier") LLM_TIMEOUT_MS = 30000 @@ -94,9 +96,13 @@ async def classify( prompt = _build_input(query, top_chunks, rerank_scores) client = AIClient() try: - # ⚠ MLX gate 안 씀 (PR #20 이후 endpoint 가 Mac mini 26B 라 concurrent 안전성 별 검토). - async with asyncio.timeout(LLM_TIMEOUT_MS / 1000): - raw = await client._request(settings.ai.classifier, prompt) + # 2026-05-17: PR #20 이후 endpoint 가 Mac mini 26B → llm_gate Semaphore(1) 필수. + # Gate 미사용 시 classifier + evidence + synthesis 가 동시에 single-inference + # MLX 에 race → 거의 모두 timeout (실측: 8/10 fixture query). docstring 영구 룰: + # "MLX primary 호출 경로는 예외 없이 gate 획득 필수". + async with get_mlx_gate(): + async with asyncio.timeout(LLM_TIMEOUT_MS / 1000): + raw = await client._request(settings.ai.classifier, prompt) _failure_count = 0 except asyncio.TimeoutError: _failure_count += 1 diff --git a/app/services/search/evidence_service.py b/app/services/search/evidence_service.py index cc56377..84ee264 100644 --- a/app/services/search/evidence_service.py +++ b/app/services/search/evidence_service.py @@ -57,6 +57,8 @@ from typing import TYPE_CHECKING from ai.client import AIClient, _load_prompt, parse_json_response from core.utils import setup_logger +from .llm_gate import get_mlx_gate + from .rerank_service import _extract_window if TYPE_CHECKING: @@ -307,10 +309,12 @@ async def extract_evidence( llm_error: str | None = None try: - # B-2: evidence 추출은 triage path (Mac mini 26B MLX) — gate 외부 실행. PR #20 이후 endpoint 통합으로 concurrent 안전성 별 검토. - # primary(26B) 는 synthesis 전용으로 MLX gate 보호. - async with asyncio.timeout(LLM_TIMEOUT_MS / 1000): - raw = await ai_client.call_triage(prompt) + # 2026-05-17: PR #20 이후 triage/primary 동일 Mac mini 26B endpoint. gate 외부 실행이 docstring + # 영구 룰 ("MLX primary 호출 경로는 예외 없이 gate 획득 필수") 위반 — race condition 으로 동시 + # 호출 timeout 빈번. gate 안쪽으로 이동. + async with get_mlx_gate(): + async with asyncio.timeout(LLM_TIMEOUT_MS / 1000): + raw = await ai_client.call_triage(prompt) except asyncio.TimeoutError: llm_error = "timeout" except Exception as exc: