From a08b6208949969a77a9654fc69c320908050e8de Mon Sep 17 00:00:00 2001
From: Hyungi Ahn <hyungiahn@Hyungiui-MacBookPro.local>
Date: Sun, 17 May 2026 08:51:57 +0900
Subject: [PATCH] refactor(search): swap 10 call sites to
 acquire_mlx_gate(Priority.*) (B-1)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

DS-Mac-mini-26B-Priority-Gate-1 — 사용자-facing 7 + worker 3 = 10 site 의
`async with get_mlx_gate():` → `async with acquire_mlx_gate(Priority.*):` 교체.

Foreground 6 (user-facing path):
- app/services/search/evidence_service.py:315 (/ask evidence stage)
- app/services/search/classifier_service.py:103 (/ask classifier stage)
- app/services/search/synthesis_service.py:299 (/ask synthesis stage)
- app/api/documents.py:1306 (수동 analyze API)
- app/api/study_topics.py:1183 (subject note 동기 생성)
- app/api/study_questions.py:1560 (study explanation 동기 API)

Background 4 (worker queue / fire-and-forget):
- app/services/search/query_analyzer.py:240 (V0 grep 확인: fire-and-forget only,
  search_pipeline.py:179 trigger_background_analysis 만, docstring rule
  "analyze() 동기 호출 금지" 부합 → BACKGROUND 확정)
- app/workers/deep_summary_worker.py:110 (classify-escalate worker)
- app/workers/study_explanation_worker.py:149
- app/workers/study_session_analysis_worker.py:237

Cleanup:
- query_analyzer._get_llm_semaphore() 제거 — self-only, unused, signature 거짓말
  (이제 get_mlx_gate 가 Semaphore 아닌 context manager 반환)

기존 get_mlx_gate() legacy wrapper 는 보존 (BACKGROUND 매핑). user-facing path
잔재 0 — closure gate grep 검증 통과 (별 commit 에서).
---
 app/api/documents.py                         |  4 ++--
 app/api/study_questions.py                   |  4 ++--
 app/api/study_topics.py                      |  4 ++--
 app/services/search/classifier_service.py    |  4 ++--
 app/services/search/evidence_service.py      |  4 ++--
 app/services/search/query_analyzer.py        | 20 ++++++--------------
 app/services/search/synthesis_service.py     |  4 ++--
 app/workers/deep_summary_worker.py           |  4 ++--
 app/workers/study_explanation_worker.py      |  4 ++--
 app/workers/study_session_analysis_worker.py |  4 ++--
 10 files changed, 24 insertions(+), 32 deletions(-)

diff --git a/app/api/documents.py b/app/api/documents.py
index 261781e..441769b 100644
--- a/app/api/documents.py
+++ b/app/api/documents.py
@@ -38,7 +38,7 @@ from models.queue import ProcessingQueue, enqueue_stage
 from models.user import User
 from services.document_telemetry import record_analyze_event, sanitize_source
 from services.prompt_versions import ANALYZE_PROMPT_VERSION, resolve_primary_model
-from services.search.llm_gate import get_mlx_gate
+from services.search.llm_gate import Priority, acquire_mlx_gate
 
 router = APIRouter()
 logger = logging.getLogger(__name__)
@@ -1303,7 +1303,7 @@ async def analyze_document(
         ai_client = AIClient()
         raw: str | None = None
         try:
-            async with get_mlx_gate():
+            async with acquire_mlx_gate(Priority.FOREGROUND):
                 async with asyncio.timeout(ANALYZE_TIMEOUT_S):
                     raw = await ai_client._call_chat(ai_client.ai.primary, prompt)
         except asyncio.TimeoutError:
diff --git a/app/api/study_questions.py b/app/api/study_questions.py
index 8a83dec..ab5a4c6 100644
--- a/app/api/study_questions.py
+++ b/app/api/study_questions.py
@@ -30,7 +30,7 @@ from models.study_question_image import StudyQuestionImage
 from models.study_quiz_session import StudyQuizSession
 from models.study_topic import StudyTopic
 from models.user import User
-from services.search.llm_gate import get_mlx_gate
+from services.search.llm_gate import Priority, acquire_mlx_gate
 from services.study.explanation_rag import (
     EvidenceItem,
     gather_explanation_context,
@@ -1557,7 +1557,7 @@ async def generate_ai_explanation(
     raw_text: str | None = None
     error_message: str | None = None
     try:
-        async with get_mlx_gate():
+        async with acquire_mlx_gate(Priority.FOREGROUND):
             async with asyncio.timeout(LLM_TIMEOUT_S):
                 raw_text = await ai_client.call_primary(prompt)
     except asyncio.TimeoutError:
diff --git a/app/api/study_topics.py b/app/api/study_topics.py
index b528329..89b8aef 100644
--- a/app/api/study_topics.py
+++ b/app/api/study_topics.py
@@ -41,7 +41,7 @@ from models.study_question_image import StudyQuestionImage
 from models.study_quiz_session import StudyQuizSession
 from models.study_topic_subject_note import StudyTopicSubjectNote
 from models.user import User
-from services.search.llm_gate import get_mlx_gate
+from services.search.llm_gate import Priority, acquire_mlx_gate
 from services.study.subject_note_rag import (
     SubjectNoteContext,
     gather_subject_note_context,
@@ -1180,7 +1180,7 @@ async def generate_subject_note(
     ai_client = AIClient()
     raw_text: str | None = None
     try:
-        async with get_mlx_gate():
+        async with acquire_mlx_gate(Priority.FOREGROUND):
             async with asyncio.timeout(SUBJECT_NOTE_TIMEOUT_S):
                 raw_text = await ai_client.call_primary(prompt)
     except asyncio.TimeoutError:
diff --git a/app/services/search/classifier_service.py b/app/services/search/classifier_service.py
index 23c55ee..626b5c9 100644
--- a/app/services/search/classifier_service.py
+++ b/app/services/search/classifier_service.py
@@ -20,7 +20,7 @@ from ai.client import AIClient, _load_prompt, parse_json_response
 from core.config import settings
 from core.utils import setup_logger
 
-from .llm_gate import get_mlx_gate
+from .llm_gate import Priority, acquire_mlx_gate
 
 logger = setup_logger("classifier")
 
@@ -100,7 +100,7 @@ async def classify(
         # Gate 미사용 시 classifier + evidence + synthesis 가 동시에 single-inference
         # MLX 에 race → 거의 모두 timeout (실측: 8/10 fixture query). docstring 영구 룰:
         # "MLX primary 호출 경로는 예외 없이 gate 획득 필수".
-        async with get_mlx_gate():
+        async with acquire_mlx_gate(Priority.FOREGROUND):
             async with asyncio.timeout(LLM_TIMEOUT_MS / 1000):
                 raw = await client._request(settings.ai.classifier, prompt)
         _failure_count = 0
diff --git a/app/services/search/evidence_service.py b/app/services/search/evidence_service.py
index 453b29c..71b50bd 100644
--- a/app/services/search/evidence_service.py
+++ b/app/services/search/evidence_service.py
@@ -57,7 +57,7 @@ from typing import TYPE_CHECKING
 from ai.client import AIClient, _load_prompt, parse_json_response
 from core.utils import setup_logger
 
-from .llm_gate import get_mlx_gate
+from .llm_gate import Priority, acquire_mlx_gate
 
 from .rerank_service import _extract_window
 
@@ -312,7 +312,7 @@ async def extract_evidence(
         # 2026-05-17: PR #20 이후 triage/primary 동일 Mac mini 26B endpoint. gate 외부 실행이 docstring
         # 영구 룰 ("MLX primary 호출 경로는 예외 없이 gate 획득 필수") 위반 — race condition 으로 동시
         # 호출 timeout 빈번. gate 안쪽으로 이동.
-        async with get_mlx_gate():
+        async with acquire_mlx_gate(Priority.FOREGROUND):
             async with asyncio.timeout(LLM_TIMEOUT_MS / 1000):
                 raw = await ai_client.call_triage(prompt)
     except asyncio.TimeoutError:
diff --git a/app/services/search/query_analyzer.py b/app/services/search/query_analyzer.py
index 105383c..78a6f90 100644
--- a/app/services/search/query_analyzer.py
+++ b/app/services/search/query_analyzer.py
@@ -36,7 +36,7 @@ from ai.client import AIClient, _load_prompt, parse_json_response
 from core.config import settings
 from core.utils import setup_logger
 
-from .llm_gate import get_mlx_gate
+from .llm_gate import Priority, acquire_mlx_gate
 
 logger = setup_logger("query_analyzer")
 
@@ -71,16 +71,6 @@ _PENDING: set[asyncio.Task[Any]] = set()
 _INFLIGHT: set[str] = set()
 
 
-def _get_llm_semaphore() -> asyncio.Semaphore:
-    """MLX single-inference gate를 반환. Phase 3.1부터 llm_gate.get_mlx_gate()
-    로 위임 — analyzer / evidence / synthesis 가 동일 semaphore 공유.
-
-    `LLM_CONCURRENCY` 상수는 하위 호환/문서용으로 유지하되, 실제 bound는
-    `llm_gate.MLX_CONCURRENCY` 가 담당한다.
-    """
-    return get_mlx_gate()
-
-
 def _cache_key(query: str) -> str:
     raw = f"{query}|{PROMPT_VERSION}|{_model_version()}"
     return hashlib.sha256(raw.encode("utf-8")).hexdigest()
@@ -237,11 +227,13 @@ async def analyze(query: str, ai_client: AIClient | None = None) -> dict:
         client_owned = True
 
     t_start = time.perf_counter()
-    semaphore = _get_llm_semaphore()
-    # ⚠️ 중요: semaphore 대기는 timeout 포함되면 안됨 (대기만 해도 timeout 발동)
+    # 2026-05-17 B-1: query_analyzer 의 analyze() 는 fire-and-forget background only
+    # (search_pipeline.py:179 trigger_background_analysis 만 호출, docstring rule
+    # "analyze() 동기 호출 금지"). 따라서 Priority.BACKGROUND.
+    # ⚠️ 중요: gate 대기는 timeout 포함되면 안됨 (대기만 해도 timeout 발동)
     #   timeout은 실제 LLM 호출 구간에만 적용.
     try:
-        async with semaphore:
+        async with acquire_mlx_gate(Priority.BACKGROUND):
             async with asyncio.timeout(LLM_TIMEOUT_MS / 1000):
                 raw = await ai_client._call_chat(
                     ai_client.ai.primary,
diff --git a/app/services/search/synthesis_service.py b/app/services/search/synthesis_service.py
index f0e5452..abb573e 100644
--- a/app/services/search/synthesis_service.py
+++ b/app/services/search/synthesis_service.py
@@ -31,7 +31,7 @@ from ai.client import AIClient, _load_prompt, parse_json_response
 from core.config import settings
 from core.utils import setup_logger
 
-from .llm_gate import get_mlx_gate
+from .llm_gate import Priority, acquire_mlx_gate
 
 if TYPE_CHECKING:
     from .evidence_service import EvidenceItem
@@ -296,7 +296,7 @@ async def synthesize(
     llm_error: str | None = None
 
     try:
-        async with get_mlx_gate():
+        async with acquire_mlx_gate(Priority.FOREGROUND):
             async with asyncio.timeout(LLM_TIMEOUT_MS / 1000):
                 raw = await ai_client._call_chat(ai_client.ai.primary, prompt)
     except asyncio.TimeoutError:
diff --git a/app/workers/deep_summary_worker.py b/app/workers/deep_summary_worker.py
index 08ff909..d955e39 100644
--- a/app/workers/deep_summary_worker.py
+++ b/app/workers/deep_summary_worker.py
@@ -28,7 +28,7 @@ from models.document import Document
 from models.queue import ProcessingQueue
 from policy.prompt_render import render_26b, policy_version as compute_policy_version
 from services.document_telemetry import record_analyze_event
-from services.search.llm_gate import get_mlx_gate
+from services.search.llm_gate import Priority, acquire_mlx_gate
 
 logger = setup_logger("deep_summary_worker")
 
@@ -107,7 +107,7 @@ async def process(document_id: int, session: AsyncSession) -> None:
 
     try:
         start = time.perf_counter()
-        async with get_mlx_gate():                     # primary(26B) 보호 Semaphore(1)
+        async with acquire_mlx_gate(Priority.BACKGROUND):  # 2026-05-17 B-1: classify-escalate worker
             raw = await client.call_primary(prompt)
         latency_ms = int((time.perf_counter() - start) * 1000)
     except Exception as exc:
diff --git a/app/workers/study_explanation_worker.py b/app/workers/study_explanation_worker.py
index 2570f71..838494c 100644
--- a/app/workers/study_explanation_worker.py
+++ b/app/workers/study_explanation_worker.py
@@ -27,7 +27,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
 from ai.client import AIClient, parse_json_response
 from models.study_question import StudyQuestion
 from models.study_question_job import StudyQuestionJob
-from services.search.llm_gate import get_mlx_gate
+from services.search.llm_gate import Priority, acquire_mlx_gate
 from services.study.explanation_rag import (
     gather_explanation_context,
     render_evidence_block,
@@ -146,7 +146,7 @@ async def run_explanation_job(session: AsyncSession, job: StudyQuestionJob) -> N
 
         ai_client = AIClient()
         try:
-            async with get_mlx_gate():
+            async with acquire_mlx_gate(Priority.BACKGROUND):  # 2026-05-17 B-1
                 async with asyncio.timeout(LLM_TIMEOUT_S):
                     raw_text = await ai_client.call_primary(prompt)
             primary_name = (
diff --git a/app/workers/study_session_analysis_worker.py b/app/workers/study_session_analysis_worker.py
index c073812..fc14937 100644
--- a/app/workers/study_session_analysis_worker.py
+++ b/app/workers/study_session_analysis_worker.py
@@ -32,7 +32,7 @@ from models.study_question import StudyQuestion, StudyQuestionAttempt
 from models.study_quiz_session import StudyQuizSession
 from models.study_quiz_session_analysis import StudyQuizSessionAnalysis
 from models.study_quiz_session_job import StudyQuizSessionJob
-from services.search.llm_gate import get_mlx_gate
+from services.search.llm_gate import Priority, acquire_mlx_gate
 from services.study.session_summary_guard import (
     GUARD_PATTERN,
     calibrate_confidence,
@@ -234,7 +234,7 @@ async def run_session_analysis_job(session: AsyncSession, job: StudyQuizSessionJ
         prompt = _render_session_summary_prompt(qs, prompt_attempts, ctx_docs)
         ai_client = AIClient()
         try:
-            async with get_mlx_gate():
+            async with acquire_mlx_gate(Priority.BACKGROUND):  # 2026-05-17 B-1
                 async with asyncio.timeout(LLM_TIMEOUT_S):
                     raw_text = await ai_client.call_primary(prompt)
             primary_name = (