refactor(search): swap 10 call sites to acquire_mlx_gate(Priority.*) (B-1)

DS-Mac-mini-26B-Priority-Gate-1 — 사용자-facing 7 + worker 3 = 10 site 의 `async with get_mlx_gate():` → `async with acquire_mlx_gate(Priority.*):` 교체. Foreground 6 (user-facing path): - app/services/search/evidence_service.py:315 (/ask evidence stage) - app/services/search/classifier_service.py:103 (/ask classifier stage) - app/services/search/synthesis_service.py:299 (/ask synthesis stage) - app/api/documents.py:1306 (수동 analyze API) - app/api/study_topics.py:1183 (subject note 동기 생성) - app/api/study_questions.py:1560 (study explanation 동기 API) Background 4 (worker queue / fire-and-forget): - app/services/search/query_analyzer.py:240 (V0 grep 확인: fire-and-forget only, search_pipeline.py:179 trigger_background_analysis 만, docstring rule "analyze() 동기 호출 금지" 부합 → BACKGROUND 확정) - app/workers/deep_summary_worker.py:110 (classify-escalate worker) - app/workers/study_explanation_worker.py:149 - app/workers/study_session_analysis_worker.py:237 Cleanup: - query_analyzer._get_llm_semaphore() 제거 — self-only, unused, signature 거짓말 (이제 get_mlx_gate 가 Semaphore 아닌 context manager 반환) 기존 get_mlx_gate() legacy wrapper 는 보존 (BACKGROUND 매핑). user-facing path 잔재 0 — closure gate grep 검증 통과 (별 commit 에서).
2026-05-17 08:51:57 +09:00
parent 7c9aff393a
commit a08b620894
10 changed files with 24 additions and 32 deletions
@@ -38,7 +38,7 @@ from models.queue import ProcessingQueue, enqueue_stage
 from models.user import User
 from services.document_telemetry import record_analyze_event, sanitize_source
 from services.prompt_versions import ANALYZE_PROMPT_VERSION, resolve_primary_model
-from services.search.llm_gate import get_mlx_gate
+from services.search.llm_gate import Priority, acquire_mlx_gate

 router = APIRouter()
 logger = logging.getLogger(__name__)
@@ -1303,7 +1303,7 @@ async def analyze_document(
        ai_client = AIClient()
        raw: str | None = None
        try:
-            async with get_mlx_gate():
+            async with acquire_mlx_gate(Priority.FOREGROUND):
                async with asyncio.timeout(ANALYZE_TIMEOUT_S):
                    raw = await ai_client._call_chat(ai_client.ai.primary, prompt)
        except asyncio.TimeoutError:
@@ -30,7 +30,7 @@ from models.study_question_image import StudyQuestionImage
 from models.study_quiz_session import StudyQuizSession
 from models.study_topic import StudyTopic
 from models.user import User
-from services.search.llm_gate import get_mlx_gate
+from services.search.llm_gate import Priority, acquire_mlx_gate
 from services.study.explanation_rag import (
    EvidenceItem,
    gather_explanation_context,
@@ -1557,7 +1557,7 @@ async def generate_ai_explanation(
    raw_text: str | None = None
    error_message: str | None = None
    try:
-        async with get_mlx_gate():
+        async with acquire_mlx_gate(Priority.FOREGROUND):
            async with asyncio.timeout(LLM_TIMEOUT_S):
                raw_text = await ai_client.call_primary(prompt)
    except asyncio.TimeoutError:
@@ -41,7 +41,7 @@ from models.study_question_image import StudyQuestionImage
 from models.study_quiz_session import StudyQuizSession
 from models.study_topic_subject_note import StudyTopicSubjectNote
 from models.user import User
-from services.search.llm_gate import get_mlx_gate
+from services.search.llm_gate import Priority, acquire_mlx_gate
 from services.study.subject_note_rag import (
    SubjectNoteContext,
    gather_subject_note_context,
@@ -1180,7 +1180,7 @@ async def generate_subject_note(
    ai_client = AIClient()
    raw_text: str | None = None
    try:
-        async with get_mlx_gate():
+        async with acquire_mlx_gate(Priority.FOREGROUND):
            async with asyncio.timeout(SUBJECT_NOTE_TIMEOUT_S):
                raw_text = await ai_client.call_primary(prompt)
    except asyncio.TimeoutError:
@@ -20,7 +20,7 @@ from ai.client import AIClient, _load_prompt, parse_json_response
 from core.config import settings
 from core.utils import setup_logger

-from .llm_gate import get_mlx_gate
+from .llm_gate import Priority, acquire_mlx_gate

 logger = setup_logger("classifier")

@@ -100,7 +100,7 @@ async def classify(
        # Gate 미사용 시 classifier + evidence + synthesis 가 동시에 single-inference
        # MLX 에 race → 거의 모두 timeout (실측: 8/10 fixture query). docstring 영구 룰:
        # "MLX primary 호출 경로는 예외 없이 gate 획득 필수".
-        async with get_mlx_gate():
+        async with acquire_mlx_gate(Priority.FOREGROUND):
            async with asyncio.timeout(LLM_TIMEOUT_MS / 1000):
                raw = await client._request(settings.ai.classifier, prompt)
        _failure_count = 0
@@ -57,7 +57,7 @@ from typing import TYPE_CHECKING
 from ai.client import AIClient, _load_prompt, parse_json_response
 from core.utils import setup_logger

-from .llm_gate import get_mlx_gate
+from .llm_gate import Priority, acquire_mlx_gate

 from .rerank_service import _extract_window

@@ -312,7 +312,7 @@ async def extract_evidence(
        # 2026-05-17: PR #20 이후 triage/primary 동일 Mac mini 26B endpoint. gate 외부 실행이 docstring
        # 영구 룰 ("MLX primary 호출 경로는 예외 없이 gate 획득 필수") 위반 — race condition 으로 동시
        # 호출 timeout 빈번. gate 안쪽으로 이동.
-        async with get_mlx_gate():
+        async with acquire_mlx_gate(Priority.FOREGROUND):
            async with asyncio.timeout(LLM_TIMEOUT_MS / 1000):
                raw = await ai_client.call_triage(prompt)
    except asyncio.TimeoutError:
@@ -36,7 +36,7 @@ from ai.client import AIClient, _load_prompt, parse_json_response
 from core.config import settings
 from core.utils import setup_logger

-from .llm_gate import get_mlx_gate
+from .llm_gate import Priority, acquire_mlx_gate

 logger = setup_logger("query_analyzer")

@@ -71,16 +71,6 @@ _PENDING: set[asyncio.Task[Any]] = set()
 _INFLIGHT: set[str] = set()


-def _get_llm_semaphore() -> asyncio.Semaphore:
-    """MLX single-inference gate를 반환. Phase 3.1부터 llm_gate.get_mlx_gate()
-    로 위임 — analyzer / evidence / synthesis 가 동일 semaphore 공유.
-
-    `LLM_CONCURRENCY` 상수는 하위 호환/문서용으로 유지하되, 실제 bound는
-    `llm_gate.MLX_CONCURRENCY` 가 담당한다.
-    """
-    return get_mlx_gate()
-
-
 def _cache_key(query: str) -> str:
    raw = f"{query}|{PROMPT_VERSION}|{_model_version()}"
    return hashlib.sha256(raw.encode("utf-8")).hexdigest()
@@ -237,11 +227,13 @@ async def analyze(query: str, ai_client: AIClient | None = None) -> dict:
        client_owned = True

    t_start = time.perf_counter()
-    semaphore = _get_llm_semaphore()
-    # ⚠️ 중요: semaphore 대기는 timeout 포함되면 안됨 (대기만 해도 timeout 발동)
+    # 2026-05-17 B-1: query_analyzer 의 analyze() 는 fire-and-forget background only
+    # (search_pipeline.py:179 trigger_background_analysis 만 호출, docstring rule
+    # "analyze() 동기 호출 금지"). 따라서 Priority.BACKGROUND.
+    # ⚠️ 중요: gate 대기는 timeout 포함되면 안됨 (대기만 해도 timeout 발동)
    #   timeout은 실제 LLM 호출 구간에만 적용.
    try:
-        async with semaphore:
+        async with acquire_mlx_gate(Priority.BACKGROUND):
            async with asyncio.timeout(LLM_TIMEOUT_MS / 1000):
                raw = await ai_client._call_chat(
                    ai_client.ai.primary,
@@ -31,7 +31,7 @@ from ai.client import AIClient, _load_prompt, parse_json_response
 from core.config import settings
 from core.utils import setup_logger

-from .llm_gate import get_mlx_gate
+from .llm_gate import Priority, acquire_mlx_gate

 if TYPE_CHECKING:
    from .evidence_service import EvidenceItem
@@ -296,7 +296,7 @@ async def synthesize(
    llm_error: str | None = None

    try:
-        async with get_mlx_gate():
+        async with acquire_mlx_gate(Priority.FOREGROUND):
            async with asyncio.timeout(LLM_TIMEOUT_MS / 1000):
                raw = await ai_client._call_chat(ai_client.ai.primary, prompt)
    except asyncio.TimeoutError:
@@ -28,7 +28,7 @@ from models.document import Document
 from models.queue import ProcessingQueue
 from policy.prompt_render import render_26b, policy_version as compute_policy_version
 from services.document_telemetry import record_analyze_event
-from services.search.llm_gate import get_mlx_gate
+from services.search.llm_gate import Priority, acquire_mlx_gate

 logger = setup_logger("deep_summary_worker")

@@ -107,7 +107,7 @@ async def process(document_id: int, session: AsyncSession) -> None:

    try:
        start = time.perf_counter()
-        async with get_mlx_gate():                     # primary(26B) 보호 Semaphore(1)
+        async with acquire_mlx_gate(Priority.BACKGROUND):  # 2026-05-17 B-1: classify-escalate worker
            raw = await client.call_primary(prompt)
        latency_ms = int((time.perf_counter() - start) * 1000)
    except Exception as exc:
@@ -27,7 +27,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
 from ai.client import AIClient, parse_json_response
 from models.study_question import StudyQuestion
 from models.study_question_job import StudyQuestionJob
-from services.search.llm_gate import get_mlx_gate
+from services.search.llm_gate import Priority, acquire_mlx_gate
 from services.study.explanation_rag import (
    gather_explanation_context,
    render_evidence_block,
@@ -146,7 +146,7 @@ async def run_explanation_job(session: AsyncSession, job: StudyQuestionJob) -> N

        ai_client = AIClient()
        try:
-            async with get_mlx_gate():
+            async with acquire_mlx_gate(Priority.BACKGROUND):  # 2026-05-17 B-1
                async with asyncio.timeout(LLM_TIMEOUT_S):
                    raw_text = await ai_client.call_primary(prompt)
            primary_name = (
@@ -32,7 +32,7 @@ from models.study_question import StudyQuestion, StudyQuestionAttempt
 from models.study_quiz_session import StudyQuizSession
 from models.study_quiz_session_analysis import StudyQuizSessionAnalysis
 from models.study_quiz_session_job import StudyQuizSessionJob
-from services.search.llm_gate import get_mlx_gate
+from services.search.llm_gate import Priority, acquire_mlx_gate
 from services.study.session_summary_guard import (
    GUARD_PATTERN,
    calibrate_confidence,
@@ -234,7 +234,7 @@ async def run_session_analysis_job(session: AsyncSession, job: StudyQuizSessionJ
        prompt = _render_session_summary_prompt(qs, prompt_attempts, ctx_docs)
        ai_client = AIClient()
        try:
-            async with get_mlx_gate():
+            async with acquire_mlx_gate(Priority.BACKGROUND):  # 2026-05-17 B-1
                async with asyncio.timeout(LLM_TIMEOUT_S):
                    raw_text = await ai_client.call_primary(prompt)
            primary_name = (