refactor(search): swap 10 call sites to acquire_mlx_gate(Priority.*) (B-1)

DS-Mac-mini-26B-Priority-Gate-1 — 사용자-facing 7 + worker 3 = 10 site 의
`async with get_mlx_gate():` → `async with acquire_mlx_gate(Priority.*):` 교체.

Foreground 6 (user-facing path):
- app/services/search/evidence_service.py:315 (/ask evidence stage)
- app/services/search/classifier_service.py:103 (/ask classifier stage)
- app/services/search/synthesis_service.py:299 (/ask synthesis stage)
- app/api/documents.py:1306 (수동 analyze API)
- app/api/study_topics.py:1183 (subject note 동기 생성)
- app/api/study_questions.py:1560 (study explanation 동기 API)

Background 4 (worker queue / fire-and-forget):
- app/services/search/query_analyzer.py:240 (V0 grep 확인: fire-and-forget only,
  search_pipeline.py:179 trigger_background_analysis 만, docstring rule
  "analyze() 동기 호출 금지" 부합 → BACKGROUND 확정)
- app/workers/deep_summary_worker.py:110 (classify-escalate worker)
- app/workers/study_explanation_worker.py:149
- app/workers/study_session_analysis_worker.py:237

Cleanup:
- query_analyzer._get_llm_semaphore() 제거 — self-only, unused, signature 거짓말
  (이제 get_mlx_gate 가 Semaphore 아닌 context manager 반환)

기존 get_mlx_gate() legacy wrapper 는 보존 (BACKGROUND 매핑). user-facing path
잔재 0 — closure gate grep 검증 통과 (별 commit 에서).
This commit is contained in:
Hyungi Ahn
2026-05-17 08:51:57 +09:00
parent 7c9aff393a
commit a08b620894
10 changed files with 24 additions and 32 deletions
+2 -2
View File
@@ -38,7 +38,7 @@ from models.queue import ProcessingQueue, enqueue_stage
from models.user import User
from services.document_telemetry import record_analyze_event, sanitize_source
from services.prompt_versions import ANALYZE_PROMPT_VERSION, resolve_primary_model
from services.search.llm_gate import get_mlx_gate
from services.search.llm_gate import Priority, acquire_mlx_gate
router = APIRouter()
logger = logging.getLogger(__name__)
@@ -1303,7 +1303,7 @@ async def analyze_document(
ai_client = AIClient()
raw: str | None = None
try:
async with get_mlx_gate():
async with acquire_mlx_gate(Priority.FOREGROUND):
async with asyncio.timeout(ANALYZE_TIMEOUT_S):
raw = await ai_client._call_chat(ai_client.ai.primary, prompt)
except asyncio.TimeoutError:
+2 -2
View File
@@ -30,7 +30,7 @@ from models.study_question_image import StudyQuestionImage
from models.study_quiz_session import StudyQuizSession
from models.study_topic import StudyTopic
from models.user import User
from services.search.llm_gate import get_mlx_gate
from services.search.llm_gate import Priority, acquire_mlx_gate
from services.study.explanation_rag import (
EvidenceItem,
gather_explanation_context,
@@ -1557,7 +1557,7 @@ async def generate_ai_explanation(
raw_text: str | None = None
error_message: str | None = None
try:
async with get_mlx_gate():
async with acquire_mlx_gate(Priority.FOREGROUND):
async with asyncio.timeout(LLM_TIMEOUT_S):
raw_text = await ai_client.call_primary(prompt)
except asyncio.TimeoutError:
+2 -2
View File
@@ -41,7 +41,7 @@ from models.study_question_image import StudyQuestionImage
from models.study_quiz_session import StudyQuizSession
from models.study_topic_subject_note import StudyTopicSubjectNote
from models.user import User
from services.search.llm_gate import get_mlx_gate
from services.search.llm_gate import Priority, acquire_mlx_gate
from services.study.subject_note_rag import (
SubjectNoteContext,
gather_subject_note_context,
@@ -1180,7 +1180,7 @@ async def generate_subject_note(
ai_client = AIClient()
raw_text: str | None = None
try:
async with get_mlx_gate():
async with acquire_mlx_gate(Priority.FOREGROUND):
async with asyncio.timeout(SUBJECT_NOTE_TIMEOUT_S):
raw_text = await ai_client.call_primary(prompt)
except asyncio.TimeoutError:
+2 -2
View File
@@ -20,7 +20,7 @@ from ai.client import AIClient, _load_prompt, parse_json_response
from core.config import settings
from core.utils import setup_logger
from .llm_gate import get_mlx_gate
from .llm_gate import Priority, acquire_mlx_gate
logger = setup_logger("classifier")
@@ -100,7 +100,7 @@ async def classify(
# Gate 미사용 시 classifier + evidence + synthesis 가 동시에 single-inference
# MLX 에 race → 거의 모두 timeout (실측: 8/10 fixture query). docstring 영구 룰:
# "MLX primary 호출 경로는 예외 없이 gate 획득 필수".
async with get_mlx_gate():
async with acquire_mlx_gate(Priority.FOREGROUND):
async with asyncio.timeout(LLM_TIMEOUT_MS / 1000):
raw = await client._request(settings.ai.classifier, prompt)
_failure_count = 0
+2 -2
View File
@@ -57,7 +57,7 @@ from typing import TYPE_CHECKING
from ai.client import AIClient, _load_prompt, parse_json_response
from core.utils import setup_logger
from .llm_gate import get_mlx_gate
from .llm_gate import Priority, acquire_mlx_gate
from .rerank_service import _extract_window
@@ -312,7 +312,7 @@ async def extract_evidence(
# 2026-05-17: PR #20 이후 triage/primary 동일 Mac mini 26B endpoint. gate 외부 실행이 docstring
# 영구 룰 ("MLX primary 호출 경로는 예외 없이 gate 획득 필수") 위반 — race condition 으로 동시
# 호출 timeout 빈번. gate 안쪽으로 이동.
async with get_mlx_gate():
async with acquire_mlx_gate(Priority.FOREGROUND):
async with asyncio.timeout(LLM_TIMEOUT_MS / 1000):
raw = await ai_client.call_triage(prompt)
except asyncio.TimeoutError:
+6 -14
View File
@@ -36,7 +36,7 @@ from ai.client import AIClient, _load_prompt, parse_json_response
from core.config import settings
from core.utils import setup_logger
from .llm_gate import get_mlx_gate
from .llm_gate import Priority, acquire_mlx_gate
logger = setup_logger("query_analyzer")
@@ -71,16 +71,6 @@ _PENDING: set[asyncio.Task[Any]] = set()
_INFLIGHT: set[str] = set()
def _get_llm_semaphore() -> asyncio.Semaphore:
"""MLX single-inference gate를 반환. Phase 3.1부터 llm_gate.get_mlx_gate()
위임 analyzer / evidence / synthesis 동일 semaphore 공유.
`LLM_CONCURRENCY` 상수는 하위 호환/문서용으로 유지하되, 실제 bound는
`llm_gate.MLX_CONCURRENCY` 담당한다.
"""
return get_mlx_gate()
def _cache_key(query: str) -> str:
raw = f"{query}|{PROMPT_VERSION}|{_model_version()}"
return hashlib.sha256(raw.encode("utf-8")).hexdigest()
@@ -237,11 +227,13 @@ async def analyze(query: str, ai_client: AIClient | None = None) -> dict:
client_owned = True
t_start = time.perf_counter()
semaphore = _get_llm_semaphore()
# ⚠️ 중요: semaphore 대기는 timeout 포함되면 안됨 (대기만 해도 timeout 발동)
# 2026-05-17 B-1: query_analyzer 의 analyze() 는 fire-and-forget background only
# (search_pipeline.py:179 trigger_background_analysis 만 호출, docstring rule
# "analyze() 동기 호출 금지"). 따라서 Priority.BACKGROUND.
# ⚠️ 중요: gate 대기는 timeout 포함되면 안됨 (대기만 해도 timeout 발동)
# timeout은 실제 LLM 호출 구간에만 적용.
try:
async with semaphore:
async with acquire_mlx_gate(Priority.BACKGROUND):
async with asyncio.timeout(LLM_TIMEOUT_MS / 1000):
raw = await ai_client._call_chat(
ai_client.ai.primary,
+2 -2
View File
@@ -31,7 +31,7 @@ from ai.client import AIClient, _load_prompt, parse_json_response
from core.config import settings
from core.utils import setup_logger
from .llm_gate import get_mlx_gate
from .llm_gate import Priority, acquire_mlx_gate
if TYPE_CHECKING:
from .evidence_service import EvidenceItem
@@ -296,7 +296,7 @@ async def synthesize(
llm_error: str | None = None
try:
async with get_mlx_gate():
async with acquire_mlx_gate(Priority.FOREGROUND):
async with asyncio.timeout(LLM_TIMEOUT_MS / 1000):
raw = await ai_client._call_chat(ai_client.ai.primary, prompt)
except asyncio.TimeoutError:
+2 -2
View File
@@ -28,7 +28,7 @@ from models.document import Document
from models.queue import ProcessingQueue
from policy.prompt_render import render_26b, policy_version as compute_policy_version
from services.document_telemetry import record_analyze_event
from services.search.llm_gate import get_mlx_gate
from services.search.llm_gate import Priority, acquire_mlx_gate
logger = setup_logger("deep_summary_worker")
@@ -107,7 +107,7 @@ async def process(document_id: int, session: AsyncSession) -> None:
try:
start = time.perf_counter()
async with get_mlx_gate(): # primary(26B) 보호 Semaphore(1)
async with acquire_mlx_gate(Priority.BACKGROUND): # 2026-05-17 B-1: classify-escalate worker
raw = await client.call_primary(prompt)
latency_ms = int((time.perf_counter() - start) * 1000)
except Exception as exc:
+2 -2
View File
@@ -27,7 +27,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
from ai.client import AIClient, parse_json_response
from models.study_question import StudyQuestion
from models.study_question_job import StudyQuestionJob
from services.search.llm_gate import get_mlx_gate
from services.search.llm_gate import Priority, acquire_mlx_gate
from services.study.explanation_rag import (
gather_explanation_context,
render_evidence_block,
@@ -146,7 +146,7 @@ async def run_explanation_job(session: AsyncSession, job: StudyQuestionJob) -> N
ai_client = AIClient()
try:
async with get_mlx_gate():
async with acquire_mlx_gate(Priority.BACKGROUND): # 2026-05-17 B-1
async with asyncio.timeout(LLM_TIMEOUT_S):
raw_text = await ai_client.call_primary(prompt)
primary_name = (
+2 -2
View File
@@ -32,7 +32,7 @@ from models.study_question import StudyQuestion, StudyQuestionAttempt
from models.study_quiz_session import StudyQuizSession
from models.study_quiz_session_analysis import StudyQuizSessionAnalysis
from models.study_quiz_session_job import StudyQuizSessionJob
from services.search.llm_gate import get_mlx_gate
from services.search.llm_gate import Priority, acquire_mlx_gate
from services.study.session_summary_guard import (
GUARD_PATTERN,
calibrate_confidence,
@@ -234,7 +234,7 @@ async def run_session_analysis_job(session: AsyncSession, job: StudyQuizSessionJ
prompt = _render_session_summary_prompt(qs, prompt_attempts, ctx_docs)
ai_client = AIClient()
try:
async with get_mlx_gate():
async with acquire_mlx_gate(Priority.BACKGROUND): # 2026-05-17 B-1
async with asyncio.timeout(LLM_TIMEOUT_S):
raw_text = await ai_client.call_primary(prompt)
primary_name = (