refactor(search): swap 10 call sites to acquire_mlx_gate(Priority.*) (B-1)
DS-Mac-mini-26B-Priority-Gate-1 — 사용자-facing 7 + worker 3 = 10 site 의 `async with get_mlx_gate():` → `async with acquire_mlx_gate(Priority.*):` 교체. Foreground 6 (user-facing path): - app/services/search/evidence_service.py:315 (/ask evidence stage) - app/services/search/classifier_service.py:103 (/ask classifier stage) - app/services/search/synthesis_service.py:299 (/ask synthesis stage) - app/api/documents.py:1306 (수동 analyze API) - app/api/study_topics.py:1183 (subject note 동기 생성) - app/api/study_questions.py:1560 (study explanation 동기 API) Background 4 (worker queue / fire-and-forget): - app/services/search/query_analyzer.py:240 (V0 grep 확인: fire-and-forget only, search_pipeline.py:179 trigger_background_analysis 만, docstring rule "analyze() 동기 호출 금지" 부합 → BACKGROUND 확정) - app/workers/deep_summary_worker.py:110 (classify-escalate worker) - app/workers/study_explanation_worker.py:149 - app/workers/study_session_analysis_worker.py:237 Cleanup: - query_analyzer._get_llm_semaphore() 제거 — self-only, unused, signature 거짓말 (이제 get_mlx_gate 가 Semaphore 아닌 context manager 반환) 기존 get_mlx_gate() legacy wrapper 는 보존 (BACKGROUND 매핑). user-facing path 잔재 0 — closure gate grep 검증 통과 (별 commit 에서).
This commit is contained in:
@@ -38,7 +38,7 @@ from models.queue import ProcessingQueue, enqueue_stage
|
||||
from models.user import User
|
||||
from services.document_telemetry import record_analyze_event, sanitize_source
|
||||
from services.prompt_versions import ANALYZE_PROMPT_VERSION, resolve_primary_model
|
||||
from services.search.llm_gate import get_mlx_gate
|
||||
from services.search.llm_gate import Priority, acquire_mlx_gate
|
||||
|
||||
router = APIRouter()
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -1303,7 +1303,7 @@ async def analyze_document(
|
||||
ai_client = AIClient()
|
||||
raw: str | None = None
|
||||
try:
|
||||
async with get_mlx_gate():
|
||||
async with acquire_mlx_gate(Priority.FOREGROUND):
|
||||
async with asyncio.timeout(ANALYZE_TIMEOUT_S):
|
||||
raw = await ai_client._call_chat(ai_client.ai.primary, prompt)
|
||||
except asyncio.TimeoutError:
|
||||
|
||||
@@ -30,7 +30,7 @@ from models.study_question_image import StudyQuestionImage
|
||||
from models.study_quiz_session import StudyQuizSession
|
||||
from models.study_topic import StudyTopic
|
||||
from models.user import User
|
||||
from services.search.llm_gate import get_mlx_gate
|
||||
from services.search.llm_gate import Priority, acquire_mlx_gate
|
||||
from services.study.explanation_rag import (
|
||||
EvidenceItem,
|
||||
gather_explanation_context,
|
||||
@@ -1557,7 +1557,7 @@ async def generate_ai_explanation(
|
||||
raw_text: str | None = None
|
||||
error_message: str | None = None
|
||||
try:
|
||||
async with get_mlx_gate():
|
||||
async with acquire_mlx_gate(Priority.FOREGROUND):
|
||||
async with asyncio.timeout(LLM_TIMEOUT_S):
|
||||
raw_text = await ai_client.call_primary(prompt)
|
||||
except asyncio.TimeoutError:
|
||||
|
||||
@@ -41,7 +41,7 @@ from models.study_question_image import StudyQuestionImage
|
||||
from models.study_quiz_session import StudyQuizSession
|
||||
from models.study_topic_subject_note import StudyTopicSubjectNote
|
||||
from models.user import User
|
||||
from services.search.llm_gate import get_mlx_gate
|
||||
from services.search.llm_gate import Priority, acquire_mlx_gate
|
||||
from services.study.subject_note_rag import (
|
||||
SubjectNoteContext,
|
||||
gather_subject_note_context,
|
||||
@@ -1180,7 +1180,7 @@ async def generate_subject_note(
|
||||
ai_client = AIClient()
|
||||
raw_text: str | None = None
|
||||
try:
|
||||
async with get_mlx_gate():
|
||||
async with acquire_mlx_gate(Priority.FOREGROUND):
|
||||
async with asyncio.timeout(SUBJECT_NOTE_TIMEOUT_S):
|
||||
raw_text = await ai_client.call_primary(prompt)
|
||||
except asyncio.TimeoutError:
|
||||
|
||||
@@ -20,7 +20,7 @@ from ai.client import AIClient, _load_prompt, parse_json_response
|
||||
from core.config import settings
|
||||
from core.utils import setup_logger
|
||||
|
||||
from .llm_gate import get_mlx_gate
|
||||
from .llm_gate import Priority, acquire_mlx_gate
|
||||
|
||||
logger = setup_logger("classifier")
|
||||
|
||||
@@ -100,7 +100,7 @@ async def classify(
|
||||
# Gate 미사용 시 classifier + evidence + synthesis 가 동시에 single-inference
|
||||
# MLX 에 race → 거의 모두 timeout (실측: 8/10 fixture query). docstring 영구 룰:
|
||||
# "MLX primary 호출 경로는 예외 없이 gate 획득 필수".
|
||||
async with get_mlx_gate():
|
||||
async with acquire_mlx_gate(Priority.FOREGROUND):
|
||||
async with asyncio.timeout(LLM_TIMEOUT_MS / 1000):
|
||||
raw = await client._request(settings.ai.classifier, prompt)
|
||||
_failure_count = 0
|
||||
|
||||
@@ -57,7 +57,7 @@ from typing import TYPE_CHECKING
|
||||
from ai.client import AIClient, _load_prompt, parse_json_response
|
||||
from core.utils import setup_logger
|
||||
|
||||
from .llm_gate import get_mlx_gate
|
||||
from .llm_gate import Priority, acquire_mlx_gate
|
||||
|
||||
from .rerank_service import _extract_window
|
||||
|
||||
@@ -312,7 +312,7 @@ async def extract_evidence(
|
||||
# 2026-05-17: PR #20 이후 triage/primary 동일 Mac mini 26B endpoint. gate 외부 실행이 docstring
|
||||
# 영구 룰 ("MLX primary 호출 경로는 예외 없이 gate 획득 필수") 위반 — race condition 으로 동시
|
||||
# 호출 timeout 빈번. gate 안쪽으로 이동.
|
||||
async with get_mlx_gate():
|
||||
async with acquire_mlx_gate(Priority.FOREGROUND):
|
||||
async with asyncio.timeout(LLM_TIMEOUT_MS / 1000):
|
||||
raw = await ai_client.call_triage(prompt)
|
||||
except asyncio.TimeoutError:
|
||||
|
||||
@@ -36,7 +36,7 @@ from ai.client import AIClient, _load_prompt, parse_json_response
|
||||
from core.config import settings
|
||||
from core.utils import setup_logger
|
||||
|
||||
from .llm_gate import get_mlx_gate
|
||||
from .llm_gate import Priority, acquire_mlx_gate
|
||||
|
||||
logger = setup_logger("query_analyzer")
|
||||
|
||||
@@ -71,16 +71,6 @@ _PENDING: set[asyncio.Task[Any]] = set()
|
||||
_INFLIGHT: set[str] = set()
|
||||
|
||||
|
||||
def _get_llm_semaphore() -> asyncio.Semaphore:
|
||||
"""MLX single-inference gate를 반환. Phase 3.1부터 llm_gate.get_mlx_gate()
|
||||
로 위임 — analyzer / evidence / synthesis 가 동일 semaphore 공유.
|
||||
|
||||
`LLM_CONCURRENCY` 상수는 하위 호환/문서용으로 유지하되, 실제 bound는
|
||||
`llm_gate.MLX_CONCURRENCY` 가 담당한다.
|
||||
"""
|
||||
return get_mlx_gate()
|
||||
|
||||
|
||||
def _cache_key(query: str) -> str:
|
||||
raw = f"{query}|{PROMPT_VERSION}|{_model_version()}"
|
||||
return hashlib.sha256(raw.encode("utf-8")).hexdigest()
|
||||
@@ -237,11 +227,13 @@ async def analyze(query: str, ai_client: AIClient | None = None) -> dict:
|
||||
client_owned = True
|
||||
|
||||
t_start = time.perf_counter()
|
||||
semaphore = _get_llm_semaphore()
|
||||
# ⚠️ 중요: semaphore 대기는 timeout 포함되면 안됨 (대기만 해도 timeout 발동)
|
||||
# 2026-05-17 B-1: query_analyzer 의 analyze() 는 fire-and-forget background only
|
||||
# (search_pipeline.py:179 trigger_background_analysis 만 호출, docstring rule
|
||||
# "analyze() 동기 호출 금지"). 따라서 Priority.BACKGROUND.
|
||||
# ⚠️ 중요: gate 대기는 timeout 포함되면 안됨 (대기만 해도 timeout 발동)
|
||||
# timeout은 실제 LLM 호출 구간에만 적용.
|
||||
try:
|
||||
async with semaphore:
|
||||
async with acquire_mlx_gate(Priority.BACKGROUND):
|
||||
async with asyncio.timeout(LLM_TIMEOUT_MS / 1000):
|
||||
raw = await ai_client._call_chat(
|
||||
ai_client.ai.primary,
|
||||
|
||||
@@ -31,7 +31,7 @@ from ai.client import AIClient, _load_prompt, parse_json_response
|
||||
from core.config import settings
|
||||
from core.utils import setup_logger
|
||||
|
||||
from .llm_gate import get_mlx_gate
|
||||
from .llm_gate import Priority, acquire_mlx_gate
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .evidence_service import EvidenceItem
|
||||
@@ -296,7 +296,7 @@ async def synthesize(
|
||||
llm_error: str | None = None
|
||||
|
||||
try:
|
||||
async with get_mlx_gate():
|
||||
async with acquire_mlx_gate(Priority.FOREGROUND):
|
||||
async with asyncio.timeout(LLM_TIMEOUT_MS / 1000):
|
||||
raw = await ai_client._call_chat(ai_client.ai.primary, prompt)
|
||||
except asyncio.TimeoutError:
|
||||
|
||||
@@ -28,7 +28,7 @@ from models.document import Document
|
||||
from models.queue import ProcessingQueue
|
||||
from policy.prompt_render import render_26b, policy_version as compute_policy_version
|
||||
from services.document_telemetry import record_analyze_event
|
||||
from services.search.llm_gate import get_mlx_gate
|
||||
from services.search.llm_gate import Priority, acquire_mlx_gate
|
||||
|
||||
logger = setup_logger("deep_summary_worker")
|
||||
|
||||
@@ -107,7 +107,7 @@ async def process(document_id: int, session: AsyncSession) -> None:
|
||||
|
||||
try:
|
||||
start = time.perf_counter()
|
||||
async with get_mlx_gate(): # primary(26B) 보호 Semaphore(1)
|
||||
async with acquire_mlx_gate(Priority.BACKGROUND): # 2026-05-17 B-1: classify-escalate worker
|
||||
raw = await client.call_primary(prompt)
|
||||
latency_ms = int((time.perf_counter() - start) * 1000)
|
||||
except Exception as exc:
|
||||
|
||||
@@ -27,7 +27,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
||||
from ai.client import AIClient, parse_json_response
|
||||
from models.study_question import StudyQuestion
|
||||
from models.study_question_job import StudyQuestionJob
|
||||
from services.search.llm_gate import get_mlx_gate
|
||||
from services.search.llm_gate import Priority, acquire_mlx_gate
|
||||
from services.study.explanation_rag import (
|
||||
gather_explanation_context,
|
||||
render_evidence_block,
|
||||
@@ -146,7 +146,7 @@ async def run_explanation_job(session: AsyncSession, job: StudyQuestionJob) -> N
|
||||
|
||||
ai_client = AIClient()
|
||||
try:
|
||||
async with get_mlx_gate():
|
||||
async with acquire_mlx_gate(Priority.BACKGROUND): # 2026-05-17 B-1
|
||||
async with asyncio.timeout(LLM_TIMEOUT_S):
|
||||
raw_text = await ai_client.call_primary(prompt)
|
||||
primary_name = (
|
||||
|
||||
@@ -32,7 +32,7 @@ from models.study_question import StudyQuestion, StudyQuestionAttempt
|
||||
from models.study_quiz_session import StudyQuizSession
|
||||
from models.study_quiz_session_analysis import StudyQuizSessionAnalysis
|
||||
from models.study_quiz_session_job import StudyQuizSessionJob
|
||||
from services.search.llm_gate import get_mlx_gate
|
||||
from services.search.llm_gate import Priority, acquire_mlx_gate
|
||||
from services.study.session_summary_guard import (
|
||||
GUARD_PATTERN,
|
||||
calibrate_confidence,
|
||||
@@ -234,7 +234,7 @@ async def run_session_analysis_job(session: AsyncSession, job: StudyQuizSessionJ
|
||||
prompt = _render_session_summary_prompt(qs, prompt_attempts, ctx_docs)
|
||||
ai_client = AIClient()
|
||||
try:
|
||||
async with get_mlx_gate():
|
||||
async with acquire_mlx_gate(Priority.BACKGROUND): # 2026-05-17 B-1
|
||||
async with asyncio.timeout(LLM_TIMEOUT_S):
|
||||
raw_text = await ai_client.call_primary(prompt)
|
||||
primary_name = (
|
||||
|
||||
Reference in New Issue
Block a user