"""PR-MacBook-RAG-Backend-1: /api/search/ask 의 명시 backend dispatcher. ## 정책 (정정 4) - 기본 (`backend` 미지정) = Gemma Mac mini. 기존 코드 경로 100% 보존. - 명시 opt-in `backend="qwen-macbook"` 만 MacBook M5 Max mlx-vlm.server 호출. - MacBook unavailable 시 `BackendUnavailable` 예외 → /ask wrapper 가 503 + `error_reason="macbook_unavailable"` 응답. **Gemma 자동 fallback 금지**. ## 영구 룰 - Qwen backend 는 **Mac mini llm_gate 점유 금지**. 별 endpoint, 별 concurrency. → MacBook 전용 `asyncio.Semaphore(1)` (single-inference 가정) 분리. - Gemma backend 는 기존 path 그대로 (acquire_mlx_gate(FOREGROUND) + ai.primary). llm_gate 영구 룰 ([[feedback_docstring_invariant_swap_audit]] 케이스) 보존. """ from __future__ import annotations import asyncio from abc import ABC, abstractmethod from typing import TYPE_CHECKING import httpx from core.config import settings from core.utils import setup_logger from services.search.llm_gate import Priority, acquire_mlx_gate if TYPE_CHECKING: from ai.client import AIClient logger = setup_logger("llm_backend") # 명시 backend 식별자. None / "gemma-macmini" 는 default Gemma path. QWEN_MACBOOK = "qwen-macbook" GEMMA_MACMINI = "gemma-macmini" class BackendUnavailable(Exception): """명시 backend 가 일시 비가용. /ask wrapper 가 503 으로 매핑.""" def __init__(self, backend_name: str, reason: str): self.backend_name = backend_name self.reason = reason super().__init__(f"{backend_name} unavailable: {reason}") class BackendBase(ABC): name: str @abstractmethod async def generate(self, prompt: str, *, timeout_read_s: int) -> str: """프롬프트 → 본문 (OpenAI 호환 chat completion content). 실패 시 `BackendUnavailable` 또는 일반 예외. 일반 예외는 synthesis_service 가 status="llm_error" 로 매핑 (기존 동작). BackendUnavailable 만 503 으로 매핑. """ class GemmaMacMiniBackend(BackendBase): """기존 Mac mini ai.primary 경로 그대로. 코드 변경 0 path.""" name = GEMMA_MACMINI async def generate(self, prompt: str, *, timeout_read_s: int) -> str: # 지연 import — ai.client 가 settings.ai 의존 from ai.client import AIClient client = AIClient() try: async with acquire_mlx_gate(Priority.FOREGROUND): async with asyncio.timeout(timeout_read_s): return await client._call_chat(client.ai.primary, prompt) finally: try: await client.close() except Exception: pass class QwenMacBookBackend(BackendBase): """MacBook M5 Max mlx-vlm.server (Tailscale) 직접 호출. - Mac mini llm_gate 점유 X (별 endpoint 라 의미 없음 + 큐 분할 금지 영구 룰의 대상이 아님) - MacBook 자체 single-inference 가정 → 별 semaphore(1) - 연결 거부 / DNS / timeout / 5xx → BackendUnavailable """ name = QWEN_MACBOOK _gate: asyncio.Semaphore | None = None def __init__(self, base_url: str, model: str, timeout_connect_s: int): self.base_url = base_url.rstrip("/") self.model = model self.timeout_connect_s = timeout_connect_s @classmethod def _get_gate(cls) -> asyncio.Semaphore: if cls._gate is None: cls._gate = asyncio.Semaphore(1) return cls._gate async def generate(self, prompt: str, *, timeout_read_s: int) -> str: gate = self._get_gate() timeout = httpx.Timeout( connect=float(self.timeout_connect_s), read=float(timeout_read_s), write=10.0, pool=5.0, ) url = f"{self.base_url}/v1/chat/completions" payload = { "model": self.model, "messages": [{"role": "user", "content": prompt}], "max_tokens": 4096, } async with gate: try: async with httpx.AsyncClient(timeout=timeout) as client: resp = await client.post(url, json=payload) resp.raise_for_status() data = resp.json() return data["choices"][0]["message"]["content"] except ( httpx.ConnectError, httpx.ConnectTimeout, httpx.ReadTimeout, httpx.PoolTimeout, httpx.WriteTimeout, httpx.RemoteProtocolError, ) as exc: logger.warning( "qwen-macbook unavailable url=%s exc=%s", url, type(exc).__name__, ) raise BackendUnavailable(self.name, type(exc).__name__) from exc except httpx.HTTPStatusError as exc: # 5xx 만 unavailable, 4xx 는 호출자 잘못 → 일반 예외 전파 if 500 <= exc.response.status_code < 600: logger.warning( "qwen-macbook 5xx status=%d", exc.response.status_code, ) raise BackendUnavailable( self.name, f"http_{exc.response.status_code}" ) from exc raise async def generate_with_tools( self, messages: list[dict], tools: list[dict], *, tool_choice: str = "auto", timeout_read_s: int, ) -> dict: """OpenAI 호환 chat completion with tool calling (ReAct loop 용). Returns: `choices[0].message` dict 그대로 — `content` (Optional[str]) + `tool_calls` (Optional[list]) 둘 다 포함. Response shape = G0-1 fixture `tests/fixtures/qwen_tool_call_response.json` 기준 (mlx-vlm OpenAI 표준 호환). tool_calls[].function.arguments 는 **JSON string** 으로 옴 — 호출자가 json.loads 필요. - `tool_choice="auto"`: 모델이 tool 호출 여부 결정 - `tool_choice="none"`: tool 호출 금지, content 만 반환 (final round) - `tools=[]` + `tool_choice="none"`: tool 정의 없이 final answer 강제 """ gate = self._get_gate() timeout = httpx.Timeout( connect=float(self.timeout_connect_s), read=float(timeout_read_s), write=10.0, pool=5.0, ) url = f"{self.base_url}/v1/chat/completions" payload: dict = { "model": self.model, "messages": messages, "max_tokens": 4096, } if tools: payload["tools"] = tools if tool_choice in ("auto", "none"): payload["tool_choice"] = tool_choice async with gate: try: async with httpx.AsyncClient(timeout=timeout) as client: resp = await client.post(url, json=payload) resp.raise_for_status() data = resp.json() return data["choices"][0]["message"] except ( httpx.ConnectError, httpx.ConnectTimeout, httpx.ReadTimeout, httpx.PoolTimeout, httpx.WriteTimeout, httpx.RemoteProtocolError, ) as exc: logger.warning( "qwen-macbook(tools) unavailable url=%s exc=%s", url, type(exc).__name__, ) raise BackendUnavailable(self.name, type(exc).__name__) from exc except httpx.HTTPStatusError as exc: if 500 <= exc.response.status_code < 600: logger.warning( "qwen-macbook(tools) 5xx status=%d", exc.response.status_code, ) raise BackendUnavailable( self.name, f"http_{exc.response.status_code}" ) from exc raise # ── dispatcher ───────────────────────────────────────────────────────────── _BACKENDS: dict[str, BackendBase] = {} def _build_qwen_backend() -> QwenMacBookBackend: b = settings.search.ask.backend return QwenMacBookBackend( base_url=b.macbook_url, model=b.macbook_model, timeout_connect_s=b.timeout_connect_s, ) def get_backend(name: str | None) -> BackendBase: """name 으로 backend 인스턴스 반환 (캐싱). - None / "" / "gemma-macmini" → Gemma Mac mini (default) - "qwen-macbook" → MacBook Qwen - 그 외 → ValueError (호출자가 400 으로 매핑) """ key = (name or "").strip().lower() or GEMMA_MACMINI if key not in (GEMMA_MACMINI, QWEN_MACBOOK): raise ValueError(f"unknown backend: {name!r}") if key not in _BACKENDS: if key == GEMMA_MACMINI: _BACKENDS[key] = GemmaMacMiniBackend() else: _BACKENDS[key] = _build_qwen_backend() return _BACKENDS[key] def reset_backends_for_test() -> None: """test fixture 가 settings 변경 후 backend 인스턴스 재생성하려고 호출. production code 에서 사용 X. """ _BACKENDS.clear() QwenMacBookBackend._gate = None