diff --git a/app/api/search.py b/app/api/search.py index 4d0a37a..101baff 100644 --- a/app/api/search.py +++ b/app/api/search.py @@ -457,12 +457,15 @@ async def ask( backend: Annotated[ str | None, Query( - pattern="^(qwen-macbook|gemma-macmini)$", + pattern="^(qwen-macbook|gemma-macmini|mac-mini-default|claude-cloud|auto)$", description=( - "PR-MacBook-RAG-Backend-1: 명시 backend opt-in. " - "미지정 = gemma-macmini (Mac mini, default). " - "'qwen-macbook' = MacBook M5 Max Qwen 3.6 27B. " - "MacBook unavailable 시 503 + error_reason=macbook_unavailable " + "PR-2 of DS AI routing policy (2026-05-23) — 명시 backend opt-in via llm-router. " + "미지정 = mac-mini-default (gemma-macmini alias, default). " + "'mac-mini-default' = router 가 tier_b (Mac mini gemma-4-26b). " + "'qwen-macbook' = router 가 named upstream (M5 Max Qwen 3.6 27B). " + "'claude-cloud' = router 가 503 provider_not_configured (활성화 별 PR). " + "'auto' = router 의 rule + LLM triage. " + "backend unavailable 시 503 + error_reason=macbook_unavailable / router_* " "(자동 fallback 없음 — 다시 호출하거나 backend 인자 제거 후 재시도)." ), ), @@ -979,11 +982,14 @@ async def ask_react( - debug_trace: list[dict] | null (debug=true 시 round 별 trace) """ # 지연 import — 순환 의존성 회피 (react_loop 가 api.search.SearchResult 사용 안 함) - from services.llm.backends import BackendUnavailable, QwenMacBookBackend, get_backend + from services.llm.backends import BackendUnavailable, get_backend from services.search.react_loop import agentic_ask_loop backend_inst = get_backend("qwen-macbook") - assert isinstance(backend_inst, QwenMacBookBackend) # mypy / runtime guard + # PR-2 of DS AI routing policy: backend_inst may be RouterBackend (default) + # or QwenMacBookBackend (DS_BACKENDS_VIA_ROUTER=false rollback). Both + # implement generate_with_tools so the ReAct loop is identical. + assert hasattr(backend_inst, "generate_with_tools") try: result = await agentic_ask_loop( diff --git a/app/core/config.py b/app/core/config.py index d2f765c..21ee149 100644 --- a/app/core/config.py +++ b/app/core/config.py @@ -36,18 +36,27 @@ class DeepSummaryBacklogConfig(BaseModel): class SearchAskBackendConfig(BaseModel): - """PR-MacBook-RAG-Backend-1: /api/search/ask backend dispatcher. + """PR-2 of DS AI routing policy ([[document-server-ai-routing-policy]], 2026-05-23): + /api/search/ask backend dispatcher 가 llm-router :8890 단일 경유. - backend 미지정 = Gemma Mac mini (settings.ai.primary 경로 그대로). - backend="qwen-macbook" 명시 opt-in = MacBook M5 Max mlx-vlm.server. - MacBook unavailable 시 503 + error_reason=macbook_unavailable (자동 fallback 없음). + - backend 미지정 / "gemma-macmini" / "mac-mini-default" → router 가 tier_b + - backend "qwen-macbook" → router 가 named upstream (M5 Max) + - backend "claude-cloud" → router 가 503 명시 (scaffold) + - backend "auto" → router 의 rule + LLM triage + + Unavailable → BackendUnavailable → 503 명시 (silent fallback 0). + Rollback: DS_BACKENDS_VIA_ROUTER=false 로 legacy 직접 호출 path. + legacy macmini_url / macbook_url / macbook_model 은 fallback 시만 사용. """ + # PR-2 신규: llm-router URL. 비면 env LLM_ROUTER_URL 또는 hardcoded default. + router_url: str = "" + # Legacy fields (DS_BACKENDS_VIA_ROUTER=false 시만 사용) macmini_url: str = "http://100.76.254.116:8801" macbook_url: str = "http://100.118.112.84:8810" macbook_model: str = "mlx-community/Qwen3.6-27B-8bit" - timeout_connect_s: int = 1 - timeout_read_s: int = 30 + timeout_connect_s: int = 5 + timeout_read_s: int = 60 class SearchAskReactConfig(BaseModel): diff --git a/app/services/llm/backends.py b/app/services/llm/backends.py index 004a621..a654d34 100644 --- a/app/services/llm/backends.py +++ b/app/services/llm/backends.py @@ -1,23 +1,40 @@ -"""PR-MacBook-RAG-Backend-1: /api/search/ask 의 명시 backend dispatcher. +"""PR-2 of DS AI routing policy ([[document-server-ai-routing-policy]], 2026-05-23): +/api/search/ask 의 명시 backend dispatcher. 모든 backend = llm-router :8890 경유. -## 정책 (정정 4) +## 정책 (PR-2 of routing policy, MVP 옵션 C — ask path 만 swap) -- 기본 (`backend` 미지정) = Gemma Mac mini. 기존 코드 경로 100% 보존. -- 명시 opt-in `backend="qwen-macbook"` 만 MacBook M5 Max mlx-vlm.server 호출. -- MacBook unavailable 시 `BackendUnavailable` 예외 → /ask wrapper 가 503 + - `error_reason="macbook_unavailable"` 응답. **Gemma 자동 fallback 금지**. +- 기본 (`backend` 미지정) / `gemma-macmini` / `mac-mini-default` + → RouterBackend(alias="mac-mini-default", requires_gate=True) + → router 가 tier_b (Mac mini :8801 gemma-4-26b) 호출. llm_gate 영구 룰 보존. +- `qwen-macbook` + → RouterBackend(alias="qwen-macbook", requires_gate=False) + → router 가 named upstream (M5 Max :8810 Qwen3.6-27B) 호출. +- `claude-cloud` + → RouterBackend(alias="claude-cloud", requires_gate=False) + → router 가 503 provider_not_configured pass-through. activation = 별 PR. +- `auto` + → RouterBackend(alias=None, requires_gate=True) + → router 가 rule + LLM triage 로 tier 결정. 안전상 Mac mini gate 보호 보수적. +- 그 외 → ValueError (호출자가 400/422 으로 매핑) ## 영구 룰 -- Qwen backend 는 **Mac mini llm_gate 점유 금지**. 별 endpoint, 별 concurrency. - → MacBook 전용 `asyncio.Semaphore(1)` (single-inference 가정) 분리. -- Gemma backend 는 기존 path 그대로 (acquire_mlx_gate(FOREGROUND) + ai.primary). - llm_gate 영구 룰 ([[feedback_docstring_invariant_swap_audit]] 케이스) 보존. +- Mac mini 26B 단일 inference (llm_gate, [[feedback_docstring_invariant_swap_audit]]) + 보존 = requires_gate=True 분기에서 `acquire_mlx_gate(Priority.FOREGROUND)` 유지. + router 경유로도 client-side mutex 효과는 동일. +- BackendUnavailable 매핑 정책 ([[feedback_no_silent_fallback_explicit_opt_in]]) 보존. + silent fallback 0 = router 가 503/502 반환하면 그대로 BackendUnavailable. + +## Rollback + +`DS_BACKENDS_VIA_ROUTER=false` env 로 legacy path (GemmaMacMiniBackend + +QwenMacBookBackend 직접 호출) 즉시 복귀. legacy class 1주 보존 후 별 cleanup PR. """ from __future__ import annotations import asyncio +import os from abc import ABC, abstractmethod from typing import TYPE_CHECKING @@ -34,9 +51,15 @@ if TYPE_CHECKING: logger = setup_logger("llm_backend") -# 명시 backend 식별자. None / "gemma-macmini" 는 default Gemma path. +# 명시 backend 식별자. QWEN_MACBOOK = "qwen-macbook" GEMMA_MACMINI = "gemma-macmini" +MAC_MINI_DEFAULT = "mac-mini-default" +CLAUDE_CLOUD = "claude-cloud" +AUTO = "auto" + +# Allowed user-facing alias keys (Query pattern 과 동기 — app/api/search.py:457). +_ALLOWED_ALIASES = {GEMMA_MACMINI, QWEN_MACBOOK, MAC_MINI_DEFAULT, CLAUDE_CLOUD, AUTO} class BackendUnavailable(Exception): @@ -59,9 +82,186 @@ class BackendBase(ABC): 가 status="llm_error" 로 매핑 (기존 동작). BackendUnavailable 만 503 으로 매핑. """ + async def generate_with_tools( + self, + messages: list[dict], + tools: list[dict], + *, + tool_choice: str = "auto", + timeout_read_s: int, + ) -> dict: + """ReAct loop 용 OpenAI 호환 chat completion with tool calling. + + Default = NotImplementedError. RouterBackend 와 QwenMacBookBackend (legacy) + 만 override. ReAct endpoint 가 미지원 backend 호출하면 명확한 에러. + """ + raise NotImplementedError( + f"{type(self).__name__} does not implement generate_with_tools" + ) + + +# ────────────────────────────────────────────────────────────────────────── +# RouterBackend (PR-2 신규, 기본 path) +# ────────────────────────────────────────────────────────────────────────── + + +class RouterBackend(BackendBase): + """모든 ask path 가 llm-router :8890 경유. alias 별 gate 적용. + + response shape = router 가 upstream OpenAI 호환 응답을 그대로 forward. + qwen-macbook tool calling response = mlx-vlm OpenAI 표준 호환 + (tests/fixtures/qwen_tool_call_response.json, [[reference_mlx_vlm_tool_calling]]). + """ + + def __init__( + self, + *, + router_url: str, + alias: str | None, + requires_gate: bool, + timeout_connect_s: int, + ): + self.name = alias or AUTO + self.router_url = router_url.rstrip("/") + self.alias = alias # None means "auto" (router rule + triage) + self.requires_gate = requires_gate + self.timeout_connect_s = timeout_connect_s + + def _build_payload( + self, + messages_or_prompt, + *, + tools: list[dict] | None = None, + tool_choice: str | None = None, + ) -> dict: + if isinstance(messages_or_prompt, str): + payload: dict = { + "messages": [{"role": "user", "content": messages_or_prompt}], + "max_tokens": 4096, + } + else: + payload = { + "messages": messages_or_prompt, + "max_tokens": 4096, + } + if self.alias: + payload["model"] = self.alias + if tools: + payload["tools"] = tools + if tool_choice in ("auto", "none"): + payload["tool_choice"] = tool_choice + return payload + + async def _post(self, payload: dict, *, timeout_read_s: int) -> dict: + timeout = httpx.Timeout( + connect=float(self.timeout_connect_s), + read=float(timeout_read_s), + write=10.0, + pool=5.0, + ) + url = f"{self.router_url}/v1/chat/completions" + try: + async with httpx.AsyncClient(timeout=timeout) as client: + resp = await client.post(url, json=payload) + # router 가 503 (provider_not_configured / 기타 router-side 503) → BackendUnavailable + if resp.status_code == 503: + try: + body = resp.json() + err = body.get("error", {}) if isinstance(body, dict) else {} + reason = ( + err.get("type") + or err.get("error_reason") + or "router_503" + ) + except Exception: + reason = "router_503" + raise BackendUnavailable(self.name, reason) + # router 가 400 unknown_alias → 코드 bug. 일반 예외 (호출자가 5xx 로 변환) + if resp.status_code == 400: + try: + body = resp.json() + except Exception: + body = {} + raise ValueError( + f"router rejected alias={self.alias!r} body={body!r}" + ) + # router 가 502 (upstream unavailable, M5 cold 등) → BackendUnavailable + if resp.status_code == 502: + try: + body = resp.json() + except Exception: + body = {} + raise BackendUnavailable( + self.name, + f"upstream_502_{body.get('error', 'unknown')[:32]}", + ) + resp.raise_for_status() + return resp.json() + except ( + httpx.ConnectError, + httpx.ConnectTimeout, + httpx.ReadTimeout, + httpx.PoolTimeout, + httpx.WriteTimeout, + httpx.RemoteProtocolError, + ) as exc: + logger.warning( + "router_backend unavailable alias=%s url=%s exc=%s", + self.alias, url, type(exc).__name__, + ) + raise BackendUnavailable( + self.name, f"router_{type(exc).__name__}" + ) from exc + except httpx.HTTPStatusError as exc: + if 500 <= exc.response.status_code < 600: + logger.warning( + "router_backend 5xx alias=%s status=%d", + self.alias, exc.response.status_code, + ) + raise BackendUnavailable( + self.name, f"router_http_{exc.response.status_code}" + ) from exc + raise + + async def generate(self, prompt: str, *, timeout_read_s: int) -> str: + payload = self._build_payload(prompt) + if self.requires_gate: + async with acquire_mlx_gate(Priority.FOREGROUND): + async with asyncio.timeout(timeout_read_s): + data = await self._post(payload, timeout_read_s=timeout_read_s) + else: + data = await self._post(payload, timeout_read_s=timeout_read_s) + return data["choices"][0]["message"]["content"] + + async def generate_with_tools( + self, + messages: list[dict], + tools: list[dict], + *, + tool_choice: str = "auto", + timeout_read_s: int, + ) -> dict: + payload = self._build_payload( + messages, tools=tools, tool_choice=tool_choice, + ) + if self.requires_gate: + async with acquire_mlx_gate(Priority.FOREGROUND): + async with asyncio.timeout(timeout_read_s): + data = await self._post(payload, timeout_read_s=timeout_read_s) + else: + data = await self._post(payload, timeout_read_s=timeout_read_s) + return data["choices"][0]["message"] + + +# ────────────────────────────────────────────────────────────────────────── +# Legacy backends (rollback safety, DS_BACKENDS_VIA_ROUTER=false 시만 사용) +# 1주 후 별 cleanup PR 로 폐기 ([[feedback_closure_gate_vs_observation]] — +# dual-path = rollback safety only, 시간 관찰 게이트 0). +# ────────────────────────────────────────────────────────────────────────── + class GemmaMacMiniBackend(BackendBase): - """기존 Mac mini ai.primary 경로 그대로. 코드 변경 0 path.""" + """[LEGACY] 기존 Mac mini ai.primary 직접 호출. DS_BACKENDS_VIA_ROUTER=false 시만.""" name = GEMMA_MACMINI @@ -82,13 +282,7 @@ class GemmaMacMiniBackend(BackendBase): class QwenMacBookBackend(BackendBase): - """MacBook M5 Max mlx-vlm.server (Tailscale) 직접 호출. - - - Mac mini llm_gate 점유 X (별 endpoint 라 의미 없음 + 큐 분할 금지 영구 룰의 - 대상이 아님) - - MacBook 자체 single-inference 가정 → 별 semaphore(1) - - 연결 거부 / DNS / timeout / 5xx → BackendUnavailable - """ + """[LEGACY] MacBook M5 Max mlx-vlm.server (Tailscale) 직접 호출. DS_BACKENDS_VIA_ROUTER=false 시만.""" name = QWEN_MACBOOK _gate: asyncio.Semaphore | None = None @@ -134,15 +328,15 @@ class QwenMacBookBackend(BackendBase): httpx.RemoteProtocolError, ) as exc: logger.warning( - "qwen-macbook unavailable url=%s exc=%s", + "qwen-macbook[legacy] unavailable url=%s exc=%s", url, type(exc).__name__, ) raise BackendUnavailable(self.name, type(exc).__name__) from exc except httpx.HTTPStatusError as exc: - # 5xx 만 unavailable, 4xx 는 호출자 잘못 → 일반 예외 전파 if 500 <= exc.response.status_code < 600: logger.warning( - "qwen-macbook 5xx status=%d", exc.response.status_code, + "qwen-macbook[legacy] 5xx status=%d", + exc.response.status_code, ) raise BackendUnavailable( self.name, f"http_{exc.response.status_code}" @@ -157,19 +351,6 @@ class QwenMacBookBackend(BackendBase): tool_choice: str = "auto", timeout_read_s: int, ) -> dict: - """OpenAI 호환 chat completion with tool calling (ReAct loop 용). - - Returns: `choices[0].message` dict 그대로 — `content` (Optional[str]) + - `tool_calls` (Optional[list]) 둘 다 포함. - - Response shape = G0-1 fixture `tests/fixtures/qwen_tool_call_response.json` - 기준 (mlx-vlm OpenAI 표준 호환). tool_calls[].function.arguments 는 - **JSON string** 으로 옴 — 호출자가 json.loads 필요. - - - `tool_choice="auto"`: 모델이 tool 호출 여부 결정 - - `tool_choice="none"`: tool 호출 금지, content 만 반환 (final round) - - `tools=[]` + `tool_choice="none"`: tool 정의 없이 final answer 강제 - """ gate = self._get_gate() timeout = httpx.Timeout( connect=float(self.timeout_connect_s), @@ -203,14 +384,15 @@ class QwenMacBookBackend(BackendBase): httpx.RemoteProtocolError, ) as exc: logger.warning( - "qwen-macbook(tools) unavailable url=%s exc=%s", + "qwen-macbook[legacy](tools) unavailable url=%s exc=%s", url, type(exc).__name__, ) raise BackendUnavailable(self.name, type(exc).__name__) from exc except httpx.HTTPStatusError as exc: if 500 <= exc.response.status_code < 600: logger.warning( - "qwen-macbook(tools) 5xx status=%d", exc.response.status_code, + "qwen-macbook[legacy](tools) 5xx status=%d", + exc.response.status_code, ) raise BackendUnavailable( self.name, f"http_{exc.response.status_code}" @@ -218,38 +400,113 @@ class QwenMacBookBackend(BackendBase): raise -# ── dispatcher ───────────────────────────────────────────────────────────── +# ────────────────────────────────────────────────────────────────────────── +# Dispatcher (PR-2: dual-path with DS_BACKENDS_VIA_ROUTER env flag) +# ────────────────────────────────────────────────────────────────────────── -_BACKENDS: dict[str, BackendBase] = {} + +def _via_router() -> bool: + """`DS_BACKENDS_VIA_ROUTER=true` (default) = RouterBackend. + false 시 legacy GemmaMacMiniBackend/QwenMacBookBackend (rollback safety). + """ + return os.getenv("DS_BACKENDS_VIA_ROUTER", "true").lower() == "true" + + +_ROUTER_BACKENDS: dict[str, RouterBackend] = {} +_LEGACY_BACKENDS: dict[str, BackendBase] = {} + + +def _router_url() -> str: + """router URL = settings 우선, fallback env, fallback hardcoded MVP default.""" + cfg = settings.search.ask.backend + cfg_url = getattr(cfg, "router_url", "") or "" + if cfg_url: + return cfg_url + return os.getenv("LLM_ROUTER_URL", "http://100.76.254.116:8890") + + +def _build_router_backend(alias: str | None, requires_gate: bool) -> RouterBackend: + cfg = settings.search.ask.backend + return RouterBackend( + router_url=_router_url(), + alias=alias, + requires_gate=requires_gate, + timeout_connect_s=cfg.timeout_connect_s, + ) def _build_qwen_backend() -> QwenMacBookBackend: - b = settings.search.ask.backend + cfg = settings.search.ask.backend return QwenMacBookBackend( - base_url=b.macbook_url, - model=b.macbook_model, - timeout_connect_s=b.timeout_connect_s, + base_url=cfg.macbook_url, + model=cfg.macbook_model, + timeout_connect_s=cfg.timeout_connect_s, ) +def _get_router_backend(name: str | None) -> RouterBackend: + """RouterBackend path. PR-2 default.""" + key = (name or "").strip().lower() + + if key in ("", GEMMA_MACMINI, MAC_MINI_DEFAULT): + cache_key = MAC_MINI_DEFAULT + if cache_key not in _ROUTER_BACKENDS: + _ROUTER_BACKENDS[cache_key] = _build_router_backend( + alias=MAC_MINI_DEFAULT, requires_gate=True, + ) + return _ROUTER_BACKENDS[cache_key] + if key == QWEN_MACBOOK: + if QWEN_MACBOOK not in _ROUTER_BACKENDS: + _ROUTER_BACKENDS[QWEN_MACBOOK] = _build_router_backend( + alias=QWEN_MACBOOK, requires_gate=False, + ) + return _ROUTER_BACKENDS[QWEN_MACBOOK] + if key == CLAUDE_CLOUD: + if CLAUDE_CLOUD not in _ROUTER_BACKENDS: + _ROUTER_BACKENDS[CLAUDE_CLOUD] = _build_router_backend( + alias=CLAUDE_CLOUD, requires_gate=False, + ) + return _ROUTER_BACKENDS[CLAUDE_CLOUD] + if key == AUTO: + if AUTO not in _ROUTER_BACKENDS: + # auto = router 의 rule + triage. tier_b 갈 가능성 큼 → gate 보호 보수적. + _ROUTER_BACKENDS[AUTO] = _build_router_backend( + alias=None, requires_gate=True, + ) + return _ROUTER_BACKENDS[AUTO] + raise ValueError(f"unknown backend: {name!r}") + + +def _get_legacy_backend(name: str | None) -> BackendBase: + """Rollback path. DS_BACKENDS_VIA_ROUTER=false 시만.""" + key = (name or "").strip().lower() or GEMMA_MACMINI + if key == MAC_MINI_DEFAULT: + key = GEMMA_MACMINI # legacy 는 mac-mini-default alias 모름 + if key == AUTO: + key = GEMMA_MACMINI # legacy 에 auto 개념 없음 → default 로 + if key == CLAUDE_CLOUD: + raise ValueError( + f"backend {CLAUDE_CLOUD!r} requires DS_BACKENDS_VIA_ROUTER=true" + ) + if key not in (GEMMA_MACMINI, QWEN_MACBOOK): + raise ValueError(f"unknown backend: {name!r}") + if key not in _LEGACY_BACKENDS: + if key == GEMMA_MACMINI: + _LEGACY_BACKENDS[key] = GemmaMacMiniBackend() + else: + _LEGACY_BACKENDS[key] = _build_qwen_backend() + return _LEGACY_BACKENDS[key] + + def get_backend(name: str | None) -> BackendBase: """name 으로 backend 인스턴스 반환 (캐싱). - - None / "" / "gemma-macmini" → Gemma Mac mini (default) - - "qwen-macbook" → MacBook Qwen - - 그 외 → ValueError (호출자가 400 으로 매핑) + DS_BACKENDS_VIA_ROUTER=true (default, PR-2) → RouterBackend + DS_BACKENDS_VIA_ROUTER=false → legacy GemmaMacMiniBackend / QwenMacBookBackend """ - key = (name or "").strip().lower() or GEMMA_MACMINI - - if key not in (GEMMA_MACMINI, QWEN_MACBOOK): - raise ValueError(f"unknown backend: {name!r}") - - if key not in _BACKENDS: - if key == GEMMA_MACMINI: - _BACKENDS[key] = GemmaMacMiniBackend() - else: - _BACKENDS[key] = _build_qwen_backend() - return _BACKENDS[key] + if _via_router(): + return _get_router_backend(name) + return _get_legacy_backend(name) def reset_backends_for_test() -> None: @@ -257,5 +514,6 @@ def reset_backends_for_test() -> None: production code 에서 사용 X. """ - _BACKENDS.clear() + _ROUTER_BACKENDS.clear() + _LEGACY_BACKENDS.clear() QwenMacBookBackend._gate = None diff --git a/docker-compose.yml b/docker-compose.yml index e97268e..c52575e 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -213,6 +213,11 @@ services: - LAPTOP_WORKER_BOT_TOKEN_ENABLED=${LAPTOP_WORKER_BOT_TOKEN_ENABLED:-false} - LAPTOP_WORKER_BOT_USERNAME=${LAPTOP_WORKER_BOT_USERNAME:-laptop-worker-bot} - LAPTOP_WORKER_BOT_TOKEN_EXPIRE_DAYS=${LAPTOP_WORKER_BOT_TOKEN_EXPIRE_DAYS:-365} + # PR-2 of DS AI routing policy (2026-05-23) — backends dispatcher via llm-router. + # router_url default points at Mac mini Tailscale interface :8890 (PR-1). + # DS_BACKENDS_VIA_ROUTER=false 로 legacy 직접 호출 path 즉시 복귀. + - LLM_ROUTER_URL=${LLM_ROUTER_URL:-http://100.76.254.116:8890} + - DS_BACKENDS_VIA_ROUTER=${DS_BACKENDS_VIA_ROUTER:-true} restart: unless-stopped frontend: