Merge pull request 'Feat/ds ai routing policy' (#23) from feat/ds-ai-routing-policy into main

Reviewed-on: #23
2026-05-24 12:20:49 +09:00
parent 711d4952a2 00edd6bff8
commit 1ae7802485
5 changed files with 462 additions and 96 deletions
@@ -532,12 +532,15 @@ async def ask(
    backend: Annotated[
        str | None,
        Query(
-            pattern="^(qwen-macbook|gemma-macmini)$",
+            pattern="^(qwen-macbook|gemma-macmini|mac-mini-default|claude-cloud|auto)$",
            description=(
-                "PR-MacBook-RAG-Backend-1: 명시 backend opt-in. "
-                "미지정 = gemma-macmini (Mac mini, default). "
-                "'qwen-macbook' = MacBook M5 Max Qwen 3.6 27B. "
-                "MacBook unavailable 시 503 + error_reason=macbook_unavailable "
+                "PR-2 of DS AI routing policy (2026-05-23) — 명시 backend opt-in via llm-router. "
+                "미지정 = mac-mini-default (gemma-macmini alias, default). "
+                "'mac-mini-default' = router 가 tier_b (Mac mini gemma-4-26b). "
+                "'qwen-macbook' = router 가 named upstream (M5 Max Qwen 3.6 27B). "
+                "'claude-cloud' = router 가 503 provider_not_configured (활성화 별 PR). "
+                "'auto' = router 의 rule + LLM triage. "
+                "backend unavailable 시 503 + error_reason=macbook_unavailable / router_* "
                "(자동 fallback 없음 — 다시 호출하거나 backend 인자 제거 후 재시도)."
            ),
        ),
@@ -1054,11 +1057,14 @@ async def ask_react(
      - debug_trace: list[dict] | null (debug=true 시 round 별 trace)
    """
    # 지연 import — 순환 의존성 회피 (react_loop 가 api.search.SearchResult 사용 안 함)
-    from services.llm.backends import BackendUnavailable, QwenMacBookBackend, get_backend
+    from services.llm.backends import BackendUnavailable, get_backend
    from services.search.react_loop import agentic_ask_loop

    backend_inst = get_backend("qwen-macbook")
-    assert isinstance(backend_inst, QwenMacBookBackend)  # mypy / runtime guard
+    # PR-2 of DS AI routing policy: backend_inst may be RouterBackend (default)
+    # or QwenMacBookBackend (DS_BACKENDS_VIA_ROUTER=false rollback). Both
+    # implement generate_with_tools so the ReAct loop is identical.
+    assert hasattr(backend_inst, "generate_with_tools")

    try:
        result = await agentic_ask_loop(
@@ -40,18 +40,27 @@ class DeepSummaryBacklogConfig(BaseModel):


 class SearchAskBackendConfig(BaseModel):
-    """PR-MacBook-RAG-Backend-1: /api/search/ask backend dispatcher.
+    """PR-2 of DS AI routing policy ([[document-server-ai-routing-policy]], 2026-05-23):
+    /api/search/ask backend dispatcher 가 llm-router :8890 단일 경유.

-    backend 미지정 = Gemma Mac mini (settings.ai.primary 경로 그대로).
-    backend="qwen-macbook" 명시 opt-in = MacBook M5 Max mlx-vlm.server.
-    MacBook unavailable 시 503 + error_reason=macbook_unavailable (자동 fallback 없음).
+    - backend 미지정 / "gemma-macmini" / "mac-mini-default" → router 가 tier_b
+    - backend "qwen-macbook" → router 가 named upstream (M5 Max)
+    - backend "claude-cloud" → router 가 503 명시 (scaffold)
+    - backend "auto" → router 의 rule + LLM triage
+
+    Unavailable → BackendUnavailable → 503 명시 (silent fallback 0).
+    Rollback: DS_BACKENDS_VIA_ROUTER=false 로 legacy 직접 호출 path.
+    legacy macmini_url / macbook_url / macbook_model 은 fallback 시만 사용.
    """

+    # PR-2 신규: llm-router URL. 비면 env LLM_ROUTER_URL 또는 hardcoded default.
+    router_url: str = ""
+    # Legacy fields (DS_BACKENDS_VIA_ROUTER=false 시만 사용)
    macmini_url: str = "http://100.76.254.116:8801"
    macbook_url: str = "http://100.118.112.84:8810"
    macbook_model: str = "mlx-community/Qwen3.6-27B-8bit"
-    timeout_connect_s: int = 1
-    timeout_read_s: int = 30
+    timeout_connect_s: int = 5
+    timeout_read_s: int = 60


 class SearchAskReactConfig(BaseModel):
@@ -1,23 +1,40 @@
-"""PR-MacBook-RAG-Backend-1: /api/search/ask 의 명시 backend dispatcher.
+"""PR-2 of DS AI routing policy ([[document-server-ai-routing-policy]], 2026-05-23):
+/api/search/ask 의 명시 backend dispatcher. 모든 backend = llm-router :8890 경유.

-## 정책 (정정 4)
+## 정책 (PR-2 of routing policy, MVP 옵션 C — ask path 만 swap)

- 기본 (`backend` 미지정) = Gemma Mac mini. 기존 코드 경로 100% 보존.
- 명시 opt-in `backend="qwen-macbook"` 만 MacBook M5 Max mlx-vlm.server 호출.
- MacBook unavailable 시 `BackendUnavailable` 예외 → /ask wrapper 가 503 +
-  `error_reason="macbook_unavailable"` 응답. **Gemma 자동 fallback 금지**.
+- 기본 (`backend` 미지정) / `gemma-macmini` / `mac-mini-default`
+    → RouterBackend(alias="mac-mini-default", requires_gate=True)
+    → router 가 tier_b (Mac mini :8801 gemma-4-26b) 호출. llm_gate 영구 룰 보존.
+- `qwen-macbook`
+    → RouterBackend(alias="qwen-macbook", requires_gate=False)
+    → router 가 named upstream (M5 Max :8810 Qwen3.6-27B) 호출.
+- `claude-cloud`
+    → RouterBackend(alias="claude-cloud", requires_gate=False)
+    → router 가 503 provider_not_configured pass-through. activation = 별 PR.
+- `auto`
+    → RouterBackend(alias=None, requires_gate=True)
+    → router 가 rule + LLM triage 로 tier 결정. 안전상 Mac mini gate 보호 보수적.
+- 그 외 → ValueError (호출자가 400/422 으로 매핑)

 ## 영구 룰

- Qwen backend 는 **Mac mini llm_gate 점유 금지**. 별 endpoint, 별 concurrency.
-  → MacBook 전용 `asyncio.Semaphore(1)` (single-inference 가정) 분리.
- Gemma backend 는 기존 path 그대로 (acquire_mlx_gate(FOREGROUND) + ai.primary).
-  llm_gate 영구 룰 ([[feedback_docstring_invariant_swap_audit]] 케이스) 보존.
+- Mac mini 26B 단일 inference (llm_gate, [[feedback_docstring_invariant_swap_audit]])
+  보존 = requires_gate=True 분기에서 `acquire_mlx_gate(Priority.FOREGROUND)` 유지.
+  router 경유로도 client-side mutex 효과는 동일.
+- BackendUnavailable 매핑 정책 ([[feedback_no_silent_fallback_explicit_opt_in]]) 보존.
+  silent fallback 0 = router 가 503/502 반환하면 그대로 BackendUnavailable.
+
+## Rollback
+
+`DS_BACKENDS_VIA_ROUTER=false` env 로 legacy path (GemmaMacMiniBackend +
+QwenMacBookBackend 직접 호출) 즉시 복귀. legacy class 1주 보존 후 별 cleanup PR.
 """

 from __future__ import annotations

 import asyncio
+import os
 from abc import ABC, abstractmethod
 from typing import TYPE_CHECKING

@@ -34,9 +51,15 @@ if TYPE_CHECKING:
 logger = setup_logger("llm_backend")


-# 명시 backend 식별자. None / "gemma-macmini" 는 default Gemma path.
+# 명시 backend 식별자.
 QWEN_MACBOOK = "qwen-macbook"
 GEMMA_MACMINI = "gemma-macmini"
+MAC_MINI_DEFAULT = "mac-mini-default"
+CLAUDE_CLOUD = "claude-cloud"
+AUTO = "auto"
+
+# Allowed user-facing alias keys (Query pattern 과 동기 — app/api/search.py:457).
+_ALLOWED_ALIASES = {GEMMA_MACMINI, QWEN_MACBOOK, MAC_MINI_DEFAULT, CLAUDE_CLOUD, AUTO}


 class BackendUnavailable(Exception):
@@ -59,9 +82,186 @@ class BackendBase(ABC):
        가 status="llm_error" 로 매핑 (기존 동작). BackendUnavailable 만 503 으로 매핑.
        """

+    async def generate_with_tools(
+        self,
+        messages: list[dict],
+        tools: list[dict],
+        *,
+        tool_choice: str = "auto",
+        timeout_read_s: int,
+    ) -> dict:
+        """ReAct loop 용 OpenAI 호환 chat completion with tool calling.
+
+        Default = NotImplementedError. RouterBackend 와 QwenMacBookBackend (legacy)
+        만 override. ReAct endpoint 가 미지원 backend 호출하면 명확한 에러.
+        """
+        raise NotImplementedError(
+            f"{type(self).__name__} does not implement generate_with_tools"
+        )
+
+
+# ──────────────────────────────────────────────────────────────────────────
+# RouterBackend (PR-2 신규, 기본 path)
+# ──────────────────────────────────────────────────────────────────────────
+
+
+class RouterBackend(BackendBase):
+    """모든 ask path 가 llm-router :8890 경유. alias 별 gate 적용.
+
+    response shape = router 가 upstream OpenAI 호환 응답을 그대로 forward.
+    qwen-macbook tool calling response = mlx-vlm OpenAI 표준 호환
+    (tests/fixtures/qwen_tool_call_response.json, [[reference_mlx_vlm_tool_calling]]).
+    """
+
+    def __init__(
+        self,
+        *,
+        router_url: str,
+        alias: str | None,
+        requires_gate: bool,
+        timeout_connect_s: int,
+    ):
+        self.name = alias or AUTO
+        self.router_url = router_url.rstrip("/")
+        self.alias = alias  # None means "auto" (router rule + triage)
+        self.requires_gate = requires_gate
+        self.timeout_connect_s = timeout_connect_s
+
+    def _build_payload(
+        self,
+        messages_or_prompt,
+        *,
+        tools: list[dict] | None = None,
+        tool_choice: str | None = None,
+    ) -> dict:
+        if isinstance(messages_or_prompt, str):
+            payload: dict = {
+                "messages": [{"role": "user", "content": messages_or_prompt}],
+                "max_tokens": 4096,
+            }
+        else:
+            payload = {
+                "messages": messages_or_prompt,
+                "max_tokens": 4096,
+            }
+        if self.alias:
+            payload["model"] = self.alias
+        if tools:
+            payload["tools"] = tools
+        if tool_choice in ("auto", "none"):
+            payload["tool_choice"] = tool_choice
+        return payload
+
+    async def _post(self, payload: dict, *, timeout_read_s: int) -> dict:
+        timeout = httpx.Timeout(
+            connect=float(self.timeout_connect_s),
+            read=float(timeout_read_s),
+            write=10.0,
+            pool=5.0,
+        )
+        url = f"{self.router_url}/v1/chat/completions"
+        try:
+            async with httpx.AsyncClient(timeout=timeout) as client:
+                resp = await client.post(url, json=payload)
+                # router 가 503 (provider_not_configured / 기타 router-side 503) → BackendUnavailable
+                if resp.status_code == 503:
+                    try:
+                        body = resp.json()
+                        err = body.get("error", {}) if isinstance(body, dict) else {}
+                        reason = (
+                            err.get("type")
+                            or err.get("error_reason")
+                            or "router_503"
+                        )
+                    except Exception:
+                        reason = "router_503"
+                    raise BackendUnavailable(self.name, reason)
+                # router 가 400 unknown_alias → 코드 bug. 일반 예외 (호출자가 5xx 로 변환)
+                if resp.status_code == 400:
+                    try:
+                        body = resp.json()
+                    except Exception:
+                        body = {}
+                    raise ValueError(
+                        f"router rejected alias={self.alias!r} body={body!r}"
+                    )
+                # router 가 502 (upstream unavailable, M5 cold 등) → BackendUnavailable
+                if resp.status_code == 502:
+                    try:
+                        body = resp.json()
+                    except Exception:
+                        body = {}
+                    raise BackendUnavailable(
+                        self.name,
+                        f"upstream_502_{body.get('error', 'unknown')[:32]}",
+                    )
+                resp.raise_for_status()
+                return resp.json()
+        except (
+            httpx.ConnectError,
+            httpx.ConnectTimeout,
+            httpx.ReadTimeout,
+            httpx.PoolTimeout,
+            httpx.WriteTimeout,
+            httpx.RemoteProtocolError,
+        ) as exc:
+            logger.warning(
+                "router_backend unavailable alias=%s url=%s exc=%s",
+                self.alias, url, type(exc).__name__,
+            )
+            raise BackendUnavailable(
+                self.name, f"router_{type(exc).__name__}"
+            ) from exc
+        except httpx.HTTPStatusError as exc:
+            if 500 <= exc.response.status_code < 600:
+                logger.warning(
+                    "router_backend 5xx alias=%s status=%d",
+                    self.alias, exc.response.status_code,
+                )
+                raise BackendUnavailable(
+                    self.name, f"router_http_{exc.response.status_code}"
+                ) from exc
+            raise
+
+    async def generate(self, prompt: str, *, timeout_read_s: int) -> str:
+        payload = self._build_payload(prompt)
+        if self.requires_gate:
+            async with acquire_mlx_gate(Priority.FOREGROUND):
+                async with asyncio.timeout(timeout_read_s):
+                    data = await self._post(payload, timeout_read_s=timeout_read_s)
+        else:
+            data = await self._post(payload, timeout_read_s=timeout_read_s)
+        return data["choices"][0]["message"]["content"]
+
+    async def generate_with_tools(
+        self,
+        messages: list[dict],
+        tools: list[dict],
+        *,
+        tool_choice: str = "auto",
+        timeout_read_s: int,
+    ) -> dict:
+        payload = self._build_payload(
+            messages, tools=tools, tool_choice=tool_choice,
+        )
+        if self.requires_gate:
+            async with acquire_mlx_gate(Priority.FOREGROUND):
+                async with asyncio.timeout(timeout_read_s):
+                    data = await self._post(payload, timeout_read_s=timeout_read_s)
+        else:
+            data = await self._post(payload, timeout_read_s=timeout_read_s)
+        return data["choices"][0]["message"]
+
+
+# ──────────────────────────────────────────────────────────────────────────
+# Legacy backends (rollback safety, DS_BACKENDS_VIA_ROUTER=false 시만 사용)
+# 1주 후 별 cleanup PR 로 폐기 ([[feedback_closure_gate_vs_observation]] —
+# dual-path = rollback safety only, 시간 관찰 게이트 0).
+# ──────────────────────────────────────────────────────────────────────────
+

 class GemmaMacMiniBackend(BackendBase):
-    """기존 Mac mini ai.primary 경로 그대로. 코드 변경 0 path."""
+    """[LEGACY] 기존 Mac mini ai.primary 직접 호출. DS_BACKENDS_VIA_ROUTER=false 시만."""

    name = GEMMA_MACMINI

@@ -82,13 +282,7 @@ class GemmaMacMiniBackend(BackendBase):


 class QwenMacBookBackend(BackendBase):
-    """MacBook M5 Max mlx-vlm.server (Tailscale) 직접 호출.
-
-    - Mac mini llm_gate 점유 X (별 endpoint 라 의미 없음 + 큐 분할 금지 영구 룰의
-      대상이 아님)
-    - MacBook 자체 single-inference 가정 → 별 semaphore(1)
-    - 연결 거부 / DNS / timeout / 5xx → BackendUnavailable
-    """
+    """[LEGACY] MacBook M5 Max mlx-vlm.server (Tailscale) 직접 호출. DS_BACKENDS_VIA_ROUTER=false 시만."""

    name = QWEN_MACBOOK
    _gate: asyncio.Semaphore | None = None
@@ -134,15 +328,15 @@ class QwenMacBookBackend(BackendBase):
                httpx.RemoteProtocolError,
            ) as exc:
                logger.warning(
-                    "qwen-macbook unavailable url=%s exc=%s",
+                    "qwen-macbook[legacy] unavailable url=%s exc=%s",
                    url, type(exc).__name__,
                )
                raise BackendUnavailable(self.name, type(exc).__name__) from exc
            except httpx.HTTPStatusError as exc:
-                # 5xx 만 unavailable, 4xx 는 호출자 잘못 → 일반 예외 전파
                if 500 <= exc.response.status_code < 600:
                    logger.warning(
-                        "qwen-macbook 5xx status=%d", exc.response.status_code,
+                        "qwen-macbook[legacy] 5xx status=%d",
+                        exc.response.status_code,
                    )
                    raise BackendUnavailable(
                        self.name, f"http_{exc.response.status_code}"
@@ -157,19 +351,6 @@ class QwenMacBookBackend(BackendBase):
        tool_choice: str = "auto",
        timeout_read_s: int,
    ) -> dict:
-        """OpenAI 호환 chat completion with tool calling (ReAct loop 용).
-
-        Returns: `choices[0].message` dict 그대로 — `content` (Optional[str]) +
-        `tool_calls` (Optional[list]) 둘 다 포함.
-
-        Response shape = G0-1 fixture `tests/fixtures/qwen_tool_call_response.json`
-        기준 (mlx-vlm OpenAI 표준 호환). tool_calls[].function.arguments 는
-        **JSON string** 으로 옴 — 호출자가 json.loads 필요.
-
-        - `tool_choice="auto"`: 모델이 tool 호출 여부 결정
-        - `tool_choice="none"`: tool 호출 금지, content 만 반환 (final round)
-        - `tools=[]` + `tool_choice="none"`: tool 정의 없이 final answer 강제
-        """
        gate = self._get_gate()
        timeout = httpx.Timeout(
            connect=float(self.timeout_connect_s),
@@ -203,14 +384,15 @@ class QwenMacBookBackend(BackendBase):
                httpx.RemoteProtocolError,
            ) as exc:
                logger.warning(
-                    "qwen-macbook(tools) unavailable url=%s exc=%s",
+                    "qwen-macbook[legacy](tools) unavailable url=%s exc=%s",
                    url, type(exc).__name__,
                )
                raise BackendUnavailable(self.name, type(exc).__name__) from exc
            except httpx.HTTPStatusError as exc:
                if 500 <= exc.response.status_code < 600:
                    logger.warning(
-                        "qwen-macbook(tools) 5xx status=%d", exc.response.status_code,
+                        "qwen-macbook[legacy](tools) 5xx status=%d",
+                        exc.response.status_code,
                    )
                    raise BackendUnavailable(
                        self.name, f"http_{exc.response.status_code}"
@@ -218,38 +400,113 @@ class QwenMacBookBackend(BackendBase):
                raise


-# ── dispatcher ─────────────────────────────────────────────────────────────
+# ──────────────────────────────────────────────────────────────────────────
+# Dispatcher (PR-2: dual-path with DS_BACKENDS_VIA_ROUTER env flag)
+# ──────────────────────────────────────────────────────────────────────────

-_BACKENDS: dict[str, BackendBase] = {}
+
+def _via_router() -> bool:
+    """`DS_BACKENDS_VIA_ROUTER=true` (default) = RouterBackend.
+    false 시 legacy GemmaMacMiniBackend/QwenMacBookBackend (rollback safety).
+    """
+    return os.getenv("DS_BACKENDS_VIA_ROUTER", "true").lower() == "true"
+
+
+_ROUTER_BACKENDS: dict[str, RouterBackend] = {}
+_LEGACY_BACKENDS: dict[str, BackendBase] = {}
+
+
+def _router_url() -> str:
+    """router URL = settings 우선, fallback env, fallback hardcoded MVP default."""
+    cfg = settings.search.ask.backend
+    cfg_url = getattr(cfg, "router_url", "") or ""
+    if cfg_url:
+        return cfg_url
+    return os.getenv("LLM_ROUTER_URL", "http://100.76.254.116:8890")
+
+
+def _build_router_backend(alias: str | None, requires_gate: bool) -> RouterBackend:
+    cfg = settings.search.ask.backend
+    return RouterBackend(
+        router_url=_router_url(),
+        alias=alias,
+        requires_gate=requires_gate,
+        timeout_connect_s=cfg.timeout_connect_s,
+    )


 def _build_qwen_backend() -> QwenMacBookBackend:
-    b = settings.search.ask.backend
+    cfg = settings.search.ask.backend
    return QwenMacBookBackend(
-        base_url=b.macbook_url,
-        model=b.macbook_model,
-        timeout_connect_s=b.timeout_connect_s,
+        base_url=cfg.macbook_url,
+        model=cfg.macbook_model,
+        timeout_connect_s=cfg.timeout_connect_s,
    )


+def _get_router_backend(name: str | None) -> RouterBackend:
+    """RouterBackend path. PR-2 default."""
+    key = (name or "").strip().lower()
+
+    if key in ("", GEMMA_MACMINI, MAC_MINI_DEFAULT):
+        cache_key = MAC_MINI_DEFAULT
+        if cache_key not in _ROUTER_BACKENDS:
+            _ROUTER_BACKENDS[cache_key] = _build_router_backend(
+                alias=MAC_MINI_DEFAULT, requires_gate=True,
+            )
+        return _ROUTER_BACKENDS[cache_key]
+    if key == QWEN_MACBOOK:
+        if QWEN_MACBOOK not in _ROUTER_BACKENDS:
+            _ROUTER_BACKENDS[QWEN_MACBOOK] = _build_router_backend(
+                alias=QWEN_MACBOOK, requires_gate=False,
+            )
+        return _ROUTER_BACKENDS[QWEN_MACBOOK]
+    if key == CLAUDE_CLOUD:
+        if CLAUDE_CLOUD not in _ROUTER_BACKENDS:
+            _ROUTER_BACKENDS[CLAUDE_CLOUD] = _build_router_backend(
+                alias=CLAUDE_CLOUD, requires_gate=False,
+            )
+        return _ROUTER_BACKENDS[CLAUDE_CLOUD]
+    if key == AUTO:
+        if AUTO not in _ROUTER_BACKENDS:
+            # auto = router 의 rule + triage. tier_b 갈 가능성 큼 → gate 보호 보수적.
+            _ROUTER_BACKENDS[AUTO] = _build_router_backend(
+                alias=None, requires_gate=True,
+            )
+        return _ROUTER_BACKENDS[AUTO]
+    raise ValueError(f"unknown backend: {name!r}")
+
+
+def _get_legacy_backend(name: str | None) -> BackendBase:
+    """Rollback path. DS_BACKENDS_VIA_ROUTER=false 시만."""
+    key = (name or "").strip().lower() or GEMMA_MACMINI
+    if key == MAC_MINI_DEFAULT:
+        key = GEMMA_MACMINI  # legacy 는 mac-mini-default alias 모름
+    if key == AUTO:
+        key = GEMMA_MACMINI  # legacy 에 auto 개념 없음 → default 로
+    if key == CLAUDE_CLOUD:
+        raise ValueError(
+            f"backend {CLAUDE_CLOUD!r} requires DS_BACKENDS_VIA_ROUTER=true"
+        )
+    if key not in (GEMMA_MACMINI, QWEN_MACBOOK):
+        raise ValueError(f"unknown backend: {name!r}")
+    if key not in _LEGACY_BACKENDS:
+        if key == GEMMA_MACMINI:
+            _LEGACY_BACKENDS[key] = GemmaMacMiniBackend()
+        else:
+            _LEGACY_BACKENDS[key] = _build_qwen_backend()
+    return _LEGACY_BACKENDS[key]
+
+
 def get_backend(name: str | None) -> BackendBase:
    """name 으로 backend 인스턴스 반환 (캐싱).

-    - None / "" / "gemma-macmini" → Gemma Mac mini (default)
-    - "qwen-macbook" → MacBook Qwen
-    - 그 외 → ValueError (호출자가 400 으로 매핑)
+    DS_BACKENDS_VIA_ROUTER=true (default, PR-2) → RouterBackend
+    DS_BACKENDS_VIA_ROUTER=false → legacy GemmaMacMiniBackend / QwenMacBookBackend
    """
-    key = (name or "").strip().lower() or GEMMA_MACMINI
-
-    if key not in (GEMMA_MACMINI, QWEN_MACBOOK):
-        raise ValueError(f"unknown backend: {name!r}")
-
-    if key not in _BACKENDS:
-        if key == GEMMA_MACMINI:
-            _BACKENDS[key] = GemmaMacMiniBackend()
-        else:
-            _BACKENDS[key] = _build_qwen_backend()
-    return _BACKENDS[key]
+    if _via_router():
+        return _get_router_backend(name)
+    return _get_legacy_backend(name)


 def reset_backends_for_test() -> None:
@@ -257,5 +514,6 @@ def reset_backends_for_test() -> None:

    production code 에서 사용 X.
    """
-    _BACKENDS.clear()
+    _ROUTER_BACKENDS.clear()
+    _LEGACY_BACKENDS.clear()
    QwenMacBookBackend._gate = None