fix: backend_model_id 매핑 추가 — MLX 모델 ID 불일치 해결

MLX 서버 모델 ID(mlx-community/Qwen3.5-35B-A3B-4bit)와 사용자 노출 ID(qwen3.5:35b-a3b)가 달라 500 에러 발생. registry에 backend_model_id 필드 추가하여 프록시 시 변환. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-31 15:14:55 +09:00
parent 7b28252d4f
commit 2dab682e21
3 changed files with 10 additions and 5 deletions
@@ -14,7 +14,7 @@
    "type": "openai-compat",
    "url": "http://192.168.1.122:8800",
    "models": [
-      { "id": "qwen3.5:35b-a3b", "capabilities": ["chat"], "priority": 1 }
+      { "id": "qwen3.5:35b-a3b", "backend_model_id": "mlx-community/Qwen3.5-35B-A3B-4bit", "capabilities": ["chat"], "priority": 1 }
    ],
    "access": "all",
    "rate_limit": null
@@ -60,12 +60,15 @@ async def chat_completions(body: ChatRequest, request: Request):
    if body.temperature is not None:
        kwargs["temperature"] = body.temperature
    # Use backend-specific model ID if configured, otherwise use the user-facing ID
    actual_model = model_info.backend_model_id or body.model
    # Route to appropriate proxy
    if backend.type == "ollama":
        if body.stream:
            return StreamingResponse(
                proxy_ollama.stream_chat(
-                    backend.url, body.model, messages, **kwargs
+                    backend.url, actual_model, messages, **kwargs
                ),
                media_type="text/event-stream",
                headers={
@@ -75,7 +78,7 @@ async def chat_completions(body: ChatRequest, request: Request):
            )
        else:
            result = await proxy_ollama.complete_chat(
-                backend.url, body.model, messages, **kwargs
+                backend.url, actual_model, messages, **kwargs
            )
            return JSONResponse(content=result)
@@ -83,7 +86,7 @@ async def chat_completions(body: ChatRequest, request: Request):
        if body.stream:
            return StreamingResponse(
                proxy_openai.stream_chat(
-                    backend.url, body.model, messages, **kwargs
+                    backend.url, actual_model, messages, **kwargs
                ),
                media_type="text/event-stream",
                headers={
@@ -93,7 +96,7 @@ async def chat_completions(body: ChatRequest, request: Request):
            )
        else:
            result = await proxy_openai.complete_chat(
-                backend.url, body.model, messages, **kwargs
+                backend.url, actual_model, messages, **kwargs
            )
            return JSONResponse(content=result)
@@ -17,6 +17,7 @@ class ModelInfo:
    id: str
    capabilities: list[str]
    priority: int = 1
    backend_model_id: str = ""  # actual model ID sent to backend (if different from id)
@dataclass
@@ -68,6 +69,7 @@ class Registry:
                    id=m["id"],
                    capabilities=m.get("capabilities", ["chat"]),
                    priority=m.get("priority", 1),
                    backend_model_id=m.get("backend_model_id", ""),
                )
                for m in entry.get("models", [])
            ]