From 2dab682e2148677e7b5736740e703599bfc67253 Mon Sep 17 00:00:00 2001
From: Hyungi Ahn <hyungiahn@Hyungiui-MacBookPro.local>
Date: Tue, 31 Mar 2026 15:14:55 +0900
Subject: [PATCH] =?UTF-8?q?fix:=20backend=5Fmodel=5Fid=20=EB=A7=A4?=
 =?UTF-8?q?=ED=95=91=20=EC=B6=94=EA=B0=80=20=E2=80=94=20MLX=20=EB=AA=A8?=
 =?UTF-8?q?=EB=8D=B8=20ID=20=EB=B6=88=EC=9D=BC=EC=B9=98=20=ED=95=B4?=
 =?UTF-8?q?=EA=B2=B0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

MLX 서버 모델 ID(mlx-community/Qwen3.5-35B-A3B-4bit)와
사용자 노출 ID(qwen3.5:35b-a3b)가 달라 500 에러 발생.
registry에 backend_model_id 필드 추가하여 프록시 시 변환.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 backends.json                |  2 +-
 hub-api/routers/chat.py      | 11 +++++++----
 hub-api/services/registry.py |  2 ++
 3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/backends.json b/backends.json
index 4641464..60bf7b0 100644
--- a/backends.json
+++ b/backends.json
@@ -14,7 +14,7 @@
     "type": "openai-compat",
     "url": "http://192.168.1.122:8800",
     "models": [
-      { "id": "qwen3.5:35b-a3b", "capabilities": ["chat"], "priority": 1 }
+      { "id": "qwen3.5:35b-a3b", "backend_model_id": "mlx-community/Qwen3.5-35B-A3B-4bit", "capabilities": ["chat"], "priority": 1 }
     ],
     "access": "all",
     "rate_limit": null
diff --git a/hub-api/routers/chat.py b/hub-api/routers/chat.py
index 2d1d8a0..a0e43f4 100644
--- a/hub-api/routers/chat.py
+++ b/hub-api/routers/chat.py
@@ -60,12 +60,15 @@ async def chat_completions(body: ChatRequest, request: Request):
     if body.temperature is not None:
         kwargs["temperature"] = body.temperature
 
+    # Use backend-specific model ID if configured, otherwise use the user-facing ID
+    actual_model = model_info.backend_model_id or body.model
+
     # Route to appropriate proxy
     if backend.type == "ollama":
         if body.stream:
             return StreamingResponse(
                 proxy_ollama.stream_chat(
-                    backend.url, body.model, messages, **kwargs
+                    backend.url, actual_model, messages, **kwargs
                 ),
                 media_type="text/event-stream",
                 headers={
@@ -75,7 +78,7 @@ async def chat_completions(body: ChatRequest, request: Request):
             )
         else:
             result = await proxy_ollama.complete_chat(
-                backend.url, body.model, messages, **kwargs
+                backend.url, actual_model, messages, **kwargs
             )
             return JSONResponse(content=result)
 
@@ -83,7 +86,7 @@ async def chat_completions(body: ChatRequest, request: Request):
         if body.stream:
             return StreamingResponse(
                 proxy_openai.stream_chat(
-                    backend.url, body.model, messages, **kwargs
+                    backend.url, actual_model, messages, **kwargs
                 ),
                 media_type="text/event-stream",
                 headers={
@@ -93,7 +96,7 @@ async def chat_completions(body: ChatRequest, request: Request):
             )
         else:
             result = await proxy_openai.complete_chat(
-                backend.url, body.model, messages, **kwargs
+                backend.url, actual_model, messages, **kwargs
             )
             return JSONResponse(content=result)
 
diff --git a/hub-api/services/registry.py b/hub-api/services/registry.py
index a689a43..815422c 100644
--- a/hub-api/services/registry.py
+++ b/hub-api/services/registry.py
@@ -17,6 +17,7 @@ class ModelInfo:
     id: str
     capabilities: list[str]
     priority: int = 1
+    backend_model_id: str = ""  # actual model ID sent to backend (if different from id)
 
 
 @dataclass
@@ -68,6 +69,7 @@ class Registry:
                     id=m["id"],
                     capabilities=m.get("capabilities", ["chat"]),
                     priority=m.get("priority", 1),
+                    backend_model_id=m.get("backend_model_id", ""),
                 )
                 for m in entry.get("models", [])
             ]