diff --git a/backends.json b/backends.json index c6cb860..4641464 100644 --- a/backends.json +++ b/backends.json @@ -4,8 +4,17 @@ "type": "ollama", "url": "http://host.docker.internal:11434", "models": [ - { "id": "qwen3.5:9b-q8_0", "capabilities": ["chat"], "priority": 1 }, - { "id": "qwen3-vl:8b", "capabilities": ["chat", "vision"], "priority": 1 } + { "id": "bge-m3", "capabilities": ["embed"], "priority": 1 } + ], + "access": "all", + "rate_limit": null + }, + { + "id": "mlx-mac", + "type": "openai-compat", + "url": "http://192.168.1.122:8800", + "models": [ + { "id": "qwen3.5:35b-a3b", "capabilities": ["chat"], "priority": 1 } ], "access": "all", "rate_limit": null diff --git a/hub-api/routers/chat.py b/hub-api/routers/chat.py index f5400cc..2d1d8a0 100644 --- a/hub-api/routers/chat.py +++ b/hub-api/routers/chat.py @@ -5,7 +5,7 @@ from fastapi.responses import JSONResponse, StreamingResponse from pydantic import BaseModel from middleware.rate_limit import check_backend_rate_limit -from services import proxy_ollama +from services import proxy_ollama, proxy_openai from services.registry import registry router = APIRouter(prefix="/v1", tags=["chat"]) @@ -79,7 +79,24 @@ async def chat_completions(body: ChatRequest, request: Request): ) return JSONResponse(content=result) - # Placeholder for other backend types + if backend.type == "openai-compat": + if body.stream: + return StreamingResponse( + proxy_openai.stream_chat( + backend.url, body.model, messages, **kwargs + ), + media_type="text/event-stream", + headers={ + "Cache-Control": "no-cache", + "X-Accel-Buffering": "no", + }, + ) + else: + result = await proxy_openai.complete_chat( + backend.url, body.model, messages, **kwargs + ) + return JSONResponse(content=result) + raise HTTPException( status_code=501, detail={ diff --git a/hub-api/services/proxy_openai.py b/hub-api/services/proxy_openai.py new file mode 100644 index 0000000..5157172 --- /dev/null +++ b/hub-api/services/proxy_openai.py @@ -0,0 +1,83 @@ +"""OpenAI-compatible proxy (MLX server, vLLM, etc.) — SSE passthrough.""" + +from __future__ import annotations + +import json +import logging +from collections.abc import AsyncGenerator + +import httpx + +logger = logging.getLogger(__name__) + + +async def stream_chat( + base_url: str, + model: str, + messages: list[dict], + **kwargs, +) -> AsyncGenerator[str, None]: + """Proxy OpenAI-compatible chat streaming. SSE passthrough with model field override.""" + payload = { + "model": model, + "messages": messages, + "stream": True, + **{k: v for k, v in kwargs.items() if v is not None}, + } + + async with httpx.AsyncClient(timeout=120.0) as client: + async with client.stream( + "POST", + f"{base_url}/v1/chat/completions", + json=payload, + ) as resp: + if resp.status_code != 200: + body = await resp.aread() + error_msg = body.decode("utf-8", errors="replace") + yield _error_event(f"Backend error ({resp.status_code}): {error_msg}") + return + + async for line in resp.aiter_lines(): + if not line.strip(): + continue + # Pass through SSE lines as-is (already in OpenAI format) + if line.startswith("data: "): + yield f"{line}\n\n" + elif line == "data: [DONE]": + yield "data: [DONE]\n\n" + + +async def complete_chat( + base_url: str, + model: str, + messages: list[dict], + **kwargs, +) -> dict: + """Non-streaming OpenAI-compatible chat.""" + payload = { + "model": model, + "messages": messages, + "stream": False, + **{k: v for k, v in kwargs.items() if v is not None}, + } + + async with httpx.AsyncClient(timeout=120.0) as client: + resp = await client.post(f"{base_url}/v1/chat/completions", json=payload) + resp.raise_for_status() + return resp.json() + + +def _error_event(message: str) -> str: + error = { + "id": "chatcmpl-gateway", + "object": "chat.completion.chunk", + "model": "error", + "choices": [ + { + "index": 0, + "delta": {"content": f"[Error] {message}"}, + "finish_reason": "stop", + } + ], + } + return f"data: {json.dumps(error)}\n\ndata: [DONE]\n\n"