gpu-services/hub-api/routers/chat.py

from typing import List, Optional

from fastapi import APIRouter, HTTPException, Request
from fastapi.responses import JSONResponse, StreamingResponse
from pydantic import BaseModel

from middleware.rate_limit import check_backend_rate_limit
from services import proxy_ollama, proxy_openai
from services.registry import registry

router = APIRouter(prefix="/v1", tags=["chat"])


class ChatMessage(BaseModel):
    role: str
    content: str


class ChatRequest(BaseModel):
    model: str
    messages: List[ChatMessage]
    stream: bool = False
    temperature: Optional[float] = None
    max_tokens: Optional[int] = None


@router.post("/chat/completions")
async def chat_completions(body: ChatRequest, request: Request):
    role = getattr(request.state, "role", "anonymous")
    if role == "anonymous":
        raise HTTPException(
            status_code=401,
            detail={"error": {"message": "Authentication required", "type": "auth_error", "code": "unauthorized"}},
        )

    # Resolve model to backend
    result = registry.resolve_model(body.model, role)
    if not result:
        raise HTTPException(
            status_code=404,
            detail={
                "error": {
                    "message": f"Model '{body.model}' not found or not available",
                    "type": "invalid_request_error",
                    "code": "model_not_found",
                }
            },
        )

    backend, model_info = result

    # Check rate limit
    check_backend_rate_limit(backend.id)

    # Record request for rate limiting
    registry.record_request(backend.id)

    messages = [{"role": m.role, "content": m.content} for m in body.messages]
    kwargs = {}
    if body.temperature is not None:
        kwargs["temperature"] = body.temperature

    # Use backend-specific model ID if configured, otherwise use the user-facing ID
    actual_model = model_info.backend_model_id or body.model

    # Route to appropriate proxy
    if backend.type == "ollama":
        if body.stream:
            return StreamingResponse(
                proxy_ollama.stream_chat(
                    backend.url, actual_model, messages, **kwargs
                ),
                media_type="text/event-stream",
                headers={
                    "Cache-Control": "no-cache",
                    "X-Accel-Buffering": "no",
                },
            )
        else:
            result = await proxy_ollama.complete_chat(
                backend.url, actual_model, messages, **kwargs
            )
            return JSONResponse(content=result)

    if backend.type == "openai-compat":
        if body.stream:
            return StreamingResponse(
                proxy_openai.stream_chat(
                    backend.url, actual_model, messages, **kwargs
                ),
                media_type="text/event-stream",
                headers={
                    "Cache-Control": "no-cache",
                    "X-Accel-Buffering": "no",
                },
            )
        else:
            result = await proxy_openai.complete_chat(
                backend.url, actual_model, messages, **kwargs
            )
            return JSONResponse(content=result)

    raise HTTPException(
        status_code=501,
        detail={
            "error": {
                "message": f"Backend type '{backend.type}' not yet implemented",
                "type": "api_error",
                "code": "not_implemented",
            }
        },
    )