from typing import List, Optional from fastapi import APIRouter, HTTPException, Request from fastapi.responses import JSONResponse, StreamingResponse from pydantic import BaseModel from middleware.rate_limit import check_backend_rate_limit from services import proxy_ollama, proxy_openai from services.registry import registry router = APIRouter(prefix="/v1", tags=["chat"]) class ChatMessage(BaseModel): role: str content: str class ChatRequest(BaseModel): model: str messages: List[ChatMessage] stream: bool = False temperature: Optional[float] = None max_tokens: Optional[int] = None @router.post("/chat/completions") async def chat_completions(body: ChatRequest, request: Request): role = getattr(request.state, "role", "anonymous") if role == "anonymous": raise HTTPException( status_code=401, detail={"error": {"message": "Authentication required", "type": "auth_error", "code": "unauthorized"}}, ) # Resolve model to backend result = registry.resolve_model(body.model, role) if not result: raise HTTPException( status_code=404, detail={ "error": { "message": f"Model '{body.model}' not found or not available", "type": "invalid_request_error", "code": "model_not_found", } }, ) backend, model_info = result # Check rate limit check_backend_rate_limit(backend.id) # Record request for rate limiting registry.record_request(backend.id) messages = [{"role": m.role, "content": m.content} for m in body.messages] kwargs = {} if body.temperature is not None: kwargs["temperature"] = body.temperature # Use backend-specific model ID if configured, otherwise use the user-facing ID actual_model = model_info.backend_model_id or body.model # Route to appropriate proxy if backend.type == "ollama": if body.stream: return StreamingResponse( proxy_ollama.stream_chat( backend.url, actual_model, messages, **kwargs ), media_type="text/event-stream", headers={ "Cache-Control": "no-cache", "X-Accel-Buffering": "no", }, ) else: result = await proxy_ollama.complete_chat( backend.url, actual_model, messages, **kwargs ) return JSONResponse(content=result) if backend.type == "openai-compat": if body.stream: return StreamingResponse( proxy_openai.stream_chat( backend.url, actual_model, messages, **kwargs ), media_type="text/event-stream", headers={ "Cache-Control": "no-cache", "X-Accel-Buffering": "no", }, ) else: result = await proxy_openai.complete_chat( backend.url, actual_model, messages, **kwargs ) return JSONResponse(content=result) raise HTTPException( status_code=501, detail={ "error": { "message": f"Backend type '{backend.type}' not yet implemented", "type": "api_error", "code": "not_implemented", } }, )