feat: AI Gateway Phase 1 - FastAPI 코어 구현

GPU 서버 중앙 AI 라우팅 서비스 초기 구현:
- OpenAI 호환 API (/v1/chat/completions, /v1/models, /v1/embeddings)
- 모델 레지스트리 + 백엔드 헬스체크 (30초 루프)
- Ollama SSE 프록시 (NDJSON → OpenAI SSE 변환)
- JWT 인증 이중 경로 (httpOnly 쿠키 + Bearer 토큰)
- owner/guest 역할 분리, 로그인 rate limiting
- 백엔드별 rate limiting (NanoClaude 대비)
- SQLite 스키마 사전 정의 (aiosqlite + WAL)
- Docker Compose + Caddy 리버스 프록시

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Hyungi Ahn
2026-03-31 13:41:46 +09:00
commit 3794afff95
27 changed files with 1121 additions and 0 deletions

16
hub-api/Dockerfile Normal file
View File

@@ -0,0 +1,16 @@
FROM python:3.12-slim
RUN apt-get update && apt-get install -y --no-install-recommends curl && rm -rf /var/lib/apt/lists/*
WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY . .
RUN mkdir -p /app/data
EXPOSE 8000
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]

21
hub-api/config.py Normal file
View File

@@ -0,0 +1,21 @@
from pydantic_settings import BaseSettings
class Settings(BaseSettings):
owner_password: str = "changeme"
guest_password: str = "guest"
jwt_secret: str = "dev-secret-change-in-production"
jwt_algorithm: str = "HS256"
jwt_expire_hours: int = 24
backends_config: str = "/app/config/backends.json"
cors_origins: str = "http://localhost:5173"
nvidia_smi_path: str = "/usr/bin/nvidia-smi"
db_path: str = "/app/data/gateway.db"
model_config = {"env_file": ".env", "extra": "ignore"}
settings = Settings()

0
hub-api/db/__init__.py Normal file
View File

50
hub-api/db/database.py Normal file
View File

@@ -0,0 +1,50 @@
import aiosqlite
from config import settings
SCHEMA = """
CREATE TABLE IF NOT EXISTS chat_sessions (
id TEXT PRIMARY KEY,
title TEXT,
model TEXT NOT NULL,
role TEXT NOT NULL DEFAULT 'guest',
created_at REAL NOT NULL
);
CREATE TABLE IF NOT EXISTS chat_messages (
id TEXT PRIMARY KEY,
session_id TEXT NOT NULL REFERENCES chat_sessions(id),
role TEXT NOT NULL,
content TEXT NOT NULL,
created_at REAL NOT NULL
);
CREATE TABLE IF NOT EXISTS usage_logs (
id INTEGER PRIMARY KEY AUTOINCREMENT,
backend_id TEXT NOT NULL,
model TEXT NOT NULL,
prompt_tokens INTEGER DEFAULT 0,
completion_tokens INTEGER DEFAULT 0,
latency_ms REAL DEFAULT 0,
user_role TEXT NOT NULL DEFAULT 'guest',
created_at REAL NOT NULL
);
CREATE INDEX IF NOT EXISTS idx_messages_session ON chat_messages(session_id);
CREATE INDEX IF NOT EXISTS idx_usage_created ON usage_logs(created_at);
"""
async def init_db():
"""Initialize SQLite database with WAL mode and schema."""
async with aiosqlite.connect(settings.db_path) as db:
await db.execute("PRAGMA journal_mode=WAL")
await db.executescript(SCHEMA)
await db.commit()
async def get_db() -> aiosqlite.Connection:
"""Get a database connection."""
db = await aiosqlite.connect(settings.db_path)
await db.execute("PRAGMA journal_mode=WAL")
return db

2
hub-api/db/models.py Normal file
View File

@@ -0,0 +1,2 @@
# DB model helpers — used in Phase 3 for logging
# Schema defined in database.py

46
hub-api/main.py Normal file
View File

@@ -0,0 +1,46 @@
from contextlib import asynccontextmanager
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from config import settings
from middleware.auth import AuthMiddleware
from routers import auth, chat, embeddings, gpu, health, models
from services.registry import registry
@asynccontextmanager
async def lifespan(app: FastAPI):
await registry.load_backends(settings.backends_config)
registry.start_health_loop()
yield
registry.stop_health_loop()
app = FastAPI(
title="AI Gateway",
version="0.1.0",
lifespan=lifespan,
)
app.add_middleware(
CORSMiddleware,
allow_origins=settings.cors_origins.split(","),
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
app.add_middleware(AuthMiddleware)
app.include_router(auth.router)
app.include_router(chat.router)
app.include_router(models.router)
app.include_router(embeddings.router)
app.include_router(health.router)
app.include_router(gpu.router)
@app.get("/")
async def root():
return {"service": "AI Gateway", "version": "0.1.0"}

View File

View File

@@ -0,0 +1,96 @@
from __future__ import annotations
import time
from jose import JWTError, jwt
from starlette.middleware.base import BaseHTTPMiddleware
from starlette.requests import Request
from config import settings
# Paths that don't require authentication
PUBLIC_PATHS = {"/", "/health", "/auth/login", "/docs", "/openapi.json"}
PUBLIC_PREFIXES = ("/health/",)
class AuthMiddleware(BaseHTTPMiddleware):
async def dispatch(self, request: Request, call_next):
path = request.url.path
# Skip auth for public paths
if path in PUBLIC_PATHS or any(path.startswith(p) for p in PUBLIC_PREFIXES):
request.state.role = "anonymous"
return await call_next(request)
# Skip auth for OPTIONS (CORS preflight)
if request.method == "OPTIONS":
return await call_next(request)
# Try Bearer token first, then cookie
token = _extract_token(request)
if not token:
request.state.role = "anonymous"
return await call_next(request)
# Verify JWT
payload = _verify_token(token)
if payload:
request.state.role = payload.get("role", "guest")
else:
request.state.role = "anonymous"
return await call_next(request)
def create_token(role: str) -> str:
payload = {
"role": role,
"exp": time.time() + settings.jwt_expire_hours * 3600,
"iat": time.time(),
}
return jwt.encode(payload, settings.jwt_secret, algorithm=settings.jwt_algorithm)
def _extract_token(request: Request) -> str | None:
# 1. Authorization: Bearer header
auth_header = request.headers.get("authorization", "")
if auth_header.startswith("Bearer "):
return auth_header[7:]
# 2. httpOnly cookie
return request.cookies.get("token")
def _verify_token(token: str) -> dict | None:
try:
payload = jwt.decode(
token, settings.jwt_secret, algorithms=[settings.jwt_algorithm]
)
if payload.get("exp", 0) < time.time():
return None
return payload
except JWTError:
return None
# Login rate limiting (IP-based)
_login_attempts: dict[str, list[float]] = {}
MAX_ATTEMPTS = 5
LOCKOUT_SECONDS = 60
def check_login_rate_limit(ip: str) -> bool:
"""Returns True if login is allowed for this IP."""
now = time.time()
attempts = _login_attempts.get(ip, [])
# Clean old attempts
attempts = [t for t in attempts if now - t < LOCKOUT_SECONDS]
_login_attempts[ip] = attempts
return len(attempts) < MAX_ATTEMPTS
def record_login_attempt(ip: str):
now = time.time()
if ip not in _login_attempts:
_login_attempts[ip] = []
_login_attempts[ip].append(now)

View File

@@ -0,0 +1,18 @@
from fastapi import HTTPException
from services.registry import registry
def check_backend_rate_limit(backend_id: str):
"""Raise 429 if rate limit exceeded for this backend."""
if not registry.check_rate_limit(backend_id):
raise HTTPException(
status_code=429,
detail={
"error": {
"message": f"Rate limit exceeded for backend '{backend_id}'",
"type": "rate_limit_error",
"code": "rate_limit_exceeded",
}
},
)

7
hub-api/requirements.txt Normal file
View File

@@ -0,0 +1,7 @@
fastapi==0.115.0
uvicorn[standard]==0.30.0
httpx==0.27.0
pydantic-settings==2.5.0
python-jose[cryptography]==3.3.0
python-multipart==0.0.9
aiosqlite==0.20.0

View File

79
hub-api/routers/auth.py Normal file
View File

@@ -0,0 +1,79 @@
from fastapi import APIRouter, Request, Response
from pydantic import BaseModel
from config import settings
from middleware.auth import (
check_login_rate_limit,
create_token,
record_login_attempt,
)
router = APIRouter(prefix="/auth", tags=["auth"])
class LoginRequest(BaseModel):
password: str
class LoginResponse(BaseModel):
role: str
token: str
@router.post("/login")
async def login(body: LoginRequest, request: Request, response: Response):
ip = request.client.host if request.client else "unknown"
if not check_login_rate_limit(ip):
return _error_response(429, "Too many login attempts. Try again in 1 minute.")
record_login_attempt(ip)
if body.password == settings.owner_password:
role = "owner"
elif body.password == settings.guest_password:
role = "guest"
else:
return _error_response(401, "Invalid password")
token = create_token(role)
# Set httpOnly cookie for web UI
response.set_cookie(
key="token",
value=token,
httponly=True,
samesite="lax",
max_age=settings.jwt_expire_hours * 3600,
)
return LoginResponse(role=role, token=token)
@router.get("/me")
async def me(request: Request):
role = getattr(request.state, "role", "anonymous")
if role == "anonymous":
return _error_response(401, "Not authenticated")
return {"role": role}
@router.post("/logout")
async def logout(response: Response):
response.delete_cookie("token")
return {"ok": True}
def _error_response(status_code: int, message: str):
from fastapi.responses import JSONResponse
return JSONResponse(
status_code=status_code,
content={
"error": {
"message": message,
"type": "auth_error",
"code": f"auth_{status_code}",
}
},
)

92
hub-api/routers/chat.py Normal file
View File

@@ -0,0 +1,92 @@
from typing import List, Optional
from fastapi import APIRouter, HTTPException, Request
from fastapi.responses import JSONResponse, StreamingResponse
from pydantic import BaseModel
from middleware.rate_limit import check_backend_rate_limit
from services import proxy_ollama
from services.registry import registry
router = APIRouter(prefix="/v1", tags=["chat"])
class ChatMessage(BaseModel):
role: str
content: str
class ChatRequest(BaseModel):
model: str
messages: List[ChatMessage]
stream: bool = False
temperature: Optional[float] = None
max_tokens: Optional[int] = None
@router.post("/chat/completions")
async def chat_completions(body: ChatRequest, request: Request):
role = getattr(request.state, "role", "anonymous")
if role == "anonymous":
raise HTTPException(
status_code=401,
detail={"error": {"message": "Authentication required", "type": "auth_error", "code": "unauthorized"}},
)
# Resolve model to backend
result = registry.resolve_model(body.model, role)
if not result:
raise HTTPException(
status_code=404,
detail={
"error": {
"message": f"Model '{body.model}' not found or not available",
"type": "invalid_request_error",
"code": "model_not_found",
}
},
)
backend, model_info = result
# Check rate limit
check_backend_rate_limit(backend.id)
# Record request for rate limiting
registry.record_request(backend.id)
messages = [{"role": m.role, "content": m.content} for m in body.messages]
kwargs = {}
if body.temperature is not None:
kwargs["temperature"] = body.temperature
# Route to appropriate proxy
if backend.type == "ollama":
if body.stream:
return StreamingResponse(
proxy_ollama.stream_chat(
backend.url, body.model, messages, **kwargs
),
media_type="text/event-stream",
headers={
"Cache-Control": "no-cache",
"X-Accel-Buffering": "no",
},
)
else:
result = await proxy_ollama.complete_chat(
backend.url, body.model, messages, **kwargs
)
return JSONResponse(content=result)
# Placeholder for other backend types
raise HTTPException(
status_code=501,
detail={
"error": {
"message": f"Backend type '{backend.type}' not yet implemented",
"type": "api_error",
"code": "not_implemented",
}
},
)

View File

@@ -0,0 +1,67 @@
from typing import List, Union
from fastapi import APIRouter, HTTPException, Request
from pydantic import BaseModel
from services import proxy_ollama
from services.registry import registry
router = APIRouter(prefix="/v1", tags=["embeddings"])
class EmbeddingRequest(BaseModel):
model: str
input: Union[str, List[str]]
@router.post("/embeddings")
async def create_embedding(body: EmbeddingRequest, request: Request):
role = getattr(request.state, "role", "anonymous")
if role == "anonymous":
raise HTTPException(
status_code=401,
detail={"error": {"message": "Authentication required", "type": "auth_error", "code": "unauthorized"}},
)
result = registry.resolve_model(body.model, role)
if not result:
raise HTTPException(
status_code=404,
detail={
"error": {
"message": f"Model '{body.model}' not found or not available",
"type": "invalid_request_error",
"code": "model_not_found",
}
},
)
backend, model_info = result
if "embed" not in model_info.capabilities:
raise HTTPException(
status_code=400,
detail={
"error": {
"message": f"Model '{body.model}' does not support embeddings",
"type": "invalid_request_error",
"code": "capability_mismatch",
}
},
)
if backend.type == "ollama":
return await proxy_ollama.generate_embedding(
backend.url, body.model, body.input
)
raise HTTPException(
status_code=501,
detail={
"error": {
"message": f"Embedding not supported for backend type '{backend.type}'",
"type": "api_error",
"code": "not_implemented",
}
},
)

13
hub-api/routers/gpu.py Normal file
View File

@@ -0,0 +1,13 @@
from fastapi import APIRouter
from services.gpu_monitor import get_gpu_info
router = APIRouter(tags=["gpu"])
@router.get("/gpu")
async def gpu_status():
info = await get_gpu_info()
if not info:
return {"error": {"message": "GPU info unavailable", "type": "api_error", "code": "gpu_unavailable"}}
return info

31
hub-api/routers/health.py Normal file
View File

@@ -0,0 +1,31 @@
from fastapi import APIRouter
from services.gpu_monitor import get_gpu_info
from services.registry import registry
router = APIRouter(tags=["health"])
@router.get("/health")
async def health():
gpu = await get_gpu_info()
return {
"status": "ok",
"backends": registry.get_health_summary(),
"gpu": gpu,
}
@router.get("/health/{backend_id}")
async def backend_health(backend_id: str):
backend = registry.backends.get(backend_id)
if not backend:
return {"error": {"message": f"Backend '{backend_id}' not found"}}
return {
"id": backend.id,
"type": backend.type,
"status": "healthy" if backend.healthy else "down",
"models": [m.id for m in backend.models],
"latency_ms": backend.latency_ms,
}

12
hub-api/routers/models.py Normal file
View File

@@ -0,0 +1,12 @@
from fastapi import APIRouter, Request
from services.registry import registry
router = APIRouter(prefix="/v1", tags=["models"])
@router.get("/models")
async def list_models(request: Request):
role = getattr(request.state, "role", "anonymous")
models = registry.list_models(role)
return {"object": "list", "data": models}

View File

View File

@@ -0,0 +1,41 @@
from __future__ import annotations
import asyncio
import logging
from config import settings
logger = logging.getLogger(__name__)
async def get_gpu_info() -> dict | None:
"""Run nvidia-smi and parse GPU info."""
try:
proc = await asyncio.create_subprocess_exec(
settings.nvidia_smi_path,
"--query-gpu=utilization.gpu,temperature.gpu,memory.used,memory.total,power.draw,name",
"--format=csv,noheader,nounits",
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=5.0)
if proc.returncode != 0:
logger.debug("nvidia-smi failed: %s", stderr.decode())
return None
line = stdout.decode().strip().split("\n")[0]
parts = [p.strip() for p in line.split(",")]
if len(parts) < 6:
return None
return {
"utilization": int(parts[0]),
"temperature": int(parts[1]),
"vram_used": int(parts[2]),
"vram_total": int(parts[3]),
"power_draw": float(parts[4]),
"name": parts[5],
}
except (FileNotFoundError, asyncio.TimeoutError):
return None

View File

@@ -0,0 +1,156 @@
from __future__ import annotations
import json
import logging
from collections.abc import AsyncGenerator
import httpx
logger = logging.getLogger(__name__)
async def stream_chat(
base_url: str,
model: str,
messages: list[dict],
**kwargs,
) -> AsyncGenerator[str, None]:
"""Proxy Ollama chat streaming, converting NDJSON to OpenAI SSE format."""
payload = {
"model": model,
"messages": messages,
"stream": True,
**{k: v for k, v in kwargs.items() if v is not None},
}
async with httpx.AsyncClient(timeout=120.0) as client:
async with client.stream(
"POST",
f"{base_url}/api/chat",
json=payload,
) as resp:
if resp.status_code != 200:
body = await resp.aread()
error_msg = body.decode("utf-8", errors="replace")
yield _error_event(f"Ollama error: {error_msg}")
return
async for line in resp.aiter_lines():
if not line.strip():
continue
try:
chunk = json.loads(line)
except json.JSONDecodeError:
continue
if chunk.get("done"):
# Final chunk — send [DONE]
yield "data: [DONE]\n\n"
return
content = chunk.get("message", {}).get("content", "")
if content:
openai_chunk = {
"id": "chatcmpl-gateway",
"object": "chat.completion.chunk",
"model": model,
"choices": [
{
"index": 0,
"delta": {"content": content},
"finish_reason": None,
}
],
}
yield f"data: {json.dumps(openai_chunk)}\n\n"
async def complete_chat(
base_url: str,
model: str,
messages: list[dict],
**kwargs,
) -> dict:
"""Non-streaming Ollama chat, returns OpenAI-compatible response."""
payload = {
"model": model,
"messages": messages,
"stream": False,
**{k: v for k, v in kwargs.items() if v is not None},
}
async with httpx.AsyncClient(timeout=120.0) as client:
resp = await client.post(f"{base_url}/api/chat", json=payload)
resp.raise_for_status()
data = resp.json()
return {
"id": "chatcmpl-gateway",
"object": "chat.completion",
"model": model,
"choices": [
{
"index": 0,
"message": {
"role": "assistant",
"content": data.get("message", {}).get("content", ""),
},
"finish_reason": "stop",
}
],
"usage": {
"prompt_tokens": data.get("prompt_eval_count", 0),
"completion_tokens": data.get("eval_count", 0),
"total_tokens": data.get("prompt_eval_count", 0)
+ data.get("eval_count", 0),
},
}
async def generate_embedding(
base_url: str,
model: str,
input_text: str | list[str],
) -> dict:
"""Ollama embedding, returns OpenAI-compatible response."""
texts = [input_text] if isinstance(input_text, str) else input_text
async with httpx.AsyncClient(timeout=60.0) as client:
resp = await client.post(
f"{base_url}/api/embed",
json={"model": model, "input": texts},
)
resp.raise_for_status()
data = resp.json()
embeddings_data = []
raw_embeddings = data.get("embeddings", [])
for i, emb in enumerate(raw_embeddings):
embeddings_data.append({
"object": "embedding",
"embedding": emb,
"index": i,
})
return {
"object": "list",
"data": embeddings_data,
"model": model,
"usage": {"prompt_tokens": 0, "total_tokens": 0},
}
def _error_event(message: str) -> str:
error = {
"id": "chatcmpl-gateway",
"object": "chat.completion.chunk",
"model": "error",
"choices": [
{
"index": 0,
"delta": {"content": f"[Error] {message}"},
"finish_reason": "stop",
}
],
}
return f"data: {json.dumps(error)}\n\ndata: [DONE]\n\n"

View File

@@ -0,0 +1,225 @@
from __future__ import annotations
import asyncio
import json
import logging
import time
from dataclasses import dataclass, field
from pathlib import Path
import httpx
logger = logging.getLogger(__name__)
@dataclass
class ModelInfo:
id: str
capabilities: list[str]
priority: int = 1
@dataclass
class RateLimitConfig:
rpm: int = 0
rph: int = 0
scope: str = "global"
@dataclass
class BackendInfo:
id: str
type: str # "ollama", "openai-compat", "anthropic"
url: str
models: list[ModelInfo]
access: str = "all" # "all" or "owner"
rate_limit: RateLimitConfig | None = None
# runtime state
healthy: bool = False
last_check: float = 0
latency_ms: float = 0
@dataclass
class RateLimitState:
minute_timestamps: list[float] = field(default_factory=list)
hour_timestamps: list[float] = field(default_factory=list)
class Registry:
def __init__(self):
self.backends: dict[str, BackendInfo] = {}
self._health_task: asyncio.Task | None = None
self._rate_limits: dict[str, RateLimitState] = {}
async def load_backends(self, config_path: str):
path = Path(config_path)
if not path.exists():
logger.warning("Backends config not found: %s", config_path)
return
with open(path) as f:
data = json.load(f)
for entry in data:
models = [
ModelInfo(
id=m["id"],
capabilities=m.get("capabilities", ["chat"]),
priority=m.get("priority", 1),
)
for m in entry.get("models", [])
]
rl_data = entry.get("rate_limit")
rate_limit = (
RateLimitConfig(
rpm=rl_data.get("rpm", 0),
rph=rl_data.get("rph", 0),
scope=rl_data.get("scope", "global"),
)
if rl_data
else None
)
backend = BackendInfo(
id=entry["id"],
type=entry["type"],
url=entry["url"].rstrip("/"),
models=models,
access=entry.get("access", "all"),
rate_limit=rate_limit,
)
self.backends[backend.id] = backend
if rate_limit:
self._rate_limits[backend.id] = RateLimitState()
logger.info("Loaded %d backends", len(self.backends))
def start_health_loop(self, interval: float = 30.0):
self._health_task = asyncio.create_task(self._health_loop(interval))
def stop_health_loop(self):
if self._health_task:
self._health_task.cancel()
async def _health_loop(self, interval: float):
while True:
await self._check_all_backends()
await asyncio.sleep(interval)
async def _check_all_backends(self):
async with httpx.AsyncClient(timeout=5.0) as client:
tasks = [
self._check_backend(client, backend)
for backend in self.backends.values()
]
await asyncio.gather(*tasks, return_exceptions=True)
async def _check_backend(self, client: httpx.AsyncClient, backend: BackendInfo):
try:
start = time.monotonic()
if backend.type == "ollama":
resp = await client.get(f"{backend.url}/api/tags")
elif backend.type in ("openai-compat", "anthropic"):
resp = await client.get(f"{backend.url}/v1/models")
else:
resp = await client.get(f"{backend.url}/health")
elapsed = (time.monotonic() - start) * 1000
backend.healthy = resp.status_code < 500
backend.latency_ms = round(elapsed, 1)
backend.last_check = time.time()
except Exception:
backend.healthy = False
backend.latency_ms = 0
backend.last_check = time.time()
logger.debug("Health check failed for %s", backend.id)
def resolve_model(self, model_id: str, role: str) -> tuple[BackendInfo, ModelInfo] | None:
"""Find the best backend for a given model ID. Returns (backend, model) or None."""
candidates: list[tuple[BackendInfo, ModelInfo, int]] = []
for backend in self.backends.values():
if not backend.healthy:
continue
if backend.access == "owner" and role != "owner":
continue
for model in backend.models:
if model.id == model_id:
candidates.append((backend, model, model.priority))
if not candidates:
return None
candidates.sort(key=lambda x: x[2])
return candidates[0][0], candidates[0][1]
def list_models(self, role: str) -> list[dict]:
"""List all available models for a given role."""
result = []
for backend in self.backends.values():
if not backend.healthy:
continue
if backend.access == "owner" and role != "owner":
continue
for model in backend.models:
result.append({
"id": model.id,
"object": "model",
"owned_by": backend.id,
"capabilities": model.capabilities,
"backend_id": backend.id,
"backend_status": "healthy" if backend.healthy else "down",
})
return result
def check_rate_limit(self, backend_id: str) -> bool:
"""Check if a request to this backend is within rate limits. Returns True if allowed."""
backend = self.backends.get(backend_id)
if not backend or not backend.rate_limit:
return True
state = self._rate_limits.get(backend_id)
if not state:
return True
now = time.time()
rl = backend.rate_limit
# Clean old timestamps
if rl.rpm > 0:
state.minute_timestamps = [t for t in state.minute_timestamps if now - t < 60]
if len(state.minute_timestamps) >= rl.rpm:
return False
if rl.rph > 0:
state.hour_timestamps = [t for t in state.hour_timestamps if now - t < 3600]
if len(state.hour_timestamps) >= rl.rph:
return False
return True
def record_request(self, backend_id: str):
"""Record a request timestamp for rate limiting."""
state = self._rate_limits.get(backend_id)
if not state:
return
now = time.time()
state.minute_timestamps.append(now)
state.hour_timestamps.append(now)
def get_health_summary(self) -> list[dict]:
return [
{
"id": b.id,
"type": b.type,
"status": "healthy" if b.healthy else "down",
"models": [m.id for m in b.models],
"latency_ms": b.latency_ms,
"last_check": b.last_check,
}
for b in self.backends.values()
]
registry = Registry()