feat: AI Gateway Phase 1 - FastAPI 코어 구현

GPU 서버 중앙 AI 라우팅 서비스 초기 구현: - OpenAI 호환 API (/v1/chat/completions, /v1/models, /v1/embeddings) - 모델 레지스트리 + 백엔드 헬스체크 (30초 루프) - Ollama SSE 프록시 (NDJSON → OpenAI SSE 변환) - JWT 인증 이중 경로 (httpOnly 쿠키 + Bearer 토큰) - owner/guest 역할 분리, 로그인 rate limiting - 백엔드별 rate limiting (NanoClaude 대비) - SQLite 스키마 사전 정의 (aiosqlite + WAL) - Docker Compose + Caddy 리버스 프록시 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-31 13:41:46 +09:00
commit 3794afff95
27 changed files with 1121 additions and 0 deletions
--- a/hub-api/services/registry.py
+++ b/hub-api/services/registry.py
@@ -0,0 +1,225 @@
+from __future__ import annotations
+
+import asyncio
+import json
+import logging
+import time
+from dataclasses import dataclass, field
+from pathlib import Path
+
+import httpx
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class ModelInfo:
+    id: str
+    capabilities: list[str]
+    priority: int = 1
+
+
+@dataclass
+class RateLimitConfig:
+    rpm: int = 0
+    rph: int = 0
+    scope: str = "global"
+
+
+@dataclass
+class BackendInfo:
+    id: str
+    type: str  # "ollama", "openai-compat", "anthropic"
+    url: str
+    models: list[ModelInfo]
+    access: str = "all"  # "all" or "owner"
+    rate_limit: RateLimitConfig | None = None
+
+    # runtime state
+    healthy: bool = False
+    last_check: float = 0
+    latency_ms: float = 0
+
+
+@dataclass
+class RateLimitState:
+    minute_timestamps: list[float] = field(default_factory=list)
+    hour_timestamps: list[float] = field(default_factory=list)
+
+
+class Registry:
+    def __init__(self):
+        self.backends: dict[str, BackendInfo] = {}
+        self._health_task: asyncio.Task | None = None
+        self._rate_limits: dict[str, RateLimitState] = {}
+
+    async def load_backends(self, config_path: str):
+        path = Path(config_path)
+        if not path.exists():
+            logger.warning("Backends config not found: %s", config_path)
+            return
+
+        with open(path) as f:
+            data = json.load(f)
+
+        for entry in data:
+            models = [
+                ModelInfo(
+                    id=m["id"],
+                    capabilities=m.get("capabilities", ["chat"]),
+                    priority=m.get("priority", 1),
+                )
+                for m in entry.get("models", [])
+            ]
+            rl_data = entry.get("rate_limit")
+            rate_limit = (
+                RateLimitConfig(
+                    rpm=rl_data.get("rpm", 0),
+                    rph=rl_data.get("rph", 0),
+                    scope=rl_data.get("scope", "global"),
+                )
+                if rl_data
+                else None
+            )
+            backend = BackendInfo(
+                id=entry["id"],
+                type=entry["type"],
+                url=entry["url"].rstrip("/"),
+                models=models,
+                access=entry.get("access", "all"),
+                rate_limit=rate_limit,
+            )
+            self.backends[backend.id] = backend
+            if rate_limit:
+                self._rate_limits[backend.id] = RateLimitState()
+
+        logger.info("Loaded %d backends", len(self.backends))
+
+    def start_health_loop(self, interval: float = 30.0):
+        self._health_task = asyncio.create_task(self._health_loop(interval))
+
+    def stop_health_loop(self):
+        if self._health_task:
+            self._health_task.cancel()
+
+    async def _health_loop(self, interval: float):
+        while True:
+            await self._check_all_backends()
+            await asyncio.sleep(interval)
+
+    async def _check_all_backends(self):
+        async with httpx.AsyncClient(timeout=5.0) as client:
+            tasks = [
+                self._check_backend(client, backend)
+                for backend in self.backends.values()
+            ]
+            await asyncio.gather(*tasks, return_exceptions=True)
+
+    async def _check_backend(self, client: httpx.AsyncClient, backend: BackendInfo):
+        try:
+            start = time.monotonic()
+            if backend.type == "ollama":
+                resp = await client.get(f"{backend.url}/api/tags")
+            elif backend.type in ("openai-compat", "anthropic"):
+                resp = await client.get(f"{backend.url}/v1/models")
+            else:
+                resp = await client.get(f"{backend.url}/health")
+            elapsed = (time.monotonic() - start) * 1000
+
+            backend.healthy = resp.status_code < 500
+            backend.latency_ms = round(elapsed, 1)
+            backend.last_check = time.time()
+        except Exception:
+            backend.healthy = False
+            backend.latency_ms = 0
+            backend.last_check = time.time()
+            logger.debug("Health check failed for %s", backend.id)
+
+    def resolve_model(self, model_id: str, role: str) -> tuple[BackendInfo, ModelInfo] | None:
+        """Find the best backend for a given model ID. Returns (backend, model) or None."""
+        candidates: list[tuple[BackendInfo, ModelInfo, int]] = []
+
+        for backend in self.backends.values():
+            if not backend.healthy:
+                continue
+            if backend.access == "owner" and role != "owner":
+                continue
+            for model in backend.models:
+                if model.id == model_id:
+                    candidates.append((backend, model, model.priority))
+
+        if not candidates:
+            return None
+
+        candidates.sort(key=lambda x: x[2])
+        return candidates[0][0], candidates[0][1]
+
+    def list_models(self, role: str) -> list[dict]:
+        """List all available models for a given role."""
+        result = []
+        for backend in self.backends.values():
+            if not backend.healthy:
+                continue
+            if backend.access == "owner" and role != "owner":
+                continue
+            for model in backend.models:
+                result.append({
+                    "id": model.id,
+                    "object": "model",
+                    "owned_by": backend.id,
+                    "capabilities": model.capabilities,
+                    "backend_id": backend.id,
+                    "backend_status": "healthy" if backend.healthy else "down",
+                })
+        return result
+
+    def check_rate_limit(self, backend_id: str) -> bool:
+        """Check if a request to this backend is within rate limits. Returns True if allowed."""
+        backend = self.backends.get(backend_id)
+        if not backend or not backend.rate_limit:
+            return True
+
+        state = self._rate_limits.get(backend_id)
+        if not state:
+            return True
+
+        now = time.time()
+        rl = backend.rate_limit
+
+        # Clean old timestamps
+        if rl.rpm > 0:
+            state.minute_timestamps = [t for t in state.minute_timestamps if now - t < 60]
+            if len(state.minute_timestamps) >= rl.rpm:
+                return False
+
+        if rl.rph > 0:
+            state.hour_timestamps = [t for t in state.hour_timestamps if now - t < 3600]
+            if len(state.hour_timestamps) >= rl.rph:
+                return False
+
+        return True
+
+    def record_request(self, backend_id: str):
+        """Record a request timestamp for rate limiting."""
+        state = self._rate_limits.get(backend_id)
+        if not state:
+            return
+        now = time.time()
+        state.minute_timestamps.append(now)
+        state.hour_timestamps.append(now)
+
+    def get_health_summary(self) -> list[dict]:
+        return [
+            {
+                "id": b.id,
+                "type": b.type,
+                "status": "healthy" if b.healthy else "down",
+                "models": [m.id for m in b.models],
+                "latency_ms": b.latency_ms,
+                "last_check": b.last_check,
+            }
+            for b in self.backends.values()
+        ]
+
+
+registry = Registry()