feat: home-gateway 초기 구성 — Mac mini에서 GPU 서버로 전면 이전

OrbStack 라이선스 만료로 Mac mini Docker 서비스를 GPU 서버로 통합. nginx → Caddy 전환, 12개 서브도메인 자동 HTTPS, fail2ban Caddy JSON 연동. 주요 변경: - home-caddy: Caddy 리버스 프록시 (Let's Encrypt 자동 HTTPS) - home-fail2ban: Caddy JSON 로그 기반 보안 모니터링 - home-ddns: Cloudflare DDNS (API 키 .env 분리) - gpu-hub-api/web: AI 백엔드 라우터 + 웹 UI (gpu-services에서 이전) - AI 런타임(Ollama) 내부망 전용, 외부는 gpu-hub 인증 게이트웨이 경유 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-05 04:55:28 +00:00
commit 79c09cede4
52 changed files with 6847 additions and 0 deletions
--- a/hub-api/services/init.py
+++ b/hub-api/services/init.py
--- a/hub-api/services/gpu_monitor.py
+++ b/hub-api/services/gpu_monitor.py
@@ -0,0 +1,41 @@
+from __future__ import annotations
+
+import asyncio
+import logging
+
+from config import settings
+
+logger = logging.getLogger(__name__)
+
+
+async def get_gpu_info() -> dict | None:
+    """Run nvidia-smi and parse GPU info."""
+    try:
+        proc = await asyncio.create_subprocess_exec(
+            settings.nvidia_smi_path,
+            "--query-gpu=utilization.gpu,temperature.gpu,memory.used,memory.total,power.draw,name",
+            "--format=csv,noheader,nounits",
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.PIPE,
+        )
+        stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=5.0)
+
+        if proc.returncode != 0:
+            logger.debug("nvidia-smi failed: %s", stderr.decode())
+            return None
+
+        line = stdout.decode().strip().split("\n")[0]
+        parts = [p.strip() for p in line.split(",")]
+        if len(parts) < 6:
+            return None
+
+        return {
+            "utilization": int(parts[0]),
+            "temperature": int(parts[1]),
+            "vram_used": int(parts[2]),
+            "vram_total": int(parts[3]),
+            "power_draw": float(parts[4]),
+            "name": parts[5],
+        }
+    except (FileNotFoundError, asyncio.TimeoutError):
+        return None
--- a/hub-api/services/proxy_ollama.py
+++ b/hub-api/services/proxy_ollama.py
@@ -0,0 +1,156 @@
+from __future__ import annotations
+
+import json
+import logging
+from collections.abc import AsyncGenerator
+
+import httpx
+
+logger = logging.getLogger(__name__)
+
+
+async def stream_chat(
+    base_url: str,
+    model: str,
+    messages: list[dict],
+    **kwargs,
+) -> AsyncGenerator[str, None]:
+    """Proxy Ollama chat streaming, converting NDJSON to OpenAI SSE format."""
+    payload = {
+        "model": model,
+        "messages": messages,
+        "stream": True,
+        **{k: v for k, v in kwargs.items() if v is not None},
+    }
+
+    async with httpx.AsyncClient(timeout=120.0) as client:
+        async with client.stream(
+            "POST",
+            f"{base_url}/api/chat",
+            json=payload,
+        ) as resp:
+            if resp.status_code != 200:
+                body = await resp.aread()
+                error_msg = body.decode("utf-8", errors="replace")
+                yield _error_event(f"Ollama error: {error_msg}")
+                return
+
+            async for line in resp.aiter_lines():
+                if not line.strip():
+                    continue
+                try:
+                    chunk = json.loads(line)
+                except json.JSONDecodeError:
+                    continue
+
+                if chunk.get("done"):
+                    # Final chunk — send [DONE]
+                    yield "data: [DONE]\n\n"
+                    return
+
+                content = chunk.get("message", {}).get("content", "")
+                if content:
+                    openai_chunk = {
+                        "id": "chatcmpl-gateway",
+                        "object": "chat.completion.chunk",
+                        "model": model,
+                        "choices": [
+                            {
+                                "index": 0,
+                                "delta": {"content": content},
+                                "finish_reason": None,
+                            }
+                        ],
+                    }
+                    yield f"data: {json.dumps(openai_chunk)}\n\n"
+
+
+async def complete_chat(
+    base_url: str,
+    model: str,
+    messages: list[dict],
+    **kwargs,
+) -> dict:
+    """Non-streaming Ollama chat, returns OpenAI-compatible response."""
+    payload = {
+        "model": model,
+        "messages": messages,
+        "stream": False,
+        **{k: v for k, v in kwargs.items() if v is not None},
+    }
+
+    async with httpx.AsyncClient(timeout=120.0) as client:
+        resp = await client.post(f"{base_url}/api/chat", json=payload)
+        resp.raise_for_status()
+        data = resp.json()
+
+    return {
+        "id": "chatcmpl-gateway",
+        "object": "chat.completion",
+        "model": model,
+        "choices": [
+            {
+                "index": 0,
+                "message": {
+                    "role": "assistant",
+                    "content": data.get("message", {}).get("content", ""),
+                },
+                "finish_reason": "stop",
+            }
+        ],
+        "usage": {
+            "prompt_tokens": data.get("prompt_eval_count", 0),
+            "completion_tokens": data.get("eval_count", 0),
+            "total_tokens": data.get("prompt_eval_count", 0)
+            + data.get("eval_count", 0),
+        },
+    }
+
+
+async def generate_embedding(
+    base_url: str,
+    model: str,
+    input_text: str | list[str],
+) -> dict:
+    """Ollama embedding, returns OpenAI-compatible response."""
+    texts = [input_text] if isinstance(input_text, str) else input_text
+
+    async with httpx.AsyncClient(timeout=60.0) as client:
+        resp = await client.post(
+            f"{base_url}/api/embed",
+            json={"model": model, "input": texts},
+        )
+        resp.raise_for_status()
+        data = resp.json()
+
+    embeddings_data = []
+    raw_embeddings = data.get("embeddings", [])
+    for i, emb in enumerate(raw_embeddings):
+        embeddings_data.append({
+            "object": "embedding",
+            "embedding": emb,
+            "index": i,
+        })
+
+    return {
+        "object": "list",
+        "data": embeddings_data,
+        "model": model,
+        "usage": {"prompt_tokens": 1, "total_tokens": 1},
+    }
+
+
+def _error_event(message: str) -> str:
+    error = {
+        "id": "chatcmpl-gateway",
+        "object": "chat.completion.chunk",
+        "model": "error",
+        "choices": [
+            {
+                "index": 0,
+                "delta": {"content": f"[Error] {message}"},
+                "finish_reason": "stop",
+            }
+        ],
+    }
+    return f"data: {json.dumps(error)}\n\ndata: [DONE]\n\n"
--- a/hub-api/services/proxy_openai.py
+++ b/hub-api/services/proxy_openai.py
@@ -0,0 +1,83 @@
+"""OpenAI-compatible proxy (MLX server, vLLM, etc.) — SSE passthrough."""
+
+from __future__ import annotations
+
+import json
+import logging
+from collections.abc import AsyncGenerator
+
+import httpx
+
+logger = logging.getLogger(__name__)
+
+
+async def stream_chat(
+    base_url: str,
+    model: str,
+    messages: list[dict],
+    **kwargs,
+) -> AsyncGenerator[str, None]:
+    """Proxy OpenAI-compatible chat streaming. SSE passthrough with model field override."""
+    payload = {
+        "model": model,
+        "messages": messages,
+        "stream": True,
+        **{k: v for k, v in kwargs.items() if v is not None},
+    }
+
+    async with httpx.AsyncClient(timeout=120.0) as client:
+        async with client.stream(
+            "POST",
+            f"{base_url}/v1/chat/completions",
+            json=payload,
+        ) as resp:
+            if resp.status_code != 200:
+                body = await resp.aread()
+                error_msg = body.decode("utf-8", errors="replace")
+                yield _error_event(f"Backend error ({resp.status_code}): {error_msg}")
+                return
+
+            async for line in resp.aiter_lines():
+                if not line.strip():
+                    continue
+                # Pass through SSE lines as-is (already in OpenAI format)
+                if line.startswith("data: "):
+                    yield f"{line}\n\n"
+                elif line == "data: [DONE]":
+                    yield "data: [DONE]\n\n"
+
+
+async def complete_chat(
+    base_url: str,
+    model: str,
+    messages: list[dict],
+    **kwargs,
+) -> dict:
+    """Non-streaming OpenAI-compatible chat."""
+    payload = {
+        "model": model,
+        "messages": messages,
+        "stream": False,
+        **{k: v for k, v in kwargs.items() if v is not None},
+    }
+
+    async with httpx.AsyncClient(timeout=120.0) as client:
+        resp = await client.post(f"{base_url}/v1/chat/completions", json=payload)
+        resp.raise_for_status()
+        return resp.json()
+
+
+def _error_event(message: str) -> str:
+    error = {
+        "id": "chatcmpl-gateway",
+        "object": "chat.completion.chunk",
+        "model": "error",
+        "choices": [
+            {
+                "index": 0,
+                "delta": {"content": f"[Error] {message}"},
+                "finish_reason": "stop",
+            }
+        ],
+    }
+    return f"data: {json.dumps(error)}\n\ndata: [DONE]\n\n"
--- a/hub-api/services/registry.py
+++ b/hub-api/services/registry.py
@@ -0,0 +1,227 @@
+from __future__ import annotations
+
+import asyncio
+import json
+import logging
+import time
+from dataclasses import dataclass, field
+from pathlib import Path
+
+import httpx
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class ModelInfo:
+    id: str
+    capabilities: list[str]
+    priority: int = 1
+    backend_model_id: str = ""  # actual model ID sent to backend (if different from id)
+
+
+@dataclass
+class RateLimitConfig:
+    rpm: int = 0
+    rph: int = 0
+    scope: str = "global"
+
+
+@dataclass
+class BackendInfo:
+    id: str
+    type: str  # "ollama", "openai-compat", "anthropic"
+    url: str
+    models: list[ModelInfo]
+    access: str = "all"  # "all" or "owner"
+    rate_limit: RateLimitConfig | None = None
+
+    # runtime state
+    healthy: bool = False
+    last_check: float = 0
+    latency_ms: float = 0
+
+
+@dataclass
+class RateLimitState:
+    minute_timestamps: list[float] = field(default_factory=list)
+    hour_timestamps: list[float] = field(default_factory=list)
+
+
+class Registry:
+    def __init__(self):
+        self.backends: dict[str, BackendInfo] = {}
+        self._health_task: asyncio.Task | None = None
+        self._rate_limits: dict[str, RateLimitState] = {}
+
+    async def load_backends(self, config_path: str):
+        path = Path(config_path)
+        if not path.exists():
+            logger.warning("Backends config not found: %s", config_path)
+            return
+
+        with open(path) as f:
+            data = json.load(f)
+
+        for entry in data:
+            models = [
+                ModelInfo(
+                    id=m["id"],
+                    capabilities=m.get("capabilities", ["chat"]),
+                    priority=m.get("priority", 1),
+                    backend_model_id=m.get("backend_model_id", ""),
+                )
+                for m in entry.get("models", [])
+            ]
+            rl_data = entry.get("rate_limit")
+            rate_limit = (
+                RateLimitConfig(
+                    rpm=rl_data.get("rpm", 0),
+                    rph=rl_data.get("rph", 0),
+                    scope=rl_data.get("scope", "global"),
+                )
+                if rl_data
+                else None
+            )
+            backend = BackendInfo(
+                id=entry["id"],
+                type=entry["type"],
+                url=entry["url"].rstrip("/"),
+                models=models,
+                access=entry.get("access", "all"),
+                rate_limit=rate_limit,
+            )
+            self.backends[backend.id] = backend
+            if rate_limit:
+                self._rate_limits[backend.id] = RateLimitState()
+
+        logger.info("Loaded %d backends", len(self.backends))
+
+    def start_health_loop(self, interval: float = 30.0):
+        self._health_task = asyncio.create_task(self._health_loop(interval))
+
+    def stop_health_loop(self):
+        if self._health_task:
+            self._health_task.cancel()
+
+    async def _health_loop(self, interval: float):
+        while True:
+            await self._check_all_backends()
+            await asyncio.sleep(interval)
+
+    async def _check_all_backends(self):
+        async with httpx.AsyncClient(timeout=5.0) as client:
+            tasks = [
+                self._check_backend(client, backend)
+                for backend in self.backends.values()
+            ]
+            await asyncio.gather(*tasks, return_exceptions=True)
+
+    async def _check_backend(self, client: httpx.AsyncClient, backend: BackendInfo):
+        try:
+            start = time.monotonic()
+            if backend.type == "ollama":
+                resp = await client.get(f"{backend.url}/api/tags")
+            elif backend.type in ("openai-compat", "anthropic"):
+                resp = await client.get(f"{backend.url}/v1/models")
+            else:
+                resp = await client.get(f"{backend.url}/health")
+            elapsed = (time.monotonic() - start) * 1000
+
+            backend.healthy = resp.status_code < 500
+            backend.latency_ms = round(elapsed, 1)
+            backend.last_check = time.time()
+        except Exception:
+            backend.healthy = False
+            backend.latency_ms = 0
+            backend.last_check = time.time()
+            logger.debug("Health check failed for %s", backend.id)
+
+    def resolve_model(self, model_id: str, role: str) -> tuple[BackendInfo, ModelInfo] | None:
+        """Find the best backend for a given model ID. Returns (backend, model) or None."""
+        candidates: list[tuple[BackendInfo, ModelInfo, int]] = []
+
+        for backend in self.backends.values():
+            if not backend.healthy:
+                continue
+            if backend.access == "owner" and role != "owner":
+                continue
+            for model in backend.models:
+                if model.id == model_id:
+                    candidates.append((backend, model, model.priority))
+
+        if not candidates:
+            return None
+
+        candidates.sort(key=lambda x: x[2])
+        return candidates[0][0], candidates[0][1]
+
+    def list_models(self, role: str) -> list[dict]:
+        """List all available models for a given role."""
+        result = []
+        for backend in self.backends.values():
+            if not backend.healthy:
+                continue
+            if backend.access == "owner" and role != "owner":
+                continue
+            for model in backend.models:
+                result.append({
+                    "id": model.id,
+                    "object": "model",
+                    "owned_by": backend.id,
+                    "capabilities": model.capabilities,
+                    "backend_id": backend.id,
+                    "backend_status": "healthy" if backend.healthy else "down",
+                })
+        return result
+
+    def check_rate_limit(self, backend_id: str) -> bool:
+        """Check if a request to this backend is within rate limits. Returns True if allowed."""
+        backend = self.backends.get(backend_id)
+        if not backend or not backend.rate_limit:
+            return True
+
+        state = self._rate_limits.get(backend_id)
+        if not state:
+            return True
+
+        now = time.time()
+        rl = backend.rate_limit
+
+        # Clean old timestamps
+        if rl.rpm > 0:
+            state.minute_timestamps = [t for t in state.minute_timestamps if now - t < 60]
+            if len(state.minute_timestamps) >= rl.rpm:
+                return False
+
+        if rl.rph > 0:
+            state.hour_timestamps = [t for t in state.hour_timestamps if now - t < 3600]
+            if len(state.hour_timestamps) >= rl.rph:
+                return False
+
+        return True
+
+    def record_request(self, backend_id: str):
+        """Record a request timestamp for rate limiting."""
+        state = self._rate_limits.get(backend_id)
+        if not state:
+            return
+        now = time.time()
+        state.minute_timestamps.append(now)
+        state.hour_timestamps.append(now)
+
+    def get_health_summary(self) -> list[dict]:
+        return [
+            {
+                "id": b.id,
+                "type": b.type,
+                "status": "healthy" if b.healthy else "down",
+                "models": [m.id for m in b.models],
+                "latency_ms": b.latency_ms,
+                "last_check": b.last_check,
+            }
+            for b in self.backends.values()
+        ]
+
+
+registry = Registry()