feat: home-gateway 초기 구성 — Mac mini에서 GPU 서버로 전면 이전

OrbStack 라이선스 만료로 Mac mini Docker 서비스를 GPU 서버로 통합.
nginx → Caddy 전환, 12개 서브도메인 자동 HTTPS, fail2ban Caddy JSON 연동.

주요 변경:
- home-caddy: Caddy 리버스 프록시 (Let's Encrypt 자동 HTTPS)
- home-fail2ban: Caddy JSON 로그 기반 보안 모니터링
- home-ddns: Cloudflare DDNS (API 키 .env 분리)
- gpu-hub-api/web: AI 백엔드 라우터 + 웹 UI (gpu-services에서 이전)
- AI 런타임(Ollama) 내부망 전용, 외부는 gpu-hub 인증 게이트웨이 경유

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Hyungi Ahn
2026-04-05 04:55:28 +00:00
commit 79c09cede4
52 changed files with 6847 additions and 0 deletions

View File

View File

@@ -0,0 +1,41 @@
from __future__ import annotations
import asyncio
import logging
from config import settings
logger = logging.getLogger(__name__)
async def get_gpu_info() -> dict | None:
"""Run nvidia-smi and parse GPU info."""
try:
proc = await asyncio.create_subprocess_exec(
settings.nvidia_smi_path,
"--query-gpu=utilization.gpu,temperature.gpu,memory.used,memory.total,power.draw,name",
"--format=csv,noheader,nounits",
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=5.0)
if proc.returncode != 0:
logger.debug("nvidia-smi failed: %s", stderr.decode())
return None
line = stdout.decode().strip().split("\n")[0]
parts = [p.strip() for p in line.split(",")]
if len(parts) < 6:
return None
return {
"utilization": int(parts[0]),
"temperature": int(parts[1]),
"vram_used": int(parts[2]),
"vram_total": int(parts[3]),
"power_draw": float(parts[4]),
"name": parts[5],
}
except (FileNotFoundError, asyncio.TimeoutError):
return None

View File

@@ -0,0 +1,156 @@
from __future__ import annotations
import json
import logging
from collections.abc import AsyncGenerator
import httpx
logger = logging.getLogger(__name__)
async def stream_chat(
base_url: str,
model: str,
messages: list[dict],
**kwargs,
) -> AsyncGenerator[str, None]:
"""Proxy Ollama chat streaming, converting NDJSON to OpenAI SSE format."""
payload = {
"model": model,
"messages": messages,
"stream": True,
**{k: v for k, v in kwargs.items() if v is not None},
}
async with httpx.AsyncClient(timeout=120.0) as client:
async with client.stream(
"POST",
f"{base_url}/api/chat",
json=payload,
) as resp:
if resp.status_code != 200:
body = await resp.aread()
error_msg = body.decode("utf-8", errors="replace")
yield _error_event(f"Ollama error: {error_msg}")
return
async for line in resp.aiter_lines():
if not line.strip():
continue
try:
chunk = json.loads(line)
except json.JSONDecodeError:
continue
if chunk.get("done"):
# Final chunk — send [DONE]
yield "data: [DONE]\n\n"
return
content = chunk.get("message", {}).get("content", "")
if content:
openai_chunk = {
"id": "chatcmpl-gateway",
"object": "chat.completion.chunk",
"model": model,
"choices": [
{
"index": 0,
"delta": {"content": content},
"finish_reason": None,
}
],
}
yield f"data: {json.dumps(openai_chunk)}\n\n"
async def complete_chat(
base_url: str,
model: str,
messages: list[dict],
**kwargs,
) -> dict:
"""Non-streaming Ollama chat, returns OpenAI-compatible response."""
payload = {
"model": model,
"messages": messages,
"stream": False,
**{k: v for k, v in kwargs.items() if v is not None},
}
async with httpx.AsyncClient(timeout=120.0) as client:
resp = await client.post(f"{base_url}/api/chat", json=payload)
resp.raise_for_status()
data = resp.json()
return {
"id": "chatcmpl-gateway",
"object": "chat.completion",
"model": model,
"choices": [
{
"index": 0,
"message": {
"role": "assistant",
"content": data.get("message", {}).get("content", ""),
},
"finish_reason": "stop",
}
],
"usage": {
"prompt_tokens": data.get("prompt_eval_count", 0),
"completion_tokens": data.get("eval_count", 0),
"total_tokens": data.get("prompt_eval_count", 0)
+ data.get("eval_count", 0),
},
}
async def generate_embedding(
base_url: str,
model: str,
input_text: str | list[str],
) -> dict:
"""Ollama embedding, returns OpenAI-compatible response."""
texts = [input_text] if isinstance(input_text, str) else input_text
async with httpx.AsyncClient(timeout=60.0) as client:
resp = await client.post(
f"{base_url}/api/embed",
json={"model": model, "input": texts},
)
resp.raise_for_status()
data = resp.json()
embeddings_data = []
raw_embeddings = data.get("embeddings", [])
for i, emb in enumerate(raw_embeddings):
embeddings_data.append({
"object": "embedding",
"embedding": emb,
"index": i,
})
return {
"object": "list",
"data": embeddings_data,
"model": model,
"usage": {"prompt_tokens": 1, "total_tokens": 1},
}
def _error_event(message: str) -> str:
error = {
"id": "chatcmpl-gateway",
"object": "chat.completion.chunk",
"model": "error",
"choices": [
{
"index": 0,
"delta": {"content": f"[Error] {message}"},
"finish_reason": "stop",
}
],
}
return f"data: {json.dumps(error)}\n\ndata: [DONE]\n\n"

View File

@@ -0,0 +1,83 @@
"""OpenAI-compatible proxy (MLX server, vLLM, etc.) — SSE passthrough."""
from __future__ import annotations
import json
import logging
from collections.abc import AsyncGenerator
import httpx
logger = logging.getLogger(__name__)
async def stream_chat(
base_url: str,
model: str,
messages: list[dict],
**kwargs,
) -> AsyncGenerator[str, None]:
"""Proxy OpenAI-compatible chat streaming. SSE passthrough with model field override."""
payload = {
"model": model,
"messages": messages,
"stream": True,
**{k: v for k, v in kwargs.items() if v is not None},
}
async with httpx.AsyncClient(timeout=120.0) as client:
async with client.stream(
"POST",
f"{base_url}/v1/chat/completions",
json=payload,
) as resp:
if resp.status_code != 200:
body = await resp.aread()
error_msg = body.decode("utf-8", errors="replace")
yield _error_event(f"Backend error ({resp.status_code}): {error_msg}")
return
async for line in resp.aiter_lines():
if not line.strip():
continue
# Pass through SSE lines as-is (already in OpenAI format)
if line.startswith("data: "):
yield f"{line}\n\n"
elif line == "data: [DONE]":
yield "data: [DONE]\n\n"
async def complete_chat(
base_url: str,
model: str,
messages: list[dict],
**kwargs,
) -> dict:
"""Non-streaming OpenAI-compatible chat."""
payload = {
"model": model,
"messages": messages,
"stream": False,
**{k: v for k, v in kwargs.items() if v is not None},
}
async with httpx.AsyncClient(timeout=120.0) as client:
resp = await client.post(f"{base_url}/v1/chat/completions", json=payload)
resp.raise_for_status()
return resp.json()
def _error_event(message: str) -> str:
error = {
"id": "chatcmpl-gateway",
"object": "chat.completion.chunk",
"model": "error",
"choices": [
{
"index": 0,
"delta": {"content": f"[Error] {message}"},
"finish_reason": "stop",
}
],
}
return f"data: {json.dumps(error)}\n\ndata: [DONE]\n\n"

View File

@@ -0,0 +1,227 @@
from __future__ import annotations
import asyncio
import json
import logging
import time
from dataclasses import dataclass, field
from pathlib import Path
import httpx
logger = logging.getLogger(__name__)
@dataclass
class ModelInfo:
id: str
capabilities: list[str]
priority: int = 1
backend_model_id: str = "" # actual model ID sent to backend (if different from id)
@dataclass
class RateLimitConfig:
rpm: int = 0
rph: int = 0
scope: str = "global"
@dataclass
class BackendInfo:
id: str
type: str # "ollama", "openai-compat", "anthropic"
url: str
models: list[ModelInfo]
access: str = "all" # "all" or "owner"
rate_limit: RateLimitConfig | None = None
# runtime state
healthy: bool = False
last_check: float = 0
latency_ms: float = 0
@dataclass
class RateLimitState:
minute_timestamps: list[float] = field(default_factory=list)
hour_timestamps: list[float] = field(default_factory=list)
class Registry:
def __init__(self):
self.backends: dict[str, BackendInfo] = {}
self._health_task: asyncio.Task | None = None
self._rate_limits: dict[str, RateLimitState] = {}
async def load_backends(self, config_path: str):
path = Path(config_path)
if not path.exists():
logger.warning("Backends config not found: %s", config_path)
return
with open(path) as f:
data = json.load(f)
for entry in data:
models = [
ModelInfo(
id=m["id"],
capabilities=m.get("capabilities", ["chat"]),
priority=m.get("priority", 1),
backend_model_id=m.get("backend_model_id", ""),
)
for m in entry.get("models", [])
]
rl_data = entry.get("rate_limit")
rate_limit = (
RateLimitConfig(
rpm=rl_data.get("rpm", 0),
rph=rl_data.get("rph", 0),
scope=rl_data.get("scope", "global"),
)
if rl_data
else None
)
backend = BackendInfo(
id=entry["id"],
type=entry["type"],
url=entry["url"].rstrip("/"),
models=models,
access=entry.get("access", "all"),
rate_limit=rate_limit,
)
self.backends[backend.id] = backend
if rate_limit:
self._rate_limits[backend.id] = RateLimitState()
logger.info("Loaded %d backends", len(self.backends))
def start_health_loop(self, interval: float = 30.0):
self._health_task = asyncio.create_task(self._health_loop(interval))
def stop_health_loop(self):
if self._health_task:
self._health_task.cancel()
async def _health_loop(self, interval: float):
while True:
await self._check_all_backends()
await asyncio.sleep(interval)
async def _check_all_backends(self):
async with httpx.AsyncClient(timeout=5.0) as client:
tasks = [
self._check_backend(client, backend)
for backend in self.backends.values()
]
await asyncio.gather(*tasks, return_exceptions=True)
async def _check_backend(self, client: httpx.AsyncClient, backend: BackendInfo):
try:
start = time.monotonic()
if backend.type == "ollama":
resp = await client.get(f"{backend.url}/api/tags")
elif backend.type in ("openai-compat", "anthropic"):
resp = await client.get(f"{backend.url}/v1/models")
else:
resp = await client.get(f"{backend.url}/health")
elapsed = (time.monotonic() - start) * 1000
backend.healthy = resp.status_code < 500
backend.latency_ms = round(elapsed, 1)
backend.last_check = time.time()
except Exception:
backend.healthy = False
backend.latency_ms = 0
backend.last_check = time.time()
logger.debug("Health check failed for %s", backend.id)
def resolve_model(self, model_id: str, role: str) -> tuple[BackendInfo, ModelInfo] | None:
"""Find the best backend for a given model ID. Returns (backend, model) or None."""
candidates: list[tuple[BackendInfo, ModelInfo, int]] = []
for backend in self.backends.values():
if not backend.healthy:
continue
if backend.access == "owner" and role != "owner":
continue
for model in backend.models:
if model.id == model_id:
candidates.append((backend, model, model.priority))
if not candidates:
return None
candidates.sort(key=lambda x: x[2])
return candidates[0][0], candidates[0][1]
def list_models(self, role: str) -> list[dict]:
"""List all available models for a given role."""
result = []
for backend in self.backends.values():
if not backend.healthy:
continue
if backend.access == "owner" and role != "owner":
continue
for model in backend.models:
result.append({
"id": model.id,
"object": "model",
"owned_by": backend.id,
"capabilities": model.capabilities,
"backend_id": backend.id,
"backend_status": "healthy" if backend.healthy else "down",
})
return result
def check_rate_limit(self, backend_id: str) -> bool:
"""Check if a request to this backend is within rate limits. Returns True if allowed."""
backend = self.backends.get(backend_id)
if not backend or not backend.rate_limit:
return True
state = self._rate_limits.get(backend_id)
if not state:
return True
now = time.time()
rl = backend.rate_limit
# Clean old timestamps
if rl.rpm > 0:
state.minute_timestamps = [t for t in state.minute_timestamps if now - t < 60]
if len(state.minute_timestamps) >= rl.rpm:
return False
if rl.rph > 0:
state.hour_timestamps = [t for t in state.hour_timestamps if now - t < 3600]
if len(state.hour_timestamps) >= rl.rph:
return False
return True
def record_request(self, backend_id: str):
"""Record a request timestamp for rate limiting."""
state = self._rate_limits.get(backend_id)
if not state:
return
now = time.time()
state.minute_timestamps.append(now)
state.hour_timestamps.append(now)
def get_health_summary(self) -> list[dict]:
return [
{
"id": b.id,
"type": b.type,
"status": "healthy" if b.healthy else "down",
"models": [m.id for m in b.models],
"latency_ms": b.latency_ms,
"last_check": b.last_check,
}
for b in self.backends.values()
]
registry = Registry()