b1f9e87d6a
mcp-infra-server를 gpu-services/infra/로 통합. core/ 순수 로직은 Agent/NanoClaude에서도 직접 import 가능. 도구: docker_status, docker_logs, service_health, disk_usage, tailscale_status, ollama_models, mlx_models. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
202 lines
6.6 KiB
Python
202 lines
6.6 KiB
Python
"""Service health checks with per-service validators."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import time
|
|
from datetime import datetime, timezone
|
|
|
|
from ..config import HOSTS
|
|
from ..schemas import HealthResult
|
|
from .ssh import run_command, SSHError
|
|
|
|
|
|
def _now() -> str:
|
|
return datetime.now(timezone.utc).isoformat()
|
|
|
|
|
|
async def _validate_document_server() -> HealthResult:
|
|
"""Document Server: /health endpoint must return ok + database connected."""
|
|
cfg = HOSTS["gpu"]
|
|
try:
|
|
t0 = time.monotonic()
|
|
stdout, _ = await run_command(cfg, "curl -sf http://localhost:8000/health")
|
|
latency_ms = int((time.monotonic() - t0) * 1000)
|
|
|
|
data = json.loads(stdout)
|
|
db_ok = data.get("database") == "connected"
|
|
status_ok = data.get("status") == "ok"
|
|
|
|
warnings = []
|
|
if not db_ok:
|
|
warnings.append("database disconnected")
|
|
|
|
return HealthResult(
|
|
ok=status_ok and db_ok,
|
|
checked_at=_now(),
|
|
service="document-server",
|
|
status="healthy" if (status_ok and db_ok) else "degraded",
|
|
details={
|
|
"status": data.get("status"),
|
|
"database": data.get("database"),
|
|
"version": data.get("version"),
|
|
"latency_ms": latency_ms,
|
|
},
|
|
warnings=warnings,
|
|
raw=stdout.strip(),
|
|
)
|
|
except SSHError as e:
|
|
return HealthResult(
|
|
ok=False, checked_at=_now(), service="document-server",
|
|
status="down", error_type=e.error_type, error=str(e),
|
|
)
|
|
except (json.JSONDecodeError, KeyError) as e:
|
|
return HealthResult(
|
|
ok=False, checked_at=_now(), service="document-server",
|
|
status="down", error_type="parse_error", error=f"응답 파싱 실패: {e}",
|
|
raw=stdout.strip() if 'stdout' in dir() else None,
|
|
)
|
|
|
|
|
|
async def _validate_mlx() -> HealthResult:
|
|
"""MLX Server: /v1/models must return at least 1 model within 5s."""
|
|
cfg = HOSTS["macmini"]
|
|
try:
|
|
t0 = time.monotonic()
|
|
stdout, _ = await run_command(cfg, "curl -sf http://localhost:8800/v1/models")
|
|
latency_ms = int((time.monotonic() - t0) * 1000)
|
|
|
|
data = json.loads(stdout)
|
|
models = data.get("data", [])
|
|
model_ids = [m.get("id", "unknown") for m in models]
|
|
|
|
warnings = []
|
|
if latency_ms > 5000:
|
|
warnings.append(f"응답 지연 {latency_ms}ms (임계값 5000ms)")
|
|
|
|
return HealthResult(
|
|
ok=len(models) > 0 and latency_ms <= 5000,
|
|
checked_at=_now(),
|
|
service="mlx",
|
|
status="healthy" if (len(models) > 0 and latency_ms <= 5000) else "degraded",
|
|
details={
|
|
"model_count": len(models),
|
|
"models": model_ids,
|
|
"latency_ms": latency_ms,
|
|
},
|
|
warnings=warnings,
|
|
raw=stdout.strip(),
|
|
)
|
|
except SSHError as e:
|
|
return HealthResult(
|
|
ok=False, checked_at=_now(), service="mlx",
|
|
status="down", error_type=e.error_type, error=str(e),
|
|
)
|
|
|
|
|
|
async def _validate_mlx_proxy() -> HealthResult:
|
|
"""MLX Proxy (:8801): must return models via proxy."""
|
|
cfg = HOSTS["macmini"]
|
|
try:
|
|
t0 = time.monotonic()
|
|
stdout, _ = await run_command(cfg, "curl -sf http://localhost:8801/v1/models")
|
|
latency_ms = int((time.monotonic() - t0) * 1000)
|
|
|
|
data = json.loads(stdout)
|
|
models = data.get("data", [])
|
|
|
|
return HealthResult(
|
|
ok=len(models) > 0,
|
|
checked_at=_now(),
|
|
service="mlx-proxy",
|
|
status="healthy" if models else "down",
|
|
details={"model_count": len(models), "latency_ms": latency_ms},
|
|
raw=stdout.strip(),
|
|
)
|
|
except SSHError as e:
|
|
return HealthResult(
|
|
ok=False, checked_at=_now(), service="mlx-proxy",
|
|
status="down", error_type=e.error_type, error=str(e),
|
|
)
|
|
|
|
|
|
async def _validate_nanoclaude() -> HealthResult:
|
|
"""NanoClaude: /health on port 8100."""
|
|
cfg = HOSTS["gpu"]
|
|
try:
|
|
t0 = time.monotonic()
|
|
stdout, _ = await run_command(cfg, "curl -sf http://localhost:8100/health")
|
|
latency_ms = int((time.monotonic() - t0) * 1000)
|
|
|
|
return HealthResult(
|
|
ok=True,
|
|
checked_at=_now(),
|
|
service="nanoclaude",
|
|
status="healthy",
|
|
details={"latency_ms": latency_ms, "response": stdout.strip()[:200]},
|
|
raw=stdout.strip(),
|
|
)
|
|
except SSHError as e:
|
|
return HealthResult(
|
|
ok=False, checked_at=_now(), service="nanoclaude",
|
|
status="down", error_type=e.error_type, error=str(e),
|
|
)
|
|
|
|
|
|
async def _validate_ollama(host: str) -> HealthResult:
|
|
"""Ollama: `ollama list` must succeed and return non-empty."""
|
|
service_name = f"ollama-{host}" if host != "gpu" else "ollama-gpu"
|
|
cfg = HOSTS[host]
|
|
try:
|
|
t0 = time.monotonic()
|
|
stdout, _ = await run_command(cfg, "ollama list")
|
|
latency_ms = int((time.monotonic() - t0) * 1000)
|
|
|
|
lines = [l for l in stdout.strip().splitlines()[1:] if l.strip()] # skip header
|
|
model_count = len(lines)
|
|
|
|
warnings = []
|
|
if model_count == 0:
|
|
warnings.append("모델 없음")
|
|
|
|
return HealthResult(
|
|
ok=model_count > 0,
|
|
checked_at=_now(),
|
|
service=service_name,
|
|
status="healthy" if model_count > 0 else "degraded",
|
|
details={"model_count": model_count, "latency_ms": latency_ms},
|
|
warnings=warnings,
|
|
raw=stdout.strip(),
|
|
)
|
|
except SSHError as e:
|
|
return HealthResult(
|
|
ok=False, checked_at=_now(), service=service_name,
|
|
status="down", error_type=e.error_type, error=str(e),
|
|
)
|
|
|
|
|
|
# Validator registry
|
|
VALIDATORS: dict[str, object] = {
|
|
"document-server": _validate_document_server,
|
|
"mlx": _validate_mlx,
|
|
"mlx-proxy": _validate_mlx_proxy,
|
|
"nanoclaude": _validate_nanoclaude,
|
|
"ollama-gpu": lambda: _validate_ollama("gpu"),
|
|
"ollama-macmini": lambda: _validate_ollama("macmini"),
|
|
}
|
|
|
|
VALID_SERVICES = list(VALIDATORS.keys())
|
|
|
|
|
|
async def service_health(service: str) -> HealthResult:
|
|
"""Run health check for a specific service."""
|
|
validator = VALIDATORS.get(service)
|
|
if not validator:
|
|
return HealthResult(
|
|
ok=False, checked_at=_now(), service=service,
|
|
status="unknown",
|
|
error_type="parse_error",
|
|
error=f"알 수 없는 서비스: '{service}'. 허용: {', '.join(VALID_SERVICES)}",
|
|
)
|
|
return await validator()
|