"""Service health checks with per-service validators.""" from __future__ import annotations import json import time from datetime import datetime, timezone from ..config import HOSTS from ..schemas import HealthResult from .ssh import run_command, SSHError def _now() -> str: return datetime.now(timezone.utc).isoformat() async def _validate_document_server() -> HealthResult: """Document Server: /health endpoint must return ok + database connected.""" cfg = HOSTS["gpu"] try: t0 = time.monotonic() stdout, _ = await run_command(cfg, "curl -sf http://localhost:8000/health") latency_ms = int((time.monotonic() - t0) * 1000) data = json.loads(stdout) db_ok = data.get("database") == "connected" status_ok = data.get("status") == "ok" warnings = [] if not db_ok: warnings.append("database disconnected") return HealthResult( ok=status_ok and db_ok, checked_at=_now(), service="document-server", status="healthy" if (status_ok and db_ok) else "degraded", details={ "status": data.get("status"), "database": data.get("database"), "version": data.get("version"), "latency_ms": latency_ms, }, warnings=warnings, raw=stdout.strip(), ) except SSHError as e: return HealthResult( ok=False, checked_at=_now(), service="document-server", status="down", error_type=e.error_type, error=str(e), ) except (json.JSONDecodeError, KeyError) as e: return HealthResult( ok=False, checked_at=_now(), service="document-server", status="down", error_type="parse_error", error=f"응답 파싱 실패: {e}", raw=stdout.strip() if 'stdout' in dir() else None, ) async def _validate_mlx() -> HealthResult: """MLX Server: /v1/models must return at least 1 model within 5s.""" cfg = HOSTS["macmini"] try: t0 = time.monotonic() stdout, _ = await run_command(cfg, "curl -sf http://localhost:8800/v1/models") latency_ms = int((time.monotonic() - t0) * 1000) data = json.loads(stdout) models = data.get("data", []) model_ids = [m.get("id", "unknown") for m in models] warnings = [] if latency_ms > 5000: warnings.append(f"응답 지연 {latency_ms}ms (임계값 5000ms)") return HealthResult( ok=len(models) > 0 and latency_ms <= 5000, checked_at=_now(), service="mlx", status="healthy" if (len(models) > 0 and latency_ms <= 5000) else "degraded", details={ "model_count": len(models), "models": model_ids, "latency_ms": latency_ms, }, warnings=warnings, raw=stdout.strip(), ) except SSHError as e: return HealthResult( ok=False, checked_at=_now(), service="mlx", status="down", error_type=e.error_type, error=str(e), ) async def _validate_mlx_proxy() -> HealthResult: """MLX Proxy (:8801): must return models via proxy.""" cfg = HOSTS["macmini"] try: t0 = time.monotonic() stdout, _ = await run_command(cfg, "curl -sf http://localhost:8801/v1/models") latency_ms = int((time.monotonic() - t0) * 1000) data = json.loads(stdout) models = data.get("data", []) return HealthResult( ok=len(models) > 0, checked_at=_now(), service="mlx-proxy", status="healthy" if models else "down", details={"model_count": len(models), "latency_ms": latency_ms}, raw=stdout.strip(), ) except SSHError as e: return HealthResult( ok=False, checked_at=_now(), service="mlx-proxy", status="down", error_type=e.error_type, error=str(e), ) async def _validate_nanoclaude() -> HealthResult: """NanoClaude: /health on port 8100.""" cfg = HOSTS["gpu"] try: t0 = time.monotonic() stdout, _ = await run_command(cfg, "curl -sf http://localhost:8100/health") latency_ms = int((time.monotonic() - t0) * 1000) return HealthResult( ok=True, checked_at=_now(), service="nanoclaude", status="healthy", details={"latency_ms": latency_ms, "response": stdout.strip()[:200]}, raw=stdout.strip(), ) except SSHError as e: return HealthResult( ok=False, checked_at=_now(), service="nanoclaude", status="down", error_type=e.error_type, error=str(e), ) async def _validate_ollama(host: str) -> HealthResult: """Ollama: `ollama list` must succeed and return non-empty.""" service_name = f"ollama-{host}" if host != "gpu" else "ollama-gpu" cfg = HOSTS[host] try: t0 = time.monotonic() stdout, _ = await run_command(cfg, "ollama list") latency_ms = int((time.monotonic() - t0) * 1000) lines = [l for l in stdout.strip().splitlines()[1:] if l.strip()] # skip header model_count = len(lines) warnings = [] if model_count == 0: warnings.append("모델 없음") return HealthResult( ok=model_count > 0, checked_at=_now(), service=service_name, status="healthy" if model_count > 0 else "degraded", details={"model_count": model_count, "latency_ms": latency_ms}, warnings=warnings, raw=stdout.strip(), ) except SSHError as e: return HealthResult( ok=False, checked_at=_now(), service=service_name, status="down", error_type=e.error_type, error=str(e), ) # Validator registry VALIDATORS: dict[str, object] = { "document-server": _validate_document_server, "mlx": _validate_mlx, "mlx-proxy": _validate_mlx_proxy, "nanoclaude": _validate_nanoclaude, "ollama-gpu": lambda: _validate_ollama("gpu"), "ollama-macmini": lambda: _validate_ollama("macmini"), } VALID_SERVICES = list(VALIDATORS.keys()) async def service_health(service: str) -> HealthResult: """Run health check for a specific service.""" validator = VALIDATORS.get(service) if not validator: return HealthResult( ok=False, checked_at=_now(), service=service, status="unknown", error_type="parse_error", error=f"알 수 없는 서비스: '{service}'. 허용: {', '.join(VALID_SERVICES)}", ) return await validator()