feat(infra): MCP 인프라 서버 통합 — 7개 도구 + core/ 분리

mcp-infra-server를 gpu-services/infra/로 통합. core/ 순수 로직은 Agent/NanoClaude에서도 직접 import 가능. 도구: docker_status, docker_logs, service_health, disk_usage, tailscale_status, ollama_models, mlx_models. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-13 13:11:54 +09:00
parent 6b36063010
commit b1f9e87d6a
14 changed files with 996 additions and 0 deletions
@@ -0,0 +1,113 @@
+"""Docker status and logs tools."""
+
+from __future__ import annotations
+
+from datetime import datetime, timezone
+
+from ..config import validate_host, HOSTS
+from ..schemas import DockerStatusResult, DockerLogsResult, ContainerInfo
+from .ssh import run_command, SSHError
+
+
+def _now() -> str:
+    return datetime.now(timezone.utc).isoformat()
+
+
+async def docker_status(host: str) -> DockerStatusResult:
+    """List all Docker containers on a host with structured status."""
+    try:
+        cfg = validate_host("docker_status", host)
+    except ValueError as e:
+        return DockerStatusResult(
+            ok=False, checked_at=_now(), host=host,
+            error_type="parse_error", error=str(e),
+        )
+
+    docker = cfg.docker_path
+    fmt = '{{.Names}}|{{.Status}}|{{.Ports}}|{{.Image}}'
+    cmd = f"{docker} ps -a --format '{fmt}'"
+
+    try:
+        stdout, _ = await run_command(cfg, cmd, use_sudo=cfg.needs_sudo)
+    except SSHError as e:
+        return DockerStatusResult(
+            ok=False, checked_at=_now(), host=host,
+            error_type=e.error_type, error=str(e),
+        )
+
+    containers: list[ContainerInfo] = []
+    for line in stdout.strip().splitlines():
+        parts = line.split("|", 3)
+        if len(parts) < 4:
+            continue
+        name, status_str, ports, image = parts
+        # Extract running state from status string
+        state = "running" if status_str.startswith("Up") else "exited"
+        if "Restarting" in status_str:
+            state = "restarting"
+        containers.append(ContainerInfo(
+            name=name, status=state, uptime=status_str, ports=ports, image=image,
+        ))
+
+    running = sum(1 for c in containers if c.status == "running")
+    total = len(containers)
+    summary = f"{running}/{total} running"
+    if running < total:
+        non_running = [c.name for c in containers if c.status != "running"]
+        summary += f", down: {', '.join(non_running)}"
+
+    warnings: list[str] = []
+    for c in containers:
+        if c.status == "restarting":
+            warnings.append(f"{c.name} is restarting")
+        elif c.status == "exited":
+            warnings.append(f"{c.name} is exited")
+
+    return DockerStatusResult(
+        ok=running == total,
+        checked_at=_now(),
+        host=host,
+        containers=containers,
+        summary=summary,
+        warnings=warnings,
+        raw=stdout.strip(),
+    )
+
+
+async def docker_logs(host: str, container: str, lines: int = 50) -> DockerLogsResult:
+    """Get recent logs from a container."""
+    try:
+        cfg = validate_host("docker_logs", host)
+    except ValueError as e:
+        return DockerLogsResult(
+            ok=False, checked_at=_now(), host=host, container=container,
+            lines=lines, error_type="parse_error", error=str(e),
+        )
+
+    docker = cfg.docker_path
+    # Request one extra line to detect truncation
+    cmd = f"{docker} logs --tail {lines + 1} {container} 2>&1"
+
+    try:
+        stdout, stderr = await run_command(cfg, cmd, use_sudo=cfg.needs_sudo, timeout=15)
+    except SSHError as e:
+        return DockerLogsResult(
+            ok=False, checked_at=_now(), host=host, container=container,
+            lines=lines, error_type=e.error_type, error=str(e),
+        )
+
+    all_lines = stdout.strip().splitlines()
+    truncated = len(all_lines) > lines
+    content = "\n".join(all_lines[:lines]) if truncated else "\n".join(all_lines)
+
+    return DockerLogsResult(
+        ok=True,
+        checked_at=_now(),
+        host=host,
+        container=container,
+        lines=lines,
+        truncated=truncated,
+        content=content,
+        stderr=stderr.strip() if stderr else "",
+        raw=stdout.strip(),
+    )
@@ -0,0 +1,201 @@
+"""Service health checks with per-service validators."""
+
+from __future__ import annotations
+
+import json
+import time
+from datetime import datetime, timezone
+
+from ..config import HOSTS
+from ..schemas import HealthResult
+from .ssh import run_command, SSHError
+
+
+def _now() -> str:
+    return datetime.now(timezone.utc).isoformat()
+
+
+async def _validate_document_server() -> HealthResult:
+    """Document Server: /health endpoint must return ok + database connected."""
+    cfg = HOSTS["gpu"]
+    try:
+        t0 = time.monotonic()
+        stdout, _ = await run_command(cfg, "curl -sf http://localhost:8000/health")
+        latency_ms = int((time.monotonic() - t0) * 1000)
+
+        data = json.loads(stdout)
+        db_ok = data.get("database") == "connected"
+        status_ok = data.get("status") == "ok"
+
+        warnings = []
+        if not db_ok:
+            warnings.append("database disconnected")
+
+        return HealthResult(
+            ok=status_ok and db_ok,
+            checked_at=_now(),
+            service="document-server",
+            status="healthy" if (status_ok and db_ok) else "degraded",
+            details={
+                "status": data.get("status"),
+                "database": data.get("database"),
+                "version": data.get("version"),
+                "latency_ms": latency_ms,
+            },
+            warnings=warnings,
+            raw=stdout.strip(),
+        )
+    except SSHError as e:
+        return HealthResult(
+            ok=False, checked_at=_now(), service="document-server",
+            status="down", error_type=e.error_type, error=str(e),
+        )
+    except (json.JSONDecodeError, KeyError) as e:
+        return HealthResult(
+            ok=False, checked_at=_now(), service="document-server",
+            status="down", error_type="parse_error", error=f"응답 파싱 실패: {e}",
+            raw=stdout.strip() if 'stdout' in dir() else None,
+        )
+
+
+async def _validate_mlx() -> HealthResult:
+    """MLX Server: /v1/models must return at least 1 model within 5s."""
+    cfg = HOSTS["macmini"]
+    try:
+        t0 = time.monotonic()
+        stdout, _ = await run_command(cfg, "curl -sf http://localhost:8800/v1/models")
+        latency_ms = int((time.monotonic() - t0) * 1000)
+
+        data = json.loads(stdout)
+        models = data.get("data", [])
+        model_ids = [m.get("id", "unknown") for m in models]
+
+        warnings = []
+        if latency_ms > 5000:
+            warnings.append(f"응답 지연 {latency_ms}ms (임계값 5000ms)")
+
+        return HealthResult(
+            ok=len(models) > 0 and latency_ms <= 5000,
+            checked_at=_now(),
+            service="mlx",
+            status="healthy" if (len(models) > 0 and latency_ms <= 5000) else "degraded",
+            details={
+                "model_count": len(models),
+                "models": model_ids,
+                "latency_ms": latency_ms,
+            },
+            warnings=warnings,
+            raw=stdout.strip(),
+        )
+    except SSHError as e:
+        return HealthResult(
+            ok=False, checked_at=_now(), service="mlx",
+            status="down", error_type=e.error_type, error=str(e),
+        )
+
+
+async def _validate_mlx_proxy() -> HealthResult:
+    """MLX Proxy (:8801): must return models via proxy."""
+    cfg = HOSTS["macmini"]
+    try:
+        t0 = time.monotonic()
+        stdout, _ = await run_command(cfg, "curl -sf http://localhost:8801/v1/models")
+        latency_ms = int((time.monotonic() - t0) * 1000)
+
+        data = json.loads(stdout)
+        models = data.get("data", [])
+
+        return HealthResult(
+            ok=len(models) > 0,
+            checked_at=_now(),
+            service="mlx-proxy",
+            status="healthy" if models else "down",
+            details={"model_count": len(models), "latency_ms": latency_ms},
+            raw=stdout.strip(),
+        )
+    except SSHError as e:
+        return HealthResult(
+            ok=False, checked_at=_now(), service="mlx-proxy",
+            status="down", error_type=e.error_type, error=str(e),
+        )
+
+
+async def _validate_nanoclaude() -> HealthResult:
+    """NanoClaude: /health on port 8100."""
+    cfg = HOSTS["gpu"]
+    try:
+        t0 = time.monotonic()
+        stdout, _ = await run_command(cfg, "curl -sf http://localhost:8100/health")
+        latency_ms = int((time.monotonic() - t0) * 1000)
+
+        return HealthResult(
+            ok=True,
+            checked_at=_now(),
+            service="nanoclaude",
+            status="healthy",
+            details={"latency_ms": latency_ms, "response": stdout.strip()[:200]},
+            raw=stdout.strip(),
+        )
+    except SSHError as e:
+        return HealthResult(
+            ok=False, checked_at=_now(), service="nanoclaude",
+            status="down", error_type=e.error_type, error=str(e),
+        )
+
+
+async def _validate_ollama(host: str) -> HealthResult:
+    """Ollama: `ollama list` must succeed and return non-empty."""
+    service_name = f"ollama-{host}" if host != "gpu" else "ollama-gpu"
+    cfg = HOSTS[host]
+    try:
+        t0 = time.monotonic()
+        stdout, _ = await run_command(cfg, "ollama list")
+        latency_ms = int((time.monotonic() - t0) * 1000)
+
+        lines = [l for l in stdout.strip().splitlines()[1:] if l.strip()]  # skip header
+        model_count = len(lines)
+
+        warnings = []
+        if model_count == 0:
+            warnings.append("모델 없음")
+
+        return HealthResult(
+            ok=model_count > 0,
+            checked_at=_now(),
+            service=service_name,
+            status="healthy" if model_count > 0 else "degraded",
+            details={"model_count": model_count, "latency_ms": latency_ms},
+            warnings=warnings,
+            raw=stdout.strip(),
+        )
+    except SSHError as e:
+        return HealthResult(
+            ok=False, checked_at=_now(), service=service_name,
+            status="down", error_type=e.error_type, error=str(e),
+        )
+
+
+# Validator registry
+VALIDATORS: dict[str, object] = {
+    "document-server": _validate_document_server,
+    "mlx": _validate_mlx,
+    "mlx-proxy": _validate_mlx_proxy,
+    "nanoclaude": _validate_nanoclaude,
+    "ollama-gpu": lambda: _validate_ollama("gpu"),
+    "ollama-macmini": lambda: _validate_ollama("macmini"),
+}
+
+VALID_SERVICES = list(VALIDATORS.keys())
+
+
+async def service_health(service: str) -> HealthResult:
+    """Run health check for a specific service."""
+    validator = VALIDATORS.get(service)
+    if not validator:
+        return HealthResult(
+            ok=False, checked_at=_now(), service=service,
+            status="unknown",
+            error_type="parse_error",
+            error=f"알 수 없는 서비스: '{service}'. 허용: {', '.join(VALID_SERVICES)}",
+        )
+    return await validator()
@@ -0,0 +1,97 @@
+"""Model inventory tools — Ollama and MLX model listing."""
+
+from __future__ import annotations
+
+import json
+from datetime import datetime, timezone
+
+from ..config import validate_host, HOSTS
+from ..schemas import ModelsResult, ModelInfo
+from .ssh import run_command, SSHError
+
+
+def _now() -> str:
+    return datetime.now(timezone.utc).isoformat()
+
+
+def _parse_ollama_list(output: str) -> list[ModelInfo]:
+    """Parse `ollama list` output."""
+    models = []
+    for line in output.strip().splitlines()[1:]:  # skip header
+        parts = line.split()
+        if len(parts) < 2:
+            continue
+        model_id = parts[0]
+        # Remaining fields vary: ID, SIZE, MODIFIED
+        size = parts[2] + " " + parts[3] if len(parts) > 3 else ""
+        modified = " ".join(parts[4:]) if len(parts) > 4 else ""
+        models.append(ModelInfo(id=model_id, size=size, modified=modified))
+    return models
+
+
+async def ollama_models(host: str) -> ModelsResult:
+    """List Ollama models on a host."""
+    try:
+        cfg = validate_host("ollama_models", host)
+    except ValueError as e:
+        return ModelsResult(
+            ok=False, checked_at=_now(), host=host, source="ollama",
+            error_type="parse_error", error=str(e),
+        )
+
+    try:
+        stdout, _ = await run_command(cfg, "ollama list")
+    except SSHError as e:
+        return ModelsResult(
+            ok=False, checked_at=_now(), host=host, source="ollama",
+            error_type=e.error_type, error=str(e),
+        )
+
+    models = _parse_ollama_list(stdout)
+    return ModelsResult(
+        ok=True,
+        checked_at=_now(),
+        host=host,
+        source="ollama",
+        models=models,
+        raw=stdout.strip(),
+    )
+
+
+async def mlx_models() -> ModelsResult:
+    """List MLX models loaded on Mac mini."""
+    cfg = HOSTS["macmini"]
+    try:
+        stdout, _ = await run_command(cfg, "curl -sf http://localhost:8800/v1/models")
+    except SSHError as e:
+        return ModelsResult(
+            ok=False, checked_at=_now(), host="macmini", source="mlx",
+            error_type=e.error_type, error=str(e),
+        )
+
+    try:
+        data = json.loads(stdout)
+        model_list = data.get("data", [])
+        models = [
+            ModelInfo(
+                id=m.get("id", "unknown"),
+                size=str(m.get("size", "")),
+                modified=str(m.get("created", "")),
+            )
+            for m in model_list
+        ]
+    except (json.JSONDecodeError, KeyError) as e:
+        return ModelsResult(
+            ok=False, checked_at=_now(), host="macmini", source="mlx",
+            error_type="parse_error", error=f"JSON 파싱 실패: {e}",
+            raw=stdout.strip(),
+        )
+
+    return ModelsResult(
+        ok=True,
+        checked_at=_now(),
+        host="macmini",
+        source="mlx",
+        models=models,
+        raw=stdout.strip(),
+    )
@@ -0,0 +1,83 @@
+"""Network tools — Tailscale status."""
+
+from __future__ import annotations
+
+from datetime import datetime, timezone
+
+from ..schemas import TailscaleResult, TailscalePeer
+from .ssh import run_local, SSHError
+
+TAILSCALE_BIN = "/Applications/Tailscale.app/Contents/MacOS/Tailscale"
+
+
+def _now() -> str:
+    return datetime.now(timezone.utc).isoformat()
+
+
+def _parse_tailscale(output: str) -> list[TailscalePeer]:
+    """Parse `tailscale status` output into peer list.
+
+    Format: IP  HOSTNAME  USER@  OS  STATUS_INFO
+    Status examples: "-" (connected/active), "idle, tx ... rx ...", "offline, last seen ..."
+    """
+    peers = []
+    for line in output.strip().splitlines():
+        parts = line.split()
+        if len(parts) < 4:
+            continue
+        # Skip header-like lines
+        if parts[0].startswith("#") or parts[0] == "IP":
+            continue
+
+        ip = parts[0]
+        hostname = parts[1]
+        # parts[2] = user@ (skip)
+        os_name = parts[3] if len(parts) > 3 else ""
+
+        # Remaining is status info
+        status_text = " ".join(parts[4:]) if len(parts) > 4 else ""
+
+        if "offline" in status_text:
+            status = "offline"
+        elif "idle" in status_text:
+            status = "idle"
+        elif status_text == "-" or status_text == "":
+            status = "active"
+        else:
+            status = "active"
+
+        peers.append(TailscalePeer(
+            hostname=hostname,
+            ip=ip,
+            status=status,
+            os=os_name,
+        ))
+    return peers
+
+
+async def tailscale_status() -> TailscaleResult:
+    """Get Tailscale network status (runs locally)."""
+    try:
+        stdout, _ = await run_local(f"{TAILSCALE_BIN} status")
+    except SSHError as e:
+        return TailscaleResult(
+            ok=False, checked_at=_now(),
+            error_type=e.error_type, error=str(e),
+        )
+
+    peers = _parse_tailscale(stdout)
+
+    warnings = []
+    expected_hosts = {"sub-server", "hyungi-macmini", "hyungi-macbookpro"}
+    found_hosts = {p.hostname for p in peers}
+    missing = expected_hosts - found_hosts
+    for h in missing:
+        warnings.append(f"{h} not found in tailnet")
+
+    return TailscaleResult(
+        ok=True,
+        checked_at=_now(),
+        peers=peers,
+        warnings=warnings,
+        raw=stdout.strip(),
+    )
@@ -0,0 +1,123 @@
+"""SSH connection layer — asyncssh based.
+
+Provides run_command() which handles:
+- Key-based auth (GPU, Mac mini)
+- Password auth + sudo (company NAS)
+- Timeout / retry
+- Structured error classification
+"""
+
+from __future__ import annotations
+
+import asyncio
+from datetime import datetime, timezone
+
+import asyncssh
+
+from ..config import HostConfig, SSH_TIMEOUT, CMD_TIMEOUT, MAX_RETRIES
+
+
+class SSHError(Exception):
+    """Typed SSH error with error_type classification."""
+
+    def __init__(self, error_type: str, message: str):
+        self.error_type = error_type
+        super().__init__(message)
+
+
+def _now_iso() -> str:
+    return datetime.now(timezone.utc).isoformat()
+
+
+async def _connect(host: HostConfig) -> asyncssh.SSHClientConnection:
+    """Open SSH connection with appropriate auth method."""
+    kwargs: dict = {
+        "host": host.ip,
+        "username": host.user,
+        "connect_timeout": SSH_TIMEOUT,
+        "known_hosts": None,  # accept any host key (Tailscale internal network)
+    }
+    if host.auth == "password" and host.password:
+        kwargs["password"] = host.password
+        kwargs["client_keys"] = []  # don't try key auth
+    # key auth is the default (uses ~/.ssh/)
+
+    return await asyncssh.connect(**kwargs)
+
+
+async def run_command(
+    host: HostConfig,
+    command: str,
+    timeout: int = CMD_TIMEOUT,
+    use_sudo: bool = False,
+) -> tuple[str, str]:
+    """Run a command on remote host. Returns (stdout, stderr).
+
+    For NAS with sudo: wraps command with sudo using password via stdin.
+    Raises SSHError with typed error_type on failure.
+    """
+    if use_sudo and host.needs_sudo and host.password:
+        # Pipe password to sudo via stdin
+        command = f"echo '{host.password}' | sudo -S {command}"
+
+    last_error: Exception | None = None
+    for attempt in range(1 + MAX_RETRIES):
+        try:
+            conn = await _connect(host)
+            async with conn:
+                result = await asyncio.wait_for(
+                    conn.run(command, check=False),
+                    timeout=timeout,
+                )
+                stdout = result.stdout or ""
+                stderr = result.stderr or ""
+
+                if result.exit_status != 0:
+                    # Command ran but returned non-zero
+                    # Filter out sudo password prompt from stderr
+                    stderr_clean = "\n".join(
+                        line for line in stderr.splitlines()
+                        if "[sudo]" not in line and "Password:" not in line
+                    )
+                    raise SSHError(
+                        "command_failed",
+                        f"exit {result.exit_status}: {stderr_clean.strip() or stdout.strip()}"
+                    )
+                return stdout, stderr
+
+        except SSHError:
+            raise
+        except asyncio.TimeoutError:
+            raise SSHError("timeout", f"명령 실행 시간 초과 ({timeout}초)")
+        except asyncssh.PermissionDenied:
+            raise SSHError("auth", f"SSH 인증 실패: {host.user}@{host.ip}")
+        except (OSError, asyncssh.Error) as e:
+            last_error = e
+            if attempt < MAX_RETRIES:
+                await asyncio.sleep(1)
+                continue
+            raise SSHError("timeout", f"SSH 연결 실패: {host.ip} — {e}")
+
+    raise SSHError("timeout", f"SSH 최대 재시도 초과: {host.ip}")
+
+
+async def run_local(command: str, timeout: int = CMD_TIMEOUT) -> tuple[str, str]:
+    """Run a command locally. Returns (stdout, stderr)."""
+    try:
+        proc = await asyncio.create_subprocess_shell(
+            command,
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.PIPE,
+        )
+        stdout_bytes, stderr_bytes = await asyncio.wait_for(
+            proc.communicate(), timeout=timeout
+        )
+        stdout = stdout_bytes.decode() if stdout_bytes else ""
+        stderr = stderr_bytes.decode() if stderr_bytes else ""
+
+        if proc.returncode != 0:
+            raise SSHError("command_failed", f"exit {proc.returncode}: {stderr.strip() or stdout.strip()}")
+
+        return stdout, stderr
+    except asyncio.TimeoutError:
+        raise SSHError("timeout", f"로컬 명령 시간 초과 ({timeout}초)")
@@ -0,0 +1,79 @@
+"""System tools — disk usage."""
+
+from __future__ import annotations
+
+from datetime import datetime, timezone
+
+from ..config import validate_host
+from ..schemas import DiskResult, FileSystemInfo
+from .ssh import run_command, SSHError
+
+
+def _now() -> str:
+    return datetime.now(timezone.utc).isoformat()
+
+
+def _parse_df(output: str) -> list[FileSystemInfo]:
+    """Parse `df -h` output into structured filesystem info."""
+    filesystems = []
+    for line in output.strip().splitlines()[1:]:  # skip header
+        parts = line.split()
+        if len(parts) < 6:
+            continue
+        # df -h columns: Filesystem Size Used Avail Use% Mounted
+        mount = parts[-1]
+        # Skip pseudo-filesystems
+        if mount.startswith(("/dev", "/sys", "/proc", "/run", "/snap")):
+            continue
+        if parts[0] in ("tmpfs", "devtmpfs", "overlay", "shm", "none"):
+            continue
+
+        try:
+            used_pct = int(parts[4].rstrip("%"))
+        except ValueError:
+            continue
+
+        filesystems.append(FileSystemInfo(
+            mount=mount,
+            total=parts[1],
+            used=parts[2],
+            avail=parts[3],
+            used_pct=used_pct,
+        ))
+    return filesystems
+
+
+async def disk_usage(host: str) -> DiskResult:
+    """Get disk usage for a host with structured filesystem info."""
+    try:
+        cfg = validate_host("disk_usage", host)
+    except ValueError as e:
+        return DiskResult(
+            ok=False, checked_at=_now(), host=host,
+            error_type="parse_error", error=str(e),
+        )
+
+    try:
+        stdout, _ = await run_command(cfg, "df -h", use_sudo=cfg.needs_sudo)
+    except SSHError as e:
+        return DiskResult(
+            ok=False, checked_at=_now(), host=host,
+            error_type=e.error_type, error=str(e),
+        )
+
+    filesystems = _parse_df(stdout)
+
+    warnings = []
+    WARN_THRESHOLD = 85
+    for fs in filesystems:
+        if fs.used_pct >= WARN_THRESHOLD:
+            warnings.append(f"{fs.mount} 사용률 {fs.used_pct}% — 임계값 {WARN_THRESHOLD}% 초과")
+
+    return DiskResult(
+        ok=True,
+        checked_at=_now(),
+        host=host,
+        filesystems=filesystems,
+        warnings=warnings,
+        raw=stdout.strip(),
+    )