feat(infra): MCP 인프라 서버 통합 — 7개 도구 + core/ 분리

mcp-infra-server를 gpu-services/infra/로 통합. core/ 순수 로직은 Agent/NanoClaude에서도 직접 import 가능. 도구: docker_status, docker_logs, service_health, disk_usage, tailscale_status, ollama_models, mlx_models. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-13 13:11:54 +09:00
parent 6b36063010
commit b1f9e87d6a
14 changed files with 996 additions and 0 deletions
@@ -0,0 +1,2 @@
 # Copy to .env and fill in values
 NAS_COMPANY_PASSWORD=
@@ -0,0 +1,73 @@
 """Host configuration and tool-host validation.
 All host IPs are Tailscale IPs (except nas-company which also works via Tailscale).
 """
 from __future__ import annotations
 import os
 from dataclasses import dataclass, field
 from dotenv import load_dotenv
 load_dotenv()
@dataclass(frozen=True)
 class HostConfig:
    ip: str
    user: str
    auth: str                          # "key" | "password"
    docker_path: str = "docker"
    needs_sudo: bool = False
    password: str | None = None        # only for auth="password"
 HOSTS: dict[str, HostConfig] = {
    "gpu": HostConfig(
        ip="100.111.160.84",
        user="hyungi",
        auth="key",
    ),
    "macmini": HostConfig(
        ip="100.76.254.116",
        user="hyungi",
        auth="key",
    ),
    "nas-company": HostConfig(
        ip="100.71.132.52",
        user="hyungi",
        auth="password",
        docker_path="/usr/local/bin/docker",
        needs_sudo=True,
        password=os.getenv("NAS_COMPANY_PASSWORD"),
    ),
 }
 # Per-tool allowed hosts — invalid host → immediate error
 TOOL_HOST_MAP: dict[str, list[str]] = {
    "docker_status":  ["gpu", "nas-company"],
    "docker_logs":    ["gpu", "nas-company"],
    "disk_usage":     ["gpu", "macmini", "nas-company"],
    "ollama_models":  ["gpu", "macmini"],
    "mlx_models":     ["macmini"],
 }
 # SSH timeouts
 SSH_TIMEOUT = 5       # connection timeout (seconds)
 CMD_TIMEOUT = 10      # command execution timeout (seconds)
 MAX_RETRIES = 1       # retry once on failure
 def validate_host(tool: str, host: str) -> HostConfig:
    """Validate host is allowed for tool and return config. Raises ValueError if invalid."""
    allowed = TOOL_HOST_MAP.get(tool)
    if allowed and host not in allowed:
        raise ValueError(
            f"'{host}'는 {tool}에서 지원하지 않습니다. 허용: {', '.join(allowed)}"
        )
    config = HOSTS.get(host)
    if not config:
        raise ValueError(
            f"알 수 없는 호스트: '{host}'. 허용: {', '.join(HOSTS.keys())}"
        )
    return config
@@ -0,0 +1,113 @@
 """Docker status and logs tools."""
 from __future__ import annotations
 from datetime import datetime, timezone
 from ..config import validate_host, HOSTS
 from ..schemas import DockerStatusResult, DockerLogsResult, ContainerInfo
 from .ssh import run_command, SSHError
 def _now() -> str:
    return datetime.now(timezone.utc).isoformat()
 async def docker_status(host: str) -> DockerStatusResult:
    """List all Docker containers on a host with structured status."""
    try:
        cfg = validate_host("docker_status", host)
    except ValueError as e:
        return DockerStatusResult(
            ok=False, checked_at=_now(), host=host,
            error_type="parse_error", error=str(e),
        )
    docker = cfg.docker_path
    fmt = '{{.Names}}|{{.Status}}|{{.Ports}}|{{.Image}}'
    cmd = f"{docker} ps -a --format '{fmt}'"
    try:
        stdout, _ = await run_command(cfg, cmd, use_sudo=cfg.needs_sudo)
    except SSHError as e:
        return DockerStatusResult(
            ok=False, checked_at=_now(), host=host,
            error_type=e.error_type, error=str(e),
        )
    containers: list[ContainerInfo] = []
    for line in stdout.strip().splitlines():
        parts = line.split("|", 3)
        if len(parts) < 4:
            continue
        name, status_str, ports, image = parts
        # Extract running state from status string
        state = "running" if status_str.startswith("Up") else "exited"
        if "Restarting" in status_str:
            state = "restarting"
        containers.append(ContainerInfo(
            name=name, status=state, uptime=status_str, ports=ports, image=image,
        ))
    running = sum(1 for c in containers if c.status == "running")
    total = len(containers)
    summary = f"{running}/{total} running"
    if running < total:
        non_running = [c.name for c in containers if c.status != "running"]
        summary += f", down: {', '.join(non_running)}"
    warnings: list[str] = []
    for c in containers:
        if c.status == "restarting":
            warnings.append(f"{c.name} is restarting")
        elif c.status == "exited":
            warnings.append(f"{c.name} is exited")
    return DockerStatusResult(
        ok=running == total,
        checked_at=_now(),
        host=host,
        containers=containers,
        summary=summary,
        warnings=warnings,
        raw=stdout.strip(),
    )
 async def docker_logs(host: str, container: str, lines: int = 50) -> DockerLogsResult:
    """Get recent logs from a container."""
    try:
        cfg = validate_host("docker_logs", host)
    except ValueError as e:
        return DockerLogsResult(
            ok=False, checked_at=_now(), host=host, container=container,
            lines=lines, error_type="parse_error", error=str(e),
        )
    docker = cfg.docker_path
    # Request one extra line to detect truncation
    cmd = f"{docker} logs --tail {lines + 1} {container} 2>&1"
    try:
        stdout, stderr = await run_command(cfg, cmd, use_sudo=cfg.needs_sudo, timeout=15)
    except SSHError as e:
        return DockerLogsResult(
            ok=False, checked_at=_now(), host=host, container=container,
            lines=lines, error_type=e.error_type, error=str(e),
        )
    all_lines = stdout.strip().splitlines()
    truncated = len(all_lines) > lines
    content = "\n".join(all_lines[:lines]) if truncated else "\n".join(all_lines)
    return DockerLogsResult(
        ok=True,
        checked_at=_now(),
        host=host,
        container=container,
        lines=lines,
        truncated=truncated,
        content=content,
        stderr=stderr.strip() if stderr else "",
        raw=stdout.strip(),
    )
@@ -0,0 +1,201 @@
 """Service health checks with per-service validators."""
 from __future__ import annotations
 import json
 import time
 from datetime import datetime, timezone
 from ..config import HOSTS
 from ..schemas import HealthResult
 from .ssh import run_command, SSHError
 def _now() -> str:
    return datetime.now(timezone.utc).isoformat()
 async def _validate_document_server() -> HealthResult:
    """Document Server: /health endpoint must return ok + database connected."""
    cfg = HOSTS["gpu"]
    try:
        t0 = time.monotonic()
        stdout, _ = await run_command(cfg, "curl -sf http://localhost:8000/health")
        latency_ms = int((time.monotonic() - t0) * 1000)
        data = json.loads(stdout)
        db_ok = data.get("database") == "connected"
        status_ok = data.get("status") == "ok"
        warnings = []
        if not db_ok:
            warnings.append("database disconnected")
        return HealthResult(
            ok=status_ok and db_ok,
            checked_at=_now(),
            service="document-server",
            status="healthy" if (status_ok and db_ok) else "degraded",
            details={
                "status": data.get("status"),
                "database": data.get("database"),
                "version": data.get("version"),
                "latency_ms": latency_ms,
            },
            warnings=warnings,
            raw=stdout.strip(),
        )
    except SSHError as e:
        return HealthResult(
            ok=False, checked_at=_now(), service="document-server",
            status="down", error_type=e.error_type, error=str(e),
        )
    except (json.JSONDecodeError, KeyError) as e:
        return HealthResult(
            ok=False, checked_at=_now(), service="document-server",
            status="down", error_type="parse_error", error=f"응답 파싱 실패: {e}",
            raw=stdout.strip() if 'stdout' in dir() else None,
        )
 async def _validate_mlx() -> HealthResult:
    """MLX Server: /v1/models must return at least 1 model within 5s."""
    cfg = HOSTS["macmini"]
    try:
        t0 = time.monotonic()
        stdout, _ = await run_command(cfg, "curl -sf http://localhost:8800/v1/models")
        latency_ms = int((time.monotonic() - t0) * 1000)
        data = json.loads(stdout)
        models = data.get("data", [])
        model_ids = [m.get("id", "unknown") for m in models]
        warnings = []
        if latency_ms > 5000:
            warnings.append(f"응답 지연 {latency_ms}ms (임계값 5000ms)")
        return HealthResult(
            ok=len(models) > 0 and latency_ms <= 5000,
            checked_at=_now(),
            service="mlx",
            status="healthy" if (len(models) > 0 and latency_ms <= 5000) else "degraded",
            details={
                "model_count": len(models),
                "models": model_ids,
                "latency_ms": latency_ms,
            },
            warnings=warnings,
            raw=stdout.strip(),
        )
    except SSHError as e:
        return HealthResult(
            ok=False, checked_at=_now(), service="mlx",
            status="down", error_type=e.error_type, error=str(e),
        )
 async def _validate_mlx_proxy() -> HealthResult:
    """MLX Proxy (:8801): must return models via proxy."""
    cfg = HOSTS["macmini"]
    try:
        t0 = time.monotonic()
        stdout, _ = await run_command(cfg, "curl -sf http://localhost:8801/v1/models")
        latency_ms = int((time.monotonic() - t0) * 1000)
        data = json.loads(stdout)
        models = data.get("data", [])
        return HealthResult(
            ok=len(models) > 0,
            checked_at=_now(),
            service="mlx-proxy",
            status="healthy" if models else "down",
            details={"model_count": len(models), "latency_ms": latency_ms},
            raw=stdout.strip(),
        )
    except SSHError as e:
        return HealthResult(
            ok=False, checked_at=_now(), service="mlx-proxy",
            status="down", error_type=e.error_type, error=str(e),
        )
 async def _validate_nanoclaude() -> HealthResult:
    """NanoClaude: /health on port 8100."""
    cfg = HOSTS["gpu"]
    try:
        t0 = time.monotonic()
        stdout, _ = await run_command(cfg, "curl -sf http://localhost:8100/health")
        latency_ms = int((time.monotonic() - t0) * 1000)
        return HealthResult(
            ok=True,
            checked_at=_now(),
            service="nanoclaude",
            status="healthy",
            details={"latency_ms": latency_ms, "response": stdout.strip()[:200]},
            raw=stdout.strip(),
        )
    except SSHError as e:
        return HealthResult(
            ok=False, checked_at=_now(), service="nanoclaude",
            status="down", error_type=e.error_type, error=str(e),
        )
 async def _validate_ollama(host: str) -> HealthResult:
    """Ollama: `ollama list` must succeed and return non-empty."""
    service_name = f"ollama-{host}" if host != "gpu" else "ollama-gpu"
    cfg = HOSTS[host]
    try:
        t0 = time.monotonic()
        stdout, _ = await run_command(cfg, "ollama list")
        latency_ms = int((time.monotonic() - t0) * 1000)
        lines = [l for l in stdout.strip().splitlines()[1:] if l.strip()]  # skip header
        model_count = len(lines)
        warnings = []
        if model_count == 0:
            warnings.append("모델 없음")
        return HealthResult(
            ok=model_count > 0,
            checked_at=_now(),
            service=service_name,
            status="healthy" if model_count > 0 else "degraded",
            details={"model_count": model_count, "latency_ms": latency_ms},
            warnings=warnings,
            raw=stdout.strip(),
        )
    except SSHError as e:
        return HealthResult(
            ok=False, checked_at=_now(), service=service_name,
            status="down", error_type=e.error_type, error=str(e),
        )
 # Validator registry
 VALIDATORS: dict[str, object] = {
    "document-server": _validate_document_server,
    "mlx": _validate_mlx,
    "mlx-proxy": _validate_mlx_proxy,
    "nanoclaude": _validate_nanoclaude,
    "ollama-gpu": lambda: _validate_ollama("gpu"),
    "ollama-macmini": lambda: _validate_ollama("macmini"),
 }
 VALID_SERVICES = list(VALIDATORS.keys())
 async def service_health(service: str) -> HealthResult:
    """Run health check for a specific service."""
    validator = VALIDATORS.get(service)
    if not validator:
        return HealthResult(
            ok=False, checked_at=_now(), service=service,
            status="unknown",
            error_type="parse_error",
            error=f"알 수 없는 서비스: '{service}'. 허용: {', '.join(VALID_SERVICES)}",
        )
    return await validator()
@@ -0,0 +1,97 @@
 """Model inventory tools — Ollama and MLX model listing."""
 from __future__ import annotations
 import json
 from datetime import datetime, timezone
 from ..config import validate_host, HOSTS
 from ..schemas import ModelsResult, ModelInfo
 from .ssh import run_command, SSHError
 def _now() -> str:
    return datetime.now(timezone.utc).isoformat()
 def _parse_ollama_list(output: str) -> list[ModelInfo]:
    """Parse `ollama list` output."""
    models = []
    for line in output.strip().splitlines()[1:]:  # skip header
        parts = line.split()
        if len(parts) < 2:
            continue
        model_id = parts[0]
        # Remaining fields vary: ID, SIZE, MODIFIED
        size = parts[2] + " " + parts[3] if len(parts) > 3 else ""
        modified = " ".join(parts[4:]) if len(parts) > 4 else ""
        models.append(ModelInfo(id=model_id, size=size, modified=modified))
    return models
 async def ollama_models(host: str) -> ModelsResult:
    """List Ollama models on a host."""
    try:
        cfg = validate_host("ollama_models", host)
    except ValueError as e:
        return ModelsResult(
            ok=False, checked_at=_now(), host=host, source="ollama",
            error_type="parse_error", error=str(e),
        )
    try:
        stdout, _ = await run_command(cfg, "ollama list")
    except SSHError as e:
        return ModelsResult(
            ok=False, checked_at=_now(), host=host, source="ollama",
            error_type=e.error_type, error=str(e),
        )
    models = _parse_ollama_list(stdout)
    return ModelsResult(
        ok=True,
        checked_at=_now(),
        host=host,
        source="ollama",
        models=models,
        raw=stdout.strip(),
    )
 async def mlx_models() -> ModelsResult:
    """List MLX models loaded on Mac mini."""
    cfg = HOSTS["macmini"]
    try:
        stdout, _ = await run_command(cfg, "curl -sf http://localhost:8800/v1/models")
    except SSHError as e:
        return ModelsResult(
            ok=False, checked_at=_now(), host="macmini", source="mlx",
            error_type=e.error_type, error=str(e),
        )
    try:
        data = json.loads(stdout)
        model_list = data.get("data", [])
        models = [
            ModelInfo(
                id=m.get("id", "unknown"),
                size=str(m.get("size", "")),
                modified=str(m.get("created", "")),
            )
            for m in model_list
        ]
    except (json.JSONDecodeError, KeyError) as e:
        return ModelsResult(
            ok=False, checked_at=_now(), host="macmini", source="mlx",
            error_type="parse_error", error=f"JSON 파싱 실패: {e}",
            raw=stdout.strip(),
        )
    return ModelsResult(
        ok=True,
        checked_at=_now(),
        host="macmini",
        source="mlx",
        models=models,
        raw=stdout.strip(),
    )
@@ -0,0 +1,83 @@
 """Network tools — Tailscale status."""
 from __future__ import annotations
 from datetime import datetime, timezone
 from ..schemas import TailscaleResult, TailscalePeer
 from .ssh import run_local, SSHError
 TAILSCALE_BIN = "/Applications/Tailscale.app/Contents/MacOS/Tailscale"
 def _now() -> str:
    return datetime.now(timezone.utc).isoformat()
 def _parse_tailscale(output: str) -> list[TailscalePeer]:
    """Parse `tailscale status` output into peer list.
    Format: IP  HOSTNAME  USER@  OS  STATUS_INFO
    Status examples: "-" (connected/active), "idle, tx ... rx ...", "offline, last seen ..."
    """
    peers = []
    for line in output.strip().splitlines():
        parts = line.split()
        if len(parts) < 4:
            continue
        # Skip header-like lines
        if parts[0].startswith("#") or parts[0] == "IP":
            continue
        ip = parts[0]
        hostname = parts[1]
        # parts[2] = user@ (skip)
        os_name = parts[3] if len(parts) > 3 else ""
        # Remaining is status info
        status_text = " ".join(parts[4:]) if len(parts) > 4 else ""
        if "offline" in status_text:
            status = "offline"
        elif "idle" in status_text:
            status = "idle"
        elif status_text == "-" or status_text == "":
            status = "active"
        else:
            status = "active"
        peers.append(TailscalePeer(
            hostname=hostname,
            ip=ip,
            status=status,
            os=os_name,
        ))
    return peers
 async def tailscale_status() -> TailscaleResult:
    """Get Tailscale network status (runs locally)."""
    try:
        stdout, _ = await run_local(f"{TAILSCALE_BIN} status")
    except SSHError as e:
        return TailscaleResult(
            ok=False, checked_at=_now(),
            error_type=e.error_type, error=str(e),
        )
    peers = _parse_tailscale(stdout)
    warnings = []
    expected_hosts = {"sub-server", "hyungi-macmini", "hyungi-macbookpro"}
    found_hosts = {p.hostname for p in peers}
    missing = expected_hosts - found_hosts
    for h in missing:
        warnings.append(f"{h} not found in tailnet")
    return TailscaleResult(
        ok=True,
        checked_at=_now(),
        peers=peers,
        warnings=warnings,
        raw=stdout.strip(),
    )
@@ -0,0 +1,123 @@
 """SSH connection layer — asyncssh based.
 Provides run_command() which handles:
 - Key-based auth (GPU, Mac mini)
 - Password auth + sudo (company NAS)
 - Timeout / retry
 - Structured error classification
 """
 from __future__ import annotations
 import asyncio
 from datetime import datetime, timezone
 import asyncssh
 from ..config import HostConfig, SSH_TIMEOUT, CMD_TIMEOUT, MAX_RETRIES
 class SSHError(Exception):
    """Typed SSH error with error_type classification."""
    def __init__(self, error_type: str, message: str):
        self.error_type = error_type
        super().__init__(message)
 def _now_iso() -> str:
    return datetime.now(timezone.utc).isoformat()
 async def _connect(host: HostConfig) -> asyncssh.SSHClientConnection:
    """Open SSH connection with appropriate auth method."""
    kwargs: dict = {
        "host": host.ip,
        "username": host.user,
        "connect_timeout": SSH_TIMEOUT,
        "known_hosts": None,  # accept any host key (Tailscale internal network)
    }
    if host.auth == "password" and host.password:
        kwargs["password"] = host.password
        kwargs["client_keys"] = []  # don't try key auth
    # key auth is the default (uses ~/.ssh/)
    return await asyncssh.connect(**kwargs)
 async def run_command(
    host: HostConfig,
    command: str,
    timeout: int = CMD_TIMEOUT,
    use_sudo: bool = False,
 ) -> tuple[str, str]:
    """Run a command on remote host. Returns (stdout, stderr).
    For NAS with sudo: wraps command with sudo using password via stdin.
    Raises SSHError with typed error_type on failure.
    """
    if use_sudo and host.needs_sudo and host.password:
        # Pipe password to sudo via stdin
        command = f"echo '{host.password}' | sudo -S {command}"
    last_error: Exception | None = None
    for attempt in range(1 + MAX_RETRIES):
        try:
            conn = await _connect(host)
            async with conn:
                result = await asyncio.wait_for(
                    conn.run(command, check=False),
                    timeout=timeout,
                )
                stdout = result.stdout or ""
                stderr = result.stderr or ""
                if result.exit_status != 0:
                    # Command ran but returned non-zero
                    # Filter out sudo password prompt from stderr
                    stderr_clean = "\n".join(
                        line for line in stderr.splitlines()
                        if "[sudo]" not in line and "Password:" not in line
                    )
                    raise SSHError(
                        "command_failed",
                        f"exit {result.exit_status}: {stderr_clean.strip() or stdout.strip()}"
                    )
                return stdout, stderr
        except SSHError:
            raise
        except asyncio.TimeoutError:
            raise SSHError("timeout", f"명령 실행 시간 초과 ({timeout}초)")
        except asyncssh.PermissionDenied:
            raise SSHError("auth", f"SSH 인증 실패: {host.user}@{host.ip}")
        except (OSError, asyncssh.Error) as e:
            last_error = e
            if attempt < MAX_RETRIES:
                await asyncio.sleep(1)
                continue
            raise SSHError("timeout", f"SSH 연결 실패: {host.ip} — {e}")
    raise SSHError("timeout", f"SSH 최대 재시도 초과: {host.ip}")
 async def run_local(command: str, timeout: int = CMD_TIMEOUT) -> tuple[str, str]:
    """Run a command locally. Returns (stdout, stderr)."""
    try:
        proc = await asyncio.create_subprocess_shell(
            command,
            stdout=asyncio.subprocess.PIPE,
            stderr=asyncio.subprocess.PIPE,
        )
        stdout_bytes, stderr_bytes = await asyncio.wait_for(
            proc.communicate(), timeout=timeout
        )
        stdout = stdout_bytes.decode() if stdout_bytes else ""
        stderr = stderr_bytes.decode() if stderr_bytes else ""
        if proc.returncode != 0:
            raise SSHError("command_failed", f"exit {proc.returncode}: {stderr.strip() or stdout.strip()}")
        return stdout, stderr
    except asyncio.TimeoutError:
        raise SSHError("timeout", f"로컬 명령 시간 초과 ({timeout}초)")
@@ -0,0 +1,79 @@
 """System tools — disk usage."""
 from __future__ import annotations
 from datetime import datetime, timezone
 from ..config import validate_host
 from ..schemas import DiskResult, FileSystemInfo
 from .ssh import run_command, SSHError
 def _now() -> str:
    return datetime.now(timezone.utc).isoformat()
 def _parse_df(output: str) -> list[FileSystemInfo]:
    """Parse `df -h` output into structured filesystem info."""
    filesystems = []
    for line in output.strip().splitlines()[1:]:  # skip header
        parts = line.split()
        if len(parts) < 6:
            continue
        # df -h columns: Filesystem Size Used Avail Use% Mounted
        mount = parts[-1]
        # Skip pseudo-filesystems
        if mount.startswith(("/dev", "/sys", "/proc", "/run", "/snap")):
            continue
        if parts[0] in ("tmpfs", "devtmpfs", "overlay", "shm", "none"):
            continue
        try:
            used_pct = int(parts[4].rstrip("%"))
        except ValueError:
            continue
        filesystems.append(FileSystemInfo(
            mount=mount,
            total=parts[1],
            used=parts[2],
            avail=parts[3],
            used_pct=used_pct,
        ))
    return filesystems
 async def disk_usage(host: str) -> DiskResult:
    """Get disk usage for a host with structured filesystem info."""
    try:
        cfg = validate_host("disk_usage", host)
    except ValueError as e:
        return DiskResult(
            ok=False, checked_at=_now(), host=host,
            error_type="parse_error", error=str(e),
        )
    try:
        stdout, _ = await run_command(cfg, "df -h", use_sudo=cfg.needs_sudo)
    except SSHError as e:
        return DiskResult(
            ok=False, checked_at=_now(), host=host,
            error_type=e.error_type, error=str(e),
        )
    filesystems = _parse_df(stdout)
    warnings = []
    WARN_THRESHOLD = 85
    for fs in filesystems:
        if fs.used_pct >= WARN_THRESHOLD:
            warnings.append(f"{fs.mount} 사용률 {fs.used_pct}% — 임계값 {WARN_THRESHOLD}% 초과")
    return DiskResult(
        ok=True,
        checked_at=_now(),
        host=host,
        filesystems=filesystems,
        warnings=warnings,
        raw=stdout.strip(),
    )
@@ -0,0 +1,107 @@
 """MCP Infra Server — thin wrapper over core/ functions.
 This file ONLY does:
 1. MCP tool registration (decorators)
 2. Parameter validation
 3. Call core/ functions
 4. Return results as JSON text
 All actual logic lives in src/core/.
 """
 from __future__ import annotations
 from mcp.server.fastmcp import FastMCP
 from .core.docker import docker_status, docker_logs
 from .core.health import service_health, VALID_SERVICES
 from .core.system import disk_usage
 from .core.network import tailscale_status
 from .core.models import ollama_models, mlx_models
 mcp = FastMCP(
    "infra",
    instructions=(
        "인프라 모니터링 도구. GPU 서버, Mac mini, 회사 NAS의 "
        "Docker 상태, 서비스 헬스체크, 디스크 사용량, 네트워크, 모델 목록을 확인합니다."
    ),
 )
@mcp.tool()
 async def check_docker_status(host: str) -> str:
    """Docker 컨테이너 상태 확인.
    Args:
        host: 대상 호스트 (gpu | nas-company)
    """
    result = await docker_status(host)
    return result.model_dump_json(indent=2)
@mcp.tool()
 async def check_docker_logs(host: str, container: str, lines: int = 50) -> str:
    """Docker 컨테이너 최근 로그 조회.
    Args:
        host: 대상 호스트 (gpu | nas-company)
        container: 컨테이너 이름
        lines: 조회할 줄 수 (기본 50)
    """
    result = await docker_logs(host, container, lines)
    return result.model_dump_json(indent=2)
@mcp.tool()
 async def check_service_health(service: str) -> str:
    """서비스 헬스체크. 서비스별 정상 판정 기준이 다름.
    Args:
        service: 서비스 이름 (document-server | mlx | mlx-proxy | nanoclaude | ollama-gpu | ollama-macmini)
    """
    result = await service_health(service)
    return result.model_dump_json(indent=2)
@mcp.tool()
 async def check_disk_usage(host: str) -> str:
    """디스크 사용량 확인. 85% 초과 시 경고.
    Args:
        host: 대상 호스트 (gpu | macmini | nas-company)
    """
    result = await disk_usage(host)
    return result.model_dump_json(indent=2)
@mcp.tool()
 async def check_tailscale() -> str:
    """Tailscale 네트워크 상태 확인. 모든 피어 연결 상태를 반환."""
    result = await tailscale_status()
    return result.model_dump_json(indent=2)
@mcp.tool()
 async def check_ollama_models(host: str) -> str:
    """Ollama 설치 모델 목록 조회.
    Args:
        host: 대상 호스트 (gpu | macmini)
    """
    result = await ollama_models(host)
    return result.model_dump_json(indent=2)
@mcp.tool()
 async def check_mlx_models() -> str:
    """Mac mini MLX 서버에 로드된 모델 목록 조회."""
    result = await mlx_models()
    return result.model_dump_json(indent=2)
 def main():
    mcp.run(transport="stdio")
 if __name__ == "__main__":
    main()
@@ -0,0 +1,14 @@
 [project]
 name = "mcp-infra-server"
 version = "0.1.0"
 description = "MCP server for infrastructure monitoring — GPU server, Mac mini, NAS"
 requires-python = ">=3.11"
 dependencies = [
    "mcp>=1.27.0",
    "asyncssh>=2.22.0",
    "pydantic>=2.12.0",
    "python-dotenv>=1.0.0",
 ]
 [project.optional-dependencies]
 dev = ["pytest", "pytest-asyncio"]
@@ -0,0 +1,3 @@
 #!/bin/bash
 cd /Users/hyungiahn/Documents/code/gpu-services
 exec /opt/homebrew/bin/python3.11 -m infra.mcp_server
@@ -0,0 +1,101 @@
 """Pydantic models for all tool results.
 Every tool returns a subclass of BaseResult.
 - ok=true + warnings: 성공이지만 주의 필요
 - ok=false + error_type + error: 실패
 - raw: 디버깅 전용 보조 필드 (상위 레이어에서 기본 숨김)
 - checked_at: 모든 결과에 포함 (수집 시점 ISO timestamp)
 """
 from __future__ import annotations
 from pydantic import BaseModel, Field
 class BaseResult(BaseModel):
    ok: bool
    checked_at: str
    warnings: list[str] = Field(default_factory=list)
    error_type: str | None = None  # "timeout" | "auth" | "command_failed" | "parse_error"
    error: str | None = None
 # -- Docker ------------------------------------------------------------------
 class ContainerInfo(BaseModel):
    name: str
    status: str       # "running" | "exited" | "restarting" | ...
    uptime: str       # "Up 3 days" etc.
    ports: str        # published ports summary
    image: str
 class DockerStatusResult(BaseResult):
    host: str
    containers: list[ContainerInfo] = Field(default_factory=list)
    summary: str = ""   # "5/5 running" | "4/5 running, 1 exited"
    raw: str = ""
 class DockerLogsResult(BaseResult):
    host: str
    container: str
    lines: int             # requested line count
    truncated: bool = False
    content: str = ""      # stdout
    stderr: str = ""       # stderr (separate)
    raw: str = ""
 # -- Health -------------------------------------------------------------------
 class HealthResult(BaseResult):
    service: str
    status: str = "unknown"   # "healthy" | "degraded" | "down"
    details: dict = Field(default_factory=dict)
    raw: str | None = None
 # -- System -------------------------------------------------------------------
 class FileSystemInfo(BaseModel):
    mount: str
    used_pct: int
    used: str
    avail: str
    total: str
 class DiskResult(BaseResult):
    host: str
    filesystems: list[FileSystemInfo] = Field(default_factory=list)
    raw: str = ""
 # -- Network ------------------------------------------------------------------
 class TailscalePeer(BaseModel):
    hostname: str
    ip: str
    status: str        # "active" | "idle" | "offline"
    os: str
 class TailscaleResult(BaseResult):
    peers: list[TailscalePeer] = Field(default_factory=list)
    raw: str = ""
 # -- Models -------------------------------------------------------------------
 class ModelInfo(BaseModel):
    id: str
    size: str = ""
    modified: str = ""
 class ModelsResult(BaseResult):
    host: str
    source: str           # "ollama" | "mlx"
    models: list[ModelInfo] = Field(default_factory=list)
    raw: str = ""
		`@@ -0,0 +1,2 @@`
							`# Copy to .env and fill in values`
							`NAS_COMPANY_PASSWORD=`