From b1f9e87d6a2dd6ee0d29f2ecdbc978e2a8b77aab Mon Sep 17 00:00:00 2001 From: Hyungi Ahn Date: Mon, 13 Apr 2026 13:11:54 +0900 Subject: [PATCH] =?UTF-8?q?feat(infra):=20MCP=20=EC=9D=B8=ED=94=84?= =?UTF-8?q?=EB=9D=BC=20=EC=84=9C=EB=B2=84=20=ED=86=B5=ED=95=A9=20=E2=80=94?= =?UTF-8?q?=207=EA=B0=9C=20=EB=8F=84=EA=B5=AC=20+=20core/=20=EB=B6=84?= =?UTF-8?q?=EB=A6=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mcp-infra-server를 gpu-services/infra/로 통합. core/ 순수 로직은 Agent/NanoClaude에서도 직접 import 가능. 도구: docker_status, docker_logs, service_health, disk_usage, tailscale_status, ollama_models, mlx_models. Co-Authored-By: Claude Opus 4.6 (1M context) --- infra/.env.example | 2 + infra/__init__.py | 0 infra/config.py | 73 +++++++++++++++ infra/core/__init__.py | 0 infra/core/docker.py | 113 +++++++++++++++++++++++ infra/core/health.py | 201 +++++++++++++++++++++++++++++++++++++++++ infra/core/models.py | 97 ++++++++++++++++++++ infra/core/network.py | 83 +++++++++++++++++ infra/core/ssh.py | 123 +++++++++++++++++++++++++ infra/core/system.py | 79 ++++++++++++++++ infra/mcp_server.py | 107 ++++++++++++++++++++++ infra/pyproject.toml | 14 +++ infra/run.sh | 3 + infra/schemas.py | 101 +++++++++++++++++++++ 14 files changed, 996 insertions(+) create mode 100644 infra/.env.example create mode 100644 infra/__init__.py create mode 100644 infra/config.py create mode 100644 infra/core/__init__.py create mode 100644 infra/core/docker.py create mode 100644 infra/core/health.py create mode 100644 infra/core/models.py create mode 100644 infra/core/network.py create mode 100644 infra/core/ssh.py create mode 100644 infra/core/system.py create mode 100644 infra/mcp_server.py create mode 100644 infra/pyproject.toml create mode 100755 infra/run.sh create mode 100644 infra/schemas.py diff --git a/infra/.env.example b/infra/.env.example new file mode 100644 index 0000000..d3faeda --- /dev/null +++ b/infra/.env.example @@ -0,0 +1,2 @@ +# Copy to .env and fill in values +NAS_COMPANY_PASSWORD= diff --git a/infra/__init__.py b/infra/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/infra/config.py b/infra/config.py new file mode 100644 index 0000000..798f8d3 --- /dev/null +++ b/infra/config.py @@ -0,0 +1,73 @@ +"""Host configuration and tool-host validation. + +All host IPs are Tailscale IPs (except nas-company which also works via Tailscale). +""" + +from __future__ import annotations + +import os +from dataclasses import dataclass, field +from dotenv import load_dotenv + +load_dotenv() + + +@dataclass(frozen=True) +class HostConfig: + ip: str + user: str + auth: str # "key" | "password" + docker_path: str = "docker" + needs_sudo: bool = False + password: str | None = None # only for auth="password" + + +HOSTS: dict[str, HostConfig] = { + "gpu": HostConfig( + ip="100.111.160.84", + user="hyungi", + auth="key", + ), + "macmini": HostConfig( + ip="100.76.254.116", + user="hyungi", + auth="key", + ), + "nas-company": HostConfig( + ip="100.71.132.52", + user="hyungi", + auth="password", + docker_path="/usr/local/bin/docker", + needs_sudo=True, + password=os.getenv("NAS_COMPANY_PASSWORD"), + ), +} + +# Per-tool allowed hosts — invalid host → immediate error +TOOL_HOST_MAP: dict[str, list[str]] = { + "docker_status": ["gpu", "nas-company"], + "docker_logs": ["gpu", "nas-company"], + "disk_usage": ["gpu", "macmini", "nas-company"], + "ollama_models": ["gpu", "macmini"], + "mlx_models": ["macmini"], +} + +# SSH timeouts +SSH_TIMEOUT = 5 # connection timeout (seconds) +CMD_TIMEOUT = 10 # command execution timeout (seconds) +MAX_RETRIES = 1 # retry once on failure + + +def validate_host(tool: str, host: str) -> HostConfig: + """Validate host is allowed for tool and return config. Raises ValueError if invalid.""" + allowed = TOOL_HOST_MAP.get(tool) + if allowed and host not in allowed: + raise ValueError( + f"'{host}'는 {tool}에서 지원하지 않습니다. 허용: {', '.join(allowed)}" + ) + config = HOSTS.get(host) + if not config: + raise ValueError( + f"알 수 없는 호스트: '{host}'. 허용: {', '.join(HOSTS.keys())}" + ) + return config diff --git a/infra/core/__init__.py b/infra/core/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/infra/core/docker.py b/infra/core/docker.py new file mode 100644 index 0000000..e803b86 --- /dev/null +++ b/infra/core/docker.py @@ -0,0 +1,113 @@ +"""Docker status and logs tools.""" + +from __future__ import annotations + +from datetime import datetime, timezone + +from ..config import validate_host, HOSTS +from ..schemas import DockerStatusResult, DockerLogsResult, ContainerInfo +from .ssh import run_command, SSHError + + +def _now() -> str: + return datetime.now(timezone.utc).isoformat() + + +async def docker_status(host: str) -> DockerStatusResult: + """List all Docker containers on a host with structured status.""" + try: + cfg = validate_host("docker_status", host) + except ValueError as e: + return DockerStatusResult( + ok=False, checked_at=_now(), host=host, + error_type="parse_error", error=str(e), + ) + + docker = cfg.docker_path + fmt = '{{.Names}}|{{.Status}}|{{.Ports}}|{{.Image}}' + cmd = f"{docker} ps -a --format '{fmt}'" + + try: + stdout, _ = await run_command(cfg, cmd, use_sudo=cfg.needs_sudo) + except SSHError as e: + return DockerStatusResult( + ok=False, checked_at=_now(), host=host, + error_type=e.error_type, error=str(e), + ) + + containers: list[ContainerInfo] = [] + for line in stdout.strip().splitlines(): + parts = line.split("|", 3) + if len(parts) < 4: + continue + name, status_str, ports, image = parts + # Extract running state from status string + state = "running" if status_str.startswith("Up") else "exited" + if "Restarting" in status_str: + state = "restarting" + containers.append(ContainerInfo( + name=name, status=state, uptime=status_str, ports=ports, image=image, + )) + + running = sum(1 for c in containers if c.status == "running") + total = len(containers) + summary = f"{running}/{total} running" + if running < total: + non_running = [c.name for c in containers if c.status != "running"] + summary += f", down: {', '.join(non_running)}" + + warnings: list[str] = [] + for c in containers: + if c.status == "restarting": + warnings.append(f"{c.name} is restarting") + elif c.status == "exited": + warnings.append(f"{c.name} is exited") + + return DockerStatusResult( + ok=running == total, + checked_at=_now(), + host=host, + containers=containers, + summary=summary, + warnings=warnings, + raw=stdout.strip(), + ) + + +async def docker_logs(host: str, container: str, lines: int = 50) -> DockerLogsResult: + """Get recent logs from a container.""" + try: + cfg = validate_host("docker_logs", host) + except ValueError as e: + return DockerLogsResult( + ok=False, checked_at=_now(), host=host, container=container, + lines=lines, error_type="parse_error", error=str(e), + ) + + docker = cfg.docker_path + # Request one extra line to detect truncation + cmd = f"{docker} logs --tail {lines + 1} {container} 2>&1" + + try: + stdout, stderr = await run_command(cfg, cmd, use_sudo=cfg.needs_sudo, timeout=15) + except SSHError as e: + return DockerLogsResult( + ok=False, checked_at=_now(), host=host, container=container, + lines=lines, error_type=e.error_type, error=str(e), + ) + + all_lines = stdout.strip().splitlines() + truncated = len(all_lines) > lines + content = "\n".join(all_lines[:lines]) if truncated else "\n".join(all_lines) + + return DockerLogsResult( + ok=True, + checked_at=_now(), + host=host, + container=container, + lines=lines, + truncated=truncated, + content=content, + stderr=stderr.strip() if stderr else "", + raw=stdout.strip(), + ) diff --git a/infra/core/health.py b/infra/core/health.py new file mode 100644 index 0000000..e1785d7 --- /dev/null +++ b/infra/core/health.py @@ -0,0 +1,201 @@ +"""Service health checks with per-service validators.""" + +from __future__ import annotations + +import json +import time +from datetime import datetime, timezone + +from ..config import HOSTS +from ..schemas import HealthResult +from .ssh import run_command, SSHError + + +def _now() -> str: + return datetime.now(timezone.utc).isoformat() + + +async def _validate_document_server() -> HealthResult: + """Document Server: /health endpoint must return ok + database connected.""" + cfg = HOSTS["gpu"] + try: + t0 = time.monotonic() + stdout, _ = await run_command(cfg, "curl -sf http://localhost:8000/health") + latency_ms = int((time.monotonic() - t0) * 1000) + + data = json.loads(stdout) + db_ok = data.get("database") == "connected" + status_ok = data.get("status") == "ok" + + warnings = [] + if not db_ok: + warnings.append("database disconnected") + + return HealthResult( + ok=status_ok and db_ok, + checked_at=_now(), + service="document-server", + status="healthy" if (status_ok and db_ok) else "degraded", + details={ + "status": data.get("status"), + "database": data.get("database"), + "version": data.get("version"), + "latency_ms": latency_ms, + }, + warnings=warnings, + raw=stdout.strip(), + ) + except SSHError as e: + return HealthResult( + ok=False, checked_at=_now(), service="document-server", + status="down", error_type=e.error_type, error=str(e), + ) + except (json.JSONDecodeError, KeyError) as e: + return HealthResult( + ok=False, checked_at=_now(), service="document-server", + status="down", error_type="parse_error", error=f"응답 파싱 실패: {e}", + raw=stdout.strip() if 'stdout' in dir() else None, + ) + + +async def _validate_mlx() -> HealthResult: + """MLX Server: /v1/models must return at least 1 model within 5s.""" + cfg = HOSTS["macmini"] + try: + t0 = time.monotonic() + stdout, _ = await run_command(cfg, "curl -sf http://localhost:8800/v1/models") + latency_ms = int((time.monotonic() - t0) * 1000) + + data = json.loads(stdout) + models = data.get("data", []) + model_ids = [m.get("id", "unknown") for m in models] + + warnings = [] + if latency_ms > 5000: + warnings.append(f"응답 지연 {latency_ms}ms (임계값 5000ms)") + + return HealthResult( + ok=len(models) > 0 and latency_ms <= 5000, + checked_at=_now(), + service="mlx", + status="healthy" if (len(models) > 0 and latency_ms <= 5000) else "degraded", + details={ + "model_count": len(models), + "models": model_ids, + "latency_ms": latency_ms, + }, + warnings=warnings, + raw=stdout.strip(), + ) + except SSHError as e: + return HealthResult( + ok=False, checked_at=_now(), service="mlx", + status="down", error_type=e.error_type, error=str(e), + ) + + +async def _validate_mlx_proxy() -> HealthResult: + """MLX Proxy (:8801): must return models via proxy.""" + cfg = HOSTS["macmini"] + try: + t0 = time.monotonic() + stdout, _ = await run_command(cfg, "curl -sf http://localhost:8801/v1/models") + latency_ms = int((time.monotonic() - t0) * 1000) + + data = json.loads(stdout) + models = data.get("data", []) + + return HealthResult( + ok=len(models) > 0, + checked_at=_now(), + service="mlx-proxy", + status="healthy" if models else "down", + details={"model_count": len(models), "latency_ms": latency_ms}, + raw=stdout.strip(), + ) + except SSHError as e: + return HealthResult( + ok=False, checked_at=_now(), service="mlx-proxy", + status="down", error_type=e.error_type, error=str(e), + ) + + +async def _validate_nanoclaude() -> HealthResult: + """NanoClaude: /health on port 8100.""" + cfg = HOSTS["gpu"] + try: + t0 = time.monotonic() + stdout, _ = await run_command(cfg, "curl -sf http://localhost:8100/health") + latency_ms = int((time.monotonic() - t0) * 1000) + + return HealthResult( + ok=True, + checked_at=_now(), + service="nanoclaude", + status="healthy", + details={"latency_ms": latency_ms, "response": stdout.strip()[:200]}, + raw=stdout.strip(), + ) + except SSHError as e: + return HealthResult( + ok=False, checked_at=_now(), service="nanoclaude", + status="down", error_type=e.error_type, error=str(e), + ) + + +async def _validate_ollama(host: str) -> HealthResult: + """Ollama: `ollama list` must succeed and return non-empty.""" + service_name = f"ollama-{host}" if host != "gpu" else "ollama-gpu" + cfg = HOSTS[host] + try: + t0 = time.monotonic() + stdout, _ = await run_command(cfg, "ollama list") + latency_ms = int((time.monotonic() - t0) * 1000) + + lines = [l for l in stdout.strip().splitlines()[1:] if l.strip()] # skip header + model_count = len(lines) + + warnings = [] + if model_count == 0: + warnings.append("모델 없음") + + return HealthResult( + ok=model_count > 0, + checked_at=_now(), + service=service_name, + status="healthy" if model_count > 0 else "degraded", + details={"model_count": model_count, "latency_ms": latency_ms}, + warnings=warnings, + raw=stdout.strip(), + ) + except SSHError as e: + return HealthResult( + ok=False, checked_at=_now(), service=service_name, + status="down", error_type=e.error_type, error=str(e), + ) + + +# Validator registry +VALIDATORS: dict[str, object] = { + "document-server": _validate_document_server, + "mlx": _validate_mlx, + "mlx-proxy": _validate_mlx_proxy, + "nanoclaude": _validate_nanoclaude, + "ollama-gpu": lambda: _validate_ollama("gpu"), + "ollama-macmini": lambda: _validate_ollama("macmini"), +} + +VALID_SERVICES = list(VALIDATORS.keys()) + + +async def service_health(service: str) -> HealthResult: + """Run health check for a specific service.""" + validator = VALIDATORS.get(service) + if not validator: + return HealthResult( + ok=False, checked_at=_now(), service=service, + status="unknown", + error_type="parse_error", + error=f"알 수 없는 서비스: '{service}'. 허용: {', '.join(VALID_SERVICES)}", + ) + return await validator() diff --git a/infra/core/models.py b/infra/core/models.py new file mode 100644 index 0000000..0e842bc --- /dev/null +++ b/infra/core/models.py @@ -0,0 +1,97 @@ +"""Model inventory tools — Ollama and MLX model listing.""" + +from __future__ import annotations + +import json +from datetime import datetime, timezone + +from ..config import validate_host, HOSTS +from ..schemas import ModelsResult, ModelInfo +from .ssh import run_command, SSHError + + +def _now() -> str: + return datetime.now(timezone.utc).isoformat() + + +def _parse_ollama_list(output: str) -> list[ModelInfo]: + """Parse `ollama list` output.""" + models = [] + for line in output.strip().splitlines()[1:]: # skip header + parts = line.split() + if len(parts) < 2: + continue + model_id = parts[0] + # Remaining fields vary: ID, SIZE, MODIFIED + size = parts[2] + " " + parts[3] if len(parts) > 3 else "" + modified = " ".join(parts[4:]) if len(parts) > 4 else "" + models.append(ModelInfo(id=model_id, size=size, modified=modified)) + return models + + +async def ollama_models(host: str) -> ModelsResult: + """List Ollama models on a host.""" + try: + cfg = validate_host("ollama_models", host) + except ValueError as e: + return ModelsResult( + ok=False, checked_at=_now(), host=host, source="ollama", + error_type="parse_error", error=str(e), + ) + + try: + stdout, _ = await run_command(cfg, "ollama list") + except SSHError as e: + return ModelsResult( + ok=False, checked_at=_now(), host=host, source="ollama", + error_type=e.error_type, error=str(e), + ) + + models = _parse_ollama_list(stdout) + return ModelsResult( + ok=True, + checked_at=_now(), + host=host, + source="ollama", + models=models, + raw=stdout.strip(), + ) + + +async def mlx_models() -> ModelsResult: + """List MLX models loaded on Mac mini.""" + cfg = HOSTS["macmini"] + try: + stdout, _ = await run_command(cfg, "curl -sf http://localhost:8800/v1/models") + except SSHError as e: + return ModelsResult( + ok=False, checked_at=_now(), host="macmini", source="mlx", + error_type=e.error_type, error=str(e), + ) + + try: + data = json.loads(stdout) + model_list = data.get("data", []) + models = [ + ModelInfo( + id=m.get("id", "unknown"), + size=str(m.get("size", "")), + modified=str(m.get("created", "")), + ) + for m in model_list + ] + except (json.JSONDecodeError, KeyError) as e: + return ModelsResult( + ok=False, checked_at=_now(), host="macmini", source="mlx", + error_type="parse_error", error=f"JSON 파싱 실패: {e}", + raw=stdout.strip(), + ) + + return ModelsResult( + ok=True, + checked_at=_now(), + host="macmini", + source="mlx", + models=models, + raw=stdout.strip(), + ) diff --git a/infra/core/network.py b/infra/core/network.py new file mode 100644 index 0000000..4851f36 --- /dev/null +++ b/infra/core/network.py @@ -0,0 +1,83 @@ +"""Network tools — Tailscale status.""" + +from __future__ import annotations + +from datetime import datetime, timezone + +from ..schemas import TailscaleResult, TailscalePeer +from .ssh import run_local, SSHError + +TAILSCALE_BIN = "/Applications/Tailscale.app/Contents/MacOS/Tailscale" + + +def _now() -> str: + return datetime.now(timezone.utc).isoformat() + + +def _parse_tailscale(output: str) -> list[TailscalePeer]: + """Parse `tailscale status` output into peer list. + + Format: IP HOSTNAME USER@ OS STATUS_INFO + Status examples: "-" (connected/active), "idle, tx ... rx ...", "offline, last seen ..." + """ + peers = [] + for line in output.strip().splitlines(): + parts = line.split() + if len(parts) < 4: + continue + # Skip header-like lines + if parts[0].startswith("#") or parts[0] == "IP": + continue + + ip = parts[0] + hostname = parts[1] + # parts[2] = user@ (skip) + os_name = parts[3] if len(parts) > 3 else "" + + # Remaining is status info + status_text = " ".join(parts[4:]) if len(parts) > 4 else "" + + if "offline" in status_text: + status = "offline" + elif "idle" in status_text: + status = "idle" + elif status_text == "-" or status_text == "": + status = "active" + else: + status = "active" + + peers.append(TailscalePeer( + hostname=hostname, + ip=ip, + status=status, + os=os_name, + )) + return peers + + +async def tailscale_status() -> TailscaleResult: + """Get Tailscale network status (runs locally).""" + try: + stdout, _ = await run_local(f"{TAILSCALE_BIN} status") + except SSHError as e: + return TailscaleResult( + ok=False, checked_at=_now(), + error_type=e.error_type, error=str(e), + ) + + peers = _parse_tailscale(stdout) + + warnings = [] + expected_hosts = {"sub-server", "hyungi-macmini", "hyungi-macbookpro"} + found_hosts = {p.hostname for p in peers} + missing = expected_hosts - found_hosts + for h in missing: + warnings.append(f"{h} not found in tailnet") + + return TailscaleResult( + ok=True, + checked_at=_now(), + peers=peers, + warnings=warnings, + raw=stdout.strip(), + ) diff --git a/infra/core/ssh.py b/infra/core/ssh.py new file mode 100644 index 0000000..7537153 --- /dev/null +++ b/infra/core/ssh.py @@ -0,0 +1,123 @@ +"""SSH connection layer — asyncssh based. + +Provides run_command() which handles: +- Key-based auth (GPU, Mac mini) +- Password auth + sudo (company NAS) +- Timeout / retry +- Structured error classification +""" + +from __future__ import annotations + +import asyncio +from datetime import datetime, timezone + +import asyncssh + +from ..config import HostConfig, SSH_TIMEOUT, CMD_TIMEOUT, MAX_RETRIES + + +class SSHError(Exception): + """Typed SSH error with error_type classification.""" + + def __init__(self, error_type: str, message: str): + self.error_type = error_type + super().__init__(message) + + +def _now_iso() -> str: + return datetime.now(timezone.utc).isoformat() + + +async def _connect(host: HostConfig) -> asyncssh.SSHClientConnection: + """Open SSH connection with appropriate auth method.""" + kwargs: dict = { + "host": host.ip, + "username": host.user, + "connect_timeout": SSH_TIMEOUT, + "known_hosts": None, # accept any host key (Tailscale internal network) + } + if host.auth == "password" and host.password: + kwargs["password"] = host.password + kwargs["client_keys"] = [] # don't try key auth + # key auth is the default (uses ~/.ssh/) + + return await asyncssh.connect(**kwargs) + + +async def run_command( + host: HostConfig, + command: str, + timeout: int = CMD_TIMEOUT, + use_sudo: bool = False, +) -> tuple[str, str]: + """Run a command on remote host. Returns (stdout, stderr). + + For NAS with sudo: wraps command with sudo using password via stdin. + Raises SSHError with typed error_type on failure. + """ + if use_sudo and host.needs_sudo and host.password: + # Pipe password to sudo via stdin + command = f"echo '{host.password}' | sudo -S {command}" + + last_error: Exception | None = None + for attempt in range(1 + MAX_RETRIES): + try: + conn = await _connect(host) + async with conn: + result = await asyncio.wait_for( + conn.run(command, check=False), + timeout=timeout, + ) + stdout = result.stdout or "" + stderr = result.stderr or "" + + if result.exit_status != 0: + # Command ran but returned non-zero + # Filter out sudo password prompt from stderr + stderr_clean = "\n".join( + line for line in stderr.splitlines() + if "[sudo]" not in line and "Password:" not in line + ) + raise SSHError( + "command_failed", + f"exit {result.exit_status}: {stderr_clean.strip() or stdout.strip()}" + ) + return stdout, stderr + + except SSHError: + raise + except asyncio.TimeoutError: + raise SSHError("timeout", f"명령 실행 시간 초과 ({timeout}초)") + except asyncssh.PermissionDenied: + raise SSHError("auth", f"SSH 인증 실패: {host.user}@{host.ip}") + except (OSError, asyncssh.Error) as e: + last_error = e + if attempt < MAX_RETRIES: + await asyncio.sleep(1) + continue + raise SSHError("timeout", f"SSH 연결 실패: {host.ip} — {e}") + + raise SSHError("timeout", f"SSH 최대 재시도 초과: {host.ip}") + + +async def run_local(command: str, timeout: int = CMD_TIMEOUT) -> tuple[str, str]: + """Run a command locally. Returns (stdout, stderr).""" + try: + proc = await asyncio.create_subprocess_shell( + command, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout_bytes, stderr_bytes = await asyncio.wait_for( + proc.communicate(), timeout=timeout + ) + stdout = stdout_bytes.decode() if stdout_bytes else "" + stderr = stderr_bytes.decode() if stderr_bytes else "" + + if proc.returncode != 0: + raise SSHError("command_failed", f"exit {proc.returncode}: {stderr.strip() or stdout.strip()}") + + return stdout, stderr + except asyncio.TimeoutError: + raise SSHError("timeout", f"로컬 명령 시간 초과 ({timeout}초)") diff --git a/infra/core/system.py b/infra/core/system.py new file mode 100644 index 0000000..26ecc45 --- /dev/null +++ b/infra/core/system.py @@ -0,0 +1,79 @@ +"""System tools — disk usage.""" + +from __future__ import annotations + +from datetime import datetime, timezone + +from ..config import validate_host +from ..schemas import DiskResult, FileSystemInfo +from .ssh import run_command, SSHError + + +def _now() -> str: + return datetime.now(timezone.utc).isoformat() + + +def _parse_df(output: str) -> list[FileSystemInfo]: + """Parse `df -h` output into structured filesystem info.""" + filesystems = [] + for line in output.strip().splitlines()[1:]: # skip header + parts = line.split() + if len(parts) < 6: + continue + # df -h columns: Filesystem Size Used Avail Use% Mounted + mount = parts[-1] + # Skip pseudo-filesystems + if mount.startswith(("/dev", "/sys", "/proc", "/run", "/snap")): + continue + if parts[0] in ("tmpfs", "devtmpfs", "overlay", "shm", "none"): + continue + + try: + used_pct = int(parts[4].rstrip("%")) + except ValueError: + continue + + filesystems.append(FileSystemInfo( + mount=mount, + total=parts[1], + used=parts[2], + avail=parts[3], + used_pct=used_pct, + )) + return filesystems + + +async def disk_usage(host: str) -> DiskResult: + """Get disk usage for a host with structured filesystem info.""" + try: + cfg = validate_host("disk_usage", host) + except ValueError as e: + return DiskResult( + ok=False, checked_at=_now(), host=host, + error_type="parse_error", error=str(e), + ) + + try: + stdout, _ = await run_command(cfg, "df -h", use_sudo=cfg.needs_sudo) + except SSHError as e: + return DiskResult( + ok=False, checked_at=_now(), host=host, + error_type=e.error_type, error=str(e), + ) + + filesystems = _parse_df(stdout) + + warnings = [] + WARN_THRESHOLD = 85 + for fs in filesystems: + if fs.used_pct >= WARN_THRESHOLD: + warnings.append(f"{fs.mount} 사용률 {fs.used_pct}% — 임계값 {WARN_THRESHOLD}% 초과") + + return DiskResult( + ok=True, + checked_at=_now(), + host=host, + filesystems=filesystems, + warnings=warnings, + raw=stdout.strip(), + ) diff --git a/infra/mcp_server.py b/infra/mcp_server.py new file mode 100644 index 0000000..5939e40 --- /dev/null +++ b/infra/mcp_server.py @@ -0,0 +1,107 @@ +"""MCP Infra Server — thin wrapper over core/ functions. + +This file ONLY does: +1. MCP tool registration (decorators) +2. Parameter validation +3. Call core/ functions +4. Return results as JSON text + +All actual logic lives in src/core/. +""" + +from __future__ import annotations + +from mcp.server.fastmcp import FastMCP + +from .core.docker import docker_status, docker_logs +from .core.health import service_health, VALID_SERVICES +from .core.system import disk_usage +from .core.network import tailscale_status +from .core.models import ollama_models, mlx_models + +mcp = FastMCP( + "infra", + instructions=( + "인프라 모니터링 도구. GPU 서버, Mac mini, 회사 NAS의 " + "Docker 상태, 서비스 헬스체크, 디스크 사용량, 네트워크, 모델 목록을 확인합니다." + ), +) + + +@mcp.tool() +async def check_docker_status(host: str) -> str: + """Docker 컨테이너 상태 확인. + + Args: + host: 대상 호스트 (gpu | nas-company) + """ + result = await docker_status(host) + return result.model_dump_json(indent=2) + + +@mcp.tool() +async def check_docker_logs(host: str, container: str, lines: int = 50) -> str: + """Docker 컨테이너 최근 로그 조회. + + Args: + host: 대상 호스트 (gpu | nas-company) + container: 컨테이너 이름 + lines: 조회할 줄 수 (기본 50) + """ + result = await docker_logs(host, container, lines) + return result.model_dump_json(indent=2) + + +@mcp.tool() +async def check_service_health(service: str) -> str: + """서비스 헬스체크. 서비스별 정상 판정 기준이 다름. + + Args: + service: 서비스 이름 (document-server | mlx | mlx-proxy | nanoclaude | ollama-gpu | ollama-macmini) + """ + result = await service_health(service) + return result.model_dump_json(indent=2) + + +@mcp.tool() +async def check_disk_usage(host: str) -> str: + """디스크 사용량 확인. 85% 초과 시 경고. + + Args: + host: 대상 호스트 (gpu | macmini | nas-company) + """ + result = await disk_usage(host) + return result.model_dump_json(indent=2) + + +@mcp.tool() +async def check_tailscale() -> str: + """Tailscale 네트워크 상태 확인. 모든 피어 연결 상태를 반환.""" + result = await tailscale_status() + return result.model_dump_json(indent=2) + + +@mcp.tool() +async def check_ollama_models(host: str) -> str: + """Ollama 설치 모델 목록 조회. + + Args: + host: 대상 호스트 (gpu | macmini) + """ + result = await ollama_models(host) + return result.model_dump_json(indent=2) + + +@mcp.tool() +async def check_mlx_models() -> str: + """Mac mini MLX 서버에 로드된 모델 목록 조회.""" + result = await mlx_models() + return result.model_dump_json(indent=2) + + +def main(): + mcp.run(transport="stdio") + + +if __name__ == "__main__": + main() diff --git a/infra/pyproject.toml b/infra/pyproject.toml new file mode 100644 index 0000000..dcd1b3f --- /dev/null +++ b/infra/pyproject.toml @@ -0,0 +1,14 @@ +[project] +name = "mcp-infra-server" +version = "0.1.0" +description = "MCP server for infrastructure monitoring — GPU server, Mac mini, NAS" +requires-python = ">=3.11" +dependencies = [ + "mcp>=1.27.0", + "asyncssh>=2.22.0", + "pydantic>=2.12.0", + "python-dotenv>=1.0.0", +] + +[project.optional-dependencies] +dev = ["pytest", "pytest-asyncio"] diff --git a/infra/run.sh b/infra/run.sh new file mode 100755 index 0000000..37236ba --- /dev/null +++ b/infra/run.sh @@ -0,0 +1,3 @@ +#!/bin/bash +cd /Users/hyungiahn/Documents/code/gpu-services +exec /opt/homebrew/bin/python3.11 -m infra.mcp_server diff --git a/infra/schemas.py b/infra/schemas.py new file mode 100644 index 0000000..4461ceb --- /dev/null +++ b/infra/schemas.py @@ -0,0 +1,101 @@ +"""Pydantic models for all tool results. + +Every tool returns a subclass of BaseResult. +- ok=true + warnings: 성공이지만 주의 필요 +- ok=false + error_type + error: 실패 +- raw: 디버깅 전용 보조 필드 (상위 레이어에서 기본 숨김) +- checked_at: 모든 결과에 포함 (수집 시점 ISO timestamp) +""" + +from __future__ import annotations + +from pydantic import BaseModel, Field + + +class BaseResult(BaseModel): + ok: bool + checked_at: str + warnings: list[str] = Field(default_factory=list) + error_type: str | None = None # "timeout" | "auth" | "command_failed" | "parse_error" + error: str | None = None + + +# -- Docker ------------------------------------------------------------------ + +class ContainerInfo(BaseModel): + name: str + status: str # "running" | "exited" | "restarting" | ... + uptime: str # "Up 3 days" etc. + ports: str # published ports summary + image: str + + +class DockerStatusResult(BaseResult): + host: str + containers: list[ContainerInfo] = Field(default_factory=list) + summary: str = "" # "5/5 running" | "4/5 running, 1 exited" + raw: str = "" + + +class DockerLogsResult(BaseResult): + host: str + container: str + lines: int # requested line count + truncated: bool = False + content: str = "" # stdout + stderr: str = "" # stderr (separate) + raw: str = "" + + +# -- Health ------------------------------------------------------------------- + +class HealthResult(BaseResult): + service: str + status: str = "unknown" # "healthy" | "degraded" | "down" + details: dict = Field(default_factory=dict) + raw: str | None = None + + +# -- System ------------------------------------------------------------------- + +class FileSystemInfo(BaseModel): + mount: str + used_pct: int + used: str + avail: str + total: str + + +class DiskResult(BaseResult): + host: str + filesystems: list[FileSystemInfo] = Field(default_factory=list) + raw: str = "" + + +# -- Network ------------------------------------------------------------------ + +class TailscalePeer(BaseModel): + hostname: str + ip: str + status: str # "active" | "idle" | "offline" + os: str + + +class TailscaleResult(BaseResult): + peers: list[TailscalePeer] = Field(default_factory=list) + raw: str = "" + + +# -- Models ------------------------------------------------------------------- + +class ModelInfo(BaseModel): + id: str + size: str = "" + modified: str = "" + + +class ModelsResult(BaseResult): + host: str + source: str # "ollama" | "mlx" + models: list[ModelInfo] = Field(default_factory=list) + raw: str = ""