feat(infra): MCP 인프라 서버 통합 — 7개 도구 + core/ 분리

mcp-infra-server를 gpu-services/infra/로 통합.
core/ 순수 로직은 Agent/NanoClaude에서도 직접 import 가능.
도구: docker_status, docker_logs, service_health, disk_usage,
tailscale_status, ollama_models, mlx_models.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Hyungi Ahn
2026-04-13 13:11:54 +09:00
parent 6b36063010
commit b1f9e87d6a
14 changed files with 996 additions and 0 deletions
View File
+113
View File
@@ -0,0 +1,113 @@
"""Docker status and logs tools."""
from __future__ import annotations
from datetime import datetime, timezone
from ..config import validate_host, HOSTS
from ..schemas import DockerStatusResult, DockerLogsResult, ContainerInfo
from .ssh import run_command, SSHError
def _now() -> str:
return datetime.now(timezone.utc).isoformat()
async def docker_status(host: str) -> DockerStatusResult:
"""List all Docker containers on a host with structured status."""
try:
cfg = validate_host("docker_status", host)
except ValueError as e:
return DockerStatusResult(
ok=False, checked_at=_now(), host=host,
error_type="parse_error", error=str(e),
)
docker = cfg.docker_path
fmt = '{{.Names}}|{{.Status}}|{{.Ports}}|{{.Image}}'
cmd = f"{docker} ps -a --format '{fmt}'"
try:
stdout, _ = await run_command(cfg, cmd, use_sudo=cfg.needs_sudo)
except SSHError as e:
return DockerStatusResult(
ok=False, checked_at=_now(), host=host,
error_type=e.error_type, error=str(e),
)
containers: list[ContainerInfo] = []
for line in stdout.strip().splitlines():
parts = line.split("|", 3)
if len(parts) < 4:
continue
name, status_str, ports, image = parts
# Extract running state from status string
state = "running" if status_str.startswith("Up") else "exited"
if "Restarting" in status_str:
state = "restarting"
containers.append(ContainerInfo(
name=name, status=state, uptime=status_str, ports=ports, image=image,
))
running = sum(1 for c in containers if c.status == "running")
total = len(containers)
summary = f"{running}/{total} running"
if running < total:
non_running = [c.name for c in containers if c.status != "running"]
summary += f", down: {', '.join(non_running)}"
warnings: list[str] = []
for c in containers:
if c.status == "restarting":
warnings.append(f"{c.name} is restarting")
elif c.status == "exited":
warnings.append(f"{c.name} is exited")
return DockerStatusResult(
ok=running == total,
checked_at=_now(),
host=host,
containers=containers,
summary=summary,
warnings=warnings,
raw=stdout.strip(),
)
async def docker_logs(host: str, container: str, lines: int = 50) -> DockerLogsResult:
"""Get recent logs from a container."""
try:
cfg = validate_host("docker_logs", host)
except ValueError as e:
return DockerLogsResult(
ok=False, checked_at=_now(), host=host, container=container,
lines=lines, error_type="parse_error", error=str(e),
)
docker = cfg.docker_path
# Request one extra line to detect truncation
cmd = f"{docker} logs --tail {lines + 1} {container} 2>&1"
try:
stdout, stderr = await run_command(cfg, cmd, use_sudo=cfg.needs_sudo, timeout=15)
except SSHError as e:
return DockerLogsResult(
ok=False, checked_at=_now(), host=host, container=container,
lines=lines, error_type=e.error_type, error=str(e),
)
all_lines = stdout.strip().splitlines()
truncated = len(all_lines) > lines
content = "\n".join(all_lines[:lines]) if truncated else "\n".join(all_lines)
return DockerLogsResult(
ok=True,
checked_at=_now(),
host=host,
container=container,
lines=lines,
truncated=truncated,
content=content,
stderr=stderr.strip() if stderr else "",
raw=stdout.strip(),
)
+201
View File
@@ -0,0 +1,201 @@
"""Service health checks with per-service validators."""
from __future__ import annotations
import json
import time
from datetime import datetime, timezone
from ..config import HOSTS
from ..schemas import HealthResult
from .ssh import run_command, SSHError
def _now() -> str:
return datetime.now(timezone.utc).isoformat()
async def _validate_document_server() -> HealthResult:
"""Document Server: /health endpoint must return ok + database connected."""
cfg = HOSTS["gpu"]
try:
t0 = time.monotonic()
stdout, _ = await run_command(cfg, "curl -sf http://localhost:8000/health")
latency_ms = int((time.monotonic() - t0) * 1000)
data = json.loads(stdout)
db_ok = data.get("database") == "connected"
status_ok = data.get("status") == "ok"
warnings = []
if not db_ok:
warnings.append("database disconnected")
return HealthResult(
ok=status_ok and db_ok,
checked_at=_now(),
service="document-server",
status="healthy" if (status_ok and db_ok) else "degraded",
details={
"status": data.get("status"),
"database": data.get("database"),
"version": data.get("version"),
"latency_ms": latency_ms,
},
warnings=warnings,
raw=stdout.strip(),
)
except SSHError as e:
return HealthResult(
ok=False, checked_at=_now(), service="document-server",
status="down", error_type=e.error_type, error=str(e),
)
except (json.JSONDecodeError, KeyError) as e:
return HealthResult(
ok=False, checked_at=_now(), service="document-server",
status="down", error_type="parse_error", error=f"응답 파싱 실패: {e}",
raw=stdout.strip() if 'stdout' in dir() else None,
)
async def _validate_mlx() -> HealthResult:
"""MLX Server: /v1/models must return at least 1 model within 5s."""
cfg = HOSTS["macmini"]
try:
t0 = time.monotonic()
stdout, _ = await run_command(cfg, "curl -sf http://localhost:8800/v1/models")
latency_ms = int((time.monotonic() - t0) * 1000)
data = json.loads(stdout)
models = data.get("data", [])
model_ids = [m.get("id", "unknown") for m in models]
warnings = []
if latency_ms > 5000:
warnings.append(f"응답 지연 {latency_ms}ms (임계값 5000ms)")
return HealthResult(
ok=len(models) > 0 and latency_ms <= 5000,
checked_at=_now(),
service="mlx",
status="healthy" if (len(models) > 0 and latency_ms <= 5000) else "degraded",
details={
"model_count": len(models),
"models": model_ids,
"latency_ms": latency_ms,
},
warnings=warnings,
raw=stdout.strip(),
)
except SSHError as e:
return HealthResult(
ok=False, checked_at=_now(), service="mlx",
status="down", error_type=e.error_type, error=str(e),
)
async def _validate_mlx_proxy() -> HealthResult:
"""MLX Proxy (:8801): must return models via proxy."""
cfg = HOSTS["macmini"]
try:
t0 = time.monotonic()
stdout, _ = await run_command(cfg, "curl -sf http://localhost:8801/v1/models")
latency_ms = int((time.monotonic() - t0) * 1000)
data = json.loads(stdout)
models = data.get("data", [])
return HealthResult(
ok=len(models) > 0,
checked_at=_now(),
service="mlx-proxy",
status="healthy" if models else "down",
details={"model_count": len(models), "latency_ms": latency_ms},
raw=stdout.strip(),
)
except SSHError as e:
return HealthResult(
ok=False, checked_at=_now(), service="mlx-proxy",
status="down", error_type=e.error_type, error=str(e),
)
async def _validate_nanoclaude() -> HealthResult:
"""NanoClaude: /health on port 8100."""
cfg = HOSTS["gpu"]
try:
t0 = time.monotonic()
stdout, _ = await run_command(cfg, "curl -sf http://localhost:8100/health")
latency_ms = int((time.monotonic() - t0) * 1000)
return HealthResult(
ok=True,
checked_at=_now(),
service="nanoclaude",
status="healthy",
details={"latency_ms": latency_ms, "response": stdout.strip()[:200]},
raw=stdout.strip(),
)
except SSHError as e:
return HealthResult(
ok=False, checked_at=_now(), service="nanoclaude",
status="down", error_type=e.error_type, error=str(e),
)
async def _validate_ollama(host: str) -> HealthResult:
"""Ollama: `ollama list` must succeed and return non-empty."""
service_name = f"ollama-{host}" if host != "gpu" else "ollama-gpu"
cfg = HOSTS[host]
try:
t0 = time.monotonic()
stdout, _ = await run_command(cfg, "ollama list")
latency_ms = int((time.monotonic() - t0) * 1000)
lines = [l for l in stdout.strip().splitlines()[1:] if l.strip()] # skip header
model_count = len(lines)
warnings = []
if model_count == 0:
warnings.append("모델 없음")
return HealthResult(
ok=model_count > 0,
checked_at=_now(),
service=service_name,
status="healthy" if model_count > 0 else "degraded",
details={"model_count": model_count, "latency_ms": latency_ms},
warnings=warnings,
raw=stdout.strip(),
)
except SSHError as e:
return HealthResult(
ok=False, checked_at=_now(), service=service_name,
status="down", error_type=e.error_type, error=str(e),
)
# Validator registry
VALIDATORS: dict[str, object] = {
"document-server": _validate_document_server,
"mlx": _validate_mlx,
"mlx-proxy": _validate_mlx_proxy,
"nanoclaude": _validate_nanoclaude,
"ollama-gpu": lambda: _validate_ollama("gpu"),
"ollama-macmini": lambda: _validate_ollama("macmini"),
}
VALID_SERVICES = list(VALIDATORS.keys())
async def service_health(service: str) -> HealthResult:
"""Run health check for a specific service."""
validator = VALIDATORS.get(service)
if not validator:
return HealthResult(
ok=False, checked_at=_now(), service=service,
status="unknown",
error_type="parse_error",
error=f"알 수 없는 서비스: '{service}'. 허용: {', '.join(VALID_SERVICES)}",
)
return await validator()
+97
View File
@@ -0,0 +1,97 @@
"""Model inventory tools — Ollama and MLX model listing."""
from __future__ import annotations
import json
from datetime import datetime, timezone
from ..config import validate_host, HOSTS
from ..schemas import ModelsResult, ModelInfo
from .ssh import run_command, SSHError
def _now() -> str:
return datetime.now(timezone.utc).isoformat()
def _parse_ollama_list(output: str) -> list[ModelInfo]:
"""Parse `ollama list` output."""
models = []
for line in output.strip().splitlines()[1:]: # skip header
parts = line.split()
if len(parts) < 2:
continue
model_id = parts[0]
# Remaining fields vary: ID, SIZE, MODIFIED
size = parts[2] + " " + parts[3] if len(parts) > 3 else ""
modified = " ".join(parts[4:]) if len(parts) > 4 else ""
models.append(ModelInfo(id=model_id, size=size, modified=modified))
return models
async def ollama_models(host: str) -> ModelsResult:
"""List Ollama models on a host."""
try:
cfg = validate_host("ollama_models", host)
except ValueError as e:
return ModelsResult(
ok=False, checked_at=_now(), host=host, source="ollama",
error_type="parse_error", error=str(e),
)
try:
stdout, _ = await run_command(cfg, "ollama list")
except SSHError as e:
return ModelsResult(
ok=False, checked_at=_now(), host=host, source="ollama",
error_type=e.error_type, error=str(e),
)
models = _parse_ollama_list(stdout)
return ModelsResult(
ok=True,
checked_at=_now(),
host=host,
source="ollama",
models=models,
raw=stdout.strip(),
)
async def mlx_models() -> ModelsResult:
"""List MLX models loaded on Mac mini."""
cfg = HOSTS["macmini"]
try:
stdout, _ = await run_command(cfg, "curl -sf http://localhost:8800/v1/models")
except SSHError as e:
return ModelsResult(
ok=False, checked_at=_now(), host="macmini", source="mlx",
error_type=e.error_type, error=str(e),
)
try:
data = json.loads(stdout)
model_list = data.get("data", [])
models = [
ModelInfo(
id=m.get("id", "unknown"),
size=str(m.get("size", "")),
modified=str(m.get("created", "")),
)
for m in model_list
]
except (json.JSONDecodeError, KeyError) as e:
return ModelsResult(
ok=False, checked_at=_now(), host="macmini", source="mlx",
error_type="parse_error", error=f"JSON 파싱 실패: {e}",
raw=stdout.strip(),
)
return ModelsResult(
ok=True,
checked_at=_now(),
host="macmini",
source="mlx",
models=models,
raw=stdout.strip(),
)
+83
View File
@@ -0,0 +1,83 @@
"""Network tools — Tailscale status."""
from __future__ import annotations
from datetime import datetime, timezone
from ..schemas import TailscaleResult, TailscalePeer
from .ssh import run_local, SSHError
TAILSCALE_BIN = "/Applications/Tailscale.app/Contents/MacOS/Tailscale"
def _now() -> str:
return datetime.now(timezone.utc).isoformat()
def _parse_tailscale(output: str) -> list[TailscalePeer]:
"""Parse `tailscale status` output into peer list.
Format: IP HOSTNAME USER@ OS STATUS_INFO
Status examples: "-" (connected/active), "idle, tx ... rx ...", "offline, last seen ..."
"""
peers = []
for line in output.strip().splitlines():
parts = line.split()
if len(parts) < 4:
continue
# Skip header-like lines
if parts[0].startswith("#") or parts[0] == "IP":
continue
ip = parts[0]
hostname = parts[1]
# parts[2] = user@ (skip)
os_name = parts[3] if len(parts) > 3 else ""
# Remaining is status info
status_text = " ".join(parts[4:]) if len(parts) > 4 else ""
if "offline" in status_text:
status = "offline"
elif "idle" in status_text:
status = "idle"
elif status_text == "-" or status_text == "":
status = "active"
else:
status = "active"
peers.append(TailscalePeer(
hostname=hostname,
ip=ip,
status=status,
os=os_name,
))
return peers
async def tailscale_status() -> TailscaleResult:
"""Get Tailscale network status (runs locally)."""
try:
stdout, _ = await run_local(f"{TAILSCALE_BIN} status")
except SSHError as e:
return TailscaleResult(
ok=False, checked_at=_now(),
error_type=e.error_type, error=str(e),
)
peers = _parse_tailscale(stdout)
warnings = []
expected_hosts = {"sub-server", "hyungi-macmini", "hyungi-macbookpro"}
found_hosts = {p.hostname for p in peers}
missing = expected_hosts - found_hosts
for h in missing:
warnings.append(f"{h} not found in tailnet")
return TailscaleResult(
ok=True,
checked_at=_now(),
peers=peers,
warnings=warnings,
raw=stdout.strip(),
)
+123
View File
@@ -0,0 +1,123 @@
"""SSH connection layer — asyncssh based.
Provides run_command() which handles:
- Key-based auth (GPU, Mac mini)
- Password auth + sudo (company NAS)
- Timeout / retry
- Structured error classification
"""
from __future__ import annotations
import asyncio
from datetime import datetime, timezone
import asyncssh
from ..config import HostConfig, SSH_TIMEOUT, CMD_TIMEOUT, MAX_RETRIES
class SSHError(Exception):
"""Typed SSH error with error_type classification."""
def __init__(self, error_type: str, message: str):
self.error_type = error_type
super().__init__(message)
def _now_iso() -> str:
return datetime.now(timezone.utc).isoformat()
async def _connect(host: HostConfig) -> asyncssh.SSHClientConnection:
"""Open SSH connection with appropriate auth method."""
kwargs: dict = {
"host": host.ip,
"username": host.user,
"connect_timeout": SSH_TIMEOUT,
"known_hosts": None, # accept any host key (Tailscale internal network)
}
if host.auth == "password" and host.password:
kwargs["password"] = host.password
kwargs["client_keys"] = [] # don't try key auth
# key auth is the default (uses ~/.ssh/)
return await asyncssh.connect(**kwargs)
async def run_command(
host: HostConfig,
command: str,
timeout: int = CMD_TIMEOUT,
use_sudo: bool = False,
) -> tuple[str, str]:
"""Run a command on remote host. Returns (stdout, stderr).
For NAS with sudo: wraps command with sudo using password via stdin.
Raises SSHError with typed error_type on failure.
"""
if use_sudo and host.needs_sudo and host.password:
# Pipe password to sudo via stdin
command = f"echo '{host.password}' | sudo -S {command}"
last_error: Exception | None = None
for attempt in range(1 + MAX_RETRIES):
try:
conn = await _connect(host)
async with conn:
result = await asyncio.wait_for(
conn.run(command, check=False),
timeout=timeout,
)
stdout = result.stdout or ""
stderr = result.stderr or ""
if result.exit_status != 0:
# Command ran but returned non-zero
# Filter out sudo password prompt from stderr
stderr_clean = "\n".join(
line for line in stderr.splitlines()
if "[sudo]" not in line and "Password:" not in line
)
raise SSHError(
"command_failed",
f"exit {result.exit_status}: {stderr_clean.strip() or stdout.strip()}"
)
return stdout, stderr
except SSHError:
raise
except asyncio.TimeoutError:
raise SSHError("timeout", f"명령 실행 시간 초과 ({timeout}초)")
except asyncssh.PermissionDenied:
raise SSHError("auth", f"SSH 인증 실패: {host.user}@{host.ip}")
except (OSError, asyncssh.Error) as e:
last_error = e
if attempt < MAX_RETRIES:
await asyncio.sleep(1)
continue
raise SSHError("timeout", f"SSH 연결 실패: {host.ip}{e}")
raise SSHError("timeout", f"SSH 최대 재시도 초과: {host.ip}")
async def run_local(command: str, timeout: int = CMD_TIMEOUT) -> tuple[str, str]:
"""Run a command locally. Returns (stdout, stderr)."""
try:
proc = await asyncio.create_subprocess_shell(
command,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
stdout_bytes, stderr_bytes = await asyncio.wait_for(
proc.communicate(), timeout=timeout
)
stdout = stdout_bytes.decode() if stdout_bytes else ""
stderr = stderr_bytes.decode() if stderr_bytes else ""
if proc.returncode != 0:
raise SSHError("command_failed", f"exit {proc.returncode}: {stderr.strip() or stdout.strip()}")
return stdout, stderr
except asyncio.TimeoutError:
raise SSHError("timeout", f"로컬 명령 시간 초과 ({timeout}초)")
+79
View File
@@ -0,0 +1,79 @@
"""System tools — disk usage."""
from __future__ import annotations
from datetime import datetime, timezone
from ..config import validate_host
from ..schemas import DiskResult, FileSystemInfo
from .ssh import run_command, SSHError
def _now() -> str:
return datetime.now(timezone.utc).isoformat()
def _parse_df(output: str) -> list[FileSystemInfo]:
"""Parse `df -h` output into structured filesystem info."""
filesystems = []
for line in output.strip().splitlines()[1:]: # skip header
parts = line.split()
if len(parts) < 6:
continue
# df -h columns: Filesystem Size Used Avail Use% Mounted
mount = parts[-1]
# Skip pseudo-filesystems
if mount.startswith(("/dev", "/sys", "/proc", "/run", "/snap")):
continue
if parts[0] in ("tmpfs", "devtmpfs", "overlay", "shm", "none"):
continue
try:
used_pct = int(parts[4].rstrip("%"))
except ValueError:
continue
filesystems.append(FileSystemInfo(
mount=mount,
total=parts[1],
used=parts[2],
avail=parts[3],
used_pct=used_pct,
))
return filesystems
async def disk_usage(host: str) -> DiskResult:
"""Get disk usage for a host with structured filesystem info."""
try:
cfg = validate_host("disk_usage", host)
except ValueError as e:
return DiskResult(
ok=False, checked_at=_now(), host=host,
error_type="parse_error", error=str(e),
)
try:
stdout, _ = await run_command(cfg, "df -h", use_sudo=cfg.needs_sudo)
except SSHError as e:
return DiskResult(
ok=False, checked_at=_now(), host=host,
error_type=e.error_type, error=str(e),
)
filesystems = _parse_df(stdout)
warnings = []
WARN_THRESHOLD = 85
for fs in filesystems:
if fs.used_pct >= WARN_THRESHOLD:
warnings.append(f"{fs.mount} 사용률 {fs.used_pct}% — 임계값 {WARN_THRESHOLD}% 초과")
return DiskResult(
ok=True,
checked_at=_now(),
host=host,
filesystems=filesystems,
warnings=warnings,
raw=stdout.strip(),
)