feat(infra): MCP 인프라 서버 통합 — 7개 도구 + core/ 분리
mcp-infra-server를 gpu-services/infra/로 통합. core/ 순수 로직은 Agent/NanoClaude에서도 직접 import 가능. 도구: docker_status, docker_logs, service_health, disk_usage, tailscale_status, ollama_models, mlx_models. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,2 @@
|
|||||||
|
# Copy to .env and fill in values
|
||||||
|
NAS_COMPANY_PASSWORD=
|
||||||
@@ -0,0 +1,73 @@
|
|||||||
|
"""Host configuration and tool-host validation.
|
||||||
|
|
||||||
|
All host IPs are Tailscale IPs (except nas-company which also works via Tailscale).
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class HostConfig:
|
||||||
|
ip: str
|
||||||
|
user: str
|
||||||
|
auth: str # "key" | "password"
|
||||||
|
docker_path: str = "docker"
|
||||||
|
needs_sudo: bool = False
|
||||||
|
password: str | None = None # only for auth="password"
|
||||||
|
|
||||||
|
|
||||||
|
HOSTS: dict[str, HostConfig] = {
|
||||||
|
"gpu": HostConfig(
|
||||||
|
ip="100.111.160.84",
|
||||||
|
user="hyungi",
|
||||||
|
auth="key",
|
||||||
|
),
|
||||||
|
"macmini": HostConfig(
|
||||||
|
ip="100.76.254.116",
|
||||||
|
user="hyungi",
|
||||||
|
auth="key",
|
||||||
|
),
|
||||||
|
"nas-company": HostConfig(
|
||||||
|
ip="100.71.132.52",
|
||||||
|
user="hyungi",
|
||||||
|
auth="password",
|
||||||
|
docker_path="/usr/local/bin/docker",
|
||||||
|
needs_sudo=True,
|
||||||
|
password=os.getenv("NAS_COMPANY_PASSWORD"),
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
|
# Per-tool allowed hosts — invalid host → immediate error
|
||||||
|
TOOL_HOST_MAP: dict[str, list[str]] = {
|
||||||
|
"docker_status": ["gpu", "nas-company"],
|
||||||
|
"docker_logs": ["gpu", "nas-company"],
|
||||||
|
"disk_usage": ["gpu", "macmini", "nas-company"],
|
||||||
|
"ollama_models": ["gpu", "macmini"],
|
||||||
|
"mlx_models": ["macmini"],
|
||||||
|
}
|
||||||
|
|
||||||
|
# SSH timeouts
|
||||||
|
SSH_TIMEOUT = 5 # connection timeout (seconds)
|
||||||
|
CMD_TIMEOUT = 10 # command execution timeout (seconds)
|
||||||
|
MAX_RETRIES = 1 # retry once on failure
|
||||||
|
|
||||||
|
|
||||||
|
def validate_host(tool: str, host: str) -> HostConfig:
|
||||||
|
"""Validate host is allowed for tool and return config. Raises ValueError if invalid."""
|
||||||
|
allowed = TOOL_HOST_MAP.get(tool)
|
||||||
|
if allowed and host not in allowed:
|
||||||
|
raise ValueError(
|
||||||
|
f"'{host}'는 {tool}에서 지원하지 않습니다. 허용: {', '.join(allowed)}"
|
||||||
|
)
|
||||||
|
config = HOSTS.get(host)
|
||||||
|
if not config:
|
||||||
|
raise ValueError(
|
||||||
|
f"알 수 없는 호스트: '{host}'. 허용: {', '.join(HOSTS.keys())}"
|
||||||
|
)
|
||||||
|
return config
|
||||||
@@ -0,0 +1,113 @@
|
|||||||
|
"""Docker status and logs tools."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
|
||||||
|
from ..config import validate_host, HOSTS
|
||||||
|
from ..schemas import DockerStatusResult, DockerLogsResult, ContainerInfo
|
||||||
|
from .ssh import run_command, SSHError
|
||||||
|
|
||||||
|
|
||||||
|
def _now() -> str:
|
||||||
|
return datetime.now(timezone.utc).isoformat()
|
||||||
|
|
||||||
|
|
||||||
|
async def docker_status(host: str) -> DockerStatusResult:
|
||||||
|
"""List all Docker containers on a host with structured status."""
|
||||||
|
try:
|
||||||
|
cfg = validate_host("docker_status", host)
|
||||||
|
except ValueError as e:
|
||||||
|
return DockerStatusResult(
|
||||||
|
ok=False, checked_at=_now(), host=host,
|
||||||
|
error_type="parse_error", error=str(e),
|
||||||
|
)
|
||||||
|
|
||||||
|
docker = cfg.docker_path
|
||||||
|
fmt = '{{.Names}}|{{.Status}}|{{.Ports}}|{{.Image}}'
|
||||||
|
cmd = f"{docker} ps -a --format '{fmt}'"
|
||||||
|
|
||||||
|
try:
|
||||||
|
stdout, _ = await run_command(cfg, cmd, use_sudo=cfg.needs_sudo)
|
||||||
|
except SSHError as e:
|
||||||
|
return DockerStatusResult(
|
||||||
|
ok=False, checked_at=_now(), host=host,
|
||||||
|
error_type=e.error_type, error=str(e),
|
||||||
|
)
|
||||||
|
|
||||||
|
containers: list[ContainerInfo] = []
|
||||||
|
for line in stdout.strip().splitlines():
|
||||||
|
parts = line.split("|", 3)
|
||||||
|
if len(parts) < 4:
|
||||||
|
continue
|
||||||
|
name, status_str, ports, image = parts
|
||||||
|
# Extract running state from status string
|
||||||
|
state = "running" if status_str.startswith("Up") else "exited"
|
||||||
|
if "Restarting" in status_str:
|
||||||
|
state = "restarting"
|
||||||
|
containers.append(ContainerInfo(
|
||||||
|
name=name, status=state, uptime=status_str, ports=ports, image=image,
|
||||||
|
))
|
||||||
|
|
||||||
|
running = sum(1 for c in containers if c.status == "running")
|
||||||
|
total = len(containers)
|
||||||
|
summary = f"{running}/{total} running"
|
||||||
|
if running < total:
|
||||||
|
non_running = [c.name for c in containers if c.status != "running"]
|
||||||
|
summary += f", down: {', '.join(non_running)}"
|
||||||
|
|
||||||
|
warnings: list[str] = []
|
||||||
|
for c in containers:
|
||||||
|
if c.status == "restarting":
|
||||||
|
warnings.append(f"{c.name} is restarting")
|
||||||
|
elif c.status == "exited":
|
||||||
|
warnings.append(f"{c.name} is exited")
|
||||||
|
|
||||||
|
return DockerStatusResult(
|
||||||
|
ok=running == total,
|
||||||
|
checked_at=_now(),
|
||||||
|
host=host,
|
||||||
|
containers=containers,
|
||||||
|
summary=summary,
|
||||||
|
warnings=warnings,
|
||||||
|
raw=stdout.strip(),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def docker_logs(host: str, container: str, lines: int = 50) -> DockerLogsResult:
|
||||||
|
"""Get recent logs from a container."""
|
||||||
|
try:
|
||||||
|
cfg = validate_host("docker_logs", host)
|
||||||
|
except ValueError as e:
|
||||||
|
return DockerLogsResult(
|
||||||
|
ok=False, checked_at=_now(), host=host, container=container,
|
||||||
|
lines=lines, error_type="parse_error", error=str(e),
|
||||||
|
)
|
||||||
|
|
||||||
|
docker = cfg.docker_path
|
||||||
|
# Request one extra line to detect truncation
|
||||||
|
cmd = f"{docker} logs --tail {lines + 1} {container} 2>&1"
|
||||||
|
|
||||||
|
try:
|
||||||
|
stdout, stderr = await run_command(cfg, cmd, use_sudo=cfg.needs_sudo, timeout=15)
|
||||||
|
except SSHError as e:
|
||||||
|
return DockerLogsResult(
|
||||||
|
ok=False, checked_at=_now(), host=host, container=container,
|
||||||
|
lines=lines, error_type=e.error_type, error=str(e),
|
||||||
|
)
|
||||||
|
|
||||||
|
all_lines = stdout.strip().splitlines()
|
||||||
|
truncated = len(all_lines) > lines
|
||||||
|
content = "\n".join(all_lines[:lines]) if truncated else "\n".join(all_lines)
|
||||||
|
|
||||||
|
return DockerLogsResult(
|
||||||
|
ok=True,
|
||||||
|
checked_at=_now(),
|
||||||
|
host=host,
|
||||||
|
container=container,
|
||||||
|
lines=lines,
|
||||||
|
truncated=truncated,
|
||||||
|
content=content,
|
||||||
|
stderr=stderr.strip() if stderr else "",
|
||||||
|
raw=stdout.strip(),
|
||||||
|
)
|
||||||
@@ -0,0 +1,201 @@
|
|||||||
|
"""Service health checks with per-service validators."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
|
||||||
|
from ..config import HOSTS
|
||||||
|
from ..schemas import HealthResult
|
||||||
|
from .ssh import run_command, SSHError
|
||||||
|
|
||||||
|
|
||||||
|
def _now() -> str:
|
||||||
|
return datetime.now(timezone.utc).isoformat()
|
||||||
|
|
||||||
|
|
||||||
|
async def _validate_document_server() -> HealthResult:
|
||||||
|
"""Document Server: /health endpoint must return ok + database connected."""
|
||||||
|
cfg = HOSTS["gpu"]
|
||||||
|
try:
|
||||||
|
t0 = time.monotonic()
|
||||||
|
stdout, _ = await run_command(cfg, "curl -sf http://localhost:8000/health")
|
||||||
|
latency_ms = int((time.monotonic() - t0) * 1000)
|
||||||
|
|
||||||
|
data = json.loads(stdout)
|
||||||
|
db_ok = data.get("database") == "connected"
|
||||||
|
status_ok = data.get("status") == "ok"
|
||||||
|
|
||||||
|
warnings = []
|
||||||
|
if not db_ok:
|
||||||
|
warnings.append("database disconnected")
|
||||||
|
|
||||||
|
return HealthResult(
|
||||||
|
ok=status_ok and db_ok,
|
||||||
|
checked_at=_now(),
|
||||||
|
service="document-server",
|
||||||
|
status="healthy" if (status_ok and db_ok) else "degraded",
|
||||||
|
details={
|
||||||
|
"status": data.get("status"),
|
||||||
|
"database": data.get("database"),
|
||||||
|
"version": data.get("version"),
|
||||||
|
"latency_ms": latency_ms,
|
||||||
|
},
|
||||||
|
warnings=warnings,
|
||||||
|
raw=stdout.strip(),
|
||||||
|
)
|
||||||
|
except SSHError as e:
|
||||||
|
return HealthResult(
|
||||||
|
ok=False, checked_at=_now(), service="document-server",
|
||||||
|
status="down", error_type=e.error_type, error=str(e),
|
||||||
|
)
|
||||||
|
except (json.JSONDecodeError, KeyError) as e:
|
||||||
|
return HealthResult(
|
||||||
|
ok=False, checked_at=_now(), service="document-server",
|
||||||
|
status="down", error_type="parse_error", error=f"응답 파싱 실패: {e}",
|
||||||
|
raw=stdout.strip() if 'stdout' in dir() else None,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def _validate_mlx() -> HealthResult:
|
||||||
|
"""MLX Server: /v1/models must return at least 1 model within 5s."""
|
||||||
|
cfg = HOSTS["macmini"]
|
||||||
|
try:
|
||||||
|
t0 = time.monotonic()
|
||||||
|
stdout, _ = await run_command(cfg, "curl -sf http://localhost:8800/v1/models")
|
||||||
|
latency_ms = int((time.monotonic() - t0) * 1000)
|
||||||
|
|
||||||
|
data = json.loads(stdout)
|
||||||
|
models = data.get("data", [])
|
||||||
|
model_ids = [m.get("id", "unknown") for m in models]
|
||||||
|
|
||||||
|
warnings = []
|
||||||
|
if latency_ms > 5000:
|
||||||
|
warnings.append(f"응답 지연 {latency_ms}ms (임계값 5000ms)")
|
||||||
|
|
||||||
|
return HealthResult(
|
||||||
|
ok=len(models) > 0 and latency_ms <= 5000,
|
||||||
|
checked_at=_now(),
|
||||||
|
service="mlx",
|
||||||
|
status="healthy" if (len(models) > 0 and latency_ms <= 5000) else "degraded",
|
||||||
|
details={
|
||||||
|
"model_count": len(models),
|
||||||
|
"models": model_ids,
|
||||||
|
"latency_ms": latency_ms,
|
||||||
|
},
|
||||||
|
warnings=warnings,
|
||||||
|
raw=stdout.strip(),
|
||||||
|
)
|
||||||
|
except SSHError as e:
|
||||||
|
return HealthResult(
|
||||||
|
ok=False, checked_at=_now(), service="mlx",
|
||||||
|
status="down", error_type=e.error_type, error=str(e),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def _validate_mlx_proxy() -> HealthResult:
|
||||||
|
"""MLX Proxy (:8801): must return models via proxy."""
|
||||||
|
cfg = HOSTS["macmini"]
|
||||||
|
try:
|
||||||
|
t0 = time.monotonic()
|
||||||
|
stdout, _ = await run_command(cfg, "curl -sf http://localhost:8801/v1/models")
|
||||||
|
latency_ms = int((time.monotonic() - t0) * 1000)
|
||||||
|
|
||||||
|
data = json.loads(stdout)
|
||||||
|
models = data.get("data", [])
|
||||||
|
|
||||||
|
return HealthResult(
|
||||||
|
ok=len(models) > 0,
|
||||||
|
checked_at=_now(),
|
||||||
|
service="mlx-proxy",
|
||||||
|
status="healthy" if models else "down",
|
||||||
|
details={"model_count": len(models), "latency_ms": latency_ms},
|
||||||
|
raw=stdout.strip(),
|
||||||
|
)
|
||||||
|
except SSHError as e:
|
||||||
|
return HealthResult(
|
||||||
|
ok=False, checked_at=_now(), service="mlx-proxy",
|
||||||
|
status="down", error_type=e.error_type, error=str(e),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def _validate_nanoclaude() -> HealthResult:
|
||||||
|
"""NanoClaude: /health on port 8100."""
|
||||||
|
cfg = HOSTS["gpu"]
|
||||||
|
try:
|
||||||
|
t0 = time.monotonic()
|
||||||
|
stdout, _ = await run_command(cfg, "curl -sf http://localhost:8100/health")
|
||||||
|
latency_ms = int((time.monotonic() - t0) * 1000)
|
||||||
|
|
||||||
|
return HealthResult(
|
||||||
|
ok=True,
|
||||||
|
checked_at=_now(),
|
||||||
|
service="nanoclaude",
|
||||||
|
status="healthy",
|
||||||
|
details={"latency_ms": latency_ms, "response": stdout.strip()[:200]},
|
||||||
|
raw=stdout.strip(),
|
||||||
|
)
|
||||||
|
except SSHError as e:
|
||||||
|
return HealthResult(
|
||||||
|
ok=False, checked_at=_now(), service="nanoclaude",
|
||||||
|
status="down", error_type=e.error_type, error=str(e),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def _validate_ollama(host: str) -> HealthResult:
|
||||||
|
"""Ollama: `ollama list` must succeed and return non-empty."""
|
||||||
|
service_name = f"ollama-{host}" if host != "gpu" else "ollama-gpu"
|
||||||
|
cfg = HOSTS[host]
|
||||||
|
try:
|
||||||
|
t0 = time.monotonic()
|
||||||
|
stdout, _ = await run_command(cfg, "ollama list")
|
||||||
|
latency_ms = int((time.monotonic() - t0) * 1000)
|
||||||
|
|
||||||
|
lines = [l for l in stdout.strip().splitlines()[1:] if l.strip()] # skip header
|
||||||
|
model_count = len(lines)
|
||||||
|
|
||||||
|
warnings = []
|
||||||
|
if model_count == 0:
|
||||||
|
warnings.append("모델 없음")
|
||||||
|
|
||||||
|
return HealthResult(
|
||||||
|
ok=model_count > 0,
|
||||||
|
checked_at=_now(),
|
||||||
|
service=service_name,
|
||||||
|
status="healthy" if model_count > 0 else "degraded",
|
||||||
|
details={"model_count": model_count, "latency_ms": latency_ms},
|
||||||
|
warnings=warnings,
|
||||||
|
raw=stdout.strip(),
|
||||||
|
)
|
||||||
|
except SSHError as e:
|
||||||
|
return HealthResult(
|
||||||
|
ok=False, checked_at=_now(), service=service_name,
|
||||||
|
status="down", error_type=e.error_type, error=str(e),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# Validator registry
|
||||||
|
VALIDATORS: dict[str, object] = {
|
||||||
|
"document-server": _validate_document_server,
|
||||||
|
"mlx": _validate_mlx,
|
||||||
|
"mlx-proxy": _validate_mlx_proxy,
|
||||||
|
"nanoclaude": _validate_nanoclaude,
|
||||||
|
"ollama-gpu": lambda: _validate_ollama("gpu"),
|
||||||
|
"ollama-macmini": lambda: _validate_ollama("macmini"),
|
||||||
|
}
|
||||||
|
|
||||||
|
VALID_SERVICES = list(VALIDATORS.keys())
|
||||||
|
|
||||||
|
|
||||||
|
async def service_health(service: str) -> HealthResult:
|
||||||
|
"""Run health check for a specific service."""
|
||||||
|
validator = VALIDATORS.get(service)
|
||||||
|
if not validator:
|
||||||
|
return HealthResult(
|
||||||
|
ok=False, checked_at=_now(), service=service,
|
||||||
|
status="unknown",
|
||||||
|
error_type="parse_error",
|
||||||
|
error=f"알 수 없는 서비스: '{service}'. 허용: {', '.join(VALID_SERVICES)}",
|
||||||
|
)
|
||||||
|
return await validator()
|
||||||
@@ -0,0 +1,97 @@
|
|||||||
|
"""Model inventory tools — Ollama and MLX model listing."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
|
||||||
|
from ..config import validate_host, HOSTS
|
||||||
|
from ..schemas import ModelsResult, ModelInfo
|
||||||
|
from .ssh import run_command, SSHError
|
||||||
|
|
||||||
|
|
||||||
|
def _now() -> str:
|
||||||
|
return datetime.now(timezone.utc).isoformat()
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_ollama_list(output: str) -> list[ModelInfo]:
|
||||||
|
"""Parse `ollama list` output."""
|
||||||
|
models = []
|
||||||
|
for line in output.strip().splitlines()[1:]: # skip header
|
||||||
|
parts = line.split()
|
||||||
|
if len(parts) < 2:
|
||||||
|
continue
|
||||||
|
model_id = parts[0]
|
||||||
|
# Remaining fields vary: ID, SIZE, MODIFIED
|
||||||
|
size = parts[2] + " " + parts[3] if len(parts) > 3 else ""
|
||||||
|
modified = " ".join(parts[4:]) if len(parts) > 4 else ""
|
||||||
|
models.append(ModelInfo(id=model_id, size=size, modified=modified))
|
||||||
|
return models
|
||||||
|
|
||||||
|
|
||||||
|
async def ollama_models(host: str) -> ModelsResult:
|
||||||
|
"""List Ollama models on a host."""
|
||||||
|
try:
|
||||||
|
cfg = validate_host("ollama_models", host)
|
||||||
|
except ValueError as e:
|
||||||
|
return ModelsResult(
|
||||||
|
ok=False, checked_at=_now(), host=host, source="ollama",
|
||||||
|
error_type="parse_error", error=str(e),
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
stdout, _ = await run_command(cfg, "ollama list")
|
||||||
|
except SSHError as e:
|
||||||
|
return ModelsResult(
|
||||||
|
ok=False, checked_at=_now(), host=host, source="ollama",
|
||||||
|
error_type=e.error_type, error=str(e),
|
||||||
|
)
|
||||||
|
|
||||||
|
models = _parse_ollama_list(stdout)
|
||||||
|
return ModelsResult(
|
||||||
|
ok=True,
|
||||||
|
checked_at=_now(),
|
||||||
|
host=host,
|
||||||
|
source="ollama",
|
||||||
|
models=models,
|
||||||
|
raw=stdout.strip(),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def mlx_models() -> ModelsResult:
|
||||||
|
"""List MLX models loaded on Mac mini."""
|
||||||
|
cfg = HOSTS["macmini"]
|
||||||
|
try:
|
||||||
|
stdout, _ = await run_command(cfg, "curl -sf http://localhost:8800/v1/models")
|
||||||
|
except SSHError as e:
|
||||||
|
return ModelsResult(
|
||||||
|
ok=False, checked_at=_now(), host="macmini", source="mlx",
|
||||||
|
error_type=e.error_type, error=str(e),
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
data = json.loads(stdout)
|
||||||
|
model_list = data.get("data", [])
|
||||||
|
models = [
|
||||||
|
ModelInfo(
|
||||||
|
id=m.get("id", "unknown"),
|
||||||
|
size=str(m.get("size", "")),
|
||||||
|
modified=str(m.get("created", "")),
|
||||||
|
)
|
||||||
|
for m in model_list
|
||||||
|
]
|
||||||
|
except (json.JSONDecodeError, KeyError) as e:
|
||||||
|
return ModelsResult(
|
||||||
|
ok=False, checked_at=_now(), host="macmini", source="mlx",
|
||||||
|
error_type="parse_error", error=f"JSON 파싱 실패: {e}",
|
||||||
|
raw=stdout.strip(),
|
||||||
|
)
|
||||||
|
|
||||||
|
return ModelsResult(
|
||||||
|
ok=True,
|
||||||
|
checked_at=_now(),
|
||||||
|
host="macmini",
|
||||||
|
source="mlx",
|
||||||
|
models=models,
|
||||||
|
raw=stdout.strip(),
|
||||||
|
)
|
||||||
@@ -0,0 +1,83 @@
|
|||||||
|
"""Network tools — Tailscale status."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
|
||||||
|
from ..schemas import TailscaleResult, TailscalePeer
|
||||||
|
from .ssh import run_local, SSHError
|
||||||
|
|
||||||
|
TAILSCALE_BIN = "/Applications/Tailscale.app/Contents/MacOS/Tailscale"
|
||||||
|
|
||||||
|
|
||||||
|
def _now() -> str:
|
||||||
|
return datetime.now(timezone.utc).isoformat()
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_tailscale(output: str) -> list[TailscalePeer]:
|
||||||
|
"""Parse `tailscale status` output into peer list.
|
||||||
|
|
||||||
|
Format: IP HOSTNAME USER@ OS STATUS_INFO
|
||||||
|
Status examples: "-" (connected/active), "idle, tx ... rx ...", "offline, last seen ..."
|
||||||
|
"""
|
||||||
|
peers = []
|
||||||
|
for line in output.strip().splitlines():
|
||||||
|
parts = line.split()
|
||||||
|
if len(parts) < 4:
|
||||||
|
continue
|
||||||
|
# Skip header-like lines
|
||||||
|
if parts[0].startswith("#") or parts[0] == "IP":
|
||||||
|
continue
|
||||||
|
|
||||||
|
ip = parts[0]
|
||||||
|
hostname = parts[1]
|
||||||
|
# parts[2] = user@ (skip)
|
||||||
|
os_name = parts[3] if len(parts) > 3 else ""
|
||||||
|
|
||||||
|
# Remaining is status info
|
||||||
|
status_text = " ".join(parts[4:]) if len(parts) > 4 else ""
|
||||||
|
|
||||||
|
if "offline" in status_text:
|
||||||
|
status = "offline"
|
||||||
|
elif "idle" in status_text:
|
||||||
|
status = "idle"
|
||||||
|
elif status_text == "-" or status_text == "":
|
||||||
|
status = "active"
|
||||||
|
else:
|
||||||
|
status = "active"
|
||||||
|
|
||||||
|
peers.append(TailscalePeer(
|
||||||
|
hostname=hostname,
|
||||||
|
ip=ip,
|
||||||
|
status=status,
|
||||||
|
os=os_name,
|
||||||
|
))
|
||||||
|
return peers
|
||||||
|
|
||||||
|
|
||||||
|
async def tailscale_status() -> TailscaleResult:
|
||||||
|
"""Get Tailscale network status (runs locally)."""
|
||||||
|
try:
|
||||||
|
stdout, _ = await run_local(f"{TAILSCALE_BIN} status")
|
||||||
|
except SSHError as e:
|
||||||
|
return TailscaleResult(
|
||||||
|
ok=False, checked_at=_now(),
|
||||||
|
error_type=e.error_type, error=str(e),
|
||||||
|
)
|
||||||
|
|
||||||
|
peers = _parse_tailscale(stdout)
|
||||||
|
|
||||||
|
warnings = []
|
||||||
|
expected_hosts = {"sub-server", "hyungi-macmini", "hyungi-macbookpro"}
|
||||||
|
found_hosts = {p.hostname for p in peers}
|
||||||
|
missing = expected_hosts - found_hosts
|
||||||
|
for h in missing:
|
||||||
|
warnings.append(f"{h} not found in tailnet")
|
||||||
|
|
||||||
|
return TailscaleResult(
|
||||||
|
ok=True,
|
||||||
|
checked_at=_now(),
|
||||||
|
peers=peers,
|
||||||
|
warnings=warnings,
|
||||||
|
raw=stdout.strip(),
|
||||||
|
)
|
||||||
@@ -0,0 +1,123 @@
|
|||||||
|
"""SSH connection layer — asyncssh based.
|
||||||
|
|
||||||
|
Provides run_command() which handles:
|
||||||
|
- Key-based auth (GPU, Mac mini)
|
||||||
|
- Password auth + sudo (company NAS)
|
||||||
|
- Timeout / retry
|
||||||
|
- Structured error classification
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
|
||||||
|
import asyncssh
|
||||||
|
|
||||||
|
from ..config import HostConfig, SSH_TIMEOUT, CMD_TIMEOUT, MAX_RETRIES
|
||||||
|
|
||||||
|
|
||||||
|
class SSHError(Exception):
|
||||||
|
"""Typed SSH error with error_type classification."""
|
||||||
|
|
||||||
|
def __init__(self, error_type: str, message: str):
|
||||||
|
self.error_type = error_type
|
||||||
|
super().__init__(message)
|
||||||
|
|
||||||
|
|
||||||
|
def _now_iso() -> str:
|
||||||
|
return datetime.now(timezone.utc).isoformat()
|
||||||
|
|
||||||
|
|
||||||
|
async def _connect(host: HostConfig) -> asyncssh.SSHClientConnection:
|
||||||
|
"""Open SSH connection with appropriate auth method."""
|
||||||
|
kwargs: dict = {
|
||||||
|
"host": host.ip,
|
||||||
|
"username": host.user,
|
||||||
|
"connect_timeout": SSH_TIMEOUT,
|
||||||
|
"known_hosts": None, # accept any host key (Tailscale internal network)
|
||||||
|
}
|
||||||
|
if host.auth == "password" and host.password:
|
||||||
|
kwargs["password"] = host.password
|
||||||
|
kwargs["client_keys"] = [] # don't try key auth
|
||||||
|
# key auth is the default (uses ~/.ssh/)
|
||||||
|
|
||||||
|
return await asyncssh.connect(**kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
async def run_command(
|
||||||
|
host: HostConfig,
|
||||||
|
command: str,
|
||||||
|
timeout: int = CMD_TIMEOUT,
|
||||||
|
use_sudo: bool = False,
|
||||||
|
) -> tuple[str, str]:
|
||||||
|
"""Run a command on remote host. Returns (stdout, stderr).
|
||||||
|
|
||||||
|
For NAS with sudo: wraps command with sudo using password via stdin.
|
||||||
|
Raises SSHError with typed error_type on failure.
|
||||||
|
"""
|
||||||
|
if use_sudo and host.needs_sudo and host.password:
|
||||||
|
# Pipe password to sudo via stdin
|
||||||
|
command = f"echo '{host.password}' | sudo -S {command}"
|
||||||
|
|
||||||
|
last_error: Exception | None = None
|
||||||
|
for attempt in range(1 + MAX_RETRIES):
|
||||||
|
try:
|
||||||
|
conn = await _connect(host)
|
||||||
|
async with conn:
|
||||||
|
result = await asyncio.wait_for(
|
||||||
|
conn.run(command, check=False),
|
||||||
|
timeout=timeout,
|
||||||
|
)
|
||||||
|
stdout = result.stdout or ""
|
||||||
|
stderr = result.stderr or ""
|
||||||
|
|
||||||
|
if result.exit_status != 0:
|
||||||
|
# Command ran but returned non-zero
|
||||||
|
# Filter out sudo password prompt from stderr
|
||||||
|
stderr_clean = "\n".join(
|
||||||
|
line for line in stderr.splitlines()
|
||||||
|
if "[sudo]" not in line and "Password:" not in line
|
||||||
|
)
|
||||||
|
raise SSHError(
|
||||||
|
"command_failed",
|
||||||
|
f"exit {result.exit_status}: {stderr_clean.strip() or stdout.strip()}"
|
||||||
|
)
|
||||||
|
return stdout, stderr
|
||||||
|
|
||||||
|
except SSHError:
|
||||||
|
raise
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
raise SSHError("timeout", f"명령 실행 시간 초과 ({timeout}초)")
|
||||||
|
except asyncssh.PermissionDenied:
|
||||||
|
raise SSHError("auth", f"SSH 인증 실패: {host.user}@{host.ip}")
|
||||||
|
except (OSError, asyncssh.Error) as e:
|
||||||
|
last_error = e
|
||||||
|
if attempt < MAX_RETRIES:
|
||||||
|
await asyncio.sleep(1)
|
||||||
|
continue
|
||||||
|
raise SSHError("timeout", f"SSH 연결 실패: {host.ip} — {e}")
|
||||||
|
|
||||||
|
raise SSHError("timeout", f"SSH 최대 재시도 초과: {host.ip}")
|
||||||
|
|
||||||
|
|
||||||
|
async def run_local(command: str, timeout: int = CMD_TIMEOUT) -> tuple[str, str]:
|
||||||
|
"""Run a command locally. Returns (stdout, stderr)."""
|
||||||
|
try:
|
||||||
|
proc = await asyncio.create_subprocess_shell(
|
||||||
|
command,
|
||||||
|
stdout=asyncio.subprocess.PIPE,
|
||||||
|
stderr=asyncio.subprocess.PIPE,
|
||||||
|
)
|
||||||
|
stdout_bytes, stderr_bytes = await asyncio.wait_for(
|
||||||
|
proc.communicate(), timeout=timeout
|
||||||
|
)
|
||||||
|
stdout = stdout_bytes.decode() if stdout_bytes else ""
|
||||||
|
stderr = stderr_bytes.decode() if stderr_bytes else ""
|
||||||
|
|
||||||
|
if proc.returncode != 0:
|
||||||
|
raise SSHError("command_failed", f"exit {proc.returncode}: {stderr.strip() or stdout.strip()}")
|
||||||
|
|
||||||
|
return stdout, stderr
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
raise SSHError("timeout", f"로컬 명령 시간 초과 ({timeout}초)")
|
||||||
@@ -0,0 +1,79 @@
|
|||||||
|
"""System tools — disk usage."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
|
||||||
|
from ..config import validate_host
|
||||||
|
from ..schemas import DiskResult, FileSystemInfo
|
||||||
|
from .ssh import run_command, SSHError
|
||||||
|
|
||||||
|
|
||||||
|
def _now() -> str:
|
||||||
|
return datetime.now(timezone.utc).isoformat()
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_df(output: str) -> list[FileSystemInfo]:
|
||||||
|
"""Parse `df -h` output into structured filesystem info."""
|
||||||
|
filesystems = []
|
||||||
|
for line in output.strip().splitlines()[1:]: # skip header
|
||||||
|
parts = line.split()
|
||||||
|
if len(parts) < 6:
|
||||||
|
continue
|
||||||
|
# df -h columns: Filesystem Size Used Avail Use% Mounted
|
||||||
|
mount = parts[-1]
|
||||||
|
# Skip pseudo-filesystems
|
||||||
|
if mount.startswith(("/dev", "/sys", "/proc", "/run", "/snap")):
|
||||||
|
continue
|
||||||
|
if parts[0] in ("tmpfs", "devtmpfs", "overlay", "shm", "none"):
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
used_pct = int(parts[4].rstrip("%"))
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
filesystems.append(FileSystemInfo(
|
||||||
|
mount=mount,
|
||||||
|
total=parts[1],
|
||||||
|
used=parts[2],
|
||||||
|
avail=parts[3],
|
||||||
|
used_pct=used_pct,
|
||||||
|
))
|
||||||
|
return filesystems
|
||||||
|
|
||||||
|
|
||||||
|
async def disk_usage(host: str) -> DiskResult:
|
||||||
|
"""Get disk usage for a host with structured filesystem info."""
|
||||||
|
try:
|
||||||
|
cfg = validate_host("disk_usage", host)
|
||||||
|
except ValueError as e:
|
||||||
|
return DiskResult(
|
||||||
|
ok=False, checked_at=_now(), host=host,
|
||||||
|
error_type="parse_error", error=str(e),
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
stdout, _ = await run_command(cfg, "df -h", use_sudo=cfg.needs_sudo)
|
||||||
|
except SSHError as e:
|
||||||
|
return DiskResult(
|
||||||
|
ok=False, checked_at=_now(), host=host,
|
||||||
|
error_type=e.error_type, error=str(e),
|
||||||
|
)
|
||||||
|
|
||||||
|
filesystems = _parse_df(stdout)
|
||||||
|
|
||||||
|
warnings = []
|
||||||
|
WARN_THRESHOLD = 85
|
||||||
|
for fs in filesystems:
|
||||||
|
if fs.used_pct >= WARN_THRESHOLD:
|
||||||
|
warnings.append(f"{fs.mount} 사용률 {fs.used_pct}% — 임계값 {WARN_THRESHOLD}% 초과")
|
||||||
|
|
||||||
|
return DiskResult(
|
||||||
|
ok=True,
|
||||||
|
checked_at=_now(),
|
||||||
|
host=host,
|
||||||
|
filesystems=filesystems,
|
||||||
|
warnings=warnings,
|
||||||
|
raw=stdout.strip(),
|
||||||
|
)
|
||||||
@@ -0,0 +1,107 @@
|
|||||||
|
"""MCP Infra Server — thin wrapper over core/ functions.
|
||||||
|
|
||||||
|
This file ONLY does:
|
||||||
|
1. MCP tool registration (decorators)
|
||||||
|
2. Parameter validation
|
||||||
|
3. Call core/ functions
|
||||||
|
4. Return results as JSON text
|
||||||
|
|
||||||
|
All actual logic lives in src/core/.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from mcp.server.fastmcp import FastMCP
|
||||||
|
|
||||||
|
from .core.docker import docker_status, docker_logs
|
||||||
|
from .core.health import service_health, VALID_SERVICES
|
||||||
|
from .core.system import disk_usage
|
||||||
|
from .core.network import tailscale_status
|
||||||
|
from .core.models import ollama_models, mlx_models
|
||||||
|
|
||||||
|
mcp = FastMCP(
|
||||||
|
"infra",
|
||||||
|
instructions=(
|
||||||
|
"인프라 모니터링 도구. GPU 서버, Mac mini, 회사 NAS의 "
|
||||||
|
"Docker 상태, 서비스 헬스체크, 디스크 사용량, 네트워크, 모델 목록을 확인합니다."
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@mcp.tool()
|
||||||
|
async def check_docker_status(host: str) -> str:
|
||||||
|
"""Docker 컨테이너 상태 확인.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
host: 대상 호스트 (gpu | nas-company)
|
||||||
|
"""
|
||||||
|
result = await docker_status(host)
|
||||||
|
return result.model_dump_json(indent=2)
|
||||||
|
|
||||||
|
|
||||||
|
@mcp.tool()
|
||||||
|
async def check_docker_logs(host: str, container: str, lines: int = 50) -> str:
|
||||||
|
"""Docker 컨테이너 최근 로그 조회.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
host: 대상 호스트 (gpu | nas-company)
|
||||||
|
container: 컨테이너 이름
|
||||||
|
lines: 조회할 줄 수 (기본 50)
|
||||||
|
"""
|
||||||
|
result = await docker_logs(host, container, lines)
|
||||||
|
return result.model_dump_json(indent=2)
|
||||||
|
|
||||||
|
|
||||||
|
@mcp.tool()
|
||||||
|
async def check_service_health(service: str) -> str:
|
||||||
|
"""서비스 헬스체크. 서비스별 정상 판정 기준이 다름.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
service: 서비스 이름 (document-server | mlx | mlx-proxy | nanoclaude | ollama-gpu | ollama-macmini)
|
||||||
|
"""
|
||||||
|
result = await service_health(service)
|
||||||
|
return result.model_dump_json(indent=2)
|
||||||
|
|
||||||
|
|
||||||
|
@mcp.tool()
|
||||||
|
async def check_disk_usage(host: str) -> str:
|
||||||
|
"""디스크 사용량 확인. 85% 초과 시 경고.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
host: 대상 호스트 (gpu | macmini | nas-company)
|
||||||
|
"""
|
||||||
|
result = await disk_usage(host)
|
||||||
|
return result.model_dump_json(indent=2)
|
||||||
|
|
||||||
|
|
||||||
|
@mcp.tool()
|
||||||
|
async def check_tailscale() -> str:
|
||||||
|
"""Tailscale 네트워크 상태 확인. 모든 피어 연결 상태를 반환."""
|
||||||
|
result = await tailscale_status()
|
||||||
|
return result.model_dump_json(indent=2)
|
||||||
|
|
||||||
|
|
||||||
|
@mcp.tool()
|
||||||
|
async def check_ollama_models(host: str) -> str:
|
||||||
|
"""Ollama 설치 모델 목록 조회.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
host: 대상 호스트 (gpu | macmini)
|
||||||
|
"""
|
||||||
|
result = await ollama_models(host)
|
||||||
|
return result.model_dump_json(indent=2)
|
||||||
|
|
||||||
|
|
||||||
|
@mcp.tool()
|
||||||
|
async def check_mlx_models() -> str:
|
||||||
|
"""Mac mini MLX 서버에 로드된 모델 목록 조회."""
|
||||||
|
result = await mlx_models()
|
||||||
|
return result.model_dump_json(indent=2)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
mcp.run(transport="stdio")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -0,0 +1,14 @@
|
|||||||
|
[project]
|
||||||
|
name = "mcp-infra-server"
|
||||||
|
version = "0.1.0"
|
||||||
|
description = "MCP server for infrastructure monitoring — GPU server, Mac mini, NAS"
|
||||||
|
requires-python = ">=3.11"
|
||||||
|
dependencies = [
|
||||||
|
"mcp>=1.27.0",
|
||||||
|
"asyncssh>=2.22.0",
|
||||||
|
"pydantic>=2.12.0",
|
||||||
|
"python-dotenv>=1.0.0",
|
||||||
|
]
|
||||||
|
|
||||||
|
[project.optional-dependencies]
|
||||||
|
dev = ["pytest", "pytest-asyncio"]
|
||||||
Executable
+3
@@ -0,0 +1,3 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
cd /Users/hyungiahn/Documents/code/gpu-services
|
||||||
|
exec /opt/homebrew/bin/python3.11 -m infra.mcp_server
|
||||||
@@ -0,0 +1,101 @@
|
|||||||
|
"""Pydantic models for all tool results.
|
||||||
|
|
||||||
|
Every tool returns a subclass of BaseResult.
|
||||||
|
- ok=true + warnings: 성공이지만 주의 필요
|
||||||
|
- ok=false + error_type + error: 실패
|
||||||
|
- raw: 디버깅 전용 보조 필드 (상위 레이어에서 기본 숨김)
|
||||||
|
- checked_at: 모든 결과에 포함 (수집 시점 ISO timestamp)
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
|
||||||
|
class BaseResult(BaseModel):
|
||||||
|
ok: bool
|
||||||
|
checked_at: str
|
||||||
|
warnings: list[str] = Field(default_factory=list)
|
||||||
|
error_type: str | None = None # "timeout" | "auth" | "command_failed" | "parse_error"
|
||||||
|
error: str | None = None
|
||||||
|
|
||||||
|
|
||||||
|
# -- Docker ------------------------------------------------------------------
|
||||||
|
|
||||||
|
class ContainerInfo(BaseModel):
|
||||||
|
name: str
|
||||||
|
status: str # "running" | "exited" | "restarting" | ...
|
||||||
|
uptime: str # "Up 3 days" etc.
|
||||||
|
ports: str # published ports summary
|
||||||
|
image: str
|
||||||
|
|
||||||
|
|
||||||
|
class DockerStatusResult(BaseResult):
|
||||||
|
host: str
|
||||||
|
containers: list[ContainerInfo] = Field(default_factory=list)
|
||||||
|
summary: str = "" # "5/5 running" | "4/5 running, 1 exited"
|
||||||
|
raw: str = ""
|
||||||
|
|
||||||
|
|
||||||
|
class DockerLogsResult(BaseResult):
|
||||||
|
host: str
|
||||||
|
container: str
|
||||||
|
lines: int # requested line count
|
||||||
|
truncated: bool = False
|
||||||
|
content: str = "" # stdout
|
||||||
|
stderr: str = "" # stderr (separate)
|
||||||
|
raw: str = ""
|
||||||
|
|
||||||
|
|
||||||
|
# -- Health -------------------------------------------------------------------
|
||||||
|
|
||||||
|
class HealthResult(BaseResult):
|
||||||
|
service: str
|
||||||
|
status: str = "unknown" # "healthy" | "degraded" | "down"
|
||||||
|
details: dict = Field(default_factory=dict)
|
||||||
|
raw: str | None = None
|
||||||
|
|
||||||
|
|
||||||
|
# -- System -------------------------------------------------------------------
|
||||||
|
|
||||||
|
class FileSystemInfo(BaseModel):
|
||||||
|
mount: str
|
||||||
|
used_pct: int
|
||||||
|
used: str
|
||||||
|
avail: str
|
||||||
|
total: str
|
||||||
|
|
||||||
|
|
||||||
|
class DiskResult(BaseResult):
|
||||||
|
host: str
|
||||||
|
filesystems: list[FileSystemInfo] = Field(default_factory=list)
|
||||||
|
raw: str = ""
|
||||||
|
|
||||||
|
|
||||||
|
# -- Network ------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TailscalePeer(BaseModel):
|
||||||
|
hostname: str
|
||||||
|
ip: str
|
||||||
|
status: str # "active" | "idle" | "offline"
|
||||||
|
os: str
|
||||||
|
|
||||||
|
|
||||||
|
class TailscaleResult(BaseResult):
|
||||||
|
peers: list[TailscalePeer] = Field(default_factory=list)
|
||||||
|
raw: str = ""
|
||||||
|
|
||||||
|
|
||||||
|
# -- Models -------------------------------------------------------------------
|
||||||
|
|
||||||
|
class ModelInfo(BaseModel):
|
||||||
|
id: str
|
||||||
|
size: str = ""
|
||||||
|
modified: str = ""
|
||||||
|
|
||||||
|
|
||||||
|
class ModelsResult(BaseResult):
|
||||||
|
host: str
|
||||||
|
source: str # "ollama" | "mlx"
|
||||||
|
models: list[ModelInfo] = Field(default_factory=list)
|
||||||
|
raw: str = ""
|
||||||
Reference in New Issue
Block a user