feat(infra): MCP 인프라 서버 통합 — 7개 도구 + core/ 분리
mcp-infra-server를 gpu-services/infra/로 통합. core/ 순수 로직은 Agent/NanoClaude에서도 직접 import 가능. 도구: docker_status, docker_logs, service_health, disk_usage, tailscale_status, ollama_models, mlx_models. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,113 @@
|
||||
"""Docker status and logs tools."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from ..config import validate_host, HOSTS
|
||||
from ..schemas import DockerStatusResult, DockerLogsResult, ContainerInfo
|
||||
from .ssh import run_command, SSHError
|
||||
|
||||
|
||||
def _now() -> str:
|
||||
return datetime.now(timezone.utc).isoformat()
|
||||
|
||||
|
||||
async def docker_status(host: str) -> DockerStatusResult:
|
||||
"""List all Docker containers on a host with structured status."""
|
||||
try:
|
||||
cfg = validate_host("docker_status", host)
|
||||
except ValueError as e:
|
||||
return DockerStatusResult(
|
||||
ok=False, checked_at=_now(), host=host,
|
||||
error_type="parse_error", error=str(e),
|
||||
)
|
||||
|
||||
docker = cfg.docker_path
|
||||
fmt = '{{.Names}}|{{.Status}}|{{.Ports}}|{{.Image}}'
|
||||
cmd = f"{docker} ps -a --format '{fmt}'"
|
||||
|
||||
try:
|
||||
stdout, _ = await run_command(cfg, cmd, use_sudo=cfg.needs_sudo)
|
||||
except SSHError as e:
|
||||
return DockerStatusResult(
|
||||
ok=False, checked_at=_now(), host=host,
|
||||
error_type=e.error_type, error=str(e),
|
||||
)
|
||||
|
||||
containers: list[ContainerInfo] = []
|
||||
for line in stdout.strip().splitlines():
|
||||
parts = line.split("|", 3)
|
||||
if len(parts) < 4:
|
||||
continue
|
||||
name, status_str, ports, image = parts
|
||||
# Extract running state from status string
|
||||
state = "running" if status_str.startswith("Up") else "exited"
|
||||
if "Restarting" in status_str:
|
||||
state = "restarting"
|
||||
containers.append(ContainerInfo(
|
||||
name=name, status=state, uptime=status_str, ports=ports, image=image,
|
||||
))
|
||||
|
||||
running = sum(1 for c in containers if c.status == "running")
|
||||
total = len(containers)
|
||||
summary = f"{running}/{total} running"
|
||||
if running < total:
|
||||
non_running = [c.name for c in containers if c.status != "running"]
|
||||
summary += f", down: {', '.join(non_running)}"
|
||||
|
||||
warnings: list[str] = []
|
||||
for c in containers:
|
||||
if c.status == "restarting":
|
||||
warnings.append(f"{c.name} is restarting")
|
||||
elif c.status == "exited":
|
||||
warnings.append(f"{c.name} is exited")
|
||||
|
||||
return DockerStatusResult(
|
||||
ok=running == total,
|
||||
checked_at=_now(),
|
||||
host=host,
|
||||
containers=containers,
|
||||
summary=summary,
|
||||
warnings=warnings,
|
||||
raw=stdout.strip(),
|
||||
)
|
||||
|
||||
|
||||
async def docker_logs(host: str, container: str, lines: int = 50) -> DockerLogsResult:
|
||||
"""Get recent logs from a container."""
|
||||
try:
|
||||
cfg = validate_host("docker_logs", host)
|
||||
except ValueError as e:
|
||||
return DockerLogsResult(
|
||||
ok=False, checked_at=_now(), host=host, container=container,
|
||||
lines=lines, error_type="parse_error", error=str(e),
|
||||
)
|
||||
|
||||
docker = cfg.docker_path
|
||||
# Request one extra line to detect truncation
|
||||
cmd = f"{docker} logs --tail {lines + 1} {container} 2>&1"
|
||||
|
||||
try:
|
||||
stdout, stderr = await run_command(cfg, cmd, use_sudo=cfg.needs_sudo, timeout=15)
|
||||
except SSHError as e:
|
||||
return DockerLogsResult(
|
||||
ok=False, checked_at=_now(), host=host, container=container,
|
||||
lines=lines, error_type=e.error_type, error=str(e),
|
||||
)
|
||||
|
||||
all_lines = stdout.strip().splitlines()
|
||||
truncated = len(all_lines) > lines
|
||||
content = "\n".join(all_lines[:lines]) if truncated else "\n".join(all_lines)
|
||||
|
||||
return DockerLogsResult(
|
||||
ok=True,
|
||||
checked_at=_now(),
|
||||
host=host,
|
||||
container=container,
|
||||
lines=lines,
|
||||
truncated=truncated,
|
||||
content=content,
|
||||
stderr=stderr.strip() if stderr else "",
|
||||
raw=stdout.strip(),
|
||||
)
|
||||
@@ -0,0 +1,201 @@
|
||||
"""Service health checks with per-service validators."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from ..config import HOSTS
|
||||
from ..schemas import HealthResult
|
||||
from .ssh import run_command, SSHError
|
||||
|
||||
|
||||
def _now() -> str:
|
||||
return datetime.now(timezone.utc).isoformat()
|
||||
|
||||
|
||||
async def _validate_document_server() -> HealthResult:
|
||||
"""Document Server: /health endpoint must return ok + database connected."""
|
||||
cfg = HOSTS["gpu"]
|
||||
try:
|
||||
t0 = time.monotonic()
|
||||
stdout, _ = await run_command(cfg, "curl -sf http://localhost:8000/health")
|
||||
latency_ms = int((time.monotonic() - t0) * 1000)
|
||||
|
||||
data = json.loads(stdout)
|
||||
db_ok = data.get("database") == "connected"
|
||||
status_ok = data.get("status") == "ok"
|
||||
|
||||
warnings = []
|
||||
if not db_ok:
|
||||
warnings.append("database disconnected")
|
||||
|
||||
return HealthResult(
|
||||
ok=status_ok and db_ok,
|
||||
checked_at=_now(),
|
||||
service="document-server",
|
||||
status="healthy" if (status_ok and db_ok) else "degraded",
|
||||
details={
|
||||
"status": data.get("status"),
|
||||
"database": data.get("database"),
|
||||
"version": data.get("version"),
|
||||
"latency_ms": latency_ms,
|
||||
},
|
||||
warnings=warnings,
|
||||
raw=stdout.strip(),
|
||||
)
|
||||
except SSHError as e:
|
||||
return HealthResult(
|
||||
ok=False, checked_at=_now(), service="document-server",
|
||||
status="down", error_type=e.error_type, error=str(e),
|
||||
)
|
||||
except (json.JSONDecodeError, KeyError) as e:
|
||||
return HealthResult(
|
||||
ok=False, checked_at=_now(), service="document-server",
|
||||
status="down", error_type="parse_error", error=f"응답 파싱 실패: {e}",
|
||||
raw=stdout.strip() if 'stdout' in dir() else None,
|
||||
)
|
||||
|
||||
|
||||
async def _validate_mlx() -> HealthResult:
|
||||
"""MLX Server: /v1/models must return at least 1 model within 5s."""
|
||||
cfg = HOSTS["macmini"]
|
||||
try:
|
||||
t0 = time.monotonic()
|
||||
stdout, _ = await run_command(cfg, "curl -sf http://localhost:8800/v1/models")
|
||||
latency_ms = int((time.monotonic() - t0) * 1000)
|
||||
|
||||
data = json.loads(stdout)
|
||||
models = data.get("data", [])
|
||||
model_ids = [m.get("id", "unknown") for m in models]
|
||||
|
||||
warnings = []
|
||||
if latency_ms > 5000:
|
||||
warnings.append(f"응답 지연 {latency_ms}ms (임계값 5000ms)")
|
||||
|
||||
return HealthResult(
|
||||
ok=len(models) > 0 and latency_ms <= 5000,
|
||||
checked_at=_now(),
|
||||
service="mlx",
|
||||
status="healthy" if (len(models) > 0 and latency_ms <= 5000) else "degraded",
|
||||
details={
|
||||
"model_count": len(models),
|
||||
"models": model_ids,
|
||||
"latency_ms": latency_ms,
|
||||
},
|
||||
warnings=warnings,
|
||||
raw=stdout.strip(),
|
||||
)
|
||||
except SSHError as e:
|
||||
return HealthResult(
|
||||
ok=False, checked_at=_now(), service="mlx",
|
||||
status="down", error_type=e.error_type, error=str(e),
|
||||
)
|
||||
|
||||
|
||||
async def _validate_mlx_proxy() -> HealthResult:
|
||||
"""MLX Proxy (:8801): must return models via proxy."""
|
||||
cfg = HOSTS["macmini"]
|
||||
try:
|
||||
t0 = time.monotonic()
|
||||
stdout, _ = await run_command(cfg, "curl -sf http://localhost:8801/v1/models")
|
||||
latency_ms = int((time.monotonic() - t0) * 1000)
|
||||
|
||||
data = json.loads(stdout)
|
||||
models = data.get("data", [])
|
||||
|
||||
return HealthResult(
|
||||
ok=len(models) > 0,
|
||||
checked_at=_now(),
|
||||
service="mlx-proxy",
|
||||
status="healthy" if models else "down",
|
||||
details={"model_count": len(models), "latency_ms": latency_ms},
|
||||
raw=stdout.strip(),
|
||||
)
|
||||
except SSHError as e:
|
||||
return HealthResult(
|
||||
ok=False, checked_at=_now(), service="mlx-proxy",
|
||||
status="down", error_type=e.error_type, error=str(e),
|
||||
)
|
||||
|
||||
|
||||
async def _validate_nanoclaude() -> HealthResult:
|
||||
"""NanoClaude: /health on port 8100."""
|
||||
cfg = HOSTS["gpu"]
|
||||
try:
|
||||
t0 = time.monotonic()
|
||||
stdout, _ = await run_command(cfg, "curl -sf http://localhost:8100/health")
|
||||
latency_ms = int((time.monotonic() - t0) * 1000)
|
||||
|
||||
return HealthResult(
|
||||
ok=True,
|
||||
checked_at=_now(),
|
||||
service="nanoclaude",
|
||||
status="healthy",
|
||||
details={"latency_ms": latency_ms, "response": stdout.strip()[:200]},
|
||||
raw=stdout.strip(),
|
||||
)
|
||||
except SSHError as e:
|
||||
return HealthResult(
|
||||
ok=False, checked_at=_now(), service="nanoclaude",
|
||||
status="down", error_type=e.error_type, error=str(e),
|
||||
)
|
||||
|
||||
|
||||
async def _validate_ollama(host: str) -> HealthResult:
|
||||
"""Ollama: `ollama list` must succeed and return non-empty."""
|
||||
service_name = f"ollama-{host}" if host != "gpu" else "ollama-gpu"
|
||||
cfg = HOSTS[host]
|
||||
try:
|
||||
t0 = time.monotonic()
|
||||
stdout, _ = await run_command(cfg, "ollama list")
|
||||
latency_ms = int((time.monotonic() - t0) * 1000)
|
||||
|
||||
lines = [l for l in stdout.strip().splitlines()[1:] if l.strip()] # skip header
|
||||
model_count = len(lines)
|
||||
|
||||
warnings = []
|
||||
if model_count == 0:
|
||||
warnings.append("모델 없음")
|
||||
|
||||
return HealthResult(
|
||||
ok=model_count > 0,
|
||||
checked_at=_now(),
|
||||
service=service_name,
|
||||
status="healthy" if model_count > 0 else "degraded",
|
||||
details={"model_count": model_count, "latency_ms": latency_ms},
|
||||
warnings=warnings,
|
||||
raw=stdout.strip(),
|
||||
)
|
||||
except SSHError as e:
|
||||
return HealthResult(
|
||||
ok=False, checked_at=_now(), service=service_name,
|
||||
status="down", error_type=e.error_type, error=str(e),
|
||||
)
|
||||
|
||||
|
||||
# Validator registry
|
||||
VALIDATORS: dict[str, object] = {
|
||||
"document-server": _validate_document_server,
|
||||
"mlx": _validate_mlx,
|
||||
"mlx-proxy": _validate_mlx_proxy,
|
||||
"nanoclaude": _validate_nanoclaude,
|
||||
"ollama-gpu": lambda: _validate_ollama("gpu"),
|
||||
"ollama-macmini": lambda: _validate_ollama("macmini"),
|
||||
}
|
||||
|
||||
VALID_SERVICES = list(VALIDATORS.keys())
|
||||
|
||||
|
||||
async def service_health(service: str) -> HealthResult:
|
||||
"""Run health check for a specific service."""
|
||||
validator = VALIDATORS.get(service)
|
||||
if not validator:
|
||||
return HealthResult(
|
||||
ok=False, checked_at=_now(), service=service,
|
||||
status="unknown",
|
||||
error_type="parse_error",
|
||||
error=f"알 수 없는 서비스: '{service}'. 허용: {', '.join(VALID_SERVICES)}",
|
||||
)
|
||||
return await validator()
|
||||
@@ -0,0 +1,97 @@
|
||||
"""Model inventory tools — Ollama and MLX model listing."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from ..config import validate_host, HOSTS
|
||||
from ..schemas import ModelsResult, ModelInfo
|
||||
from .ssh import run_command, SSHError
|
||||
|
||||
|
||||
def _now() -> str:
|
||||
return datetime.now(timezone.utc).isoformat()
|
||||
|
||||
|
||||
def _parse_ollama_list(output: str) -> list[ModelInfo]:
|
||||
"""Parse `ollama list` output."""
|
||||
models = []
|
||||
for line in output.strip().splitlines()[1:]: # skip header
|
||||
parts = line.split()
|
||||
if len(parts) < 2:
|
||||
continue
|
||||
model_id = parts[0]
|
||||
# Remaining fields vary: ID, SIZE, MODIFIED
|
||||
size = parts[2] + " " + parts[3] if len(parts) > 3 else ""
|
||||
modified = " ".join(parts[4:]) if len(parts) > 4 else ""
|
||||
models.append(ModelInfo(id=model_id, size=size, modified=modified))
|
||||
return models
|
||||
|
||||
|
||||
async def ollama_models(host: str) -> ModelsResult:
|
||||
"""List Ollama models on a host."""
|
||||
try:
|
||||
cfg = validate_host("ollama_models", host)
|
||||
except ValueError as e:
|
||||
return ModelsResult(
|
||||
ok=False, checked_at=_now(), host=host, source="ollama",
|
||||
error_type="parse_error", error=str(e),
|
||||
)
|
||||
|
||||
try:
|
||||
stdout, _ = await run_command(cfg, "ollama list")
|
||||
except SSHError as e:
|
||||
return ModelsResult(
|
||||
ok=False, checked_at=_now(), host=host, source="ollama",
|
||||
error_type=e.error_type, error=str(e),
|
||||
)
|
||||
|
||||
models = _parse_ollama_list(stdout)
|
||||
return ModelsResult(
|
||||
ok=True,
|
||||
checked_at=_now(),
|
||||
host=host,
|
||||
source="ollama",
|
||||
models=models,
|
||||
raw=stdout.strip(),
|
||||
)
|
||||
|
||||
|
||||
async def mlx_models() -> ModelsResult:
|
||||
"""List MLX models loaded on Mac mini."""
|
||||
cfg = HOSTS["macmini"]
|
||||
try:
|
||||
stdout, _ = await run_command(cfg, "curl -sf http://localhost:8800/v1/models")
|
||||
except SSHError as e:
|
||||
return ModelsResult(
|
||||
ok=False, checked_at=_now(), host="macmini", source="mlx",
|
||||
error_type=e.error_type, error=str(e),
|
||||
)
|
||||
|
||||
try:
|
||||
data = json.loads(stdout)
|
||||
model_list = data.get("data", [])
|
||||
models = [
|
||||
ModelInfo(
|
||||
id=m.get("id", "unknown"),
|
||||
size=str(m.get("size", "")),
|
||||
modified=str(m.get("created", "")),
|
||||
)
|
||||
for m in model_list
|
||||
]
|
||||
except (json.JSONDecodeError, KeyError) as e:
|
||||
return ModelsResult(
|
||||
ok=False, checked_at=_now(), host="macmini", source="mlx",
|
||||
error_type="parse_error", error=f"JSON 파싱 실패: {e}",
|
||||
raw=stdout.strip(),
|
||||
)
|
||||
|
||||
return ModelsResult(
|
||||
ok=True,
|
||||
checked_at=_now(),
|
||||
host="macmini",
|
||||
source="mlx",
|
||||
models=models,
|
||||
raw=stdout.strip(),
|
||||
)
|
||||
@@ -0,0 +1,83 @@
|
||||
"""Network tools — Tailscale status."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from ..schemas import TailscaleResult, TailscalePeer
|
||||
from .ssh import run_local, SSHError
|
||||
|
||||
TAILSCALE_BIN = "/Applications/Tailscale.app/Contents/MacOS/Tailscale"
|
||||
|
||||
|
||||
def _now() -> str:
|
||||
return datetime.now(timezone.utc).isoformat()
|
||||
|
||||
|
||||
def _parse_tailscale(output: str) -> list[TailscalePeer]:
|
||||
"""Parse `tailscale status` output into peer list.
|
||||
|
||||
Format: IP HOSTNAME USER@ OS STATUS_INFO
|
||||
Status examples: "-" (connected/active), "idle, tx ... rx ...", "offline, last seen ..."
|
||||
"""
|
||||
peers = []
|
||||
for line in output.strip().splitlines():
|
||||
parts = line.split()
|
||||
if len(parts) < 4:
|
||||
continue
|
||||
# Skip header-like lines
|
||||
if parts[0].startswith("#") or parts[0] == "IP":
|
||||
continue
|
||||
|
||||
ip = parts[0]
|
||||
hostname = parts[1]
|
||||
# parts[2] = user@ (skip)
|
||||
os_name = parts[3] if len(parts) > 3 else ""
|
||||
|
||||
# Remaining is status info
|
||||
status_text = " ".join(parts[4:]) if len(parts) > 4 else ""
|
||||
|
||||
if "offline" in status_text:
|
||||
status = "offline"
|
||||
elif "idle" in status_text:
|
||||
status = "idle"
|
||||
elif status_text == "-" or status_text == "":
|
||||
status = "active"
|
||||
else:
|
||||
status = "active"
|
||||
|
||||
peers.append(TailscalePeer(
|
||||
hostname=hostname,
|
||||
ip=ip,
|
||||
status=status,
|
||||
os=os_name,
|
||||
))
|
||||
return peers
|
||||
|
||||
|
||||
async def tailscale_status() -> TailscaleResult:
|
||||
"""Get Tailscale network status (runs locally)."""
|
||||
try:
|
||||
stdout, _ = await run_local(f"{TAILSCALE_BIN} status")
|
||||
except SSHError as e:
|
||||
return TailscaleResult(
|
||||
ok=False, checked_at=_now(),
|
||||
error_type=e.error_type, error=str(e),
|
||||
)
|
||||
|
||||
peers = _parse_tailscale(stdout)
|
||||
|
||||
warnings = []
|
||||
expected_hosts = {"sub-server", "hyungi-macmini", "hyungi-macbookpro"}
|
||||
found_hosts = {p.hostname for p in peers}
|
||||
missing = expected_hosts - found_hosts
|
||||
for h in missing:
|
||||
warnings.append(f"{h} not found in tailnet")
|
||||
|
||||
return TailscaleResult(
|
||||
ok=True,
|
||||
checked_at=_now(),
|
||||
peers=peers,
|
||||
warnings=warnings,
|
||||
raw=stdout.strip(),
|
||||
)
|
||||
@@ -0,0 +1,123 @@
|
||||
"""SSH connection layer — asyncssh based.
|
||||
|
||||
Provides run_command() which handles:
|
||||
- Key-based auth (GPU, Mac mini)
|
||||
- Password auth + sudo (company NAS)
|
||||
- Timeout / retry
|
||||
- Structured error classification
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from datetime import datetime, timezone
|
||||
|
||||
import asyncssh
|
||||
|
||||
from ..config import HostConfig, SSH_TIMEOUT, CMD_TIMEOUT, MAX_RETRIES
|
||||
|
||||
|
||||
class SSHError(Exception):
|
||||
"""Typed SSH error with error_type classification."""
|
||||
|
||||
def __init__(self, error_type: str, message: str):
|
||||
self.error_type = error_type
|
||||
super().__init__(message)
|
||||
|
||||
|
||||
def _now_iso() -> str:
|
||||
return datetime.now(timezone.utc).isoformat()
|
||||
|
||||
|
||||
async def _connect(host: HostConfig) -> asyncssh.SSHClientConnection:
|
||||
"""Open SSH connection with appropriate auth method."""
|
||||
kwargs: dict = {
|
||||
"host": host.ip,
|
||||
"username": host.user,
|
||||
"connect_timeout": SSH_TIMEOUT,
|
||||
"known_hosts": None, # accept any host key (Tailscale internal network)
|
||||
}
|
||||
if host.auth == "password" and host.password:
|
||||
kwargs["password"] = host.password
|
||||
kwargs["client_keys"] = [] # don't try key auth
|
||||
# key auth is the default (uses ~/.ssh/)
|
||||
|
||||
return await asyncssh.connect(**kwargs)
|
||||
|
||||
|
||||
async def run_command(
|
||||
host: HostConfig,
|
||||
command: str,
|
||||
timeout: int = CMD_TIMEOUT,
|
||||
use_sudo: bool = False,
|
||||
) -> tuple[str, str]:
|
||||
"""Run a command on remote host. Returns (stdout, stderr).
|
||||
|
||||
For NAS with sudo: wraps command with sudo using password via stdin.
|
||||
Raises SSHError with typed error_type on failure.
|
||||
"""
|
||||
if use_sudo and host.needs_sudo and host.password:
|
||||
# Pipe password to sudo via stdin
|
||||
command = f"echo '{host.password}' | sudo -S {command}"
|
||||
|
||||
last_error: Exception | None = None
|
||||
for attempt in range(1 + MAX_RETRIES):
|
||||
try:
|
||||
conn = await _connect(host)
|
||||
async with conn:
|
||||
result = await asyncio.wait_for(
|
||||
conn.run(command, check=False),
|
||||
timeout=timeout,
|
||||
)
|
||||
stdout = result.stdout or ""
|
||||
stderr = result.stderr or ""
|
||||
|
||||
if result.exit_status != 0:
|
||||
# Command ran but returned non-zero
|
||||
# Filter out sudo password prompt from stderr
|
||||
stderr_clean = "\n".join(
|
||||
line for line in stderr.splitlines()
|
||||
if "[sudo]" not in line and "Password:" not in line
|
||||
)
|
||||
raise SSHError(
|
||||
"command_failed",
|
||||
f"exit {result.exit_status}: {stderr_clean.strip() or stdout.strip()}"
|
||||
)
|
||||
return stdout, stderr
|
||||
|
||||
except SSHError:
|
||||
raise
|
||||
except asyncio.TimeoutError:
|
||||
raise SSHError("timeout", f"명령 실행 시간 초과 ({timeout}초)")
|
||||
except asyncssh.PermissionDenied:
|
||||
raise SSHError("auth", f"SSH 인증 실패: {host.user}@{host.ip}")
|
||||
except (OSError, asyncssh.Error) as e:
|
||||
last_error = e
|
||||
if attempt < MAX_RETRIES:
|
||||
await asyncio.sleep(1)
|
||||
continue
|
||||
raise SSHError("timeout", f"SSH 연결 실패: {host.ip} — {e}")
|
||||
|
||||
raise SSHError("timeout", f"SSH 최대 재시도 초과: {host.ip}")
|
||||
|
||||
|
||||
async def run_local(command: str, timeout: int = CMD_TIMEOUT) -> tuple[str, str]:
|
||||
"""Run a command locally. Returns (stdout, stderr)."""
|
||||
try:
|
||||
proc = await asyncio.create_subprocess_shell(
|
||||
command,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE,
|
||||
)
|
||||
stdout_bytes, stderr_bytes = await asyncio.wait_for(
|
||||
proc.communicate(), timeout=timeout
|
||||
)
|
||||
stdout = stdout_bytes.decode() if stdout_bytes else ""
|
||||
stderr = stderr_bytes.decode() if stderr_bytes else ""
|
||||
|
||||
if proc.returncode != 0:
|
||||
raise SSHError("command_failed", f"exit {proc.returncode}: {stderr.strip() or stdout.strip()}")
|
||||
|
||||
return stdout, stderr
|
||||
except asyncio.TimeoutError:
|
||||
raise SSHError("timeout", f"로컬 명령 시간 초과 ({timeout}초)")
|
||||
@@ -0,0 +1,79 @@
|
||||
"""System tools — disk usage."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from ..config import validate_host
|
||||
from ..schemas import DiskResult, FileSystemInfo
|
||||
from .ssh import run_command, SSHError
|
||||
|
||||
|
||||
def _now() -> str:
|
||||
return datetime.now(timezone.utc).isoformat()
|
||||
|
||||
|
||||
def _parse_df(output: str) -> list[FileSystemInfo]:
|
||||
"""Parse `df -h` output into structured filesystem info."""
|
||||
filesystems = []
|
||||
for line in output.strip().splitlines()[1:]: # skip header
|
||||
parts = line.split()
|
||||
if len(parts) < 6:
|
||||
continue
|
||||
# df -h columns: Filesystem Size Used Avail Use% Mounted
|
||||
mount = parts[-1]
|
||||
# Skip pseudo-filesystems
|
||||
if mount.startswith(("/dev", "/sys", "/proc", "/run", "/snap")):
|
||||
continue
|
||||
if parts[0] in ("tmpfs", "devtmpfs", "overlay", "shm", "none"):
|
||||
continue
|
||||
|
||||
try:
|
||||
used_pct = int(parts[4].rstrip("%"))
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
filesystems.append(FileSystemInfo(
|
||||
mount=mount,
|
||||
total=parts[1],
|
||||
used=parts[2],
|
||||
avail=parts[3],
|
||||
used_pct=used_pct,
|
||||
))
|
||||
return filesystems
|
||||
|
||||
|
||||
async def disk_usage(host: str) -> DiskResult:
|
||||
"""Get disk usage for a host with structured filesystem info."""
|
||||
try:
|
||||
cfg = validate_host("disk_usage", host)
|
||||
except ValueError as e:
|
||||
return DiskResult(
|
||||
ok=False, checked_at=_now(), host=host,
|
||||
error_type="parse_error", error=str(e),
|
||||
)
|
||||
|
||||
try:
|
||||
stdout, _ = await run_command(cfg, "df -h", use_sudo=cfg.needs_sudo)
|
||||
except SSHError as e:
|
||||
return DiskResult(
|
||||
ok=False, checked_at=_now(), host=host,
|
||||
error_type=e.error_type, error=str(e),
|
||||
)
|
||||
|
||||
filesystems = _parse_df(stdout)
|
||||
|
||||
warnings = []
|
||||
WARN_THRESHOLD = 85
|
||||
for fs in filesystems:
|
||||
if fs.used_pct >= WARN_THRESHOLD:
|
||||
warnings.append(f"{fs.mount} 사용률 {fs.used_pct}% — 임계값 {WARN_THRESHOLD}% 초과")
|
||||
|
||||
return DiskResult(
|
||||
ok=True,
|
||||
checked_at=_now(),
|
||||
host=host,
|
||||
filesystems=filesystems,
|
||||
warnings=warnings,
|
||||
raw=stdout.strip(),
|
||||
)
|
||||
Reference in New Issue
Block a user