Files
gpu-services/nanoclaude/tools/infra_tool.py
T
Hyungi Ahn 03e3df058f feat(infra): docker_restart 쓰기 도구 추가
보호 컨테이너(home-caddy, home-fail2ban, nanoclaude) 재시작 차단.
MCP 11개 도구 + NanoClaude wrapper.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-13 15:06:40 +09:00

144 lines
5.3 KiB
Python

"""Infra tool — NanoClaude wrapper over infra.core/ functions.
Converts infra.core results to NanoClaude tool return format:
{"ok": bool, "tool": "infra", "operation": str, "data": list, "summary": str, "error": str}
"""
from __future__ import annotations
import asyncio
import logging
from infra.core.docker import docker_status, docker_restart as _docker_restart
from infra.core.health import service_health, VALID_SERVICES
from infra.core.system import disk_usage
from infra.core.network import tailscale_status
from infra.core.models import ollama_models, mlx_models
from infra.core.docserver import scheduler_status as _scheduler_status, queue_status as _queue_status
from infra.core.verify import run_verify as _run_verify, VERIFY_COMMANDS
logger = logging.getLogger(__name__)
async def status(host: str = "gpu") -> dict:
"""Docker container status overview."""
result = await docker_status(host)
# SSH/연결 실패 시에만 에러 반환. 컨테이너가 exited여도 데이터는 전달.
if result.error_type:
return {"ok": False, "tool": "infra", "operation": "status",
"data": [], "summary": "", "error": result.error or "확인 실패"}
data = [{"name": c.name, "status": c.status, "uptime": c.uptime}
for c in result.containers]
return {"ok": True, "tool": "infra", "operation": "status",
"data": data, "summary": result.summary, "error": ""}
async def health(service: str = "") -> dict:
"""Service health check. If no service specified, check all critical ones."""
if service:
services = [service]
else:
services = ["document-server", "mlx", "ollama-gpu"]
results = []
all_ok = True
for svc in services:
r = await service_health(svc)
results.append({
"service": r.service, "status": r.status, "ok": r.ok,
"details": r.details,
})
if not r.ok:
all_ok = False
summary_parts = []
for r in results:
icon = "정상" if r["ok"] else "이상"
summary_parts.append(f"{r['service']}: {icon}")
return {"ok": all_ok, "tool": "infra", "operation": "health",
"data": results, "summary": ", ".join(summary_parts), "error": ""}
async def disk(host: str = "") -> dict:
"""Disk usage. If no host, check gpu + macmini."""
hosts = [host] if host else ["gpu", "macmini"]
all_data = []
warnings = []
for h in hosts:
result = await disk_usage(h)
if not result.ok:
warnings.append(f"{h}: {result.error}")
continue
for fs in result.filesystems:
all_data.append({"host": h, "mount": fs.mount,
"used_pct": fs.used_pct, "used": fs.used, "total": fs.total})
warnings.extend(result.warnings)
summary = ", ".join(f"{d['host']}:{d['mount']} {d['used_pct']}%" for d in all_data[:5])
return {"ok": len(warnings) == 0, "tool": "infra", "operation": "disk",
"data": all_data, "summary": summary,
"error": "; ".join(warnings) if warnings else ""}
async def network() -> dict:
"""Tailscale network status."""
result = await tailscale_status()
if not result.ok:
return {"ok": False, "tool": "infra", "operation": "network",
"data": [], "summary": "", "error": result.error or "확인 실패"}
data = [{"hostname": p.hostname, "ip": p.ip, "status": p.status, "os": p.os}
for p in result.peers]
online = sum(1 for p in result.peers if p.status != "offline")
summary = f"{online}/{len(result.peers)} 온라인"
return {"ok": True, "tool": "infra", "operation": "network",
"data": data, "summary": summary, "error": ""}
async def models(host: str = "gpu") -> dict:
"""Model inventory."""
if host == "mlx" or host == "macmini":
result = await mlx_models()
else:
result = await ollama_models(host)
if not result.ok:
return {"ok": False, "tool": "infra", "operation": "models",
"data": [], "summary": "", "error": result.error or "확인 실패"}
data = [{"id": m.id, "size": m.size} for m in result.models]
summary = f"{result.source} on {result.host}: {len(result.models)}개 모델"
return {"ok": True, "tool": "infra", "operation": "models",
"data": data, "summary": summary, "error": ""}
async def scheduler() -> dict:
"""Document Server scheduler status."""
return await _scheduler_status()
async def queue() -> dict:
"""Document Server queue status."""
return await _queue_status()
async def verify(check_name: str = "gpu-snapshot") -> dict:
"""Run predefined verify command."""
return await _run_verify(check_name)
async def restart(host: str = "gpu", container: str = "") -> dict:
"""Restart a Docker container."""
if not container:
return {"ok": False, "tool": "infra", "operation": "restart",
"data": [], "summary": "", "error": "컨테이너 이름을 지정해주세요."}
result = await _docker_restart(host, container)
ok = result.ok
return {"ok": ok, "tool": "infra", "operation": "restart",
"data": result.warnings if ok else [],
"summary": result.warnings[0] if ok and result.warnings else "",
"error": result.error or ("재시작 실패" if not ok else "")}