gpu-services/infra/agent.py

"""Infra Monitoring Agent — rule-first, Gemma 2nd.

5분마다 실행되어:
1. core/ 함수로 상태 수집
2. Rule 기반 1차 판정 (threshold/패턴)
3. 이상 감지 시 → Gemma 4가 자연어 설명 생성
4. 시놀로지 Chat 알림 (rule 요약 + Gemma 설명)
5. 로그 → stdout (launchd가 캡처)
"""

from __future__ import annotations

import asyncio
import json
import logging
import os
import sys
from datetime import datetime, timezone
from pathlib import Path

import httpx
from dotenv import load_dotenv

# gpu-services 루트에서 실행되므로 infra.core를 import 가능
from infra.core.docker import docker_status
from infra.core.health import service_health
from infra.core.system import disk_usage
from infra.core.network import tailscale_status

load_dotenv(Path(__file__).parent / ".env")

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)
# asyncssh is extremely verbose at INFO level
logging.getLogger("asyncssh").setLevel(logging.WARNING)
log = logging.getLogger("infra-agent")

# --- Config ---

SYNOLOGY_WEBHOOK_URL = os.getenv("SYNOLOGY_INCOMING_URL", "")
DISK_WARN_PCT = 85
EXPECTED_TAILSCALE_HOSTS = {"sub-server", "hyungi-macmini", "hyungi-macbookpro"}

# Docker containers known to be intentionally stopped
IGNORED_CONTAINERS = {
    "hyungi_document_server-ai-gateway-1",
    "hyungi_document_server-ollama-1",
}

# Services to health-check
HEALTH_SERVICES = ["document-server", "mlx", "ollama-gpu"]

# Gemma 4 — Mac mini MLX proxy (localhost on Mac mini)
GEMMA_URL = "http://localhost:8801/v1/chat/completions"
GEMMA_MODEL = "mlx-community/gemma-4-26b-a4b-it-8bit"
GEMMA_TIMEOUT = 30  # seconds


# --- Gemma ---

async def generate_explanation(alerts: list[str]) -> str:
    """Ask Gemma 4 to explain alerts and suggest actions. Returns empty on failure."""
    prompt = (
        "당신은 서버 인프라 모니터링 AI입니다. "
        "아래 이상 항목들을 분석해서 간결하게 설명하고 권장 조치를 알려주세요.\n\n"
        "이상 항목:\n" + "\n".join(f"- {a}" for a in alerts) + "\n\n"
        "형식: 1~3문장으로 원인 분석 + 권장 조치. 한국어로."
    )
    try:
        async with httpx.AsyncClient(timeout=GEMMA_TIMEOUT) as client:
            resp = await client.post(
                GEMMA_URL,
                json={
                    "model": GEMMA_MODEL,
                    "messages": [{"role": "user", "content": prompt}],
                    "max_tokens": 200,
                    "temperature": 0.3,
                },
            )
            if resp.status_code == 200:
                data = resp.json()
                return data["choices"][0]["message"]["content"].strip()
    except Exception:
        log.debug("Gemma 설명 생성 실패 — rule 결과만 전송", exc_info=True)
    return ""


# --- Alert ---

async def send_alert(message: str) -> bool:
    """Send alert to Synology Chat via incoming webhook."""
    if not SYNOLOGY_WEBHOOK_URL:
        log.warning("SYNOLOGY_INCOMING_URL not set — alert skipped")
        log.info("Alert content: %s", message)
        return False

    payload = json.dumps({"text": f"[infra-agent] {message}"}, ensure_ascii=False)
    try:
        async with httpx.AsyncClient(verify=False, timeout=10.0) as client:
            resp = await client.post(
                SYNOLOGY_WEBHOOK_URL,
                data={"payload": payload},
            )
            if resp.status_code == 200:
                log.info("Alert sent: %s", message[:100])
                return True
            log.error("Alert failed: %d %s", resp.status_code, resp.text)
            return False
    except Exception:
        log.exception("Failed to send alert")
        return False


# --- Rules ---

async def check_docker_rules() -> list[str]:
    """Check Docker containers on GPU server."""
    alerts = []
    result = await docker_status("gpu")

    if not result.ok:
        if result.error_type:
            alerts.append(f"GPU Docker 확인 실패: {result.error}")
            return alerts

    for c in result.containers:
        if c.name in IGNORED_CONTAINERS:
            continue
        if c.status != "running":
            alerts.append(f"GPU 컨테이너 다운: {c.name} ({c.status})")
        elif c.status == "restarting":
            alerts.append(f"GPU 컨테이너 재시작 중: {c.name}")

    return alerts


async def check_disk_rules() -> list[str]:
    """Check disk usage on all hosts."""
    alerts = []
    for host in ["gpu", "macmini"]:
        result = await disk_usage(host)
        if not result.ok:
            alerts.append(f"{host} 디스크 확인 실패: {result.error}")
            continue
        for fs in result.filesystems:
            if fs.used_pct >= DISK_WARN_PCT:
                alerts.append(
                    f"{host} 디스크 경고: {fs.mount} {fs.used_pct}% "
                    f"(사용 {fs.used}/{fs.total})"
                )
    return alerts


async def check_health_rules() -> list[str]:
    """Check critical services health."""
    alerts = []
    for svc in HEALTH_SERVICES:
        result = await service_health(svc)
        if not result.ok:
            detail = result.error or result.status
            alerts.append(f"서비스 다운: {svc} — {detail}")
        elif result.status == "degraded":
            alerts.append(f"서비스 저하: {svc} — {result.details}")
    return alerts


async def check_network_rules() -> list[str]:
    """Check Tailscale connectivity for critical hosts."""
    alerts = []
    result = await tailscale_status()
    if not result.ok:
        alerts.append(f"Tailscale 확인 실패: {result.error}")
        return alerts

    online_hosts = {p.hostname for p in result.peers if p.status != "offline"}
    for expected in EXPECTED_TAILSCALE_HOSTS:
        if expected not in online_hosts:
            alerts.append(f"Tailscale 오프라인: {expected}")

    return alerts


# --- Main ---

async def run_checks() -> None:
    """Run all checks and send alerts if needed."""
    now = datetime.now(timezone.utc).isoformat()
    log.info("=== 상태 수집 시작 (%s) ===", now)

    all_alerts: list[str] = []

    # Run all checks concurrently
    results = await asyncio.gather(
        check_docker_rules(),
        check_disk_rules(),
        check_health_rules(),
        check_network_rules(),
        return_exceptions=True,
    )

    for result in results:
        if isinstance(result, Exception):
            all_alerts.append(f"체크 실패: {result}")
        else:
            all_alerts.extend(result)

    if all_alerts:
        log.warning("이상 감지 %d건:", len(all_alerts))
        for a in all_alerts:
            log.warning("  - %s", a)

        # Gemma 2nd: generate explanation
        explanation = await generate_explanation(all_alerts)
        if explanation:
            log.info("Gemma 설명: %s", explanation[:100])

        # Build alert message
        lines = [f"이상 감지 {len(all_alerts)}건:"]
        lines.extend(f"- {a}" for a in all_alerts)
        if explanation:
            lines.append(f"\n분석: {explanation}")
        message = "\n".join(lines)

        await send_alert(message)
    else:
        log.info("전체 정상")


def main():
    asyncio.run(run_checks())


if __name__ == "__main__":
    main()