diff --git a/infra/agent.py b/infra/agent.py index 3611a2f..7d1c132 100644 --- a/infra/agent.py +++ b/infra/agent.py @@ -18,6 +18,7 @@ import os import tempfile from datetime import datetime, timezone from pathlib import Path +from zoneinfo import ZoneInfo import httpx from dotenv import load_dotenv @@ -201,13 +202,25 @@ async def check_disk_rules() -> dict[str, str]: async def check_health_rules() -> dict[str, str]: issues: dict[str, str] = {} + # KST 0~7시는 Document Server tier_backfill 가동 시간대 (정책 0~6시 + 잔여 처리 1h buffer). + # 26B 가 batch 점유로 /v1/models 응답이 5~10초 lock 되는 게 정상이므로 mlx 알람만 격하. + # 정책: ~/Documents/code/hyungi_Document_Server/app/workers/tier_backfill.py NIGHT_START/END_HOUR + kst_hour = datetime.now(tz=ZoneInfo("Asia/Seoul")).hour + is_backfill_window = 0 <= kst_hour < 7 + for svc in HEALTH_SERVICES: result = await service_health(svc) if not result.ok: + if svc == "mlx" and is_backfill_window: + log.info("[mute] mlx down — KST %d시 backfill window", kst_hour) + continue detail = result.error or result.status k = _health_key(svc, "down") issues[k] = f"서비스 다운: {svc} — {detail}" elif result.status == "degraded": + if svc == "mlx" and is_backfill_window: + log.info("[mute] mlx degraded — KST %d시 backfill window", kst_hour) + continue k = _health_key(svc, "degraded") issues[k] = f"서비스 저하: {svc}" return issues