feat: Hybrid 부하 판단 — health latency baseline + 조건부 inference
- model_adapter: measure_inference_latency() (max_tokens=1, 최소 부하) - backend_registry: - health latency baseline 학습 (초기 5회 max, 이후 EMA) - get_load_status(): inference 우선, health/queue 보조 - cache 30s + cooldown 10s + asyncio.Lock으로 자기증폭 루프 방지 - 조건: health > baseline*3 또는 사용자 명시 요청 시에만 ping - worker: - "system_status" 액션 — 사용자 상태 조회 시 force_measure - _build_system_status() 응답 빌더 (health/baseline/ping/queue) - route busy 안내를 get_load_status 기반으로 변경 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -68,6 +68,11 @@ class BackendRegistry:
|
||||
self.reasoner: ModelAdapter | None = None # Gemma4: 추론
|
||||
self._health: dict[str, bool] = {"classifier": False, "reasoner": False}
|
||||
self._latency: dict[str, float] = {"classifier": 0.0, "reasoner": 0.0}
|
||||
self._health_baseline: dict[str, float] = {}
|
||||
self._sample_count: dict[str, int] = {}
|
||||
self._inference_latency: dict[str, float | None] = {}
|
||||
self._inference_latency_at: dict[str, float] = {}
|
||||
self._measure_lock = asyncio.Lock()
|
||||
self._health_task: asyncio.Task | None = None
|
||||
|
||||
def init_from_settings(self, settings) -> None:
|
||||
@@ -111,10 +116,95 @@ class BackendRegistry:
|
||||
prev = self._health[role]
|
||||
self._health[role] = healthy
|
||||
self._latency[role] = elapsed
|
||||
if healthy:
|
||||
self._update_baseline(role, elapsed)
|
||||
if prev != healthy:
|
||||
status = "UP" if healthy else "DOWN"
|
||||
logger.warning("%s (%s) → %s (%.0fms)", adapter.name, role, status, elapsed)
|
||||
|
||||
def _update_baseline(self, role: str, latency: float) -> None:
|
||||
"""baseline EMA 업데이트. 초기 5회는 max로 spike 보호."""
|
||||
sample_count = self._sample_count.get(role, 0) + 1
|
||||
self._sample_count[role] = sample_count
|
||||
current = self._health_baseline.get(role, latency)
|
||||
|
||||
if sample_count < 5:
|
||||
new_baseline = max(current, latency)
|
||||
else:
|
||||
new_baseline = current * 0.9 + latency * 0.1
|
||||
|
||||
# 절대 최솟값 (10ms 이하면 의미 없음)
|
||||
self._health_baseline[role] = max(new_baseline, 10)
|
||||
|
||||
async def _measure_inference(self, role: str) -> float | None:
|
||||
"""inference latency 측정 (lock으로 동시 1개 제한)."""
|
||||
adapter = self.classifier if role == "classifier" else self.reasoner
|
||||
if not adapter:
|
||||
return None
|
||||
async with self._measure_lock:
|
||||
latency = await adapter.measure_inference_latency()
|
||||
if latency >= 0:
|
||||
self._inference_latency[role] = latency
|
||||
self._inference_latency_at[role] = time.time()
|
||||
return latency
|
||||
return None
|
||||
|
||||
async def get_load_status(self, role: str, force_measure: bool = False) -> dict:
|
||||
"""role의 부하 상태 판단. inference latency는 캐시 또는 조건부 측정."""
|
||||
health_latency = self._latency.get(role, 0)
|
||||
baseline = self._health_baseline.get(role, 50)
|
||||
cached_inference = self._inference_latency.get(role)
|
||||
cached_at = self._inference_latency_at.get(role, 0)
|
||||
|
||||
now = time.time()
|
||||
cache_valid = (now - cached_at) < 30
|
||||
cooldown_active = (now - cached_at) < 10
|
||||
|
||||
# 조건부 측정 (상대값 + 쿨다운)
|
||||
should_measure = False
|
||||
if force_measure and not cooldown_active:
|
||||
should_measure = True
|
||||
elif not cache_valid and not cooldown_active:
|
||||
if health_latency > baseline * 3:
|
||||
should_measure = True
|
||||
|
||||
if should_measure:
|
||||
inference_latency = await self._measure_inference(role)
|
||||
else:
|
||||
inference_latency = cached_inference if cache_valid else None
|
||||
|
||||
# 부하 판단 (inference 우선, health는 fallback)
|
||||
if inference_latency is not None and inference_latency > 8000:
|
||||
load = "매우 바쁨"
|
||||
elif inference_latency is not None and inference_latency > 4000:
|
||||
load = "바쁨"
|
||||
elif health_latency > baseline * 5:
|
||||
load = "바쁨"
|
||||
elif health_latency > baseline * 2:
|
||||
load = "보통"
|
||||
else:
|
||||
load = "여유"
|
||||
|
||||
# local queue로 보조 판단 (한 단계 상향)
|
||||
try:
|
||||
from services import job_queue as jq_module
|
||||
if jq_module.job_queue:
|
||||
active = jq_module.job_queue.stats.get("active", 0)
|
||||
if active > 0 and load == "보통":
|
||||
load = "바쁨"
|
||||
elif active > 0 and load == "여유":
|
||||
load = "보통"
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return {
|
||||
"load": load,
|
||||
"health_ms": health_latency,
|
||||
"health_baseline_ms": round(baseline, 1),
|
||||
"inference_ms": inference_latency,
|
||||
"measured": inference_latency is not None,
|
||||
}
|
||||
|
||||
def is_healthy(self, role: str) -> bool:
|
||||
return self._health.get(role, False)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user