feat(services): crawl-24x7 A-8 헬스 패널 + D-1 stt/marker idle-unload

A-8 1차: crawl-health 컨테이너(100.110.63.63:8765 Tailscale 바인딩 전용, 읽기 전용 SELECT, caddy 라우트 금지). D-1 전제 작업: STT_PRELOAD=0+30분 유휴 해제(lock+inflight+reaper), marker MARKER_PRELOAD=0+idle-unload, /ready idle=200(503=warmup_failed 한정 — fastapi depends_on 정합), healthcheck cuda 기준 전환.
2026-06-10 13:03:31 +09:00
parent 7cd8cfde0a
commit 3df0ca53ab
6 changed files with 437 additions and 58 deletions
@@ -64,6 +64,11 @@ services:
    environment:
      - HF_HOME=/models/huggingface
      - TORCH_HOME=/models/torch
+      # D-1 (crawl-24x7): idle-unload 전환 — 영구 점유(~3.5GB) 해제가 90% 봉투의 전제.
+      # /ready 는 idle 에서도 200 (fastapi depends_on service_healthy 유지).
+      # 롤백 = MARKER_PRELOAD=1 + MARKER_IDLE_UNLOAD_MINUTES=0.
+      - MARKER_PRELOAD=0
+      - MARKER_IDLE_UNLOAD_MINUTES=${MARKER_IDLE_UNLOAD_MINUTES:-30}
    volumes:
      - ${NAS_NFS_PATH:-/mnt/nas/Document_Server}:/documents:ro
      - marker_models:/models
@@ -97,6 +102,11 @@ services:
      - WHISPER_MODEL=${WHISPER_MODEL:-large-v3}
      - WHISPER_DEVICE=${WHISPER_DEVICE:-cuda}
      - WHISPER_COMPUTE_TYPE=${WHISPER_COMPUTE_TYPE:-float16}
+      # D-1 (crawl-24x7): idle-unload 전환 — 영구 점유(~4GB) 해제가 90% 봉투의 전제.
+      # 콜드로드 수초~수십 초는 배치 작업이라 무방 (stt_worker read=1800s 가 흡수).
+      # 롤백 = STT_PRELOAD=1 + STT_IDLE_UNLOAD_MINUTES=0.
+      - STT_PRELOAD=0
+      - STT_IDLE_UNLOAD_MINUTES=${STT_IDLE_UNLOAD_MINUTES:-30}
    deploy:
      resources:
        reservations:
@@ -105,9 +115,9 @@ services:
              count: 1
              capabilities: [gpu]
    healthcheck:
-      # /ready: CUDA 디바이스 + 모델 적재 둘 다 확인. ready=true 만 healthy 처리.
-      # /health 는 단순 liveness 라 모델 미적재 상태도 healthy 로 잡혀 운영 신호로 부적합.
-      test: ["CMD", "python3", "-c", "import json,urllib.request,sys; r=urllib.request.urlopen('http://localhost:3300/ready'); sys.exit(0 if json.load(r).get('ready') else 1)"]
+      # D-1: idle-unload 도입으로 '모델 적재' 는 더 이상 상시 상태가 아님 — cuda 가용성만
+      # healthy 기준. 모델 적재 여부는 /ready 의 models_loaded 필드로 관측(정보성).
+      test: ["CMD", "python3", "-c", "import json,urllib.request,sys; r=urllib.request.urlopen('http://localhost:3300/ready'); sys.exit(0 if json.load(r).get('cuda') else 1)"]
      interval: 30s
      timeout: 10s
      retries: 3
@@ -229,6 +239,21 @@ services:
      - fastapi
    restart: unless-stopped

+  # crawl-24x7 A-8 1차: 전 소스 헬스 패널 — 내부 전용 (읽기 전용 SELECT 만).
+  # '내부 전용' 성립 구현 = 별도 바인딩뿐 (r4 결정): Tailscale 인터페이스에만 publish.
+  # 기존 SvelteKit 라우트(vhost=Host 헤더 검사=앱 가드 환원)나 프록시 경로 차단(경로 가드
+  # 회귀)으로 옮기지 말 것. caddy/home-caddy 라우트 추가 금지. fastapi/postgres 바인딩 선례.
+  crawl-health:
+    build: ./services/crawl-health
+    ports:
+      - "100.110.63.63:8765:8765"
+    environment:
+      - CRAWL_HEALTH_DSN=postgresql://pkm:${POSTGRES_PASSWORD}@postgres:5432/pkm
+    depends_on:
+      postgres:
+        condition: service_healthy
+    restart: unless-stopped
+
  caddy:
    image: caddy:2
    ports:
@@ -0,0 +1,12 @@
+FROM python:3.12-slim
+
+WORKDIR /srv
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+COPY server.py .
+
+EXPOSE 8765
+HEALTHCHECK --interval=30s --timeout=5s --retries=3 \
+  CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8765/health')"
+
+CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8765"]
@@ -0,0 +1,3 @@
+fastapi>=0.111.0
+uvicorn>=0.30.0
+asyncpg>=0.29.0
@@ -0,0 +1,202 @@
+"""crawl-health — 전 소스 헬스 패널 1차 (A-8, plan crawl-24x7-1)
+
+읽기 전용 내부 운영 패널. 의존 = 기존 수집 상태(news_sources/source_health/documents/
+processing_queue SELECT 만) — 쓰기 0.
+
+[1차] 소스별 last success / 수집 건수 추이(24h/7d) / 연속 실패 / circuit 상태 /
+빈 피드 streak + fulltext 승격/격하 통계 + 큐 백로그. 비-RSS 소스(C-2 sitemap 등)도
+같은 표면이 수용 (fetch_method 컬럼 표시 — '구독 소스 패널' 로 좁히지 않는 전 소스 일반화).
+[2차 범위 외] B-3 상태 계약 도착 시 세션 열 + [재로그인 시도] 버튼(enqueue 방식).
+
+노출: 별도 바인딩만 — compose 가 Tailscale 인터페이스(100.110.63.63)에만 publish.
+vhost/경로 가드 방식 금지 (r4: 둘 다 '덜 깨짐' 속성 상실). 앱 레벨 인증 없음 =
+Tailscale 도달성만이 경계 (fab-server 선례).
+"""
+
+import html
+import logging
+import os
+from contextlib import asynccontextmanager
+
+import asyncpg
+from fastapi import FastAPI
+from fastapi.responses import HTMLResponse, JSONResponse
+
+logger = logging.getLogger("crawl_health")
+
+DSN = os.environ.get("CRAWL_HEALTH_DSN", "")
+
+_pool: asyncpg.Pool | None = None
+
+
+@asynccontextmanager
+async def lifespan(_app: FastAPI):
+    global _pool
+    _pool = await asyncpg.create_pool(DSN, min_size=1, max_size=3)
+    yield
+    await _pool.close()
+
+
+app = FastAPI(lifespan=lifespan)
+
+
+async def _collect_data() -> dict:
+    async with _pool.acquire() as conn:
+        sources = await conn.fetch(
+            """
+            SELECT s.id, s.name, s.country, s.enabled, s.feed_type, s.fetch_method,
+                   s.fulltext_policy, s.last_fetched_at,
+                   h.circuit_state, h.consecutive_failures, h.last_success_at,
+                   h.last_error, h.last_error_at, h.last_fetch_items, h.empty_streak,
+                   h.total_fetches, h.total_failures
+            FROM news_sources s
+            LEFT JOIN source_health h ON h.source_id = s.id
+            ORDER BY s.enabled DESC, s.name
+            """
+        )
+        counts = await conn.fetch(
+            """
+            SELECT s.id,
+                   count(d.id) FILTER (WHERE d.extracted_at > now() - interval '24 hours') AS items_24h,
+                   count(d.id) AS items_7d
+            FROM news_sources s
+            LEFT JOIN documents d
+              ON d.source_channel = 'news'
+             AND d.extracted_at > now() - interval '7 days'
+             AND d.file_path LIKE 'news/' || s.name || '/%'
+            GROUP BY s.id
+            """
+        )
+        queue = await conn.fetch(
+            """
+            SELECT stage::text AS stage, status::text AS status, count(*) AS n,
+                   min(created_at) FILTER (WHERE status = 'pending') AS oldest_pending
+            FROM processing_queue
+            WHERE stage IN ('fulltext', 'summarize', 'embed', 'chunk')
+              AND status IN ('pending', 'processing', 'failed')
+            GROUP BY 1, 2
+            ORDER BY 1, 2
+            """
+        )
+        fulltext = await conn.fetch(
+            """
+            SELECT extract_meta -> 'fulltext' ->> 'status' AS status, count(*) AS n
+            FROM documents
+            WHERE source_channel = 'news' AND extract_meta ? 'fulltext'
+            GROUP BY 1
+            """
+        )
+    count_map = {r["id"]: r for r in counts}
+    return {
+        "sources": [
+            {**dict(r),
+             "items_24h": count_map.get(r["id"], {}).get("items_24h", 0),
+             "items_7d": count_map.get(r["id"], {}).get("items_7d", 0)}
+            for r in sources
+        ],
+        "queue": [dict(r) for r in queue],
+        "fulltext": [dict(r) for r in fulltext],
+    }
+
+
+@app.get("/health")
+async def health():
+    """Liveness — Docker healthcheck 용 (DB 미접근, 프로세스 생존만)."""
+    return {"status": "ok", "service": "crawl-health"}
+
+
+@app.get("/api/health.json")
+async def api_health():
+    data = await _collect_data()
+    # asyncpg Record 의 datetime → isoformat 직렬화
+    def _ser(v):
+        return v.isoformat() if hasattr(v, "isoformat") else v
+    return JSONResponse({
+        k: [{kk: _ser(vv) for kk, vv in row.items()} for row in v]
+        for k, v in data.items()
+    })
+
+
+def _chip(state: str | None, enabled: bool) -> str:
+    if not enabled:
+        return '<span class="chip off">OFF</span>'
+    if state == "disabled":
+        return '<span class="chip err">DISABLED</span>'
+    if state == "open":
+        return '<span class="chip warn">OPEN</span>'
+    return '<span class="chip ok">OK</span>'
+
+
+def _fmt_ts(v) -> str:
+    return v.strftime("%m-%d %H:%M") if v else "-"
+
+
+@app.get("/", response_class=HTMLResponse)
+async def index():
+    data = await _collect_data()
+    rows = []
+    for s in data["sources"]:
+        err = html.escape((s.get("last_error") or "")[:80])
+        warn_cls = ""
+        if s["enabled"] and (s.get("consecutive_failures") or 0) >= 3:
+            warn_cls = ' class="row-warn"'
+        elif s["enabled"] and (s.get("empty_streak") or 0) >= 8:
+            warn_cls = ' class="row-warn"'
+        rows.append(
+            f"<tr{warn_cls}>"
+            f"<td>{html.escape(s['name'])}</td>"
+            f"<td>{_chip(s.get('circuit_state'), s['enabled'])}</td>"
+            f"<td>{html.escape(s.get('fetch_method') or 'rss')}</td>"
+            f"<td>{html.escape(s.get('fulltext_policy') or 'none')}</td>"
+            f"<td class='num'>{s['items_24h']}</td>"
+            f"<td class='num'>{s['items_7d']}</td>"
+            f"<td class='num'>{s.get('consecutive_failures') or 0}</td>"
+            f"<td class='num'>{s.get('empty_streak') or 0}</td>"
+            f"<td>{_fmt_ts(s.get('last_success_at'))}</td>"
+            f"<td>{_fmt_ts(s.get('last_fetched_at'))}</td>"
+            f"<td class='err-text'>{err}</td>"
+            f"</tr>"
+        )
+    qrows = [
+        f"<tr><td>{html.escape(q['stage'])}</td><td>{html.escape(q['status'])}</td>"
+        f"<td class='num'>{q['n']}</td><td>{_fmt_ts(q.get('oldest_pending'))}</td></tr>"
+        for q in data["queue"]
+    ]
+    frows = [
+        f"<tr><td>{html.escape(f['status'] or '-')}</td><td class='num'>{f['n']}</td></tr>"
+        for f in data["fulltext"]
+    ]
+    body = f"""<!DOCTYPE html>
+<html lang="ko"><head><meta charset="utf-8">
+<title>crawl-health — 전 소스 헬스 패널</title>
+<style>
+  body {{ font-family: -apple-system, 'Apple SD Gothic Neo', sans-serif; background: #f5f1e8;
+         color: #3d3a33; margin: 0; padding: 28px; }}
+  h1 {{ font-size: 19px; margin: 0 0 4px; }} h2 {{ font-size: 14px; margin: 26px 0 8px; }}
+  .sub {{ color: #8a8474; font-size: 12px; margin-bottom: 18px; }}
+  table {{ border-collapse: collapse; width: 100%; background: #fffdf8; font-size: 12.5px; }}
+  th, td {{ border: 1px solid #e3ddcd; padding: 5px 9px; text-align: left; }}
+  th {{ background: #ece6d6; font-weight: 600; white-space: nowrap; }}
+  td.num {{ text-align: right; font-variant-numeric: tabular-nums; }}
+  td.err-text {{ color: #9a4a3a; font-size: 11.5px; max-width: 320px; }}
+  tr.row-warn td {{ background: #fbf0e4; }}
+  .chip {{ display: inline-block; padding: 1px 8px; border-radius: 9px; font-size: 11px; font-weight: 600; }}
+  .chip.ok {{ background: #dce8d4; color: #3c5a2e; }}
+  .chip.warn {{ background: #f3e0b8; color: #7a5a14; }}
+  .chip.err {{ background: #eecfc6; color: #8a2f1d; }}
+  .chip.off {{ background: #e3ddcd; color: #6e6859; }}
+</style></head><body>
+<h1>crawl-health — 전 소스 헬스 패널</h1>
+<div class="sub">A-8 1차 (피드 수집 헬스) · 내부 전용 (Tailscale 바인딩) · 새로고침 = 실시간 조회</div>
+<h2>소스 ({len(rows)})</h2>
+<table><tr><th>소스</th><th>circuit</th><th>fetch</th><th>fulltext</th><th>24h</th><th>7d</th>
+<th>연속실패</th><th>빈피드</th><th>last success</th><th>last fetch</th><th>last error</th></tr>
+{''.join(rows)}</table>
+<h2>처리 큐 (fulltext / summarize / embed / chunk)</h2>
+<table><tr><th>stage</th><th>status</th><th>건수</th><th>oldest pending</th></tr>
+{''.join(qrows) or '<tr><td colspan="4">백로그 없음</td></tr>'}</table>
+<h2>fulltext 승격 누적</h2>
+<table><tr><th>status</th><th>건수</th></tr>
+{''.join(frows) or '<tr><td colspan="2">기록 없음 (파일럿 전환 전)</td></tr>'}</table>
+</body></html>"""
+    return HTMLResponse(body)
@@ -1,12 +1,18 @@
 """marker-service — POST /convert: PDF → markdown + 추출 이미지 base64.

 Phase 1B   (2026-05-01) — 텍스트만 응답, 이미지 폐기.
-Phase 1B.5 (본 변경)    — `_images` 직렬화해서 base64 응답에 포함. NAS write 권한이
+Phase 1B.5             — `_images` 직렬화해서 base64 응답에 포함. NAS write 권한이
                         없는 stateless 변환기 유지 (fastapi 가 NAS persist 담당).
+D-1 (plan crawl-24x7-1, 2026-06-10) — idle-unload 운영 전환:
+  MARKER_PRELOAD=0           : startup warmup 끔 (첫 /convert 시 lazy load)
+  MARKER_IDLE_UNLOAD_MINUTES : N분 유휴 시 모델 해제 (0=비활성, 기존 동작)
+  /ready 는 idle(미적재)에서도 200 — fastapi 의 depends_on service_healthy 가
+  lazy 모드에서 영구 미기동으로 굳는 것 방지. 503 은 warmup_failed 한정.

 plan: ~/.claude/plans/piped-humming-crystal.md
 """
 import base64
+import gc
 import hashlib
 import io
 import logging
@@ -40,6 +46,12 @@ _warmup_done = False
 _warmup_error: str | None = None
 _warmup_lock = threading.Lock()

+# D-1 idle-unload 상태 — 전이는 전부 _warmup_lock 아래
+_PRELOAD = os.getenv("MARKER_PRELOAD", "1") != "0"
+_IDLE_UNLOAD_MINUTES = int(os.getenv("MARKER_IDLE_UNLOAD_MINUTES", "0"))
+_inflight = 0
+_last_used = time.monotonic()
+
 # 이미지 응답 cap. base64 응답 크기 폭주 방지. 사용자 PDF 풀 측정 (Phase 1D) 시
 # 가장 이미지 많은 문서가 ~30건 수준 → 200 은 안전 마진. 초과 시 truncate flag 응답.
 MAX_IMAGES_PER_DOC = int(os.getenv("MARKER_MAX_IMAGES_PER_DOC", "200"))
@@ -68,11 +80,67 @@ def _ensure_warmup() -> None:
            raise


+def _acquire_models():
+    """warmup 보장 + inflight 진입을 원자적으로 — ensure 직후 reaper 가 해제하는 경합 차단."""
+    global _inflight
+    while True:
+        _ensure_warmup()
+        with _warmup_lock:
+            if _warmup_done:
+                _inflight += 1
+                return
+        # ensure 와 lock 재진입 사이에 unload 가 끼어든 희귀 경합 — 재시도
+
+
+def _release_models():
+    global _inflight, _last_used
+    with _warmup_lock:
+        _inflight -= 1
+        _last_used = time.monotonic()
+
+
+def _maybe_unload() -> None:
+    """유휴 시 모델 해제. 변환 중(inflight>0)이면 절대 해제하지 않는다.
+
+    split 변환의 배치 사이 간격은 초 단위 — N>=1분 임계면 배치 사이 해제 없음.
+    """
+    global _models, _converter, _warmup_done
+    with _warmup_lock:
+        if not _warmup_done or _inflight > 0:
+            return
+        if time.monotonic() - _last_used < _IDLE_UNLOAD_MINUTES * 60:
+            return
+        _models = None
+        _converter = None
+        _warmup_done = False
+    gc.collect()
+    try:
+        import torch
+        torch.cuda.empty_cache()
+    except Exception:
+        pass
+    logger.info(f"[marker-service] idle-unload: 모델 해제 (유휴 {_IDLE_UNLOAD_MINUTES}분 초과)")
+
+
+async def _idle_reaper():
+    import asyncio
+    while True:
+        await asyncio.sleep(60)
+        try:
+            _maybe_unload()
+        except Exception:
+            logger.exception("[marker-service] idle reaper 오류")
+
+
@app.on_event("startup")
 async def startup():
-    """startup hook — async warmup 백그라운드. /ready 가 완료 여부 노출."""
+    """startup hook — warmup 은 MARKER_PRELOAD 게이트 (D-1: lazy 기본 전환은 compose 가)."""
    import asyncio
-    asyncio.create_task(asyncio.to_thread(_ensure_warmup))
+    if _PRELOAD:
+        asyncio.create_task(asyncio.to_thread(_ensure_warmup))
+    if _IDLE_UNLOAD_MINUTES > 0:
+        asyncio.create_task(_idle_reaper())
+        logger.info(f"[marker-service] idle-unload 활성: {_IDLE_UNLOAD_MINUTES}분")


 class ConvertRequest(BaseModel):
@@ -111,7 +179,12 @@ def health():

@app.get("/ready")
 async def ready(response: Response):
-    """Round 4 #1+#2: Response.status_code 명시 + warmup_error 노출."""
+    """Round 4 #1+#2: Response.status_code 명시 + warmup_error 노출.
+
+    D-1: idle(미적재) = 200. 503 은 warmup_failed 한정 — lazy 모드에서 fastapi
+    depends_on service_healthy 가 영구 미기동으로 굳지 않게. 배포 검증에서
+    'status=ready' 단언하던 runbook 은 강제 warm 호출(/convert 1건)로 대체.
+    """
    if _warmup_error:
        response.status_code = 503
        return {
@@ -121,31 +194,28 @@ async def ready(response: Response):
            "error": _warmup_error,
        }
    if not _warmup_done:
-        response.status_code = 503
        return {
-            "status": "warming_up",
+            "status": "warming_up" if _PRELOAD else "idle",
            "engine": "marker",
            "engine_version": _engine_version,
+            "models_loaded": False,
+            "idle_unload_minutes": _IDLE_UNLOAD_MINUTES,
        }
    return {
        "status": "ready",
        "engine": "marker",
        "engine_version": _engine_version,
+        "models_loaded": True,
+        "inflight": _inflight,
+        "idle_unload_minutes": _IDLE_UNLOAD_MINUTES,
    }


@app.post("/convert", response_model=ConvertResponse)
 async def convert(req: ConvertRequest):
-    _ensure_warmup()
-
    p = Path(req.file_path)
    if not p.is_file():
        raise HTTPException(404, detail={"code": "file_not_found", "message": str(p)})
-
-    start = time.monotonic()
-    # page range 지정 시 per-request converter (모델 _models 재사용 → reload 없음).
-    # invariant: req.start_page/end_page = 1-based inclusive → marker 0-based 로 변환.
-    converter = _converter
    if req.start_page is not None and req.end_page is not None:
        if req.start_page < 1 or req.end_page < req.start_page:
            raise HTTPException(
@@ -155,22 +225,33 @@ async def convert(req: ConvertRequest):
                    "message": f"start_page={req.start_page} end_page={req.end_page}",
                },
            )
-        page_range = list(range(req.start_page - 1, req.end_page))  # 0-based inclusive
-        converter = PdfConverter(artifact_dict=_models, config={"page_range": page_range})
-    try:
-        rendered = converter(str(p))
-    except Exception as exc:
-        logger.exception(f"[marker-service] conversion failed path={p}: {exc}")
-        raise HTTPException(
-            status_code=422,
-            detail={
-                "code": "conversion_failed",
-                "message": f"{type(exc).__name__}: {exc}",
-            },
-        ) from exc

-    md_text, _meta, raw_images = text_from_rendered(rendered)
-    elapsed_ms = int((time.monotonic() - start) * 1000)
+    # D-1: warmup 보장 + inflight 진입 원자화 — 변환 중 reaper 해제 차단. 해제는 finally.
+    _acquire_models()
+    try:
+        start = time.monotonic()
+        # page range 지정 시 per-request converter (모델 _models 재사용 → reload 없음).
+        # invariant: req.start_page/end_page = 1-based inclusive → marker 0-based 로 변환.
+        converter = _converter
+        if req.start_page is not None and req.end_page is not None:
+            page_range = list(range(req.start_page - 1, req.end_page))  # 0-based inclusive
+            converter = PdfConverter(artifact_dict=_models, config={"page_range": page_range})
+        try:
+            rendered = converter(str(p))
+        except Exception as exc:
+            logger.exception(f"[marker-service] conversion failed path={p}: {exc}")
+            raise HTTPException(
+                status_code=422,
+                detail={
+                    "code": "conversion_failed",
+                    "message": f"{type(exc).__name__}: {exc}",
+                },
+            ) from exc
+
+        md_text, _meta, raw_images = text_from_rendered(rendered)
+        elapsed_ms = int((time.monotonic() - start) * 1000)
+    finally:
+        _release_models()

    images_payload, truncated = _serialize_images(raw_images, str(p))

@@ -1,14 +1,23 @@
 """STT 마이크로서비스 — faster-whisper (GPU) 기반 음성 전사.

 filePath → {text, segments:[{start,end,text}]}.
-모델은 startup 에서 eager preload (Docker /ready healthcheck 가 모델 적재까지 검증).
 기본 모델 large-v3 (VRAM ~3GB, float16). 환경변수로 교체 가능.

-환경변수 `STT_PRELOAD=0` 으로 lazy 로 강제 가능 (개발/테스트용).
+D-1 (plan crawl-24x7-1, 2026-06-10) — idle-unload 운영 전환:
+  STT_PRELOAD=0          : startup eager preload 끔 (첫 요청 시 lazy load)
+  STT_IDLE_UNLOAD_MINUTES: N분 유휴 시 모델 해제 (0=비활성, 기존 동작).
+                           faster-whisper=CTranslate2 라 torch 미설치 — 해제는
+                           참조 제거 + gc (CTranslate2 가 소멸 시 VRAM 반환).
+콜드로드 수초~수십 초는 호출측(stt_worker read=1800s)이 흡수. healthcheck 는
+cuda 가용성 기준 (compose) — 모델 적재는 더 이상 상시 상태가 아니다.
 """

+import asyncio
+import gc
 import logging
 import os
+import threading
+import time
 import unicodedata
 from contextlib import asynccontextmanager
 from pathlib import Path
@@ -17,18 +26,26 @@ from fastapi import FastAPI

 logger = logging.getLogger("stt")

+_IDLE_UNLOAD_MINUTES = int(os.getenv("STT_IDLE_UNLOAD_MINUTES", "0"))
+

@asynccontextmanager
 async def lifespan(_app: FastAPI):
    # startup: 모델 eager preload 시도. 실패해도 프로세스는 살아 있고
-    # /ready 가 false 로 남아 healthcheck 가 unhealthy 처리.
+    # /ready 의 models_loaded 가 false 로 남는다.
    if os.getenv("STT_PRELOAD", "1") != "0":
        try:
            _load_model()
            logger.info("stt model preloaded: %s (%s, %s)", _MODEL_NAME, _DEVICE, _COMPUTE_TYPE)
        except Exception as e:
            logger.exception("stt model preload failed: %s", e)
+    reaper = None
+    if _IDLE_UNLOAD_MINUTES > 0:
+        reaper = asyncio.create_task(_idle_reaper())
+        logger.info("stt idle-unload 활성: %d분", _IDLE_UNLOAD_MINUTES)
    yield
+    if reaper:
+        reaper.cancel()


 app = FastAPI(lifespan=lifespan)
@@ -38,6 +55,11 @@ _MODEL_NAME = os.getenv("WHISPER_MODEL", "large-v3")
 _DEVICE = os.getenv("WHISPER_DEVICE", "cuda")
 _COMPUTE_TYPE = os.getenv("WHISPER_COMPUTE_TYPE", "float16")

+# load/unload/inflight 상태 전이는 전부 이 lock 아래 (cold 동시 요청 이중 로드 방지 포함)
+_model_lock = threading.Lock()
+_inflight = 0
+_last_used = time.monotonic()
+

 def _resolve_path(file_path: str) -> Path | None:
    """NFC(DB) vs NFD(NFS) 한글 경로 정규화 차이 흡수. OCR 서비스와 동일 패턴."""
@@ -61,14 +83,38 @@ def _resolve_path(file_path: str) -> Path | None:


 def _load_model():
-    """faster-whisper lazy loading — 첫 호출 시만 VRAM 점유."""
+    """faster-whisper lazy loading — 첫 호출 시만 VRAM 점유. lock 으로 이중 로드 방지."""
    global _model
    if _model is not None:
        return _model
-    from faster_whisper import WhisperModel
+    with _model_lock:
+        if _model is None:
+            from faster_whisper import WhisperModel
+            logger.info("stt model loading: %s (%s, %s)", _MODEL_NAME, _DEVICE, _COMPUTE_TYPE)
+            _model = WhisperModel(_MODEL_NAME, device=_DEVICE, compute_type=_COMPUTE_TYPE)
+        return _model

-    _model = WhisperModel(_MODEL_NAME, device=_DEVICE, compute_type=_COMPUTE_TYPE)
-    return _model
+
+def _maybe_unload() -> None:
+    """유휴 시 모델 해제. 처리 중(inflight>0)이면 절대 해제하지 않는다."""
+    global _model
+    with _model_lock:
+        if _model is None or _inflight > 0:
+            return
+        if time.monotonic() - _last_used < _IDLE_UNLOAD_MINUTES * 60:
+            return
+        _model = None
+    gc.collect()
+    logger.info("stt idle-unload: whisper 모델 해제 (유휴 %d분 초과)", _IDLE_UNLOAD_MINUTES)
+
+
+async def _idle_reaper():
+    while True:
+        await asyncio.sleep(60)
+        try:
+            _maybe_unload()
+        except Exception:
+            logger.exception("stt idle reaper 오류")


 def _cuda_device_count() -> int:
@@ -87,7 +133,7 @@ def health():

@app.get("/ready")
 def ready():
-    """Readiness — CUDA + 모델 상태. 배포 검증용."""
+    """Readiness — CUDA + 모델 상태. healthcheck 는 cuda 만 본다 (D-1 idle-unload)."""
    count = _cuda_device_count()
    cuda_ok = count > 0
    models_loaded = _model is not None
@@ -98,6 +144,8 @@ def ready():
        "models_loaded": models_loaded,
        "model": _MODEL_NAME,
        "compute_type": _COMPUTE_TYPE,
+        "idle_unload_minutes": _IDLE_UNLOAD_MINUTES,
+        "inflight": _inflight,
    }


@@ -121,6 +169,7 @@ async def transcribe(body: dict):
        "duration": 1832.5
      }
    """
+    global _inflight, _last_used
    raw_path = body["filePath"]
    langs = body.get("langs")
    beam_size = int(body.get("beamSize", 5))
@@ -129,28 +178,35 @@ async def transcribe(body: dict):
    if resolved is None:
        return {"error": f"파일 없음: {raw_path}", "text": "", "segments": []}

-    model = _load_model()
+    with _model_lock:
+        _inflight += 1
+    try:
+        model = _load_model()

-    language = None
-    if isinstance(langs, list) and len(langs) == 1:
-        language = langs[0]
+        language = None
+        if isinstance(langs, list) and len(langs) == 1:
+            language = langs[0]

-    segments_iter, info = model.transcribe(
-        str(resolved),
-        beam_size=beam_size,
-        language=language,
-        vad_filter=True,
-    )
+        segments_iter, info = model.transcribe(
+            str(resolved),
+            beam_size=beam_size,
+            language=language,
+            vad_filter=True,
+        )

-    segments = []
-    parts = []
-    for seg in segments_iter:
-        segments.append({
-            "start": round(float(seg.start), 2),
-            "end": round(float(seg.end), 2),
-            "text": seg.text.strip(),
-        })
-        parts.append(seg.text)
+        segments = []
+        parts = []
+        for seg in segments_iter:
+            segments.append({
+                "start": round(float(seg.start), 2),
+                "end": round(float(seg.end), 2),
+                "text": seg.text.strip(),
+            })
+            parts.append(seg.text)
+    finally:
+        with _model_lock:
+            _inflight -= 1
+            _last_used = time.monotonic()

    return {
        "text": " ".join(p.strip() for p in parts).strip(),