From 3df0ca53aba9aa1ede1cb92bd873570922fd2b84 Mon Sep 17 00:00:00 2001 From: hyungi Date: Wed, 10 Jun 2026 13:03:31 +0900 Subject: [PATCH] =?UTF-8?q?feat(services):=20crawl-24x7=20A-8=20=ED=97=AC?= =?UTF-8?q?=EC=8A=A4=20=ED=8C=A8=EB=84=90=20+=20D-1=20stt/marker=20idle-un?= =?UTF-8?q?load?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A-8 1차: crawl-health 컨테이너(100.110.63.63:8765 Tailscale 바인딩 전용, 읽기 전용 SELECT, caddy 라우트 금지). D-1 전제 작업: STT_PRELOAD=0+30분 유휴 해제(lock+inflight+reaper), marker MARKER_PRELOAD=0+idle-unload, /ready idle=200(503=warmup_failed 한정 — fastapi depends_on 정합), healthcheck cuda 기준 전환. --- docker-compose.yml | 31 +++- services/crawl-health/Dockerfile | 12 ++ services/crawl-health/requirements.txt | 3 + services/crawl-health/server.py | 202 +++++++++++++++++++++++++ services/marker/server.py | 137 +++++++++++++---- services/stt/server.py | 110 ++++++++++---- 6 files changed, 437 insertions(+), 58 deletions(-) create mode 100644 services/crawl-health/Dockerfile create mode 100644 services/crawl-health/requirements.txt create mode 100644 services/crawl-health/server.py diff --git a/docker-compose.yml b/docker-compose.yml index 8e67995..7e1283e 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -64,6 +64,11 @@ services: environment: - HF_HOME=/models/huggingface - TORCH_HOME=/models/torch + # D-1 (crawl-24x7): idle-unload 전환 — 영구 점유(~3.5GB) 해제가 90% 봉투의 전제. + # /ready 는 idle 에서도 200 (fastapi depends_on service_healthy 유지). + # 롤백 = MARKER_PRELOAD=1 + MARKER_IDLE_UNLOAD_MINUTES=0. + - MARKER_PRELOAD=0 + - MARKER_IDLE_UNLOAD_MINUTES=${MARKER_IDLE_UNLOAD_MINUTES:-30} volumes: - ${NAS_NFS_PATH:-/mnt/nas/Document_Server}:/documents:ro - marker_models:/models @@ -97,6 +102,11 @@ services: - WHISPER_MODEL=${WHISPER_MODEL:-large-v3} - WHISPER_DEVICE=${WHISPER_DEVICE:-cuda} - WHISPER_COMPUTE_TYPE=${WHISPER_COMPUTE_TYPE:-float16} + # D-1 (crawl-24x7): idle-unload 전환 — 영구 점유(~4GB) 해제가 90% 봉투의 전제. + # 콜드로드 수초~수십 초는 배치 작업이라 무방 (stt_worker read=1800s 가 흡수). + # 롤백 = STT_PRELOAD=1 + STT_IDLE_UNLOAD_MINUTES=0. + - STT_PRELOAD=0 + - STT_IDLE_UNLOAD_MINUTES=${STT_IDLE_UNLOAD_MINUTES:-30} deploy: resources: reservations: @@ -105,9 +115,9 @@ services: count: 1 capabilities: [gpu] healthcheck: - # /ready: CUDA 디바이스 + 모델 적재 둘 다 확인. ready=true 만 healthy 처리. - # /health 는 단순 liveness 라 모델 미적재 상태도 healthy 로 잡혀 운영 신호로 부적합. - test: ["CMD", "python3", "-c", "import json,urllib.request,sys; r=urllib.request.urlopen('http://localhost:3300/ready'); sys.exit(0 if json.load(r).get('ready') else 1)"] + # D-1: idle-unload 도입으로 '모델 적재' 는 더 이상 상시 상태가 아님 — cuda 가용성만 + # healthy 기준. 모델 적재 여부는 /ready 의 models_loaded 필드로 관측(정보성). + test: ["CMD", "python3", "-c", "import json,urllib.request,sys; r=urllib.request.urlopen('http://localhost:3300/ready'); sys.exit(0 if json.load(r).get('cuda') else 1)"] interval: 30s timeout: 10s retries: 3 @@ -229,6 +239,21 @@ services: - fastapi restart: unless-stopped + # crawl-24x7 A-8 1차: 전 소스 헬스 패널 — 내부 전용 (읽기 전용 SELECT 만). + # '내부 전용' 성립 구현 = 별도 바인딩뿐 (r4 결정): Tailscale 인터페이스에만 publish. + # 기존 SvelteKit 라우트(vhost=Host 헤더 검사=앱 가드 환원)나 프록시 경로 차단(경로 가드 + # 회귀)으로 옮기지 말 것. caddy/home-caddy 라우트 추가 금지. fastapi/postgres 바인딩 선례. + crawl-health: + build: ./services/crawl-health + ports: + - "100.110.63.63:8765:8765" + environment: + - CRAWL_HEALTH_DSN=postgresql://pkm:${POSTGRES_PASSWORD}@postgres:5432/pkm + depends_on: + postgres: + condition: service_healthy + restart: unless-stopped + caddy: image: caddy:2 ports: diff --git a/services/crawl-health/Dockerfile b/services/crawl-health/Dockerfile new file mode 100644 index 0000000..4851237 --- /dev/null +++ b/services/crawl-health/Dockerfile @@ -0,0 +1,12 @@ +FROM python:3.12-slim + +WORKDIR /srv +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt +COPY server.py . + +EXPOSE 8765 +HEALTHCHECK --interval=30s --timeout=5s --retries=3 \ + CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8765/health')" + +CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8765"] diff --git a/services/crawl-health/requirements.txt b/services/crawl-health/requirements.txt new file mode 100644 index 0000000..70e1fdf --- /dev/null +++ b/services/crawl-health/requirements.txt @@ -0,0 +1,3 @@ +fastapi>=0.111.0 +uvicorn>=0.30.0 +asyncpg>=0.29.0 diff --git a/services/crawl-health/server.py b/services/crawl-health/server.py new file mode 100644 index 0000000..ef8dc11 --- /dev/null +++ b/services/crawl-health/server.py @@ -0,0 +1,202 @@ +"""crawl-health — 전 소스 헬스 패널 1차 (A-8, plan crawl-24x7-1) + +읽기 전용 내부 운영 패널. 의존 = 기존 수집 상태(news_sources/source_health/documents/ +processing_queue SELECT 만) — 쓰기 0. + +[1차] 소스별 last success / 수집 건수 추이(24h/7d) / 연속 실패 / circuit 상태 / +빈 피드 streak + fulltext 승격/격하 통계 + 큐 백로그. 비-RSS 소스(C-2 sitemap 등)도 +같은 표면이 수용 (fetch_method 컬럼 표시 — '구독 소스 패널' 로 좁히지 않는 전 소스 일반화). +[2차 범위 외] B-3 상태 계약 도착 시 세션 열 + [재로그인 시도] 버튼(enqueue 방식). + +노출: 별도 바인딩만 — compose 가 Tailscale 인터페이스(100.110.63.63)에만 publish. +vhost/경로 가드 방식 금지 (r4: 둘 다 '덜 깨짐' 속성 상실). 앱 레벨 인증 없음 = +Tailscale 도달성만이 경계 (fab-server 선례). +""" + +import html +import logging +import os +from contextlib import asynccontextmanager + +import asyncpg +from fastapi import FastAPI +from fastapi.responses import HTMLResponse, JSONResponse + +logger = logging.getLogger("crawl_health") + +DSN = os.environ.get("CRAWL_HEALTH_DSN", "") + +_pool: asyncpg.Pool | None = None + + +@asynccontextmanager +async def lifespan(_app: FastAPI): + global _pool + _pool = await asyncpg.create_pool(DSN, min_size=1, max_size=3) + yield + await _pool.close() + + +app = FastAPI(lifespan=lifespan) + + +async def _collect_data() -> dict: + async with _pool.acquire() as conn: + sources = await conn.fetch( + """ + SELECT s.id, s.name, s.country, s.enabled, s.feed_type, s.fetch_method, + s.fulltext_policy, s.last_fetched_at, + h.circuit_state, h.consecutive_failures, h.last_success_at, + h.last_error, h.last_error_at, h.last_fetch_items, h.empty_streak, + h.total_fetches, h.total_failures + FROM news_sources s + LEFT JOIN source_health h ON h.source_id = s.id + ORDER BY s.enabled DESC, s.name + """ + ) + counts = await conn.fetch( + """ + SELECT s.id, + count(d.id) FILTER (WHERE d.extracted_at > now() - interval '24 hours') AS items_24h, + count(d.id) AS items_7d + FROM news_sources s + LEFT JOIN documents d + ON d.source_channel = 'news' + AND d.extracted_at > now() - interval '7 days' + AND d.file_path LIKE 'news/' || s.name || '/%' + GROUP BY s.id + """ + ) + queue = await conn.fetch( + """ + SELECT stage::text AS stage, status::text AS status, count(*) AS n, + min(created_at) FILTER (WHERE status = 'pending') AS oldest_pending + FROM processing_queue + WHERE stage IN ('fulltext', 'summarize', 'embed', 'chunk') + AND status IN ('pending', 'processing', 'failed') + GROUP BY 1, 2 + ORDER BY 1, 2 + """ + ) + fulltext = await conn.fetch( + """ + SELECT extract_meta -> 'fulltext' ->> 'status' AS status, count(*) AS n + FROM documents + WHERE source_channel = 'news' AND extract_meta ? 'fulltext' + GROUP BY 1 + """ + ) + count_map = {r["id"]: r for r in counts} + return { + "sources": [ + {**dict(r), + "items_24h": count_map.get(r["id"], {}).get("items_24h", 0), + "items_7d": count_map.get(r["id"], {}).get("items_7d", 0)} + for r in sources + ], + "queue": [dict(r) for r in queue], + "fulltext": [dict(r) for r in fulltext], + } + + +@app.get("/health") +async def health(): + """Liveness — Docker healthcheck 용 (DB 미접근, 프로세스 생존만).""" + return {"status": "ok", "service": "crawl-health"} + + +@app.get("/api/health.json") +async def api_health(): + data = await _collect_data() + # asyncpg Record 의 datetime → isoformat 직렬화 + def _ser(v): + return v.isoformat() if hasattr(v, "isoformat") else v + return JSONResponse({ + k: [{kk: _ser(vv) for kk, vv in row.items()} for row in v] + for k, v in data.items() + }) + + +def _chip(state: str | None, enabled: bool) -> str: + if not enabled: + return 'OFF' + if state == "disabled": + return 'DISABLED' + if state == "open": + return 'OPEN' + return 'OK' + + +def _fmt_ts(v) -> str: + return v.strftime("%m-%d %H:%M") if v else "-" + + +@app.get("/", response_class=HTMLResponse) +async def index(): + data = await _collect_data() + rows = [] + for s in data["sources"]: + err = html.escape((s.get("last_error") or "")[:80]) + warn_cls = "" + if s["enabled"] and (s.get("consecutive_failures") or 0) >= 3: + warn_cls = ' class="row-warn"' + elif s["enabled"] and (s.get("empty_streak") or 0) >= 8: + warn_cls = ' class="row-warn"' + rows.append( + f"" + f"{html.escape(s['name'])}" + f"{_chip(s.get('circuit_state'), s['enabled'])}" + f"{html.escape(s.get('fetch_method') or 'rss')}" + f"{html.escape(s.get('fulltext_policy') or 'none')}" + f"{s['items_24h']}" + f"{s['items_7d']}" + f"{s.get('consecutive_failures') or 0}" + f"{s.get('empty_streak') or 0}" + f"{_fmt_ts(s.get('last_success_at'))}" + f"{_fmt_ts(s.get('last_fetched_at'))}" + f"{err}" + f"" + ) + qrows = [ + f"{html.escape(q['stage'])}{html.escape(q['status'])}" + f"{q['n']}{_fmt_ts(q.get('oldest_pending'))}" + for q in data["queue"] + ] + frows = [ + f"{html.escape(f['status'] or '-')}{f['n']}" + for f in data["fulltext"] + ] + body = f""" + +crawl-health — 전 소스 헬스 패널 + +

crawl-health — 전 소스 헬스 패널

+
A-8 1차 (피드 수집 헬스) · 내부 전용 (Tailscale 바인딩) · 새로고침 = 실시간 조회
+

소스 ({len(rows)})

+ + +{''.join(rows)}
소스circuitfetchfulltext24h7d연속실패빈피드last successlast fetchlast error
+

처리 큐 (fulltext / summarize / embed / chunk)

+ +{''.join(qrows) or ''}
stagestatus건수oldest pending
백로그 없음
+

fulltext 승격 누적

+ +{''.join(frows) or ''}
status건수
기록 없음 (파일럿 전환 전)
+""" + return HTMLResponse(body) diff --git a/services/marker/server.py b/services/marker/server.py index 3e6fb5d..da7738a 100644 --- a/services/marker/server.py +++ b/services/marker/server.py @@ -1,12 +1,18 @@ """marker-service — POST /convert: PDF → markdown + 추출 이미지 base64. Phase 1B (2026-05-01) — 텍스트만 응답, 이미지 폐기. -Phase 1B.5 (본 변경) — `_images` 직렬화해서 base64 응답에 포함. NAS write 권한이 +Phase 1B.5 — `_images` 직렬화해서 base64 응답에 포함. NAS write 권한이 없는 stateless 변환기 유지 (fastapi 가 NAS persist 담당). +D-1 (plan crawl-24x7-1, 2026-06-10) — idle-unload 운영 전환: + MARKER_PRELOAD=0 : startup warmup 끔 (첫 /convert 시 lazy load) + MARKER_IDLE_UNLOAD_MINUTES : N분 유휴 시 모델 해제 (0=비활성, 기존 동작) + /ready 는 idle(미적재)에서도 200 — fastapi 의 depends_on service_healthy 가 + lazy 모드에서 영구 미기동으로 굳는 것 방지. 503 은 warmup_failed 한정. plan: ~/.claude/plans/piped-humming-crystal.md """ import base64 +import gc import hashlib import io import logging @@ -40,6 +46,12 @@ _warmup_done = False _warmup_error: str | None = None _warmup_lock = threading.Lock() +# D-1 idle-unload 상태 — 전이는 전부 _warmup_lock 아래 +_PRELOAD = os.getenv("MARKER_PRELOAD", "1") != "0" +_IDLE_UNLOAD_MINUTES = int(os.getenv("MARKER_IDLE_UNLOAD_MINUTES", "0")) +_inflight = 0 +_last_used = time.monotonic() + # 이미지 응답 cap. base64 응답 크기 폭주 방지. 사용자 PDF 풀 측정 (Phase 1D) 시 # 가장 이미지 많은 문서가 ~30건 수준 → 200 은 안전 마진. 초과 시 truncate flag 응답. MAX_IMAGES_PER_DOC = int(os.getenv("MARKER_MAX_IMAGES_PER_DOC", "200")) @@ -68,11 +80,67 @@ def _ensure_warmup() -> None: raise +def _acquire_models(): + """warmup 보장 + inflight 진입을 원자적으로 — ensure 직후 reaper 가 해제하는 경합 차단.""" + global _inflight + while True: + _ensure_warmup() + with _warmup_lock: + if _warmup_done: + _inflight += 1 + return + # ensure 와 lock 재진입 사이에 unload 가 끼어든 희귀 경합 — 재시도 + + +def _release_models(): + global _inflight, _last_used + with _warmup_lock: + _inflight -= 1 + _last_used = time.monotonic() + + +def _maybe_unload() -> None: + """유휴 시 모델 해제. 변환 중(inflight>0)이면 절대 해제하지 않는다. + + split 변환의 배치 사이 간격은 초 단위 — N>=1분 임계면 배치 사이 해제 없음. + """ + global _models, _converter, _warmup_done + with _warmup_lock: + if not _warmup_done or _inflight > 0: + return + if time.monotonic() - _last_used < _IDLE_UNLOAD_MINUTES * 60: + return + _models = None + _converter = None + _warmup_done = False + gc.collect() + try: + import torch + torch.cuda.empty_cache() + except Exception: + pass + logger.info(f"[marker-service] idle-unload: 모델 해제 (유휴 {_IDLE_UNLOAD_MINUTES}분 초과)") + + +async def _idle_reaper(): + import asyncio + while True: + await asyncio.sleep(60) + try: + _maybe_unload() + except Exception: + logger.exception("[marker-service] idle reaper 오류") + + @app.on_event("startup") async def startup(): - """startup hook — async warmup 백그라운드. /ready 가 완료 여부 노출.""" + """startup hook — warmup 은 MARKER_PRELOAD 게이트 (D-1: lazy 기본 전환은 compose 가).""" import asyncio - asyncio.create_task(asyncio.to_thread(_ensure_warmup)) + if _PRELOAD: + asyncio.create_task(asyncio.to_thread(_ensure_warmup)) + if _IDLE_UNLOAD_MINUTES > 0: + asyncio.create_task(_idle_reaper()) + logger.info(f"[marker-service] idle-unload 활성: {_IDLE_UNLOAD_MINUTES}분") class ConvertRequest(BaseModel): @@ -111,7 +179,12 @@ def health(): @app.get("/ready") async def ready(response: Response): - """Round 4 #1+#2: Response.status_code 명시 + warmup_error 노출.""" + """Round 4 #1+#2: Response.status_code 명시 + warmup_error 노출. + + D-1: idle(미적재) = 200. 503 은 warmup_failed 한정 — lazy 모드에서 fastapi + depends_on service_healthy 가 영구 미기동으로 굳지 않게. 배포 검증에서 + 'status=ready' 단언하던 runbook 은 강제 warm 호출(/convert 1건)로 대체. + """ if _warmup_error: response.status_code = 503 return { @@ -121,31 +194,28 @@ async def ready(response: Response): "error": _warmup_error, } if not _warmup_done: - response.status_code = 503 return { - "status": "warming_up", + "status": "warming_up" if _PRELOAD else "idle", "engine": "marker", "engine_version": _engine_version, + "models_loaded": False, + "idle_unload_minutes": _IDLE_UNLOAD_MINUTES, } return { "status": "ready", "engine": "marker", "engine_version": _engine_version, + "models_loaded": True, + "inflight": _inflight, + "idle_unload_minutes": _IDLE_UNLOAD_MINUTES, } @app.post("/convert", response_model=ConvertResponse) async def convert(req: ConvertRequest): - _ensure_warmup() - p = Path(req.file_path) if not p.is_file(): raise HTTPException(404, detail={"code": "file_not_found", "message": str(p)}) - - start = time.monotonic() - # page range 지정 시 per-request converter (모델 _models 재사용 → reload 없음). - # invariant: req.start_page/end_page = 1-based inclusive → marker 0-based 로 변환. - converter = _converter if req.start_page is not None and req.end_page is not None: if req.start_page < 1 or req.end_page < req.start_page: raise HTTPException( @@ -155,22 +225,33 @@ async def convert(req: ConvertRequest): "message": f"start_page={req.start_page} end_page={req.end_page}", }, ) - page_range = list(range(req.start_page - 1, req.end_page)) # 0-based inclusive - converter = PdfConverter(artifact_dict=_models, config={"page_range": page_range}) - try: - rendered = converter(str(p)) - except Exception as exc: - logger.exception(f"[marker-service] conversion failed path={p}: {exc}") - raise HTTPException( - status_code=422, - detail={ - "code": "conversion_failed", - "message": f"{type(exc).__name__}: {exc}", - }, - ) from exc - md_text, _meta, raw_images = text_from_rendered(rendered) - elapsed_ms = int((time.monotonic() - start) * 1000) + # D-1: warmup 보장 + inflight 진입 원자화 — 변환 중 reaper 해제 차단. 해제는 finally. + _acquire_models() + try: + start = time.monotonic() + # page range 지정 시 per-request converter (모델 _models 재사용 → reload 없음). + # invariant: req.start_page/end_page = 1-based inclusive → marker 0-based 로 변환. + converter = _converter + if req.start_page is not None and req.end_page is not None: + page_range = list(range(req.start_page - 1, req.end_page)) # 0-based inclusive + converter = PdfConverter(artifact_dict=_models, config={"page_range": page_range}) + try: + rendered = converter(str(p)) + except Exception as exc: + logger.exception(f"[marker-service] conversion failed path={p}: {exc}") + raise HTTPException( + status_code=422, + detail={ + "code": "conversion_failed", + "message": f"{type(exc).__name__}: {exc}", + }, + ) from exc + + md_text, _meta, raw_images = text_from_rendered(rendered) + elapsed_ms = int((time.monotonic() - start) * 1000) + finally: + _release_models() images_payload, truncated = _serialize_images(raw_images, str(p)) diff --git a/services/stt/server.py b/services/stt/server.py index decc778..fd2f300 100644 --- a/services/stt/server.py +++ b/services/stt/server.py @@ -1,14 +1,23 @@ """STT 마이크로서비스 — faster-whisper (GPU) 기반 음성 전사. filePath → {text, segments:[{start,end,text}]}. -모델은 startup 에서 eager preload (Docker /ready healthcheck 가 모델 적재까지 검증). 기본 모델 large-v3 (VRAM ~3GB, float16). 환경변수로 교체 가능. -환경변수 `STT_PRELOAD=0` 으로 lazy 로 강제 가능 (개발/테스트용). +D-1 (plan crawl-24x7-1, 2026-06-10) — idle-unload 운영 전환: + STT_PRELOAD=0 : startup eager preload 끔 (첫 요청 시 lazy load) + STT_IDLE_UNLOAD_MINUTES: N분 유휴 시 모델 해제 (0=비활성, 기존 동작). + faster-whisper=CTranslate2 라 torch 미설치 — 해제는 + 참조 제거 + gc (CTranslate2 가 소멸 시 VRAM 반환). +콜드로드 수초~수십 초는 호출측(stt_worker read=1800s)이 흡수. healthcheck 는 +cuda 가용성 기준 (compose) — 모델 적재는 더 이상 상시 상태가 아니다. """ +import asyncio +import gc import logging import os +import threading +import time import unicodedata from contextlib import asynccontextmanager from pathlib import Path @@ -17,18 +26,26 @@ from fastapi import FastAPI logger = logging.getLogger("stt") +_IDLE_UNLOAD_MINUTES = int(os.getenv("STT_IDLE_UNLOAD_MINUTES", "0")) + @asynccontextmanager async def lifespan(_app: FastAPI): # startup: 모델 eager preload 시도. 실패해도 프로세스는 살아 있고 - # /ready 가 false 로 남아 healthcheck 가 unhealthy 처리. + # /ready 의 models_loaded 가 false 로 남는다. if os.getenv("STT_PRELOAD", "1") != "0": try: _load_model() logger.info("stt model preloaded: %s (%s, %s)", _MODEL_NAME, _DEVICE, _COMPUTE_TYPE) except Exception as e: logger.exception("stt model preload failed: %s", e) + reaper = None + if _IDLE_UNLOAD_MINUTES > 0: + reaper = asyncio.create_task(_idle_reaper()) + logger.info("stt idle-unload 활성: %d분", _IDLE_UNLOAD_MINUTES) yield + if reaper: + reaper.cancel() app = FastAPI(lifespan=lifespan) @@ -38,6 +55,11 @@ _MODEL_NAME = os.getenv("WHISPER_MODEL", "large-v3") _DEVICE = os.getenv("WHISPER_DEVICE", "cuda") _COMPUTE_TYPE = os.getenv("WHISPER_COMPUTE_TYPE", "float16") +# load/unload/inflight 상태 전이는 전부 이 lock 아래 (cold 동시 요청 이중 로드 방지 포함) +_model_lock = threading.Lock() +_inflight = 0 +_last_used = time.monotonic() + def _resolve_path(file_path: str) -> Path | None: """NFC(DB) vs NFD(NFS) 한글 경로 정규화 차이 흡수. OCR 서비스와 동일 패턴.""" @@ -61,14 +83,38 @@ def _resolve_path(file_path: str) -> Path | None: def _load_model(): - """faster-whisper lazy loading — 첫 호출 시만 VRAM 점유.""" + """faster-whisper lazy loading — 첫 호출 시만 VRAM 점유. lock 으로 이중 로드 방지.""" global _model if _model is not None: return _model - from faster_whisper import WhisperModel + with _model_lock: + if _model is None: + from faster_whisper import WhisperModel + logger.info("stt model loading: %s (%s, %s)", _MODEL_NAME, _DEVICE, _COMPUTE_TYPE) + _model = WhisperModel(_MODEL_NAME, device=_DEVICE, compute_type=_COMPUTE_TYPE) + return _model - _model = WhisperModel(_MODEL_NAME, device=_DEVICE, compute_type=_COMPUTE_TYPE) - return _model + +def _maybe_unload() -> None: + """유휴 시 모델 해제. 처리 중(inflight>0)이면 절대 해제하지 않는다.""" + global _model + with _model_lock: + if _model is None or _inflight > 0: + return + if time.monotonic() - _last_used < _IDLE_UNLOAD_MINUTES * 60: + return + _model = None + gc.collect() + logger.info("stt idle-unload: whisper 모델 해제 (유휴 %d분 초과)", _IDLE_UNLOAD_MINUTES) + + +async def _idle_reaper(): + while True: + await asyncio.sleep(60) + try: + _maybe_unload() + except Exception: + logger.exception("stt idle reaper 오류") def _cuda_device_count() -> int: @@ -87,7 +133,7 @@ def health(): @app.get("/ready") def ready(): - """Readiness — CUDA + 모델 상태. 배포 검증용.""" + """Readiness — CUDA + 모델 상태. healthcheck 는 cuda 만 본다 (D-1 idle-unload).""" count = _cuda_device_count() cuda_ok = count > 0 models_loaded = _model is not None @@ -98,6 +144,8 @@ def ready(): "models_loaded": models_loaded, "model": _MODEL_NAME, "compute_type": _COMPUTE_TYPE, + "idle_unload_minutes": _IDLE_UNLOAD_MINUTES, + "inflight": _inflight, } @@ -121,6 +169,7 @@ async def transcribe(body: dict): "duration": 1832.5 } """ + global _inflight, _last_used raw_path = body["filePath"] langs = body.get("langs") beam_size = int(body.get("beamSize", 5)) @@ -129,28 +178,35 @@ async def transcribe(body: dict): if resolved is None: return {"error": f"파일 없음: {raw_path}", "text": "", "segments": []} - model = _load_model() + with _model_lock: + _inflight += 1 + try: + model = _load_model() - language = None - if isinstance(langs, list) and len(langs) == 1: - language = langs[0] + language = None + if isinstance(langs, list) and len(langs) == 1: + language = langs[0] - segments_iter, info = model.transcribe( - str(resolved), - beam_size=beam_size, - language=language, - vad_filter=True, - ) + segments_iter, info = model.transcribe( + str(resolved), + beam_size=beam_size, + language=language, + vad_filter=True, + ) - segments = [] - parts = [] - for seg in segments_iter: - segments.append({ - "start": round(float(seg.start), 2), - "end": round(float(seg.end), 2), - "text": seg.text.strip(), - }) - parts.append(seg.text) + segments = [] + parts = [] + for seg in segments_iter: + segments.append({ + "start": round(float(seg.start), 2), + "end": round(float(seg.end), 2), + "text": seg.text.strip(), + }) + parts.append(seg.text) + finally: + with _model_lock: + _inflight -= 1 + _last_used = time.monotonic() return { "text": " ".join(p.strip() for p in parts).strip(),