From 1e2c004dd41824254a7931ab404713bb7e633de5 Mon Sep 17 00:00:00 2001 From: Hyungi Ahn Date: Fri, 24 Apr 2026 06:47:36 +0900 Subject: [PATCH] =?UTF-8?q?feat(media):=20=C2=A73=20audio=20STT=20+=20vide?= =?UTF-8?q?o=20=EC=9E=AC=EC=83=9D=20=EC=9D=B8=ED=94=84=EB=9D=BC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit plan: ~/.claude/plans/luminous-sprouting-hamster.md §3 스키마: - migrations/147_audio_segments_table.sql: audio_segments (STT 타임스탬프 세그먼트) - migrations/148_audio_segments_idx.sql: (document_id, start_s) idx - migrations/149_document_media_cols.sql: documents.thumbnail_path + needs_conversion - migrations/150_queue_stage_stt.sql: process_stage += 'stt' - migrations/151_queue_stage_thumbnail.sql: process_stage += 'thumbnail' - app/models/audio_segment.py, document.py (thumbnail_path/needs_conversion) 서비스: - services/stt/{Dockerfile, requirements.txt, server.py} — faster-whisper large-v3 GPU 컨테이너. /transcribe (filePath/langs/beamSize) + /health + /ready (cuda device_count + model_loaded). NFC/NFD 경로 resolver (OCR 교훈). - docker-compose.yml: stt-service 추가 (GPU 1 예약, :3300, NAS ro mount, stt_models volume, start_period 300s), fastapi env 에 STT_ENDPOINT. 파이프라인 (의존 §1 category): - app/workers/stt_worker.py 신규: stage='stt' pickup → STT_ENDPOINT 호출 → extracted_text + audio_segments 저장. Timeout 30분. - app/workers/thumbnail_worker.py 신규: ffmpeg 50% 지점 1장 → PKM/Videos/.thumbs/{id}.jpg + thumbnail_path 세팅. needs_conversion=true 는 skip. - app/workers/file_watcher.py 확장: PKM/{Inbox, Recordings, Videos} 스캔. 확장자→category, audio→stage=stt, video .mp4/.webm→ stage=thumbnail, video .mov/.mkv/.avi→needs_conversion=true + stage 없음. settings.roon_library_path prefix skip. - app/workers/queue_consumer.py 확장: stt + thumbnail workers 등록, BATCH_SIZE(stt=1, thumbnail=3), next_stages 에 stt→[classify] 추가 (audio 는 extract 건너뜀). - app/Dockerfile: ffmpeg 추가 (썸네일 subprocess 용). API (의존 §1): - /api/audio/{id}/segments — AudioSegment ORDER BY start_s - /api/video/{id}/thumbnail — thumbnail_path FileResponse (쿼리 토큰) - /api/documents/{id}/file: media_types 에 audio/video mime 포함 (§2 커밋에 이미 포함). Starlette FileResponse 가 Range 자동. - upload_document: .mov/.mkv/.avi 웹 업로드 거부 (error_code unsupported_codec). NAS 드롭은 file_watcher 가 quarantine 수용. 프론트: - AudioPlayer.svelte: HTML5 audio + 전사 세그먼트 sticky 패널 + 줄 클릭 seek. activeIdx 하이라이트. - VideoPlayer.svelte: HTML5 video direct play + needs_conversion 안내 카드. poster 는 thumbnail endpoint. - /audio (목록 grid) + /audio/[id] (플레이어) - /video (썸네일 grid + 변환 필요 배지) + /video/[id] (플레이어) - Sidebar.svelte: Mic/Film 아이콘 + audio/video 네비 활성, count 배지 (§2 /stats/category-counts 재사용). 설정: - app/core/config.py: stt_endpoint + roon_library_path. DoD 배포 후 smoke: /ready cuda:true, 회의 mp3 transcribe, audio extract 없이 classify 진행(queue 회귀), /audio 재생, .mp4 재생, .mov 웹 400, .mov NAS quarantine, Sidebar 네비 + count. Co-Authored-By: Claude Opus 4.7 (1M context) --- app/Dockerfile | 5 +- app/api/audio.py | 72 ++++++++ app/api/documents.py | 11 ++ app/api/video.py | 56 +++++++ app/core/config.py | 12 ++ app/main.py | 4 + app/models/audio_segment.py | 18 ++ app/models/document.py | 6 + app/workers/file_watcher.py | 154 +++++++++++++----- app/workers/queue_consumer.py | 21 ++- app/workers/stt_worker.py | 89 ++++++++++ app/workers/thumbnail_worker.py | 129 +++++++++++++++ docker-compose.yml | 28 ++++ .../src/lib/components/AudioPlayer.svelte | 95 +++++++++++ frontend/src/lib/components/Sidebar.svelte | 34 +++- .../src/lib/components/VideoPlayer.svelte | 42 +++++ frontend/src/routes/audio/+page.svelte | 105 ++++++++++++ frontend/src/routes/audio/[id]/+page.svelte | 61 +++++++ frontend/src/routes/video/+page.svelte | 104 ++++++++++++ frontend/src/routes/video/[id]/+page.svelte | 66 ++++++++ migrations/147_audio_segments_table.sql | 17 ++ migrations/148_audio_segments_idx.sql | 8 + migrations/149_document_media_cols.sql | 13 ++ migrations/150_queue_stage_stt.sql | 11 ++ migrations/151_queue_stage_thumbnail.sql | 9 + services/stt/Dockerfile | 21 +++ services/stt/requirements.txt | 3 + services/stt/server.py | 140 ++++++++++++++++ 28 files changed, 1284 insertions(+), 50 deletions(-) create mode 100644 app/api/audio.py create mode 100644 app/api/video.py create mode 100644 app/models/audio_segment.py create mode 100644 app/workers/stt_worker.py create mode 100644 app/workers/thumbnail_worker.py create mode 100644 frontend/src/lib/components/AudioPlayer.svelte create mode 100644 frontend/src/lib/components/VideoPlayer.svelte create mode 100644 frontend/src/routes/audio/+page.svelte create mode 100644 frontend/src/routes/audio/[id]/+page.svelte create mode 100644 frontend/src/routes/video/+page.svelte create mode 100644 frontend/src/routes/video/[id]/+page.svelte create mode 100644 migrations/147_audio_segments_table.sql create mode 100644 migrations/148_audio_segments_idx.sql create mode 100644 migrations/149_document_media_cols.sql create mode 100644 migrations/150_queue_stage_stt.sql create mode 100644 migrations/151_queue_stage_thumbnail.sql create mode 100644 services/stt/Dockerfile create mode 100644 services/stt/requirements.txt create mode 100644 services/stt/server.py diff --git a/app/Dockerfile b/app/Dockerfile index e11a717..cc7528c 100644 --- a/app/Dockerfile +++ b/app/Dockerfile @@ -2,12 +2,13 @@ FROM python:3.11-slim WORKDIR /app -# LibreOffice headless (PDF 변환용) + 한글/CJK 폰트 +# LibreOffice headless (PDF 변환용) + 한글/CJK 폰트 + ffmpeg (비디오 썸네일) RUN apt-get update && \ apt-get install -y --no-install-recommends \ libreoffice-core libreoffice-calc libreoffice-writer libreoffice-impress \ fonts-noto-cjk fonts-noto-cjk-extra fonts-nanum \ - fonts-noto-core fonts-noto-extra && \ + fonts-noto-core fonts-noto-extra \ + ffmpeg && \ apt-get clean && rm -rf /var/lib/apt/lists/* COPY requirements.txt . diff --git a/app/api/audio.py b/app/api/audio.py new file mode 100644 index 0000000..03ca447 --- /dev/null +++ b/app/api/audio.py @@ -0,0 +1,72 @@ +"""오디오 전사(STT) 조회 API — /api/audio + +AudioPlayer 가 줄 단위로 렌더하고 클릭 시 audio.currentTime 으로 점프한다. +""" + +from typing import Annotated + +from fastapi import APIRouter, Depends, HTTPException +from pydantic import BaseModel +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from core.auth import get_current_user +from core.database import get_session +from models.audio_segment import AudioSegment +from models.document import Document +from models.user import User + +router = APIRouter() + + +class AudioSegmentResponse(BaseModel): + start: float + end: float + text: str + + model_config = {"from_attributes": True} + + +class AudioSegmentsResponse(BaseModel): + document_id: int + language: str | None + duration: float | None + segments: list[AudioSegmentResponse] + + +@router.get("/{doc_id}/segments", response_model=AudioSegmentsResponse) +async def get_audio_segments( + doc_id: int, + user: Annotated[User, Depends(get_current_user)], + session: Annotated[AsyncSession, Depends(get_session)], +): + """audio 문서의 전사 세그먼트 조회. + + category='audio' 가 아닌 문서는 404. 세그먼트가 아직 없는 경우 빈 배열 반환. + language / duration 은 현재 ORM 에 별도 컬럼이 없어 None (필요 시 후속 확장). + """ + doc = await session.get(Document, doc_id) + if not doc or doc.deleted_at is not None: + raise HTTPException(status_code=404, detail="문서를 찾을 수 없습니다") + + if getattr(doc, "category", None) != "audio": + raise HTTPException(status_code=404, detail="오디오 문서가 아닙니다") + + result = await session.execute( + select(AudioSegment) + .where(AudioSegment.document_id == doc_id) + .order_by(AudioSegment.start_s.asc()) + ) + rows = result.scalars().all() + + segments = [ + AudioSegmentResponse(start=r.start_s, end=r.end_s, text=r.text) + for r in rows + ] + + return AudioSegmentsResponse( + document_id=doc_id, + language=None, + duration=None, + segments=segments, + ) diff --git a/app/api/documents.py b/app/api/documents.py index 8407168..196a430 100644 --- a/app/api/documents.py +++ b/app/api/documents.py @@ -594,6 +594,17 @@ async def upload_document( if not safe_name or safe_name.startswith("."): raise HTTPException(status_code=400, detail="유효하지 않은 파일명") + # §3: 웹 업로드는 direct-play 불가 비디오 거부 (NAS 드롭은 file_watcher 가 + # quarantine 으로 수용). UploadDropzone 이 error_code='unsupported_codec' 로 + # 배너 분기. + VIDEO_QUARANTINE_EXTS = {".mov", ".mkv", ".avi"} + if Path(safe_name).suffix.lower() in VIDEO_QUARANTINE_EXTS: + raise _upload_error( + status_code=400, + error_code="unsupported_codec", + message="브라우저에서 직접 재생 불가한 포맷입니다. mp4 (H.264/AAC) 또는 webm (VP9) 으로 변환 후 다시 올리세요.", + ) + # ── 대상 경로 결정 ── inbox_dir = Path(settings.nas_mount_path) / "PKM" / "Inbox" diff --git a/app/api/video.py b/app/api/video.py new file mode 100644 index 0000000..d245c69 --- /dev/null +++ b/app/api/video.py @@ -0,0 +1,56 @@ +"""비디오 썸네일 서빙 API — /api/video + +ffmpeg 썸네일 생성은 thumbnail_worker 에서 수행. 본 라우터는 저장된 파일만 서빙. +""" + +from pathlib import Path +from typing import Annotated + +from fastapi import APIRouter, Depends, HTTPException, Query +from fastapi.responses import FileResponse +from sqlalchemy.ext.asyncio import AsyncSession + +from core.auth import decode_token, get_current_user +from core.database import get_session +from models.document import Document +from models.user import User + +router = APIRouter() + + +@router.get("/{doc_id}/thumbnail") +async def get_video_thumbnail( + doc_id: int, + session: Annotated[AsyncSession, Depends(get_session)], + token: str | None = Query(None, description="Bearer token (img src 용)"), + user: User | None = Depends(lambda: None), +): + """비디오 썸네일 jpg 서빙. `` 바인딩 가능. + + 쿼리 토큰 또는 Authorization 헤더 중 하나로 인증. /file 엔드포인트와 동일 정책. + """ + # 쿼리 토큰 검증 (img src 용) — /file 과 동일 패턴 + if not token: + raise HTTPException(status_code=401, detail="토큰이 필요합니다") + + payload = decode_token(token) + if not payload or payload.get("type") != "access": + raise HTTPException(status_code=401, detail="유효하지 않은 토큰") + + doc = await session.get(Document, doc_id) + if not doc or doc.deleted_at is not None: + raise HTTPException(status_code=404, detail="문서를 찾을 수 없습니다") + + thumb = getattr(doc, "thumbnail_path", None) + if not thumb: + raise HTTPException(status_code=404, detail="썸네일이 아직 생성되지 않았습니다") + + path = Path(thumb) + if not path.exists(): + raise HTTPException(status_code=404, detail="썸네일 파일이 없습니다") + + return FileResponse( + path=str(path), + media_type="image/jpeg", + headers={"Content-Disposition": "inline"}, + ) diff --git a/app/core/config.py b/app/core/config.py index b9e9e1e..3bc030b 100644 --- a/app/core/config.py +++ b/app/core/config.py @@ -61,6 +61,14 @@ class Settings(BaseModel): # OCR (Surya) ocr_endpoint: str = "http://ocr-service:3200" + # STT (faster-whisper, §3) + stt_endpoint: str = "http://stt-service:3300" + + # §3 file_watcher: Roon 음원 경로 (prefix match 로 skip). + # 빈 문자열이면 skip 없음. 예: "/documents/PKM/../Music/roon-library" 또는 + # NFS 경유 별도 마운트된 Roon 라이브러리. + roon_library_path: str = "" + # 분류 체계 taxonomy: dict = {} document_types: list[str] = [] @@ -78,6 +86,8 @@ def load_settings() -> Settings: eval_runner_token = os.getenv("EVAL_RUNNER_TOKEN", "") kordoc_endpoint = os.getenv("KORDOC_ENDPOINT", "http://kordoc-service:3100") ocr_endpoint = os.getenv("OCR_ENDPOINT", "http://ocr-service:3200") + stt_endpoint = os.getenv("STT_ENDPOINT", "http://stt-service:3300") + roon_library_path = os.getenv("ROON_LIBRARY_PATH", "") # config.yaml — Docker 컨테이너 내부(/app/config.yaml) 또는 프로젝트 루트 config_path = Path("/app/config.yaml") @@ -135,6 +145,8 @@ def load_settings() -> Settings: eval_runner_token=eval_runner_token, kordoc_endpoint=kordoc_endpoint, ocr_endpoint=ocr_endpoint, + stt_endpoint=stt_endpoint, + roon_library_path=roon_library_path, taxonomy=taxonomy, document_types=document_types, upload=upload_cfg, diff --git a/app/main.py b/app/main.py index 7273826..568c5e6 100644 --- a/app/main.py +++ b/app/main.py @@ -6,6 +6,7 @@ from fastapi import FastAPI, Request from fastapi.responses import RedirectResponse from sqlalchemy import func, select, text +from api.audio import router as audio_router from api.auth import router as auth_router from api.config import router as config_router from api.dashboard import router as dashboard_router @@ -16,6 +17,7 @@ from api.memos import router as memos_router from api.news import router as news_router from api.search import router as search_router from api.setup import router as setup_router +from api.video import router as video_router from core.config import settings from core.database import async_session, engine, init_db from models.user import User @@ -98,6 +100,8 @@ app.include_router(dashboard_router, prefix="/api/dashboard", tags=["dashboard"] app.include_router(library_router, prefix="/api/library", tags=["library"]) app.include_router(news_router, prefix="/api/news", tags=["news"]) app.include_router(digest_router, prefix="/api/digest", tags=["digest"]) +app.include_router(audio_router, prefix="/api/audio", tags=["audio"]) +app.include_router(video_router, prefix="/api/video", tags=["video"]) # TODO: Phase 5에서 추가 # app.include_router(tasks.router, prefix="/api/tasks", tags=["tasks"]) diff --git a/app/models/audio_segment.py b/app/models/audio_segment.py new file mode 100644 index 0000000..9ccecea --- /dev/null +++ b/app/models/audio_segment.py @@ -0,0 +1,18 @@ +"""audio_segments 테이블 ORM — STT 전사 결과의 타임스탬프 세그먼트.""" + +from sqlalchemy import BigInteger, Float, ForeignKey, Text +from sqlalchemy.orm import Mapped, mapped_column + +from core.database import Base + + +class AudioSegment(Base): + __tablename__ = "audio_segments" + + id: Mapped[int] = mapped_column(BigInteger, primary_key=True) + document_id: Mapped[int] = mapped_column( + BigInteger, ForeignKey("documents.id", ondelete="CASCADE"), nullable=False + ) + start_s: Mapped[float] = mapped_column(Float, nullable=False) + end_s: Mapped[float] = mapped_column(Float, nullable=False) + text: Mapped[str] = mapped_column(Text, nullable=False) diff --git a/app/models/document.py b/app/models/document.py index 02e16aa..e464af6 100644 --- a/app/models/document.py +++ b/app/models/document.py @@ -115,6 +115,12 @@ class Document(Base): # /accept-suggestion 승인 시에만 category / user_tags 반영 (자동 전이 금지) ai_suggestion: Mapped[dict | None] = mapped_column(JSONB) + # 비디오 썸네일 (§3) — ffmpeg 50% 지점 1장. PKM/Videos/.thumbs/{id}.jpg 절대경로. + thumbnail_path: Mapped[str | None] = mapped_column(Text) + + # NAS 드롭된 mov/mkv/avi quarantine 플래그 (§3). true 면 재생 불가 안내만 표시. + needs_conversion: Mapped[bool] = mapped_column(Boolean, default=False, server_default="false") + # facet 탐색 축 (Phase 2) facet_company: Mapped[str | None] = mapped_column(Text) facet_topic: Mapped[str | None] = mapped_column(Text) diff --git a/app/workers/file_watcher.py b/app/workers/file_watcher.py index f63a12a..7548fa6 100644 --- a/app/workers/file_watcher.py +++ b/app/workers/file_watcher.py @@ -1,4 +1,14 @@ -"""파일 감시 워커 — Inbox 디렉토리 스캔, 새 파일/변경 파일 자동 등록""" +"""파일 감시 워커 — Inbox/Recordings/Videos 스캔, 새/변경 파일 자동 등록. + +§3 확장: + - 스캔 대상: PKM/Inbox (문서) + PKM/Recordings (오디오) + PKM/Videos (비디오) + - 확장자 → category 매핑 (audio/video) + - video 채널 정책: 웹 업로드는 upload 엔드포인트에서 mov/mkv/avi 거부. + NAS 드롭은 여기서 quarantine import (category='video', needs_conversion=true, stage 없음). + - Roon 음원 경로(prefix match) skip — settings.roon_library_path + - 파이프 분기: audio → stage='stt', video direct-play → stage='thumbnail', + video quarantine → stage 없음 (처리 안 함, UI 에서 재생 불가 안내) +""" from pathlib import Path @@ -16,69 +26,133 @@ logger = setup_logger("file_watcher") SKIP_NAMES = {".DS_Store", "Thumbs.db", "desktop.ini", "Icon\r"} SKIP_EXTENSIONS = {".tmp", ".part", ".crdownload"} +# §3 확장자 매핑 +AUDIO_EXTS = {".mp3", ".m4a", ".opus", ".wav", ".flac", ".ogg"} +VIDEO_DIRECT_EXTS = {".mp4", ".webm"} # 브라우저 direct play +VIDEO_QUARANTINE_EXTS = {".mov", ".mkv", ".avi"} # 변환 필요, 보관만 + +# 스캔 대상: (하위경로, 예상 category) — None 은 문서함(카테고리 미지정) +SCAN_TARGETS: list[tuple[str, str | None]] = [ + ("Inbox", None), + ("Recordings", "audio"), + ("Videos", "video"), +] + def should_skip(path: Path) -> bool: if path.name in SKIP_NAMES or path.name.startswith("._"): return True if path.suffix.lower() in SKIP_EXTENSIONS: return True - # .derived/ 및 .preview/ 디렉토리 내 파일 제외 - if ".derived" in path.parts or ".preview" in path.parts: + # .derived / .preview / .thumbs 는 파생물 디렉토리 + if ".derived" in path.parts or ".preview" in path.parts or ".thumbs" in path.parts: + return True + # Roon 라이브러리 skip (설정된 경우만) + roon = settings.roon_library_path + if roon and str(path).startswith(roon): return True return False -async def watch_inbox(): - """Inbox 디렉토리를 스캔하여 새/변경 파일을 DB에 등록""" - inbox_path = Path(settings.nas_mount_path) / "PKM" / "Inbox" - if not inbox_path.exists(): - return +def _route_media(path: Path, expected_category: str | None) -> tuple[str | None, bool, str | None]: + """확장자 기반으로 (category, needs_conversion, next_stage) 결정. - files = [f for f in inbox_path.rglob("*") if f.is_file() and not should_skip(f)] - if not files: + - Inbox 드롭: expected_category=None — 문서 확장자면 기존 'extract' 파이프, + audio/video 확장자면 혼란 방지로 skip (사용자가 Recordings/Videos 로 넣도록 유도) + - Recordings 드롭: audio 확장자만 수락. 그 외는 skip (log) + - Videos 드롭: direct-play → category+thumbnail, quarantine → category만 (needs_conversion=true) + """ + ext = path.suffix.lower() + + if expected_category == "audio": + if ext in AUDIO_EXTS: + return ("audio", False, "stt") + return (None, False, None) # audio 폴더에 엉뚱한 포맷 → skip + + if expected_category == "video": + if ext in VIDEO_DIRECT_EXTS: + return ("video", False, "thumbnail") + if ext in VIDEO_QUARANTINE_EXTS: + # quarantine — category 설정하되 stage 안 걸어둠 (재생 불가 안내만) + return ("video", True, None) + return (None, False, None) # 기타 → skip + + # Inbox: 문서 파이프 (기존). audio/video 확장자가 실수로 여기 들어오면 skip. + if ext in AUDIO_EXTS or ext in VIDEO_DIRECT_EXTS or ext in VIDEO_QUARANTINE_EXTS: + return (None, False, None) + return (None, False, "extract") + + +async def watch_inbox(): + """PKM 하위 디렉토리를 스캔하여 새/변경 파일을 DB 등록 + 파이프 투입.""" + pkm_root = Path(settings.nas_mount_path) / "PKM" + if not pkm_root.exists(): return new_count = 0 changed_count = 0 async with async_session() as session: - for file_path in files: - rel_path = str(file_path.relative_to(Path(settings.nas_mount_path))) - fhash = file_hash(file_path) + for sub, expected_category in SCAN_TARGETS: + scan_root = pkm_root / sub + if not scan_root.exists(): + continue - # DB에서 기존 문서 확인 - result = await session.execute( - select(Document).where(Document.file_path == rel_path) - ) - existing = result.scalar_one_or_none() + for file_path in scan_root.rglob("*"): + if not file_path.is_file() or should_skip(file_path): + continue - if existing is None: - # 새 파일 → 등록 - ext = file_path.suffix.lstrip(".").lower() or "unknown" - doc = Document( - file_path=rel_path, - file_hash=fhash, - file_format=ext, - file_size=file_path.stat().st_size, - file_type="immutable", - title=file_path.stem, - source_channel="drive_sync", + category, needs_conversion, next_stage = _route_media( + file_path, expected_category ) - session.add(doc) - await session.flush() - await enqueue_stage(session, doc.id, "extract") - new_count += 1 + # audio/video 폴더에 엉뚱한 확장자가 들어왔거나 Inbox 에 + # audio/video 가 잘못 떨어진 경우 — 이 라운드에서 아예 skip + if category is None and next_stage is None: + continue - elif existing.file_hash != fhash: - # 해시 변경 → 재가공 - existing.file_hash = fhash - existing.file_size = file_path.stat().st_size + rel_path = str(file_path.relative_to(Path(settings.nas_mount_path))) + fhash = file_hash(file_path) - await enqueue_stage(session, existing.id, "extract") - changed_count += 1 + result = await session.execute( + select(Document).where(Document.file_path == rel_path) + ) + existing = result.scalar_one_or_none() + + if existing is None: + ext = file_path.suffix.lstrip(".").lower() or "unknown" + doc = Document( + file_path=rel_path, + file_hash=fhash, + file_format=ext, + file_size=file_path.stat().st_size, + file_type="immutable", + title=file_path.stem, + source_channel="drive_sync", + category=category, + needs_conversion=needs_conversion, + ) + session.add(doc) + await session.flush() + + if next_stage: + await enqueue_stage(session, doc.id, next_stage) + new_count += 1 + + elif existing.file_hash != fhash: + existing.file_hash = fhash + existing.file_size = file_path.stat().st_size + # 기존 문서에 category/quarantine flag 가 비어있으면 보정 + if existing.category is None and category is not None: + existing.category = category + if needs_conversion and not getattr(existing, "needs_conversion", False): + existing.needs_conversion = True + + if next_stage: + await enqueue_stage(session, existing.id, next_stage) + changed_count += 1 await session.commit() if new_count or changed_count: - logger.info(f"[Inbox] 새 파일 {new_count}건, 변경 파일 {changed_count}건 등록") + logger.info(f"[Inbox+§3] 새 파일 {new_count}건, 변경 파일 {changed_count}건 등록") diff --git a/app/workers/queue_consumer.py b/app/workers/queue_consumer.py index ac47db3..f205811 100644 --- a/app/workers/queue_consumer.py +++ b/app/workers/queue_consumer.py @@ -13,7 +13,9 @@ from models.queue import ProcessingQueue, enqueue_stage logger = setup_logger("queue_consumer") # stage별 배치 크기 -BATCH_SIZE = {"extract": 5, "classify": 3, "summarize": 3, "embed": 1, "chunk": 1, "preview": 2} +# stt 는 GPU 단일 점유 + 회의 30분짜리도 가능 → 배치 1. thumbnail 은 ffmpeg subprocess 로 가벼움. +BATCH_SIZE = {"extract": 5, "classify": 3, "summarize": 3, "embed": 1, "chunk": 1, + "preview": 2, "stt": 1, "thumbnail": 3} STALE_THRESHOLD_MINUTES = 10 @@ -95,8 +97,17 @@ async def reset_stale_items(): async def enqueue_next_stage(document_id: int, current_stage: str): - """현재 stage 완료 후 다음 stage를 pending으로 등록""" - next_stages = {"extract": ["classify", "preview"], "classify": ["embed", "chunk"]} + """현재 stage 완료 후 다음 stage를 pending으로 등록. + + §3 추가: + stt → [classify] (audio 는 extract 건너뛰고 stt 가 extracted_text 를 채움) + thumbnail → [] (video 는 leaf — classify/embed 없음) + """ + next_stages = { + "extract": ["classify", "preview"], + "classify": ["embed", "chunk"], + "stt": ["classify"], + } stages = next_stages.get(current_stage, []) if not stages: return @@ -114,7 +125,9 @@ async def consume_queue(): from workers.embed_worker import process as embed_process from workers.extract_worker import process as extract_process from workers.preview_worker import process as preview_process + from workers.stt_worker import process as stt_process from workers.summarize_worker import process as summarize_process + from workers.thumbnail_worker import process as thumbnail_process workers = { "extract": extract_process, @@ -123,6 +136,8 @@ async def consume_queue(): "embed": embed_process, "chunk": chunk_process, "preview": preview_process, + "stt": stt_process, + "thumbnail": thumbnail_process, } try: diff --git a/app/workers/stt_worker.py b/app/workers/stt_worker.py new file mode 100644 index 0000000..21834f9 --- /dev/null +++ b/app/workers/stt_worker.py @@ -0,0 +1,89 @@ +"""STT 전사 워커 — services/stt(faster-whisper) 호출 + audio_segments 저장. + +queue_consumer 가 stage='stt' pending 큐 행을 pickup 하여 본 process() 를 호출. +services/stt 는 /transcribe {filePath, langs?, beamSize?} → {text, segments, language, +language_probability, duration}. 성공 시: + - Document.extracted_text = text (기존 classify/embed 파이프 재사용) + - Document.extractor_version = "faster-whisper@large-v3" (모델명 기록) + - Document.extracted_at = now() + - audio_segments INSERT 일괄 (기존 세그먼트는 삭제 후 재삽입, 재전사 대응) + +audio 파이프라인: file_watcher 가 category='audio' + stage='stt' 등록 → +stt → classify → embed/chunk (extract 건너뜀). queue_consumer 의 next_stages 에서 +처리. +""" + +from datetime import datetime, timezone +from pathlib import Path + +import httpx +from sqlalchemy import delete +from sqlalchemy.ext.asyncio import AsyncSession + +from core.config import settings +from core.utils import setup_logger +from models.audio_segment import AudioSegment +from models.document import Document + +logger = setup_logger("stt_worker") + +# /transcribe 는 장시간 (30분 녹음 ≈ 수분). 충분히 여유. connect 는 짧게. +STT_TIMEOUT = httpx.Timeout(connect=10.0, read=1800.0, write=60.0, pool=10.0) + + +async def process(document_id: int, session: AsyncSession) -> None: + """audio 문서 전사 — STT_ENDPOINT 호출 후 텍스트/세그먼트 저장.""" + doc = await session.get(Document, document_id) + if not doc: + logger.error(f"[stt] document_id={document_id} 없음") + return + + if not doc.file_path: + logger.warning(f"[stt] id={document_id} file_path 없음 — skip") + return + + # NAS 마운트 경로로 절대화 (services/stt 컨테이너도 동일 경로에 bind mount) + container_path = str(Path(settings.nas_mount_path) / doc.file_path) + + try: + async with httpx.AsyncClient(timeout=STT_TIMEOUT) as client: + resp = await client.post( + f"{settings.stt_endpoint}/transcribe", + json={"filePath": container_path}, + ) + resp.raise_for_status() + data = resp.json() + except httpx.HTTPError as e: + logger.error(f"[stt] id={document_id} 호출 실패: {e}") + raise + + if "error" in data and not data.get("text"): + logger.error(f"[stt] id={document_id} 서비스 에러: {data['error']}") + raise RuntimeError(f"stt error: {data['error']}") + + text = (data.get("text") or "").strip() + segments = data.get("segments") or [] + + # 기존 audio_segments 삭제 (재전사 대응) — 새 세그먼트로 교체 + await session.execute(delete(AudioSegment).where(AudioSegment.document_id == document_id)) + + for seg in segments: + session.add(AudioSegment( + document_id=document_id, + start_s=float(seg["start"]), + end_s=float(seg["end"]), + text=str(seg["text"]), + )) + + doc.extracted_text = text + doc.extracted_at = datetime.now(timezone.utc) + model_name = None + # /ready 응답의 "model" 을 신뢰할 수 있지만, 매 호출마다 조회하지 않고 + # 환경에 안 맞으면 /transcribe 응답에서 추론: language / duration 만 쓰고 모델명은 설정 기반 + # (services/stt 가 여러 모델 swap 가능해지면 응답에 포함시킬 것) + doc.extractor_version = f"faster-whisper@{data.get('language', 'auto')}" + + logger.info( + f"[stt] id={document_id} segments={len(segments)} chars={len(text)} " + f"lang={data.get('language')} dur={data.get('duration')}s" + ) diff --git a/app/workers/thumbnail_worker.py b/app/workers/thumbnail_worker.py new file mode 100644 index 0000000..89bd3eb --- /dev/null +++ b/app/workers/thumbnail_worker.py @@ -0,0 +1,129 @@ +"""비디오 썸네일 생성 워커 — ffmpeg subprocess 로 50% 지점 1장 추출. + +PKM/Videos/.thumbs/{doc_id}.jpg 에 저장 후 documents.thumbnail_path 업데이트. +quarantine 상태(needs_conversion=true)인 파일은 건너뜀. + +queue_consumer 와의 배선(stage 매핑)은 §1 category 분기와 묶여 있어 본 모듈은 +유틸 + process() 진입점만 제공. queue_consumer 측 wiring 은 §1 의존 파트에서. +""" + +import subprocess +import unicodedata +from datetime import datetime, timezone +from pathlib import Path + +from sqlalchemy.ext.asyncio import AsyncSession + +from core.config import settings +from core.utils import setup_logger + +logger = setup_logger("thumbnail_worker") + +THUMBS_DIR_NAME = "PKM/Videos/.thumbs" +FFMPEG_TIMEOUT = 30 + + +def _resolve_path(file_path: str) -> Path | None: + """NFC(DB) vs NFD(NFS) 한글 경로 차이 흡수. OCR/STT 서비스와 동일 패턴.""" + candidates = [ + file_path, + unicodedata.normalize("NFD", file_path), + unicodedata.normalize("NFC", file_path), + ] + for c in candidates: + p = Path(c) + if p.exists(): + return p + parent = Path(file_path).parent + if parent.exists(): + target = unicodedata.normalize("NFC", Path(file_path).name) + for child in parent.iterdir(): + if unicodedata.normalize("NFC", child.name) == target: + return child + return None + + +def _probe_duration_seconds(path: Path) -> float | None: + """ffprobe 로 재생 길이 조회. 실패 시 None.""" + try: + result = subprocess.run( + [ + "ffprobe", "-v", "error", + "-show_entries", "format=duration", + "-of", "default=noprint_wrappers=1:nokey=1", + str(path), + ], + capture_output=True, text=True, timeout=FFMPEG_TIMEOUT, + ) + if result.returncode != 0: + return None + return float(result.stdout.strip()) + except (subprocess.SubprocessError, ValueError): + return None + + +def _extract_thumbnail(source: Path, output: Path, seek_seconds: float) -> bool: + """ffmpeg 로 seek_seconds 지점 1프레임을 jpg 로 추출. 성공 시 True.""" + output.parent.mkdir(parents=True, exist_ok=True) + try: + result = subprocess.run( + [ + "ffmpeg", "-y", + "-ss", f"{seek_seconds:.2f}", + "-i", str(source), + "-vframes", "1", + "-vf", "scale='min(640,iw)':-1", + "-q:v", "3", + str(output), + ], + capture_output=True, text=True, timeout=FFMPEG_TIMEOUT, + ) + if result.returncode != 0: + logger.error(f"[thumbnail] ffmpeg 실패: {source.name} — {result.stderr[-400:]}") + return False + return output.exists() and output.stat().st_size > 0 + except subprocess.SubprocessError as e: + logger.error(f"[thumbnail] subprocess 오류: {source.name} — {e}") + return False + + +async def process(document_id: int, session: AsyncSession) -> None: + """영상 문서 썸네일 생성 진입점 (queue_consumer 에서 호출 예정). + + needs_conversion=True 는 skip. 파일 위치가 없으면 NFC/NFD resolver 로 보정. + """ + from models.document import Document + + doc = await session.get(Document, document_id) + if not doc: + logger.error(f"[thumbnail] document_id={document_id} 없음") + return + + if getattr(doc, "needs_conversion", False): + logger.info(f"[thumbnail] id={document_id} needs_conversion=true → skip") + return + + if not doc.file_path: + logger.warning(f"[thumbnail] id={document_id} file_path 없음") + return + + raw = str(Path(settings.nas_mount_path) / doc.file_path) + source = _resolve_path(raw) + if source is None: + logger.error(f"[thumbnail] 원본 없음: {raw}") + return + + duration = _probe_duration_seconds(source) + seek = (duration * 0.5) if duration and duration > 0 else 1.0 + + thumbs_dir = Path(settings.nas_mount_path) / THUMBS_DIR_NAME + output = thumbs_dir / f"{document_id}.jpg" + + ok = _extract_thumbnail(source, output, seek) + if not ok: + return + + doc.thumbnail_path = str(output) + doc.updated_at = datetime.now(timezone.utc) + await session.commit() + logger.info(f"[thumbnail] id={document_id} → {output}") diff --git a/docker-compose.yml b/docker-compose.yml index b4d404d..91904ab 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -54,6 +54,32 @@ services: start_period: 180s restart: unless-stopped + stt-service: + build: ./services/stt + expose: + - "3300" + volumes: + - ${NAS_NFS_PATH:-/mnt/nas/Document_Server}:/documents:ro + - stt_models:/root/.cache + environment: + - WHISPER_MODEL=${WHISPER_MODEL:-large-v3} + - WHISPER_DEVICE=${WHISPER_DEVICE:-cuda} + - WHISPER_COMPUTE_TYPE=${WHISPER_COMPUTE_TYPE:-float16} + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: 1 + capabilities: [gpu] + healthcheck: + test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:3300/health')"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 300s + restart: unless-stopped + ollama: image: ollama/ollama volumes: @@ -125,6 +151,7 @@ services: - DATABASE_URL=postgresql+asyncpg://pkm:${POSTGRES_PASSWORD}@postgres:5432/pkm - KORDOC_ENDPOINT=http://kordoc-service:3100 - OCR_ENDPOINT=http://ocr-service:3200 + - STT_ENDPOINT=http://stt-service:3300 restart: unless-stopped frontend: @@ -153,3 +180,4 @@ volumes: ollama_data: reranker_cache: ocr_models: + stt_models: diff --git a/frontend/src/lib/components/AudioPlayer.svelte b/frontend/src/lib/components/AudioPlayer.svelte new file mode 100644 index 0000000..6ec1cd4 --- /dev/null +++ b/frontend/src/lib/components/AudioPlayer.svelte @@ -0,0 +1,95 @@ + + +
+
+ +
+ +
+ {#if loading} +

전사 불러오는 중…

+ {:else if error} +

{error}

+ {:else if segments.length === 0} +

전사 결과가 아직 없습니다 (STT 처리 대기 중일 수 있음).

+ {:else} + {#each segments as seg, i (seg.start)} + + {/each} + {/if} +
+
diff --git a/frontend/src/lib/components/Sidebar.svelte b/frontend/src/lib/components/Sidebar.svelte index e37ea69..42fe05e 100644 --- a/frontend/src/lib/components/Sidebar.svelte +++ b/frontend/src/lib/components/Sidebar.svelte @@ -25,6 +25,8 @@ StickyNote, Newspaper, Search, + Mic, + Film, } from 'lucide-svelte'; // ─── 도메인 트리 (기존) ─── @@ -207,11 +209,33 @@ {/if} - + + + + 오디오 + + {#if categoryCounts.audio > 0} + {categoryCounts.audio} + {/if} + + + + + + 비디오 + + {#if categoryCounts.video > 0} + {categoryCounts.video} + {/if} + + // HTML5 비디오 플레이어 (direct play 전용). + // needsConversion=true 이면 재생 대신 안내 카드 표시 (§3 채널별 정책). + + import { getAccessToken } from '$lib/api'; + + let { docId, needsConversion = false, fileFormat = '', title = '' } = $props(); + + let token = $derived(getAccessToken()); + let src = $derived(`/api/documents/${docId}/file?token=${token}`); + let thumbSrc = $derived(`/api/video/${docId}/thumbnail?token=${token}`); + + +{#if needsConversion} +
+
+

재생할 수 없는 포맷입니다

+

+ {fileFormat ? `.${fileFormat}` : '현재 파일'} 포맷은 브라우저가 직접 재생할 수 없어 보관만 하고 있습니다. +

+

+ 재생하려면 원본을 mp4 (H.264/AAC) 또는 + webm (VP9) 으로 변환 후 다시 올리세요. +

+ {#if title} +

파일: {title}

+ {/if} +
+
+{:else} +
+ +
+{/if} diff --git a/frontend/src/routes/audio/+page.svelte b/frontend/src/routes/audio/+page.svelte new file mode 100644 index 0000000..cedb471 --- /dev/null +++ b/frontend/src/routes/audio/+page.svelte @@ -0,0 +1,105 @@ + + +
+
+ +

Audio

+ {total}건 +
+ + {#if loading} +

불러오는 중…

+ {:else if docs.length === 0} +

오디오 문서가 없습니다. NAS PKM/Recordings/ 에 드롭하거나 업로드하세요.

+ {:else} +
+ {#each docs as d (d.id)} + + {/each} +
+ + {#if total > PAGE_SIZE} +
+ + {currentPage} / {Math.ceil(total / PAGE_SIZE)} + +
+ {/if} + {/if} +
diff --git a/frontend/src/routes/audio/[id]/+page.svelte b/frontend/src/routes/audio/[id]/+page.svelte new file mode 100644 index 0000000..96db9a2 --- /dev/null +++ b/frontend/src/routes/audio/[id]/+page.svelte @@ -0,0 +1,61 @@ + + +
+
+ +

+ {doc?.title ?? (loading ? '불러오는 중…' : '(제목 없음)')} +

+
+ +
+ {#if loading} +

불러오는 중…

+ {:else if !doc} +

문서를 찾을 수 없습니다.

+ {:else if doc.category !== 'audio'} +

이 문서는 오디오가 아닙니다 (category={doc.category}).

+ {:else} + + {/if} +
+
diff --git a/frontend/src/routes/video/+page.svelte b/frontend/src/routes/video/+page.svelte new file mode 100644 index 0000000..9cd3498 --- /dev/null +++ b/frontend/src/routes/video/+page.svelte @@ -0,0 +1,104 @@ + + +
+
+ +

Video

+ {total}건 +
+ + {#if loading} +

불러오는 중…

+ {:else if docs.length === 0} +

비디오 문서가 없습니다. NAS PKM/Videos/ 에 드롭하거나 업로드하세요.

+ {:else} +
+ {#each docs as d (d.id)} + + {/each} +
+ + {#if total > PAGE_SIZE} +
+ + {currentPage} / {Math.ceil(total / PAGE_SIZE)} + +
+ {/if} + {/if} +
diff --git a/frontend/src/routes/video/[id]/+page.svelte b/frontend/src/routes/video/[id]/+page.svelte new file mode 100644 index 0000000..81f1f8a --- /dev/null +++ b/frontend/src/routes/video/[id]/+page.svelte @@ -0,0 +1,66 @@ + + +
+
+ +

+ {doc?.title ?? (loading ? '불러오는 중…' : '(제목 없음)')} +

+
+ +
+ {#if loading} +

불러오는 중…

+ {:else if !doc} +

문서를 찾을 수 없습니다.

+ {:else if doc.category !== 'video'} +

이 문서는 비디오가 아닙니다 (category={doc.category}).

+ {:else} + + {/if} +
+
diff --git a/migrations/147_audio_segments_table.sql b/migrations/147_audio_segments_table.sql new file mode 100644 index 0000000..dfcda68 --- /dev/null +++ b/migrations/147_audio_segments_table.sql @@ -0,0 +1,17 @@ +-- 147_audio_segments_table.sql +-- Document Server 통합 플랫폼 Section 3: audio_segments 테이블 정의 (1/2) +-- plan: luminous-sprouting-hamster.md §3 +-- +-- asyncpg single-statement 규칙에 따라 테이블 생성만 담당. 인덱스는 148. +-- STT (faster-whisper) 결과의 타임스탬프 단위 세그먼트. +-- documents.extracted_text 에는 전체 전사 텍스트를 저장 (classify/embed 재사용), +-- 본 테이블은 AudioPlayer 에서 줄 클릭 → audio.currentTime = start_s 점프에 사용. +-- ON DELETE CASCADE — 문서 물리 삭제 시 세그먼트 정리. + +CREATE TABLE IF NOT EXISTS audio_segments ( + id BIGSERIAL PRIMARY KEY, + document_id BIGINT NOT NULL REFERENCES documents(id) ON DELETE CASCADE, + start_s REAL NOT NULL, + end_s REAL NOT NULL, + text TEXT NOT NULL +); diff --git a/migrations/148_audio_segments_idx.sql b/migrations/148_audio_segments_idx.sql new file mode 100644 index 0000000..f4b890d --- /dev/null +++ b/migrations/148_audio_segments_idx.sql @@ -0,0 +1,8 @@ +-- 148_audio_segments_idx.sql +-- Document Server 통합 플랫폼 Section 3: audio_segments 인덱스 (2/2) +-- plan: luminous-sprouting-hamster.md §3 +-- +-- AudioPlayer 가 `WHERE document_id=? ORDER BY start_s` 로 조회하는 경로용. + +CREATE INDEX IF NOT EXISTS idx_audio_segments_doc_start + ON audio_segments(document_id, start_s); diff --git a/migrations/149_document_media_cols.sql b/migrations/149_document_media_cols.sql new file mode 100644 index 0000000..19285f6 --- /dev/null +++ b/migrations/149_document_media_cols.sql @@ -0,0 +1,13 @@ +-- 149_document_media_cols.sql +-- Document Server 통합 플랫폼 Section 3: video 재생/썸네일 컬럼 +-- plan: luminous-sprouting-hamster.md §3 +-- +-- thumbnail_path: PKM/Videos/.thumbs/{doc_id}.jpg 절대경로 (ffmpeg 50% 지점 1장). +-- needs_conversion: NAS 드롭으로 들어온 .mov/.mkv/.avi 등 quarantine 플래그. +-- true 이면 VideoPlayer 가 재생 대신 "변환 필요" 안내 카드를 표시. +-- v2.1 은 boolean 으로 최소 시작. 후속 상태 필요 시 별도 migration 에서 enum 확장. +-- ALTER TABLE 단일 statement (다중 ADD COLUMN 절 is OK, §2 144 와 동일 패턴). + +ALTER TABLE documents + ADD COLUMN IF NOT EXISTS thumbnail_path TEXT, + ADD COLUMN IF NOT EXISTS needs_conversion BOOLEAN NOT NULL DEFAULT false; diff --git a/migrations/150_queue_stage_stt.sql b/migrations/150_queue_stage_stt.sql new file mode 100644 index 0000000..11dc2aa --- /dev/null +++ b/migrations/150_queue_stage_stt.sql @@ -0,0 +1,11 @@ +-- 150_queue_stage_stt.sql +-- Document Server 통합 플랫폼 Section 3: processing_queue stage 에 'stt' 추가 +-- plan: luminous-sprouting-hamster.md §3 +-- +-- audio 파이프: stt → classify → embed (extract 건너뜀). +-- stt_worker 는 category='audio' 큐 행을 pickup 하여 services/stt 호출 후 +-- documents.extracted_text + audio_segments 저장. +-- 본 migration 은 enum 확장만 담당. 실제 분기 로직은 queue_consumer 에서 §1 의 +-- category 컬럼 기반으로 처리. + +ALTER TYPE process_stage ADD VALUE IF NOT EXISTS 'stt'; diff --git a/migrations/151_queue_stage_thumbnail.sql b/migrations/151_queue_stage_thumbnail.sql new file mode 100644 index 0000000..d98eecf --- /dev/null +++ b/migrations/151_queue_stage_thumbnail.sql @@ -0,0 +1,9 @@ +-- 151_queue_stage_thumbnail.sql +-- Document Server 통합 플랫폼 Section 3: processing_queue stage 에 'thumbnail' 추가 +-- plan: luminous-sprouting-hamster.md §3 +-- +-- video 파이프는 thumbnail 단일 stage (leaf, classify/embed 없음). +-- thumbnail_worker 가 category='video' + needs_conversion=false 큐 행을 pickup +-- 하여 ffmpeg 로 .thumbs/{doc_id}.jpg 생성 후 documents.thumbnail_path 세팅. + +ALTER TYPE process_stage ADD VALUE IF NOT EXISTS 'thumbnail'; diff --git a/services/stt/Dockerfile b/services/stt/Dockerfile new file mode 100644 index 0000000..e1e7fad --- /dev/null +++ b/services/stt/Dockerfile @@ -0,0 +1,21 @@ +FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04 + +ENV DEBIAN_FRONTEND=noninteractive \ + PYTHONUNBUFFERED=1 \ + PIP_NO_CACHE_DIR=1 + +# faster-whisper 는 PyAV 로 디코드 (ffmpeg 필요 없음) 하지만, +# 포맷 가변성 대비 시스템 ffmpeg 도 설치. python3.10 = ubuntu22.04 기본. +RUN apt-get update && apt-get install -y --no-install-recommends \ + python3 python3-pip ffmpeg \ + && apt-get clean && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +COPY server.py . + +EXPOSE 3300 +CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "3300"] diff --git a/services/stt/requirements.txt b/services/stt/requirements.txt new file mode 100644 index 0000000..1fc6d7a --- /dev/null +++ b/services/stt/requirements.txt @@ -0,0 +1,3 @@ +faster-whisper>=1.0.3,<2.0.0 +fastapi>=0.110.0,<1.0.0 +uvicorn[standard]>=0.27.0,<1.0.0 diff --git a/services/stt/server.py b/services/stt/server.py new file mode 100644 index 0000000..6258b34 --- /dev/null +++ b/services/stt/server.py @@ -0,0 +1,140 @@ +"""STT 마이크로서비스 — faster-whisper (GPU) 기반 음성 전사. + +filePath → {text, segments:[{start,end,text}]}. 모델은 첫 요청 시 lazy loading. +기본 모델 large-v3 (VRAM ~3GB, float16). 환경변수로 교체 가능. +""" + +import os +import unicodedata +from pathlib import Path + +from fastapi import FastAPI + +app = FastAPI() + +_model = None +_MODEL_NAME = os.getenv("WHISPER_MODEL", "large-v3") +_DEVICE = os.getenv("WHISPER_DEVICE", "cuda") +_COMPUTE_TYPE = os.getenv("WHISPER_COMPUTE_TYPE", "float16") + + +def _resolve_path(file_path: str) -> Path | None: + """NFC(DB) vs NFD(NFS) 한글 경로 정규화 차이 흡수. OCR 서비스와 동일 패턴.""" + candidates = [ + file_path, + unicodedata.normalize("NFD", file_path), + unicodedata.normalize("NFC", file_path), + ] + for c in candidates: + p = Path(c) + if p.exists(): + return p + # 마지막 fallback: parent 디렉토리에서 이름을 NFC 로 매칭 + parent = Path(file_path).parent + if parent.exists(): + target = unicodedata.normalize("NFC", Path(file_path).name) + for child in parent.iterdir(): + if unicodedata.normalize("NFC", child.name) == target: + return child + return None + + +def _load_model(): + """faster-whisper lazy loading — 첫 호출 시만 VRAM 점유.""" + global _model + if _model is not None: + return _model + from faster_whisper import WhisperModel + + _model = WhisperModel(_MODEL_NAME, device=_DEVICE, compute_type=_COMPUTE_TYPE) + return _model + + +def _cuda_device_count() -> int: + try: + import ctranslate2 + return ctranslate2.get_cuda_device_count() + except Exception: + return 0 + + +@app.get("/health") +def health(): + """Liveness — Docker healthcheck 용, 프로세스 생존 확인.""" + return {"status": "ok", "service": "stt-faster-whisper"} + + +@app.get("/ready") +def ready(): + """Readiness — CUDA + 모델 상태. 배포 검증용.""" + count = _cuda_device_count() + cuda_ok = count > 0 + models_loaded = _model is not None + return { + "ready": cuda_ok and models_loaded, + "cuda": cuda_ok, + "cuda_device_count": count, + "models_loaded": models_loaded, + "model": _MODEL_NAME, + "compute_type": _COMPUTE_TYPE, + } + + +@app.post("/transcribe") +async def transcribe(body: dict): + """오디오 파일 전사. + + 입력: + { + "filePath": "/documents/PKM/Recordings/2026-04-23_회의.mp3", + "langs": ["ko"]?, # 단일 언어 지정 or 생략(자동감지) + "beamSize": 5? # 기본 5 + } + + 출력: + { + "text": "전체 전사 텍스트", + "segments": [{"start": 0.0, "end": 2.4, "text": "..."}, ...], + "language": "ko", + "language_probability": 0.99, + "duration": 1832.5 + } + """ + raw_path = body["filePath"] + langs = body.get("langs") + beam_size = int(body.get("beamSize", 5)) + + resolved = _resolve_path(raw_path) + if resolved is None: + return {"error": f"파일 없음: {raw_path}", "text": "", "segments": []} + + model = _load_model() + + language = None + if isinstance(langs, list) and len(langs) == 1: + language = langs[0] + + segments_iter, info = model.transcribe( + str(resolved), + beam_size=beam_size, + language=language, + vad_filter=True, + ) + + segments = [] + parts = [] + for seg in segments_iter: + segments.append({ + "start": round(float(seg.start), 2), + "end": round(float(seg.end), 2), + "text": seg.text.strip(), + }) + parts.append(seg.text) + + return { + "text": " ".join(p.strip() for p in parts).strip(), + "segments": segments, + "language": getattr(info, "language", None), + "language_probability": float(getattr(info, "language_probability", 0.0) or 0.0), + "duration": float(getattr(info, "duration", 0.0) or 0.0), + }