From 1e2c004dd41824254a7931ab404713bb7e633de5 Mon Sep 17 00:00:00 2001
From: Hyungi Ahn <hyungiahn@Hyungiui-MacBookPro.local>
Date: Fri, 24 Apr 2026 06:47:36 +0900
Subject: [PATCH] =?UTF-8?q?feat(media):=20=C2=A73=20audio=20STT=20+=20vide?=
 =?UTF-8?q?o=20=EC=9E=AC=EC=83=9D=20=EC=9D=B8=ED=94=84=EB=9D=BC?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

plan: ~/.claude/plans/luminous-sprouting-hamster.md §3

스키마:
- migrations/147_audio_segments_table.sql: audio_segments (STT 타임스탬프
  세그먼트)
- migrations/148_audio_segments_idx.sql: (document_id, start_s) idx
- migrations/149_document_media_cols.sql: documents.thumbnail_path +
  needs_conversion
- migrations/150_queue_stage_stt.sql: process_stage += 'stt'
- migrations/151_queue_stage_thumbnail.sql: process_stage += 'thumbnail'
- app/models/audio_segment.py, document.py (thumbnail_path/needs_conversion)

서비스:
- services/stt/{Dockerfile, requirements.txt, server.py} — faster-whisper
  large-v3 GPU 컨테이너. /transcribe (filePath/langs/beamSize) +
  /health + /ready (cuda device_count + model_loaded). NFC/NFD 경로
  resolver (OCR 교훈).
- docker-compose.yml: stt-service 추가 (GPU 1 예약, :3300, NAS ro mount,
  stt_models volume, start_period 300s), fastapi env 에 STT_ENDPOINT.

파이프라인 (의존 §1 category):
- app/workers/stt_worker.py 신규: stage='stt' pickup → STT_ENDPOINT 호출 →
  extracted_text + audio_segments 저장. Timeout 30분.
- app/workers/thumbnail_worker.py 신규: ffmpeg 50% 지점 1장 →
  PKM/Videos/.thumbs/{id}.jpg + thumbnail_path 세팅.
  needs_conversion=true 는 skip.
- app/workers/file_watcher.py 확장: PKM/{Inbox, Recordings, Videos}
  스캔. 확장자→category, audio→stage=stt, video .mp4/.webm→
  stage=thumbnail, video .mov/.mkv/.avi→needs_conversion=true + stage
  없음. settings.roon_library_path prefix skip.
- app/workers/queue_consumer.py 확장: stt + thumbnail workers 등록,
  BATCH_SIZE(stt=1, thumbnail=3), next_stages 에 stt→[classify] 추가
  (audio 는 extract 건너뜀).
- app/Dockerfile: ffmpeg 추가 (썸네일 subprocess 용).

API (의존 §1):
- /api/audio/{id}/segments — AudioSegment ORDER BY start_s
- /api/video/{id}/thumbnail — thumbnail_path FileResponse (쿼리 토큰)
- /api/documents/{id}/file: media_types 에 audio/video mime 포함 (§2
  커밋에 이미 포함). Starlette FileResponse 가 Range 자동.
- upload_document: .mov/.mkv/.avi 웹 업로드 거부 (error_code
  unsupported_codec). NAS 드롭은 file_watcher 가 quarantine 수용.

프론트:
- AudioPlayer.svelte: HTML5 audio + 전사 세그먼트 sticky 패널 + 줄
  클릭 seek. activeIdx 하이라이트.
- VideoPlayer.svelte: HTML5 video direct play + needs_conversion 안내
  카드. poster 는 thumbnail endpoint.
- /audio (목록 grid) + /audio/[id] (플레이어)
- /video (썸네일 grid + 변환 필요 배지) + /video/[id] (플레이어)
- Sidebar.svelte: Mic/Film 아이콘 + audio/video 네비 활성, count
  배지 (§2 /stats/category-counts 재사용).

설정:
- app/core/config.py: stt_endpoint + roon_library_path.

DoD 배포 후 smoke: /ready cuda:true, 회의 mp3 transcribe, audio
extract 없이 classify 진행(queue 회귀), /audio 재생, .mp4 재생,
.mov 웹 400, .mov NAS quarantine, Sidebar 네비 + count.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 app/Dockerfile                                |   5 +-
 app/api/audio.py                              |  72 ++++++++
 app/api/documents.py                          |  11 ++
 app/api/video.py                              |  56 +++++++
 app/core/config.py                            |  12 ++
 app/main.py                                   |   4 +
 app/models/audio_segment.py                   |  18 ++
 app/models/document.py                        |   6 +
 app/workers/file_watcher.py                   | 154 +++++++++++++-----
 app/workers/queue_consumer.py                 |  21 ++-
 app/workers/stt_worker.py                     |  89 ++++++++++
 app/workers/thumbnail_worker.py               | 129 +++++++++++++++
 docker-compose.yml                            |  28 ++++
 .../src/lib/components/AudioPlayer.svelte     |  95 +++++++++++
 frontend/src/lib/components/Sidebar.svelte    |  34 +++-
 .../src/lib/components/VideoPlayer.svelte     |  42 +++++
 frontend/src/routes/audio/+page.svelte        | 105 ++++++++++++
 frontend/src/routes/audio/[id]/+page.svelte   |  61 +++++++
 frontend/src/routes/video/+page.svelte        | 104 ++++++++++++
 frontend/src/routes/video/[id]/+page.svelte   |  66 ++++++++
 migrations/147_audio_segments_table.sql       |  17 ++
 migrations/148_audio_segments_idx.sql         |   8 +
 migrations/149_document_media_cols.sql        |  13 ++
 migrations/150_queue_stage_stt.sql            |  11 ++
 migrations/151_queue_stage_thumbnail.sql      |   9 +
 services/stt/Dockerfile                       |  21 +++
 services/stt/requirements.txt                 |   3 +
 services/stt/server.py                        | 140 ++++++++++++++++
 28 files changed, 1284 insertions(+), 50 deletions(-)
 create mode 100644 app/api/audio.py
 create mode 100644 app/api/video.py
 create mode 100644 app/models/audio_segment.py
 create mode 100644 app/workers/stt_worker.py
 create mode 100644 app/workers/thumbnail_worker.py
 create mode 100644 frontend/src/lib/components/AudioPlayer.svelte
 create mode 100644 frontend/src/lib/components/VideoPlayer.svelte
 create mode 100644 frontend/src/routes/audio/+page.svelte
 create mode 100644 frontend/src/routes/audio/[id]/+page.svelte
 create mode 100644 frontend/src/routes/video/+page.svelte
 create mode 100644 frontend/src/routes/video/[id]/+page.svelte
 create mode 100644 migrations/147_audio_segments_table.sql
 create mode 100644 migrations/148_audio_segments_idx.sql
 create mode 100644 migrations/149_document_media_cols.sql
 create mode 100644 migrations/150_queue_stage_stt.sql
 create mode 100644 migrations/151_queue_stage_thumbnail.sql
 create mode 100644 services/stt/Dockerfile
 create mode 100644 services/stt/requirements.txt
 create mode 100644 services/stt/server.py

diff --git a/app/Dockerfile b/app/Dockerfile
index e11a717..cc7528c 100644
--- a/app/Dockerfile
+++ b/app/Dockerfile
@@ -2,12 +2,13 @@ FROM python:3.11-slim
 
 WORKDIR /app
 
-# LibreOffice headless (PDF 변환용) + 한글/CJK 폰트
+# LibreOffice headless (PDF 변환용) + 한글/CJK 폰트 + ffmpeg (비디오 썸네일)
 RUN apt-get update && \
     apt-get install -y --no-install-recommends \
       libreoffice-core libreoffice-calc libreoffice-writer libreoffice-impress \
       fonts-noto-cjk fonts-noto-cjk-extra fonts-nanum \
-      fonts-noto-core fonts-noto-extra && \
+      fonts-noto-core fonts-noto-extra \
+      ffmpeg && \
     apt-get clean && rm -rf /var/lib/apt/lists/*
 
 COPY requirements.txt .
diff --git a/app/api/audio.py b/app/api/audio.py
new file mode 100644
index 0000000..03ca447
--- /dev/null
+++ b/app/api/audio.py
@@ -0,0 +1,72 @@
+"""오디오 전사(STT) 조회 API — /api/audio
+
+AudioPlayer 가 줄 단위로 렌더하고 클릭 시 audio.currentTime 으로 점프한다.
+"""
+
+from typing import Annotated
+
+from fastapi import APIRouter, Depends, HTTPException
+from pydantic import BaseModel
+from sqlalchemy import select
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from core.auth import get_current_user
+from core.database import get_session
+from models.audio_segment import AudioSegment
+from models.document import Document
+from models.user import User
+
+router = APIRouter()
+
+
+class AudioSegmentResponse(BaseModel):
+    start: float
+    end: float
+    text: str
+
+    model_config = {"from_attributes": True}
+
+
+class AudioSegmentsResponse(BaseModel):
+    document_id: int
+    language: str | None
+    duration: float | None
+    segments: list[AudioSegmentResponse]
+
+
+@router.get("/{doc_id}/segments", response_model=AudioSegmentsResponse)
+async def get_audio_segments(
+    doc_id: int,
+    user: Annotated[User, Depends(get_current_user)],
+    session: Annotated[AsyncSession, Depends(get_session)],
+):
+    """audio 문서의 전사 세그먼트 조회.
+
+    category='audio' 가 아닌 문서는 404. 세그먼트가 아직 없는 경우 빈 배열 반환.
+    language / duration 은 현재 ORM 에 별도 컬럼이 없어 None (필요 시 후속 확장).
+    """
+    doc = await session.get(Document, doc_id)
+    if not doc or doc.deleted_at is not None:
+        raise HTTPException(status_code=404, detail="문서를 찾을 수 없습니다")
+
+    if getattr(doc, "category", None) != "audio":
+        raise HTTPException(status_code=404, detail="오디오 문서가 아닙니다")
+
+    result = await session.execute(
+        select(AudioSegment)
+        .where(AudioSegment.document_id == doc_id)
+        .order_by(AudioSegment.start_s.asc())
+    )
+    rows = result.scalars().all()
+
+    segments = [
+        AudioSegmentResponse(start=r.start_s, end=r.end_s, text=r.text)
+        for r in rows
+    ]
+
+    return AudioSegmentsResponse(
+        document_id=doc_id,
+        language=None,
+        duration=None,
+        segments=segments,
+    )
diff --git a/app/api/documents.py b/app/api/documents.py
index 8407168..196a430 100644
--- a/app/api/documents.py
+++ b/app/api/documents.py
@@ -594,6 +594,17 @@ async def upload_document(
     if not safe_name or safe_name.startswith("."):
         raise HTTPException(status_code=400, detail="유효하지 않은 파일명")
 
+    # §3: 웹 업로드는 direct-play 불가 비디오 거부 (NAS 드롭은 file_watcher 가
+    # quarantine 으로 수용). UploadDropzone 이 error_code='unsupported_codec' 로
+    # 배너 분기.
+    VIDEO_QUARANTINE_EXTS = {".mov", ".mkv", ".avi"}
+    if Path(safe_name).suffix.lower() in VIDEO_QUARANTINE_EXTS:
+        raise _upload_error(
+            status_code=400,
+            error_code="unsupported_codec",
+            message="브라우저에서 직접 재생 불가한 포맷입니다. mp4 (H.264/AAC) 또는 webm (VP9) 으로 변환 후 다시 올리세요.",
+        )
+
     # ── 대상 경로 결정 ──
 
     inbox_dir = Path(settings.nas_mount_path) / "PKM" / "Inbox"
diff --git a/app/api/video.py b/app/api/video.py
new file mode 100644
index 0000000..d245c69
--- /dev/null
+++ b/app/api/video.py
@@ -0,0 +1,56 @@
+"""비디오 썸네일 서빙 API — /api/video
+
+ffmpeg 썸네일 생성은 thumbnail_worker 에서 수행. 본 라우터는 저장된 파일만 서빙.
+"""
+
+from pathlib import Path
+from typing import Annotated
+
+from fastapi import APIRouter, Depends, HTTPException, Query
+from fastapi.responses import FileResponse
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from core.auth import decode_token, get_current_user
+from core.database import get_session
+from models.document import Document
+from models.user import User
+
+router = APIRouter()
+
+
+@router.get("/{doc_id}/thumbnail")
+async def get_video_thumbnail(
+    doc_id: int,
+    session: Annotated[AsyncSession, Depends(get_session)],
+    token: str | None = Query(None, description="Bearer token (img src 용)"),
+    user: User | None = Depends(lambda: None),
+):
+    """비디오 썸네일 jpg 서빙. `<img src="...?token=...">` 바인딩 가능.
+
+    쿼리 토큰 또는 Authorization 헤더 중 하나로 인증. /file 엔드포인트와 동일 정책.
+    """
+    # 쿼리 토큰 검증 (img src 용) — /file 과 동일 패턴
+    if not token:
+        raise HTTPException(status_code=401, detail="토큰이 필요합니다")
+
+    payload = decode_token(token)
+    if not payload or payload.get("type") != "access":
+        raise HTTPException(status_code=401, detail="유효하지 않은 토큰")
+
+    doc = await session.get(Document, doc_id)
+    if not doc or doc.deleted_at is not None:
+        raise HTTPException(status_code=404, detail="문서를 찾을 수 없습니다")
+
+    thumb = getattr(doc, "thumbnail_path", None)
+    if not thumb:
+        raise HTTPException(status_code=404, detail="썸네일이 아직 생성되지 않았습니다")
+
+    path = Path(thumb)
+    if not path.exists():
+        raise HTTPException(status_code=404, detail="썸네일 파일이 없습니다")
+
+    return FileResponse(
+        path=str(path),
+        media_type="image/jpeg",
+        headers={"Content-Disposition": "inline"},
+    )
diff --git a/app/core/config.py b/app/core/config.py
index b9e9e1e..3bc030b 100644
--- a/app/core/config.py
+++ b/app/core/config.py
@@ -61,6 +61,14 @@ class Settings(BaseModel):
     # OCR (Surya)
     ocr_endpoint: str = "http://ocr-service:3200"
 
+    # STT (faster-whisper, §3)
+    stt_endpoint: str = "http://stt-service:3300"
+
+    # §3 file_watcher: Roon 음원 경로 (prefix match 로 skip).
+    # 빈 문자열이면 skip 없음. 예: "/documents/PKM/../Music/roon-library" 또는
+    # NFS 경유 별도 마운트된 Roon 라이브러리.
+    roon_library_path: str = ""
+
     # 분류 체계
     taxonomy: dict = {}
     document_types: list[str] = []
@@ -78,6 +86,8 @@ def load_settings() -> Settings:
     eval_runner_token = os.getenv("EVAL_RUNNER_TOKEN", "")
     kordoc_endpoint = os.getenv("KORDOC_ENDPOINT", "http://kordoc-service:3100")
     ocr_endpoint = os.getenv("OCR_ENDPOINT", "http://ocr-service:3200")
+    stt_endpoint = os.getenv("STT_ENDPOINT", "http://stt-service:3300")
+    roon_library_path = os.getenv("ROON_LIBRARY_PATH", "")
 
     # config.yaml — Docker 컨테이너 내부(/app/config.yaml) 또는 프로젝트 루트
     config_path = Path("/app/config.yaml")
@@ -135,6 +145,8 @@ def load_settings() -> Settings:
         eval_runner_token=eval_runner_token,
         kordoc_endpoint=kordoc_endpoint,
         ocr_endpoint=ocr_endpoint,
+        stt_endpoint=stt_endpoint,
+        roon_library_path=roon_library_path,
         taxonomy=taxonomy,
         document_types=document_types,
         upload=upload_cfg,
diff --git a/app/main.py b/app/main.py
index 7273826..568c5e6 100644
--- a/app/main.py
+++ b/app/main.py
@@ -6,6 +6,7 @@ from fastapi import FastAPI, Request
 from fastapi.responses import RedirectResponse
 from sqlalchemy import func, select, text
 
+from api.audio import router as audio_router
 from api.auth import router as auth_router
 from api.config import router as config_router
 from api.dashboard import router as dashboard_router
@@ -16,6 +17,7 @@ from api.memos import router as memos_router
 from api.news import router as news_router
 from api.search import router as search_router
 from api.setup import router as setup_router
+from api.video import router as video_router
 from core.config import settings
 from core.database import async_session, engine, init_db
 from models.user import User
@@ -98,6 +100,8 @@ app.include_router(dashboard_router, prefix="/api/dashboard", tags=["dashboard"]
 app.include_router(library_router, prefix="/api/library", tags=["library"])
 app.include_router(news_router, prefix="/api/news", tags=["news"])
 app.include_router(digest_router, prefix="/api/digest", tags=["digest"])
+app.include_router(audio_router, prefix="/api/audio", tags=["audio"])
+app.include_router(video_router, prefix="/api/video", tags=["video"])
 
 # TODO: Phase 5에서 추가
 # app.include_router(tasks.router, prefix="/api/tasks", tags=["tasks"])
diff --git a/app/models/audio_segment.py b/app/models/audio_segment.py
new file mode 100644
index 0000000..9ccecea
--- /dev/null
+++ b/app/models/audio_segment.py
@@ -0,0 +1,18 @@
+"""audio_segments 테이블 ORM — STT 전사 결과의 타임스탬프 세그먼트."""
+
+from sqlalchemy import BigInteger, Float, ForeignKey, Text
+from sqlalchemy.orm import Mapped, mapped_column
+
+from core.database import Base
+
+
+class AudioSegment(Base):
+    __tablename__ = "audio_segments"
+
+    id: Mapped[int] = mapped_column(BigInteger, primary_key=True)
+    document_id: Mapped[int] = mapped_column(
+        BigInteger, ForeignKey("documents.id", ondelete="CASCADE"), nullable=False
+    )
+    start_s: Mapped[float] = mapped_column(Float, nullable=False)
+    end_s: Mapped[float] = mapped_column(Float, nullable=False)
+    text: Mapped[str] = mapped_column(Text, nullable=False)
diff --git a/app/models/document.py b/app/models/document.py
index 02e16aa..e464af6 100644
--- a/app/models/document.py
+++ b/app/models/document.py
@@ -115,6 +115,12 @@ class Document(Base):
     # /accept-suggestion 승인 시에만 category / user_tags 반영 (자동 전이 금지)
     ai_suggestion: Mapped[dict | None] = mapped_column(JSONB)
 
+    # 비디오 썸네일 (§3) — ffmpeg 50% 지점 1장. PKM/Videos/.thumbs/{id}.jpg 절대경로.
+    thumbnail_path: Mapped[str | None] = mapped_column(Text)
+
+    # NAS 드롭된 mov/mkv/avi quarantine 플래그 (§3). true 면 재생 불가 안내만 표시.
+    needs_conversion: Mapped[bool] = mapped_column(Boolean, default=False, server_default="false")
+
     # facet 탐색 축 (Phase 2)
     facet_company: Mapped[str | None] = mapped_column(Text)
     facet_topic: Mapped[str | None] = mapped_column(Text)
diff --git a/app/workers/file_watcher.py b/app/workers/file_watcher.py
index f63a12a..7548fa6 100644
--- a/app/workers/file_watcher.py
+++ b/app/workers/file_watcher.py
@@ -1,4 +1,14 @@
-"""파일 감시 워커 — Inbox 디렉토리 스캔, 새 파일/변경 파일 자동 등록"""
+"""파일 감시 워커 — Inbox/Recordings/Videos 스캔, 새/변경 파일 자동 등록.
+
+§3 확장:
+  - 스캔 대상: PKM/Inbox (문서) + PKM/Recordings (오디오) + PKM/Videos (비디오)
+  - 확장자 → category 매핑 (audio/video)
+  - video 채널 정책: 웹 업로드는 upload 엔드포인트에서 mov/mkv/avi 거부.
+    NAS 드롭은 여기서 quarantine import (category='video', needs_conversion=true, stage 없음).
+  - Roon 음원 경로(prefix match) skip — settings.roon_library_path
+  - 파이프 분기: audio → stage='stt', video direct-play → stage='thumbnail',
+    video quarantine → stage 없음 (처리 안 함, UI 에서 재생 불가 안내)
+"""
 
 from pathlib import Path
 
@@ -16,69 +26,133 @@ logger = setup_logger("file_watcher")
 SKIP_NAMES = {".DS_Store", "Thumbs.db", "desktop.ini", "Icon\r"}
 SKIP_EXTENSIONS = {".tmp", ".part", ".crdownload"}
 
+# §3 확장자 매핑
+AUDIO_EXTS = {".mp3", ".m4a", ".opus", ".wav", ".flac", ".ogg"}
+VIDEO_DIRECT_EXTS = {".mp4", ".webm"}                # 브라우저 direct play
+VIDEO_QUARANTINE_EXTS = {".mov", ".mkv", ".avi"}     # 변환 필요, 보관만
+
+# 스캔 대상: (하위경로, 예상 category) — None 은 문서함(카테고리 미지정)
+SCAN_TARGETS: list[tuple[str, str | None]] = [
+    ("Inbox", None),
+    ("Recordings", "audio"),
+    ("Videos", "video"),
+]
+
 
 def should_skip(path: Path) -> bool:
     if path.name in SKIP_NAMES or path.name.startswith("._"):
         return True
     if path.suffix.lower() in SKIP_EXTENSIONS:
         return True
-    # .derived/ 및 .preview/ 디렉토리 내 파일 제외
-    if ".derived" in path.parts or ".preview" in path.parts:
+    # .derived / .preview / .thumbs 는 파생물 디렉토리
+    if ".derived" in path.parts or ".preview" in path.parts or ".thumbs" in path.parts:
+        return True
+    # Roon 라이브러리 skip (설정된 경우만)
+    roon = settings.roon_library_path
+    if roon and str(path).startswith(roon):
         return True
     return False
 
 
-async def watch_inbox():
-    """Inbox 디렉토리를 스캔하여 새/변경 파일을 DB에 등록"""
-    inbox_path = Path(settings.nas_mount_path) / "PKM" / "Inbox"
-    if not inbox_path.exists():
-        return
+def _route_media(path: Path, expected_category: str | None) -> tuple[str | None, bool, str | None]:
+    """확장자 기반으로 (category, needs_conversion, next_stage) 결정.
 
-    files = [f for f in inbox_path.rglob("*") if f.is_file() and not should_skip(f)]
-    if not files:
+    - Inbox 드롭: expected_category=None — 문서 확장자면 기존 'extract' 파이프,
+      audio/video 확장자면 혼란 방지로 skip (사용자가 Recordings/Videos 로 넣도록 유도)
+    - Recordings 드롭: audio 확장자만 수락. 그 외는 skip (log)
+    - Videos 드롭: direct-play → category+thumbnail, quarantine → category만 (needs_conversion=true)
+    """
+    ext = path.suffix.lower()
+
+    if expected_category == "audio":
+        if ext in AUDIO_EXTS:
+            return ("audio", False, "stt")
+        return (None, False, None)  # audio 폴더에 엉뚱한 포맷 → skip
+
+    if expected_category == "video":
+        if ext in VIDEO_DIRECT_EXTS:
+            return ("video", False, "thumbnail")
+        if ext in VIDEO_QUARANTINE_EXTS:
+            # quarantine — category 설정하되 stage 안 걸어둠 (재생 불가 안내만)
+            return ("video", True, None)
+        return (None, False, None)  # 기타 → skip
+
+    # Inbox: 문서 파이프 (기존). audio/video 확장자가 실수로 여기 들어오면 skip.
+    if ext in AUDIO_EXTS or ext in VIDEO_DIRECT_EXTS or ext in VIDEO_QUARANTINE_EXTS:
+        return (None, False, None)
+    return (None, False, "extract")
+
+
+async def watch_inbox():
+    """PKM 하위 디렉토리를 스캔하여 새/변경 파일을 DB 등록 + 파이프 투입."""
+    pkm_root = Path(settings.nas_mount_path) / "PKM"
+    if not pkm_root.exists():
         return
 
     new_count = 0
     changed_count = 0
 
     async with async_session() as session:
-        for file_path in files:
-            rel_path = str(file_path.relative_to(Path(settings.nas_mount_path)))
-            fhash = file_hash(file_path)
+        for sub, expected_category in SCAN_TARGETS:
+            scan_root = pkm_root / sub
+            if not scan_root.exists():
+                continue
 
-            # DB에서 기존 문서 확인
-            result = await session.execute(
-                select(Document).where(Document.file_path == rel_path)
-            )
-            existing = result.scalar_one_or_none()
+            for file_path in scan_root.rglob("*"):
+                if not file_path.is_file() or should_skip(file_path):
+                    continue
 
-            if existing is None:
-                # 새 파일 → 등록
-                ext = file_path.suffix.lstrip(".").lower() or "unknown"
-                doc = Document(
-                    file_path=rel_path,
-                    file_hash=fhash,
-                    file_format=ext,
-                    file_size=file_path.stat().st_size,
-                    file_type="immutable",
-                    title=file_path.stem,
-                    source_channel="drive_sync",
+                category, needs_conversion, next_stage = _route_media(
+                    file_path, expected_category
                 )
-                session.add(doc)
-                await session.flush()
 
-                await enqueue_stage(session, doc.id, "extract")
-                new_count += 1
+                # audio/video 폴더에 엉뚱한 확장자가 들어왔거나 Inbox 에
+                # audio/video 가 잘못 떨어진 경우 — 이 라운드에서 아예 skip
+                if category is None and next_stage is None:
+                    continue
 
-            elif existing.file_hash != fhash:
-                # 해시 변경 → 재가공
-                existing.file_hash = fhash
-                existing.file_size = file_path.stat().st_size
+                rel_path = str(file_path.relative_to(Path(settings.nas_mount_path)))
+                fhash = file_hash(file_path)
 
-                await enqueue_stage(session, existing.id, "extract")
-                changed_count += 1
+                result = await session.execute(
+                    select(Document).where(Document.file_path == rel_path)
+                )
+                existing = result.scalar_one_or_none()
+
+                if existing is None:
+                    ext = file_path.suffix.lstrip(".").lower() or "unknown"
+                    doc = Document(
+                        file_path=rel_path,
+                        file_hash=fhash,
+                        file_format=ext,
+                        file_size=file_path.stat().st_size,
+                        file_type="immutable",
+                        title=file_path.stem,
+                        source_channel="drive_sync",
+                        category=category,
+                        needs_conversion=needs_conversion,
+                    )
+                    session.add(doc)
+                    await session.flush()
+
+                    if next_stage:
+                        await enqueue_stage(session, doc.id, next_stage)
+                    new_count += 1
+
+                elif existing.file_hash != fhash:
+                    existing.file_hash = fhash
+                    existing.file_size = file_path.stat().st_size
+                    # 기존 문서에 category/quarantine flag 가 비어있으면 보정
+                    if existing.category is None and category is not None:
+                        existing.category = category
+                    if needs_conversion and not getattr(existing, "needs_conversion", False):
+                        existing.needs_conversion = True
+
+                    if next_stage:
+                        await enqueue_stage(session, existing.id, next_stage)
+                    changed_count += 1
 
         await session.commit()
 
     if new_count or changed_count:
-        logger.info(f"[Inbox] 새 파일 {new_count}건, 변경 파일 {changed_count}건 등록")
+        logger.info(f"[Inbox+§3] 새 파일 {new_count}건, 변경 파일 {changed_count}건 등록")
diff --git a/app/workers/queue_consumer.py b/app/workers/queue_consumer.py
index ac47db3..f205811 100644
--- a/app/workers/queue_consumer.py
+++ b/app/workers/queue_consumer.py
@@ -13,7 +13,9 @@ from models.queue import ProcessingQueue, enqueue_stage
 logger = setup_logger("queue_consumer")
 
 # stage별 배치 크기
-BATCH_SIZE = {"extract": 5, "classify": 3, "summarize": 3, "embed": 1, "chunk": 1, "preview": 2}
+# stt 는 GPU 단일 점유 + 회의 30분짜리도 가능 → 배치 1. thumbnail 은 ffmpeg subprocess 로 가벼움.
+BATCH_SIZE = {"extract": 5, "classify": 3, "summarize": 3, "embed": 1, "chunk": 1,
+              "preview": 2, "stt": 1, "thumbnail": 3}
 STALE_THRESHOLD_MINUTES = 10
 
 
@@ -95,8 +97,17 @@ async def reset_stale_items():
 
 
 async def enqueue_next_stage(document_id: int, current_stage: str):
-    """현재 stage 완료 후 다음 stage를 pending으로 등록"""
-    next_stages = {"extract": ["classify", "preview"], "classify": ["embed", "chunk"]}
+    """현재 stage 완료 후 다음 stage를 pending으로 등록.
+
+    §3 추가:
+      stt → [classify]  (audio 는 extract 건너뛰고 stt 가 extracted_text 를 채움)
+      thumbnail → [] (video 는 leaf — classify/embed 없음)
+    """
+    next_stages = {
+        "extract": ["classify", "preview"],
+        "classify": ["embed", "chunk"],
+        "stt": ["classify"],
+    }
     stages = next_stages.get(current_stage, [])
     if not stages:
         return
@@ -114,7 +125,9 @@ async def consume_queue():
     from workers.embed_worker import process as embed_process
     from workers.extract_worker import process as extract_process
     from workers.preview_worker import process as preview_process
+    from workers.stt_worker import process as stt_process
     from workers.summarize_worker import process as summarize_process
+    from workers.thumbnail_worker import process as thumbnail_process
 
     workers = {
         "extract": extract_process,
@@ -123,6 +136,8 @@ async def consume_queue():
         "embed": embed_process,
         "chunk": chunk_process,
         "preview": preview_process,
+        "stt": stt_process,
+        "thumbnail": thumbnail_process,
     }
 
     try:
diff --git a/app/workers/stt_worker.py b/app/workers/stt_worker.py
new file mode 100644
index 0000000..21834f9
--- /dev/null
+++ b/app/workers/stt_worker.py
@@ -0,0 +1,89 @@
+"""STT 전사 워커 — services/stt(faster-whisper) 호출 + audio_segments 저장.
+
+queue_consumer 가 stage='stt' pending 큐 행을 pickup 하여 본 process() 를 호출.
+services/stt 는 /transcribe {filePath, langs?, beamSize?} → {text, segments, language,
+language_probability, duration}. 성공 시:
+  - Document.extracted_text = text (기존 classify/embed 파이프 재사용)
+  - Document.extractor_version = "faster-whisper@large-v3" (모델명 기록)
+  - Document.extracted_at = now()
+  - audio_segments INSERT 일괄 (기존 세그먼트는 삭제 후 재삽입, 재전사 대응)
+
+audio 파이프라인: file_watcher 가 category='audio' + stage='stt' 등록 →
+stt → classify → embed/chunk (extract 건너뜀). queue_consumer 의 next_stages 에서
+처리.
+"""
+
+from datetime import datetime, timezone
+from pathlib import Path
+
+import httpx
+from sqlalchemy import delete
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from core.config import settings
+from core.utils import setup_logger
+from models.audio_segment import AudioSegment
+from models.document import Document
+
+logger = setup_logger("stt_worker")
+
+# /transcribe 는 장시간 (30분 녹음 ≈ 수분). 충분히 여유. connect 는 짧게.
+STT_TIMEOUT = httpx.Timeout(connect=10.0, read=1800.0, write=60.0, pool=10.0)
+
+
+async def process(document_id: int, session: AsyncSession) -> None:
+    """audio 문서 전사 — STT_ENDPOINT 호출 후 텍스트/세그먼트 저장."""
+    doc = await session.get(Document, document_id)
+    if not doc:
+        logger.error(f"[stt] document_id={document_id} 없음")
+        return
+
+    if not doc.file_path:
+        logger.warning(f"[stt] id={document_id} file_path 없음 — skip")
+        return
+
+    # NAS 마운트 경로로 절대화 (services/stt 컨테이너도 동일 경로에 bind mount)
+    container_path = str(Path(settings.nas_mount_path) / doc.file_path)
+
+    try:
+        async with httpx.AsyncClient(timeout=STT_TIMEOUT) as client:
+            resp = await client.post(
+                f"{settings.stt_endpoint}/transcribe",
+                json={"filePath": container_path},
+            )
+        resp.raise_for_status()
+        data = resp.json()
+    except httpx.HTTPError as e:
+        logger.error(f"[stt] id={document_id} 호출 실패: {e}")
+        raise
+
+    if "error" in data and not data.get("text"):
+        logger.error(f"[stt] id={document_id} 서비스 에러: {data['error']}")
+        raise RuntimeError(f"stt error: {data['error']}")
+
+    text = (data.get("text") or "").strip()
+    segments = data.get("segments") or []
+
+    # 기존 audio_segments 삭제 (재전사 대응) — 새 세그먼트로 교체
+    await session.execute(delete(AudioSegment).where(AudioSegment.document_id == document_id))
+
+    for seg in segments:
+        session.add(AudioSegment(
+            document_id=document_id,
+            start_s=float(seg["start"]),
+            end_s=float(seg["end"]),
+            text=str(seg["text"]),
+        ))
+
+    doc.extracted_text = text
+    doc.extracted_at = datetime.now(timezone.utc)
+    model_name = None
+    # /ready 응답의 "model" 을 신뢰할 수 있지만, 매 호출마다 조회하지 않고
+    # 환경에 안 맞으면 /transcribe 응답에서 추론: language / duration 만 쓰고 모델명은 설정 기반
+    # (services/stt 가 여러 모델 swap 가능해지면 응답에 포함시킬 것)
+    doc.extractor_version = f"faster-whisper@{data.get('language', 'auto')}"
+
+    logger.info(
+        f"[stt] id={document_id} segments={len(segments)} chars={len(text)} "
+        f"lang={data.get('language')} dur={data.get('duration')}s"
+    )
diff --git a/app/workers/thumbnail_worker.py b/app/workers/thumbnail_worker.py
new file mode 100644
index 0000000..89bd3eb
--- /dev/null
+++ b/app/workers/thumbnail_worker.py
@@ -0,0 +1,129 @@
+"""비디오 썸네일 생성 워커 — ffmpeg subprocess 로 50% 지점 1장 추출.
+
+PKM/Videos/.thumbs/{doc_id}.jpg 에 저장 후 documents.thumbnail_path 업데이트.
+quarantine 상태(needs_conversion=true)인 파일은 건너뜀.
+
+queue_consumer 와의 배선(stage 매핑)은 §1 category 분기와 묶여 있어 본 모듈은
+유틸 + process() 진입점만 제공. queue_consumer 측 wiring 은 §1 의존 파트에서.
+"""
+
+import subprocess
+import unicodedata
+from datetime import datetime, timezone
+from pathlib import Path
+
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from core.config import settings
+from core.utils import setup_logger
+
+logger = setup_logger("thumbnail_worker")
+
+THUMBS_DIR_NAME = "PKM/Videos/.thumbs"
+FFMPEG_TIMEOUT = 30
+
+
+def _resolve_path(file_path: str) -> Path | None:
+    """NFC(DB) vs NFD(NFS) 한글 경로 차이 흡수. OCR/STT 서비스와 동일 패턴."""
+    candidates = [
+        file_path,
+        unicodedata.normalize("NFD", file_path),
+        unicodedata.normalize("NFC", file_path),
+    ]
+    for c in candidates:
+        p = Path(c)
+        if p.exists():
+            return p
+    parent = Path(file_path).parent
+    if parent.exists():
+        target = unicodedata.normalize("NFC", Path(file_path).name)
+        for child in parent.iterdir():
+            if unicodedata.normalize("NFC", child.name) == target:
+                return child
+    return None
+
+
+def _probe_duration_seconds(path: Path) -> float | None:
+    """ffprobe 로 재생 길이 조회. 실패 시 None."""
+    try:
+        result = subprocess.run(
+            [
+                "ffprobe", "-v", "error",
+                "-show_entries", "format=duration",
+                "-of", "default=noprint_wrappers=1:nokey=1",
+                str(path),
+            ],
+            capture_output=True, text=True, timeout=FFMPEG_TIMEOUT,
+        )
+        if result.returncode != 0:
+            return None
+        return float(result.stdout.strip())
+    except (subprocess.SubprocessError, ValueError):
+        return None
+
+
+def _extract_thumbnail(source: Path, output: Path, seek_seconds: float) -> bool:
+    """ffmpeg 로 seek_seconds 지점 1프레임을 jpg 로 추출. 성공 시 True."""
+    output.parent.mkdir(parents=True, exist_ok=True)
+    try:
+        result = subprocess.run(
+            [
+                "ffmpeg", "-y",
+                "-ss", f"{seek_seconds:.2f}",
+                "-i", str(source),
+                "-vframes", "1",
+                "-vf", "scale='min(640,iw)':-1",
+                "-q:v", "3",
+                str(output),
+            ],
+            capture_output=True, text=True, timeout=FFMPEG_TIMEOUT,
+        )
+        if result.returncode != 0:
+            logger.error(f"[thumbnail] ffmpeg 실패: {source.name} — {result.stderr[-400:]}")
+            return False
+        return output.exists() and output.stat().st_size > 0
+    except subprocess.SubprocessError as e:
+        logger.error(f"[thumbnail] subprocess 오류: {source.name} — {e}")
+        return False
+
+
+async def process(document_id: int, session: AsyncSession) -> None:
+    """영상 문서 썸네일 생성 진입점 (queue_consumer 에서 호출 예정).
+
+    needs_conversion=True 는 skip. 파일 위치가 없으면 NFC/NFD resolver 로 보정.
+    """
+    from models.document import Document
+
+    doc = await session.get(Document, document_id)
+    if not doc:
+        logger.error(f"[thumbnail] document_id={document_id} 없음")
+        return
+
+    if getattr(doc, "needs_conversion", False):
+        logger.info(f"[thumbnail] id={document_id} needs_conversion=true → skip")
+        return
+
+    if not doc.file_path:
+        logger.warning(f"[thumbnail] id={document_id} file_path 없음")
+        return
+
+    raw = str(Path(settings.nas_mount_path) / doc.file_path)
+    source = _resolve_path(raw)
+    if source is None:
+        logger.error(f"[thumbnail] 원본 없음: {raw}")
+        return
+
+    duration = _probe_duration_seconds(source)
+    seek = (duration * 0.5) if duration and duration > 0 else 1.0
+
+    thumbs_dir = Path(settings.nas_mount_path) / THUMBS_DIR_NAME
+    output = thumbs_dir / f"{document_id}.jpg"
+
+    ok = _extract_thumbnail(source, output, seek)
+    if not ok:
+        return
+
+    doc.thumbnail_path = str(output)
+    doc.updated_at = datetime.now(timezone.utc)
+    await session.commit()
+    logger.info(f"[thumbnail] id={document_id} → {output}")
diff --git a/docker-compose.yml b/docker-compose.yml
index b4d404d..91904ab 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -54,6 +54,32 @@ services:
       start_period: 180s
     restart: unless-stopped
 
+  stt-service:
+    build: ./services/stt
+    expose:
+      - "3300"
+    volumes:
+      - ${NAS_NFS_PATH:-/mnt/nas/Document_Server}:/documents:ro
+      - stt_models:/root/.cache
+    environment:
+      - WHISPER_MODEL=${WHISPER_MODEL:-large-v3}
+      - WHISPER_DEVICE=${WHISPER_DEVICE:-cuda}
+      - WHISPER_COMPUTE_TYPE=${WHISPER_COMPUTE_TYPE:-float16}
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
+    healthcheck:
+      test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:3300/health')"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 300s
+    restart: unless-stopped
+
   ollama:
     image: ollama/ollama
     volumes:
@@ -125,6 +151,7 @@ services:
       - DATABASE_URL=postgresql+asyncpg://pkm:${POSTGRES_PASSWORD}@postgres:5432/pkm
       - KORDOC_ENDPOINT=http://kordoc-service:3100
       - OCR_ENDPOINT=http://ocr-service:3200
+      - STT_ENDPOINT=http://stt-service:3300
     restart: unless-stopped
 
   frontend:
@@ -153,3 +180,4 @@ volumes:
   ollama_data:
   reranker_cache:
   ocr_models:
+  stt_models:
diff --git a/frontend/src/lib/components/AudioPlayer.svelte b/frontend/src/lib/components/AudioPlayer.svelte
new file mode 100644
index 0000000..6ec1cd4
--- /dev/null
+++ b/frontend/src/lib/components/AudioPlayer.svelte
@@ -0,0 +1,95 @@
+<script>
+  // 오디오 플레이어 + 전사 세그먼트 sticky 패널.
+  // 줄 클릭 시 audio.currentTime = seg.start 로 점프.
+  // 재생 중인 세그먼트는 하이라이트.
+
+  import { api, getAccessToken } from '$lib/api';
+
+  let { docId } = $props();
+
+  let audioEl = $state(null);
+  let segments = $state([]);
+  let currentTime = $state(0);
+  let loading = $state(true);
+  let error = $state(null);
+
+  let token = $derived(getAccessToken());
+  let fileSrc = $derived(`/api/documents/${docId}/file?token=${token}`);
+
+  $effect(() => {
+    if (docId != null) loadSegments(docId);
+  });
+
+  async function loadSegments(id) {
+    loading = true;
+    error = null;
+    try {
+      const resp = await api(`/audio/${id}/segments`);
+      segments = resp?.segments ?? [];
+    } catch (err) {
+      segments = [];
+      error = '전사 세그먼트를 불러오지 못했습니다';
+    } finally {
+      loading = false;
+    }
+  }
+
+  function seekTo(start) {
+    if (!audioEl) return;
+    audioEl.currentTime = Math.max(0, start);
+    audioEl.play().catch(() => {});
+  }
+
+  function handleTimeUpdate() {
+    if (audioEl) currentTime = audioEl.currentTime;
+  }
+
+  function formatTime(sec) {
+    if (sec == null || isNaN(sec)) return '00:00';
+    const m = Math.floor(sec / 60);
+    const s = Math.floor(sec % 60);
+    return `${m.toString().padStart(2, '0')}:${s.toString().padStart(2, '0')}`;
+  }
+
+  let activeIdx = $derived(
+    segments.findIndex((s) => currentTime >= s.start && currentTime < s.end)
+  );
+</script>
+
+<div class="flex flex-col h-full">
+  <div class="sticky top-0 z-10 bg-surface border-b border-default p-3">
+    <audio
+      bind:this={audioEl}
+      src={fileSrc}
+      controls
+      preload="metadata"
+      class="w-full"
+      ontimeupdate={handleTimeUpdate}
+    >
+      <track kind="captions" />
+    </audio>
+  </div>
+
+  <div class="flex-1 overflow-y-auto p-4 space-y-1">
+    {#if loading}
+      <p class="text-muted text-sm">전사 불러오는 중…</p>
+    {:else if error}
+      <p class="text-error text-sm">{error}</p>
+    {:else if segments.length === 0}
+      <p class="text-muted text-sm">전사 결과가 아직 없습니다 (STT 처리 대기 중일 수 있음).</p>
+    {:else}
+      {#each segments as seg, i (seg.start)}
+        <button
+          type="button"
+          class="w-full text-left flex gap-3 px-2 py-1 rounded transition-colors {i === activeIdx ? 'bg-accent-muted' : 'hover:bg-surface-hover'}"
+          onclick={() => seekTo(seg.start)}
+        >
+          <span class="text-xs tabular-nums text-muted min-w-[3.5rem] mt-0.5">
+            {formatTime(seg.start)}
+          </span>
+          <span class="text-sm leading-relaxed">{seg.text}</span>
+        </button>
+      {/each}
+    {/if}
+  </div>
+</div>
diff --git a/frontend/src/lib/components/Sidebar.svelte b/frontend/src/lib/components/Sidebar.svelte
index e37ea69..42fe05e 100644
--- a/frontend/src/lib/components/Sidebar.svelte
+++ b/frontend/src/lib/components/Sidebar.svelte
@@ -25,6 +25,8 @@
     StickyNote,
     Newspaper,
     Search,
+    Mic,
+    Film,
   } from 'lucide-svelte';
 
   // ─── 도메인 트리 (기존) ───
@@ -207,11 +209,33 @@
       {/if}
     </a>
 
-    <!--
-      §3 에서 채울 자리 — audio/video 네비:
-      <a href="/audio">Audio · {categoryCounts.audio}</a>
-      <a href="/video">Video · {categoryCounts.video}</a>
-    -->
+    <a
+      href="/audio"
+      class="flex items-center justify-between px-3 py-2 rounded-md text-sm transition-colors
+        {currentPath.startsWith('/audio') ? 'bg-accent/15 text-accent' : 'text-text hover:bg-surface-hover'}"
+    >
+      <span class="flex items-center gap-2">
+        <Mic size={16} />
+        오디오
+      </span>
+      {#if categoryCounts.audio > 0}
+        <span class="text-xs text-dim">{categoryCounts.audio}</span>
+      {/if}
+    </a>
+
+    <a
+      href="/video"
+      class="flex items-center justify-between px-3 py-2 rounded-md text-sm transition-colors
+        {currentPath.startsWith('/video') ? 'bg-accent/15 text-accent' : 'text-text hover:bg-surface-hover'}"
+    >
+      <span class="flex items-center gap-2">
+        <Film size={16} />
+        비디오
+      </span>
+      {#if categoryCounts.video > 0}
+        <span class="text-xs text-dim">{categoryCounts.video}</span>
+      {/if}
+    </a>
 
     <a
       href="/ask"
diff --git a/frontend/src/lib/components/VideoPlayer.svelte b/frontend/src/lib/components/VideoPlayer.svelte
new file mode 100644
index 0000000..48765de
--- /dev/null
+++ b/frontend/src/lib/components/VideoPlayer.svelte
@@ -0,0 +1,42 @@
+<script>
+  // HTML5 비디오 플레이어 (direct play 전용).
+  // needsConversion=true 이면 재생 대신 안내 카드 표시 (§3 채널별 정책).
+
+  import { getAccessToken } from '$lib/api';
+
+  let { docId, needsConversion = false, fileFormat = '', title = '' } = $props();
+
+  let token = $derived(getAccessToken());
+  let src = $derived(`/api/documents/${docId}/file?token=${token}`);
+  let thumbSrc = $derived(`/api/video/${docId}/thumbnail?token=${token}`);
+</script>
+
+{#if needsConversion}
+  <div class="flex flex-col items-center justify-center h-full p-8 bg-surface text-center">
+    <div class="max-w-md space-y-3">
+      <p class="text-lg font-semibold">재생할 수 없는 포맷입니다</p>
+      <p class="text-sm text-muted">
+        {fileFormat ? `.${fileFormat}` : '현재 파일'} 포맷은 브라우저가 직접 재생할 수 없어 보관만 하고 있습니다.
+      </p>
+      <p class="text-sm text-muted">
+        재생하려면 원본을 <code class="px-1.5 py-0.5 bg-surface-hover rounded">mp4 (H.264/AAC)</code> 또는
+        <code class="px-1.5 py-0.5 bg-surface-hover rounded">webm (VP9)</code> 으로 변환 후 다시 올리세요.
+      </p>
+      {#if title}
+        <p class="text-xs text-muted pt-2">파일: {title}</p>
+      {/if}
+    </div>
+  </div>
+{:else}
+  <div class="flex flex-col h-full bg-black">
+    <video
+      {src}
+      controls
+      preload="metadata"
+      poster={thumbSrc}
+      class="w-full h-full object-contain"
+    >
+      <track kind="captions" />
+    </video>
+  </div>
+{/if}
diff --git a/frontend/src/routes/audio/+page.svelte b/frontend/src/routes/audio/+page.svelte
new file mode 100644
index 0000000..cedb471
--- /dev/null
+++ b/frontend/src/routes/audio/+page.svelte
@@ -0,0 +1,105 @@
+<script>
+  // /audio — 오디오 문서 목록. §3
+  // `GET /api/documents?category=audio` 로 조회 후 카드 그리드로 렌더.
+
+  import { onMount } from 'svelte';
+  import { goto } from '$app/navigation';
+  import { api } from '$lib/api';
+  import { addToast } from '$lib/stores/toast';
+  import { Mic, FileAudio } from 'lucide-svelte';
+
+  let docs = $state([]);
+  let total = $state(0);
+  let loading = $state(true);
+  let currentPage = $state(1);
+  const PAGE_SIZE = 30;
+
+  onMount(load);
+
+  async function load() {
+    loading = true;
+    try {
+      const params = new URLSearchParams({
+        category: 'audio',
+        page: String(currentPage),
+        page_size: String(PAGE_SIZE),
+        sort: 'updated_desc',
+      });
+      const data = await api(`/documents?${params}`);
+      docs = data.items ?? [];
+      total = data.total ?? 0;
+    } catch (err) {
+      addToast('error', '오디오 목록 불러오기 실패');
+      docs = [];
+    } finally {
+      loading = false;
+    }
+  }
+
+  function pickOpen(id) {
+    goto(`/audio/${id}`);
+  }
+
+  function fmtSize(bytes) {
+    if (!bytes) return '-';
+    const mb = bytes / (1024 * 1024);
+    return mb < 1 ? `${(bytes / 1024).toFixed(0)} KB` : `${mb.toFixed(1)} MB`;
+  }
+
+  function fmtDate(s) {
+    if (!s) return '';
+    const d = new Date(s);
+    return d.toLocaleDateString();
+  }
+</script>
+
+<div class="p-6 max-w-[1200px] mx-auto">
+  <header class="flex items-center gap-2 mb-4">
+    <Mic size={20} />
+    <h1 class="text-xl font-semibold">Audio</h1>
+    <span class="text-sm text-muted ml-2">{total}건</span>
+  </header>
+
+  {#if loading}
+    <p class="text-muted">불러오는 중…</p>
+  {:else if docs.length === 0}
+    <p class="text-muted">오디오 문서가 없습니다. NAS <code>PKM/Recordings/</code> 에 드롭하거나 업로드하세요.</p>
+  {:else}
+    <div class="grid grid-cols-1 sm:grid-cols-2 md:grid-cols-3 gap-3">
+      {#each docs as d (d.id)}
+        <button
+          type="button"
+          class="text-left p-4 rounded-lg border border-default bg-surface hover:bg-surface-hover transition-colors"
+          onclick={() => pickOpen(d.id)}
+        >
+          <div class="flex items-start gap-2">
+            <FileAudio size={18} class="text-muted mt-0.5 shrink-0" />
+            <div class="flex-1 min-w-0">
+              <p class="font-medium truncate">{d.title || '(제목 없음)'}</p>
+              <p class="text-xs text-muted mt-1">.{d.file_format} · {fmtSize(d.file_size)}</p>
+              <p class="text-xs text-dim mt-0.5">{fmtDate(d.updated_at)}</p>
+            </div>
+          </div>
+        </button>
+      {/each}
+    </div>
+
+    {#if total > PAGE_SIZE}
+      <div class="flex justify-center gap-2 mt-6">
+        <button
+          type="button"
+          class="px-3 py-1 rounded border border-default disabled:opacity-50"
+          disabled={currentPage === 1}
+          onclick={() => { currentPage -= 1; load(); }}
+        >이전</button>
+        <span class="px-3 py-1 text-sm text-muted">{currentPage} / {Math.ceil(total / PAGE_SIZE)}</span>
+        <button
+          type="button"
+          class="px-3 py-1 rounded border border-default disabled:opacity-50"
+          disabled={currentPage * PAGE_SIZE >= total}
+          onclick={() => { currentPage += 1; load(); }}
+        >다음</button>
+      </div>
+    {/if}
+  {/if}
+</div>
diff --git a/frontend/src/routes/audio/[id]/+page.svelte b/frontend/src/routes/audio/[id]/+page.svelte
new file mode 100644
index 0000000..96db9a2
--- /dev/null
+++ b/frontend/src/routes/audio/[id]/+page.svelte
@@ -0,0 +1,61 @@
+<script>
+  // /audio/[id] — 단건 오디오 재생 + 전사 세그먼트.
+
+  import { page } from '$app/stores';
+  import { onMount } from 'svelte';
+  import { api } from '$lib/api';
+  import { addToast } from '$lib/stores/toast';
+  import AudioPlayer from '$lib/components/AudioPlayer.svelte';
+  import { ArrowLeft } from 'lucide-svelte';
+  import { goto } from '$app/navigation';
+
+  let docId = $derived(Number($page.params.id));
+  let doc = $state(null);
+  let loading = $state(true);
+
+  onMount(load);
+
+  $effect(() => {
+    if (docId) load();
+  });
+
+  async function load() {
+    loading = true;
+    try {
+      doc = await api(`/documents/${docId}`);
+    } catch (err) {
+      addToast('error', '문서를 불러올 수 없습니다');
+      doc = null;
+    } finally {
+      loading = false;
+    }
+  }
+</script>
+
+<div class="flex flex-col h-full">
+  <header class="flex items-center gap-2 px-4 py-2 border-b border-default">
+    <button
+      type="button"
+      class="p-1 rounded hover:bg-surface-hover"
+      onclick={() => goto('/audio')}
+      aria-label="목록으로"
+    >
+      <ArrowLeft size={18} />
+    </button>
+    <h1 class="font-medium truncate flex-1">
+      {doc?.title ?? (loading ? '불러오는 중…' : '(제목 없음)')}
+    </h1>
+  </header>
+
+  <div class="flex-1 overflow-hidden">
+    {#if loading}
+      <p class="p-4 text-muted">불러오는 중…</p>
+    {:else if !doc}
+      <p class="p-4 text-error">문서를 찾을 수 없습니다.</p>
+    {:else if doc.category !== 'audio'}
+      <p class="p-4 text-error">이 문서는 오디오가 아닙니다 (category={doc.category}).</p>
+    {:else}
+      <AudioPlayer docId={doc.id} />
+    {/if}
+  </div>
+</div>
diff --git a/frontend/src/routes/video/+page.svelte b/frontend/src/routes/video/+page.svelte
new file mode 100644
index 0000000..9cd3498
--- /dev/null
+++ b/frontend/src/routes/video/+page.svelte
@@ -0,0 +1,104 @@
+<script>
+  // /video — 비디오 문서 목록. §3
+  // needs_conversion=true 는 재생 불가 배지 표시, 썸네일은 /api/video/{id}/thumbnail?token=.
+
+  import { onMount } from 'svelte';
+  import { goto } from '$app/navigation';
+  import { api, getAccessToken } from '$lib/api';
+  import { addToast } from '$lib/stores/toast';
+  import { Film, AlertTriangle } from 'lucide-svelte';
+
+  let docs = $state([]);
+  let total = $state(0);
+  let loading = $state(true);
+  let currentPage = $state(1);
+  const PAGE_SIZE = 30;
+
+  onMount(load);
+
+  async function load() {
+    loading = true;
+    try {
+      const params = new URLSearchParams({
+        category: 'video',
+        page: String(currentPage),
+        page_size: String(PAGE_SIZE),
+        sort: 'updated_desc',
+      });
+      const data = await api(`/documents?${params}`);
+      docs = data.items ?? [];
+      total = data.total ?? 0;
+    } catch (err) {
+      addToast('error', '비디오 목록 불러오기 실패');
+      docs = [];
+    } finally {
+      loading = false;
+    }
+  }
+
+  let token = $derived(getAccessToken());
+
+  function thumbSrc(id) {
+    return `/api/video/${id}/thumbnail?token=${token}`;
+  }
+
+  function fmtSize(bytes) {
+    if (!bytes) return '-';
+    const mb = bytes / (1024 * 1024);
+    if (mb < 1024) return `${mb.toFixed(0)} MB`;
+    return `${(mb / 1024).toFixed(1)} GB`;
+  }
+</script>
+
+<div class="p-6 max-w-[1400px] mx-auto">
+  <header class="flex items-center gap-2 mb-4">
+    <Film size={20} />
+    <h1 class="text-xl font-semibold">Video</h1>
+    <span class="text-sm text-muted ml-2">{total}건</span>
+  </header>
+
+  {#if loading}
+    <p class="text-muted">불러오는 중…</p>
+  {:else if docs.length === 0}
+    <p class="text-muted">비디오 문서가 없습니다. NAS <code>PKM/Videos/</code> 에 드롭하거나 업로드하세요.</p>
+  {:else}
+    <div class="grid grid-cols-2 sm:grid-cols-3 md:grid-cols-4 gap-4">
+      {#each docs as d (d.id)}
+        <button
+          type="button"
+          class="text-left rounded-lg overflow-hidden border border-default bg-surface hover:bg-surface-hover transition-colors"
+          onclick={() => goto(`/video/${d.id}`)}
+        >
+          <div class="aspect-video bg-black relative flex items-center justify-center">
+            {#if d.thumbnail_path}
+              <img src={thumbSrc(d.id)} alt="" class="w-full h-full object-cover" loading="lazy" />
+            {:else}
+              <Film size={32} class="text-dim" />
+            {/if}
+            {#if d.needs_conversion}
+              <span class="absolute top-1 left-1 px-1.5 py-0.5 rounded bg-warning text-xs flex items-center gap-1">
+                <AlertTriangle size={12} /> 변환 필요
+              </span>
+            {/if}
+          </div>
+          <div class="p-2">
+            <p class="text-sm font-medium truncate">{d.title || '(제목 없음)'}</p>
+            <p class="text-xs text-muted mt-0.5">.{d.file_format} · {fmtSize(d.file_size)}</p>
+          </div>
+        </button>
+      {/each}
+    </div>
+
+    {#if total > PAGE_SIZE}
+      <div class="flex justify-center gap-2 mt-6">
+        <button type="button" class="px-3 py-1 rounded border border-default disabled:opacity-50"
+          disabled={currentPage === 1}
+          onclick={() => { currentPage -= 1; load(); }}>이전</button>
+        <span class="px-3 py-1 text-sm text-muted">{currentPage} / {Math.ceil(total / PAGE_SIZE)}</span>
+        <button type="button" class="px-3 py-1 rounded border border-default disabled:opacity-50"
+          disabled={currentPage * PAGE_SIZE >= total}
+          onclick={() => { currentPage += 1; load(); }}>다음</button>
+      </div>
+    {/if}
+  {/if}
+</div>
diff --git a/frontend/src/routes/video/[id]/+page.svelte b/frontend/src/routes/video/[id]/+page.svelte
new file mode 100644
index 0000000..81f1f8a
--- /dev/null
+++ b/frontend/src/routes/video/[id]/+page.svelte
@@ -0,0 +1,66 @@
+<script>
+  // /video/[id] — 단건 비디오 재생 (direct play) 또는 quarantine 안내.
+
+  import { page } from '$app/stores';
+  import { onMount } from 'svelte';
+  import { api } from '$lib/api';
+  import { addToast } from '$lib/stores/toast';
+  import VideoPlayer from '$lib/components/VideoPlayer.svelte';
+  import { ArrowLeft } from 'lucide-svelte';
+  import { goto } from '$app/navigation';
+
+  let docId = $derived(Number($page.params.id));
+  let doc = $state(null);
+  let loading = $state(true);
+
+  onMount(load);
+
+  $effect(() => {
+    if (docId) load();
+  });
+
+  async function load() {
+    loading = true;
+    try {
+      doc = await api(`/documents/${docId}`);
+    } catch (err) {
+      addToast('error', '문서를 불러올 수 없습니다');
+      doc = null;
+    } finally {
+      loading = false;
+    }
+  }
+</script>
+
+<div class="flex flex-col h-full">
+  <header class="flex items-center gap-2 px-4 py-2 border-b border-default">
+    <button
+      type="button"
+      class="p-1 rounded hover:bg-surface-hover"
+      onclick={() => goto('/video')}
+      aria-label="목록으로"
+    >
+      <ArrowLeft size={18} />
+    </button>
+    <h1 class="font-medium truncate flex-1">
+      {doc?.title ?? (loading ? '불러오는 중…' : '(제목 없음)')}
+    </h1>
+  </header>
+
+  <div class="flex-1 overflow-hidden">
+    {#if loading}
+      <p class="p-4 text-muted">불러오는 중…</p>
+    {:else if !doc}
+      <p class="p-4 text-error">문서를 찾을 수 없습니다.</p>
+    {:else if doc.category !== 'video'}
+      <p class="p-4 text-error">이 문서는 비디오가 아닙니다 (category={doc.category}).</p>
+    {:else}
+      <VideoPlayer
+        docId={doc.id}
+        needsConversion={doc.needs_conversion ?? false}
+        fileFormat={doc.file_format}
+        title={doc.title}
+      />
+    {/if}
+  </div>
+</div>
diff --git a/migrations/147_audio_segments_table.sql b/migrations/147_audio_segments_table.sql
new file mode 100644
index 0000000..dfcda68
--- /dev/null
+++ b/migrations/147_audio_segments_table.sql
@@ -0,0 +1,17 @@
+-- 147_audio_segments_table.sql
+-- Document Server 통합 플랫폼 Section 3: audio_segments 테이블 정의 (1/2)
+-- plan: luminous-sprouting-hamster.md §3
+--
+-- asyncpg single-statement 규칙에 따라 테이블 생성만 담당. 인덱스는 148.
+-- STT (faster-whisper) 결과의 타임스탬프 단위 세그먼트.
+-- documents.extracted_text 에는 전체 전사 텍스트를 저장 (classify/embed 재사용),
+-- 본 테이블은 AudioPlayer 에서 줄 클릭 → audio.currentTime = start_s 점프에 사용.
+-- ON DELETE CASCADE — 문서 물리 삭제 시 세그먼트 정리.
+
+CREATE TABLE IF NOT EXISTS audio_segments (
+  id          BIGSERIAL PRIMARY KEY,
+  document_id BIGINT NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
+  start_s     REAL NOT NULL,
+  end_s       REAL NOT NULL,
+  text        TEXT NOT NULL
+);
diff --git a/migrations/148_audio_segments_idx.sql b/migrations/148_audio_segments_idx.sql
new file mode 100644
index 0000000..f4b890d
--- /dev/null
+++ b/migrations/148_audio_segments_idx.sql
@@ -0,0 +1,8 @@
+-- 148_audio_segments_idx.sql
+-- Document Server 통합 플랫폼 Section 3: audio_segments 인덱스 (2/2)
+-- plan: luminous-sprouting-hamster.md §3
+--
+-- AudioPlayer 가 `WHERE document_id=? ORDER BY start_s` 로 조회하는 경로용.
+
+CREATE INDEX IF NOT EXISTS idx_audio_segments_doc_start
+  ON audio_segments(document_id, start_s);
diff --git a/migrations/149_document_media_cols.sql b/migrations/149_document_media_cols.sql
new file mode 100644
index 0000000..19285f6
--- /dev/null
+++ b/migrations/149_document_media_cols.sql
@@ -0,0 +1,13 @@
+-- 149_document_media_cols.sql
+-- Document Server 통합 플랫폼 Section 3: video 재생/썸네일 컬럼
+-- plan: luminous-sprouting-hamster.md §3
+--
+-- thumbnail_path: PKM/Videos/.thumbs/{doc_id}.jpg 절대경로 (ffmpeg 50% 지점 1장).
+-- needs_conversion: NAS 드롭으로 들어온 .mov/.mkv/.avi 등 quarantine 플래그.
+--   true 이면 VideoPlayer 가 재생 대신 "변환 필요" 안내 카드를 표시.
+-- v2.1 은 boolean 으로 최소 시작. 후속 상태 필요 시 별도 migration 에서 enum 확장.
+-- ALTER TABLE 단일 statement (다중 ADD COLUMN 절 is OK, §2 144 와 동일 패턴).
+
+ALTER TABLE documents
+  ADD COLUMN IF NOT EXISTS thumbnail_path    TEXT,
+  ADD COLUMN IF NOT EXISTS needs_conversion  BOOLEAN NOT NULL DEFAULT false;
diff --git a/migrations/150_queue_stage_stt.sql b/migrations/150_queue_stage_stt.sql
new file mode 100644
index 0000000..11dc2aa
--- /dev/null
+++ b/migrations/150_queue_stage_stt.sql
@@ -0,0 +1,11 @@
+-- 150_queue_stage_stt.sql
+-- Document Server 통합 플랫폼 Section 3: processing_queue stage 에 'stt' 추가
+-- plan: luminous-sprouting-hamster.md §3
+--
+-- audio 파이프: stt → classify → embed (extract 건너뜀).
+-- stt_worker 는 category='audio' 큐 행을 pickup 하여 services/stt 호출 후
+-- documents.extracted_text + audio_segments 저장.
+-- 본 migration 은 enum 확장만 담당. 실제 분기 로직은 queue_consumer 에서 §1 의
+-- category 컬럼 기반으로 처리.
+
+ALTER TYPE process_stage ADD VALUE IF NOT EXISTS 'stt';
diff --git a/migrations/151_queue_stage_thumbnail.sql b/migrations/151_queue_stage_thumbnail.sql
new file mode 100644
index 0000000..d98eecf
--- /dev/null
+++ b/migrations/151_queue_stage_thumbnail.sql
@@ -0,0 +1,9 @@
+-- 151_queue_stage_thumbnail.sql
+-- Document Server 통합 플랫폼 Section 3: processing_queue stage 에 'thumbnail' 추가
+-- plan: luminous-sprouting-hamster.md §3
+--
+-- video 파이프는 thumbnail 단일 stage (leaf, classify/embed 없음).
+-- thumbnail_worker 가 category='video' + needs_conversion=false 큐 행을 pickup
+-- 하여 ffmpeg 로 .thumbs/{doc_id}.jpg 생성 후 documents.thumbnail_path 세팅.
+
+ALTER TYPE process_stage ADD VALUE IF NOT EXISTS 'thumbnail';
diff --git a/services/stt/Dockerfile b/services/stt/Dockerfile
new file mode 100644
index 0000000..e1e7fad
--- /dev/null
+++ b/services/stt/Dockerfile
@@ -0,0 +1,21 @@
+FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
+
+ENV DEBIAN_FRONTEND=noninteractive \
+    PYTHONUNBUFFERED=1 \
+    PIP_NO_CACHE_DIR=1
+
+# faster-whisper 는 PyAV 로 디코드 (ffmpeg 필요 없음) 하지만,
+# 포맷 가변성 대비 시스템 ffmpeg 도 설치. python3.10 = ubuntu22.04 기본.
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    python3 python3-pip ffmpeg \
+    && apt-get clean && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /app
+
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY server.py .
+
+EXPOSE 3300
+CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "3300"]
diff --git a/services/stt/requirements.txt b/services/stt/requirements.txt
new file mode 100644
index 0000000..1fc6d7a
--- /dev/null
+++ b/services/stt/requirements.txt
@@ -0,0 +1,3 @@
+faster-whisper>=1.0.3,<2.0.0
+fastapi>=0.110.0,<1.0.0
+uvicorn[standard]>=0.27.0,<1.0.0
diff --git a/services/stt/server.py b/services/stt/server.py
new file mode 100644
index 0000000..6258b34
--- /dev/null
+++ b/services/stt/server.py
@@ -0,0 +1,140 @@
+"""STT 마이크로서비스 — faster-whisper (GPU) 기반 음성 전사.
+
+filePath → {text, segments:[{start,end,text}]}. 모델은 첫 요청 시 lazy loading.
+기본 모델 large-v3 (VRAM ~3GB, float16). 환경변수로 교체 가능.
+"""
+
+import os
+import unicodedata
+from pathlib import Path
+
+from fastapi import FastAPI
+
+app = FastAPI()
+
+_model = None
+_MODEL_NAME = os.getenv("WHISPER_MODEL", "large-v3")
+_DEVICE = os.getenv("WHISPER_DEVICE", "cuda")
+_COMPUTE_TYPE = os.getenv("WHISPER_COMPUTE_TYPE", "float16")
+
+
+def _resolve_path(file_path: str) -> Path | None:
+    """NFC(DB) vs NFD(NFS) 한글 경로 정규화 차이 흡수. OCR 서비스와 동일 패턴."""
+    candidates = [
+        file_path,
+        unicodedata.normalize("NFD", file_path),
+        unicodedata.normalize("NFC", file_path),
+    ]
+    for c in candidates:
+        p = Path(c)
+        if p.exists():
+            return p
+    # 마지막 fallback: parent 디렉토리에서 이름을 NFC 로 매칭
+    parent = Path(file_path).parent
+    if parent.exists():
+        target = unicodedata.normalize("NFC", Path(file_path).name)
+        for child in parent.iterdir():
+            if unicodedata.normalize("NFC", child.name) == target:
+                return child
+    return None
+
+
+def _load_model():
+    """faster-whisper lazy loading — 첫 호출 시만 VRAM 점유."""
+    global _model
+    if _model is not None:
+        return _model
+    from faster_whisper import WhisperModel
+
+    _model = WhisperModel(_MODEL_NAME, device=_DEVICE, compute_type=_COMPUTE_TYPE)
+    return _model
+
+
+def _cuda_device_count() -> int:
+    try:
+        import ctranslate2
+        return ctranslate2.get_cuda_device_count()
+    except Exception:
+        return 0
+
+
+@app.get("/health")
+def health():
+    """Liveness — Docker healthcheck 용, 프로세스 생존 확인."""
+    return {"status": "ok", "service": "stt-faster-whisper"}
+
+
+@app.get("/ready")
+def ready():
+    """Readiness — CUDA + 모델 상태. 배포 검증용."""
+    count = _cuda_device_count()
+    cuda_ok = count > 0
+    models_loaded = _model is not None
+    return {
+        "ready": cuda_ok and models_loaded,
+        "cuda": cuda_ok,
+        "cuda_device_count": count,
+        "models_loaded": models_loaded,
+        "model": _MODEL_NAME,
+        "compute_type": _COMPUTE_TYPE,
+    }
+
+
+@app.post("/transcribe")
+async def transcribe(body: dict):
+    """오디오 파일 전사.
+
+    입력:
+      {
+        "filePath": "/documents/PKM/Recordings/2026-04-23_회의.mp3",
+        "langs":    ["ko"]?,     # 단일 언어 지정 or 생략(자동감지)
+        "beamSize": 5?           # 기본 5
+      }
+
+    출력:
+      {
+        "text": "전체 전사 텍스트",
+        "segments": [{"start": 0.0, "end": 2.4, "text": "..."}, ...],
+        "language": "ko",
+        "language_probability": 0.99,
+        "duration": 1832.5
+      }
+    """
+    raw_path = body["filePath"]
+    langs = body.get("langs")
+    beam_size = int(body.get("beamSize", 5))
+
+    resolved = _resolve_path(raw_path)
+    if resolved is None:
+        return {"error": f"파일 없음: {raw_path}", "text": "", "segments": []}
+
+    model = _load_model()
+
+    language = None
+    if isinstance(langs, list) and len(langs) == 1:
+        language = langs[0]
+
+    segments_iter, info = model.transcribe(
+        str(resolved),
+        beam_size=beam_size,
+        language=language,
+        vad_filter=True,
+    )
+
+    segments = []
+    parts = []
+    for seg in segments_iter:
+        segments.append({
+            "start": round(float(seg.start), 2),
+            "end": round(float(seg.end), 2),
+            "text": seg.text.strip(),
+        })
+        parts.append(seg.text)
+
+    return {
+        "text": " ".join(p.strip() for p in parts).strip(),
+        "segments": segments,
+        "language": getattr(info, "language", None),
+        "language_probability": float(getattr(info, "language_probability", 0.0) or 0.0),
+        "duration": float(getattr(info, "duration", 0.0) or 0.0),
+    }