feat(media): §3 audio STT + video 재생 인프라

plan: ~/.claude/plans/luminous-sprouting-hamster.md §3

스키마:
- migrations/147_audio_segments_table.sql: audio_segments (STT 타임스탬프
  세그먼트)
- migrations/148_audio_segments_idx.sql: (document_id, start_s) idx
- migrations/149_document_media_cols.sql: documents.thumbnail_path +
  needs_conversion
- migrations/150_queue_stage_stt.sql: process_stage += 'stt'
- migrations/151_queue_stage_thumbnail.sql: process_stage += 'thumbnail'
- app/models/audio_segment.py, document.py (thumbnail_path/needs_conversion)

서비스:
- services/stt/{Dockerfile, requirements.txt, server.py} — faster-whisper
  large-v3 GPU 컨테이너. /transcribe (filePath/langs/beamSize) +
  /health + /ready (cuda device_count + model_loaded). NFC/NFD 경로
  resolver (OCR 교훈).
- docker-compose.yml: stt-service 추가 (GPU 1 예약, :3300, NAS ro mount,
  stt_models volume, start_period 300s), fastapi env 에 STT_ENDPOINT.

파이프라인 (의존 §1 category):
- app/workers/stt_worker.py 신규: stage='stt' pickup → STT_ENDPOINT 호출 →
  extracted_text + audio_segments 저장. Timeout 30분.
- app/workers/thumbnail_worker.py 신규: ffmpeg 50% 지점 1장 →
  PKM/Videos/.thumbs/{id}.jpg + thumbnail_path 세팅.
  needs_conversion=true 는 skip.
- app/workers/file_watcher.py 확장: PKM/{Inbox, Recordings, Videos}
  스캔. 확장자→category, audio→stage=stt, video .mp4/.webm→
  stage=thumbnail, video .mov/.mkv/.avi→needs_conversion=true + stage
  없음. settings.roon_library_path prefix skip.
- app/workers/queue_consumer.py 확장: stt + thumbnail workers 등록,
  BATCH_SIZE(stt=1, thumbnail=3), next_stages 에 stt→[classify] 추가
  (audio 는 extract 건너뜀).
- app/Dockerfile: ffmpeg 추가 (썸네일 subprocess 용).

API (의존 §1):
- /api/audio/{id}/segments — AudioSegment ORDER BY start_s
- /api/video/{id}/thumbnail — thumbnail_path FileResponse (쿼리 토큰)
- /api/documents/{id}/file: media_types 에 audio/video mime 포함 (§2
  커밋에 이미 포함). Starlette FileResponse 가 Range 자동.
- upload_document: .mov/.mkv/.avi 웹 업로드 거부 (error_code
  unsupported_codec). NAS 드롭은 file_watcher 가 quarantine 수용.

프론트:
- AudioPlayer.svelte: HTML5 audio + 전사 세그먼트 sticky 패널 + 줄
  클릭 seek. activeIdx 하이라이트.
- VideoPlayer.svelte: HTML5 video direct play + needs_conversion 안내
  카드. poster 는 thumbnail endpoint.
- /audio (목록 grid) + /audio/[id] (플레이어)
- /video (썸네일 grid + 변환 필요 배지) + /video/[id] (플레이어)
- Sidebar.svelte: Mic/Film 아이콘 + audio/video 네비 활성, count
  배지 (§2 /stats/category-counts 재사용).

설정:
- app/core/config.py: stt_endpoint + roon_library_path.

DoD 배포 후 smoke: /ready cuda:true, 회의 mp3 transcribe, audio
extract 없이 classify 진행(queue 회귀), /audio 재생, .mp4 재생,
.mov 웹 400, .mov NAS quarantine, Sidebar 네비 + count.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Hyungi Ahn
2026-04-24 06:47:36 +09:00
parent aceb54e586
commit 1e2c004dd4
28 changed files with 1284 additions and 50 deletions
+3 -2
View File
@@ -2,12 +2,13 @@ FROM python:3.11-slim
WORKDIR /app
# LibreOffice headless (PDF 변환용) + 한글/CJK 폰트
# LibreOffice headless (PDF 변환용) + 한글/CJK 폰트 + ffmpeg (비디오 썸네일)
RUN apt-get update && \
apt-get install -y --no-install-recommends \
libreoffice-core libreoffice-calc libreoffice-writer libreoffice-impress \
fonts-noto-cjk fonts-noto-cjk-extra fonts-nanum \
fonts-noto-core fonts-noto-extra && \
fonts-noto-core fonts-noto-extra \
ffmpeg && \
apt-get clean && rm -rf /var/lib/apt/lists/*
COPY requirements.txt .
+72
View File
@@ -0,0 +1,72 @@
"""오디오 전사(STT) 조회 API — /api/audio
AudioPlayer 가 줄 단위로 렌더하고 클릭 시 audio.currentTime 으로 점프한다.
"""
from typing import Annotated
from fastapi import APIRouter, Depends, HTTPException
from pydantic import BaseModel
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from core.auth import get_current_user
from core.database import get_session
from models.audio_segment import AudioSegment
from models.document import Document
from models.user import User
router = APIRouter()
class AudioSegmentResponse(BaseModel):
start: float
end: float
text: str
model_config = {"from_attributes": True}
class AudioSegmentsResponse(BaseModel):
document_id: int
language: str | None
duration: float | None
segments: list[AudioSegmentResponse]
@router.get("/{doc_id}/segments", response_model=AudioSegmentsResponse)
async def get_audio_segments(
doc_id: int,
user: Annotated[User, Depends(get_current_user)],
session: Annotated[AsyncSession, Depends(get_session)],
):
"""audio 문서의 전사 세그먼트 조회.
category='audio' 가 아닌 문서는 404. 세그먼트가 아직 없는 경우 빈 배열 반환.
language / duration 은 현재 ORM 에 별도 컬럼이 없어 None (필요 시 후속 확장).
"""
doc = await session.get(Document, doc_id)
if not doc or doc.deleted_at is not None:
raise HTTPException(status_code=404, detail="문서를 찾을 수 없습니다")
if getattr(doc, "category", None) != "audio":
raise HTTPException(status_code=404, detail="오디오 문서가 아닙니다")
result = await session.execute(
select(AudioSegment)
.where(AudioSegment.document_id == doc_id)
.order_by(AudioSegment.start_s.asc())
)
rows = result.scalars().all()
segments = [
AudioSegmentResponse(start=r.start_s, end=r.end_s, text=r.text)
for r in rows
]
return AudioSegmentsResponse(
document_id=doc_id,
language=None,
duration=None,
segments=segments,
)
+11
View File
@@ -594,6 +594,17 @@ async def upload_document(
if not safe_name or safe_name.startswith("."):
raise HTTPException(status_code=400, detail="유효하지 않은 파일명")
# §3: 웹 업로드는 direct-play 불가 비디오 거부 (NAS 드롭은 file_watcher 가
# quarantine 으로 수용). UploadDropzone 이 error_code='unsupported_codec' 로
# 배너 분기.
VIDEO_QUARANTINE_EXTS = {".mov", ".mkv", ".avi"}
if Path(safe_name).suffix.lower() in VIDEO_QUARANTINE_EXTS:
raise _upload_error(
status_code=400,
error_code="unsupported_codec",
message="브라우저에서 직접 재생 불가한 포맷입니다. mp4 (H.264/AAC) 또는 webm (VP9) 으로 변환 후 다시 올리세요.",
)
# ── 대상 경로 결정 ──
inbox_dir = Path(settings.nas_mount_path) / "PKM" / "Inbox"
+56
View File
@@ -0,0 +1,56 @@
"""비디오 썸네일 서빙 API — /api/video
ffmpeg 썸네일 생성은 thumbnail_worker 에서 수행. 본 라우터는 저장된 파일만 서빙.
"""
from pathlib import Path
from typing import Annotated
from fastapi import APIRouter, Depends, HTTPException, Query
from fastapi.responses import FileResponse
from sqlalchemy.ext.asyncio import AsyncSession
from core.auth import decode_token, get_current_user
from core.database import get_session
from models.document import Document
from models.user import User
router = APIRouter()
@router.get("/{doc_id}/thumbnail")
async def get_video_thumbnail(
doc_id: int,
session: Annotated[AsyncSession, Depends(get_session)],
token: str | None = Query(None, description="Bearer token (img src 용)"),
user: User | None = Depends(lambda: None),
):
"""비디오 썸네일 jpg 서빙. `<img src="...?token=...">` 바인딩 가능.
쿼리 토큰 또는 Authorization 헤더 중 하나로 인증. /file 엔드포인트와 동일 정책.
"""
# 쿼리 토큰 검증 (img src 용) — /file 과 동일 패턴
if not token:
raise HTTPException(status_code=401, detail="토큰이 필요합니다")
payload = decode_token(token)
if not payload or payload.get("type") != "access":
raise HTTPException(status_code=401, detail="유효하지 않은 토큰")
doc = await session.get(Document, doc_id)
if not doc or doc.deleted_at is not None:
raise HTTPException(status_code=404, detail="문서를 찾을 수 없습니다")
thumb = getattr(doc, "thumbnail_path", None)
if not thumb:
raise HTTPException(status_code=404, detail="썸네일이 아직 생성되지 않았습니다")
path = Path(thumb)
if not path.exists():
raise HTTPException(status_code=404, detail="썸네일 파일이 없습니다")
return FileResponse(
path=str(path),
media_type="image/jpeg",
headers={"Content-Disposition": "inline"},
)
+12
View File
@@ -61,6 +61,14 @@ class Settings(BaseModel):
# OCR (Surya)
ocr_endpoint: str = "http://ocr-service:3200"
# STT (faster-whisper, §3)
stt_endpoint: str = "http://stt-service:3300"
# §3 file_watcher: Roon 음원 경로 (prefix match 로 skip).
# 빈 문자열이면 skip 없음. 예: "/documents/PKM/../Music/roon-library" 또는
# NFS 경유 별도 마운트된 Roon 라이브러리.
roon_library_path: str = ""
# 분류 체계
taxonomy: dict = {}
document_types: list[str] = []
@@ -78,6 +86,8 @@ def load_settings() -> Settings:
eval_runner_token = os.getenv("EVAL_RUNNER_TOKEN", "")
kordoc_endpoint = os.getenv("KORDOC_ENDPOINT", "http://kordoc-service:3100")
ocr_endpoint = os.getenv("OCR_ENDPOINT", "http://ocr-service:3200")
stt_endpoint = os.getenv("STT_ENDPOINT", "http://stt-service:3300")
roon_library_path = os.getenv("ROON_LIBRARY_PATH", "")
# config.yaml — Docker 컨테이너 내부(/app/config.yaml) 또는 프로젝트 루트
config_path = Path("/app/config.yaml")
@@ -135,6 +145,8 @@ def load_settings() -> Settings:
eval_runner_token=eval_runner_token,
kordoc_endpoint=kordoc_endpoint,
ocr_endpoint=ocr_endpoint,
stt_endpoint=stt_endpoint,
roon_library_path=roon_library_path,
taxonomy=taxonomy,
document_types=document_types,
upload=upload_cfg,
+4
View File
@@ -6,6 +6,7 @@ from fastapi import FastAPI, Request
from fastapi.responses import RedirectResponse
from sqlalchemy import func, select, text
from api.audio import router as audio_router
from api.auth import router as auth_router
from api.config import router as config_router
from api.dashboard import router as dashboard_router
@@ -16,6 +17,7 @@ from api.memos import router as memos_router
from api.news import router as news_router
from api.search import router as search_router
from api.setup import router as setup_router
from api.video import router as video_router
from core.config import settings
from core.database import async_session, engine, init_db
from models.user import User
@@ -98,6 +100,8 @@ app.include_router(dashboard_router, prefix="/api/dashboard", tags=["dashboard"]
app.include_router(library_router, prefix="/api/library", tags=["library"])
app.include_router(news_router, prefix="/api/news", tags=["news"])
app.include_router(digest_router, prefix="/api/digest", tags=["digest"])
app.include_router(audio_router, prefix="/api/audio", tags=["audio"])
app.include_router(video_router, prefix="/api/video", tags=["video"])
# TODO: Phase 5에서 추가
# app.include_router(tasks.router, prefix="/api/tasks", tags=["tasks"])
+18
View File
@@ -0,0 +1,18 @@
"""audio_segments 테이블 ORM — STT 전사 결과의 타임스탬프 세그먼트."""
from sqlalchemy import BigInteger, Float, ForeignKey, Text
from sqlalchemy.orm import Mapped, mapped_column
from core.database import Base
class AudioSegment(Base):
__tablename__ = "audio_segments"
id: Mapped[int] = mapped_column(BigInteger, primary_key=True)
document_id: Mapped[int] = mapped_column(
BigInteger, ForeignKey("documents.id", ondelete="CASCADE"), nullable=False
)
start_s: Mapped[float] = mapped_column(Float, nullable=False)
end_s: Mapped[float] = mapped_column(Float, nullable=False)
text: Mapped[str] = mapped_column(Text, nullable=False)
+6
View File
@@ -115,6 +115,12 @@ class Document(Base):
# /accept-suggestion 승인 시에만 category / user_tags 반영 (자동 전이 금지)
ai_suggestion: Mapped[dict | None] = mapped_column(JSONB)
# 비디오 썸네일 (§3) — ffmpeg 50% 지점 1장. PKM/Videos/.thumbs/{id}.jpg 절대경로.
thumbnail_path: Mapped[str | None] = mapped_column(Text)
# NAS 드롭된 mov/mkv/avi quarantine 플래그 (§3). true 면 재생 불가 안내만 표시.
needs_conversion: Mapped[bool] = mapped_column(Boolean, default=False, server_default="false")
# facet 탐색 축 (Phase 2)
facet_company: Mapped[str | None] = mapped_column(Text)
facet_topic: Mapped[str | None] = mapped_column(Text)
+114 -40
View File
@@ -1,4 +1,14 @@
"""파일 감시 워커 — Inbox 디렉토리 스캔, 새 파일/변경 파일 자동 등록"""
"""파일 감시 워커 — Inbox/Recordings/Videos 스캔, 새/변경 파일 자동 등록.
§3 확장:
- 스캔 대상: PKM/Inbox (문서) + PKM/Recordings (오디오) + PKM/Videos (비디오)
- 확장자 → category 매핑 (audio/video)
- video 채널 정책: 웹 업로드는 upload 엔드포인트에서 mov/mkv/avi 거부.
NAS 드롭은 여기서 quarantine import (category='video', needs_conversion=true, stage 없음).
- Roon 음원 경로(prefix match) skip — settings.roon_library_path
- 파이프 분기: audio → stage='stt', video direct-play → stage='thumbnail',
video quarantine → stage 없음 (처리 안 함, UI 에서 재생 불가 안내)
"""
from pathlib import Path
@@ -16,69 +26,133 @@ logger = setup_logger("file_watcher")
SKIP_NAMES = {".DS_Store", "Thumbs.db", "desktop.ini", "Icon\r"}
SKIP_EXTENSIONS = {".tmp", ".part", ".crdownload"}
# §3 확장자 매핑
AUDIO_EXTS = {".mp3", ".m4a", ".opus", ".wav", ".flac", ".ogg"}
VIDEO_DIRECT_EXTS = {".mp4", ".webm"} # 브라우저 direct play
VIDEO_QUARANTINE_EXTS = {".mov", ".mkv", ".avi"} # 변환 필요, 보관만
# 스캔 대상: (하위경로, 예상 category) — None 은 문서함(카테고리 미지정)
SCAN_TARGETS: list[tuple[str, str | None]] = [
("Inbox", None),
("Recordings", "audio"),
("Videos", "video"),
]
def should_skip(path: Path) -> bool:
if path.name in SKIP_NAMES or path.name.startswith("._"):
return True
if path.suffix.lower() in SKIP_EXTENSIONS:
return True
# .derived/ .preview/ 디렉토리 내 파일 제외
if ".derived" in path.parts or ".preview" in path.parts:
# .derived / .preview / .thumbs 는 파생물 디렉토리
if ".derived" in path.parts or ".preview" in path.parts or ".thumbs" in path.parts:
return True
# Roon 라이브러리 skip (설정된 경우만)
roon = settings.roon_library_path
if roon and str(path).startswith(roon):
return True
return False
async def watch_inbox():
"""Inbox 디렉토리를 스캔하여 새/변경 파일을 DB에 등록"""
inbox_path = Path(settings.nas_mount_path) / "PKM" / "Inbox"
if not inbox_path.exists():
return
def _route_media(path: Path, expected_category: str | None) -> tuple[str | None, bool, str | None]:
"""확장자 기반으로 (category, needs_conversion, next_stage) 결정.
files = [f for f in inbox_path.rglob("*") if f.is_file() and not should_skip(f)]
if not files:
- Inbox 드롭: expected_category=None — 문서 확장자면 기존 'extract' 파이프,
audio/video 확장자면 혼란 방지로 skip (사용자가 Recordings/Videos 로 넣도록 유도)
- Recordings 드롭: audio 확장자만 수락. 그 외는 skip (log)
- Videos 드롭: direct-play → category+thumbnail, quarantine → category만 (needs_conversion=true)
"""
ext = path.suffix.lower()
if expected_category == "audio":
if ext in AUDIO_EXTS:
return ("audio", False, "stt")
return (None, False, None) # audio 폴더에 엉뚱한 포맷 → skip
if expected_category == "video":
if ext in VIDEO_DIRECT_EXTS:
return ("video", False, "thumbnail")
if ext in VIDEO_QUARANTINE_EXTS:
# quarantine — category 설정하되 stage 안 걸어둠 (재생 불가 안내만)
return ("video", True, None)
return (None, False, None) # 기타 → skip
# Inbox: 문서 파이프 (기존). audio/video 확장자가 실수로 여기 들어오면 skip.
if ext in AUDIO_EXTS or ext in VIDEO_DIRECT_EXTS or ext in VIDEO_QUARANTINE_EXTS:
return (None, False, None)
return (None, False, "extract")
async def watch_inbox():
"""PKM 하위 디렉토리를 스캔하여 새/변경 파일을 DB 등록 + 파이프 투입."""
pkm_root = Path(settings.nas_mount_path) / "PKM"
if not pkm_root.exists():
return
new_count = 0
changed_count = 0
async with async_session() as session:
for file_path in files:
rel_path = str(file_path.relative_to(Path(settings.nas_mount_path)))
fhash = file_hash(file_path)
for sub, expected_category in SCAN_TARGETS:
scan_root = pkm_root / sub
if not scan_root.exists():
continue
# DB에서 기존 문서 확인
result = await session.execute(
select(Document).where(Document.file_path == rel_path)
)
existing = result.scalar_one_or_none()
for file_path in scan_root.rglob("*"):
if not file_path.is_file() or should_skip(file_path):
continue
if existing is None:
# 새 파일 → 등록
ext = file_path.suffix.lstrip(".").lower() or "unknown"
doc = Document(
file_path=rel_path,
file_hash=fhash,
file_format=ext,
file_size=file_path.stat().st_size,
file_type="immutable",
title=file_path.stem,
source_channel="drive_sync",
category, needs_conversion, next_stage = _route_media(
file_path, expected_category
)
session.add(doc)
await session.flush()
await enqueue_stage(session, doc.id, "extract")
new_count += 1
# audio/video 폴더에 엉뚱한 확장자가 들어왔거나 Inbox 에
# audio/video 가 잘못 떨어진 경우 — 이 라운드에서 아예 skip
if category is None and next_stage is None:
continue
elif existing.file_hash != fhash:
# 해시 변경 → 재가공
existing.file_hash = fhash
existing.file_size = file_path.stat().st_size
rel_path = str(file_path.relative_to(Path(settings.nas_mount_path)))
fhash = file_hash(file_path)
await enqueue_stage(session, existing.id, "extract")
changed_count += 1
result = await session.execute(
select(Document).where(Document.file_path == rel_path)
)
existing = result.scalar_one_or_none()
if existing is None:
ext = file_path.suffix.lstrip(".").lower() or "unknown"
doc = Document(
file_path=rel_path,
file_hash=fhash,
file_format=ext,
file_size=file_path.stat().st_size,
file_type="immutable",
title=file_path.stem,
source_channel="drive_sync",
category=category,
needs_conversion=needs_conversion,
)
session.add(doc)
await session.flush()
if next_stage:
await enqueue_stage(session, doc.id, next_stage)
new_count += 1
elif existing.file_hash != fhash:
existing.file_hash = fhash
existing.file_size = file_path.stat().st_size
# 기존 문서에 category/quarantine flag 가 비어있으면 보정
if existing.category is None and category is not None:
existing.category = category
if needs_conversion and not getattr(existing, "needs_conversion", False):
existing.needs_conversion = True
if next_stage:
await enqueue_stage(session, existing.id, next_stage)
changed_count += 1
await session.commit()
if new_count or changed_count:
logger.info(f"[Inbox] 새 파일 {new_count}건, 변경 파일 {changed_count}건 등록")
logger.info(f"[Inbox+§3] 새 파일 {new_count}건, 변경 파일 {changed_count}건 등록")
+18 -3
View File
@@ -13,7 +13,9 @@ from models.queue import ProcessingQueue, enqueue_stage
logger = setup_logger("queue_consumer")
# stage별 배치 크기
BATCH_SIZE = {"extract": 5, "classify": 3, "summarize": 3, "embed": 1, "chunk": 1, "preview": 2}
# stt 는 GPU 단일 점유 + 회의 30분짜리도 가능 → 배치 1. thumbnail 은 ffmpeg subprocess 로 가벼움.
BATCH_SIZE = {"extract": 5, "classify": 3, "summarize": 3, "embed": 1, "chunk": 1,
"preview": 2, "stt": 1, "thumbnail": 3}
STALE_THRESHOLD_MINUTES = 10
@@ -95,8 +97,17 @@ async def reset_stale_items():
async def enqueue_next_stage(document_id: int, current_stage: str):
"""현재 stage 완료 후 다음 stage를 pending으로 등록"""
next_stages = {"extract": ["classify", "preview"], "classify": ["embed", "chunk"]}
"""현재 stage 완료 후 다음 stage를 pending으로 등록.
§3 추가:
stt → [classify] (audio 는 extract 건너뛰고 stt 가 extracted_text 를 채움)
thumbnail → [] (video 는 leaf — classify/embed 없음)
"""
next_stages = {
"extract": ["classify", "preview"],
"classify": ["embed", "chunk"],
"stt": ["classify"],
}
stages = next_stages.get(current_stage, [])
if not stages:
return
@@ -114,7 +125,9 @@ async def consume_queue():
from workers.embed_worker import process as embed_process
from workers.extract_worker import process as extract_process
from workers.preview_worker import process as preview_process
from workers.stt_worker import process as stt_process
from workers.summarize_worker import process as summarize_process
from workers.thumbnail_worker import process as thumbnail_process
workers = {
"extract": extract_process,
@@ -123,6 +136,8 @@ async def consume_queue():
"embed": embed_process,
"chunk": chunk_process,
"preview": preview_process,
"stt": stt_process,
"thumbnail": thumbnail_process,
}
try:
+89
View File
@@ -0,0 +1,89 @@
"""STT 전사 워커 — services/stt(faster-whisper) 호출 + audio_segments 저장.
queue_consumer 가 stage='stt' pending 큐 행을 pickup 하여 본 process() 를 호출.
services/stt 는 /transcribe {filePath, langs?, beamSize?} → {text, segments, language,
language_probability, duration}. 성공 시:
- Document.extracted_text = text (기존 classify/embed 파이프 재사용)
- Document.extractor_version = "faster-whisper@large-v3" (모델명 기록)
- Document.extracted_at = now()
- audio_segments INSERT 일괄 (기존 세그먼트는 삭제 후 재삽입, 재전사 대응)
audio 파이프라인: file_watcher 가 category='audio' + stage='stt' 등록 →
stt → classify → embed/chunk (extract 건너뜀). queue_consumer 의 next_stages 에서
처리.
"""
from datetime import datetime, timezone
from pathlib import Path
import httpx
from sqlalchemy import delete
from sqlalchemy.ext.asyncio import AsyncSession
from core.config import settings
from core.utils import setup_logger
from models.audio_segment import AudioSegment
from models.document import Document
logger = setup_logger("stt_worker")
# /transcribe 는 장시간 (30분 녹음 ≈ 수분). 충분히 여유. connect 는 짧게.
STT_TIMEOUT = httpx.Timeout(connect=10.0, read=1800.0, write=60.0, pool=10.0)
async def process(document_id: int, session: AsyncSession) -> None:
"""audio 문서 전사 — STT_ENDPOINT 호출 후 텍스트/세그먼트 저장."""
doc = await session.get(Document, document_id)
if not doc:
logger.error(f"[stt] document_id={document_id} 없음")
return
if not doc.file_path:
logger.warning(f"[stt] id={document_id} file_path 없음 — skip")
return
# NAS 마운트 경로로 절대화 (services/stt 컨테이너도 동일 경로에 bind mount)
container_path = str(Path(settings.nas_mount_path) / doc.file_path)
try:
async with httpx.AsyncClient(timeout=STT_TIMEOUT) as client:
resp = await client.post(
f"{settings.stt_endpoint}/transcribe",
json={"filePath": container_path},
)
resp.raise_for_status()
data = resp.json()
except httpx.HTTPError as e:
logger.error(f"[stt] id={document_id} 호출 실패: {e}")
raise
if "error" in data and not data.get("text"):
logger.error(f"[stt] id={document_id} 서비스 에러: {data['error']}")
raise RuntimeError(f"stt error: {data['error']}")
text = (data.get("text") or "").strip()
segments = data.get("segments") or []
# 기존 audio_segments 삭제 (재전사 대응) — 새 세그먼트로 교체
await session.execute(delete(AudioSegment).where(AudioSegment.document_id == document_id))
for seg in segments:
session.add(AudioSegment(
document_id=document_id,
start_s=float(seg["start"]),
end_s=float(seg["end"]),
text=str(seg["text"]),
))
doc.extracted_text = text
doc.extracted_at = datetime.now(timezone.utc)
model_name = None
# /ready 응답의 "model" 을 신뢰할 수 있지만, 매 호출마다 조회하지 않고
# 환경에 안 맞으면 /transcribe 응답에서 추론: language / duration 만 쓰고 모델명은 설정 기반
# (services/stt 가 여러 모델 swap 가능해지면 응답에 포함시킬 것)
doc.extractor_version = f"faster-whisper@{data.get('language', 'auto')}"
logger.info(
f"[stt] id={document_id} segments={len(segments)} chars={len(text)} "
f"lang={data.get('language')} dur={data.get('duration')}s"
)
+129
View File
@@ -0,0 +1,129 @@
"""비디오 썸네일 생성 워커 — ffmpeg subprocess 로 50% 지점 1장 추출.
PKM/Videos/.thumbs/{doc_id}.jpg 에 저장 후 documents.thumbnail_path 업데이트.
quarantine 상태(needs_conversion=true)인 파일은 건너뜀.
queue_consumer 와의 배선(stage 매핑)은 §1 category 분기와 묶여 있어 본 모듈은
유틸 + process() 진입점만 제공. queue_consumer 측 wiring 은 §1 의존 파트에서.
"""
import subprocess
import unicodedata
from datetime import datetime, timezone
from pathlib import Path
from sqlalchemy.ext.asyncio import AsyncSession
from core.config import settings
from core.utils import setup_logger
logger = setup_logger("thumbnail_worker")
THUMBS_DIR_NAME = "PKM/Videos/.thumbs"
FFMPEG_TIMEOUT = 30
def _resolve_path(file_path: str) -> Path | None:
"""NFC(DB) vs NFD(NFS) 한글 경로 차이 흡수. OCR/STT 서비스와 동일 패턴."""
candidates = [
file_path,
unicodedata.normalize("NFD", file_path),
unicodedata.normalize("NFC", file_path),
]
for c in candidates:
p = Path(c)
if p.exists():
return p
parent = Path(file_path).parent
if parent.exists():
target = unicodedata.normalize("NFC", Path(file_path).name)
for child in parent.iterdir():
if unicodedata.normalize("NFC", child.name) == target:
return child
return None
def _probe_duration_seconds(path: Path) -> float | None:
"""ffprobe 로 재생 길이 조회. 실패 시 None."""
try:
result = subprocess.run(
[
"ffprobe", "-v", "error",
"-show_entries", "format=duration",
"-of", "default=noprint_wrappers=1:nokey=1",
str(path),
],
capture_output=True, text=True, timeout=FFMPEG_TIMEOUT,
)
if result.returncode != 0:
return None
return float(result.stdout.strip())
except (subprocess.SubprocessError, ValueError):
return None
def _extract_thumbnail(source: Path, output: Path, seek_seconds: float) -> bool:
"""ffmpeg 로 seek_seconds 지점 1프레임을 jpg 로 추출. 성공 시 True."""
output.parent.mkdir(parents=True, exist_ok=True)
try:
result = subprocess.run(
[
"ffmpeg", "-y",
"-ss", f"{seek_seconds:.2f}",
"-i", str(source),
"-vframes", "1",
"-vf", "scale='min(640,iw)':-1",
"-q:v", "3",
str(output),
],
capture_output=True, text=True, timeout=FFMPEG_TIMEOUT,
)
if result.returncode != 0:
logger.error(f"[thumbnail] ffmpeg 실패: {source.name}{result.stderr[-400:]}")
return False
return output.exists() and output.stat().st_size > 0
except subprocess.SubprocessError as e:
logger.error(f"[thumbnail] subprocess 오류: {source.name}{e}")
return False
async def process(document_id: int, session: AsyncSession) -> None:
"""영상 문서 썸네일 생성 진입점 (queue_consumer 에서 호출 예정).
needs_conversion=True 는 skip. 파일 위치가 없으면 NFC/NFD resolver 로 보정.
"""
from models.document import Document
doc = await session.get(Document, document_id)
if not doc:
logger.error(f"[thumbnail] document_id={document_id} 없음")
return
if getattr(doc, "needs_conversion", False):
logger.info(f"[thumbnail] id={document_id} needs_conversion=true → skip")
return
if not doc.file_path:
logger.warning(f"[thumbnail] id={document_id} file_path 없음")
return
raw = str(Path(settings.nas_mount_path) / doc.file_path)
source = _resolve_path(raw)
if source is None:
logger.error(f"[thumbnail] 원본 없음: {raw}")
return
duration = _probe_duration_seconds(source)
seek = (duration * 0.5) if duration and duration > 0 else 1.0
thumbs_dir = Path(settings.nas_mount_path) / THUMBS_DIR_NAME
output = thumbs_dir / f"{document_id}.jpg"
ok = _extract_thumbnail(source, output, seek)
if not ok:
return
doc.thumbnail_path = str(output)
doc.updated_at = datetime.now(timezone.utc)
await session.commit()
logger.info(f"[thumbnail] id={document_id}{output}")
+28
View File
@@ -54,6 +54,32 @@ services:
start_period: 180s
restart: unless-stopped
stt-service:
build: ./services/stt
expose:
- "3300"
volumes:
- ${NAS_NFS_PATH:-/mnt/nas/Document_Server}:/documents:ro
- stt_models:/root/.cache
environment:
- WHISPER_MODEL=${WHISPER_MODEL:-large-v3}
- WHISPER_DEVICE=${WHISPER_DEVICE:-cuda}
- WHISPER_COMPUTE_TYPE=${WHISPER_COMPUTE_TYPE:-float16}
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
healthcheck:
test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:3300/health')"]
interval: 30s
timeout: 10s
retries: 3
start_period: 300s
restart: unless-stopped
ollama:
image: ollama/ollama
volumes:
@@ -125,6 +151,7 @@ services:
- DATABASE_URL=postgresql+asyncpg://pkm:${POSTGRES_PASSWORD}@postgres:5432/pkm
- KORDOC_ENDPOINT=http://kordoc-service:3100
- OCR_ENDPOINT=http://ocr-service:3200
- STT_ENDPOINT=http://stt-service:3300
restart: unless-stopped
frontend:
@@ -153,3 +180,4 @@ volumes:
ollama_data:
reranker_cache:
ocr_models:
stt_models:
@@ -0,0 +1,95 @@
<script>
// 오디오 플레이어 + 전사 세그먼트 sticky 패널.
// 줄 클릭 시 audio.currentTime = seg.start 로 점프.
// 재생 중인 세그먼트는 하이라이트.
import { api, getAccessToken } from '$lib/api';
let { docId } = $props();
let audioEl = $state(null);
let segments = $state([]);
let currentTime = $state(0);
let loading = $state(true);
let error = $state(null);
let token = $derived(getAccessToken());
let fileSrc = $derived(`/api/documents/${docId}/file?token=${token}`);
$effect(() => {
if (docId != null) loadSegments(docId);
});
async function loadSegments(id) {
loading = true;
error = null;
try {
const resp = await api(`/audio/${id}/segments`);
segments = resp?.segments ?? [];
} catch (err) {
segments = [];
error = '전사 세그먼트를 불러오지 못했습니다';
} finally {
loading = false;
}
}
function seekTo(start) {
if (!audioEl) return;
audioEl.currentTime = Math.max(0, start);
audioEl.play().catch(() => {});
}
function handleTimeUpdate() {
if (audioEl) currentTime = audioEl.currentTime;
}
function formatTime(sec) {
if (sec == null || isNaN(sec)) return '00:00';
const m = Math.floor(sec / 60);
const s = Math.floor(sec % 60);
return `${m.toString().padStart(2, '0')}:${s.toString().padStart(2, '0')}`;
}
let activeIdx = $derived(
segments.findIndex((s) => currentTime >= s.start && currentTime < s.end)
);
</script>
<div class="flex flex-col h-full">
<div class="sticky top-0 z-10 bg-surface border-b border-default p-3">
<audio
bind:this={audioEl}
src={fileSrc}
controls
preload="metadata"
class="w-full"
ontimeupdate={handleTimeUpdate}
>
<track kind="captions" />
</audio>
</div>
<div class="flex-1 overflow-y-auto p-4 space-y-1">
{#if loading}
<p class="text-muted text-sm">전사 불러오는 중…</p>
{:else if error}
<p class="text-error text-sm">{error}</p>
{:else if segments.length === 0}
<p class="text-muted text-sm">전사 결과가 아직 없습니다 (STT 처리 대기 중일 수 있음).</p>
{:else}
{#each segments as seg, i (seg.start)}
<button
type="button"
class="w-full text-left flex gap-3 px-2 py-1 rounded transition-colors {i === activeIdx ? 'bg-accent-muted' : 'hover:bg-surface-hover'}"
onclick={() => seekTo(seg.start)}
>
<span class="text-xs tabular-nums text-muted min-w-[3.5rem] mt-0.5">
{formatTime(seg.start)}
</span>
<span class="text-sm leading-relaxed">{seg.text}</span>
</button>
{/each}
{/if}
</div>
</div>
+29 -5
View File
@@ -25,6 +25,8 @@
StickyNote,
Newspaper,
Search,
Mic,
Film,
} from 'lucide-svelte';
// ─── 도메인 트리 (기존) ───
@@ -207,11 +209,33 @@
{/if}
</a>
<!--
§3 에서 채울 자리 — audio/video 네비:
<a href="/audio">Audio · {categoryCounts.audio}</a>
<a href="/video">Video · {categoryCounts.video}</a>
-->
<a
href="/audio"
class="flex items-center justify-between px-3 py-2 rounded-md text-sm transition-colors
{currentPath.startsWith('/audio') ? 'bg-accent/15 text-accent' : 'text-text hover:bg-surface-hover'}"
>
<span class="flex items-center gap-2">
<Mic size={16} />
오디오
</span>
{#if categoryCounts.audio > 0}
<span class="text-xs text-dim">{categoryCounts.audio}</span>
{/if}
</a>
<a
href="/video"
class="flex items-center justify-between px-3 py-2 rounded-md text-sm transition-colors
{currentPath.startsWith('/video') ? 'bg-accent/15 text-accent' : 'text-text hover:bg-surface-hover'}"
>
<span class="flex items-center gap-2">
<Film size={16} />
비디오
</span>
{#if categoryCounts.video > 0}
<span class="text-xs text-dim">{categoryCounts.video}</span>
{/if}
</a>
<a
href="/ask"
@@ -0,0 +1,42 @@
<script>
// HTML5 비디오 플레이어 (direct play 전용).
// needsConversion=true 이면 재생 대신 안내 카드 표시 (§3 채널별 정책).
import { getAccessToken } from '$lib/api';
let { docId, needsConversion = false, fileFormat = '', title = '' } = $props();
let token = $derived(getAccessToken());
let src = $derived(`/api/documents/${docId}/file?token=${token}`);
let thumbSrc = $derived(`/api/video/${docId}/thumbnail?token=${token}`);
</script>
{#if needsConversion}
<div class="flex flex-col items-center justify-center h-full p-8 bg-surface text-center">
<div class="max-w-md space-y-3">
<p class="text-lg font-semibold">재생할 수 없는 포맷입니다</p>
<p class="text-sm text-muted">
{fileFormat ? `.${fileFormat}` : '현재 파일'} 포맷은 브라우저가 직접 재생할 수 없어 보관만 하고 있습니다.
</p>
<p class="text-sm text-muted">
재생하려면 원본을 <code class="px-1.5 py-0.5 bg-surface-hover rounded">mp4 (H.264/AAC)</code> 또는
<code class="px-1.5 py-0.5 bg-surface-hover rounded">webm (VP9)</code> 으로 변환 후 다시 올리세요.
</p>
{#if title}
<p class="text-xs text-muted pt-2">파일: {title}</p>
{/if}
</div>
</div>
{:else}
<div class="flex flex-col h-full bg-black">
<video
{src}
controls
preload="metadata"
poster={thumbSrc}
class="w-full h-full object-contain"
>
<track kind="captions" />
</video>
</div>
{/if}
+105
View File
@@ -0,0 +1,105 @@
<script>
// /audio — 오디오 문서 목록. §3
// `GET /api/documents?category=audio` 로 조회 후 카드 그리드로 렌더.
import { onMount } from 'svelte';
import { goto } from '$app/navigation';
import { api } from '$lib/api';
import { addToast } from '$lib/stores/toast';
import { Mic, FileAudio } from 'lucide-svelte';
let docs = $state([]);
let total = $state(0);
let loading = $state(true);
let currentPage = $state(1);
const PAGE_SIZE = 30;
onMount(load);
async function load() {
loading = true;
try {
const params = new URLSearchParams({
category: 'audio',
page: String(currentPage),
page_size: String(PAGE_SIZE),
sort: 'updated_desc',
});
const data = await api(`/documents?${params}`);
docs = data.items ?? [];
total = data.total ?? 0;
} catch (err) {
addToast('error', '오디오 목록 불러오기 실패');
docs = [];
} finally {
loading = false;
}
}
function pickOpen(id) {
goto(`/audio/${id}`);
}
function fmtSize(bytes) {
if (!bytes) return '-';
const mb = bytes / (1024 * 1024);
return mb < 1 ? `${(bytes / 1024).toFixed(0)} KB` : `${mb.toFixed(1)} MB`;
}
function fmtDate(s) {
if (!s) return '';
const d = new Date(s);
return d.toLocaleDateString();
}
</script>
<div class="p-6 max-w-[1200px] mx-auto">
<header class="flex items-center gap-2 mb-4">
<Mic size={20} />
<h1 class="text-xl font-semibold">Audio</h1>
<span class="text-sm text-muted ml-2">{total}</span>
</header>
{#if loading}
<p class="text-muted">불러오는 중…</p>
{:else if docs.length === 0}
<p class="text-muted">오디오 문서가 없습니다. NAS <code>PKM/Recordings/</code> 에 드롭하거나 업로드하세요.</p>
{:else}
<div class="grid grid-cols-1 sm:grid-cols-2 md:grid-cols-3 gap-3">
{#each docs as d (d.id)}
<button
type="button"
class="text-left p-4 rounded-lg border border-default bg-surface hover:bg-surface-hover transition-colors"
onclick={() => pickOpen(d.id)}
>
<div class="flex items-start gap-2">
<FileAudio size={18} class="text-muted mt-0.5 shrink-0" />
<div class="flex-1 min-w-0">
<p class="font-medium truncate">{d.title || '(제목 없음)'}</p>
<p class="text-xs text-muted mt-1">.{d.file_format} · {fmtSize(d.file_size)}</p>
<p class="text-xs text-dim mt-0.5">{fmtDate(d.updated_at)}</p>
</div>
</div>
</button>
{/each}
</div>
{#if total > PAGE_SIZE}
<div class="flex justify-center gap-2 mt-6">
<button
type="button"
class="px-3 py-1 rounded border border-default disabled:opacity-50"
disabled={currentPage === 1}
onclick={() => { currentPage -= 1; load(); }}
>이전</button>
<span class="px-3 py-1 text-sm text-muted">{currentPage} / {Math.ceil(total / PAGE_SIZE)}</span>
<button
type="button"
class="px-3 py-1 rounded border border-default disabled:opacity-50"
disabled={currentPage * PAGE_SIZE >= total}
onclick={() => { currentPage += 1; load(); }}
>다음</button>
</div>
{/if}
{/if}
</div>
@@ -0,0 +1,61 @@
<script>
// /audio/[id] — 단건 오디오 재생 + 전사 세그먼트.
import { page } from '$app/stores';
import { onMount } from 'svelte';
import { api } from '$lib/api';
import { addToast } from '$lib/stores/toast';
import AudioPlayer from '$lib/components/AudioPlayer.svelte';
import { ArrowLeft } from 'lucide-svelte';
import { goto } from '$app/navigation';
let docId = $derived(Number($page.params.id));
let doc = $state(null);
let loading = $state(true);
onMount(load);
$effect(() => {
if (docId) load();
});
async function load() {
loading = true;
try {
doc = await api(`/documents/${docId}`);
} catch (err) {
addToast('error', '문서를 불러올 수 없습니다');
doc = null;
} finally {
loading = false;
}
}
</script>
<div class="flex flex-col h-full">
<header class="flex items-center gap-2 px-4 py-2 border-b border-default">
<button
type="button"
class="p-1 rounded hover:bg-surface-hover"
onclick={() => goto('/audio')}
aria-label="목록으로"
>
<ArrowLeft size={18} />
</button>
<h1 class="font-medium truncate flex-1">
{doc?.title ?? (loading ? '불러오는 중…' : '(제목 없음)')}
</h1>
</header>
<div class="flex-1 overflow-hidden">
{#if loading}
<p class="p-4 text-muted">불러오는 중…</p>
{:else if !doc}
<p class="p-4 text-error">문서를 찾을 수 없습니다.</p>
{:else if doc.category !== 'audio'}
<p class="p-4 text-error">이 문서는 오디오가 아닙니다 (category={doc.category}).</p>
{:else}
<AudioPlayer docId={doc.id} />
{/if}
</div>
</div>
+104
View File
@@ -0,0 +1,104 @@
<script>
// /video — 비디오 문서 목록. §3
// needs_conversion=true 는 재생 불가 배지 표시, 썸네일은 /api/video/{id}/thumbnail?token=.
import { onMount } from 'svelte';
import { goto } from '$app/navigation';
import { api, getAccessToken } from '$lib/api';
import { addToast } from '$lib/stores/toast';
import { Film, AlertTriangle } from 'lucide-svelte';
let docs = $state([]);
let total = $state(0);
let loading = $state(true);
let currentPage = $state(1);
const PAGE_SIZE = 30;
onMount(load);
async function load() {
loading = true;
try {
const params = new URLSearchParams({
category: 'video',
page: String(currentPage),
page_size: String(PAGE_SIZE),
sort: 'updated_desc',
});
const data = await api(`/documents?${params}`);
docs = data.items ?? [];
total = data.total ?? 0;
} catch (err) {
addToast('error', '비디오 목록 불러오기 실패');
docs = [];
} finally {
loading = false;
}
}
let token = $derived(getAccessToken());
function thumbSrc(id) {
return `/api/video/${id}/thumbnail?token=${token}`;
}
function fmtSize(bytes) {
if (!bytes) return '-';
const mb = bytes / (1024 * 1024);
if (mb < 1024) return `${mb.toFixed(0)} MB`;
return `${(mb / 1024).toFixed(1)} GB`;
}
</script>
<div class="p-6 max-w-[1400px] mx-auto">
<header class="flex items-center gap-2 mb-4">
<Film size={20} />
<h1 class="text-xl font-semibold">Video</h1>
<span class="text-sm text-muted ml-2">{total}</span>
</header>
{#if loading}
<p class="text-muted">불러오는 중…</p>
{:else if docs.length === 0}
<p class="text-muted">비디오 문서가 없습니다. NAS <code>PKM/Videos/</code> 에 드롭하거나 업로드하세요.</p>
{:else}
<div class="grid grid-cols-2 sm:grid-cols-3 md:grid-cols-4 gap-4">
{#each docs as d (d.id)}
<button
type="button"
class="text-left rounded-lg overflow-hidden border border-default bg-surface hover:bg-surface-hover transition-colors"
onclick={() => goto(`/video/${d.id}`)}
>
<div class="aspect-video bg-black relative flex items-center justify-center">
{#if d.thumbnail_path}
<img src={thumbSrc(d.id)} alt="" class="w-full h-full object-cover" loading="lazy" />
{:else}
<Film size={32} class="text-dim" />
{/if}
{#if d.needs_conversion}
<span class="absolute top-1 left-1 px-1.5 py-0.5 rounded bg-warning text-xs flex items-center gap-1">
<AlertTriangle size={12} /> 변환 필요
</span>
{/if}
</div>
<div class="p-2">
<p class="text-sm font-medium truncate">{d.title || '(제목 없음)'}</p>
<p class="text-xs text-muted mt-0.5">.{d.file_format} · {fmtSize(d.file_size)}</p>
</div>
</button>
{/each}
</div>
{#if total > PAGE_SIZE}
<div class="flex justify-center gap-2 mt-6">
<button type="button" class="px-3 py-1 rounded border border-default disabled:opacity-50"
disabled={currentPage === 1}
onclick={() => { currentPage -= 1; load(); }}>이전</button>
<span class="px-3 py-1 text-sm text-muted">{currentPage} / {Math.ceil(total / PAGE_SIZE)}</span>
<button type="button" class="px-3 py-1 rounded border border-default disabled:opacity-50"
disabled={currentPage * PAGE_SIZE >= total}
onclick={() => { currentPage += 1; load(); }}>다음</button>
</div>
{/if}
{/if}
</div>
@@ -0,0 +1,66 @@
<script>
// /video/[id] — 단건 비디오 재생 (direct play) 또는 quarantine 안내.
import { page } from '$app/stores';
import { onMount } from 'svelte';
import { api } from '$lib/api';
import { addToast } from '$lib/stores/toast';
import VideoPlayer from '$lib/components/VideoPlayer.svelte';
import { ArrowLeft } from 'lucide-svelte';
import { goto } from '$app/navigation';
let docId = $derived(Number($page.params.id));
let doc = $state(null);
let loading = $state(true);
onMount(load);
$effect(() => {
if (docId) load();
});
async function load() {
loading = true;
try {
doc = await api(`/documents/${docId}`);
} catch (err) {
addToast('error', '문서를 불러올 수 없습니다');
doc = null;
} finally {
loading = false;
}
}
</script>
<div class="flex flex-col h-full">
<header class="flex items-center gap-2 px-4 py-2 border-b border-default">
<button
type="button"
class="p-1 rounded hover:bg-surface-hover"
onclick={() => goto('/video')}
aria-label="목록으로"
>
<ArrowLeft size={18} />
</button>
<h1 class="font-medium truncate flex-1">
{doc?.title ?? (loading ? '불러오는 중…' : '(제목 없음)')}
</h1>
</header>
<div class="flex-1 overflow-hidden">
{#if loading}
<p class="p-4 text-muted">불러오는 중…</p>
{:else if !doc}
<p class="p-4 text-error">문서를 찾을 수 없습니다.</p>
{:else if doc.category !== 'video'}
<p class="p-4 text-error">이 문서는 비디오가 아닙니다 (category={doc.category}).</p>
{:else}
<VideoPlayer
docId={doc.id}
needsConversion={doc.needs_conversion ?? false}
fileFormat={doc.file_format}
title={doc.title}
/>
{/if}
</div>
</div>
+17
View File
@@ -0,0 +1,17 @@
-- 147_audio_segments_table.sql
-- Document Server 통합 플랫폼 Section 3: audio_segments 테이블 정의 (1/2)
-- plan: luminous-sprouting-hamster.md §3
--
-- asyncpg single-statement 규칙에 따라 테이블 생성만 담당. 인덱스는 148.
-- STT (faster-whisper) 결과의 타임스탬프 단위 세그먼트.
-- documents.extracted_text 에는 전체 전사 텍스트를 저장 (classify/embed 재사용),
-- 본 테이블은 AudioPlayer 에서 줄 클릭 → audio.currentTime = start_s 점프에 사용.
-- ON DELETE CASCADE — 문서 물리 삭제 시 세그먼트 정리.
CREATE TABLE IF NOT EXISTS audio_segments (
id BIGSERIAL PRIMARY KEY,
document_id BIGINT NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
start_s REAL NOT NULL,
end_s REAL NOT NULL,
text TEXT NOT NULL
);
+8
View File
@@ -0,0 +1,8 @@
-- 148_audio_segments_idx.sql
-- Document Server 통합 플랫폼 Section 3: audio_segments 인덱스 (2/2)
-- plan: luminous-sprouting-hamster.md §3
--
-- AudioPlayer 가 `WHERE document_id=? ORDER BY start_s` 로 조회하는 경로용.
CREATE INDEX IF NOT EXISTS idx_audio_segments_doc_start
ON audio_segments(document_id, start_s);
+13
View File
@@ -0,0 +1,13 @@
-- 149_document_media_cols.sql
-- Document Server 통합 플랫폼 Section 3: video 재생/썸네일 컬럼
-- plan: luminous-sprouting-hamster.md §3
--
-- thumbnail_path: PKM/Videos/.thumbs/{doc_id}.jpg 절대경로 (ffmpeg 50% 지점 1장).
-- needs_conversion: NAS 드롭으로 들어온 .mov/.mkv/.avi 등 quarantine 플래그.
-- true 이면 VideoPlayer 가 재생 대신 "변환 필요" 안내 카드를 표시.
-- v2.1 은 boolean 으로 최소 시작. 후속 상태 필요 시 별도 migration 에서 enum 확장.
-- ALTER TABLE 단일 statement (다중 ADD COLUMN 절 is OK, §2 144 와 동일 패턴).
ALTER TABLE documents
ADD COLUMN IF NOT EXISTS thumbnail_path TEXT,
ADD COLUMN IF NOT EXISTS needs_conversion BOOLEAN NOT NULL DEFAULT false;
+11
View File
@@ -0,0 +1,11 @@
-- 150_queue_stage_stt.sql
-- Document Server 통합 플랫폼 Section 3: processing_queue stage 에 'stt' 추가
-- plan: luminous-sprouting-hamster.md §3
--
-- audio 파이프: stt → classify → embed (extract 건너뜀).
-- stt_worker 는 category='audio' 큐 행을 pickup 하여 services/stt 호출 후
-- documents.extracted_text + audio_segments 저장.
-- 본 migration 은 enum 확장만 담당. 실제 분기 로직은 queue_consumer 에서 §1 의
-- category 컬럼 기반으로 처리.
ALTER TYPE process_stage ADD VALUE IF NOT EXISTS 'stt';
+9
View File
@@ -0,0 +1,9 @@
-- 151_queue_stage_thumbnail.sql
-- Document Server 통합 플랫폼 Section 3: processing_queue stage 에 'thumbnail' 추가
-- plan: luminous-sprouting-hamster.md §3
--
-- video 파이프는 thumbnail 단일 stage (leaf, classify/embed 없음).
-- thumbnail_worker 가 category='video' + needs_conversion=false 큐 행을 pickup
-- 하여 ffmpeg 로 .thumbs/{doc_id}.jpg 생성 후 documents.thumbnail_path 세팅.
ALTER TYPE process_stage ADD VALUE IF NOT EXISTS 'thumbnail';
+21
View File
@@ -0,0 +1,21 @@
FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
ENV DEBIAN_FRONTEND=noninteractive \
PYTHONUNBUFFERED=1 \
PIP_NO_CACHE_DIR=1
# faster-whisper 는 PyAV 로 디코드 (ffmpeg 필요 없음) 하지만,
# 포맷 가변성 대비 시스템 ffmpeg 도 설치. python3.10 = ubuntu22.04 기본.
RUN apt-get update && apt-get install -y --no-install-recommends \
python3 python3-pip ffmpeg \
&& apt-get clean && rm -rf /var/lib/apt/lists/*
WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY server.py .
EXPOSE 3300
CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "3300"]
+3
View File
@@ -0,0 +1,3 @@
faster-whisper>=1.0.3,<2.0.0
fastapi>=0.110.0,<1.0.0
uvicorn[standard]>=0.27.0,<1.0.0
+140
View File
@@ -0,0 +1,140 @@
"""STT 마이크로서비스 — faster-whisper (GPU) 기반 음성 전사.
filePath {text, segments:[{start,end,text}]}. 모델은 요청 lazy loading.
기본 모델 large-v3 (VRAM ~3GB, float16). 환경변수로 교체 가능.
"""
import os
import unicodedata
from pathlib import Path
from fastapi import FastAPI
app = FastAPI()
_model = None
_MODEL_NAME = os.getenv("WHISPER_MODEL", "large-v3")
_DEVICE = os.getenv("WHISPER_DEVICE", "cuda")
_COMPUTE_TYPE = os.getenv("WHISPER_COMPUTE_TYPE", "float16")
def _resolve_path(file_path: str) -> Path | None:
"""NFC(DB) vs NFD(NFS) 한글 경로 정규화 차이 흡수. OCR 서비스와 동일 패턴."""
candidates = [
file_path,
unicodedata.normalize("NFD", file_path),
unicodedata.normalize("NFC", file_path),
]
for c in candidates:
p = Path(c)
if p.exists():
return p
# 마지막 fallback: parent 디렉토리에서 이름을 NFC 로 매칭
parent = Path(file_path).parent
if parent.exists():
target = unicodedata.normalize("NFC", Path(file_path).name)
for child in parent.iterdir():
if unicodedata.normalize("NFC", child.name) == target:
return child
return None
def _load_model():
"""faster-whisper lazy loading — 첫 호출 시만 VRAM 점유."""
global _model
if _model is not None:
return _model
from faster_whisper import WhisperModel
_model = WhisperModel(_MODEL_NAME, device=_DEVICE, compute_type=_COMPUTE_TYPE)
return _model
def _cuda_device_count() -> int:
try:
import ctranslate2
return ctranslate2.get_cuda_device_count()
except Exception:
return 0
@app.get("/health")
def health():
"""Liveness — Docker healthcheck 용, 프로세스 생존 확인."""
return {"status": "ok", "service": "stt-faster-whisper"}
@app.get("/ready")
def ready():
"""Readiness — CUDA + 모델 상태. 배포 검증용."""
count = _cuda_device_count()
cuda_ok = count > 0
models_loaded = _model is not None
return {
"ready": cuda_ok and models_loaded,
"cuda": cuda_ok,
"cuda_device_count": count,
"models_loaded": models_loaded,
"model": _MODEL_NAME,
"compute_type": _COMPUTE_TYPE,
}
@app.post("/transcribe")
async def transcribe(body: dict):
"""오디오 파일 전사.
입력:
{
"filePath": "/documents/PKM/Recordings/2026-04-23_회의.mp3",
"langs": ["ko"]?, # 단일 언어 지정 or 생략(자동감지)
"beamSize": 5? # 기본 5
}
출력:
{
"text": "전체 전사 텍스트",
"segments": [{"start": 0.0, "end": 2.4, "text": "..."}, ...],
"language": "ko",
"language_probability": 0.99,
"duration": 1832.5
}
"""
raw_path = body["filePath"]
langs = body.get("langs")
beam_size = int(body.get("beamSize", 5))
resolved = _resolve_path(raw_path)
if resolved is None:
return {"error": f"파일 없음: {raw_path}", "text": "", "segments": []}
model = _load_model()
language = None
if isinstance(langs, list) and len(langs) == 1:
language = langs[0]
segments_iter, info = model.transcribe(
str(resolved),
beam_size=beam_size,
language=language,
vad_filter=True,
)
segments = []
parts = []
for seg in segments_iter:
segments.append({
"start": round(float(seg.start), 2),
"end": round(float(seg.end), 2),
"text": seg.text.strip(),
})
parts.append(seg.text)
return {
"text": " ".join(p.strip() for p in parts).strip(),
"segments": segments,
"language": getattr(info, "language", None),
"language_probability": float(getattr(info, "language_probability", 0.0) or 0.0),
"duration": float(getattr(info, "duration", 0.0) or 0.0),
}