feat: Markdown 편집기 + PDF 변환 파이프라인 + 뷰어 포맷 분기

- Markdown split editor: textarea + marked preview, Ctrl+S 저장
- PUT /api/documents/{id}/content: 원본 파일 저장 + extracted_text 갱신
- GET /api/documents/{id}/preview: PDF 미리보기 캐시 서빙
- preview_worker: LibreOffice headless → PDF 변환 (timeout 60s, retry 1회)
- queue_consumer: preview stage 추가 (embed 후 자동 트리거)
- DocumentViewer: 포맷별 분기 (markdown/pdf/preview-pdf/image/text/cad)
- 오피스/CAD 문서: 새 탭 편집 버튼
- Dockerfile: LibreOffice headless 설치
- migration 005: preview_status, preview_hash, preview_at 컬럼

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Hyungi Ahn
2026-04-03 10:10:03 +09:00
parent 3546c8cefb
commit 4bea408bbd
7 changed files with 354 additions and 45 deletions

View File

@@ -2,6 +2,11 @@ FROM python:3.11-slim
WORKDIR /app
# LibreOffice headless (PDF 변환용)
RUN apt-get update && \
apt-get install -y --no-install-recommends libreoffice-core libreoffice-calc libreoffice-writer libreoffice-impress && \
apt-get clean && rm -rf /var/lib/apt/lists/*
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt

View File

@@ -37,6 +37,7 @@ class DocumentResponse(BaseModel):
ai_tags: list | None
ai_summary: str | None
user_note: str | None
preview_status: str | None
source_channel: str | None
data_origin: str | None
extracted_at: datetime | None
@@ -298,6 +299,66 @@ async def update_document(
return DocumentResponse.model_validate(doc)
@router.put("/{doc_id}/content")
async def save_document_content(
doc_id: int,
user: Annotated[User, Depends(get_current_user)],
session: Annotated[AsyncSession, Depends(get_session)],
body: dict = None,
):
"""Markdown 원본 파일 저장 + extracted_text 갱신"""
doc = await session.get(Document, doc_id)
if not doc:
raise HTTPException(status_code=404, detail="문서를 찾을 수 없습니다")
if doc.file_format not in ("md", "txt"):
raise HTTPException(status_code=400, detail="편집 가능한 포맷이 아닙니다 (md, txt만 가능)")
content = body.get("content", "") if body else ""
file_path = Path(settings.nas_mount_path) / doc.file_path
file_path.write_text(content, encoding="utf-8")
# 메타 갱신
doc.file_size = len(content.encode("utf-8"))
doc.file_hash = file_hash(file_path)
doc.extracted_text = content[:15000]
doc.updated_at = datetime.now(timezone.utc)
await session.commit()
return DocumentResponse.model_validate(doc)
@router.get("/{doc_id}/preview")
async def get_document_preview(
doc_id: int,
session: Annotated[AsyncSession, Depends(get_session)],
token: str | None = Query(None, description="Bearer token (iframe용)"),
):
"""PDF 미리보기 캐시 서빙"""
from core.auth import decode_token
if token:
payload = decode_token(token)
if not payload or payload.get("type") != "access":
raise HTTPException(status_code=401, detail="유효하지 않은 토큰")
else:
raise HTTPException(status_code=401, detail="토큰이 필요합니다")
doc = await session.get(Document, doc_id)
if not doc:
raise HTTPException(status_code=404, detail="문서를 찾을 수 없습니다")
preview_path = Path(settings.nas_mount_path) / "PKM" / ".preview" / f"{doc_id}.pdf"
if not preview_path.exists():
raise HTTPException(status_code=404, detail="미리보기가 아직 생성되지 않았습니다")
return FileResponse(
path=str(preview_path),
media_type="application/pdf",
headers={"Content-Disposition": "inline"},
)
@router.delete("/{doc_id}")
async def delete_document(
doc_id: int,

View File

@@ -47,6 +47,11 @@ class Document(Base):
# 사용자 메모
user_note: Mapped[str | None] = mapped_column(Text)
# 미리보기
preview_status: Mapped[str | None] = mapped_column(String(20), default="none")
preview_hash: Mapped[str | None] = mapped_column(String(64))
preview_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True))
# 메타데이터
source_channel: Mapped[str | None] = mapped_column(
Enum("law_monitor", "devonagent", "email", "web_clip",

View File

@@ -0,0 +1,110 @@
"""PDF 미리보기 생성 워커 — LibreOffice Headless로 문서→PDF 변환"""
import subprocess
import shutil
from datetime import datetime, timezone
from pathlib import Path
from sqlalchemy.ext.asyncio import AsyncSession
from core.config import settings
from core.utils import setup_logger
logger = setup_logger("preview_worker")
# PDF 변환 대상 포맷
CONVERTIBLE_FORMATS = {
"docx", "xlsx", "pptx", "odt", "ods", "odp", # 안정 지원
"odoc", "osheet", "hwp", "hwpx", # 검증 필요
}
# 이미 PDF이거나 변환 불필요한 포맷
NATIVE_PDF = {"pdf"}
NATIVE_IMAGE = {"jpg", "jpeg", "png", "gif", "bmp", "tiff"}
TEXT_FORMATS = {"md", "txt", "csv", "json", "xml", "html"}
PREVIEW_DIR_NAME = "PKM/.preview"
TIMEOUT_SECONDS = 60
async def process(document_id: int, session: AsyncSession) -> None:
"""문서 PDF 미리보기 생성"""
from models.document import Document
doc = await session.get(Document, document_id)
if not doc:
logger.error(f"[preview] document_id={document_id} 없음")
return
fmt = doc.file_format.lower()
# PDF/이미지/텍스트는 변환 불필요
if fmt in NATIVE_PDF or fmt in NATIVE_IMAGE or fmt in TEXT_FORMATS:
doc.preview_status = "ready" if fmt in NATIVE_PDF else "none"
doc.preview_at = datetime.now(timezone.utc)
await session.commit()
return
if fmt not in CONVERTIBLE_FORMATS:
doc.preview_status = "none"
await session.commit()
logger.info(f"[preview] {doc.title} — 변환 불가 포맷: {fmt}")
return
# 원본 파일 경로
source = Path(settings.nas_mount_path) / doc.file_path
if not source.exists():
doc.preview_status = "failed"
await session.commit()
logger.error(f"[preview] 원본 없음: {source}")
return
# 미리보기 디렉토리
preview_dir = Path(settings.nas_mount_path) / PREVIEW_DIR_NAME
preview_dir.mkdir(parents=True, exist_ok=True)
output_path = preview_dir / f"{document_id}.pdf"
doc.preview_status = "processing"
await session.commit()
# LibreOffice 변환
try:
tmp_dir = Path("/tmp/preview_work")
tmp_dir.mkdir(exist_ok=True)
result = subprocess.run(
[
"libreoffice", "--headless", "--convert-to", "pdf",
"--outdir", str(tmp_dir),
str(source),
],
capture_output=True,
text=True,
timeout=TIMEOUT_SECONDS,
)
if result.returncode != 0:
raise RuntimeError(f"LibreOffice 변환 실패: {result.stderr[:200]}")
# 변환 결과 찾기
converted = tmp_dir / f"{source.stem}.pdf"
if not converted.exists():
raise RuntimeError(f"변환 결과물 없음: {converted}")
# 캐시로 이동
shutil.move(str(converted), str(output_path))
doc.preview_status = "ready"
doc.preview_hash = doc.file_hash
doc.preview_at = datetime.now(timezone.utc)
await session.commit()
logger.info(f"[preview] {doc.title} → PDF 변환 완료")
except subprocess.TimeoutExpired:
doc.preview_status = "failed"
await session.commit()
logger.error(f"[preview] {doc.title} — 변환 timeout ({TIMEOUT_SECONDS}s)")
except Exception as e:
doc.preview_status = "failed"
await session.commit()
logger.error(f"[preview] {doc.title} — 변환 실패: {e}")

View File

@@ -11,7 +11,7 @@ from models.queue import ProcessingQueue
logger = setup_logger("queue_consumer")
# stage별 배치 크기
BATCH_SIZE = {"extract": 5, "classify": 3, "embed": 1}
BATCH_SIZE = {"extract": 5, "classify": 3, "embed": 1, "preview": 2}
STALE_THRESHOLD_MINUTES = 10
@@ -34,7 +34,7 @@ async def reset_stale_items():
async def enqueue_next_stage(document_id: int, current_stage: str):
"""현재 stage 완료 후 다음 stage를 pending으로 등록"""
next_stages = {"extract": "classify", "classify": "embed"}
next_stages = {"extract": "classify", "classify": "embed", "embed": "preview"}
next_stage = next_stages.get(current_stage)
if not next_stage:
return
@@ -63,11 +63,13 @@ async def consume_queue():
from workers.classify_worker import process as classify_process
from workers.embed_worker import process as embed_process
from workers.extract_worker import process as extract_process
from workers.preview_worker import process as preview_process
workers = {
"extract": extract_process,
"classify": classify_process,
"embed": embed_process,
"preview": preview_process,
}
await reset_stale_items()