diff --git a/app/Dockerfile b/app/Dockerfile index 628cc31..b7dbea5 100644 --- a/app/Dockerfile +++ b/app/Dockerfile @@ -2,6 +2,11 @@ FROM python:3.11-slim WORKDIR /app +# LibreOffice headless (PDF 변환용) +RUN apt-get update && \ + apt-get install -y --no-install-recommends libreoffice-core libreoffice-calc libreoffice-writer libreoffice-impress && \ + apt-get clean && rm -rf /var/lib/apt/lists/* + COPY requirements.txt . RUN pip install --no-cache-dir -r requirements.txt diff --git a/app/api/documents.py b/app/api/documents.py index 714e3d0..31444d7 100644 --- a/app/api/documents.py +++ b/app/api/documents.py @@ -37,6 +37,7 @@ class DocumentResponse(BaseModel): ai_tags: list | None ai_summary: str | None user_note: str | None + preview_status: str | None source_channel: str | None data_origin: str | None extracted_at: datetime | None @@ -298,6 +299,66 @@ async def update_document( return DocumentResponse.model_validate(doc) +@router.put("/{doc_id}/content") +async def save_document_content( + doc_id: int, + user: Annotated[User, Depends(get_current_user)], + session: Annotated[AsyncSession, Depends(get_session)], + body: dict = None, +): + """Markdown 원본 파일 저장 + extracted_text 갱신""" + doc = await session.get(Document, doc_id) + if not doc: + raise HTTPException(status_code=404, detail="문서를 찾을 수 없습니다") + + if doc.file_format not in ("md", "txt"): + raise HTTPException(status_code=400, detail="편집 가능한 포맷이 아닙니다 (md, txt만 가능)") + + content = body.get("content", "") if body else "" + file_path = Path(settings.nas_mount_path) / doc.file_path + file_path.write_text(content, encoding="utf-8") + + # 메타 갱신 + doc.file_size = len(content.encode("utf-8")) + doc.file_hash = file_hash(file_path) + doc.extracted_text = content[:15000] + doc.updated_at = datetime.now(timezone.utc) + await session.commit() + + return DocumentResponse.model_validate(doc) + + +@router.get("/{doc_id}/preview") +async def get_document_preview( + doc_id: int, + session: Annotated[AsyncSession, Depends(get_session)], + token: str | None = Query(None, description="Bearer token (iframe용)"), +): + """PDF 미리보기 캐시 서빙""" + from core.auth import decode_token + + if token: + payload = decode_token(token) + if not payload or payload.get("type") != "access": + raise HTTPException(status_code=401, detail="유효하지 않은 토큰") + else: + raise HTTPException(status_code=401, detail="토큰이 필요합니다") + + doc = await session.get(Document, doc_id) + if not doc: + raise HTTPException(status_code=404, detail="문서를 찾을 수 없습니다") + + preview_path = Path(settings.nas_mount_path) / "PKM" / ".preview" / f"{doc_id}.pdf" + if not preview_path.exists(): + raise HTTPException(status_code=404, detail="미리보기가 아직 생성되지 않았습니다") + + return FileResponse( + path=str(preview_path), + media_type="application/pdf", + headers={"Content-Disposition": "inline"}, + ) + + @router.delete("/{doc_id}") async def delete_document( doc_id: int, diff --git a/app/models/document.py b/app/models/document.py index ce56d94..99e7499 100644 --- a/app/models/document.py +++ b/app/models/document.py @@ -47,6 +47,11 @@ class Document(Base): # 사용자 메모 user_note: Mapped[str | None] = mapped_column(Text) + # 미리보기 + preview_status: Mapped[str | None] = mapped_column(String(20), default="none") + preview_hash: Mapped[str | None] = mapped_column(String(64)) + preview_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True)) + # 메타데이터 source_channel: Mapped[str | None] = mapped_column( Enum("law_monitor", "devonagent", "email", "web_clip", diff --git a/app/workers/preview_worker.py b/app/workers/preview_worker.py new file mode 100644 index 0000000..997cf77 --- /dev/null +++ b/app/workers/preview_worker.py @@ -0,0 +1,110 @@ +"""PDF 미리보기 생성 워커 — LibreOffice Headless로 문서→PDF 변환""" + +import subprocess +import shutil +from datetime import datetime, timezone +from pathlib import Path + +from sqlalchemy.ext.asyncio import AsyncSession + +from core.config import settings +from core.utils import setup_logger + +logger = setup_logger("preview_worker") + +# PDF 변환 대상 포맷 +CONVERTIBLE_FORMATS = { + "docx", "xlsx", "pptx", "odt", "ods", "odp", # 안정 지원 + "odoc", "osheet", "hwp", "hwpx", # 검증 필요 +} +# 이미 PDF이거나 변환 불필요한 포맷 +NATIVE_PDF = {"pdf"} +NATIVE_IMAGE = {"jpg", "jpeg", "png", "gif", "bmp", "tiff"} +TEXT_FORMATS = {"md", "txt", "csv", "json", "xml", "html"} + +PREVIEW_DIR_NAME = "PKM/.preview" +TIMEOUT_SECONDS = 60 + + +async def process(document_id: int, session: AsyncSession) -> None: + """문서 PDF 미리보기 생성""" + from models.document import Document + + doc = await session.get(Document, document_id) + if not doc: + logger.error(f"[preview] document_id={document_id} 없음") + return + + fmt = doc.file_format.lower() + + # PDF/이미지/텍스트는 변환 불필요 + if fmt in NATIVE_PDF or fmt in NATIVE_IMAGE or fmt in TEXT_FORMATS: + doc.preview_status = "ready" if fmt in NATIVE_PDF else "none" + doc.preview_at = datetime.now(timezone.utc) + await session.commit() + return + + if fmt not in CONVERTIBLE_FORMATS: + doc.preview_status = "none" + await session.commit() + logger.info(f"[preview] {doc.title} — 변환 불가 포맷: {fmt}") + return + + # 원본 파일 경로 + source = Path(settings.nas_mount_path) / doc.file_path + if not source.exists(): + doc.preview_status = "failed" + await session.commit() + logger.error(f"[preview] 원본 없음: {source}") + return + + # 미리보기 디렉토리 + preview_dir = Path(settings.nas_mount_path) / PREVIEW_DIR_NAME + preview_dir.mkdir(parents=True, exist_ok=True) + output_path = preview_dir / f"{document_id}.pdf" + + doc.preview_status = "processing" + await session.commit() + + # LibreOffice 변환 + try: + tmp_dir = Path("/tmp/preview_work") + tmp_dir.mkdir(exist_ok=True) + + result = subprocess.run( + [ + "libreoffice", "--headless", "--convert-to", "pdf", + "--outdir", str(tmp_dir), + str(source), + ], + capture_output=True, + text=True, + timeout=TIMEOUT_SECONDS, + ) + + if result.returncode != 0: + raise RuntimeError(f"LibreOffice 변환 실패: {result.stderr[:200]}") + + # 변환 결과 찾기 + converted = tmp_dir / f"{source.stem}.pdf" + if not converted.exists(): + raise RuntimeError(f"변환 결과물 없음: {converted}") + + # 캐시로 이동 + shutil.move(str(converted), str(output_path)) + + doc.preview_status = "ready" + doc.preview_hash = doc.file_hash + doc.preview_at = datetime.now(timezone.utc) + await session.commit() + logger.info(f"[preview] {doc.title} → PDF 변환 완료") + + except subprocess.TimeoutExpired: + doc.preview_status = "failed" + await session.commit() + logger.error(f"[preview] {doc.title} — 변환 timeout ({TIMEOUT_SECONDS}s)") + + except Exception as e: + doc.preview_status = "failed" + await session.commit() + logger.error(f"[preview] {doc.title} — 변환 실패: {e}") diff --git a/app/workers/queue_consumer.py b/app/workers/queue_consumer.py index 4117b89..589349e 100644 --- a/app/workers/queue_consumer.py +++ b/app/workers/queue_consumer.py @@ -11,7 +11,7 @@ from models.queue import ProcessingQueue logger = setup_logger("queue_consumer") # stage별 배치 크기 -BATCH_SIZE = {"extract": 5, "classify": 3, "embed": 1} +BATCH_SIZE = {"extract": 5, "classify": 3, "embed": 1, "preview": 2} STALE_THRESHOLD_MINUTES = 10 @@ -34,7 +34,7 @@ async def reset_stale_items(): async def enqueue_next_stage(document_id: int, current_stage: str): """현재 stage 완료 후 다음 stage를 pending으로 등록""" - next_stages = {"extract": "classify", "classify": "embed"} + next_stages = {"extract": "classify", "classify": "embed", "embed": "preview"} next_stage = next_stages.get(current_stage) if not next_stage: return @@ -63,11 +63,13 @@ async def consume_queue(): from workers.classify_worker import process as classify_process from workers.embed_worker import process as embed_process from workers.extract_worker import process as extract_process + from workers.preview_worker import process as preview_process workers = { "extract": extract_process, "classify": classify_process, "embed": embed_process, + "preview": preview_process, } await reset_stale_items() diff --git a/frontend/src/lib/components/DocumentViewer.svelte b/frontend/src/lib/components/DocumentViewer.svelte index 447e150..216d649 100644 --- a/frontend/src/lib/components/DocumentViewer.svelte +++ b/frontend/src/lib/components/DocumentViewer.svelte @@ -1,25 +1,44 @@ -
- {#if loading} -
-

로딩 중...

+ + +
+ + {#if fullDoc && !loading} +
+ {fullDoc.title || '제목 없음'} +
+ {#if viewerType === 'markdown'} + {#if editMode} + + + {:else} + + {/if} + {/if} + {#if editUrl} + + 편집 + + {/if} + 전체 보기 +
- {:else if fullDoc} - {#if viewerType === 'markdown' || viewerType === 'hwp-markdown'} -
- {@html marked(fullDoc.extracted_text || '*텍스트 추출 대기 중*')} -
- {:else if viewerType === 'pdf'} - - {:else if viewerType === 'image'} -
- {fullDoc.title} -
- {:else if viewerType === 'synology'} - - {:else} -
-
-

미리보기를 지원하지 않는 형식입니다

-

{fullDoc.file_format}

-
-
- {/if} {/if} + + +
+ {#if loading} +
+

로딩 중...

+
+ {:else if fullDoc} + {#if viewerType === 'markdown'} + {#if editMode} + +
+ +
+ {@html marked(editContent)} +
+
+ {:else} +
+ {@html marked(fullDoc.extracted_text || '*텍스트 추출 대기 중*')} +
+ {/if} + {:else if viewerType === 'pdf'} + + {:else if viewerType === 'preview-pdf'} + + {:else if viewerType === 'image'} +
+ {fullDoc.title} +
+ {:else if viewerType === 'text'} +
+
{fullDoc.extracted_text || '텍스트 없음'}
+
+ {:else if viewerType === 'cad'} +
+

CAD 미리보기 (향후 지원 예정)

+ AutoCAD Web에서 열기 +
+ {:else} +
+

미리보기를 지원하지 않는 형식입니다 ({fullDoc.file_format})

+
+ {/if} + {/if} +
diff --git a/migrations/005_preview_fields.sql b/migrations/005_preview_fields.sql new file mode 100644 index 0000000..b153a36 --- /dev/null +++ b/migrations/005_preview_fields.sql @@ -0,0 +1,4 @@ +-- 문서 미리보기 상태 필드 추가 +ALTER TABLE documents ADD COLUMN IF NOT EXISTS preview_status VARCHAR(20) DEFAULT 'none'; +ALTER TABLE documents ADD COLUMN IF NOT EXISTS preview_hash VARCHAR(64); +ALTER TABLE documents ADD COLUMN IF NOT EXISTS preview_at TIMESTAMPTZ;