feat(documents): S1-ADD dedup·원본명 3컬럼 + md_status success→completed 매핑 (A) + office→md PoC (C-1)

plan ds-s1-backend-1 (r5 수렴). 코드만 스테이징 — migration 미적용(restart 보류, E-2 Soft Lock 예외창). A (앱 v1 디코딩 비파괴 최소선): - A-1 migrations/287_documents_dedup_fields.sql: original_filename TEXT / duplicate_of BIGINT FK ON DELETE SET NULL / duplicate_count INTEGER NOT NULL DEFAULT 0. 단일 statement·PG16 fast-path·BEGIN/COMMIT 금지. backfill 미포함(B-4). - A-2 app/models/document.py: 1계층 블록에 3 mapped_column (+ ForeignKey import). md_* 는 기존. - A-3 app/api/documents.py: DocumentResponse 3필드(duplicate_count=0 non-opt) + DocumentDetailResponse field_validator(success→completed, mode=before) — read-time DB→API 단방향, write(ORM) 미적용. - A-4 tests/test_s1_dedup_shape.py: success→completed 동작 + 비-success 통과 + 3필드 디폴트/roundtrip + ds-app contract fixture 디코드(skip-if-absent). py_compile OK. ★ backend 절반 — 전체 비파괴는 S3 render 테스트와 AND. C-1 PoC (워커 미연결 — C-2 에서 marker_worker 분기 연결): - app/workers/office_md.py: OOXML=markitdown(신규 dep, lazy) / hwp·hwpx=LibreOffice headless→HTML→markdownify(기존 dep). 실패·빈출력·타임아웃·dep부재 → OfficeMdError raise (success+빈md 금지 = C-5 postcondition 의 변환기 계약). - scripts/poc_office_md.py: 표 fidelity 측정 하니스. E-1 = prod LibreOffice 버전핀 안전컨텍스트 실행(hwpx 필터 버전 의존). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-04 22:09:12 +09:00
parent 5a19cde38c
commit 68e2d7ea04
6 changed files with 426 additions and 2 deletions
@@ -22,7 +22,7 @@ from fastapi import (
    status,
 )
 from fastapi.responses import FileResponse
-from pydantic import BaseModel
+from pydantic import BaseModel, field_validator
 from sqlalchemy import func, select
 from sqlalchemy.ext.asyncio import AsyncSession
 from starlette.requests import ClientDisconnect
@@ -113,6 +113,10 @@ class DocumentResponse(BaseModel):
    # 회독 추적 (자료실 등) — 현재 사용자 기준. 다른 endpoint 응답에선 0/None.
    read_count: int = 0
    last_read_at: datetime | None = None
+    # S1-ADD (migration 287): 원본 파일명 + 중복검사. 앱은 옵셔널 디코딩, 없으면 폴백.
+    original_filename: str | None = None   # 다운로드 라벨용. 없으면 file_path basename 폴백(앱 측).
+    duplicate_of: int | None = None        # canonical doc id (자기 자신이 canonical 이면 None).
+    duplicate_count: int = 0               # 본인 제외 동일 판정 사본 수 (canonical 행 기준).

    class Config:
        from_attributes = True
@@ -140,6 +144,16 @@ class DocumentDetailResponse(DocumentResponse):
    md_extraction_engine_version: str | None = None
    md_generated_at: datetime | None = None

+    @field_validator("md_status", mode="before")
+    @classmethod
+    def _db_success_to_completed(cls, v: str | None) -> str | None:
+        """DB CHECK enum 은 'success'; 계약/fixture·앱 MD-first 렌더 트리거는 'completed'.
+        read-time(DB→API) 단방향 매핑만 — write 경로(ORM)는 이 모델을 거치지 않아 미적용.
+        pending/processing/partial/failed/skipped 는 양쪽 동일하므로 'success' 만 매핑한다.
+        (불변식: md_status ∈ {success,partial} ⟹ md_content 非공백 = 워커 postcondition, C-5.)
+        """
+        return "completed" if v == "success" else v
+

 class AcceptSuggestionRequest(BaseModel):
    """§1 accept-suggestion 요청 body — stale payload / doc 수정 검출."""
@@ -3,7 +3,7 @@
 from datetime import datetime

 from pgvector.sqlalchemy import Vector
-from sqlalchemy import BigInteger, Boolean, DateTime, Enum, Integer, String, Text
+from sqlalchemy import BigInteger, Boolean, DateTime, Enum, ForeignKey, Integer, String, Text
 from sqlalchemy.dialects.postgresql import JSONB
 from sqlalchemy.orm import Mapped, mapped_column

@@ -28,6 +28,19 @@ class Document(Base):
    )
    import_source: Mapped[str | None] = mapped_column(Text)

+    # 1계층: 원본명 + 중복검사 (S1-ADD, migration 287)
+    # original_filename = 업로드 원본 파일명(다운로드 라벨용). file_path 는 충돌 시 _N 리네임됨.
+    #   cf. original_format(ODF 변환용) / original_path·original_hash(007 legacy dead) 와 의미 구분.
+    # duplicate_of = canonical doc id (자기 자신이 canonical 이면 NULL). FK ON DELETE SET NULL.
+    # duplicate_count = canonical 행에 담는 '본인 제외 동일 판정 사본 수' (group_size-1). 업로드/backfill 가 갱신.
+    original_filename: Mapped[str | None] = mapped_column(Text)
+    duplicate_of: Mapped[int | None] = mapped_column(
+        BigInteger, ForeignKey("documents.id", ondelete="SET NULL")
+    )
+    duplicate_count: Mapped[int] = mapped_column(
+        Integer, nullable=False, default=0, server_default="0"
+    )
+
    # 2계층: 텍스트 추출
    extracted_text: Mapped[str | None] = mapped_column(Text)
    extracted_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True))
@@ -0,0 +1,134 @@
+"""office/hwp → Markdown 하이브리드 변환기 (plan ds-s1-backend-1, C-1 PoC).
+
+★ PoC 상태 — marker_worker 에 아직 연결하지 않음(그건 C-2). 본 모듈은 변환 *계약*과
+   PoC 하니스(scripts/poc_office_md.py)가 호출하는 순수 함수만 제공한다.
+
+전략 (하이브리드):
+  - OOXML(.docx/.xlsx/.pptx) → markitdown        ← 신규 의존성(pip install markitdown). lazy import.
+  - .hwp/.hwpx               → LibreOffice(headless) → HTML → markdownify  ← markdownify 기존 의존성.
+    (LibreOffice 가 hwp import 필터 보유. .hwpx 는 .hwp 와 다른 필터·버전 의존 → E-1: prod LibreOffice
+     버전핀 안전컨텍스트에서 PoC 실행. 표 fidelity 가 진짜 리스크 — 하니스가 측정.)
+
+실패 계약 (C-5 postcondition 의 backend 절반):
+  변환 실패·빈 출력·타임아웃·의존성 부재 → OfficeMdError 를 raise 한다.
+  **success + 빈 md 를 절대 반환하지 않는다** — 호출부(C-2 marker_worker)가 이를 잡아
+  md_status='failed'(¬success·¬skipped) 로 라우팅한다. 불변식: md_status ∈ {success,partial} ⟹ md_content 非공백.
+"""
+
+from __future__ import annotations
+
+import os
+import shutil
+import subprocess
+import tempfile
+from pathlib import Path
+
+OOXML_FORMATS = {".docx", ".xlsx", ".pptx"}
+HWP_FORMATS = {".hwp", ".hwpx"}
+SUPPORTED = OOXML_FORMATS | HWP_FORMATS
+
+# 빈 출력 판정 임계 — 공백 제거 후 이 미만이면 '실패(빈 변환)'로 본다.
+_MIN_BODY_CHARS = 16
+
+_SOFFICE_BIN = os.environ.get("LIBREOFFICE_BIN", "soffice")
+
+
+class OfficeMdError(Exception):
+    """office/hwp → md 변환 실패 신호. 호출부는 md_status='failed' 로 라우팅."""
+
+
+def convert_office_to_md(path: str | Path, *, timeout: int = 90) -> str:
+    """office/hwp 파일을 Markdown 문자열로 변환. 실패/빈출력 시 OfficeMdError raise."""
+    p = Path(path)
+    suffix = p.suffix.lower()
+    if suffix not in SUPPORTED:
+        raise OfficeMdError(f"unsupported suffix for office_md: {suffix!r}")
+    if not p.exists():
+        raise OfficeMdError(f"file not found: {p}")
+
+    if suffix in OOXML_FORMATS:
+        md = _via_markitdown(p)
+    else:  # .hwp / .hwpx
+        md = _via_libreoffice_html(p, timeout=timeout)
+
+    md = (md or "").strip()
+    if len(md) < _MIN_BODY_CHARS:
+        raise OfficeMdError(f"empty/too-short conversion ({len(md)} chars) for {p.name}")
+    return md
+
+
+def _via_markitdown(path: Path) -> str:
+    try:
+        from markitdown import MarkItDown  # lazy — 신규 의존성
+    except ImportError as e:  # noqa: BLE001
+        raise OfficeMdError(
+            "markitdown 미설치 (OOXML 변환에 필요) — `pip install markitdown`. "
+            "C-1 PoC 는 prod worker 이미지/버전핀 컨텍스트에서 실행(E-1)."
+        ) from e
+    try:
+        result = MarkItDown().convert(str(path))
+    except Exception as e:  # noqa: BLE001 — 어떤 변환 예외든 failed 로 라우팅
+        raise OfficeMdError(f"markitdown 변환 실패: {path.name}: {e}") from e
+    return getattr(result, "text_content", "") or ""
+
+
+def _via_libreoffice_html(path: Path, *, timeout: int) -> str:
+    """LibreOffice headless 로 HTML 변환 후 markdownify. hwp/hwpx 용."""
+    try:
+        from markdownify import markdownify  # 기존 의존성
+    except ImportError as e:  # noqa: BLE001
+        raise OfficeMdError("markdownify 미설치(기존 의존성이어야 함)") from e
+
+    with tempfile.TemporaryDirectory(prefix="office_md_") as tmp:
+        tmpdir = Path(tmp)
+        # soffice 동시 실행 시 user profile 락 충돌 회피 — 호출별 격리 프로필.
+        profile = tmpdir / "lo_profile"
+        cmd = [
+            _SOFFICE_BIN,
+            "--headless",
+            "--nologo",
+            "--nofirststartwizard",
+            f"-env:UserInstallation=file://{profile}",
+            "--convert-to",
+            "html",
+            "--outdir",
+            str(tmpdir),
+            str(path),
+        ]
+        try:
+            proc = subprocess.run(
+                cmd, capture_output=True, text=True, timeout=timeout, check=False
+            )
+        except FileNotFoundError as e:
+            raise OfficeMdError(
+                f"LibreOffice 바이너리 부재({_SOFFICE_BIN}) — LIBREOFFICE_BIN 설정 또는 설치 필요"
+            ) from e
+        except subprocess.TimeoutExpired as e:
+            raise OfficeMdError(f"LibreOffice 변환 타임아웃({timeout}s): {path.name}") from e
+
+        html_path = tmpdir / f"{path.stem}.html"
+        if proc.returncode != 0 or not html_path.exists():
+            raise OfficeMdError(
+                f"LibreOffice html 변환 실패: {path.name} (rc={proc.returncode}): "
+                f"{(proc.stderr or proc.stdout or '').strip()[:300]}"
+            )
+        html = html_path.read_text(encoding="utf-8", errors="replace")
+        # 표 보존 위해 markdownify 가 table 을 GFM 으로 — heading_style ATX.
+        return markdownify(html, heading_style="ATX", strip=["span", "font"])
+
+
+def table_fidelity(md: str) -> dict:
+    """E-1 표 fidelity 의 crude 지표 — GFM 표 행/구분행 카운트 (정밀 평가 아님, 회귀 신호)."""
+    lines = md.splitlines()
+    pipe_rows = sum(1 for ln in lines if ln.strip().startswith("|") and ln.strip().endswith("|"))
+    sep_rows = sum(
+        1 for ln in lines
+        if ln.strip().startswith("|") and set(ln.strip()) <= set("|-: ")
+    )
+    return {
+        "chars": len(md),
+        "lines": len(lines),
+        "table_pipe_rows": pipe_rows,
+        "table_separator_rows": sep_rows,  # 표 개수의 근사
+        "has_heading": any(ln.lstrip().startswith("#") for ln in lines),
+    }