Files
hyungi_document_server/services/marker/server.py
T
Hyungi Ahn 68fa86ea52 feat(markdown): persist extracted images with auth routes
Markdown Canonical Phase 1B.5 — marker 가 추출하던 이미지를 NAS 에 영구 저장하고
DB 메타 + 인증 라우트 + 프론트 swap 까지 wiring.

핵심 변경:
- marker-service /convert 응답에 base64 image 리스트 포함 (stateless 유지, NAS write 권한 X)
- marker_worker 가 NAS `/documents/extracted_images/{doc_id}/` 에 persist + UPSERT +
  고아 row DELETE + md_content ref 를 `docimg:img_NNN` stable scheme 으로 정규화
- /api/documents/{id}/images/{key}/raw 인증 라우트 (Cache-Control private + ETag = content_hash)
- frontend MarkdownDoc 가 placeholder card 안의 docimg ref 를 실제 <img> 로 swap

원칙:
- 이미지 binary = NAS, metadata = Postgres (학습 섹션 패턴 동일)
- image_key sequence 기반 결정적 → 재변환 idempotent
- MARKDOWN_IMAGE_PERSIST=false env 로 rollback 가능 (placeholder card 폴백 자연 유지)

기존 28건 marker success 문서는 본 PR 에서 건드리지 않음 — deploy + 신규 업로드 1건 +
sample 5건 검증 후 scripts/marker_reprocess_existing_success.py 로 targeted reprocess.

plan: ~/.claude/plans/piped-humming-crystal.md

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-10 14:05:41 +09:00

222 lines
7.5 KiB
Python

"""marker-service — POST /convert: PDF → markdown + 추출 이미지 base64.
Phase 1B (2026-05-01) — 텍스트만 응답, 이미지 폐기.
Phase 1B.5 (본 변경) — `_images` 직렬화해서 base64 응답에 포함. NAS write 권한이
없는 stateless 변환기 유지 (fastapi 가 NAS persist 담당).
plan: ~/.claude/plans/piped-humming-crystal.md
"""
import base64
import hashlib
import io
import logging
import os
import threading
import time
from pathlib import Path
from fastapi import FastAPI, HTTPException, Response
from pydantic import BaseModel, Field
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.output import text_from_rendered
import marker as marker_module
logger = logging.getLogger(__name__)
app = FastAPI()
os.environ.setdefault("HF_HOME", "/models/huggingface")
os.environ.setdefault("TORCH_HOME", "/models/torch")
_models = None
_converter = None
try:
import importlib.metadata
_engine_version = importlib.metadata.version("marker-pdf")
except Exception:
_engine_version = "unknown"
_warmup_done = False
_warmup_error: str | None = None
_warmup_lock = threading.Lock()
# 이미지 응답 cap. base64 응답 크기 폭주 방지. 사용자 PDF 풀 측정 (Phase 1D) 시
# 가장 이미지 많은 문서가 ~30건 수준 → 200 은 안전 마진. 초과 시 truncate flag 응답.
MAX_IMAGES_PER_DOC = int(os.getenv("MARKER_MAX_IMAGES_PER_DOC", "200"))
# per-image 최대 raw bytes (base64 전). 그래픽이 많은 풀페이지 스캔 회피.
MAX_BYTES_PER_IMAGE = int(os.getenv("MARKER_MAX_BYTES_PER_IMAGE", str(10 * 1024 * 1024)))
def _ensure_warmup() -> None:
"""첫 /convert 또는 startup hook 시 모델 로드. HF cache volume 활용."""
global _models, _converter, _warmup_done, _warmup_error
if _warmup_done:
return
with _warmup_lock:
if _warmup_done:
return
try:
logger.info("[marker-service] warmup start")
_models = create_model_dict()
_converter = PdfConverter(artifact_dict=_models)
_warmup_done = True
_warmup_error = None
logger.info(f"[marker-service] warmup done engine_version={_engine_version}")
except Exception as exc:
_warmup_error = f"{type(exc).__name__}: {exc}"
logger.exception("[marker-service] warmup failed")
raise
@app.on_event("startup")
async def startup():
"""startup hook — async warmup 백그라운드. /ready 가 완료 여부 노출."""
import asyncio
asyncio.create_task(asyncio.to_thread(_ensure_warmup))
class ConvertRequest(BaseModel):
file_path: str
max_pages: int | None = None
class ConvertImage(BaseModel):
"""marker 추출 이미지 1건. fastapi 가 NAS 에 쓰고 docimg:img_NNN 으로 ref 정규화."""
slug: str # marker 원본 slug (예: '_page_0_Picture_3.jpeg')
format: str # 'png' | 'jpeg' | 'webp' | 'gif'
width: int | None = None
height: int | None = None
bytes_b64: str # base64-encoded raw bytes
class ConvertResponse(BaseModel):
md_content: str
md_content_hash: str
engine: str
engine_version: str
elapsed_ms: int
raw_metrics: dict
images: list[ConvertImage] = Field(default_factory=list)
images_truncated: bool = False
@app.get("/ready")
async def ready(response: Response):
"""Round 4 #1+#2: Response.status_code 명시 + warmup_error 노출."""
if _warmup_error:
response.status_code = 503
return {
"status": "warmup_failed",
"engine": "marker",
"engine_version": _engine_version,
"error": _warmup_error,
}
if not _warmup_done:
response.status_code = 503
return {
"status": "warming_up",
"engine": "marker",
"engine_version": _engine_version,
}
return {
"status": "ready",
"engine": "marker",
"engine_version": _engine_version,
}
@app.post("/convert", response_model=ConvertResponse)
async def convert(req: ConvertRequest):
_ensure_warmup()
p = Path(req.file_path)
if not p.is_file():
raise HTTPException(404, detail={"code": "file_not_found", "message": str(p)})
start = time.monotonic()
try:
rendered = _converter(str(p))
except Exception as exc:
logger.exception(f"[marker-service] conversion failed path={p}: {exc}")
raise HTTPException(
status_code=422,
detail={
"code": "conversion_failed",
"message": f"{type(exc).__name__}: {exc}",
},
) from exc
md_text, _meta, raw_images = text_from_rendered(rendered)
elapsed_ms = int((time.monotonic() - start) * 1000)
images_payload, truncated = _serialize_images(raw_images, str(p))
return ConvertResponse(
md_content=md_text,
md_content_hash=hashlib.sha256(md_text.encode("utf-8")).hexdigest(),
engine="marker",
engine_version=_engine_version,
elapsed_ms=elapsed_ms,
raw_metrics={
"page_count": getattr(rendered, "page_count", None),
"image_count_extracted": len(raw_images) if raw_images else 0,
"image_count_returned": len(images_payload),
},
images=images_payload,
images_truncated=truncated,
)
def _serialize_images(raw_images, src_path: str) -> tuple[list[ConvertImage], bool]:
"""marker 의 `_images` (dict[slug, PIL.Image]) → base64 ConvertImage 리스트.
가드:
- MAX_IMAGES_PER_DOC 초과 시 head 만 반환 + truncated=True
- per-image 직렬화 실패 시 해당 이미지만 skip + warn (전체 fail 안 함)
- per-image 결과 byte 크기가 MAX_BYTES_PER_IMAGE 초과 시 skip + warn
"""
if not raw_images:
return [], False
items = list(raw_images.items())
truncated = len(items) > MAX_IMAGES_PER_DOC
if truncated:
logger.warning(
f"[marker-service] images truncated path={src_path} "
f"total={len(items)} cap={MAX_IMAGES_PER_DOC}"
)
items = items[:MAX_IMAGES_PER_DOC]
out: list[ConvertImage] = []
for slug, pil_img in items:
try:
fmt_raw = (pil_img.format or "PNG").upper()
# WebP/GIF 도 marker 가 emit 가능하지만 본 1B.5 기준은 PNG/JPEG 우선.
# 알 수 없는 포맷이면 PNG 로 강제 (lossless re-encode).
fmt = fmt_raw if fmt_raw in {"PNG", "JPEG", "WEBP", "GIF"} else "PNG"
buf = io.BytesIO()
pil_img.save(buf, format=fmt)
raw_bytes = buf.getvalue()
if len(raw_bytes) > MAX_BYTES_PER_IMAGE:
logger.warning(
f"[marker-service] image too large skipped path={src_path} "
f"slug={slug} bytes={len(raw_bytes)} cap={MAX_BYTES_PER_IMAGE}"
)
continue
out.append(
ConvertImage(
slug=slug,
format=fmt.lower(),
width=pil_img.width,
height=pil_img.height,
bytes_b64=base64.b64encode(raw_bytes).decode("ascii"),
)
)
except Exception as exc:
logger.warning(
f"[marker-service] image serialize failed path={src_path} "
f"slug={slug}: {type(exc).__name__}: {exc}"
)
continue
return out, truncated