feat(canonical): Phase 1B marker-service + marker_worker for PDF→markdown (222)
신규 컨테이너 marker-service (port 3300, Marker 1.10.2 + surya 0.17.1 + HF cache volume). marker_worker 가 markdown stage 큐 소비: classify_worker → enqueue 'markdown' (leaf, embed/chunk 와 독립) → SKIP_DOC_TYPES (발주서/세금계산서/명세표) 스킵 → 확장자 != .pdf 스킵 (Phase 1B = PDF only) → page_count > 200 스킵 → marker-service POST /convert → 422/404 = doc-level failed, 5xx = queue retry 안정성 장치: - migration 222: ALTER TYPE process_stage ADD VALUE markdown (단일 statement) - md_extraction_quality JSONB dict 직접 저장 - skip 시 md_content/hash NULL 클리어 - /ready Response.status_code + warmup_error 가시화 - HF cache volume (build-time download 0) - file_path 는 NAS 상대경로 → /documents prefix prepend 성공 기준: 파이프라인 안정성. markdown 품질은 Phase 1D pilot. Pre-flight (2026-05-01): - marker-pdf 1.10.2 stable - file_path 9503건 NAS 상대경로 - DOCUMENT_TYPES 한국어 7종 → SKIP alias 보강 - queue retry max_attempts=3 + reset_stale_items 확인 - main 220/221 study_q_related 선점 → 222 rebump Plan: ~/.claude/plans/plan-idempotent-sundae.md (Round 5 approved) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,22 @@
|
||||
FROM python:3.12-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
libgl1 libglib2.0-0 curl \
|
||||
&& apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir \
|
||||
--extra-index-url https://download.pytorch.org/whl/cu126 \
|
||||
-r requirements.txt
|
||||
|
||||
# 모델 미다운로드 (HF cache volume → 첫 호출/warmup 시 적재).
|
||||
|
||||
COPY server.py .
|
||||
|
||||
EXPOSE 3300
|
||||
HEALTHCHECK --start-period=300s --interval=30s --timeout=10s --retries=3 \
|
||||
CMD curl -f http://localhost:3300/ready || exit 1
|
||||
|
||||
CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "3300"]
|
||||
@@ -0,0 +1,9 @@
|
||||
torch==2.11.0+cu126
|
||||
torchvision==0.26.0+cu126
|
||||
transformers==4.57.6
|
||||
surya-ocr==0.17.1
|
||||
marker-pdf==1.10.2
|
||||
pymupdf>=1.24.0,<2.0.0
|
||||
fastapi>=0.110.0,<1.0.0
|
||||
uvicorn[standard]>=0.27.0,<1.0.0
|
||||
pillow>=10.0.0,<12.0.0
|
||||
@@ -0,0 +1,136 @@
|
||||
"""marker-service — POST /convert: PDF → markdown (텍스트만, 이미지 제외).
|
||||
|
||||
Phase 1B Round 5 — /ready 정확한 status code, warmup 실패 가시화, 변환 실패 = 422.
|
||||
plan: ~/.claude/plans/plan-idempotent-sundae.md
|
||||
"""
|
||||
import hashlib
|
||||
import logging
|
||||
import os
|
||||
import threading
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
from fastapi import FastAPI, HTTPException, Response
|
||||
from pydantic import BaseModel
|
||||
|
||||
from marker.converters.pdf import PdfConverter
|
||||
from marker.models import create_model_dict
|
||||
from marker.output import text_from_rendered
|
||||
import marker as marker_module
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
app = FastAPI()
|
||||
|
||||
os.environ.setdefault("HF_HOME", "/models/huggingface")
|
||||
os.environ.setdefault("TORCH_HOME", "/models/torch")
|
||||
|
||||
_models = None
|
||||
_converter = None
|
||||
_engine_version = getattr(marker_module, "__version__", "unknown")
|
||||
_warmup_done = False
|
||||
_warmup_error: str | None = None
|
||||
_warmup_lock = threading.Lock()
|
||||
|
||||
|
||||
def _ensure_warmup() -> None:
|
||||
"""첫 /convert 또는 startup hook 시 모델 로드. HF cache volume 활용."""
|
||||
global _models, _converter, _warmup_done, _warmup_error
|
||||
if _warmup_done:
|
||||
return
|
||||
with _warmup_lock:
|
||||
if _warmup_done:
|
||||
return
|
||||
try:
|
||||
logger.info("[marker-service] warmup start")
|
||||
_models = create_model_dict()
|
||||
_converter = PdfConverter(artifact_dict=_models)
|
||||
_warmup_done = True
|
||||
_warmup_error = None
|
||||
logger.info(f"[marker-service] warmup done engine_version={_engine_version}")
|
||||
except Exception as exc:
|
||||
_warmup_error = f"{type(exc).__name__}: {exc}"
|
||||
logger.exception("[marker-service] warmup failed")
|
||||
raise
|
||||
|
||||
|
||||
@app.on_event("startup")
|
||||
async def startup():
|
||||
"""startup hook — async warmup 백그라운드. /ready 가 완료 여부 노출."""
|
||||
import asyncio
|
||||
asyncio.create_task(asyncio.to_thread(_ensure_warmup))
|
||||
|
||||
|
||||
class ConvertRequest(BaseModel):
|
||||
file_path: str
|
||||
max_pages: int | None = None
|
||||
|
||||
|
||||
class ConvertResponse(BaseModel):
|
||||
md_content: str
|
||||
md_content_hash: str
|
||||
engine: str
|
||||
engine_version: str
|
||||
elapsed_ms: int
|
||||
raw_metrics: dict
|
||||
|
||||
|
||||
@app.get("/ready")
|
||||
async def ready(response: Response):
|
||||
"""Round 4 #1+#2: Response.status_code 명시 + warmup_error 노출."""
|
||||
if _warmup_error:
|
||||
response.status_code = 503
|
||||
return {
|
||||
"status": "warmup_failed",
|
||||
"engine": "marker",
|
||||
"engine_version": _engine_version,
|
||||
"error": _warmup_error,
|
||||
}
|
||||
if not _warmup_done:
|
||||
response.status_code = 503
|
||||
return {
|
||||
"status": "warming_up",
|
||||
"engine": "marker",
|
||||
"engine_version": _engine_version,
|
||||
}
|
||||
return {
|
||||
"status": "ready",
|
||||
"engine": "marker",
|
||||
"engine_version": _engine_version,
|
||||
}
|
||||
|
||||
|
||||
@app.post("/convert", response_model=ConvertResponse)
|
||||
async def convert(req: ConvertRequest):
|
||||
_ensure_warmup()
|
||||
|
||||
p = Path(req.file_path)
|
||||
if not p.is_file():
|
||||
raise HTTPException(404, detail={"code": "file_not_found", "message": str(p)})
|
||||
|
||||
start = time.monotonic()
|
||||
try:
|
||||
rendered = _converter(str(p))
|
||||
except Exception as exc:
|
||||
logger.exception(f"[marker-service] conversion failed path={p}: {exc}")
|
||||
raise HTTPException(
|
||||
status_code=422,
|
||||
detail={
|
||||
"code": "conversion_failed",
|
||||
"message": f"{type(exc).__name__}: {exc}",
|
||||
},
|
||||
) from exc
|
||||
|
||||
md_text, _meta, _images = text_from_rendered(rendered)
|
||||
elapsed_ms = int((time.monotonic() - start) * 1000)
|
||||
|
||||
return ConvertResponse(
|
||||
md_content=md_text,
|
||||
md_content_hash=hashlib.sha256(md_text.encode("utf-8")).hexdigest(),
|
||||
engine="marker",
|
||||
engine_version=_engine_version,
|
||||
elapsed_ms=elapsed_ms,
|
||||
raw_metrics={
|
||||
"page_count": getattr(rendered, "page_count", None),
|
||||
"image_count_extracted": len(_images) if _images else 0,
|
||||
},
|
||||
)
|
||||
Reference in New Issue
Block a user