` +
`` +
`🖼️` +
`${safeLabel}` +
@@ -65,8 +70,18 @@ docMarked.use({
const SANITIZE_OPTS = {
USE_PROFILES: { html: true },
// KaTeX (style + aria-hidden), heading anchor (id), 이미지 마킹 (data-md-img,
- // data-md-image-src — 1B.5 ImgAuth selector 용), figure caption (figure/figcaption)
- ADD_ATTR: ['style', 'aria-hidden', 'id', 'data-md-img', 'data-md-image-src', 'loading'],
+ // data-md-image-src + data-md-image-internal + data-md-image-alt — 1B.5 ImgAuth
+ // selector 용), figure caption (figure/figcaption)
+ ADD_ATTR: [
+ 'style',
+ 'aria-hidden',
+ 'id',
+ 'data-md-img',
+ 'data-md-image-src',
+ 'data-md-image-internal',
+ 'data-md-image-alt',
+ 'loading',
+ ],
ADD_TAGS: ['figure', 'figcaption'],
FORBID_TAGS: ['script', 'iframe', 'object', 'embed', 'link', 'meta'],
FORBID_ATTR: ['onerror', 'onclick', 'onload', 'onmouseover', 'onfocus'],
diff --git a/frontend/src/routes/documents/[id]/+page.svelte b/frontend/src/routes/documents/[id]/+page.svelte
index 3b950e3..bd39baa 100644
--- a/frontend/src/routes/documents/[id]/+page.svelte
+++ b/frontend/src/routes/documents/[id]/+page.svelte
@@ -249,6 +249,7 @@
{#if viewerType === 'markdown' || viewerType === 'hwp-markdown'}
{#if pdfViewMode === 'markdown' && canShowMarkdown}
{#if doc.md_content || doc.extracted_text}
dict:
+ quality = row.md_extraction_quality
+ return {
+ "id": row.id,
+ "md_status": row.md_status,
+ "md_content_hash": row.md_content_hash,
+ "md_extraction_engine": row.md_extraction_engine,
+ "md_extraction_engine_version": row.md_extraction_engine_version,
+ "md_extraction_quality": json.dumps(quality, ensure_ascii=False) if quality else "",
+ "md_generated_at": row.md_generated_at.isoformat() if row.md_generated_at else "",
+ "file_format": row.file_format,
+ "file_path": row.file_path,
+ "title": row.title or "",
+ }
+
+
+async def run(*, apply: bool, only_ids: set[int] | None, snapshot_csv: str | None) -> int:
+ database_url = os.getenv(
+ "DATABASE_URL",
+ "postgresql+asyncpg://pkm:pkm@localhost:5432/pkm",
+ )
+
+ engine = create_async_engine(database_url)
+ session_factory = async_sessionmaker(
+ engine, class_=AsyncSession, expire_on_commit=False
+ )
+
+ try:
+ async with session_factory() as session:
+ rows = (await session.execute(text(CANDIDATES_SQL))).all()
+ if only_ids:
+ rows = [r for r in rows if r.id in only_ids]
+
+ print(f"=== marker success 후보 = {len(rows)}건 ===")
+ if not rows:
+ print("후보 없음 — 종료.")
+ return 0
+
+ # pre-snapshot CSV 출력
+ buf = StringIO()
+ writer = csv.DictWriter(
+ buf,
+ fieldnames=[
+ "id", "md_status", "md_content_hash", "md_extraction_engine",
+ "md_extraction_engine_version", "md_extraction_quality",
+ "md_generated_at", "file_format", "file_path", "title",
+ ],
+ )
+ writer.writeheader()
+ for row in rows:
+ writer.writerow(_serialize_row(row))
+ csv_text = buf.getvalue()
+
+ if snapshot_csv:
+ with open(snapshot_csv, "w", encoding="utf-8") as f:
+ f.write(csv_text)
+ print(f"[snapshot] {snapshot_csv} 에 {len(rows)}행 기록")
+ else:
+ print("\n=== Pre-snapshot CSV ===")
+ print(csv_text)
+
+ if not apply:
+ print(f"\n[dry-run] {len(rows)}건 영향. --apply 로 실제 enqueue.")
+ return 0
+
+ # enqueue — UNIQUE(document_id, stage) WHERE status IN ('pending', 'processing')
+ # 가 있으므로 활성 markdown 행이 없는 doc 만 통과. 충돌 시 silent skip.
+ ENQUEUE_SQL = text("""
+ INSERT INTO processing_queue (document_id, stage, status, payload)
+ VALUES (:doc_id, 'markdown', 'pending', :payload::jsonb)
+ ON CONFLICT DO NOTHING
+ """)
+
+ payload = json.dumps({
+ "force_reprocess": True,
+ "reason": "phase_1b5_imgauth_targeted_reprocess",
+ })
+
+ inserted = 0
+ for row in rows:
+ result = await session.execute(
+ ENQUEUE_SQL, {"doc_id": row.id, "payload": payload}
+ )
+ if result.rowcount > 0:
+ inserted += 1
+
+ await session.commit()
+ print(f"\n[apply] enqueue 완료 — {inserted}/{len(rows)} 건 신규 markdown 큐 추가")
+ print(" (skip = 이미 활성 markdown 큐 행이 있는 문서)")
+ return 0
+ finally:
+ await engine.dispose()
+
+
+def _parse_only_ids(arg: str | None) -> set[int] | None:
+ if not arg:
+ return None
+ out: set[int] = set()
+ for part in arg.split(","):
+ part = part.strip()
+ if part:
+ out.add(int(part))
+ return out or None
+
+
+def main() -> int:
+ parser = argparse.ArgumentParser(description=__doc__)
+ parser.add_argument("--apply", action="store_true", help="실제 enqueue (기본 dry-run)")
+ parser.add_argument("--dry-run", action="store_true", help="명시적 dry-run (default 동등)")
+ parser.add_argument(
+ "--only", type=str, default=None,
+ help="쉼표 구분 doc_id 화이트리스트 (sample 검증용, 예: 4809,5127,5180)",
+ )
+ parser.add_argument(
+ "--snapshot-csv", type=str, default=None,
+ help="pre-snapshot 을 stdout 대신 이 경로의 CSV 파일로 저장",
+ )
+ args = parser.parse_args()
+
+ if args.apply and args.dry_run:
+ parser.error("--apply 와 --dry-run 동시 지정 불가")
+
+ only_ids = _parse_only_ids(args.only)
+ return asyncio.run(run(
+ apply=args.apply,
+ only_ids=only_ids,
+ snapshot_csv=args.snapshot_csv,
+ ))
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/services/marker/server.py b/services/marker/server.py
index 0d566f0..39e35f9 100644
--- a/services/marker/server.py
+++ b/services/marker/server.py
@@ -1,9 +1,14 @@
-"""marker-service — POST /convert: PDF → markdown (텍스트만, 이미지 제외).
+"""marker-service — POST /convert: PDF → markdown + 추출 이미지 base64.
-Phase 1B Round 5 — /ready 정확한 status code, warmup 실패 가시화, 변환 실패 = 422.
-plan: ~/.claude/plans/plan-idempotent-sundae.md
+Phase 1B (2026-05-01) — 텍스트만 응답, 이미지 폐기.
+Phase 1B.5 (본 변경) — `_images` 직렬화해서 base64 응답에 포함. NAS write 권한이
+ 없는 stateless 변환기 유지 (fastapi 가 NAS persist 담당).
+
+plan: ~/.claude/plans/piped-humming-crystal.md
"""
+import base64
import hashlib
+import io
import logging
import os
import threading
@@ -11,7 +16,7 @@ import time
from pathlib import Path
from fastapi import FastAPI, HTTPException, Response
-from pydantic import BaseModel
+from pydantic import BaseModel, Field
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
@@ -35,6 +40,12 @@ _warmup_done = False
_warmup_error: str | None = None
_warmup_lock = threading.Lock()
+# 이미지 응답 cap. base64 응답 크기 폭주 방지. 사용자 PDF 풀 측정 (Phase 1D) 시
+# 가장 이미지 많은 문서가 ~30건 수준 → 200 은 안전 마진. 초과 시 truncate flag 응답.
+MAX_IMAGES_PER_DOC = int(os.getenv("MARKER_MAX_IMAGES_PER_DOC", "200"))
+# per-image 최대 raw bytes (base64 전). 그래픽이 많은 풀페이지 스캔 회피.
+MAX_BYTES_PER_IMAGE = int(os.getenv("MARKER_MAX_BYTES_PER_IMAGE", str(10 * 1024 * 1024)))
+
def _ensure_warmup() -> None:
"""첫 /convert 또는 startup hook 시 모델 로드. HF cache volume 활용."""
@@ -69,6 +80,15 @@ class ConvertRequest(BaseModel):
max_pages: int | None = None
+class ConvertImage(BaseModel):
+ """marker 추출 이미지 1건. fastapi 가 NAS 에 쓰고 docimg:img_NNN 으로 ref 정규화."""
+ slug: str # marker 원본 slug (예: '_page_0_Picture_3.jpeg')
+ format: str # 'png' | 'jpeg' | 'webp' | 'gif'
+ width: int | None = None
+ height: int | None = None
+ bytes_b64: str # base64-encoded raw bytes
+
+
class ConvertResponse(BaseModel):
md_content: str
md_content_hash: str
@@ -76,6 +96,8 @@ class ConvertResponse(BaseModel):
engine_version: str
elapsed_ms: int
raw_metrics: dict
+ images: list[ConvertImage] = Field(default_factory=list)
+ images_truncated: bool = False
@app.get("/ready")
@@ -124,9 +146,11 @@ async def convert(req: ConvertRequest):
},
) from exc
- md_text, _meta, _images = text_from_rendered(rendered)
+ md_text, _meta, raw_images = text_from_rendered(rendered)
elapsed_ms = int((time.monotonic() - start) * 1000)
+ images_payload, truncated = _serialize_images(raw_images, str(p))
+
return ConvertResponse(
md_content=md_text,
md_content_hash=hashlib.sha256(md_text.encode("utf-8")).hexdigest(),
@@ -135,6 +159,63 @@ async def convert(req: ConvertRequest):
elapsed_ms=elapsed_ms,
raw_metrics={
"page_count": getattr(rendered, "page_count", None),
- "image_count_extracted": len(_images) if _images else 0,
+ "image_count_extracted": len(raw_images) if raw_images else 0,
+ "image_count_returned": len(images_payload),
},
+ images=images_payload,
+ images_truncated=truncated,
)
+
+
+def _serialize_images(raw_images, src_path: str) -> tuple[list[ConvertImage], bool]:
+ """marker 의 `_images` (dict[slug, PIL.Image]) → base64 ConvertImage 리스트.
+
+ 가드:
+ - MAX_IMAGES_PER_DOC 초과 시 head 만 반환 + truncated=True
+ - per-image 직렬화 실패 시 해당 이미지만 skip + warn (전체 fail 안 함)
+ - per-image 결과 byte 크기가 MAX_BYTES_PER_IMAGE 초과 시 skip + warn
+ """
+ if not raw_images:
+ return [], False
+
+ items = list(raw_images.items())
+ truncated = len(items) > MAX_IMAGES_PER_DOC
+ if truncated:
+ logger.warning(
+ f"[marker-service] images truncated path={src_path} "
+ f"total={len(items)} cap={MAX_IMAGES_PER_DOC}"
+ )
+ items = items[:MAX_IMAGES_PER_DOC]
+
+ out: list[ConvertImage] = []
+ for slug, pil_img in items:
+ try:
+ fmt_raw = (pil_img.format or "PNG").upper()
+ # WebP/GIF 도 marker 가 emit 가능하지만 본 1B.5 기준은 PNG/JPEG 우선.
+ # 알 수 없는 포맷이면 PNG 로 강제 (lossless re-encode).
+ fmt = fmt_raw if fmt_raw in {"PNG", "JPEG", "WEBP", "GIF"} else "PNG"
+ buf = io.BytesIO()
+ pil_img.save(buf, format=fmt)
+ raw_bytes = buf.getvalue()
+ if len(raw_bytes) > MAX_BYTES_PER_IMAGE:
+ logger.warning(
+ f"[marker-service] image too large skipped path={src_path} "
+ f"slug={slug} bytes={len(raw_bytes)} cap={MAX_BYTES_PER_IMAGE}"
+ )
+ continue
+ out.append(
+ ConvertImage(
+ slug=slug,
+ format=fmt.lower(),
+ width=pil_img.width,
+ height=pil_img.height,
+ bytes_b64=base64.b64encode(raw_bytes).decode("ascii"),
+ )
+ )
+ except Exception as exc:
+ logger.warning(
+ f"[marker-service] image serialize failed path={src_path} "
+ f"slug={slug}: {type(exc).__name__}: {exc}"
+ )
+ continue
+ return out, truncated
diff --git a/tests/test_marker_image_persist.py b/tests/test_marker_image_persist.py
new file mode 100644
index 0000000..a30fc51
--- /dev/null
+++ b/tests/test_marker_image_persist.py
@@ -0,0 +1,169 @@
+"""Phase 1B.5 ImgAuth — marker_worker 의 순수 헬퍼 단위 테스트.
+
+DB / NAS / marker-service 실접속이 필요한 통합 테스트는 별 파일 (배포 후 실행).
+본 파일은 image-bytes mocking 만으로 검증 가능한 부분 (rewrite 로직 + persist 매핑).
+
+plan: ~/.claude/plans/piped-humming-crystal.md
+"""
+
+from __future__ import annotations
+
+import base64
+import os
+import sys
+
+import pytest
+
+# tests/ → 프로젝트 루트 → app/
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "app"))
+
+from workers.marker_worker import (
+ _persist_images_to_nas,
+ _rewrite_image_refs,
+)
+
+
+# ─── _rewrite_image_refs ───
+
+
+def test_rewrite_exact_slug_match():
+ md = "본문\n\n\n\n뒤"
+ out = _rewrite_image_refs(md, {"_page_0_Picture_3.jpeg": "img_001"})
+ assert "" in out
+ assert "_page_0_Picture_3.jpeg" not in out
+
+
+def test_rewrite_basename_match_with_subdir_href():
+ md = ""
+ out = _rewrite_image_refs(md, {"_page_2_Figure_1.png": "img_007"})
+ assert out == ""
+
+
+def test_rewrite_preserves_external_urls():
+ md = "외부  와 내부 "
+ out = _rewrite_image_refs(md, {"slug.png": "img_002"})
+ # 외부 URL 는 그대로, 내부 slug 만 docimg 로 치환.
+ assert "https://example.com/x.png" in out
+ assert "(docimg:img_002)" in out
+
+
+def test_rewrite_preserves_alt_text():
+ md = ""
+ out = _rewrite_image_refs(md, {"slug.jpeg": "img_001"})
+ assert out == ""
+
+
+def test_rewrite_no_slug_map_is_noop():
+ md = ""
+ assert _rewrite_image_refs(md, {}) == md
+
+
+def test_rewrite_unknown_slug_kept():
+ md = ""
+ out = _rewrite_image_refs(md, {"other.png": "img_001"})
+ assert out == md
+
+
+def test_rewrite_idempotent_on_already_normalized():
+ """이미 docimg:img_NNN 인 ref 는 slug 매칭 실패 → 변경 없음 (재변환 idempotent)."""
+ md = ""
+ out = _rewrite_image_refs(md, {"_page_0.jpeg": "img_001"})
+ assert out == md
+
+
+def test_rewrite_multiple_images():
+ md = " text  "
+ out = _rewrite_image_refs(md, {
+ "s1.png": "img_001",
+ "s2.png": "img_002",
+ "s3.jpg": "img_003",
+ })
+ assert "(docimg:img_001)" in out
+ assert "(docimg:img_002)" in out
+ assert "(docimg:img_003)" in out
+
+
+# ─── _persist_images_to_nas ───
+
+
+def _make_png_bytes() -> bytes:
+ """1x1 transparent PNG (signature + IHDR + IDAT + IEND)."""
+ return bytes.fromhex(
+ "89504e470d0a1a0a" # signature
+ "0000000d49484452" # IHDR len + type
+ "00000001000000010806000000" # 1x1 RGBA
+ "1f15c4890000000d4944415478"
+ "9c626001000000ffff03000006"
+ "00057ce4ec5d0000000049454e44ae426082"
+ )
+
+
+def test_persist_sequential_image_keys(tmp_path, monkeypatch):
+ # NAS root 를 tmp_path 로 redirect
+ monkeypatch.setattr(
+ "workers.marker_worker.EXTRACTED_IMAGES_ROOT",
+ tmp_path / "extracted_images",
+ )
+
+ payload = [
+ {"slug": "_page_0.png", "format": "png",
+ "bytes_b64": base64.b64encode(_make_png_bytes()).decode("ascii")},
+ {"slug": "_page_1.png", "format": "png",
+ "bytes_b64": base64.b64encode(_make_png_bytes()).decode("ascii")},
+ {"slug": "_page_2.png", "format": "png",
+ "bytes_b64": base64.b64encode(_make_png_bytes()).decode("ascii")},
+ ]
+ saved = _persist_images_to_nas(document_id=999, images_resp=payload)
+
+ assert [s["image_key"] for s in saved] == ["img_001", "img_002", "img_003"]
+ assert all(s["mime_type"] == "image/png" for s in saved)
+ assert all(s["file_size"] > 0 for s in saved)
+ assert all(s["source_slug"].startswith("_page_") for s in saved)
+ # NAS 파일 실재 확인
+ for s in saved:
+ from pathlib import Path
+ assert Path(s["file_path"]).is_file()
+
+
+def test_persist_idempotent_on_rerun(tmp_path, monkeypatch):
+ """같은 doc_id 두번 persist → 같은 image_key 같은 path 에 overwrite."""
+ monkeypatch.setattr(
+ "workers.marker_worker.EXTRACTED_IMAGES_ROOT",
+ tmp_path / "extracted_images",
+ )
+ raw = _make_png_bytes()
+ payload = [{"slug": "_page_0.png", "format": "png",
+ "bytes_b64": base64.b64encode(raw).decode("ascii")}]
+
+ s1 = _persist_images_to_nas(document_id=42, images_resp=payload)
+ s2 = _persist_images_to_nas(document_id=42, images_resp=payload)
+ assert s1[0]["image_key"] == s2[0]["image_key"] == "img_001"
+ assert s1[0]["file_path"] == s2[0]["file_path"]
+ assert s1[0]["content_hash"] == s2[0]["content_hash"]
+
+
+def test_persist_skips_invalid_base64(tmp_path, monkeypatch):
+ """깨진 base64 는 skip — 다른 이미지 처리는 계속."""
+ monkeypatch.setattr(
+ "workers.marker_worker.EXTRACTED_IMAGES_ROOT",
+ tmp_path / "extracted_images",
+ )
+ raw = _make_png_bytes()
+ payload = [
+ {"slug": "_page_0.png", "format": "png", "bytes_b64": "@@@invalid@@@"},
+ {"slug": "_page_1.png", "format": "png",
+ "bytes_b64": base64.b64encode(raw).decode("ascii")},
+ ]
+ saved = _persist_images_to_nas(document_id=7, images_resp=payload)
+ # 첫 번째 invalid skip, 두 번째만 저장. seq 는 그대로 진행 → img_002 가 됨.
+ assert len(saved) == 1
+ assert saved[0]["image_key"] == "img_002"
+ assert saved[0]["source_slug"] == "_page_1.png"
+
+
+def test_persist_empty_images_returns_empty(tmp_path, monkeypatch):
+ monkeypatch.setattr(
+ "workers.marker_worker.EXTRACTED_IMAGES_ROOT",
+ tmp_path / "extracted_images",
+ )
+ assert _persist_images_to_nas(document_id=1, images_resp=[]) == []