68fa86ea52
Markdown Canonical Phase 1B.5 — marker 가 추출하던 이미지를 NAS 에 영구 저장하고
DB 메타 + 인증 라우트 + 프론트 swap 까지 wiring.
핵심 변경:
- marker-service /convert 응답에 base64 image 리스트 포함 (stateless 유지, NAS write 권한 X)
- marker_worker 가 NAS `/documents/extracted_images/{doc_id}/` 에 persist + UPSERT +
고아 row DELETE + md_content ref 를 `docimg:img_NNN` stable scheme 으로 정규화
- /api/documents/{id}/images/{key}/raw 인증 라우트 (Cache-Control private + ETag = content_hash)
- frontend MarkdownDoc 가 placeholder card 안의 docimg ref 를 실제 <img> 로 swap
원칙:
- 이미지 binary = NAS, metadata = Postgres (학습 섹션 패턴 동일)
- image_key sequence 기반 결정적 → 재변환 idempotent
- MARKDOWN_IMAGE_PERSIST=false env 로 rollback 가능 (placeholder card 폴백 자연 유지)
기존 28건 marker success 문서는 본 PR 에서 건드리지 않음 — deploy + 신규 업로드 1건 +
sample 5건 검증 후 scripts/marker_reprocess_existing_success.py 로 targeted reprocess.
plan: ~/.claude/plans/piped-humming-crystal.md
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
170 lines
5.8 KiB
Python
170 lines
5.8 KiB
Python
"""Phase 1B.5 ImgAuth — marker_worker 의 순수 헬퍼 단위 테스트.
|
|
|
|
DB / NAS / marker-service 실접속이 필요한 통합 테스트는 별 파일 (배포 후 실행).
|
|
본 파일은 image-bytes mocking 만으로 검증 가능한 부분 (rewrite 로직 + persist 매핑).
|
|
|
|
plan: ~/.claude/plans/piped-humming-crystal.md
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import base64
|
|
import os
|
|
import sys
|
|
|
|
import pytest
|
|
|
|
# tests/ → 프로젝트 루트 → app/
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "app"))
|
|
|
|
from workers.marker_worker import (
|
|
_persist_images_to_nas,
|
|
_rewrite_image_refs,
|
|
)
|
|
|
|
|
|
# ─── _rewrite_image_refs ───
|
|
|
|
|
|
def test_rewrite_exact_slug_match():
|
|
md = "본문\n\n\n\n뒤"
|
|
out = _rewrite_image_refs(md, {"_page_0_Picture_3.jpeg": "img_001"})
|
|
assert "" in out
|
|
assert "_page_0_Picture_3.jpeg" not in out
|
|
|
|
|
|
def test_rewrite_basename_match_with_subdir_href():
|
|
md = ""
|
|
out = _rewrite_image_refs(md, {"_page_2_Figure_1.png": "img_007"})
|
|
assert out == ""
|
|
|
|
|
|
def test_rewrite_preserves_external_urls():
|
|
md = "외부  와 내부 "
|
|
out = _rewrite_image_refs(md, {"slug.png": "img_002"})
|
|
# 외부 URL 는 그대로, 내부 slug 만 docimg 로 치환.
|
|
assert "https://example.com/x.png" in out
|
|
assert "(docimg:img_002)" in out
|
|
|
|
|
|
def test_rewrite_preserves_alt_text():
|
|
md = ""
|
|
out = _rewrite_image_refs(md, {"slug.jpeg": "img_001"})
|
|
assert out == ""
|
|
|
|
|
|
def test_rewrite_no_slug_map_is_noop():
|
|
md = ""
|
|
assert _rewrite_image_refs(md, {}) == md
|
|
|
|
|
|
def test_rewrite_unknown_slug_kept():
|
|
md = ""
|
|
out = _rewrite_image_refs(md, {"other.png": "img_001"})
|
|
assert out == md
|
|
|
|
|
|
def test_rewrite_idempotent_on_already_normalized():
|
|
"""이미 docimg:img_NNN 인 ref 는 slug 매칭 실패 → 변경 없음 (재변환 idempotent)."""
|
|
md = ""
|
|
out = _rewrite_image_refs(md, {"_page_0.jpeg": "img_001"})
|
|
assert out == md
|
|
|
|
|
|
def test_rewrite_multiple_images():
|
|
md = " text  "
|
|
out = _rewrite_image_refs(md, {
|
|
"s1.png": "img_001",
|
|
"s2.png": "img_002",
|
|
"s3.jpg": "img_003",
|
|
})
|
|
assert "(docimg:img_001)" in out
|
|
assert "(docimg:img_002)" in out
|
|
assert "(docimg:img_003)" in out
|
|
|
|
|
|
# ─── _persist_images_to_nas ───
|
|
|
|
|
|
def _make_png_bytes() -> bytes:
|
|
"""1x1 transparent PNG (signature + IHDR + IDAT + IEND)."""
|
|
return bytes.fromhex(
|
|
"89504e470d0a1a0a" # signature
|
|
"0000000d49484452" # IHDR len + type
|
|
"00000001000000010806000000" # 1x1 RGBA
|
|
"1f15c4890000000d4944415478"
|
|
"9c626001000000ffff03000006"
|
|
"00057ce4ec5d0000000049454e44ae426082"
|
|
)
|
|
|
|
|
|
def test_persist_sequential_image_keys(tmp_path, monkeypatch):
|
|
# NAS root 를 tmp_path 로 redirect
|
|
monkeypatch.setattr(
|
|
"workers.marker_worker.EXTRACTED_IMAGES_ROOT",
|
|
tmp_path / "extracted_images",
|
|
)
|
|
|
|
payload = [
|
|
{"slug": "_page_0.png", "format": "png",
|
|
"bytes_b64": base64.b64encode(_make_png_bytes()).decode("ascii")},
|
|
{"slug": "_page_1.png", "format": "png",
|
|
"bytes_b64": base64.b64encode(_make_png_bytes()).decode("ascii")},
|
|
{"slug": "_page_2.png", "format": "png",
|
|
"bytes_b64": base64.b64encode(_make_png_bytes()).decode("ascii")},
|
|
]
|
|
saved = _persist_images_to_nas(document_id=999, images_resp=payload)
|
|
|
|
assert [s["image_key"] for s in saved] == ["img_001", "img_002", "img_003"]
|
|
assert all(s["mime_type"] == "image/png" for s in saved)
|
|
assert all(s["file_size"] > 0 for s in saved)
|
|
assert all(s["source_slug"].startswith("_page_") for s in saved)
|
|
# NAS 파일 실재 확인
|
|
for s in saved:
|
|
from pathlib import Path
|
|
assert Path(s["file_path"]).is_file()
|
|
|
|
|
|
def test_persist_idempotent_on_rerun(tmp_path, monkeypatch):
|
|
"""같은 doc_id 두번 persist → 같은 image_key 같은 path 에 overwrite."""
|
|
monkeypatch.setattr(
|
|
"workers.marker_worker.EXTRACTED_IMAGES_ROOT",
|
|
tmp_path / "extracted_images",
|
|
)
|
|
raw = _make_png_bytes()
|
|
payload = [{"slug": "_page_0.png", "format": "png",
|
|
"bytes_b64": base64.b64encode(raw).decode("ascii")}]
|
|
|
|
s1 = _persist_images_to_nas(document_id=42, images_resp=payload)
|
|
s2 = _persist_images_to_nas(document_id=42, images_resp=payload)
|
|
assert s1[0]["image_key"] == s2[0]["image_key"] == "img_001"
|
|
assert s1[0]["file_path"] == s2[0]["file_path"]
|
|
assert s1[0]["content_hash"] == s2[0]["content_hash"]
|
|
|
|
|
|
def test_persist_skips_invalid_base64(tmp_path, monkeypatch):
|
|
"""깨진 base64 는 skip — 다른 이미지 처리는 계속."""
|
|
monkeypatch.setattr(
|
|
"workers.marker_worker.EXTRACTED_IMAGES_ROOT",
|
|
tmp_path / "extracted_images",
|
|
)
|
|
raw = _make_png_bytes()
|
|
payload = [
|
|
{"slug": "_page_0.png", "format": "png", "bytes_b64": "@@@invalid@@@"},
|
|
{"slug": "_page_1.png", "format": "png",
|
|
"bytes_b64": base64.b64encode(raw).decode("ascii")},
|
|
]
|
|
saved = _persist_images_to_nas(document_id=7, images_resp=payload)
|
|
# 첫 번째 invalid skip, 두 번째만 저장. seq 는 그대로 진행 → img_002 가 됨.
|
|
assert len(saved) == 1
|
|
assert saved[0]["image_key"] == "img_002"
|
|
assert saved[0]["source_slug"] == "_page_1.png"
|
|
|
|
|
|
def test_persist_empty_images_returns_empty(tmp_path, monkeypatch):
|
|
monkeypatch.setattr(
|
|
"workers.marker_worker.EXTRACTED_IMAGES_ROOT",
|
|
tmp_path / "extracted_images",
|
|
)
|
|
assert _persist_images_to_nas(document_id=1, images_resp=[]) == []
|