Files
hyungi_document_server/tests/test_marker_image_persist.py
Hyungi Ahn 68fa86ea52 feat(markdown): persist extracted images with auth routes
Markdown Canonical Phase 1B.5 — marker 가 추출하던 이미지를 NAS 에 영구 저장하고
DB 메타 + 인증 라우트 + 프론트 swap 까지 wiring.

핵심 변경:
- marker-service /convert 응답에 base64 image 리스트 포함 (stateless 유지, NAS write 권한 X)
- marker_worker 가 NAS `/documents/extracted_images/{doc_id}/` 에 persist + UPSERT +
  고아 row DELETE + md_content ref 를 `docimg:img_NNN` stable scheme 으로 정규화
- /api/documents/{id}/images/{key}/raw 인증 라우트 (Cache-Control private + ETag = content_hash)
- frontend MarkdownDoc 가 placeholder card 안의 docimg ref 를 실제 <img> 로 swap

원칙:
- 이미지 binary = NAS, metadata = Postgres (학습 섹션 패턴 동일)
- image_key sequence 기반 결정적 → 재변환 idempotent
- MARKDOWN_IMAGE_PERSIST=false env 로 rollback 가능 (placeholder card 폴백 자연 유지)

기존 28건 marker success 문서는 본 PR 에서 건드리지 않음 — deploy + 신규 업로드 1건 +
sample 5건 검증 후 scripts/marker_reprocess_existing_success.py 로 targeted reprocess.

plan: ~/.claude/plans/piped-humming-crystal.md

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-10 14:05:41 +09:00

170 lines
5.8 KiB
Python

"""Phase 1B.5 ImgAuth — marker_worker 의 순수 헬퍼 단위 테스트.
DB / NAS / marker-service 실접속이 필요한 통합 테스트는 별 파일 (배포 후 실행).
본 파일은 image-bytes mocking 만으로 검증 가능한 부분 (rewrite 로직 + persist 매핑).
plan: ~/.claude/plans/piped-humming-crystal.md
"""
from __future__ import annotations
import base64
import os
import sys
import pytest
# tests/ → 프로젝트 루트 → app/
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "app"))
from workers.marker_worker import (
_persist_images_to_nas,
_rewrite_image_refs,
)
# ─── _rewrite_image_refs ───
def test_rewrite_exact_slug_match():
md = "본문\n\n![도식 1](_page_0_Picture_3.jpeg)\n\n"
out = _rewrite_image_refs(md, {"_page_0_Picture_3.jpeg": "img_001"})
assert "![도식 1](docimg:img_001)" in out
assert "_page_0_Picture_3.jpeg" not in out
def test_rewrite_basename_match_with_subdir_href():
md = "![](sub/_page_2_Figure_1.png)"
out = _rewrite_image_refs(md, {"_page_2_Figure_1.png": "img_007"})
assert out == "![](docimg:img_007)"
def test_rewrite_preserves_external_urls():
md = "외부 ![logo](https://example.com/x.png) 와 내부 ![](slug.png)"
out = _rewrite_image_refs(md, {"slug.png": "img_002"})
# 외부 URL 는 그대로, 내부 slug 만 docimg 로 치환.
assert "https://example.com/x.png" in out
assert "(docimg:img_002)" in out
def test_rewrite_preserves_alt_text():
md = "![긴 한국어 alt 설명 with $math$](slug.jpeg)"
out = _rewrite_image_refs(md, {"slug.jpeg": "img_001"})
assert out == "![긴 한국어 alt 설명 with $math$](docimg:img_001)"
def test_rewrite_no_slug_map_is_noop():
md = "![](slug.png)"
assert _rewrite_image_refs(md, {}) == md
def test_rewrite_unknown_slug_kept():
md = "![](unknown_slug.png)"
out = _rewrite_image_refs(md, {"other.png": "img_001"})
assert out == md
def test_rewrite_idempotent_on_already_normalized():
"""이미 docimg:img_NNN 인 ref 는 slug 매칭 실패 → 변경 없음 (재변환 idempotent)."""
md = "![alt](docimg:img_001)"
out = _rewrite_image_refs(md, {"_page_0.jpeg": "img_001"})
assert out == md
def test_rewrite_multiple_images():
md = "![a](s1.png) text ![b](s2.png) ![c](s3.jpg)"
out = _rewrite_image_refs(md, {
"s1.png": "img_001",
"s2.png": "img_002",
"s3.jpg": "img_003",
})
assert "(docimg:img_001)" in out
assert "(docimg:img_002)" in out
assert "(docimg:img_003)" in out
# ─── _persist_images_to_nas ───
def _make_png_bytes() -> bytes:
"""1x1 transparent PNG (signature + IHDR + IDAT + IEND)."""
return bytes.fromhex(
"89504e470d0a1a0a" # signature
"0000000d49484452" # IHDR len + type
"00000001000000010806000000" # 1x1 RGBA
"1f15c4890000000d4944415478"
"9c626001000000ffff03000006"
"00057ce4ec5d0000000049454e44ae426082"
)
def test_persist_sequential_image_keys(tmp_path, monkeypatch):
# NAS root 를 tmp_path 로 redirect
monkeypatch.setattr(
"workers.marker_worker.EXTRACTED_IMAGES_ROOT",
tmp_path / "extracted_images",
)
payload = [
{"slug": "_page_0.png", "format": "png",
"bytes_b64": base64.b64encode(_make_png_bytes()).decode("ascii")},
{"slug": "_page_1.png", "format": "png",
"bytes_b64": base64.b64encode(_make_png_bytes()).decode("ascii")},
{"slug": "_page_2.png", "format": "png",
"bytes_b64": base64.b64encode(_make_png_bytes()).decode("ascii")},
]
saved = _persist_images_to_nas(document_id=999, images_resp=payload)
assert [s["image_key"] for s in saved] == ["img_001", "img_002", "img_003"]
assert all(s["mime_type"] == "image/png" for s in saved)
assert all(s["file_size"] > 0 for s in saved)
assert all(s["source_slug"].startswith("_page_") for s in saved)
# NAS 파일 실재 확인
for s in saved:
from pathlib import Path
assert Path(s["file_path"]).is_file()
def test_persist_idempotent_on_rerun(tmp_path, monkeypatch):
"""같은 doc_id 두번 persist → 같은 image_key 같은 path 에 overwrite."""
monkeypatch.setattr(
"workers.marker_worker.EXTRACTED_IMAGES_ROOT",
tmp_path / "extracted_images",
)
raw = _make_png_bytes()
payload = [{"slug": "_page_0.png", "format": "png",
"bytes_b64": base64.b64encode(raw).decode("ascii")}]
s1 = _persist_images_to_nas(document_id=42, images_resp=payload)
s2 = _persist_images_to_nas(document_id=42, images_resp=payload)
assert s1[0]["image_key"] == s2[0]["image_key"] == "img_001"
assert s1[0]["file_path"] == s2[0]["file_path"]
assert s1[0]["content_hash"] == s2[0]["content_hash"]
def test_persist_skips_invalid_base64(tmp_path, monkeypatch):
"""깨진 base64 는 skip — 다른 이미지 처리는 계속."""
monkeypatch.setattr(
"workers.marker_worker.EXTRACTED_IMAGES_ROOT",
tmp_path / "extracted_images",
)
raw = _make_png_bytes()
payload = [
{"slug": "_page_0.png", "format": "png", "bytes_b64": "@@@invalid@@@"},
{"slug": "_page_1.png", "format": "png",
"bytes_b64": base64.b64encode(raw).decode("ascii")},
]
saved = _persist_images_to_nas(document_id=7, images_resp=payload)
# 첫 번째 invalid skip, 두 번째만 저장. seq 는 그대로 진행 → img_002 가 됨.
assert len(saved) == 1
assert saved[0]["image_key"] == "img_002"
assert saved[0]["source_slug"] == "_page_1.png"
def test_persist_empty_images_returns_empty(tmp_path, monkeypatch):
monkeypatch.setattr(
"workers.marker_worker.EXTRACTED_IMAGES_ROOT",
tmp_path / "extracted_images",
)
assert _persist_images_to_nas(document_id=1, images_resp=[]) == []