Files
hyungi_document_server/tests/test_s1_dedup_b2.py
T
hyungi daf6a0ade9 feat(documents): S1 dedup·office-md·storage scaffold (B/C/D/E)
plan ds-s1-backend-1 잔여 구현 (A·C-1 은 16b0fe1):
- B 중복검사: services/dedup.py (OFF-list law_monitor 공용) + 업로드 채움(B-1)
  + GET /documents/duplicates(B-2) + post-upload near-dup 비동기(B-3)
  + backfill_dedup.py(B-4) + 야간 dedup_reconcile 잡(03:30 KST 멱등 재계산)
- C MD-first: marker_worker office/hwp 분기 _process_office(C-2) + md_status
  상태머신 postcondition success|failed(C-5) + backfill_nonpdf_markdown.py(C-4)
  + requirements markitdown
- D 스토리지: services/storage ABC+Range 계약 / LocalBackend / NasApiBackend 503
  (D-1) + /file resolver 경유, 로컬 동작 불변(D-2)
- E 운영: pre-change pg_dump + rollback_287.sql + apply runbook(E-3) + 테스트(E-1)

비파괴 불변식 유지(기존 응답 shape 무변경, md_status success→completed read-time 매핑).
어드버서리얼 리뷰 확정 1건(soft-delete canonical 승격 시 stale duplicate_of) → B-1
승격 정규화 + 야간 재계산으로 정합.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-08 03:05:30 +00:00

97 lines
3.9 KiB
Python

"""S1-ADD (plan ds-s1-backend-1) B-2 /duplicates shape + D-2 Range 파서 + dedup 상수 단위 검증.
순수 단위(DB 불요). 실행 환경 = app/ 의존성 설치 컨텍스트(devsbx/GPU) — 기존
test_s1_dedup_shape.py 와 동일 부트스트랩. DB 를 타는 검증(find_canonical/near_dup/엔드포인트)은
GPU read-only/통합 매트릭스(E-1)에서.
"""
from __future__ import annotations
import json
import logging
import os
import sys
from pathlib import Path
import pytest
# logs/ 가 운영 daemon 소유일 때 import-time FileHandler PermissionError 방어 (test 한정).
_orig_file_handler = logging.FileHandler
def _safe_file_handler(filename, *args, **kwargs): # type: ignore[no-untyped-def]
try:
return _orig_file_handler(filename, *args, **kwargs)
except PermissionError:
return logging.NullHandler()
logging.FileHandler = _safe_file_handler # type: ignore[assignment]
os.environ.setdefault("DATABASE_URL", "postgresql+asyncpg://test:test@localhost:5432/test")
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "app"))
from api.documents import ( # noqa: E402
DuplicateGroup,
DuplicatesResponse,
_parse_byte_range,
)
from services.dedup import DEDUP_OFF_CHANNELS # noqa: E402
_FIXDIR = Path(os.path.expanduser("~/Documents/code/ds-app/contract/fixtures"))
# ── 1. /duplicates 응답 shape = contract fixture ───────────────────────────────
def test_duplicates_response_shape_matches_total_formula():
# 엔드포인트 정의: total_duplicate_docs = Σ(멤버수-1). fixture 와 동일해야 함.
groups = [
DuplicateGroup(canonical_id=4912, members=[4912, 4977], reason="content_hash"),
DuplicateGroup(canonical_id=5120, members=[5120, 5121, 5260], reason="content_hash"),
]
total_dup = sum(len(g.members) - 1 for g in groups)
resp = DuplicatesResponse(
groups=groups, total_groups=len(groups), total_duplicate_docs=total_dup
)
assert resp.total_groups == 2
assert resp.total_duplicate_docs == 3 # (2-1)+(3-1)
@pytest.mark.skipif(not _FIXDIR.exists(), reason="ds-app contract fixtures 미존재")
def test_duplicates_contract_fixture_decodes():
payload = json.loads((_FIXDIR / "documents_duplicates.json").read_text())
m = DuplicatesResponse.model_validate(payload)
assert m.total_groups == payload["total_groups"]
assert m.total_duplicate_docs == payload["total_duplicate_docs"]
# Σ(멤버수-1) 정의가 fixture total 과 일치(계약 self-consistency).
assert sum(len(g.members) - 1 for g in m.groups) == payload["total_duplicate_docs"]
assert m.groups[0].canonical_id == payload["groups"][0]["canonical_id"]
# ── 2. D-2 Range 파서 (원격 백엔드 pass-through; local 은 FileResponse 자동) ──────
@pytest.mark.parametrize(
"header,size,expected",
[
(None, 1000, (None, None)),
("", 1000, (None, None)),
("bytes=0-99", 1000, (0, 99)),
("bytes=100-", 1000, (100, 999)), # 끝까지
("bytes=-200", 1000, (800, 999)), # suffix: 마지막 200
("bytes=0-99999", 1000, (0, 999)), # end clamp
("bytes=2000-3000", 1000, (None, None)), # start >= size → 무효(전체)
("bytes=abc-def", 1000, (None, None)), # 파싱 실패
("bytes=50-10", 1000, (None, None)), # start>end
("bytes=0-99", 0, (None, None)), # 빈 파일
],
)
def test_parse_byte_range(header, size, expected):
assert _parse_byte_range(header, size) == expected
# ── 3. dedup OFF-whitelist 단일 source ─────────────────────────────────────────
def test_dedup_off_channels_is_law_monitor_only():
# P0-2 결정: 단일 OFF-list = law_monitor (법령 개정본 보존). 확장은 의도적 결정으로만.
assert DEDUP_OFF_CHANNELS == ("law_monitor",)