daf6a0ade9
plan ds-s1-backend-1 잔여 구현 (A·C-1 은 16b0fe1):
- B 중복검사: services/dedup.py (OFF-list law_monitor 공용) + 업로드 채움(B-1)
+ GET /documents/duplicates(B-2) + post-upload near-dup 비동기(B-3)
+ backfill_dedup.py(B-4) + 야간 dedup_reconcile 잡(03:30 KST 멱등 재계산)
- C MD-first: marker_worker office/hwp 분기 _process_office(C-2) + md_status
상태머신 postcondition success|failed(C-5) + backfill_nonpdf_markdown.py(C-4)
+ requirements markitdown
- D 스토리지: services/storage ABC+Range 계약 / LocalBackend / NasApiBackend 503
(D-1) + /file resolver 경유, 로컬 동작 불변(D-2)
- E 운영: pre-change pg_dump + rollback_287.sql + apply runbook(E-3) + 테스트(E-1)
비파괴 불변식 유지(기존 응답 shape 무변경, md_status success→completed read-time 매핑).
어드버서리얼 리뷰 확정 1건(soft-delete canonical 승격 시 stale duplicate_of) → B-1
승격 정규화 + 야간 재계산으로 정합.
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
97 lines
3.9 KiB
Python
97 lines
3.9 KiB
Python
"""S1-ADD (plan ds-s1-backend-1) B-2 /duplicates shape + D-2 Range 파서 + dedup 상수 단위 검증.
|
|
|
|
순수 단위(DB 불요). 실행 환경 = app/ 의존성 설치 컨텍스트(devsbx/GPU) — 기존
|
|
test_s1_dedup_shape.py 와 동일 부트스트랩. DB 를 타는 검증(find_canonical/near_dup/엔드포인트)은
|
|
GPU read-only/통합 매트릭스(E-1)에서.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import logging
|
|
import os
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
# logs/ 가 운영 daemon 소유일 때 import-time FileHandler PermissionError 방어 (test 한정).
|
|
_orig_file_handler = logging.FileHandler
|
|
|
|
|
|
def _safe_file_handler(filename, *args, **kwargs): # type: ignore[no-untyped-def]
|
|
try:
|
|
return _orig_file_handler(filename, *args, **kwargs)
|
|
except PermissionError:
|
|
return logging.NullHandler()
|
|
|
|
|
|
logging.FileHandler = _safe_file_handler # type: ignore[assignment]
|
|
|
|
os.environ.setdefault("DATABASE_URL", "postgresql+asyncpg://test:test@localhost:5432/test")
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "app"))
|
|
|
|
from api.documents import ( # noqa: E402
|
|
DuplicateGroup,
|
|
DuplicatesResponse,
|
|
_parse_byte_range,
|
|
)
|
|
from services.dedup import DEDUP_OFF_CHANNELS # noqa: E402
|
|
|
|
_FIXDIR = Path(os.path.expanduser("~/Documents/code/ds-app/contract/fixtures"))
|
|
|
|
|
|
# ── 1. /duplicates 응답 shape = contract fixture ───────────────────────────────
|
|
|
|
def test_duplicates_response_shape_matches_total_formula():
|
|
# 엔드포인트 정의: total_duplicate_docs = Σ(멤버수-1). fixture 와 동일해야 함.
|
|
groups = [
|
|
DuplicateGroup(canonical_id=4912, members=[4912, 4977], reason="content_hash"),
|
|
DuplicateGroup(canonical_id=5120, members=[5120, 5121, 5260], reason="content_hash"),
|
|
]
|
|
total_dup = sum(len(g.members) - 1 for g in groups)
|
|
resp = DuplicatesResponse(
|
|
groups=groups, total_groups=len(groups), total_duplicate_docs=total_dup
|
|
)
|
|
assert resp.total_groups == 2
|
|
assert resp.total_duplicate_docs == 3 # (2-1)+(3-1)
|
|
|
|
|
|
@pytest.mark.skipif(not _FIXDIR.exists(), reason="ds-app contract fixtures 미존재")
|
|
def test_duplicates_contract_fixture_decodes():
|
|
payload = json.loads((_FIXDIR / "documents_duplicates.json").read_text())
|
|
m = DuplicatesResponse.model_validate(payload)
|
|
assert m.total_groups == payload["total_groups"]
|
|
assert m.total_duplicate_docs == payload["total_duplicate_docs"]
|
|
# Σ(멤버수-1) 정의가 fixture total 과 일치(계약 self-consistency).
|
|
assert sum(len(g.members) - 1 for g in m.groups) == payload["total_duplicate_docs"]
|
|
assert m.groups[0].canonical_id == payload["groups"][0]["canonical_id"]
|
|
|
|
|
|
# ── 2. D-2 Range 파서 (원격 백엔드 pass-through; local 은 FileResponse 자동) ──────
|
|
|
|
@pytest.mark.parametrize(
|
|
"header,size,expected",
|
|
[
|
|
(None, 1000, (None, None)),
|
|
("", 1000, (None, None)),
|
|
("bytes=0-99", 1000, (0, 99)),
|
|
("bytes=100-", 1000, (100, 999)), # 끝까지
|
|
("bytes=-200", 1000, (800, 999)), # suffix: 마지막 200
|
|
("bytes=0-99999", 1000, (0, 999)), # end clamp
|
|
("bytes=2000-3000", 1000, (None, None)), # start >= size → 무효(전체)
|
|
("bytes=abc-def", 1000, (None, None)), # 파싱 실패
|
|
("bytes=50-10", 1000, (None, None)), # start>end
|
|
("bytes=0-99", 0, (None, None)), # 빈 파일
|
|
],
|
|
)
|
|
def test_parse_byte_range(header, size, expected):
|
|
assert _parse_byte_range(header, size) == expected
|
|
|
|
|
|
# ── 3. dedup OFF-whitelist 단일 source ─────────────────────────────────────────
|
|
|
|
def test_dedup_off_channels_is_law_monitor_only():
|
|
# P0-2 결정: 단일 OFF-list = law_monitor (법령 개정본 보존). 확장은 의도적 결정으로만.
|
|
assert DEDUP_OFF_CHANNELS == ("law_monitor",)
|