"""S1-ADD (plan ds-s1-backend-1) B-2 /duplicates shape + D-2 Range 파서 + dedup 상수 단위 검증. 순수 단위(DB 불요). 실행 환경 = app/ 의존성 설치 컨텍스트(devsbx/GPU) — 기존 test_s1_dedup_shape.py 와 동일 부트스트랩. DB 를 타는 검증(find_canonical/near_dup/엔드포인트)은 GPU read-only/통합 매트릭스(E-1)에서. """ from __future__ import annotations import json import logging import os import sys from pathlib import Path import pytest # logs/ 가 운영 daemon 소유일 때 import-time FileHandler PermissionError 방어 (test 한정). _orig_file_handler = logging.FileHandler def _safe_file_handler(filename, *args, **kwargs): # type: ignore[no-untyped-def] try: return _orig_file_handler(filename, *args, **kwargs) except PermissionError: return logging.NullHandler() logging.FileHandler = _safe_file_handler # type: ignore[assignment] os.environ.setdefault("DATABASE_URL", "postgresql+asyncpg://test:test@localhost:5432/test") sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "app")) from api.documents import ( # noqa: E402 DuplicateGroup, DuplicatesResponse, _parse_byte_range, ) from services.dedup import DEDUP_OFF_CHANNELS # noqa: E402 _FIXDIR = Path(os.path.expanduser("~/Documents/code/ds-app/contract/fixtures")) # ── 1. /duplicates 응답 shape = contract fixture ─────────────────────────────── def test_duplicates_response_shape_matches_total_formula(): # 엔드포인트 정의: total_duplicate_docs = Σ(멤버수-1). fixture 와 동일해야 함. groups = [ DuplicateGroup(canonical_id=4912, members=[4912, 4977], reason="content_hash"), DuplicateGroup(canonical_id=5120, members=[5120, 5121, 5260], reason="content_hash"), ] total_dup = sum(len(g.members) - 1 for g in groups) resp = DuplicatesResponse( groups=groups, total_groups=len(groups), total_duplicate_docs=total_dup ) assert resp.total_groups == 2 assert resp.total_duplicate_docs == 3 # (2-1)+(3-1) @pytest.mark.skipif(not _FIXDIR.exists(), reason="ds-app contract fixtures 미존재") def test_duplicates_contract_fixture_decodes(): payload = json.loads((_FIXDIR / "documents_duplicates.json").read_text()) m = DuplicatesResponse.model_validate(payload) assert m.total_groups == payload["total_groups"] assert m.total_duplicate_docs == payload["total_duplicate_docs"] # Σ(멤버수-1) 정의가 fixture total 과 일치(계약 self-consistency). assert sum(len(g.members) - 1 for g in m.groups) == payload["total_duplicate_docs"] assert m.groups[0].canonical_id == payload["groups"][0]["canonical_id"] # ── 2. D-2 Range 파서 (원격 백엔드 pass-through; local 은 FileResponse 자동) ────── @pytest.mark.parametrize( "header,size,expected", [ (None, 1000, (None, None)), ("", 1000, (None, None)), ("bytes=0-99", 1000, (0, 99)), ("bytes=100-", 1000, (100, 999)), # 끝까지 ("bytes=-200", 1000, (800, 999)), # suffix: 마지막 200 ("bytes=0-99999", 1000, (0, 999)), # end clamp ("bytes=2000-3000", 1000, (None, None)), # start >= size → 무효(전체) ("bytes=abc-def", 1000, (None, None)), # 파싱 실패 ("bytes=50-10", 1000, (None, None)), # start>end ("bytes=0-99", 0, (None, None)), # 빈 파일 ], ) def test_parse_byte_range(header, size, expected): assert _parse_byte_range(header, size) == expected # ── 3. dedup OFF-whitelist 단일 source ───────────────────────────────────────── def test_dedup_off_channels_is_law_monitor_only(): # P0-2 결정: 단일 OFF-list = law_monitor (법령 개정본 보존). 확장은 의도적 결정으로만. assert DEDUP_OFF_CHANNELS == ("law_monitor",)