"""PR-G2-3 — presegment LLM 경계 폴백 단위 테스트. scaffold-first 안전성 박제: (a) parse_json_response + SegmentationOutput 가 대표 fixture(ToC-less 120p → 3 segments) 검증 (b) 검증 게이트(_is_clear_bundle)가 정상 응답 수락 / 비정상(중첩·gap·tiny child·N>MAX) 거부 (c) flag OFF(기본) → LLM 절대 호출 안 함(call_deep count==0), flag ON → 호출됨(positive control) DB·PyMuPDF 불요(unit) — AsyncSession 은 최소 fake, fitz 는 sys.modules 주입 fake. 라이브 LLM 호출 없음(call_deep 는 fixture 반환 monkeypatch). worker-process 레벨 E2E(실 PDF 번들 분할, 보류 백오프 DB 기록)는 GPU 라이브 게이트에서 별도 실측. [[feedback_external_api_fixture_first]] / [[feedback_scaffold_first_for_external_cost_pr]] """ from __future__ import annotations import json import sys import types from pathlib import Path import pytest sys.path.insert(0, str(Path(__file__).parent.parent / "app")) from ai.client import parse_json_response # noqa: E402 import workers.presegment_worker as pw # noqa: E402 from workers.presegment_worker import ( # noqa: E402 SegmentationOutput, _is_clear_bundle, _segments_from_output, ) # ─── 대표 fixture: ToC-less 120p 번들 → 3 segments (1-based inclusive, 전범위·무중첩) ─── GOOD_LLM_JSON = json.dumps( { "is_bundle": True, "segments": [ {"start_page": 1, "end_page": 40, "title": "문서 A"}, {"start_page": 41, "end_page": 85, "title": "문서 B"}, {"start_page": 86, "end_page": 120, "title": "문서 C"}, ], "confidence": 0.82, }, ensure_ascii=False, ) PAGE_COUNT = 120 # ─── (a) parse_json_response + SegmentationOutput 검증 ────────────────────── def test_parse_and_validate_good_fixture(): parsed = parse_json_response(GOOD_LLM_JSON) assert parsed is not None out = SegmentationOutput.model_validate(parsed) assert out.is_bundle is True assert len(out.segments) == 3 assert out.segments[0].start_page == 1 assert out.segments[-1].end_page == PAGE_COUNT assert out.confidence == pytest.approx(0.82) def test_parse_tolerates_think_and_fence(): """house parse_json_response 가 + ```json fence 를 벗겨낸다.""" wrapped = f"분석중...\n```json\n{GOOD_LLM_JSON}\n```" parsed = parse_json_response(wrapped) out = SegmentationOutput.model_validate(parsed) assert out.is_bundle is True and len(out.segments) == 3 # ─── (b) 검증 게이트 accept / reject ──────────────────────────────────────── def _segments(*spans): return [{"start_page": s, "end_page": e, "title": ""} for (s, e) in spans] def test_gate_accepts_good(): out = SegmentationOutput.model_validate(parse_json_response(GOOD_LLM_JSON)) segs = _segments_from_output(out) clear, reason = _is_clear_bundle(segs, PAGE_COUNT) assert clear is True, reason assert reason == "" def test_gate_rejects_overlap(): # 41 이어야 할 두번째 start 가 40 으로 중첩 clear, reason = _is_clear_bundle(_segments((1, 40), (40, 85), (86, 120)), PAGE_COUNT) assert clear is False assert "non_contiguous" in reason def test_gate_rejects_gap(): # 40 다음이 42 로 시작 → 41 빈틈 (non_contiguous 로 검출) clear, reason = _is_clear_bundle(_segments((1, 40), (42, 85), (86, 120)), PAGE_COUNT) assert clear is False assert "non_contiguous" in reason def test_gate_rejects_tiny_child(): # 두번째 자식 41..43 = 3p < MIN_CHILD_PAGES(5) clear, reason = _is_clear_bundle(_segments((1, 40), (41, 43), (44, 120)), PAGE_COUNT) assert clear is False assert "child_too_small" in reason def test_gate_rejects_coverage_not_full(): # 마지막이 page_count 에 못 미침 clear, reason = _is_clear_bundle(_segments((1, 40), (41, 85), (86, 110)), PAGE_COUNT) assert clear is False assert "last_end_not_page_count" in reason def test_gate_rejects_too_many_children(): # N > MAX_CHILDREN — 각 자식 MIN_CHILD_PAGES 만족시키되 개수만 초과 n = pw.MAX_CHILDREN + 1 pc = n * pw.MIN_CHILD_PAGES spans = [ (i * pw.MIN_CHILD_PAGES + 1, (i + 1) * pw.MIN_CHILD_PAGES) for i in range(n) ] clear, reason = _is_clear_bundle(_segments(*spans), pc) assert clear is False assert "too_many_children" in reason def test_gate_rejects_single_segment(): clear, reason = _is_clear_bundle(_segments((1, 120)), PAGE_COUNT) assert clear is False assert "too_few_level1_entries" in reason # ─── 공통 fake (DB / PyMuPDF) ────────────────────────────────────────────── class _FakeDoc: """presegment 가 읽는 Document 필드만 가진 최소 stand-in.""" def __init__(self, doc_id=1): self.id = doc_id self.file_path = "PKM/bundle.pdf" self.file_hash = "deadbeef" self.file_format = "pdf" self.file_size = 123 self.file_type = "document" self.import_source = "upload" self.original_filename = "bundle.pdf" self.source_channel = None self.category = None self.data_origin = None self.doc_purpose = None self.material_type = None self.jurisdiction = None self.title = "번들" self.presegment_role = None self.bundle_page_start = None self.bundle_page_end = None self.extracted_at = None self.extracted_text = None class _ScalarResult: def __init__(self, rows): self._rows = rows def scalars(self): return self def all(self): return list(self._rows) class _FakeSession: """_create_children / process 가 쓰는 AsyncSession 표면만 구현. execute() = 기존 자식 lineage 조회 → 빈 결과(첫 분할). add/flush 로 child.id 부여. get() = document_id → 미리 등록한 doc, child_id → 생성된 child. """ def __init__(self, doc): self._docs = {doc.id: doc} self.added = [] self.commits = 0 self.enqueued = [] # enqueue_stage monkeypatch 가 채움 self._next_id = 1000 async def get(self, _model, oid): return self._docs.get(oid) async def execute(self, _stmt): # _create_children 의 기존 자식 조회 → 항상 빈(첫 분할). enqueue_stage 는 monkeypatch. return _ScalarResult([]) def add(self, obj): self.added.append(obj) # child Document 에 id 부여 (flush 대용 — _FakeDoc/실 Document 모두 setattr 가능) if getattr(obj, "id", None) is None and hasattr(obj, "presegment_role"): self._next_id += 1 obj.id = self._next_id self._docs[obj.id] = obj async def flush(self): for obj in self.added: if getattr(obj, "id", None) is None and hasattr(obj, "presegment_role"): self._next_id += 1 obj.id = self._next_id self._docs[obj.id] = obj async def commit(self): self.commits += 1 def _install_fake_fitz(monkeypatch, *, page_count=PAGE_COUNT, toc=None, first_lines=None): """sys.modules['fitz'] 에 fake 주입 — worker 의 `import fitz` 가 이걸 받게 한다.""" toc = toc or [] class _FakePage: def __init__(self, idx): self._idx = idx def get_text(self): if first_lines and self._idx < len(first_lines): return first_lines[self._idx] return f"page {self._idx + 1} body text" class _FakePdf: def __init__(self): self.page_count = page_count def get_toc(self, simple=True): return list(toc) def __getitem__(self, idx): return _FakePage(idx) def __enter__(self): return self def __exit__(self, *exc): return False fake = types.ModuleType("fitz") fake.open = lambda *_a, **_k: _FakePdf() monkeypatch.setitem(sys.modules, "fitz", fake) return fake class _SpyClient: """AIClient stand-in — call_deep 호출 횟수 카운트 + 지정 응답 반환.""" calls = 0 response = GOOD_LLM_JSON def __init__(self): type(self).calls += 1 # 인스턴스화 자체는 비용 아님 — 호출 카운트는 call_deep 기준 async def call_deep(self, prompt, system=None): type(self)._deep_calls += 1 return type(self).response async def close(self): pass @pytest.fixture(autouse=True) def _reset_spy(): _SpyClient.calls = 0 _SpyClient._deep_calls = 0 _SpyClient.response = GOOD_LLM_JSON yield # ─── (b) _llm_boundary_fallback 수락/거부 (mocked LLM) ────────────────────── @pytest.mark.asyncio async def test_fallback_accepts_good_and_creates_children(monkeypatch): """정상 LLM 응답 → 게이트 통과 → _create_children 가 3 자식 + parent 표식.""" _install_fake_fitz(monkeypatch) monkeypatch.setattr(pw, "AIClient", _SpyClient) # enqueue_stage 는 DB 의존 — no-op 으로 대체 (호출 인자만 기록) enq = [] async def _fake_enqueue(session, doc_id, stage, **kw): enq.append((doc_id, stage)) return True monkeypatch.setattr(pw, "enqueue_stage", _fake_enqueue) doc = _FakeDoc() session = _FakeSession(doc) ok = await pw._llm_boundary_fallback(doc, Path("/tmp/bundle.pdf"), PAGE_COUNT, session) assert ok is True assert _SpyClient._deep_calls == 1 # 자식 3개 생성 + parent 표식 + lineage 3 + commit children = [o for o in session.added if getattr(o, "presegment_role", None) == "child"] assert len(children) == 3 assert doc.presegment_role == "parent" assert sum(1 for o in session.added if o.__class__.__name__ == "DocumentLineage") == 3 assert {s for (_id, s) in enq} == {"extract"} @pytest.mark.asyncio async def test_fallback_rejects_bad_segments(monkeypatch): """LLM 이 중첩 경계 반환 → 게이트 거부 → False + 자식 0 (단일문서).""" _install_fake_fitz(monkeypatch) bad = json.dumps({ "is_bundle": True, "segments": [ {"start_page": 1, "end_page": 40}, {"start_page": 40, "end_page": 85}, # 중첩 {"start_page": 86, "end_page": 120}, ], }) _SpyClient.response = bad monkeypatch.setattr(pw, "AIClient", _SpyClient) async def _fake_enqueue(*a, **k): return True monkeypatch.setattr(pw, "enqueue_stage", _fake_enqueue) doc = _FakeDoc() session = _FakeSession(doc) ok = await pw._llm_boundary_fallback(doc, Path("/tmp/b.pdf"), PAGE_COUNT, session) assert ok is False assert _SpyClient._deep_calls == 1 assert [o for o in session.added if getattr(o, "presegment_role", None) == "child"] == [] assert doc.presegment_role is None @pytest.mark.asyncio async def test_fallback_rejects_is_bundle_false(monkeypatch): """is_bundle=false → 호출은 했으나 분할 안 함(False, 자식 0).""" _install_fake_fitz(monkeypatch) _SpyClient.response = json.dumps({"is_bundle": False, "segments": []}) monkeypatch.setattr(pw, "AIClient", _SpyClient) async def _fake_enqueue(*a, **k): return True monkeypatch.setattr(pw, "enqueue_stage", _fake_enqueue) doc = _FakeDoc() session = _FakeSession(doc) ok = await pw._llm_boundary_fallback(doc, Path("/tmp/b.pdf"), PAGE_COUNT, session) assert ok is False assert _SpyClient._deep_calls == 1 assert doc.presegment_role is None # ─── (c) flag gating — OFF=호출 0 (deployed default 무변), ON=호출됨 ─────────── @pytest.mark.asyncio async def test_flag_off_never_calls_llm(monkeypatch): """PRESEGMENT_LLM_FALLBACK=False(기본) → 큰 ToC-less PDF 도 LLM 미호출 = 오늘과 동일.""" monkeypatch.setattr(pw, "PRESEGMENT_LLM_FALLBACK", False) _install_fake_fitz(monkeypatch, page_count=120, toc=[]) # 대형 + level-1 ToC 없음 = 애매 monkeypatch.setattr(pw, "AIClient", _SpyClient) monkeypatch.setattr(pw, "_resolve_path", lambda raw: Path("/tmp/bundle.pdf")) async def _fake_enqueue(*a, **k): return True monkeypatch.setattr(pw, "enqueue_stage", _fake_enqueue) doc = _FakeDoc() session = _FakeSession(doc) await pw.process(doc.id, session) assert _SpyClient._deep_calls == 0 # ★ LLM 절대 호출 안 됨 assert doc.presegment_role is None # 단일문서 (분할 안 함) assert session.commits == 0 @pytest.mark.asyncio async def test_flag_on_calls_llm_and_splits(monkeypatch): """positive control — flag ON 이면 같은 입력에 LLM 호출 + 게이트 통과 시 분할.""" monkeypatch.setattr(pw, "PRESEGMENT_LLM_FALLBACK", True) _install_fake_fitz(monkeypatch, page_count=120, toc=[]) _SpyClient.response = GOOD_LLM_JSON monkeypatch.setattr(pw, "AIClient", _SpyClient) monkeypatch.setattr(pw, "_resolve_path", lambda raw: Path("/tmp/bundle.pdf")) async def _fake_enqueue(*a, **k): return True monkeypatch.setattr(pw, "enqueue_stage", _fake_enqueue) doc = _FakeDoc() session = _FakeSession(doc) await pw.process(doc.id, session) assert _SpyClient._deep_calls == 1 # LLM 호출됨 assert doc.presegment_role == "parent" # 분할 수행 children = [o for o in session.added if getattr(o, "presegment_role", None) == "child"] assert len(children) == 3