Files
hyungi_document_server/tests/test_presegment_llm.py
T
hyungi 23bb5ac9c9 feat(presegment): G2 PR-3 — LLM 경계 폴백 (flag-gated, 기본 OFF, scaffold-first)
ToC 없는/게이트 미달 대형 PDF(>=60p)에 한해 off-card Qwen(맥북, call_deep_or_defer,
StageDeferred-safe) 경계 제안 → 동일 검증게이트(_is_clear_bundle) 통과 시에만 deterministic 과
공유하는 _create_children 로 분할. is_bundle=false/파싱·검증 실패=단일문서(오늘과 동일)+로깅.
- env PRESEGMENT_LLM_FALLBACK 기본 false → 배포 동작 무변(LLM 미호출, 검증=unit test)
- 자식생성 _create_children 공유 헬퍼로 리팩터(deterministic+LLM 단일 경로, 동작 동일)
- SegmentationOutput Pydantic + parse_json_response(house 패턴) + per-page heading 샘플(본문 미전송)
- prompt app/prompts/presegment_boundaries.txt + tests/test_presegment_llm.py(14, fitz/DB/LLM mock)
no direct HTTP·no silent fallback. 활성=flag ON + 실 router fixture 검증 후.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-18 17:52:27 +09:00

401 lines
14 KiB
Python

"""PR-G2-3 — presegment LLM 경계 폴백 단위 테스트.
scaffold-first 안전성 박제:
(a) parse_json_response + SegmentationOutput 가 대표 fixture(ToC-less 120p → 3 segments) 검증
(b) 검증 게이트(_is_clear_bundle)가 정상 응답 수락 / 비정상(중첩·gap·tiny child·N>MAX) 거부
(c) flag OFF(기본) → LLM 절대 호출 안 함(call_deep count==0), flag ON → 호출됨(positive control)
DB·PyMuPDF 불요(unit) — AsyncSession 은 최소 fake, fitz 는 sys.modules 주입 fake.
라이브 LLM 호출 없음(call_deep 는 fixture 반환 monkeypatch). worker-process 레벨 E2E(실 PDF
번들 분할, 보류 백오프 DB 기록)는 GPU 라이브 게이트에서 별도 실측.
[[feedback_external_api_fixture_first]] / [[feedback_scaffold_first_for_external_cost_pr]]
"""
from __future__ import annotations
import json
import sys
import types
from pathlib import Path
import pytest
sys.path.insert(0, str(Path(__file__).parent.parent / "app"))
from ai.client import parse_json_response # noqa: E402
import workers.presegment_worker as pw # noqa: E402
from workers.presegment_worker import ( # noqa: E402
SegmentationOutput,
_is_clear_bundle,
_segments_from_output,
)
# ─── 대표 fixture: ToC-less 120p 번들 → 3 segments (1-based inclusive, 전범위·무중첩) ───
GOOD_LLM_JSON = json.dumps(
{
"is_bundle": True,
"segments": [
{"start_page": 1, "end_page": 40, "title": "문서 A"},
{"start_page": 41, "end_page": 85, "title": "문서 B"},
{"start_page": 86, "end_page": 120, "title": "문서 C"},
],
"confidence": 0.82,
},
ensure_ascii=False,
)
PAGE_COUNT = 120
# ─── (a) parse_json_response + SegmentationOutput 검증 ──────────────────────
def test_parse_and_validate_good_fixture():
parsed = parse_json_response(GOOD_LLM_JSON)
assert parsed is not None
out = SegmentationOutput.model_validate(parsed)
assert out.is_bundle is True
assert len(out.segments) == 3
assert out.segments[0].start_page == 1
assert out.segments[-1].end_page == PAGE_COUNT
assert out.confidence == pytest.approx(0.82)
def test_parse_tolerates_think_and_fence():
"""house parse_json_response 가 <think> + ```json fence 를 벗겨낸다."""
wrapped = f"<think>분석중...</think>\n```json\n{GOOD_LLM_JSON}\n```"
parsed = parse_json_response(wrapped)
out = SegmentationOutput.model_validate(parsed)
assert out.is_bundle is True and len(out.segments) == 3
# ─── (b) 검증 게이트 accept / reject ────────────────────────────────────────
def _segments(*spans):
return [{"start_page": s, "end_page": e, "title": ""} for (s, e) in spans]
def test_gate_accepts_good():
out = SegmentationOutput.model_validate(parse_json_response(GOOD_LLM_JSON))
segs = _segments_from_output(out)
clear, reason = _is_clear_bundle(segs, PAGE_COUNT)
assert clear is True, reason
assert reason == ""
def test_gate_rejects_overlap():
# 41 이어야 할 두번째 start 가 40 으로 중첩
clear, reason = _is_clear_bundle(_segments((1, 40), (40, 85), (86, 120)), PAGE_COUNT)
assert clear is False
assert "non_contiguous" in reason
def test_gate_rejects_gap():
# 40 다음이 42 로 시작 → 41 빈틈 (non_contiguous 로 검출)
clear, reason = _is_clear_bundle(_segments((1, 40), (42, 85), (86, 120)), PAGE_COUNT)
assert clear is False
assert "non_contiguous" in reason
def test_gate_rejects_tiny_child():
# 두번째 자식 41..43 = 3p < MIN_CHILD_PAGES(5)
clear, reason = _is_clear_bundle(_segments((1, 40), (41, 43), (44, 120)), PAGE_COUNT)
assert clear is False
assert "child_too_small" in reason
def test_gate_rejects_coverage_not_full():
# 마지막이 page_count 에 못 미침
clear, reason = _is_clear_bundle(_segments((1, 40), (41, 85), (86, 110)), PAGE_COUNT)
assert clear is False
assert "last_end_not_page_count" in reason
def test_gate_rejects_too_many_children():
# N > MAX_CHILDREN — 각 자식 MIN_CHILD_PAGES 만족시키되 개수만 초과
n = pw.MAX_CHILDREN + 1
pc = n * pw.MIN_CHILD_PAGES
spans = [
(i * pw.MIN_CHILD_PAGES + 1, (i + 1) * pw.MIN_CHILD_PAGES) for i in range(n)
]
clear, reason = _is_clear_bundle(_segments(*spans), pc)
assert clear is False
assert "too_many_children" in reason
def test_gate_rejects_single_segment():
clear, reason = _is_clear_bundle(_segments((1, 120)), PAGE_COUNT)
assert clear is False
assert "too_few_level1_entries" in reason
# ─── 공통 fake (DB / PyMuPDF) ──────────────────────────────────────────────
class _FakeDoc:
"""presegment 가 읽는 Document 필드만 가진 최소 stand-in."""
def __init__(self, doc_id=1):
self.id = doc_id
self.file_path = "PKM/bundle.pdf"
self.file_hash = "deadbeef"
self.file_format = "pdf"
self.file_size = 123
self.file_type = "document"
self.import_source = "upload"
self.original_filename = "bundle.pdf"
self.source_channel = None
self.category = None
self.data_origin = None
self.doc_purpose = None
self.material_type = None
self.jurisdiction = None
self.title = "번들"
self.presegment_role = None
self.bundle_page_start = None
self.bundle_page_end = None
self.extracted_at = None
self.extracted_text = None
class _ScalarResult:
def __init__(self, rows):
self._rows = rows
def scalars(self):
return self
def all(self):
return list(self._rows)
class _FakeSession:
"""_create_children / process 가 쓰는 AsyncSession 표면만 구현.
execute() = 기존 자식 lineage 조회 → 빈 결과(첫 분할). add/flush 로 child.id 부여.
get() = document_id → 미리 등록한 doc, child_id → 생성된 child.
"""
def __init__(self, doc):
self._docs = {doc.id: doc}
self.added = []
self.commits = 0
self.enqueued = [] # enqueue_stage monkeypatch 가 채움
self._next_id = 1000
async def get(self, _model, oid):
return self._docs.get(oid)
async def execute(self, _stmt):
# _create_children 의 기존 자식 조회 → 항상 빈(첫 분할). enqueue_stage 는 monkeypatch.
return _ScalarResult([])
def add(self, obj):
self.added.append(obj)
# child Document 에 id 부여 (flush 대용 — _FakeDoc/실 Document 모두 setattr 가능)
if getattr(obj, "id", None) is None and hasattr(obj, "presegment_role"):
self._next_id += 1
obj.id = self._next_id
self._docs[obj.id] = obj
async def flush(self):
for obj in self.added:
if getattr(obj, "id", None) is None and hasattr(obj, "presegment_role"):
self._next_id += 1
obj.id = self._next_id
self._docs[obj.id] = obj
async def commit(self):
self.commits += 1
def _install_fake_fitz(monkeypatch, *, page_count=PAGE_COUNT, toc=None, first_lines=None):
"""sys.modules['fitz'] 에 fake 주입 — worker 의 `import fitz` 가 이걸 받게 한다."""
toc = toc or []
class _FakePage:
def __init__(self, idx):
self._idx = idx
def get_text(self):
if first_lines and self._idx < len(first_lines):
return first_lines[self._idx]
return f"page {self._idx + 1} body text"
class _FakePdf:
def __init__(self):
self.page_count = page_count
def get_toc(self, simple=True):
return list(toc)
def __getitem__(self, idx):
return _FakePage(idx)
def __enter__(self):
return self
def __exit__(self, *exc):
return False
fake = types.ModuleType("fitz")
fake.open = lambda *_a, **_k: _FakePdf()
monkeypatch.setitem(sys.modules, "fitz", fake)
return fake
class _SpyClient:
"""AIClient stand-in — call_deep 호출 횟수 카운트 + 지정 응답 반환."""
calls = 0
response = GOOD_LLM_JSON
def __init__(self):
type(self).calls += 1 # 인스턴스화 자체는 비용 아님 — 호출 카운트는 call_deep 기준
async def call_deep(self, prompt, system=None):
type(self)._deep_calls += 1
return type(self).response
async def close(self):
pass
@pytest.fixture(autouse=True)
def _reset_spy():
_SpyClient.calls = 0
_SpyClient._deep_calls = 0
_SpyClient.response = GOOD_LLM_JSON
yield
# ─── (b) _llm_boundary_fallback 수락/거부 (mocked LLM) ──────────────────────
@pytest.mark.asyncio
async def test_fallback_accepts_good_and_creates_children(monkeypatch):
"""정상 LLM 응답 → 게이트 통과 → _create_children 가 3 자식 + parent 표식."""
_install_fake_fitz(monkeypatch)
monkeypatch.setattr(pw, "AIClient", _SpyClient)
# enqueue_stage 는 DB 의존 — no-op 으로 대체 (호출 인자만 기록)
enq = []
async def _fake_enqueue(session, doc_id, stage, **kw):
enq.append((doc_id, stage))
return True
monkeypatch.setattr(pw, "enqueue_stage", _fake_enqueue)
doc = _FakeDoc()
session = _FakeSession(doc)
ok = await pw._llm_boundary_fallback(doc, Path("/tmp/bundle.pdf"), PAGE_COUNT, session)
assert ok is True
assert _SpyClient._deep_calls == 1
# 자식 3개 생성 + parent 표식 + lineage 3 + commit
children = [o for o in session.added if getattr(o, "presegment_role", None) == "child"]
assert len(children) == 3
assert doc.presegment_role == "parent"
assert sum(1 for o in session.added if o.__class__.__name__ == "DocumentLineage") == 3
assert {s for (_id, s) in enq} == {"extract"}
@pytest.mark.asyncio
async def test_fallback_rejects_bad_segments(monkeypatch):
"""LLM 이 중첩 경계 반환 → 게이트 거부 → False + 자식 0 (단일문서)."""
_install_fake_fitz(monkeypatch)
bad = json.dumps({
"is_bundle": True,
"segments": [
{"start_page": 1, "end_page": 40},
{"start_page": 40, "end_page": 85}, # 중첩
{"start_page": 86, "end_page": 120},
],
})
_SpyClient.response = bad
monkeypatch.setattr(pw, "AIClient", _SpyClient)
async def _fake_enqueue(*a, **k):
return True
monkeypatch.setattr(pw, "enqueue_stage", _fake_enqueue)
doc = _FakeDoc()
session = _FakeSession(doc)
ok = await pw._llm_boundary_fallback(doc, Path("/tmp/b.pdf"), PAGE_COUNT, session)
assert ok is False
assert _SpyClient._deep_calls == 1
assert [o for o in session.added if getattr(o, "presegment_role", None) == "child"] == []
assert doc.presegment_role is None
@pytest.mark.asyncio
async def test_fallback_rejects_is_bundle_false(monkeypatch):
"""is_bundle=false → 호출은 했으나 분할 안 함(False, 자식 0)."""
_install_fake_fitz(monkeypatch)
_SpyClient.response = json.dumps({"is_bundle": False, "segments": []})
monkeypatch.setattr(pw, "AIClient", _SpyClient)
async def _fake_enqueue(*a, **k):
return True
monkeypatch.setattr(pw, "enqueue_stage", _fake_enqueue)
doc = _FakeDoc()
session = _FakeSession(doc)
ok = await pw._llm_boundary_fallback(doc, Path("/tmp/b.pdf"), PAGE_COUNT, session)
assert ok is False
assert _SpyClient._deep_calls == 1
assert doc.presegment_role is None
# ─── (c) flag gating — OFF=호출 0 (deployed default 무변), ON=호출됨 ───────────
@pytest.mark.asyncio
async def test_flag_off_never_calls_llm(monkeypatch):
"""PRESEGMENT_LLM_FALLBACK=False(기본) → 큰 ToC-less PDF 도 LLM 미호출 = 오늘과 동일."""
monkeypatch.setattr(pw, "PRESEGMENT_LLM_FALLBACK", False)
_install_fake_fitz(monkeypatch, page_count=120, toc=[]) # 대형 + level-1 ToC 없음 = 애매
monkeypatch.setattr(pw, "AIClient", _SpyClient)
monkeypatch.setattr(pw, "_resolve_path", lambda raw: Path("/tmp/bundle.pdf"))
async def _fake_enqueue(*a, **k):
return True
monkeypatch.setattr(pw, "enqueue_stage", _fake_enqueue)
doc = _FakeDoc()
session = _FakeSession(doc)
await pw.process(doc.id, session)
assert _SpyClient._deep_calls == 0 # ★ LLM 절대 호출 안 됨
assert doc.presegment_role is None # 단일문서 (분할 안 함)
assert session.commits == 0
@pytest.mark.asyncio
async def test_flag_on_calls_llm_and_splits(monkeypatch):
"""positive control — flag ON 이면 같은 입력에 LLM 호출 + 게이트 통과 시 분할."""
monkeypatch.setattr(pw, "PRESEGMENT_LLM_FALLBACK", True)
_install_fake_fitz(monkeypatch, page_count=120, toc=[])
_SpyClient.response = GOOD_LLM_JSON
monkeypatch.setattr(pw, "AIClient", _SpyClient)
monkeypatch.setattr(pw, "_resolve_path", lambda raw: Path("/tmp/bundle.pdf"))
async def _fake_enqueue(*a, **k):
return True
monkeypatch.setattr(pw, "enqueue_stage", _fake_enqueue)
doc = _FakeDoc()
session = _FakeSession(doc)
await pw.process(doc.id, session)
assert _SpyClient._deep_calls == 1 # LLM 호출됨
assert doc.presegment_role == "parent" # 분할 수행
children = [o for o in session.added if getattr(o, "presegment_role", None) == "child"]
assert len(children) == 3