feat(summarize): presegment PR1 — summarize_units 순수함수(greedy-pack + 3-way 게이트)

plan ds-presegment-mapreduce-2 PR1. CAP 12K tok/unit · TRIGGER 25K ·
over% 게이트(0=auto/<=40=hybrid/>40=whole). 토큰추정=PR0 실 Qwen 캘리브
(KO 0.529/기타 0.217 tok/char). leaf=hier_decomp.builder 재사용
(leaf_hard_max=inf 로 window-split 억제). 순수함수·DB/IO 0·배선은 PR2.
tests/summarize_units 15 passed.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
hyungi
2026-07-01 23:07:40 +00:00
parent a182def9e6
commit 61e70864e4
2 changed files with 346 additions and 0 deletions
+180
View File
@@ -0,0 +1,180 @@
"""summarize_units 단위테스트 (presegment PR1 — 순수함수·fixture).
핵심 불변식:
- estimate_tokens = PR0 캘리브레이션(한글 0.529 · 기타 0.217 tok/char) 정확 재현.
- greedy_pack: 순서 보존·인접만·cap 준수·단독 초과 leaf=over_cap 전용 유닛·텍스트 손실 0
(구 deep_summary head/mid/tail 가운데 폐기 버그의 반대 성질).
- gate 3-way: 0=auto / (0,40]=hybrid / >40=whole (경계 포함).
- plan_summarize_units: trigger 이하=single(현행 단일콜 유지=무회귀) / 초과=map_reduce.
pytest + 단독 실행 양쪽 지원:
PYTHONPATH=. .venv/bin/pytest tests/summarize_units/ -q
"""
from __future__ import annotations
from app.services.hier_decomp.builder import HierNode
from app.services.summarize_units import (
CAP_TOKENS,
TRIGGER_TOKENS,
SummarizeUnit,
estimate_tokens,
extract_leaves,
gate,
greedy_pack,
over_pct,
plan_summarize_units,
)
def _leaf(idx: int, text: str, title: str | None = None) -> HierNode:
return HierNode(idx=idx, parent_idx=None, level=1, node_type=None,
section_title=title, heading_path=title, text=text)
# ---------- estimate_tokens ----------
def test_estimate_tokens_korean_calibration():
# 한글 1000자 → 529 tok (PR0: 0.529 tok/char)
assert estimate_tokens("" * 1000) == 529
def test_estimate_tokens_english_calibration():
# 비한글 1000자 → 217 tok (PR0: 0.217 tok/char)
assert estimate_tokens("a" * 1000) == 217
def test_estimate_tokens_mixed_and_empty():
assert estimate_tokens("") == 0
mixed = "" * 100 + "a" * 100
assert estimate_tokens(mixed) == round(100 * 0.529 + 100 * 0.217)
# ---------- greedy_pack ----------
def test_greedy_pack_adjacency_and_cap():
# 4000tok 짜리 한글 leaf 4개 (4000/0.529 ≈ 7562자) → cap 12000 이면 [3개, 1개]... 아니
# 4000*3=12000 = cap 정확 경계(<=cap 허용) → [1,2,3] + [4]
body = "" * 7562 # ≈ 3999~4000 tok
leaves = [_leaf(i, body, f"s{i}") for i in range(4)]
units = greedy_pack(leaves, cap=12_000)
assert len(units) == 2
assert [len(u.section_titles) for u in units] == [3, 1]
# 순서 보존
assert units[0].section_titles == ["s0", "s1", "s2"]
assert units[1].section_titles == ["s3"]
# cap 준수
assert all(u.est_tokens <= 12_000 for u in units)
def test_greedy_pack_oversized_leaf_gets_own_unit():
small = "" * 1000 # ≈ 529 tok
big = "" * 30_000 # ≈ 15,870 tok > CAP
leaves = [_leaf(0, small, "a"), _leaf(1, big, "mega"), _leaf(2, small, "b")]
units = greedy_pack(leaves, cap=CAP_TOKENS)
assert len(units) == 3
assert units[1].over_cap and units[1].section_titles == ["mega"]
assert not units[0].over_cap and not units[2].over_cap
# 인접성: 초과 leaf 가 앞뒤 pack 을 넘나들며 합쳐지지 않음
assert units[0].section_titles == ["a"] and units[2].section_titles == ["b"]
def test_greedy_pack_no_text_loss():
leaves = [_leaf(i, f"본문{i} " + "" * 500, f"s{i}") for i in range(7)]
units = greedy_pack(leaves, cap=1_000)
joined = "\n\n".join(u.text for u in units)
for leaf in leaves:
assert leaf.text in joined # 커버리지 — 중간 폐기 0
def test_greedy_pack_empty():
assert greedy_pack([]) == []
# ---------- over_pct + gate ----------
def test_over_pct_and_gate_boundaries():
assert gate(0.0) == "auto"
assert gate(0.01) == "hybrid"
assert gate(40.0) == "hybrid"
assert gate(40.01) == "whole"
assert gate(100.0) == "whole"
def test_over_pct_computation():
# leaf: 6000tok + 18000tok(초과) → over% = 18000/24000 = 75%
l_small = _leaf(0, "" * round(6000 / 0.529), "a")
l_big = _leaf(1, "" * round(18000 / 0.529), "b")
pct = over_pct([l_small, l_big], cap=CAP_TOKENS)
assert 74.0 < pct < 76.0
assert over_pct([], cap=CAP_TOKENS) == 0.0
assert over_pct([l_small], cap=CAP_TOKENS) == 0.0
# ---------- plan_summarize_units (fixture md) ----------
def _md_doc(sections: int, chars_per_section: int, ch: str = "") -> str:
parts = []
for i in range(sections):
parts.append(f"# 제{i+1}장 섹션{i}\n\n" + ch * chars_per_section)
return "\n\n".join(parts)
def test_plan_small_doc_stays_single():
md = _md_doc(3, 1000) # ≈ 3×529 tok ≪ trigger
plan = plan_summarize_units(md)
assert plan.mode == "single" and plan.tier is None and plan.units == []
assert plan.total_est_tokens <= TRIGGER_TOKENS
def test_plan_large_doc_auto_tier():
# 섹션 20개 × ≈4000tok = ≈80K tok > trigger, 전 섹션 < cap → auto
md = _md_doc(20, 7562)
plan = plan_summarize_units(md)
assert plan.mode == "map_reduce"
assert plan.tier == "auto" and plan.over_pct == 0.0
assert len(plan.units) >= 2
assert all(u.est_tokens <= CAP_TOKENS for u in plan.units)
def test_plan_mega_section_whole_tier():
# 작은 섹션 2 + 초대형 1(≈53K tok — 전체의 >40%) → whole
md = (_md_doc(2, 7562)
+ "\n\n# 메가섹션\n\n" + "" * 100_000)
plan = plan_summarize_units(md)
assert plan.mode == "map_reduce"
assert plan.tier == "whole" and plan.over_pct > 40.0
assert any(u.over_cap for u in plan.units)
def test_plan_hybrid_tier():
# 정상 섹션 15개(≈60K tok) + 초과 섹션 1개(≈15.9K tok) → over% ≈ 21% → hybrid
md = _md_doc(15, 7562) + "\n\n# 초과섹션\n\n" + "" * 30_000
plan = plan_summarize_units(md)
assert plan.mode == "map_reduce"
assert plan.tier == "hybrid"
assert 0.0 < plan.over_pct <= 40.0
over_units = [u for u in plan.units if u.over_cap]
assert len(over_units) == 1 # hybrid 시 클로드 대상 = 이 유닛들만
def test_plan_headingless_giant_is_whole():
# 헤딩 없는 거대 EN 문서 — leaf 1개 전체 초과 → over% 100 → whole (PR0: EN 책 다수)
md = "x" * 200_000 # ≈ 43K tok > trigger, 단일 leaf > cap
plan = plan_summarize_units(md)
assert plan.mode == "map_reduce" and plan.tier == "whole"
def test_plan_deterministic():
md = _md_doc(10, 7562)
p1, p2 = plan_summarize_units(md), plan_summarize_units(md)
assert p1 == p2
if __name__ == "__main__":
import sys
fns = [v for k, v in sorted(globals().items()) if k.startswith("test_")]
for fn in fns:
fn()
print(f"ok {fn.__name__}")
print(f"{len(fns)} passed (standalone)")
sys.exit(0)