feat(documents): hier 절 char_start offset (Path B) — md_content 점프 builder offset
플랜 ds-outline-anchor-b5 (g1~g6 코드). 핵심 ASME/법령 windowed 절의 0% 점프를
서버계산 char_start(builder offset)로 100% deterministic 점프로 전환.
- g1 migration 318: document_chunks.char_start INTEGER NULL (단일 statement, 멱등)
- g2 builder: char_start emit = FE 라인/offset 모델 미러(split('\n')+UTF-16 code unit+코드펜스 skip).
window-child=NULL, split-parent=heading offset, preamble=NULL, CR 미strip, NFC=telemetry.
node.text 보존(라인모델 hash-neutral) → hash_stable doc 보존. 단위테스트 7건.
- g3 persist+backfill 하이브리드:
* persist INSERT char_start
* update-char-start (g3-tU): hash_stable doc 비파괴 — 100% jump-target VERIFY(NEW-1) +
position-aligned PK UPDATE(NEW-2), 미달 doc DEMOTE → re-decompose 합류(NEW-4)
* --reprocess (g3-t2): md_content 출처(g0-t1) + jump-target-set 완료마커(B1) + B_jumptarget>=1(B3),
--doc 필수 else REFUSE. self-heal sweep(g3-t3).
- g4 /sections: char_start inner+outer SELECT + split-parent 노출(is_leaf OR %_split)
- g5 FE: resolveAnchorMap(BE-first, NEW-5 jump-target-candidate-scoped 폴백, C1 OR-exclude),
per-render-site basis guard(C3), endsWith('_split') 정정 + collapseWindows split-parent 흡수(C2).
단위테스트 25건(NEW-5/B4/C1/C2 포함).
- g6 hier_outline_quality_gate.py: read-only g-measure(verdict/B_jumptarget/hash_stable/dup/fence)
배포(g7: --no-deps, 스냅샷, UPDATE-only 32 + re-decompose 230∪demote, 정확도 게이트)는 별 ops 단계.
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,95 @@
|
||||
"""builder.py char_start emit 단위테스트 (플랜 ds-outline-anchor-b5 g2 / g0-t2).
|
||||
|
||||
핵심 불변식:
|
||||
- char_start = FE outlineAnchors.ts 라인/offset 모델(split('\n') + UTF-16 code unit + 코드펜스)과 동일.
|
||||
- astral(BMP 밖) prefix 가 있어도 UTF-16 code unit offset 이어야 함 (#2 SILENT 단위버그 게이트).
|
||||
- window-child char_start=None, split-parent char_start=heading offset (B1/#1).
|
||||
- 코드펜스 내부 heading 미탐지 (O3).
|
||||
- 라인모델 변경이 node.text 를 바꾸지 않음(hash-neutral) — hash_stable doc 보존.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
|
||||
from app.services.hier_decomp.builder import build_hier_tree, coverage_stats, _utf16_units
|
||||
|
||||
|
||||
def _fe_offset_of_line(md: str, target_line: str) -> int | None:
|
||||
"""FE outlineAnchors.ts:55-65 재현 — char_start 가 이 값과 같아야 함."""
|
||||
off = 0
|
||||
for raw in md.split("\n"):
|
||||
if raw == target_line:
|
||||
return off
|
||||
off += len(raw.encode("utf-16-le")) // 2 + 1
|
||||
return None
|
||||
|
||||
|
||||
def _u16_slice(md: str, cs: int, n: int) -> str:
|
||||
return md.encode("utf-16-le")[2 * cs: 2 * (cs + n)].decode("utf-16-le")
|
||||
|
||||
|
||||
def test_char_start_matches_fe_offset_and_slices():
|
||||
md = "# Alpha\nbody alpha here\n\n## Beta\nbody beta\n# Gamma\nlast line"
|
||||
nodes = build_hier_tree(md, leaf_hard_max=100000)
|
||||
seen = 0
|
||||
for n in nodes:
|
||||
if n.char_start is None:
|
||||
continue
|
||||
seen += 1
|
||||
head = n.text.split("\n", 1)[0]
|
||||
assert n.char_start == _fe_offset_of_line(md, head), n.section_title
|
||||
assert _u16_slice(md, n.char_start, _utf16_units(head)) == head
|
||||
assert seen >= 2
|
||||
|
||||
|
||||
def test_astral_prefix_offset_is_utf16_not_codepoint():
|
||||
# 📄 = U+1F4C4 = 1 code point 이나 UTF-16 surrogate pair(2 code unit).
|
||||
md = "\U0001F4C4 manifest\n\n# Section One\nbody"
|
||||
nodes = build_hier_tree(md)
|
||||
sec = next(n for n in nodes if n.section_title == "Section One")
|
||||
fe = _fe_offset_of_line(md, "# Section One")
|
||||
assert sec.char_start == fe
|
||||
# UTF-16 슬라이스는 정확
|
||||
assert _u16_slice(md, sec.char_start, _utf16_units("# Section One")) == "# Section One"
|
||||
# code-point 슬라이스는 어긋나야 함(astral 때문에) — 단위버그가 있었다면 이게 통과했을 것
|
||||
assert md[sec.char_start: sec.char_start + len("# Section One")] != "# Section One"
|
||||
|
||||
|
||||
def test_fenced_heading_not_detected():
|
||||
md = "# Real\nintro\n```\n# Fake In Fence\n```\n# Real Two\nx"
|
||||
titles = [n.section_title for n in build_hier_tree(md) if n.section_title]
|
||||
assert "Fake In Fence" not in titles
|
||||
assert "Real" in titles and "Real Two" in titles
|
||||
|
||||
|
||||
def test_window_child_null_split_parent_has_offset():
|
||||
md = "# BigSection\n" + ("paragraph text here. " * 20 + "\n\n") * 60
|
||||
nodes = build_hier_tree(md, leaf_hard_max=5000, leaf_target_max=3000)
|
||||
sp = [n for n in nodes if n.node_type and n.node_type.endswith("_split")]
|
||||
wc = [n for n in nodes if n.node_type == "window"]
|
||||
assert sp and sp[0].char_start is not None
|
||||
assert wc and all(w.char_start is None for w in wc)
|
||||
|
||||
|
||||
def test_node_text_preserved_hash_neutral():
|
||||
# 라인모델(split vs splitlines) 변경에도 leaf 이어붙이면 원문 재구성 → hash 불변.
|
||||
md = "# A\nl1\nl2\n# B\nl3\n# C\nl4\n"
|
||||
nodes = build_hier_tree(md, leaf_hard_max=100000)
|
||||
recon = "".join(n.text for n in nodes if n.is_leaf or (n.node_type and n.node_type.endswith("_split")))
|
||||
assert recon == md
|
||||
|
||||
|
||||
def test_preamble_char_start_none():
|
||||
md = "intro paragraph with no heading\nmore intro\n# First\nbody"
|
||||
nodes = build_hier_tree(md, leaf_hard_max=100000)
|
||||
preamble = [n for n in nodes if n.section_title is None and n.level == 0]
|
||||
assert preamble and preamble[0].char_start is None
|
||||
|
||||
|
||||
def test_coverage_stats_char_start_telemetry():
|
||||
md = "# Alpha\nbody\n# Beta\nbody2"
|
||||
nodes = build_hier_tree(md, leaf_hard_max=100000)
|
||||
st = coverage_stats(md, nodes)
|
||||
assert st["char_start_total"] >= 2
|
||||
assert st["char_start_verified"] == st["char_start_total"] # 모두 O5 통과
|
||||
assert st["non_nfc"] == 0
|
||||
Reference in New Issue
Block a user