"""builder.py char_start emit 단위테스트 (플랜 ds-outline-anchor-b5 g2 / g0-t2). 핵심 불변식: - char_start = FE outlineAnchors.ts 라인/offset 모델(split('\n') + UTF-16 code unit + 코드펜스)과 동일. - astral(BMP 밖) prefix 가 있어도 UTF-16 code unit offset 이어야 함 (#2 SILENT 단위버그 게이트). - window-child char_start=None, split-parent char_start=heading offset (B1/#1). - 코드펜스 내부 heading 미탐지 (O3). - 라인모델 변경이 node.text 를 바꾸지 않음(hash-neutral) — hash_stable doc 보존. """ from __future__ import annotations import hashlib from app.services.hier_decomp.builder import build_hier_tree, coverage_stats, _utf16_units def _fe_offset_of_line(md: str, target_line: str) -> int | None: """FE outlineAnchors.ts:55-65 재현 — char_start 가 이 값과 같아야 함.""" off = 0 for raw in md.split("\n"): if raw == target_line: return off off += len(raw.encode("utf-16-le")) // 2 + 1 return None def _u16_slice(md: str, cs: int, n: int) -> str: return md.encode("utf-16-le")[2 * cs: 2 * (cs + n)].decode("utf-16-le") def test_char_start_matches_fe_offset_and_slices(): md = "# Alpha\nbody alpha here\n\n## Beta\nbody beta\n# Gamma\nlast line" nodes = build_hier_tree(md, leaf_hard_max=100000) seen = 0 for n in nodes: if n.char_start is None: continue seen += 1 head = n.text.split("\n", 1)[0] assert n.char_start == _fe_offset_of_line(md, head), n.section_title assert _u16_slice(md, n.char_start, _utf16_units(head)) == head assert seen >= 2 def test_astral_prefix_offset_is_utf16_not_codepoint(): # 📄 = U+1F4C4 = 1 code point 이나 UTF-16 surrogate pair(2 code unit). md = "\U0001F4C4 manifest\n\n# Section One\nbody" nodes = build_hier_tree(md) sec = next(n for n in nodes if n.section_title == "Section One") fe = _fe_offset_of_line(md, "# Section One") assert sec.char_start == fe # UTF-16 슬라이스는 정확 assert _u16_slice(md, sec.char_start, _utf16_units("# Section One")) == "# Section One" # code-point 슬라이스는 어긋나야 함(astral 때문에) — 단위버그가 있었다면 이게 통과했을 것 assert md[sec.char_start: sec.char_start + len("# Section One")] != "# Section One" def test_fenced_heading_not_detected(): md = "# Real\nintro\n```\n# Fake In Fence\n```\n# Real Two\nx" titles = [n.section_title for n in build_hier_tree(md) if n.section_title] assert "Fake In Fence" not in titles assert "Real" in titles and "Real Two" in titles def test_window_child_null_split_parent_has_offset(): md = "# BigSection\n" + ("paragraph text here. " * 20 + "\n\n") * 60 nodes = build_hier_tree(md, leaf_hard_max=5000, leaf_target_max=3000) sp = [n for n in nodes if n.node_type and n.node_type.endswith("_split")] wc = [n for n in nodes if n.node_type == "window"] assert sp and sp[0].char_start is not None assert wc and all(w.char_start is None for w in wc) def test_node_text_preserved_hash_neutral(): # 라인모델(split vs splitlines) 변경에도 leaf 이어붙이면 원문 재구성 → hash 불변. md = "# A\nl1\nl2\n# B\nl3\n# C\nl4\n" nodes = build_hier_tree(md, leaf_hard_max=100000) recon = "".join(n.text for n in nodes if n.is_leaf or (n.node_type and n.node_type.endswith("_split"))) assert recon == md def test_preamble_char_start_none(): md = "intro paragraph with no heading\nmore intro\n# First\nbody" nodes = build_hier_tree(md, leaf_hard_max=100000) preamble = [n for n in nodes if n.section_title is None and n.level == 0] assert preamble and preamble[0].char_start is None def test_coverage_stats_char_start_telemetry(): md = "# Alpha\nbody\n# Beta\nbody2" nodes = build_hier_tree(md, leaf_hard_max=100000) st = coverage_stats(md, nodes) assert st["char_start_total"] >= 2 assert st["char_start_verified"] == st["char_start_total"] # 모두 O5 통과 assert st["non_nfc"] == 0