diff --git a/app/api/documents.py b/app/api/documents.py
index ab46978..1259fc6 100644
--- a/app/api/documents.py
+++ b/app/api/documents.py
@@ -663,8 +663,9 @@ class SectionItem(BaseModel):
section_title: str | None = None # raw 마크다운 포함 — 정제는 프런트(headingPath.ts)
heading_path: str | None = None # raw
level: int | None = None
- node_type: str | None = None # window | section_split | null
+ node_type: str | None = None # window | chapter_split | clause_split | section_split | null
is_leaf: bool
+ char_start: int | None = None # md_content 내 heading offset(UTF-16). jump-target 만 값, 그 외 None (Path B)
section_type: str | None = None
summary: str | None = None # status='summarized' 인 분석행에만, 그 외 None
confidence: float | None = None
@@ -703,12 +704,12 @@ async def get_document_sections(
await session.execute(
sql_text(
"""
- SELECT chunk_id, section_title, heading_path, level, node_type, is_leaf,
+ SELECT chunk_id, section_title, heading_path, level, node_type, is_leaf, char_start,
section_type, summary, confidence
FROM (
SELECT DISTINCT ON (c.id)
c.id AS chunk_id, c.chunk_index, c.section_title, c.heading_path,
- c.level, c.node_type, c.is_leaf,
+ c.level, c.node_type, c.is_leaf, c.char_start,
a.section_type,
CASE WHEN a.status = 'summarized' THEN a.summary ELSE NULL END AS summary,
a.confidence
@@ -717,7 +718,7 @@ async def get_document_sections(
ON a.chunk_id = c.id AND a.status = 'summarized'
WHERE c.doc_id = :doc_id
AND c.source_type = 'hier_section'
- AND c.is_leaf = true
+ AND (c.is_leaf = true OR c.node_type LIKE '%\\_split' ESCAPE '\\')
ORDER BY c.id, a.created_at DESC, a.id DESC
) t
ORDER BY t.chunk_index
diff --git a/app/services/hier_decomp/builder.py b/app/services/hier_decomp/builder.py
index 721eab7..3718e54 100644
--- a/app/services/hier_decomp/builder.py
+++ b/app/services/hier_decomp/builder.py
@@ -13,6 +13,7 @@
from __future__ import annotations
import re
import hashlib
+import unicodedata
from dataclasses import dataclass, field
STRUCTURE_SPLIT_THRESHOLD = 4000
@@ -27,6 +28,17 @@ _KO_JEOL = re.compile(r'^\s*(?P
제\s*\d+\s*절\b.*)$')
_KO_JO = re.compile(r'^\s*(?P제\s*\d+\s*조\b.*)$')
_ENG = re.compile(r'^\s*(?P(?:Chapter|Section|Article|Part|PART)\s+[\dIVXLA-Z]+\b.*)$')
+# 코드펜스 경계 (FE outlineAnchors.ts:60 `/^\s{0,3}(```|~~~)/` 와 동일). 펜스 내부 라인은
+# heading 미탐지 — 코드블록 안 '# foo' 가 가짜 절을 만들지 않게(O3).
+_FENCE = re.compile(r'^\s{0,3}(```|~~~)')
+
+
+def _utf16_units(s: str) -> int:
+ """JS 문자열 .length(= UTF-16 code unit 수) 와 동일. astral(BMP 밖)=surrogate pair=2 units.
+ FE 의 `raw.length` / `out.slice(off)` 가 UTF-16 code unit 단위라 char_start 도 같은 단위여야 함.
+ len(s.encode('utf-16-le'))//2 = code unit 수 (utf-16-le 는 BOM 미부착)."""
+ return len(s.encode("utf-16-le")) // 2
+
@dataclass
class HierNode:
@@ -39,6 +51,9 @@ class HierNode:
text: str
is_leaf: bool = True
chunk_content_hash: str = field(default="")
+ # md_content 내 heading 라인 시작 offset(UTF-16 code unit). jump-target(비-window leaf / %_split parent)만
+ # 값 보유; window-child / preamble(title None) = None(점프 타깃 아님, g0-t2/g2-t3).
+ char_start: int | None = None
def finalize_hash(self):
self.chunk_content_hash = hashlib.sha256(self.text.encode("utf-8")).hexdigest()
@@ -57,33 +72,64 @@ def _detect_heading(line: str) -> tuple[int, str, str] | None:
return None
-def _segment(text: str) -> list[tuple[int, str | None, str | None, str]]:
- """heading 경계로 분할 → [(level, title, node_type, segment_text), ...].
+def _segment(text: str) -> list[tuple[int, str | None, str | None, str, int | None]]:
+ """heading 경계로 분할 → [(level, title, node_type, segment_text, char_start), ...].
- preamble(첫 heading 이전 본문) = (0, None, None, text).
+ 라인 모델 = FE outlineAnchors.ts:55-65 와 동일: `text.split('\n')` + UTF-16 code-unit offset +
+ 코드펜스 추적(splitlines(keepends=True) 폐기 — JS 와 라인경계 \v\f\x1c… 7종을 다르게 쪼개는 문제 제거).
+ char_start = 그 segment 첫 라인(=heading 라인)의 UTF-16 offset. preamble = None(점프 타깃 아님).
+ node.text 보존(라인모델 변경에 hash-neutral): 그룹을 '\n'.join 하되 마지막 그룹이 아니면 분리용 '\n'
+ 을 그 그룹 끝에 되돌려 붙여(= splitlines(keepends) 가 마지막 라인에 \n 을 남기던 동작) 원문과 동일.
+ CR 미strip(CRLF 면 '\r' 잔류 → FE raw.length 와 동일), NFC 무변환.
"""
- lines = text.splitlines(keepends=True)
- segs: list[tuple[int, str | None, str | None, list[str]]] = []
- cur: tuple[int, str | None, str | None, list[str]] | None = None
- preamble: list[str] = []
- for ln in lines:
- h = _detect_heading(ln.rstrip("\n"))
- if h:
- if cur is not None:
- segs.append(cur)
- elif preamble and "".join(preamble).strip():
- segs.append((0, None, None, preamble))
- cur = (h[0], h[1], h[2], [ln])
+ raw_lines = text.split("\n")
+ n = len(raw_lines)
+ # 라인별 (offset, heading) 선계산 — 펜스 내부/경계 라인은 heading 미탐지.
+ offs: list[int] = []
+ headings: list[tuple[int, str, str | None] | None] = []
+ off = 0
+ in_fence = False
+ for raw in raw_lines:
+ fence_toggle = bool(_FENCE.match(raw))
+ fenced_here = in_fence or fence_toggle
+ offs.append(off)
+ headings.append(None if fenced_here else _detect_heading(raw))
+ if fence_toggle:
+ in_fence = not in_fence
+ off += _utf16_units(raw) + 1 # '\n'
+
+ # 그룹 경계 = 첫 heading 이전(preamble) + 각 heading 라인. (start_idx, meta) 리스트.
+ first_heading = next((i for i in range(n) if headings[i] is not None), None)
+ starts: list[int] = []
+ metas: list[tuple[int, str | None, str | None] | None] = []
+ if first_heading is None:
+ starts.append(0)
+ metas.append(None) # 전체 = preamble
+ else:
+ if first_heading > 0:
+ starts.append(0)
+ metas.append(None)
+ for i in range(first_heading, n):
+ h = headings[i]
+ if h is not None:
+ starts.append(i)
+ metas.append((h[0], h[1], h[2]))
+
+ segs: list[tuple[int, str | None, str | None, str, int | None]] = []
+ for gi, s_idx in enumerate(starts):
+ e_idx = starts[gi + 1] if gi + 1 < len(starts) else n
+ seg_text = "\n".join(raw_lines[s_idx:e_idx])
+ if e_idx < n:
+ seg_text += "\n" # 분리용 '\n' 을 앞 그룹에 귀속(splitlines keepends 동치)
+ meta = metas[gi]
+ if meta is None:
+ if not seg_text.strip(): # 빈 preamble 폐기(기존 동작)
+ continue
+ segs.append((0, None, None, seg_text, None))
else:
- if cur is None:
- preamble.append(ln)
- else:
- cur[3].append(ln)
- if cur is not None:
- segs.append(cur)
- elif preamble and "".join(preamble).strip():
- segs.append((0, None, None, preamble))
- return [(lvl, title, nt, "".join(body)) for (lvl, title, nt, body) in segs]
+ lvl, title, nt = meta
+ segs.append((lvl, title, nt, seg_text, offs[s_idx]))
+ return segs
def _window_split(body: str, target: int) -> list[str]:
@@ -139,7 +185,7 @@ def build_hier_tree(
chain.append(title)
return " > ".join(chain) if chain else None
- for lvl, title, nt, body in segs:
+ for lvl, title, nt, body, cstart in segs:
norm = 0 if lvl == 0 else min(level_map[lvl], max_depth)
# 부모 = 스택에서 norm 보다 작은 가장 가까운 노드
while stack and stack[-1][0] >= norm:
@@ -147,8 +193,11 @@ def build_hier_tree(
parent_idx = stack[-1][1] if stack else None
idx = len(nodes)
hp = _heading_path(parent_idx, title)
+ # char_start = 생성 시점 할당(window-split 가 n.text 를 heading 라인으로 truncate 하기 전에 박제).
+ # split-parent 가 돼도 이 값(heading 라인 offset)이 windowed section 단일 jump target 으로 보존된다.
node = HierNode(idx=idx, parent_idx=parent_idx, level=norm, node_type=nt,
- section_title=title, heading_path=hp, text=body, is_leaf=True)
+ section_title=title, heading_path=hp, text=body, is_leaf=True,
+ char_start=cstart)
nodes.append(node)
if norm > 0:
stack.append((norm, idx))
@@ -178,14 +227,17 @@ def build_hier_tree(
n.is_leaf = False
heading_line = (n.text.splitlines() or [""])[0]
n.text = heading_line # 중복 저장 회피 (full body 는 window child 가 보유)
- n.node_type = (n.node_type or "section") + "_split"
+ n.node_type = (n.node_type or "section") + "_split" # chapter_split/clause_split/section_split
+ # n.char_start 보존 = windowed section 의 단일 jump target(생성시점 heading offset).
base_level = min(n.level + 1, max_depth)
for wtext in wins:
ci = len(final)
+ # window child = char_start None(_window_split 가 whitespace buf 를 drop 해
+ # char-preserving 이 아니므로 합산 offset 이 거짓; 점프 타깃도 아님, B1/#1).
final.append(HierNode(
idx=ci, parent_idx=n.idx, level=base_level, node_type="window",
section_title=n.section_title, heading_path=n.heading_path,
- text=wtext, is_leaf=True))
+ text=wtext, is_leaf=True, char_start=None))
for n in final:
n.finalize_hash()
return final
@@ -209,6 +261,24 @@ def coverage_stats(text: str, nodes: list[HierNode]) -> dict:
# 일반 네비: 자식 level > 부모 level 만 보장
if n.level <= nodes[n.parent_idx].level and nodes[n.parent_idx].level > 0:
bad_level += 1
+ # char_start O5 검증 (UTF-16 슬라이스 == heading 라인) + NFC telemetry (g2-t4).
+ # 검증은 FE 가 실제 쓰는 방식과 동일: md.encode('utf-16-le')[2*cs:2*(cs+n)].decode == heading_line
+ # (Python code-point 슬라이스 md[cs:cs+n] 가 아님 — astral 시 어긋남).
+ md_u16 = text.encode("utf-16-le")
+ cs_total = cs_verified = 0
+ for n in nodes:
+ if n.char_start is None:
+ continue
+ cs_total += 1
+ first_line = n.text.split("\n", 1)[0]
+ nu = _utf16_units(first_line)
+ seg = md_u16[2 * n.char_start: 2 * (n.char_start + nu)]
+ try:
+ if seg.decode("utf-16-le") == first_line:
+ cs_verified += 1
+ except UnicodeDecodeError:
+ pass
+ non_nfc = 1 if unicodedata.normalize("NFC", text) != text else 0
return {
"nodes": len(nodes), "leaves": len(leaves),
"coverage_ratio": round(leaf_chars / base, 4) if base else 0,
@@ -217,4 +287,6 @@ def coverage_stats(text: str, nodes: list[HierNode]) -> dict:
"level_dist": {l: sum(1 for n in nodes if n.level == l) for l in sorted({n.level for n in nodes})},
"leaf_len_min": min((len(n.text) for n in leaves), default=0),
"leaf_len_max": max((len(n.text) for n in leaves), default=0),
+ "char_start_total": cs_total, "char_start_verified": cs_verified,
+ "non_nfc": non_nfc,
}
diff --git a/app/services/hier_decomp/persist.py b/app/services/hier_decomp/persist.py
index ec32bf7..4c4153b 100644
--- a/app/services/hier_decomp/persist.py
+++ b/app/services/hier_decomp/persist.py
@@ -58,16 +58,16 @@ async def persist_hier_tree(
INSERT INTO document_chunks
(doc_id, chunk_index, chunk_type, section_title, heading_path, domain_category,
text, embedding, source_type, chunker_version, chunk_content_hash,
- parent_id, level, node_type, is_leaf, in_corpus)
+ parent_id, level, node_type, is_leaf, in_corpus, char_start)
VALUES (:d, :ci, :ct, :stt, :hp, :dc, :tx,
cast(cast(:emb AS text) AS vector),
- :src, :cv, :hash, :pid, :lvl, :nt, :leaf, false)
+ :src, :cv, :hash, :pid, :lvl, :nt, :leaf, false, :cs)
RETURNING id"""), {
"d": doc_id, "ci": base + n.idx, "ct": chunk_type,
"stt": n.section_title, "hp": n.heading_path, "dc": domain_category,
"tx": n.text, "emb": emb_str, "src": SOURCE_TYPE, "cv": CHUNKER_VERSION,
"hash": n.chunk_content_hash, "pid": parent_db, "lvl": n.level,
- "nt": n.node_type, "leaf": n.is_leaf})
+ "nt": n.node_type, "leaf": n.is_leaf, "cs": n.char_start})
idx_to_dbid[n.idx] = db_id
await session.commit()
diff --git a/frontend/src/lib/components/DocumentViewer.svelte b/frontend/src/lib/components/DocumentViewer.svelte
index ee4f3f4..4725474 100644
--- a/frontend/src/lib/components/DocumentViewer.svelte
+++ b/frontend/src/lib/components/DocumentViewer.svelte
@@ -10,7 +10,7 @@
import SectionOutline from '$lib/components/SectionOutline.svelte';
import { getViewerType } from '$lib/utils/viewerType';
import { isMdSuccess } from '$lib/utils/mdStatus';
- import { buildAnchorMap } from '$lib/utils/outlineAnchors';
+ import { resolveAnchorMap } from '$lib/utils/resolveAnchorMap';
import { cleanHeading } from '$lib/utils/headingPath';
// 편집 미리보기 전용 plain marked (본문 렌더는 MarkdownDoc 가 담당).
@@ -109,7 +109,7 @@
(s) => !!(cleanHeading(s.section_title) || cleanHeading((s.heading_path || '').split('>').pop() || '')),
),
);
- // MarkdownDoc 가 실제 렌더하는 텍스트(anchor offset 기준과 일치해야 함).
+ // MarkdownDoc 가 실제 렌더하는 텍스트(rail 표시 게이트용).
let mdRenderText = $derived.by(() => {
if (!fullDoc) return '';
if (viewerType === 'pdf') return pdfViewMode === 'markdown' && canShowMarkdown ? (fullDoc.md_content || '') : '';
@@ -117,7 +117,26 @@
if (viewerType === 'hwp-markdown' || viewerType === 'article') return fullDoc.md_content || fullDoc.extracted_text || '';
return '';
});
- let anchorMap = $derived(sections.length && mdRenderText ? buildAnchorMap(mdRenderText, sections).anchors : {});
+ // [g5-t3] basis 는 RENDER SITE 별. anchorMap 을 basis 별로 분리 — 같은 component 가 두 basis 를
+ // 공유하면(md_content vs extracted_text) trustBE 가 어긋난다.
+ // - md_content site(pdf-markdown): trustBE=true (BE char_start 1순위, 비면 내부 string-match 폴백).
+ // - extracted_text site(3-pane markdown): trustBE=false (char_start 는 md_content offset 이라 무효 → 무조건 폴백).
+ let mdBasisText = $derived.by(() => {
+ if (!fullDoc) return '';
+ if (viewerType === 'pdf') return pdfViewMode === 'markdown' && canShowMarkdown ? (fullDoc.md_content || '') : '';
+ return '';
+ });
+ let extractedBasisText = $derived.by(() => {
+ if (!fullDoc) return '';
+ if (viewerType === 'markdown') return fullDoc.extracted_text || rawMarkdown || '';
+ return '';
+ });
+ let anchorMapMd = $derived(
+ sections.length && mdBasisText ? resolveAnchorMap(mdBasisText, sections, { trustBE: true }).anchors : {},
+ );
+ let anchorMapExtracted = $derived(
+ sections.length && extractedBasisText ? resolveAnchorMap(extractedBasisText, sections, { trustBE: false }).anchors : {},
+ );
let showRail = $derived(outlineSections.length > 0 && !!mdRenderText);
let scrollEl = $state();
@@ -128,7 +147,8 @@
}
// scroll-spy: scrollEl 내 .md-anchor 중 컨테이너 상단(+120) 지난 마지막 = 현재 절.
$effect(() => {
- void anchorMap;
+ void anchorMapMd;
+ void anchorMapExtracted;
const el = scrollEl;
if (!el) return;
let raf = 0;
@@ -255,7 +275,7 @@
mdStatus={fullDoc.md_status}
mdExtractionError={fullDoc.md_extraction_error}
mdExtractionQuality={fullDoc.md_extraction_quality}
- anchorMap={anchorMap}
+ anchorMap={anchorMapExtracted}
extractedText={fullDoc.extracted_text || rawMarkdown}
class={PROSE}
/>
@@ -280,7 +300,7 @@
mdStatus={fullDoc.md_status}
mdExtractionError={fullDoc.md_extraction_error}
mdExtractionQuality={fullDoc.md_extraction_quality}
- anchorMap={anchorMap}
+ anchorMap={anchorMapMd}
extractedText={fullDoc.extracted_text}
class={PROSE}
/>
diff --git a/frontend/src/lib/components/MarkdownDoc.svelte b/frontend/src/lib/components/MarkdownDoc.svelte
index bf00d3f..be83685 100644
--- a/frontend/src/lib/components/MarkdownDoc.svelte
+++ b/frontend/src/lib/components/MarkdownDoc.svelte
@@ -50,7 +50,9 @@
}: Props = $props();
// 개요 anchor 주입: body 의 각 offset(내림차순)에 빈 삽입(점프 타깃).
- // offset 은 buildAnchorMap 이 body 와 동일 문자열 기준으로 산출했어야 함(호출측 책임).
+ // [C3 불변식] char_start(BE) 는 호출측이 넘긴 md_content(raw, untransformed)에 대한 UTF-16 offset 이다.
+ // 이 함수는 그 동일 문자열을 'out' 으로 받아 trim/CRLF-normalize/replace 없이 slice 해야 한다 —
+ // prop→out 사이 어떤 변환도 char_start 를 drift 시킨다. (현재 out = text(=body=mdContent prop) 무변환.)
function spliceAnchors(text: string, map: Record | null): string {
if (!map) return text;
const ents = Object.entries(map)
diff --git a/frontend/src/lib/utils/headingPath.test.ts b/frontend/src/lib/utils/headingPath.test.ts
index bdbfdd1..72334e6 100644
--- a/frontend/src/lib/utils/headingPath.test.ts
+++ b/frontend/src/lib/utils/headingPath.test.ts
@@ -69,6 +69,20 @@ test('collapseWindows: 연속 동일 heading window 만 dedupe, 순서 유지',
);
});
+test('[C2] collapseWindows: split-parent + window 들 → rail 1행, 대표=split-parent(char_start 보유)', () => {
+ const input = [
+ sec({ section_title: 'Article 5', heading_path: 'Article 5', node_type: 'chapter_split', is_leaf: false, char_start: 120 }),
+ sec({ section_title: 'Article 5', heading_path: 'Article 5', node_type: 'window', is_leaf: true, char_start: null }),
+ sec({ section_title: 'Article 5', heading_path: 'Article 5', node_type: 'window', is_leaf: true, char_start: null }),
+ ];
+ const out = collapseWindows(input);
+ assert.equal(out.length, 1, 'split-parent + 2 window → rail 1행');
+ // 대표 = split-parent (char_start 보유) → jump 성립
+ assert.equal(out[0].section.node_type, 'chapter_split');
+ assert.equal(out[0].section.char_start, 120);
+ assert.equal(out[0].fragmentCount, 2, 'window 조각 수 = 2 (split-parent 자신 제외)');
+});
+
test('groupOrFlat: 적은 그룹 + 낮은 기타% → group (5140-류)', () => {
// 3 top segment × 4 = 12절, window 없음 → group_count 3, 기타 0%
const sections: DocumentSection[] = [];
diff --git a/frontend/src/lib/utils/headingPath.ts b/frontend/src/lib/utils/headingPath.ts
index 597fc48..b8132f0 100644
--- a/frontend/src/lib/utils/headingPath.ts
+++ b/frontend/src/lib/utils/headingPath.ts
@@ -12,8 +12,10 @@ export interface DocumentSection {
section_title: string | null;
heading_path: string | null;
level: number | null;
- node_type: string | null; // 'window' | 'section_split' | null
+ node_type: string | null; // 'window' | 'chapter_split' | 'clause_split' | 'section_split' | null
is_leaf: boolean;
+ /** md_content 내 heading offset(UTF-16). jump-target 만 값, window-child/preamble/Path A = null (Path B). */
+ char_start?: number | null;
section_type: string | null;
summary: string | null;
confidence: number | null;
@@ -87,32 +89,38 @@ export function pathSegments(hp: string | null | undefined): string[] {
.filter(Boolean);
}
-/** 그룹 키: window/section_split(인공 조각) 또는 path 없음/깨짐 → OTHER. */
+/** 그룹 키: window/%_split(인공 조각·windowed split-parent) 또는 path 없음/깨짐 → OTHER. */
function topSegment(s: DocumentSection): string {
- if (s.node_type === 'window' || s.node_type === 'section_split') return OTHER;
+ if (s.node_type === 'window' || !!s.node_type?.endsWith('_split')) return OTHER;
const segs = pathSegments(s.heading_path);
return segs.length === 0 ? OTHER : segs[0];
}
/**
* 서버 chunk_index 순서를 유지한 채(정렬 변경 금지), 연속된 동일 cleaned heading_path 의
- * node_type='window' 절을 1 항목으로 dedupe. 대표 = 첫 조각(요약 사용), fragmentCount 누적.
+ * node_type='window' 절을 1 항목으로 dedupe. fragmentCount = window 조각 수.
+ *
+ * [C2] g4-t2 가 split-parent(%_split, char_start 보유)를 그 window child 들보다 먼저(낮은 chunk_index)
+ * 노출하므로, 후속 window child 를 직전 split-parent(또는 legacy window 대표)에 흡수해 rail 1행으로 만든다.
+ * merged row 의 대표 section = split-parent 여야 jump(anchorMap[split-parent char_start])가 성립한다 —
+ * window-child(char_start NULL, anchorMap 부재)가 대표면 windowed section 이 점프 안 됨.
+ * fragmentCount: split-parent 대표는 0 에서 시작(자신은 조각 아님) + 흡수 child 수 = 실제 조각 수;
+ * legacy window 대표는 1 에서 시작(자신이 첫 조각).
*/
export function collapseWindows(sections: DocumentSection[]): OutlineItem[] {
const out: OutlineItem[] = [];
for (const s of sections) {
const prev = out[out.length - 1];
const h = cleanHeading(s.heading_path);
- if (
- s.node_type === 'window' &&
+ const prevAbsorbs =
prev &&
- prev.section.node_type === 'window' &&
+ (prev.section.node_type === 'window' || !!prev.section.node_type?.endsWith('_split')) &&
h !== '' &&
- cleanHeading(prev.section.heading_path) === h
- ) {
- prev.fragmentCount += 1;
+ cleanHeading(prev.section.heading_path) === h;
+ if (s.node_type === 'window' && prevAbsorbs) {
+ prev!.fragmentCount += 1; // window child 흡수 — 대표(split-parent 우선)는 그대로 유지
} else {
- out.push({ section: s, fragmentCount: 1 });
+ out.push({ section: s, fragmentCount: s.node_type?.endsWith('_split') ? 0 : 1 });
}
}
return out;
diff --git a/frontend/src/lib/utils/outlineAnchors.ts b/frontend/src/lib/utils/outlineAnchors.ts
index 64ecd53..f4c29fc 100644
--- a/frontend/src/lib/utils/outlineAnchors.ts
+++ b/frontend/src/lib/utils/outlineAnchors.ts
@@ -69,8 +69,9 @@ export function buildAnchorMap(
let matched = 0;
for (const s of sections) {
- // window/section_split 조각은 자체 heading 없음(부모 제목 상속) → 건너뜀.
- if (s.node_type === 'window' || s.node_type === 'section_split') continue;
+ // window 조각 + %_split parent(chapter_split/clause_split/section_split)는 string-match 대상 아님 →
+ // 건너뜀. (split-parent jump 은 Path B 의 BE char_start 로만 성립; Path A 폴백선 windowed 절 무점프=무회귀.)
+ if (s.node_type === 'window' || s.node_type?.endsWith('_split')) continue;
let nt = norm(s.section_title);
if (!nt && s.heading_path) {
const last = s.heading_path.split('>').pop();
diff --git a/frontend/src/lib/utils/resolveAnchorMap.test.ts b/frontend/src/lib/utils/resolveAnchorMap.test.ts
new file mode 100644
index 0000000..db3abe0
--- /dev/null
+++ b/frontend/src/lib/utils/resolveAnchorMap.test.ts
@@ -0,0 +1,95 @@
+// resolveAnchorMap 회귀 테스트 (플랜 ds-outline-anchor-b5 g5-t1 / NEW-5 / B4 / C1).
+// 실행: node --test src/lib/utils/resolveAnchorMap.test.ts
+import { test } from 'node:test';
+import assert from 'node:assert/strict';
+import { resolveAnchorMap, isJumpTargetCandidate } from './resolveAnchorMap.ts';
+import { type DocumentSection } from './headingPath.ts';
+
+let _id = 0;
+function sec(p: Partial): DocumentSection {
+ return {
+ chunk_id: ++_id,
+ section_title: null,
+ heading_path: null,
+ level: null,
+ node_type: null,
+ is_leaf: true,
+ char_start: null,
+ section_type: null,
+ summary: null,
+ confidence: null,
+ ...p,
+ };
+}
+
+const LONG = 'x'.repeat(500);
+
+test('trustBE=false → 무조건 string-match 폴백(fellBack=true)', () => {
+ const md = '# Alpha\nbody\n# Beta\nx';
+ const secs = [sec({ section_title: 'Alpha', char_start: 999 }), sec({ section_title: 'Beta', char_start: 999 })];
+ const r = resolveAnchorMap(md, secs, { trustBE: false });
+ assert.equal(r.fellBack, true);
+ // char_start(999) 무시하고 string-match offset 사용
+ assert.ok(Object.values(r.anchors).every((o) => o < 50));
+});
+
+test('trustBE=true + 모든 jump-target candidate char_start 보유 → BE 채택(fellBack=false)', () => {
+ const secs = [
+ sec({ section_title: 'A', char_start: 5, is_leaf: true }),
+ sec({ section_title: 'B', char_start: 42, is_leaf: true }),
+ ];
+ const r = resolveAnchorMap(LONG, secs, { trustBE: true });
+ assert.equal(r.fellBack, false);
+ assert.equal(r.anchors[secs[0].chunk_id], 5);
+ assert.equal(r.anchors[secs[1].chunk_id], 42);
+ assert.equal(r.matched, 2);
+});
+
+test('[NEW-5] windowed doc — window-child char_start NULL 이 폴백을 유발하지 않음(split-parent BE 사용)', () => {
+ const secs = [
+ sec({ section_title: 'Big', heading_path: 'Big', node_type: 'chapter_split', is_leaf: false, char_start: 10 }),
+ sec({ section_title: 'Big', heading_path: 'Big', node_type: 'window', is_leaf: true, char_start: null }),
+ sec({ section_title: 'Big', heading_path: 'Big', node_type: 'window', is_leaf: true, char_start: null }),
+ ];
+ const r = resolveAnchorMap(LONG, secs, { trustBE: true });
+ // window-child NULL 은 candidate 가 아니므로 트리거 안 됨 → BE 사용, split-parent 점프 보존
+ assert.equal(r.fellBack, false, 'window-child NULL 이 whole-doc 폴백을 유발하면 안 됨(NEW-5)');
+ assert.equal(r.anchors[secs[0].chunk_id], 10, 'split-parent char_start 가 BE 맵에 있어야 함');
+ // window-child 는 anchor 없음
+ assert.equal(r.anchors[secs[1].chunk_id], undefined);
+});
+
+test('[B4] non-PASS doc — jump-target candidate char_start NULL → string-match 폴백', () => {
+ const md = '# Gamma\nbody text here\n# Delta\nmore';
+ const secs = [
+ sec({ section_title: 'Gamma', is_leaf: true, char_start: null }),
+ sec({ section_title: 'Delta', is_leaf: true, char_start: null }),
+ ];
+ const r = resolveAnchorMap(md, secs, { trustBE: true });
+ assert.equal(r.fellBack, true, 'candidate char_start NULL 이면 폴백해야 함(BE-first not BE-only)');
+ // string-match 로 실제 jump 산출(0 아님)
+ assert.ok(r.matched >= 1, 'md-aligned doc 는 폴백 string-match 로 jump 비-0');
+});
+
+test('char_start > splicedText.length → 그 anchor 만 비활성, 폴백 안 함', () => {
+ const secs = [
+ sec({ section_title: 'A', char_start: 3, is_leaf: true }),
+ sec({ section_title: 'B', char_start: 100000, is_leaf: true }), // 범위 초과(truncated tail)
+ ];
+ const short = 'hello world';
+ const r = resolveAnchorMap(short, secs, { trustBE: true });
+ assert.equal(r.fellBack, false, '범위 초과는 폴백 트리거 아님(candidate char_start NOT NULL)');
+ assert.equal(r.anchors[secs[0].chunk_id], 3);
+ assert.equal(r.anchors[secs[1].chunk_id], undefined, '초과 anchor 는 비활성');
+});
+
+test('preamble(title 없음, is_leaf) char_start NULL 은 candidate 아님 → 폴백 유발 X', () => {
+ const secs = [
+ sec({ section_title: null, heading_path: null, is_leaf: true, char_start: null }), // preamble
+ sec({ section_title: 'Real', is_leaf: true, char_start: 7 }),
+ ];
+ const r = resolveAnchorMap(LONG, secs, { trustBE: true });
+ assert.equal(isJumpTargetCandidate(secs[0]), false, 'preamble 은 candidate 아님');
+ assert.equal(r.fellBack, false);
+ assert.equal(r.anchors[secs[1].chunk_id], 7);
+});
diff --git a/frontend/src/lib/utils/resolveAnchorMap.ts b/frontend/src/lib/utils/resolveAnchorMap.ts
new file mode 100644
index 0000000..c33ff9d
--- /dev/null
+++ b/frontend/src/lib/utils/resolveAnchorMap.ts
@@ -0,0 +1,82 @@
+// 개요(절 목차) → 본문 점프 anchor 산출 공유 헬퍼 (경로 B: BE char_start primary + string-match 폴백).
+//
+// render-site 가 md_content 를 splice 할 때(trustBE=true)는 BE 가 builder 단계에서 박은 char_start 를
+// 1순위로 쓰고, 비-md basis(3-pane extracted_text 등, trustBE=false)는 무조건 string-match(buildAnchorMap)로
+// 폴백한다. char_start 가 비어 있으면(non-PASS doc, 또는 multi-night 재처리 중 아직 미백필 PASS doc) BE-only
+// 가 아니라 string-match 로 graceful degrade 한다(B4: BE-first, NOT BE-only).
+//
+// ★ NEW-5 (must-not-miss): 폴백 트리거는 JUMP-TARGET-CANDIDATE 한정이다.
+// window-child(node_type='window')와 preamble(title 없음)은 char_start=NULL **BY DESIGN**(g2).
+// 트리거가 'NULL char_start 가 하나라도 있으면 whole-doc 폴백' 이면, window-child 를 항상 보유한 windowed
+// doc 은 매번 폴백 → split-parent char_start(windowed 절의 단일 jump target)를 영영 안 쓰고 →
+// buildAnchorMap 은 split-parent 를 skip → windowed 코어 절이 영원히 점프 안 됨 = 이 플랜이 겨냥한
+// 바로 그 절에서 Path A 0% 회귀. 따라서 트리거 분모 = jump-target-candidate 뿐.
+
+import { buildAnchorMap } from './outlineAnchors.ts';
+import { cleanHeading, type DocumentSection } from './headingPath.ts';
+
+export interface ResolveResult {
+ /** chunk_id → splicedText 내 char offset (UTF-16). */
+ anchors: Record;
+ /** jump-target candidate 수(BE 경로) 또는 buildAnchorMap.total(폴백). */
+ total: number;
+ /** 실제 anchor 부여 수. */
+ matched: number;
+ /** string-match(buildAnchorMap) 로 폴백했는지 — V-rail/검증용. */
+ fellBack: boolean;
+}
+
+/** 표시 가능한 제목(또는 heading_path 말단)이 있는가. */
+function hasTitle(s: DocumentSection): boolean {
+ if (cleanHeading(s.section_title)) return true;
+ const last = (s.heading_path || '').split('>').pop() || '';
+ return !!cleanHeading(last);
+}
+
+/**
+ * jump-target candidate = char_start 를 받아야 하는 절.
+ * = (비-window leaf) OR (%_split parent), 그리고 제목 보유.
+ * window-child(node_type='window')·preamble(제목 없음)은 설계상 char_start NULL → candidate 아님(NEW-5).
+ */
+export function isJumpTargetCandidate(s: DocumentSection): boolean {
+ const structural = (s.is_leaf && s.node_type !== 'window') || !!s.node_type?.endsWith('_split');
+ return structural && hasTitle(s);
+}
+
+export function resolveAnchorMap(
+ splicedText: string | null | undefined,
+ sections: DocumentSection[] | null | undefined,
+ opts: { trustBE: boolean },
+): ResolveResult {
+ const secs = sections ?? [];
+
+ // basis 불일치(extracted_text 3-pane 등) → 무조건 string-match.
+ if (!opts.trustBE) {
+ const r = buildAnchorMap(splicedText, secs);
+ return { ...r, fellBack: true };
+ }
+
+ // [B4 + NEW-5] BE-first: jump-target candidate 가 비었거나, candidate 중 char_start NULL 이 있으면 폴백.
+ // window-child/preamble NULL 은 candidate 가 아니라 트리거에 안 들어간다.
+ const candidates = secs.filter(isJumpTargetCandidate);
+ const beUnusable = candidates.length === 0 || candidates.some((s) => s.char_start == null);
+ if (beUnusable) {
+ const r = buildAnchorMap(splicedText, secs);
+ return { ...r, fellBack: true };
+ }
+
+ // BE char_start 채택 (C1: window/null/no-title 제외 = candidate 집합과 동일).
+ const anchors: Record = {};
+ const limit = (splicedText ?? '').length;
+ let matched = 0;
+ for (const s of candidates) {
+ const cs = s.char_start as number;
+ // char_start<=splicedText.length 가드(MarkdownDoc.svelte:58). 초과 = FE serve-truncate tail →
+ // 그 anchor 만 비활성(폴백 안 함 — string-match 도 truncated tail 은 못 찾음).
+ if (Number.isFinite(cs) && cs >= 0 && cs <= limit) {
+ anchors[s.chunk_id] = cs;
+ matched++;
+ }
+ }
+ return { anchors, total: candidates.length, matched, fellBack: false };
+}
diff --git a/frontend/src/routes/documents/[id]/+page.svelte b/frontend/src/routes/documents/[id]/+page.svelte
index a1b86ba..745f6a9 100644
--- a/frontend/src/routes/documents/[id]/+page.svelte
+++ b/frontend/src/routes/documents/[id]/+page.svelte
@@ -7,7 +7,7 @@
import { goto } from '$app/navigation';
import { api, getAccessToken } from '$lib/api';
import { isMdSuccess } from '$lib/utils/mdStatus';
- import { buildAnchorMap } from '$lib/utils/outlineAnchors';
+ import { resolveAnchorMap } from '$lib/utils/resolveAnchorMap';
import { addToast } from '$lib/stores/toast';
import { marked } from 'marked';
import DOMPurify from 'dompurify';
@@ -164,11 +164,12 @@
}
});
- // ── 개요 점프 (outlineAnchors, 경로 A) ──
- // anchorMap = md_content 의 각 절 heading offset. MarkdownDoc 가 주입.
+ // ── 개요 점프 (경로 B: BE char_start primary + string-match 폴백) ──
+ // 이 사이트는 항상 md_content basis(canShowMarkdown && doc.md_content) → trustBE=true.
+ // BE char_start 가 있으면 채택, 비면(non-PASS/미백필) resolveAnchorMap 내부에서 buildAnchorMap 로 폴백.
let anchorMap = $derived(
hasSections && canShowMarkdown && doc?.md_content
- ? buildAnchorMap(doc.md_content, sections).anchors
+ ? resolveAnchorMap(doc.md_content, sections, { trustBE: true }).anchors
: {}
);
let activeKey = $state(null);
diff --git a/migrations/318_document_chunks_char_start.sql b/migrations/318_document_chunks_char_start.sql
new file mode 100644
index 0000000..0fc1594
--- /dev/null
+++ b/migrations/318_document_chunks_char_start.sql
@@ -0,0 +1,15 @@
+-- 318_document_chunks_char_start.sql
+-- 플랜 ds-outline-anchor-b5 (Path B, g1-t1): hier 절 → md_content 본문 점프용 offset 컬럼.
+--
+-- char_start = md_content 내 heading 라인 시작 offset, **UTF-16 code unit** 기준
+-- (FE outlineAnchors.ts:64 `off += raw.length + 1` / MarkdownDoc.svelte:63 `out.slice(off)` 와 동일 단위).
+-- NULL 허용 = (a) md_content 없음(legacy/news/Path A) (b) window-child(node_type='window') (c) preamble(title NULL).
+-- → jump-target(비-window leaf OR %_split parent)만 NOT NULL 을 받는다(BY DESIGN, B1/B3 완료마커 기준).
+--
+-- 두 backfill 경로 공통 prereq:
+-- - UPDATE-only path(g3-tU, hash_stable): 저장된 hier 행에 char_start 만 UPDATE (DELETE/CASCADE/재임베딩 0).
+-- - re-decompose path(g3-t2, hash_changed): persist INSERT 시 char_start 동봉.
+--
+-- 멱등: ADD COLUMN IF NOT EXISTS + init_db version-skip + pg_advisory_xact_lock. BEGIN/COMMIT 금지(단일 statement).
+
+ALTER TABLE document_chunks ADD COLUMN IF NOT EXISTS char_start INTEGER NULL;
diff --git a/scripts/hier_outline_quality_gate.py b/scripts/hier_outline_quality_gate.py
new file mode 100644
index 0000000..656671a
--- /dev/null
+++ b/scripts/hier_outline_quality_gate.py
@@ -0,0 +1,196 @@
+"""hier 개요 keep-better 게이트 + g-measure 엔진 (플랜 ds-outline-anchor-b5 g6-t1 / gm-t1).
+
+READ-ONLY dry-run. doc 별로:
+ (A) 현 저장 hier 절제목 (source_type='hier_section', char_start IS NULL = extracted_text 산)
+ (B) build_hier_tree(md_content) 절제목 (= 새 g2 builder: split('\n')+UTF-16+fence skip)
+를 비교해 산출:
+ - verdict {B_better, A_better, equivalent} (+ junk-heading 검출 → A_better 보호)
+ - B_jumptarget_count (build 후 jump-target node 수) — B3 게이트 입력
+ - hash_stable 판정 — UPDATE-only(g3-tU) vs re-decompose(g3-t2) 라우팅:
+ * hash_stable_strict = build(md) 가 저장 hier hash 를 position-by-position 100% 재현
+ (= 런타임 g3-tU 가 UPDATE-only 로 처리할 정확한 집합; demote 안 함)
+ * hash_stable_99 = >=99% 재현 (원 MEASURE2 분류 기준 — 비교용)
+ - dup_title_count / has_fence (measure3 budget note: fence 보유 doc 은 새 builder 에서 hash_changed flip 가능)
+ - REFINED PASS = (verdict B>=A) AND (B_jumptarget>=1)
+
+★ gm-t1 재확인(이 빌드의 유일 잔여 측정): g2 builder 코딩 후 1회 실행 → REFINED PASS 중
+ hash_changed(=re-decompose) count 가 ~230 인지 확인(코드펜스-skip 으로 32 중 ≤2 flip → 최대 ~232 수용).
+
+실행 (GPU 서버, 컨테이너):
+ docker compose exec -T fastapi python /app/scripts/hier_outline_quality_gate.py run
+ docker compose exec -T fastapi python /app/scripts/hier_outline_quality_gate.py run --json /tmp/measure.json
+ docker compose exec -T fastapi python /app/scripts/hier_outline_quality_gate.py run --doc 5140,5209,5165 # 코어 spot-check
+"""
+
+import argparse
+import asyncio
+import json
+import os
+import re
+import sys
+from collections import Counter
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+from sqlalchemy import text
+from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine
+
+from services.hier_decomp.builder import build_hier_tree
+
+
+def _is_jump_target(node) -> bool:
+ """jump-target = 비-window leaf OR %_split parent + 제목 보유 (resolveAnchorMap / _JUMP_TARGET_PRED 일치)."""
+ structural = (node.is_leaf and node.node_type != "window") or bool(
+ node.node_type and node.node_type.endswith("_split"))
+ return structural and bool(node.section_title)
+
+
+# cover/TOC org-이름 junk 검출 (g6-t1 high-recall): 회사명 접미사 + 거의-전부-대문자.
+_JUNK_ORG = re.compile(r"\b(INC\.?|LLC|L\.L\.C|CORP\.?|CO\.,?\s*LTD|CONSULTING|COMPANY|LIMITED|LTD\.?)\b", re.I)
+_FENCE_ANY = re.compile(r"(?m)^\s{0,3}(```|~~~)")
+
+
+def _looks_junk(title: str | None) -> bool:
+ if not title:
+ return False
+ if _JUNK_ORG.search(title):
+ return True
+ letters = [c for c in title if c.isalpha()]
+ if len(letters) >= 6 and sum(1 for c in letters if c.isupper()) / len(letters) >= 0.85:
+ return True
+ return False
+
+
+def _make_engine():
+ return create_async_engine(os.environ["DATABASE_URL"], pool_pre_ping=True)
+
+
+async def _measure_doc(session, doc_id):
+ md = await session.scalar(text("SELECT md_content FROM documents WHERE id=:d"), {"d": doc_id})
+ stored = (await session.execute(text("""
+ SELECT chunk_index, chunk_content_hash, node_type, is_leaf, section_title, char_start
+ FROM document_chunks WHERE doc_id=:d AND source_type='hier_section'
+ ORDER BY chunk_index"""), {"d": doc_id})).mappings().all()
+ if not stored:
+ return None
+ res = {"doc_id": doc_id, "n_stored": len(stored)}
+ if not md or not md.strip():
+ res.update({"md_null": True, "verdict": "A_better", "b_jumptarget": 0,
+ "hash_stable_strict": False, "refined_pass": False})
+ return res
+
+ nodes = build_hier_tree(md)
+ jt = [n for n in nodes if _is_jump_target(n)]
+ titles = [n.section_title for n in jt]
+ res["n_build"] = len(nodes)
+ res["b_jumptarget"] = len(jt)
+ res["dup_title"] = len(titles) - len(set(titles))
+ res["has_fence"] = bool(_FENCE_ANY.search(md))
+ res["len_md"] = len(md)
+
+ # hash 비교 (position-aligned, runtime g3-tU 기준).
+ if len(nodes) == len(stored):
+ mism = sum(1 for n, s in zip(nodes, stored)
+ if n.chunk_content_hash != s["chunk_content_hash"])
+ frac = (len(stored) - mism) / len(stored)
+ res["hash_match_frac"] = round(frac, 4)
+ res["hash_stable_strict"] = (mism == 0)
+ res["hash_stable_99"] = (frac >= 0.99)
+ else:
+ res["hash_match_frac"] = 0.0
+ res["hash_stable_strict"] = False
+ res["hash_stable_99"] = False
+
+ stored_titles = {s["section_title"] for s in stored if s["section_title"]}
+ res["junk_b"] = any(_looks_junk(n.section_title) and n.section_title not in stored_titles for n in nodes)
+
+ # verdict 휴리스틱 (high-recall junk 보호 + absent-structure → A_better).
+ # MEASURE2 가 canonical 분포를 이미 박제 — 이 verdict 는 재현/감사용. 애매(notes:ambiguous)는 PASS 미차단.
+ n_a = sum(1 for s in stored if s["is_leaf"])
+ n_b = res["b_jumptarget"]
+ if n_b == 0:
+ res["verdict"] = "A_better" # B 개요 없음(빈 jump-target)
+ elif res["junk_b"]:
+ res["verdict"] = "A_better" # B 가 cover junk 도입
+ elif n_b >= max(1, n_a * 0.7):
+ res["verdict"] = "B_better" if n_b > n_a else "equivalent"
+ else:
+ res["verdict"] = "A_better" # B 가 구조 상실(5209 absent-class)
+ res["notes"] = "absent_or_degraded"
+
+ res["refined_pass"] = res["verdict"] in ("B_better", "equivalent") and n_b >= 1
+ return res
+
+
+async def cmd_run(args):
+ doc_ids = [int(x) for x in args.doc.split(",") if x.strip()] if args.doc else None
+ engine = _make_engine()
+ sm = async_sessionmaker(engine, expire_on_commit=False)
+ try:
+ async with sm() as session:
+ if doc_ids is None:
+ doc_ids = [r[0] for r in (await session.execute(text(
+ "SELECT DISTINCT doc_id FROM document_chunks WHERE source_type='hier_section' ORDER BY doc_id"))).all()]
+ results = []
+ for d in doc_ids:
+ r = await _measure_doc(session, d)
+ if r is not None:
+ results.append(r)
+ finally:
+ await engine.dispose()
+
+ total = len(results)
+ md_null = [r for r in results if r.get("md_null")]
+ measured = [r for r in results if not r.get("md_null")]
+ passes = [r for r in measured if r.get("refined_pass")]
+ pass_jt0 = [r for r in measured if r["verdict"] in ("B_better", "equivalent") and r["b_jumptarget"] == 0]
+ hash_stable = [r for r in passes if r.get("hash_stable_strict")]
+ hash_stable_99 = [r for r in passes if r.get("hash_stable_99")]
+ hash_changed = [r for r in passes if not r.get("hash_stable_strict")]
+ verdict_dist = Counter(r["verdict"] for r in measured)
+ dup_among_stable = [r for r in hash_stable if r.get("dup_title", 0) > 0]
+ fence_among_stable = [r for r in hash_stable if r.get("has_fence")]
+
+ print("=" * 64)
+ print(f"hier doc 측정: {total} (md_null {len(md_null)}, measured {len(measured)})")
+ print(f"verdict 분포: {dict(verdict_dist)}")
+ print(f"B_jumptarget==0 (PASS-verdict 이나 빈 jump-target, B3 HOLD): {len(pass_jt0)}")
+ print("-" * 64)
+ print(f"REFINED PASS = (verdict B>=A) AND (B_jumptarget>=1): {len(passes)}")
+ print(f" ├─ hash_stable (strict 100% position 재현 = g3-tU UPDATE-only): {len(hash_stable)}")
+ print(f" │ dup_title>0: {len(dup_among_stable)} / has_fence: {len(fence_among_stable)}")
+ print(f" │ (참고) hash_stable_99(원 MEASURE2 기준): {len(hash_stable_99)}")
+ print(f" └─ hash_changed (re-decompose 대상, g3-t2 --reprocess): {len(hash_changed)} ← ★ '230' 재확인 수치")
+ print("-" * 64)
+ print(f" re-decompose --doc(B_jumptarget>=1) = {','.join(str(r['doc_id']) for r in hash_changed) or '(없음)'}")
+ print(f" UPDATE-only --doc(hash_stable) = {','.join(str(r['doc_id']) for r in hash_stable) or '(없음)'}")
+ if md_null:
+ print(f" md_null(suspect, V4): {[r['doc_id'] for r in md_null]}")
+ print("=" * 64)
+ print("NOTE: '230' 은 hash_changed PASS 수치. 코드펜스-skip 으로 hash_stable 32 중 fence 보유분(measure3=2)이 "
+ "hash_changed 로 flip 가능 → 230~232 수용(NEW-3 budget-only, 정확성은 g3-tU 런타임 100% VERIFY 가 보증).")
+
+ if args.json:
+ with open(args.json, "w") as f:
+ json.dump({"summary": {
+ "total": total, "measured": len(measured), "refined_pass": len(passes),
+ "hash_stable": len(hash_stable), "hash_changed": len(hash_changed),
+ "b_jumptarget_0": len(pass_jt0), "md_null": [r["doc_id"] for r in md_null],
+ "hash_changed_doc_ids": [r["doc_id"] for r in hash_changed],
+ "hash_stable_doc_ids": [r["doc_id"] for r in hash_stable],
+ }, "docs": results}, f, ensure_ascii=False, indent=2)
+ print(f"[json] {args.json} 기록 ({len(results)} doc)")
+
+
+def main():
+ ap = argparse.ArgumentParser(description="hier 개요 keep-better 게이트 + g-measure (read-only)")
+ sub = ap.add_subparsers(dest="cmd", required=True)
+ p = sub.add_parser("run", help="전체(또는 --doc) 측정 + 분포 출력")
+ p.add_argument("--doc", default=None, help="comma-sep doc id (미지정=전 hier doc)")
+ p.add_argument("--json", default=None, help="per-doc 결과 JSON 덤프 경로")
+ args = ap.parse_args()
+ asyncio.run({"run": cmd_run}[args.cmd](args))
+
+
+if __name__ == "__main__":
+ main()
diff --git a/scripts/hier_overnight_backfill.py b/scripts/hier_overnight_backfill.py
index 5270541..7b0c94a 100644
--- a/scripts/hier_overnight_backfill.py
+++ b/scripts/hier_overnight_backfill.py
@@ -29,6 +29,7 @@ from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine
from ai.client import AIClient, parse_json_response, strip_thinking
from core.config import settings
+from services.hier_decomp.builder import build_hier_tree
from services.hier_decomp.persist import persist_hier_tree
from services.search.llm_gate import Priority, acquire_mlx_gate
@@ -42,27 +43,48 @@ DOC_MIN_CHARS = 4000 # hier 분해가 의미 있는 doc 크기 하한(STRUCTUR
BUFFER_MIN = 10 # deadline 이 만큼 전 안전 중단
-def _candidate_sql(allowlist, doc_ids=None):
- """allowlist 있으면 그 domain 만, 없으면 EXCLUDE_DOMAINS(news) 제외 전부.
- doc_ids 명시 시 = 그 doc 만(크기 게이트 DOC_MIN_CHARS + domain 필터 우회 —
- 구조화 소형 문서(법령 등) eval coverage 보정용. NOT EXISTS hier 멱등 가드는 유지).
- 작은 doc 먼저 = 완료 doc 수 최대화 + 단일 mega-doc 예산 독식 방지."""
+# jump-target = 비-window leaf OR %_split parent (B1/B3 완료마커 + B_jumptarget 분모, 플랜 g3-t2).
+# 이 집합만 char_start 를 받는다(window-child/preamble 은 설계상 NULL).
+_JUMP_TARGET_PRED = r"((c.is_leaf AND c.node_type IS DISTINCT FROM 'window') OR c.node_type LIKE '%\_split' ESCAPE '\')"
+
+
+def _candidate_sql(allowlist, doc_ids=None, reprocess=False):
+ """body = d.md_content (g0-t1: hier 출처 md_content 영구확정 — extracted_text 폐기. char_start 가
+ md_content offset 이라 FE splice basis 와 일치해야 하므로 분해 source 도 md_content 여야 함[F1]).
+
+ reprocess=False (additive): 아직 hier 없는 doc 만 신규 분해 (NOT EXISTS hier_section 멱등).
+ reprocess=True (re-decompose): hier 는 있으나 jump-target char_start 가 아직 안 채워진 doc 재분해.
+ [B1] 완료마커 = jump-target 중 char_start NOT NULL 행이 존재(=한 번 재분해되면 atomic 하게 전부 채워짐);
+ window-child/preamble 은 설계상 NULL 이라 'all-leaf NOT NULL' 마커의 무한 trap 을 피한다.
+ [B3] 빈 jump-target doc(B_jumptarget==0)은 NOT EXISTS 가 vacuous TRUE → 영구 재선택 trap →
+ 호출측이 --doc 을 REFINED PASS(B_jumptarget>=1) 로 제한해 차단(--reprocess 는 --doc 필수, REFUSE).
+ doc_ids 명시 시 크기 게이트 우회. 작은 doc 먼저 = 완료 doc 수 최대화."""
if doc_ids:
cond, gate = "d.id = ANY(:doc_ids)", "" # 명시 doc = 크기 게이트 우회
else:
cond = ("lower(split_part(coalesce(d.ai_domain,''), '/', 1)) = ANY(:domains)"
if allowlist else
"lower(split_part(coalesce(d.ai_domain,''), '/', 1)) <> ALL(:exclude)")
- gate = "AND length(d.extracted_text) > :minchars"
+ gate = "AND length(d.md_content) > :minchars"
+ if reprocess:
+ marker = f"""
+ AND EXISTS (SELECT 1 FROM document_chunks dc
+ WHERE dc.doc_id = d.id AND dc.source_type = 'hier_section')
+ AND NOT EXISTS (SELECT 1 FROM document_chunks c
+ WHERE c.doc_id = d.id AND c.source_type = 'hier_section'
+ AND c.char_start IS NOT NULL AND {_JUMP_TARGET_PRED})"""
+ else:
+ marker = """
+ AND NOT EXISTS (SELECT 1 FROM document_chunks dc
+ WHERE dc.doc_id = d.id AND dc.source_type = 'hier_section')"""
return text(f"""
- SELECT d.id AS doc_id, d.extracted_text AS body, d.ai_domain AS ai_domain
+ SELECT d.id AS doc_id, d.md_content AS body, d.ai_domain AS ai_domain
FROM documents d
- WHERE d.extracted_text IS NOT NULL
+ WHERE d.md_content IS NOT NULL AND length(d.md_content) > 0
{gate}
AND {cond}
- AND NOT EXISTS (SELECT 1 FROM document_chunks dc
- WHERE dc.doc_id = d.id AND dc.source_type = 'hier_section')
- ORDER BY length(d.extracted_text) ASC
+ {marker}
+ ORDER BY length(d.md_content) ASC
""")
@@ -77,10 +99,11 @@ def _candidate_params(allowlist, doc_ids=None):
return p
-def _scope_label(allowlist, doc_ids=None):
+def _scope_label(allowlist, doc_ids=None, reprocess=False):
+ tag = "RE-DECOMPOSE" if reprocess else "additive"
if doc_ids:
- return f"doc-list={len(doc_ids)}건(크기게이트 우회)"
- return f"allowlist={allowlist}" if allowlist else f"all-except={EXCLUDE_DOMAINS}"
+ return f"doc-list={len(doc_ids)}건(크기게이트 우회, {tag})"
+ return (f"allowlist={allowlist}" if allowlist else f"all-except={EXCLUDE_DOMAINS}") + f" ({tag})"
# 멱등 leaf 선별 (재실행 시 이미 분석된 leaf 제외)
LEAF_SQL = text("""
@@ -177,14 +200,19 @@ def _parse_doc_ids(args):
async def cmd_dry_run(args):
allowlist = args.domains.split(",") if args.domains else None
doc_ids = _parse_doc_ids(args)
+ reprocess = getattr(args, "reprocess", False)
+ if reprocess and not doc_ids:
+ print("REFUSE: --reprocess 는 --doc 필수 (B3 빈 jump-target trap 차단 — REFINED PASS 리스트만)")
+ sys.exit(2)
engine = _make_engine()
sm = async_sessionmaker(engine, expire_on_commit=False)
async with sm() as session:
- rows = (await session.execute(_candidate_sql(allowlist, doc_ids),
+ rows = (await session.execute(_candidate_sql(allowlist, doc_ids, reprocess),
_candidate_params(allowlist, doc_ids))).mappings().all()
await engine.dispose()
gate_lbl = "doc-list" if doc_ids else f">{DOC_MIN_CHARS}자"
- print(f"[dry-run] 후보 doc {len(rows)} ({_scope_label(allowlist, doc_ids)}, {gate_lbl}, 미분해)")
+ state_lbl = "재분해 미완료(jump-target char_start 부재)" if reprocess else "미분해"
+ print(f"[dry-run] 후보 doc {len(rows)} ({_scope_label(allowlist, doc_ids, reprocess)}, {gate_lbl}, {state_lbl})")
if rows:
lens = [len(r["body"]) for r in rows]
print(f" 본문길이: min={min(lens)} p50={int(statistics.median(lens))} max={max(lens)} 합={sum(lens):,}")
@@ -196,11 +224,16 @@ async def cmd_dry_run(args):
async def cmd_run(args):
allowlist = args.domains.split(",") if args.domains else None
doc_ids = _parse_doc_ids(args)
+ reprocess = getattr(args, "reprocess", False)
+ if reprocess and not doc_ids:
+ _log("REFUSE: --reprocess 는 --doc 필수 (B3 빈 jump-target trap 차단 — REFINED PASS 리스트만)")
+ sys.exit(2)
skip_analysis = getattr(args, "skip_analysis", False)
deadline = _compute_deadline(args.deadline)
stop_at = (deadline - timedelta(minutes=BUFFER_MIN)).timestamp()
_log(f"deadline={deadline:%m-%d %H:%M} (buffer {BUFFER_MIN}m → stop_at={datetime.fromtimestamp(stop_at):%H:%M}) "
- f"{_scope_label(allowlist, doc_ids)}{' [SKIP-ANALYSIS: 분해+임베딩만]' if skip_analysis else ''}")
+ f"{_scope_label(allowlist, doc_ids, reprocess)}{' [SKIP-ANALYSIS: 분해+임베딩만]' if skip_analysis else ''}"
+ f"{' [RE-DECOMPOSE: 기존 hier DELETE→CASCADE chunk_section_analysis→재INSERT; 스냅샷 선행 필수]' if reprocess else ''}")
engine = _make_engine()
sm = async_sessionmaker(engine, expire_on_commit=False)
@@ -219,7 +252,7 @@ async def cmd_run(args):
run_start = time.time()
try:
async with sm() as session:
- cands = (await session.execute(_candidate_sql(allowlist, doc_ids),
+ cands = (await session.execute(_candidate_sql(allowlist, doc_ids, reprocess),
_candidate_params(allowlist, doc_ids))).mappings().all()
_log(f"후보 doc {len(cands)} 선별. 시작.")
@@ -268,6 +301,101 @@ async def cmd_run(args):
d = Counter(all_types)
_log(f" section_type: {dict(d.most_common())} other={d.get('other',0)/len(all_types):.1%}")
+ # [g3-t3/g3-t4] post-run sweep: 처리한 doc 중 미분석 leaf 잔여 집계(반쪽상태/stall 검출).
+ # GOAL(jump=char_start)/rail-summary(re-analyze) DECOUPLE — 잔여는 다음 실행이 LEAF_SQL 멱등으로 흡수.
+ if doc_ids:
+ try:
+ async with sm() as session:
+ pending = (await session.execute(text(f"""
+ SELECT dc.doc_id, count(*) AS unanalyzed
+ FROM document_chunks dc
+ WHERE dc.doc_id = ANY(:ids) AND dc.source_type='hier_section' AND dc.is_leaf=true
+ AND NOT EXISTS (SELECT 1 FROM chunk_section_analysis a
+ WHERE a.chunk_id = dc.id AND a.prompt_version = :pv
+ AND a.source_content_hash = dc.chunk_content_hash)
+ GROUP BY dc.doc_id ORDER BY unanalyzed DESC"""),
+ {"ids": doc_ids, "pv": PROMPT_VERSION})).mappings().all()
+ if pending:
+ tot = sum(r["unanalyzed"] for r in pending)
+ _log(f" [sweep] 미분석 leaf 잔여: {tot} (doc {len(pending)}) — 다음 실행이 이어서 분석(멱등). "
+ f"상위: {[(r['doc_id'], r['unanalyzed']) for r in pending[:5]]}")
+ else:
+ _log(" [sweep] 미분석 leaf 잔여 0 — 분석 수렴.")
+ except Exception as exc:
+ _log(f" [sweep] 잔여 집계 실패(무해): {type(exc).__name__}")
+
+
+def _is_jump_target(node) -> bool:
+ """jump-target = 비-window leaf OR %_split parent (builder HierNode 판정, _JUMP_TARGET_PRED 와 일치)."""
+ return ((node.is_leaf and node.node_type != "window")
+ or bool(node.node_type and node.node_type.endswith("_split")))
+
+
+async def cmd_update_char_start(args):
+ """[g3-tU] hash_stable doc 전용 비파괴 char_start UPDATE.
+
+ 각 doc: build(md_content) → stored hier 행과 position-by-position(chunk_index 순) 정렬 →
+ [NEW-1] jump-target 전수 100% hash 일치(ALL-OR-NOTHING) VERIFY. 단 한 자리라도 불일치 → DEMOTE.
+ [NEW-2] hash 로 WHERE 하지 않음(동일-body 절 충돌 회피) — position 의 stored row PK(id)로 UPDATE.
+ 통과 doc: UPDATE document_chunks SET char_start (DELETE/CASCADE/embed/analyze 0, 가역).
+ 미달 doc: DEMOTE-LIST 로 emit → re-decompose 배치에 UNION(NEW-4). stdout 마지막에 DEMOTE_DOC_IDS= 출력.
+ """
+ doc_ids = _parse_doc_ids(args)
+ if not doc_ids:
+ _log("REFUSE: update-char-start 는 --doc 필수 (hash_stable 32 = gm-t1 산출)")
+ sys.exit(2)
+ engine = _make_engine()
+ sm = async_sessionmaker(engine, expire_on_commit=False)
+ updated, demoted, noop = [], [], []
+ try:
+ for doc_id in doc_ids:
+ async with sm() as session:
+ md = await session.scalar(text("SELECT md_content FROM documents WHERE id=:d"), {"d": doc_id})
+ if not md or not md.strip():
+ noop.append(doc_id)
+ _log(f" doc={doc_id} md_content 없음 → no-op(suspect, V4)")
+ continue
+ nodes = build_hier_tree(md)
+ stored = (await session.execute(text("""
+ SELECT id, chunk_index, chunk_content_hash, node_type, is_leaf
+ FROM document_chunks
+ WHERE doc_id=:d AND source_type='hier_section'
+ ORDER BY chunk_index"""), {"d": doc_id})).mappings().all()
+ # [NEW-2] position 정렬: build node[i] ↔ stored[i] (chunk_index = base + idx 라 동일 순서).
+ # 노드 수가 다르면 구조 변경 = hash_changed → DEMOTE.
+ if len(nodes) != len(stored):
+ demoted.append(doc_id)
+ _log(f" doc={doc_id} 노드수 build {len(nodes)} ≠ stored {len(stored)} → DEMOTE(re-decompose)")
+ continue
+ # [NEW-1] 전 position hash 일치 VERIFY (position-alignment 가 ordering 도 검증).
+ # 임의 position 불일치 → DEMOTE (jump-target 1% miss 도 whole-doc 폴백 회귀를 부르므로 100%).
+ mismatch = next((i for i, (nd, sr) in enumerate(zip(nodes, stored))
+ if nd.chunk_content_hash != sr["chunk_content_hash"]), None)
+ if mismatch is not None:
+ demoted.append(doc_id)
+ _log(f" doc={doc_id} position {mismatch} hash 불일치 → DEMOTE(re-decompose, NEW-1)")
+ continue
+ # 통과 → jump-target 의 char_start 를 stored row PK 로 UPDATE.
+ n_upd = 0
+ for nd, sr in zip(nodes, stored):
+ if _is_jump_target(nd) and nd.char_start is not None:
+ await session.execute(
+ text("UPDATE document_chunks SET char_start=:cs WHERE id=:id"),
+ {"cs": nd.char_start, "id": sr["id"]})
+ n_upd += 1
+ await session.commit()
+ updated.append(doc_id)
+ _log(f" ✓ doc={doc_id} char_start UPDATE {n_upd} jump-target (VERIFY 100%, 비파괴)")
+ finally:
+ await engine.dispose()
+ _log(f"=== update-char-start: updated={len(updated)} demoted={len(demoted)} noop={len(noop)} ===")
+ if demoted:
+ _log(f" DEMOTE(re-decompose 배치 합류, NEW-4): {demoted}")
+ if noop:
+ _log(f" NO-OP(md_content NULL suspect, V4): {noop}")
+ # 기계가독: re-decompose --doc = (gm-t1 hash_changed 230) UNION (이 리스트)
+ print("DEMOTE_DOC_IDS=" + ",".join(str(x) for x in demoted), flush=True)
+
def main():
ap = argparse.ArgumentParser(description="오버나이트 hier 분해+절 분석 backfill (additive)")
@@ -275,13 +403,20 @@ def main():
p_dry = sub.add_parser("dry-run", help="후보 doc 집계 (작업 0)")
p_dry.add_argument("--domains", default=None, help="comma-sep allowlist (미지정=뉴스 제외 전부)")
p_dry.add_argument("--doc", default=None, help="comma-sep doc id (크기 게이트 우회 — 구조화 소형 문서 coverage 보정)")
+ p_dry.add_argument("--reprocess", action="store_true", help="재분해 후보(기존 hier+jump-target char_start 부재) — --doc 필수")
p_run = sub.add_parser("run", help="분해+분석 실행 (deadline time-box)")
p_run.add_argument("--deadline", default="07:00", help="HH:MM (기본 07:00 — 컨테이너 UTC 주의, 07:00 KST=22:00 UTC)")
p_run.add_argument("--domains", default=None, help="comma-sep allowlist (미지정=뉴스 제외 전부)")
p_run.add_argument("--doc", default=None, help="comma-sep doc id (크기 게이트 우회 — 구조화 소형 문서 coverage 보정)")
p_run.add_argument("--skip-analysis", action="store_true", help="절 분석(Mac mini) 생략, 분해+임베딩만 (retrieval go/no-go 측정 준비용)")
+ p_run.add_argument("--reprocess", action="store_true",
+ help="[g3-t2] RE-DECOMPOSE: 기존 hier DELETE→CASCADE→재INSERT (md_content 출처, char_start). "
+ "--doc(REFINED PASS hash_changed∪demote) 필수 / 스냅샷 선행 필수")
+ p_upd = sub.add_parser("update-char-start",
+ help="[g3-tU] hash_stable doc 비파괴 char_start UPDATE (100% VERIFY, --doc 필수)")
+ p_upd.add_argument("--doc", default=None, help="comma-sep doc id (gm-t1 hash_stable 32)")
args = ap.parse_args()
- fn = {"dry-run": cmd_dry_run, "run": cmd_run}[args.cmd]
+ fn = {"dry-run": cmd_dry_run, "run": cmd_run, "update-char-start": cmd_update_char_start}[args.cmd]
asyncio.run(fn(args))
diff --git a/tests/hier_decomp/test_builder_char_start.py b/tests/hier_decomp/test_builder_char_start.py
new file mode 100644
index 0000000..d5f2815
--- /dev/null
+++ b/tests/hier_decomp/test_builder_char_start.py
@@ -0,0 +1,95 @@
+"""builder.py char_start emit 단위테스트 (플랜 ds-outline-anchor-b5 g2 / g0-t2).
+
+핵심 불변식:
+ - char_start = FE outlineAnchors.ts 라인/offset 모델(split('\n') + UTF-16 code unit + 코드펜스)과 동일.
+ - astral(BMP 밖) prefix 가 있어도 UTF-16 code unit offset 이어야 함 (#2 SILENT 단위버그 게이트).
+ - window-child char_start=None, split-parent char_start=heading offset (B1/#1).
+ - 코드펜스 내부 heading 미탐지 (O3).
+ - 라인모델 변경이 node.text 를 바꾸지 않음(hash-neutral) — hash_stable doc 보존.
+"""
+from __future__ import annotations
+
+import hashlib
+
+from app.services.hier_decomp.builder import build_hier_tree, coverage_stats, _utf16_units
+
+
+def _fe_offset_of_line(md: str, target_line: str) -> int | None:
+ """FE outlineAnchors.ts:55-65 재현 — char_start 가 이 값과 같아야 함."""
+ off = 0
+ for raw in md.split("\n"):
+ if raw == target_line:
+ return off
+ off += len(raw.encode("utf-16-le")) // 2 + 1
+ return None
+
+
+def _u16_slice(md: str, cs: int, n: int) -> str:
+ return md.encode("utf-16-le")[2 * cs: 2 * (cs + n)].decode("utf-16-le")
+
+
+def test_char_start_matches_fe_offset_and_slices():
+ md = "# Alpha\nbody alpha here\n\n## Beta\nbody beta\n# Gamma\nlast line"
+ nodes = build_hier_tree(md, leaf_hard_max=100000)
+ seen = 0
+ for n in nodes:
+ if n.char_start is None:
+ continue
+ seen += 1
+ head = n.text.split("\n", 1)[0]
+ assert n.char_start == _fe_offset_of_line(md, head), n.section_title
+ assert _u16_slice(md, n.char_start, _utf16_units(head)) == head
+ assert seen >= 2
+
+
+def test_astral_prefix_offset_is_utf16_not_codepoint():
+ # 📄 = U+1F4C4 = 1 code point 이나 UTF-16 surrogate pair(2 code unit).
+ md = "\U0001F4C4 manifest\n\n# Section One\nbody"
+ nodes = build_hier_tree(md)
+ sec = next(n for n in nodes if n.section_title == "Section One")
+ fe = _fe_offset_of_line(md, "# Section One")
+ assert sec.char_start == fe
+ # UTF-16 슬라이스는 정확
+ assert _u16_slice(md, sec.char_start, _utf16_units("# Section One")) == "# Section One"
+ # code-point 슬라이스는 어긋나야 함(astral 때문에) — 단위버그가 있었다면 이게 통과했을 것
+ assert md[sec.char_start: sec.char_start + len("# Section One")] != "# Section One"
+
+
+def test_fenced_heading_not_detected():
+ md = "# Real\nintro\n```\n# Fake In Fence\n```\n# Real Two\nx"
+ titles = [n.section_title for n in build_hier_tree(md) if n.section_title]
+ assert "Fake In Fence" not in titles
+ assert "Real" in titles and "Real Two" in titles
+
+
+def test_window_child_null_split_parent_has_offset():
+ md = "# BigSection\n" + ("paragraph text here. " * 20 + "\n\n") * 60
+ nodes = build_hier_tree(md, leaf_hard_max=5000, leaf_target_max=3000)
+ sp = [n for n in nodes if n.node_type and n.node_type.endswith("_split")]
+ wc = [n for n in nodes if n.node_type == "window"]
+ assert sp and sp[0].char_start is not None
+ assert wc and all(w.char_start is None for w in wc)
+
+
+def test_node_text_preserved_hash_neutral():
+ # 라인모델(split vs splitlines) 변경에도 leaf 이어붙이면 원문 재구성 → hash 불변.
+ md = "# A\nl1\nl2\n# B\nl3\n# C\nl4\n"
+ nodes = build_hier_tree(md, leaf_hard_max=100000)
+ recon = "".join(n.text for n in nodes if n.is_leaf or (n.node_type and n.node_type.endswith("_split")))
+ assert recon == md
+
+
+def test_preamble_char_start_none():
+ md = "intro paragraph with no heading\nmore intro\n# First\nbody"
+ nodes = build_hier_tree(md, leaf_hard_max=100000)
+ preamble = [n for n in nodes if n.section_title is None and n.level == 0]
+ assert preamble and preamble[0].char_start is None
+
+
+def test_coverage_stats_char_start_telemetry():
+ md = "# Alpha\nbody\n# Beta\nbody2"
+ nodes = build_hier_tree(md, leaf_hard_max=100000)
+ st = coverage_stats(md, nodes)
+ assert st["char_start_total"] >= 2
+ assert st["char_start_verified"] == st["char_start_total"] # 모두 O5 통과
+ assert st["non_nfc"] == 0