aeb9290cbd
플랜 ds-outline-anchor-b5 (g1~g6 코드). 핵심 ASME/법령 windowed 절의 0% 점프를
서버계산 char_start(builder offset)로 100% deterministic 점프로 전환.
- g1 migration 318: document_chunks.char_start INTEGER NULL (단일 statement, 멱등)
- g2 builder: char_start emit = FE 라인/offset 모델 미러(split('\n')+UTF-16 code unit+코드펜스 skip).
window-child=NULL, split-parent=heading offset, preamble=NULL, CR 미strip, NFC=telemetry.
node.text 보존(라인모델 hash-neutral) → hash_stable doc 보존. 단위테스트 7건.
- g3 persist+backfill 하이브리드:
* persist INSERT char_start
* update-char-start (g3-tU): hash_stable doc 비파괴 — 100% jump-target VERIFY(NEW-1) +
position-aligned PK UPDATE(NEW-2), 미달 doc DEMOTE → re-decompose 합류(NEW-4)
* --reprocess (g3-t2): md_content 출처(g0-t1) + jump-target-set 완료마커(B1) + B_jumptarget>=1(B3),
--doc 필수 else REFUSE. self-heal sweep(g3-t3).
- g4 /sections: char_start inner+outer SELECT + split-parent 노출(is_leaf OR %_split)
- g5 FE: resolveAnchorMap(BE-first, NEW-5 jump-target-candidate-scoped 폴백, C1 OR-exclude),
per-render-site basis guard(C3), endsWith('_split') 정정 + collapseWindows split-parent 흡수(C2).
단위테스트 25건(NEW-5/B4/C1/C2 포함).
- g6 hier_outline_quality_gate.py: read-only g-measure(verdict/B_jumptarget/hash_stable/dup/fence)
배포(g7: --no-deps, 스냅샷, UPDATE-only 32 + re-decompose 230∪demote, 정확도 게이트)는 별 ops 단계.
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
80 lines
3.7 KiB
Python
80 lines
3.7 KiB
Python
"""Hier tree → document_chunks 영속화 (PR-DocSrv-Hierarchical-Decomposition-1 c4).
|
|
|
|
build_hier_tree 결과를 document_chunks 에 insert. source_type='hier_section',
|
|
in_corpus=false(검색 비활성), is_leaf 노드만 embedding. 재실행 idempotent(기존 hier 행 삭제 후 재삽입).
|
|
chunk_index = doc 별 (max+1) offset → 기존 legacy 와 (doc_id,chunk_index) unique 충돌 회피.
|
|
c4(pilot)/c6(replace)/향후 backfill 공용.
|
|
"""
|
|
from __future__ import annotations
|
|
from typing import Awaitable, Callable
|
|
from sqlalchemy import text
|
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
|
|
from services.hier_decomp.builder import build_hier_tree, coverage_stats
|
|
|
|
CHUNKER_VERSION = "hier-rule-v1"
|
|
SOURCE_TYPE = "hier_section"
|
|
|
|
|
|
async def persist_hier_tree(
|
|
session: AsyncSession,
|
|
doc_id: int,
|
|
source_text: str,
|
|
embed_leaf: Callable[[str], Awaitable[list[float] | None]],
|
|
*,
|
|
domain_category: str | None = None,
|
|
) -> dict:
|
|
"""doc 의 hier_section 트리를 재생성(idempotent). 통계 dict 반환."""
|
|
nodes = build_hier_tree(source_text)
|
|
if not nodes:
|
|
return {"doc_id": doc_id, "nodes": 0, "leaves": 0, "skipped": "empty"}
|
|
|
|
# domain_category 결정 (NOT NULL): legacy chunk 다수결 → fallback 'general'
|
|
if domain_category is None:
|
|
domain_category = await session.scalar(text("""
|
|
SELECT domain_category FROM document_chunks WHERE doc_id=:d
|
|
GROUP BY domain_category ORDER BY count(*) DESC LIMIT 1"""), {"d": doc_id}) or "general"
|
|
|
|
# idempotency: 기존 hier 행 삭제
|
|
await session.execute(text(
|
|
"DELETE FROM document_chunks WHERE doc_id=:d AND source_type=:st AND chunker_version=:cv"),
|
|
{"d": doc_id, "st": SOURCE_TYPE, "cv": CHUNKER_VERSION})
|
|
|
|
base = (await session.scalar(text(
|
|
"SELECT COALESCE(MAX(chunk_index),-1)+1 FROM document_chunks WHERE doc_id=:d"), {"d": doc_id})) or 0
|
|
|
|
idx_to_dbid: dict[int, int] = {}
|
|
embedded = 0
|
|
for n in nodes: # parent always precedes child in list order
|
|
parent_db = idx_to_dbid.get(n.parent_idx) if n.parent_idx is not None else None
|
|
emb_str = None
|
|
if n.is_leaf:
|
|
emb = await embed_leaf(n.text)
|
|
if emb:
|
|
emb_str = "[" + ",".join(repr(float(x)) for x in emb) + "]"
|
|
embedded += 1
|
|
chunk_type = "section_md" if n.is_leaf else "section_container"
|
|
db_id = await session.scalar(text("""
|
|
INSERT INTO document_chunks
|
|
(doc_id, chunk_index, chunk_type, section_title, heading_path, domain_category,
|
|
text, embedding, source_type, chunker_version, chunk_content_hash,
|
|
parent_id, level, node_type, is_leaf, in_corpus, char_start)
|
|
VALUES (:d, :ci, :ct, :stt, :hp, :dc, :tx,
|
|
cast(cast(:emb AS text) AS vector),
|
|
:src, :cv, :hash, :pid, :lvl, :nt, :leaf, false, :cs)
|
|
RETURNING id"""), {
|
|
"d": doc_id, "ci": base + n.idx, "ct": chunk_type,
|
|
"stt": n.section_title, "hp": n.heading_path, "dc": domain_category,
|
|
"tx": n.text, "emb": emb_str, "src": SOURCE_TYPE, "cv": CHUNKER_VERSION,
|
|
"hash": n.chunk_content_hash, "pid": parent_db, "lvl": n.level,
|
|
"nt": n.node_type, "leaf": n.is_leaf, "cs": n.char_start})
|
|
idx_to_dbid[n.idx] = db_id
|
|
await session.commit()
|
|
|
|
leaves = [n for n in nodes if n.is_leaf]
|
|
st = coverage_stats(source_text, nodes)
|
|
st.update({"doc_id": doc_id, "base_chunk_index": base, "embedded_leaves": embedded,
|
|
"embed_coverage": round(embedded / len(leaves), 4) if leaves else 0,
|
|
"domain_category": domain_category})
|
|
return st
|