feat(search): hier persist + partial ivfflat index on in_corpus (Hier-Decomp-1 c4)

persist_hier_tree(): build_hier_tree → document_chunks insert. source_type=hier_section,
in_corpus=false, is_leaf 노드만 bge-m3 embedding. idempotent(기존 hier 행 삭제 후 재삽입).
chunk_index = doc 별 (max+1) offset → 기존 (doc_id,chunk_index) unique 충돌 회피.
embedding NULL 파라미터 asyncpg 타입추론 → cast(cast(:emb AS text) AS vector) 이중캐스트.

migration 284/285: ivfflat 오염 fix. full 인덱스는 in_corpus=false hier 벡터까지 색인 →
근사 검색이 비활성 벡터에 오염(corpus_chunks 필터해도 근사 이웃 셋 흔들림). partial index
(WHERE in_corpus=true)로 교체 → in_corpus=false 는 검색 인덱스에 부재 = 무영향 인덱스 레벨 보장.

c4 pilot(5140/5186/5225) G3: 트리 insert, embed_coverage 1.0(doc-local 100%), in_corpus_true=0,
dangling_parent=0, dup 0. **부분인덱스 후 검색 baseline IDENTICAL to 원래(pre-hier)** = 691 hier
행 영향 0 검증(오염 fix 효과). replace 는 c5/c6.

plan: hierarchical-decomposition-tiered-nesting-marmot.md

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
hyungi
2026-05-24 13:12:42 +00:00
parent d982dce7d1
commit fa82bd495b
3 changed files with 91 additions and 0 deletions
+79
View File
@@ -0,0 +1,79 @@
"""Hier tree → document_chunks 영속화 (PR-DocSrv-Hierarchical-Decomposition-1 c4).
build_hier_tree 결과를 document_chunks 에 insert. source_type='hier_section',
in_corpus=false(검색 비활성), is_leaf 노드만 embedding. 재실행 idempotent(기존 hier 행 삭제 후 재삽입).
chunk_index = doc 별 (max+1) offset → 기존 legacy 와 (doc_id,chunk_index) unique 충돌 회피.
c4(pilot)/c6(replace)/향후 backfill 공용.
"""
from __future__ import annotations
from typing import Awaitable, Callable
from sqlalchemy import text
from sqlalchemy.ext.asyncio import AsyncSession
from services.hier_decomp.builder import build_hier_tree, coverage_stats
CHUNKER_VERSION = "hier-rule-v1"
SOURCE_TYPE = "hier_section"
async def persist_hier_tree(
session: AsyncSession,
doc_id: int,
source_text: str,
embed_leaf: Callable[[str], Awaitable[list[float] | None]],
*,
domain_category: str | None = None,
) -> dict:
"""doc 의 hier_section 트리를 재생성(idempotent). 통계 dict 반환."""
nodes = build_hier_tree(source_text)
if not nodes:
return {"doc_id": doc_id, "nodes": 0, "leaves": 0, "skipped": "empty"}
# domain_category 결정 (NOT NULL): legacy chunk 다수결 → fallback 'general'
if domain_category is None:
domain_category = await session.scalar(text("""
SELECT domain_category FROM document_chunks WHERE doc_id=:d
GROUP BY domain_category ORDER BY count(*) DESC LIMIT 1"""), {"d": doc_id}) or "general"
# idempotency: 기존 hier 행 삭제
await session.execute(text(
"DELETE FROM document_chunks WHERE doc_id=:d AND source_type=:st AND chunker_version=:cv"),
{"d": doc_id, "st": SOURCE_TYPE, "cv": CHUNKER_VERSION})
base = (await session.scalar(text(
"SELECT COALESCE(MAX(chunk_index),-1)+1 FROM document_chunks WHERE doc_id=:d"), {"d": doc_id})) or 0
idx_to_dbid: dict[int, int] = {}
embedded = 0
for n in nodes: # parent always precedes child in list order
parent_db = idx_to_dbid.get(n.parent_idx) if n.parent_idx is not None else None
emb_str = None
if n.is_leaf:
emb = await embed_leaf(n.text)
if emb:
emb_str = "[" + ",".join(repr(float(x)) for x in emb) + "]"
embedded += 1
chunk_type = "section_md" if n.is_leaf else "section_container"
db_id = await session.scalar(text("""
INSERT INTO document_chunks
(doc_id, chunk_index, chunk_type, section_title, heading_path, domain_category,
text, embedding, source_type, chunker_version, chunk_content_hash,
parent_id, level, node_type, is_leaf, in_corpus)
VALUES (:d, :ci, :ct, :stt, :hp, :dc, :tx,
cast(cast(:emb AS text) AS vector),
:src, :cv, :hash, :pid, :lvl, :nt, :leaf, false)
RETURNING id"""), {
"d": doc_id, "ci": base + n.idx, "ct": chunk_type,
"stt": n.section_title, "hp": n.heading_path, "dc": domain_category,
"tx": n.text, "emb": emb_str, "src": SOURCE_TYPE, "cv": CHUNKER_VERSION,
"hash": n.chunk_content_hash, "pid": parent_db, "lvl": n.level,
"nt": n.node_type, "leaf": n.is_leaf})
idx_to_dbid[n.idx] = db_id
await session.commit()
leaves = [n for n in nodes if n.is_leaf]
st = coverage_stats(source_text, nodes)
st.update({"doc_id": doc_id, "base_chunk_index": base, "embedded_leaves": embedded,
"embed_coverage": round(embedded / len(leaves), 4) if leaves else 0,
"domain_category": domain_category})
return st
@@ -0,0 +1,5 @@
-- PR-DocSrv-Hierarchical-Decomposition-1 (c4): ivfflat 오염 fix 1/2.
-- 기존 full ivfflat(idx_chunks_embedding)은 in_corpus=false hier leaf 벡터까지 포함 →
-- 근사 검색이 비활성 벡터에 오염됨(corpus_chunks 결과 필터해도 근사 이웃 셋이 흔들림).
-- partial index(WHERE in_corpus=true)로 교체 위해 먼저 drop. (다음 285 에서 재생성)
DROP INDEX IF EXISTS idx_chunks_embedding;
@@ -0,0 +1,7 @@
-- PR-DocSrv-Hierarchical-Decomposition-1 (c4): ivfflat 오염 fix 2/2.
-- partial ivfflat — in_corpus=true 벡터만 색인 → in_corpus=false(비활성 hier leaf)는
-- 검색 인덱스에 부재 = 검색 무영향이 인덱스 레벨에서 보장. replace 시 in_corpus 토글로 자동 편입/제외.
-- corpus_chunks 쿼리(WHERE in_corpus=true)가 이 partial index 사용.
CREATE INDEX IF NOT EXISTS idx_chunks_embedding ON document_chunks
USING ivfflat (embedding vector_cosine_ops) WITH (lists = '100')
WHERE in_corpus = true;