From fa82bd495bff45fb3b53488d3afbbec7275fa0bf Mon Sep 17 00:00:00 2001 From: hyungi Date: Sun, 24 May 2026 13:12:42 +0000 Subject: [PATCH] feat(search): hier persist + partial ivfflat index on in_corpus (Hier-Decomp-1 c4) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit persist_hier_tree(): build_hier_tree → document_chunks insert. source_type=hier_section, in_corpus=false, is_leaf 노드만 bge-m3 embedding. idempotent(기존 hier 행 삭제 후 재삽입). chunk_index = doc 별 (max+1) offset → 기존 (doc_id,chunk_index) unique 충돌 회피. embedding NULL 파라미터 asyncpg 타입추론 → cast(cast(:emb AS text) AS vector) 이중캐스트. migration 284/285: ivfflat 오염 fix. full 인덱스는 in_corpus=false hier 벡터까지 색인 → 근사 검색이 비활성 벡터에 오염(corpus_chunks 필터해도 근사 이웃 셋 흔들림). partial index (WHERE in_corpus=true)로 교체 → in_corpus=false 는 검색 인덱스에 부재 = 무영향 인덱스 레벨 보장. c4 pilot(5140/5186/5225) G3: 트리 insert, embed_coverage 1.0(doc-local 100%), in_corpus_true=0, dangling_parent=0, dup 0. **부분인덱스 후 검색 baseline IDENTICAL to 원래(pre-hier)** = 691 hier 행 영향 0 검증(오염 fix 효과). replace 는 c5/c6. plan: hierarchical-decomposition-tiered-nesting-marmot.md Co-Authored-By: Claude Opus 4.7 (1M context) --- app/services/hier_decomp/persist.py | 79 +++++++++++++++++++ migrations/284_drop_full_embedding_index.sql | 5 ++ .../285_create_partial_embedding_index.sql | 7 ++ 3 files changed, 91 insertions(+) create mode 100644 app/services/hier_decomp/persist.py create mode 100644 migrations/284_drop_full_embedding_index.sql create mode 100644 migrations/285_create_partial_embedding_index.sql diff --git a/app/services/hier_decomp/persist.py b/app/services/hier_decomp/persist.py new file mode 100644 index 0000000..ec32bf7 --- /dev/null +++ b/app/services/hier_decomp/persist.py @@ -0,0 +1,79 @@ +"""Hier tree → document_chunks 영속화 (PR-DocSrv-Hierarchical-Decomposition-1 c4). + +build_hier_tree 결과를 document_chunks 에 insert. source_type='hier_section', +in_corpus=false(검색 비활성), is_leaf 노드만 embedding. 재실행 idempotent(기존 hier 행 삭제 후 재삽입). +chunk_index = doc 별 (max+1) offset → 기존 legacy 와 (doc_id,chunk_index) unique 충돌 회피. +c4(pilot)/c6(replace)/향후 backfill 공용. +""" +from __future__ import annotations +from typing import Awaitable, Callable +from sqlalchemy import text +from sqlalchemy.ext.asyncio import AsyncSession + +from services.hier_decomp.builder import build_hier_tree, coverage_stats + +CHUNKER_VERSION = "hier-rule-v1" +SOURCE_TYPE = "hier_section" + + +async def persist_hier_tree( + session: AsyncSession, + doc_id: int, + source_text: str, + embed_leaf: Callable[[str], Awaitable[list[float] | None]], + *, + domain_category: str | None = None, +) -> dict: + """doc 의 hier_section 트리를 재생성(idempotent). 통계 dict 반환.""" + nodes = build_hier_tree(source_text) + if not nodes: + return {"doc_id": doc_id, "nodes": 0, "leaves": 0, "skipped": "empty"} + + # domain_category 결정 (NOT NULL): legacy chunk 다수결 → fallback 'general' + if domain_category is None: + domain_category = await session.scalar(text(""" + SELECT domain_category FROM document_chunks WHERE doc_id=:d + GROUP BY domain_category ORDER BY count(*) DESC LIMIT 1"""), {"d": doc_id}) or "general" + + # idempotency: 기존 hier 행 삭제 + await session.execute(text( + "DELETE FROM document_chunks WHERE doc_id=:d AND source_type=:st AND chunker_version=:cv"), + {"d": doc_id, "st": SOURCE_TYPE, "cv": CHUNKER_VERSION}) + + base = (await session.scalar(text( + "SELECT COALESCE(MAX(chunk_index),-1)+1 FROM document_chunks WHERE doc_id=:d"), {"d": doc_id})) or 0 + + idx_to_dbid: dict[int, int] = {} + embedded = 0 + for n in nodes: # parent always precedes child in list order + parent_db = idx_to_dbid.get(n.parent_idx) if n.parent_idx is not None else None + emb_str = None + if n.is_leaf: + emb = await embed_leaf(n.text) + if emb: + emb_str = "[" + ",".join(repr(float(x)) for x in emb) + "]" + embedded += 1 + chunk_type = "section_md" if n.is_leaf else "section_container" + db_id = await session.scalar(text(""" + INSERT INTO document_chunks + (doc_id, chunk_index, chunk_type, section_title, heading_path, domain_category, + text, embedding, source_type, chunker_version, chunk_content_hash, + parent_id, level, node_type, is_leaf, in_corpus) + VALUES (:d, :ci, :ct, :stt, :hp, :dc, :tx, + cast(cast(:emb AS text) AS vector), + :src, :cv, :hash, :pid, :lvl, :nt, :leaf, false) + RETURNING id"""), { + "d": doc_id, "ci": base + n.idx, "ct": chunk_type, + "stt": n.section_title, "hp": n.heading_path, "dc": domain_category, + "tx": n.text, "emb": emb_str, "src": SOURCE_TYPE, "cv": CHUNKER_VERSION, + "hash": n.chunk_content_hash, "pid": parent_db, "lvl": n.level, + "nt": n.node_type, "leaf": n.is_leaf}) + idx_to_dbid[n.idx] = db_id + await session.commit() + + leaves = [n for n in nodes if n.is_leaf] + st = coverage_stats(source_text, nodes) + st.update({"doc_id": doc_id, "base_chunk_index": base, "embedded_leaves": embedded, + "embed_coverage": round(embedded / len(leaves), 4) if leaves else 0, + "domain_category": domain_category}) + return st diff --git a/migrations/284_drop_full_embedding_index.sql b/migrations/284_drop_full_embedding_index.sql new file mode 100644 index 0000000..a3b749a --- /dev/null +++ b/migrations/284_drop_full_embedding_index.sql @@ -0,0 +1,5 @@ +-- PR-DocSrv-Hierarchical-Decomposition-1 (c4): ivfflat 오염 fix 1/2. +-- 기존 full ivfflat(idx_chunks_embedding)은 in_corpus=false hier leaf 벡터까지 포함 → +-- 근사 검색이 비활성 벡터에 오염됨(corpus_chunks 결과 필터해도 근사 이웃 셋이 흔들림). +-- partial index(WHERE in_corpus=true)로 교체 위해 먼저 drop. (다음 285 에서 재생성) +DROP INDEX IF EXISTS idx_chunks_embedding; diff --git a/migrations/285_create_partial_embedding_index.sql b/migrations/285_create_partial_embedding_index.sql new file mode 100644 index 0000000..9f13a3b --- /dev/null +++ b/migrations/285_create_partial_embedding_index.sql @@ -0,0 +1,7 @@ +-- PR-DocSrv-Hierarchical-Decomposition-1 (c4): ivfflat 오염 fix 2/2. +-- partial ivfflat — in_corpus=true 벡터만 색인 → in_corpus=false(비활성 hier leaf)는 +-- 검색 인덱스에 부재 = 검색 무영향이 인덱스 레벨에서 보장. replace 시 in_corpus 토글로 자동 편입/제외. +-- corpus_chunks 쿼리(WHERE in_corpus=true)가 이 partial index 사용. +CREATE INDEX IF NOT EXISTS idx_chunks_embedding ON document_chunks + USING ivfflat (embedding vector_cosine_ops) WITH (lists = '100') + WHERE in_corpus = true;