diff --git a/app/services/hier_decomp/persist.py b/app/services/hier_decomp/persist.py new file mode 100644 index 0000000..ec32bf7 --- /dev/null +++ b/app/services/hier_decomp/persist.py @@ -0,0 +1,79 @@ +"""Hier tree → document_chunks 영속화 (PR-DocSrv-Hierarchical-Decomposition-1 c4). + +build_hier_tree 결과를 document_chunks 에 insert. source_type='hier_section', +in_corpus=false(검색 비활성), is_leaf 노드만 embedding. 재실행 idempotent(기존 hier 행 삭제 후 재삽입). +chunk_index = doc 별 (max+1) offset → 기존 legacy 와 (doc_id,chunk_index) unique 충돌 회피. +c4(pilot)/c6(replace)/향후 backfill 공용. +""" +from __future__ import annotations +from typing import Awaitable, Callable +from sqlalchemy import text +from sqlalchemy.ext.asyncio import AsyncSession + +from services.hier_decomp.builder import build_hier_tree, coverage_stats + +CHUNKER_VERSION = "hier-rule-v1" +SOURCE_TYPE = "hier_section" + + +async def persist_hier_tree( + session: AsyncSession, + doc_id: int, + source_text: str, + embed_leaf: Callable[[str], Awaitable[list[float] | None]], + *, + domain_category: str | None = None, +) -> dict: + """doc 의 hier_section 트리를 재생성(idempotent). 통계 dict 반환.""" + nodes = build_hier_tree(source_text) + if not nodes: + return {"doc_id": doc_id, "nodes": 0, "leaves": 0, "skipped": "empty"} + + # domain_category 결정 (NOT NULL): legacy chunk 다수결 → fallback 'general' + if domain_category is None: + domain_category = await session.scalar(text(""" + SELECT domain_category FROM document_chunks WHERE doc_id=:d + GROUP BY domain_category ORDER BY count(*) DESC LIMIT 1"""), {"d": doc_id}) or "general" + + # idempotency: 기존 hier 행 삭제 + await session.execute(text( + "DELETE FROM document_chunks WHERE doc_id=:d AND source_type=:st AND chunker_version=:cv"), + {"d": doc_id, "st": SOURCE_TYPE, "cv": CHUNKER_VERSION}) + + base = (await session.scalar(text( + "SELECT COALESCE(MAX(chunk_index),-1)+1 FROM document_chunks WHERE doc_id=:d"), {"d": doc_id})) or 0 + + idx_to_dbid: dict[int, int] = {} + embedded = 0 + for n in nodes: # parent always precedes child in list order + parent_db = idx_to_dbid.get(n.parent_idx) if n.parent_idx is not None else None + emb_str = None + if n.is_leaf: + emb = await embed_leaf(n.text) + if emb: + emb_str = "[" + ",".join(repr(float(x)) for x in emb) + "]" + embedded += 1 + chunk_type = "section_md" if n.is_leaf else "section_container" + db_id = await session.scalar(text(""" + INSERT INTO document_chunks + (doc_id, chunk_index, chunk_type, section_title, heading_path, domain_category, + text, embedding, source_type, chunker_version, chunk_content_hash, + parent_id, level, node_type, is_leaf, in_corpus) + VALUES (:d, :ci, :ct, :stt, :hp, :dc, :tx, + cast(cast(:emb AS text) AS vector), + :src, :cv, :hash, :pid, :lvl, :nt, :leaf, false) + RETURNING id"""), { + "d": doc_id, "ci": base + n.idx, "ct": chunk_type, + "stt": n.section_title, "hp": n.heading_path, "dc": domain_category, + "tx": n.text, "emb": emb_str, "src": SOURCE_TYPE, "cv": CHUNKER_VERSION, + "hash": n.chunk_content_hash, "pid": parent_db, "lvl": n.level, + "nt": n.node_type, "leaf": n.is_leaf}) + idx_to_dbid[n.idx] = db_id + await session.commit() + + leaves = [n for n in nodes if n.is_leaf] + st = coverage_stats(source_text, nodes) + st.update({"doc_id": doc_id, "base_chunk_index": base, "embedded_leaves": embedded, + "embed_coverage": round(embedded / len(leaves), 4) if leaves else 0, + "domain_category": domain_category}) + return st diff --git a/migrations/284_drop_full_embedding_index.sql b/migrations/284_drop_full_embedding_index.sql new file mode 100644 index 0000000..a3b749a --- /dev/null +++ b/migrations/284_drop_full_embedding_index.sql @@ -0,0 +1,5 @@ +-- PR-DocSrv-Hierarchical-Decomposition-1 (c4): ivfflat 오염 fix 1/2. +-- 기존 full ivfflat(idx_chunks_embedding)은 in_corpus=false hier leaf 벡터까지 포함 → +-- 근사 검색이 비활성 벡터에 오염됨(corpus_chunks 결과 필터해도 근사 이웃 셋이 흔들림). +-- partial index(WHERE in_corpus=true)로 교체 위해 먼저 drop. (다음 285 에서 재생성) +DROP INDEX IF EXISTS idx_chunks_embedding; diff --git a/migrations/285_create_partial_embedding_index.sql b/migrations/285_create_partial_embedding_index.sql new file mode 100644 index 0000000..9f13a3b --- /dev/null +++ b/migrations/285_create_partial_embedding_index.sql @@ -0,0 +1,7 @@ +-- PR-DocSrv-Hierarchical-Decomposition-1 (c4): ivfflat 오염 fix 2/2. +-- partial ivfflat — in_corpus=true 벡터만 색인 → in_corpus=false(비활성 hier leaf)는 +-- 검색 인덱스에 부재 = 검색 무영향이 인덱스 레벨에서 보장. replace 시 in_corpus 토글로 자동 편입/제외. +-- corpus_chunks 쿼리(WHERE in_corpus=true)가 이 partial index 사용. +CREATE INDEX IF NOT EXISTS idx_chunks_embedding ON document_chunks + USING ivfflat (embedding vector_cosine_ops) WITH (lists = '100') + WHERE in_corpus = true;