Files
hyungi_document_server/app/services/hier_decomp/replace.py
T
hyungi a7b16b63db feat(search): doc-level atomic corpus replace + isolation test (Hier-Decomp-1 c5)
replace_doc_corpus(dry_run): G5 precond(doc-local embed 100% + parent 무결성 + leaf>0) 검증 후
단일 트랜잭션 atomic 교체(legacy in_corpus=false / hier leaf in_corpus=true,
predicate=is_leaf AND embedding NOT NULL, node_type 미사용). 물리삭제 없음. rollback_doc_corpus 역토글.
precond 미충족 시 변경 0(legacy 유지).

tests/hier_decomp/test_corpus_isolation.py: in_corpus=false leaf 가 corpus_chunks 누출 0 단언
(부분 ivfflat + 뷰 이중 choke point 회귀 가드).

c5: dry-run 3 pilot precond_ok(5140 158L→271leaf / 5186 381→199 / 5225 18→164), 격리 테스트 PASS.
실제 replace 는 c6(1-doc-first).

plan: hierarchical-decomposition-tiered-nesting-marmot.md

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-24 13:14:36 +00:00

73 lines
3.6 KiB
Python

"""doc 단위 atomic 코퍼스 교체 (PR-DocSrv-Hierarchical-Decomposition-1 c5/c6).
legacy 윈도우 청크 → hier_section leaf 청크로 검색 코퍼스 교체(in_corpus 토글).
- 물리 삭제 없음(in_corpus 플래그만). 부분 ivfflat 이 자동 반영.
- G5 precondition(doc-local): hier leaf>0 + 모든 leaf embedding 보유(doc-local 100%) + parent 무결성(dangling 0).
- 단일 트랜잭션 atomic. 실패/precond 미충족 → 변경 0(legacy 유지).
- rollback: in_corpus 역토글(아래 rollback_doc_corpus).
"""
from __future__ import annotations
from sqlalchemy import text
from sqlalchemy.ext.asyncio import AsyncSession
CHUNKER_VERSION = "hier-rule-v1"
async def precheck(session: AsyncSession, doc_id: int) -> dict:
row = (await session.execute(text("""
SELECT
count(*) FILTER (WHERE source_type='hier_section' AND is_leaf) AS hier_leaves,
count(*) FILTER (WHERE source_type='hier_section' AND is_leaf AND embedding IS NOT NULL) AS hier_leaves_emb,
count(*) FILTER (WHERE source_type='legacy' AND in_corpus) AS legacy_active,
count(*) FILTER (WHERE source_type='hier_section' AND parent_id IS NOT NULL
AND parent_id NOT IN (SELECT id FROM document_chunks WHERE doc_id=:d AND source_type='hier_section')) AS dangling
FROM document_chunks WHERE doc_id=:d"""), {"d": doc_id})).one()
leaves, leaves_emb = row.hier_leaves, row.hier_leaves_emb
doc_local_100 = leaves > 0 and leaves_emb == leaves
ok = doc_local_100 and row.dangling == 0
return {
"doc_id": doc_id, "hier_leaves": leaves, "hier_leaves_embedded": leaves_emb,
"doc_local_embed_100": doc_local_100, "legacy_active": row.legacy_active,
"dangling_parent": row.dangling, "precond_ok": ok,
"reason": None if ok else (
"no_hier_leaves" if leaves == 0 else
"embed_incomplete" if not doc_local_100 else
"dangling_parent"),
}
async def replace_doc_corpus(session: AsyncSession, doc_id: int, *, dry_run: bool = True) -> dict:
pc = await precheck(session, doc_id)
pc["dry_run"] = dry_run
if not pc["precond_ok"]:
pc["action"] = "aborted"
return pc
if dry_run:
pc["action"] = "dry_run"
pc["would_deactivate_legacy"] = pc["legacy_active"]
pc["would_activate_hier_leaves"] = pc["hier_leaves"]
return pc
# atomic 교체 (단일 트랜잭션)
deact = (await session.execute(text(
"UPDATE document_chunks SET in_corpus=false WHERE doc_id=:d AND source_type='legacy' AND in_corpus=true"),
{"d": doc_id})).rowcount
act = (await session.execute(text(
"UPDATE document_chunks SET in_corpus=true WHERE doc_id=:d AND source_type='hier_section'"
" AND chunker_version=:cv AND is_leaf=true AND embedding IS NOT NULL AND in_corpus=false"),
{"d": doc_id, "cv": CHUNKER_VERSION})).rowcount
await session.commit()
pc.update({"action": "replaced", "legacy_deactivated": deact, "hier_activated": act})
return pc
async def rollback_doc_corpus(session: AsyncSession, doc_id: int) -> dict:
"""교체 역토글 (legacy 복귀, hier 비활성)."""
act = (await session.execute(text(
"UPDATE document_chunks SET in_corpus=true WHERE doc_id=:d AND source_type='legacy' AND in_corpus=false"),
{"d": doc_id})).rowcount
deact = (await session.execute(text(
"UPDATE document_chunks SET in_corpus=false WHERE doc_id=:d AND source_type='hier_section' AND in_corpus=true"),
{"d": doc_id})).rowcount
await session.commit()
return {"doc_id": doc_id, "action": "rolled_back", "legacy_reactivated": act, "hier_deactivated": deact}