a7b16b63db
replace_doc_corpus(dry_run): G5 precond(doc-local embed 100% + parent 무결성 + leaf>0) 검증 후 단일 트랜잭션 atomic 교체(legacy in_corpus=false / hier leaf in_corpus=true, predicate=is_leaf AND embedding NOT NULL, node_type 미사용). 물리삭제 없음. rollback_doc_corpus 역토글. precond 미충족 시 변경 0(legacy 유지). tests/hier_decomp/test_corpus_isolation.py: in_corpus=false leaf 가 corpus_chunks 누출 0 단언 (부분 ivfflat + 뷰 이중 choke point 회귀 가드). c5: dry-run 3 pilot precond_ok(5140 158L→271leaf / 5186 381→199 / 5225 18→164), 격리 테스트 PASS. 실제 replace 는 c6(1-doc-first). plan: hierarchical-decomposition-tiered-nesting-marmot.md Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
73 lines
3.6 KiB
Python
73 lines
3.6 KiB
Python
"""doc 단위 atomic 코퍼스 교체 (PR-DocSrv-Hierarchical-Decomposition-1 c5/c6).
|
|
|
|
legacy 윈도우 청크 → hier_section leaf 청크로 검색 코퍼스 교체(in_corpus 토글).
|
|
- 물리 삭제 없음(in_corpus 플래그만). 부분 ivfflat 이 자동 반영.
|
|
- G5 precondition(doc-local): hier leaf>0 + 모든 leaf embedding 보유(doc-local 100%) + parent 무결성(dangling 0).
|
|
- 단일 트랜잭션 atomic. 실패/precond 미충족 → 변경 0(legacy 유지).
|
|
- rollback: in_corpus 역토글(아래 rollback_doc_corpus).
|
|
"""
|
|
from __future__ import annotations
|
|
from sqlalchemy import text
|
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
|
|
CHUNKER_VERSION = "hier-rule-v1"
|
|
|
|
|
|
async def precheck(session: AsyncSession, doc_id: int) -> dict:
|
|
row = (await session.execute(text("""
|
|
SELECT
|
|
count(*) FILTER (WHERE source_type='hier_section' AND is_leaf) AS hier_leaves,
|
|
count(*) FILTER (WHERE source_type='hier_section' AND is_leaf AND embedding IS NOT NULL) AS hier_leaves_emb,
|
|
count(*) FILTER (WHERE source_type='legacy' AND in_corpus) AS legacy_active,
|
|
count(*) FILTER (WHERE source_type='hier_section' AND parent_id IS NOT NULL
|
|
AND parent_id NOT IN (SELECT id FROM document_chunks WHERE doc_id=:d AND source_type='hier_section')) AS dangling
|
|
FROM document_chunks WHERE doc_id=:d"""), {"d": doc_id})).one()
|
|
leaves, leaves_emb = row.hier_leaves, row.hier_leaves_emb
|
|
doc_local_100 = leaves > 0 and leaves_emb == leaves
|
|
ok = doc_local_100 and row.dangling == 0
|
|
return {
|
|
"doc_id": doc_id, "hier_leaves": leaves, "hier_leaves_embedded": leaves_emb,
|
|
"doc_local_embed_100": doc_local_100, "legacy_active": row.legacy_active,
|
|
"dangling_parent": row.dangling, "precond_ok": ok,
|
|
"reason": None if ok else (
|
|
"no_hier_leaves" if leaves == 0 else
|
|
"embed_incomplete" if not doc_local_100 else
|
|
"dangling_parent"),
|
|
}
|
|
|
|
|
|
async def replace_doc_corpus(session: AsyncSession, doc_id: int, *, dry_run: bool = True) -> dict:
|
|
pc = await precheck(session, doc_id)
|
|
pc["dry_run"] = dry_run
|
|
if not pc["precond_ok"]:
|
|
pc["action"] = "aborted"
|
|
return pc
|
|
if dry_run:
|
|
pc["action"] = "dry_run"
|
|
pc["would_deactivate_legacy"] = pc["legacy_active"]
|
|
pc["would_activate_hier_leaves"] = pc["hier_leaves"]
|
|
return pc
|
|
# atomic 교체 (단일 트랜잭션)
|
|
deact = (await session.execute(text(
|
|
"UPDATE document_chunks SET in_corpus=false WHERE doc_id=:d AND source_type='legacy' AND in_corpus=true"),
|
|
{"d": doc_id})).rowcount
|
|
act = (await session.execute(text(
|
|
"UPDATE document_chunks SET in_corpus=true WHERE doc_id=:d AND source_type='hier_section'"
|
|
" AND chunker_version=:cv AND is_leaf=true AND embedding IS NOT NULL AND in_corpus=false"),
|
|
{"d": doc_id, "cv": CHUNKER_VERSION})).rowcount
|
|
await session.commit()
|
|
pc.update({"action": "replaced", "legacy_deactivated": deact, "hier_activated": act})
|
|
return pc
|
|
|
|
|
|
async def rollback_doc_corpus(session: AsyncSession, doc_id: int) -> dict:
|
|
"""교체 역토글 (legacy 복귀, hier 비활성)."""
|
|
act = (await session.execute(text(
|
|
"UPDATE document_chunks SET in_corpus=true WHERE doc_id=:d AND source_type='legacy' AND in_corpus=false"),
|
|
{"d": doc_id})).rowcount
|
|
deact = (await session.execute(text(
|
|
"UPDATE document_chunks SET in_corpus=false WHERE doc_id=:d AND source_type='hier_section' AND in_corpus=true"),
|
|
{"d": doc_id})).rowcount
|
|
await session.commit()
|
|
return {"doc_id": doc_id, "action": "rolled_back", "legacy_reactivated": act, "hier_deactivated": deact}
|