ops(hier): Phase A law/library decompose + snapshot freeze (replace-diagnose c3)

47 eval-target undecomposed non-news docs (law21+library24+document2) 분해+임베딩
(--skip-analysis, additive). 1005 leaf 생성 fail0, in_corpus 634 무손상 검증.
snapshot doc_id_max=25912 chunk_id_max=71164 docs_decomposed 301->348. 측정 drift 0.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
hyungi
2026-05-25 05:23:38 +00:00
parent fc9e0f1d8f
commit e860baa179
@@ -0,0 +1,42 @@
{
"version": "v0.2-hier-replace",
"label": "snapshot_after_phaseA",
"date": "2026-05-25",
"plan": "hier-hazy-waffle.md (PR-DocSrv-Hier-Replace-Diagnose-1)",
"purpose": "Corpus drift freeze for hier vs prehier (legacy) go/no-go measurement. Frozen AFTER Phase A law/library decomposition so the 47 newly-decomposed eval-target docs' leaves are included.",
"snapshot": {
"doc_id_max": 25912,
"chunk_id_max": 71164,
"documents_n": 22097,
"prehier_chunks_n": 31327,
"hier_leaves_emb": 12697,
"docs_decomposed": 348,
"hier_in_corpus": 634
},
"phase_a": {
"newly_decomposed_docs": 47,
"leaves_created": 1005,
"null_embedding_leaves": 9,
"skip_analysis": true,
"in_corpus_unchanged": "634 (invariant preserved)",
"doc_ids": [3776,3853,3854,3855,3856,3865,3867,3868,3878,3879,3886,3887,3888,3897,3916,3917,3920,3921,3980,3981,3982,4041,5205,11495,11496,11500,11503,11504,11505,11514,11515,11591,11617,11620,11625,11627,11644,11645,11646,11647,11669,11689,11691,11711,11712,13305,13652],
"note": "47 = eval-target undecomposed non-news docs (law 21 + library 24 + document 2). DOC_MIN_CHARS=4000 gate bypassed via --doc; section analysis deferred (Mac mini, separate axis)."
},
"eval_set": {
"total_cases": 51,
"target_docs": 101,
"target_docs_decomposed_after_phaseA": "33 + 47 (news 21 + single-chunk excluded by design)"
},
"model_config": {
"embedding": "BAAI/bge-m3 (production)",
"reranker": "BAAI/bge-reranker-v2-m3",
"search_mode": "hybrid",
"knn": "exact (enable_indexscan/bitmapscan=off in eval) — isolates chunking from ivfflat approximation"
},
"variants": {
"prehier": "document_chunks WHERE source_type IS DISTINCT FROM 'hier_section' (legacy 30952 + null-source 375)",
"hier_sim_raw": "post-replace simulation: hier leaf if doc decomposed else prehier (per-doc fallback), current builder output incl tiny leaves",
"hier_sim_clean": "raw + exclude childless leaves with length(trim(text)) < 30 (A1 held-out finding, measured as variable vs raw)"
},
"baseline_reference": "PR-Eval-V0_2 production baseline graded NDCG 0.659 (v0_2_baseline_2026-05-23.json) — harness sanity target (production ivfflat path)"
}