ops(hier): Phase A law/library decompose + snapshot freeze (replace-diagnose c3)
47 eval-target undecomposed non-news docs (law21+library24+document2) 분해+임베딩 (--skip-analysis, additive). 1005 leaf 생성 fail0, in_corpus 634 무손상 검증. snapshot doc_id_max=25912 chunk_id_max=71164 docs_decomposed 301->348. 측정 drift 0. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,42 @@
|
||||
{
|
||||
"version": "v0.2-hier-replace",
|
||||
"label": "snapshot_after_phaseA",
|
||||
"date": "2026-05-25",
|
||||
"plan": "hier-hazy-waffle.md (PR-DocSrv-Hier-Replace-Diagnose-1)",
|
||||
"purpose": "Corpus drift freeze for hier vs prehier (legacy) go/no-go measurement. Frozen AFTER Phase A law/library decomposition so the 47 newly-decomposed eval-target docs' leaves are included.",
|
||||
"snapshot": {
|
||||
"doc_id_max": 25912,
|
||||
"chunk_id_max": 71164,
|
||||
"documents_n": 22097,
|
||||
"prehier_chunks_n": 31327,
|
||||
"hier_leaves_emb": 12697,
|
||||
"docs_decomposed": 348,
|
||||
"hier_in_corpus": 634
|
||||
},
|
||||
"phase_a": {
|
||||
"newly_decomposed_docs": 47,
|
||||
"leaves_created": 1005,
|
||||
"null_embedding_leaves": 9,
|
||||
"skip_analysis": true,
|
||||
"in_corpus_unchanged": "634 (invariant preserved)",
|
||||
"doc_ids": [3776,3853,3854,3855,3856,3865,3867,3868,3878,3879,3886,3887,3888,3897,3916,3917,3920,3921,3980,3981,3982,4041,5205,11495,11496,11500,11503,11504,11505,11514,11515,11591,11617,11620,11625,11627,11644,11645,11646,11647,11669,11689,11691,11711,11712,13305,13652],
|
||||
"note": "47 = eval-target undecomposed non-news docs (law 21 + library 24 + document 2). DOC_MIN_CHARS=4000 gate bypassed via --doc; section analysis deferred (Mac mini, separate axis)."
|
||||
},
|
||||
"eval_set": {
|
||||
"total_cases": 51,
|
||||
"target_docs": 101,
|
||||
"target_docs_decomposed_after_phaseA": "33 + 47 (news 21 + single-chunk excluded by design)"
|
||||
},
|
||||
"model_config": {
|
||||
"embedding": "BAAI/bge-m3 (production)",
|
||||
"reranker": "BAAI/bge-reranker-v2-m3",
|
||||
"search_mode": "hybrid",
|
||||
"knn": "exact (enable_indexscan/bitmapscan=off in eval) — isolates chunking from ivfflat approximation"
|
||||
},
|
||||
"variants": {
|
||||
"prehier": "document_chunks WHERE source_type IS DISTINCT FROM 'hier_section' (legacy 30952 + null-source 375)",
|
||||
"hier_sim_raw": "post-replace simulation: hier leaf if doc decomposed else prehier (per-doc fallback), current builder output incl tiny leaves",
|
||||
"hier_sim_clean": "raw + exclude childless leaves with length(trim(text)) < 30 (A1 held-out finding, measured as variable vs raw)"
|
||||
},
|
||||
"baseline_reference": "PR-Eval-V0_2 production baseline graded NDCG 0.659 (v0_2_baseline_2026-05-23.json) — harness sanity target (production ivfflat path)"
|
||||
}
|
||||
Reference in New Issue
Block a user