Files
hyungi_document_server/tests/search_eval/hier_measure_views.sql
T
hyungi 100aaa3b0c feat(search): corpus_variant + exact_knn measurement dispatch (replace-diagnose c4+c5)
PR-DocSrv-Hier-Replace-Diagnose-1 c4+c5. hier vs prehier(legacy) go/no-go 비파괴 측정 hook.
- 측정 뷰 3종 (hier_measure_views.sql, additive/droppable): corpus_chunks_prehier
  (legacy+null-source 375 포함) / hier_sim_raw / hier_sim_clean (childless-tiny<30 제외,
  all-tiny doc 은 legacy fallback 정합).
- retrieval_service: _resolve_corpus_variant + CORPUS_VARIANT_MAP + _VALID_CHUNKS_TABLE
  3 뷰 추가 + exact_knn(SET LOCAL enable_indexscan/bitmapscan=off, eval 전용).
  chunk leg 만 영향 (doc-level + fts/trgm = documents 무관). baseline/None path 회귀 0.
- search_pipeline.run_search + search.py: corpus_variant/exact_knn 전달, unknown→400,
  embedding_backend cand 와 동시 사용 금지(400).
- run_eval: --corpus-variant + --exact-knn flag.
- tests/test_corpus_variant.py 22 PASS (resolver/map/allowlist + SQL injection 거부).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-25 05:37:15 +00:00

38 lines
1.9 KiB
SQL

-- PR-DocSrv-Hier-Replace-Diagnose-1 c4: 측정 전용 view (additive, droppable, in_corpus 무관)
-- prehier = pre-hier baseline (legacy + null-source). hier_sim_* = post-replace 시뮬(doc 단위 fallback).
-- clean = childless-tiny(<30자) leaf 제외 (A1 held-out 발견). kept-leaf = is_leaf AND (len>=30 OR has child).
DROP VIEW IF EXISTS corpus_chunks_prehier;
DROP VIEW IF EXISTS corpus_chunks_hier_sim_raw;
DROP VIEW IF EXISTS corpus_chunks_hier_sim_clean;
CREATE VIEW corpus_chunks_prehier AS
SELECT * FROM document_chunks
WHERE source_type IS DISTINCT FROM 'hier_section' AND embedding IS NOT NULL;
CREATE VIEW corpus_chunks_hier_sim_raw AS
SELECT * FROM document_chunks dc
WHERE dc.embedding IS NOT NULL AND (
(dc.source_type = 'hier_section' AND dc.is_leaf = true)
OR (dc.source_type IS DISTINCT FROM 'hier_section'
AND NOT EXISTS (SELECT 1 FROM document_chunks h
WHERE h.doc_id = dc.doc_id AND h.source_type = 'hier_section'
AND h.is_leaf = true AND h.embedding IS NOT NULL))
);
CREATE VIEW corpus_chunks_hier_sim_clean AS
SELECT * FROM document_chunks dc
WHERE dc.embedding IS NOT NULL AND (
-- kept hier leaf: is_leaf AND NOT childless-tiny
(dc.source_type = 'hier_section' AND dc.is_leaf = true
AND (length(trim(dc.text)) >= 30
OR EXISTS (SELECT 1 FROM document_chunks ch WHERE ch.parent_id = dc.id)))
-- legacy fallback: doc 에 kept(clean) hier leaf 가 하나도 없을 때만
OR (dc.source_type IS DISTINCT FROM 'hier_section'
AND NOT EXISTS (SELECT 1 FROM document_chunks h
WHERE h.doc_id = dc.doc_id AND h.source_type = 'hier_section'
AND h.is_leaf = true AND h.embedding IS NOT NULL
AND (length(trim(h.text)) >= 30
OR EXISTS (SELECT 1 FROM document_chunks ch2 WHERE ch2.parent_id = h.id))))
);