feat(search): corpus_variant + exact_knn measurement dispatch (replace-diagnose c4+c5)
PR-DocSrv-Hier-Replace-Diagnose-1 c4+c5. hier vs prehier(legacy) go/no-go 비파괴 측정 hook. - 측정 뷰 3종 (hier_measure_views.sql, additive/droppable): corpus_chunks_prehier (legacy+null-source 375 포함) / hier_sim_raw / hier_sim_clean (childless-tiny<30 제외, all-tiny doc 은 legacy fallback 정합). - retrieval_service: _resolve_corpus_variant + CORPUS_VARIANT_MAP + _VALID_CHUNKS_TABLE 3 뷰 추가 + exact_knn(SET LOCAL enable_indexscan/bitmapscan=off, eval 전용). chunk leg 만 영향 (doc-level + fts/trgm = documents 무관). baseline/None path 회귀 0. - search_pipeline.run_search + search.py: corpus_variant/exact_knn 전달, unknown→400, embedding_backend cand 와 동시 사용 금지(400). - run_eval: --corpus-variant + --exact-knn flag. - tests/test_corpus_variant.py 22 PASS (resolver/map/allowlist + SQL injection 거부). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,37 @@
|
||||
-- PR-DocSrv-Hier-Replace-Diagnose-1 c4: 측정 전용 view (additive, droppable, in_corpus 무관)
|
||||
-- prehier = pre-hier baseline (legacy + null-source). hier_sim_* = post-replace 시뮬(doc 단위 fallback).
|
||||
-- clean = childless-tiny(<30자) leaf 제외 (A1 held-out 발견). kept-leaf = is_leaf AND (len>=30 OR has child).
|
||||
|
||||
DROP VIEW IF EXISTS corpus_chunks_prehier;
|
||||
DROP VIEW IF EXISTS corpus_chunks_hier_sim_raw;
|
||||
DROP VIEW IF EXISTS corpus_chunks_hier_sim_clean;
|
||||
|
||||
CREATE VIEW corpus_chunks_prehier AS
|
||||
SELECT * FROM document_chunks
|
||||
WHERE source_type IS DISTINCT FROM 'hier_section' AND embedding IS NOT NULL;
|
||||
|
||||
CREATE VIEW corpus_chunks_hier_sim_raw AS
|
||||
SELECT * FROM document_chunks dc
|
||||
WHERE dc.embedding IS NOT NULL AND (
|
||||
(dc.source_type = 'hier_section' AND dc.is_leaf = true)
|
||||
OR (dc.source_type IS DISTINCT FROM 'hier_section'
|
||||
AND NOT EXISTS (SELECT 1 FROM document_chunks h
|
||||
WHERE h.doc_id = dc.doc_id AND h.source_type = 'hier_section'
|
||||
AND h.is_leaf = true AND h.embedding IS NOT NULL))
|
||||
);
|
||||
|
||||
CREATE VIEW corpus_chunks_hier_sim_clean AS
|
||||
SELECT * FROM document_chunks dc
|
||||
WHERE dc.embedding IS NOT NULL AND (
|
||||
-- kept hier leaf: is_leaf AND NOT childless-tiny
|
||||
(dc.source_type = 'hier_section' AND dc.is_leaf = true
|
||||
AND (length(trim(dc.text)) >= 30
|
||||
OR EXISTS (SELECT 1 FROM document_chunks ch WHERE ch.parent_id = dc.id)))
|
||||
-- legacy fallback: doc 에 kept(clean) hier leaf 가 하나도 없을 때만
|
||||
OR (dc.source_type IS DISTINCT FROM 'hier_section'
|
||||
AND NOT EXISTS (SELECT 1 FROM document_chunks h
|
||||
WHERE h.doc_id = dc.doc_id AND h.source_type = 'hier_section'
|
||||
AND h.is_leaf = true AND h.embedding IS NOT NULL
|
||||
AND (length(trim(h.text)) >= 30
|
||||
OR EXISTS (SELECT 1 FROM document_chunks ch2 WHERE ch2.parent_id = h.id))))
|
||||
);
|
||||
@@ -243,6 +243,8 @@ async def call_search(
|
||||
snapshot_chunk_id_max: int | None = None,
|
||||
reranker_backend: str | None = None,
|
||||
rewrite_backend: str | None = None,
|
||||
corpus_variant: str | None = None,
|
||||
exact_knn: bool = False,
|
||||
) -> tuple[list[int], float]:
|
||||
"""검색 API 호출 → (doc_ids, latency_ms)."""
|
||||
url = f"{base_url.rstrip('/')}/api/search/"
|
||||
@@ -264,6 +266,10 @@ async def call_search(
|
||||
params["reranker_backend"] = reranker_backend
|
||||
if rewrite_backend is not None:
|
||||
params["rewrite_backend"] = rewrite_backend
|
||||
if corpus_variant is not None:
|
||||
params["corpus_variant"] = corpus_variant
|
||||
if exact_knn:
|
||||
params["exact_knn"] = "true"
|
||||
|
||||
import time
|
||||
|
||||
@@ -296,6 +302,8 @@ async def evaluate(
|
||||
snapshot_chunk_id_max: int | None = None,
|
||||
reranker_backend: str | None = None,
|
||||
rewrite_backend: str | None = None,
|
||||
corpus_variant: str | None = None,
|
||||
exact_knn: bool = False,
|
||||
) -> list[QueryResult]:
|
||||
"""전체 쿼리셋 평가."""
|
||||
results: list[QueryResult] = []
|
||||
@@ -310,6 +318,8 @@ async def evaluate(
|
||||
snapshot_chunk_id_max=snapshot_chunk_id_max,
|
||||
reranker_backend=reranker_backend,
|
||||
rewrite_backend=rewrite_backend,
|
||||
corpus_variant=corpus_variant,
|
||||
exact_knn=exact_knn,
|
||||
)
|
||||
dedup_count = count_dedup(returned_ids, 10)
|
||||
if dedup_count > 0:
|
||||
@@ -1392,6 +1402,18 @@ def main() -> int:
|
||||
default=None,
|
||||
help="Phase 2Q Diagnose query rewrite dispatcher slug (baseline | cand_multi_query_macmini | cand_multi_query_macbook). 미지정 = single-query path. Phase 1B scaffold = variants 박제만, retrieval 합성은 Phase 2.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--corpus-variant",
|
||||
type=str,
|
||||
default=None,
|
||||
choices=["prehier", "hier_sim_raw", "hier_sim_clean"],
|
||||
help="Hier-Replace-Diagnose-1: chunk leg 측정 뷰 (prehier=legacy baseline | hier_sim_raw | hier_sim_clean). 미지정 = production corpus_chunks.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--exact-knn",
|
||||
action="store_true",
|
||||
help="Hier-Replace-Diagnose-1: vector leg exact KNN (ivfflat 근사 제거). prehier vs hier_sim 공정 비교용. eval 전용.",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@@ -1445,21 +1467,21 @@ def main() -> int:
|
||||
if args.base_url:
|
||||
print(f"\n>>> evaluating: {args.base_url}")
|
||||
results = asyncio.run(
|
||||
evaluate(queries, args.base_url, args.token, "single", mode=args.mode, fusion=args.fusion, rerank=args.rerank, analyze=args.analyze, embedding_backend=args.embedding_backend, snapshot_doc_id_max=args.snapshot_doc_id_max, snapshot_chunk_id_max=args.snapshot_chunk_id_max, reranker_backend=args.reranker_backend, rewrite_backend=args.rewrite_backend)
|
||||
evaluate(queries, args.base_url, args.token, "single", mode=args.mode, fusion=args.fusion, rerank=args.rerank, analyze=args.analyze, embedding_backend=args.embedding_backend, snapshot_doc_id_max=args.snapshot_doc_id_max, snapshot_chunk_id_max=args.snapshot_chunk_id_max, reranker_backend=args.reranker_backend, rewrite_backend=args.rewrite_backend, corpus_variant=args.corpus_variant, exact_knn=args.exact_knn)
|
||||
)
|
||||
print_summary("single", results, eval_version=args.eval_version)
|
||||
all_results.extend(results)
|
||||
else:
|
||||
print(f"\n>>> baseline: {args.baseline_url}")
|
||||
baseline_results = asyncio.run(
|
||||
evaluate(queries, args.baseline_url, args.token, "baseline", mode=args.mode, fusion=args.fusion, rerank=args.rerank, analyze=args.analyze, embedding_backend=args.embedding_backend, snapshot_doc_id_max=args.snapshot_doc_id_max, snapshot_chunk_id_max=args.snapshot_chunk_id_max, reranker_backend=args.reranker_backend, rewrite_backend=args.rewrite_backend)
|
||||
evaluate(queries, args.baseline_url, args.token, "baseline", mode=args.mode, fusion=args.fusion, rerank=args.rerank, analyze=args.analyze, embedding_backend=args.embedding_backend, snapshot_doc_id_max=args.snapshot_doc_id_max, snapshot_chunk_id_max=args.snapshot_chunk_id_max, reranker_backend=args.reranker_backend, rewrite_backend=args.rewrite_backend, corpus_variant=args.corpus_variant, exact_knn=args.exact_knn)
|
||||
)
|
||||
baseline_summary = print_summary("baseline", baseline_results, eval_version=args.eval_version)
|
||||
|
||||
print(f"\n>>> candidate: {args.candidate_url}")
|
||||
candidate_results = asyncio.run(
|
||||
evaluate(
|
||||
queries, args.candidate_url, args.token, "candidate", mode=args.mode, fusion=args.fusion, rerank=args.rerank, analyze=args.analyze, embedding_backend=args.embedding_backend, snapshot_doc_id_max=args.snapshot_doc_id_max, snapshot_chunk_id_max=args.snapshot_chunk_id_max, reranker_backend=args.reranker_backend, rewrite_backend=args.rewrite_backend
|
||||
queries, args.candidate_url, args.token, "candidate", mode=args.mode, fusion=args.fusion, rerank=args.rerank, analyze=args.analyze, embedding_backend=args.embedding_backend, snapshot_doc_id_max=args.snapshot_doc_id_max, snapshot_chunk_id_max=args.snapshot_chunk_id_max, reranker_backend=args.reranker_backend, rewrite_backend=args.rewrite_backend, corpus_variant=args.corpus_variant, exact_knn=args.exact_knn
|
||||
)
|
||||
)
|
||||
candidate_summary = print_summary("candidate", candidate_results, eval_version=args.eval_version)
|
||||
|
||||
Reference in New Issue
Block a user