feat(search): Phase 1.2-C chunks 기반 vector retrieval + raw chunks 보존
retrieval_service.search_vector를 documents.embedding → document_chunks.embedding로 전환. fetch_limit = limit*5로 raw chunks를 넓게 가져온 후 doc 기준 압축. 신규: compress_chunks_to_docs(chunks, limit) → (doc_results, chunks_by_doc) - doc_id 별 best score chunk만 doc_results (fusion 입력) - 모든 raw chunks는 chunks_by_doc dict에 보존 (Phase 1.3 reranker용) - '같은 doc 중복으로 RRF가 false boost' 방지 SearchResult: chunk_id / chunk_index / section_title optional 필드 추가. - text 검색 결과는 None (doc-level) - vector 검색 결과는 채워짐 (chunk-level) search.py 흐름: 1. raw_chunks = await search_vector(...) 2. vector_results, chunks_by_doc = compress_chunks_to_docs(raw_chunks, limit) 3. fusion(text_results, vector_results) — doc 기준 4. (Phase 1.3) chunks_by_doc → reranker — chunk 기준 debug notes: raw=N compressed=M unique_docs=K로 흐름 검증. 데이터 의존: 재인덱싱(reindex_all_chunks.py 진행 중) 완료 후 평가셋으로 검증.
This commit is contained in:
@@ -16,7 +16,7 @@ from core.database import get_session
|
||||
from core.utils import setup_logger
|
||||
from models.user import User
|
||||
from services.search.fusion_service import DEFAULT_FUSION, get_strategy, normalize_display_scores
|
||||
from services.search.retrieval_service import search_text, search_vector
|
||||
from services.search.retrieval_service import compress_chunks_to_docs, search_text, search_vector
|
||||
from services.search_telemetry import (
|
||||
compute_confidence,
|
||||
compute_confidence_hybrid,
|
||||
@@ -30,7 +30,14 @@ router = APIRouter()
|
||||
|
||||
|
||||
class SearchResult(BaseModel):
|
||||
id: int
|
||||
"""검색 결과 단일 행.
|
||||
|
||||
Phase 1.2-C: chunk-level vector retrieval 도입으로 chunk 메타 필드 추가.
|
||||
text 검색 결과는 chunk_id 등이 None (doc-level).
|
||||
vector 검색 결과는 chunk_id 등이 채워짐 (chunk-level).
|
||||
"""
|
||||
|
||||
id: int # doc_id (text/vector 공통)
|
||||
title: str | None
|
||||
ai_domain: str | None
|
||||
ai_summary: str | None
|
||||
@@ -38,6 +45,10 @@ class SearchResult(BaseModel):
|
||||
score: float
|
||||
snippet: str | None
|
||||
match_reason: str | None = None
|
||||
# Phase 1.2-C: chunk 메타 (vector 검색 시 채워짐)
|
||||
chunk_id: int | None = None
|
||||
chunk_index: int | None = None
|
||||
section_title: str | None = None
|
||||
|
||||
|
||||
# ─── Phase 0.4: 디버그 응답 스키마 ─────────────────────────
|
||||
@@ -99,16 +110,20 @@ async def search(
|
||||
timing: dict[str, float] = {}
|
||||
notes: list[str] = []
|
||||
text_results: list[SearchResult] = []
|
||||
vector_results: list[SearchResult] = []
|
||||
vector_results: list[SearchResult] = [] # doc-level (압축 후, fusion 입력)
|
||||
raw_chunks: list[SearchResult] = [] # chunk-level (raw, Phase 1.3 reranker용)
|
||||
chunks_by_doc: dict[int, list[SearchResult]] = {} # Phase 1.3 reranker용 보존
|
||||
|
||||
t_total = time.perf_counter()
|
||||
|
||||
if mode == "vector":
|
||||
t0 = time.perf_counter()
|
||||
vector_results = await search_vector(session, q, limit)
|
||||
raw_chunks = await search_vector(session, q, limit)
|
||||
timing["vector_ms"] = (time.perf_counter() - t0) * 1000
|
||||
if not vector_results:
|
||||
if not raw_chunks:
|
||||
notes.append("vector_search_returned_empty (AI client error or no embeddings)")
|
||||
# vector 단독 모드도 doc 압축해서 다양성 확보 (chunk 중복 방지)
|
||||
vector_results, chunks_by_doc = compress_chunks_to_docs(raw_chunks, limit)
|
||||
results = vector_results
|
||||
else:
|
||||
t0 = time.perf_counter()
|
||||
@@ -117,8 +132,14 @@ async def search(
|
||||
|
||||
if mode == "hybrid":
|
||||
t1 = time.perf_counter()
|
||||
vector_results = await search_vector(session, q, limit)
|
||||
raw_chunks = await search_vector(session, q, limit)
|
||||
timing["vector_ms"] = (time.perf_counter() - t1) * 1000
|
||||
|
||||
# chunk-level → doc-level 압축 (raw chunks는 chunks_by_doc에 보존)
|
||||
t1b = time.perf_counter()
|
||||
vector_results, chunks_by_doc = compress_chunks_to_docs(raw_chunks, limit)
|
||||
timing["compress_ms"] = (time.perf_counter() - t1b) * 1000
|
||||
|
||||
if not vector_results:
|
||||
notes.append("vector_search_returned_empty — text-only fallback")
|
||||
|
||||
@@ -127,6 +148,10 @@ async def search(
|
||||
results = strategy.fuse(text_results, vector_results, q, limit)
|
||||
timing["fusion_ms"] = (time.perf_counter() - t2) * 1000
|
||||
notes.append(f"fusion={strategy.name}")
|
||||
notes.append(
|
||||
f"chunks raw={len(raw_chunks)} compressed={len(vector_results)} "
|
||||
f"unique_docs={len(chunks_by_doc)}"
|
||||
)
|
||||
else:
|
||||
results = text_results
|
||||
|
||||
|
||||
Reference in New Issue
Block a user