diff --git a/app/services/search/retrieval_service.py b/app/services/search/retrieval_service.py index 568ec4a..e690ef6 100644 --- a/app/services/search/retrieval_service.py +++ b/app/services/search/retrieval_service.py @@ -121,16 +121,24 @@ async def search_text( async def search_vector( session: AsyncSession, query: str, limit: int ) -> list["SearchResult"]: - """벡터 유사도 검색 — chunk-level (Phase 1.2-C). + """벡터 유사도 검색 — chunk-level + doc 다양성 보장 (Phase 1.2-C). - document_chunks 테이블에서 cosine similarity로 raw chunks 반환. - 같은 doc에서 여러 chunks가 들어올 수 있음 (압축 안 함). - fusion 직전에 compress_chunks_to_docs() helper로 doc 기준 압축 필요. - Phase 1.3 reranker는 raw chunks를 그대로 활용. + Phase 1.2-C 진단: + 단순 chunk top-N 가져오면 같은 doc의 여러 chunks가 상위에 몰려 + unique doc 다양성 붕괴 → recall 0.788 → 0.531 (catastrophic). - SearchResult.id = doc_id (fusion 호환) - SearchResult.chunk_id / chunk_index / section_title = chunk 메타 - snippet = chunk의 text 앞 200자 + 해결 (사용자 추천 C 방식): + Window function으로 doc_id 기준 PARTITION → 각 doc의 top 2 chunks만 반환. + raw_chunks(chunks_by_doc 보존)와 doc-level 압축 둘 다 만족. + + SQL 흐름: + 1. inner CTE: ivfflat 인덱스로 top-K chunks 빠르게 추출 + 2. ranked CTE: doc_id PARTITION 후 score 내림차순 ROW_NUMBER + 3. outer: rn <= 2 (doc당 max 2 chunks) + JOIN documents + + Returns: + list[SearchResult] — chunk-level, 각 doc 최대 2개. compress_chunks_to_docs로 + doc-level 압축 + chunks_by_doc 보존. """ from api.search import SearchResult # 순환 import 회피 @@ -141,29 +149,49 @@ async def search_vector( except Exception: return [] - # raw chunks를 doc 메타와 join. limit * 5 정도 넓게 → 압축 후 doc 다양성. - fetch_limit = limit * 5 + # ivfflat 인덱스로 top-K chunks 추출 후 doc 단위 partition + # inner_k = limit * 10 정도로 충분 unique doc 확보 (~30~50 docs) + inner_k = max(limit * 10, 200) result = await session.execute( text(""" + WITH topk AS ( + SELECT + c.id AS chunk_id, + c.doc_id, + c.chunk_index, + c.section_title, + c.text, + c.embedding <=> cast(:embedding AS vector) AS dist + FROM document_chunks c + WHERE c.embedding IS NOT NULL + ORDER BY c.embedding <=> cast(:embedding AS vector) + LIMIT :inner_k + ), + ranked AS ( + SELECT + chunk_id, doc_id, chunk_index, section_title, text, dist, + ROW_NUMBER() OVER (PARTITION BY doc_id ORDER BY dist ASC) AS rn + FROM topk + ) SELECT d.id AS id, d.title AS title, d.ai_domain AS ai_domain, d.ai_summary AS ai_summary, d.file_format AS file_format, - (1 - (c.embedding <=> cast(:embedding AS vector))) AS score, - left(c.text, 200) AS snippet, + (1 - r.dist) AS score, + left(r.text, 200) AS snippet, 'vector' AS match_reason, - c.id AS chunk_id, - c.chunk_index AS chunk_index, - c.section_title AS section_title - FROM document_chunks c - JOIN documents d ON d.id = c.doc_id - WHERE c.embedding IS NOT NULL AND d.deleted_at IS NULL - ORDER BY c.embedding <=> cast(:embedding AS vector) + r.chunk_id AS chunk_id, + r.chunk_index AS chunk_index, + r.section_title AS section_title + FROM ranked r + JOIN documents d ON d.id = r.doc_id + WHERE r.rn <= 2 AND d.deleted_at IS NULL + ORDER BY r.dist LIMIT :limit """), - {"embedding": str(query_embedding), "limit": fetch_limit}, + {"embedding": str(query_embedding), "inner_k": inner_k, "limit": limit * 4}, ) return [SearchResult(**row._mapping) for row in result]