From fab3c81a0feedbe76994232c585dee38ddf6fcc4 Mon Sep 17 00:00:00 2001 From: Hyungi Ahn Date: Wed, 8 Apr 2026 11:51:06 +0900 Subject: [PATCH] =?UTF-8?q?fix(search):=20Phase=201.2-B=20UNION=20?= =?UTF-8?q?=EB=B6=84=ED=95=B4=EB=A1=9C=20trigram/FTS=20=EC=9D=B8=EB=8D=B1?= =?UTF-8?q?=EC=8A=A4=20=EA=B0=95=EC=A0=9C=20=ED=99=9C=EC=9A=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit EXPLAIN 진단: OR 통합 WHERE는 PostgreSQL planner가 인덱스 결합 못 함 (small table 765 docs라 Seq Scan 선택). Filter 524ms. 해결: WHERE OR을 CTE candidates UNION으로 분해. - title trigram → idx_documents_title_trgm (0.5ms) - ai_summary trigram → idx_documents_ai_summary_trgm (length>0 매치 추가) - FTS @@ → idx_documents_fts_full (0.05ms) EXPLAIN 측정: 525ms → 26ms (95% 감소). 본 SELECT(similarity 가중 합산 + ORDER BY) 추가하면 100~150ms 예상. --- app/services/search/retrieval_service.py | 109 +++++++++++++---------- 1 file changed, 63 insertions(+), 46 deletions(-) diff --git a/app/services/search/retrieval_service.py b/app/services/search/retrieval_service.py index a628f49..13b7f69 100644 --- a/app/services/search/retrieval_service.py +++ b/app/services/search/retrieval_service.py @@ -24,65 +24,82 @@ if TYPE_CHECKING: async def search_text( session: AsyncSession, query: str, limit: int ) -> list["SearchResult"]: - """FTS + trigram 필드별 가중치 검색 (Phase 1.2-B). + """FTS + trigram 필드별 가중치 검색 (Phase 1.2-B UNION 분해). - WHERE: 인덱스 있는 trigram 컬럼(title, ai_summary)으로 후보 필터 + FTS 통합 인덱스 - - idx_documents_title_trgm - - idx_documents_ai_summary_trgm - - idx_documents_fts_full (title + ai_tags + ai_summary + user_note + extracted_text) - - extracted_text는 trigram threshold 0.3에서 매우 낮은 similarity → WHERE에선 FTS만 - ORDER BY: 5개 컬럼 similarity 가중 합산 + ts_rank * 2.0 + Phase 1.2-B 진단: + OR로 묶은 단일 SELECT는 PostgreSQL planner가 OR 결합 인덱스를 못 만들고 + Seq Scan을 선택 (small table 765 docs). EXPLAIN으로 측정 시 525ms. + → CTE + UNION으로 분해하면 각 branch가 자기 인덱스 활용 → 26ms (95% 감소). + + 구조: + candidates CTE + ├─ title % → idx_documents_title_trgm + ├─ ai_summary % → idx_documents_ai_summary_trgm + │ (length > 0 partial index 매치 조건 포함) + └─ FTS @@ plainto_tsquery → idx_documents_fts_full + JOIN documents d ON d.id = c.id + ORDER BY 5컬럼 similarity 가중 합산 + ts_rank * 2.0 가중치: title 3.0 / ai_tags 2.5 / user_note 2.0 / ai_summary 1.5 / extracted_text 1.0 """ from api.search import SearchResult # 순환 import 회피 result = await session.execute( text(""" - SELECT id, title, ai_domain, ai_summary, file_format, - left(extracted_text, 200) AS snippet, - ( - -- 컬럼별 trigram similarity 가중 합산 - similarity(coalesce(title, ''), :q) * 3.0 - + similarity(coalesce(ai_tags::text, ''), :q) * 2.5 - + similarity(coalesce(user_note, ''), :q) * 2.0 - + similarity(coalesce(ai_summary, ''), :q) * 1.5 - + similarity(coalesce(extracted_text, ''), :q) * 1.0 - -- FTS 보너스 (idx_documents_fts_full 활용) - + coalesce(ts_rank( - to_tsvector('simple', - coalesce(title, '') || ' ' || - coalesce(ai_tags::text, '') || ' ' || - coalesce(ai_summary, '') || ' ' || - coalesce(user_note, '') || ' ' || - coalesce(extracted_text, '') - ), - plainto_tsquery('simple', :q) - ), 0) * 2.0 - ) AS score, - -- match_reason: similarity 가장 큰 컬럼 또는 FTS - CASE - WHEN similarity(coalesce(title, ''), :q) >= 0.3 THEN 'title' - WHEN similarity(coalesce(ai_tags::text, ''), :q) >= 0.3 THEN 'tags' - WHEN similarity(coalesce(user_note, ''), :q) >= 0.3 THEN 'note' - WHEN similarity(coalesce(ai_summary, ''), :q) >= 0.3 THEN 'summary' - WHEN similarity(coalesce(extracted_text, ''), :q) >= 0.3 THEN 'content' - ELSE 'fts' - END AS match_reason - FROM documents - WHERE deleted_at IS NULL - AND ( - -- trigram 후보 필터 (인덱스 있는 짧은 컬럼만) - title % :q - OR (ai_summary IS NOT NULL AND ai_summary % :q) - -- FTS 통합 인덱스 - OR to_tsvector('simple', + WITH candidates AS ( + -- title trigram (idx_documents_title_trgm) + SELECT id FROM documents + WHERE deleted_at IS NULL AND title %% :q + UNION + -- ai_summary trigram (idx_documents_ai_summary_trgm 부분 인덱스 매치) + SELECT id FROM documents + WHERE deleted_at IS NULL + AND ai_summary IS NOT NULL + AND length(ai_summary) > 0 + AND ai_summary %% :q + UNION + -- FTS 통합 인덱스 (idx_documents_fts_full) + SELECT id FROM documents + WHERE deleted_at IS NULL + AND to_tsvector('simple', coalesce(title, '') || ' ' || coalesce(ai_tags::text, '') || ' ' || coalesce(ai_summary, '') || ' ' || coalesce(user_note, '') || ' ' || coalesce(extracted_text, '') ) @@ plainto_tsquery('simple', :q) - ) + ) + SELECT d.id, d.title, d.ai_domain, d.ai_summary, d.file_format, + left(d.extracted_text, 200) AS snippet, + ( + -- 컬럼별 trigram similarity 가중 합산 + similarity(coalesce(d.title, ''), :q) * 3.0 + + similarity(coalesce(d.ai_tags::text, ''), :q) * 2.5 + + similarity(coalesce(d.user_note, ''), :q) * 2.0 + + similarity(coalesce(d.ai_summary, ''), :q) * 1.5 + + similarity(coalesce(d.extracted_text, ''), :q) * 1.0 + -- FTS 보너스 (idx_documents_fts_full 활용) + + coalesce(ts_rank( + to_tsvector('simple', + coalesce(d.title, '') || ' ' || + coalesce(d.ai_tags::text, '') || ' ' || + coalesce(d.ai_summary, '') || ' ' || + coalesce(d.user_note, '') || ' ' || + coalesce(d.extracted_text, '') + ), + plainto_tsquery('simple', :q) + ), 0) * 2.0 + ) AS score, + -- match_reason: similarity 가장 큰 컬럼 또는 FTS + CASE + WHEN similarity(coalesce(d.title, ''), :q) >= 0.3 THEN 'title' + WHEN similarity(coalesce(d.ai_tags::text, ''), :q) >= 0.3 THEN 'tags' + WHEN similarity(coalesce(d.user_note, ''), :q) >= 0.3 THEN 'note' + WHEN similarity(coalesce(d.ai_summary, ''), :q) >= 0.3 THEN 'summary' + WHEN similarity(coalesce(d.extracted_text, ''), :q) >= 0.3 THEN 'content' + ELSE 'fts' + END AS match_reason + FROM documents d + JOIN candidates c ON d.id = c.id ORDER BY score DESC LIMIT :limit """),