fix(search): Phase 1.2-B UNION 분해로 trigram/FTS 인덱스 강제 활용
EXPLAIN 진단: OR 통합 WHERE는 PostgreSQL planner가 인덱스 결합 못 함 (small table 765 docs라 Seq Scan 선택). Filter 524ms. 해결: WHERE OR을 CTE candidates UNION으로 분해. - title trigram → idx_documents_title_trgm (0.5ms) - ai_summary trigram → idx_documents_ai_summary_trgm (length>0 매치 추가) - FTS @@ → idx_documents_fts_full (0.05ms) EXPLAIN 측정: 525ms → 26ms (95% 감소). 본 SELECT(similarity 가중 합산 + ORDER BY) 추가하면 100~150ms 예상.
This commit is contained in:
@@ -24,65 +24,82 @@ if TYPE_CHECKING:
|
||||
async def search_text(
|
||||
session: AsyncSession, query: str, limit: int
|
||||
) -> list["SearchResult"]:
|
||||
"""FTS + trigram 필드별 가중치 검색 (Phase 1.2-B).
|
||||
"""FTS + trigram 필드별 가중치 검색 (Phase 1.2-B UNION 분해).
|
||||
|
||||
WHERE: 인덱스 있는 trigram 컬럼(title, ai_summary)으로 후보 필터 + FTS 통합 인덱스
|
||||
- idx_documents_title_trgm
|
||||
- idx_documents_ai_summary_trgm
|
||||
- idx_documents_fts_full (title + ai_tags + ai_summary + user_note + extracted_text)
|
||||
- extracted_text는 trigram threshold 0.3에서 매우 낮은 similarity → WHERE에선 FTS만
|
||||
ORDER BY: 5개 컬럼 similarity 가중 합산 + ts_rank * 2.0
|
||||
Phase 1.2-B 진단:
|
||||
OR로 묶은 단일 SELECT는 PostgreSQL planner가 OR 결합 인덱스를 못 만들고
|
||||
Seq Scan을 선택 (small table 765 docs). EXPLAIN으로 측정 시 525ms.
|
||||
→ CTE + UNION으로 분해하면 각 branch가 자기 인덱스 활용 → 26ms (95% 감소).
|
||||
|
||||
구조:
|
||||
candidates CTE
|
||||
├─ title % → idx_documents_title_trgm
|
||||
├─ ai_summary % → idx_documents_ai_summary_trgm
|
||||
│ (length > 0 partial index 매치 조건 포함)
|
||||
└─ FTS @@ plainto_tsquery → idx_documents_fts_full
|
||||
JOIN documents d ON d.id = c.id
|
||||
ORDER BY 5컬럼 similarity 가중 합산 + ts_rank * 2.0
|
||||
가중치: title 3.0 / ai_tags 2.5 / user_note 2.0 / ai_summary 1.5 / extracted_text 1.0
|
||||
"""
|
||||
from api.search import SearchResult # 순환 import 회피
|
||||
|
||||
result = await session.execute(
|
||||
text("""
|
||||
SELECT id, title, ai_domain, ai_summary, file_format,
|
||||
left(extracted_text, 200) AS snippet,
|
||||
(
|
||||
-- 컬럼별 trigram similarity 가중 합산
|
||||
similarity(coalesce(title, ''), :q) * 3.0
|
||||
+ similarity(coalesce(ai_tags::text, ''), :q) * 2.5
|
||||
+ similarity(coalesce(user_note, ''), :q) * 2.0
|
||||
+ similarity(coalesce(ai_summary, ''), :q) * 1.5
|
||||
+ similarity(coalesce(extracted_text, ''), :q) * 1.0
|
||||
-- FTS 보너스 (idx_documents_fts_full 활용)
|
||||
+ coalesce(ts_rank(
|
||||
to_tsvector('simple',
|
||||
coalesce(title, '') || ' ' ||
|
||||
coalesce(ai_tags::text, '') || ' ' ||
|
||||
coalesce(ai_summary, '') || ' ' ||
|
||||
coalesce(user_note, '') || ' ' ||
|
||||
coalesce(extracted_text, '')
|
||||
),
|
||||
plainto_tsquery('simple', :q)
|
||||
), 0) * 2.0
|
||||
) AS score,
|
||||
-- match_reason: similarity 가장 큰 컬럼 또는 FTS
|
||||
CASE
|
||||
WHEN similarity(coalesce(title, ''), :q) >= 0.3 THEN 'title'
|
||||
WHEN similarity(coalesce(ai_tags::text, ''), :q) >= 0.3 THEN 'tags'
|
||||
WHEN similarity(coalesce(user_note, ''), :q) >= 0.3 THEN 'note'
|
||||
WHEN similarity(coalesce(ai_summary, ''), :q) >= 0.3 THEN 'summary'
|
||||
WHEN similarity(coalesce(extracted_text, ''), :q) >= 0.3 THEN 'content'
|
||||
ELSE 'fts'
|
||||
END AS match_reason
|
||||
FROM documents
|
||||
WHERE deleted_at IS NULL
|
||||
AND (
|
||||
-- trigram 후보 필터 (인덱스 있는 짧은 컬럼만)
|
||||
title % :q
|
||||
OR (ai_summary IS NOT NULL AND ai_summary % :q)
|
||||
-- FTS 통합 인덱스
|
||||
OR to_tsvector('simple',
|
||||
WITH candidates AS (
|
||||
-- title trigram (idx_documents_title_trgm)
|
||||
SELECT id FROM documents
|
||||
WHERE deleted_at IS NULL AND title %% :q
|
||||
UNION
|
||||
-- ai_summary trigram (idx_documents_ai_summary_trgm 부분 인덱스 매치)
|
||||
SELECT id FROM documents
|
||||
WHERE deleted_at IS NULL
|
||||
AND ai_summary IS NOT NULL
|
||||
AND length(ai_summary) > 0
|
||||
AND ai_summary %% :q
|
||||
UNION
|
||||
-- FTS 통합 인덱스 (idx_documents_fts_full)
|
||||
SELECT id FROM documents
|
||||
WHERE deleted_at IS NULL
|
||||
AND to_tsvector('simple',
|
||||
coalesce(title, '') || ' ' ||
|
||||
coalesce(ai_tags::text, '') || ' ' ||
|
||||
coalesce(ai_summary, '') || ' ' ||
|
||||
coalesce(user_note, '') || ' ' ||
|
||||
coalesce(extracted_text, '')
|
||||
) @@ plainto_tsquery('simple', :q)
|
||||
)
|
||||
)
|
||||
SELECT d.id, d.title, d.ai_domain, d.ai_summary, d.file_format,
|
||||
left(d.extracted_text, 200) AS snippet,
|
||||
(
|
||||
-- 컬럼별 trigram similarity 가중 합산
|
||||
similarity(coalesce(d.title, ''), :q) * 3.0
|
||||
+ similarity(coalesce(d.ai_tags::text, ''), :q) * 2.5
|
||||
+ similarity(coalesce(d.user_note, ''), :q) * 2.0
|
||||
+ similarity(coalesce(d.ai_summary, ''), :q) * 1.5
|
||||
+ similarity(coalesce(d.extracted_text, ''), :q) * 1.0
|
||||
-- FTS 보너스 (idx_documents_fts_full 활용)
|
||||
+ coalesce(ts_rank(
|
||||
to_tsvector('simple',
|
||||
coalesce(d.title, '') || ' ' ||
|
||||
coalesce(d.ai_tags::text, '') || ' ' ||
|
||||
coalesce(d.ai_summary, '') || ' ' ||
|
||||
coalesce(d.user_note, '') || ' ' ||
|
||||
coalesce(d.extracted_text, '')
|
||||
),
|
||||
plainto_tsquery('simple', :q)
|
||||
), 0) * 2.0
|
||||
) AS score,
|
||||
-- match_reason: similarity 가장 큰 컬럼 또는 FTS
|
||||
CASE
|
||||
WHEN similarity(coalesce(d.title, ''), :q) >= 0.3 THEN 'title'
|
||||
WHEN similarity(coalesce(d.ai_tags::text, ''), :q) >= 0.3 THEN 'tags'
|
||||
WHEN similarity(coalesce(d.user_note, ''), :q) >= 0.3 THEN 'note'
|
||||
WHEN similarity(coalesce(d.ai_summary, ''), :q) >= 0.3 THEN 'summary'
|
||||
WHEN similarity(coalesce(d.extracted_text, ''), :q) >= 0.3 THEN 'content'
|
||||
ELSE 'fts'
|
||||
END AS match_reason
|
||||
FROM documents d
|
||||
JOIN candidates c ON d.id = c.id
|
||||
ORDER BY score DESC
|
||||
LIMIT :limit
|
||||
"""),
|
||||
|
||||
Reference in New Issue
Block a user