migration 016: documents FTS 확장 + trigram 인덱스 (1.5초 빌드) - idx_documents_fts_full — title+ai_tags+ai_summary+user_note+extracted_text 통합 FTS - idx_documents_title_trgm — title 단독 trigram - idx_documents_extracted_text_trgm — 본문 trigram (NULL 제외) - idx_documents_ai_summary_trgm — AI 요약 trigram - CONCURRENTLY 불필요 (765 docs / 6.5MB) retrieval_service.search_text: ILIKE 완전 제거 → trigram % + similarity() - WHERE: title %, ai_summary %, FTS @@ (모두 인덱스 활용) - ORDER BY: 5컬럼 similarity 가중 합산 + ts_rank * 2.0 - 가중치 그대로 (title 3.0 / tags 2.5 / note 2.0 / summary 1.5 / extracted 1.0) - threshold default 0.3 (필요 시 set_limit으로 조정) 목표: text_ms 470ms → 100~200ms (ILIKE 풀스캔 제거 효과)
125 lines
5.2 KiB
Python
125 lines
5.2 KiB
Python
"""검색 후보 수집 서비스 (Phase 1.2).
|
|
|
|
text(documents FTS + trigram) + vector(documents.embedding → chunks) 후보를
|
|
SearchResult 리스트로 반환.
|
|
|
|
Phase 1.1a: search.py의 _search_text/_search_vector를 이전 (ILIKE 그대로).
|
|
Phase 1.2-B: ILIKE → trigram `%` + `similarity()`. ILIKE 풀 스캔 제거.
|
|
Phase 1.2-B 이후: vector retrieval을 document_chunks 테이블 기반으로 전환.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from typing import TYPE_CHECKING
|
|
|
|
from sqlalchemy import text
|
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
|
|
from ai.client import AIClient
|
|
|
|
if TYPE_CHECKING:
|
|
from api.search import SearchResult
|
|
|
|
|
|
async def search_text(
|
|
session: AsyncSession, query: str, limit: int
|
|
) -> list["SearchResult"]:
|
|
"""FTS + trigram 필드별 가중치 검색 (Phase 1.2-B).
|
|
|
|
WHERE: 인덱스 있는 trigram 컬럼(title, ai_summary)으로 후보 필터 + FTS 통합 인덱스
|
|
- idx_documents_title_trgm
|
|
- idx_documents_ai_summary_trgm
|
|
- idx_documents_fts_full (title + ai_tags + ai_summary + user_note + extracted_text)
|
|
- extracted_text는 trigram threshold 0.3에서 매우 낮은 similarity → WHERE에선 FTS만
|
|
ORDER BY: 5개 컬럼 similarity 가중 합산 + ts_rank * 2.0
|
|
가중치: title 3.0 / ai_tags 2.5 / user_note 2.0 / ai_summary 1.5 / extracted_text 1.0
|
|
"""
|
|
from api.search import SearchResult # 순환 import 회피
|
|
|
|
result = await session.execute(
|
|
text("""
|
|
SELECT id, title, ai_domain, ai_summary, file_format,
|
|
left(extracted_text, 200) AS snippet,
|
|
(
|
|
-- 컬럼별 trigram similarity 가중 합산
|
|
similarity(coalesce(title, ''), :q) * 3.0
|
|
+ similarity(coalesce(ai_tags::text, ''), :q) * 2.5
|
|
+ similarity(coalesce(user_note, ''), :q) * 2.0
|
|
+ similarity(coalesce(ai_summary, ''), :q) * 1.5
|
|
+ similarity(coalesce(extracted_text, ''), :q) * 1.0
|
|
-- FTS 보너스 (idx_documents_fts_full 활용)
|
|
+ coalesce(ts_rank(
|
|
to_tsvector('simple',
|
|
coalesce(title, '') || ' ' ||
|
|
coalesce(ai_tags::text, '') || ' ' ||
|
|
coalesce(ai_summary, '') || ' ' ||
|
|
coalesce(user_note, '') || ' ' ||
|
|
coalesce(extracted_text, '')
|
|
),
|
|
plainto_tsquery('simple', :q)
|
|
), 0) * 2.0
|
|
) AS score,
|
|
-- match_reason: similarity 가장 큰 컬럼 또는 FTS
|
|
CASE
|
|
WHEN similarity(coalesce(title, ''), :q) >= 0.3 THEN 'title'
|
|
WHEN similarity(coalesce(ai_tags::text, ''), :q) >= 0.3 THEN 'tags'
|
|
WHEN similarity(coalesce(user_note, ''), :q) >= 0.3 THEN 'note'
|
|
WHEN similarity(coalesce(ai_summary, ''), :q) >= 0.3 THEN 'summary'
|
|
WHEN similarity(coalesce(extracted_text, ''), :q) >= 0.3 THEN 'content'
|
|
ELSE 'fts'
|
|
END AS match_reason
|
|
FROM documents
|
|
WHERE deleted_at IS NULL
|
|
AND (
|
|
-- trigram 후보 필터 (인덱스 있는 짧은 컬럼만)
|
|
title % :q
|
|
OR (ai_summary IS NOT NULL AND ai_summary % :q)
|
|
-- FTS 통합 인덱스
|
|
OR to_tsvector('simple',
|
|
coalesce(title, '') || ' ' ||
|
|
coalesce(ai_tags::text, '') || ' ' ||
|
|
coalesce(ai_summary, '') || ' ' ||
|
|
coalesce(user_note, '') || ' ' ||
|
|
coalesce(extracted_text, '')
|
|
) @@ plainto_tsquery('simple', :q)
|
|
)
|
|
ORDER BY score DESC
|
|
LIMIT :limit
|
|
"""),
|
|
{"q": query, "limit": limit},
|
|
)
|
|
return [SearchResult(**row._mapping) for row in result]
|
|
|
|
|
|
async def search_vector(
|
|
session: AsyncSession, query: str, limit: int
|
|
) -> list["SearchResult"]:
|
|
"""벡터 유사도 검색 (코사인 거리).
|
|
|
|
Phase 1.2에서 document_chunks 테이블 기반으로 전환 예정.
|
|
현재는 documents.embedding 사용.
|
|
"""
|
|
from api.search import SearchResult # 순환 import 회피
|
|
|
|
try:
|
|
client = AIClient()
|
|
query_embedding = await client.embed(query)
|
|
await client.close()
|
|
except Exception:
|
|
return []
|
|
|
|
result = await session.execute(
|
|
text("""
|
|
SELECT id, title, ai_domain, ai_summary, file_format,
|
|
(1 - (embedding <=> cast(:embedding AS vector))) AS score,
|
|
left(extracted_text, 200) AS snippet,
|
|
'vector' AS match_reason
|
|
FROM documents
|
|
WHERE embedding IS NOT NULL AND deleted_at IS NULL
|
|
ORDER BY embedding <=> cast(:embedding AS vector)
|
|
LIMIT :limit
|
|
"""),
|
|
{"embedding": str(query_embedding), "limit": limit},
|
|
)
|
|
return [SearchResult(**row._mapping) for row in result]
|