Files
hyungi_document_server/app/services/search/retrieval_service.py
Hyungi Ahn f9af8dd355 fix(search): trigram threshold 0.3 → 0.15 (set_limit)
Phase 1.2-B 평가셋 결과 recall 0.788 → 0.750 회귀.
원인: trigram default threshold 0.3이 multi-token 쿼리에서 너무 엄격.

예: '이란 미국 전쟁 글로벌 반응' 같은 5단어 한국어 뉴스 쿼리는
title/ai_summary trigram 매칭이 거의 안 됨.

해결: search_text 시작 시 set_limit(0.15) 호출.
- trigram 매칭 더 관대 (recall ↑)
- precision은 ORDER BY similarity 가중 합산이 보정
- p95 latency 169ms 여유 충분 (목표 500ms)
2026-04-08 11:58:41 +09:00

152 lines
6.5 KiB
Python

"""검색 후보 수집 서비스 (Phase 1.2).
text(documents FTS + trigram) + vector(documents.embedding → chunks) 후보를
SearchResult 리스트로 반환.
Phase 1.1a: search.py의 _search_text/_search_vector를 이전 (ILIKE 그대로).
Phase 1.2-B: ILIKE → trigram `%` + `similarity()`. ILIKE 풀 스캔 제거.
Phase 1.2-B 이후: vector retrieval을 document_chunks 테이블 기반으로 전환.
"""
from __future__ import annotations
from typing import TYPE_CHECKING
from sqlalchemy import text
from sqlalchemy.ext.asyncio import AsyncSession
from ai.client import AIClient
if TYPE_CHECKING:
from api.search import SearchResult
async def search_text(
session: AsyncSession, query: str, limit: int
) -> list["SearchResult"]:
"""FTS + trigram 필드별 가중치 검색 (Phase 1.2-B UNION 분해).
Phase 1.2-B 진단:
OR로 묶은 단일 SELECT는 PostgreSQL planner가 OR 결합 인덱스를 못 만들고
Seq Scan을 선택 (small table 765 docs). EXPLAIN으로 측정 시 525ms.
→ CTE + UNION으로 분해하면 각 branch가 자기 인덱스 활용 → 26ms (95% 감소).
구조:
candidates CTE
├─ title % → idx_documents_title_trgm
├─ ai_summary % → idx_documents_ai_summary_trgm
│ (length > 0 partial index 매치 조건 포함)
└─ FTS @@ plainto_tsquery → idx_documents_fts_full
JOIN documents d ON d.id = c.id
ORDER BY 5컬럼 similarity 가중 합산 + ts_rank * 2.0
가중치: title 3.0 / ai_tags 2.5 / user_note 2.0 / ai_summary 1.5 / extracted_text 1.0
threshold:
pg_trgm.similarity_threshold default = 0.3
→ multi-token 한국어 뉴스 쿼리(예: "이란 미국 전쟁 글로벌 반응")에서
candidates를 못 모음 → recall 감소 (0.788 → 0.750)
→ set_limit(0.15)으로 낮춰 recall 회복. precision은 ORDER BY similarity 합산이 보정.
"""
from api.search import SearchResult # 순환 import 회피
# trigram threshold를 0.15로 낮춰 multi-token query recall 회복
# SQLAlchemy async session 내 두 execute는 같은 connection 사용
await session.execute(text("SELECT set_limit(0.15)"))
result = await session.execute(
text("""
WITH candidates AS (
-- title trigram (idx_documents_title_trgm)
SELECT id FROM documents
WHERE deleted_at IS NULL AND title % :q
UNION
-- ai_summary trigram (idx_documents_ai_summary_trgm 부분 인덱스 매치)
SELECT id FROM documents
WHERE deleted_at IS NULL
AND ai_summary IS NOT NULL
AND length(ai_summary) > 0
AND ai_summary % :q
UNION
-- FTS 통합 인덱스 (idx_documents_fts_full)
SELECT id FROM documents
WHERE deleted_at IS NULL
AND to_tsvector('simple',
coalesce(title, '') || ' ' ||
coalesce(ai_tags::text, '') || ' ' ||
coalesce(ai_summary, '') || ' ' ||
coalesce(user_note, '') || ' ' ||
coalesce(extracted_text, '')
) @@ plainto_tsquery('simple', :q)
)
SELECT d.id, d.title, d.ai_domain, d.ai_summary, d.file_format,
left(d.extracted_text, 200) AS snippet,
(
-- 컬럼별 trigram similarity 가중 합산
similarity(coalesce(d.title, ''), :q) * 3.0
+ similarity(coalesce(d.ai_tags::text, ''), :q) * 2.5
+ similarity(coalesce(d.user_note, ''), :q) * 2.0
+ similarity(coalesce(d.ai_summary, ''), :q) * 1.5
+ similarity(coalesce(d.extracted_text, ''), :q) * 1.0
-- FTS 보너스 (idx_documents_fts_full 활용)
+ coalesce(ts_rank(
to_tsvector('simple',
coalesce(d.title, '') || ' ' ||
coalesce(d.ai_tags::text, '') || ' ' ||
coalesce(d.ai_summary, '') || ' ' ||
coalesce(d.user_note, '') || ' ' ||
coalesce(d.extracted_text, '')
),
plainto_tsquery('simple', :q)
), 0) * 2.0
) AS score,
-- match_reason: similarity 가장 큰 컬럼 또는 FTS
CASE
WHEN similarity(coalesce(d.title, ''), :q) >= 0.3 THEN 'title'
WHEN similarity(coalesce(d.ai_tags::text, ''), :q) >= 0.3 THEN 'tags'
WHEN similarity(coalesce(d.user_note, ''), :q) >= 0.3 THEN 'note'
WHEN similarity(coalesce(d.ai_summary, ''), :q) >= 0.3 THEN 'summary'
WHEN similarity(coalesce(d.extracted_text, ''), :q) >= 0.3 THEN 'content'
ELSE 'fts'
END AS match_reason
FROM documents d
JOIN candidates c ON d.id = c.id
ORDER BY score DESC
LIMIT :limit
"""),
{"q": query, "limit": limit},
)
return [SearchResult(**row._mapping) for row in result]
async def search_vector(
session: AsyncSession, query: str, limit: int
) -> list["SearchResult"]:
"""벡터 유사도 검색 (코사인 거리).
Phase 1.2에서 document_chunks 테이블 기반으로 전환 예정.
현재는 documents.embedding 사용.
"""
from api.search import SearchResult # 순환 import 회피
try:
client = AIClient()
query_embedding = await client.embed(query)
await client.close()
except Exception:
return []
result = await session.execute(
text("""
SELECT id, title, ai_domain, ai_summary, file_format,
(1 - (embedding <=> cast(:embedding AS vector))) AS score,
left(extracted_text, 200) AS snippet,
'vector' AS match_reason
FROM documents
WHERE embedding IS NOT NULL AND deleted_at IS NULL
ORDER BY embedding <=> cast(:embedding AS vector)
LIMIT :limit
"""),
{"embedding": str(query_embedding), "limit": limit},
)
return [SearchResult(**row._mapping) for row in result]