feat: 검색 전면 개편 — 필드별 가중치 + 벡터 합산 + match reason
검색 대상: title > ai_tags > user_note > ai_summary > extracted_text - 필드별 가중치: title(3.0), tags(2.5), note(2.0), summary(1.5), text(1.0) - 벡터 검색: 별도 쿼리로 분리, 결과 합산 (asyncpg 충돌 방지) - match_reason: 어떤 필드에서 매칭됐는지 반환 - 중복 제거 + 점수 합산 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1,4 +1,4 @@
|
||||
"""하이브리드 검색 API — FTS + 트리그램 + 벡터"""
|
||||
"""하이브리드 검색 API — FTS + ILIKE + 벡터 (필드별 가중치)"""
|
||||
|
||||
from typing import Annotated
|
||||
|
||||
@@ -14,11 +14,6 @@ from models.user import User
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
# 가중치 (초기값, 튜닝 가능)
|
||||
W_FTS = 0.4
|
||||
W_TRGM = 0.2
|
||||
W_VECTOR = 0.4
|
||||
|
||||
|
||||
class SearchResult(BaseModel):
|
||||
id: int
|
||||
@@ -28,6 +23,7 @@ class SearchResult(BaseModel):
|
||||
file_format: str
|
||||
score: float
|
||||
snippet: str | None
|
||||
match_reason: str | None = None
|
||||
|
||||
|
||||
class SearchResponse(BaseModel):
|
||||
@@ -45,22 +41,16 @@ async def search(
|
||||
mode: str = Query("hybrid", pattern="^(fts|trgm|vector|hybrid)$"),
|
||||
limit: int = Query(20, ge=1, le=100),
|
||||
):
|
||||
"""문서 검색
|
||||
|
||||
mode:
|
||||
- fts: PostgreSQL 전문검색 (GIN 인덱스)
|
||||
- trgm: 트리그램 부분매칭 (한국어 지원)
|
||||
- vector: 벡터 유사도 검색 (의미검색)
|
||||
- hybrid: FTS + 트리그램 + 벡터 결합 (기본)
|
||||
"""
|
||||
if mode == "fts":
|
||||
results = await _search_fts(session, q, limit)
|
||||
elif mode == "trgm":
|
||||
results = await _search_trgm(session, q, limit)
|
||||
elif mode == "vector":
|
||||
"""문서 검색 — FTS + ILIKE + 벡터 결합"""
|
||||
if mode == "vector":
|
||||
results = await _search_vector(session, q, limit)
|
||||
else:
|
||||
results = await _search_hybrid(session, q, limit)
|
||||
results = await _search_text(session, q, limit)
|
||||
|
||||
# hybrid: 벡터 결과도 합산
|
||||
if mode == "hybrid":
|
||||
vector_results = await _search_vector(session, q, limit)
|
||||
results = _merge_results(results, vector_results, limit)
|
||||
|
||||
return SearchResponse(
|
||||
results=results,
|
||||
@@ -70,68 +60,69 @@ async def search(
|
||||
)
|
||||
|
||||
|
||||
async def _search_fts(session: AsyncSession, query: str, limit: int) -> list[SearchResult]:
|
||||
"""PostgreSQL 전문검색 (GIN 인덱스)"""
|
||||
# simple 설정으로 한국어 토큰화 없이 공백 기반 분리
|
||||
async def _search_text(session: AsyncSession, query: str, limit: int) -> list[SearchResult]:
|
||||
"""FTS + ILIKE — 필드별 가중치 적용"""
|
||||
result = await session.execute(
|
||||
text("""
|
||||
SELECT id, title, ai_domain, ai_summary, file_format,
|
||||
ts_rank(
|
||||
to_tsvector('simple', coalesce(title, '') || ' ' || coalesce(extracted_text, '')),
|
||||
plainto_tsquery('simple', :query)
|
||||
left(extracted_text, 200) AS snippet,
|
||||
(
|
||||
-- title 매칭 (가중치 최고)
|
||||
CASE WHEN coalesce(title, '') ILIKE '%%' || :q || '%%' THEN 3.0 ELSE 0 END
|
||||
-- ai_tags 매칭 (가중치 높음)
|
||||
+ CASE WHEN coalesce(ai_tags::text, '') ILIKE '%%' || :q || '%%' THEN 2.5 ELSE 0 END
|
||||
-- user_note 매칭 (가중치 높음)
|
||||
+ CASE WHEN coalesce(user_note, '') ILIKE '%%' || :q || '%%' THEN 2.0 ELSE 0 END
|
||||
-- ai_summary 매칭 (가중치 중상)
|
||||
+ CASE WHEN coalesce(ai_summary, '') ILIKE '%%' || :q || '%%' THEN 1.5 ELSE 0 END
|
||||
-- extracted_text 매칭 (가중치 중간)
|
||||
+ CASE WHEN coalesce(extracted_text, '') ILIKE '%%' || :q || '%%' THEN 1.0 ELSE 0 END
|
||||
-- FTS 점수 (보너스)
|
||||
+ coalesce(ts_rank(
|
||||
to_tsvector('simple', coalesce(title, '') || ' ' || coalesce(extracted_text, '')),
|
||||
plainto_tsquery('simple', :q)
|
||||
), 0) * 2.0
|
||||
) AS score,
|
||||
left(extracted_text, 200) AS snippet
|
||||
-- match reason
|
||||
CASE
|
||||
WHEN coalesce(title, '') ILIKE '%%' || :q || '%%' THEN 'title'
|
||||
WHEN coalesce(ai_tags::text, '') ILIKE '%%' || :q || '%%' THEN 'tags'
|
||||
WHEN coalesce(user_note, '') ILIKE '%%' || :q || '%%' THEN 'note'
|
||||
WHEN coalesce(ai_summary, '') ILIKE '%%' || :q || '%%' THEN 'summary'
|
||||
WHEN coalesce(extracted_text, '') ILIKE '%%' || :q || '%%' THEN 'content'
|
||||
ELSE 'fts'
|
||||
END AS match_reason
|
||||
FROM documents
|
||||
WHERE to_tsvector('simple', coalesce(title, '') || ' ' || coalesce(extracted_text, ''))
|
||||
@@ plainto_tsquery('simple', :query)
|
||||
WHERE coalesce(title, '') ILIKE '%%' || :q || '%%'
|
||||
OR coalesce(ai_tags::text, '') ILIKE '%%' || :q || '%%'
|
||||
OR coalesce(user_note, '') ILIKE '%%' || :q || '%%'
|
||||
OR coalesce(ai_summary, '') ILIKE '%%' || :q || '%%'
|
||||
OR coalesce(extracted_text, '') ILIKE '%%' || :q || '%%'
|
||||
OR to_tsvector('simple', coalesce(title, '') || ' ' || coalesce(extracted_text, ''))
|
||||
@@ plainto_tsquery('simple', :q)
|
||||
ORDER BY score DESC
|
||||
LIMIT :limit
|
||||
"""),
|
||||
{"query": query, "limit": limit},
|
||||
)
|
||||
return [SearchResult(**row._mapping) for row in result]
|
||||
|
||||
|
||||
async def _search_trgm(session: AsyncSession, query: str, limit: int) -> list[SearchResult]:
|
||||
"""트리그램 부분매칭 + ILIKE fallback (한국어 지원)"""
|
||||
# threshold 낮춰서 한국어 매칭 향상
|
||||
await session.execute(text("SET pg_trgm.similarity_threshold = 0.1"))
|
||||
result = await session.execute(
|
||||
text("""
|
||||
SELECT id, title, ai_domain, ai_summary, file_format,
|
||||
GREATEST(
|
||||
similarity(coalesce(title, '') || ' ' || coalesce(extracted_text, ''), :query),
|
||||
CASE WHEN (coalesce(title, '') || ' ' || coalesce(extracted_text, '')) ILIKE '%%' || :query || '%%'
|
||||
THEN 0.5 ELSE 0 END
|
||||
) AS score,
|
||||
left(extracted_text, 200) AS snippet
|
||||
FROM documents
|
||||
WHERE (coalesce(title, '') || ' ' || coalesce(extracted_text, '')) %% :query
|
||||
OR (coalesce(title, '') || ' ' || coalesce(extracted_text, '')) ILIKE '%%' || :query || '%%'
|
||||
ORDER BY score DESC
|
||||
LIMIT :limit
|
||||
"""),
|
||||
{"query": query, "limit": limit},
|
||||
{"q": query, "limit": limit},
|
||||
)
|
||||
return [SearchResult(**row._mapping) for row in result]
|
||||
|
||||
|
||||
async def _search_vector(session: AsyncSession, query: str, limit: int) -> list[SearchResult]:
|
||||
"""벡터 유사도 검색 (코사인 거리)"""
|
||||
client = AIClient()
|
||||
try:
|
||||
client = AIClient()
|
||||
query_embedding = await client.embed(query)
|
||||
except Exception:
|
||||
return [] # GPU 서버 불가 시 빈 결과
|
||||
finally:
|
||||
await client.close()
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
# pgvector 코사인 거리 (0=동일, 2=반대)
|
||||
result = await session.execute(
|
||||
text("""
|
||||
SELECT id, title, ai_domain, ai_summary, file_format,
|
||||
(1 - (embedding <=> :embedding::vector)) AS score,
|
||||
left(extracted_text, 200) AS snippet
|
||||
left(extracted_text, 200) AS snippet,
|
||||
'vector' AS match_reason
|
||||
FROM documents
|
||||
WHERE embedding IS NOT NULL
|
||||
ORDER BY embedding <=> :embedding::vector
|
||||
@@ -142,28 +133,33 @@ async def _search_vector(session: AsyncSession, query: str, limit: int) -> list[
|
||||
return [SearchResult(**row._mapping) for row in result]
|
||||
|
||||
|
||||
async def _search_hybrid(session: AsyncSession, query: str, limit: int) -> list[SearchResult]:
|
||||
"""하이브리드 검색 — FTS + ILIKE (안정적 한국어 지원)"""
|
||||
result = await session.execute(
|
||||
text("""
|
||||
SELECT id, title, ai_domain, ai_summary, file_format,
|
||||
GREATEST(
|
||||
coalesce(ts_rank(
|
||||
to_tsvector('simple', coalesce(title, '') || ' ' || coalesce(extracted_text, '')),
|
||||
plainto_tsquery('simple', :query)
|
||||
), 0),
|
||||
CASE WHEN (coalesce(title, '') || ' ' || coalesce(extracted_text, ''))
|
||||
ILIKE '%%' || :query || '%%' THEN 0.5 ELSE 0 END
|
||||
) AS score,
|
||||
left(extracted_text, 200) AS snippet
|
||||
FROM documents
|
||||
WHERE to_tsvector('simple', coalesce(title, '') || ' ' || coalesce(extracted_text, ''))
|
||||
@@ plainto_tsquery('simple', :query)
|
||||
OR (coalesce(title, '') || ' ' || coalesce(extracted_text, ''))
|
||||
ILIKE '%%' || :query || '%%'
|
||||
ORDER BY score DESC
|
||||
LIMIT :limit
|
||||
"""),
|
||||
{"query": query, "limit": limit},
|
||||
)
|
||||
return [SearchResult(**row._mapping) for row in result]
|
||||
def _merge_results(
|
||||
text_results: list[SearchResult],
|
||||
vector_results: list[SearchResult],
|
||||
limit: int,
|
||||
) -> list[SearchResult]:
|
||||
"""텍스트 + 벡터 결과 합산 (중복 제거, 점수 합산)"""
|
||||
merged: dict[int, SearchResult] = {}
|
||||
|
||||
for r in text_results:
|
||||
merged[r.id] = r
|
||||
|
||||
for r in vector_results:
|
||||
if r.id in merged:
|
||||
# 이미 텍스트로 잡힌 문서 — 벡터 점수 가산
|
||||
existing = merged[r.id]
|
||||
merged[r.id] = SearchResult(
|
||||
id=existing.id,
|
||||
title=existing.title,
|
||||
ai_domain=existing.ai_domain,
|
||||
ai_summary=existing.ai_summary,
|
||||
file_format=existing.file_format,
|
||||
score=existing.score + r.score * 0.5,
|
||||
snippet=existing.snippet,
|
||||
match_reason=f"{existing.match_reason}+vector",
|
||||
)
|
||||
elif r.score > 0.3: # 벡터 유사도 최소 threshold
|
||||
merged[r.id] = r
|
||||
|
||||
results = sorted(merged.values(), key=lambda x: x.score, reverse=True)
|
||||
return results[:limit]
|
||||
|
||||
Reference in New Issue
Block a user