- 프론트: debounce 자동검색 제거 → Enter 키로만 검색 (한글 조합 문제 해결) - 백엔드: trgm threshold 0.1로 낮춤 + ILIKE '%검색어%' fallback 추가 - hybrid 검색 score threshold 0.01 → 0.001로 낮춤 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
198 lines
7.0 KiB
Python
198 lines
7.0 KiB
Python
"""하이브리드 검색 API — FTS + 트리그램 + 벡터"""
|
|
|
|
from typing import Annotated
|
|
|
|
from fastapi import APIRouter, Depends, Query
|
|
from pydantic import BaseModel
|
|
from sqlalchemy import text
|
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
|
|
from ai.client import AIClient
|
|
from core.auth import get_current_user
|
|
from core.database import get_session
|
|
from models.user import User
|
|
|
|
router = APIRouter()
|
|
|
|
# 가중치 (초기값, 튜닝 가능)
|
|
W_FTS = 0.4
|
|
W_TRGM = 0.2
|
|
W_VECTOR = 0.4
|
|
|
|
|
|
class SearchResult(BaseModel):
|
|
id: int
|
|
title: str | None
|
|
ai_domain: str | None
|
|
ai_summary: str | None
|
|
file_format: str
|
|
score: float
|
|
snippet: str | None
|
|
|
|
|
|
class SearchResponse(BaseModel):
|
|
results: list[SearchResult]
|
|
total: int
|
|
query: str
|
|
mode: str
|
|
|
|
|
|
@router.get("/", response_model=SearchResponse)
|
|
async def search(
|
|
q: str,
|
|
user: Annotated[User, Depends(get_current_user)],
|
|
session: Annotated[AsyncSession, Depends(get_session)],
|
|
mode: str = Query("hybrid", pattern="^(fts|trgm|vector|hybrid)$"),
|
|
limit: int = Query(20, ge=1, le=100),
|
|
):
|
|
"""문서 검색
|
|
|
|
mode:
|
|
- fts: PostgreSQL 전문검색 (GIN 인덱스)
|
|
- trgm: 트리그램 부분매칭 (한국어 지원)
|
|
- vector: 벡터 유사도 검색 (의미검색)
|
|
- hybrid: FTS + 트리그램 + 벡터 결합 (기본)
|
|
"""
|
|
if mode == "fts":
|
|
results = await _search_fts(session, q, limit)
|
|
elif mode == "trgm":
|
|
results = await _search_trgm(session, q, limit)
|
|
elif mode == "vector":
|
|
results = await _search_vector(session, q, limit)
|
|
else:
|
|
results = await _search_hybrid(session, q, limit)
|
|
|
|
return SearchResponse(
|
|
results=results,
|
|
total=len(results),
|
|
query=q,
|
|
mode=mode,
|
|
)
|
|
|
|
|
|
async def _search_fts(session: AsyncSession, query: str, limit: int) -> list[SearchResult]:
|
|
"""PostgreSQL 전문검색 (GIN 인덱스)"""
|
|
# simple 설정으로 한국어 토큰화 없이 공백 기반 분리
|
|
result = await session.execute(
|
|
text("""
|
|
SELECT id, title, ai_domain, ai_summary, file_format,
|
|
ts_rank(
|
|
to_tsvector('simple', coalesce(title, '') || ' ' || coalesce(extracted_text, '')),
|
|
plainto_tsquery('simple', :query)
|
|
) AS score,
|
|
left(extracted_text, 200) AS snippet
|
|
FROM documents
|
|
WHERE to_tsvector('simple', coalesce(title, '') || ' ' || coalesce(extracted_text, ''))
|
|
@@ plainto_tsquery('simple', :query)
|
|
ORDER BY score DESC
|
|
LIMIT :limit
|
|
"""),
|
|
{"query": query, "limit": limit},
|
|
)
|
|
return [SearchResult(**row._mapping) for row in result]
|
|
|
|
|
|
async def _search_trgm(session: AsyncSession, query: str, limit: int) -> list[SearchResult]:
|
|
"""트리그램 부분매칭 + ILIKE fallback (한국어 지원)"""
|
|
# threshold 낮춰서 한국어 매칭 향상
|
|
await session.execute(text("SET pg_trgm.similarity_threshold = 0.1"))
|
|
result = await session.execute(
|
|
text("""
|
|
SELECT id, title, ai_domain, ai_summary, file_format,
|
|
GREATEST(
|
|
similarity(coalesce(title, '') || ' ' || coalesce(extracted_text, ''), :query),
|
|
CASE WHEN (coalesce(title, '') || ' ' || coalesce(extracted_text, '')) ILIKE '%' || :query || '%'
|
|
THEN 0.5 ELSE 0 END
|
|
) AS score,
|
|
left(extracted_text, 200) AS snippet
|
|
FROM documents
|
|
WHERE (coalesce(title, '') || ' ' || coalesce(extracted_text, '')) %% :query
|
|
OR (coalesce(title, '') || ' ' || coalesce(extracted_text, '')) ILIKE '%' || :query || '%'
|
|
ORDER BY score DESC
|
|
LIMIT :limit
|
|
"""),
|
|
{"query": query, "limit": limit},
|
|
)
|
|
return [SearchResult(**row._mapping) for row in result]
|
|
|
|
|
|
async def _search_vector(session: AsyncSession, query: str, limit: int) -> list[SearchResult]:
|
|
"""벡터 유사도 검색 (코사인 거리)"""
|
|
client = AIClient()
|
|
try:
|
|
query_embedding = await client.embed(query)
|
|
except Exception:
|
|
return [] # GPU 서버 불가 시 빈 결과
|
|
finally:
|
|
await client.close()
|
|
|
|
# pgvector 코사인 거리 (0=동일, 2=반대)
|
|
result = await session.execute(
|
|
text("""
|
|
SELECT id, title, ai_domain, ai_summary, file_format,
|
|
(1 - (embedding <=> :embedding::vector)) AS score,
|
|
left(extracted_text, 200) AS snippet
|
|
FROM documents
|
|
WHERE embedding IS NOT NULL
|
|
ORDER BY embedding <=> :embedding::vector
|
|
LIMIT :limit
|
|
"""),
|
|
{"embedding": str(query_embedding), "limit": limit},
|
|
)
|
|
return [SearchResult(**row._mapping) for row in result]
|
|
|
|
|
|
async def _search_hybrid(session: AsyncSession, query: str, limit: int) -> list[SearchResult]:
|
|
"""하이브리드 검색 — FTS + 트리그램 + 벡터 가중 합산"""
|
|
# 벡터 임베딩 생성 (실패 시 FTS+트리그램만)
|
|
query_embedding = None
|
|
try:
|
|
client = AIClient()
|
|
query_embedding = await client.embed(query)
|
|
await client.close()
|
|
except Exception:
|
|
pass
|
|
|
|
vector_clause = ""
|
|
vector_score = "0"
|
|
params = {"query": query, "limit": limit, "w_fts": W_FTS, "w_trgm": W_TRGM, "w_vector": W_VECTOR}
|
|
|
|
if query_embedding:
|
|
vector_clause = "LEFT JOIN LATERAL (SELECT 1 - (d.embedding <=> :embedding::vector) AS vscore) v ON true"
|
|
vector_score = "coalesce(v.vscore, 0)"
|
|
params["embedding"] = str(query_embedding)
|
|
else:
|
|
# 벡터 없으면 FTS+트리그램만 사용
|
|
params["w_fts"] = 0.6
|
|
params["w_trgm"] = 0.4
|
|
params["w_vector"] = 0.0
|
|
|
|
result = await session.execute(
|
|
text(f"""
|
|
SELECT * FROM (
|
|
SELECT d.id, d.title, d.ai_domain, d.ai_summary, d.file_format,
|
|
(
|
|
:w_fts * coalesce(ts_rank(
|
|
to_tsvector('simple', coalesce(d.title, '') || ' ' || coalesce(d.extracted_text, '')),
|
|
plainto_tsquery('simple', :query)
|
|
), 0)
|
|
+ :w_trgm * coalesce(similarity(
|
|
coalesce(d.title, '') || ' ' || coalesce(d.extracted_text, ''),
|
|
:query
|
|
), 0)
|
|
+ :w_vector * {vector_score}
|
|
) AS score,
|
|
left(d.extracted_text, 200) AS snippet
|
|
FROM documents d
|
|
{vector_clause}
|
|
WHERE coalesce(d.extracted_text, '') != ''
|
|
OR (coalesce(d.title, '') || ' ' || coalesce(d.extracted_text, '')) ILIKE '%' || :query || '%'
|
|
) sub
|
|
WHERE sub.score > 0.001
|
|
ORDER BY sub.score DESC
|
|
LIMIT :limit
|
|
"""),
|
|
params,
|
|
)
|
|
return [SearchResult(**row._mapping) for row in result]
|