Files
hyungi_document_server/app/api/search.py
Hyungi Ahn e7cd710e69 fix: hybrid 검색 단순화 — FTS + ILIKE (vector/trgm 복잡 쿼리 제거)
asyncpg 파라미터 바인딩 충돌 문제 근본 해결.
한국어 검색: ILIKE fallback으로 안정 동작.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-03 14:16:36 +09:00

170 lines
6.1 KiB
Python

"""하이브리드 검색 API — FTS + 트리그램 + 벡터"""
from typing import Annotated
from fastapi import APIRouter, Depends, Query
from pydantic import BaseModel
from sqlalchemy import text
from sqlalchemy.ext.asyncio import AsyncSession
from ai.client import AIClient
from core.auth import get_current_user
from core.database import get_session
from models.user import User
router = APIRouter()
# 가중치 (초기값, 튜닝 가능)
W_FTS = 0.4
W_TRGM = 0.2
W_VECTOR = 0.4
class SearchResult(BaseModel):
id: int
title: str | None
ai_domain: str | None
ai_summary: str | None
file_format: str
score: float
snippet: str | None
class SearchResponse(BaseModel):
results: list[SearchResult]
total: int
query: str
mode: str
@router.get("/", response_model=SearchResponse)
async def search(
q: str,
user: Annotated[User, Depends(get_current_user)],
session: Annotated[AsyncSession, Depends(get_session)],
mode: str = Query("hybrid", pattern="^(fts|trgm|vector|hybrid)$"),
limit: int = Query(20, ge=1, le=100),
):
"""문서 검색
mode:
- fts: PostgreSQL 전문검색 (GIN 인덱스)
- trgm: 트리그램 부분매칭 (한국어 지원)
- vector: 벡터 유사도 검색 (의미검색)
- hybrid: FTS + 트리그램 + 벡터 결합 (기본)
"""
if mode == "fts":
results = await _search_fts(session, q, limit)
elif mode == "trgm":
results = await _search_trgm(session, q, limit)
elif mode == "vector":
results = await _search_vector(session, q, limit)
else:
results = await _search_hybrid(session, q, limit)
return SearchResponse(
results=results,
total=len(results),
query=q,
mode=mode,
)
async def _search_fts(session: AsyncSession, query: str, limit: int) -> list[SearchResult]:
"""PostgreSQL 전문검색 (GIN 인덱스)"""
# simple 설정으로 한국어 토큰화 없이 공백 기반 분리
result = await session.execute(
text("""
SELECT id, title, ai_domain, ai_summary, file_format,
ts_rank(
to_tsvector('simple', coalesce(title, '') || ' ' || coalesce(extracted_text, '')),
plainto_tsquery('simple', :query)
) AS score,
left(extracted_text, 200) AS snippet
FROM documents
WHERE to_tsvector('simple', coalesce(title, '') || ' ' || coalesce(extracted_text, ''))
@@ plainto_tsquery('simple', :query)
ORDER BY score DESC
LIMIT :limit
"""),
{"query": query, "limit": limit},
)
return [SearchResult(**row._mapping) for row in result]
async def _search_trgm(session: AsyncSession, query: str, limit: int) -> list[SearchResult]:
"""트리그램 부분매칭 + ILIKE fallback (한국어 지원)"""
# threshold 낮춰서 한국어 매칭 향상
await session.execute(text("SET pg_trgm.similarity_threshold = 0.1"))
result = await session.execute(
text("""
SELECT id, title, ai_domain, ai_summary, file_format,
GREATEST(
similarity(coalesce(title, '') || ' ' || coalesce(extracted_text, ''), :query),
CASE WHEN (coalesce(title, '') || ' ' || coalesce(extracted_text, '')) ILIKE '%%' || :query || '%%'
THEN 0.5 ELSE 0 END
) AS score,
left(extracted_text, 200) AS snippet
FROM documents
WHERE (coalesce(title, '') || ' ' || coalesce(extracted_text, '')) %% :query
OR (coalesce(title, '') || ' ' || coalesce(extracted_text, '')) ILIKE '%%' || :query || '%%'
ORDER BY score DESC
LIMIT :limit
"""),
{"query": query, "limit": limit},
)
return [SearchResult(**row._mapping) for row in result]
async def _search_vector(session: AsyncSession, query: str, limit: int) -> list[SearchResult]:
"""벡터 유사도 검색 (코사인 거리)"""
client = AIClient()
try:
query_embedding = await client.embed(query)
except Exception:
return [] # GPU 서버 불가 시 빈 결과
finally:
await client.close()
# pgvector 코사인 거리 (0=동일, 2=반대)
result = await session.execute(
text("""
SELECT id, title, ai_domain, ai_summary, file_format,
(1 - (embedding <=> :embedding::vector)) AS score,
left(extracted_text, 200) AS snippet
FROM documents
WHERE embedding IS NOT NULL
ORDER BY embedding <=> :embedding::vector
LIMIT :limit
"""),
{"embedding": str(query_embedding), "limit": limit},
)
return [SearchResult(**row._mapping) for row in result]
async def _search_hybrid(session: AsyncSession, query: str, limit: int) -> list[SearchResult]:
"""하이브리드 검색 — FTS + ILIKE (안정적 한국어 지원)"""
result = await session.execute(
text("""
SELECT id, title, ai_domain, ai_summary, file_format,
GREATEST(
coalesce(ts_rank(
to_tsvector('simple', coalesce(title, '') || ' ' || coalesce(extracted_text, '')),
plainto_tsquery('simple', :query)
), 0),
CASE WHEN (coalesce(title, '') || ' ' || coalesce(extracted_text, ''))
ILIKE '%%' || :query || '%%' THEN 0.5 ELSE 0 END
) AS score,
left(extracted_text, 200) AS snippet
FROM documents
WHERE to_tsvector('simple', coalesce(title, '') || ' ' || coalesce(extracted_text, ''))
@@ plainto_tsquery('simple', :query)
OR (coalesce(title, '') || ' ' || coalesce(extracted_text, ''))
ILIKE '%%' || :query || '%%'
ORDER BY score DESC
LIMIT :limit
"""),
{"query": query, "limit": limit},
)
return [SearchResult(**row._mapping) for row in result]