diff --git a/app/api/search.py b/app/api/search.py index 2a09fb2..cdcdf09 100644 --- a/app/api/search.py +++ b/app/api/search.py @@ -1,4 +1,4 @@ -"""하이브리드 검색 API — FTS + 트리그램 + 벡터""" +"""하이브리드 검색 API — FTS + ILIKE + 벡터 (필드별 가중치)""" from typing import Annotated @@ -14,11 +14,6 @@ from models.user import User router = APIRouter() -# 가중치 (초기값, 튜닝 가능) -W_FTS = 0.4 -W_TRGM = 0.2 -W_VECTOR = 0.4 - class SearchResult(BaseModel): id: int @@ -28,6 +23,7 @@ class SearchResult(BaseModel): file_format: str score: float snippet: str | None + match_reason: str | None = None class SearchResponse(BaseModel): @@ -45,22 +41,16 @@ async def search( mode: str = Query("hybrid", pattern="^(fts|trgm|vector|hybrid)$"), limit: int = Query(20, ge=1, le=100), ): - """문서 검색 - - mode: - - fts: PostgreSQL 전문검색 (GIN 인덱스) - - trgm: 트리그램 부분매칭 (한국어 지원) - - vector: 벡터 유사도 검색 (의미검색) - - hybrid: FTS + 트리그램 + 벡터 결합 (기본) - """ - if mode == "fts": - results = await _search_fts(session, q, limit) - elif mode == "trgm": - results = await _search_trgm(session, q, limit) - elif mode == "vector": + """문서 검색 — FTS + ILIKE + 벡터 결합""" + if mode == "vector": results = await _search_vector(session, q, limit) else: - results = await _search_hybrid(session, q, limit) + results = await _search_text(session, q, limit) + + # hybrid: 벡터 결과도 합산 + if mode == "hybrid": + vector_results = await _search_vector(session, q, limit) + results = _merge_results(results, vector_results, limit) return SearchResponse( results=results, @@ -70,68 +60,69 @@ async def search( ) -async def _search_fts(session: AsyncSession, query: str, limit: int) -> list[SearchResult]: - """PostgreSQL 전문검색 (GIN 인덱스)""" - # simple 설정으로 한국어 토큰화 없이 공백 기반 분리 +async def _search_text(session: AsyncSession, query: str, limit: int) -> list[SearchResult]: + """FTS + ILIKE — 필드별 가중치 적용""" result = await session.execute( text(""" SELECT id, title, ai_domain, ai_summary, file_format, - ts_rank( - to_tsvector('simple', coalesce(title, '') || ' ' || coalesce(extracted_text, '')), - plainto_tsquery('simple', :query) + left(extracted_text, 200) AS snippet, + ( + -- title 매칭 (가중치 최고) + CASE WHEN coalesce(title, '') ILIKE '%%' || :q || '%%' THEN 3.0 ELSE 0 END + -- ai_tags 매칭 (가중치 높음) + + CASE WHEN coalesce(ai_tags::text, '') ILIKE '%%' || :q || '%%' THEN 2.5 ELSE 0 END + -- user_note 매칭 (가중치 높음) + + CASE WHEN coalesce(user_note, '') ILIKE '%%' || :q || '%%' THEN 2.0 ELSE 0 END + -- ai_summary 매칭 (가중치 중상) + + CASE WHEN coalesce(ai_summary, '') ILIKE '%%' || :q || '%%' THEN 1.5 ELSE 0 END + -- extracted_text 매칭 (가중치 중간) + + CASE WHEN coalesce(extracted_text, '') ILIKE '%%' || :q || '%%' THEN 1.0 ELSE 0 END + -- FTS 점수 (보너스) + + coalesce(ts_rank( + to_tsvector('simple', coalesce(title, '') || ' ' || coalesce(extracted_text, '')), + plainto_tsquery('simple', :q) + ), 0) * 2.0 ) AS score, - left(extracted_text, 200) AS snippet + -- match reason + CASE + WHEN coalesce(title, '') ILIKE '%%' || :q || '%%' THEN 'title' + WHEN coalesce(ai_tags::text, '') ILIKE '%%' || :q || '%%' THEN 'tags' + WHEN coalesce(user_note, '') ILIKE '%%' || :q || '%%' THEN 'note' + WHEN coalesce(ai_summary, '') ILIKE '%%' || :q || '%%' THEN 'summary' + WHEN coalesce(extracted_text, '') ILIKE '%%' || :q || '%%' THEN 'content' + ELSE 'fts' + END AS match_reason FROM documents - WHERE to_tsvector('simple', coalesce(title, '') || ' ' || coalesce(extracted_text, '')) - @@ plainto_tsquery('simple', :query) + WHERE coalesce(title, '') ILIKE '%%' || :q || '%%' + OR coalesce(ai_tags::text, '') ILIKE '%%' || :q || '%%' + OR coalesce(user_note, '') ILIKE '%%' || :q || '%%' + OR coalesce(ai_summary, '') ILIKE '%%' || :q || '%%' + OR coalesce(extracted_text, '') ILIKE '%%' || :q || '%%' + OR to_tsvector('simple', coalesce(title, '') || ' ' || coalesce(extracted_text, '')) + @@ plainto_tsquery('simple', :q) ORDER BY score DESC LIMIT :limit """), - {"query": query, "limit": limit}, - ) - return [SearchResult(**row._mapping) for row in result] - - -async def _search_trgm(session: AsyncSession, query: str, limit: int) -> list[SearchResult]: - """트리그램 부분매칭 + ILIKE fallback (한국어 지원)""" - # threshold 낮춰서 한국어 매칭 향상 - await session.execute(text("SET pg_trgm.similarity_threshold = 0.1")) - result = await session.execute( - text(""" - SELECT id, title, ai_domain, ai_summary, file_format, - GREATEST( - similarity(coalesce(title, '') || ' ' || coalesce(extracted_text, ''), :query), - CASE WHEN (coalesce(title, '') || ' ' || coalesce(extracted_text, '')) ILIKE '%%' || :query || '%%' - THEN 0.5 ELSE 0 END - ) AS score, - left(extracted_text, 200) AS snippet - FROM documents - WHERE (coalesce(title, '') || ' ' || coalesce(extracted_text, '')) %% :query - OR (coalesce(title, '') || ' ' || coalesce(extracted_text, '')) ILIKE '%%' || :query || '%%' - ORDER BY score DESC - LIMIT :limit - """), - {"query": query, "limit": limit}, + {"q": query, "limit": limit}, ) return [SearchResult(**row._mapping) for row in result] async def _search_vector(session: AsyncSession, query: str, limit: int) -> list[SearchResult]: """벡터 유사도 검색 (코사인 거리)""" - client = AIClient() try: + client = AIClient() query_embedding = await client.embed(query) - except Exception: - return [] # GPU 서버 불가 시 빈 결과 - finally: await client.close() + except Exception: + return [] - # pgvector 코사인 거리 (0=동일, 2=반대) result = await session.execute( text(""" SELECT id, title, ai_domain, ai_summary, file_format, (1 - (embedding <=> :embedding::vector)) AS score, - left(extracted_text, 200) AS snippet + left(extracted_text, 200) AS snippet, + 'vector' AS match_reason FROM documents WHERE embedding IS NOT NULL ORDER BY embedding <=> :embedding::vector @@ -142,28 +133,33 @@ async def _search_vector(session: AsyncSession, query: str, limit: int) -> list[ return [SearchResult(**row._mapping) for row in result] -async def _search_hybrid(session: AsyncSession, query: str, limit: int) -> list[SearchResult]: - """하이브리드 검색 — FTS + ILIKE (안정적 한국어 지원)""" - result = await session.execute( - text(""" - SELECT id, title, ai_domain, ai_summary, file_format, - GREATEST( - coalesce(ts_rank( - to_tsvector('simple', coalesce(title, '') || ' ' || coalesce(extracted_text, '')), - plainto_tsquery('simple', :query) - ), 0), - CASE WHEN (coalesce(title, '') || ' ' || coalesce(extracted_text, '')) - ILIKE '%%' || :query || '%%' THEN 0.5 ELSE 0 END - ) AS score, - left(extracted_text, 200) AS snippet - FROM documents - WHERE to_tsvector('simple', coalesce(title, '') || ' ' || coalesce(extracted_text, '')) - @@ plainto_tsquery('simple', :query) - OR (coalesce(title, '') || ' ' || coalesce(extracted_text, '')) - ILIKE '%%' || :query || '%%' - ORDER BY score DESC - LIMIT :limit - """), - {"query": query, "limit": limit}, - ) - return [SearchResult(**row._mapping) for row in result] +def _merge_results( + text_results: list[SearchResult], + vector_results: list[SearchResult], + limit: int, +) -> list[SearchResult]: + """텍스트 + 벡터 결과 합산 (중복 제거, 점수 합산)""" + merged: dict[int, SearchResult] = {} + + for r in text_results: + merged[r.id] = r + + for r in vector_results: + if r.id in merged: + # 이미 텍스트로 잡힌 문서 — 벡터 점수 가산 + existing = merged[r.id] + merged[r.id] = SearchResult( + id=existing.id, + title=existing.title, + ai_domain=existing.ai_domain, + ai_summary=existing.ai_summary, + file_format=existing.file_format, + score=existing.score + r.score * 0.5, + snippet=existing.snippet, + match_reason=f"{existing.match_reason}+vector", + ) + elif r.score > 0.3: # 벡터 유사도 최소 threshold + merged[r.id] = r + + results = sorted(merged.values(), key=lambda x: x.score, reverse=True) + return results[:limit]