"""검색 후보 수집 서비스 (Phase 1.2). text(documents FTS + trigram) + vector(documents.embedding → chunks) 후보를 SearchResult 리스트로 반환. Phase 1.1a: search.py의 _search_text/_search_vector를 이전 (ILIKE 그대로). Phase 1.2-B: ILIKE → trigram `%` + `similarity()`. ILIKE 풀 스캔 제거. Phase 1.2-B 이후: vector retrieval을 document_chunks 테이블 기반으로 전환. """ from __future__ import annotations from typing import TYPE_CHECKING from sqlalchemy import text from sqlalchemy.ext.asyncio import AsyncSession from ai.client import AIClient if TYPE_CHECKING: from api.search import SearchResult async def search_text( session: AsyncSession, query: str, limit: int ) -> list["SearchResult"]: """FTS + trigram 필드별 가중치 검색 (Phase 1.2-B). WHERE: 인덱스 있는 trigram 컬럼(title, ai_summary)으로 후보 필터 + FTS 통합 인덱스 - idx_documents_title_trgm - idx_documents_ai_summary_trgm - idx_documents_fts_full (title + ai_tags + ai_summary + user_note + extracted_text) - extracted_text는 trigram threshold 0.3에서 매우 낮은 similarity → WHERE에선 FTS만 ORDER BY: 5개 컬럼 similarity 가중 합산 + ts_rank * 2.0 가중치: title 3.0 / ai_tags 2.5 / user_note 2.0 / ai_summary 1.5 / extracted_text 1.0 """ from api.search import SearchResult # 순환 import 회피 result = await session.execute( text(""" SELECT id, title, ai_domain, ai_summary, file_format, left(extracted_text, 200) AS snippet, ( -- 컬럼별 trigram similarity 가중 합산 similarity(coalesce(title, ''), :q) * 3.0 + similarity(coalesce(ai_tags::text, ''), :q) * 2.5 + similarity(coalesce(user_note, ''), :q) * 2.0 + similarity(coalesce(ai_summary, ''), :q) * 1.5 + similarity(coalesce(extracted_text, ''), :q) * 1.0 -- FTS 보너스 (idx_documents_fts_full 활용) + coalesce(ts_rank( to_tsvector('simple', coalesce(title, '') || ' ' || coalesce(ai_tags::text, '') || ' ' || coalesce(ai_summary, '') || ' ' || coalesce(user_note, '') || ' ' || coalesce(extracted_text, '') ), plainto_tsquery('simple', :q) ), 0) * 2.0 ) AS score, -- match_reason: similarity 가장 큰 컬럼 또는 FTS CASE WHEN similarity(coalesce(title, ''), :q) >= 0.3 THEN 'title' WHEN similarity(coalesce(ai_tags::text, ''), :q) >= 0.3 THEN 'tags' WHEN similarity(coalesce(user_note, ''), :q) >= 0.3 THEN 'note' WHEN similarity(coalesce(ai_summary, ''), :q) >= 0.3 THEN 'summary' WHEN similarity(coalesce(extracted_text, ''), :q) >= 0.3 THEN 'content' ELSE 'fts' END AS match_reason FROM documents WHERE deleted_at IS NULL AND ( -- trigram 후보 필터 (인덱스 있는 짧은 컬럼만) title % :q OR (ai_summary IS NOT NULL AND ai_summary % :q) -- FTS 통합 인덱스 OR to_tsvector('simple', coalesce(title, '') || ' ' || coalesce(ai_tags::text, '') || ' ' || coalesce(ai_summary, '') || ' ' || coalesce(user_note, '') || ' ' || coalesce(extracted_text, '') ) @@ plainto_tsquery('simple', :q) ) ORDER BY score DESC LIMIT :limit """), {"q": query, "limit": limit}, ) return [SearchResult(**row._mapping) for row in result] async def search_vector( session: AsyncSession, query: str, limit: int ) -> list["SearchResult"]: """벡터 유사도 검색 (코사인 거리). Phase 1.2에서 document_chunks 테이블 기반으로 전환 예정. 현재는 documents.embedding 사용. """ from api.search import SearchResult # 순환 import 회피 try: client = AIClient() query_embedding = await client.embed(query) await client.close() except Exception: return [] result = await session.execute( text(""" SELECT id, title, ai_domain, ai_summary, file_format, (1 - (embedding <=> cast(:embedding AS vector))) AS score, left(extracted_text, 200) AS snippet, 'vector' AS match_reason FROM documents WHERE embedding IS NOT NULL AND deleted_at IS NULL ORDER BY embedding <=> cast(:embedding AS vector) LIMIT :limit """), {"embedding": str(query_embedding), "limit": limit}, ) return [SearchResult(**row._mapping) for row in result]