"""검색 후보 수집 서비스 (Phase 1.2). text(documents FTS + trigram) + vector(documents.embedding → chunks) 후보를 SearchResult 리스트로 반환. Phase 1.1a: search.py의 _search_text/_search_vector를 이전 (ILIKE 그대로). Phase 1.2-B: ILIKE → trigram `%` + `similarity()`. ILIKE 풀 스캔 제거. Phase 1.2-B 이후: vector retrieval을 document_chunks 테이블 기반으로 전환. """ from __future__ import annotations from typing import TYPE_CHECKING from sqlalchemy import text from sqlalchemy.ext.asyncio import AsyncSession from ai.client import AIClient if TYPE_CHECKING: from api.search import SearchResult async def search_text( session: AsyncSession, query: str, limit: int ) -> list["SearchResult"]: """FTS + trigram 필드별 가중치 검색 (Phase 1.2-B UNION 분해). Phase 1.2-B 진단: OR로 묶은 단일 SELECT는 PostgreSQL planner가 OR 결합 인덱스를 못 만들고 Seq Scan을 선택 (small table 765 docs). EXPLAIN으로 측정 시 525ms. → CTE + UNION으로 분해하면 각 branch가 자기 인덱스 활용 → 26ms (95% 감소). 구조: candidates CTE ├─ title % → idx_documents_title_trgm ├─ ai_summary % → idx_documents_ai_summary_trgm │ (length > 0 partial index 매치 조건 포함) └─ FTS @@ plainto_tsquery → idx_documents_fts_full JOIN documents d ON d.id = c.id ORDER BY 5컬럼 similarity 가중 합산 + ts_rank * 2.0 가중치: title 3.0 / ai_tags 2.5 / user_note 2.0 / ai_summary 1.5 / extracted_text 1.0 """ from api.search import SearchResult # 순환 import 회피 result = await session.execute( text(""" WITH candidates AS ( -- title trigram (idx_documents_title_trgm) SELECT id FROM documents WHERE deleted_at IS NULL AND title %% :q UNION -- ai_summary trigram (idx_documents_ai_summary_trgm 부분 인덱스 매치) SELECT id FROM documents WHERE deleted_at IS NULL AND ai_summary IS NOT NULL AND length(ai_summary) > 0 AND ai_summary %% :q UNION -- FTS 통합 인덱스 (idx_documents_fts_full) SELECT id FROM documents WHERE deleted_at IS NULL AND to_tsvector('simple', coalesce(title, '') || ' ' || coalesce(ai_tags::text, '') || ' ' || coalesce(ai_summary, '') || ' ' || coalesce(user_note, '') || ' ' || coalesce(extracted_text, '') ) @@ plainto_tsquery('simple', :q) ) SELECT d.id, d.title, d.ai_domain, d.ai_summary, d.file_format, left(d.extracted_text, 200) AS snippet, ( -- 컬럼별 trigram similarity 가중 합산 similarity(coalesce(d.title, ''), :q) * 3.0 + similarity(coalesce(d.ai_tags::text, ''), :q) * 2.5 + similarity(coalesce(d.user_note, ''), :q) * 2.0 + similarity(coalesce(d.ai_summary, ''), :q) * 1.5 + similarity(coalesce(d.extracted_text, ''), :q) * 1.0 -- FTS 보너스 (idx_documents_fts_full 활용) + coalesce(ts_rank( to_tsvector('simple', coalesce(d.title, '') || ' ' || coalesce(d.ai_tags::text, '') || ' ' || coalesce(d.ai_summary, '') || ' ' || coalesce(d.user_note, '') || ' ' || coalesce(d.extracted_text, '') ), plainto_tsquery('simple', :q) ), 0) * 2.0 ) AS score, -- match_reason: similarity 가장 큰 컬럼 또는 FTS CASE WHEN similarity(coalesce(d.title, ''), :q) >= 0.3 THEN 'title' WHEN similarity(coalesce(d.ai_tags::text, ''), :q) >= 0.3 THEN 'tags' WHEN similarity(coalesce(d.user_note, ''), :q) >= 0.3 THEN 'note' WHEN similarity(coalesce(d.ai_summary, ''), :q) >= 0.3 THEN 'summary' WHEN similarity(coalesce(d.extracted_text, ''), :q) >= 0.3 THEN 'content' ELSE 'fts' END AS match_reason FROM documents d JOIN candidates c ON d.id = c.id ORDER BY score DESC LIMIT :limit """), {"q": query, "limit": limit}, ) return [SearchResult(**row._mapping) for row in result] async def search_vector( session: AsyncSession, query: str, limit: int ) -> list["SearchResult"]: """벡터 유사도 검색 (코사인 거리). Phase 1.2에서 document_chunks 테이블 기반으로 전환 예정. 현재는 documents.embedding 사용. """ from api.search import SearchResult # 순환 import 회피 try: client = AIClient() query_embedding = await client.embed(query) await client.close() except Exception: return [] result = await session.execute( text(""" SELECT id, title, ai_domain, ai_summary, file_format, (1 - (embedding <=> cast(:embedding AS vector))) AS score, left(extracted_text, 200) AS snippet, 'vector' AS match_reason FROM documents WHERE embedding IS NOT NULL AND deleted_at IS NULL ORDER BY embedding <=> cast(:embedding AS vector) LIMIT :limit """), {"embedding": str(query_embedding), "limit": limit}, ) return [SearchResult(**row._mapping) for row in result]