"""검색 후보 수집 서비스 (Phase 1.1). text(documents FTS + 키워드) + vector(documents.embedding) 후보를 SearchResult 리스트로 반환. Phase 1.1: search.py의 _search_text/_search_vector를 이전. Phase 1.1 후속 substep: ILIKE → trigram `similarity()` + `gin_trgm_ops`. Phase 1.2: vector retrieval을 document_chunks 테이블 기반으로 전환. """ from __future__ import annotations from typing import TYPE_CHECKING from sqlalchemy import text from sqlalchemy.ext.asyncio import AsyncSession from ai.client import AIClient if TYPE_CHECKING: from api.search import SearchResult async def search_text( session: AsyncSession, query: str, limit: int ) -> list["SearchResult"]: """FTS + ILIKE 필드별 가중치 검색. 가중치: title 3.0 / ai_tags 2.5 / user_note 2.0 / ai_summary 1.5 / extracted_text 1.0 + ts_rank * 2.0 보너스. """ from api.search import SearchResult # 순환 import 회피 result = await session.execute( text(""" SELECT id, title, ai_domain, ai_summary, file_format, left(extracted_text, 200) AS snippet, ( -- title 매칭 (가중치 최고) CASE WHEN coalesce(title, '') ILIKE '%%' || :q || '%%' THEN 3.0 ELSE 0 END -- ai_tags 매칭 (가중치 높음) + CASE WHEN coalesce(ai_tags::text, '') ILIKE '%%' || :q || '%%' THEN 2.5 ELSE 0 END -- user_note 매칭 (가중치 높음) + CASE WHEN coalesce(user_note, '') ILIKE '%%' || :q || '%%' THEN 2.0 ELSE 0 END -- ai_summary 매칭 (가중치 중상) + CASE WHEN coalesce(ai_summary, '') ILIKE '%%' || :q || '%%' THEN 1.5 ELSE 0 END -- extracted_text 매칭 (가중치 중간) + CASE WHEN coalesce(extracted_text, '') ILIKE '%%' || :q || '%%' THEN 1.0 ELSE 0 END -- FTS 점수 (보너스) + coalesce(ts_rank( to_tsvector('simple', coalesce(title, '') || ' ' || coalesce(extracted_text, '')), plainto_tsquery('simple', :q) ), 0) * 2.0 ) AS score, -- match reason CASE WHEN coalesce(title, '') ILIKE '%%' || :q || '%%' THEN 'title' WHEN coalesce(ai_tags::text, '') ILIKE '%%' || :q || '%%' THEN 'tags' WHEN coalesce(user_note, '') ILIKE '%%' || :q || '%%' THEN 'note' WHEN coalesce(ai_summary, '') ILIKE '%%' || :q || '%%' THEN 'summary' WHEN coalesce(extracted_text, '') ILIKE '%%' || :q || '%%' THEN 'content' ELSE 'fts' END AS match_reason FROM documents WHERE deleted_at IS NULL AND (coalesce(title, '') ILIKE '%%' || :q || '%%' OR coalesce(ai_tags::text, '') ILIKE '%%' || :q || '%%' OR coalesce(user_note, '') ILIKE '%%' || :q || '%%' OR coalesce(ai_summary, '') ILIKE '%%' || :q || '%%' OR coalesce(extracted_text, '') ILIKE '%%' || :q || '%%' OR to_tsvector('simple', coalesce(title, '') || ' ' || coalesce(extracted_text, '')) @@ plainto_tsquery('simple', :q)) ORDER BY score DESC LIMIT :limit """), {"q": query, "limit": limit}, ) return [SearchResult(**row._mapping) for row in result] async def search_vector( session: AsyncSession, query: str, limit: int ) -> list["SearchResult"]: """벡터 유사도 검색 (코사인 거리). Phase 1.2에서 document_chunks 테이블 기반으로 전환 예정. 현재는 documents.embedding 사용. """ from api.search import SearchResult # 순환 import 회피 try: client = AIClient() query_embedding = await client.embed(query) await client.close() except Exception: return [] result = await session.execute( text(""" SELECT id, title, ai_domain, ai_summary, file_format, (1 - (embedding <=> cast(:embedding AS vector))) AS score, left(extracted_text, 200) AS snippet, 'vector' AS match_reason FROM documents WHERE embedding IS NOT NULL AND deleted_at IS NULL ORDER BY embedding <=> cast(:embedding AS vector) LIMIT :limit """), {"embedding": str(query_embedding), "limit": limit}, ) return [SearchResult(**row._mapping) for row in result]