검색 로직을 services/search/* 모듈로 분리. trigram 도입은 Phase 1.2 인덱스와 함께.
신규:
- services/search/{__init__,retrieval_service,rerank_service,query_analyzer,evidence_service,synthesis_service}.py
- retrieval_service는 search_text/search_vector 이전 (ILIKE 동작 그대로)
- 나머지는 Phase 1.3/2/3 placeholder
이동:
- services/search_fusion.py → services/search/fusion_service.py (R100)
수정:
- api/search.py — thin orchestrator로 축소 (251줄 → 178줄)
동작 변경 없음 — 구조만 분리. 회귀 검증 후 Phase 1.2 진입.
112 lines
4.6 KiB
Python
112 lines
4.6 KiB
Python
"""검색 후보 수집 서비스 (Phase 1.1).
|
|
|
|
text(documents FTS + 키워드) + vector(documents.embedding) 후보를
|
|
SearchResult 리스트로 반환.
|
|
|
|
Phase 1.1: search.py의 _search_text/_search_vector를 이전.
|
|
Phase 1.1 후속 substep: ILIKE → trigram `similarity()` + `gin_trgm_ops`.
|
|
Phase 1.2: vector retrieval을 document_chunks 테이블 기반으로 전환.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from typing import TYPE_CHECKING
|
|
|
|
from sqlalchemy import text
|
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
|
|
from ai.client import AIClient
|
|
|
|
if TYPE_CHECKING:
|
|
from api.search import SearchResult
|
|
|
|
|
|
async def search_text(
|
|
session: AsyncSession, query: str, limit: int
|
|
) -> list["SearchResult"]:
|
|
"""FTS + ILIKE 필드별 가중치 검색.
|
|
|
|
가중치: title 3.0 / ai_tags 2.5 / user_note 2.0 / ai_summary 1.5 / extracted_text 1.0
|
|
+ ts_rank * 2.0 보너스.
|
|
"""
|
|
from api.search import SearchResult # 순환 import 회피
|
|
|
|
result = await session.execute(
|
|
text("""
|
|
SELECT id, title, ai_domain, ai_summary, file_format,
|
|
left(extracted_text, 200) AS snippet,
|
|
(
|
|
-- title 매칭 (가중치 최고)
|
|
CASE WHEN coalesce(title, '') ILIKE '%%' || :q || '%%' THEN 3.0 ELSE 0 END
|
|
-- ai_tags 매칭 (가중치 높음)
|
|
+ CASE WHEN coalesce(ai_tags::text, '') ILIKE '%%' || :q || '%%' THEN 2.5 ELSE 0 END
|
|
-- user_note 매칭 (가중치 높음)
|
|
+ CASE WHEN coalesce(user_note, '') ILIKE '%%' || :q || '%%' THEN 2.0 ELSE 0 END
|
|
-- ai_summary 매칭 (가중치 중상)
|
|
+ CASE WHEN coalesce(ai_summary, '') ILIKE '%%' || :q || '%%' THEN 1.5 ELSE 0 END
|
|
-- extracted_text 매칭 (가중치 중간)
|
|
+ CASE WHEN coalesce(extracted_text, '') ILIKE '%%' || :q || '%%' THEN 1.0 ELSE 0 END
|
|
-- FTS 점수 (보너스)
|
|
+ coalesce(ts_rank(
|
|
to_tsvector('simple', coalesce(title, '') || ' ' || coalesce(extracted_text, '')),
|
|
plainto_tsquery('simple', :q)
|
|
), 0) * 2.0
|
|
) AS score,
|
|
-- match reason
|
|
CASE
|
|
WHEN coalesce(title, '') ILIKE '%%' || :q || '%%' THEN 'title'
|
|
WHEN coalesce(ai_tags::text, '') ILIKE '%%' || :q || '%%' THEN 'tags'
|
|
WHEN coalesce(user_note, '') ILIKE '%%' || :q || '%%' THEN 'note'
|
|
WHEN coalesce(ai_summary, '') ILIKE '%%' || :q || '%%' THEN 'summary'
|
|
WHEN coalesce(extracted_text, '') ILIKE '%%' || :q || '%%' THEN 'content'
|
|
ELSE 'fts'
|
|
END AS match_reason
|
|
FROM documents
|
|
WHERE deleted_at IS NULL
|
|
AND (coalesce(title, '') ILIKE '%%' || :q || '%%'
|
|
OR coalesce(ai_tags::text, '') ILIKE '%%' || :q || '%%'
|
|
OR coalesce(user_note, '') ILIKE '%%' || :q || '%%'
|
|
OR coalesce(ai_summary, '') ILIKE '%%' || :q || '%%'
|
|
OR coalesce(extracted_text, '') ILIKE '%%' || :q || '%%'
|
|
OR to_tsvector('simple', coalesce(title, '') || ' ' || coalesce(extracted_text, ''))
|
|
@@ plainto_tsquery('simple', :q))
|
|
ORDER BY score DESC
|
|
LIMIT :limit
|
|
"""),
|
|
{"q": query, "limit": limit},
|
|
)
|
|
return [SearchResult(**row._mapping) for row in result]
|
|
|
|
|
|
async def search_vector(
|
|
session: AsyncSession, query: str, limit: int
|
|
) -> list["SearchResult"]:
|
|
"""벡터 유사도 검색 (코사인 거리).
|
|
|
|
Phase 1.2에서 document_chunks 테이블 기반으로 전환 예정.
|
|
현재는 documents.embedding 사용.
|
|
"""
|
|
from api.search import SearchResult # 순환 import 회피
|
|
|
|
try:
|
|
client = AIClient()
|
|
query_embedding = await client.embed(query)
|
|
await client.close()
|
|
except Exception:
|
|
return []
|
|
|
|
result = await session.execute(
|
|
text("""
|
|
SELECT id, title, ai_domain, ai_summary, file_format,
|
|
(1 - (embedding <=> cast(:embedding AS vector))) AS score,
|
|
left(extracted_text, 200) AS snippet,
|
|
'vector' AS match_reason
|
|
FROM documents
|
|
WHERE embedding IS NOT NULL AND deleted_at IS NULL
|
|
ORDER BY embedding <=> cast(:embedding AS vector)
|
|
LIMIT :limit
|
|
"""),
|
|
{"embedding": str(query_embedding), "limit": limit},
|
|
)
|
|
return [SearchResult(**row._mapping) for row in result]
|