1. [critical] config.yaml → settings 객체에서 taxonomy 로드 (import crash 방지) 2. [high] ODF 변환: file_path 유지, derived_path 별도 필드 (무한 중복 방지) 3. [high] 법령 분할: 첫 장 이전 조문을 "서문"으로 보존 4. [high] Inbox: review_status 필드 분리 (pending/approved/rejected) 5. [high] 삭제: soft-delete (deleted_at) + worker 방어 + active_documents 뷰 - 모든 조회에 deleted_at IS NULL 일관 적용 - queue_consumer: row 없으면 gracefully skip Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
167 lines
6.3 KiB
Python
167 lines
6.3 KiB
Python
"""하이브리드 검색 API — FTS + ILIKE + 벡터 (필드별 가중치)"""
|
|
|
|
from typing import Annotated
|
|
|
|
from fastapi import APIRouter, Depends, Query
|
|
from pydantic import BaseModel
|
|
from sqlalchemy import text
|
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
|
|
from ai.client import AIClient
|
|
from core.auth import get_current_user
|
|
from core.database import get_session
|
|
from models.user import User
|
|
|
|
router = APIRouter()
|
|
|
|
|
|
class SearchResult(BaseModel):
|
|
id: int
|
|
title: str | None
|
|
ai_domain: str | None
|
|
ai_summary: str | None
|
|
file_format: str
|
|
score: float
|
|
snippet: str | None
|
|
match_reason: str | None = None
|
|
|
|
|
|
class SearchResponse(BaseModel):
|
|
results: list[SearchResult]
|
|
total: int
|
|
query: str
|
|
mode: str
|
|
|
|
|
|
@router.get("/", response_model=SearchResponse)
|
|
async def search(
|
|
q: str,
|
|
user: Annotated[User, Depends(get_current_user)],
|
|
session: Annotated[AsyncSession, Depends(get_session)],
|
|
mode: str = Query("hybrid", pattern="^(fts|trgm|vector|hybrid)$"),
|
|
limit: int = Query(20, ge=1, le=100),
|
|
):
|
|
"""문서 검색 — FTS + ILIKE + 벡터 결합"""
|
|
if mode == "vector":
|
|
results = await _search_vector(session, q, limit)
|
|
else:
|
|
results = await _search_text(session, q, limit)
|
|
|
|
# hybrid: 벡터 결과도 합산
|
|
if mode == "hybrid":
|
|
vector_results = await _search_vector(session, q, limit)
|
|
results = _merge_results(results, vector_results, limit)
|
|
|
|
return SearchResponse(
|
|
results=results,
|
|
total=len(results),
|
|
query=q,
|
|
mode=mode,
|
|
)
|
|
|
|
|
|
async def _search_text(session: AsyncSession, query: str, limit: int) -> list[SearchResult]:
|
|
"""FTS + ILIKE — 필드별 가중치 적용"""
|
|
result = await session.execute(
|
|
text("""
|
|
SELECT id, title, ai_domain, ai_summary, file_format,
|
|
left(extracted_text, 200) AS snippet,
|
|
(
|
|
-- title 매칭 (가중치 최고)
|
|
CASE WHEN coalesce(title, '') ILIKE '%%' || :q || '%%' THEN 3.0 ELSE 0 END
|
|
-- ai_tags 매칭 (가중치 높음)
|
|
+ CASE WHEN coalesce(ai_tags::text, '') ILIKE '%%' || :q || '%%' THEN 2.5 ELSE 0 END
|
|
-- user_note 매칭 (가중치 높음)
|
|
+ CASE WHEN coalesce(user_note, '') ILIKE '%%' || :q || '%%' THEN 2.0 ELSE 0 END
|
|
-- ai_summary 매칭 (가중치 중상)
|
|
+ CASE WHEN coalesce(ai_summary, '') ILIKE '%%' || :q || '%%' THEN 1.5 ELSE 0 END
|
|
-- extracted_text 매칭 (가중치 중간)
|
|
+ CASE WHEN coalesce(extracted_text, '') ILIKE '%%' || :q || '%%' THEN 1.0 ELSE 0 END
|
|
-- FTS 점수 (보너스)
|
|
+ coalesce(ts_rank(
|
|
to_tsvector('simple', coalesce(title, '') || ' ' || coalesce(extracted_text, '')),
|
|
plainto_tsquery('simple', :q)
|
|
), 0) * 2.0
|
|
) AS score,
|
|
-- match reason
|
|
CASE
|
|
WHEN coalesce(title, '') ILIKE '%%' || :q || '%%' THEN 'title'
|
|
WHEN coalesce(ai_tags::text, '') ILIKE '%%' || :q || '%%' THEN 'tags'
|
|
WHEN coalesce(user_note, '') ILIKE '%%' || :q || '%%' THEN 'note'
|
|
WHEN coalesce(ai_summary, '') ILIKE '%%' || :q || '%%' THEN 'summary'
|
|
WHEN coalesce(extracted_text, '') ILIKE '%%' || :q || '%%' THEN 'content'
|
|
ELSE 'fts'
|
|
END AS match_reason
|
|
FROM documents
|
|
WHERE deleted_at IS NULL
|
|
AND (coalesce(title, '') ILIKE '%%' || :q || '%%'
|
|
OR coalesce(ai_tags::text, '') ILIKE '%%' || :q || '%%'
|
|
OR coalesce(user_note, '') ILIKE '%%' || :q || '%%'
|
|
OR coalesce(ai_summary, '') ILIKE '%%' || :q || '%%'
|
|
OR coalesce(extracted_text, '') ILIKE '%%' || :q || '%%'
|
|
OR to_tsvector('simple', coalesce(title, '') || ' ' || coalesce(extracted_text, ''))
|
|
@@ plainto_tsquery('simple', :q))
|
|
ORDER BY score DESC
|
|
LIMIT :limit
|
|
"""),
|
|
{"q": query, "limit": limit},
|
|
)
|
|
return [SearchResult(**row._mapping) for row in result]
|
|
|
|
|
|
async def _search_vector(session: AsyncSession, query: str, limit: int) -> list[SearchResult]:
|
|
"""벡터 유사도 검색 (코사인 거리)"""
|
|
try:
|
|
client = AIClient()
|
|
query_embedding = await client.embed(query)
|
|
await client.close()
|
|
except Exception:
|
|
return []
|
|
|
|
result = await session.execute(
|
|
text("""
|
|
SELECT id, title, ai_domain, ai_summary, file_format,
|
|
(1 - (embedding <=> cast(:embedding AS vector))) AS score,
|
|
left(extracted_text, 200) AS snippet,
|
|
'vector' AS match_reason
|
|
FROM documents
|
|
WHERE embedding IS NOT NULL AND deleted_at IS NULL
|
|
ORDER BY embedding <=> cast(:embedding AS vector)
|
|
LIMIT :limit
|
|
"""),
|
|
{"embedding": str(query_embedding), "limit": limit},
|
|
)
|
|
return [SearchResult(**row._mapping) for row in result]
|
|
|
|
|
|
def _merge_results(
|
|
text_results: list[SearchResult],
|
|
vector_results: list[SearchResult],
|
|
limit: int,
|
|
) -> list[SearchResult]:
|
|
"""텍스트 + 벡터 결과 합산 (중복 제거, 점수 합산)"""
|
|
merged: dict[int, SearchResult] = {}
|
|
|
|
for r in text_results:
|
|
merged[r.id] = r
|
|
|
|
for r in vector_results:
|
|
if r.id in merged:
|
|
# 이미 텍스트로 잡힌 문서 — 벡터 점수 가산
|
|
existing = merged[r.id]
|
|
merged[r.id] = SearchResult(
|
|
id=existing.id,
|
|
title=existing.title,
|
|
ai_domain=existing.ai_domain,
|
|
ai_summary=existing.ai_summary,
|
|
file_format=existing.file_format,
|
|
score=existing.score + r.score * 0.5,
|
|
snippet=existing.snippet,
|
|
match_reason=f"{existing.match_reason}+vector",
|
|
)
|
|
elif r.score > 0.3: # 벡터 유사도 최소 threshold
|
|
merged[r.id] = r
|
|
|
|
results = sorted(merged.values(), key=lambda x: x.score, reverse=True)
|
|
return results[:limit]
|