Files
Hyungi Ahn 24142ea605 fix: Codex 리뷰 5건 수정 (critical 1 + high 4)
1. [critical] config.yaml → settings 객체에서 taxonomy 로드 (import crash 방지)
2. [high] ODF 변환: file_path 유지, derived_path 별도 필드 (무한 중복 방지)
3. [high] 법령 분할: 첫 장 이전 조문을 "서문"으로 보존
4. [high] Inbox: review_status 필드 분리 (pending/approved/rejected)
5. [high] 삭제: soft-delete (deleted_at) + worker 방어 + active_documents 뷰
   - 모든 조회에 deleted_at IS NULL 일관 적용
   - queue_consumer: row 없으면 gracefully skip

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-06 07:15:13 +09:00

167 lines
6.3 KiB
Python

"""하이브리드 검색 API — FTS + ILIKE + 벡터 (필드별 가중치)"""
from typing import Annotated
from fastapi import APIRouter, Depends, Query
from pydantic import BaseModel
from sqlalchemy import text
from sqlalchemy.ext.asyncio import AsyncSession
from ai.client import AIClient
from core.auth import get_current_user
from core.database import get_session
from models.user import User
router = APIRouter()
class SearchResult(BaseModel):
id: int
title: str | None
ai_domain: str | None
ai_summary: str | None
file_format: str
score: float
snippet: str | None
match_reason: str | None = None
class SearchResponse(BaseModel):
results: list[SearchResult]
total: int
query: str
mode: str
@router.get("/", response_model=SearchResponse)
async def search(
q: str,
user: Annotated[User, Depends(get_current_user)],
session: Annotated[AsyncSession, Depends(get_session)],
mode: str = Query("hybrid", pattern="^(fts|trgm|vector|hybrid)$"),
limit: int = Query(20, ge=1, le=100),
):
"""문서 검색 — FTS + ILIKE + 벡터 결합"""
if mode == "vector":
results = await _search_vector(session, q, limit)
else:
results = await _search_text(session, q, limit)
# hybrid: 벡터 결과도 합산
if mode == "hybrid":
vector_results = await _search_vector(session, q, limit)
results = _merge_results(results, vector_results, limit)
return SearchResponse(
results=results,
total=len(results),
query=q,
mode=mode,
)
async def _search_text(session: AsyncSession, query: str, limit: int) -> list[SearchResult]:
"""FTS + ILIKE — 필드별 가중치 적용"""
result = await session.execute(
text("""
SELECT id, title, ai_domain, ai_summary, file_format,
left(extracted_text, 200) AS snippet,
(
-- title 매칭 (가중치 최고)
CASE WHEN coalesce(title, '') ILIKE '%%' || :q || '%%' THEN 3.0 ELSE 0 END
-- ai_tags 매칭 (가중치 높음)
+ CASE WHEN coalesce(ai_tags::text, '') ILIKE '%%' || :q || '%%' THEN 2.5 ELSE 0 END
-- user_note 매칭 (가중치 높음)
+ CASE WHEN coalesce(user_note, '') ILIKE '%%' || :q || '%%' THEN 2.0 ELSE 0 END
-- ai_summary 매칭 (가중치 중상)
+ CASE WHEN coalesce(ai_summary, '') ILIKE '%%' || :q || '%%' THEN 1.5 ELSE 0 END
-- extracted_text 매칭 (가중치 중간)
+ CASE WHEN coalesce(extracted_text, '') ILIKE '%%' || :q || '%%' THEN 1.0 ELSE 0 END
-- FTS 점수 (보너스)
+ coalesce(ts_rank(
to_tsvector('simple', coalesce(title, '') || ' ' || coalesce(extracted_text, '')),
plainto_tsquery('simple', :q)
), 0) * 2.0
) AS score,
-- match reason
CASE
WHEN coalesce(title, '') ILIKE '%%' || :q || '%%' THEN 'title'
WHEN coalesce(ai_tags::text, '') ILIKE '%%' || :q || '%%' THEN 'tags'
WHEN coalesce(user_note, '') ILIKE '%%' || :q || '%%' THEN 'note'
WHEN coalesce(ai_summary, '') ILIKE '%%' || :q || '%%' THEN 'summary'
WHEN coalesce(extracted_text, '') ILIKE '%%' || :q || '%%' THEN 'content'
ELSE 'fts'
END AS match_reason
FROM documents
WHERE deleted_at IS NULL
AND (coalesce(title, '') ILIKE '%%' || :q || '%%'
OR coalesce(ai_tags::text, '') ILIKE '%%' || :q || '%%'
OR coalesce(user_note, '') ILIKE '%%' || :q || '%%'
OR coalesce(ai_summary, '') ILIKE '%%' || :q || '%%'
OR coalesce(extracted_text, '') ILIKE '%%' || :q || '%%'
OR to_tsvector('simple', coalesce(title, '') || ' ' || coalesce(extracted_text, ''))
@@ plainto_tsquery('simple', :q))
ORDER BY score DESC
LIMIT :limit
"""),
{"q": query, "limit": limit},
)
return [SearchResult(**row._mapping) for row in result]
async def _search_vector(session: AsyncSession, query: str, limit: int) -> list[SearchResult]:
"""벡터 유사도 검색 (코사인 거리)"""
try:
client = AIClient()
query_embedding = await client.embed(query)
await client.close()
except Exception:
return []
result = await session.execute(
text("""
SELECT id, title, ai_domain, ai_summary, file_format,
(1 - (embedding <=> cast(:embedding AS vector))) AS score,
left(extracted_text, 200) AS snippet,
'vector' AS match_reason
FROM documents
WHERE embedding IS NOT NULL AND deleted_at IS NULL
ORDER BY embedding <=> cast(:embedding AS vector)
LIMIT :limit
"""),
{"embedding": str(query_embedding), "limit": limit},
)
return [SearchResult(**row._mapping) for row in result]
def _merge_results(
text_results: list[SearchResult],
vector_results: list[SearchResult],
limit: int,
) -> list[SearchResult]:
"""텍스트 + 벡터 결과 합산 (중복 제거, 점수 합산)"""
merged: dict[int, SearchResult] = {}
for r in text_results:
merged[r.id] = r
for r in vector_results:
if r.id in merged:
# 이미 텍스트로 잡힌 문서 — 벡터 점수 가산
existing = merged[r.id]
merged[r.id] = SearchResult(
id=existing.id,
title=existing.title,
ai_domain=existing.ai_domain,
ai_summary=existing.ai_summary,
file_format=existing.file_format,
score=existing.score + r.score * 0.5,
snippet=existing.snippet,
match_reason=f"{existing.match_reason}+vector",
)
elif r.score > 0.3: # 벡터 유사도 최소 threshold
merged[r.id] = r
results = sorted(merged.values(), key=lambda x: x.score, reverse=True)
return results[:limit]