document-server/backend/src/api/routes/search.py

"""
검색 API 라우터
"""
from fastapi import APIRouter, Depends, Query
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy import select, or_, and_, text
from sqlalchemy.orm import joinedload, selectinload
from typing import List, Optional, Dict, Any
from datetime import datetime

from ...core.database import get_db
from ...models.user import User
from ...models.document import Document, Tag
from ...models.highlight import Highlight
from ...models.note import Note
from ...models.memo_tree import MemoTree, MemoNode
from ...models.note_document import NoteDocument
from ..dependencies import get_current_active_user
from pydantic import BaseModel


class SearchResult(BaseModel):
    """검색 결과"""
    type: str  # "document", "note", "highlight"
    id: str
    title: str
    content: str
    document_id: str
    document_title: str
    created_at: datetime
    relevance_score: float = 0.0
    highlight_info: Optional[Dict[str, Any]] = None

    class Config:
        from_attributes = True


class SearchResponse(BaseModel):
    """검색 응답"""
    query: str
    total_results: int
    results: List[SearchResult]
    facets: Dict[str, List[Dict[str, Any]]] = {}


router = APIRouter()


@router.get("/", response_model=SearchResponse)
async def search_all(
    q: str = Query(..., description="검색어"),
    type_filter: Optional[str] = Query(None, description="검색 타입 필터: document, note, memo, highlight"),
    document_id: Optional[str] = Query(None, description="특정 문서 내 검색"),
    tag: Optional[str] = Query(None, description="태그 필터"),
    skip: int = Query(0, ge=0),
    limit: int = Query(50, ge=1, le=100),
    current_user: User = Depends(get_current_active_user),
    db: AsyncSession = Depends(get_db)
):
    """통합 검색 (문서 + 메모 + 하이라이트)"""
    results = []

    # 1. 문서 검색
    if not type_filter or type_filter == "document":
        document_results = await search_documents(q, document_id, tag, current_user, db)
        results.extend(document_results)

    # 2. 노트 문서 검색
    if not type_filter or type_filter == "note":
        note_results = await search_note_documents(q, current_user, db)
        results.extend(note_results)

    # 3. 메모 트리 노드 검색
    if not type_filter or type_filter == "memo":
        memo_results = await search_memo_nodes(q, current_user, db)
        results.extend(memo_results)

    # 4. 기존 메모 검색 (하위 호환성)
    if not type_filter or type_filter == "note":
        old_note_results = await search_notes(q, document_id, tag, current_user, db)
        results.extend(old_note_results)

    # 5. 하이라이트 검색
    if not type_filter or type_filter == "highlight":
        highlight_results = await search_highlights(q, document_id, current_user, db)
        results.extend(highlight_results)

    # 6. 하이라이트 메모 검색
    if not type_filter or type_filter == "highlight_note":
        highlight_note_results = await search_highlight_notes(q, document_id, current_user, db)
        results.extend(highlight_note_results)

    # 7. 문서 본문 검색 (OCR 데이터)
    if not type_filter or type_filter == "document_content":
        content_results = await search_document_content(q, document_id, current_user, db)
        results.extend(content_results)

    # 관련성 점수로 정렬
    results.sort(key=lambda x: x.relevance_score, reverse=True)

    # 페이지네이션
    total_results = len(results)
    paginated_results = results[skip:skip + limit]

    # 패싯 정보 생성
    facets = await generate_search_facets(results, current_user, db)

    return SearchResponse(
        query=q,
        total_results=total_results,
        results=paginated_results,
        facets=facets
    )


async def search_documents(
    query: str,
    document_id: Optional[str],
    tag: Optional[str],
    current_user: User,
    db: AsyncSession
) -> List[SearchResult]:
    """문서 검색"""
    query_obj = select(Document).options(
        selectinload(Document.uploader),
        selectinload(Document.tags)
    )

    # 권한 필터링
    if not current_user.is_admin:
        query_obj = query_obj.where(
            or_(
                Document.is_public == True,
                Document.uploaded_by == current_user.id
            )
        )

    # 특정 문서 필터
    if document_id:
        query_obj = query_obj.where(Document.id == document_id)

    # 태그 필터
    if tag:
        query_obj = query_obj.join(Document.tags).where(Tag.name == tag)

    # 텍스트 검색
    search_condition = or_(
        Document.title.ilike(f"%{query}%"),
        Document.description.ilike(f"%{query}%")
    )
    query_obj = query_obj.where(search_condition)

    result = await db.execute(query_obj)
    documents = result.scalars().all()

    search_results = []
    for doc in documents:
        # 관련성 점수 계산 (제목 매치가 더 높은 점수)
        score = 0.0
        if query.lower() in doc.title.lower():
            score += 2.0
        if doc.description and query.lower() in doc.description.lower():
            score += 1.0

        search_results.append(SearchResult(
            type="document",
            id=str(doc.id),
            title=doc.title,
            content=doc.description or "",
            document_id=str(doc.id),
            document_title=doc.title,
            created_at=doc.created_at,
            relevance_score=score
        ))

    return search_results


async def search_notes(
    query: str,
    document_id: Optional[str],
    tag: Optional[str],
    current_user: User,
    db: AsyncSession
) -> List[SearchResult]:
    """메모 검색"""
    query_obj = (
        select(Note)
        .options(
            joinedload(Note.highlight).joinedload(Highlight.document)
        )
        .join(Highlight)
        .where(Highlight.user_id == current_user.id)
    )

    # 특정 문서 필터
    if document_id:
        query_obj = query_obj.where(Highlight.document_id == document_id)

    # 태그 필터
    if tag:
        query_obj = query_obj.where(Note.tags.contains([tag]))

    # 텍스트 검색 (메모 내용 + 하이라이트된 텍스트)
    search_condition = or_(
        Note.content.ilike(f"%{query}%"),
        Highlight.selected_text.ilike(f"%{query}%")
    )
    query_obj = query_obj.where(search_condition)

    result = await db.execute(query_obj)
    notes = result.scalars().all()

    search_results = []
    for note in notes:
        # 관련성 점수 계산
        score = 0.0
        if query.lower() in note.content.lower():
            score += 2.0
        if query.lower() in note.highlight.selected_text.lower():
            score += 1.5

        search_results.append(SearchResult(
            type="note",
            id=str(note.id),
            title=f"메모: {note.highlight.selected_text[:50]}...",
            content=note.content,
            document_id=str(note.highlight.document.id),
            document_title=note.highlight.document.title,
            created_at=note.created_at,
            relevance_score=score,
            highlight_info={
                "highlight_id": str(note.highlight.id),
                "selected_text": note.highlight.selected_text,
                "start_offset": note.highlight.start_offset,
                "end_offset": note.highlight.end_offset
            }
        ))

    return search_results


async def search_highlights(
    query: str,
    document_id: Optional[str],
    current_user: User,
    db: AsyncSession
) -> List[SearchResult]:
    """하이라이트 검색"""
    query_obj = (
        select(Highlight)
        .options(joinedload(Highlight.document))
        .where(Highlight.user_id == current_user.id)
    )

    # 특정 문서 필터
    if document_id:
        query_obj = query_obj.where(Highlight.document_id == document_id)

    # 텍스트 검색
    query_obj = query_obj.where(Highlight.selected_text.ilike(f"%{query}%"))

    result = await db.execute(query_obj)
    highlights = result.scalars().all()

    search_results = []
    for highlight in highlights:
        # 관련성 점수 계산
        score = 1.0 if query.lower() in highlight.selected_text.lower() else 0.5

        search_results.append(SearchResult(
            type="highlight",
            id=str(highlight.id),
            title=f"하이라이트: {highlight.selected_text[:50]}...",
            content=highlight.selected_text,
            document_id=str(highlight.document.id),
            document_title=highlight.document.title,
            created_at=highlight.created_at,
            relevance_score=score,
            highlight_info={
                "highlight_id": str(highlight.id),
                "selected_text": highlight.selected_text,
                "start_offset": highlight.start_offset,
                "end_offset": highlight.end_offset,
                "highlight_color": highlight.highlight_color
            }
        ))

    return search_results


async def generate_search_facets(
    results: List[SearchResult],
    current_user: User,
    db: AsyncSession
) -> Dict[str, List[Dict[str, Any]]]:
    """검색 결과 패싯 생성"""
    facets = {}

    # 타입별 개수
    type_counts = {}
    for result in results:
        type_counts[result.type] = type_counts.get(result.type, 0) + 1

    facets["types"] = [
        {"name": type_name, "count": count}
        for type_name, count in type_counts.items()
    ]

    # 문서별 개수
    document_counts = {}
    for result in results:
        doc_title = result.document_title
        document_counts[doc_title] = document_counts.get(doc_title, 0) + 1

    facets["documents"] = [
        {"name": doc_title, "count": count}
        for doc_title, count in sorted(document_counts.items(), key=lambda x: x[1], reverse=True)[:10]
    ]

    return facets


@router.get("/suggestions")
async def get_search_suggestions(
    q: str = Query(..., min_length=2, description="검색어 (최소 2글자)"),
    current_user: User = Depends(get_current_active_user),
    db: AsyncSession = Depends(get_db)
):
    """검색어 자동완성 제안"""
    suggestions = []

    # 문서 제목에서 제안
    doc_result = await db.execute(
        select(Document.title)
        .where(
            and_(
                Document.title.ilike(f"%{q}%"),
                or_(
                    Document.is_public == True,
                    Document.uploaded_by == current_user.id
                ) if not current_user.is_admin else text("true")
            )
        )
        .limit(5)
    )
    doc_titles = doc_result.scalars().all()
    suggestions.extend([{"text": title, "type": "document"} for title in doc_titles])

    # 태그에서 제안
    tag_result = await db.execute(
        select(Tag.name)
        .where(Tag.name.ilike(f"%{q}%"))
        .limit(5)
    )
    tag_names = tag_result.scalars().all()
    suggestions.extend([{"text": name, "type": "tag"} for name in tag_names])

    # 메모 태그에서 제안
    note_result = await db.execute(
        select(Note.tags)
        .join(Highlight)
        .where(Highlight.user_id == current_user.id)
    )
    notes = note_result.scalars().all()

    note_tags = set()
    for note in notes:
        if note and isinstance(note, list):
            for tag in note:
                if q.lower() in tag.lower():
                    note_tags.add(tag)

    suggestions.extend([{"text": tag, "type": "note_tag"} for tag in list(note_tags)[:5]])

    return {"suggestions": suggestions[:10]}


async def search_highlight_notes(
    query: str,
    document_id: Optional[str],
    current_user: User,
    db: AsyncSession
) -> List[SearchResult]:
    """하이라이트 메모 내용 검색"""
    query_obj = select(Note).options(
        selectinload(Note.highlight).selectinload(Highlight.document)
    )

    # 하이라이트가 있는 노트만
    query_obj = query_obj.where(Note.highlight_id.isnot(None))

    # Highlight와 조인 (권한 및 문서 필터링을 위해)
    query_obj = query_obj.join(Highlight)

    # 권한 필터링 - 사용자의 노트만
    query_obj = query_obj.where(Highlight.user_id == current_user.id)

    # 특정 문서 필터
    if document_id:
        query_obj = query_obj.where(Highlight.document_id == document_id)

    # 메모 내용에서 검색
    query_obj = query_obj.where(Note.content.ilike(f"%{query}%"))

    result = await db.execute(query_obj)
    notes = result.scalars().all()

    search_results = []
    for note in notes:
        if not note.highlight or not note.highlight.document:
            continue

        # 관련성 점수 계산
        score = 1.5  # 메모 내용 매치는 높은 점수
        content_lower = (note.content or "").lower()
        if query.lower() in content_lower:
            score += 2.0

        search_results.append(SearchResult(
            type="highlight_note",
            id=str(note.id),
            title=f"하이라이트 메모: {note.highlight.selected_text[:30]}...",
            content=note.content or "",
            document_id=str(note.highlight.document.id),
            document_title=note.highlight.document.title,
            created_at=note.created_at,
            relevance_score=score,
            highlight_info={
                "highlight_id": str(note.highlight.id),
                "selected_text": note.highlight.selected_text,
                "start_offset": note.highlight.start_offset,
                "end_offset": note.highlight.end_offset,
                "note_content": note.content
            }
        ))

    return search_results


async def search_note_documents(
    query: str,
    current_user: User,
    db: AsyncSession
) -> List[SearchResult]:
    """노트 문서 검색"""
    query_obj = select(NoteDocument).where(
        or_(
            NoteDocument.title.ilike(f"%{query}%"),
            NoteDocument.content.ilike(f"%{query}%")
        )
    )

    # 권한 필터링 - 사용자의 노트만
    query_obj = query_obj.where(NoteDocument.created_by == current_user.email)

    result = await db.execute(query_obj)
    notes = result.scalars().all()

    search_results = []
    for note in notes:
        # 관련성 점수 계산
        score = 1.0
        if query.lower() in note.title.lower():
            score += 2.0
        if note.content and query.lower() in note.content.lower():
            score += 1.0

        search_results.append(SearchResult(
            type="note",
            id=str(note.id),
            title=note.title,
            content=note.content or "",
            document_id=str(note.id),  # 노트 자체가 문서
            document_title=note.title,
            created_at=note.created_at,
            relevance_score=score
        ))

    return search_results


async def search_memo_nodes(
    query: str,
    current_user: User,
    db: AsyncSession
) -> List[SearchResult]:
    """메모 트리 노드 검색"""
    query_obj = select(MemoNode).options(
        selectinload(MemoNode.tree)
    ).where(
        or_(
            MemoNode.title.ilike(f"%{query}%"),
            MemoNode.content.ilike(f"%{query}%")
        )
    )

    # 권한 필터링 - 사용자의 트리에 속한 노드만
    query_obj = query_obj.join(MemoTree).where(MemoTree.user_id == current_user.id)

    result = await db.execute(query_obj)
    nodes = result.scalars().all()

    search_results = []
    for node in nodes:
        # 관련성 점수 계산
        score = 1.0
        if query.lower() in node.title.lower():
            score += 2.0
        if node.content and query.lower() in node.content.lower():
            score += 1.0

        search_results.append(SearchResult(
            type="memo",
            id=str(node.id),
            title=node.title,
            content=node.content or "",
            document_id=str(node.tree.id),  # 트리 ID를 문서 ID로 사용
            document_title=f"📚 {node.tree.title}",
            created_at=node.created_at,
            relevance_score=score
        ))

    return search_results


async def search_document_content(
    query: str,
    document_id: Optional[str],
    current_user: User,
    db: AsyncSession
) -> List[SearchResult]:
    """문서 본문 내용 검색 (OCR 데이터 포함)"""
    # 문서 권한 확인
    doc_query = select(Document)
    if not current_user.is_admin:
        doc_query = doc_query.where(
            or_(
                Document.is_public == True,
                Document.uploaded_by == current_user.id
            )
        )

    if document_id:
        doc_query = doc_query.where(Document.id == document_id)

    result = await db.execute(doc_query)
    documents = result.scalars().all()

    search_results = []

    for doc in documents:
        text_content = ""
        file_type = ""

        # HTML 파일에서 텍스트 검색 (PDF OCR 결과 또는 서적 HTML)
        if doc.html_path:
            try:
                import os
                from bs4 import BeautifulSoup

                # 절대 경로 처리
                if doc.html_path.startswith('/'):
                    html_file_path = doc.html_path
                else:
                    html_file_path = os.path.join("/app", doc.html_path)

                if os.path.exists(html_file_path):
                    with open(html_file_path, 'r', encoding='utf-8') as f:
                        html_content = f.read()

                    # HTML에서 텍스트 추출
                    soup = BeautifulSoup(html_content, 'html.parser')
                    text_content = soup.get_text()

                    # PDF인지 서적인지 구분
                    if doc.pdf_path:
                        file_type = "PDF"
                    else:
                        file_type = "HTML"

            except Exception as e:
                print(f"HTML 파일 읽기 오류 ({doc.html_path}): {e}")
                continue

        # PDF 파일 직접 텍스트 추출 (HTML이 없는 경우)
        elif doc.pdf_path:
            try:
                import os
                import PyPDF2

                # 절대 경로 처리
                if doc.pdf_path.startswith('/'):
                    pdf_file_path = doc.pdf_path
                else:
                    pdf_file_path = os.path.join("/app", doc.pdf_path)

                if os.path.exists(pdf_file_path):
                    with open(pdf_file_path, 'rb') as f:
                        pdf_reader = PyPDF2.PdfReader(f)
                        text_pages = []

                        # 모든 페이지에서 텍스트 추출
                        for page_num in range(len(pdf_reader.pages)):
                            page = pdf_reader.pages[page_num]
                            page_text = page.extract_text()
                            if page_text.strip():
                                text_pages.append(f"[페이지 {page_num + 1}]\n{page_text}")

                        text_content = "\n\n".join(text_pages)
                        file_type = "PDF (직접추출)"

            except Exception as e:
                print(f"PDF 파일 읽기 오류 ({doc.pdf_path}): {e}")
                continue

        # 검색어가 포함된 경우
        if text_content and query.lower() in text_content.lower():
            # 검색어 주변 컨텍스트 추출
            context = extract_search_context(text_content, query, context_length=300)

            # 관련성 점수 계산
            score = 2.0  # 본문 매치는 높은 점수

            # 검색어 매치 횟수로 점수 조정
            match_count = text_content.lower().count(query.lower())
            score += min(match_count * 0.1, 1.0)  # 최대 1점 추가

            search_results.append(SearchResult(
                type="document_content",
                id=str(doc.id),
                title=f"📄 {doc.title} ({file_type} 본문)",
                content=context,
                document_id=str(doc.id),
                document_title=doc.title,
                created_at=doc.created_at,
                relevance_score=score,
                highlight_info={
                    "file_type": file_type,
                    "match_count": match_count,
                    "has_pdf": bool(doc.pdf_path),
                    "has_html": bool(doc.html_path)
                }
            ))

    return search_results


def extract_search_context(text: str, query: str, context_length: int = 200) -> str:
    """검색어 주변 컨텍스트 추출"""
    text_lower = text.lower()
    query_lower = query.lower()

    # 첫 번째 매치 위치 찾기
    match_pos = text_lower.find(query_lower)
    if match_pos == -1:
        return text[:context_length] + "..."

    # 컨텍스트 시작/끝 위치 계산
    start = max(0, match_pos - context_length // 2)
    end = min(len(text), match_pos + len(query) + context_length // 2)

    context = text[start:end]

    # 앞뒤에 ... 추가
    if start > 0:
        context = "..." + context
    if end < len(text):
        context = context + "..."

    return context