📄 PDF 본문 검색 및 미리보기 완성

🔍 PDF/HTML 본문 검색 개선: - PDF OCR 데이터 전체 텍스트 검색 (BeautifulSoup + PyPDF2) - 서적 HTML 파일 본문 검색 지원 - 파일 타입 구분 (PDF/HTML/PDF직접추출) - 검색어 매치 횟수 기반 관련성 점수 - 절대/상대 경로 처리 개선 📱 PDF 미리보기 기능: - 검색 결과에서 PDF 직접 미리보기 (iframe) - PDF에서 검색 버튼으로 페이지 이동 - 검색어 위치 기반 뷰어 연동 - PDF 로드 실패 시 fallback UI 🎯 백엔드 API 추가: - GET /documents/{id}/pdf: PDF 파일 직접 제공 - GET /documents/{id}/search-in-content: 문서 내 검색 - 페이지별 검색 결과 및 컨텍스트 제공 - 권한 확인 및 에러 처리 🎨 프론트엔드 UX: - PDF/HTML 타입별 배지 표시 - 검색 통계에 본문 검색 결과 포함 - 미리보기 모달에서 PDF 뷰어 통합 - 검색어 하이라이트 및 컨텍스트 표시
2025-09-02 17:09:32 +09:00
parent 0afe6dcf65
commit 4b65d45584
4 changed files with 332 additions and 21 deletions
--- a/backend/src/api/routes/documents.py
+++ b/backend/src/api/routes/documents.py
@@ -507,6 +507,173 @@ async def get_document_content(
        raise HTTPException(status_code=500, detail=f"Error reading document: {str(e)}")


+@router.get("/{document_id}/pdf")
+async def get_document_pdf(
+    document_id: str,
+    current_user: User = Depends(get_current_active_user),
+    db: AsyncSession = Depends(get_db)
+):
+    """문서 PDF 파일 조회"""
+    try:
+        doc_uuid = UUID(document_id)
+    except ValueError:
+        raise HTTPException(status_code=400, detail="Invalid document ID format")
+    
+    # 문서 조회
+    query = select(Document).where(Document.id == doc_uuid)
+    result = await db.execute(query)
+    document = result.scalar_one_or_none()
+    
+    if not document:
+        raise HTTPException(status_code=404, detail="Document not found")
+    
+    # 권한 확인
+    if not current_user.is_admin and not document.is_public and document.uploaded_by != current_user.id:
+        raise HTTPException(status_code=403, detail="Access denied")
+    
+    # PDF 파일 확인
+    if not document.pdf_path:
+        raise HTTPException(status_code=404, detail="PDF file not found for this document")
+    
+    # PDF 파일 경로 처리
+    import os
+    from fastapi.responses import FileResponse
+    
+    if document.pdf_path.startswith('/'):
+        file_path = document.pdf_path
+    else:
+        file_path = os.path.join("/app/data/documents", document.pdf_path)
+    
+    if not os.path.exists(file_path):
+        raise HTTPException(status_code=404, detail="PDF file not found on disk")
+    
+    return FileResponse(
+        path=file_path,
+        media_type='application/pdf',
+        filename=f"{document.title}.pdf"
+    )
+
+
+@router.get("/{document_id}/search-in-content")
+async def search_in_document_content(
+    document_id: str,
+    q: str,
+    current_user: User = Depends(get_current_active_user),
+    db: AsyncSession = Depends(get_db)
+):
+    """특정 문서 내에서 텍스트 검색 및 페이지 위치 반환"""
+    try:
+        doc_uuid = UUID(document_id)
+    except ValueError:
+        raise HTTPException(status_code=400, detail="Invalid document ID format")
+    
+    # 문서 조회
+    query = select(Document).where(Document.id == doc_uuid)
+    result = await db.execute(query)
+    document = result.scalar_one_or_none()
+    
+    if not document:
+        raise HTTPException(status_code=404, detail="Document not found")
+    
+    # 권한 확인
+    if not current_user.is_admin and not document.is_public and document.uploaded_by != current_user.id:
+        raise HTTPException(status_code=403, detail="Access denied")
+    
+    search_results = []
+    
+    # HTML 파일에서 검색 (OCR 결과)
+    if document.html_path:
+        try:
+            import os
+            from bs4 import BeautifulSoup
+            import re
+            
+            # 절대 경로 처리
+            if document.html_path.startswith('/'):
+                html_file_path = document.html_path
+            else:
+                html_file_path = os.path.join("/app/data/documents", document.html_path)
+            
+            if os.path.exists(html_file_path):
+                with open(html_file_path, 'r', encoding='utf-8') as f:
+                    html_content = f.read()
+                
+                # HTML에서 페이지별로 검색
+                soup = BeautifulSoup(html_content, 'html.parser')
+                
+                # 페이지 구분자 찾기 (OCR 결과에서 페이지 정보)
+                pages = soup.find_all(['div', 'section'], class_=re.compile(r'page|Page'))
+                
+                if not pages:
+                    # 페이지 구분이 없으면 전체 텍스트에서 검색
+                    text_content = soup.get_text()
+                    matches = []
+                    start = 0
+                    while True:
+                        pos = text_content.lower().find(q.lower(), start)
+                        if pos == -1:
+                            break
+                        
+                        # 컨텍스트 추출
+                        context_start = max(0, pos - 100)
+                        context_end = min(len(text_content), pos + len(q) + 100)
+                        context = text_content[context_start:context_end]
+                        
+                        matches.append({
+                            "page": 1,
+                            "position": pos,
+                            "context": context,
+                            "match_text": text_content[pos:pos + len(q)]
+                        })
+                        
+                        start = pos + 1
+                        if len(matches) >= 10:  # 최대 10개 결과
+                            break
+                    
+                    search_results.extend(matches)
+                else:
+                    # 페이지별로 검색
+                    for page_num, page_elem in enumerate(pages, 1):
+                        page_text = page_elem.get_text()
+                        matches = []
+                        start = 0
+                        
+                        while True:
+                            pos = page_text.lower().find(q.lower(), start)
+                            if pos == -1:
+                                break
+                            
+                            # 컨텍스트 추출
+                            context_start = max(0, pos - 100)
+                            context_end = min(len(page_text), pos + len(q) + 100)
+                            context = page_text[context_start:context_end]
+                            
+                            matches.append({
+                                "page": page_num,
+                                "position": pos,
+                                "context": context,
+                                "match_text": page_text[pos:pos + len(q)]
+                            })
+                            
+                            start = pos + 1
+                            if len(matches) >= 5:  # 페이지당 최대 5개
+                                break
+                        
+                        search_results.extend(matches)
+                        
+        except Exception as e:
+            print(f"HTML 검색 오류: {e}")
+    
+    return {
+        "document_id": document_id,
+        "query": q,
+        "total_matches": len(search_results),
+        "matches": search_results[:20],  # 최대 20개 결과
+        "has_pdf": bool(document.pdf_path),
+        "has_html": bool(document.html_path)
+    }
+
+
 class UpdateDocumentRequest(BaseModel):
    """문서 업데이트 요청"""
    title: Optional[str] = None
--- a/backend/src/api/routes/search.py
+++ b/backend/src/api/routes/search.py
@@ -547,13 +547,21 @@ async def search_document_content(
    search_results = []
    
    for doc in documents:
-        # HTML 파일에서 텍스트 검색
+        text_content = ""
+        file_type = ""
+        
+        # HTML 파일에서 텍스트 검색 (PDF OCR 결과 또는 서적 HTML)
        if doc.html_path:
            try:
                import os
                from bs4 import BeautifulSoup
                
-                html_file_path = os.path.join("/app/data/documents", doc.html_path)
+                # 절대 경로 처리
+                if doc.html_path.startswith('/'):
+                    html_file_path = doc.html_path
+                else:
+                    html_file_path = os.path.join("/app/data/documents", doc.html_path)
+                
                if os.path.exists(html_file_path):
                    with open(html_file_path, 'r', encoding='utf-8') as f:
                        html_content = f.read()
@@ -562,27 +570,75 @@ async def search_document_content(
                    soup = BeautifulSoup(html_content, 'html.parser')
                    text_content = soup.get_text()
                    
-                    # 검색어가 포함된 경우
-                    if query.lower() in text_content.lower():
-                        # 검색어 주변 컨텍스트 추출
-                        context = extract_search_context(text_content, query)
+                    # PDF인지 서적인지 구분
+                    if doc.pdf_path:
+                        file_type = "PDF"
+                    else:
+                        file_type = "HTML"
                        
-                        # 관련성 점수 계산
-                        score = 2.0  # 본문 매치는 높은 점수
-                        
-                        search_results.append(SearchResult(
-                            type="document_content",
-                            id=str(doc.id),
-                            title=f"📄 {doc.title} (본문)",
-                            content=context,
-                            document_id=str(doc.id),
-                            document_title=doc.title,
-                            created_at=doc.created_at,
-                            relevance_score=score
-                        ))
            except Exception as e:
-                print(f"문서 본문 검색 오류: {e}")
+                print(f"HTML 파일 읽기 오류 ({doc.html_path}): {e}")
                continue
+        
+        # PDF 파일 직접 텍스트 추출 (HTML이 없는 경우)
+        elif doc.pdf_path:
+            try:
+                import os
+                import PyPDF2
+                
+                # 절대 경로 처리
+                if doc.pdf_path.startswith('/'):
+                    pdf_file_path = doc.pdf_path
+                else:
+                    pdf_file_path = os.path.join("/app/data/documents", doc.pdf_path)
+                
+                if os.path.exists(pdf_file_path):
+                    with open(pdf_file_path, 'rb') as f:
+                        pdf_reader = PyPDF2.PdfReader(f)
+                        text_pages = []
+                        
+                        # 모든 페이지에서 텍스트 추출
+                        for page_num in range(len(pdf_reader.pages)):
+                            page = pdf_reader.pages[page_num]
+                            page_text = page.extract_text()
+                            if page_text.strip():
+                                text_pages.append(f"[페이지 {page_num + 1}]\n{page_text}")
+                        
+                        text_content = "\n\n".join(text_pages)
+                        file_type = "PDF (직접추출)"
+                        
+            except Exception as e:
+                print(f"PDF 파일 읽기 오류 ({doc.pdf_path}): {e}")
+                continue
+        
+        # 검색어가 포함된 경우
+        if text_content and query.lower() in text_content.lower():
+            # 검색어 주변 컨텍스트 추출
+            context = extract_search_context(text_content, query, context_length=300)
+            
+            # 관련성 점수 계산
+            score = 2.0  # 본문 매치는 높은 점수
+            
+            # 검색어 매치 횟수로 점수 조정
+            match_count = text_content.lower().count(query.lower())
+            score += min(match_count * 0.1, 1.0)  # 최대 1점 추가
+            
+            search_results.append(SearchResult(
+                type="document_content",
+                id=str(doc.id),
+                title=f"📄 {doc.title} ({file_type} 본문)",
+                content=context,
+                document_id=str(doc.id),
+                document_title=doc.title,
+                created_at=doc.created_at,
+                relevance_score=score,
+                highlight_info={
+                    "file_type": file_type,
+                    "match_count": match_count,
+                    "has_pdf": bool(doc.pdf_path),
+                    "has_html": bool(doc.html_path)
+                }
+            ))
    
    return search_results

--- a/frontend/search.html
+++ b/frontend/search.html
@@ -281,6 +281,12 @@
                            <span x-show="getResultCount('highlight') > 0">
                                🖍️ 하이라이트 <strong x-text="getResultCount('highlight')"></strong>개
                            </span>
+                            <span x-show="getResultCount('highlight_note') > 0">
+                                💬 메모 <strong x-text="getResultCount('highlight_note')"></strong>개
+                            </span>
+                            <span x-show="getResultCount('document_content') > 0">
+                                📖 본문 <strong x-text="getResultCount('document_content')"></strong>개
+                            </span>
                        </div>
                    </div>
                    <div class="text-xs text-gray-500">
@@ -450,6 +456,39 @@
            
            <!-- 모달 내용 -->
            <div class="p-6 overflow-y-auto max-h-[60vh]">
+                <!-- PDF 미리보기 -->
+                <div x-show="previewResult?.type === 'document_content' && previewResult?.highlight_info?.has_pdf" 
+                     class="mb-4">
+                    <div class="flex items-center justify-between mb-3">
+                        <div class="text-sm font-medium text-gray-800">
+                            <i class="fas fa-file-pdf mr-2 text-red-600"></i>PDF 미리보기
+                        </div>
+                        <div class="flex items-center space-x-2">
+                            <button @click="searchInPdf()" 
+                                    class="px-3 py-1 bg-blue-600 text-white rounded text-xs hover:bg-blue-700">
+                                <i class="fas fa-search mr-1"></i>PDF에서 검색
+                            </button>
+                        </div>
+                    </div>
+                    <div class="border rounded-lg overflow-hidden bg-gray-100" style="height: 400px;">
+                        <iframe :src="`/api/documents/${previewResult?.document_id}/pdf`" 
+                                class="w-full h-full"
+                                x-show="!pdfError"
+                                @error="pdfError = true">
+                        </iframe>
+                        <div x-show="pdfError" class="flex items-center justify-center h-full text-gray-500">
+                            <div class="text-center">
+                                <i class="fas fa-exclamation-triangle text-2xl mb-2"></i>
+                                <p>PDF를 로드할 수 없습니다</p>
+                                <button @click="openResult(previewResult)" 
+                                        class="mt-2 px-3 py-1 bg-blue-600 text-white rounded text-sm">
+                                    뷰어에서 열기
+                                </button>
+                            </div>
+                        </div>
+                    </div>
+                </div>
+                
                <!-- 하이라이트 정보 -->
                <div x-show="previewResult?.type === 'highlight' && previewResult?.highlight_info" 
                     class="mb-4 p-4 bg-yellow-50 border border-yellow-200 rounded-lg">
@@ -473,8 +512,23 @@
                    <div class="text-sm font-medium text-blue-800 mb-1">메모 내용:</div>
                </div>
                
+                <!-- 본문 검색 결과 정보 -->
+                <div x-show="previewResult?.type === 'document_content' && previewResult?.highlight_info" 
+                     class="mb-4 p-4 bg-gray-50 border border-gray-200 rounded-lg">
+                    <div class="flex items-center justify-between mb-2">
+                        <div class="text-sm font-medium text-gray-800">
+                            <i class="fas fa-search mr-2"></i>본문 검색 결과
+                        </div>
+                        <div class="text-xs text-gray-600">
+                            <span x-text="previewResult?.highlight_info?.file_type"></span>
+                            • <span x-text="previewResult?.highlight_info?.match_count"></span>개 매치
+                        </div>
+                    </div>
+                </div>
+                
                <!-- 본문 내용 -->
-                <div class="prose max-w-none">
+                <div x-show="!previewResult?.highlight_info?.has_pdf || previewResult?.type !== 'document_content'" 
+                     class="prose max-w-none">
                    <div class="text-gray-700 leading-relaxed" 
                         style="white-space: pre-wrap; word-wrap: break-word; max-height: 400px; overflow-y: auto;"
                         x-html="highlightText(previewResult?.content || '', searchQuery)"></div>
--- a/frontend/static/js/search.js
+++ b/frontend/static/js/search.js
@@ -24,6 +24,7 @@ window.searchApp = function() {
        showPreviewModal: false,
        previewResult: null,
        previewLoading: false,
+        pdfError: false,
        
        // 인증 상태
        isAuthenticated: false,
@@ -281,6 +282,39 @@ window.searchApp = function() {
            this.showPreviewModal = false;
            this.previewResult = null;
            this.previewLoading = false;
+            this.pdfError = false;
+        },
+
+        // PDF에서 검색
+        async searchInPdf() {
+            if (!this.previewResult || !this.searchQuery) return;
+            
+            try {
+                const searchResults = await this.api.get(
+                    `/documents/${this.previewResult.document_id}/search-in-content?q=${encodeURIComponent(this.searchQuery)}`
+                );
+                
+                if (searchResults.total_matches > 0) {
+                    // 첫 번째 매치로 이동하여 뷰어에서 열기
+                    const firstMatch = searchResults.matches[0];
+                    let url = `/viewer.html?id=${this.previewResult.document_id}`;
+                    
+                    if (firstMatch.page > 1) {
+                        url += `&page=${firstMatch.page}`;
+                    }
+                    
+                    // 검색어 하이라이트를 위한 파라미터 추가
+                    url += `&search=${encodeURIComponent(this.searchQuery)}`;
+                    
+                    window.open(url, '_blank');
+                    this.closePreview();
+                } else {
+                    alert('PDF에서 검색 결과를 찾을 수 없습니다.');
+                }
+            } catch (error) {
+                console.error('PDF 검색 실패:', error);
+                alert('PDF 검색 중 오류가 발생했습니다.');
+            }
        },

        // 검색 결과 열기