From 4b65d4558442086d85b21b834b8c3b417d79b68e Mon Sep 17 00:00:00 2001 From: Hyungi Ahn Date: Tue, 2 Sep 2025 17:09:32 +0900 Subject: [PATCH] =?UTF-8?q?=F0=9F=93=84=20PDF=20=EB=B3=B8=EB=AC=B8=20?= =?UTF-8?q?=EA=B2=80=EC=83=89=20=EB=B0=8F=20=EB=AF=B8=EB=A6=AC=EB=B3=B4?= =?UTF-8?q?=EA=B8=B0=20=EC=99=84=EC=84=B1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ๐Ÿ” PDF/HTML ๋ณธ๋ฌธ ๊ฒ€์ƒ‰ ๊ฐœ์„ : - PDF OCR ๋ฐ์ดํ„ฐ ์ „์ฒด ํ…์ŠคํŠธ ๊ฒ€์ƒ‰ (BeautifulSoup + PyPDF2) - ์„œ์  HTML ํŒŒ์ผ ๋ณธ๋ฌธ ๊ฒ€์ƒ‰ ์ง€์› - ํŒŒ์ผ ํƒ€์ž… ๊ตฌ๋ถ„ (PDF/HTML/PDF์ง์ ‘์ถ”์ถœ) - ๊ฒ€์ƒ‰์–ด ๋งค์น˜ ํšŸ์ˆ˜ ๊ธฐ๋ฐ˜ ๊ด€๋ จ์„ฑ ์ ์ˆ˜ - ์ ˆ๋Œ€/์ƒ๋Œ€ ๊ฒฝ๋กœ ์ฒ˜๋ฆฌ ๊ฐœ์„  ๐Ÿ“ฑ PDF ๋ฏธ๋ฆฌ๋ณด๊ธฐ ๊ธฐ๋Šฅ: - ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ์—์„œ PDF ์ง์ ‘ ๋ฏธ๋ฆฌ๋ณด๊ธฐ (iframe) - PDF์—์„œ ๊ฒ€์ƒ‰ ๋ฒ„ํŠผ์œผ๋กœ ํŽ˜์ด์ง€ ์ด๋™ - ๊ฒ€์ƒ‰์–ด ์œ„์น˜ ๊ธฐ๋ฐ˜ ๋ทฐ์–ด ์—ฐ๋™ - PDF ๋กœ๋“œ ์‹คํŒจ ์‹œ fallback UI ๐ŸŽฏ ๋ฐฑ์—”๋“œ API ์ถ”๊ฐ€: - GET /documents/{id}/pdf: PDF ํŒŒ์ผ ์ง์ ‘ ์ œ๊ณต - GET /documents/{id}/search-in-content: ๋ฌธ์„œ ๋‚ด ๊ฒ€์ƒ‰ - ํŽ˜์ด์ง€๋ณ„ ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ ๋ฐ ์ปจํ…์ŠคํŠธ ์ œ๊ณต - ๊ถŒํ•œ ํ™•์ธ ๋ฐ ์—๋Ÿฌ ์ฒ˜๋ฆฌ ๐ŸŽจ ํ”„๋ก ํŠธ์—”๋“œ UX: - PDF/HTML ํƒ€์ž…๋ณ„ ๋ฐฐ์ง€ ํ‘œ์‹œ - ๊ฒ€์ƒ‰ ํ†ต๊ณ„์— ๋ณธ๋ฌธ ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ ํฌํ•จ - ๋ฏธ๋ฆฌ๋ณด๊ธฐ ๋ชจ๋‹ฌ์—์„œ PDF ๋ทฐ์–ด ํ†ตํ•ฉ - ๊ฒ€์ƒ‰์–ด ํ•˜์ด๋ผ์ดํŠธ ๋ฐ ์ปจํ…์ŠคํŠธ ํ‘œ์‹œ --- backend/src/api/routes/documents.py | 167 ++++++++++++++++++++++++++++ backend/src/api/routes/search.py | 96 ++++++++++++---- frontend/search.html | 56 +++++++++- frontend/static/js/search.js | 34 ++++++ 4 files changed, 332 insertions(+), 21 deletions(-) diff --git a/backend/src/api/routes/documents.py b/backend/src/api/routes/documents.py index ee9b0da..5c597e8 100644 --- a/backend/src/api/routes/documents.py +++ b/backend/src/api/routes/documents.py @@ -507,6 +507,173 @@ async def get_document_content( raise HTTPException(status_code=500, detail=f"Error reading document: {str(e)}") +@router.get("/{document_id}/pdf") +async def get_document_pdf( + document_id: str, + current_user: User = Depends(get_current_active_user), + db: AsyncSession = Depends(get_db) +): + """๋ฌธ์„œ PDF ํŒŒ์ผ ์กฐํšŒ""" + try: + doc_uuid = UUID(document_id) + except ValueError: + raise HTTPException(status_code=400, detail="Invalid document ID format") + + # ๋ฌธ์„œ ์กฐํšŒ + query = select(Document).where(Document.id == doc_uuid) + result = await db.execute(query) + document = result.scalar_one_or_none() + + if not document: + raise HTTPException(status_code=404, detail="Document not found") + + # ๊ถŒํ•œ ํ™•์ธ + if not current_user.is_admin and not document.is_public and document.uploaded_by != current_user.id: + raise HTTPException(status_code=403, detail="Access denied") + + # PDF ํŒŒ์ผ ํ™•์ธ + if not document.pdf_path: + raise HTTPException(status_code=404, detail="PDF file not found for this document") + + # PDF ํŒŒ์ผ ๊ฒฝ๋กœ ์ฒ˜๋ฆฌ + import os + from fastapi.responses import FileResponse + + if document.pdf_path.startswith('/'): + file_path = document.pdf_path + else: + file_path = os.path.join("/app/data/documents", document.pdf_path) + + if not os.path.exists(file_path): + raise HTTPException(status_code=404, detail="PDF file not found on disk") + + return FileResponse( + path=file_path, + media_type='application/pdf', + filename=f"{document.title}.pdf" + ) + + +@router.get("/{document_id}/search-in-content") +async def search_in_document_content( + document_id: str, + q: str, + current_user: User = Depends(get_current_active_user), + db: AsyncSession = Depends(get_db) +): + """ํŠน์ • ๋ฌธ์„œ ๋‚ด์—์„œ ํ…์ŠคํŠธ ๊ฒ€์ƒ‰ ๋ฐ ํŽ˜์ด์ง€ ์œ„์น˜ ๋ฐ˜ํ™˜""" + try: + doc_uuid = UUID(document_id) + except ValueError: + raise HTTPException(status_code=400, detail="Invalid document ID format") + + # ๋ฌธ์„œ ์กฐํšŒ + query = select(Document).where(Document.id == doc_uuid) + result = await db.execute(query) + document = result.scalar_one_or_none() + + if not document: + raise HTTPException(status_code=404, detail="Document not found") + + # ๊ถŒํ•œ ํ™•์ธ + if not current_user.is_admin and not document.is_public and document.uploaded_by != current_user.id: + raise HTTPException(status_code=403, detail="Access denied") + + search_results = [] + + # HTML ํŒŒ์ผ์—์„œ ๊ฒ€์ƒ‰ (OCR ๊ฒฐ๊ณผ) + if document.html_path: + try: + import os + from bs4 import BeautifulSoup + import re + + # ์ ˆ๋Œ€ ๊ฒฝ๋กœ ์ฒ˜๋ฆฌ + if document.html_path.startswith('/'): + html_file_path = document.html_path + else: + html_file_path = os.path.join("/app/data/documents", document.html_path) + + if os.path.exists(html_file_path): + with open(html_file_path, 'r', encoding='utf-8') as f: + html_content = f.read() + + # HTML์—์„œ ํŽ˜์ด์ง€๋ณ„๋กœ ๊ฒ€์ƒ‰ + soup = BeautifulSoup(html_content, 'html.parser') + + # ํŽ˜์ด์ง€ ๊ตฌ๋ถ„์ž ์ฐพ๊ธฐ (OCR ๊ฒฐ๊ณผ์—์„œ ํŽ˜์ด์ง€ ์ •๋ณด) + pages = soup.find_all(['div', 'section'], class_=re.compile(r'page|Page')) + + if not pages: + # ํŽ˜์ด์ง€ ๊ตฌ๋ถ„์ด ์—†์œผ๋ฉด ์ „์ฒด ํ…์ŠคํŠธ์—์„œ ๊ฒ€์ƒ‰ + text_content = soup.get_text() + matches = [] + start = 0 + while True: + pos = text_content.lower().find(q.lower(), start) + if pos == -1: + break + + # ์ปจํ…์ŠคํŠธ ์ถ”์ถœ + context_start = max(0, pos - 100) + context_end = min(len(text_content), pos + len(q) + 100) + context = text_content[context_start:context_end] + + matches.append({ + "page": 1, + "position": pos, + "context": context, + "match_text": text_content[pos:pos + len(q)] + }) + + start = pos + 1 + if len(matches) >= 10: # ์ตœ๋Œ€ 10๊ฐœ ๊ฒฐ๊ณผ + break + + search_results.extend(matches) + else: + # ํŽ˜์ด์ง€๋ณ„๋กœ ๊ฒ€์ƒ‰ + for page_num, page_elem in enumerate(pages, 1): + page_text = page_elem.get_text() + matches = [] + start = 0 + + while True: + pos = page_text.lower().find(q.lower(), start) + if pos == -1: + break + + # ์ปจํ…์ŠคํŠธ ์ถ”์ถœ + context_start = max(0, pos - 100) + context_end = min(len(page_text), pos + len(q) + 100) + context = page_text[context_start:context_end] + + matches.append({ + "page": page_num, + "position": pos, + "context": context, + "match_text": page_text[pos:pos + len(q)] + }) + + start = pos + 1 + if len(matches) >= 5: # ํŽ˜์ด์ง€๋‹น ์ตœ๋Œ€ 5๊ฐœ + break + + search_results.extend(matches) + + except Exception as e: + print(f"HTML ๊ฒ€์ƒ‰ ์˜ค๋ฅ˜: {e}") + + return { + "document_id": document_id, + "query": q, + "total_matches": len(search_results), + "matches": search_results[:20], # ์ตœ๋Œ€ 20๊ฐœ ๊ฒฐ๊ณผ + "has_pdf": bool(document.pdf_path), + "has_html": bool(document.html_path) + } + + class UpdateDocumentRequest(BaseModel): """๋ฌธ์„œ ์—…๋ฐ์ดํŠธ ์š”์ฒญ""" title: Optional[str] = None diff --git a/backend/src/api/routes/search.py b/backend/src/api/routes/search.py index 149e50a..bdc9d15 100644 --- a/backend/src/api/routes/search.py +++ b/backend/src/api/routes/search.py @@ -547,13 +547,21 @@ async def search_document_content( search_results = [] for doc in documents: - # HTML ํŒŒ์ผ์—์„œ ํ…์ŠคํŠธ ๊ฒ€์ƒ‰ + text_content = "" + file_type = "" + + # HTML ํŒŒ์ผ์—์„œ ํ…์ŠคํŠธ ๊ฒ€์ƒ‰ (PDF OCR ๊ฒฐ๊ณผ ๋˜๋Š” ์„œ์  HTML) if doc.html_path: try: import os from bs4 import BeautifulSoup - html_file_path = os.path.join("/app/data/documents", doc.html_path) + # ์ ˆ๋Œ€ ๊ฒฝ๋กœ ์ฒ˜๋ฆฌ + if doc.html_path.startswith('/'): + html_file_path = doc.html_path + else: + html_file_path = os.path.join("/app/data/documents", doc.html_path) + if os.path.exists(html_file_path): with open(html_file_path, 'r', encoding='utf-8') as f: html_content = f.read() @@ -562,27 +570,75 @@ async def search_document_content( soup = BeautifulSoup(html_content, 'html.parser') text_content = soup.get_text() - # ๊ฒ€์ƒ‰์–ด๊ฐ€ ํฌํ•จ๋œ ๊ฒฝ์šฐ - if query.lower() in text_content.lower(): - # ๊ฒ€์ƒ‰์–ด ์ฃผ๋ณ€ ์ปจํ…์ŠคํŠธ ์ถ”์ถœ - context = extract_search_context(text_content, query) + # PDF์ธ์ง€ ์„œ์ ์ธ์ง€ ๊ตฌ๋ถ„ + if doc.pdf_path: + file_type = "PDF" + else: + file_type = "HTML" - # ๊ด€๋ จ์„ฑ ์ ์ˆ˜ ๊ณ„์‚ฐ - score = 2.0 # ๋ณธ๋ฌธ ๋งค์น˜๋Š” ๋†’์€ ์ ์ˆ˜ - - search_results.append(SearchResult( - type="document_content", - id=str(doc.id), - title=f"๐Ÿ“„ {doc.title} (๋ณธ๋ฌธ)", - content=context, - document_id=str(doc.id), - document_title=doc.title, - created_at=doc.created_at, - relevance_score=score - )) except Exception as e: - print(f"๋ฌธ์„œ ๋ณธ๋ฌธ ๊ฒ€์ƒ‰ ์˜ค๋ฅ˜: {e}") + print(f"HTML ํŒŒ์ผ ์ฝ๊ธฐ ์˜ค๋ฅ˜ ({doc.html_path}): {e}") continue + + # PDF ํŒŒ์ผ ์ง์ ‘ ํ…์ŠคํŠธ ์ถ”์ถœ (HTML์ด ์—†๋Š” ๊ฒฝ์šฐ) + elif doc.pdf_path: + try: + import os + import PyPDF2 + + # ์ ˆ๋Œ€ ๊ฒฝ๋กœ ์ฒ˜๋ฆฌ + if doc.pdf_path.startswith('/'): + pdf_file_path = doc.pdf_path + else: + pdf_file_path = os.path.join("/app/data/documents", doc.pdf_path) + + if os.path.exists(pdf_file_path): + with open(pdf_file_path, 'rb') as f: + pdf_reader = PyPDF2.PdfReader(f) + text_pages = [] + + # ๋ชจ๋“  ํŽ˜์ด์ง€์—์„œ ํ…์ŠคํŠธ ์ถ”์ถœ + for page_num in range(len(pdf_reader.pages)): + page = pdf_reader.pages[page_num] + page_text = page.extract_text() + if page_text.strip(): + text_pages.append(f"[ํŽ˜์ด์ง€ {page_num + 1}]\n{page_text}") + + text_content = "\n\n".join(text_pages) + file_type = "PDF (์ง์ ‘์ถ”์ถœ)" + + except Exception as e: + print(f"PDF ํŒŒ์ผ ์ฝ๊ธฐ ์˜ค๋ฅ˜ ({doc.pdf_path}): {e}") + continue + + # ๊ฒ€์ƒ‰์–ด๊ฐ€ ํฌํ•จ๋œ ๊ฒฝ์šฐ + if text_content and query.lower() in text_content.lower(): + # ๊ฒ€์ƒ‰์–ด ์ฃผ๋ณ€ ์ปจํ…์ŠคํŠธ ์ถ”์ถœ + context = extract_search_context(text_content, query, context_length=300) + + # ๊ด€๋ จ์„ฑ ์ ์ˆ˜ ๊ณ„์‚ฐ + score = 2.0 # ๋ณธ๋ฌธ ๋งค์น˜๋Š” ๋†’์€ ์ ์ˆ˜ + + # ๊ฒ€์ƒ‰์–ด ๋งค์น˜ ํšŸ์ˆ˜๋กœ ์ ์ˆ˜ ์กฐ์ • + match_count = text_content.lower().count(query.lower()) + score += min(match_count * 0.1, 1.0) # ์ตœ๋Œ€ 1์  ์ถ”๊ฐ€ + + search_results.append(SearchResult( + type="document_content", + id=str(doc.id), + title=f"๐Ÿ“„ {doc.title} ({file_type} ๋ณธ๋ฌธ)", + content=context, + document_id=str(doc.id), + document_title=doc.title, + created_at=doc.created_at, + relevance_score=score, + highlight_info={ + "file_type": file_type, + "match_count": match_count, + "has_pdf": bool(doc.pdf_path), + "has_html": bool(doc.html_path) + } + )) return search_results diff --git a/frontend/search.html b/frontend/search.html index 802aae6..d5a178c 100644 --- a/frontend/search.html +++ b/frontend/search.html @@ -281,6 +281,12 @@ ๐Ÿ–๏ธ ํ•˜์ด๋ผ์ดํŠธ ๊ฐœ + + ๐Ÿ’ฌ ๋ฉ”๋ชจ ๊ฐœ + + + ๐Ÿ“– ๋ณธ๋ฌธ ๊ฐœ +
@@ -450,6 +456,39 @@
+ +
+
+
+ PDF ๋ฏธ๋ฆฌ๋ณด๊ธฐ +
+
+ +
+
+
+ +
+
+ +

PDF๋ฅผ ๋กœ๋“œํ•  ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค

+ +
+
+
+
+
@@ -473,8 +512,23 @@
๋ฉ”๋ชจ ๋‚ด์šฉ:
+ +
+
+
+ ๋ณธ๋ฌธ ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ +
+
+ + โ€ข ๊ฐœ ๋งค์น˜ +
+
+
+ -
+
diff --git a/frontend/static/js/search.js b/frontend/static/js/search.js index 6ea1008..00b931d 100644 --- a/frontend/static/js/search.js +++ b/frontend/static/js/search.js @@ -24,6 +24,7 @@ window.searchApp = function() { showPreviewModal: false, previewResult: null, previewLoading: false, + pdfError: false, // ์ธ์ฆ ์ƒํƒœ isAuthenticated: false, @@ -281,6 +282,39 @@ window.searchApp = function() { this.showPreviewModal = false; this.previewResult = null; this.previewLoading = false; + this.pdfError = false; + }, + + // PDF์—์„œ ๊ฒ€์ƒ‰ + async searchInPdf() { + if (!this.previewResult || !this.searchQuery) return; + + try { + const searchResults = await this.api.get( + `/documents/${this.previewResult.document_id}/search-in-content?q=${encodeURIComponent(this.searchQuery)}` + ); + + if (searchResults.total_matches > 0) { + // ์ฒซ ๋ฒˆ์งธ ๋งค์น˜๋กœ ์ด๋™ํ•˜์—ฌ ๋ทฐ์–ด์—์„œ ์—ด๊ธฐ + const firstMatch = searchResults.matches[0]; + let url = `/viewer.html?id=${this.previewResult.document_id}`; + + if (firstMatch.page > 1) { + url += `&page=${firstMatch.page}`; + } + + // ๊ฒ€์ƒ‰์–ด ํ•˜์ด๋ผ์ดํŠธ๋ฅผ ์œ„ํ•œ ํŒŒ๋ผ๋ฏธํ„ฐ ์ถ”๊ฐ€ + url += `&search=${encodeURIComponent(this.searchQuery)}`; + + window.open(url, '_blank'); + this.closePreview(); + } else { + alert('PDF์—์„œ ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.'); + } + } catch (error) { + console.error('PDF ๊ฒ€์ƒ‰ ์‹คํŒจ:', error); + alert('PDF ๊ฒ€์ƒ‰ ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค.'); + } }, // ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ ์—ด๊ธฐ