From 45448b4036887e7e115e7bd20abb6071211cb69b Mon Sep 17 00:00:00 2001 From: Hyungi Ahn Date: Fri, 3 Apr 2026 11:12:19 +0900 Subject: [PATCH] =?UTF-8?q?feat:=20extract=5Fworker=EC=97=90=20LibreOffice?= =?UTF-8?q?=20=ED=85=8D=EC=8A=A4=ED=8A=B8=20=EC=B6=94=EC=B6=9C=20=EC=B6=94?= =?UTF-8?q?=EA=B0=80=20(=EC=98=A4=ED=94=BC=EC=8A=A4=20=ED=8F=AC=EB=A7=B7)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - xlsx, docx, pptx, odt, ods, odp, odoc, osheet 지원 - LibreOffice --convert-to txt로 텍스트 추출 (60s timeout) - 추가 의존성 없음 (Docker에 이미 설치된 LibreOffice 사용) Co-Authored-By: Claude Opus 4.6 (1M context) --- app/workers/extract_worker.py | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/app/workers/extract_worker.py b/app/workers/extract_worker.py index 9bf38f6..9a6e32f 100644 --- a/app/workers/extract_worker.py +++ b/app/workers/extract_worker.py @@ -1,5 +1,6 @@ -"""텍스트 추출 워커 — kordoc 호출 또는 직접 파일 읽기""" +"""텍스트 추출 워커 — kordoc / LibreOffice / 직접 읽기""" +import subprocess from datetime import datetime, timezone from pathlib import Path @@ -16,6 +17,8 @@ logger = setup_logger("extract_worker") KORDOC_FORMATS = {"hwp", "hwpx", "pdf"} # 직접 읽기 가능한 텍스트 포맷 TEXT_FORMATS = {"md", "txt", "csv", "json", "xml", "html"} +# LibreOffice로 텍스트 추출 가능한 포맷 +OFFICE_FORMATS = {"xlsx", "xls", "docx", "doc", "pptx", "ppt", "odt", "ods", "odp", "odoc", "osheet"} # OCR 필요 이미지 포맷 (Phase 2) IMAGE_FORMATS = {"jpg", "jpeg", "png", "tiff", "tif", "bmp", "gif"} @@ -73,6 +76,33 @@ async def process(document_id: int, session: AsyncSession) -> None: logger.info(f"[kordoc] {doc.file_path} ({len(doc.extracted_text)}자)") return + # 오피스 포맷 — LibreOffice 텍스트 변환 + if fmt in OFFICE_FORMATS: + if not full_path.exists(): + raise FileNotFoundError(f"파일 없음: {full_path}") + + tmp_dir = Path("/tmp/extract_work") + tmp_dir.mkdir(exist_ok=True) + + try: + result = subprocess.run( + ["libreoffice", "--headless", "--convert-to", "txt:Text", "--outdir", str(tmp_dir), str(full_path)], + capture_output=True, text=True, timeout=60, + ) + txt_file = tmp_dir / f"{full_path.stem}.txt" + if txt_file.exists(): + text = txt_file.read_text(encoding="utf-8", errors="replace") + doc.extracted_text = text[:15000] + doc.extracted_at = datetime.now(timezone.utc) + doc.extractor_version = "libreoffice" + txt_file.unlink() + logger.info(f"[LibreOffice] {doc.file_path} ({len(text)}자)") + return + else: + raise RuntimeError(f"LibreOffice 변환 결과물 없음: {result.stderr[:200]}") + except subprocess.TimeoutExpired: + raise RuntimeError(f"LibreOffice 텍스트 추출 timeout (60s)") + # 미지원 포맷 doc.extracted_text = "" doc.extracted_at = datetime.now(timezone.utc)