diff --git a/app/workers/extract_worker.py b/app/workers/extract_worker.py index 9bf29a2..6f9db5d 100644 --- a/app/workers/extract_worker.py +++ b/app/workers/extract_worker.py @@ -89,21 +89,30 @@ async def process(document_id: int, session: AsyncSession) -> None: tmp_input = tmp_dir / f"input_{document_id}.{fmt}" shutil.copy2(str(full_path), str(tmp_input)) + # 스프레드시트는 csv, 나머지는 txt + CALC_FORMATS = {"xlsx", "xls", "ods", "osheet"} + if fmt in CALC_FORMATS: + convert_to = "csv:Text - txt - csv (StarCalc):44,34,76,1" + out_ext = "csv" + else: + convert_to = "txt:Text" + out_ext = "txt" + try: result = subprocess.run( - ["libreoffice", "--headless", "--convert-to", "txt:Text", "--outdir", str(tmp_dir), str(tmp_input)], + ["libreoffice", "--headless", "--convert-to", convert_to, "--outdir", str(tmp_dir), str(tmp_input)], capture_output=True, text=True, timeout=60, ) - txt_file = tmp_dir / f"input_{document_id}.txt" - if txt_file.exists(): - text = txt_file.read_text(encoding="utf-8", errors="replace") + out_file = tmp_dir / f"input_{document_id}.{out_ext}" + if out_file.exists(): + text = out_file.read_text(encoding="utf-8", errors="replace") doc.extracted_text = text[:15000] doc.extracted_at = datetime.now(timezone.utc) doc.extractor_version = "libreoffice" - txt_file.unlink() + out_file.unlink() logger.info(f"[LibreOffice] {doc.file_path} ({len(text)}자)") else: - raise RuntimeError(f"LibreOffice 변환 결과물 없음: {result.stderr[:200]}") + raise RuntimeError(f"LibreOffice 변환 실패: {result.stderr[:300]}") except subprocess.TimeoutExpired: raise RuntimeError(f"LibreOffice 텍스트 추출 timeout (60s)") finally: