From a5186bf4aa8f039586cd7d72d96d1ecc3b3e86c6 Mon Sep 17 00:00:00 2001 From: Hyungi Ahn Date: Fri, 3 Apr 2026 11:21:29 +0900 Subject: [PATCH] =?UTF-8?q?fix:=20=EC=8A=A4=ED=94=84=EB=A0=88=EB=93=9C?= =?UTF-8?q?=EC=8B=9C=ED=8A=B8=20=ED=85=8D=EC=8A=A4=ED=8A=B8=20=EC=B6=94?= =?UTF-8?q?=EC=B6=9C=20=E2=80=94=20csv=20=ED=95=84=ED=84=B0=20=EC=82=AC?= =?UTF-8?q?=EC=9A=A9=20(txt:Text=EB=8A=94=20Calc=20=EB=AF=B8=EC=A7=80?= =?UTF-8?q?=EC=9B=90)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Opus 4.6 (1M context) --- app/workers/extract_worker.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/app/workers/extract_worker.py b/app/workers/extract_worker.py index 9bf29a2..6f9db5d 100644 --- a/app/workers/extract_worker.py +++ b/app/workers/extract_worker.py @@ -89,21 +89,30 @@ async def process(document_id: int, session: AsyncSession) -> None: tmp_input = tmp_dir / f"input_{document_id}.{fmt}" shutil.copy2(str(full_path), str(tmp_input)) + # 스프레드시트는 csv, 나머지는 txt + CALC_FORMATS = {"xlsx", "xls", "ods", "osheet"} + if fmt in CALC_FORMATS: + convert_to = "csv:Text - txt - csv (StarCalc):44,34,76,1" + out_ext = "csv" + else: + convert_to = "txt:Text" + out_ext = "txt" + try: result = subprocess.run( - ["libreoffice", "--headless", "--convert-to", "txt:Text", "--outdir", str(tmp_dir), str(tmp_input)], + ["libreoffice", "--headless", "--convert-to", convert_to, "--outdir", str(tmp_dir), str(tmp_input)], capture_output=True, text=True, timeout=60, ) - txt_file = tmp_dir / f"input_{document_id}.txt" - if txt_file.exists(): - text = txt_file.read_text(encoding="utf-8", errors="replace") + out_file = tmp_dir / f"input_{document_id}.{out_ext}" + if out_file.exists(): + text = out_file.read_text(encoding="utf-8", errors="replace") doc.extracted_text = text[:15000] doc.extracted_at = datetime.now(timezone.utc) doc.extractor_version = "libreoffice" - txt_file.unlink() + out_file.unlink() logger.info(f"[LibreOffice] {doc.file_path} ({len(text)}자)") else: - raise RuntimeError(f"LibreOffice 변환 결과물 없음: {result.stderr[:200]}") + raise RuntimeError(f"LibreOffice 변환 실패: {result.stderr[:300]}") except subprocess.TimeoutExpired: raise RuntimeError(f"LibreOffice 텍스트 추출 timeout (60s)") finally: