fix: 스프레드시트 텍스트 추출 — csv 필터 사용 (txt:Text는 Calc 미지원)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-03 11:21:29 +09:00
parent b37043d651
commit a5186bf4aa
1 changed files with 15 additions and 6 deletions
@@ -89,21 +89,30 @@ async def process(document_id: int, session: AsyncSession) -> None:
        tmp_input = tmp_dir / f"input_{document_id}.{fmt}"
        shutil.copy2(str(full_path), str(tmp_input))

+        # 스프레드시트는 csv, 나머지는 txt
+        CALC_FORMATS = {"xlsx", "xls", "ods", "osheet"}
+        if fmt in CALC_FORMATS:
+            convert_to = "csv:Text - txt - csv (StarCalc):44,34,76,1"
+            out_ext = "csv"
+        else:
+            convert_to = "txt:Text"
+            out_ext = "txt"
+
        try:
            result = subprocess.run(
-                ["libreoffice", "--headless", "--convert-to", "txt:Text", "--outdir", str(tmp_dir), str(tmp_input)],
+                ["libreoffice", "--headless", "--convert-to", convert_to, "--outdir", str(tmp_dir), str(tmp_input)],
                capture_output=True, text=True, timeout=60,
            )
-            txt_file = tmp_dir / f"input_{document_id}.txt"
-            if txt_file.exists():
-                text = txt_file.read_text(encoding="utf-8", errors="replace")
+            out_file = tmp_dir / f"input_{document_id}.{out_ext}"
+            if out_file.exists():
+                text = out_file.read_text(encoding="utf-8", errors="replace")
                doc.extracted_text = text[:15000]
                doc.extracted_at = datetime.now(timezone.utc)
                doc.extractor_version = "libreoffice"
-                txt_file.unlink()
+                out_file.unlink()
                logger.info(f"[LibreOffice] {doc.file_path} ({len(text)}자)")
            else:
-                raise RuntimeError(f"LibreOffice 변환 결과물 없음: {result.stderr[:200]}")
+                raise RuntimeError(f"LibreOffice 변환 실패: {result.stderr[:300]}")
        except subprocess.TimeoutExpired:
            raise RuntimeError(f"LibreOffice 텍스트 추출 timeout (60s)")
        finally: