fix: 스프레드시트 텍스트 추출 — csv 필터 사용 (txt:Text는 Calc 미지원)
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -89,21 +89,30 @@ async def process(document_id: int, session: AsyncSession) -> None:
|
||||
tmp_input = tmp_dir / f"input_{document_id}.{fmt}"
|
||||
shutil.copy2(str(full_path), str(tmp_input))
|
||||
|
||||
# 스프레드시트는 csv, 나머지는 txt
|
||||
CALC_FORMATS = {"xlsx", "xls", "ods", "osheet"}
|
||||
if fmt in CALC_FORMATS:
|
||||
convert_to = "csv:Text - txt - csv (StarCalc):44,34,76,1"
|
||||
out_ext = "csv"
|
||||
else:
|
||||
convert_to = "txt:Text"
|
||||
out_ext = "txt"
|
||||
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["libreoffice", "--headless", "--convert-to", "txt:Text", "--outdir", str(tmp_dir), str(tmp_input)],
|
||||
["libreoffice", "--headless", "--convert-to", convert_to, "--outdir", str(tmp_dir), str(tmp_input)],
|
||||
capture_output=True, text=True, timeout=60,
|
||||
)
|
||||
txt_file = tmp_dir / f"input_{document_id}.txt"
|
||||
if txt_file.exists():
|
||||
text = txt_file.read_text(encoding="utf-8", errors="replace")
|
||||
out_file = tmp_dir / f"input_{document_id}.{out_ext}"
|
||||
if out_file.exists():
|
||||
text = out_file.read_text(encoding="utf-8", errors="replace")
|
||||
doc.extracted_text = text[:15000]
|
||||
doc.extracted_at = datetime.now(timezone.utc)
|
||||
doc.extractor_version = "libreoffice"
|
||||
txt_file.unlink()
|
||||
out_file.unlink()
|
||||
logger.info(f"[LibreOffice] {doc.file_path} ({len(text)}자)")
|
||||
else:
|
||||
raise RuntimeError(f"LibreOffice 변환 결과물 없음: {result.stderr[:200]}")
|
||||
raise RuntimeError(f"LibreOffice 변환 실패: {result.stderr[:300]}")
|
||||
except subprocess.TimeoutExpired:
|
||||
raise RuntimeError(f"LibreOffice 텍스트 추출 timeout (60s)")
|
||||
finally:
|
||||
|
||||
Reference in New Issue
Block a user