fix: 스프레드시트 텍스트 추출 — csv 필터 사용 (txt:Text는 Calc 미지원)
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -89,21 +89,30 @@ async def process(document_id: int, session: AsyncSession) -> None:
|
|||||||
tmp_input = tmp_dir / f"input_{document_id}.{fmt}"
|
tmp_input = tmp_dir / f"input_{document_id}.{fmt}"
|
||||||
shutil.copy2(str(full_path), str(tmp_input))
|
shutil.copy2(str(full_path), str(tmp_input))
|
||||||
|
|
||||||
|
# 스프레드시트는 csv, 나머지는 txt
|
||||||
|
CALC_FORMATS = {"xlsx", "xls", "ods", "osheet"}
|
||||||
|
if fmt in CALC_FORMATS:
|
||||||
|
convert_to = "csv:Text - txt - csv (StarCalc):44,34,76,1"
|
||||||
|
out_ext = "csv"
|
||||||
|
else:
|
||||||
|
convert_to = "txt:Text"
|
||||||
|
out_ext = "txt"
|
||||||
|
|
||||||
try:
|
try:
|
||||||
result = subprocess.run(
|
result = subprocess.run(
|
||||||
["libreoffice", "--headless", "--convert-to", "txt:Text", "--outdir", str(tmp_dir), str(tmp_input)],
|
["libreoffice", "--headless", "--convert-to", convert_to, "--outdir", str(tmp_dir), str(tmp_input)],
|
||||||
capture_output=True, text=True, timeout=60,
|
capture_output=True, text=True, timeout=60,
|
||||||
)
|
)
|
||||||
txt_file = tmp_dir / f"input_{document_id}.txt"
|
out_file = tmp_dir / f"input_{document_id}.{out_ext}"
|
||||||
if txt_file.exists():
|
if out_file.exists():
|
||||||
text = txt_file.read_text(encoding="utf-8", errors="replace")
|
text = out_file.read_text(encoding="utf-8", errors="replace")
|
||||||
doc.extracted_text = text[:15000]
|
doc.extracted_text = text[:15000]
|
||||||
doc.extracted_at = datetime.now(timezone.utc)
|
doc.extracted_at = datetime.now(timezone.utc)
|
||||||
doc.extractor_version = "libreoffice"
|
doc.extractor_version = "libreoffice"
|
||||||
txt_file.unlink()
|
out_file.unlink()
|
||||||
logger.info(f"[LibreOffice] {doc.file_path} ({len(text)}자)")
|
logger.info(f"[LibreOffice] {doc.file_path} ({len(text)}자)")
|
||||||
else:
|
else:
|
||||||
raise RuntimeError(f"LibreOffice 변환 결과물 없음: {result.stderr[:200]}")
|
raise RuntimeError(f"LibreOffice 변환 실패: {result.stderr[:300]}")
|
||||||
except subprocess.TimeoutExpired:
|
except subprocess.TimeoutExpired:
|
||||||
raise RuntimeError(f"LibreOffice 텍스트 추출 timeout (60s)")
|
raise RuntimeError(f"LibreOffice 텍스트 추출 timeout (60s)")
|
||||||
finally:
|
finally:
|
||||||
|
|||||||
Reference in New Issue
Block a user