"""텍스트 추출 워커 — kordoc / PyMuPDF / Surya OCR / LibreOffice / 직접 읽기 / 웹 HTML""" import hashlib import re import subprocess from datetime import datetime, timezone from pathlib import Path import httpx from sqlalchemy.ext.asyncio import AsyncSession from core.config import settings from core.utils import setup_logger from models.document import Document logger = setup_logger("extract_worker") # kordoc으로 파싱 가능한 포맷 KORDOC_FORMATS = {"hwp", "hwpx", "pdf"} # 직접 읽기 가능한 텍스트 포맷 TEXT_FORMATS = {"md", "txt", "csv", "json", "xml", "html"} # LibreOffice로 텍스트 추출 가능한 포맷 OFFICE_FORMATS = {"xlsx", "xls", "docx", "doc", "pptx", "ppt", "odt", "ods", "odp", "odoc", "osheet"} # OCR 대상 이미지 포맷 IMAGE_FORMATS = {"jpg", "jpeg", "png", "tiff", "tif", "bmp", "gif", "webp"} EXTRACTOR_VERSION = "kordoc@1.7" PYMUPDF_VERSION = "pymupdf" # ─── OCR 판정 함수 ─── def _should_ocr(text: str, page_count: int) -> tuple[bool, str]: """텍스트 추출 결과로 OCR 필요 여부 판정 — 2단계""" total = len(text.strip()) if total < 300: return True, "no_text_layer" avg = total / max(page_count, 1) if avg < 80 and total < 3000: return True, "low_text_density" return False, "" def _ocr_skip_reason(file_size: int, page_count: int) -> str | None: """OCR 상한 체크""" if page_count > 200: return "page_limit" if file_size > 150 * 1024 * 1024: return "size_limit" return None def _ocr_quality_ok(text: str, page_count: int, is_image: bool) -> bool: """OCR 결과 품질 검증 — 유형별 차등""" chars = len(text.strip()) if is_image: return chars >= 50 if page_count > 0: return chars >= 200 or (chars / max(page_count, 1)) >= 30 return chars >= 200 def _postprocess_ocr(text: str) -> str: """OCR 후처리 — NUL 제거 + 과도한 공백 정리""" text = text.replace("\x00", "") text = re.sub(r'\s{3,}', '\n', text) return text.strip() def _extract_pdf_pymupdf( file_path: Path, start_page: int | None = None, end_page: int | None = None ) -> str: """PyMuPDF fallback — 페이지 단위 스트리밍으로 대형 PDF도 저메모리 처리. G2 (PR-G2-2): start_page/end_page(1-based inclusive) 가 주어지면 그 범위만 추출 (번들 자식 doc = 부모 파일 공유 + 자기 page 범위). 둘 다 None = 전체(기존 동작 동일). """ import fitz text_parts = [] with fitz.open(str(file_path)) as doc: if start_page is None and end_page is None: for page in doc: text_parts.append(page.get_text()) else: # 1-based inclusive → 0-based range. 범위는 [0, page_count] 로 클램프(방어). total = doc.page_count lo = max(1, start_page or 1) - 1 hi = min(total, end_page or total) # inclusive 끝 (0-based 마지막 인덱스 = hi-1) for i in range(lo, hi): text_parts.append(doc.load_page(i).get_text()) return "\n".join(text_parts) def _get_pdf_page_count( file_path: Path, start_page: int | None = None, end_page: int | None = None ) -> int: """PDF 페이지 수 확인. G2: 범위가 주어지면 그 범위의 페이지 수(자식 doc 밀도 계산용). 둘 다 None = 전체 페이지 수(기존 동작 동일). """ import fitz with fitz.open(str(file_path)) as doc: total = len(doc) if start_page is None and end_page is None: return total lo = max(1, start_page or 1) hi = min(total, end_page or total) return max(0, hi - lo + 1) async def _call_ocr(file_path: Path, is_image: bool, max_pages: int = 200) -> str | None: """OCR 서비스 호출 — 타임아웃 페이지 수 비례""" container_path = f"/documents/{file_path.relative_to(Path(settings.nas_mount_path))}" timeout = 60 if is_image else min(600, max(120, max_pages * 3)) try: async with httpx.AsyncClient(timeout=timeout) as client: resp = await client.post( f"{settings.ocr_endpoint}/ocr", json={"filePath": container_path, "langs": ["ko", "en"], "maxPages": max_pages}, ) if resp.status_code == 200: data = resp.json() return data.get("text", "") except Exception as e: logger.error(f"[ocr] OCR 서비스 호출 실패: {e}") return None # ─── Web/Blog ingest (devonagent 트랙) — HTML → markdown 4-tier ──────────── _WEB_MIN_BODY_LEN = 200 # 4-tier fallback 전환 임계 def _extract_web_with_trafilatura(html: str) -> tuple[str, str | None]: """trafilatura 로 본문 markdown 추출. (body, engine_version) 반환. 실패 시 ("", None).""" try: import trafilatura except ImportError: logger.warning("[web] trafilatura 미설치 — 다음 fallback 시도") return "", None try: body = trafilatura.extract( html, output_format="markdown", include_comments=False, include_tables=True, with_metadata=True, deduplicate=True, favor_precision=True, ) return (body or "", getattr(trafilatura, "__version__", "unknown")) except Exception as e: logger.warning(f"[web] trafilatura 실패: {e}") return "", None def _extract_web_with_readability(html: str) -> tuple[str, str | None]: """readability-lxml 로 본문 추출 + markdownify 로 markdown 변환.""" try: from readability import Document as ReadabilityDocument from markdownify import markdownify except ImportError: logger.warning("[web] readability/markdownify 미설치 — 다음 fallback 시도") return "", None try: rd = ReadabilityDocument(html) body_html = rd.summary() or "" if not body_html: return "", None body_md = markdownify(body_html, heading_style="ATX") return (body_md or "", "readability+markdownify") except Exception as e: logger.warning(f"[web] readability 실패: {e}") return "", None def _extract_web_with_bs4(html: str) -> tuple[str, str | None]: """최종 fallback — BeautifulSoup 으로 script/style 제거 후 get_text.""" try: from bs4 import BeautifulSoup except ImportError: logger.warning("[web] beautifulsoup4 미설치 — 빈 본문 반환") return "", None try: soup = BeautifulSoup(html, "lxml") for tag in soup(["script", "style", "noscript", "nav", "footer", "aside"]): tag.decompose() text = soup.get_text(" ", strip=True) return (text or "", "bs4_text") except Exception as e: logger.warning(f"[web] bs4 실패: {e}") return "", None async def _extract_web_html(doc: Document, html_path: Path) -> None: """devonagent HTML → markdown 4-tier fallback. md_* 컬럼 전체 채움.""" html_bytes = html_path.read_bytes() html_text = html_bytes.decode("utf-8", errors="replace") src_hash = hashlib.sha256(html_bytes).hexdigest() # 1) trafilatura body, engine_ver = _extract_web_with_trafilatura(html_text) engine = "trafilatura" if body and len(body) >= _WEB_MIN_BODY_LEN else None # 2) sibling .md (DEVONthink rendered) if not engine: md_path = html_path.with_suffix(".md") if md_path.is_file(): try: md_body = md_path.read_text(encoding="utf-8", errors="replace") if md_body and len(md_body) >= _WEB_MIN_BODY_LEN: body = md_body engine = "devonthink_export" engine_ver = "smart_rule" except Exception as e: logger.warning(f"[web] sibling .md 읽기 실패 {md_path}: {e}") # 3) readability + markdownify if not engine: body2, ver2 = _extract_web_with_readability(html_text) if body2 and len(body2) >= _WEB_MIN_BODY_LEN: body = body2 engine = "readability" engine_ver = ver2 # 4) bs4 get_text (최종 fallback) if not engine: body3, ver3 = _extract_web_with_bs4(html_text) if body3: body = body3 engine = "bs4_text" engine_ver = ver3 else: body = "" engine = "empty" engine_ver = None clean_body = (body or "").replace("\x00", "") now = datetime.now(timezone.utc) doc.extracted_text = clean_body doc.extracted_at = now doc.extractor_version = f"web@{engine}" doc.md_content = clean_body doc.md_status = "success" if clean_body else "failed" doc.md_extraction_engine = engine doc.md_extraction_engine_version = engine_ver doc.md_format_version = "1.0" doc.md_generated_at = now doc.md_source_hash = src_hash doc.md_content_hash = hashlib.sha256(clean_body.encode("utf-8")).hexdigest() doc.content_origin = "extracted" # extract_meta 의 web_meta 는 file_watcher 가 박은 그대로 유지 (sidecar 출처) logger.info( f"[web/{engine}] {doc.file_path} ({len(clean_body)}자, engine_ver={engine_ver})" ) # ─── 메인 처리 ─── async def process(document_id: int, session: AsyncSession) -> None: """문서 텍스트 추출""" doc = await session.get(Document, document_id) if not doc: raise ValueError(f"문서 ID {document_id}를 찾을 수 없음") fmt = doc.file_format.lower() full_path = Path(settings.nas_mount_path) / doc.file_path # ─── Web/Blog ingest (devonagent 트랙) — HTML 본문 정화 4-tier fallback ─── # plan: ~/.claude/plans/db-snuggly-petal.md # 1) trafilatura (markdown body) # 2) sibling .md (DEVONthink rendered, >= 200 char) # 3) readability-lxml + markdownify # 4) BeautifulSoup get_text # md_extraction_engine 으로 어느 경로로 추출됐는지 기록 → 품질 모니터링용 if fmt == "html" and doc.source_channel == "devonagent": if not full_path.exists(): raise FileNotFoundError(f"파일 없음: {full_path}") await _extract_web_html(doc, full_path) return # ─── 텍스트 파일 — 직접 읽기 ─── if fmt in TEXT_FORMATS: if not full_path.exists(): raise FileNotFoundError(f"파일 없음: {full_path}") text = full_path.read_text(encoding="utf-8", errors="replace") doc.extracted_text = text.replace("\x00", "") doc.extracted_at = datetime.now(timezone.utc) doc.extractor_version = "direct_read" logger.info(f"[텍스트] {doc.file_path} ({len(text)}자)") return # ─── 이미지 — OCR ─── if fmt in IMAGE_FORMATS: meta = doc.extract_meta or {} # OCR 1회 제한 if meta.get("ocr_attempted"): meta["ocr_skip_reason"] = "already_attempted" doc.extract_meta = meta logger.info(f"[이미지] {doc.file_path} — OCR 이미 시도됨, 스킵") return # 상한 체크 skip = _ocr_skip_reason(doc.file_size or 0, 1) if skip: doc.extracted_text = "" doc.extractor_version = None doc.extract_meta = {**meta, "ocr_skip_reason": skip, "ocr_terminal": True} doc.extracted_at = datetime.now(timezone.utc) logger.warning(f"[이미지] {doc.file_path} — OCR 스킵 ({skip})") return # OCR 서비스 호출 ocr_text = await _call_ocr(full_path, is_image=True) meta["ocr_attempted"] = True meta["ocr_reason"] = "image_file" if ocr_text and _ocr_quality_ok(ocr_text, 1, is_image=True): doc.extracted_text = _postprocess_ocr(ocr_text) doc.extractor_version = "surya_ocr" meta["ocr_chars"] = len(doc.extracted_text) logger.info(f"[surya_ocr] {doc.file_path} ({len(doc.extracted_text)}자)") else: doc.extracted_text = "" doc.extractor_version = None meta["ocr_quality_ok"] = False meta["ocr_terminal"] = True logger.warning(f"[이미지] {doc.file_path} — OCR 결과 품질 미달") doc.extract_meta = meta doc.extracted_at = datetime.now(timezone.utc) return # ─── G2 (PR-G2-2): 번들 자식 PDF — 부모 파일 공유 + 자기 page 범위만 추출 ─── # kordoc 서비스는 page-range 파라미터가 없어 전체 파일을 파싱한다(자식엔 부적합) → kordoc # 우회, PyMuPDF 로 [bundle_page_start, bundle_page_end] 범위만 추출. range OCR 은 본 PR 범위 # 밖(자식은 ToC 존재 = digital text layer 전제 → 대개 OCR 불필요). PyMuPDF 텍스트가 빈약해도 # 그대로 보존하고 사유를 남긴다. if fmt == "pdf" and doc.bundle_page_start is not None and doc.bundle_page_end is not None: # 후보 A: 자식 file_path 는 합성값(`{부모}#p{s}-{e}`) → 실파일 = bundle_source_path 로 부모경로 # 복원 + NFC/NFD resolve. (자식 file_path 는 디스크에 없음.) from workers.presegment_worker import _resolve_path as _resolve_bundle_path from workers.presegment_worker import bundle_source_path real_rel = bundle_source_path(doc.file_path) src = _resolve_bundle_path(str(Path(settings.nas_mount_path) / real_rel)) if src is None: raise FileNotFoundError(f"번들 원본 파일 없음: {real_rel}") start, end = doc.bundle_page_start, doc.bundle_page_end try: pymupdf_text = _extract_pdf_pymupdf(src, start, end) page_count = _get_pdf_page_count(src, start, end) except Exception as e: logger.error(f"[pymupdf:child] {doc.file_path} pages={start}-{end} 실패: {e}") raise meta = doc.extract_meta or {} meta["presegment_child_range"] = {"start_page": start, "end_page": end} meta["pymupdf_chars"] = len(pymupdf_text.strip()) should, reason = _should_ocr(pymupdf_text, page_count) if should: # range OCR 미지원(후속 PR) — PyMuPDF 결과 유지 + 사유 기록(silent skip 아님). meta["ocr_skip_reason"] = "presegment_child_range_ocr_unsupported" meta["ocr_reason"] = reason logger.warning( f"[pymupdf:child] {doc.file_path} pages={start}-{end} " f"OCR 필요({reason})하나 range OCR 미지원 → PyMuPDF 결과 유지" ) doc.extracted_text = pymupdf_text.replace("\x00", "") doc.extracted_at = datetime.now(timezone.utc) doc.extractor_version = PYMUPDF_VERSION if pymupdf_text.strip() else None doc.extract_meta = meta logger.info( f"[pymupdf:child] {doc.file_path} pages={start}-{end} ({len(pymupdf_text)}자)" ) return # ─── kordoc 파싱 (HWP/HWPX/PDF) + PyMuPDF fallback + OCR ─── if fmt in KORDOC_FORMATS: container_path = f"/documents/{doc.file_path}" kordoc_timeout = min(300, max(60, (doc.file_size or 0) // (10 * 1024 * 1024) * 60 + 60)) kordoc_ok = False try: async with httpx.AsyncClient(timeout=kordoc_timeout) as client: resp = await client.post( f"{settings.kordoc_endpoint}/parse", json={"filePath": container_path}, ) if resp.status_code == 404: raise FileNotFoundError(f"kordoc: 파일 없음 — {container_path}") if resp.status_code == 200: data = resp.json() text = data.get("markdown", "").replace("\x00", "") if text: doc.extracted_text = text doc.extracted_at = datetime.now(timezone.utc) doc.extractor_version = EXTRACTOR_VERSION logger.info(f"[kordoc] {doc.file_path} ({len(text)}자)") kordoc_ok = True except FileNotFoundError: raise except Exception as e: logger.warning(f"[kordoc] {doc.file_path} 실패 ({e.__class__.__name__}), fallback 시도") if kordoc_ok: return # ─── PyMuPDF fallback (PDF만) ─── if fmt == "pdf" and full_path.exists(): try: pymupdf_text = _extract_pdf_pymupdf(full_path) page_count = _get_pdf_page_count(full_path) except Exception as e: logger.error(f"[pymupdf] {doc.file_path} 실패: {e}") pymupdf_text = "" page_count = 0 meta = doc.extract_meta or {} meta["pymupdf_chars"] = len(pymupdf_text.strip()) # PyMuPDF 텍스트 충분 여부 판정 should, reason = _should_ocr(pymupdf_text, page_count) if not should: # PyMuPDF 텍스트 충분 → OCR 불필요 doc.extracted_text = pymupdf_text.replace("\x00", "") doc.extracted_at = datetime.now(timezone.utc) doc.extractor_version = PYMUPDF_VERSION doc.extract_meta = meta logger.info(f"[pymupdf] {doc.file_path} ({len(pymupdf_text)}자)") return # ─── OCR 필요 ─── # OCR 1회 제한 if meta.get("ocr_attempted"): doc.extracted_text = pymupdf_text.replace("\x00", "") or "" doc.extracted_at = datetime.now(timezone.utc) doc.extractor_version = "pymupdf" if pymupdf_text.strip() else None meta["ocr_skip_reason"] = "already_attempted" doc.extract_meta = meta logger.info(f"[pdf] {doc.file_path} — OCR 이미 시도됨, PyMuPDF 결과 유지") return # 상한 체크 skip = _ocr_skip_reason(doc.file_size or 0, page_count) if skip: doc.extracted_text = pymupdf_text.replace("\x00", "") or "" doc.extracted_at = datetime.now(timezone.utc) doc.extractor_version = "pymupdf" if pymupdf_text.strip() else None doc.extract_meta = {**meta, "ocr_skip_reason": skip, "ocr_terminal": not pymupdf_text.strip()} logger.warning(f"[pdf] {doc.file_path} — OCR 스킵 ({skip}), PyMuPDF 결과 유지") return # OCR 서비스 호출 meta["ocr_attempted"] = True meta["ocr_reason"] = reason logger.info(f"[pdf] {doc.file_path} — OCR 시도 (reason={reason}, pages={page_count})") ocr_text = await _call_ocr(full_path, is_image=False, max_pages=min(page_count, 200)) if ocr_text and _ocr_quality_ok(ocr_text, page_count, is_image=False): doc.extracted_text = _postprocess_ocr(ocr_text) doc.extractor_version = "surya_ocr" meta["ocr_chars"] = len(doc.extracted_text) logger.info(f"[surya_ocr] {doc.file_path} ({len(doc.extracted_text)}자)") else: # OCR 실패 → PyMuPDF 텍스트라도 보존 doc.extracted_text = pymupdf_text.replace("\x00", "") or "" doc.extractor_version = "pymupdf" if pymupdf_text.strip() else None meta["ocr_quality_ok"] = False if not pymupdf_text.strip(): meta["ocr_terminal"] = True logger.warning(f"[pdf] {doc.file_path} — OCR 결과 품질 미달, PyMuPDF 결과 유지") doc.extract_meta = meta doc.extracted_at = datetime.now(timezone.utc) return # HWP/HWPX는 kordoc 전용 — fallback 없음 if fmt != "pdf": raise ValueError(f"kordoc 파싱 실패 (HWP/HWPX는 fallback 없음)") raise ValueError(f"PDF 텍스트 추출 실패 — kordoc + PyMuPDF 모두 실패") # ─── 오피스 포맷 — LibreOffice 텍스트 변환 ─── if fmt in OFFICE_FORMATS: if not full_path.exists(): raise FileNotFoundError(f"파일 없음: {full_path}") import shutil tmp_dir = Path("/tmp/extract_work") tmp_dir.mkdir(exist_ok=True) tmp_input = tmp_dir / f"input_{document_id}.{fmt}" shutil.copy2(str(full_path), str(tmp_input)) CALC_FORMATS = {"xlsx", "xls", "ods", "osheet"} if fmt in CALC_FORMATS: convert_to = "csv:Text - txt - csv (StarCalc):44,34,76,1" out_ext = "csv" else: convert_to = "txt:Text" out_ext = "txt" try: result = subprocess.run( ["libreoffice", "--headless", "--convert-to", convert_to, "--outdir", str(tmp_dir), str(tmp_input)], capture_output=True, text=True, timeout=60, ) out_file = tmp_dir / f"input_{document_id}.{out_ext}" if out_file.exists(): text = out_file.read_text(encoding="utf-8", errors="replace") # 설계 원칙: extract는 전체 텍스트 저장. classify/summarize가 자체 상한으로 slice. doc.extracted_text = text.replace("\x00", "") doc.extracted_at = datetime.now(timezone.utc) doc.extractor_version = "libreoffice" out_file.unlink() logger.info(f"[LibreOffice] {doc.file_path} ({len(text)}자)") else: raise RuntimeError(f"LibreOffice 변환 실패: {result.stderr[:300]}") except subprocess.TimeoutExpired: raise RuntimeError(f"LibreOffice 텍스트 추출 timeout (60s)") finally: tmp_input.unlink(missing_ok=True) # ─── ODF 변환 (편집용) ─── CONVERT_MAP = { 'xlsx': 'ods', 'xls': 'ods', 'docx': 'odt', 'doc': 'odt', 'pptx': 'odp', 'ppt': 'odp', } target_fmt = CONVERT_MAP.get(fmt) if target_fmt: try: derived_dir = full_path.parent / ".derived" derived_dir.mkdir(exist_ok=True) tmp_input2 = tmp_dir / f"convert_{document_id}.{fmt}" shutil.copy2(str(full_path), str(tmp_input2)) conv_result = subprocess.run( ["libreoffice", "--headless", "--convert-to", target_fmt, "--outdir", str(tmp_dir), str(tmp_input2)], capture_output=True, text=True, timeout=60, ) tmp_input2.unlink(missing_ok=True) conv_file = tmp_dir / f"convert_{document_id}.{target_fmt}" if conv_file.exists(): final_path = derived_dir / f"{document_id}.{target_fmt}" shutil.move(str(conv_file), str(final_path)) nas_root = Path(settings.nas_mount_path) doc.derived_path = str(final_path.relative_to(nas_root)) doc.original_format = doc.file_format doc.conversion_status = "done" logger.info(f"[ODF변환] {doc.file_path} → derived: {doc.derived_path}") else: doc.conversion_status = "failed" logger.warning(f"[ODF변환] 실패: {conv_result.stderr[:200]}") except Exception as e: doc.conversion_status = "failed" logger.error(f"[ODF변환] {doc.file_path} 에러: {e}") else: doc.conversion_status = "none" return # ─── 미지원 포맷 ─── doc.extracted_text = "" doc.extracted_at = datetime.now(timezone.utc) doc.extractor_version = f"unsupported_{fmt}" logger.warning(f"[미지원] {doc.file_path} (format={fmt})")