From cf0d75fe84519ffb87d36d91eb7faa0ab57a23c6 Mon Sep 17 00:00:00 2001 From: hyungi Date: Sun, 24 May 2026 08:02:30 +0000 Subject: [PATCH] fix(search): handle markdown/fileless docs without marker conversion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR-DocSrv-LargeDoc-Split-Markdown-1 commit 5 (plan brisk-paging-quokka.md). 이미 마크다운인 문서는 marker 변환 불필요 → _process_markdown_passthrough 로 파일 내용(없으면 extracted_text)을 md_content 에 직접 적재(success), 비면 skipped. - _is_markdown_doc: file_format=md/markdown 또는 .md/.markdown 확장자 - 분기 위치 = file_path validation 이전 (fileless md = file_path NULL 처리 위함) - engine=passthrough 로 marker 변환본과 구분 기존 버그 해소: fileless md 43건=「no file_path」 fail / .md 파일=unsupported extension skip → 둘 다 md_content 미생성이었음. 검증(docker cp 격리): 13948(.md+file_path)→success md_len=1805(파일) / 23409(fileless 931자)→success(extracted_text) / 20237(fileless 6자)→success. PDF 경로 무영향(_is_markdown_doc=False). Co-Authored-By: Claude Opus 4.7 (1M context) --- app/workers/marker_worker.py | 71 +++++++++++++++++++++++++++++++++++- 1 file changed, 70 insertions(+), 1 deletion(-) diff --git a/app/workers/marker_worker.py b/app/workers/marker_worker.py index d8a0ef9..05ff158 100644 --- a/app/workers/marker_worker.py +++ b/app/workers/marker_worker.py @@ -3,7 +3,9 @@ 플로우: classify_worker 완료 → enqueue 'markdown' stage (또는 reprocess 스크립트가 force=True 로 enqueue) → marker_worker.process() - → doc_type / 확장자 / page_count gauge + → doc_type / handwritten skip + → 이미 마크다운(file_format=md / .md) = _process_markdown_passthrough (변환 없이 내용 직접 적재) + → PDF: 확장자 / page_count gauge → 소형(≤SPLIT_THRESHOLD_PAGES) = _process_single 통째 1-shot /convert 대형(>SPLIT_THRESHOLD_PAGES) = _process_split page-range BATCH_PAGES 윈도우 분할 → 응답 이미지 NAS persist + document_images UPSERT + md_content ref 정규화 @@ -118,6 +120,16 @@ def _get_page_count(file_path: str) -> int | None: return None +def _is_markdown_doc(doc: Document) -> bool: + """이미 마크다운 형식인 문서 판정 (file_format=md/markdown 또는 .md/.markdown 확장자).""" + fmt = (doc.file_format or "").lower() + if fmt in ("md", "markdown"): + return True + if doc.file_path: + return Path(doc.file_path).suffix.lower() in (".md", ".markdown") + return False + + async def process(document_id: int, session: AsyncSession) -> None: """markdown stage 워커 진입점. queue_consumer 가 호출.""" doc = await session.get(Document, document_id) @@ -151,6 +163,13 @@ async def process(document_id: int, session: AsyncSession) -> None: ) return + # ---- (1.7) 이미 마크다운인 문서 = marker 변환 불필요, 내용 직접 적재 (commit 5) ---- + # fileless md(file_path NULL) + .md/.markdown 파일 둘 다 처리. 기존엔 전자=「no file_path」 + # fail, 후자=unsupported extension skip → md_content 미생성이었음. + if _is_markdown_doc(doc): + await _process_markdown_passthrough(doc, document_id, session) + return + # ---- (2) file_path validation ---- if not doc.file_path: await _fail(session, document_id, "no file_path") @@ -298,6 +317,56 @@ async def _process_single( ) +async def _process_markdown_passthrough( + doc: Document, document_id: int, session: AsyncSession +) -> None: + """마크다운 문서 — 변환 없이 파일 내용(없으면 extracted_text)을 md_content 로 직접 적재. + + 내용이 비면 skipped. engine='passthrough' 로 marker 변환본과 구분. + """ + content = "" + if doc.file_path: + path = _to_marker_path(doc.file_path) + try: + content = Path(path).read_text(encoding="utf-8", errors="replace") + except OSError as exc: + logger.warning( + f"[marker] md passthrough file read failed id={document_id} " + f"path={path}: {type(exc).__name__}: {exc}" + ) + content = "" + if not content.strip(): + content = doc.extracted_text or "" # fileless md = extracted_text 사용 + content = content.strip() + + if not content: + await _set_skipped(session, document_id, "skipped: markdown doc with no content") + return + + quality = _compute_quality(content, doc.extracted_text or "", {"page_count": None}) + await session.execute( + update(Document).where(Document.id == document_id).values( + md_content=content, + md_status="success", + md_extraction_engine="passthrough", + md_extraction_engine_version=None, + md_extraction_quality=quality, + md_content_hash=hashlib.sha256(content.encode("utf-8")).hexdigest(), + md_source_hash=doc.file_hash, + md_generated_at=_now(), + md_extraction_error=None, + md_frontmatter=doc.md_frontmatter or {}, + md_format_version="1.0", + content_origin="extracted", + ) + ) + await session.commit() + logger.info( + f"[marker] md passthrough success id={document_id} len={len(content)} " + f"src={'file' if doc.file_path else 'extracted_text'}" + ) + + async def _process_split( doc: Document, document_id: int,