From cd694e7386409a5b625f60640a8f06cebdcd6c57 Mon Sep 17 00:00:00 2001 From: hyungi Date: Wed, 17 Jun 2026 23:29:19 +0000 Subject: [PATCH 1/5] =?UTF-8?q?refactor(ds):=20vestigial=20ai-gateway=20?= =?UTF-8?q?=ED=8F=90=EA=B8=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 예산캡 LLM 게이트웨이(2026-04-03 GPU 이관 최초 커밋부터 존재). config.ai.gateway 파싱만·소비코드 0줄·established 0·요청 이력 0 = vestigial 입증. docker-compose.yml ai-gateway 서비스블록 + config.yaml ai.gateway 블록 제거. 컨테이너+image(256MB) 제거, fastapi 무손상(재생성 안 함). dangling CLAUDE_API_KEY env 노출 동반 제거(credentials.env=gitignore 별도). Co-Authored-By: Claude Opus 4.8 (1M context) --- config.yaml | 2 -- docker-compose.yml | 13 ------------- 2 files changed, 15 deletions(-) diff --git a/config.yaml b/config.yaml index 9be6e5e..a6f8b3f 100644 --- a/config.yaml +++ b/config.yaml @@ -1,8 +1,6 @@ # hyungi_Document_Server 설정 ai: - gateway: - endpoint: "http://ai-gateway:8080" models: # ─── 단일 generation 호스트 routing (2026-05-14 GPU LLM 제거) ─── diff --git a/docker-compose.yml b/docker-compose.yml index 8bcf06d..4b986d5 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -168,19 +168,6 @@ services: start_period: 120s restart: unless-stopped - ai-gateway: - build: ./gpu-server/services/ai-gateway - ports: - - "127.0.0.1:8081:8080" - environment: - - PRIMARY_ENDPOINT=http://100.76.254.116:8801/v1/chat/completions - - FALLBACK_ENDPOINT=http://ollama:11434/v1/chat/completions - - CLAUDE_API_KEY=${CLAUDE_API_KEY:-} - - DAILY_BUDGET_USD=${DAILY_BUDGET_USD:-5.00} - # depends_on: ollama 제거 (2026-06-08) — ollama 서비스가 standalone 으로 이관됨. - # FALLBACK_ENDPOINT 의 ollama:11434 는 standalone(동일 hostname, DS 망 부착)으로 해소. - restart: unless-stopped - fastapi: build: ./app ports: -- 2.52.0 From 5cabf728e66b2201c5feeda95ad2b8409dbfa254 Mon Sep 17 00:00:00 2001 From: hyungi Date: Wed, 17 Jun 2026 23:35:43 +0000 Subject: [PATCH 2/5] =?UTF-8?q?fix(search):=20reranker=20MAX=5FCLIENT=5FBA?= =?UTF-8?q?TCH=5FSIZE=2064=E2=86=92256?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit rerank_service.py 가 후보를 MAX_RERANK_INPUT=200 까지 청크 없이 한 번에 TEI 로 POST → TEI 한도 64 초과(85) 시 HTTPError → RRF silent fallback(리랭크 누락=검색 품질 저하, 48h 4회). MAX_BATCH_TOKENS=16384 가 VRAM 상한이라 client batch entries 한도만 256(MAX_RERANK_INPUT 200 커버)으로 상향, reranker 만 재생성. 검증: 85-text rerank HTTP 200, batch 에러 0. Co-Authored-By: Claude Opus 4.8 (1M context) --- docker-compose.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker-compose.yml b/docker-compose.yml index 4b986d5..0f0b349 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -149,7 +149,7 @@ services: # → 32 한도 초과 → 413. 64 로 늘림. # GPU VRAM free 6199MiB 충분. baseline path (MAX_RERANK_INPUT=200) 영향 0. - MAX_BATCH_TOKENS=16384 - - MAX_CLIENT_BATCH_SIZE=64 + - MAX_CLIENT_BATCH_SIZE=256 # 2026-06-18 fix: 64→256, MAX_RERANK_INPUT=200 커버 (batch>64 ERROR=RRF silent fallback 해소; MAX_BATCH_TOKENS가 VRAM 상한이라 entries 증가는 VRAM 무관) - MAX_CONCURRENT_REQUESTS=4 volumes: - reranker_cache:/data -- 2.52.0 From bb929f88d0f021b8425105acf0635f9ec8870c15 Mon Sep 17 00:00:00 2001 From: hyungi Date: Thu, 18 Jun 2026 15:58:55 +0900 Subject: [PATCH 3/5] =?UTF-8?q?feat(extraction):=20MinerU=202.5=20VLM=20?= =?UTF-8?q?=EC=B6=94=EC=B6=9C=20=EC=84=9C=EB=B9=84=EC=8A=A4=20+=20?= =?UTF-8?q?=EC=9B=8C=EC=BB=A4=20=EC=97=94=EB=93=9C=ED=8F=AC=EC=9D=B8?= =?UTF-8?q?=ED=8A=B8=20env=ED=99=94?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit marker-service(Surya, ~10GB) 대체 후보. MinerU2.5-Pro-2605-1.2B VLM(vllm-async-engine, ~5.9GB 고정). marker /convert 계약 복제(file_path·start/end·md+base64 images) → 워커는 MARKER_ENDPOINT env 플립만으로 전환. 단일카드(16GB) 검색스택 공존, 40p 윈도우 무변. - services/mineru: Dockerfile(vllm/vllm-openai:v0.21.0 + mineru[core]) + async server.py (NFC/NFD 한글경로 resolver, PyMuPDF page 슬라이스, gpu_memory_utilization 캡) - docker-compose: mineru-service profile-gated(기본 미기동=marker 무영향) + mineru_models vol - marker_worker: MARKER_ENDPOINT 하드코딩 → env(기본 marker, 무변) 격리 PoC A/B 8/8 게이트 PASS (한국어/표/수식LaTeX/heading/figure/40p VRAM). 컷오버(env 플립+marker 제거)는 별 단계(읽기뷰 회귀 0 게이트). Co-Authored-By: Claude Opus 4.8 (1M context) --- app/workers/marker_worker.py | 6 +- docker-compose.yml | 38 +++++ services/mineru/Dockerfile | 45 +++++ services/mineru/server.py | 315 +++++++++++++++++++++++++++++++++++ 4 files changed, 403 insertions(+), 1 deletion(-) create mode 100644 services/mineru/Dockerfile create mode 100644 services/mineru/server.py diff --git a/app/workers/marker_worker.py b/app/workers/marker_worker.py index cc2c70d..6f1055f 100644 --- a/app/workers/marker_worker.py +++ b/app/workers/marker_worker.py @@ -39,7 +39,11 @@ from models.queue import ProcessingQueue logger = logging.getLogger(__name__) -MARKER_ENDPOINT = "http://marker-service:3300/convert" +# 마크다운 추출 엔드포인트. compose env `MARKER_ENDPOINT`(base URL)에서 읽는다 — +# 기본=marker(무변), 컷오버=`http://mineru-service:3301` 로 env 플립만으로 전환. +# marker/mineru 가 동일 /convert 계약(file_path·start/end·md+base64 images)이라 워커 무변. +_MARKDOWN_BASE = os.getenv("MARKER_ENDPOINT", "http://marker-service:3300").rstrip("/") +MARKER_ENDPOINT = _MARKDOWN_BASE if _MARKDOWN_BASE.endswith("/convert") else _MARKDOWN_BASE + "/convert" MARKER_TIMEOUT = 300 # 큰 PDF 5 분 한도 MAX_PAGES = 200 # 소형 1-shot 경로 /convert max_pages 안전장치 diff --git a/docker-compose.yml b/docker-compose.yml index 0f0b349..5fd088a 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -87,6 +87,43 @@ services: start_period: 300s restart: unless-stopped + # MinerU 2.5 VLM PDF→markdown 추출 — marker-service 대체 후보(단일카드 markdown VRAM ~10→~5GB). + # profile-gated: 기본 미기동 = marker 무영향. 활성 = `docker compose --profile mineru up -d mineru-service`. + # 컷오버(A/B 8게이트 PASS) 전까지 fastapi depends_on 에 넣지 않는다(격리). 포트 3301 (marker=3300). + mineru-service: + build: ./services/mineru + profiles: ["mineru"] + ports: + - "127.0.0.1:3301:3301" + expose: + - "3301" + environment: + # vlm-engine = 순수 VLM 단일모델. 기본 hybrid-engine 은 다중모델 로드 = OOM(반드시 명시). + - MINERU_BACKEND=vlm-engine + - MINERU_LANG=${MINERU_LANG:-korean} + # 공유 16GB 카드 공존: 절대 VRAM 캡(GB, 공유카드 robust) + vLLM 분율 캡 병용. + - MINERU_VIRTUAL_VRAM_SIZE=${MINERU_VIRTUAL_VRAM_SIZE:-6} + - MINERU_GPU_MEMORY_UTILIZATION=${MINERU_GPU_MEMORY_UTILIZATION:-0.40} + - MINERU_PRELOAD=${MINERU_PRELOAD:-1} + volumes: + - ${NAS_NFS_PATH:-/mnt/nas/Document_Server}:/documents:ro + - mineru_models:/root/.cache + ipc: host # vLLM 공유메모리 — 공식 run 의 --ipc=host 대응. + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: 1 + capabilities: [gpu] + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:3301/ready"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 900s # VLM 모델 lazy 다운로드(~2.4GB)+엔진 로드 여유. + restart: unless-stopped + stt-service: # 2026-05-08 (D9 Track B revised): GPU is canonical STT owner. # 정책: Mac mini = Gemma 26B 전용 우선이므로 STT/Whisper 는 호출량 무관 GPU 서버 소유. @@ -271,3 +308,4 @@ volumes: ocr_models: stt_models: marker_models: + mineru_models: diff --git a/services/mineru/Dockerfile b/services/mineru/Dockerfile new file mode 100644 index 0000000..8ed4732 --- /dev/null +++ b/services/mineru/Dockerfile @@ -0,0 +1,45 @@ +# mineru-service — MinerU 2.5 VLM 기반 PDF→markdown 추출기. marker-service 대체. +# 단일카드(RTX 4070 Ti S 16GB→PRO 4000 24GB) markdown VRAM ~10GB(marker)→~5GB(MinerU VLM). +# +# 공식 opendatalab/MinerU global Dockerfile 기반: +# FROM vllm/vllm-openai:v0.21.0 (CUDA 13.0). GPU 호스트 드라이버 595.71.05 / CUDA 13.2 가 +# 13.0 런타임 지원 → cu129 폴백 불필요. vLLM 은 base 이미지가 제공하므로 mineru 는 [core] 만. +# +# 모델은 이미지에 굽지 않고 런타임 warmup 시 HF cache 볼륨으로 lazy 다운로드 (marker/ocr 선례 = +# 서버 .cache 볼륨). 이미지 슬림 유지 + server.py 반복 빌드 빠름 + 모델 볼륨 영속. +FROM vllm/vllm-openai:v0.21.0 + +# base 이미지의 ENTRYPOINT(vLLM OpenAI 서버)를 제거 — 우리는 uvicorn 으로 자체 FastAPI 기동. +ENTRYPOINT [] + +# opencv(libgl) + CJK 폰트(레이아웃/렌더 안전) + curl(healthcheck). 공식 Dockerfile 동일. +RUN apt-get update && apt-get install -y --no-install-recommends \ + fonts-noto-core fonts-noto-cjk fontconfig libgl1 curl \ + && fc-cache -fv \ + && apt-get clean && rm -rf /var/lib/apt/lists/* + +# mineru[core] — 공식 설치 라인. vLLM(vlm-engine 백엔드)은 base 가 이미 제공. +RUN python3 -m pip install -U 'mineru[core]>=3.2.1' --break-system-packages \ + && python3 -m pip cache purge + +# 서비스 wrapper 의존성. base(vllm-openai)+mineru 가 fastapi/uvicorn/pillow 를 이미 제공 → +# pymupdf 만 추가(나머지 명시 핀은 base 의 pillow 12.x 를 불필요하게 다운그레이드해서 제거). +RUN python3 -m pip install --no-cache-dir --break-system-packages \ + 'pymupdf>=1.24.0,<2.0.0' + +# MINERU_MODEL_SOURCE=huggingface = warmup 시 lazy 다운로드 (HF cache 볼륨에 영속). +# PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True = 단편화 완화(연구 권고, 거대 입력 OOM 완충). +ENV MINERU_MODEL_SOURCE=huggingface \ + HF_HOME=/root/.cache/huggingface \ + PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True + +WORKDIR /app +# server.py = 무거운 pip 레이어 뒤에 COPY → 반복 빌드 시 캐시 적중(빠른 재빌드). +COPY server.py /app/server.py + +EXPOSE 3301 +# VLM 모델 lazy 다운로드(~2.4GB)+엔진 로드 여유로 start-period 길게. +HEALTHCHECK --start-period=900s --interval=30s --timeout=10s --retries=3 \ + CMD curl -f http://localhost:3301/ready || exit 1 + +CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "3301"] diff --git a/services/mineru/server.py b/services/mineru/server.py new file mode 100644 index 0000000..dd86cde --- /dev/null +++ b/services/mineru/server.py @@ -0,0 +1,315 @@ +"""mineru-service — POST /convert: PDF → markdown + 추출 이미지 base64. + +marker-service 대체(MinerU 2.5 VLM). **marker 의 /convert 계약을 그대로 복제**해서 +marker_worker 가 엔드포인트만 바꾸면 되도록 한다(요청/응답 동일 shape): + + 요청: {file_path, max_pages?, start_page?, end_page?} (page = 1-based inclusive) + 응답: {md_content, md_content_hash, engine, engine_version, elapsed_ms, + raw_metrics, images:[{slug, format, width, height, bytes_b64}], images_truncated} + +설계 노트: +- **page range 는 PyMuPDF 로 직접 슬라이스**해서 MinerU 에 넘긴다(start_page..end_page → + 0-based [a,b] 페이지만 담은 새 PDF bytes). MinerU 의 `end_page_id=0 falsy 무시` 버그 회피. + 40p 윈도우 분할은 marker_worker 가 그대로 담당. (검증: fitz 슬라이스 렌더 = 원본과 동일 품질.) +- **★ 반드시 async 엔진(`aio_do_parse`) 사용.** 동기 `do_parse`(vllm-engine sync)는 본 모델 + (MinerU2.5-Pro-2605-1.2B)에서 layout 토큰 malformed → 빈 md 산출(실측 G1-2). async + (`aio_do_parse` = vllm-async-engine, mineru CLI 가 쓰는 정상 경로) = 정상 출력. +- **이미지 = stateless**: marker 처럼 NAS write 안 함. MinerU 가 md 에 박는 `![](images/.jpg)` + href 를 그대로 slug 으로 반환 → fastapi(marker_worker)의 `_rewrite_image_refs` 가 basename + 매칭으로 `docimg:img_NNN` 정규화 + NAS persist. (계약 무변) +- **VRAM 캡**: `MINERU_GPU_MEMORY_UTILIZATION`(vLLM 분율, 0.40→~6GB 실측). compose 의 + `MINERU_VIRTUAL_VRAM_SIZE` 도 무해(실측 정상)하나 출력엔 무관 — 캡은 분율로 충분. + backend=`vlm-engine`(기본 hybrid-engine 은 다중모델 로드 OOM, 반드시 명시). + +엔진은 첫 변환(또는 startup warmup) 시 1회 로드 — MinerU ModelSingleton 캐시. 단일 GPU 라 +변환은 _engine_lock 으로 직렬화. +""" +import asyncio +import base64 +import hashlib +import inspect +import io +import logging +import os +import time +import unicodedata +from pathlib import Path + +import fitz # PyMuPDF — page 슬라이스 + 페이지수 +from fastapi import FastAPI, HTTPException, Response +from PIL import Image +from pydantic import BaseModel, Field + +logger = logging.getLogger("mineru-service") +logging.basicConfig(level=logging.INFO) +app = FastAPI() + +try: + import importlib.metadata + _engine_version = importlib.metadata.version("mineru") +except Exception: + _engine_version = "unknown" + +# ---- 설정 (compose env 로 override) ----------------------------------------- +MINERU_BACKEND = os.getenv("MINERU_BACKEND", "vlm-engine") +MINERU_LANG = os.getenv("MINERU_LANG", "korean") +GPU_MEM_UTIL = float(os.getenv("MINERU_GPU_MEMORY_UTILIZATION", "0.40")) + +MAX_IMAGES_PER_DOC = int(os.getenv("MINERU_MAX_IMAGES_PER_DOC", "200")) +MAX_BYTES_PER_IMAGE = int(os.getenv("MINERU_MAX_BYTES_PER_IMAGE", str(10 * 1024 * 1024))) +MAX_PAGES_HARD = int(os.getenv("MINERU_MAX_PAGES_HARD", "200")) # 1-shot max_pages 안전장치 + +_PRELOAD = os.getenv("MINERU_PRELOAD", "1") != "0" + +# ---- 엔진 상태 --------------------------------------------------------------- +_warmup_done = False +_warmup_error: str | None = None +# 단일 GPU async 엔진 — warmup + convert 직렬화(엔진 1개, 임시디렉토리/싱글톤 경합 차단). +_engine_lock = asyncio.Lock() + + +async def _run_mineru(pdf_bytes: bytes, lang: str) -> tuple[str, list[dict]]: + """슬라이스된 PDF bytes → (markdown, 이미지 dict 리스트). **async 엔진 경로.** + + 호출자(_ensure_warmup / convert)가 _engine_lock 을 잡은 상태로 호출한다. + 이미지 dict: {slug, format, width, height, raw_bytes}. slug = md href 그대로. + """ + import glob + import tempfile + + from mineru.cli.common import aio_do_parse + + with tempfile.TemporaryDirectory(prefix="mineru_") as td: + candidate = { + "output_dir": td, + "pdf_file_names": ["doc"], + "pdf_bytes_list": [pdf_bytes], + "p_lang_list": [lang], + "backend": MINERU_BACKEND, + "formula_enable": True, + "table_enable": True, + "f_dump_md": True, + "f_dump_content_list": True, + "f_dump_middle_json": False, + "f_dump_model_output": False, + "f_dump_orig_pdf": False, + "f_draw_layout_bbox": False, + "f_draw_span_bbox": False, + "gpu_memory_utilization": GPU_MEM_UTIL, + } + sig = inspect.signature(aio_do_parse) + has_var_kw = any( + p.kind == inspect.Parameter.VAR_KEYWORD for p in sig.parameters.values() + ) + kwargs = candidate if has_var_kw else { + k: v for k, v in candidate.items() if k in sig.parameters + } + await aio_do_parse(**kwargs) + + md_files = sorted(glob.glob(f"{td}/**/*.md", recursive=True)) + if not md_files: + raise RuntimeError("mineru produced no markdown output") + md_path = Path(md_files[0]) + md_text = md_path.read_text(encoding="utf-8", errors="replace") + + images: list[dict] = [] + img_dir = md_path.parent / "images" + if img_dir.is_dir(): + for img_file in sorted(img_dir.iterdir()): + if not img_file.is_file(): + continue + raw = img_file.read_bytes() + slug = f"images/{img_file.name}" # md href 와 정확히 일치 + w = h = None + try: + with Image.open(io.BytesIO(raw)) as im: + w, h = im.width, im.height + fmt = (im.format or "JPEG").lower() + except Exception: + fmt = img_file.suffix.lstrip(".").lower() or "jpeg" + images.append( + {"slug": slug, "format": fmt, "width": w, "height": h, "raw_bytes": raw} + ) + return md_text, images + + +async def _ensure_warmup() -> None: + """첫 /convert 또는 startup hook 시 1-page 합성 PDF 로 엔진+모델 적재.""" + global _warmup_done, _warmup_error + if _warmup_done: + return + async with _engine_lock: + if _warmup_done: + return + try: + logger.info("[mineru-service] warmup start (async engine load + model fetch)") + doc = fitz.open() + page = doc.new_page() + page.insert_text((72, 72), "MinerU warmup.") + warmup_bytes = doc.tobytes() + doc.close() + await _run_mineru(warmup_bytes, MINERU_LANG) + _warmup_done = True + _warmup_error = None + logger.info(f"[mineru-service] warmup done engine_version={_engine_version}") + except Exception as exc: + _warmup_error = f"{type(exc).__name__}: {exc}" + logger.exception("[mineru-service] warmup failed") + raise + + +@app.on_event("startup") +async def startup(): + if _PRELOAD: + asyncio.create_task(_ensure_warmup()) + + +# ---- 계약 모델 (marker 와 동일 shape) ---------------------------------------- +class ConvertRequest(BaseModel): + file_path: str + max_pages: int | None = None + start_page: int | None = None # 1-based inclusive + end_page: int | None = None # 1-based inclusive + + +class ConvertImage(BaseModel): + slug: str + format: str + width: int | None = None + height: int | None = None + bytes_b64: str + + +class ConvertResponse(BaseModel): + md_content: str + md_content_hash: str + engine: str + engine_version: str + elapsed_ms: int + raw_metrics: dict + images: list[ConvertImage] = Field(default_factory=list) + images_truncated: bool = False + + +@app.get("/health") +def health(): + return {"status": "ok", "service": "mineru-service"} + + +@app.get("/ready") +async def ready(response: Response): + """marker /ready 의미 복제: warmup_failed 만 503, idle/warming=200(depends_on 굳음 방지).""" + if _warmup_error: + response.status_code = 503 + return {"status": "warmup_failed", "engine": "mineru", + "engine_version": _engine_version, "error": _warmup_error} + if not _warmup_done: + return {"status": "warming_up" if _PRELOAD else "idle", "engine": "mineru", + "engine_version": _engine_version, "models_loaded": False} + return {"status": "ready", "engine": "mineru", + "engine_version": _engine_version, "models_loaded": True} + + +def _resolve_path(file_path: str) -> Path | None: + """NFC(DB) vs NFD(NFS) 한글 경로 정규화 차이 흡수. ocr/server.py 와 동일 패턴 + (필수 — 한글명 파일은 NFS=NFD 저장이라 DB 의 NFC 경로로는 is_file=False).""" + for c in (file_path, + unicodedata.normalize("NFD", file_path), + unicodedata.normalize("NFC", file_path)): + p = Path(c) + if p.exists(): + return p + parent = Path(file_path).parent + if parent.exists(): + target = unicodedata.normalize("NFC", Path(file_path).name) + for child in parent.iterdir(): + if unicodedata.normalize("NFC", child.name) == target: + return child + return None + + +def _slice_pdf(src_path: Path, start_page: int | None, end_page: int | None, + max_pages: int | None) -> tuple[bytes, int]: + """요청 page 범위(1-based inclusive)만 담은 새 PDF bytes + 변환 페이지수 반환.""" + with fitz.open(src_path) as src: + n = src.page_count + if start_page is not None and end_page is not None: + a = max(0, start_page - 1) + b = min(n - 1, end_page - 1) + else: + a = 0 + cap = max_pages if max_pages is not None else MAX_PAGES_HARD + b = min(n - 1, cap - 1) + if b < a: + raise HTTPException(422, detail={"code": "bad_page_range", + "message": f"a={a} b={b} n={n}"}) + out = fitz.open() + out.insert_pdf(src, from_page=a, to_page=b) + pdf_bytes = out.tobytes() + out.close() + return pdf_bytes, (b - a + 1) + + +def _serialize_images(images: list[dict], src_path: str) -> tuple[list[ConvertImage], bool]: + """이미지 dict 리스트 → base64 ConvertImage 리스트 (marker 가드 동일).""" + truncated = len(images) > MAX_IMAGES_PER_DOC + if truncated: + logger.warning(f"[mineru-service] images truncated path={src_path} " + f"total={len(images)} cap={MAX_IMAGES_PER_DOC}") + images = images[:MAX_IMAGES_PER_DOC] + out: list[ConvertImage] = [] + for img in images: + raw = img["raw_bytes"] + if len(raw) > MAX_BYTES_PER_IMAGE: + logger.warning(f"[mineru-service] image too large skipped path={src_path} " + f"slug={img['slug']} bytes={len(raw)} cap={MAX_BYTES_PER_IMAGE}") + continue + out.append(ConvertImage( + slug=img["slug"], format=img["format"], + width=img.get("width"), height=img.get("height"), + bytes_b64=base64.b64encode(raw).decode("ascii"), + )) + return out, truncated + + +@app.post("/convert", response_model=ConvertResponse) +async def convert(req: ConvertRequest): + p = _resolve_path(req.file_path) + if p is None or not p.is_file(): + raise HTTPException(404, detail={"code": "file_not_found", "message": req.file_path}) + if req.start_page is not None and req.end_page is not None: + if req.start_page < 1 or req.end_page < req.start_page: + raise HTTPException(422, detail={"code": "bad_page_range", + "message": f"start_page={req.start_page} end_page={req.end_page}"}) + + pdf_bytes, page_count = _slice_pdf(p, req.start_page, req.end_page, req.max_pages) + + await _ensure_warmup() # 엔진 로드 보장(내부에서 _engine_lock 잡았다 놓음) + async with _engine_lock: # 실제 변환 직렬화(단일 GPU) + start = time.monotonic() + try: + md_text, raw_images = await _run_mineru(pdf_bytes, MINERU_LANG) + except HTTPException: + raise + except Exception as exc: + logger.exception(f"[mineru-service] conversion failed path={p}: {exc}") + raise HTTPException(422, detail={"code": "conversion_failed", + "message": f"{type(exc).__name__}: {exc}"}) from exc + elapsed_ms = int((time.monotonic() - start) * 1000) + + images_payload, truncated = _serialize_images(raw_images, str(p)) + + return ConvertResponse( + md_content=md_text, + md_content_hash=hashlib.sha256(md_text.encode("utf-8")).hexdigest(), + engine="mineru", + engine_version=_engine_version, + elapsed_ms=elapsed_ms, + raw_metrics={ + "page_count": page_count, + "image_count_extracted": len(raw_images), + "image_count_returned": len(images_payload), + }, + images=images_payload, + images_truncated=truncated, + ) -- 2.52.0 From 28b8afc748932becfea9981a0d6538cb0a999e53 Mon Sep 17 00:00:00 2001 From: hyungi Date: Thu, 18 Jun 2026 16:11:38 +0900 Subject: [PATCH 4/5] =?UTF-8?q?feat(extraction):=20=EC=BB=B7=EC=98=A4?= =?UTF-8?q?=EB=B2=84=20Phase=201=20=E2=80=94=20mineru-service=20=EB=A5=BC?= =?UTF-8?q?=20=EB=A7=88=ED=81=AC=EB=8B=A4=EC=9A=B4=20=EC=97=94=EC=A7=84?= =?UTF-8?q?=EC=9C=BC=EB=A1=9C=20(marker=20=EC=9E=94=EC=A1=B4)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mineru-service profile-gate 해제(상시 기동) + fastapi depends_on 추가 + MARKER_ENDPOINT 을 mineru-service:3301 로 flip. marker-service 는 롤백 대비 Phase 2 까지 잔존(depends_on 유지, 호출만 안 됨 → idle-unload). 동일 /convert 계약. Co-Authored-By: Claude Opus 4.8 (1M context) --- docker-compose.yml | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 5fd088a..d9f897e 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -87,12 +87,11 @@ services: start_period: 300s restart: unless-stopped - # MinerU 2.5 VLM PDF→markdown 추출 — marker-service 대체 후보(단일카드 markdown VRAM ~10→~5GB). - # profile-gated: 기본 미기동 = marker 무영향. 활성 = `docker compose --profile mineru up -d mineru-service`. - # 컷오버(A/B 8게이트 PASS) 전까지 fastapi depends_on 에 넣지 않는다(격리). 포트 3301 (marker=3300). + # MinerU 2.5 VLM PDF→markdown 추출 — ★ marker-service 대체(컷오버 2026-06-18, A/B 8/8 PASS). + # 단일카드 markdown VRAM ~10GB(marker)→~5.9GB 고정. fastapi 가 MARKER_ENDPOINT 로 호출. + # 동기 do_parse 버그 회피 위해 server.py 는 async aio_do_parse 사용. 포트 3301. mineru-service: build: ./services/mineru - profiles: ["mineru"] ports: - "127.0.0.1:3301:3301" expose: @@ -221,6 +220,9 @@ services: condition: service_healthy kordoc-service: condition: service_healthy + # 컷오버: mineru-service 가 마크다운 엔진. marker-service 는 Phase 2 에서 제거(롤백 대비 잔존). + mineru-service: + condition: service_healthy marker-service: condition: service_healthy env_file: @@ -229,7 +231,8 @@ services: - DATABASE_URL=postgresql+asyncpg://pkm:${POSTGRES_PASSWORD}@postgres:5432/pkm - KORDOC_ENDPOINT=http://kordoc-service:3100 - OCR_ENDPOINT=http://ocr-service:3200 - - MARKER_ENDPOINT=http://marker-service:3300 + # ★ 컷오버 2026-06-18: marker-service:3300 → mineru-service:3301 (동일 /convert 계약). + - MARKER_ENDPOINT=http://mineru-service:3301 - MARKER_CONTAINER_PATH_PREFIX=/documents # 2026-05-08 (D9 Track B revised): GPU stt-service 정식 승격, 내부 DNS 사용. - STT_ENDPOINT=http://stt-service:3300 -- 2.52.0 From a77ac38e92e33fd9e8b9d43052c5fe777c355042 Mon Sep 17 00:00:00 2001 From: hyungi Date: Thu, 18 Jun 2026 16:27:26 +0900 Subject: [PATCH 5/5] =?UTF-8?q?feat(extraction):=20=EC=BB=B7=EC=98=A4?= =?UTF-8?q?=EB=B2=84=20Phase=202=20=E2=80=94=20marker-service=20=EC=A0=9C?= =?UTF-8?q?=EA=B1=B0=20(MinerU=20=EB=8B=A8=EB=8F=85)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 읽기뷰 회귀 0 확인(doc 39464 재처리 → engine=mineru success, 71 imgs, docimg ref/NAS persist 정상) 후 marker 제거. compose 에서 marker-service 블록 + fastapi depends_on + marker_models 볼륨 + services/marker/ 소스 삭제. 롤백 = git history + ~/.local/share/marker-decommission-backups. 마크다운 엔진 = mineru-service 단독. Co-Authored-By: Claude Opus 4.8 (1M context) --- docker-compose.yml | 38 +--- services/marker/Dockerfile | 22 --- services/marker/requirements.txt | 9 - services/marker/server.py | 325 ------------------------------- 4 files changed, 1 insertion(+), 393 deletions(-) delete mode 100644 services/marker/Dockerfile delete mode 100644 services/marker/requirements.txt delete mode 100644 services/marker/server.py diff --git a/docker-compose.yml b/docker-compose.yml index d9f897e..a3bae71 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -54,39 +54,6 @@ services: start_period: 180s restart: unless-stopped - # Phase 1B (2026-05-01): PDF → markdown 변환. ocr-service 와 별도 컨테이너 (deps 충돌 회피). - marker-service: - build: ./services/marker - ports: - - "127.0.0.1:3300:3300" - expose: - - "3300" - environment: - - HF_HOME=/models/huggingface - - TORCH_HOME=/models/torch - # D-1 (crawl-24x7): idle-unload 전환 — 영구 점유(~3.5GB) 해제가 90% 봉투의 전제. - # /ready 는 idle 에서도 200 (fastapi depends_on service_healthy 유지). - # 롤백 = MARKER_PRELOAD=1 + MARKER_IDLE_UNLOAD_MINUTES=0. - - MARKER_PRELOAD=0 - - MARKER_IDLE_UNLOAD_MINUTES=${MARKER_IDLE_UNLOAD_MINUTES:-30} - volumes: - - ${NAS_NFS_PATH:-/mnt/nas/Document_Server}:/documents:ro - - marker_models:/models - deploy: - resources: - reservations: - devices: - - driver: nvidia - count: 1 - capabilities: [gpu] - healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:3300/ready"] - interval: 30s - timeout: 10s - retries: 3 - start_period: 300s - restart: unless-stopped - # MinerU 2.5 VLM PDF→markdown 추출 — ★ marker-service 대체(컷오버 2026-06-18, A/B 8/8 PASS). # 단일카드 markdown VRAM ~10GB(marker)→~5.9GB 고정. fastapi 가 MARKER_ENDPOINT 로 호출. # 동기 do_parse 버그 회피 위해 server.py 는 async aio_do_parse 사용. 포트 3301. @@ -220,11 +187,9 @@ services: condition: service_healthy kordoc-service: condition: service_healthy - # 컷오버: mineru-service 가 마크다운 엔진. marker-service 는 Phase 2 에서 제거(롤백 대비 잔존). + # 마크다운 엔진 = mineru-service (marker-service 제거 2026-06-18, 롤백=git history). mineru-service: condition: service_healthy - marker-service: - condition: service_healthy env_file: - credentials.env environment: @@ -310,5 +275,4 @@ volumes: reranker_cache: ocr_models: stt_models: - marker_models: mineru_models: diff --git a/services/marker/Dockerfile b/services/marker/Dockerfile deleted file mode 100644 index 33ddfa4..0000000 --- a/services/marker/Dockerfile +++ /dev/null @@ -1,22 +0,0 @@ -FROM python:3.12-slim - -WORKDIR /app - -RUN apt-get update && apt-get install -y --no-install-recommends \ - libgl1 libglib2.0-0 curl \ - && apt-get clean && rm -rf /var/lib/apt/lists/* - -COPY requirements.txt . -RUN pip install --no-cache-dir \ - --extra-index-url https://download.pytorch.org/whl/cu126 \ - -r requirements.txt - -# 모델 미다운로드 (HF cache volume → 첫 호출/warmup 시 적재). - -COPY server.py . - -EXPOSE 3300 -HEALTHCHECK --start-period=300s --interval=30s --timeout=10s --retries=3 \ - CMD curl -f http://localhost:3300/ready || exit 1 - -CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "3300"] diff --git a/services/marker/requirements.txt b/services/marker/requirements.txt deleted file mode 100644 index ef7cc06..0000000 --- a/services/marker/requirements.txt +++ /dev/null @@ -1,9 +0,0 @@ -torch==2.11.0+cu126 -torchvision==0.26.0+cu126 -transformers==4.57.6 -surya-ocr==0.17.1 -marker-pdf==1.10.2 -pymupdf>=1.24.0,<2.0.0 -fastapi>=0.110.0,<1.0.0 -uvicorn[standard]>=0.27.0,<1.0.0 -pillow>=10.0.0,<12.0.0 diff --git a/services/marker/server.py b/services/marker/server.py deleted file mode 100644 index da7738a..0000000 --- a/services/marker/server.py +++ /dev/null @@ -1,325 +0,0 @@ -"""marker-service — POST /convert: PDF → markdown + 추출 이미지 base64. - -Phase 1B (2026-05-01) — 텍스트만 응답, 이미지 폐기. -Phase 1B.5 — `_images` 직렬화해서 base64 응답에 포함. NAS write 권한이 - 없는 stateless 변환기 유지 (fastapi 가 NAS persist 담당). -D-1 (plan crawl-24x7-1, 2026-06-10) — idle-unload 운영 전환: - MARKER_PRELOAD=0 : startup warmup 끔 (첫 /convert 시 lazy load) - MARKER_IDLE_UNLOAD_MINUTES : N분 유휴 시 모델 해제 (0=비활성, 기존 동작) - /ready 는 idle(미적재)에서도 200 — fastapi 의 depends_on service_healthy 가 - lazy 모드에서 영구 미기동으로 굳는 것 방지. 503 은 warmup_failed 한정. - -plan: ~/.claude/plans/piped-humming-crystal.md -""" -import base64 -import gc -import hashlib -import io -import logging -import os -import threading -import time -from pathlib import Path - -from fastapi import FastAPI, HTTPException, Response -from pydantic import BaseModel, Field - -from marker.converters.pdf import PdfConverter -from marker.models import create_model_dict -from marker.output import text_from_rendered -import marker as marker_module - -logger = logging.getLogger(__name__) -app = FastAPI() - -os.environ.setdefault("HF_HOME", "/models/huggingface") -os.environ.setdefault("TORCH_HOME", "/models/torch") - -_models = None -_converter = None -try: - import importlib.metadata - _engine_version = importlib.metadata.version("marker-pdf") -except Exception: - _engine_version = "unknown" -_warmup_done = False -_warmup_error: str | None = None -_warmup_lock = threading.Lock() - -# D-1 idle-unload 상태 — 전이는 전부 _warmup_lock 아래 -_PRELOAD = os.getenv("MARKER_PRELOAD", "1") != "0" -_IDLE_UNLOAD_MINUTES = int(os.getenv("MARKER_IDLE_UNLOAD_MINUTES", "0")) -_inflight = 0 -_last_used = time.monotonic() - -# 이미지 응답 cap. base64 응답 크기 폭주 방지. 사용자 PDF 풀 측정 (Phase 1D) 시 -# 가장 이미지 많은 문서가 ~30건 수준 → 200 은 안전 마진. 초과 시 truncate flag 응답. -MAX_IMAGES_PER_DOC = int(os.getenv("MARKER_MAX_IMAGES_PER_DOC", "200")) -# per-image 최대 raw bytes (base64 전). 그래픽이 많은 풀페이지 스캔 회피. -MAX_BYTES_PER_IMAGE = int(os.getenv("MARKER_MAX_BYTES_PER_IMAGE", str(10 * 1024 * 1024))) - - -def _ensure_warmup() -> None: - """첫 /convert 또는 startup hook 시 모델 로드. HF cache volume 활용.""" - global _models, _converter, _warmup_done, _warmup_error - if _warmup_done: - return - with _warmup_lock: - if _warmup_done: - return - try: - logger.info("[marker-service] warmup start") - _models = create_model_dict() - _converter = PdfConverter(artifact_dict=_models) - _warmup_done = True - _warmup_error = None - logger.info(f"[marker-service] warmup done engine_version={_engine_version}") - except Exception as exc: - _warmup_error = f"{type(exc).__name__}: {exc}" - logger.exception("[marker-service] warmup failed") - raise - - -def _acquire_models(): - """warmup 보장 + inflight 진입을 원자적으로 — ensure 직후 reaper 가 해제하는 경합 차단.""" - global _inflight - while True: - _ensure_warmup() - with _warmup_lock: - if _warmup_done: - _inflight += 1 - return - # ensure 와 lock 재진입 사이에 unload 가 끼어든 희귀 경합 — 재시도 - - -def _release_models(): - global _inflight, _last_used - with _warmup_lock: - _inflight -= 1 - _last_used = time.monotonic() - - -def _maybe_unload() -> None: - """유휴 시 모델 해제. 변환 중(inflight>0)이면 절대 해제하지 않는다. - - split 변환의 배치 사이 간격은 초 단위 — N>=1분 임계면 배치 사이 해제 없음. - """ - global _models, _converter, _warmup_done - with _warmup_lock: - if not _warmup_done or _inflight > 0: - return - if time.monotonic() - _last_used < _IDLE_UNLOAD_MINUTES * 60: - return - _models = None - _converter = None - _warmup_done = False - gc.collect() - try: - import torch - torch.cuda.empty_cache() - except Exception: - pass - logger.info(f"[marker-service] idle-unload: 모델 해제 (유휴 {_IDLE_UNLOAD_MINUTES}분 초과)") - - -async def _idle_reaper(): - import asyncio - while True: - await asyncio.sleep(60) - try: - _maybe_unload() - except Exception: - logger.exception("[marker-service] idle reaper 오류") - - -@app.on_event("startup") -async def startup(): - """startup hook — warmup 은 MARKER_PRELOAD 게이트 (D-1: lazy 기본 전환은 compose 가).""" - import asyncio - if _PRELOAD: - asyncio.create_task(asyncio.to_thread(_ensure_warmup)) - if _IDLE_UNLOAD_MINUTES > 0: - asyncio.create_task(_idle_reaper()) - logger.info(f"[marker-service] idle-unload 활성: {_IDLE_UNLOAD_MINUTES}분") - - -class ConvertRequest(BaseModel): - file_path: str - max_pages: int | None = None - # page range (1-based inclusive) — LargeDoc split 변환용. marker 내부 0-based 변환은 - # convert() 에 격리 (page numbering invariant: DB/API=1-based, marker=0-based). - start_page: int | None = None - end_page: int | None = None - - -class ConvertImage(BaseModel): - """marker 추출 이미지 1건. fastapi 가 NAS 에 쓰고 docimg:img_NNN 으로 ref 정규화.""" - slug: str # marker 원본 slug (예: '_page_0_Picture_3.jpeg') - format: str # 'png' | 'jpeg' | 'webp' | 'gif' - width: int | None = None - height: int | None = None - bytes_b64: str # base64-encoded raw bytes - - -class ConvertResponse(BaseModel): - md_content: str - md_content_hash: str - engine: str - engine_version: str - elapsed_ms: int - raw_metrics: dict - images: list[ConvertImage] = Field(default_factory=list) - images_truncated: bool = False - - -@app.get("/health") -def health(): - return {"status": "ok", "service": "marker-service"} - - -@app.get("/ready") -async def ready(response: Response): - """Round 4 #1+#2: Response.status_code 명시 + warmup_error 노출. - - D-1: idle(미적재) = 200. 503 은 warmup_failed 한정 — lazy 모드에서 fastapi - depends_on service_healthy 가 영구 미기동으로 굳지 않게. 배포 검증에서 - 'status=ready' 단언하던 runbook 은 강제 warm 호출(/convert 1건)로 대체. - """ - if _warmup_error: - response.status_code = 503 - return { - "status": "warmup_failed", - "engine": "marker", - "engine_version": _engine_version, - "error": _warmup_error, - } - if not _warmup_done: - return { - "status": "warming_up" if _PRELOAD else "idle", - "engine": "marker", - "engine_version": _engine_version, - "models_loaded": False, - "idle_unload_minutes": _IDLE_UNLOAD_MINUTES, - } - return { - "status": "ready", - "engine": "marker", - "engine_version": _engine_version, - "models_loaded": True, - "inflight": _inflight, - "idle_unload_minutes": _IDLE_UNLOAD_MINUTES, - } - - -@app.post("/convert", response_model=ConvertResponse) -async def convert(req: ConvertRequest): - p = Path(req.file_path) - if not p.is_file(): - raise HTTPException(404, detail={"code": "file_not_found", "message": str(p)}) - if req.start_page is not None and req.end_page is not None: - if req.start_page < 1 or req.end_page < req.start_page: - raise HTTPException( - 422, - detail={ - "code": "bad_page_range", - "message": f"start_page={req.start_page} end_page={req.end_page}", - }, - ) - - # D-1: warmup 보장 + inflight 진입 원자화 — 변환 중 reaper 해제 차단. 해제는 finally. - _acquire_models() - try: - start = time.monotonic() - # page range 지정 시 per-request converter (모델 _models 재사용 → reload 없음). - # invariant: req.start_page/end_page = 1-based inclusive → marker 0-based 로 변환. - converter = _converter - if req.start_page is not None and req.end_page is not None: - page_range = list(range(req.start_page - 1, req.end_page)) # 0-based inclusive - converter = PdfConverter(artifact_dict=_models, config={"page_range": page_range}) - try: - rendered = converter(str(p)) - except Exception as exc: - logger.exception(f"[marker-service] conversion failed path={p}: {exc}") - raise HTTPException( - status_code=422, - detail={ - "code": "conversion_failed", - "message": f"{type(exc).__name__}: {exc}", - }, - ) from exc - - md_text, _meta, raw_images = text_from_rendered(rendered) - elapsed_ms = int((time.monotonic() - start) * 1000) - finally: - _release_models() - - images_payload, truncated = _serialize_images(raw_images, str(p)) - - return ConvertResponse( - md_content=md_text, - md_content_hash=hashlib.sha256(md_text.encode("utf-8")).hexdigest(), - engine="marker", - engine_version=_engine_version, - elapsed_ms=elapsed_ms, - raw_metrics={ - "page_count": getattr(rendered, "page_count", None), - "image_count_extracted": len(raw_images) if raw_images else 0, - "image_count_returned": len(images_payload), - }, - images=images_payload, - images_truncated=truncated, - ) - - -def _serialize_images(raw_images, src_path: str) -> tuple[list[ConvertImage], bool]: - """marker 의 `_images` (dict[slug, PIL.Image]) → base64 ConvertImage 리스트. - - 가드: - - MAX_IMAGES_PER_DOC 초과 시 head 만 반환 + truncated=True - - per-image 직렬화 실패 시 해당 이미지만 skip + warn (전체 fail 안 함) - - per-image 결과 byte 크기가 MAX_BYTES_PER_IMAGE 초과 시 skip + warn - """ - if not raw_images: - return [], False - - items = list(raw_images.items()) - truncated = len(items) > MAX_IMAGES_PER_DOC - if truncated: - logger.warning( - f"[marker-service] images truncated path={src_path} " - f"total={len(items)} cap={MAX_IMAGES_PER_DOC}" - ) - items = items[:MAX_IMAGES_PER_DOC] - - out: list[ConvertImage] = [] - for slug, pil_img in items: - try: - fmt_raw = (pil_img.format or "PNG").upper() - # WebP/GIF 도 marker 가 emit 가능하지만 본 1B.5 기준은 PNG/JPEG 우선. - # 알 수 없는 포맷이면 PNG 로 강제 (lossless re-encode). - fmt = fmt_raw if fmt_raw in {"PNG", "JPEG", "WEBP", "GIF"} else "PNG" - buf = io.BytesIO() - pil_img.save(buf, format=fmt) - raw_bytes = buf.getvalue() - if len(raw_bytes) > MAX_BYTES_PER_IMAGE: - logger.warning( - f"[marker-service] image too large skipped path={src_path} " - f"slug={slug} bytes={len(raw_bytes)} cap={MAX_BYTES_PER_IMAGE}" - ) - continue - out.append( - ConvertImage( - slug=slug, - format=fmt.lower(), - width=pil_img.width, - height=pil_img.height, - bytes_b64=base64.b64encode(raw_bytes).decode("ascii"), - ) - ) - except Exception as exc: - logger.warning( - f"[marker-service] image serialize failed path={src_path} " - f"slug={slug}: {type(exc).__name__}: {exc}" - ) - continue - return out, truncated -- 2.52.0