feat: add /pipeline/ingest_file endpoint for .txt/.pdf upload

2025-08-13 08:48:17 +09:00
parent 6e7cf8eafa
commit 6346635ac1
2 changed files with 61 additions and 1 deletions
--- a/README.md
+++ b/README.md
@@ -233,6 +233,18 @@ curl -s -X POST http://localhost:26000/pipeline/ingest \
 - 번역 켜짐(`translate=true`): 번역본이 `outputs/html/<doc_id>.html`로 생성되고, 번역문이 인덱스에 추가됩니다.
 - 번역 꺼짐(`translate=false`): 원문으로 HTML만 생성되고, 원문 텍스트가 인덱스에 추가됩니다.

+파일 업로드 버전(`/pipeline/ingest_file`): `.txt`/`.pdf` 지원
+
+```bash
+curl -s -X POST http://localhost:26000/pipeline/ingest_file \
+  -H 'X-API-Key: <키>' \
+  -F 'file=@/path/to/file.pdf' \
+  -F 'doc_id=doc-001' \
+  -F 'generate_html=true' \
+  -F 'translate=false' \
+  -F 'target_language=ko'
+```
+

 Paperless에서 다수 문서를 일괄 인덱싱합니다.

--- a/server/main.py
+++ b/server/main.py
@@ -1,6 +1,6 @@
 from __future__ import annotations

-from fastapi import FastAPI, HTTPException, Depends
+from fastapi import FastAPI, HTTPException, Depends, UploadFile, File, Form
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
 from typing import List, Dict, Any
@@ -175,6 +175,54 @@ def pipeline_ingest(req: PipelineIngestRequest, _: None = Depends(require_api_ke
    return {"status": "ok", "doc_id": result.doc_id, "added": result.added_chunks, "chunks": result.chunks, "html_path": result.html_path}


+@app.post("/pipeline/ingest_file")
+async def pipeline_ingest_file(
+    _: None = Depends(require_api_key),
+    file: UploadFile = File(...),
+    doc_id: str = Form(...),
+    generate_html: bool = Form(True),
+    translate: bool = Form(True),
+    target_language: str = Form("ko"),
+) -> Dict[str, Any]:
+    content_type = (file.content_type or "").lower()
+    raw = await file.read()
+    text = ""
+    if "text/plain" in content_type or file.filename.endswith(".txt"):
+        try:
+            text = raw.decode("utf-8")
+        except Exception:
+            text = raw.decode("latin-1", errors="ignore")
+    elif "pdf" in content_type or file.filename.endswith(".pdf"):
+        try:
+            from pypdf import PdfReader
+            from io import BytesIO
+            reader = PdfReader(BytesIO(raw))
+            parts: List[str] = []
+            for p in reader.pages:
+                try:
+                    parts.append(p.extract_text() or "")
+                except Exception:
+                    parts.append("")
+            text = "\n\n".join(parts)
+        except Exception as e:
+            raise HTTPException(status_code=400, detail=f"pdf_extract_error: {e}")
+    else:
+        raise HTTPException(status_code=400, detail="unsupported_file_type (only .txt/.pdf)")
+
+    if not text.strip():
+        raise HTTPException(status_code=400, detail="empty_text_after_extraction")
+
+    result = pipeline.process(
+        doc_id=doc_id,
+        text=text,
+        index=index,
+        generate_html=generate_html,
+        translate=translate,
+        target_language=target_language,
+    )
+    return {"status": "ok", "doc_id": result.doc_id, "added": result.added_chunks, "chunks": result.chunks, "html_path": result.html_path}
+
+
 # Paperless webhook placeholder (to be wired with user-provided details)
 class PaperlessHook(BaseModel):
    document_id: int