feat: add /pipeline/ingest_file endpoint for .txt/.pdf upload

This commit is contained in:
hyungi
2025-08-13 08:48:17 +09:00
parent 6e7cf8eafa
commit 6346635ac1
2 changed files with 61 additions and 1 deletions

View File

@@ -233,6 +233,18 @@ curl -s -X POST http://localhost:26000/pipeline/ingest \
- 번역 켜짐(`translate=true`): 번역본이 `outputs/html/<doc_id>.html`로 생성되고, 번역문이 인덱스에 추가됩니다.
- 번역 꺼짐(`translate=false`): 원문으로 HTML만 생성되고, 원문 텍스트가 인덱스에 추가됩니다.
파일 업로드 버전(`/pipeline/ingest_file`): `.txt`/`.pdf` 지원
```bash
curl -s -X POST http://localhost:26000/pipeline/ingest_file \
-H 'X-API-Key: <키>' \
-F 'file=@/path/to/file.pdf' \
-F 'doc_id=doc-001' \
-F 'generate_html=true' \
-F 'translate=false' \
-F 'target_language=ko'
```
Paperless에서 다수 문서를 일괄 인덱싱합니다.

View File

@@ -1,6 +1,6 @@
from __future__ import annotations
from fastapi import FastAPI, HTTPException, Depends
from fastapi import FastAPI, HTTPException, Depends, UploadFile, File, Form
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from typing import List, Dict, Any
@@ -175,6 +175,54 @@ def pipeline_ingest(req: PipelineIngestRequest, _: None = Depends(require_api_ke
return {"status": "ok", "doc_id": result.doc_id, "added": result.added_chunks, "chunks": result.chunks, "html_path": result.html_path}
@app.post("/pipeline/ingest_file")
async def pipeline_ingest_file(
_: None = Depends(require_api_key),
file: UploadFile = File(...),
doc_id: str = Form(...),
generate_html: bool = Form(True),
translate: bool = Form(True),
target_language: str = Form("ko"),
) -> Dict[str, Any]:
content_type = (file.content_type or "").lower()
raw = await file.read()
text = ""
if "text/plain" in content_type or file.filename.endswith(".txt"):
try:
text = raw.decode("utf-8")
except Exception:
text = raw.decode("latin-1", errors="ignore")
elif "pdf" in content_type or file.filename.endswith(".pdf"):
try:
from pypdf import PdfReader
from io import BytesIO
reader = PdfReader(BytesIO(raw))
parts: List[str] = []
for p in reader.pages:
try:
parts.append(p.extract_text() or "")
except Exception:
parts.append("")
text = "\n\n".join(parts)
except Exception as e:
raise HTTPException(status_code=400, detail=f"pdf_extract_error: {e}")
else:
raise HTTPException(status_code=400, detail="unsupported_file_type (only .txt/.pdf)")
if not text.strip():
raise HTTPException(status_code=400, detail="empty_text_after_extraction")
result = pipeline.process(
doc_id=doc_id,
text=text,
index=index,
generate_html=generate_html,
translate=translate,
target_language=target_language,
)
return {"status": "ok", "doc_id": result.doc_id, "added": result.added_chunks, "chunks": result.chunks, "html_path": result.html_path}
# Paperless webhook placeholder (to be wired with user-provided details)
class PaperlessHook(BaseModel):
document_id: int