feat: add /pipeline/ingest_file endpoint for .txt/.pdf upload
This commit is contained in:
12
README.md
12
README.md
@@ -233,6 +233,18 @@ curl -s -X POST http://localhost:26000/pipeline/ingest \
|
||||
- 번역 켜짐(`translate=true`): 번역본이 `outputs/html/<doc_id>.html`로 생성되고, 번역문이 인덱스에 추가됩니다.
|
||||
- 번역 꺼짐(`translate=false`): 원문으로 HTML만 생성되고, 원문 텍스트가 인덱스에 추가됩니다.
|
||||
|
||||
파일 업로드 버전(`/pipeline/ingest_file`): `.txt`/`.pdf` 지원
|
||||
|
||||
```bash
|
||||
curl -s -X POST http://localhost:26000/pipeline/ingest_file \
|
||||
-H 'X-API-Key: <키>' \
|
||||
-F 'file=@/path/to/file.pdf' \
|
||||
-F 'doc_id=doc-001' \
|
||||
-F 'generate_html=true' \
|
||||
-F 'translate=false' \
|
||||
-F 'target_language=ko'
|
||||
```
|
||||
|
||||
|
||||
Paperless에서 다수 문서를 일괄 인덱싱합니다.
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from fastapi import FastAPI, HTTPException, Depends
|
||||
from fastapi import FastAPI, HTTPException, Depends, UploadFile, File, Form
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from pydantic import BaseModel
|
||||
from typing import List, Dict, Any
|
||||
@@ -175,6 +175,54 @@ def pipeline_ingest(req: PipelineIngestRequest, _: None = Depends(require_api_ke
|
||||
return {"status": "ok", "doc_id": result.doc_id, "added": result.added_chunks, "chunks": result.chunks, "html_path": result.html_path}
|
||||
|
||||
|
||||
@app.post("/pipeline/ingest_file")
|
||||
async def pipeline_ingest_file(
|
||||
_: None = Depends(require_api_key),
|
||||
file: UploadFile = File(...),
|
||||
doc_id: str = Form(...),
|
||||
generate_html: bool = Form(True),
|
||||
translate: bool = Form(True),
|
||||
target_language: str = Form("ko"),
|
||||
) -> Dict[str, Any]:
|
||||
content_type = (file.content_type or "").lower()
|
||||
raw = await file.read()
|
||||
text = ""
|
||||
if "text/plain" in content_type or file.filename.endswith(".txt"):
|
||||
try:
|
||||
text = raw.decode("utf-8")
|
||||
except Exception:
|
||||
text = raw.decode("latin-1", errors="ignore")
|
||||
elif "pdf" in content_type or file.filename.endswith(".pdf"):
|
||||
try:
|
||||
from pypdf import PdfReader
|
||||
from io import BytesIO
|
||||
reader = PdfReader(BytesIO(raw))
|
||||
parts: List[str] = []
|
||||
for p in reader.pages:
|
||||
try:
|
||||
parts.append(p.extract_text() or "")
|
||||
except Exception:
|
||||
parts.append("")
|
||||
text = "\n\n".join(parts)
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=400, detail=f"pdf_extract_error: {e}")
|
||||
else:
|
||||
raise HTTPException(status_code=400, detail="unsupported_file_type (only .txt/.pdf)")
|
||||
|
||||
if not text.strip():
|
||||
raise HTTPException(status_code=400, detail="empty_text_after_extraction")
|
||||
|
||||
result = pipeline.process(
|
||||
doc_id=doc_id,
|
||||
text=text,
|
||||
index=index,
|
||||
generate_html=generate_html,
|
||||
translate=translate,
|
||||
target_language=target_language,
|
||||
)
|
||||
return {"status": "ok", "doc_id": result.doc_id, "added": result.added_chunks, "chunks": result.chunks, "html_path": result.html_path}
|
||||
|
||||
|
||||
# Paperless webhook placeholder (to be wired with user-provided details)
|
||||
class PaperlessHook(BaseModel):
|
||||
document_id: int
|
||||
|
||||
Reference in New Issue
Block a user