feat: Paperless integration (content endpoint, list+sync). Add /paperless/sync and docs
This commit is contained in:
17
README.md
17
README.md
@@ -212,6 +212,22 @@ curl -s -X POST http://localhost:26000/paperless/hook \
|
|||||||
|
|
||||||
해당 훅은 문서 도착을 통지받는 용도로 제공됩니다. 실제 본문 텍스트는 Paperless API로 조회해 `/index/upsert`로 추가하세요.
|
해당 훅은 문서 도착을 통지받는 용도로 제공됩니다. 실제 본문 텍스트는 Paperless API로 조회해 `/index/upsert`로 추가하세요.
|
||||||
|
|
||||||
|
### Paperless 배치 동기화(`/paperless/sync`)
|
||||||
|
|
||||||
|
Paperless에서 다수 문서를 일괄 인덱싱합니다.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl -s -X POST http://localhost:26000/paperless/sync \
|
||||||
|
-H 'Content-Type: application/json' -H 'X-API-Key: <키>' \
|
||||||
|
-d '{
|
||||||
|
"page_size": 50,
|
||||||
|
"ordering": "-created",
|
||||||
|
"tags": null,
|
||||||
|
"query": null,
|
||||||
|
"limit": 200
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
## 시놀로지 메일/오피스 연동 가이드(요약)
|
## 시놀로지 메일/오피스 연동 가이드(요약)
|
||||||
|
|
||||||
- **검색/QA 호출 엔드포인트**: `http://<AI서버IP>:26000/search`, `http://<AI서버IP>:26000/chat`
|
- **검색/QA 호출 엔드포인트**: `http://<AI서버IP>:26000/search`, `http://<AI서버IP>:26000/chat`
|
||||||
@@ -229,6 +245,7 @@ curl -s -X POST http://localhost:26000/paperless/hook \
|
|||||||
- `BASE_MODEL`(기본 `qwen2.5:7b-instruct`)
|
- `BASE_MODEL`(기본 `qwen2.5:7b-instruct`)
|
||||||
- `BOOST_MODEL`(기본 `qwen2.5:14b-instruct`)
|
- `BOOST_MODEL`(기본 `qwen2.5:14b-instruct`)
|
||||||
- `ENGLISH_MODEL`(기본 `llama3:8b-instruct`): 영어 감지 시 라우팅 대상
|
- `ENGLISH_MODEL`(기본 `llama3:8b-instruct`): 영어 감지 시 라우팅 대상
|
||||||
|
- `ENGLISH_RATIO_THRESHOLD`(기본 `0.65`): 영어 비율 임계값(초과 시 영어 모델)
|
||||||
- `EMBEDDING_MODEL`(기본 `nomic-embed-text`)
|
- `EMBEDDING_MODEL`(기본 `nomic-embed-text`)
|
||||||
- `INDEX_PATH`(기본 `data/index.jsonl`)
|
- `INDEX_PATH`(기본 `data/index.jsonl`)
|
||||||
- `PAPERLESS_BASE_URL`, `PAPERLESS_TOKEN`(선택): Paperless API 연동 시 사용
|
- `PAPERLESS_BASE_URL`, `PAPERLESS_TOKEN`(선택): Paperless API 연동 시 사용
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ class Settings:
|
|||||||
base_model: str = os.getenv("BASE_MODEL", "qwen2.5:7b-instruct")
|
base_model: str = os.getenv("BASE_MODEL", "qwen2.5:7b-instruct")
|
||||||
boost_model: str = os.getenv("BOOST_MODEL", "qwen2.5:14b-instruct")
|
boost_model: str = os.getenv("BOOST_MODEL", "qwen2.5:14b-instruct")
|
||||||
english_model: str = os.getenv("ENGLISH_MODEL", "llama3:8b-instruct")
|
english_model: str = os.getenv("ENGLISH_MODEL", "llama3:8b-instruct")
|
||||||
|
english_ratio_threshold: float = float(os.getenv("ENGLISH_RATIO_THRESHOLD", "0.65"))
|
||||||
embedding_model: str = os.getenv("EMBEDDING_MODEL", "nomic-embed-text")
|
embedding_model: str = os.getenv("EMBEDDING_MODEL", "nomic-embed-text")
|
||||||
index_path: str = os.getenv("INDEX_PATH", "data/index.jsonl")
|
index_path: str = os.getenv("INDEX_PATH", "data/index.jsonl")
|
||||||
|
|
||||||
|
|||||||
@@ -90,7 +90,7 @@ def chat(req: ChatRequest) -> Dict[str, Any]:
|
|||||||
non_ascii_letters = sum((not ch.isascii()) and ch.isalpha() for ch in user_text)
|
non_ascii_letters = sum((not ch.isascii()) and ch.isalpha() for ch in user_text)
|
||||||
english_ratio = ascii_letters / max(ascii_letters + non_ascii_letters, 1)
|
english_ratio = ascii_letters / max(ascii_letters + non_ascii_letters, 1)
|
||||||
total_chars = len(user_text)
|
total_chars = len(user_text)
|
||||||
if english_ratio > 0.8:
|
if english_ratio > settings.english_ratio_threshold:
|
||||||
model = settings.english_model
|
model = settings.english_model
|
||||||
else:
|
else:
|
||||||
model = settings.boost_model if (req.force_boost or total_chars > 2000) else settings.base_model
|
model = settings.boost_model if (req.force_boost or total_chars > 2000) else settings.base_model
|
||||||
@@ -175,6 +175,55 @@ def paperless_hook(hook: PaperlessHook, _: None = Depends(require_api_key)) -> D
|
|||||||
return {"status": "indexed", "document_id": hook.document_id, "chunks": added}
|
return {"status": "indexed", "document_id": hook.document_id, "chunks": added}
|
||||||
|
|
||||||
|
|
||||||
|
class PaperlessSyncRequest(BaseModel):
|
||||||
|
page_size: int = 50
|
||||||
|
ordering: str = "-created"
|
||||||
|
tags: List[int] | None = None
|
||||||
|
query: str | None = None
|
||||||
|
limit: int = 200
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/paperless/sync")
|
||||||
|
def paperless_sync(req: PaperlessSyncRequest, _: None = Depends(require_api_key)) -> Dict[str, Any]:
|
||||||
|
client = PaperlessClient(settings.paperless_base_url, settings.paperless_token)
|
||||||
|
from .index_store import IndexRow
|
||||||
|
added_total = 0
|
||||||
|
next_url: str | None = None
|
||||||
|
fetched = 0
|
||||||
|
|
||||||
|
while True:
|
||||||
|
if next_url:
|
||||||
|
import requests as _rq
|
||||||
|
resp = _rq.get(next_url, headers=client._headers(), timeout=60)
|
||||||
|
resp.raise_for_status()
|
||||||
|
data = resp.json()
|
||||||
|
else:
|
||||||
|
data = client.list_documents(page_size=req.page_size, ordering=req.ordering, tags=req.tags, query=req.query)
|
||||||
|
results = data.get("results", [])
|
||||||
|
to_append: List[IndexRow] = []
|
||||||
|
for doc in results:
|
||||||
|
doc_id = doc.get("id")
|
||||||
|
if not doc_id:
|
||||||
|
continue
|
||||||
|
text = client.get_document_text(int(doc_id))
|
||||||
|
if not text:
|
||||||
|
continue
|
||||||
|
parts = chunk_text(text)
|
||||||
|
for i, t in enumerate(parts):
|
||||||
|
vec = ollama.embeddings(settings.embedding_model, t)
|
||||||
|
to_append.append(IndexRow(id=f"paperless:{doc_id}:{i}", text=t, vector=vec, source="paperless"))
|
||||||
|
if to_append:
|
||||||
|
added_total += index.append(to_append)
|
||||||
|
fetched += len(results)
|
||||||
|
if fetched >= req.limit:
|
||||||
|
break
|
||||||
|
next_url = data.get("next")
|
||||||
|
if not next_url:
|
||||||
|
break
|
||||||
|
|
||||||
|
return {"status": "synced", "added": added_total}
|
||||||
|
|
||||||
|
|
||||||
# OpenAI-compatible chat completions (minimal)
|
# OpenAI-compatible chat completions (minimal)
|
||||||
class ChatCompletionsRequest(BaseModel):
|
class ChatCompletionsRequest(BaseModel):
|
||||||
model: str | None = None
|
model: str | None = None
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import os
|
import os
|
||||||
from typing import Any, Dict
|
from typing import Any, Dict, List, Optional
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
|
|
||||||
@@ -16,15 +16,47 @@ class PaperlessClient:
|
|||||||
headers["Authorization"] = f"Token {self.token}"
|
headers["Authorization"] = f"Token {self.token}"
|
||||||
return headers
|
return headers
|
||||||
|
|
||||||
def get_document_text(self, doc_id: int) -> str:
|
def get_document(self, doc_id: int) -> Dict[str, Any]:
|
||||||
if not self.base_url:
|
if not self.base_url:
|
||||||
raise RuntimeError("PAPERLESS_BASE_URL not configured")
|
raise RuntimeError("PAPERLESS_BASE_URL not configured")
|
||||||
# Example endpoint; adjust to real Paperless API
|
|
||||||
url = f"{self.base_url}/api/documents/{doc_id}/"
|
url = f"{self.base_url}/api/documents/{doc_id}/"
|
||||||
resp = requests.get(url, headers=self._headers(), timeout=60)
|
resp = requests.get(url, headers=self._headers(), timeout=60)
|
||||||
resp.raise_for_status()
|
resp.raise_for_status()
|
||||||
data = resp.json()
|
return resp.json()
|
||||||
# Prefer content field if available; else title
|
|
||||||
text = data.get("content", "") or data.get("notes", "") or data.get("title", "")
|
def get_document_text(self, doc_id: int) -> str:
|
||||||
return text
|
if not self.base_url:
|
||||||
|
raise RuntimeError("PAPERLESS_BASE_URL not configured")
|
||||||
|
# Try content endpoint
|
||||||
|
url_content = f"{self.base_url}/api/documents/{doc_id}/content/"
|
||||||
|
try:
|
||||||
|
r = requests.get(url_content, headers=self._headers(), timeout=60)
|
||||||
|
if r.status_code == 200 and r.text:
|
||||||
|
return r.text
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
# Try txt download
|
||||||
|
url_txt = f"{self.base_url}/api/documents/{doc_id}/download/?format=txt"
|
||||||
|
try:
|
||||||
|
r = requests.get(url_txt, headers=self._headers(), timeout=60)
|
||||||
|
if r.status_code == 200 and r.text:
|
||||||
|
return r.text
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
# Fallback to metadata fields
|
||||||
|
data = self.get_document(doc_id)
|
||||||
|
return data.get("content", "") or data.get("notes", "") or data.get("title", "")
|
||||||
|
|
||||||
|
def list_documents(self, page_size: int = 50, ordering: str = "-created", tags: Optional[List[int]] = None, query: Optional[str] = None) -> Dict[str, Any]:
|
||||||
|
if not self.base_url:
|
||||||
|
raise RuntimeError("PAPERLESS_BASE_URL not configured")
|
||||||
|
params: Dict[str, Any] = {"page_size": page_size, "ordering": ordering}
|
||||||
|
if tags:
|
||||||
|
params["tags__id__in"] = ",".join(str(t) for t in tags)
|
||||||
|
if query:
|
||||||
|
params["query"] = query
|
||||||
|
url = f"{self.base_url}/api/documents/"
|
||||||
|
resp = requests.get(url, headers=self._headers(), params=params, timeout=60)
|
||||||
|
resp.raise_for_status()
|
||||||
|
return resp.json()
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user