1. [critical] config.yaml → settings 객체에서 taxonomy 로드 (import crash 방지) 2. [high] ODF 변환: file_path 유지, derived_path 별도 필드 (무한 중복 방지) 3. [high] 법령 분할: 첫 장 이전 조문을 "서문"으로 보존 4. [high] Inbox: review_status 필드 분리 (pending/approved/rejected) 5. [high] 삭제: soft-delete (deleted_at) + worker 방어 + active_documents 뷰 - 모든 조회에 deleted_at IS NULL 일관 적용 - queue_consumer: row 없으면 gracefully skip Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
400 lines
12 KiB
Python
400 lines
12 KiB
Python
"""문서 CRUD API"""
|
|
|
|
import shutil
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Annotated
|
|
|
|
from fastapi import APIRouter, Depends, HTTPException, Query, UploadFile, status
|
|
from fastapi.responses import FileResponse
|
|
from pydantic import BaseModel
|
|
from sqlalchemy import func, select
|
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
|
|
from core.auth import get_current_user
|
|
from core.config import settings
|
|
from core.database import get_session
|
|
from core.utils import file_hash
|
|
from models.document import Document
|
|
from models.queue import ProcessingQueue
|
|
from models.user import User
|
|
|
|
router = APIRouter()
|
|
|
|
|
|
# ─── 스키마 ───
|
|
|
|
|
|
class DocumentResponse(BaseModel):
|
|
id: int
|
|
file_path: str
|
|
file_format: str
|
|
file_size: int | None
|
|
file_type: str
|
|
title: str | None
|
|
ai_domain: str | None
|
|
ai_sub_group: str | None
|
|
ai_tags: list | None
|
|
ai_summary: str | None
|
|
document_type: str | None
|
|
importance: str | None
|
|
ai_confidence: float | None
|
|
user_note: str | None
|
|
derived_path: str | None
|
|
original_format: str | None
|
|
conversion_status: str | None
|
|
review_status: str | None
|
|
edit_url: str | None
|
|
preview_status: str | None
|
|
source_channel: str | None
|
|
data_origin: str | None
|
|
extracted_at: datetime | None
|
|
ai_processed_at: datetime | None
|
|
embedded_at: datetime | None
|
|
created_at: datetime
|
|
updated_at: datetime
|
|
|
|
class Config:
|
|
from_attributes = True
|
|
|
|
|
|
class DocumentListResponse(BaseModel):
|
|
items: list[DocumentResponse]
|
|
total: int
|
|
page: int
|
|
page_size: int
|
|
|
|
|
|
class DocumentUpdate(BaseModel):
|
|
title: str | None = None
|
|
ai_domain: str | None = None
|
|
ai_sub_group: str | None = None
|
|
ai_tags: list | None = None
|
|
user_note: str | None = None
|
|
edit_url: str | None = None
|
|
source_channel: str | None = None
|
|
data_origin: str | None = None
|
|
|
|
|
|
# ─── 스키마 (트리) ───
|
|
|
|
|
|
class TreeNode(BaseModel):
|
|
name: str
|
|
path: str
|
|
count: int
|
|
children: list["TreeNode"]
|
|
|
|
|
|
# ─── 엔드포인트 ───
|
|
|
|
|
|
@router.get("/tree")
|
|
async def get_document_tree(
|
|
user: Annotated[User, Depends(get_current_user)],
|
|
session: Annotated[AsyncSession, Depends(get_session)],
|
|
):
|
|
"""도메인 트리 (3단계 경로 파싱, 사이드바용)"""
|
|
from sqlalchemy import text as sql_text
|
|
|
|
result = await session.execute(
|
|
sql_text("""
|
|
SELECT ai_domain, COUNT(*)
|
|
FROM documents
|
|
WHERE ai_domain IS NOT NULL AND ai_domain != ''
|
|
AND deleted_at IS NULL
|
|
GROUP BY ai_domain
|
|
ORDER BY ai_domain
|
|
""")
|
|
)
|
|
|
|
# 경로를 트리로 파싱
|
|
root: dict = {}
|
|
for domain_path, count in result:
|
|
parts = domain_path.split("/")
|
|
node = root
|
|
for part in parts:
|
|
if part not in node:
|
|
node[part] = {"_count": 0, "_children": {}}
|
|
node[part]["_count"] += count
|
|
node = node[part]["_children"]
|
|
|
|
def build_tree(d: dict, prefix: str = "") -> list[dict]:
|
|
nodes = []
|
|
for name, data in sorted(d.items()):
|
|
path = f"{prefix}/{name}" if prefix else name
|
|
children = build_tree(data["_children"], path)
|
|
nodes.append({
|
|
"name": name,
|
|
"path": path,
|
|
"count": data["_count"],
|
|
"children": children,
|
|
})
|
|
return nodes
|
|
|
|
return build_tree(root)
|
|
|
|
|
|
@router.get("/", response_model=DocumentListResponse)
|
|
async def list_documents(
|
|
user: Annotated[User, Depends(get_current_user)],
|
|
session: Annotated[AsyncSession, Depends(get_session)],
|
|
page: int = Query(1, ge=1),
|
|
page_size: int = Query(20, ge=1, le=100),
|
|
domain: str | None = None,
|
|
sub_group: str | None = None,
|
|
source: str | None = None,
|
|
format: str | None = None,
|
|
):
|
|
"""문서 목록 조회 (페이지네이션 + 필터)"""
|
|
query = select(Document).where(Document.deleted_at == None)
|
|
|
|
if domain:
|
|
# prefix 매칭: Industrial_Safety 클릭 시 하위 전부 포함
|
|
query = query.where(Document.ai_domain.startswith(domain))
|
|
if source:
|
|
query = query.where(Document.source_channel == source)
|
|
if format:
|
|
query = query.where(Document.file_format == format)
|
|
|
|
# 전체 건수
|
|
count_query = select(func.count()).select_from(query.subquery())
|
|
total = (await session.execute(count_query)).scalar()
|
|
|
|
# 페이지네이션
|
|
query = query.order_by(Document.created_at.desc())
|
|
query = query.offset((page - 1) * page_size).limit(page_size)
|
|
result = await session.execute(query)
|
|
items = result.scalars().all()
|
|
|
|
return DocumentListResponse(
|
|
items=[DocumentResponse.model_validate(doc) for doc in items],
|
|
total=total,
|
|
page=page,
|
|
page_size=page_size,
|
|
)
|
|
|
|
|
|
@router.get("/{doc_id}", response_model=DocumentResponse)
|
|
async def get_document(
|
|
doc_id: int,
|
|
user: Annotated[User, Depends(get_current_user)],
|
|
session: Annotated[AsyncSession, Depends(get_session)],
|
|
):
|
|
"""문서 단건 조회"""
|
|
doc = await session.get(Document, doc_id)
|
|
if not doc or doc.deleted_at is not None:
|
|
raise HTTPException(status_code=404, detail="문서를 찾을 수 없습니다")
|
|
return DocumentResponse.model_validate(doc)
|
|
|
|
|
|
@router.get("/{doc_id}/file")
|
|
async def get_document_file(
|
|
doc_id: int,
|
|
session: Annotated[AsyncSession, Depends(get_session)],
|
|
token: str | None = Query(None, description="Bearer token (iframe용)"),
|
|
user: User | None = Depends(lambda: None),
|
|
):
|
|
"""문서 원본 파일 서빙 (Bearer 헤더 또는 ?token= 쿼리 파라미터)"""
|
|
from core.auth import decode_token
|
|
|
|
# 쿼리 파라미터 토큰 검증
|
|
if token:
|
|
payload = decode_token(token)
|
|
if not payload or payload.get("type") != "access":
|
|
raise HTTPException(status_code=401, detail="유효하지 않은 토큰")
|
|
else:
|
|
# 일반 Bearer 헤더 인증 시도
|
|
raise HTTPException(status_code=401, detail="토큰이 필요합니다")
|
|
|
|
doc = await session.get(Document, doc_id)
|
|
if not doc:
|
|
raise HTTPException(status_code=404, detail="문서를 찾을 수 없습니다")
|
|
|
|
file_path = Path(settings.nas_mount_path) / doc.file_path
|
|
if not file_path.exists():
|
|
raise HTTPException(status_code=404, detail="파일을 찾을 수 없습니다")
|
|
|
|
# 미디어 타입 매핑
|
|
media_types = {
|
|
".pdf": "application/pdf",
|
|
".jpg": "image/jpeg", ".jpeg": "image/jpeg",
|
|
".png": "image/png", ".gif": "image/gif",
|
|
".bmp": "image/bmp", ".tiff": "image/tiff",
|
|
".svg": "image/svg+xml",
|
|
".txt": "text/plain", ".md": "text/plain",
|
|
".html": "text/html", ".csv": "text/csv",
|
|
".json": "application/json", ".xml": "application/xml",
|
|
}
|
|
suffix = file_path.suffix.lower()
|
|
media_type = media_types.get(suffix, "application/octet-stream")
|
|
|
|
return FileResponse(
|
|
path=str(file_path),
|
|
media_type=media_type,
|
|
headers={"Content-Disposition": "inline"},
|
|
)
|
|
|
|
|
|
@router.post("/", response_model=DocumentResponse, status_code=201)
|
|
async def upload_document(
|
|
file: UploadFile,
|
|
user: Annotated[User, Depends(get_current_user)],
|
|
session: Annotated[AsyncSession, Depends(get_session)],
|
|
):
|
|
"""파일 업로드 → Inbox 저장 + DB 등록 + 처리 큐 등록"""
|
|
if not file.filename:
|
|
raise HTTPException(status_code=400, detail="파일명이 필요합니다")
|
|
|
|
# 파일명 정규화 (경로 이탈 방지)
|
|
safe_name = Path(file.filename).name
|
|
if not safe_name or safe_name.startswith("."):
|
|
raise HTTPException(status_code=400, detail="유효하지 않은 파일명")
|
|
|
|
# Inbox에 파일 저장
|
|
inbox_dir = Path(settings.nas_mount_path) / "PKM" / "Inbox"
|
|
inbox_dir.mkdir(parents=True, exist_ok=True)
|
|
target = (inbox_dir / safe_name).resolve()
|
|
|
|
# Inbox 하위 경로 검증
|
|
if not str(target).startswith(str(inbox_dir.resolve())):
|
|
raise HTTPException(status_code=400, detail="잘못된 파일 경로")
|
|
|
|
# 중복 파일명 처리
|
|
counter = 1
|
|
stem, suffix = target.stem, target.suffix
|
|
while target.exists():
|
|
target = inbox_dir.resolve() / f"{stem}_{counter}{suffix}"
|
|
counter += 1
|
|
|
|
content = await file.read()
|
|
target.write_bytes(content)
|
|
|
|
# 상대 경로 (NAS 루트 기준)
|
|
rel_path = str(target.relative_to(Path(settings.nas_mount_path)))
|
|
fhash = file_hash(target)
|
|
ext = target.suffix.lstrip(".").lower() or "unknown"
|
|
|
|
# DB 등록
|
|
doc = Document(
|
|
file_path=rel_path,
|
|
file_hash=fhash,
|
|
file_format=ext,
|
|
file_size=len(content),
|
|
file_type="immutable",
|
|
title=target.stem,
|
|
source_channel="manual",
|
|
)
|
|
session.add(doc)
|
|
await session.flush()
|
|
|
|
# 처리 큐 등록
|
|
session.add(ProcessingQueue(
|
|
document_id=doc.id,
|
|
stage="extract",
|
|
status="pending",
|
|
))
|
|
await session.commit()
|
|
|
|
return DocumentResponse.model_validate(doc)
|
|
|
|
|
|
@router.patch("/{doc_id}", response_model=DocumentResponse)
|
|
async def update_document(
|
|
doc_id: int,
|
|
body: DocumentUpdate,
|
|
user: Annotated[User, Depends(get_current_user)],
|
|
session: Annotated[AsyncSession, Depends(get_session)],
|
|
):
|
|
"""문서 메타데이터 수정 (수동 오버라이드)"""
|
|
doc = await session.get(Document, doc_id)
|
|
if not doc:
|
|
raise HTTPException(status_code=404, detail="문서를 찾을 수 없습니다")
|
|
|
|
update_data = body.model_dump(exclude_unset=True)
|
|
for field, value in update_data.items():
|
|
setattr(doc, field, value)
|
|
doc.updated_at = datetime.now(timezone.utc)
|
|
await session.commit()
|
|
|
|
return DocumentResponse.model_validate(doc)
|
|
|
|
|
|
@router.put("/{doc_id}/content")
|
|
async def save_document_content(
|
|
doc_id: int,
|
|
user: Annotated[User, Depends(get_current_user)],
|
|
session: Annotated[AsyncSession, Depends(get_session)],
|
|
body: dict = None,
|
|
):
|
|
"""Markdown 원본 파일 저장 + extracted_text 갱신"""
|
|
doc = await session.get(Document, doc_id)
|
|
if not doc:
|
|
raise HTTPException(status_code=404, detail="문서를 찾을 수 없습니다")
|
|
|
|
if doc.file_format not in ("md", "txt"):
|
|
raise HTTPException(status_code=400, detail="편집 가능한 포맷이 아닙니다 (md, txt만 가능)")
|
|
|
|
content = body.get("content", "") if body else ""
|
|
file_path = Path(settings.nas_mount_path) / doc.file_path
|
|
file_path.write_text(content, encoding="utf-8")
|
|
|
|
# 메타 갱신
|
|
doc.file_size = len(content.encode("utf-8"))
|
|
doc.file_hash = file_hash(file_path)
|
|
doc.extracted_text = content[:15000]
|
|
doc.updated_at = datetime.now(timezone.utc)
|
|
await session.commit()
|
|
|
|
return DocumentResponse.model_validate(doc)
|
|
|
|
|
|
@router.get("/{doc_id}/preview")
|
|
async def get_document_preview(
|
|
doc_id: int,
|
|
session: Annotated[AsyncSession, Depends(get_session)],
|
|
token: str | None = Query(None, description="Bearer token (iframe용)"),
|
|
):
|
|
"""PDF 미리보기 캐시 서빙"""
|
|
from core.auth import decode_token
|
|
|
|
if token:
|
|
payload = decode_token(token)
|
|
if not payload or payload.get("type") != "access":
|
|
raise HTTPException(status_code=401, detail="유효하지 않은 토큰")
|
|
else:
|
|
raise HTTPException(status_code=401, detail="토큰이 필요합니다")
|
|
|
|
doc = await session.get(Document, doc_id)
|
|
if not doc:
|
|
raise HTTPException(status_code=404, detail="문서를 찾을 수 없습니다")
|
|
|
|
preview_path = Path(settings.nas_mount_path) / "PKM" / ".preview" / f"{doc_id}.pdf"
|
|
if not preview_path.exists():
|
|
raise HTTPException(status_code=404, detail="미리보기가 아직 생성되지 않았습니다")
|
|
|
|
return FileResponse(
|
|
path=str(preview_path),
|
|
media_type="application/pdf",
|
|
headers={"Content-Disposition": "inline"},
|
|
)
|
|
|
|
|
|
@router.delete("/{doc_id}")
|
|
async def delete_document(
|
|
doc_id: int,
|
|
user: Annotated[User, Depends(get_current_user)],
|
|
session: Annotated[AsyncSession, Depends(get_session)],
|
|
delete_file: bool = Query(False, description="NAS 파일도 함께 삭제"),
|
|
):
|
|
"""문서 삭제 (기본: DB만 삭제, 파일 유지)"""
|
|
doc = await session.get(Document, doc_id)
|
|
if not doc:
|
|
raise HTTPException(status_code=404, detail="문서를 찾을 수 없습니다")
|
|
|
|
# soft-delete (물리 파일은 cleanup job에서 나중에 정리)
|
|
doc.deleted_at = datetime.now(timezone.utc)
|
|
await session.commit()
|
|
|
|
return {"message": f"문서 {doc_id} soft-delete 완료"}
|