diff --git a/app/api/documents.py b/app/api/documents.py index 6346223..02af95c 100644 --- a/app/api/documents.py +++ b/app/api/documents.py @@ -36,6 +36,9 @@ class DocumentResponse(BaseModel): ai_sub_group: str | None ai_tags: list | None ai_summary: str | None + document_type: str | None + importance: str | None + ai_confidence: float | None user_note: str | None original_path: str | None original_format: str | None diff --git a/app/models/document.py b/app/models/document.py index fd69c46..c4868b9 100644 --- a/app/models/document.py +++ b/app/models/document.py @@ -38,6 +38,9 @@ class Document(Base): ai_sub_group: Mapped[str | None] = mapped_column(String(100)) ai_model_version: Mapped[str | None] = mapped_column(String(50)) ai_processed_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True)) + document_type: Mapped[str | None] = mapped_column(String(50)) + importance: Mapped[str | None] = mapped_column(String(20), default="medium") + ai_confidence: Mapped[float | None] = mapped_column() # 3계층: 벡터 임베딩 embedding = mapped_column(Vector(768), nullable=True) diff --git a/app/prompts/classify.txt b/app/prompts/classify.txt index 232cde9..b3d072d 100644 --- a/app/prompts/classify.txt +++ b/app/prompts/classify.txt @@ -1,51 +1,93 @@ -당신은 문서 분류 AI입니다. 아래 문서를 분석하고 반드시 JSON 형식으로만 응답하세요. 다른 텍스트는 출력하지 마세요. +You are a document classification AI. Analyze the document below and respond ONLY in JSON format. No other text. -## 응답 형식 +## Response Format { - "tags": ["태그1", "태그2", "태그3"], - "domain": "도메인경로", - "sub_group": "하위그룹", - "sourceChannel": "유입경로", - "dataOrigin": "work 또는 external" + "domain": "Level1/Level2/Level3", + "document_type": "one of document_types", + "confidence": 0.85, + "tags": ["tag1", "tag2"], + "importance": "medium", + "sourceChannel": "inbox_route", + "dataOrigin": "work or external" } -## 도메인 선택지 (NAS 폴더 경로) -- Knowledge/Philosophy — 철학, 사상, 인문학 -- Knowledge/Language — 어학, 번역, 언어학 -- Knowledge/Engineering — 공학 전반 기술 문서 -- Knowledge/Industrial_Safety — 산업안전, 규정, 인증 -- Knowledge/Programming — 개발, 코드, IT 기술 -- Knowledge/General — 일반 도서, 독서 노트, 메모 -- Reference — 도면, 참고자료, 규격표 +## Domain Taxonomy (select the most specific leaf node) -## 하위 그룹 예시 (도메인별) -- Knowledge/Industrial_Safety: Legislation, Standards, Cases -- Knowledge/Programming: Language, Framework, DevOps, AI_ML -- Knowledge/Engineering: Mechanical, Electrical, Network -- 잘 모르겠으면: (비워둠) +Philosophy/ + Ethics, Metaphysics, Epistemology, Logic, Aesthetics, Eastern_Philosophy, Western_Philosophy -## 태그 체계 -태그는 최대 5개, 한글 사용. 아래 계층 구조 중에서 선택: -- @상태/: 처리중, 검토필요, 완료, 아카이브 -- #주제/기술/: 서버관리, 네트워크, AI-ML -- #주제/산업안전/: 법령, 위험성평가, 순회점검, 안전교육, 사고사례, 신고보고, 안전관리자, 보건관리자 -- #주제/업무/: 프로젝트, 회의, 보고서 -- $유형/: 논문, 법령, 기사, 메모, 이메일, 채팅로그, 도면, 체크리스트 -- !우선순위/: 긴급, 중요, 참고 +Language/ + Korean, English, Japanese, Translation, Linguistics -## sourceChannel 값 -- tksafety: TKSafety API 업무 실적 -- devonagent: 자동 수집 뉴스 -- law_monitor: 법령 API 법령 변경 -- inbox_route: Inbox AI 분류 (이 프롬프트에 의한 분류) -- email: MailPlus 이메일 -- web_clip: Web Clipper 스크랩 -- manual: 직접 추가 -- drive_sync: Synology Drive 동기화 +Engineering/ + Mechanical/ Piping, HVAC, Equipment + Electrical/ Power, Instrumentation + Chemical/ Process, Material + Civil + Network/ Server, Security, Infrastructure -## dataOrigin 값 -- work: 자사 업무 관련 (TK, 테크니컬코리아, 공장, 생산, 사내) -- external: 외부 참고 자료 (뉴스, 논문, 법령, 일반 정보) +Industrial_Safety/ + Legislation/ Act, Decree, Foreign_Law, Korea_Law_Archive, Enforcement_Rule, Public_Notice, SAPA + Theory/ Industrial_Safety_General, Safety_Health_Fundamentals + Academic_Papers/ Safety_General, Risk_Assessment_Research + Cases/ Domestic, International + Practice/ Checklist, Contractor_Management, Safety_Education, Emergency_Plan, Patrol_Inspection, Permit_to_Work, PPE, Safety_Plan + Risk_Assessment/ KRAS, JSA, Checklist_Method + Safety_Manager/ Appointment, Duty_Record, Improvement, Inspection, Meeting + Health_Manager/ Appointment, Duty_Record, Ergonomics, Health_Checkup, Mental_Health, MSDS, Work_Environment -## 분류 대상 문서 +Programming/ + Programming_Language/ Python, JavaScript, Go, Rust + Framework/ FastAPI, SvelteKit, React + DevOps/ Docker, CI_CD, Linux_Administration + AI_ML/ Large_Language_Model, Computer_Vision, Data_Science + Database + Software_Architecture + +General/ + Reading_Notes, Self_Development, Business, Science, History + +## Classification Rules +- domain MUST be the most specific leaf node (e.g., Industrial_Safety/Practice/Patrol_Inspection, NOT Industrial_Safety/Practice) +- domain MUST be exactly ONE path +- If content spans multiple domains, choose by PRIMARY purpose +- If safety content is >30%, prefer Industrial_Safety +- If code is included, prefer Programming +- 2-level paths allowed ONLY when no leaf exists (e.g., Engineering/Civil) + +## Document Types (select exactly ONE) +Reference, Standard, Manual, Drawing, Template, Note, Academic_Paper, Law_Document, Report, Memo, Checklist, Meeting_Minutes, Specification + +### Document Type Detection Rules +- Step-by-step instructions → Manual +- Legal clauses/regulations → Law_Document +- Technical requirements → Specification +- Meeting discussion → Meeting_Minutes +- Checklist format → Checklist +- Academic/research format → Academic_Paper +- Technical drawings → Drawing +- If unclear → Note + +## Confidence (0.0 ~ 1.0) +- How confident are you in the domain classification? +- 0.85+ = high confidence, 0.6~0.85 = moderate, <0.6 = uncertain + +## Tags +- Free-form tags (Korean or English) +- Include: person names, technology names, concepts, project names +- Maximum 5 tags + +## Importance +- high: urgent or critical documents +- medium: normal working documents +- low: reference or archive material + +## sourceChannel +- inbox_route (this classification) + +## dataOrigin +- work: company-related (TK, Technicalkorea, factory, production) +- external: external reference (news, papers, laws, general info) + +## Document to classify {document_text} diff --git a/app/workers/classify_worker.py b/app/workers/classify_worker.py index 2609c28..04134ea 100644 --- a/app/workers/classify_worker.py +++ b/app/workers/classify_worker.py @@ -1,30 +1,67 @@ -"""AI 분류 워커 — Qwen3.5로 도메인/태그/요약 생성 + Inbox→Knowledge 이동""" +"""AI 분류 워커 — taxonomy 기반 도메인/문서타입/태그/요약 생성""" +import yaml from datetime import datetime, timezone from pathlib import Path from sqlalchemy.ext.asyncio import AsyncSession -from ai.client import AIClient, parse_json_response +from ai.client import AIClient, parse_json_response, strip_thinking from core.config import settings from core.utils import setup_logger from models.document import Document logger = setup_logger("classify_worker") -# 분류용 텍스트 최대 길이 (Qwen3.5 컨텍스트 관리) MAX_CLASSIFY_TEXT = 8000 -# 유효한 도메인 목록 -VALID_DOMAINS = { - "Knowledge/Philosophy", - "Knowledge/Language", - "Knowledge/Engineering", - "Knowledge/Industrial_Safety", - "Knowledge/Programming", - "Knowledge/General", - "Reference", -} +# config.yaml에서 taxonomy 로딩 +_config_path = Path(__file__).resolve().parent.parent / "config.yaml" +_config = yaml.safe_load(_config_path.read_text(encoding="utf-8")) + +DOCUMENT_TYPES = set(_config.get("document_types", [])) + + +def _get_taxonomy_leaf_paths(taxonomy: dict, prefix: str = "") -> set[str]: + """taxonomy dict에서 모든 유효한 경로를 추출""" + paths = set() + for key, value in taxonomy.items(): + current = f"{prefix}/{key}" if prefix else key + if isinstance(value, dict): + if not value: + paths.add(current) + else: + paths.update(_get_taxonomy_leaf_paths(value, current)) + elif isinstance(value, list): + if not value: + paths.add(current) + else: + for leaf in value: + paths.add(f"{current}/{leaf}") + paths.add(current) # 2단계도 허용 (leaf가 없는 경우용) + else: + paths.add(current) + return paths + + +VALID_DOMAIN_PATHS = _get_taxonomy_leaf_paths(_config.get("taxonomy", {})) + + +def _validate_domain(domain: str) -> str: + """domain이 taxonomy에 존재하는지 검증, 없으면 최대한 가까운 경로 찾기""" + if domain in VALID_DOMAIN_PATHS: + return domain + + # 부분 매칭 시도 (2단계까지) + parts = domain.split("/") + for i in range(len(parts), 0, -1): + partial = "/".join(parts[:i]) + if partial in VALID_DOMAIN_PATHS: + logger.warning(f"[분류] domain '{domain}' → '{partial}' (부분 매칭)") + return partial + + logger.warning(f"[분류] domain '{domain}' taxonomy에 없음, General/Reading_Notes로 대체") + return "General/Reading_Notes" async def process(document_id: int, session: AsyncSession) -> None: @@ -46,23 +83,36 @@ async def process(document_id: int, session: AsyncSession) -> None: if not parsed: raise ValueError(f"AI 응답에서 JSON 추출 실패: {raw_response[:200]}") - # 유효성 검증 + DB 업데이트 - domain = parsed.get("domain", "") - if domain not in VALID_DOMAINS: - logger.warning(f"[분류] document_id={document_id}: 알 수 없는 도메인 '{domain}', Knowledge/General로 대체") - domain = "Knowledge/General" - + # domain 검증 + domain = _validate_domain(parsed.get("domain", "")) doc.ai_domain = domain - doc.ai_sub_group = parsed.get("sub_group", "") - doc.ai_tags = parsed.get("tags", []) + # sub_group은 domain 경로에서 추출 (호환성) + parts = domain.split("/") + doc.ai_sub_group = parts[1] if len(parts) > 1 else "" + + # document_type 검증 + doc_type = parsed.get("document_type", "") + doc.document_type = doc_type if doc_type in DOCUMENT_TYPES else "Note" + + # confidence + confidence = parsed.get("confidence", 0.5) + doc.ai_confidence = max(0.0, min(1.0, float(confidence))) + + # importance + importance = parsed.get("importance", "medium") + doc.importance = importance if importance in ("high", "medium", "low") else "medium" + + # tags + doc.ai_tags = parsed.get("tags", [])[:5] + + # source/origin if parsed.get("sourceChannel") and not doc.source_channel: doc.source_channel = parsed["sourceChannel"] if parsed.get("dataOrigin") and not doc.data_origin: doc.data_origin = parsed["dataOrigin"] # ─── 요약 ─── - from ai.client import strip_thinking summary = await client.summarize(doc.extracted_text[:15000]) doc.ai_summary = strip_thinking(summary) @@ -70,15 +120,13 @@ async def process(document_id: int, session: AsyncSession) -> None: doc.ai_model_version = "qwen3.5-35b-a3b" doc.ai_processed_at = datetime.now(timezone.utc) - # 파일은 원본 위치 유지 (물리 이동 없음, DB 메타데이터만 관리) - logger.info( f"[분류] document_id={document_id}: " - f"domain={domain}, tags={doc.ai_tags}, summary={len(summary)}자" + f"domain={domain}, type={doc.document_type}, " + f"confidence={doc.ai_confidence:.2f}, tags={doc.ai_tags}" ) finally: await client.close() - # _move_to_knowledge 제거됨 — 파일은 원본 위치 유지, 분류는 DB 메타데이터만 diff --git a/config.yaml b/config.yaml index 7d32c9f..417e1e0 100644 --- a/config.yaml +++ b/config.yaml @@ -40,6 +40,66 @@ nas: mount_path: "/documents" pkm_root: "/documents/PKM" +# ─── 문서 분류 체계 ─── +taxonomy: + Philosophy: + Ethics: [] + Metaphysics: [] + Epistemology: [] + Logic: [] + Aesthetics: [] + Eastern_Philosophy: [] + Western_Philosophy: [] + Language: + Korean: [] + English: [] + Japanese: [] + Translation: [] + Linguistics: [] + Engineering: + Mechanical: [Piping, HVAC, Equipment] + Electrical: [Power, Instrumentation] + Chemical: [Process, Material] + Civil: [] + Network: [Server, Security, Infrastructure] + Industrial_Safety: + Legislation: [Act, Decree, Foreign_Law, Korea_Law_Archive, Enforcement_Rule, Public_Notice, SAPA] + Theory: [Industrial_Safety_General, Safety_Health_Fundamentals] + Academic_Papers: [Safety_General, Risk_Assessment_Research] + Cases: [Domestic, International] + Practice: [Checklist, Contractor_Management, Safety_Education, Emergency_Plan, Patrol_Inspection, Permit_to_Work, PPE, Safety_Plan] + Risk_Assessment: [KRAS, JSA, Checklist_Method] + Safety_Manager: [Appointment, Duty_Record, Improvement, Inspection, Meeting] + Health_Manager: [Appointment, Duty_Record, Ergonomics, Health_Checkup, Mental_Health, MSDS, Work_Environment] + Programming: + Programming_Language: [Python, JavaScript, Go, Rust] + Framework: [FastAPI, SvelteKit, React] + DevOps: [Docker, CI_CD, Linux_Administration] + AI_ML: [Large_Language_Model, Computer_Vision, Data_Science] + Database: [] + Software_Architecture: [] + General: + Reading_Notes: [] + Self_Development: [] + Business: [] + Science: [] + History: [] + +document_types: + - Reference + - Standard + - Manual + - Drawing + - Template + - Note + - Academic_Paper + - Law_Document + - Report + - Memo + - Checklist + - Meeting_Minutes + - Specification + schedule: law_monitor: "07:00" mailplus_archive: ["07:00", "18:00"] diff --git a/migrations/008_classify_fields.sql b/migrations/008_classify_fields.sql new file mode 100644 index 0000000..2d2e615 --- /dev/null +++ b/migrations/008_classify_fields.sql @@ -0,0 +1,4 @@ +-- 분류 체계 확장 필드 +ALTER TABLE documents ADD COLUMN IF NOT EXISTS document_type VARCHAR(50); +ALTER TABLE documents ADD COLUMN IF NOT EXISTS importance VARCHAR(20) DEFAULT 'medium'; +ALTER TABLE documents ADD COLUMN IF NOT EXISTS ai_confidence FLOAT;