From 0cbba0ceeb63d5c7a228fbfe3feb95771efbc88f Mon Sep 17 00:00:00 2001 From: Hyungi Ahn Date: Fri, 15 May 2026 21:17:08 +0900 Subject: [PATCH] =?UTF-8?q?feat(ingest):=20devonagent=20=ED=8A=B8=EB=9E=99?= =?UTF-8?q?=20Phase=201=20ingest=20=ED=99=9C=EC=84=B1=ED=99=94?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DEVONagent/DEVONthink 가 발견한 웹페이지를 NAS Web/ drop → file_watcher ingest → extract 4-tier fallback (trafilatura/sibling-md/readability/bs4) → embed + chunk 까지. classify/preview/markdown SKIP. - source_channel='devonagent' (migration 001 dormant 활성화) - file_watcher: SCAN_TARGETS 통합 + Web/ rglob + canonical_url dedup + sidecar 누락 정책 (skip 안 함, web_meta.sidecar_missing=true flag) - extract_worker: HTML+devonagent 분기 + md_extraction_engine 4-tier 구분 (trafilatura → sibling .md ≥200char → readability+markdownify → bs4_text) - queue_consumer: enqueue_next_stage 의 extract stage 만 source_channel- aware override (devonagent → [embed, chunk]) - classify_worker: devonagent safety skip (law_monitor 패턴 mirror, ai_domain='Web', ai_tags=['Web/{host}']) - requirements: trafilatura/readability-lxml/markdownify 추가 - docs: devonthink-web-bridge.md 설치 가이드 + first-wins 정책 명시 Phase 1 closure 기준 = 재료 품질 (검색 가능 + 노이즈율 + dedup + 엔진 분포). 활용처(ai_tldr/digest/PKM 회고)는 1-2주 OR 30-50건 관찰 후 별 PR 에서 결정. Plan: ~/.claude/plans/db-snuggly-petal.md Co-Authored-By: Claude Opus 4.7 (1M context) --- app/requirements.txt | 4 + app/workers/classify_worker.py | 16 ++ app/workers/extract_worker.py | 147 ++++++++++++++++- app/workers/file_watcher.py | 138 +++++++++++++++- app/workers/queue_consumer.py | 25 ++- docs/devonthink-web-bridge.md | 279 +++++++++++++++++++++++++++++++++ 6 files changed, 601 insertions(+), 8 deletions(-) create mode 100644 docs/devonthink-web-bridge.md diff --git a/app/requirements.txt b/app/requirements.txt index 9111787..da03960 100644 --- a/app/requirements.txt +++ b/app/requirements.txt @@ -17,3 +17,7 @@ python-multipart>=0.0.9 jinja2>=3.1.0 feedparser>=6.0.0 pymupdf>=1.24.0 +# Web/Blog ingest (devonagent 트랙) — HTML 본문 정화 4-tier fallback +trafilatura>=1.12.0 +readability-lxml>=0.8.1 +markdownify>=0.13.1 diff --git a/app/workers/classify_worker.py b/app/workers/classify_worker.py index 6a3ea78..d6b9f5e 100644 --- a/app/workers/classify_worker.py +++ b/app/workers/classify_worker.py @@ -373,6 +373,22 @@ async def process(document_id: int, session: AsyncSession) -> None: logger.info(f"doc {document_id}: law_monitor → classify skip") return + # Web/Blog ingest (devonagent 트랙) — plan db-snuggly-petal.md + # queue_consumer override 가 classify 를 skip 시키지만, 우회 경로 (예: 수동 enqueue) + # 로 들어왔을 때 안전망. ai_tldr/ai_bullets 같은 LLM 가공은 별 PR (Mac mini derived-worker). + if doc.source_channel == "devonagent": + from urllib.parse import urlparse + if not doc.ai_domain: + doc.ai_domain = "Web" + if not doc.ai_tags: + host = (urlparse(doc.edit_url or "").hostname or "web").lower() + doc.ai_tags = [f"Web/{host}"] + if not doc.importance: + doc.importance = "medium" + await session.commit() + logger.info(f"doc {document_id}: devonagent → classify skip") + return + if not doc.extracted_text: raise ValueError(f"문서 ID {document_id}: extracted_text가 비어있음") diff --git a/app/workers/extract_worker.py b/app/workers/extract_worker.py index 8ebbb7e..4155019 100644 --- a/app/workers/extract_worker.py +++ b/app/workers/extract_worker.py @@ -1,5 +1,6 @@ -"""텍스트 추출 워커 — kordoc / PyMuPDF / Surya OCR / LibreOffice / 직접 읽기""" +"""텍스트 추출 워커 — kordoc / PyMuPDF / Surya OCR / LibreOffice / 직접 읽기 / 웹 HTML""" +import hashlib import re import subprocess from datetime import datetime, timezone @@ -101,6 +102,137 @@ async def _call_ocr(file_path: Path, is_image: bool, max_pages: int = 200) -> st return None +# ─── Web/Blog ingest (devonagent 트랙) — HTML → markdown 4-tier ──────────── + +_WEB_MIN_BODY_LEN = 200 # 4-tier fallback 전환 임계 + + +def _extract_web_with_trafilatura(html: str) -> tuple[str, str | None]: + """trafilatura 로 본문 markdown 추출. (body, engine_version) 반환. 실패 시 ("", None).""" + try: + import trafilatura + except ImportError: + logger.warning("[web] trafilatura 미설치 — 다음 fallback 시도") + return "", None + try: + body = trafilatura.extract( + html, + output_format="markdown", + include_comments=False, + include_tables=True, + with_metadata=True, + deduplicate=True, + favor_precision=True, + ) + return (body or "", getattr(trafilatura, "__version__", "unknown")) + except Exception as e: + logger.warning(f"[web] trafilatura 실패: {e}") + return "", None + + +def _extract_web_with_readability(html: str) -> tuple[str, str | None]: + """readability-lxml 로 본문 추출 + markdownify 로 markdown 변환.""" + try: + from readability import Document as ReadabilityDocument + from markdownify import markdownify + except ImportError: + logger.warning("[web] readability/markdownify 미설치 — 다음 fallback 시도") + return "", None + try: + rd = ReadabilityDocument(html) + body_html = rd.summary() or "" + if not body_html: + return "", None + body_md = markdownify(body_html, heading_style="ATX") + return (body_md or "", "readability+markdownify") + except Exception as e: + logger.warning(f"[web] readability 실패: {e}") + return "", None + + +def _extract_web_with_bs4(html: str) -> tuple[str, str | None]: + """최종 fallback — BeautifulSoup 으로 script/style 제거 후 get_text.""" + try: + from bs4 import BeautifulSoup + except ImportError: + logger.warning("[web] beautifulsoup4 미설치 — 빈 본문 반환") + return "", None + try: + soup = BeautifulSoup(html, "lxml") + for tag in soup(["script", "style", "noscript", "nav", "footer", "aside"]): + tag.decompose() + text = soup.get_text(" ", strip=True) + return (text or "", "bs4_text") + except Exception as e: + logger.warning(f"[web] bs4 실패: {e}") + return "", None + + +async def _extract_web_html(doc: Document, html_path: Path) -> None: + """devonagent HTML → markdown 4-tier fallback. md_* 컬럼 전체 채움.""" + html_bytes = html_path.read_bytes() + html_text = html_bytes.decode("utf-8", errors="replace") + src_hash = hashlib.sha256(html_bytes).hexdigest() + + # 1) trafilatura + body, engine_ver = _extract_web_with_trafilatura(html_text) + engine = "trafilatura" if body and len(body) >= _WEB_MIN_BODY_LEN else None + + # 2) sibling .md (DEVONthink rendered) + if not engine: + md_path = html_path.with_suffix(".md") + if md_path.is_file(): + try: + md_body = md_path.read_text(encoding="utf-8", errors="replace") + if md_body and len(md_body) >= _WEB_MIN_BODY_LEN: + body = md_body + engine = "devonthink_export" + engine_ver = "smart_rule" + except Exception as e: + logger.warning(f"[web] sibling .md 읽기 실패 {md_path}: {e}") + + # 3) readability + markdownify + if not engine: + body2, ver2 = _extract_web_with_readability(html_text) + if body2 and len(body2) >= _WEB_MIN_BODY_LEN: + body = body2 + engine = "readability" + engine_ver = ver2 + + # 4) bs4 get_text (최종 fallback) + if not engine: + body3, ver3 = _extract_web_with_bs4(html_text) + if body3: + body = body3 + engine = "bs4_text" + engine_ver = ver3 + else: + body = "" + engine = "empty" + engine_ver = None + + clean_body = (body or "").replace("\x00", "") + now = datetime.now(timezone.utc) + + doc.extracted_text = clean_body + doc.extracted_at = now + doc.extractor_version = f"web@{engine}" + doc.md_content = clean_body + doc.md_status = "ready" if clean_body else "failed" + doc.md_extraction_engine = engine + doc.md_extraction_engine_version = engine_ver + doc.md_format_version = "1.0" + doc.md_generated_at = now + doc.md_source_hash = src_hash + doc.md_content_hash = hashlib.sha256(clean_body.encode("utf-8")).hexdigest() + doc.content_origin = "extracted" + + # extract_meta 의 web_meta 는 file_watcher 가 박은 그대로 유지 (sidecar 출처) + logger.info( + f"[web/{engine}] {doc.file_path} ({len(clean_body)}자, engine_ver={engine_ver})" + ) + + # ─── 메인 처리 ─── async def process(document_id: int, session: AsyncSession) -> None: @@ -112,6 +244,19 @@ async def process(document_id: int, session: AsyncSession) -> None: fmt = doc.file_format.lower() full_path = Path(settings.nas_mount_path) / doc.file_path + # ─── Web/Blog ingest (devonagent 트랙) — HTML 본문 정화 4-tier fallback ─── + # plan: ~/.claude/plans/db-snuggly-petal.md + # 1) trafilatura (markdown body) + # 2) sibling .md (DEVONthink rendered, >= 200 char) + # 3) readability-lxml + markdownify + # 4) BeautifulSoup get_text + # md_extraction_engine 으로 어느 경로로 추출됐는지 기록 → 품질 모니터링용 + if fmt == "html" and doc.source_channel == "devonagent": + if not full_path.exists(): + raise FileNotFoundError(f"파일 없음: {full_path}") + await _extract_web_html(doc, full_path) + return + # ─── 텍스트 파일 — 직접 읽기 ─── if fmt in TEXT_FORMATS: if not full_path.exists(): diff --git a/app/workers/file_watcher.py b/app/workers/file_watcher.py index 0c65f60..64e0116 100644 --- a/app/workers/file_watcher.py +++ b/app/workers/file_watcher.py @@ -1,4 +1,4 @@ -"""파일 감시 워커 — Inbox/Recordings/Videos 스캔, 새/변경 파일 자동 등록. +"""파일 감시 워커 — PKM(Inbox/Recordings/Videos) + Web(devonagent) 스캔, 자동 등록. §3 확장: - 스캔 대상: PKM/Inbox (문서) + PKM/Recordings (오디오) + PKM/Videos (비디오) @@ -8,9 +8,19 @@ - Roon 음원 경로(prefix match) skip — settings.roon_library_path - 파이프 분기: audio → stage='stt', video direct-play → stage='thumbnail', video quarantine → stage 없음 (처리 안 함, UI 에서 재생 불가 안내) + +Web/Blog ingest (devonagent 트랙, plan db-snuggly-petal.md): + - 스캔 대상: NAS/Web/{domain}/{YYYY-MM-DD}/{slug}.{html,md,json} + - DEVONthink Smart Rule 이 3종 export → 여기서 .html 만 진입 (sidecar 는 메타 소스) + - source_channel='devonagent', dedup = file_hash = sha256(canonical_url) + - first-wins 정책: 같은 canonical_url 재저장은 ingest 안 함 + - sidecar (.json) 누락 시: skip 안 하고 ingest, web_meta.sidecar_missing=true """ +import hashlib +import json from pathlib import Path +from urllib.parse import parse_qsl, urlencode, urlparse, urlunparse from sqlalchemy import select @@ -34,7 +44,14 @@ VIDEO_QUARANTINE_EXTS = {".mov", ".mkv", ".avi"} # 변환 필요, 보관만 # library (외부 작성 학습 자료) 폴더 — md/pdf/docx 등 문서 확장자만 수락 LIBRARY_DOC_EXTS = {".md", ".pdf", ".docx", ".doc", ".txt", ".rtf", ".html", ".odt"} -# 스캔 대상: (하위경로, 예상 category) — None 은 문서함(카테고리 미지정) +# Web ingest — canonical URL 정규화 시 strip 할 추적 파라미터 +TRACKING_PARAMS = { + "utm_source", "utm_medium", "utm_campaign", "utm_term", "utm_content", + "fbclid", "gclid", "msclkid", "ref", "ref_src", "ref_url", "mc_cid", "mc_eid", +} + +# 스캔 대상: (PKM 상대 하위경로, 예상 category) — None 은 문서함(카테고리 미지정) +# 모든 PKM 스캔은 source_channel='drive_sync'. Web 트랙은 별도 처리 (watch_inbox 안). SCAN_TARGETS: list[tuple[str, str | None]] = [ ("Inbox", None), ("Recordings", "audio"), @@ -95,10 +112,109 @@ def _route_media(path: Path, expected_category: str | None) -> tuple[str | None, return (None, False, "extract") +# ─── Web/Blog ingest (devonagent 트랙) 헬퍼 ────────────────────────────────── + +def _canonicalize_url(url: str) -> str: + """URL 정규화 — UTM/fbclid/fragment/trailing-slash 제거. dedup 의 진짜 기준. + + 같은 글의 utm 변형 (`?utm_source=foo`) 과 fragment 변형 (`#section`) 을 + 한 row 로 수렴시키기 위해 file_hash 산출 전 반드시 거친다. + """ + if not url: + return "" + try: + p = urlparse(url.strip()) + clean_qs = [ + (k, v) for k, v in parse_qsl(p.query, keep_blank_values=True) + if k.lower() not in TRACKING_PARAMS + ] + clean_qs.sort() + path = p.path.rstrip("/") or "/" + netloc = p.netloc.lower() + return urlunparse((p.scheme.lower(), netloc, path, "", urlencode(clean_qs), "")) + except Exception: + return url.strip() + + +def _load_web_sidecar(html_path: Path) -> dict | None: + """sibling .json sidecar 읽기. 부재/파싱실패 시 None.""" + json_path = html_path.with_suffix(".json") + if not json_path.is_file(): + return None + try: + return json.loads(json_path.read_text(encoding="utf-8", errors="replace")) + except Exception as e: + logger.warning(f"[devonagent] sidecar parse 실패 {json_path}: {e}") + return None + + +async def _ingest_web_file(session, file_path: Path, rel_path: str) -> tuple[int, int]: + """devonagent 트랙: .html 1건을 documents row + extract enqueue 로 등록. + + - .md/.json 은 sidecar 라 caller 가 skip (여기 진입 안 함) + - sidecar (.json) 있으면: canonical_url 기반 dedup, web_meta 풍부 + - sidecar 없으면: ingest 하되 web_meta.sidecar_missing=true (조용한 누락 방지) + - first-wins: 같은 canonical_url 재저장 시 변경 ingest 안 함 + """ + sidecar = _load_web_sidecar(file_path) + if sidecar and sidecar.get("url"): + raw_url = str(sidecar["url"]) + canonical_url = _canonicalize_url(raw_url) + fhash = hashlib.sha256(canonical_url.encode("utf-8")).hexdigest() + title = str(sidecar.get("title") or file_path.stem) + web_meta = { + "raw_url": raw_url, + "devonthink_uuid": sidecar.get("devonthink_uuid"), + "pub_date": sidecar.get("pub_date"), + "author": sidecar.get("author"), + "source_agent": sidecar.get("source_agent"), + } + edit_url = canonical_url + else: + canonical_url = None + fhash = hashlib.sha256(f"NO_URL:{rel_path}".encode("utf-8")).hexdigest() + title = file_path.stem + web_meta = {"sidecar_missing": True} + edit_url = None + + # devonagent dedup: file_path OR file_hash (URL identity 우선, path re-slug 흡수) + result = await session.execute( + select(Document).where( + (Document.file_path == rel_path) | (Document.file_hash == fhash) + ) + ) + existing = result.scalar_one_or_none() + if existing is not None: + # first-wins: 변경 ingest 안 함 (Phase 1 정책. 업데이트는 별 PR) + return (0, 0) + + doc = Document( + file_path=rel_path, + file_hash=fhash, + file_format="html", + file_size=file_path.stat().st_size, + file_type="immutable", + title=title, + source_channel="devonagent", + category="document", + data_origin="external", + import_source="devonthink", + edit_url=edit_url, + extract_meta={"web_meta": web_meta}, + ) + session.add(doc) + await session.flush() + await enqueue_stage(session, doc.id, "extract") + return (1, 0) + + async def watch_inbox(): - """PKM 하위 디렉토리를 스캔하여 새/변경 파일을 DB 등록 + 파이프 투입.""" - pkm_root = Path(settings.nas_mount_path) / "PKM" - if not pkm_root.exists(): + """PKM 하위 디렉토리 + Web/ 를 스캔하여 새/변경 파일을 DB 등록 + 파이프 투입.""" + nas_root = Path(settings.nas_mount_path) + pkm_root = nas_root / "PKM" + web_root = nas_root / "Web" + + if not pkm_root.exists() and not web_root.exists(): return new_count = 0 @@ -111,6 +227,16 @@ async def watch_inbox(): targets.append((extra_path, "library")) async with async_session() as session: + # ─── Web/ 트랙 (devonagent) — DEVONthink Smart Rule 이 떨군 .html 만 진입 ─── + if web_root.exists(): + for file_path in web_root.rglob("*.html"): + if not file_path.is_file() or should_skip(file_path): + continue + rel_path = str(file_path.relative_to(nas_root)) + added, _ = await _ingest_web_file(session, file_path, rel_path) + new_count += added + + # ─── PKM 트랙 (기존 drive_sync) ───────────────────────────────────────── for sub, expected_category in targets: scan_root = pkm_root / sub if not scan_root.exists(): @@ -129,7 +255,7 @@ async def watch_inbox(): if category is None and next_stage is None: continue - rel_path = str(file_path.relative_to(Path(settings.nas_mount_path))) + rel_path = str(file_path.relative_to(nas_root)) fhash = file_hash(file_path) result = await session.execute( diff --git a/app/workers/queue_consumer.py b/app/workers/queue_consumer.py index 0aa130e..8212d04 100644 --- a/app/workers/queue_consumer.py +++ b/app/workers/queue_consumer.py @@ -103,13 +103,36 @@ async def enqueue_next_stage(document_id: int, current_stage: str): §3 추가: stt → [classify] (audio 는 extract 건너뛰고 stt 가 extracted_text 를 채움) thumbnail → [] (video 는 leaf — classify/embed 없음) + + Web/Blog ingest (devonagent 트랙) — plan db-snuggly-petal.md: + source_channel='devonagent' 인 doc 의 extract 완료 시 + classify/preview/markdown 전부 SKIP → [embed, chunk] 만 enqueue. + AI 가공 (ai_tldr/ai_bullets 등) 은 별 PR (Mac mini derived-worker). """ + # source_channel-aware override (extract stage 만). source_channel 누락 시 _default. + extract_override_by_channel = { + "devonagent": ["embed", "chunk"], + } + next_stages = { "extract": ["classify", "preview"], "classify": ["embed", "chunk", "markdown"], "stt": ["classify"], } - stages = next_stages.get(current_stage, []) + + # extract 의 경우만 doc.source_channel 을 lookup 해서 override 적용 + if current_stage == "extract": + from models.document import Document + async with async_session() as lookup_session: + doc = await lookup_session.get(Document, document_id) + sc = doc.source_channel if doc else None + if sc in extract_override_by_channel: + stages = extract_override_by_channel[sc] + else: + stages = next_stages.get(current_stage, []) + else: + stages = next_stages.get(current_stage, []) + if not stages: return diff --git a/docs/devonthink-web-bridge.md b/docs/devonthink-web-bridge.md new file mode 100644 index 0000000..7949cae --- /dev/null +++ b/docs/devonthink-web-bridge.md @@ -0,0 +1,279 @@ +# DEVONthink → Document Server Web Bridge (devonagent 트랙) + +DEVONagent / DEVONthink 가 발견·저장한 웹페이지를 Document Server 의 검색 가능한 재료로 +보내기 위한 수동 설치 가이드. Plan: `~/.claude/plans/db-snuggly-petal.md`. + +## 흐름 + +``` +DEVONagent (smart agent — 사용자 운영) + ↓ +DEVONthink Inbox / tagged group (web/ingest) + ↓ Smart Rule (AppleScript) +NAS /volume4/Document_Server/Web/{domain}/{YYYY-MM-DD}/{slug}.{html,md,json} + ↓ NFS → GPU file_watcher (5분 간격) +documents row (source_channel='devonagent') + extract → embed → chunk + ↓ +/api/search + bge-reranker-v2-m3 검색 가능 상태 +``` + +## 정책 (Phase 1) + +- **첫 ingest 만 유지 (first-wins)**: 같은 `canonical_url` 은 한 번만 documents row 생성. + DEVONthink 에서 같은 글을 다시 저장해도 **내용이 갱신되지 않는다**. UTM 파라미터 변형 + (`?utm_source=foo`) 과 fragment (`#section`) 도 정규화되어 한 row 로 수렴. + 업데이트 버전 관리는 추후 별 PR (`PR-Web-Update-Policy`) 에서 다룬다. +- **AI 가공 미적용**: 이 단계는 "검색 가능한 재료" 까지만. ai_tldr / ai_bullets / 카테고리 + 자동 태깅은 별 PR (Mac mini derived-worker) 에서 결정. +- **Sidecar (.json) 누락 시**: skip 안 하고 ingest. `extract_meta.web_meta.sidecar_missing=true` + 로 표시. URL 정보가 없어 검색 evidence 가치는 줄지만 침묵 누락보다 낫다. + +## NAS 경로 규칙 + +``` +/volume4/Document_Server/Web/ + ├── example.com/ + │ ├── 2026-05-15/ + │ │ ├── sample-post.html # 본문 HTML + │ │ ├── sample-post.md # DEVONthink rendered markdown (fallback 용) + │ │ └── sample-post.json # 메타 sidecar + │ └── 2026-05-14/ + │ └── another-post.html + └── ... +``` + +- 도메인: `urlparse(url).hostname` 의 lowercase +- 날짜: `creation date` 의 `YYYY-MM-DD` (KST 또는 UTC, 일관 유지) +- slug: 파일명 안전한 형태로 변환 (영숫자/하이픈/언더스코어만) + +## Sidecar JSON 스키마 + +```json +{ + "title": "Sample Blog Post Title", + "url": "https://example.com/sample-post?utm_source=newsletter#main", + "author": "Author Name", + "pub_date": "2026-05-15T09:00:00Z", + "devonthink_uuid": "DEADBEEF-1234-5678-90AB-CDEF12345678", + "source_agent": "web-ingest" +} +``` + +- `title`, `url` **필수** (둘 다 없으면 sidecar_missing 처리) +- `pub_date` 는 ISO 8601 UTC 권장 (한국 시간이면 명시적 +09:00) +- `source_agent` 는 어떤 smart agent 가 수집했는지 (분석용 메타, 옵션) + +## DEVONthink Smart Rule 설치 + +### 1. Smart Rule 생성 + +DEVONthink 3 메뉴 → `Tools` → `Smart Rules` → `+` (새 규칙). + +- **Name**: `Web → NAS for GPU ingest` +- **Trigger**: + - `On Adding Item to` (Inbox) — Inbox 자동 처리 + - 또는 `On Tagging Item` — `web/ingest` 태그 붙으면 발동 (수동 큐레이션 선호 시) +- **Conditions** (옵션): + - `Kind` is `WebArchive` or `HTML` or `Markdown` + - `URL` is not empty + +### 2. Action: `Execute Script` + +다음 AppleScript 본문을 `Action Scripts` 영역에 붙여넣는다. NAS 경로 +`/Volumes/Document_Server` 는 macOS 가 마운트한 SMB/AFP volume 이라고 가정한다. +(다른 mount point 면 `kBaseDir` 만 수정.) + +```applescript +-- DEVONthink Smart Rule: Web → NAS for GPU ingest +-- Plan: ~/.claude/plans/db-snuggly-petal.md + +property kBaseDir : "/Volumes/Document_Server/Web" + +on slugify(theText) + set theResult to "" + repeat with c in theText + set ch to c as string + set asciiVal to (id of ch) + if (asciiVal ≥ 48 and asciiVal ≤ 57) or ¬ + (asciiVal ≥ 65 and asciiVal ≤ 90) or ¬ + (asciiVal ≥ 97 and asciiVal ≤ 122) or ¬ + ch is "-" or ch is "_" then + set theResult to theResult & ch + else if ch is " " or ch is "." or ch is "/" then + set theResult to theResult & "-" + end if + end repeat + if theResult is "" then set theResult to "untitled" + if (length of theResult) > 80 then ¬ + set theResult to text 1 thru 80 of theResult + return theResult +end slugify + +on hostnameFromURL(theURL) + try + set delim to "://" + set AppleScript's text item delimiters to delim + set tail to text item 2 of theURL + set AppleScript's text item delimiters to "/" + set host to text item 1 of tail + set AppleScript's text item delimiters to "" + -- strip port + 소문자 + set AppleScript's text item delimiters to ":" + set host to text item 1 of host + set AppleScript's text item delimiters to "" + return do shell script "echo " & quoted form of host & " | tr 'A-Z' 'a-z'" + on error + return "unknown" + end try +end hostnameFromURL + +on isoDate(theDate) + set y to year of theDate as string + set m to month of theDate as integer + set d to day of theDate as integer + if m < 10 then set m to "0" & m + if d < 10 then set d to "0" & d + return y & "-" & m & "-" & d +end isoDate + +on performSmartRule(theRecords) + tell application id "DNtp" + repeat with theRecord in theRecords + try + set theURL to URL of theRecord + if theURL is missing value or theURL is "" then + log message "Web→NAS: URL 없음, skip — " & (name of theRecord) + -- continue + else + set theName to name of theRecord + set theUUID to uuid of theRecord + set theAuthor to "" + try + set theAuthor to (meta data of theRecord)'s |author| + end try + set theDate to (creation date of theRecord) + set dateStr to my isoDate(theDate) + set host to my hostnameFromURL(theURL) + set slug to my slugify(theName) + + set targetDir to kBaseDir & "/" & host & "/" & dateStr + do shell script "mkdir -p " & quoted form of targetDir + + set htmlPath to targetDir & "/" & slug & ".html" + set mdPath to targetDir & "/" & slug & ".md" + set jsonPath to targetDir & "/" & slug & ".json" + + -- 1) HTML export + try + export record theRecord to htmlPath as HTML + on error errMsg + log message "Web→NAS HTML export 실패 (" & theName & "): " & errMsg + end try + + -- 2) Markdown export (DEVONthink rendered, trafilatura fallback) + try + export record theRecord to mdPath as markdown + end try + + -- 3) JSON sidecar + set pubISO to do shell script ¬ + "date -u +%Y-%m-%dT%H:%M:%SZ -r " & ¬ + (do shell script "stat -f %m " & quoted form of htmlPath) + set jsonText to "{" & ¬ + "\"title\":" & my jsonEsc(theName) & "," & ¬ + "\"url\":" & my jsonEsc(theURL) & "," & ¬ + "\"author\":" & my jsonEsc(theAuthor) & "," & ¬ + "\"pub_date\":\"" & pubISO & "\"," & ¬ + "\"devonthink_uuid\":\"" & theUUID & "\"," & ¬ + "\"source_agent\":\"smart-rule:web-ingest\"" & ¬ + "}" + do shell script "cat > " & quoted form of jsonPath & ¬ + " <<'EOF'" & linefeed & jsonText & linefeed & "EOF" + + log message "Web→NAS: " & theName & " → " & host & "/" & dateStr + end if + on error errMsg + log message "Web→NAS 처리 실패: " & errMsg + end try + end repeat + end tell +end performSmartRule + +on jsonEsc(theText) + if theText is missing value then return "\"\"" + set s to theText as string + -- 최소 escape: backslash 와 따옴표 + set AppleScript's text item delimiters to "\\" + set parts to text items of s + set AppleScript's text item delimiters to "\\\\" + set s to parts as string + set AppleScript's text item delimiters to "\"" + set parts to text items of s + set AppleScript's text item delimiters to "\\\"" + set s to parts as string + set AppleScript's text item delimiters to "" + return "\"" & s & "\"" +end jsonEsc +``` + +**참고**: 위 스크립트는 시작점이다. 실제 사용 시 다음을 점검하라. + +- `kBaseDir` 경로가 실제 NAS mount 와 일치하는지 +- `creation date` 가 글의 실제 발행일이 아닐 수 있음 (DEVONthink 가 저장한 시점) — + 필요하면 `meta data → date` 사용 +- JSON escape 가 한국어/특수문자에서 깨지는지 → `do shell script "python3 -c ..."` 로 + 대체하는 게 안전 + +### 3. 동작 확인 + +1. DEVONthink 에서 웹페이지를 Inbox 에 저장 (단축키 `^⌥⌘)` 또는 Clip to DEVONthink) +2. Smart Rule 이 자동 발동 (혹은 우클릭 → `Apply Rule`) +3. `/Volumes/Document_Server/Web/{host}/{date}/{slug}.{html,md,json}` 3종 생성 확인 +4. 최대 5분 내 GPU file_watcher 가 ingest. SQL 확인: + ```sql + SELECT id, title, edit_url, md_extraction_engine, md_status + FROM documents WHERE source_channel='devonagent' + ORDER BY created_at DESC LIMIT 5; + ``` + +## file_watcher 동작 요약 + +- `nas_mount_path / "Web"` 하위를 5분 간격 rglob 으로 `.html` 만 수집 +- 각 `.html` 마다 sibling `.json` 읽어 canonical URL 산출 +- `file_hash = sha256(canonical_url)` → URL identity dedup +- documents row 생성 + `processing_queue.stage='extract'` 등록 +- extract_worker 의 4-tier fallback 으로 md_content 채움 +- `source_channel='devonagent'` 인 doc 은 `classify`/`preview`/`markdown` SKIP → + `embed` + `chunk` 만 enqueue + +## 검증 (운영 후) + +```sql +-- 도메인 분포 (어느 사이트가 많이 들어오는지) +SELECT split_part(edit_url, '/', 3) host, count(*) cnt +FROM documents WHERE source_channel='devonagent' AND edit_url IS NOT NULL +GROUP BY host ORDER BY cnt DESC; + +-- 추출 엔진 분포 (bs4_text 비율 모니터링) +SELECT md_extraction_engine, count(*) cnt, + ROUND(100.0 * count(*) / sum(count(*)) OVER (), 1) pct +FROM documents WHERE source_channel='devonagent' +GROUP BY md_extraction_engine ORDER BY cnt DESC; + +-- Sidecar 누락 분 (조용한 누락 가시화) +SELECT id, title, file_path +FROM documents +WHERE source_channel='devonagent' + AND extract_meta->'web_meta'->>'sidecar_missing' = 'true'; +``` + +## 알려진 한계 (Phase 1) + +- **JS-rendered 페이지**: SPA / React / Vue 로 본문이 client-side 렌더되는 사이트는 + HTML 안에 본문 텍스트가 없어 trafilatura 가 빈 결과를 낸다. DEVONthink WebArchive + export 가 렌더 결과를 잡아주면 OK, 아니면 bs4_text fallback 도 빈약하다. + Playwright 컨테이너는 별 PR. +- **로그인/페이월 콘텐츠**: DEVONthink 가 로그인 세션으로 capture 한 경우만 본문 보유. +- **canonical_url 정책**: 같은 글의 reprint (Medium → 본인 블로그) 는 다른 row 로 ingest 됨. + URL identity 만 dedup 기준이다. +- **첫 ingest 만 유지**: 글이 후속 편집되어도 갱신 안 됨. 별 PR 에서 정책 결정.