Compare commits
25 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| a410f5b65c | |||
| 7031439364 | |||
| 468804494d | |||
| 01db4816fd | |||
| e7c7a2091f | |||
| 88e5893041 | |||
| 9fb3de6e0a | |||
| cd06ef0403 | |||
| d3aa640f65 | |||
| e10ccc9169 | |||
| 321d997123 | |||
| b75307b89b | |||
| f3530e382d | |||
| 8583465c58 | |||
| f4e5db9723 | |||
| 69db9bcb94 | |||
| 61e5a416d0 | |||
| cdf4ee0ef6 | |||
| 251a5392ef | |||
| 1842f27d89 | |||
| 53a30449e2 | |||
| ab668d7990 | |||
| dcf99b377e | |||
| 3df0ca53ab | |||
| 7cd8cfde0a |
@@ -9,7 +9,23 @@
|
||||
}
|
||||
|
||||
http://document.hyungi.net {
|
||||
encode gzip
|
||||
# 명시 Content-Type match — 기본 match 의 text/* 는 text/event-stream 까지 포함해
|
||||
# SSE(/api/eid/chat)의 첫 ~512B 를 gzip 버퍼링함. SSE 제외, 기존 압축 대상은 보존.
|
||||
# (응답 매처는 header <필드> <값> 한 쌍씩 — 여러 줄 = OR. 한 줄 다중 값은 파싱 에러)
|
||||
encode {
|
||||
gzip
|
||||
match {
|
||||
header Content-Type text/html*
|
||||
header Content-Type text/css*
|
||||
header Content-Type text/plain*
|
||||
header Content-Type text/xml*
|
||||
header Content-Type text/javascript*
|
||||
header Content-Type application/json*
|
||||
header Content-Type application/javascript*
|
||||
header Content-Type application/xml*
|
||||
header Content-Type image/svg+xml*
|
||||
}
|
||||
}
|
||||
|
||||
# API + 문서 → FastAPI
|
||||
handle /api/* {
|
||||
|
||||
@@ -134,6 +134,38 @@ def _fix_json_string_escapes(s: str) -> str:
|
||||
i += 1
|
||||
return "".join(out)
|
||||
|
||||
def is_deferrable_error(exc: Exception) -> bool:
|
||||
"""deep(맥북 M5 Max) 호출 실패가 '보류(StageDeferred)' 대상인지 분류 (ds-macbook-offload-1).
|
||||
|
||||
보류 = 맥북 일시 불가 신호:
|
||||
- HTTP 503 (라우터 upstream_cold / editor_busy / warming — no-silent-fallback 계약)
|
||||
- HTTP 502/504 (라우터가 upstream 연결 실패·생성 도중 절단을 502 로 변환 —
|
||||
llm_router.py 실측 4곳. 맥북 sleep 절단이 라우터 경유 토폴로지에선 이걸로 표면화)
|
||||
- httpx.TransportError 전계열 (ConnectError·ReadError·RemoteProtocolError +
|
||||
ConnectTimeout·ReadTimeout 등) — 라우터 자체 불가 / DS↔라우터 구간 절단.
|
||||
그 외(400/500, 파싱/검증 오류 등)는 보류가 아니라 호출자의 기존 실패 경로.
|
||||
"""
|
||||
if isinstance(exc, httpx.HTTPStatusError):
|
||||
return exc.response.status_code in (502, 503, 504)
|
||||
return isinstance(exc, httpx.TransportError)
|
||||
|
||||
|
||||
async def call_deep_or_defer(client: "AIClient", prompt: str, system: str | None = None) -> str:
|
||||
"""call_deep + 보류 변환 — 맥북 불가(503/연결/절단)는 StageDeferred 로 raise.
|
||||
|
||||
deep_summary_worker / summarize_worker(drain) 가 공유. StageDeferred 는 queue_consumer/
|
||||
queue_drain 이 attempts 미소모 + deferred_until 백오프로 처리한다 (sleep-안전 불변식).
|
||||
"""
|
||||
from models.queue import StageDeferred
|
||||
|
||||
try:
|
||||
return await client.call_deep(prompt, system=system)
|
||||
except Exception as exc:
|
||||
if is_deferrable_error(exc):
|
||||
raise StageDeferred(f"macbook_unavailable:{type(exc).__name__}") from exc
|
||||
raise
|
||||
|
||||
|
||||
# 프롬프트 로딩
|
||||
PROMPTS_DIR = Path(__file__).parent.parent / "prompts"
|
||||
|
||||
@@ -185,6 +217,18 @@ class AIClient:
|
||||
"""triage/primary 실패 시 최후 방어선. Claude Sonnet 4 API (config.yaml ai.models.fallback) — PR #20 이후 swap 완료."""
|
||||
return await self._request(self.ai.fallback, prompt)
|
||||
|
||||
async def call_deep(self, prompt: str, system: str | None = None) -> str:
|
||||
"""심층 전용 — 맥북 M5 Max Qwen3.6-27B (config.yaml ai.models.deep, ds-macbook-offload-1).
|
||||
|
||||
llm-router :8890 경유(model=qwen-macbook alias) — 라우터의 wake preflight(~24s)·
|
||||
editor_busy 가드를 재사용한다. 맥미니 mlx gate 와 무관(게이트는 맥미니 보호 목적)이라
|
||||
gate 없이 호출. 자동 cloud/맥미니 폴백 없음 — 실패는 그대로 전파하고 보류 판단은
|
||||
호출자가 is_deferrable_error() 로 한다. 슬롯 부재 시 primary 로 처리(방어적 —
|
||||
호출자가 보통 슬롯 유무를 먼저 분기).
|
||||
"""
|
||||
cfg = self.ai.deep or self.ai.primary
|
||||
return await self._request(cfg, prompt, system=system)
|
||||
|
||||
# ─── Legacy API (classify_worker 교체 시 제거 예정) ───────────────────
|
||||
|
||||
async def classify(self, text: str) -> dict:
|
||||
|
||||
@@ -0,0 +1,168 @@
|
||||
"""이드 채팅 표면 — POST /api/eid/chat (eid-chat 트랙).
|
||||
|
||||
확정 결정:
|
||||
- D-1 경로 = /api/eid/chat (main.py prefix=/api/eid + 본 라우터 POST /chat)
|
||||
- D-2 mode 닫힌 어휘: daily(mac-mini-default) / deep(qwen-macbook). 클라는 mode 만 보냄 —
|
||||
claude-cloud / auto 금지 (Literal 로 422 차단). 심층(deep) 모드 무게이트.
|
||||
- D-3 독립 /chat 라우트 (frontend) — 본 모듈은 백엔드 API 만.
|
||||
- D-5 LLM 호출 = EidAIClient.call_stream 한 곳 (이드 egress 봉쇄 불변식 #5,
|
||||
RouterBackend 직접 호출 금지).
|
||||
- D-6 rules.md 부재 = 503 substrate_degraded fail-closed — 다른 표면의 degraded 배너
|
||||
컨벤션(compose._rules)과 달리 채팅은 진행 자체를 거부.
|
||||
|
||||
응답 = router SSE 라인 단위 중계 (text/event-stream — call_stream 이 model 필드를 mode
|
||||
어휘로 치환·usage 제거, 프레이밍 보존. 본 모듈은 무변형 relay). 스트림 시작 전
|
||||
backend 실패는 /api/search/ask 와 동일 shape 의 503 + error_reason 매핑(자동 fallback 0).
|
||||
로그는 메타 1줄(mode·턴수·status)만 — 대화 본문 로깅 0.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Annotated, Literal
|
||||
|
||||
import httpx
|
||||
from fastapi import APIRouter, Depends
|
||||
from fastapi.responses import JSONResponse, StreamingResponse
|
||||
from pydantic import BaseModel, Field, field_validator, model_validator
|
||||
|
||||
from core.auth import get_current_user
|
||||
from core.utils import setup_logger
|
||||
from eid import compose as eid_compose
|
||||
from eid.ai import EidAIClient
|
||||
from models.user import User
|
||||
from services.llm.backends import BackendUnavailable
|
||||
|
||||
logger = setup_logger("eid_chat")
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
class ChatMessage(BaseModel):
|
||||
"""채팅 턴 1건. role=system 은 Literal 밖 → 422 (system 합본은 서버 compose 만 주입)."""
|
||||
|
||||
role: Literal["user", "assistant"]
|
||||
content: str = Field(min_length=1, max_length=8000)
|
||||
|
||||
|
||||
# 대화 총량 cap (전 메시지 content 합) — per-message 8000·40턴 제한과 별도의 총량 상한
|
||||
_TOTAL_CONTENT_CAP = 32000
|
||||
|
||||
|
||||
class ChatRequest(BaseModel):
|
||||
"""POST /api/eid/chat body. mode 는 닫힌 어휘(D-2), messages 는 1~40턴 + 총량 32000자."""
|
||||
|
||||
mode: Literal["daily", "deep"]
|
||||
messages: list[ChatMessage] = Field(min_length=1, max_length=40)
|
||||
|
||||
@field_validator("messages")
|
||||
@classmethod
|
||||
def _last_turn_is_user(cls, v: list[ChatMessage]) -> list[ChatMessage]:
|
||||
if v and v[-1].role != "user":
|
||||
raise ValueError("마지막 메시지는 role=user 여야 합니다")
|
||||
return v
|
||||
|
||||
@model_validator(mode="after")
|
||||
def _total_content_cap(self) -> "ChatRequest":
|
||||
if sum(len(m.content) for m in self.messages) > _TOTAL_CONTENT_CAP:
|
||||
raise ValueError(
|
||||
"대화 총량 초과 — 새 대화로 시작하거나 입력을 줄여주세요 "
|
||||
f"(전체 메시지 합 {_TOTAL_CONTENT_CAP}자 제한)"
|
||||
)
|
||||
return self
|
||||
|
||||
|
||||
@router.post("/chat")
|
||||
async def eid_chat(
|
||||
body: ChatRequest,
|
||||
user: Annotated[User, Depends(get_current_user)],
|
||||
):
|
||||
"""이드 채팅 — router SSE 스트리밍 pass-through.
|
||||
|
||||
503 두 경로 (둘 다 자동 fallback 없음):
|
||||
- substrate_degraded: rules.md 부재 (D-6 fail-closed, 채팅 진행 거부)
|
||||
- backend_unavailable: 스트림 시작 전 backend 실패 (ask 컨벤션과 동일 shape)
|
||||
"""
|
||||
# D-6: rules 부재 = fail-closed. 채팅은 안전·정책 가드 없이 진행하지 않는다(배너 X).
|
||||
if not eid_compose.rules_present():
|
||||
logger.error(
|
||||
"eid_chat substrate_degraded mode=%s turns=%d status=503 — rules.md 부재, 채팅 거부",
|
||||
body.mode, len(body.messages),
|
||||
)
|
||||
return JSONResponse(
|
||||
status_code=503,
|
||||
content={
|
||||
"detail": (
|
||||
"이드 substrate 가 degraded 상태입니다 (운영 규칙 rules.md 부재). "
|
||||
"복구 전까지 채팅을 진행하지 않습니다."
|
||||
),
|
||||
"error_reason": "substrate_degraded",
|
||||
},
|
||||
)
|
||||
|
||||
system = eid_compose.compose("eid_chat", task="")
|
||||
client = EidAIClient()
|
||||
stream = client.call_stream(
|
||||
body.mode, [m.model_dump() for m in body.messages], system,
|
||||
)
|
||||
|
||||
# async generator 는 첫 __anext__ 에서야 실제 요청 전송 — 스트림 시작 전 실패(연결/4xx/5xx)
|
||||
# 를 503 으로 매핑하기 위해 첫 chunk 를 여기서 먼저 당긴다.
|
||||
try:
|
||||
first = await anext(stream, None)
|
||||
except BackendUnavailable as exc:
|
||||
logger.warning(
|
||||
"eid_chat backend_unavailable mode=%s turns=%d status=503 reason=%s",
|
||||
body.mode, len(body.messages), exc.reason,
|
||||
)
|
||||
await client.close()
|
||||
return JSONResponse(
|
||||
status_code=503,
|
||||
content={
|
||||
"error": "backend_unavailable",
|
||||
"error_reason": exc.reason,
|
||||
"backend_requested": exc.backend_name,
|
||||
"detail": (
|
||||
"선택한 모드의 backend 가 일시적으로 응답할 수 없습니다. "
|
||||
"잠시 후 다시 시도하거나 mode 를 바꿔 호출하세요."
|
||||
),
|
||||
},
|
||||
)
|
||||
except BaseException:
|
||||
await client.close()
|
||||
raise
|
||||
|
||||
# 메타 로그 1줄 — 본문 로깅 0 (대화 내용은 어디에도 남기지 않는다)
|
||||
logger.info(
|
||||
"eid_chat stream mode=%s turns=%d status=200", body.mode, len(body.messages)
|
||||
)
|
||||
|
||||
async def _passthrough():
|
||||
# call_stream 방출분 무변형 relay (정화는 call_stream 라인 단위 한 곳). 취소·
|
||||
# disconnect 포함 finally 에서 generator aclose → AsyncExitStack 이 upstream 정리.
|
||||
try:
|
||||
try:
|
||||
if first is not None:
|
||||
yield first
|
||||
async for chunk in stream:
|
||||
yield chunk
|
||||
except (BackendUnavailable, httpx.HTTPError) as exc:
|
||||
# 스트림 시작 후 절단 — status 200 은 이미 송신돼 재매핑 불가. 메타 로그
|
||||
# 1줄만 남기고 조용히 종료(traceback 전파 0) — 프론트는 [DONE] 부재로 처리.
|
||||
logger.warning(
|
||||
"eid_chat stream aborted mode=%s turns=%d reason=%s",
|
||||
body.mode, len(body.messages),
|
||||
getattr(exc, "reason", type(exc).__name__),
|
||||
)
|
||||
return
|
||||
finally:
|
||||
# stream.aclose() 가 예외여도 client.close() 는 보장 (중첩 finally)
|
||||
try:
|
||||
await stream.aclose()
|
||||
finally:
|
||||
await client.close()
|
||||
|
||||
return StreamingResponse(
|
||||
_passthrough(),
|
||||
media_type="text/event-stream",
|
||||
headers={"Cache-Control": "no-store", "X-Accel-Buffering": "no"},
|
||||
)
|
||||
@@ -0,0 +1,90 @@
|
||||
"""처리 머신 보드 API — GET /api/queue/overview (plan ds-processing-ui-6an).
|
||||
|
||||
홈 stage 평면 테이블을 "머신 관점 보드(누가 일하나)"로 — 집계 로직은
|
||||
services/queue_overview.py (순수 판정부 분리). 응답 스키마는 FE 와 계약 고정.
|
||||
응답에 raw 모델명 노출 금지 — 머신 label 만.
|
||||
"""
|
||||
|
||||
from typing import Annotated, Literal
|
||||
|
||||
from fastapi import APIRouter, Depends
|
||||
from pydantic import BaseModel
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from core.auth import get_current_user
|
||||
from core.database import get_session
|
||||
from models.user import User
|
||||
from services.queue_overview import build_overview
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
class CurrentItem(BaseModel):
|
||||
"""머신이 지금 처리 중인 문서 (최대 2건)."""
|
||||
document_id: int
|
||||
title: str
|
||||
stage: str
|
||||
|
||||
|
||||
class MachineCard(BaseModel):
|
||||
"""머신 카드 — stage 귀속 합산 + 완료 실적(summarize 는 풀 분리) + state."""
|
||||
key: Literal["gpu", "macmini", "macbook"]
|
||||
label: str
|
||||
state: Literal["active", "deferred", "idle"]
|
||||
stages: list[str]
|
||||
pending: int
|
||||
processing: int
|
||||
failed: int
|
||||
done_1h: int
|
||||
done_today: int
|
||||
deferred_pending: int
|
||||
current: list[CurrentItem]
|
||||
|
||||
|
||||
class SummarizeEta(BaseModel):
|
||||
"""summarize 풀 ETA — done > inflow 일 때만 eta_minutes 산출."""
|
||||
pending: int
|
||||
done_rate_1h: int
|
||||
inflow_rate_1h: int
|
||||
eta_minutes: int | None
|
||||
|
||||
|
||||
class TrendBucket(BaseModel):
|
||||
"""summarize 24h 추이 버킷 — hour 는 KST "HH:00" 라벨."""
|
||||
hour: str
|
||||
inflow: int
|
||||
done: int
|
||||
|
||||
|
||||
class Totals(BaseModel):
|
||||
"""전 stage 합계."""
|
||||
pending: int
|
||||
processing: int
|
||||
failed: int
|
||||
|
||||
|
||||
class StageRow(BaseModel):
|
||||
"""단계별 현황 행 — '단계 상세' 패널용 (완료 가시화)."""
|
||||
stage: str
|
||||
pending: int
|
||||
processing: int
|
||||
failed: int
|
||||
done_today: int
|
||||
oldest_pending_age_sec: int | None
|
||||
|
||||
|
||||
class QueueOverviewResponse(BaseModel):
|
||||
machines: list[MachineCard]
|
||||
stages: list[StageRow]
|
||||
summarize_eta: SummarizeEta
|
||||
trend_24h: list[TrendBucket]
|
||||
totals: Totals
|
||||
|
||||
|
||||
@router.get("/overview", response_model=QueueOverviewResponse)
|
||||
async def get_queue_overview(
|
||||
user: Annotated[User, Depends(get_current_user)],
|
||||
session: Annotated[AsyncSession, Depends(get_session)],
|
||||
):
|
||||
"""머신 관점 처리 보드 + summarize ETA 집계 (라이브 계산, 신규 테이블 0)"""
|
||||
return QueueOverviewResponse.model_validate(await build_overview(session))
|
||||
@@ -98,6 +98,10 @@ class AIConfig(BaseModel):
|
||||
classifier: AIModelConfig | None = None
|
||||
# Phase 3.5b: semantic verifier (optional — 없으면 grounding-only). PR #20 이후 Mac mini 26B MLX endpoint (initial = exaone3.5).
|
||||
verifier: AIModelConfig | None = None
|
||||
# ds-macbook-offload-1: 심층 전용 슬롯 (optional). 맥북 M5 Max Qwen3.6-27B — llm-router :8890
|
||||
# 경유(model=qwen-macbook alias, wake preflight 재사용). 부재 시 deep_summary 는 기존
|
||||
# primary(맥미니 26B) 경로 그대로 = 기능 미활성. 명시 opt-in — silent fallback 없음.
|
||||
deep: AIModelConfig | None = None
|
||||
# Legacy: vision 슬롯 (현재 사용처 0 — Document Server 는 OCR/STT 별도 서비스).
|
||||
# 제거 진행 중이므로 optional 로 관대한 로딩 유지.
|
||||
vision: AIModelConfig | None = None
|
||||
@@ -218,6 +222,7 @@ def load_settings() -> Settings:
|
||||
verifier=(
|
||||
AIModelConfig(**models["verifier"]) if "verifier" in models else None
|
||||
),
|
||||
deep=(AIModelConfig(**models["deep"]) if "deep" in models else None),
|
||||
deep_summary_backlog=DeepSummaryBacklogConfig(
|
||||
**ai_raw.get("deep_summary_backlog", {})
|
||||
),
|
||||
|
||||
@@ -0,0 +1,346 @@
|
||||
"""크롤링 politeness 코어 (A-4, plan crawl-24x7-1)
|
||||
|
||||
개인 아카이빙 권장치를 그대로 박은 공용 fetch 계층:
|
||||
- per-domain 동시성 1 (asyncio.Lock) + 같은 도메인 연속 요청 5–15초 지연 + jitter
|
||||
- robots.txt 존중 (urllib.robotparser, 24h 캐시) — 비로그인 공개 크롤링 한정.
|
||||
로그인 세션 fetch (B-3) 는 사용자 행위 성격이라 robots 대신 사람 속도가 기준.
|
||||
- 정직 식별 UA + 연락처 (익명 크롤링 트랙. 로그인 세션은 브라우저 UA 유지 — B-3)
|
||||
- 429 = Retry-After 존중 / 5xx = 재시도 가능 / 403 = 차단 신호 (호출측 circuit 연동)
|
||||
|
||||
도메인별 마지막 요청 시각 등 rate 상태는 in-process (영속 워터마크는 DB — news_sources).
|
||||
SSRF 차단은 core.url_validator.validate_feed_url 재사용 (redirect target 재검증 포함).
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import base64
|
||||
import random
|
||||
import time
|
||||
import urllib.robotparser
|
||||
from urllib.parse import urljoin, urlparse
|
||||
|
||||
import httpx
|
||||
|
||||
from core.url_validator import validate_feed_url
|
||||
from core.utils import setup_logger
|
||||
|
||||
# bare getLogger 는 root(WARNING) 상속이라 INFO 대기/차단 로그가 드랍됨 — 타 워커와 동일 설정
|
||||
logger = setup_logger("crawl_politeness")
|
||||
|
||||
# 정직 식별 UA + 연락처 — 차단 전 연락 통로 (A-4)
|
||||
CRAWL_UA = "HyungiPKM-Archiver/1.0 (personal archive; +mailto:hyun49196@gmail.com)"
|
||||
|
||||
# 같은 도메인 연속 요청 간격 (초) — 권장치 5–15s + jitter
|
||||
_DOMAIN_DELAY_MIN = 5.0
|
||||
_DOMAIN_DELAY_MAX = 15.0
|
||||
|
||||
# 구독 세션(브라우저) fetch 간격 — 사람 속도 (B-3 ④: 기사 간 수십 초)
|
||||
_AUTH_DELAY_MIN = 30.0
|
||||
_AUTH_DELAY_MAX = 60.0
|
||||
|
||||
# B-3 Playwright 격리 컨테이너 (internal-only, compose DNS)
|
||||
_FETCHER_URL = "http://playwright-fetcher:3400"
|
||||
_FETCHER_TIMEOUT = 120.0 # 브라우저 기동 + 네비게이션 + settle 포함
|
||||
|
||||
# 안티봇 챌린지 페이지 식별 마커 (DataDome/Cloudflare 등) — 좁게 유지(오탐 회피).
|
||||
# 실측: 르몽드 기사 = DataDome "Client Challenge" + "Entrez les caractères" CAPTCHA.
|
||||
_CHALLENGE_MARKERS = (
|
||||
"Client Challenge",
|
||||
"Entrez les caractères affichés",
|
||||
"Checking your browser before",
|
||||
"captcha-delivery.com",
|
||||
"geo.captcha-delivery",
|
||||
# CF JS 챌린지 인터스티셜의 스크립트 도메인 (aiche.org 실측 2026-06-11) —
|
||||
# fetcher 의 챌린지 대기를 끝까지 통과 못 한 최종 HTML 만 여기 걸린다.
|
||||
"challenges.cloudflare.com",
|
||||
)
|
||||
|
||||
_ROBOTS_CACHE_TTL = 24 * 3600 # 24h
|
||||
_MAX_PAGE_BYTES = 5 * 1024 * 1024 # 피드 fetch 와 동일 5MB cap
|
||||
_PAGE_TIMEOUT = 20.0
|
||||
_MAX_REDIRECTS = 3
|
||||
|
||||
_HTML_CONTENT_TYPES = ("text/html", "application/xhtml+xml")
|
||||
|
||||
|
||||
class CrawlFetchError(Exception):
|
||||
"""일시 오류 (5xx / timeout / 네트워크) — 큐 재시도 대상."""
|
||||
|
||||
|
||||
class CrawlBlocked(Exception):
|
||||
"""차단 신호 (403 / 429 / robots disallow) — 재시도보다 backoff/circuit 대상."""
|
||||
|
||||
|
||||
class CrawlSkip(Exception):
|
||||
"""영구 비대상 (비-HTML / 크기 초과 / SSRF 차단 / 4xx) — 격하 처리 대상."""
|
||||
|
||||
|
||||
# 도메인별 직렬화 상태 (in-process)
|
||||
_domain_locks: dict[str, asyncio.Lock] = {}
|
||||
_domain_last_request: dict[str, float] = {}
|
||||
# host → (cached_at, RobotFileParser | None). None = robots 없음/4xx (전부 허용)
|
||||
_robots_cache: dict[str, tuple[float, urllib.robotparser.RobotFileParser | None]] = {}
|
||||
|
||||
|
||||
def _domain_of(url: str) -> str:
|
||||
return (urlparse(url).hostname or "").lower()
|
||||
|
||||
|
||||
def _get_lock(domain: str) -> asyncio.Lock:
|
||||
if domain not in _domain_locks:
|
||||
_domain_locks[domain] = asyncio.Lock()
|
||||
return _domain_locks[domain]
|
||||
|
||||
|
||||
async def _respect_domain_rate(
|
||||
domain: str,
|
||||
delay_min: float = _DOMAIN_DELAY_MIN,
|
||||
delay_max: float = _DOMAIN_DELAY_MAX,
|
||||
) -> None:
|
||||
"""같은 도메인 직전 요청에서 delay(jitter) 경과할 때까지 대기."""
|
||||
last = _domain_last_request.get(domain)
|
||||
if last is not None:
|
||||
delay = random.uniform(delay_min, delay_max)
|
||||
wait = last + delay - time.monotonic()
|
||||
if wait > 0:
|
||||
# silent sleep 금지 — politeness 동작 검증·운영 관찰 가시성
|
||||
logger.info("[politeness] %s %.1fs 대기", domain, wait)
|
||||
await asyncio.sleep(wait)
|
||||
|
||||
|
||||
async def _fetch_robots(client: httpx.AsyncClient, scheme: str, host: str):
|
||||
"""robots.txt 조회. 4xx/부재 = 전부 허용(None), 5xx/오류 = 보수적으로 이번 사이클 차단."""
|
||||
robots_url = f"{scheme}://{host}/robots.txt"
|
||||
try:
|
||||
resp = await client.get(robots_url, headers={"User-Agent": CRAWL_UA})
|
||||
except httpx.HTTPError as e:
|
||||
raise CrawlFetchError(f"robots.txt 조회 실패: {host}: {e}") from e
|
||||
if resp.status_code >= 500:
|
||||
# 5xx 는 의도 불명 — 표준 관행대로 이번 사이클은 차단 취급
|
||||
raise CrawlFetchError(f"robots.txt 5xx: {host}: {resp.status_code}")
|
||||
if resp.status_code >= 400:
|
||||
return None # robots 없음 = 전부 허용
|
||||
rp = urllib.robotparser.RobotFileParser()
|
||||
rp.parse(resp.text.splitlines())
|
||||
return rp
|
||||
|
||||
|
||||
async def _robots_allows(client: httpx.AsyncClient, url: str) -> bool:
|
||||
parsed = urlparse(url)
|
||||
host = (parsed.hostname or "").lower()
|
||||
cached = _robots_cache.get(host)
|
||||
if cached is None or time.monotonic() - cached[0] > _ROBOTS_CACHE_TTL:
|
||||
rp = await _fetch_robots(client, parsed.scheme or "https", host)
|
||||
_robots_cache[host] = (time.monotonic(), rp)
|
||||
cached = _robots_cache[host]
|
||||
rp = cached[1]
|
||||
if rp is None:
|
||||
return True
|
||||
return rp.can_fetch(CRAWL_UA, url)
|
||||
|
||||
|
||||
async def fetch_page(
|
||||
url: str, *, check_robots: bool = True,
|
||||
content_types: tuple[str, ...] = _HTML_CONTENT_TYPES,
|
||||
) -> tuple[str, str]:
|
||||
"""공개 페이지 1건 politeness fetch. (html_text, final_url) 반환.
|
||||
|
||||
- SSRF 검증 (redirect target 포함, news_collector 피드 fetch 와 동일 이중 검증)
|
||||
- per-domain 동시성 1 + 5–15s jitter 지연
|
||||
- 429: Retry-After 로그 후 CrawlBlocked / 403: CrawlBlocked / 그 외 4xx: CrawlSkip
|
||||
- 5xx/timeout: CrawlFetchError (큐 재시도)
|
||||
- 비-HTML content-type / 5MB 초과: CrawlSkip
|
||||
"""
|
||||
try:
|
||||
validate_feed_url(url)
|
||||
except ValueError as e:
|
||||
raise CrawlSkip(f"URL 검증 실패: {e}") from e
|
||||
|
||||
domain = _domain_of(url)
|
||||
async with _get_lock(domain):
|
||||
await _respect_domain_rate(domain)
|
||||
try:
|
||||
async with httpx.AsyncClient(
|
||||
timeout=_PAGE_TIMEOUT, follow_redirects=False,
|
||||
headers={"User-Agent": CRAWL_UA},
|
||||
) as client:
|
||||
if check_robots and not await _robots_allows(client, url):
|
||||
raise CrawlBlocked(f"robots.txt disallow: {url}")
|
||||
|
||||
resp = await client.get(url)
|
||||
redirects = 0
|
||||
# has_redirect_location = location 헤더 있는 진짜 redirect 만 (httpx 의
|
||||
# is_redirect 는 3xx 전체라 304 등을 redirect 로 오인 — news_collector 동일 함정)
|
||||
while resp.has_redirect_location and redirects < _MAX_REDIRECTS:
|
||||
location = urljoin(str(resp.request.url), resp.headers["location"])
|
||||
try:
|
||||
validate_feed_url(location)
|
||||
except ValueError as e:
|
||||
raise CrawlSkip(f"redirect target 차단: {e}") from e
|
||||
# redirect 도 같은 도메인 연속 요청 — 간격은 lock 보유로 충분 (즉시 1회)
|
||||
resp = await client.get(location)
|
||||
redirects += 1
|
||||
if resp.has_redirect_location:
|
||||
raise CrawlSkip(f"redirect {_MAX_REDIRECTS}회 초과: {url}")
|
||||
except httpx.TimeoutException as e:
|
||||
raise CrawlFetchError(f"timeout: {url}") from e
|
||||
except httpx.HTTPError as e:
|
||||
raise CrawlFetchError(f"네트워크 오류: {url}: {e}") from e
|
||||
finally:
|
||||
_domain_last_request[domain] = time.monotonic()
|
||||
|
||||
if resp.status_code == 429:
|
||||
retry_after = resp.headers.get("retry-after", "")
|
||||
logger.warning("[politeness] 429 %s (Retry-After=%s)", domain, retry_after or "-")
|
||||
raise CrawlBlocked(f"429 rate limited: {url} (Retry-After={retry_after or '-'})")
|
||||
if resp.status_code == 403:
|
||||
raise CrawlBlocked(f"403 forbidden: {url}")
|
||||
if resp.status_code >= 500:
|
||||
raise CrawlFetchError(f"{resp.status_code}: {url}")
|
||||
if resp.status_code >= 400:
|
||||
raise CrawlSkip(f"{resp.status_code}: {url}")
|
||||
|
||||
ct = resp.headers.get("content-type", "").lower()
|
||||
if ct and not any(t in ct for t in content_types):
|
||||
raise CrawlSkip(f"비허용 content-type: {ct}: {url}")
|
||||
if len(resp.content) > _MAX_PAGE_BYTES:
|
||||
raise CrawlSkip(f"크기 초과: {len(resp.content)} bytes: {url}")
|
||||
|
||||
return resp.text, str(resp.request.url)
|
||||
|
||||
|
||||
# ── B-3 구독 세션 fetch (Playwright 격리 컨테이너 경유) ──────────────────────
|
||||
|
||||
async def fetch_page_via_browser(url: str, profile: str | None) -> tuple[str, str]:
|
||||
"""브라우저 페이지 1건 — playwright-fetcher 에 위임, politeness 는 사람 속도(30~60s).
|
||||
|
||||
profile=None = 익명 컨텍스트 (사이클 3 — 평문 httpx 를 UA 무관 403 하는 공개
|
||||
사이트의 WAF 우회 전용, CCPS aiche.org 실측). 값 = B-3 구독 세션.
|
||||
(html_text, final_url) 반환. robots 미적용 — 구독 fetch 는 사용자 행위 성격,
|
||||
익명 WAF 우회는 월간 1~2회 저빈도 + 사람 속도가 보호 장치.
|
||||
예외 어휘는 fetch_page 와 동일 (호출측 분기 재사용).
|
||||
"""
|
||||
try:
|
||||
validate_feed_url(url)
|
||||
except ValueError as e:
|
||||
raise CrawlSkip(f"URL 검증 실패: {e}") from e
|
||||
|
||||
payload = {"url": url}
|
||||
if profile:
|
||||
payload["profile"] = profile
|
||||
|
||||
domain = _domain_of(url)
|
||||
async with _get_lock(domain):
|
||||
await _respect_domain_rate(domain, _AUTH_DELAY_MIN, _AUTH_DELAY_MAX)
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=_FETCHER_TIMEOUT) as client:
|
||||
resp = await client.post(f"{_FETCHER_URL}/fetch", json=payload)
|
||||
except httpx.TimeoutException as e:
|
||||
raise CrawlFetchError(f"browser fetch timeout: {url}") from e
|
||||
except httpx.HTTPError as e:
|
||||
raise CrawlFetchError(f"playwright-fetcher 연결 오류: {e}") from e
|
||||
finally:
|
||||
_domain_last_request[domain] = time.monotonic()
|
||||
|
||||
if resp.status_code == 503:
|
||||
# storage_state 부재 — 수동 세션 박제 대기 (호출측 degrade, 재시도 루프 금지)
|
||||
raise CrawlBlocked(f"세션 프로필 부재: {profile}")
|
||||
if resp.status_code != 200:
|
||||
raise CrawlFetchError(f"playwright-fetcher {resp.status_code}: {url}")
|
||||
data = resp.json()
|
||||
html_text = data.get("html", "")
|
||||
if len(html_text.encode("utf-8", errors="replace")) > _MAX_PAGE_BYTES:
|
||||
raise CrawlSkip(f"크기 초과 (browser): {url}")
|
||||
# 안티봇 챌린지 페이지(DataDome 등) 식별 — 본문 길이 게이트(200자)를 통과하는
|
||||
# 짧은 챌린지 HTML 이 기사 본문으로 승격되는 silent corruption 차단. 헤드리스 탐지라
|
||||
# 재시도 무의미 → CrawlBlocked(=degrade, RSS 요약 유지). 마커는 보수적으로 좁게.
|
||||
if any(m in html_text for m in _CHALLENGE_MARKERS):
|
||||
raise CrawlBlocked(f"안티봇 챌린지 페이지(headless 차단): {url}")
|
||||
return html_text, data.get("final_url", url)
|
||||
|
||||
|
||||
_MAX_DOWNLOAD_BYTES = 60 * 1024 * 1024 # fetcher MAX_DOWNLOAD_BYTES 와 동률
|
||||
|
||||
|
||||
async def download_via_browser(
|
||||
url: str, *, referer: str | None = None, profile: str | None = None
|
||||
) -> tuple[bytes, str]:
|
||||
"""바이너리(PDF) 1건 — fetcher /download 위임. (content, content_type) 반환.
|
||||
|
||||
referer = WAF 챌린지 쿠키를 먼저 획득할 목록 페이지 (CCPS Beacon 패턴).
|
||||
내부 status 판정: 403/429 = CrawlBlocked, 그 외 4xx = CrawlSkip, 5xx = CrawlFetchError
|
||||
(fetch_page 와 동일 어휘 — 호출측 분기 재사용).
|
||||
"""
|
||||
try:
|
||||
validate_feed_url(url)
|
||||
except ValueError as e:
|
||||
raise CrawlSkip(f"URL 검증 실패: {e}") from e
|
||||
|
||||
payload: dict = {"url": url}
|
||||
if referer:
|
||||
payload["referer"] = referer
|
||||
if profile:
|
||||
payload["profile"] = profile
|
||||
|
||||
domain = _domain_of(url)
|
||||
async with _get_lock(domain):
|
||||
await _respect_domain_rate(domain, _AUTH_DELAY_MIN, _AUTH_DELAY_MAX)
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=_FETCHER_TIMEOUT) as client:
|
||||
resp = await client.post(f"{_FETCHER_URL}/download", json=payload)
|
||||
except httpx.TimeoutException as e:
|
||||
raise CrawlFetchError(f"browser download timeout: {url}") from e
|
||||
except httpx.HTTPError as e:
|
||||
raise CrawlFetchError(f"playwright-fetcher 연결 오류: {e}") from e
|
||||
finally:
|
||||
_domain_last_request[domain] = time.monotonic()
|
||||
|
||||
if resp.status_code == 503:
|
||||
raise CrawlBlocked(f"세션 프로필 부재: {profile}")
|
||||
if resp.status_code != 200:
|
||||
raise CrawlFetchError(f"playwright-fetcher {resp.status_code}: {url}")
|
||||
data = resp.json()
|
||||
inner = int(data.get("status", 0))
|
||||
if inner in (403, 429):
|
||||
raise CrawlBlocked(f"{inner} (browser download): {url}")
|
||||
if 400 <= inner < 500:
|
||||
raise CrawlSkip(f"{inner} (browser download): {url}")
|
||||
if inner != 200:
|
||||
raise CrawlFetchError(f"{inner} (browser download): {url}")
|
||||
content = base64.b64decode(data.get("body_b64", ""))
|
||||
if len(content) > _MAX_DOWNLOAD_BYTES:
|
||||
raise CrawlSkip(f"크기 초과 (browser download): {url}")
|
||||
return content, data.get("content_type", "")
|
||||
|
||||
|
||||
async def probe_session(
|
||||
profile: str, probe_url: str, min_body_chars: int, paywall_markers: list[str]
|
||||
) -> dict:
|
||||
"""내용 기반 세션 probe (B-3 ②) — {'ok': bool, 'reason': str|None, 'body_chars': int}.
|
||||
|
||||
실패를 예외가 아닌 값으로 반환 — 호출측이 source_health 에 기록하고 degrade 분기.
|
||||
probe 도 실제 publisher fetch 라 동일 도메인 lock + 사람 속도 적용.
|
||||
"""
|
||||
domain = _domain_of(probe_url)
|
||||
async with _get_lock(domain):
|
||||
await _respect_domain_rate(domain, _AUTH_DELAY_MIN, _AUTH_DELAY_MAX)
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=_FETCHER_TIMEOUT) as client:
|
||||
resp = await client.post(
|
||||
f"{_FETCHER_URL}/probe",
|
||||
json={
|
||||
"profile": profile,
|
||||
"probe_url": probe_url,
|
||||
"min_body_chars": min_body_chars,
|
||||
"paywall_markers": paywall_markers,
|
||||
},
|
||||
)
|
||||
except httpx.HTTPError as e:
|
||||
return {"ok": False, "reason": f"fetcher 연결 오류: {e}", "body_chars": 0}
|
||||
finally:
|
||||
_domain_last_request[domain] = time.monotonic()
|
||||
|
||||
if resp.status_code == 503:
|
||||
return {"ok": False, "reason": f"세션 프로필 부재: {profile}", "body_chars": 0}
|
||||
if resp.status_code != 200:
|
||||
return {"ok": False, "reason": f"fetcher {resp.status_code}", "body_chars": 0}
|
||||
return resp.json()
|
||||
+193
@@ -11,11 +11,116 @@ endpoint 를 못 부른다(silent fallback 0, rules no-silent-fallback).
|
||||
- _request() → endpoint 에 anthropic.com 있으면 raise(primary 오결선 방어, 이중보증)
|
||||
call_primary / call_triage / embed / rerank 는 그대로(내부 inference·임베딩 허용).
|
||||
egress 워커·시스템 경로는 기존 AIClient 유지 — fallback 은 시스템만, 이드만 박탈(분리).
|
||||
|
||||
eid-chat (D-5): 이드 채팅 SSE 스트리밍도 이 클래스의 call_stream() 한 곳 — RouterBackend
|
||||
직접 호출 금지, mode 어휘는 _CHAT_ALIAS 닫힌 매핑(daily/deep)만, 미지 mode = EidEgressBlocked.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import re
|
||||
from collections.abc import AsyncIterator
|
||||
from contextlib import AsyncExitStack
|
||||
|
||||
import httpx
|
||||
|
||||
from ai.client import AIClient
|
||||
from services.llm.backends import (
|
||||
MAC_MINI_DEFAULT,
|
||||
QWEN_MACBOOK,
|
||||
BackendUnavailable,
|
||||
_router_url, # router URL 단일 출처 재사용 (settings → env LLM_ROUTER_URL → MVP default)
|
||||
)
|
||||
from services.search.llm_gate import Priority, acquire_mlx_gate
|
||||
|
||||
# 이드 채팅 mode → router alias 닫힌 매핑 (D-2). 클라는 mode 만 보냄 — claude-cloud/auto 금지.
|
||||
_CHAT_ALIAS: dict[str, str] = {
|
||||
"daily": MAC_MINI_DEFAULT, # router tier_b → Mac mini :8801 gemma-4-26b
|
||||
"deep": QWEN_MACBOOK, # router named upstream → M5 Max Qwen3.6-27B (무게이트, D-2)
|
||||
}
|
||||
|
||||
# read 는 per-chunk 적용이라 MacBook wake(24s)+토큰 생성 간격 커버. connect 는 내부 router 라 짧게.
|
||||
_STREAM_TIMEOUT = httpx.Timeout(connect=5.0, read=120.0, write=30.0, pool=5.0)
|
||||
|
||||
# 스트림 중계 전체(업스트림 진입~종료) wall-clock 상한. per-chunk read timeout 만으로는
|
||||
# 토큰이 계속 흐르는 한 무한 점유 가능 → daily 는 mlx gate 를 물고 있어 deadline 필수.
|
||||
# deep 도 동일 적용(단순·일관). 정상 스트림(max_tokens 2048, ~90tps ≈ 23s)은 여유 통과.
|
||||
_STREAM_DEADLINE_S = 300.0
|
||||
|
||||
# error_reason allowlist — 이 밖(대문자/공백/JSON 직렬화 파편)은 일반화해 비노출
|
||||
_REASON_ALLOWED = re.compile(r"[a-z0-9_]{1,64}")
|
||||
|
||||
# 스트림 시작 전 transport 계열 실패 → BackendUnavailable 매핑 대상 (RouterBackend._post 와 동일 목록)
|
||||
_TRANSPORT_ERRORS = (
|
||||
httpx.ConnectError,
|
||||
httpx.ConnectTimeout,
|
||||
httpx.ReadTimeout,
|
||||
httpx.PoolTimeout,
|
||||
httpx.WriteTimeout,
|
||||
httpx.RemoteProtocolError,
|
||||
)
|
||||
|
||||
|
||||
def _stream_error_reason(status_code: int, body: bytes) -> str:
|
||||
"""스트림 시작 전 4xx/5xx 응답 본문 → error_reason 추출.
|
||||
|
||||
어휘는 /api/search/ask(RouterBackend._post)와 일치 — router 가 주는 error.type /
|
||||
error.error_reason (macbook_unavailable / warming / editor_busy / upstream_cold /
|
||||
provider_not_configured 등) 우선, 없으면 status 기반 router_503 / upstream_502 /
|
||||
router_http_<status>.
|
||||
|
||||
최종 reason 은 [a-z0-9_]{1,64} allowlist 검사 — 불일치(대문자/공백/dict 직렬화
|
||||
파편)는 upstream_502(502 계열) / router_error(그 외) 로 일반화해 외부 비노출.
|
||||
"""
|
||||
try:
|
||||
data = json.loads(body.decode("utf-8", errors="replace"))
|
||||
except Exception:
|
||||
data = {}
|
||||
err = data.get("error", {}) if isinstance(data, dict) else {}
|
||||
reason: str | None = None
|
||||
if isinstance(err, dict):
|
||||
raw = err.get("type") or err.get("error_reason")
|
||||
if raw:
|
||||
reason = str(raw)
|
||||
if reason is None and isinstance(data, dict) and data.get("error_reason"):
|
||||
reason = str(data["error_reason"])
|
||||
if reason is None:
|
||||
if status_code == 502:
|
||||
reason = "upstream_502"
|
||||
elif status_code == 503:
|
||||
reason = "router_503"
|
||||
else:
|
||||
reason = f"router_http_{status_code}"
|
||||
if _REASON_ALLOWED.fullmatch(reason):
|
||||
return reason
|
||||
return "upstream_502" if status_code == 502 else "router_error"
|
||||
|
||||
|
||||
def _rewrite_sse_line(line: bytes, mode: str) -> bytes:
|
||||
"""SSE 라인 1건 정화 — data: JSON 의 model 을 mode 어휘로 치환 + usage 제거.
|
||||
|
||||
fixture 실측: 27B chunk 의 model 필드가 맥북 파일시스템 절대경로
|
||||
("/Users/.../mlx-models/Qwen3.6-27B-8bit")를 노출 — 표면 문법 '모델·머신명
|
||||
비노출'과 충돌해 라인 단위로 재작성한다. usage(tps/peak_memory 등 머신
|
||||
텔레메트리)도 함께 제거. [DONE]·비-data 라인(빈 줄 포함)·파싱 실패 라인은
|
||||
원문 그대로(방어적) — SSE 프레이밍(data: 라인 + 빈 줄) 보존.
|
||||
"""
|
||||
if not line.startswith(b"data: "):
|
||||
return line
|
||||
payload = line[len(b"data: "):]
|
||||
if payload.strip() == b"[DONE]":
|
||||
return line
|
||||
try:
|
||||
obj = json.loads(payload)
|
||||
except Exception:
|
||||
return line
|
||||
if not isinstance(obj, dict):
|
||||
return line
|
||||
obj["model"] = mode
|
||||
obj.pop("usage", None)
|
||||
return b"data: " + json.dumps(obj, ensure_ascii=False).encode("utf-8")
|
||||
|
||||
|
||||
class EidEgressBlocked(RuntimeError):
|
||||
@@ -39,3 +144,91 @@ class EidAIClient(AIClient):
|
||||
if "anthropic.com" in endpoint:
|
||||
raise EidEgressBlocked(f"이드: 외부 endpoint 차단 ({endpoint}). 내부 inference 만.")
|
||||
return await super()._request(model_config, prompt, system=system)
|
||||
|
||||
async def call_stream(
|
||||
self, mode: str, messages: list[dict], system: str
|
||||
) -> AsyncIterator[bytes]:
|
||||
"""이드 채팅 SSE 스트림 — router /v1/chat/completions stream=true 라인 단위 중계 (D-5).
|
||||
|
||||
mode : "daily" | "deep" — _CHAT_ALIAS 닫힌 매핑. 미지 mode = EidEgressBlocked
|
||||
(이드 LLM 호출 봉쇄는 이 클래스 한 곳, 불변식 #5).
|
||||
messages : user/assistant 턴 목록 (system role 금지 — system 인자로만 주입).
|
||||
system : compose("eid_chat", ...) 합본. messages 맨 앞에 system role 로 끼움.
|
||||
|
||||
스트림 시작 전 실패(연결 실패·5xx 응답) = BackendUnavailable(reason 어휘는 ask
|
||||
와 동일). router 400 = 닫힌 매핑에서 alias drift 코드 버그 → ValueError fail-loud
|
||||
(RouterBackend._post 컨벤션 미러). 스트림 시작 후엔 bytes 를 라인 버퍼링해
|
||||
_rewrite_sse_line 으로 model 치환(mode 어휘)·usage 제거만 하고 프레이밍은 보존.
|
||||
취소/disconnect 시 AsyncExitStack 이 response·client 정리(upstream 닫힘 보장).
|
||||
|
||||
daily(mac-mini-default)는 Mac mini MLX 단일 inference 영구 룰(llm_gate docstring
|
||||
"예외 없이 gate 획득 필수")에 따라 acquire_mlx_gate(FOREGROUND) 안에서 스트리밍 —
|
||||
RouterBackend 의 requires_gate=True 와 동일한 client-side mutex 효과.
|
||||
deep(qwen-macbook)은 별 endpoint 라 무게이트 (D-2, RouterBackend 동형).
|
||||
|
||||
중계 전체(업스트림 진입~종료)는 asyncio.timeout(_STREAM_DEADLINE_S) wall-clock
|
||||
deadline 안 — llm_gate 계약 "timeout 은 gate 안쪽" 준수(gate 대기엔 미적용).
|
||||
초과 시 BackendUnavailable(alias, "stream_deadline_exceeded") 로 수렴.
|
||||
"""
|
||||
alias = _CHAT_ALIAS.get(mode)
|
||||
if alias is None:
|
||||
raise EidEgressBlocked(
|
||||
f"이드: 미지 chat mode {mode!r} — 닫힌 매핑(daily/deep) 외 호출 차단."
|
||||
)
|
||||
router_url = _router_url()
|
||||
if "anthropic.com" in router_url:
|
||||
# 기존 _request 패턴 미러 — router URL 오결선 시 외부 egress 방어 (이중보증)
|
||||
raise EidEgressBlocked(f"이드: 외부 endpoint 차단 ({router_url}). 내부 router 만.")
|
||||
url = f"{router_url.rstrip('/')}/v1/chat/completions"
|
||||
payload = {
|
||||
"model": alias,
|
||||
"messages": [{"role": "system", "content": system}] + messages,
|
||||
"stream": True,
|
||||
"max_tokens": 2048,
|
||||
"temperature": 0.4,
|
||||
}
|
||||
async with AsyncExitStack() as stack:
|
||||
if alias == MAC_MINI_DEFAULT:
|
||||
await stack.enter_async_context(acquire_mlx_gate(Priority.FOREGROUND))
|
||||
client = await stack.enter_async_context(httpx.AsyncClient(timeout=_STREAM_TIMEOUT))
|
||||
try:
|
||||
# wall-clock deadline — gate 획득 *후* 진입 (llm_gate "timeout 은 gate 안쪽")
|
||||
async with asyncio.timeout(_STREAM_DEADLINE_S):
|
||||
try:
|
||||
resp = await stack.enter_async_context(
|
||||
client.stream("POST", url, json=payload)
|
||||
)
|
||||
except _TRANSPORT_ERRORS as exc:
|
||||
# 스트림 시작 전 연결 계열 실패 — reason 어휘 = RouterBackend(router_*) 와 일치
|
||||
raise BackendUnavailable(alias, f"router_{type(exc).__name__}") from exc
|
||||
if resp.status_code == 400:
|
||||
# 닫힌 매핑에서 400 = alias drift 코드 버그 — RouterBackend._post 미러,
|
||||
# BackendUnavailable(일시 비가용) 아님 → fail-loud
|
||||
body = await resp.aread()
|
||||
try:
|
||||
data = json.loads(body.decode("utf-8", errors="replace"))
|
||||
except Exception:
|
||||
data = {}
|
||||
raise ValueError(f"router rejected alias={alias!r} body={data!r}")
|
||||
if resp.status_code >= 400:
|
||||
body = await resp.aread()
|
||||
raise BackendUnavailable(
|
||||
alias, _stream_error_reason(resp.status_code, body)
|
||||
)
|
||||
buf = b""
|
||||
try:
|
||||
async for chunk in resp.aiter_bytes():
|
||||
buf += chunk
|
||||
# 라인 버퍼링 — 청크 경계에서 b"\n" 분리, 잔여 버퍼 유지
|
||||
while (nl := buf.find(b"\n")) != -1:
|
||||
line, buf = buf[:nl], buf[nl + 1:]
|
||||
yield _rewrite_sse_line(line, mode) + b"\n"
|
||||
except _TRANSPORT_ERRORS as exc:
|
||||
# 시작 후 중단 — 이미 보낸 chunk 는 전송됨. typed 예외로 수렴(caller 가 끊고 정리).
|
||||
raise BackendUnavailable(alias, f"router_{type(exc).__name__}") from exc
|
||||
if buf:
|
||||
# 스트림 끝 잔여분 flush (개행 없는 마지막 라인 — 원문에 없던 \n 추가 안 함)
|
||||
yield _rewrite_sse_line(buf, mode)
|
||||
except TimeoutError as exc:
|
||||
# asyncio.timeout 초과 — 게이트 점유 무한화 차단, typed 예외로 수렴
|
||||
raise BackendUnavailable(alias, "stream_deadline_exceeded") from exc
|
||||
|
||||
@@ -50,6 +50,8 @@ _ROUTE: dict[str, dict] = {
|
||||
"react_ask": {"overlay": None, "variant": "full"},
|
||||
"study_subject_note": {"overlay": None, "variant": "full"},
|
||||
"study_question_explanation": {"overlay": None, "variant": "full"},
|
||||
# 이드 채팅 표면 (D-1 /api/eid/chat) — 자유-prose(base), persona ON (불변식 #3)
|
||||
"eid_chat": {"overlay": None, "variant": "full"},
|
||||
# 미래 active eid 표면 — 기능 overlay (W3+ 에서 호출 배선)
|
||||
"study_diagnosis": {"overlay": "study", "variant": "full"},
|
||||
"document_brief": {"overlay": "document", "variant": "full"},
|
||||
@@ -113,6 +115,17 @@ def is_composed_surface(surface: str) -> bool:
|
||||
return surface in _ROUTE
|
||||
|
||||
|
||||
def rules_present() -> bool:
|
||||
"""rules.md 존재 여부 — 채팅 표면(D-6)의 fail-closed 판정 재료.
|
||||
|
||||
기존 _rules() 의 degraded 배너 컨벤션(다른 표면, fail-loud 진행)은 그대로 둔다 —
|
||||
여긴 '진행 거부' 판정만 제공하고 강제는 호출부(/api/eid/chat) 책임.
|
||||
lru_cache 된 _read 를 쓰지 않고 매 호출 직접 stat — D-6 게이트는 살아있는 판정
|
||||
이어야 한다(캐시 동결 시 rules.md 부재/복구가 영원히 반영 안 됨).
|
||||
"""
|
||||
return (_SUBSTRATE_DIR / "rules.md").is_file()
|
||||
|
||||
|
||||
def compose(surface: str, task: str, *, variant: str | None = None,
|
||||
budget_chars: int | None = None) -> str:
|
||||
"""persona → rules → overlay → task 단일 system 문자열 합성.
|
||||
|
||||
+22
@@ -17,10 +17,12 @@ from api.digest import router as digest_router
|
||||
from api.document_notes import router as document_notes_router
|
||||
from api.document_reads import router as document_reads_router
|
||||
from api.documents import router as documents_router
|
||||
from api.eid_chat import router as eid_chat_router
|
||||
from api.events import router as events_router
|
||||
from api.library import router as library_router
|
||||
from api.memos import router as memos_router
|
||||
from api.news import router as news_router
|
||||
from api.queue_overview import router as queue_overview_router
|
||||
from api.search import router as search_router
|
||||
from api.setup import router as setup_router
|
||||
from api.study_question_progress import router as study_question_progress_router
|
||||
@@ -54,6 +56,11 @@ async def lifespan(app: FastAPI):
|
||||
from workers.law_monitor import run as law_monitor_run
|
||||
from workers.mailplus_archive import run as mailplus_run
|
||||
from workers.news_collector import run as news_collector_run
|
||||
from workers.fulltext_worker import reconcile_unresolved as fulltext_reconcile_run
|
||||
from workers.kosha_collector import run as kosha_collector_run
|
||||
from workers.csb_collector import run as csb_collector_run
|
||||
from workers.api_standards_collector import run as api_standards_run
|
||||
from workers.ccps_collector import run as ccps_collector_run
|
||||
from workers.queue_consumer import consume_queue, consume_markdown_queue
|
||||
from workers.study_queue_consumer import consume_study_queue
|
||||
from workers.study_session_queue_consumer import consume_study_session_queue
|
||||
@@ -121,9 +128,20 @@ async def lifespan(app: FastAPI):
|
||||
# 이드 W3-2: 공부중 토픽 약점 derived 스냅샷 (nightly 04:30 KST, LLM 0). study_diagnosis 표면 source.
|
||||
scheduler.add_job(study_weakness_run, CronTrigger(hour=4, minute=30, timezone=KST), id="study_weakness")
|
||||
scheduler.add_job(news_collector_run, "interval", hours=6, id="news_collector")
|
||||
# crawl-24x7 A-2 안전망: fulltext 영구 실패(3회 소진) 문서를 RSS 요약 기준으로
|
||||
# 후속 enqueue (silent skip 누적 방지). 03:40 = dedup_reconcile(03:30) 직후 비충돌 슬롯.
|
||||
scheduler.add_job(fulltext_reconcile_run, CronTrigger(hour=3, minute=40, timezone=KST), id="fulltext_reconcile")
|
||||
# plan ds-s1-backend-1 B-4: dedup 컬럼(duplicate_of/duplicate_count) 야간 절대 재계산.
|
||||
# soft-delete 잔여 드리프트 정리(멱등, 드리프트 없으면 no-op). cron 03:30 (다른 잡과 비충돌).
|
||||
scheduler.add_job(dedup_reconcile_run, CronTrigger(hour=3, minute=30, timezone=KST), id="dedup_reconcile")
|
||||
# crawl-24x7 C-2: KOSHA 재해사례 diff + GUIDE 점진 백필 (daily, 새벽 잡들과 비충돌 슬롯).
|
||||
scheduler.add_job(kosha_collector_run, CronTrigger(hour=6, minute=40, timezone=KST), id="kosha_collector")
|
||||
# 사이클 3 C-2 잔여: CSB sitemap lastmod diff (weekly 월, cap 40 + 워터마크 점진 백필).
|
||||
scheduler.add_job(csb_collector_run, CronTrigger(day_of_week="mon", hour=6, minute=50, timezone=KST), id="csb_collector")
|
||||
# 사이클 3 C-4: API 표준 공지 목록 diff (monthly — 월 1~2건 공지 페이스).
|
||||
scheduler.add_job(api_standards_run, CronTrigger(day=5, hour=7, minute=5, timezone=KST), id="api_standards_collector")
|
||||
# 사이클 3 C-2 잔여: CCPS Beacon 월간 PDF (playwright 익명 경유 — WAF 차단 시 health 로 가시화).
|
||||
scheduler.add_job(ccps_collector_run, CronTrigger(day=5, hour=7, minute=20, timezone=KST), id="ccps_collector")
|
||||
scheduler.start()
|
||||
|
||||
# Phase 2.1 (async 구조): QueryAnalyzer prewarm.
|
||||
@@ -158,12 +176,16 @@ app.include_router(documents_router, prefix="/api/documents", tags=["documents"]
|
||||
app.include_router(document_reads_router, prefix="/api/documents", tags=["document-reads"])
|
||||
app.include_router(document_notes_router, prefix="/api/documents", tags=["document-notes"])
|
||||
app.include_router(search_router, prefix="/api/search", tags=["search"])
|
||||
# 이드 채팅 표면 (D-1) — POST /api/eid/chat. SSE 스트리밍, EidAIClient.call_stream 봉쇄 경유.
|
||||
app.include_router(eid_chat_router, prefix="/api/eid", tags=["eid-chat"])
|
||||
|
||||
app.include_router(memos_router, prefix="/api/memos", tags=["memos"])
|
||||
app.include_router(events_router, prefix="/api/events", tags=["events"])
|
||||
app.include_router(dashboard_router, prefix="/api/dashboard", tags=["dashboard"])
|
||||
app.include_router(library_router, prefix="/api/library", tags=["library"])
|
||||
app.include_router(news_router, prefix="/api/news", tags=["news"])
|
||||
# 처리 머신 보드 (plan ds-processing-ui-6an) — GET /api/queue/overview
|
||||
app.include_router(queue_overview_router, prefix="/api/queue", tags=["queue"])
|
||||
app.include_router(digest_router, prefix="/api/digest", tags=["digest"])
|
||||
app.include_router(briefing_router, prefix="/api/briefing", tags=["briefing"])
|
||||
app.include_router(audio_router, prefix="/api/audio", tags=["audio"])
|
||||
|
||||
@@ -118,7 +118,7 @@ class Document(Base):
|
||||
source_channel: Mapped[str | None] = mapped_column(
|
||||
Enum("law_monitor", "devonagent", "email", "web_clip",
|
||||
"tksafety", "inbox_route", "manual", "drive_sync", "news", "memo",
|
||||
"voice", "hermes",
|
||||
"voice", "hermes", "crawl",
|
||||
name="source_channel")
|
||||
)
|
||||
# 외부 채널 (Hermes Discord 등) 의 channel/user/message_id/timestamp 메타.
|
||||
|
||||
@@ -2,7 +2,8 @@
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
from sqlalchemy import Boolean, DateTime, String, Text
|
||||
from sqlalchemy import Boolean, DateTime, Enum, Integer, String, Text
|
||||
from sqlalchemy.dialects.postgresql import JSONB
|
||||
from sqlalchemy.orm import Mapped, mapped_column
|
||||
|
||||
from core.database import Base
|
||||
@@ -23,3 +24,32 @@ class NewsSource(Base):
|
||||
created_at: Mapped[datetime] = mapped_column(
|
||||
DateTime(timezone=True), default=datetime.now
|
||||
)
|
||||
|
||||
# ── A-3 (plan crawl-24x7-1) 레지스트리 증축 — migration 319 ──
|
||||
# fetch_method: rss / rss+page / sitemap+page / page / api / signal-only
|
||||
fetch_method: Mapped[str] = mapped_column(String(20), default="rss")
|
||||
# fulltext_policy: none(현행) / page(기사 페이지 fetch 후 4-tier 승격) / feed-full(피드 본문이 전문)
|
||||
fulltext_policy: Mapped[str] = mapped_column(String(20), default="none")
|
||||
# NULL=공개, 값=구독 세션 키 (B-3 Playwright 어댑터 슬롯)
|
||||
auth_profile: Mapped[str | None] = mapped_column(String(50))
|
||||
# 소스별 차등 폴링 (NULL=전역 6h 사이클)
|
||||
poll_interval_minutes: Mapped[int | None] = mapped_column(Integer)
|
||||
# 조건부 GET 워터마크 — 서버가 준 값 그대로 저장·재전송 (A-1)
|
||||
etag: Mapped[str | None] = mapped_column(Text)
|
||||
last_modified: Mapped[str | None] = mapped_column(Text)
|
||||
# CDN ETag 회전 대비 콘텐츠 해시 변경감지 병행 (A-1)
|
||||
feed_content_hash: Mapped[str | None] = mapped_column(String(64))
|
||||
# 추출 실패 잦은 소스의 site-specific CSS selector (A-2)
|
||||
selector_override: Mapped[dict | None] = mapped_column(JSONB)
|
||||
# rdf / table-strip / gn-redirect / skip-video 등 파서 특이 케이스 (B-5)
|
||||
parser_quirk: Mapped[str | None] = mapped_column(String(30))
|
||||
# 채널 — 'news'(다이제스트/브리핑 대상) / 'crawl'(도메인 재료, 0-5 (a)) — migration 324.
|
||||
# documents.source_channel 로 전파, crawl 채널은 embed/chunk 30일 게이트 미적용.
|
||||
# documents 와 동일 PG enum 재사용 (Document 모델과 값 목록 동기 유지).
|
||||
source_channel: Mapped[str] = mapped_column(
|
||||
Enum("law_monitor", "devonagent", "email", "web_clip",
|
||||
"tksafety", "inbox_route", "manual", "drive_sync", "news", "memo",
|
||||
"voice", "hermes", "crawl",
|
||||
name="source_channel"),
|
||||
default="news",
|
||||
)
|
||||
|
||||
+30
-2
@@ -2,14 +2,41 @@
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
from sqlalchemy import BigInteger, DateTime, Enum, ForeignKey, SmallInteger, Text, text
|
||||
from sqlalchemy import BigInteger, DateTime, Enum, ForeignKey, SmallInteger, Text, func, or_, text
|
||||
from sqlalchemy.dialects.postgresql import JSONB, insert as pg_insert
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
from sqlalchemy.orm import Mapped, mapped_column
|
||||
from sqlalchemy.types import TIMESTAMP
|
||||
|
||||
from core.database import Base
|
||||
|
||||
|
||||
class StageDeferred(Exception):
|
||||
"""워커가 '지금은 처리 불가 — 자료 손상 없이 보류' 를 선언하는 신호 (ds-macbook-offload-1).
|
||||
|
||||
맥북(M5 Max) deep 슬롯 경로 전용: 503(upstream_cold/editor_busy/warming) · 연결 실패 ·
|
||||
생성 중 절단(read-timeout, 맥북 sleep) 시 raise. queue_consumer/queue_drain 이 attempts 를
|
||||
소모하지 않고 pending 복귀 + payload.deferred_until 백오프를 기록한다. 결과 쓰기는 호출
|
||||
완주 + 파싱 성공 후에만 일어나므로 어느 시점에 끊겨도 부분 쓰기 0 (sleep-안전 불변식).
|
||||
"""
|
||||
|
||||
def __init__(self, reason: str, retry_after_minutes: int = 30):
|
||||
super().__init__(reason)
|
||||
self.retry_after_minutes = retry_after_minutes
|
||||
|
||||
|
||||
def not_deferred_condition():
|
||||
"""보류 백오프(payload.deferred_until, ISO 문자열) 가 미래인 행을 claim 에서 제외.
|
||||
|
||||
payload 없음 / 키 없음 = 통과. queue_consumer 와 queue_drain 의 claim 이 공유한다.
|
||||
"""
|
||||
deferred = ProcessingQueue.payload["deferred_until"].astext
|
||||
return or_(
|
||||
deferred.is_(None),
|
||||
deferred.cast(TIMESTAMP(timezone=True)) <= func.now(),
|
||||
)
|
||||
|
||||
|
||||
class ProcessingQueue(Base):
|
||||
__tablename__ = "processing_queue"
|
||||
|
||||
@@ -18,10 +45,11 @@ class ProcessingQueue(Base):
|
||||
stage: Mapped[str] = mapped_column(
|
||||
# 'stt' (audio): migration 150 / 'thumbnail' (video): queue_consumer 가 enqueue.
|
||||
# 'deep_summary' (PR-B B-1): classify_worker 가 에스컬레이션 시 enqueue.
|
||||
# 'fulltext' (crawl-24x7 A-2): migration 321 — 기사 페이지 fetch 후 본문 승격.
|
||||
# DB enum 변경은 마이그레이션이 처리하므로 create_type=False.
|
||||
Enum(
|
||||
"extract", "classify", "summarize", "embed", "chunk", "preview",
|
||||
"stt", "thumbnail", "deep_summary", "markdown",
|
||||
"stt", "thumbnail", "deep_summary", "markdown", "fulltext",
|
||||
name="process_stage",
|
||||
create_type=False,
|
||||
),
|
||||
|
||||
@@ -0,0 +1,44 @@
|
||||
"""source_health 테이블 ORM (A-5, plan crawl-24x7-1)
|
||||
|
||||
news_sources 와 1:1. 소스별 fetch 성공/실패 기록 + circuit breaker 상태.
|
||||
silent skip 누적 방지의 가시성 기반 — A-8 헬스 패널이 읽는다.
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
from sqlalchemy import BigInteger, Boolean, DateTime, ForeignKey, Integer, String, Text
|
||||
from sqlalchemy.orm import Mapped, mapped_column
|
||||
|
||||
from core.database import Base
|
||||
|
||||
|
||||
class SourceHealth(Base):
|
||||
__tablename__ = "source_health"
|
||||
|
||||
id: Mapped[int] = mapped_column(primary_key=True)
|
||||
source_id: Mapped[int] = mapped_column(
|
||||
Integer, ForeignKey("news_sources.id", ondelete="CASCADE"), nullable=False
|
||||
)
|
||||
consecutive_failures: Mapped[int] = mapped_column(Integer, default=0)
|
||||
total_fetches: Mapped[int] = mapped_column(BigInteger, default=0)
|
||||
total_failures: Mapped[int] = mapped_column(BigInteger, default=0)
|
||||
last_success_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True))
|
||||
last_error: Mapped[str | None] = mapped_column(Text)
|
||||
last_error_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True))
|
||||
last_fetch_items: Mapped[int | None] = mapped_column(Integer)
|
||||
# 200 인데 entries 0 인 연속 fetch 횟수 (304/해시동일은 미집계 — 피드 부패 신호 전용)
|
||||
empty_streak: Mapped[int] = mapped_column(Integer, default=0)
|
||||
# closed(정상) / open(연속 실패 → 지수 backoff) / disabled(임계 초과, 수동 복구 대상)
|
||||
circuit_state: Mapped[str] = mapped_column(String(10), default="closed")
|
||||
circuit_opened_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True))
|
||||
updated_at: Mapped[datetime] = mapped_column(
|
||||
DateTime(timezone=True), default=datetime.now
|
||||
)
|
||||
|
||||
# ── B-3 구독 세션 상태 계약 — migration 325 ──
|
||||
# 쓰기 1종 플래그: A-8 버튼이 기록만, 어댑터가 소비(수동 half-open).
|
||||
# 소비 위치 = open-스킵 분기보다 앞 (r5 함정 고정 — 데드 버튼 방지).
|
||||
relogin_requested: Mapped[bool] = mapped_column(Boolean, default=False)
|
||||
# 내용 기반 probe 결과 (시간 기반 만료 판정 금지 — 페이월 안내문 silent corruption 차단)
|
||||
last_probe_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True))
|
||||
last_probe_ok: Mapped[bool | None] = mapped_column(Boolean)
|
||||
@@ -17,10 +17,13 @@ python-multipart>=0.0.9
|
||||
jinja2>=3.1.0
|
||||
feedparser>=6.0.0
|
||||
pymupdf>=1.24.0
|
||||
# Web/Blog ingest (devonagent 트랙) — HTML 본문 정화 4-tier fallback
|
||||
trafilatura>=1.12.0
|
||||
# Web/Blog ingest (devonagent 트랙) + 뉴스 fulltext 승격 (crawl-24x7 A-2) — 4-tier fallback.
|
||||
# trafilatura 는 단일 메인테이너 리스크로 exact pin (A-2 결정).
|
||||
trafilatura==2.1.0
|
||||
readability-lxml>=0.8.1
|
||||
markdownify>=0.13.1
|
||||
# tier-4 (bs4) 가 직접 import — 전이 의존 가정 제거 (crawl-24x7 A-2)
|
||||
beautifulsoup4>=4.12.0
|
||||
# office OOXML(docx/xlsx/pptx) → md (plan ds-s1-backend-1 C-1).
|
||||
# 정확한 핀은 E-1 markitdown OOXML PoC(devsbx/버전핀 컨텍스트)에서 확정.
|
||||
markitdown[docx,xlsx,pptx]>=0.1.0
|
||||
|
||||
@@ -0,0 +1,410 @@
|
||||
"""처리 머신 보드 + ETA 집계 (plan ds-processing-ui-6an, 안2+안5/6).
|
||||
|
||||
GET /api/queue/overview 의 집계 로직. 모든 수치는 기존 processing_queue /
|
||||
documents 컬럼에서 라이브 계산 — 신규 테이블/마이그레이션 0 (HARD 제약).
|
||||
|
||||
구조: SQL 수집부(build_overview 내부 5쿼리)와 판정부(순수 함수)를 분리.
|
||||
판정부(rows_to_* / build_machines / build_summarize_eta / build_trend /
|
||||
build_totals / compute_eta_minutes)는 DB 없이 단위테스트 가능.
|
||||
|
||||
귀속 규칙 (단일 진실):
|
||||
- stage→machine 정적 맵: gpu = extract/embed/chunk/markdown/preview/thumbnail/
|
||||
fulltext/stt · macmini = classify/summarize · macbook = deep_summary
|
||||
(단, settings.ai.deep 부재 시 deep_summary 도 macmini 귀속).
|
||||
- summarize 는 풀(pool): pending/processing/failed 는 macmini 귀속이되, 완료
|
||||
실적(done_*)은 documents.ai_model_version 조인으로 분리 — 'qwen-macbook'
|
||||
이면 macbook 실적, 아니면 macmini 실적.
|
||||
- deferred_pending(payload.deferred_until 미래)은 macbook 카드 귀속
|
||||
(보류 = 맥북 불가 신호).
|
||||
"""
|
||||
|
||||
from datetime import datetime, timedelta
|
||||
from posixpath import basename
|
||||
from zoneinfo import ZoneInfo
|
||||
|
||||
from sqlalchemy import text
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from core.config import settings
|
||||
|
||||
KST = ZoneInfo("Asia/Seoul")
|
||||
|
||||
# 내부 판별용 alias — 응답에 raw 모델명 노출 금지, 머신 label 만 노출.
|
||||
_MACBOOK_MODEL_ALIAS = "qwen-macbook"
|
||||
|
||||
# stage→machine 정적 맵 재료 (선언 순서 = 카드 stages 표시 순서)
|
||||
_GPU_STAGES = (
|
||||
"extract", "embed", "chunk", "markdown",
|
||||
"preview", "thumbnail", "fulltext", "stt",
|
||||
)
|
||||
_MACMINI_STAGES = ("classify", "summarize")
|
||||
_MACBOOK_STAGES = ("deep_summary",)
|
||||
_STAGE_ORDER = _GPU_STAGES + _MACMINI_STAGES + _MACBOOK_STAGES
|
||||
|
||||
_MACHINE_KEYS = ("gpu", "macmini", "macbook")
|
||||
_MACHINE_LABELS = {
|
||||
"gpu": "GPU 서버",
|
||||
"macmini": "맥미니",
|
||||
"macbook": "맥북 M5 Max",
|
||||
}
|
||||
|
||||
# 머신 카드당 current 표시 상한
|
||||
_CURRENT_LIMIT = 2
|
||||
|
||||
|
||||
def stage_machine_map(deep_enabled: bool) -> dict[str, str]:
|
||||
"""stage → machine key 맵. deep 슬롯 부재 시 deep_summary 는 macmini 귀속."""
|
||||
mapping: dict[str, str] = {}
|
||||
for s in _GPU_STAGES:
|
||||
mapping[s] = "gpu"
|
||||
for s in _MACMINI_STAGES:
|
||||
mapping[s] = "macmini"
|
||||
for s in _MACBOOK_STAGES:
|
||||
mapping[s] = "macbook" if deep_enabled else "macmini"
|
||||
return mapping
|
||||
|
||||
|
||||
def _zero_stage() -> dict:
|
||||
return {
|
||||
"pending": 0, "processing": 0, "failed": 0,
|
||||
"done_1h": 0, "done_today": 0, "done_15m": 0,
|
||||
"deferred_pending": 0, "created_1h": 0, "oldest_pending_at": None,
|
||||
}
|
||||
|
||||
|
||||
def rows_to_stage_stats(rows) -> dict[str, dict]:
|
||||
"""stage×status 집계 쿼리 행 → {stage: {pending, ..., created_1h}} 변환."""
|
||||
stats: dict[str, dict] = {}
|
||||
for row in rows:
|
||||
stats[row[0]] = {
|
||||
"pending": int(row[1] or 0),
|
||||
"processing": int(row[2] or 0),
|
||||
"failed": int(row[3] or 0),
|
||||
"done_1h": int(row[4] or 0),
|
||||
"done_today": int(row[5] or 0),
|
||||
"done_15m": int(row[6] or 0),
|
||||
"deferred_pending": int(row[7] or 0),
|
||||
"created_1h": int(row[8] or 0),
|
||||
"oldest_pending_at": row[9] if len(row) > 9 else None,
|
||||
}
|
||||
return stats
|
||||
|
||||
|
||||
def rows_to_summarize_split(rows) -> dict[str, dict]:
|
||||
"""summarize 완료 실적 분리 쿼리 행 → {"macbook"|"macmini": {done_*}}.
|
||||
|
||||
is_macbook = documents.ai_model_version 이 'qwen-macbook' 인지 (내부 판별 전용).
|
||||
"""
|
||||
split = {
|
||||
"macbook": {"done_1h": 0, "done_today": 0, "done_15m": 0},
|
||||
"macmini": {"done_1h": 0, "done_today": 0, "done_15m": 0},
|
||||
}
|
||||
for row in rows:
|
||||
key = "macbook" if row[0] else "macmini"
|
||||
split[key]["done_1h"] += int(row[1] or 0)
|
||||
split[key]["done_today"] += int(row[2] or 0)
|
||||
split[key]["done_15m"] += int(row[3] or 0)
|
||||
return split
|
||||
|
||||
|
||||
def display_title(row: dict) -> str:
|
||||
"""표시용 제목 — title > original_filename > file_path basename > 문서 id."""
|
||||
if row.get("title"):
|
||||
return row["title"]
|
||||
if row.get("original_filename"):
|
||||
return row["original_filename"]
|
||||
if row.get("file_path"):
|
||||
return basename(row["file_path"].rstrip("/"))
|
||||
return f"문서 #{row['document_id']}"
|
||||
|
||||
|
||||
def build_machines(
|
||||
stage_stats: dict[str, dict],
|
||||
summarize_split: dict[str, dict],
|
||||
current_rows: list[dict],
|
||||
*,
|
||||
deep_enabled: bool,
|
||||
) -> list[dict]:
|
||||
"""머신 카드 3장 (gpu / macmini / macbook) 구성 — 귀속 규칙의 판정부."""
|
||||
smap = stage_machine_map(deep_enabled)
|
||||
|
||||
def g(stage: str, field: str) -> int:
|
||||
return stage_stats.get(stage, {}).get(field, 0)
|
||||
|
||||
# current 귀속: processing 행을 머신별 최대 2건 (summarize processing → macmini)
|
||||
current_by_machine: dict[str, list[dict]] = {k: [] for k in _MACHINE_KEYS}
|
||||
for row in current_rows:
|
||||
machine = smap.get(row["stage"])
|
||||
if machine and len(current_by_machine[machine]) < _CURRENT_LIMIT:
|
||||
current_by_machine[machine].append({
|
||||
"document_id": row["document_id"],
|
||||
"title": display_title(row),
|
||||
"stage": row["stage"],
|
||||
})
|
||||
|
||||
machines = []
|
||||
for key in _MACHINE_KEYS:
|
||||
stages = [s for s in _STAGE_ORDER if smap[s] == key]
|
||||
|
||||
pending = sum(g(s, "pending") for s in stages)
|
||||
processing = sum(g(s, "processing") for s in stages)
|
||||
failed = sum(g(s, "failed") for s in stages)
|
||||
|
||||
# 완료 실적: summarize 는 풀이라 stage 합산에서 제외하고 split 로 귀속
|
||||
done_1h = sum(g(s, "done_1h") for s in stages if s != "summarize")
|
||||
done_today = sum(g(s, "done_today") for s in stages if s != "summarize")
|
||||
done_15m = sum(g(s, "done_15m") for s in stages if s != "summarize")
|
||||
if key in summarize_split:
|
||||
done_1h += summarize_split[key]["done_1h"]
|
||||
done_today += summarize_split[key]["done_today"]
|
||||
done_15m += summarize_split[key]["done_15m"]
|
||||
|
||||
# 보류 백오프 = 맥북 불가 신호 → macbook 카드 귀속 (deep 슬롯 유무 무관)
|
||||
deferred_pending = (
|
||||
g("summarize", "deferred_pending") + g("deep_summary", "deferred_pending")
|
||||
if key == "macbook" else 0
|
||||
)
|
||||
|
||||
# state 판정 — 우선순위: 가동 > 보류 > 대기 (사용자 피드백 2026-06-11).
|
||||
# 일하고 있으면(처리 중 또는 최근 15분 완료) 백오프 잔여가 있어도 "가동" —
|
||||
# 보류 건수는 카드의 deferred_pending 라인이 따로 보여준다. "보류" 칩은
|
||||
# 실제로 일이 멈춰 있고 백오프만 쌓인 상태(sleep/불가 지속)에서만.
|
||||
if processing > 0 or done_15m > 0:
|
||||
state = "active"
|
||||
elif key == "macbook" and deferred_pending > 0:
|
||||
state = "deferred"
|
||||
else:
|
||||
state = "idle"
|
||||
|
||||
machines.append({
|
||||
"key": key,
|
||||
"label": _MACHINE_LABELS[key],
|
||||
"state": state,
|
||||
"stages": stages,
|
||||
"pending": pending,
|
||||
"processing": processing,
|
||||
"failed": failed,
|
||||
"done_1h": done_1h,
|
||||
"done_today": done_today,
|
||||
"deferred_pending": deferred_pending,
|
||||
"current": current_by_machine[key],
|
||||
})
|
||||
return machines
|
||||
|
||||
|
||||
def compute_eta_minutes(pending: int, done_1h: int, inflow_1h: int) -> int | None:
|
||||
"""ETA(분) = 순소화율 기반. done > inflow 일 때만 산출, 아니면 None (소화 불가)."""
|
||||
if done_1h > inflow_1h:
|
||||
return round(pending / (done_1h - inflow_1h) * 60)
|
||||
return None
|
||||
|
||||
|
||||
def build_summarize_eta(stage_stats: dict[str, dict]) -> dict:
|
||||
"""summarize 풀 ETA — pending 은 보류(deferred) 포함 총수."""
|
||||
s = stage_stats.get("summarize", _zero_stage())
|
||||
pending = s["pending"]
|
||||
done_rate = s["done_1h"]
|
||||
inflow_rate = s["created_1h"]
|
||||
return {
|
||||
"pending": pending,
|
||||
"done_rate_1h": done_rate,
|
||||
"inflow_rate_1h": inflow_rate,
|
||||
"eta_minutes": compute_eta_minutes(pending, done_rate, inflow_rate),
|
||||
}
|
||||
|
||||
|
||||
def build_trend(
|
||||
inflow_buckets: dict[str, int],
|
||||
done_buckets: dict[str, int],
|
||||
now_kst: datetime,
|
||||
) -> list[dict]:
|
||||
"""summarize 24h 추이 — KST 시간 버킷 24개 (오래된 것부터, 빈 버킷 0).
|
||||
|
||||
버킷 key = "YYYY-MM-DD HH:00" (KST). SQL to_char 출력과 동일 포맷.
|
||||
"""
|
||||
base = now_kst.replace(minute=0, second=0, microsecond=0)
|
||||
trend = []
|
||||
for i in range(23, -1, -1):
|
||||
bucket = base - timedelta(hours=i)
|
||||
key = bucket.strftime("%Y-%m-%d %H:00")
|
||||
trend.append({
|
||||
"hour": bucket.strftime("%H:00"),
|
||||
"inflow": inflow_buckets.get(key, 0),
|
||||
"done": done_buckets.get(key, 0),
|
||||
})
|
||||
return trend
|
||||
|
||||
|
||||
def build_stages(stage_stats: dict[str, dict], now=None) -> list[dict]:
|
||||
"""단계별 현황 행 — '단계 상세' 패널용 (2026-06-11 사용자 피드백: 완료가 보여야 한다).
|
||||
|
||||
파이프라인 순서 유지, 미지 stage 는 뒤에. 숨김/강조 판단은 FE 몫 — 여기선 사실만.
|
||||
oldest_pending_age_sec = 가장 오래된 pending 의 경과 초 (pending 없으면 None).
|
||||
"""
|
||||
from datetime import datetime, timezone
|
||||
now = now or datetime.now(timezone.utc)
|
||||
extra = [s for s in stage_stats if s not in _STAGE_ORDER]
|
||||
rows = []
|
||||
for stage in [*_STAGE_ORDER, *extra]:
|
||||
st = stage_stats.get(stage) or _zero_stage()
|
||||
oldest = st.get("oldest_pending_at")
|
||||
age = None
|
||||
if oldest is not None:
|
||||
if oldest.tzinfo is None:
|
||||
oldest = oldest.replace(tzinfo=timezone.utc)
|
||||
age = max(0, int((now - oldest).total_seconds()))
|
||||
rows.append({
|
||||
"stage": stage,
|
||||
"pending": st["pending"],
|
||||
"processing": st["processing"],
|
||||
"failed": st["failed"],
|
||||
"done_today": st["done_today"],
|
||||
"oldest_pending_age_sec": age,
|
||||
})
|
||||
return rows
|
||||
|
||||
|
||||
def build_totals(stage_stats: dict[str, dict]) -> dict:
|
||||
"""전 stage 합계."""
|
||||
return {
|
||||
"pending": sum(s["pending"] for s in stage_stats.values()),
|
||||
"processing": sum(s["processing"] for s in stage_stats.values()),
|
||||
"failed": sum(s["failed"] for s in stage_stats.values()),
|
||||
}
|
||||
|
||||
|
||||
def compose_overview(
|
||||
stage_stats: dict[str, dict],
|
||||
summarize_split: dict[str, dict],
|
||||
inflow_buckets: dict[str, int],
|
||||
done_buckets: dict[str, int],
|
||||
current_rows: list[dict],
|
||||
*,
|
||||
deep_enabled: bool,
|
||||
now_kst: datetime,
|
||||
) -> dict:
|
||||
"""수집된 통계 → 응답 dict (계약 shape). 순수 함수 — DB 불요."""
|
||||
return {
|
||||
"machines": build_machines(
|
||||
stage_stats, summarize_split, current_rows, deep_enabled=deep_enabled
|
||||
),
|
||||
"stages": build_stages(stage_stats),
|
||||
"summarize_eta": build_summarize_eta(stage_stats),
|
||||
"trend_24h": build_trend(inflow_buckets, done_buckets, now_kst),
|
||||
"totals": build_totals(stage_stats),
|
||||
}
|
||||
|
||||
|
||||
# ─── SQL 수집부 (총 5쿼리) ────────────────────────────────────────────────────
|
||||
|
||||
# 1) stage×status 집계 + 시간창 완료/유입 + 보류 (1방)
|
||||
_STAGE_STATS_SQL = """
|
||||
SELECT
|
||||
stage,
|
||||
COUNT(*) FILTER (WHERE status = 'pending') AS pending,
|
||||
COUNT(*) FILTER (WHERE status = 'processing') AS processing,
|
||||
COUNT(*) FILTER (WHERE status = 'failed') AS failed,
|
||||
COUNT(*) FILTER (WHERE status = 'completed'
|
||||
AND completed_at > NOW() - INTERVAL '1 hour') AS done_1h,
|
||||
COUNT(*) FILTER (WHERE status = 'completed'
|
||||
AND completed_at > :kst_midnight) AS done_today,
|
||||
COUNT(*) FILTER (WHERE status = 'completed'
|
||||
AND completed_at > NOW() - INTERVAL '15 minutes') AS done_15m,
|
||||
COUNT(*) FILTER (WHERE status = 'pending'
|
||||
AND payload ->> 'deferred_until' IS NOT NULL
|
||||
AND (payload ->> 'deferred_until')::timestamptz > NOW())
|
||||
AS deferred_pending,
|
||||
COUNT(*) FILTER (WHERE created_at > NOW() - INTERVAL '1 hour') AS created_1h,
|
||||
MIN(created_at) FILTER (WHERE status = 'pending') AS oldest_pending_at
|
||||
FROM processing_queue
|
||||
GROUP BY stage
|
||||
"""
|
||||
|
||||
# 2) summarize 풀 완료 실적 분리 (documents.ai_model_version 조인, 1방)
|
||||
# 스캔 하한 = 오늘 0시(KST)와 1h 전 중 더 이른 시각 (자정 직후 1h 창 보전).
|
||||
_SUMMARIZE_SPLIT_SQL = """
|
||||
SELECT
|
||||
COALESCE(d.ai_model_version = :macbook_alias, false) AS is_macbook,
|
||||
COUNT(*) FILTER (WHERE q.completed_at > NOW() - INTERVAL '1 hour') AS done_1h,
|
||||
COUNT(*) FILTER (WHERE q.completed_at > :kst_midnight) AS done_today,
|
||||
COUNT(*) FILTER (WHERE q.completed_at > NOW() - INTERVAL '15 minutes') AS done_15m
|
||||
FROM processing_queue q
|
||||
JOIN documents d ON d.id = q.document_id
|
||||
WHERE q.stage = 'summarize'
|
||||
AND q.status = 'completed'
|
||||
AND q.completed_at > LEAST(:kst_midnight, NOW() - INTERVAL '1 hour')
|
||||
GROUP BY 1
|
||||
"""
|
||||
|
||||
# 3/4) summarize 24h 추이 — KST 시간 버킷 (inflow/done 각 1방)
|
||||
_TREND_INFLOW_SQL = """
|
||||
SELECT to_char(date_trunc('hour', created_at AT TIME ZONE 'Asia/Seoul'),
|
||||
'YYYY-MM-DD HH24:00') AS bucket,
|
||||
COUNT(*) AS n
|
||||
FROM processing_queue
|
||||
WHERE stage = 'summarize'
|
||||
AND created_at > NOW() - INTERVAL '24 hours'
|
||||
GROUP BY 1
|
||||
"""
|
||||
|
||||
_TREND_DONE_SQL = """
|
||||
SELECT to_char(date_trunc('hour', completed_at AT TIME ZONE 'Asia/Seoul'),
|
||||
'YYYY-MM-DD HH24:00') AS bucket,
|
||||
COUNT(*) AS n
|
||||
FROM processing_queue
|
||||
WHERE stage = 'summarize'
|
||||
AND status = 'completed'
|
||||
AND completed_at > NOW() - INTERVAL '24 hours'
|
||||
GROUP BY 1
|
||||
"""
|
||||
|
||||
# 5) processing 행 + 표시용 제목 재료 (1방 — 머신별 2건 슬라이스는 판정부에서)
|
||||
_CURRENT_SQL = """
|
||||
SELECT q.stage, q.document_id, d.title, d.original_filename, d.file_path
|
||||
FROM processing_queue q
|
||||
JOIN documents d ON d.id = q.document_id
|
||||
WHERE q.status = 'processing'
|
||||
ORDER BY q.started_at DESC NULLS LAST
|
||||
LIMIT 50
|
||||
"""
|
||||
|
||||
|
||||
async def build_overview(session: AsyncSession) -> dict:
|
||||
"""5쿼리 수집 → compose_overview 판정 → 응답 dict."""
|
||||
now_kst = datetime.now(KST)
|
||||
kst_midnight = now_kst.replace(hour=0, minute=0, second=0, microsecond=0)
|
||||
deep_enabled = settings.ai is not None and settings.ai.deep is not None
|
||||
|
||||
stage_rows = (
|
||||
await session.execute(text(_STAGE_STATS_SQL), {"kst_midnight": kst_midnight})
|
||||
).all()
|
||||
split_rows = (
|
||||
await session.execute(
|
||||
text(_SUMMARIZE_SPLIT_SQL),
|
||||
{"kst_midnight": kst_midnight, "macbook_alias": _MACBOOK_MODEL_ALIAS},
|
||||
)
|
||||
).all()
|
||||
inflow_rows = (await session.execute(text(_TREND_INFLOW_SQL))).all()
|
||||
done_rows = (await session.execute(text(_TREND_DONE_SQL))).all()
|
||||
current_result = (await session.execute(text(_CURRENT_SQL))).all()
|
||||
|
||||
current_rows = [
|
||||
{
|
||||
"stage": row[0],
|
||||
"document_id": row[1],
|
||||
"title": row[2],
|
||||
"original_filename": row[3],
|
||||
"file_path": row[4],
|
||||
}
|
||||
for row in current_result
|
||||
]
|
||||
|
||||
return compose_overview(
|
||||
rows_to_stage_stats(stage_rows),
|
||||
rows_to_summarize_split(split_rows),
|
||||
{row[0]: int(row[1]) for row in inflow_rows},
|
||||
{row[0]: int(row[1]) for row in done_rows},
|
||||
current_rows,
|
||||
deep_enabled=deep_enabled,
|
||||
now_kst=now_kst,
|
||||
)
|
||||
@@ -0,0 +1,250 @@
|
||||
"""C-4 ① API 표준 공지(Important Standards Announcements) 수집 워커 (사이클 3).
|
||||
|
||||
RSS 없음. 실측(2026-06-11) 결과 '페이지 diff' 가 아니라 공지별 상세 URL 이 있는
|
||||
목록 페이지(10건/페이지, ?page=N&pageSize=10 페이지네이션 ~12+) — 목록 링크 파싱
|
||||
→ 신규 상세 페이지만 ingest 가 정확하고 dedup 도 자연스럽다 (rss+page 패턴의 HTML 판).
|
||||
510/570/653 개정 공지가 업무 직결 — 표준 본문은 유료라 공지만 수집 (카드 C-4).
|
||||
|
||||
스케줄 = monthly (main.py 5일 07:05 KST) — 최근 2페이지 diff (월 1~2건 공지 페이스).
|
||||
초기 일괄: docker exec hyungi_document_server-fastapi-1 \
|
||||
python -m workers.api_standards_collector --bulk # 전 페이지 (~120건, politeness ~30분)
|
||||
|
||||
멱등: edit_url(정규화)+file_hash dedup — 재실행 = 신규분만.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import hashlib
|
||||
import re
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from sqlalchemy import select
|
||||
|
||||
from core.crawl_politeness import (
|
||||
CrawlBlocked,
|
||||
CrawlFetchError,
|
||||
CrawlSkip,
|
||||
fetch_page,
|
||||
)
|
||||
from core.database import async_session
|
||||
from core.utils import setup_logger
|
||||
from models.document import Document
|
||||
from models.news_source import NewsSource
|
||||
from models.queue import enqueue_stage
|
||||
from workers.fulltext_worker import (
|
||||
_WEB_MIN_BODY_LEN,
|
||||
_extract_body,
|
||||
_raw_html_path,
|
||||
_save_raw_html,
|
||||
_strip_article_footer,
|
||||
)
|
||||
from workers.news_collector import (
|
||||
_get_or_create_health,
|
||||
_normalize_url,
|
||||
_record_failure,
|
||||
_record_success,
|
||||
)
|
||||
from workers.static_corpus_ingest import _page_title
|
||||
|
||||
logger = setup_logger("api_standards")
|
||||
|
||||
_BASE = "https://www.api.org"
|
||||
_LISTING_PATH = "/products-and-services/standards/important-standards-announcements"
|
||||
_LISTING_URL = f"{_BASE}{_LISTING_PATH}"
|
||||
_SOURCE_NAME = "API 표준 공지"
|
||||
|
||||
_SCHEDULED_PAGES = 2 # monthly diff 범위 (20건 — 월 1~2건 페이스에 충분한 겹침)
|
||||
_BULK_MAX_PAGES = 15 # 실측 12페이지 + 여유. 빈 페이지에서 조기 종료.
|
||||
|
||||
_DETAIL_RE = re.compile(
|
||||
r'href="(' + re.escape(_LISTING_PATH) + r'/[^"?#]+)"'
|
||||
)
|
||||
_DATE_RE = re.compile(
|
||||
r"(January|February|March|April|May|June|July|August|September|October"
|
||||
r"|November|December)\s+(\d{1,2}),?\s+(\d{4})"
|
||||
)
|
||||
_MONTHS = {m: i for i, m in enumerate(
|
||||
["January", "February", "March", "April", "May", "June", "July",
|
||||
"August", "September", "October", "November", "December"], start=1)}
|
||||
|
||||
|
||||
def _parse_listing(html_text: str) -> list[str]:
|
||||
"""상세 공지 절대 URL — 순서 보존 dedup (페이지네이션 링크는 ?가 패턴에서 배제)."""
|
||||
seen: set[str] = set()
|
||||
out: list[str] = []
|
||||
for m in _DETAIL_RE.finditer(html_text):
|
||||
url = f"{_BASE}{m.group(1)}"
|
||||
if url not in seen:
|
||||
seen.add(url)
|
||||
out.append(url)
|
||||
return out
|
||||
|
||||
|
||||
def _parse_pub_date(text: str) -> datetime | None:
|
||||
"""본문 첫 'Month DD, YYYY' — 공지 게시일 관행. 실패 = None (색인은 채널 게이트로 무조건)."""
|
||||
m = _DATE_RE.search(text)
|
||||
if not m:
|
||||
return None
|
||||
try:
|
||||
return datetime(int(m.group(3)), _MONTHS[m.group(1)], int(m.group(2)),
|
||||
tzinfo=timezone.utc)
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
async def _get_or_create_source(session) -> NewsSource:
|
||||
result = await session.execute(
|
||||
select(NewsSource).where(NewsSource.name == _SOURCE_NAME)
|
||||
)
|
||||
source = result.scalars().first()
|
||||
if source is None:
|
||||
source = NewsSource(
|
||||
name=_SOURCE_NAME, feed_url=_LISTING_URL, feed_type="rss",
|
||||
fetch_method="page", fulltext_policy="none",
|
||||
source_channel="crawl", category="Engineering", language="en", country="US",
|
||||
enabled=False, # 6h 뉴스 사이클 비대상 — 본 워커가 monthly 폴링
|
||||
)
|
||||
session.add(source)
|
||||
await session.flush()
|
||||
return source
|
||||
|
||||
|
||||
async def _ingest_detail(session, source: NewsSource, url: str) -> str:
|
||||
"""공지 1건. 반환: 'ok' / 'dup' / 'skip'."""
|
||||
normalized_url = _normalize_url(url)
|
||||
ann_hash = hashlib.sha256(f"api-ann|{normalized_url}".encode()).hexdigest()[:32]
|
||||
existing = await session.execute(
|
||||
select(Document).where(
|
||||
(Document.file_hash == ann_hash)
|
||||
| (Document.edit_url.in_([normalized_url, url]))
|
||||
).limit(1)
|
||||
)
|
||||
if existing.scalars().first():
|
||||
return "dup"
|
||||
|
||||
try:
|
||||
html_text, final_url = await fetch_page(url)
|
||||
except (CrawlBlocked, CrawlSkip, CrawlFetchError) as e:
|
||||
logger.warning(f"[api-std] fetch 실패 skip: {url} — {type(e).__name__}: {e}")
|
||||
return "skip"
|
||||
|
||||
body, engine, engine_ver = _extract_body(html_text)
|
||||
if not engine:
|
||||
logger.warning(f"[api-std] 추출 실패 skip (< {_WEB_MIN_BODY_LEN}자): {url}")
|
||||
return "skip"
|
||||
clean_body = _strip_article_footer(body.replace("\x00", ""))
|
||||
if len(clean_body) < _WEB_MIN_BODY_LEN:
|
||||
return "skip"
|
||||
|
||||
now = datetime.now(timezone.utc)
|
||||
raw_path = _raw_html_path(source.id, ann_hash, now)
|
||||
raw_saved = True
|
||||
try:
|
||||
_save_raw_html(raw_path, html_text)
|
||||
except OSError as e:
|
||||
raw_saved = False
|
||||
logger.error(f"[api-std] 원본 보존 실패 (ingest 는 진행): {e}")
|
||||
|
||||
pub_dt = _parse_pub_date(clean_body)
|
||||
title = _page_title(html_text, fallback=url.rsplit("/", 1)[-1][:90])
|
||||
title = re.sub(r"\s*\|\s*API\s*$", "", title).strip() or title
|
||||
|
||||
doc = Document(
|
||||
file_path=f"crawl/{_SOURCE_NAME}/{ann_hash}",
|
||||
file_hash=ann_hash,
|
||||
file_format="article",
|
||||
file_size=0,
|
||||
file_type="note",
|
||||
title=title,
|
||||
extracted_text=f"{title}\n\n{clean_body}",
|
||||
extracted_at=now,
|
||||
extractor_version=f"listing+page@{engine}",
|
||||
md_content=clean_body,
|
||||
md_status="success",
|
||||
md_extraction_engine=engine,
|
||||
md_extraction_engine_version=engine_ver,
|
||||
md_format_version="1.0",
|
||||
md_generated_at=now,
|
||||
md_source_hash=hashlib.sha256(html_text.encode("utf-8", errors="replace")).hexdigest(),
|
||||
md_content_hash=hashlib.sha256(clean_body.encode("utf-8")).hexdigest(),
|
||||
content_origin="extracted",
|
||||
source_channel="crawl",
|
||||
data_origin="external",
|
||||
edit_url=normalized_url,
|
||||
review_status="approved",
|
||||
ai_domain="Engineering",
|
||||
ai_sub_group=_SOURCE_NAME,
|
||||
ai_tags=["Engineering/API 표준 공지"],
|
||||
extract_meta={
|
||||
"source_id": source.id,
|
||||
"source_name": _SOURCE_NAME,
|
||||
"published_at": pub_dt.isoformat() if pub_dt else None,
|
||||
"fulltext": {
|
||||
"status": "api_announcement",
|
||||
"engine": engine,
|
||||
"final_url": final_url,
|
||||
"raw_html_path": str(raw_path) if raw_saved else None,
|
||||
"body_chars": len(clean_body),
|
||||
"resolved_at": now.isoformat(),
|
||||
},
|
||||
},
|
||||
)
|
||||
doc.file_size = len(doc.extracted_text.encode())
|
||||
session.add(doc)
|
||||
await session.flush()
|
||||
await enqueue_stage(session, doc.id, "summarize")
|
||||
await enqueue_stage(session, doc.id, "embed")
|
||||
await enqueue_stage(session, doc.id, "chunk")
|
||||
logger.info(f"[api-std] ingest {len(clean_body)}자 ({engine}): {title[:60]}")
|
||||
return "ok"
|
||||
|
||||
|
||||
async def run(bulk: bool = False) -> None:
|
||||
"""monthly 진입점 (스케줄러) — bulk 는 CLI 전용 (전 페이지 일괄)."""
|
||||
now = datetime.now(timezone.utc)
|
||||
async with async_session() as session:
|
||||
source = await _get_or_create_source(session)
|
||||
await session.commit()
|
||||
source_id = source.id
|
||||
|
||||
max_pages = _BULK_MAX_PAGES if bulk else _SCHEDULED_PAGES
|
||||
counts = {"ok": 0, "dup": 0, "skip": 0}
|
||||
try:
|
||||
for page in range(1, max_pages + 1):
|
||||
listing_url = (
|
||||
_LISTING_URL if page == 1
|
||||
else f"{_LISTING_URL}?page={page}&pageSize=10"
|
||||
)
|
||||
html_text, _ = await fetch_page(listing_url)
|
||||
detail_urls = _parse_listing(html_text)
|
||||
if not detail_urls:
|
||||
break # 빈 페이지 = 끝 (bulk 조기 종료)
|
||||
for url in detail_urls:
|
||||
async with async_session() as session:
|
||||
src = await session.get(NewsSource, source_id)
|
||||
status = await _ingest_detail(session, src, url)
|
||||
await session.commit()
|
||||
counts[status] += 1
|
||||
logger.info(f"[api-std] 목록 p{page}: 누적 {counts}")
|
||||
except (CrawlBlocked, CrawlSkip, CrawlFetchError) as e:
|
||||
logger.error(f"[api-std] 목록 수집 실패: {e}")
|
||||
async with async_session() as session:
|
||||
health = await _get_or_create_health(session, source_id)
|
||||
_record_failure(health, str(e) or repr(e), now)
|
||||
await session.commit()
|
||||
return
|
||||
|
||||
async with async_session() as session:
|
||||
health = await _get_or_create_health(session, source_id)
|
||||
_record_success(health, counts["ok"], False, now)
|
||||
src = await session.get(NewsSource, source_id)
|
||||
src.last_fetched_at = now
|
||||
await session.commit()
|
||||
logger.info(f"[api-std] 완료: {counts}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="API 표준 공지 수집")
|
||||
parser.add_argument("--bulk", action="store_true", help="전 페이지 일괄 (초기 백필)")
|
||||
args = parser.parse_args()
|
||||
asyncio.run(run(bulk=args.bulk))
|
||||
@@ -0,0 +1,185 @@
|
||||
"""C-2 잔여 ② CCPS Process Safety Beacon 수집 워커 (사이클 3).
|
||||
|
||||
월간 1페이지 PDF + 한국어 번역판 — RAG 청크로 이상적 크기 (카드 C-2).
|
||||
aiche.org 는 평문 httpx 를 UA 무관 403 (2026-06-11 실측: Archiver UA·브라우저 UA 모두)
|
||||
→ playwright-fetcher 익명 컨텍스트 경유 (B-3 인프라 재사용):
|
||||
목록 페이지 브라우저 fetch → beacon PDF 링크 파싱 → referer 쿠키 승계 다운로드.
|
||||
|
||||
알려진 리스크: WAF 가 헤드리스 자체를 차단하면 _CHALLENGE_MARKERS → CrawlBlocked
|
||||
→ health 실패 기록 후 종료 (르몽드 B-3 PARK 선례 — 그 경우 대안 = 이메일 구독
|
||||
.eml 트랙 결합, [[feedback_antibot_headless_subscription_wall]]).
|
||||
|
||||
스케줄 = monthly (main.py 5일 07:20 KST). 월간 1건 페이스라 diff 는 file_path dedup 으로 충분.
|
||||
수동: docker exec hyungi_document_server-fastapi-1 python -m workers.ccps_collector
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import hashlib
|
||||
import re
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from urllib.parse import urljoin, urlparse
|
||||
|
||||
from sqlalchemy import select
|
||||
|
||||
from core.config import settings
|
||||
from core.crawl_politeness import (
|
||||
CrawlBlocked,
|
||||
CrawlFetchError,
|
||||
CrawlSkip,
|
||||
download_via_browser,
|
||||
fetch_page_via_browser,
|
||||
)
|
||||
from core.database import async_session
|
||||
from core.utils import setup_logger
|
||||
from models.document import Document
|
||||
from models.news_source import NewsSource
|
||||
from models.queue import enqueue_stage
|
||||
from workers.kosha_collector import _safe_filename
|
||||
from workers.news_collector import (
|
||||
_get_or_create_health,
|
||||
_record_failure,
|
||||
_record_success,
|
||||
)
|
||||
|
||||
logger = setup_logger("ccps_collector")
|
||||
|
||||
_BEACON_URL = "https://www.aiche.org/ccps/resources/process-safety-beacon"
|
||||
_SOURCE_NAME = "CCPS Process Safety Beacon"
|
||||
_MAX_PDFS_PER_RUN = 10 # 월간 1~2건(영/한) 페이스 — 페이지 구조 오판 시 폭주 방지
|
||||
|
||||
|
||||
def _beacon_pdf_links(html_text: str, base_url: str) -> list[str]:
|
||||
"""beacon 관련 PDF 링크 — href/앵커텍스트에 'beacon' 포함만 (보수적).
|
||||
|
||||
필터에 안 걸린 PDF 가 있으면 호출측이 로그로 가시화 (첫 실측에서 패턴 보정용).
|
||||
"""
|
||||
seen: set[str] = set()
|
||||
out: list[str] = []
|
||||
for m in re.finditer(
|
||||
r'<a\s+[^>]*href="([^"]+\.pdf(?:\?[^"]*)?)"[^>]*>(.*?)</a>',
|
||||
html_text, re.I | re.S,
|
||||
):
|
||||
href, text = m.group(1), re.sub(r"<[^>]+>", " ", m.group(2))
|
||||
if "beacon" not in href.lower() and "beacon" not in text.lower():
|
||||
continue
|
||||
absolute = urljoin(base_url, href)
|
||||
path = urlparse(absolute).path
|
||||
if path not in seen:
|
||||
seen.add(path)
|
||||
out.append(absolute)
|
||||
return out
|
||||
|
||||
|
||||
def _all_pdf_hrefs(html_text: str) -> list[str]:
|
||||
return sorted({m.group(1) for m in re.finditer(r'href="([^"]+\.pdf(?:\?[^"]*)?)"', html_text, re.I)})
|
||||
|
||||
|
||||
async def _get_or_create_source(session) -> NewsSource:
|
||||
result = await session.execute(
|
||||
select(NewsSource).where(NewsSource.name == _SOURCE_NAME)
|
||||
)
|
||||
source = result.scalars().first()
|
||||
if source is None:
|
||||
source = NewsSource(
|
||||
name=_SOURCE_NAME, feed_url=_BEACON_URL, feed_type="rss",
|
||||
fetch_method="page", fulltext_policy="none",
|
||||
source_channel="crawl", category="Safety", language="en", country="US",
|
||||
enabled=False, # 6h 뉴스 사이클 비대상 — 본 워커가 monthly 폴링
|
||||
)
|
||||
session.add(source)
|
||||
await session.flush()
|
||||
return source
|
||||
|
||||
|
||||
async def _ingest_pdf(session, pdf_url: str) -> bool:
|
||||
"""Beacon PDF 1건 → NAS 저장 + Document + extract enqueue. 반환 = 신규 여부."""
|
||||
fname = _safe_filename(Path(urlparse(pdf_url).path).name)
|
||||
rel_path = f"crawl_raw/ccps_beacon/{fname}"
|
||||
existing = await session.execute(
|
||||
select(Document).where(Document.file_path == rel_path).limit(1)
|
||||
)
|
||||
if existing.scalars().first():
|
||||
return False
|
||||
|
||||
content, content_type = await download_via_browser(pdf_url, referer=_BEACON_URL)
|
||||
if "pdf" not in content_type.lower() and not content.startswith(b"%PDF"):
|
||||
raise CrawlSkip(f"PDF 아님 (content-type={content_type[:60]}): {pdf_url}")
|
||||
|
||||
dest = Path(settings.nas_mount_path) / rel_path
|
||||
dest.parent.mkdir(parents=True, exist_ok=True)
|
||||
dest.write_bytes(content)
|
||||
|
||||
doc = Document(
|
||||
file_path=rel_path,
|
||||
file_hash=hashlib.sha256(content).hexdigest(),
|
||||
file_format="pdf",
|
||||
file_size=len(content),
|
||||
file_type="immutable",
|
||||
title=fname.rsplit(".", 1)[0].replace("_", " ").replace("-", " "),
|
||||
source_channel="crawl",
|
||||
data_origin="external",
|
||||
import_source="ccps_beacon",
|
||||
edit_url=pdf_url,
|
||||
ai_tags=["Safety/CCPS Beacon"],
|
||||
extract_meta={"ccps": {"kind": "beacon_pdf"}},
|
||||
)
|
||||
session.add(doc)
|
||||
await session.flush()
|
||||
await enqueue_stage(session, doc.id, "extract")
|
||||
logger.info(f"[ccps] Beacon ingest: {rel_path} ({len(content)} bytes)")
|
||||
return True
|
||||
|
||||
|
||||
async def run() -> None:
|
||||
"""monthly 진입점 — 실패는 health 기록 (circuit 가 A-8 패널 가시화)."""
|
||||
now = datetime.now(timezone.utc)
|
||||
async with async_session() as session:
|
||||
source = await _get_or_create_source(session)
|
||||
await session.commit()
|
||||
source_id = source.id
|
||||
|
||||
try:
|
||||
html_text, final_url = await fetch_page_via_browser(_BEACON_URL, profile=None)
|
||||
links = _beacon_pdf_links(html_text, final_url)
|
||||
if not links:
|
||||
others = _all_pdf_hrefs(html_text)
|
||||
# 필터 0건 = 페이지 구조/명명 변경 가능성 — 발견 PDF 를 가시화해 보정 단서 제공
|
||||
raise CrawlFetchError(
|
||||
f"beacon PDF 0건 (전체 PDF {len(others)}건: {others[:5]})"
|
||||
)
|
||||
|
||||
new_count = 0
|
||||
for pdf_url in links[:_MAX_PDFS_PER_RUN]:
|
||||
async with async_session() as session:
|
||||
try:
|
||||
if await _ingest_pdf(session, pdf_url):
|
||||
new_count += 1
|
||||
await session.commit()
|
||||
except (CrawlBlocked, CrawlSkip, CrawlFetchError) as e:
|
||||
await session.rollback()
|
||||
logger.warning(f"[ccps] PDF 실패 skip ({pdf_url}): {e}")
|
||||
if len(links) > _MAX_PDFS_PER_RUN:
|
||||
logger.warning(
|
||||
f"[ccps] PDF {len(links)}건 중 {_MAX_PDFS_PER_RUN}건만 처리 "
|
||||
f"(월간 1~2건 가정 초과 — 페이지 구조 확인 필요)"
|
||||
)
|
||||
|
||||
async with async_session() as session:
|
||||
health = await _get_or_create_health(session, source_id)
|
||||
_record_success(health, new_count, False, now)
|
||||
src = await session.get(NewsSource, source_id)
|
||||
src.last_fetched_at = now
|
||||
await session.commit()
|
||||
logger.info(f"[ccps] 완료: 신규 {new_count}건 (링크 {len(links)}건)")
|
||||
except (CrawlBlocked, CrawlSkip, CrawlFetchError) as e:
|
||||
# CrawlBlocked = WAF 헤드리스 차단 신호 — 연속되면 circuit open (PARK 판단 근거)
|
||||
logger.error(f"[ccps] 수집 실패: {type(e).__name__}: {e}")
|
||||
async with async_session() as session:
|
||||
health = await _get_or_create_health(session, source_id)
|
||||
_record_failure(health, str(e) or repr(e), now)
|
||||
await session.commit()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(run())
|
||||
@@ -0,0 +1,390 @@
|
||||
"""C-2 잔여 ① US CSB sitemap diff 수집 워커 (plan crawl-24x7-1, 사이클 3).
|
||||
|
||||
RSS 폐지 → sitemap.xml lastmod diff 폴링이 정석 (정부 사이트라 lastmod 양호 —
|
||||
2026-06-11 실측 1,307 URL, 조사 보고서 페이지는 루트 슬러그). 페이지 본문(4-tier
|
||||
≥200자 게이트) + 보고서 PDF(/assets/, recommendation 상태요약 제외) →
|
||||
기존 extract 파이프라인(marker/kordoc) 재사용.
|
||||
|
||||
스케줄 = weekly (main.py 월 06:50 KST):
|
||||
워터마크(selector_override.sitemap_watermark — B-3 probe 설정과 같은 JSONB 슬롯)
|
||||
이후 lastmod 만, 오래된 것부터 cap(40페이지/회). 워터마크는 처리분까지만 전진
|
||||
= 잔량 자동 점진 백필 (KOSHA GUIDE cap 패턴). cap 미처리 잔량은 매회 로그
|
||||
(silent cap 금지). diff 건수 > sanity(300) = sitemap 부패/lastmod 남발 의심 가시 경고.
|
||||
|
||||
초기 일괄 (cap 해제, politeness 로 수 시간 — docker exec -d, 진행 중 같은 서비스
|
||||
재배포 금지 [[feedback_docker_exec_orphan_kill]] 자매 함정):
|
||||
docker exec hyungi_document_server-fastapi-1 \
|
||||
python -m workers.csb_collector --limit 3 # 검증용
|
||||
docker exec -d hyungi_document_server-fastapi-1 \
|
||||
python -m workers.csb_collector --bulk # 전체
|
||||
|
||||
멱등: 페이지 = edit_url(정규화)+file_hash dedup (first-wins — lastmod 갱신 페이지의
|
||||
본문 재적재는 안 함, 갱신의 실체인 신규 PDF 는 개별 dedup 으로 적재됨).
|
||||
PDF = file_path dedup. 워터마크 경계는 >= 재조회 — 경계 페이지 1회 재fetch 후
|
||||
dedup 이 잡는다 (lastmod 실측 distinct 라 누적 재fetch 없음).
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import hashlib
|
||||
import random
|
||||
import re
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from urllib.parse import urljoin, urlparse
|
||||
|
||||
import httpx
|
||||
from sqlalchemy import select
|
||||
|
||||
from core.config import settings
|
||||
from core.crawl_politeness import (
|
||||
CRAWL_UA,
|
||||
CrawlBlocked,
|
||||
CrawlFetchError,
|
||||
CrawlSkip,
|
||||
fetch_page,
|
||||
)
|
||||
from core.database import async_session
|
||||
from core.utils import setup_logger
|
||||
from models.document import Document
|
||||
from models.news_source import NewsSource
|
||||
from models.queue import enqueue_stage
|
||||
from workers.fulltext_worker import (
|
||||
_WEB_MIN_BODY_LEN,
|
||||
_extract_body,
|
||||
_raw_html_path,
|
||||
_save_raw_html,
|
||||
_strip_article_footer,
|
||||
)
|
||||
from workers.kosha_collector import _safe_filename
|
||||
from workers.news_collector import (
|
||||
FeedError,
|
||||
_get_or_create_health,
|
||||
_normalize_url,
|
||||
_record_failure,
|
||||
_record_success,
|
||||
)
|
||||
from workers.static_corpus_ingest import _page_title
|
||||
|
||||
logger = setup_logger("csb_collector")
|
||||
|
||||
_SITEMAP_URL = "https://www.csb.gov/sitemap.xml"
|
||||
_SOURCE_NAME = "US CSB 사고조사보고서"
|
||||
|
||||
_RUN_PAGE_CAP = 40 # weekly 1회 처리 상한 — 잔량은 워터마크 미전진으로 자동 이월
|
||||
_DIFF_SANITY = 300 # 주간 diff 가 이를 넘으면 sitemap lastmod 남발/부패 의심 (카드 C-2)
|
||||
_MAX_PDF_BYTES = 50 * 1024 * 1024
|
||||
_PDF_DELAY = (2.0, 5.0) # 같은 도메인 연속 PDF 다운로드 간격 (kosha _DOWNLOAD_DELAY 동률)
|
||||
|
||||
# 텍스트 코퍼스 무가치/관리성 섹션 — 첫 path segment 기준 (조사 보고서·뉴스 릴리스는
|
||||
# 루트 슬러그라 영향 없음. /news/·/investigations/ 는 목록 페이지뿐이라 제외).
|
||||
_SKIP_FIRST_SEGMENT = {
|
||||
"videos", "photos", "events", "members", "disclaimers", "media-room",
|
||||
"about-the-csb", "about-us", "foia", "news", "investigations",
|
||||
"site-map", "subscribe", "unsubscribe", "optout", "test",
|
||||
"privacy-policy", "vulnerability-disclosure-policy", "en-espanol",
|
||||
"newsletter", "recom-stats", "500.aspx", "documents", "records-details",
|
||||
}
|
||||
|
||||
|
||||
def _parse_sitemap(xml_text: str) -> list[tuple[str, datetime]]:
|
||||
"""(url, lastmod) 목록 — lastmod 없는/파싱불가 항목은 제외 (diff 축이 없음)."""
|
||||
out: list[tuple[str, datetime]] = []
|
||||
for m in re.finditer(
|
||||
r"<url>\s*<loc>([^<]+)</loc>\s*<lastmod>([^<]+)</lastmod>", xml_text
|
||||
):
|
||||
try:
|
||||
lastmod = datetime.fromisoformat(m.group(2).strip())
|
||||
except ValueError:
|
||||
continue
|
||||
if lastmod.tzinfo is None:
|
||||
lastmod = lastmod.replace(tzinfo=timezone.utc)
|
||||
out.append((m.group(1).strip(), lastmod))
|
||||
return out
|
||||
|
||||
|
||||
def _should_skip(url: str) -> bool:
|
||||
path = urlparse(url).path.strip("/")
|
||||
if not path:
|
||||
return True # 홈
|
||||
return path.split("/", 1)[0].lower() in _SKIP_FIRST_SEGMENT
|
||||
|
||||
|
||||
def _pdf_links(html_text: str, base_url: str) -> list[str]:
|
||||
"""페이지 내 보고서 PDF — /assets/recommendation/(상태변경 요약 다수)은 제외.
|
||||
|
||||
cache-buster 쿼리(?17346)는 다운로드 URL 에는 유지, dedup/파일명은 path 기준.
|
||||
"""
|
||||
seen: set[str] = set()
|
||||
out: list[str] = []
|
||||
for m in re.finditer(r'href="([^"]+\.pdf(?:\?[^"]*)?)"', html_text, re.I):
|
||||
absolute = urljoin(base_url, m.group(1))
|
||||
path = urlparse(absolute).path
|
||||
if "/assets/recommendation/" in path.lower():
|
||||
continue
|
||||
if (urlparse(absolute).hostname or "").lower() != "www.csb.gov":
|
||||
continue
|
||||
if path not in seen:
|
||||
seen.add(path)
|
||||
out.append(absolute)
|
||||
return out
|
||||
|
||||
|
||||
async def _download_pdf(url: str, dest: Path) -> int:
|
||||
"""PDF 다운로드 — 크기 cap + 연속 간격 (politeness 는 순차 실행 전제)."""
|
||||
await asyncio.sleep(random.uniform(*_PDF_DELAY))
|
||||
async with httpx.AsyncClient(timeout=60, follow_redirects=True) as client:
|
||||
resp = await client.get(url, headers={"User-Agent": CRAWL_UA})
|
||||
if resp.status_code != 200:
|
||||
raise FeedError(f"PDF 다운로드 {resp.status_code}: {url}")
|
||||
if len(resp.content) > _MAX_PDF_BYTES:
|
||||
raise FeedError(f"PDF 크기 초과 ({len(resp.content)} bytes): {url}")
|
||||
dest.parent.mkdir(parents=True, exist_ok=True)
|
||||
dest.write_bytes(resp.content)
|
||||
return len(resp.content)
|
||||
|
||||
|
||||
async def _get_or_create_source(session) -> NewsSource:
|
||||
result = await session.execute(
|
||||
select(NewsSource).where(NewsSource.name == _SOURCE_NAME)
|
||||
)
|
||||
source = result.scalars().first()
|
||||
if source is None:
|
||||
source = NewsSource(
|
||||
name=_SOURCE_NAME, feed_url=_SITEMAP_URL, feed_type="rss",
|
||||
fetch_method="sitemap+page", fulltext_policy="none",
|
||||
source_channel="crawl", category="Safety", language="en", country="US",
|
||||
enabled=False, # 6h 뉴스 사이클 비대상 — 본 워커가 weekly 폴링
|
||||
)
|
||||
session.add(source)
|
||||
await session.flush()
|
||||
return source
|
||||
|
||||
|
||||
def _watermark(source: NewsSource) -> datetime | None:
|
||||
raw = (source.selector_override or {}).get("sitemap_watermark")
|
||||
if not raw:
|
||||
return None
|
||||
try:
|
||||
return datetime.fromisoformat(raw)
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
def _set_watermark(source: NewsSource, value: datetime) -> None:
|
||||
# JSONB 변경 감지를 위해 dict 재할당 (fulltext_worker._set_fulltext_meta 동일 규약)
|
||||
cfg = dict(source.selector_override or {})
|
||||
cfg["sitemap_watermark"] = value.isoformat()
|
||||
source.selector_override = cfg
|
||||
|
||||
|
||||
async def _ingest_pdf(session, page_slug: str, pdf_url: str) -> bool:
|
||||
"""PDF 1건 → NAS 저장 + Document + extract enqueue. 반환 = 신규 여부."""
|
||||
fname = _safe_filename(Path(urlparse(pdf_url).path).name)
|
||||
rel_path = f"crawl_raw/csb/{page_slug}/{fname}"
|
||||
existing = await session.execute(
|
||||
select(Document).where(Document.file_path == rel_path).limit(1)
|
||||
)
|
||||
if existing.scalars().first():
|
||||
return False
|
||||
|
||||
dest = Path(settings.nas_mount_path) / rel_path
|
||||
size = await _download_pdf(pdf_url, dest)
|
||||
doc = Document(
|
||||
file_path=rel_path,
|
||||
file_hash=hashlib.sha256(dest.read_bytes()).hexdigest(),
|
||||
file_format="pdf",
|
||||
file_size=size,
|
||||
file_type="immutable",
|
||||
title=fname.rsplit(".", 1)[0].replace("_", " "),
|
||||
source_channel="crawl",
|
||||
data_origin="external",
|
||||
import_source="csb_sitemap",
|
||||
edit_url=pdf_url,
|
||||
ai_tags=["Safety/CSB/보고서"],
|
||||
extract_meta={"csb": {"page_slug": page_slug, "kind": "report_pdf"}},
|
||||
)
|
||||
session.add(doc)
|
||||
await session.flush()
|
||||
await enqueue_stage(session, doc.id, "extract")
|
||||
logger.info(f"[csb] PDF ingest: {rel_path} ({size} bytes)")
|
||||
return True
|
||||
|
||||
|
||||
async def _ingest_url(session, source: NewsSource, url: str, lastmod: datetime) -> dict:
|
||||
"""변경 URL 1건: 페이지 fetch → PDF 전수 스캔(개별 dedup) + 본문 신규면 적재.
|
||||
|
||||
페이지 재방문(lastmod 갱신)에서도 PDF 스캔은 항상 수행 — 갱신의 실체
|
||||
(최종 보고서 추가 등)가 PDF 로 오는 경우가 핵심 가치다.
|
||||
"""
|
||||
counts = {"page": 0, "pdf": 0, "skip": 0}
|
||||
try:
|
||||
html_text, final_url = await fetch_page(url)
|
||||
except (CrawlBlocked, CrawlSkip, CrawlFetchError) as e:
|
||||
logger.warning(f"[csb] fetch 실패 skip: {url} — {type(e).__name__}: {e}")
|
||||
counts["skip"] = 1
|
||||
return counts
|
||||
|
||||
page_slug = _safe_filename(urlparse(url).path.strip("/").split("/")[-1] or "root")
|
||||
|
||||
for pdf_url in _pdf_links(html_text, final_url):
|
||||
try:
|
||||
if await _ingest_pdf(session, page_slug, pdf_url):
|
||||
counts["pdf"] += 1
|
||||
except FeedError as e:
|
||||
logger.warning(f"[csb] PDF 실패 skip ({pdf_url}): {e}")
|
||||
|
||||
# 페이지 본문 — first-wins (이미 있으면 본문 재적재 없음)
|
||||
normalized_url = _normalize_url(url)
|
||||
page_hash = hashlib.sha256(f"csb-page|{normalized_url}".encode()).hexdigest()[:32]
|
||||
existing = await session.execute(
|
||||
select(Document).where(
|
||||
(Document.file_hash == page_hash)
|
||||
| (Document.edit_url.in_([normalized_url, url]))
|
||||
).limit(1)
|
||||
)
|
||||
if existing.scalars().first():
|
||||
return counts
|
||||
|
||||
body, engine, engine_ver = _extract_body(html_text)
|
||||
if not engine:
|
||||
logger.info(f"[csb] 본문 부족 — 페이지 비적재 (PDF 만): {url}")
|
||||
return counts
|
||||
clean_body = _strip_article_footer(body.replace("\x00", ""))
|
||||
if len(clean_body) < _WEB_MIN_BODY_LEN:
|
||||
return counts
|
||||
|
||||
now = datetime.now(timezone.utc)
|
||||
raw_path = _raw_html_path(source.id, page_hash, now)
|
||||
raw_saved = True
|
||||
try:
|
||||
_save_raw_html(raw_path, html_text)
|
||||
except OSError as e:
|
||||
raw_saved = False
|
||||
logger.error(f"[csb] 원본 보존 실패 (ingest 는 진행): {e}")
|
||||
|
||||
title = _page_title(html_text, fallback=page_slug.replace("-", " ")[:90])
|
||||
doc = Document(
|
||||
file_path=f"crawl/{_SOURCE_NAME}/{page_hash}",
|
||||
file_hash=page_hash,
|
||||
file_format="article",
|
||||
file_size=0,
|
||||
file_type="note",
|
||||
title=title,
|
||||
extracted_text=f"{title}\n\n{clean_body}",
|
||||
extracted_at=now,
|
||||
extractor_version=f"sitemap+page@{engine}",
|
||||
md_content=clean_body,
|
||||
md_status="success",
|
||||
md_extraction_engine=engine,
|
||||
md_extraction_engine_version=engine_ver,
|
||||
md_format_version="1.0",
|
||||
md_generated_at=now,
|
||||
md_source_hash=hashlib.sha256(html_text.encode("utf-8", errors="replace")).hexdigest(),
|
||||
md_content_hash=hashlib.sha256(clean_body.encode("utf-8")).hexdigest(),
|
||||
content_origin="extracted",
|
||||
source_channel="crawl",
|
||||
data_origin="external",
|
||||
edit_url=normalized_url,
|
||||
review_status="approved",
|
||||
ai_domain="Safety",
|
||||
ai_sub_group=_SOURCE_NAME,
|
||||
ai_tags=["Safety/CSB"],
|
||||
extract_meta={
|
||||
"source_id": source.id,
|
||||
"source_name": _SOURCE_NAME,
|
||||
"published_at": lastmod.isoformat(),
|
||||
"fulltext": {
|
||||
"status": "csb_sitemap",
|
||||
"engine": engine,
|
||||
"final_url": final_url,
|
||||
"raw_html_path": str(raw_path) if raw_saved else None,
|
||||
"body_chars": len(clean_body),
|
||||
"resolved_at": now.isoformat(),
|
||||
},
|
||||
},
|
||||
)
|
||||
doc.file_size = len(doc.extracted_text.encode())
|
||||
session.add(doc)
|
||||
await session.flush()
|
||||
await enqueue_stage(session, doc.id, "summarize")
|
||||
await enqueue_stage(session, doc.id, "embed")
|
||||
await enqueue_stage(session, doc.id, "chunk")
|
||||
counts["page"] = 1
|
||||
logger.info(f"[csb] page ingest {len(clean_body)}자 ({engine}): {title[:60]}")
|
||||
return counts
|
||||
|
||||
|
||||
async def run(bulk: bool = False, limit: int = 0) -> None:
|
||||
"""weekly 진입점 (스케줄러) — bulk/limit 은 CLI 전용."""
|
||||
now = datetime.now(timezone.utc)
|
||||
async with async_session() as session:
|
||||
source = await _get_or_create_source(session)
|
||||
await session.commit()
|
||||
source_id = source.id
|
||||
watermark = _watermark(source)
|
||||
|
||||
try:
|
||||
xml_text, _ = await fetch_page(
|
||||
_SITEMAP_URL, content_types=("text/xml", "application/xml", "text/html")
|
||||
)
|
||||
entries = _parse_sitemap(xml_text)
|
||||
if not entries:
|
||||
raise FeedError("sitemap 파싱 0건 — 포맷 변경/부패 의심")
|
||||
except (CrawlBlocked, CrawlSkip, CrawlFetchError, FeedError) as e:
|
||||
logger.error(f"[csb] sitemap 수집 실패: {e}")
|
||||
async with async_session() as session:
|
||||
health = await _get_or_create_health(session, source_id)
|
||||
_record_failure(health, str(e) or repr(e), now)
|
||||
await session.commit()
|
||||
return
|
||||
|
||||
changed = sorted(
|
||||
(
|
||||
(url, lastmod) for url, lastmod in entries
|
||||
if not _should_skip(url) and (watermark is None or lastmod >= watermark)
|
||||
),
|
||||
key=lambda pair: pair[1],
|
||||
)
|
||||
if watermark is not None and len(changed) > _DIFF_SANITY:
|
||||
logger.error(
|
||||
f"[csb] diff {len(changed)}건 > sanity {_DIFF_SANITY} — "
|
||||
f"sitemap lastmod 남발/부패 의심 (cap 처리는 계속, 관찰 필요)"
|
||||
)
|
||||
|
||||
cap = len(changed) if bulk else _RUN_PAGE_CAP
|
||||
if limit:
|
||||
cap = min(cap, limit)
|
||||
todo, deferred = changed[:cap], max(len(changed) - cap, 0)
|
||||
logger.info(
|
||||
f"[csb] sitemap {len(entries)}건 중 변경 {len(changed)}건, 처리 {len(todo)}건"
|
||||
+ (f" (잔여 {deferred}건 — 워터마크 미전진으로 자동 이월)" if deferred else "")
|
||||
)
|
||||
|
||||
totals = {"page": 0, "pdf": 0, "skip": 0}
|
||||
for i, (url, lastmod) in enumerate(todo, 1):
|
||||
async with async_session() as session:
|
||||
src = await session.get(NewsSource, source_id)
|
||||
counts = await _ingest_url(session, src, url, lastmod)
|
||||
_set_watermark(src, lastmod)
|
||||
await session.commit()
|
||||
for k in totals:
|
||||
totals[k] += counts[k]
|
||||
if i % 10 == 0:
|
||||
logger.info(f"[csb] 진행 {i}/{len(todo)} {totals}")
|
||||
|
||||
async with async_session() as session:
|
||||
health = await _get_or_create_health(session, source_id)
|
||||
_record_success(health, totals["page"] + totals["pdf"], False, now)
|
||||
src = await session.get(NewsSource, source_id)
|
||||
src.last_fetched_at = now
|
||||
await session.commit()
|
||||
logger.info(f"[csb] 완료: {totals} (변경 {len(changed)}건 중 {len(todo)}건 처리)")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="CSB sitemap diff 수집")
|
||||
parser.add_argument("--bulk", action="store_true", help="cap 해제 — 초기 일괄")
|
||||
parser.add_argument("--limit", type=int, default=0, help="처리 상한 (검증용)")
|
||||
args = parser.parse_args()
|
||||
asyncio.run(run(bulk=args.bulk, limit=args.limit))
|
||||
@@ -20,12 +20,12 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
import json
|
||||
import re
|
||||
from ai.client import AIClient, parse_json_response, strip_thinking
|
||||
from ai.client import AIClient, call_deep_or_defer, parse_json_response, strip_thinking
|
||||
from ai.envelope import EscalationEnvelope
|
||||
from core.config import settings
|
||||
from core.utils import setup_logger
|
||||
from models.document import Document
|
||||
from models.queue import ProcessingQueue
|
||||
from models.queue import ProcessingQueue, StageDeferred
|
||||
from policy.prompt_render import render_26b, policy_version as compute_policy_version
|
||||
from services.document_telemetry import record_analyze_event
|
||||
from services.search.llm_gate import Priority, acquire_mlx_gate
|
||||
@@ -101,17 +101,30 @@ async def process(document_id: int, session: AsyncSession) -> None:
|
||||
)
|
||||
|
||||
client = AIClient()
|
||||
# ds-macbook-offload-1: deep 슬롯 구성 시 맥북 M5 Max 경유(라우터). 부재 시 기존 경로 그대로.
|
||||
deep_cfg = client.ai.deep
|
||||
used_cfg = deep_cfg or settings.ai.primary
|
||||
latency_ms = 0
|
||||
parse_error: str | None = None
|
||||
deep_out = DeepSummaryOutput()
|
||||
|
||||
try:
|
||||
start = time.perf_counter()
|
||||
async with acquire_mlx_gate(Priority.BACKGROUND): # 2026-05-17 B-1: classify-escalate worker
|
||||
raw = await client.call_primary(prompt)
|
||||
if deep_cfg is not None:
|
||||
# 맥북 경유 — 맥미니 mlx gate 미점유(게이트는 맥미니 보호 목적). 맥북 불가
|
||||
# (503/연결/생성 중 sleep 절단)는 StageDeferred = 보류, 맥미니 강등 없음.
|
||||
# doc 쓰기는 완주+파싱 후에만 일어나므로 어느 시점에 끊겨도 부분 쓰기 0.
|
||||
raw = await call_deep_or_defer(client, prompt)
|
||||
else:
|
||||
async with acquire_mlx_gate(Priority.BACKGROUND): # 2026-05-17 B-1: classify-escalate worker
|
||||
raw = await client.call_primary(prompt)
|
||||
latency_ms = int((time.perf_counter() - start) * 1000)
|
||||
except StageDeferred:
|
||||
# 보류는 실패가 아님 — analyze_event 미기록(가짜 완료 방지), consumer 가 백오프 기록.
|
||||
logger.info(f"[deep] id={document_id} 맥북 일시 불가 — 보류 (deferred)")
|
||||
raise
|
||||
except Exception as exc:
|
||||
logger.warning(f"[deep] 26B 호출 실패 id={document_id}: {exc}")
|
||||
logger.warning(f"[deep] 호출 실패 id={document_id} model={used_cfg.model}: {exc}")
|
||||
parse_error = "call_failed"
|
||||
raw = ""
|
||||
finally:
|
||||
@@ -147,12 +160,13 @@ async def process(document_id: int, session: AsyncSession) -> None:
|
||||
doc_id=document_id,
|
||||
user_id=None,
|
||||
mode="summary_deep",
|
||||
text_limit=settings.ai.primary.context_char_limit or 260000,
|
||||
text_limit=used_cfg.context_char_limit or 260000,
|
||||
truncated=False,
|
||||
layers_returned=["detail_summary", "inconsistencies"] if not parse_error else [],
|
||||
cached=False,
|
||||
latency_ms=latency_ms,
|
||||
model_name=settings.ai.primary.model,
|
||||
# deep 슬롯 사용 시 실처리 모델(qwen-macbook alias) 기록 — 어느 머신이 처리했는지 추적
|
||||
model_name=used_cfg.model,
|
||||
prompt_version=(f"{DEEP_SUMMARY_TASK}@{pv}" if pv else DEEP_SUMMARY_TASK),
|
||||
error_code=parse_error,
|
||||
source="document_server",
|
||||
|
||||
@@ -0,0 +1,320 @@
|
||||
"""fulltext 승격 워커 (A-2 + A-7, plan crawl-24x7-1)
|
||||
|
||||
news_collector 가 fulltext_policy='page' 소스의 기사에 enqueue 한 'fulltext' stage 를 소비:
|
||||
기사 페이지 politeness fetch (A-4) → 원본 HTML NAS gzip 보존 (A-7)
|
||||
→ extract_worker 4-tier 재사용 (tier 2 sibling .md 는 디스크 원본이 없어 비적용)
|
||||
→ extracted_text/md_content 승격 → summarize + (30일 게이트) embed/chunk enqueue.
|
||||
|
||||
실패 처리 (큐 어휘 = DB enum, 분기만 워커):
|
||||
- 일시 오류 (5xx/timeout) : raise → 큐 재시도 (max_attempts 3)
|
||||
- 차단/비대상 (403/429/robots/비HTML/추출부족): RSS 요약으로 격하(degrade) 후 완료
|
||||
→ summarize/embed/chunk enqueue 보장 (기사 유실 0). 격하 사유는 extract_meta.fulltext 에 기록.
|
||||
- 영구 실패 (3회 소진) : 야간 reconcile_unresolved() 가 summarize 안전망 enqueue
|
||||
([[feedback_silent_skip_accumulation]] — 조건부 skip 이 영구 침묵으로 누적되지 않게).
|
||||
|
||||
승격 게이트: 전 tier 공통 본문 >= 200자 (devonagent 와 달리 tier 4 도 게이트 적용 —
|
||||
페이월/오류 페이지의 nav 찌꺼기를 본문으로 승격하느니 RSS 요약 격하가 낫다).
|
||||
"""
|
||||
|
||||
import gzip
|
||||
import hashlib
|
||||
import re
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
from sqlalchemy import exists, select
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
from sqlalchemy.orm import aliased
|
||||
|
||||
from core.config import settings
|
||||
from core.crawl_politeness import (
|
||||
CrawlBlocked,
|
||||
CrawlFetchError,
|
||||
CrawlSkip,
|
||||
fetch_page,
|
||||
fetch_page_via_browser,
|
||||
probe_session,
|
||||
)
|
||||
from core.database import async_session
|
||||
from core.utils import setup_logger
|
||||
from models.document import Document
|
||||
from models.news_source import NewsSource
|
||||
from models.queue import ProcessingQueue, enqueue_stage
|
||||
from workers.extract_worker import (
|
||||
_WEB_MIN_BODY_LEN,
|
||||
_extract_web_with_bs4,
|
||||
_extract_web_with_readability,
|
||||
_extract_web_with_trafilatura,
|
||||
)
|
||||
|
||||
logger = setup_logger("fulltext_worker")
|
||||
|
||||
# 한국 기사 푸터 1층 후처리 (A-2) — 보수적으로 라인 단위만 제거
|
||||
_FOOTER_PATTERNS = [
|
||||
re.compile(r"^.{0,120}(무단\s*전재|무단\s*복제|재배포\s*금지|저작권자\s*[ⓒ©(]).*$", re.M),
|
||||
re.compile(r"^[\w.+-]+@[\w.-]+\.[A-Za-z]{2,}\s*$", re.M), # 단독 이메일 라인
|
||||
re.compile(r"^\s*\S{2,4}\s*기자\s*$", re.M), # 단독 '◯◯◯ 기자' 라인
|
||||
]
|
||||
|
||||
|
||||
def _strip_article_footer(body: str) -> str:
|
||||
for pat in _FOOTER_PATTERNS:
|
||||
body = pat.sub("", body)
|
||||
return re.sub(r"\n{3,}", "\n\n", body).strip()
|
||||
|
||||
|
||||
def _extract_body(html_text: str) -> tuple[str, str | None, str | None]:
|
||||
"""(body, engine, engine_version). 전 tier >= 200자 게이트, 미달이면 ("", None, None)."""
|
||||
body, ver = _extract_web_with_trafilatura(html_text)
|
||||
if body and len(body) >= _WEB_MIN_BODY_LEN:
|
||||
return body, "trafilatura", ver
|
||||
body, ver = _extract_web_with_readability(html_text)
|
||||
if body and len(body) >= _WEB_MIN_BODY_LEN:
|
||||
return body, "readability", ver
|
||||
body, ver = _extract_web_with_bs4(html_text)
|
||||
if body and len(body) >= _WEB_MIN_BODY_LEN:
|
||||
return body, "bs4_text", ver
|
||||
return "", None, None
|
||||
|
||||
|
||||
def _raw_html_path(source_id: int | None, file_hash: str, now: datetime) -> Path:
|
||||
"""A-7 원본 보존 경로 — NAS 본진. 한글 디렉토리의 NFC/NFD 비대칭을 피해 source_id 사용.
|
||||
|
||||
file_hash 는 DB 컬럼이 character(64) 라 32자 해시가 공백 패딩되어 돌아옴 — strip 필수
|
||||
(미적용 시 NAS 파일명에 공백 32개 = 쉘/rsync 함정).
|
||||
"""
|
||||
src_dir = f"src_{source_id}" if source_id is not None else "src_unknown"
|
||||
return (
|
||||
Path(settings.nas_mount_path) / "crawl_raw" / src_dir
|
||||
/ now.strftime("%Y-%m") / f"{file_hash.strip()}.html.gz"
|
||||
)
|
||||
|
||||
|
||||
def _save_raw_html(path: Path, html_text: str) -> None:
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with gzip.open(path, "wb") as f:
|
||||
f.write(html_text.encode("utf-8", errors="replace"))
|
||||
|
||||
|
||||
async def _enqueue_downstream(session: AsyncSession, doc: Document) -> None:
|
||||
"""승격/격하 공통 후속 — summarize 무조건 + 30일 게이트 통과 시 embed/chunk."""
|
||||
await enqueue_stage(session, doc.id, "summarize")
|
||||
published_raw = (doc.extract_meta or {}).get("published_at")
|
||||
if doc.source_channel == "crawl":
|
||||
# 도메인 재료 코퍼스 — 발행일 무관 전량 색인 (30일 게이트는 뉴스 전용)
|
||||
await enqueue_stage(session, doc.id, "embed")
|
||||
await enqueue_stage(session, doc.id, "chunk")
|
||||
return
|
||||
days_old = 0
|
||||
if published_raw:
|
||||
try:
|
||||
pub_dt = datetime.fromisoformat(published_raw)
|
||||
days_old = (datetime.now(timezone.utc) - pub_dt).days
|
||||
except ValueError:
|
||||
days_old = 0 # 파싱 불가 = 신규 취급 (수집 시점 기본과 동일)
|
||||
if days_old <= 30:
|
||||
await enqueue_stage(session, doc.id, "embed")
|
||||
await enqueue_stage(session, doc.id, "chunk")
|
||||
|
||||
|
||||
def _set_fulltext_meta(doc: Document, **fields) -> None:
|
||||
"""extract_meta.fulltext 갱신 — JSONB 변경 감지를 위해 dict 재할당."""
|
||||
meta = dict(doc.extract_meta or {})
|
||||
meta["fulltext"] = {**meta.get("fulltext", {}), **fields}
|
||||
doc.extract_meta = meta
|
||||
|
||||
|
||||
_PROBE_TTL_SECONDS = 6 * 3600 # probe 유효 시간 — 만료 시 배치 경계에서 재검증
|
||||
|
||||
|
||||
async def _auth_session_ready(session: AsyncSession, source: NewsSource) -> tuple[bool, str]:
|
||||
"""B-3 ② 내용 기반 probe 게이트 + relogin_requested 소비 (수동 half-open).
|
||||
|
||||
플래그 소비는 '불가용 스킵' 분기보다 앞 — 어댑터 틱마다 도달 (r5 데드 버튼 함정 고정).
|
||||
probe 실패 상태에서는 auth fetch 0회 (자동 재시도 루프 = 계정 잠금 직행 — B-3 ③).
|
||||
복구 경로 = storage_state 갱신 후 relogin_requested 플래그 set (수동).
|
||||
probe 설정은 source.selector_override JSONB: probe_url / min_body_chars / paywall_markers.
|
||||
"""
|
||||
from workers.news_collector import _get_or_create_health
|
||||
|
||||
health = await _get_or_create_health(session, source.id)
|
||||
now = datetime.now(timezone.utc)
|
||||
cfg = source.selector_override or {}
|
||||
probe_url = cfg.get("probe_url")
|
||||
|
||||
force = False
|
||||
if health.relogin_requested:
|
||||
health.relogin_requested = False # 소비 = 1회 half-open 시도
|
||||
health.updated_at = now
|
||||
force = True
|
||||
logger.info(f"[fulltext/auth] {source.name} relogin_requested 소비 — half-open probe")
|
||||
|
||||
if not force:
|
||||
if health.last_probe_ok is False:
|
||||
return False, "probe 실패 상태 (storage_state 갱신 + relogin_requested 대기)"
|
||||
if (
|
||||
health.last_probe_ok
|
||||
and health.last_probe_at
|
||||
and (now - health.last_probe_at).total_seconds() < _PROBE_TTL_SECONDS
|
||||
):
|
||||
return True, ""
|
||||
|
||||
if not probe_url:
|
||||
return False, "selector_override.probe_url 미설정"
|
||||
|
||||
result = await probe_session(
|
||||
source.auth_profile,
|
||||
probe_url,
|
||||
int(cfg.get("min_body_chars", 800)),
|
||||
list(cfg.get("paywall_markers", [])),
|
||||
)
|
||||
health.last_probe_at = now
|
||||
health.last_probe_ok = bool(result.get("ok"))
|
||||
health.updated_at = now
|
||||
if not health.last_probe_ok:
|
||||
logger.warning(f"[fulltext/auth] {source.name} probe 실패: {result.get('reason')}")
|
||||
return False, str(result.get("reason"))
|
||||
logger.info(f"[fulltext/auth] {source.name} probe OK ({result.get('body_chars')}자)")
|
||||
return True, ""
|
||||
|
||||
|
||||
async def _degrade(session: AsyncSession, doc: Document, reason: str) -> None:
|
||||
"""본문 승격 실패 — RSS 요약 그대로 후속 단계 진행 (기사 유실 0)."""
|
||||
_set_fulltext_meta(
|
||||
doc, status="degraded", reason=reason[:300],
|
||||
resolved_at=datetime.now(timezone.utc).isoformat(),
|
||||
)
|
||||
await _enqueue_downstream(session, doc)
|
||||
logger.warning(f"[fulltext] doc={doc.id} 격하(RSS 요약 유지): {reason}")
|
||||
|
||||
|
||||
async def process(document_id: int, session: AsyncSession) -> None:
|
||||
"""기사 1건 풀텍스트 승격. queue_consumer 컨벤션 시그니처 (커밋은 consumer 가)."""
|
||||
doc = await session.get(Document, document_id)
|
||||
if not doc:
|
||||
raise ValueError(f"문서 ID {document_id}를 찾을 수 없음")
|
||||
if not doc.edit_url:
|
||||
await _degrade(session, doc, "edit_url 없음")
|
||||
return
|
||||
|
||||
meta = doc.extract_meta or {}
|
||||
source_id = meta.get("source_id")
|
||||
|
||||
# B-3: 구독 소스(auth_profile)는 Playwright 세션 fetch — probe 게이트 선행
|
||||
source = await session.get(NewsSource, source_id) if source_id else None
|
||||
auth_profile = source.auth_profile if source is not None else None
|
||||
|
||||
if auth_profile:
|
||||
ready, why = await _auth_session_ready(session, source)
|
||||
if not ready:
|
||||
await _degrade(session, doc, f"구독 세션 불가용: {why}")
|
||||
return
|
||||
|
||||
try:
|
||||
if auth_profile:
|
||||
html_text, final_url = await fetch_page_via_browser(doc.edit_url, auth_profile)
|
||||
else:
|
||||
html_text, final_url = await fetch_page(doc.edit_url)
|
||||
except (CrawlBlocked, CrawlSkip) as e:
|
||||
await _degrade(session, doc, f"{type(e).__name__}: {e}")
|
||||
return
|
||||
except CrawlFetchError:
|
||||
raise # 일시 오류 — 큐 재시도
|
||||
|
||||
now = datetime.now(timezone.utc)
|
||||
|
||||
# A-7: 원본 HTML 보존 (추출기 교체 시 전체 재추출 가능 상태 유지)
|
||||
raw_path = _raw_html_path(source_id, doc.file_hash, now)
|
||||
try:
|
||||
_save_raw_html(raw_path, html_text)
|
||||
raw_saved = True
|
||||
except OSError as e:
|
||||
# NAS 일시 장애 시 보존만 누락하고 승격은 진행 — 사유 기록 (silent 누락 회피)
|
||||
raw_saved = False
|
||||
logger.error(f"[fulltext] doc={doc.id} 원본 보존 실패 (승격은 진행): {e}")
|
||||
|
||||
body, engine, engine_ver = _extract_body(html_text)
|
||||
if not engine:
|
||||
await _degrade(session, doc, f"추출 실패 (전 tier < {_WEB_MIN_BODY_LEN}자)")
|
||||
return
|
||||
|
||||
clean_body = _strip_article_footer(body.replace("\x00", ""))
|
||||
if len(clean_body) < _WEB_MIN_BODY_LEN:
|
||||
await _degrade(session, doc, "푸터 제거 후 본문 부족")
|
||||
return
|
||||
|
||||
# B-3: 추출 결과도 페이월 마커로 게이트 — probe 통과 후 만료된 세션의
|
||||
# '페이월 안내문' 본문 승격(silent corruption) 차단 + 즉시 probe 상태 강등
|
||||
if auth_profile:
|
||||
from workers.news_collector import _get_or_create_health
|
||||
|
||||
markers = (source.selector_override or {}).get("paywall_markers", [])
|
||||
hit = next((m for m in markers if m and m.lower() in clean_body.lower()), None)
|
||||
if hit:
|
||||
health = await _get_or_create_health(session, source.id)
|
||||
health.last_probe_ok = False
|
||||
health.updated_at = datetime.now(timezone.utc)
|
||||
await _degrade(session, doc, f"본문 페이월 마커 검출({hit}) — 세션 손상 의심")
|
||||
return
|
||||
|
||||
title = doc.title or ""
|
||||
doc.extracted_text = f"{title}\n\n{clean_body}" if title else clean_body
|
||||
doc.extracted_at = now
|
||||
doc.extractor_version = f"rss+page@{engine}"
|
||||
doc.md_content = clean_body
|
||||
doc.md_status = "success"
|
||||
doc.md_extraction_engine = engine
|
||||
doc.md_extraction_engine_version = engine_ver
|
||||
doc.md_format_version = "1.0"
|
||||
doc.md_generated_at = now
|
||||
doc.md_source_hash = hashlib.sha256(html_text.encode("utf-8", errors="replace")).hexdigest()
|
||||
doc.md_content_hash = hashlib.sha256(clean_body.encode("utf-8")).hexdigest()
|
||||
doc.md_extraction_error = None # 수집 시점의 '변환 비대상' 마커 해제
|
||||
doc.content_origin = "extracted"
|
||||
doc.file_size = len(doc.extracted_text.encode())
|
||||
_set_fulltext_meta(
|
||||
doc, status="promoted", engine=engine,
|
||||
raw_html_path=str(raw_path) if raw_saved else None,
|
||||
final_url=final_url, body_chars=len(clean_body),
|
||||
resolved_at=now.isoformat(),
|
||||
)
|
||||
|
||||
await _enqueue_downstream(session, doc)
|
||||
logger.info(
|
||||
f"[fulltext/{engine}] doc={doc.id} {len(clean_body)}자 승격 "
|
||||
f"(raw={'saved' if raw_saved else 'MISSING'})"
|
||||
)
|
||||
|
||||
|
||||
async def reconcile_unresolved() -> None:
|
||||
"""안전망 (야간 1회): fulltext 영구 실패(3회 소진)로 summarize 가 영영 안 잡힌
|
||||
뉴스 문서에 RSS 요약 기준 후속 단계를 enqueue. 멱등 — enqueue 후엔 조건 불일치."""
|
||||
async with async_session() as session:
|
||||
# 외부 쿼리 FROM 에 ProcessingQueue 가 이미 있어 alias 없이는 auto-correlation 이
|
||||
# 서브쿼리 FROM 을 전부 제거 → InvalidRequestError (queue_consumer.reset_stale_items 패턴)
|
||||
pq = aliased(ProcessingQueue)
|
||||
summarize_q = (
|
||||
select(pq.id)
|
||||
.where(
|
||||
pq.document_id == Document.id,
|
||||
pq.stage == "summarize",
|
||||
)
|
||||
)
|
||||
result = await session.execute(
|
||||
select(Document)
|
||||
.join(ProcessingQueue, ProcessingQueue.document_id == Document.id)
|
||||
.where(
|
||||
ProcessingQueue.stage == "fulltext",
|
||||
ProcessingQueue.status == "failed",
|
||||
Document.source_channel == "news",
|
||||
~exists(summarize_q),
|
||||
)
|
||||
.limit(200)
|
||||
)
|
||||
docs = result.scalars().unique().all()
|
||||
for doc in docs:
|
||||
_set_fulltext_meta(doc, status="failed_reconciled")
|
||||
await _enqueue_downstream(session, doc)
|
||||
if docs:
|
||||
await session.commit()
|
||||
logger.warning(f"[fulltext] reconcile: 영구 실패 {len(docs)}건 RSS 요약으로 후속 enqueue")
|
||||
@@ -0,0 +1,351 @@
|
||||
"""C-2 KOSHA Open API 수집 워커 (plan crawl-24x7-1).
|
||||
|
||||
3 API (2026-06-10 실키 live 검증 + fixture 박제 — tests/fixtures/kosha_*_response.json):
|
||||
재해사례 게시판: GET /B552468/disaster_api02/getdisaster_api02 callApiId=1060
|
||||
재해사례 첨부: GET /B552468/disaster_attach_api02/Disaster_attach_api02 callApiId=1070
|
||||
KOSHA GUIDE: GET /B552468/koshaguide/getKoshaGuide callApiId=1050
|
||||
|
||||
daily 스케줄 1회 (main.py):
|
||||
재해사례 = 최근 페이지만 diff (boardno dedup) — 사례 본문 Document(텍스트 네이티브)
|
||||
+ 첨부 PDF/HWP 다운로드 → /documents/crawl_raw/kosha/{boardno}/ 저장
|
||||
→ 파일 Document + extract enqueue (kordoc HWP/PDF 기존 파이프라인 재사용).
|
||||
GUIDE = 전체 레지스트리 메타 diff (1039건, 100/page = 11 call) → 신규/개정만,
|
||||
일일 ingest cap(기본 25) = backlog 자동 점진 백필(~6주) + 부하 평탄화.
|
||||
cap 으로 미처리 잔량은 매회 로그 (silent cap 금지).
|
||||
|
||||
키: KOSHA_API_KEY (credentials.env) — 공공데이터포털 '인코딩' 키를 그대로 저장.
|
||||
httpx params= 로 넘기면 % 가 재인코딩되므로 반드시 URL 문자열에 직접 결합.
|
||||
개정 감지: GUIDE dedup 키 = 규정번호+공표일자 — 같은 번호의 새 공표일자 = 신규 문서로 적재.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import hashlib
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
import httpx
|
||||
from sqlalchemy import select
|
||||
|
||||
from core.config import settings
|
||||
from core.crawl_politeness import CRAWL_UA
|
||||
from core.database import async_session
|
||||
from core.utils import setup_logger
|
||||
from models.document import Document
|
||||
from models.news_source import NewsSource
|
||||
from models.queue import enqueue_stage
|
||||
from workers.news_collector import (
|
||||
FeedError,
|
||||
_get_or_create_health,
|
||||
_record_failure,
|
||||
_record_success,
|
||||
)
|
||||
|
||||
logger = setup_logger("kosha_collector")
|
||||
|
||||
_BASE = "https://apis.data.go.kr/B552468"
|
||||
_BOARD_EP = f"{_BASE}/disaster_api02/getdisaster_api02"
|
||||
_ATTACH_EP = f"{_BASE}/disaster_attach_api02/Disaster_attach_api02"
|
||||
_GUIDE_EP = f"{_BASE}/koshaguide/getKoshaGuide"
|
||||
|
||||
_CASE_SOURCE = "KOSHA 재해사례"
|
||||
_GUIDE_SOURCE = "KOSHA GUIDE"
|
||||
|
||||
_CASE_PAGES = 2 # daily diff 범위 (30×2 = 최근 60건 — 등록일 역순 API)
|
||||
_CASE_ROWS = 30
|
||||
_GUIDE_ROWS = 100
|
||||
_GUIDE_DAILY_CAP = int(os.getenv("KOSHA_GUIDE_DAILY_CAP", "25"))
|
||||
_MAX_FILE_BYTES = 50 * 1024 * 1024
|
||||
_DOWNLOAD_DELAY = (2.0, 5.0) # portal.kosha.or.kr 파일서버 — 연속 다운로드 간격
|
||||
|
||||
|
||||
def _api_key() -> str:
|
||||
key = os.getenv("KOSHA_API_KEY", "")
|
||||
if not key:
|
||||
raise FeedError("KOSHA_API_KEY 미설정 — KOSHA 수집 불가")
|
||||
return key
|
||||
|
||||
|
||||
async def _api_get(url: str) -> dict:
|
||||
"""공통 GET — 게이트웨이/제공자 이중 에러 체계 검사."""
|
||||
async with httpx.AsyncClient(timeout=25) as client:
|
||||
resp = await client.get(url, headers={"User-Agent": CRAWL_UA})
|
||||
if resp.status_code != 200:
|
||||
raise FeedError(f"KOSHA API {resp.status_code} @ {url.split('?')[0]}")
|
||||
try:
|
||||
payload = resp.json()
|
||||
except ValueError as e:
|
||||
# 게이트웨이 에러는 XML/plain 으로 옴 (SERVICE_KEY_IS_NOT_REGISTERED 등)
|
||||
raise FeedError(f"KOSHA API 비-JSON 응답: {resp.text[:120]}") from e
|
||||
code = (payload.get("header") or {}).get("resultCode")
|
||||
if code != "00":
|
||||
raise FeedError(f"KOSHA API resultCode={code}: {(payload.get('header') or {}).get('resultMsg')}")
|
||||
return payload
|
||||
|
||||
|
||||
def _items(payload: dict) -> list[dict]:
|
||||
"""body.items.item — 단건이면 dict 로 오는 data.go.kr 관행 방어."""
|
||||
item = ((payload.get("body") or {}).get("items") or {}).get("item")
|
||||
if item is None:
|
||||
return []
|
||||
return [item] if isinstance(item, dict) else list(item)
|
||||
|
||||
|
||||
def _safe_filename(name: str) -> str:
|
||||
"""NAS 파일명 정화 — 경로분리자/제어문자/공백연쇄 제거 (쉘 함정 회피)."""
|
||||
name = re.sub(r"[/\\\x00-\x1f]", "_", name).strip()
|
||||
name = re.sub(r"\s+", " ", name)
|
||||
return name[:140] or "unnamed"
|
||||
|
||||
|
||||
async def _download(url: str, dest: Path) -> int:
|
||||
"""첨부/규정 파일 다운로드 — 크기 cap + 디렉토리 생성 + 연속 간격."""
|
||||
await asyncio.sleep(random.uniform(*_DOWNLOAD_DELAY))
|
||||
async with httpx.AsyncClient(timeout=60, follow_redirects=True) as client:
|
||||
resp = await client.get(url, headers={"User-Agent": CRAWL_UA})
|
||||
if resp.status_code != 200:
|
||||
raise FeedError(f"파일 다운로드 {resp.status_code}: {url}")
|
||||
if len(resp.content) > _MAX_FILE_BYTES:
|
||||
raise FeedError(f"파일 크기 초과 ({len(resp.content)} bytes): {url}")
|
||||
dest.parent.mkdir(parents=True, exist_ok=True)
|
||||
dest.write_bytes(resp.content)
|
||||
return len(resp.content)
|
||||
|
||||
|
||||
async def _get_or_create_source(session, name: str, feed_url: str) -> NewsSource:
|
||||
result = await session.execute(select(NewsSource).where(NewsSource.name == name))
|
||||
source = result.scalars().first()
|
||||
if source is None:
|
||||
source = NewsSource(
|
||||
name=name, feed_url=feed_url, feed_type="rss", fetch_method="api",
|
||||
fulltext_policy="none", source_channel="crawl", category="Safety",
|
||||
language="ko", country="KR",
|
||||
enabled=False, # 6h 뉴스 사이클 비대상 — 본 워커가 daily 폴링
|
||||
)
|
||||
session.add(source)
|
||||
await session.flush()
|
||||
return source
|
||||
|
||||
|
||||
async def _ingest_attachment(session, boardno: str, filenm: str, filepath: str) -> bool:
|
||||
"""첨부 1건 → NAS 저장 + 파일 Document + extract enqueue. 반환 = 신규 여부."""
|
||||
safe = _safe_filename(filenm)
|
||||
rel_path = f"crawl_raw/kosha/{boardno}/{safe}"
|
||||
existing = await session.execute(
|
||||
select(Document).where(Document.file_path == rel_path).limit(1)
|
||||
)
|
||||
if existing.scalars().first():
|
||||
return False
|
||||
|
||||
dest = Path(settings.nas_mount_path) / rel_path
|
||||
size = await _download(filepath, dest)
|
||||
ext = (safe.rsplit(".", 1)[-1].lower() if "." in safe else "bin")[:10]
|
||||
|
||||
doc = Document(
|
||||
file_path=rel_path,
|
||||
file_hash=hashlib.sha256(dest.read_bytes()).hexdigest(),
|
||||
file_format=ext,
|
||||
file_size=size,
|
||||
file_type="immutable",
|
||||
title=safe.rsplit(".", 1)[0],
|
||||
source_channel="crawl",
|
||||
data_origin="external",
|
||||
import_source="kosha_api",
|
||||
edit_url=filepath,
|
||||
ai_tags=["Safety/KOSHA재해사례/첨부"],
|
||||
extract_meta={"kosha": {"boardno": boardno, "kind": "case_attachment"}},
|
||||
)
|
||||
session.add(doc)
|
||||
await session.flush()
|
||||
# extract → (crawl override) classify → embed/chunk — 기존 파일 파이프라인 재사용
|
||||
await enqueue_stage(session, doc.id, "extract")
|
||||
logger.info(f"[kosha] 첨부 ingest: {rel_path} ({size} bytes)")
|
||||
return True
|
||||
|
||||
|
||||
async def collect_disaster_cases(session) -> int:
|
||||
"""재해사례 daily diff — 최근 _CASE_PAGES 페이지, boardno dedup."""
|
||||
key = _api_key()
|
||||
source = await _get_or_create_source(session, _CASE_SOURCE, _BOARD_EP)
|
||||
new_count = 0
|
||||
|
||||
for page in range(1, _CASE_PAGES + 1):
|
||||
payload = await _api_get(
|
||||
f"{_BOARD_EP}?serviceKey={key}&callApiId=1060&pageNo={page}&numOfRows={_CASE_ROWS}"
|
||||
)
|
||||
items = _items(payload)
|
||||
if not items:
|
||||
break
|
||||
page_all_dup = True
|
||||
for item in items:
|
||||
boardno = str(item.get("boardno") or "").strip()
|
||||
title = (item.get("keyword") or "").strip()
|
||||
if not boardno or not title:
|
||||
continue
|
||||
fhash = hashlib.sha256(f"kosha-case|{boardno}".encode()).hexdigest()[:32]
|
||||
existing = await session.execute(
|
||||
select(Document).where(Document.file_hash == fhash).limit(1)
|
||||
)
|
||||
if existing.scalars().first():
|
||||
continue
|
||||
page_all_dup = False
|
||||
|
||||
contents = (item.get("contents") or "").strip()
|
||||
business = (item.get("business") or "").strip()
|
||||
now = datetime.now(timezone.utc)
|
||||
doc = Document(
|
||||
file_path=f"crawl/{_CASE_SOURCE}/{boardno}",
|
||||
file_hash=fhash,
|
||||
file_format="article",
|
||||
file_size=len(contents.encode()),
|
||||
file_type="note",
|
||||
title=title,
|
||||
extracted_text=f"{title}\n\n[{business}]\n{contents}",
|
||||
extracted_at=now,
|
||||
extractor_version="kosha_api",
|
||||
md_status="skipped",
|
||||
md_extraction_error="kosha case: 텍스트 네이티브, markdown 변환 비대상",
|
||||
source_channel="crawl",
|
||||
data_origin="external",
|
||||
review_status="approved",
|
||||
ai_domain="Safety",
|
||||
ai_sub_group=_CASE_SOURCE,
|
||||
ai_tags=[f"Safety/KOSHA재해사례/{business or '기타'}"],
|
||||
extract_meta={
|
||||
"source_id": source.id,
|
||||
"source_name": _CASE_SOURCE,
|
||||
"published_at": None,
|
||||
"kosha": {"boardno": boardno, "business": business,
|
||||
"atcflcnt": item.get("atcflcnt")},
|
||||
},
|
||||
)
|
||||
session.add(doc)
|
||||
await session.flush()
|
||||
await enqueue_stage(session, doc.id, "summarize")
|
||||
await enqueue_stage(session, doc.id, "embed")
|
||||
await enqueue_stage(session, doc.id, "chunk")
|
||||
new_count += 1
|
||||
|
||||
# 첨부 (PDF/HWP) — 본문보다 정보량 큰 정식 사례 보고서
|
||||
if int(item.get("atcflcnt") or 0) > 0:
|
||||
attach = await _api_get(
|
||||
f"{_ATTACH_EP}?serviceKey={key}&callApiId=1070"
|
||||
f"&pageNo=1&numOfRows=10&boardno={boardno}"
|
||||
)
|
||||
for att in _items(attach):
|
||||
filenm = (att.get("filenm") or "").strip()
|
||||
filepath = (att.get("filepath") or "").strip()
|
||||
if not filenm or not filepath.startswith("https://"):
|
||||
continue
|
||||
try:
|
||||
await _ingest_attachment(session, boardno, filenm, filepath)
|
||||
except FeedError as e:
|
||||
logger.warning(f"[kosha] 첨부 실패 skip ({boardno}/{filenm}): {e}")
|
||||
if page_all_dup:
|
||||
break # 등록일 역순 — 페이지 전체가 기존이면 이후 페이지도 기존
|
||||
|
||||
logger.info(f"[kosha] 재해사례 신규 {new_count}건")
|
||||
return new_count
|
||||
|
||||
|
||||
async def collect_kosha_guide(session, cap: int = _GUIDE_DAILY_CAP) -> int:
|
||||
"""GUIDE 레지스트리 전체 메타 diff → 신규/개정만 다운로드 (일일 cap 점진 백필)."""
|
||||
key = _api_key()
|
||||
await _get_or_create_source(session, _GUIDE_SOURCE, _GUIDE_EP)
|
||||
new_specs: list[dict] = []
|
||||
page, total = 1, None
|
||||
|
||||
while True:
|
||||
payload = await _api_get(
|
||||
f"{_GUIDE_EP}?serviceKey={key}&callApiId=1050&pageNo={page}&numOfRows={_GUIDE_ROWS}"
|
||||
)
|
||||
if total is None:
|
||||
total = int((payload.get("body") or {}).get("totalCount") or 0)
|
||||
items = _items(payload)
|
||||
if not items:
|
||||
break
|
||||
for item in items:
|
||||
no = (item.get("techGdlnNo") or "").strip()
|
||||
ymd = (item.get("techGdlnOfancYmd") or "").strip()
|
||||
url = (item.get("fileDownloadUrl") or "").strip()
|
||||
if not no or not url.startswith("https://"):
|
||||
continue
|
||||
fhash = hashlib.sha256(f"kosha-guide|{no}|{ymd}".encode()).hexdigest()[:32]
|
||||
existing = await session.execute(
|
||||
select(Document).where(Document.file_hash == fhash).limit(1)
|
||||
)
|
||||
if not existing.scalars().first():
|
||||
new_specs.append({"no": no, "ymd": ymd, "url": url,
|
||||
"name": (item.get("techGdlnNm") or no).strip(),
|
||||
"fhash": fhash})
|
||||
if page * _GUIDE_ROWS >= total:
|
||||
break
|
||||
page += 1
|
||||
|
||||
todo, deferred = new_specs[:cap], len(new_specs) - min(len(new_specs), cap)
|
||||
ingested = 0
|
||||
for spec in todo:
|
||||
safe_no = _safe_filename(spec["no"])
|
||||
rel_path = f"crawl_raw/kosha_guide/{safe_no}-{spec['ymd'] or 'nodate'}.pdf"
|
||||
dest = Path(settings.nas_mount_path) / rel_path
|
||||
try:
|
||||
size = await _download(spec["url"], dest)
|
||||
except FeedError as e:
|
||||
logger.warning(f"[kosha] GUIDE 다운로드 실패 skip ({spec['no']}): {e}")
|
||||
continue
|
||||
doc = Document(
|
||||
file_path=rel_path,
|
||||
file_hash=spec["fhash"],
|
||||
file_format="pdf",
|
||||
file_size=size,
|
||||
file_type="immutable",
|
||||
title=f"{spec['name']} ({spec['no']})",
|
||||
source_channel="crawl",
|
||||
data_origin="external",
|
||||
import_source="kosha_api",
|
||||
edit_url=spec["url"],
|
||||
ai_tags=["Safety/KOSHA GUIDE"],
|
||||
extract_meta={"kosha": {"kind": "guide", "techGdlnNo": spec["no"],
|
||||
"ofancYmd": spec["ymd"]}},
|
||||
)
|
||||
session.add(doc)
|
||||
await session.flush()
|
||||
await enqueue_stage(session, doc.id, "extract")
|
||||
ingested += 1
|
||||
|
||||
# silent cap 금지 — 잔량 가시화 (자동 점진 백필: 내일 cap 만큼 또 소화)
|
||||
logger.info(f"[kosha] GUIDE 신규/개정 {len(new_specs)}건 중 {ingested}건 ingest"
|
||||
+ (f" (cap {cap}, 잔여 {deferred}건 — 일일 점진 백필)" if deferred > 0 else ""))
|
||||
return ingested
|
||||
|
||||
|
||||
async def run() -> None:
|
||||
"""daily 1회 — 소스별 실패 격리 (재해사례 실패가 GUIDE 를 막지 않게)."""
|
||||
now = datetime.now(timezone.utc)
|
||||
for name, collector in ((_CASE_SOURCE, collect_disaster_cases),
|
||||
(_GUIDE_SOURCE, collect_kosha_guide)):
|
||||
async with async_session() as session:
|
||||
result = await session.execute(select(NewsSource).where(NewsSource.name == name))
|
||||
source = result.scalars().first()
|
||||
try:
|
||||
count = await collector(session)
|
||||
if source is None: # 첫 실행에서 collector 가 생성
|
||||
result = await session.execute(
|
||||
select(NewsSource).where(NewsSource.name == name))
|
||||
source = result.scalars().first()
|
||||
health = await _get_or_create_health(session, source.id)
|
||||
_record_success(health, count, False, now)
|
||||
await session.commit()
|
||||
except Exception as e:
|
||||
logger.error(f"[kosha] {name} 수집 실패: {e}")
|
||||
await session.rollback() # 부분 적재 폐기 후 health 만 기록
|
||||
if source is not None:
|
||||
health = await _get_or_create_health(session, source.id)
|
||||
_record_failure(health, str(e) or repr(e), now)
|
||||
await session.commit()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(run())
|
||||
+482
-62
@@ -1,8 +1,16 @@
|
||||
"""뉴스 수집 워커 — RSS/API에서 기사 수집, documents에 저장"""
|
||||
"""뉴스 수집 워커 — RSS/API에서 기사 수집, documents에 저장
|
||||
|
||||
plan crawl-24x7-1 A그룹 (2026-06-10):
|
||||
A-1 조건부 GET(ETag/Last-Modified 그대로 재전송) + 콘텐츠 해시 변경감지
|
||||
A-2 fulltext_policy='page' 소스는 'fulltext' stage 로 본문 승격 위임
|
||||
A-5 source_health 기록 + circuit breaker (소스별 실패 격리)
|
||||
A-6 first-wins + 포털 전재 2차 dedup (제목+최근 3일, 12자 이상 제목 한정)
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import hashlib
|
||||
import re
|
||||
from datetime import datetime, timezone
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from html import unescape
|
||||
from urllib.parse import parse_qsl, urlencode, urlparse, urlunparse
|
||||
|
||||
@@ -10,11 +18,13 @@ import feedparser
|
||||
import httpx
|
||||
from sqlalchemy import select
|
||||
|
||||
from core.crawl_politeness import CRAWL_UA
|
||||
from core.database import async_session
|
||||
from core.utils import setup_logger
|
||||
from models.document import Document
|
||||
from models.news_source import NewsSource
|
||||
from models.queue import enqueue_stage
|
||||
from models.source_health import SourceHealth
|
||||
|
||||
logger = setup_logger("news_collector")
|
||||
|
||||
@@ -26,6 +36,7 @@ CATEGORY_MAP = {
|
||||
"환경": "Environment", "기술": "Technology",
|
||||
# 영어
|
||||
"World": "International", "International": "International",
|
||||
"World news": "International", # Guardian sectionName (B-2)
|
||||
"Technology": "Technology", "Tech": "Technology", "Sci-Tech": "Technology",
|
||||
"Arts": "Culture", "Culture": "Culture",
|
||||
"Climate": "Environment", "Environment": "Environment",
|
||||
@@ -35,21 +46,30 @@ CATEGORY_MAP = {
|
||||
"Kultur": "Culture", "Wissenschaft": "Technology",
|
||||
# 프랑스어
|
||||
"Environnement": "Environment",
|
||||
# 도메인 채널 (source_channel='crawl', 0-5 (a)) — 양쪽 공통 맵
|
||||
"안전": "Safety", "Safety": "Safety",
|
||||
"공학": "Engineering", "Engineering": "Engineering",
|
||||
"철학": "Philosophy", "Philosophy": "Philosophy",
|
||||
}
|
||||
|
||||
|
||||
class FeedError(Exception):
|
||||
"""소스 단위 fetch/parse 실패 — run() 이 source_health 실패로 기록."""
|
||||
|
||||
|
||||
def _normalize_category(raw: str) -> str:
|
||||
"""카테고리 표준화"""
|
||||
return CATEGORY_MAP.get(raw, CATEGORY_MAP.get(raw.strip(), "Other"))
|
||||
|
||||
|
||||
def _clean_html(text: str) -> str:
|
||||
"""HTML 태그 제거 + 정제"""
|
||||
def _clean_html(text: str, max_len: int | None = 1000) -> str:
|
||||
"""HTML 태그 제거 + 정제. max_len=None 이면 절단 없음 (feed-full 전문용)."""
|
||||
if not text:
|
||||
return ""
|
||||
text = re.sub(r"<[^>]+>", "", text)
|
||||
text = unescape(text)
|
||||
return text.strip()[:1000]
|
||||
text = text.strip()
|
||||
return text if max_len is None else text[:max_len]
|
||||
|
||||
|
||||
# tracking 파라미터 판별 — prefix(utm_/at_=BBC/ns_=BBC/mc_=mailchimp) + 단독 키
|
||||
@@ -87,8 +107,104 @@ def _normalize_to_utc(dt) -> datetime:
|
||||
return datetime.now(timezone.utc)
|
||||
|
||||
|
||||
# ── A-5: circuit breaker 정책 ──
|
||||
# 연속 실패 >= OPEN 임계 → open (재시도 간격 지수 확대, 6h × 2^n, cap 48h)
|
||||
# 연속 실패 > DISABLE 임계 → disabled (수집 제외 + 가시 로그, 수동 복구 대상)
|
||||
# news_sources.enabled 는 건드리지 않는다 — 사용자 의도(enabled)와 자동 상태(circuit) 분리.
|
||||
_CIRCUIT_OPEN_AFTER = 3
|
||||
_CIRCUIT_DISABLE_AFTER = 10
|
||||
_BACKOFF_BASE_HOURS = 6
|
||||
_BACKOFF_CAP_HOURS = 48
|
||||
_EMPTY_STREAK_ALERT = 8 # 6h 사이클 × 8 = 약 2일 연속 빈 피드 → 가시 경고
|
||||
|
||||
|
||||
def _should_attempt(health: SourceHealth, now: datetime) -> bool:
|
||||
"""circuit 상태에 따라 이번 사이클 fetch 여부 결정.
|
||||
|
||||
주의 (B-3 계약 ②, r5): 추후 relogin_requested 플래그 소비는 반드시 이
|
||||
open-스킵 분기보다 *앞*에 두어야 한다 — open 이 스케줄 제외 형태가 되면
|
||||
배치 경계가 안 와 플래그가 영원히 미소비(half-open 데드 버튼)가 된다.
|
||||
"""
|
||||
if health.circuit_state == "disabled":
|
||||
return False
|
||||
if health.circuit_state == "open" and health.last_error_at is not None:
|
||||
over = max(health.consecutive_failures - _CIRCUIT_OPEN_AFTER, 0)
|
||||
backoff_h = min(_BACKOFF_BASE_HOURS * (2 ** over), _BACKOFF_CAP_HOURS)
|
||||
if now - health.last_error_at < timedelta(hours=backoff_h):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _record_success(health: SourceHealth, items: int, not_modified: bool, now: datetime) -> None:
|
||||
health.consecutive_failures = 0
|
||||
health.total_fetches += 1
|
||||
health.last_success_at = now
|
||||
health.last_fetch_items = items
|
||||
if health.circuit_state != "closed":
|
||||
logger.info(f"[health] source={health.source_id} circuit {health.circuit_state}→closed")
|
||||
health.circuit_state = "closed"
|
||||
health.circuit_opened_at = None
|
||||
# 빈 피드 streak: 304/해시동일은 정상 신호라 미집계, 200+entries 0 만 집계 (피드 부패 감시)
|
||||
if not_modified:
|
||||
pass
|
||||
elif items == 0:
|
||||
health.empty_streak += 1
|
||||
if health.empty_streak >= _EMPTY_STREAK_ALERT:
|
||||
logger.error(
|
||||
f"[health] source={health.source_id} 빈 피드 {health.empty_streak}회 연속 "
|
||||
f"— 피드 부패 의심 (RSSHub 류 라우트 깨짐 패턴)"
|
||||
)
|
||||
else:
|
||||
health.empty_streak = 0
|
||||
health.updated_at = now
|
||||
|
||||
|
||||
def _record_failure(health: SourceHealth, error: str, now: datetime) -> None:
|
||||
health.consecutive_failures += 1
|
||||
health.total_fetches += 1
|
||||
health.total_failures += 1
|
||||
health.last_error = error[:500]
|
||||
health.last_error_at = now
|
||||
health.updated_at = now
|
||||
cf = health.consecutive_failures
|
||||
if cf > _CIRCUIT_DISABLE_AFTER and health.circuit_state != "disabled":
|
||||
health.circuit_state = "disabled"
|
||||
logger.error(
|
||||
f"[health] source={health.source_id} 연속 실패 {cf}회 — circuit DISABLED "
|
||||
f"(수집 제외, A-8 패널에서 수동 복구 필요)"
|
||||
)
|
||||
elif cf >= _CIRCUIT_OPEN_AFTER and health.circuit_state == "closed":
|
||||
health.circuit_state = "open"
|
||||
health.circuit_opened_at = now
|
||||
logger.warning(f"[health] source={health.source_id} 연속 실패 {cf}회 — circuit open")
|
||||
|
||||
|
||||
async def _get_or_create_health(session, source_id: int) -> SourceHealth:
|
||||
result = await session.execute(
|
||||
select(SourceHealth).where(SourceHealth.source_id == source_id)
|
||||
)
|
||||
health = result.scalars().first()
|
||||
if health is None:
|
||||
health = SourceHealth(source_id=source_id)
|
||||
session.add(health)
|
||||
await session.flush()
|
||||
return health
|
||||
|
||||
|
||||
# 수동 POST /api/news/collect 와 6h 스케줄 사이클의 동시 실행 차단 (단일 프로세스·단일
|
||||
# 이벤트루프). 동시 진입 시 _get_or_create_health 가 같은 source_id 를 양쪽에서 INSERT
|
||||
# → uq_source_health_source_id 위반 IntegrityError 로 사이클 전체가 죽는 경합의 원천 봉쇄.
|
||||
_run_lock = asyncio.Lock()
|
||||
|
||||
|
||||
async def run():
|
||||
"""뉴스 수집 실행"""
|
||||
async with _run_lock:
|
||||
await _run_locked()
|
||||
|
||||
|
||||
async def _run_locked():
|
||||
now = datetime.now(timezone.utc)
|
||||
async with async_session() as session:
|
||||
result = await session.execute(
|
||||
select(NewsSource).where(NewsSource.enabled == True)
|
||||
@@ -101,17 +217,24 @@ async def run():
|
||||
|
||||
total = 0
|
||||
for source in sources:
|
||||
health = await _get_or_create_health(session, source.id)
|
||||
if not _should_attempt(health, now):
|
||||
logger.info(f"[{source.name}] circuit {health.circuit_state} — 이번 사이클 skip")
|
||||
continue
|
||||
try:
|
||||
if source.feed_type == "api":
|
||||
count = await _fetch_api(session, source)
|
||||
count, status = await _fetch_api(session, source)
|
||||
else:
|
||||
count = await _fetch_rss(session, source)
|
||||
count, status = await _fetch_rss(session, source)
|
||||
|
||||
source.last_fetched_at = datetime.now(timezone.utc)
|
||||
_record_success(health, count, status == "not_modified", now)
|
||||
total += count
|
||||
except Exception as e:
|
||||
logger.error(f"[{source.name}] 수집 실패: {e}")
|
||||
# str 이 빈 예외(httpx.ConnectError('')) 대비 — health 기록과 동일 규칙
|
||||
logger.error(f"[{source.name}] 수집 실패: {str(e) or repr(e)}")
|
||||
source.last_fetched_at = datetime.now(timezone.utc)
|
||||
_record_failure(health, str(e) or repr(e), now)
|
||||
|
||||
await session.commit()
|
||||
logger.info(f"뉴스 수집 완료: {total}건 신규")
|
||||
@@ -122,8 +245,135 @@ ALLOWED_CONTENT_TYPES = ("application/rss+xml", "application/atom+xml",
|
||||
"application/xml", "text/xml")
|
||||
|
||||
|
||||
async def _fetch_rss(session, source: NewsSource) -> int:
|
||||
"""RSS 피드 수집 — redirect 재검증 + 크기/content-type 제한"""
|
||||
# 연결 재시도 간격 — MOEL 추가 실측(2026-06-11): 드랍이 연결 단위 랜덤이라
|
||||
# 1.5s 후 재시도도 연속으로 걸리는 케이스 발생(직후 다른 연결은 즉시 성공) → 2회로 보강.
|
||||
_CONNECT_RETRY_DELAYS = (2.0, 5.0)
|
||||
|
||||
|
||||
async def _get_with_connect_retry(client, url: str):
|
||||
"""연결 계층(TCP/TLS) 오류만 재시도(최대 2회) — HTTP 상태 오류는 비대상 (호출측 분기 보존).
|
||||
|
||||
MOEL 실측(2026-06-11): 정부 사이트 보안장비가 TLS 핸드셰이크를 연결 단위로 간헐 드랍
|
||||
(curl rc=35, 직후 재시도는 성공) → 사이클당 1회 fetch 인 피드 수집이 ConnectError('')
|
||||
로 실패 누적·circuit open. 지속 장애는 그대로 circuit 몫.
|
||||
"""
|
||||
for delay in _CONNECT_RETRY_DELAYS:
|
||||
try:
|
||||
return await client.get(url)
|
||||
except (httpx.ConnectError, httpx.ConnectTimeout) as e:
|
||||
logger.info(f"연결 오류 {delay}s 후 재시도 ({url.split('?')[0]}): {repr(e)}")
|
||||
await asyncio.sleep(delay)
|
||||
return await client.get(url)
|
||||
|
||||
|
||||
async def _is_portal_duplicate(session, title: str) -> bool:
|
||||
"""A-6 2차 dedup: 포털 전재본 vs 원본이 다른 URL 로 이중 적재되는 케이스.
|
||||
|
||||
보조 키 = 제목 + 최근 3일 (다른 소스/다른 URL 이므로 1차 키로 안 잡힘).
|
||||
범용 제목 오탐 방지: 12자 미만 제목은 비적용. skip 은 전부 로그 (silent 누락 회피).
|
||||
"""
|
||||
if len(title) < 12:
|
||||
return False
|
||||
cutoff = datetime.now(timezone.utc) - timedelta(days=3)
|
||||
dup = await session.execute(
|
||||
select(Document.id).where(
|
||||
Document.title == title,
|
||||
Document.source_channel == "news",
|
||||
Document.file_format == "article",
|
||||
Document.extracted_at >= cutoff,
|
||||
).limit(1)
|
||||
)
|
||||
return dup.scalars().first() is not None
|
||||
|
||||
|
||||
async def _enqueue_processing(session, doc: Document, source: NewsSource, pub_dt: datetime) -> None:
|
||||
"""후속 단계 enqueue.
|
||||
|
||||
fulltext_policy='page' 소스는 'fulltext' stage 만 — summarize/embed/chunk 는
|
||||
fulltext_worker 가 승격(또는 격하) 확정 후 enqueue (RSS 요약 선요약 → 풀텍스트
|
||||
도착 시 summarize_worker 의 '이미 요약 있음 skip' 에 막히는 순서 함정 회피).
|
||||
"""
|
||||
if source.fetch_method == "signal-only":
|
||||
# B-4: 시그널 = 검색 색인만 (embed/chunk). fulltext/summarize 절대 enqueue 안 함 —
|
||||
# 레지스트리가 fulltext_policy='page' 로 잘못 설정돼도 페이지 fetch 0 (방어 우선).
|
||||
# 요약 LLM 스킵 = 맥미니 부하 0. 다이제스트/브리핑은 ai_summary IS NULL 문서를
|
||||
# 처음부터 제외(services/digest/loader.py)하므로 시그널 문서가 자연 배제된다.
|
||||
if source.source_channel == "crawl" or (datetime.now(timezone.utc) - pub_dt).days <= 30:
|
||||
await enqueue_stage(session, doc.id, "embed")
|
||||
await enqueue_stage(session, doc.id, "chunk")
|
||||
return
|
||||
if source.fulltext_policy == "page" and doc.edit_url:
|
||||
await enqueue_stage(session, doc.id, "fulltext")
|
||||
return
|
||||
await enqueue_stage(session, doc.id, "summarize")
|
||||
if source.source_channel == "crawl":
|
||||
# 도메인 재료 코퍼스 — 발행일 무관 전량 색인 (30일 게이트는 뉴스 전용)
|
||||
await enqueue_stage(session, doc.id, "embed")
|
||||
await enqueue_stage(session, doc.id, "chunk")
|
||||
return
|
||||
days_old = (datetime.now(timezone.utc) - pub_dt).days
|
||||
if days_old <= 30:
|
||||
await enqueue_stage(session, doc.id, "embed")
|
||||
await enqueue_stage(session, doc.id, "chunk")
|
||||
|
||||
|
||||
def _entry_body(source: NewsSource, entry, summary: str) -> tuple[str, str]:
|
||||
"""(body, extractor_version) — 정책별 본문 선택, 순수 함수 (shape 테스트 대상).
|
||||
|
||||
signal-only: 피드 요약이 곧 본문 — 절단 없음 (arXiv 초록 1.3~1.6K자 보존,
|
||||
1000자 cap 적용 시 초록 꼬리 유실). 페이지 fetch 는 어떤 경우에도 없음 (B-4).
|
||||
feed-full: 피드 본문이 전문인 소스만 신뢰 (truncate·광고 삽입이 흔해 일반
|
||||
소스의 summary/content:encoded 를 전문으로 오인 저장 금지 — A-6).
|
||||
"""
|
||||
if source.fetch_method == "signal-only":
|
||||
body = _clean_html(
|
||||
entry.get("summary", "") or entry.get("description", ""), max_len=None
|
||||
)
|
||||
return (body or summary), "rss-signal"
|
||||
if source.fulltext_policy == "feed-full":
|
||||
content_list = entry.get("content") or []
|
||||
raw_body = content_list[0].get("value", "") if content_list else ""
|
||||
full_body = _clean_html(raw_body or entry.get("summary", ""), max_len=None)
|
||||
if len(full_body) > len(summary):
|
||||
return full_body, "rss-feed-full"
|
||||
return summary, "rss"
|
||||
|
||||
|
||||
def _build_extract_meta(source: NewsSource, pub_dt: datetime) -> dict:
|
||||
"""fulltext_worker / 패널이 쓰는 출처 메타 (documents 에 source FK 가 없어 여기 기록)."""
|
||||
return {
|
||||
"source_id": source.id,
|
||||
"source_name": source.name,
|
||||
"published_at": pub_dt.isoformat(),
|
||||
}
|
||||
|
||||
|
||||
def _doc_identity(source: NewsSource, source_short: str, category: str) -> dict:
|
||||
"""채널별 문서 정체성 — news 채널은 기존 값 그대로(무회귀), crawl 채널은 도메인 정체성.
|
||||
|
||||
file_path 접두사가 곧 채널 디렉토리. ai_domain 은 다이제스트/검색 필터의 분기 축이라
|
||||
crawl 채널이 'News' 를 오염시키지 않게 분리 (0-5 채널 레벨 분리 사상).
|
||||
"""
|
||||
if source.source_channel == "crawl":
|
||||
domain = category if category and category != "Other" else "Domain"
|
||||
return {
|
||||
"path_prefix": "crawl",
|
||||
"ai_domain": domain,
|
||||
"ai_tags": [f"{domain}/{source_short}"],
|
||||
}
|
||||
return {
|
||||
"path_prefix": "news",
|
||||
"ai_domain": "News",
|
||||
"ai_tags": [f"News/{source_short}/{category}"],
|
||||
}
|
||||
|
||||
|
||||
async def _fetch_rss(session, source: NewsSource) -> tuple[int, str]:
|
||||
"""RSS 피드 수집 — redirect 재검증 + 크기/content-type 제한 + 조건부 GET (A-1).
|
||||
|
||||
반환 (신규 건수, 상태). 상태 'not_modified' = 304 또는 콘텐츠 해시 동일.
|
||||
소스 단위 실패는 FeedError raise — run() 이 health 실패로 기록.
|
||||
"""
|
||||
from urllib.parse import urljoin
|
||||
from core.url_validator import validate_feed_url, HTTP_EXCEPTION_DOMAINS
|
||||
|
||||
@@ -134,51 +384,79 @@ async def _fetch_rss(session, source: NewsSource) -> int:
|
||||
|
||||
# 순수 HTTP 소스인데 allowlist에 없으면 차단
|
||||
if source.feed_url.startswith("http://") and not http_allowed:
|
||||
logger.error(f"[{source.name}] HTTP 차단 (allowlist 미등록): {source_hostname}")
|
||||
return 0
|
||||
raise FeedError(f"HTTP 차단 (allowlist 미등록): {source_hostname}")
|
||||
|
||||
# fetch 전 URL 재검증 (등록 이후 DNS 변경 대비)
|
||||
try:
|
||||
validate_feed_url(source.feed_url, allow_http=http_allowed)
|
||||
except ValueError as e:
|
||||
logger.error(f"[{source.name}] URL 검증 실패: {e}")
|
||||
return 0
|
||||
raise FeedError(f"URL 검증 실패: {e}") from e
|
||||
|
||||
async with httpx.AsyncClient(timeout=10, follow_redirects=False) as client:
|
||||
resp = await client.get(source.feed_url)
|
||||
# A-1: 정직 UA + 조건부 GET — 서버가 준 워터마크를 받은 그대로 재전송
|
||||
headers = {"User-Agent": CRAWL_UA}
|
||||
if source.etag:
|
||||
headers["If-None-Match"] = source.etag
|
||||
if source.last_modified:
|
||||
headers["If-Modified-Since"] = source.last_modified
|
||||
|
||||
# redirect 수동 처리 (최대 3회, 각 target 재검증)
|
||||
async with httpx.AsyncClient(
|
||||
timeout=10, follow_redirects=False, headers=headers
|
||||
) as client:
|
||||
resp = await _get_with_connect_retry(client, source.feed_url)
|
||||
|
||||
# 304 는 redirect 처리보다 먼저 — httpx 의 is_redirect 는 3xx 전체(304 포함)에
|
||||
# True 라, 304 를 redirect 로 오인하면 location 없는 같은 URL 을 재요청해
|
||||
# "redirect 3회 초과" 로 오류 처리됨(조건부 GET 안정 피드 전멸 버그).
|
||||
if resp.status_code == 304:
|
||||
logger.info(f"[{source.name}] 304 Not Modified — 본문 미전송")
|
||||
return 0, "not_modified"
|
||||
|
||||
# redirect 수동 처리 (최대 3회, 각 target 재검증) — location 있는 진짜 redirect 만.
|
||||
# allowlist 도메인이면 redirect target의 HTTP도 허용
|
||||
redirects = 0
|
||||
while resp.is_redirect and redirects < 3:
|
||||
location = resp.headers.get("location", "")
|
||||
location = urljoin(str(resp.request.url), location)
|
||||
while resp.has_redirect_location and redirects < 3:
|
||||
location = urljoin(str(resp.request.url), resp.headers["location"])
|
||||
try:
|
||||
validate_feed_url(location, allow_http=http_allowed)
|
||||
except ValueError as e:
|
||||
logger.error(f"[{source.name}] redirect target 차단: {e}")
|
||||
return 0
|
||||
raise FeedError(f"redirect target 차단: {e}") from e
|
||||
resp = await client.get(location)
|
||||
if resp.status_code == 304:
|
||||
logger.info(f"[{source.name}] 304 Not Modified (redirect 후) — 본문 미전송")
|
||||
return 0, "not_modified"
|
||||
redirects += 1
|
||||
if resp.is_redirect:
|
||||
logger.error(f"[{source.name}] redirect 3회 초과")
|
||||
return 0
|
||||
if resp.has_redirect_location:
|
||||
raise FeedError("redirect 3회 초과")
|
||||
|
||||
resp.raise_for_status()
|
||||
|
||||
if len(resp.content) > MAX_RESPONSE_SIZE:
|
||||
logger.warning(f"[{source.name}] 응답 크기 초과: {len(resp.content)} bytes")
|
||||
return 0
|
||||
raise FeedError(f"응답 크기 초과: {len(resp.content)} bytes")
|
||||
|
||||
ct = resp.headers.get("content-type", "").lower()
|
||||
if not any(t in ct for t in ALLOWED_CONTENT_TYPES):
|
||||
logger.warning(f"[{source.name}] 비정상 content-type: {ct}")
|
||||
return 0
|
||||
raise FeedError(f"비정상 content-type: {ct}")
|
||||
|
||||
# A-1: 콘텐츠 해시 변경감지 (CDN 의 ETag 회전 대비 병행) — 저장된 해시는 항상
|
||||
# 파싱 검증을 통과한 응답의 것이므로 동일성 비교는 파싱 전에 안전
|
||||
new_etag = resp.headers.get("etag")
|
||||
new_last_modified = resp.headers.get("last-modified")
|
||||
content_hash = hashlib.sha256(resp.content).hexdigest()
|
||||
if source.feed_content_hash == content_hash:
|
||||
logger.info(f"[{source.name}] 콘텐츠 해시 동일 — 파싱 skip")
|
||||
return 0, "not_modified"
|
||||
|
||||
feed = feedparser.parse(resp.text)
|
||||
if feed.bozo and not feed.entries:
|
||||
logger.warning(f"[{source.name}] RSS 파싱 실패: {feed.bozo_exception}")
|
||||
return 0
|
||||
raise FeedError(f"RSS 파싱 실패: {feed.bozo_exception}")
|
||||
|
||||
# A-1: 워터마크 영속은 파싱 검증 통과 후에만 — 부패(bozo) 응답의 ETag 를 저장하면
|
||||
# 이후 304 로 영구 skip 되는 silent corruption 차단
|
||||
if new_etag:
|
||||
source.etag = new_etag
|
||||
if new_last_modified:
|
||||
source.last_modified = new_last_modified
|
||||
source.feed_content_hash = content_hash
|
||||
count = 0
|
||||
|
||||
for entry in feed.entries:
|
||||
@@ -190,7 +468,15 @@ async def _fetch_rss(session, source: NewsSource) -> int:
|
||||
if not summary:
|
||||
summary = title
|
||||
|
||||
# 정책별 본문 선택 — signal-only(무절단 요약) / feed-full(피드 전문) / 기본(요약)
|
||||
body, extractor_version = _entry_body(source, entry, summary)
|
||||
|
||||
link = entry.get("link", "")
|
||||
|
||||
# B-5 quirk: 비디오 항목 필터 (Aeon/Psyche — 텍스트 코퍼스에 비디오 페이지 무가치)
|
||||
if source.parser_quirk == "skip-video" and re.search(r"/videos?/", link):
|
||||
continue
|
||||
|
||||
published = entry.get("published_parsed") or entry.get("updated_parsed")
|
||||
pub_dt = datetime(*published[:6], tzinfo=timezone.utc) if published else datetime.now(timezone.utc)
|
||||
|
||||
@@ -209,56 +495,190 @@ async def _fetch_rss(session, source: NewsSource) -> int:
|
||||
if existing.scalars().first():
|
||||
continue
|
||||
|
||||
# A-6 2차: 포털 전재 dedup (first-wins — 먼저 적재된 쪽이 정본)
|
||||
if await _is_portal_duplicate(session, title):
|
||||
logger.info(f"[{source.name}] portal-dup skip: {title[:60]}")
|
||||
continue
|
||||
|
||||
category = _normalize_category(source.category or "")
|
||||
source_short = source.name.split(" ")[0] # "경향신문 문화" → "경향신문"
|
||||
ident = _doc_identity(source, source_short, category)
|
||||
|
||||
doc = Document(
|
||||
file_path=f"news/{source.name}/{article_id}",
|
||||
file_path=f"{ident['path_prefix']}/{source.name}/{article_id}",
|
||||
file_hash=article_id,
|
||||
file_format="article",
|
||||
file_size=len(summary.encode()),
|
||||
file_size=len(body.encode()),
|
||||
file_type="note",
|
||||
title=title,
|
||||
extracted_text=f"{title}\n\n{summary}",
|
||||
extracted_text=f"{title}\n\n{body}",
|
||||
extracted_at=datetime.now(timezone.utc),
|
||||
extractor_version="rss",
|
||||
extractor_version=extractor_version,
|
||||
# article = 텍스트 네이티브(본문=extracted_text). markdown 단계 미enqueue 라
|
||||
# 기본값 'pending' 이면 영구 비수렴 → backlog 지표 오염 + md_status_pending partial
|
||||
# 인덱스 비대. 생성 시점에 terminal 'skipped' 로 명시(변환 비대상).
|
||||
# fulltext_policy='page' 소스는 fulltext_worker 가 승격 시 success 로 갱신.
|
||||
md_status="skipped",
|
||||
md_extraction_error="news article: 텍스트 네이티브, markdown 변환 비대상",
|
||||
source_channel="news",
|
||||
source_channel=source.source_channel,
|
||||
data_origin="external",
|
||||
# 조회와 동일하게 정규화해 저장 — raw(tracking param 포함) 저장 시 URL dedup 무력화
|
||||
edit_url=normalized_url,
|
||||
review_status="approved",
|
||||
ai_domain="News",
|
||||
ai_domain=ident["ai_domain"],
|
||||
ai_sub_group=source_short,
|
||||
ai_tags=[f"News/{source_short}/{category}"],
|
||||
ai_tags=ident["ai_tags"],
|
||||
extract_meta=_build_extract_meta(source, pub_dt),
|
||||
)
|
||||
session.add(doc)
|
||||
await session.flush()
|
||||
|
||||
# summarize + embed + chunk 등록 (classify 불필요)
|
||||
await enqueue_stage(session, doc.id, "summarize")
|
||||
days_old = (datetime.now(timezone.utc) - pub_dt).days
|
||||
if days_old <= 30:
|
||||
await enqueue_stage(session, doc.id, "embed")
|
||||
await enqueue_stage(session, doc.id, "chunk")
|
||||
# summarize + embed + chunk 등록 (classify 불필요).
|
||||
# page 정책 소스는 fulltext 만 — 후속은 fulltext_worker 가 확정 후 enqueue.
|
||||
await _enqueue_processing(session, doc, source, pub_dt)
|
||||
|
||||
count += 1
|
||||
|
||||
logger.info(f"[{source.name}] RSS → {count}건 수집")
|
||||
return count
|
||||
return count, "ok"
|
||||
|
||||
|
||||
async def _fetch_api(session, source: NewsSource) -> int:
|
||||
async def _fetch_api(session, source: NewsSource) -> tuple[int, str]:
|
||||
"""API 소스 디스패치 — feed_url 호스트로 제공자 판별 (B-2).
|
||||
|
||||
레거시 NYT 행(feed_url=api.nytimes.com)은 무변경 경로. 신규 제공자는 호스트 분기 추가.
|
||||
미지의 호스트 = NYT 경로로 넘기지 않고 명시 실패 (silent fallback 금지).
|
||||
"""
|
||||
host = (urlparse(source.feed_url).hostname or "").lower()
|
||||
if host.endswith("guardianapis.com"):
|
||||
return await _fetch_api_guardian(session, source)
|
||||
if host.endswith("nytimes.com"):
|
||||
return await _fetch_api_nyt(session, source)
|
||||
raise FeedError(f"API 제공자 미등록 호스트: {host} — 디스패치 분기 추가 필요")
|
||||
|
||||
|
||||
def _guardian_request(feed_url: str, api_key: str) -> tuple[str, dict]:
|
||||
"""Guardian 호출 형태 단일 source-of-truth — fixture 회귀 테스트 대상
|
||||
(tests/fixtures/guardian_open_platform_search_response.json 박제 시 호출과 동일해야 함)."""
|
||||
parsed = urlparse(feed_url)
|
||||
params = {
|
||||
**dict(parse_qsl(parsed.query)),
|
||||
"show-fields": "bodyText,trailText",
|
||||
"page-size": "20",
|
||||
"order-by": "newest",
|
||||
"api-key": api_key,
|
||||
}
|
||||
return f"{parsed.scheme}://{parsed.netloc}{parsed.path}", params
|
||||
|
||||
|
||||
async def _fetch_api_guardian(session, source: NewsSource) -> tuple[int, str]:
|
||||
"""Guardian Open Platform 수집 (B-2) — show-fields=bodyText 로 정식 전문 JSON.
|
||||
|
||||
feed_url 에 section 쿼리를 박아 등록 (예: https://content.guardianapis.com/search?section=world).
|
||||
전문이 API 로 오므로 fulltext stage 불요. 키 미설정 = FeedError (health 실패 기록,
|
||||
silent fallback 없음 — [[feedback_no_silent_fallback_explicit_opt_in]]).
|
||||
"""
|
||||
import os
|
||||
api_key = os.getenv("GUARDIAN_API_KEY", "")
|
||||
if not api_key:
|
||||
raise FeedError("GUARDIAN_API_KEY 미설정 — Guardian 수집 불가")
|
||||
|
||||
endpoint, params = _guardian_request(source.feed_url, api_key)
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=15) as client:
|
||||
resp = await client.get(endpoint, params=params)
|
||||
resp.raise_for_status()
|
||||
except httpx.HTTPStatusError as e:
|
||||
# 쿼리스트링(api-key 포함) 제거 — path 까지만 로깅 (NYT 와 동일 규율)
|
||||
safe_url = str(e.request.url).split("?")[0]
|
||||
raise FeedError(f"Guardian API 실패: {e.response.status_code} @ {safe_url}") from e
|
||||
except httpx.RequestError as e:
|
||||
safe_url = str(e.request.url).split("?")[0] if e.request else "unknown"
|
||||
raise FeedError(f"Guardian API 연결 실패: {safe_url}") from e
|
||||
|
||||
payload = resp.json().get("response", {})
|
||||
if payload.get("status") != "ok":
|
||||
raise FeedError(f"Guardian API status={payload.get('status')}")
|
||||
|
||||
count = 0
|
||||
for item in payload.get("results", []):
|
||||
title = (item.get("webTitle") or "").strip()
|
||||
if not title:
|
||||
continue
|
||||
|
||||
fields = item.get("fields") or {}
|
||||
body_text = (fields.get("bodyText") or "").strip()
|
||||
trail = _clean_html(fields.get("trailText") or "")
|
||||
# bodyText = plain text 전문 (HTML 정화 불요). 짧으면(라이브 블로그 잔재 등) trail 격하.
|
||||
is_full = len(body_text) >= 200
|
||||
body = body_text if is_full else (trail or title)
|
||||
|
||||
link = item.get("webUrl", "")
|
||||
pub_str = item.get("webPublicationDate", "")
|
||||
try:
|
||||
pub_dt = datetime.fromisoformat(pub_str.replace("Z", "+00:00"))
|
||||
except (ValueError, AttributeError):
|
||||
pub_dt = datetime.now(timezone.utc)
|
||||
|
||||
article_id = _article_hash(title, pub_dt.strftime("%Y%m%d"), source.name)
|
||||
normalized_url = _normalize_url(link)
|
||||
|
||||
# RSS 수집부와 동일: 레거시 raw URL + 교차 게시 다중 매칭 내성 (first)
|
||||
existing = await session.execute(
|
||||
select(Document).where(
|
||||
(Document.file_hash == article_id) |
|
||||
(Document.edit_url.in_([normalized_url, link]))
|
||||
).limit(1)
|
||||
)
|
||||
if existing.scalars().first():
|
||||
continue
|
||||
|
||||
if await _is_portal_duplicate(session, title):
|
||||
logger.info(f"[{source.name}] portal-dup skip: {title[:60]}")
|
||||
continue
|
||||
|
||||
category = _normalize_category(item.get("sectionName", source.category or ""))
|
||||
source_short = source.name.split(" ")[0]
|
||||
ident = _doc_identity(source, source_short, category)
|
||||
|
||||
doc = Document(
|
||||
file_path=f"{ident['path_prefix']}/{source.name}/{article_id}",
|
||||
file_hash=article_id,
|
||||
file_format="article",
|
||||
file_size=len(body.encode()),
|
||||
file_type="note",
|
||||
title=title,
|
||||
extracted_text=f"{title}\n\n{body}",
|
||||
extracted_at=datetime.now(timezone.utc),
|
||||
extractor_version="guardian_api_full" if is_full else "guardian_api",
|
||||
md_status="skipped",
|
||||
md_extraction_error="news article: 텍스트 네이티브, markdown 변환 비대상",
|
||||
source_channel=source.source_channel,
|
||||
data_origin="external",
|
||||
edit_url=normalized_url,
|
||||
review_status="approved",
|
||||
ai_domain=ident["ai_domain"],
|
||||
ai_sub_group=source_short,
|
||||
ai_tags=ident["ai_tags"],
|
||||
extract_meta=_build_extract_meta(source, pub_dt),
|
||||
)
|
||||
session.add(doc)
|
||||
await session.flush()
|
||||
|
||||
await _enqueue_processing(session, doc, source, pub_dt)
|
||||
count += 1
|
||||
|
||||
logger.info(f"[{source.name}] API → {count}건 수집")
|
||||
return count, "ok"
|
||||
|
||||
|
||||
async def _fetch_api_nyt(session, source: NewsSource) -> tuple[int, str]:
|
||||
"""NYT API 수집 — 키 마스킹 + health degradation"""
|
||||
import os
|
||||
nyt_key = os.getenv("NYT_API_KEY", "")
|
||||
if not nyt_key:
|
||||
logger.error("NYT_API_KEY 미설정 — US 뉴스 수집 불가")
|
||||
return 0
|
||||
raise FeedError("NYT_API_KEY 미설정 — US 뉴스 수집 불가")
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=10) as client:
|
||||
@@ -270,12 +690,10 @@ async def _fetch_api(session, source: NewsSource) -> int:
|
||||
except httpx.HTTPStatusError as e:
|
||||
# 쿼리스트링(api-key 포함) 제거 — path까지만 로깅
|
||||
safe_url = str(e.request.url).split("?")[0]
|
||||
logger.error(f"NYT API 실패: {e.response.status_code} @ {safe_url}")
|
||||
return 0
|
||||
raise FeedError(f"NYT API 실패: {e.response.status_code} @ {safe_url}") from e
|
||||
except httpx.RequestError as e:
|
||||
safe_url = str(e.request.url).split("?")[0] if e.request else "unknown"
|
||||
logger.error(f"NYT API 연결 실패: {safe_url}")
|
||||
return 0
|
||||
raise FeedError(f"NYT API 연결 실패: {safe_url}") from e
|
||||
|
||||
data = resp.json()
|
||||
count = 0
|
||||
@@ -309,11 +727,16 @@ async def _fetch_api(session, source: NewsSource) -> int:
|
||||
if existing.scalars().first():
|
||||
continue
|
||||
|
||||
if await _is_portal_duplicate(session, title):
|
||||
logger.info(f"[{source.name}] portal-dup skip: {title[:60]}")
|
||||
continue
|
||||
|
||||
category = _normalize_category(article.get("section", source.category or ""))
|
||||
source_short = source.name.split(" ")[0]
|
||||
|
||||
ident = _doc_identity(source, source_short, category)
|
||||
doc = Document(
|
||||
file_path=f"news/{source.name}/{article_id}",
|
||||
file_path=f"{ident['path_prefix']}/{source.name}/{article_id}",
|
||||
file_hash=article_id,
|
||||
file_format="article",
|
||||
file_size=len(summary.encode()),
|
||||
@@ -327,24 +750,21 @@ async def _fetch_api(session, source: NewsSource) -> int:
|
||||
# 인덱스 비대. 생성 시점에 terminal 'skipped' 로 명시(변환 비대상).
|
||||
md_status="skipped",
|
||||
md_extraction_error="news article: 텍스트 네이티브, markdown 변환 비대상",
|
||||
source_channel="news",
|
||||
source_channel=source.source_channel,
|
||||
data_origin="external",
|
||||
edit_url=normalized_url,
|
||||
review_status="approved",
|
||||
ai_domain="News",
|
||||
ai_domain=ident["ai_domain"],
|
||||
ai_sub_group=source_short,
|
||||
ai_tags=[f"News/{source_short}/{category}"],
|
||||
ai_tags=ident["ai_tags"],
|
||||
extract_meta=_build_extract_meta(source, pub_dt),
|
||||
)
|
||||
session.add(doc)
|
||||
await session.flush()
|
||||
|
||||
await enqueue_stage(session, doc.id, "summarize")
|
||||
days_old = (datetime.now(timezone.utc) - pub_dt).days
|
||||
if days_old <= 30:
|
||||
await enqueue_stage(session, doc.id, "embed")
|
||||
await enqueue_stage(session, doc.id, "chunk")
|
||||
await _enqueue_processing(session, doc, source, pub_dt)
|
||||
|
||||
count += 1
|
||||
|
||||
logger.info(f"[{source.name}] API → {count}건 수집")
|
||||
return count
|
||||
return count, "ok"
|
||||
|
||||
@@ -15,15 +15,18 @@ from sqlalchemy.orm import aliased
|
||||
|
||||
from core.database import async_session
|
||||
from core.utils import setup_logger
|
||||
from models.queue import ProcessingQueue, enqueue_stage
|
||||
from models.queue import ProcessingQueue, StageDeferred, enqueue_stage, not_deferred_condition
|
||||
|
||||
logger = setup_logger("queue_consumer")
|
||||
|
||||
# stage별 배치 크기
|
||||
# stt 는 GPU 단일 점유 + 회의 30분짜리도 가능 → 배치 1. thumbnail 은 ffmpeg subprocess 로 가벼움.
|
||||
# deep_summary (PR-B B-1) 는 MLX 26B 단일 Semaphore(1) 경유 → 배치 1.
|
||||
# fulltext 는 politeness 지연(같은 도메인 5–15s)이 배치 내 직렬로 걸린다 — 배치 3 이면
|
||||
# 같은 도메인 최악 ~45s/사이클, 메인 큐 1m 간격(max_instances=1, coalesce)이 흡수.
|
||||
BATCH_SIZE = {"extract": 5, "classify": 3, "summarize": 3, "embed": 1, "chunk": 1,
|
||||
"preview": 2, "stt": 1, "thumbnail": 3, "deep_summary": 1, "markdown": 1}
|
||||
"preview": 2, "stt": 1, "thumbnail": 3, "deep_summary": 1, "markdown": 1,
|
||||
"fulltext": 3}
|
||||
STALE_THRESHOLD_MINUTES = 10
|
||||
# markdown 대형 split 변환은 한 doc 이 수십 분(5210 ≈ 40분) 동안 processing 상태로 머문다.
|
||||
# marker_worker 는 queue 행에 heartbeat 를 찍지 않으므로(started_at 고정), main 의 10분
|
||||
@@ -35,7 +38,7 @@ MARKDOWN_STALE_THRESHOLD_MINUTES = int(os.getenv("MARKDOWN_STALE_MINUTES", "120"
|
||||
# STT 도 장기 작업 가능성이 있으나 본 PR 범위 밖 — main 에 유지(follow-up).
|
||||
MAIN_QUEUE_STAGES = [
|
||||
"extract", "classify", "summarize", "embed", "chunk",
|
||||
"preview", "stt", "thumbnail", "deep_summary",
|
||||
"preview", "stt", "thumbnail", "deep_summary", "fulltext",
|
||||
]
|
||||
MARKDOWN_QUEUE_STAGES = ["markdown"]
|
||||
|
||||
@@ -137,6 +140,9 @@ async def enqueue_next_stage(document_id: int, current_stage: str):
|
||||
# source_channel-aware override (extract stage 만). source_channel 누락 시 _default.
|
||||
extract_override_by_channel = {
|
||||
"devonagent": ["embed", "chunk"],
|
||||
# crawl 채널 파일형 (KOSHA 첨부/GUIDE PDF 등): preview 사전 캐시 스킵 —
|
||||
# 재료 코퍼스 대량 백필이 preview 큐를 점령하지 않게. classify → embed/chunk/markdown 유지.
|
||||
"crawl": ["classify"],
|
||||
}
|
||||
|
||||
next_stages = {
|
||||
@@ -179,6 +185,7 @@ def _load_workers():
|
||||
from workers.summarize_worker import process as summarize_process
|
||||
from workers.thumbnail_worker import process as thumbnail_process
|
||||
from workers.marker_worker import process as marker_process
|
||||
from workers.fulltext_worker import process as fulltext_process
|
||||
|
||||
return {
|
||||
"extract": extract_process,
|
||||
@@ -195,6 +202,9 @@ def _load_workers():
|
||||
# Phase 1B: classify 완료 후 enqueue. PDF→markdown 변환 (leaf, embed/chunk 와 독립).
|
||||
# consume_markdown_queue 가 전담 (대형 split 변환이 메인 파이프라인을 막지 않도록).
|
||||
"markdown": marker_process,
|
||||
# crawl-24x7 A-2: 기사 페이지 fetch → 4-tier 본문 승격. 후속(summarize/embed/chunk)은
|
||||
# 워커가 직접 enqueue — next_stages dict 미등록 (enqueue_next_stage no-op).
|
||||
"fulltext": fulltext_process,
|
||||
}
|
||||
|
||||
|
||||
@@ -206,13 +216,14 @@ async def _process_stage(stage, worker_fn):
|
||||
"""
|
||||
batch_size = BATCH_SIZE.get(stage, 3)
|
||||
|
||||
# pending 항목 조회
|
||||
# pending 항목 조회 (보류 백오프 deferred_until 미래 항목 제외 — ds-macbook-offload-1)
|
||||
async with async_session() as session:
|
||||
result = await session.execute(
|
||||
select(ProcessingQueue.id, ProcessingQueue.document_id)
|
||||
.where(
|
||||
ProcessingQueue.stage == stage,
|
||||
ProcessingQueue.status == "pending",
|
||||
not_deferred_condition(),
|
||||
)
|
||||
.order_by(ProcessingQueue.created_at)
|
||||
.limit(batch_size)
|
||||
@@ -266,6 +277,26 @@ async def _process_stage(stage, worker_fn):
|
||||
await enqueue_next_stage(document_id, stage)
|
||||
logger.info(f"[{stage}] document_id={document_id} 완료")
|
||||
|
||||
except StageDeferred as defer:
|
||||
# 보류 (ds-macbook-offload-1): 맥북 일시 불가(sleep/cold/editor_busy) — 실패 아님.
|
||||
# attempts 는 claim 시 선증가분을 반환(미소모)하고 deferred_until 백오프 후 자연 재개.
|
||||
# 워커는 완주 전 doc 쓰기를 하지 않으므로 이 시점의 데이터 변경 = 0 (sleep-안전).
|
||||
async with async_session() as session:
|
||||
item = await session.get(ProcessingQueue, queue_id)
|
||||
if not item:
|
||||
logger.warning(f"[{stage}] queue_id={queue_id} 없음 (삭제됨?), skip")
|
||||
continue
|
||||
item.status = "pending"
|
||||
item.started_at = None
|
||||
item.attempts = max(0, item.attempts - 1)
|
||||
until = datetime.now(timezone.utc) + timedelta(minutes=defer.retry_after_minutes)
|
||||
item.payload = {**(item.payload or {}), "deferred_until": until.isoformat()}
|
||||
await session.commit()
|
||||
logger.info(
|
||||
f"[{stage}] document_id={document_id} 보류({defer}) — "
|
||||
f"{defer.retry_after_minutes}분 후 재개"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
# 실패 처리
|
||||
async with async_session() as session:
|
||||
|
||||
@@ -0,0 +1,183 @@
|
||||
"""수동 burst-drain CLI — 맥미니 백로그를 사용자가 의도적으로 맥북(M5 Max)으로 소화.
|
||||
|
||||
ds-macbook-offload-1 P2-3. 운영 패턴 = csb_collector --bulk 와 동일 (컨테이너 내 실행,
|
||||
장기 배치 중 fastapi 재생성 = in-flight 절단이지만 멱등 재실행으로 무손실).
|
||||
|
||||
docker compose exec fastapi python -m workers.queue_drain --stage summarize --limit 200
|
||||
|
||||
설계 원칙:
|
||||
- deep 슬롯(config.yaml ai.models.deep) 필수 — 부재 시 명시 종료 (silent 강등 금지)
|
||||
- claim = FOR UPDATE SKIP LOCKED 단건 전이 → consumer(1분 주기)와 이중처리 0
|
||||
- per-item 커밋 = sleep-안전: 중단돼도 완료분 무손상, 진행 중 1건만 stale recovery
|
||||
(10분) 로 pending 복귀. 재실행 멱등 (summarize 는 ai_summary 존재 시 skip)
|
||||
- 보류(StageDeferred = 맥북 sleep/cold/editor_busy/네트워크 플랩): attempts 반환 +
|
||||
deferred_until 백오프 기록. 연속 보류 --defer-retries(기본 5)회까지 --defer-wait
|
||||
(기본 120s) 간격 재시도(분 단위 플랩 흡수), 한도 도달 = sleep 판정으로 run 종료 —
|
||||
불가 상태의 맥북을 계속 두드리지 않는다
|
||||
- 폴백 0: 맥미니/cloud 강등 없음
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
from datetime import datetime, timedelta, timezone
|
||||
|
||||
from sqlalchemy import select
|
||||
|
||||
from core.config import settings
|
||||
from core.database import async_session
|
||||
from core.utils import setup_logger
|
||||
from models.queue import ProcessingQueue, StageDeferred, not_deferred_condition
|
||||
|
||||
logger = setup_logger("queue_drain")
|
||||
|
||||
# summarize = 맥미니 백로그 본체 / deep_summary = 심층 (consumer 도 deep 슬롯 시 맥북 경유).
|
||||
# classify 는 triage 경량 호출이라 맥미니 적합 — 대상에서 제외 (plan Q-4).
|
||||
DRAIN_STAGES = ("summarize", "deep_summary")
|
||||
|
||||
|
||||
async def _claim_one(stage: str) -> tuple[int, int] | None:
|
||||
"""pending 1건을 processing 으로 원자 전이 (SKIP LOCKED — consumer 와 경합 안전)."""
|
||||
async with async_session() as session:
|
||||
item = (await session.execute(
|
||||
select(ProcessingQueue)
|
||||
.where(
|
||||
ProcessingQueue.stage == stage,
|
||||
ProcessingQueue.status == "pending",
|
||||
not_deferred_condition(),
|
||||
)
|
||||
.order_by(ProcessingQueue.created_at)
|
||||
.limit(1)
|
||||
.with_for_update(skip_locked=True)
|
||||
)).scalar_one_or_none()
|
||||
if item is None:
|
||||
return None
|
||||
item.status = "processing"
|
||||
item.started_at = datetime.now(timezone.utc)
|
||||
item.attempts += 1
|
||||
claimed = (item.id, item.document_id)
|
||||
await session.commit()
|
||||
return claimed
|
||||
|
||||
|
||||
async def _mark_completed(queue_id: int) -> None:
|
||||
async with async_session() as session:
|
||||
item = await session.get(ProcessingQueue, queue_id)
|
||||
if item:
|
||||
item.status = "completed"
|
||||
item.completed_at = datetime.now(timezone.utc)
|
||||
await session.commit()
|
||||
|
||||
|
||||
async def _mark_deferred(queue_id: int, defer: StageDeferred) -> None:
|
||||
"""보류: attempts 반환(미소모) + deferred_until 백오프 — consumer 의 처리와 동형."""
|
||||
async with async_session() as session:
|
||||
item = await session.get(ProcessingQueue, queue_id)
|
||||
if item:
|
||||
item.status = "pending"
|
||||
item.started_at = None
|
||||
item.attempts = max(0, item.attempts - 1)
|
||||
until = datetime.now(timezone.utc) + timedelta(minutes=defer.retry_after_minutes)
|
||||
item.payload = {**(item.payload or {}), "deferred_until": until.isoformat()}
|
||||
await session.commit()
|
||||
|
||||
|
||||
async def _mark_failed(queue_id: int, exc: Exception) -> None:
|
||||
"""실패: consumer 와 동일 재시도 정책 (attempts >= max → failed, 아니면 pending 복귀)."""
|
||||
async with async_session() as session:
|
||||
item = await session.get(ProcessingQueue, queue_id)
|
||||
if item:
|
||||
err_text = str(exc) or repr(exc) or type(exc).__name__
|
||||
item.error_message = err_text[:500]
|
||||
if item.attempts >= item.max_attempts:
|
||||
item.status = "failed"
|
||||
else:
|
||||
item.status = "pending"
|
||||
item.started_at = None
|
||||
await session.commit()
|
||||
|
||||
|
||||
async def drain(stage: str, limit: int, defer_retries: int = 5, defer_wait: int = 120) -> None:
|
||||
if stage not in DRAIN_STAGES:
|
||||
raise SystemExit(f"--stage 는 {DRAIN_STAGES} 만 허용 (classify 등은 맥미니 적합 — plan Q-4)")
|
||||
if settings.ai.deep is None:
|
||||
raise SystemExit(
|
||||
"config.yaml ai.models.deep 슬롯 미구성 — drain 은 맥북 분담 전용 레버라 진행하지 않음"
|
||||
" (맥미니로의 silent 강등 금지)"
|
||||
)
|
||||
|
||||
from workers.deep_summary_worker import process as deep_summary_process
|
||||
from workers.summarize_worker import process as summarize_process
|
||||
|
||||
done = failed = 0
|
||||
deferred = False
|
||||
consecutive_defers = 0
|
||||
while done + failed < limit:
|
||||
claimed = await _claim_one(stage)
|
||||
if claimed is None:
|
||||
logger.info(f"[drain:{stage}] pending 소진 — 종료")
|
||||
break
|
||||
queue_id, document_id = claimed
|
||||
try:
|
||||
async with async_session() as worker_session:
|
||||
if stage == "summarize":
|
||||
await summarize_process(document_id, worker_session, use_deep=True)
|
||||
else:
|
||||
# deep_summary 는 deep 슬롯 구성 시 워커가 자체적으로 맥북 경유
|
||||
await deep_summary_process(document_id, worker_session)
|
||||
await worker_session.commit()
|
||||
await _mark_completed(queue_id)
|
||||
done += 1
|
||||
consecutive_defers = 0
|
||||
logger.info(f"[drain:{stage}] {done}/{limit} doc={document_id} 완료")
|
||||
except StageDeferred as defer:
|
||||
# 일시 불가는 종류가 둘: 진짜 sleep(장시간) vs 일시 네트워크 플랩(분 단위 —
|
||||
# 2026-06-11 실측: Tailscale direct 경로 ~10분 플랩으로 32/300 조기 종료).
|
||||
# 연속 보류 한도까지 대기 후 재시도해 플랩을 흡수, 한도 도달 시 종료(sleep 판정).
|
||||
await _mark_deferred(queue_id, defer)
|
||||
consecutive_defers += 1
|
||||
if consecutive_defers >= defer_retries:
|
||||
deferred = True
|
||||
logger.warning(
|
||||
f"[drain:{stage}] doc={document_id} 맥북 불가({defer}) — 연속 보류 "
|
||||
f"{consecutive_defers}회 한도 도달, run 종료. 맥북 깨운 뒤(또는 "
|
||||
f"{defer.retry_after_minutes}분 후) 재실행"
|
||||
)
|
||||
break
|
||||
logger.warning(
|
||||
f"[drain:{stage}] doc={document_id} 맥북 일시 불가({defer}) — "
|
||||
f"{defer_wait}s 대기 후 재시도 ({consecutive_defers}/{defer_retries})"
|
||||
)
|
||||
await asyncio.sleep(defer_wait)
|
||||
except Exception as exc:
|
||||
await _mark_failed(queue_id, exc)
|
||||
failed += 1
|
||||
logger.error(f"[drain:{stage}] doc={document_id} 실패: {exc}")
|
||||
|
||||
# 종료 요약 (잔여 = 지금 시점 pending 수)
|
||||
async with async_session() as session:
|
||||
from sqlalchemy import func as sa_func
|
||||
remaining = (await session.execute(
|
||||
select(sa_func.count()).select_from(ProcessingQueue).where(
|
||||
ProcessingQueue.stage == stage, ProcessingQueue.status == "pending",
|
||||
)
|
||||
)).scalar_one()
|
||||
logger.info(
|
||||
f"[drain:{stage}] 요약 — 완료 {done} · 실패 {failed} · "
|
||||
f"보류종료 {'예' if deferred else '아니오'} · 잔여 pending {remaining}"
|
||||
)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description="맥북(M5 Max) burst-drain — 수동 백로그 분담 레버")
|
||||
parser.add_argument("--stage", required=True, choices=DRAIN_STAGES)
|
||||
parser.add_argument("--limit", type=int, default=50, help="이번 run 최대 처리 건수 (기본 50)")
|
||||
parser.add_argument("--defer-retries", type=int, default=5,
|
||||
help="연속 보류 허용 횟수 — 네트워크 플랩 흡수 (기본 5, 한도 도달 시 종료)")
|
||||
parser.add_argument("--defer-wait", type=int, default=120,
|
||||
help="보류 재시도 간 대기 초 (기본 120)")
|
||||
args = parser.parse_args()
|
||||
asyncio.run(drain(args.stage, args.limit, args.defer_retries, args.defer_wait))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,265 @@
|
||||
"""C-3 공학 정적 코퍼스 1회 일괄 ingest (plan crawl-24x7-1).
|
||||
|
||||
National Board 기술 아티클(~86, ASP.NET 구식 — 기사 앵커가 싱글쿼트 href) +
|
||||
TWI Job Knowledge(~153, sitemap 기반). 지속 크롤링이 아니라 아카이브 일괄 +
|
||||
저빈도 증분 유형 — 스케줄러 미등록, 수동 CLI:
|
||||
|
||||
docker exec hyungi_document_server-fastapi-1 \
|
||||
python -m workers.static_corpus_ingest --corpus all --limit 3 # 검증용
|
||||
docker exec -d hyungi_document_server-fastapi-1 \
|
||||
python -m workers.static_corpus_ingest --corpus all # 전체 (~45분)
|
||||
|
||||
※ -d 백그라운드 실행 시 중단은 host pkill 이 아니라 컨테이너 내부 PID kill
|
||||
([[feedback_docker_exec_orphan_kill]]).
|
||||
|
||||
멱등: edit_url(정규화)+file_hash dedup — 재실행 = 신규분만 (그대로 monthly 증분 절차).
|
||||
politeness: fetch_page 재사용 (per-domain 1 + 5~15s jitter + robots).
|
||||
원본 보존·승격 필드: fulltext_worker 와 동일 규약 (재추출 가능 상태 유지).
|
||||
실패는 degrade 없이 skip + 말미 목록 출력 (정적 코퍼스 — RSS 요약 같은 격하 대상 부재).
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import hashlib
|
||||
import re
|
||||
from datetime import datetime, timezone
|
||||
from html import unescape
|
||||
|
||||
from sqlalchemy import select
|
||||
|
||||
from core.crawl_politeness import CrawlBlocked, CrawlFetchError, CrawlSkip, fetch_page
|
||||
from core.database import async_session
|
||||
from core.utils import setup_logger
|
||||
from models.document import Document
|
||||
from models.news_source import NewsSource
|
||||
from models.queue import enqueue_stage
|
||||
from workers.fulltext_worker import (
|
||||
_WEB_MIN_BODY_LEN,
|
||||
_extract_body,
|
||||
_raw_html_path,
|
||||
_save_raw_html,
|
||||
_strip_article_footer,
|
||||
)
|
||||
from workers.news_collector import _article_hash, _normalize_url
|
||||
|
||||
logger = setup_logger("static_corpus")
|
||||
|
||||
_NB_LISTING = "https://www.nationalboard.org/Index.aspx?pageID=164"
|
||||
_TWI_SITEMAP = "https://www.twi-global.com/sitemap.xml"
|
||||
|
||||
|
||||
async def _discover_national_board() -> list[str]:
|
||||
"""목록 페이지의 기사 앵커 — 싱글쿼트 href 가 기본형이라 양쪽 인용부호 매칭."""
|
||||
html_text, _ = await fetch_page(_NB_LISTING)
|
||||
ids = sorted(
|
||||
{int(i) for i in re.findall(
|
||||
r"href=['\"]/?Index\.aspx\?pageID=164&(?:amp;)?ID=(\d+)['\"]", html_text)}
|
||||
)
|
||||
return [f"https://www.nationalboard.org/Index.aspx?pageID=164&ID={i}" for i in ids]
|
||||
|
||||
|
||||
async def _discover_twi() -> list[str]:
|
||||
"""sitemap 에서 job-knowledge 시리즈만 (faqs/published-papers 는 향후 증분 후보)."""
|
||||
xml_text, _ = await fetch_page(
|
||||
_TWI_SITEMAP,
|
||||
content_types=("text/xml", "application/xml", "text/html"),
|
||||
)
|
||||
urls = re.findall(
|
||||
r"<loc>(https://www\.twi-global\.com/technical-knowledge/job-knowledge/[^<]+)</loc>",
|
||||
xml_text,
|
||||
)
|
||||
return sorted({u for u in urls if not u.rstrip("/").endswith("job-knowledge")})
|
||||
|
||||
|
||||
CORPORA = {
|
||||
"national-board": {
|
||||
"source_name": "National Board 기술 아티클",
|
||||
"listing_url": _NB_LISTING,
|
||||
"discover": _discover_national_board,
|
||||
"fetch_method": "page",
|
||||
},
|
||||
"twi": {
|
||||
"source_name": "TWI Job Knowledge",
|
||||
"listing_url": _TWI_SITEMAP,
|
||||
"discover": _discover_twi,
|
||||
"fetch_method": "sitemap+page",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
async def _get_or_create_source(session, spec: dict) -> NewsSource:
|
||||
"""레지스트리 행 — 출처 추적 + crawl_raw src_{id} 경로 + A-8 패널 가시성.
|
||||
|
||||
enabled=False: 6h 뉴스 사이클 비대상 (피드가 없는 정적 코퍼스 — 증분은 본 CLI 재실행).
|
||||
"""
|
||||
result = await session.execute(
|
||||
select(NewsSource).where(NewsSource.name == spec["source_name"])
|
||||
)
|
||||
source = result.scalars().first()
|
||||
if source is None:
|
||||
source = NewsSource(
|
||||
name=spec["source_name"],
|
||||
feed_url=spec["listing_url"],
|
||||
feed_type="rss",
|
||||
fetch_method=spec["fetch_method"],
|
||||
fulltext_policy="none",
|
||||
source_channel="crawl",
|
||||
category="Engineering",
|
||||
language="en",
|
||||
country="US" if "national" in spec["source_name"].lower() else "GB",
|
||||
enabled=False,
|
||||
)
|
||||
session.add(source)
|
||||
await session.flush()
|
||||
return source
|
||||
|
||||
|
||||
def _page_title(html_text: str, fallback: str) -> str:
|
||||
m = re.search(r'<meta\s+property="og:title"\s+content="([^"]+)"', html_text)
|
||||
if not m:
|
||||
m = re.search(r"<title[^>]*>([^<]+)</title>", html_text, re.I)
|
||||
title = unescape(m.group(1)).strip() if m else ""
|
||||
# 사이트 접미 잡음 제거 (TWI 는 ' - TWI', NB 는 'National Board ...' 꼬리표)
|
||||
title = re.sub(r"\s*[-|·]\s*(TWI|National Board[^-|]*)\s*$", "", title).strip()
|
||||
return title or fallback
|
||||
|
||||
|
||||
async def _ingest_one(session, source: NewsSource, url: str) -> str:
|
||||
"""기사 1건. 반환: 'ok' / 'dup' / 'skip'(추출부족·차단)."""
|
||||
normalized_url = _normalize_url(url)
|
||||
existing = await session.execute(
|
||||
select(Document).where(Document.edit_url.in_([normalized_url, url])).limit(1)
|
||||
)
|
||||
if existing.scalars().first():
|
||||
return "dup"
|
||||
|
||||
try:
|
||||
html_text, final_url = await fetch_page(url)
|
||||
except (CrawlBlocked, CrawlSkip, CrawlFetchError) as e:
|
||||
logger.warning(f"[{source.name}] fetch 실패 skip: {url} — {type(e).__name__}: {e}")
|
||||
return "skip"
|
||||
|
||||
body, engine, engine_ver = _extract_body(html_text)
|
||||
if not engine:
|
||||
logger.warning(f"[{source.name}] 추출 실패 skip (< {_WEB_MIN_BODY_LEN}자): {url}")
|
||||
return "skip"
|
||||
clean_body = _strip_article_footer(body.replace("\x00", ""))
|
||||
if len(clean_body) < _WEB_MIN_BODY_LEN:
|
||||
logger.warning(f"[{source.name}] 푸터 제거 후 본문 부족 skip: {url}")
|
||||
return "skip"
|
||||
|
||||
title = _page_title(html_text, fallback=url.rsplit("/", 1)[-1][:90])
|
||||
article_id = _article_hash(title, "static", source.name)
|
||||
dup2 = await session.execute(
|
||||
select(Document).where(Document.file_hash == article_id).limit(1)
|
||||
)
|
||||
if dup2.scalars().first():
|
||||
return "dup"
|
||||
|
||||
now = datetime.now(timezone.utc)
|
||||
raw_path = _raw_html_path(source.id, article_id, now)
|
||||
raw_saved = True
|
||||
try:
|
||||
_save_raw_html(raw_path, html_text)
|
||||
except OSError as e:
|
||||
raw_saved = False
|
||||
logger.error(f"[{source.name}] 원본 보존 실패 (ingest 는 진행): {e}")
|
||||
|
||||
doc = Document(
|
||||
file_path=f"crawl/{source.name}/{article_id}",
|
||||
file_hash=article_id,
|
||||
file_format="article",
|
||||
file_size=0, # 아래 extracted_text 확정 후 재계산
|
||||
file_type="note",
|
||||
title=title,
|
||||
extracted_text=f"{title}\n\n{clean_body}",
|
||||
extracted_at=now,
|
||||
extractor_version=f"static+page@{engine}",
|
||||
md_content=clean_body,
|
||||
md_status="success",
|
||||
md_extraction_engine=engine,
|
||||
md_extraction_engine_version=engine_ver,
|
||||
md_format_version="1.0",
|
||||
md_generated_at=now,
|
||||
md_source_hash=hashlib.sha256(html_text.encode("utf-8", errors="replace")).hexdigest(),
|
||||
md_content_hash=hashlib.sha256(clean_body.encode("utf-8")).hexdigest(),
|
||||
content_origin="extracted",
|
||||
source_channel="crawl",
|
||||
data_origin="external",
|
||||
edit_url=normalized_url,
|
||||
review_status="approved",
|
||||
ai_domain="Engineering",
|
||||
ai_sub_group=source.name,
|
||||
ai_tags=[f"Engineering/{source.name}"],
|
||||
extract_meta={
|
||||
"source_id": source.id,
|
||||
"source_name": source.name,
|
||||
"published_at": None, # 정적 코퍼스 — 페이지 발행일 비신뢰, 색인은 채널 게이트로 무조건
|
||||
"fulltext": {
|
||||
"status": "static_corpus",
|
||||
"engine": engine,
|
||||
"final_url": final_url,
|
||||
"raw_html_path": str(raw_path) if raw_saved else None,
|
||||
"body_chars": len(clean_body),
|
||||
"resolved_at": now.isoformat(),
|
||||
},
|
||||
},
|
||||
)
|
||||
doc.file_size = len(doc.extracted_text.encode())
|
||||
session.add(doc)
|
||||
await session.flush()
|
||||
|
||||
# crawl 채널 = 발행일 무관 전량 색인 (summarize 는 맥미니 큐 — D-4 lag 관찰 대상)
|
||||
await enqueue_stage(session, doc.id, "summarize")
|
||||
await enqueue_stage(session, doc.id, "embed")
|
||||
await enqueue_stage(session, doc.id, "chunk")
|
||||
logger.info(f"[{source.name}] ingest {len(clean_body)}자 ({engine}): {title[:60]}")
|
||||
return "ok"
|
||||
|
||||
|
||||
async def run(corpus: str = "all", limit: int = 0) -> None:
|
||||
targets = list(CORPORA) if corpus == "all" else [corpus]
|
||||
for key in targets:
|
||||
spec = CORPORA[key]
|
||||
async with async_session() as session:
|
||||
source = await _get_or_create_source(session, spec)
|
||||
await session.commit()
|
||||
source_id = source.id
|
||||
|
||||
try:
|
||||
urls = await spec["discover"]()
|
||||
except (CrawlBlocked, CrawlSkip, CrawlFetchError) as e:
|
||||
logger.error(f"[{spec['source_name']}] 목록 수집 실패 — corpus 건너뜀: {e}")
|
||||
continue
|
||||
if limit:
|
||||
urls = urls[:limit]
|
||||
logger.info(f"[{spec['source_name']}] 대상 {len(urls)}건 (limit={limit or '없음'})")
|
||||
|
||||
counts = {"ok": 0, "dup": 0, "skip": 0}
|
||||
failed: list[str] = []
|
||||
for i, url in enumerate(urls, 1):
|
||||
# 커밋 10건 단위 — 장시간 배치 중단 시 진행분 보존
|
||||
async with async_session() as session:
|
||||
src = await session.get(NewsSource, source_id)
|
||||
status = await _ingest_one(session, src, url)
|
||||
await session.commit()
|
||||
counts[status] += 1
|
||||
if status == "skip":
|
||||
failed.append(url)
|
||||
if i % 10 == 0:
|
||||
logger.info(f"[{spec['source_name']}] 진행 {i}/{len(urls)} {counts}")
|
||||
|
||||
logger.info(f"[{spec['source_name']}] 완료: {counts}")
|
||||
if failed:
|
||||
logger.warning(
|
||||
f"[{spec['source_name']}] skip {len(failed)}건 — 재시도는 CLI 재실행(멱등):\n "
|
||||
+ "\n ".join(failed)
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="C-3 정적 코퍼스 일괄 ingest")
|
||||
parser.add_argument("--corpus", choices=[*CORPORA, "all"], default="all")
|
||||
parser.add_argument("--limit", type=int, default=0, help="corpus 당 상한 (0=전체)")
|
||||
args = parser.parse_args()
|
||||
asyncio.run(run(args.corpus, args.limit))
|
||||
@@ -2,27 +2,37 @@
|
||||
|
||||
P3 of family-adaptive-bengio (2026-05-23): 50k 초과 input 은 sliding window
|
||||
(cumulative carry-over) 로 분할 처리. 50k 이하 input 은 기존 동작 유지.
|
||||
|
||||
ds-macbook-offload-1: use_deep=True (queue_drain 전용) 시 맥북 M5 Max deep 슬롯으로
|
||||
호출 — 맥미니 백로그를 사용자가 의도적으로 분담시키는 수동 레버. 기본(consumer) 경로는
|
||||
use_deep=False 로 기존 동작 그대로. 맥북 불가 시 StageDeferred (강등 0, 부분 쓰기 0).
|
||||
"""
|
||||
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from ai.client import AIClient, strip_thinking
|
||||
from ai.client import AIClient, call_deep_or_defer, strip_thinking
|
||||
from core.utils import setup_logger
|
||||
from models.document import Document
|
||||
|
||||
logger = setup_logger("summarize_worker")
|
||||
|
||||
CHUNK_SIZE = 50000
|
||||
# client.summarize() 의 단일 프롬프트와 동일 문구 — deep 경로가 같은 과업을 수행하도록 고정
|
||||
SUMMARY_PROMPT_SINGLE = "다음 문서를 500자 이내로 요약해주세요:\n\n{text}"
|
||||
SUMMARY_PROMPT_CONTINUATION = (
|
||||
"이전 부분 요약:\n{prior}\n\n다음 부분:\n{text}\n\n"
|
||||
"위 두 정보를 합쳐 전체 문서를 500자 이내로 요약해주세요."
|
||||
)
|
||||
|
||||
|
||||
async def process(document_id: int, session: AsyncSession) -> None:
|
||||
"""문서 AI 요약 생성 (분류 없이 요약만)"""
|
||||
async def process(document_id: int, session: AsyncSession, *, use_deep: bool = False) -> None:
|
||||
"""문서 AI 요약 생성 (분류 없이 요약만).
|
||||
|
||||
use_deep: queue_drain 전용 — deep 슬롯(맥북) 경유. 슬롯 미구성 시 명시 에러
|
||||
(silent 강등 금지). consumer 기본 경로는 False (기존 동작 무변경).
|
||||
"""
|
||||
doc = await session.get(Document, document_id)
|
||||
if not doc:
|
||||
raise ValueError(f"문서 ID {document_id}를 찾을 수 없음")
|
||||
@@ -35,13 +45,29 @@ async def process(document_id: int, session: AsyncSession) -> None:
|
||||
return
|
||||
|
||||
client = AIClient()
|
||||
if use_deep and client.ai.deep is None:
|
||||
await client.close()
|
||||
raise ValueError("use_deep=True 인데 config.yaml ai.models.deep 슬롯 미구성 — silent 강등 금지")
|
||||
used_cfg = client.ai.deep if use_deep else client.ai.primary
|
||||
|
||||
async def _summarize_first(text_part: str) -> str:
|
||||
if use_deep:
|
||||
return await call_deep_or_defer(client, SUMMARY_PROMPT_SINGLE.format(text=text_part))
|
||||
return await client.summarize(text_part)
|
||||
|
||||
async def _summarize_continuation(prompt: str) -> str:
|
||||
if use_deep:
|
||||
return await call_deep_or_defer(client, prompt)
|
||||
return await client.call_primary(prompt)
|
||||
|
||||
try:
|
||||
text = doc.extracted_text
|
||||
total_chars = len(text)
|
||||
if total_chars <= CHUNK_SIZE:
|
||||
summary = await client.summarize(text)
|
||||
summary = await _summarize_first(text)
|
||||
logger.info(
|
||||
f"[요약] document_id={document_id}: single chunk ({total_chars}자)"
|
||||
+ (" via deep(맥북)" if use_deep else "")
|
||||
)
|
||||
else:
|
||||
chunks = [text[i:i + CHUNK_SIZE] for i in range(0, total_chars, CHUNK_SIZE)]
|
||||
@@ -52,10 +78,10 @@ async def process(document_id: int, session: AsyncSession) -> None:
|
||||
carry = ""
|
||||
for idx, chunk in enumerate(chunks):
|
||||
if idx == 0:
|
||||
partial = await client.summarize(chunk)
|
||||
partial = await _summarize_first(chunk)
|
||||
else:
|
||||
prompt = SUMMARY_PROMPT_CONTINUATION.format(prior=carry, text=chunk)
|
||||
partial = await client.call_primary(prompt)
|
||||
partial = await _summarize_continuation(prompt)
|
||||
carry = strip_thinking(partial)
|
||||
logger.info(
|
||||
f"[요약] document_id={document_id}: chunk {idx + 1}/{len(chunks)} done "
|
||||
@@ -63,8 +89,10 @@ async def process(document_id: int, session: AsyncSession) -> None:
|
||||
)
|
||||
summary = carry
|
||||
|
||||
# sleep-안전 불변식: 쓰기는 전체 완주 후에만 — 중간 절단은 StageDeferred 로 빠져
|
||||
# 이 지점에 도달하지 않는다 (carry 는 로컬 변수, doc 무변경).
|
||||
doc.ai_summary = strip_thinking(summary)
|
||||
doc.ai_model_version = client.ai.primary.model
|
||||
doc.ai_model_version = used_cfg.model
|
||||
doc.ai_processed_at = datetime.now(timezone.utc)
|
||||
logger.info(
|
||||
f"[요약] document_id={document_id}: {len(doc.ai_summary)}자 final"
|
||||
|
||||
+38
-3
@@ -64,6 +64,11 @@ services:
|
||||
environment:
|
||||
- HF_HOME=/models/huggingface
|
||||
- TORCH_HOME=/models/torch
|
||||
# D-1 (crawl-24x7): idle-unload 전환 — 영구 점유(~3.5GB) 해제가 90% 봉투의 전제.
|
||||
# /ready 는 idle 에서도 200 (fastapi depends_on service_healthy 유지).
|
||||
# 롤백 = MARKER_PRELOAD=1 + MARKER_IDLE_UNLOAD_MINUTES=0.
|
||||
- MARKER_PRELOAD=0
|
||||
- MARKER_IDLE_UNLOAD_MINUTES=${MARKER_IDLE_UNLOAD_MINUTES:-30}
|
||||
volumes:
|
||||
- ${NAS_NFS_PATH:-/mnt/nas/Document_Server}:/documents:ro
|
||||
- marker_models:/models
|
||||
@@ -97,6 +102,11 @@ services:
|
||||
- WHISPER_MODEL=${WHISPER_MODEL:-large-v3}
|
||||
- WHISPER_DEVICE=${WHISPER_DEVICE:-cuda}
|
||||
- WHISPER_COMPUTE_TYPE=${WHISPER_COMPUTE_TYPE:-float16}
|
||||
# D-1 (crawl-24x7): idle-unload 전환 — 영구 점유(~4GB) 해제가 90% 봉투의 전제.
|
||||
# 콜드로드 수초~수십 초는 배치 작업이라 무방 (stt_worker read=1800s 가 흡수).
|
||||
# 롤백 = STT_PRELOAD=1 + STT_IDLE_UNLOAD_MINUTES=0.
|
||||
- STT_PRELOAD=0
|
||||
- STT_IDLE_UNLOAD_MINUTES=${STT_IDLE_UNLOAD_MINUTES:-30}
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
@@ -105,9 +115,9 @@ services:
|
||||
count: 1
|
||||
capabilities: [gpu]
|
||||
healthcheck:
|
||||
# /ready: CUDA 디바이스 + 모델 적재 둘 다 확인. ready=true 만 healthy 처리.
|
||||
# /health 는 단순 liveness 라 모델 미적재 상태도 healthy 로 잡혀 운영 신호로 부적합.
|
||||
test: ["CMD", "python3", "-c", "import json,urllib.request,sys; r=urllib.request.urlopen('http://localhost:3300/ready'); sys.exit(0 if json.load(r).get('ready') else 1)"]
|
||||
# D-1: idle-unload 도입으로 '모델 적재' 는 더 이상 상시 상태가 아님 — cuda 가용성만
|
||||
# healthy 기준. 모델 적재 여부는 /ready 의 models_loaded 필드로 관측(정보성).
|
||||
test: ["CMD", "python3", "-c", "import json,urllib.request,sys; r=urllib.request.urlopen('http://localhost:3300/ready'); sys.exit(0 if json.load(r).get('cuda') else 1)"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
@@ -229,6 +239,31 @@ services:
|
||||
- fastapi
|
||||
restart: unless-stopped
|
||||
|
||||
# crawl-24x7 A-8 1차: 전 소스 헬스 패널 — 내부 전용 (읽기 전용 SELECT 만).
|
||||
# '내부 전용' 성립 구현 = 별도 바인딩뿐 (r4 결정): Tailscale 인터페이스에만 publish.
|
||||
# 기존 SvelteKit 라우트(vhost=Host 헤더 검사=앱 가드 환원)나 프록시 경로 차단(경로 가드
|
||||
# 회귀)으로 옮기지 말 것. caddy/home-caddy 라우트 추가 금지. fastapi/postgres 바인딩 선례.
|
||||
crawl-health:
|
||||
build: ./services/crawl-health
|
||||
ports:
|
||||
- "100.110.63.63:8765:8765"
|
||||
environment:
|
||||
- CRAWL_HEALTH_DSN=postgresql://pkm:${POSTGRES_PASSWORD}@postgres:5432/pkm
|
||||
depends_on:
|
||||
postgres:
|
||||
condition: service_healthy
|
||||
restart: unless-stopped
|
||||
|
||||
# crawl-24x7 B-3: 구독 세션 Playwright fetch 격리 — internal-only (host 포트·caddy 라우트 금지).
|
||||
# 브라우저 hang/크래시가 fastapi APScheduler 를 잠식하지 않게 별도 컨테이너 + mem cap.
|
||||
# 세션 파일(쿠키=credential 등가물)은 repo 밖 호스트 경로 ro mount (600, gitignore 무관 영역).
|
||||
playwright-fetcher:
|
||||
build: ./services/playwright-fetcher
|
||||
volumes:
|
||||
- /home/hyungi/.local/share/crawl-auth:/auth:ro
|
||||
mem_limit: 2g
|
||||
restart: unless-stopped
|
||||
|
||||
caddy:
|
||||
image: caddy:2
|
||||
ports:
|
||||
|
||||
@@ -172,6 +172,61 @@ export async function api<T = unknown>(
|
||||
return res.json();
|
||||
}
|
||||
|
||||
/**
|
||||
* Raw fetch 헬퍼 — SSE/스트리밍 등 JSON 일괄 파싱이 부적합한 endpoint 전용.
|
||||
*
|
||||
* api<T>() 와 동일한 정책을 공유한다:
|
||||
* - access token 자동 첨부
|
||||
* - 401 → refresh 1회 재시도 (실패 시 handleTokenRefresh 가 강제 logout)
|
||||
* - JSON body 면 Content-Type 자동 설정
|
||||
*
|
||||
* 차이: Response 를 그대로 반환한다 (status 판단 / body 소비는 호출자 책임).
|
||||
* PR-Eid-Chat: `/api/eid/chat` SSE 스트림이 첫 소비자. additive export only —
|
||||
* 기존 api()/uploadFile() 동작은 변경하지 않는다.
|
||||
*/
|
||||
export async function apiFetchRaw(
|
||||
path: string,
|
||||
options: RequestInit = {},
|
||||
): Promise<Response> {
|
||||
const headers: Record<string, string> = {
|
||||
...(options.headers as Record<string, string> || {}),
|
||||
};
|
||||
|
||||
if (accessToken) {
|
||||
headers['Authorization'] = `Bearer ${accessToken}`;
|
||||
}
|
||||
if (options.body && !(options.body instanceof FormData)) {
|
||||
headers['Content-Type'] = 'application/json';
|
||||
}
|
||||
|
||||
const res = await fetch(`${API_BASE}${path}`, {
|
||||
...options,
|
||||
headers,
|
||||
credentials: 'include',
|
||||
});
|
||||
|
||||
// 401 → refresh 1회 시도 (api() 와 같은 정책, auth endpoint 제외)
|
||||
const isAuthEndpoint = path.startsWith('/auth/login') || path.startsWith('/auth/refresh');
|
||||
if (res.status === 401 && accessToken && !isAuthEndpoint) {
|
||||
try {
|
||||
await handleTokenRefresh();
|
||||
} catch {
|
||||
// refresh 실패 — handleTokenRefresh 가 강제 logout(리다이렉트) 처리.
|
||||
// api() 와 일관되게 원본 401 Response 를 그대로 반환해 호출자가
|
||||
// 네트워크 에러로 오인하지 않게 한다 (body 미소비 상태라 재사용 가능).
|
||||
return res;
|
||||
}
|
||||
headers['Authorization'] = `Bearer ${accessToken}`;
|
||||
return fetch(`${API_BASE}${path}`, {
|
||||
...options,
|
||||
headers,
|
||||
credentials: 'include',
|
||||
});
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
/**
|
||||
* 업로드 전용 헬퍼 — XMLHttpRequest 기반.
|
||||
*
|
||||
|
||||
@@ -0,0 +1,106 @@
|
||||
<script lang="ts">
|
||||
// 처리 현황 드로어 (안6 라이트) — 전 페이지 상태 스트립 클릭 시 우측에서 열림.
|
||||
// 머신 미니카드 3 + ETA 한 줄 + 실패 합계 + 홈 링크 축약본. 상세는 홈 보드가 담당.
|
||||
// 데이터 = queueOverview store 공유 (60s 폴링, 실패 시 null → 안내문으로 degrade).
|
||||
// 열림 상태는 uiState 단일 drawer slot('queue') — 사이드바 드로어와 동시 오픈 차단.
|
||||
import { X } from 'lucide-svelte';
|
||||
import { ui } from '$lib/stores/uiState.svelte';
|
||||
import { queueOverview } from '$lib/stores/queueOverview';
|
||||
import {
|
||||
MACHINE_STATE_LABEL, machineChipClass, machineDotClass, formatRate, etaPhrase,
|
||||
} from '$lib/utils/queueDisplay';
|
||||
import IconButton from '$lib/components/ui/IconButton.svelte';
|
||||
|
||||
let open = $derived(ui.isDrawerOpen('queue'));
|
||||
let data = $derived($queueOverview);
|
||||
|
||||
function close() {
|
||||
ui.closeDrawer();
|
||||
}
|
||||
|
||||
// ESC 닫기 — 레이아웃 전역 핸들러(ui.handleEscape)와 중복돼도 무해(멱등).
|
||||
// modal stack 이 열려 있으면 modal 우선 (전역 우선순위와 동일).
|
||||
function onWindowKeydown(e: KeyboardEvent) {
|
||||
if (e.key === 'Escape' && open && ui.modalStack.length === 0) close();
|
||||
}
|
||||
</script>
|
||||
|
||||
<svelte:window onkeydown={onWindowKeydown} />
|
||||
|
||||
{#if open}
|
||||
<div class="fixed inset-0 z-drawer">
|
||||
<!-- 스크림 — 클릭 시 닫기 -->
|
||||
<button
|
||||
type="button"
|
||||
onclick={close}
|
||||
class="absolute inset-0 bg-scrim transition-opacity"
|
||||
aria-label="드로어 닫기"
|
||||
></button>
|
||||
|
||||
<!-- 패널 — div + role="dialog" (aside 는 interactive role 불가, a11y 경고) -->
|
||||
<div
|
||||
role="dialog"
|
||||
aria-modal="true"
|
||||
aria-label="처리 현황"
|
||||
class="absolute right-0 top-0 bottom-0 w-rail max-w-full bg-sidebar shadow-xl overflow-y-auto"
|
||||
>
|
||||
<div class="flex items-center justify-between px-4 h-12 border-b border-default">
|
||||
<span class="text-sm font-bold text-text">처리 현황</span>
|
||||
<IconButton icon={X} size="sm" aria-label="닫기" onclick={close} />
|
||||
</div>
|
||||
|
||||
<div class="p-4 space-y-3">
|
||||
{#if data}
|
||||
<!-- 머신 미니카드 3 -->
|
||||
{#each data.machines as m (m.key)}
|
||||
<div class="bg-surface border border-default rounded-lg px-3.5 py-2.5">
|
||||
<div class="flex items-center justify-between gap-2">
|
||||
<span class="flex items-center gap-2 text-[13px] font-semibold text-text min-w-0">
|
||||
<span class="w-2 h-2 rounded-full shrink-0 {machineDotClass(m.state)}"></span>
|
||||
<span class="truncate">{m.label}</span>
|
||||
</span>
|
||||
<span class="text-[10px] font-bold rounded-full px-2 py-0.5 shrink-0 {machineChipClass(m.state)}">
|
||||
{MACHINE_STATE_LABEL[m.state]}
|
||||
</span>
|
||||
</div>
|
||||
<div class="text-[11px] text-dim mt-1 tabular-nums">
|
||||
대기 <strong class="text-text">{m.pending.toLocaleString()}</strong>
|
||||
· 오늘 <strong class="text-text">{m.done_today.toLocaleString()}</strong>건 처리
|
||||
</div>
|
||||
</div>
|
||||
{/each}
|
||||
|
||||
<!-- ETA 한 줄 (안5 라이트 — 추정치) -->
|
||||
<div
|
||||
class="text-[11px] text-dim leading-relaxed tabular-nums"
|
||||
title="현재 페이스 기반 추정치 — 유입 변동 시 달라질 수 있습니다"
|
||||
>
|
||||
요약 대기 <strong class="text-text">{data.summarize_eta.pending.toLocaleString()}건</strong>
|
||||
— 소화 {formatRate(data.summarize_eta.done_rate_1h)}/h
|
||||
· 유입 {formatRate(data.summarize_eta.inflow_rate_1h)}/h
|
||||
{#if data.summarize_eta.eta_minutes != null}
|
||||
· <span class="text-accent font-semibold">{etaPhrase(data.summarize_eta.eta_minutes)}</span>
|
||||
{:else}
|
||||
· 유입 우세(백필 중)
|
||||
{/if}
|
||||
</div>
|
||||
|
||||
<!-- 실패 합계 -->
|
||||
{#if data.totals.failed > 0}
|
||||
<div class="text-[11px] font-semibold text-error bg-error/10 rounded-md px-2.5 py-1.5 tabular-nums">
|
||||
실패 {data.totals.failed.toLocaleString()}건 — 확인 필요
|
||||
</div>
|
||||
{/if}
|
||||
{:else}
|
||||
<p class="text-xs text-dim">처리 현황을 불러오지 못했습니다.</p>
|
||||
{/if}
|
||||
|
||||
<a
|
||||
href="/"
|
||||
onclick={close}
|
||||
class="block text-xs text-accent font-semibold hover:underline pt-1"
|
||||
>홈에서 자세히 →</a>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
{/if}
|
||||
@@ -2,7 +2,7 @@
|
||||
import { page } from '$app/stores';
|
||||
import { goto } from '$app/navigation';
|
||||
import { api } from '$lib/api';
|
||||
import { ChevronRight, ChevronDown, FolderOpen, FolderTree, Inbox, Clock, Mail, Scale, StickyNote, GraduationCap, CalendarCheck } from 'lucide-svelte';
|
||||
import { ChevronRight, ChevronDown, FolderOpen, FolderTree, Inbox, Clock, Mail, Scale, StickyNote, GraduationCap, CalendarCheck, MessageCircle } from 'lucide-svelte';
|
||||
|
||||
let tree = $state([]);
|
||||
let loading = $state(true);
|
||||
@@ -229,6 +229,16 @@
|
||||
공부
|
||||
</span>
|
||||
</a>
|
||||
<a
|
||||
href="/chat"
|
||||
class="flex items-center justify-between px-3 py-2 rounded-md text-sm transition-colors
|
||||
{$page.url.pathname.startsWith('/chat') ? 'bg-accent/15 text-accent' : 'text-text hover:bg-surface'}"
|
||||
>
|
||||
<span class="flex items-center gap-2">
|
||||
<MessageCircle size={16} />
|
||||
이드
|
||||
</span>
|
||||
</a>
|
||||
<a
|
||||
href="/inbox"
|
||||
class="flex items-center justify-between px-3 py-2 rounded-md text-sm text-text hover:bg-surface transition-colors"
|
||||
|
||||
@@ -0,0 +1,8 @@
|
||||
/**
|
||||
* 이드 채팅(/chat) 공유 상수 (PR-Eid-Chat).
|
||||
*
|
||||
* localStorage 이력 키 — 채팅 표면(routes/chat/+page.svelte)의 저장/복원과
|
||||
* logout(stores/auth.ts)의 이력 제거가 같은 키를 보도록 단일 상수로 공유한다
|
||||
* (본문 무로깅 posture 정합: 로그아웃 시 브라우저에 대화 본문을 남기지 않음).
|
||||
*/
|
||||
export const EID_CHAT_STORAGE_KEY = 'eid_chat:v1';
|
||||
@@ -1,5 +1,6 @@
|
||||
import { writable } from 'svelte/store';
|
||||
import { api, setAccessToken } from '$lib/api';
|
||||
import { EID_CHAT_STORAGE_KEY } from '$lib/eidChat';
|
||||
|
||||
interface User {
|
||||
id: number;
|
||||
@@ -39,6 +40,14 @@ export async function logout() {
|
||||
setAccessToken(null);
|
||||
user.set(null);
|
||||
isAuthenticated.set(false);
|
||||
// 본문 무로깅 posture 정합 — 로그아웃 시 이드 대화 이력도 브라우저에서 제거
|
||||
if (typeof window !== 'undefined') {
|
||||
try {
|
||||
window.localStorage.removeItem(EID_CHAT_STORAGE_KEY);
|
||||
} catch {
|
||||
// 이력 제거 실패가 logout 자체를 막지는 않음
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
export async function tryRefresh() {
|
||||
|
||||
@@ -0,0 +1,63 @@
|
||||
// 처리 큐 overview store — GET /api/queue/overview 를 60초 주기로 폴링.
|
||||
// system.ts 의 dashboardSummary 와 같은 구독 기반 패턴 (첫 subscribe 시 시작).
|
||||
//
|
||||
// 의도적으로 api() 헬퍼를 쓰지 않는다 — 폴링 경로의 401 이 refresh 실패 →
|
||||
// window.location='/login' 강제 logout 부수효과를 일으키면 안 됨 (eid 리뷰
|
||||
// finding 재발 방지). 백엔드 미배포(404)/401/네트워크 실패 전부 silent 하게
|
||||
// null 로 수렴하고, 소비자(스트립/보드/드로어)는 null 이면 스스로 숨는다.
|
||||
|
||||
import { writable } from 'svelte/store';
|
||||
import { browser } from '$app/environment';
|
||||
import { getAccessToken } from '$lib/api';
|
||||
import type { QueueOverview } from '$lib/types/queue';
|
||||
|
||||
const POLL_INTERVAL_MS = 60_000;
|
||||
|
||||
let pollHandle: ReturnType<typeof setInterval> | null = null;
|
||||
let subscriberCount = 0;
|
||||
let inFlight: Promise<void> | null = null;
|
||||
|
||||
const internal = writable<QueueOverview | null>(null, (_set) => {
|
||||
subscriberCount += 1;
|
||||
if (subscriberCount === 1 && browser) {
|
||||
void refreshQueueOverview();
|
||||
pollHandle = setInterval(() => void refreshQueueOverview(), POLL_INTERVAL_MS);
|
||||
}
|
||||
return () => {
|
||||
subscriberCount -= 1;
|
||||
if (subscriberCount === 0 && pollHandle) {
|
||||
clearInterval(pollHandle);
|
||||
pollHandle = null;
|
||||
}
|
||||
};
|
||||
});
|
||||
|
||||
export const queueOverview = { subscribe: internal.subscribe };
|
||||
|
||||
/** 경량 fetch — 실패는 전부 null (silent 비차단, 강제 logout 경로 없음) */
|
||||
async function fetchOverview(): Promise<QueueOverview | null> {
|
||||
try {
|
||||
const headers: Record<string, string> = {};
|
||||
const token = getAccessToken();
|
||||
if (token) headers['Authorization'] = `Bearer ${token}`;
|
||||
const res = await fetch('/api/queue/overview', { headers, credentials: 'include' });
|
||||
if (!res.ok) return null;
|
||||
return (await res.json()) as QueueOverview;
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/** 수동/추가 폴링용 — 홈은 자체 30s interval 로 이 함수를 호출 (동시 fetch 합치기) */
|
||||
export async function refreshQueueOverview(): Promise<void> {
|
||||
if (!browser) return;
|
||||
if (inFlight) return inFlight;
|
||||
inFlight = (async () => {
|
||||
try {
|
||||
internal.set(await fetchOverview());
|
||||
} finally {
|
||||
inFlight = null;
|
||||
}
|
||||
})();
|
||||
return inFlight;
|
||||
}
|
||||
@@ -3,7 +3,9 @@
|
||||
// (toast는 별도 store. drawer가 persistent inline panel(예: xl+ meta rail)일 때는
|
||||
// 여기 시스템 밖이다 — 그저 레이아웃의 일부.)
|
||||
|
||||
type Drawer = { id: 'sidebar' | 'meta' } | null;
|
||||
// 'queue' = 처리 현황 드로어 (상태 스트립 클릭 시 우측) — 단일 slot 규칙 동일
|
||||
export type DrawerId = 'sidebar' | 'meta' | 'queue';
|
||||
type Drawer = { id: DrawerId } | null;
|
||||
type Modal = { id: string };
|
||||
|
||||
class UIState {
|
||||
@@ -11,14 +13,14 @@ class UIState {
|
||||
modalStack = $state<Modal[]>([]);
|
||||
|
||||
// ── Drawer (단일 slot) ──────────────────────────────
|
||||
openDrawer(id: 'sidebar' | 'meta') {
|
||||
openDrawer(id: DrawerId) {
|
||||
// 새 drawer 열면 이전 drawer는 자동으로 사라진다 (단일 slot)
|
||||
this.drawer = { id };
|
||||
}
|
||||
closeDrawer() {
|
||||
this.drawer = null;
|
||||
}
|
||||
isDrawerOpen(id: 'sidebar' | 'meta') {
|
||||
isDrawerOpen(id: DrawerId) {
|
||||
return this.drawer?.id === id;
|
||||
}
|
||||
|
||||
|
||||
@@ -0,0 +1,74 @@
|
||||
/**
|
||||
* GET /api/queue/overview 응답 타입.
|
||||
*
|
||||
* Backend 는 병렬 트랙에서 구현 중 — 계약 고정 (feat/ds-processing-board).
|
||||
* 필드 변경 시 양쪽 동시 수정 필수.
|
||||
*/
|
||||
|
||||
export type MachineKey = 'gpu' | 'macmini' | 'macbook';
|
||||
|
||||
/** 머신 상태 — active(가동) / deferred(보류) / idle(대기) */
|
||||
export type MachineState = 'active' | 'deferred' | 'idle';
|
||||
|
||||
/** 머신이 지금 처리 중인 문서 1건 */
|
||||
export interface MachineCurrentItem {
|
||||
document_id: number;
|
||||
title: string;
|
||||
stage: string;
|
||||
}
|
||||
|
||||
export interface MachineOverview {
|
||||
key: MachineKey;
|
||||
label: string;
|
||||
state: MachineState;
|
||||
/** 담당 단계 키 목록 (extract/classify/... — 홈 STAGE_LABEL 로 한글화) */
|
||||
stages: string[];
|
||||
pending: number;
|
||||
processing: number;
|
||||
failed: number;
|
||||
/** 최근 1시간 완료 건수 (처리율 N/h 표기) */
|
||||
done_1h: number;
|
||||
done_today: number;
|
||||
/** 보류 건수 — 맥북 sleep 등으로 자동 재개 대기 중 */
|
||||
deferred_pending: number;
|
||||
current: MachineCurrentItem[];
|
||||
}
|
||||
|
||||
/** 요약 백로그 ETA (안5 라이트) — 추정치, 유입 변동 시 오차 */
|
||||
export interface SummarizeEta {
|
||||
pending: number;
|
||||
done_rate_1h: number;
|
||||
inflow_rate_1h: number;
|
||||
/** null = 유입이 소화를 앞섬 (백필 중) — 소진 예상 불가 */
|
||||
eta_minutes: number | null;
|
||||
}
|
||||
|
||||
/** 시간당 유입 vs 소화 (이번 트랙 미렌더 — 후속 추세 위젯 슬롯) */
|
||||
export interface TrendPoint {
|
||||
hour: string;
|
||||
inflow: number;
|
||||
done: number;
|
||||
}
|
||||
|
||||
export interface QueueTotals {
|
||||
pending: number;
|
||||
processing: number;
|
||||
failed: number;
|
||||
}
|
||||
|
||||
export interface QueueStageRow {
|
||||
stage: string;
|
||||
pending: number;
|
||||
processing: number;
|
||||
failed: number;
|
||||
done_today: number;
|
||||
oldest_pending_age_sec: number | null;
|
||||
}
|
||||
|
||||
export interface QueueOverview {
|
||||
machines: MachineOverview[];
|
||||
summarize_eta: SummarizeEta;
|
||||
trend_24h: TrendPoint[];
|
||||
stages: QueueStageRow[];
|
||||
totals: QueueTotals;
|
||||
}
|
||||
@@ -0,0 +1,38 @@
|
||||
// 처리 머신 보드 / 상태 스트립 / 드로어 공용 표시 헬퍼.
|
||||
// 상태 표현은 dot + 칩 (이모지 금지 원칙) — 토큰 클래스만 사용.
|
||||
|
||||
import type { MachineState } from '$lib/types/queue';
|
||||
|
||||
/** 머신 상태 한글 라벨 */
|
||||
export const MACHINE_STATE_LABEL: Record<MachineState, string> = {
|
||||
active: '가동',
|
||||
deferred: '보류',
|
||||
idle: '대기',
|
||||
};
|
||||
|
||||
/** 상태 dot 색 — 가동=success / 보류=warning / 대기=faint */
|
||||
export function machineDotClass(state: MachineState): string {
|
||||
if (state === 'active') return 'bg-success';
|
||||
if (state === 'deferred') return 'bg-warning';
|
||||
return 'bg-faint';
|
||||
}
|
||||
|
||||
/** 상태 칩 톤 — 가동=accent / 보류=warn / 대기=dim */
|
||||
export function machineChipClass(state: MachineState): string {
|
||||
if (state === 'active') return 'bg-accent/10 text-accent';
|
||||
if (state === 'deferred') return 'bg-warning/10 text-warning';
|
||||
return 'bg-surface-hover text-faint';
|
||||
}
|
||||
|
||||
/** 처리율 표기 — 정수는 그대로, 소수는 한 자리 */
|
||||
export function formatRate(n: number): string {
|
||||
return Number.isInteger(n) ? n.toLocaleString() : n.toFixed(1);
|
||||
}
|
||||
|
||||
/** ETA 분 → "약 N분/N시간 후 소진 예상" (추정치 — title 로 명시는 호출부 책임) */
|
||||
export function etaPhrase(minutes: number): string {
|
||||
if (minutes < 60) return `약 ${Math.max(1, Math.round(minutes))}분 후 소진 예상`;
|
||||
const hours = minutes / 60;
|
||||
const text = hours >= 10 ? String(Math.round(hours)) : String(Math.round(hours * 10) / 10);
|
||||
return `약 ${text}시간 후 소진 예상`;
|
||||
}
|
||||
@@ -3,13 +3,16 @@
|
||||
import { browser } from '$app/environment';
|
||||
import { page } from '$app/stores';
|
||||
import { goto } from '$app/navigation';
|
||||
import { Menu, EllipsisVertical, ChevronDown, FileText, Newspaper, HelpCircle, StickyNote, Inbox, PanelLeft } from 'lucide-svelte';
|
||||
import { Menu, EllipsisVertical, ChevronDown, FileText, Newspaper, HelpCircle, StickyNote, Inbox, PanelLeft, MessageCircle } from 'lucide-svelte';
|
||||
import { isAuthenticated, user, tryRefresh, logout } from '$lib/stores/auth';
|
||||
import { toasts, removeToast } from '$lib/stores/toast';
|
||||
import { refresh as refreshPublicConfig } from '$lib/stores/config';
|
||||
import { ui } from '$lib/stores/uiState.svelte';
|
||||
import { queueOverview } from '$lib/stores/queueOverview';
|
||||
import { MACHINE_STATE_LABEL, machineChipClass } from '$lib/utils/queueDisplay';
|
||||
import Sidebar from '$lib/components/Sidebar.svelte';
|
||||
import SystemStatusDot from '$lib/components/SystemStatusDot.svelte';
|
||||
import QueueDrawer from '$lib/components/QueueDrawer.svelte';
|
||||
import QuickMemoButton from '$lib/components/QuickMemoButton.svelte';
|
||||
import IconButton from '$lib/components/ui/IconButton.svelte';
|
||||
import Drawer from '$lib/components/ui/Drawer.svelte';
|
||||
@@ -65,6 +68,15 @@
|
||||
let showChrome = $derived($isAuthenticated && !NO_CHROME_PATHS.some(p => $page.url.pathname.startsWith(p)));
|
||||
let showSidebar = $derived(showChrome && !NO_SIDEBAR_PATHS.some(p => $page.url.pathname.startsWith(p)));
|
||||
|
||||
// 처리 현황 스트립 (안6 라이트) — 60s 폴링 store 공유. fetch 실패/401 시
|
||||
// store 가 null → 스트립 자체를 숨김 (silent 비차단, 로그인 페이지 동일).
|
||||
let queue = $derived($queueOverview);
|
||||
let queueMacbook = $derived(queue?.machines?.find((m) => m.key === 'macbook') ?? null);
|
||||
function toggleQueueDrawer() {
|
||||
if (ui.isDrawerOpen('queue')) ui.closeDrawer();
|
||||
else ui.openDrawer('queue');
|
||||
}
|
||||
|
||||
function handleKeydown(e) {
|
||||
if (e.key === '/' && !['INPUT', 'TEXTAREA'].includes(document.activeElement?.tagName)) {
|
||||
e.preventDefault();
|
||||
@@ -140,6 +152,7 @@
|
||||
</div>
|
||||
|
||||
<a href="/ask" class="px-3 py-1.5 rounded-md text-sm font-semibold transition-colors {isActive('/ask') ? 'text-accent bg-accent/12' : 'text-dim hover:text-text hover:bg-surface'}">질문</a>
|
||||
<a href="/chat" class="px-3 py-1.5 rounded-md text-sm font-semibold transition-colors {isActive('/chat') ? 'text-accent bg-accent/12' : 'text-dim hover:text-text hover:bg-surface'}">이드</a>
|
||||
<SystemStatusDot />
|
||||
</div>
|
||||
|
||||
@@ -161,6 +174,28 @@
|
||||
</div>
|
||||
</nav>
|
||||
|
||||
<!-- 전 페이지 상태 스트립 (안6 라이트) — 클릭 시 우측 처리 현황 드로어 토글 -->
|
||||
{#if queue}
|
||||
<button
|
||||
type="button"
|
||||
onclick={toggleQueueDrawer}
|
||||
aria-expanded={ui.isDrawerOpen('queue')}
|
||||
aria-label="처리 현황 자세히 보기"
|
||||
class="flex items-center gap-3 px-4 py-1.5 border-b border-default bg-surface text-[11px] text-dim shrink-0 text-left hover:bg-surface-hover transition-colors overflow-x-auto"
|
||||
>
|
||||
<span class="flex items-center gap-1.5 shrink-0">
|
||||
<span class="w-2 h-2 rounded-full {queue.totals.processing > 0 ? 'bg-success' : 'bg-faint'}"></span>
|
||||
<strong class="text-text font-semibold tabular-nums">처리 중 {queue.totals.processing.toLocaleString()}</strong>
|
||||
</span>
|
||||
<span class="tabular-nums shrink-0">대기 <strong class="text-text">{queue.totals.pending.toLocaleString()}</strong></span>
|
||||
<span class="tabular-nums shrink-0 {queue.totals.failed > 0 ? 'text-error font-semibold' : ''}">실패 <strong class={queue.totals.failed > 0 ? '' : 'text-text'}>{queue.totals.failed.toLocaleString()}</strong></span>
|
||||
{#if queueMacbook}
|
||||
<span class="text-[10px] font-bold rounded-full px-2 py-0.5 shrink-0 {machineChipClass(queueMacbook.state)}">맥북 {MACHINE_STATE_LABEL[queueMacbook.state]}</span>
|
||||
{/if}
|
||||
<span class="ml-auto flex items-center gap-0.5 text-faint shrink-0">자세히 <ChevronDown size={11} /></span>
|
||||
</button>
|
||||
{/if}
|
||||
|
||||
<!-- 메인: 데스크탑 상시 사이드바 + 콘텐츠 -->
|
||||
<div class="flex-1 min-h-0 flex">
|
||||
{#if showSidebar}
|
||||
@@ -178,6 +213,7 @@
|
||||
<a href="/documents" aria-current={docsActive ? 'page' : undefined} class="flex-1 flex flex-col items-center justify-center gap-1 py-2 text-[10px] font-semibold transition-colors {docsActive ? 'text-accent' : 'text-dim'}"><FileText size={18} strokeWidth={1.9} /> 문서</a>
|
||||
<a href="/news" aria-current={newsActive ? 'page' : undefined} class="flex-1 flex flex-col items-center justify-center gap-1 py-2 text-[10px] font-semibold transition-colors {newsActive ? 'text-accent' : 'text-dim'}"><Newspaper size={18} strokeWidth={1.9} /> 뉴스</a>
|
||||
<a href="/ask" aria-current={isActive('/ask') ? 'page' : undefined} class="flex-1 flex flex-col items-center justify-center gap-1 py-2 text-[10px] font-semibold transition-colors {isActive('/ask') ? 'text-accent' : 'text-dim'}"><HelpCircle size={18} strokeWidth={1.9} /> 질문</a>
|
||||
<a href="/chat" aria-current={isActive('/chat') ? 'page' : undefined} class="flex-1 flex flex-col items-center justify-center gap-1 py-2 text-[10px] font-semibold transition-colors {isActive('/chat') ? 'text-accent' : 'text-dim'}"><MessageCircle size={18} strokeWidth={1.9} /> 이드</a>
|
||||
<a href="/memos" aria-current={isActive('/memos') ? 'page' : undefined} class="flex-1 flex flex-col items-center justify-center gap-1 py-2 text-[10px] font-semibold transition-colors {isActive('/memos') ? 'text-accent' : 'text-dim'}"><StickyNote size={18} strokeWidth={1.9} /> 메모</a>
|
||||
<button onclick={() => ui.openDrawer('sidebar')} class="flex-1 flex flex-col items-center justify-center gap-1 py-2 text-[10px] font-semibold text-dim"><Menu size={18} strokeWidth={1.9} /> 더보기</button>
|
||||
</nav>
|
||||
@@ -189,6 +225,9 @@
|
||||
</Drawer>
|
||||
</div>
|
||||
|
||||
<!-- 처리 현황 드로어 (안6 라이트, 스트립 클릭 시 우측) -->
|
||||
<QueueDrawer />
|
||||
|
||||
<!-- 빠른 메모 FAB -->
|
||||
<QuickMemoButton />
|
||||
</div>
|
||||
|
||||
@@ -13,6 +13,11 @@
|
||||
import { domainBgClass, domainLabel } from '$lib/utils/domainSlug';
|
||||
import { user } from '$lib/stores/auth';
|
||||
import { api } from '$lib/api';
|
||||
import { queueOverview, refreshQueueOverview } from '$lib/stores/queueOverview';
|
||||
import {
|
||||
MACHINE_STATE_LABEL, machineChipClass, machineDotClass, formatRate, etaPhrase,
|
||||
} from '$lib/utils/queueDisplay';
|
||||
import type { QueueOverview } from '$lib/types/queue';
|
||||
import EmptyState from '$lib/components/ui/EmptyState.svelte';
|
||||
import Skeleton from '$lib/components/ui/Skeleton.svelte';
|
||||
import {
|
||||
@@ -125,6 +130,28 @@
|
||||
preview: '미리보기', thumbnail: '썸네일',
|
||||
};
|
||||
|
||||
// ─── 처리 머신 보드 (안2) + ETA (안5 라이트) — GET /api/queue/overview ───
|
||||
// 홈은 30s 폴링 (store 기본 60s 위에 추가 — inFlight 합치기로 중복 호출 0).
|
||||
// 백엔드 미배포/실패 시 store=null → 보드 자체가 조용히 생략 (silent 비차단).
|
||||
let queue = $derived<QueueOverview | null>($queueOverview);
|
||||
|
||||
// 머신 담당 단계 라벨 — STAGE_LABEL 재사용 + overview 전용 단계 보강
|
||||
// (backend services/queue_overview.py _STAGE_ORDER 와 동기), 미지 키는 raw
|
||||
const QUEUE_STAGE_LABEL: Record<string, string> = {
|
||||
...STAGE_LABEL,
|
||||
summarize: '요약', chunk: '청크', markdown: '마크다운',
|
||||
fulltext: '전문', deep_summary: '심층분석',
|
||||
};
|
||||
function queueStageLabel(stage: string): string {
|
||||
return QUEUE_STAGE_LABEL[stage] ?? stage;
|
||||
}
|
||||
|
||||
onMount(() => {
|
||||
void refreshQueueOverview();
|
||||
const handle = setInterval(() => void refreshQueueOverview(), 30_000);
|
||||
return () => clearInterval(handle);
|
||||
});
|
||||
|
||||
interface PipelineRow {
|
||||
stage: string; label: string;
|
||||
pending: number; processing: number; failed: number; total: number;
|
||||
@@ -172,7 +199,20 @@
|
||||
let totalProcessing = $derived(pipelineRows.reduce((s, r) => s + r.processing, 0));
|
||||
|
||||
let pipelineManualClosed = $state(false);
|
||||
let pipelineOpen = $derived(pipelineManualClosed ? false : totalFailed > 0);
|
||||
let pipelineOpen = $derived(
|
||||
pipelineManualClosed ? false : (queue?.totals.failed ?? totalFailed) > 0
|
||||
);
|
||||
|
||||
// 단계별 현황 (2026-06-11 피드백 재설계: 완료가 보여야 한다 — overview.stages 단일 소스)
|
||||
// active = 오늘 움직임이 있는 단계만, idle = 전부 0 인 단계는 한 줄로 숨김.
|
||||
let stageRows = $derived(queue?.stages ?? []);
|
||||
let activeStageRows = $derived(
|
||||
stageRows.filter((r) => r.pending + r.processing + r.failed + r.done_today > 0)
|
||||
);
|
||||
let idleStageRows = $derived(
|
||||
stageRows.filter((r) => r.pending + r.processing + r.failed + r.done_today === 0)
|
||||
);
|
||||
let stageDoneToday = $derived(stageRows.reduce((s, r) => s + r.done_today, 0));
|
||||
|
||||
function formatAge(sec: number | null): string {
|
||||
if (sec == null || sec <= 0) return '';
|
||||
@@ -420,7 +460,68 @@
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- ═══ 파이프라인 상세 (실패 있을 때 자동 펼침) ═══ -->
|
||||
<!-- ═══ 처리 머신 보드 (안2) + ETA 라인 (안5 라이트) ═══ -->
|
||||
{#if queue}
|
||||
<div class="mt-5">
|
||||
<div class="text-[11px] font-bold text-dim uppercase tracking-wider mb-3">처리 머신</div>
|
||||
<div class="grid grid-cols-1 md:grid-cols-3 gap-3">
|
||||
{#each queue.machines as m (m.key)}
|
||||
<div class="bg-surface border border-default rounded-card p-4">
|
||||
<!-- 헤더: 상태 dot + 라벨 + state 칩 -->
|
||||
<div class="flex items-center justify-between gap-2 mb-2">
|
||||
<span class="flex items-center gap-2 text-[13px] font-bold text-text min-w-0">
|
||||
<span class="w-2 h-2 rounded-full shrink-0 {machineDotClass(m.state)}"></span>
|
||||
<span class="truncate">{m.label}</span>
|
||||
</span>
|
||||
<span class="text-[10px] font-bold rounded-full px-2 py-0.5 shrink-0 {machineChipClass(m.state)}">
|
||||
{MACHINE_STATE_LABEL[m.state]}
|
||||
</span>
|
||||
</div>
|
||||
<!-- 담당 단계 칩 -->
|
||||
{#if m.stages.length > 0}
|
||||
<div class="flex flex-wrap gap-1 mb-2.5">
|
||||
{#each m.stages as s (s)}
|
||||
<span class="text-[10px] font-semibold rounded-full px-2 py-0.5 bg-surface-hover text-dim">{queueStageLabel(s)}</span>
|
||||
{/each}
|
||||
</div>
|
||||
{/if}
|
||||
<!-- 대기 · 처리율 · 오늘 -->
|
||||
<div class="text-xs text-dim tabular-nums">
|
||||
대기 <strong class="text-text">{m.pending.toLocaleString()}</strong>
|
||||
· 처리율 <strong class="text-text">{formatRate(m.done_1h)}/h</strong>
|
||||
· 오늘 <strong class="text-text">{m.done_today.toLocaleString()}</strong>건
|
||||
</div>
|
||||
<!-- 맥북 보류 (sleep 등 자동 재개 대기) -->
|
||||
{#if m.key === 'macbook' && m.deferred_pending > 0}
|
||||
<div class="text-[11px] font-semibold text-warning mt-1.5 tabular-nums">보류 {m.deferred_pending.toLocaleString()}건 — 자동 재개 대기</div>
|
||||
{/if}
|
||||
<!-- 지금 처리 중인 문서 -->
|
||||
{#if m.current.length > 0}
|
||||
<div class="text-[11px] text-dim border-t border-dashed border-default mt-2.5 pt-2 truncate"
|
||||
title={m.current.map((c) => `${c.title} (${queueStageLabel(c.stage)})`).join(' · ')}>
|
||||
지금: {m.current[0].title} ({queueStageLabel(m.current[0].stage)}){m.current.length > 1 ? ` 외 ${m.current.length - 1}건` : ''}
|
||||
</div>
|
||||
{/if}
|
||||
</div>
|
||||
{/each}
|
||||
</div>
|
||||
|
||||
<!-- ETA 한 줄 (안5 라이트 — 추정치) -->
|
||||
<div class="text-xs text-dim mt-2.5 px-1 tabular-nums"
|
||||
title="현재 페이스 기반 추정치 — 유입 변동 시 달라질 수 있습니다">
|
||||
요약 대기 <strong class="text-text">{queue.summarize_eta.pending.toLocaleString()}건</strong>
|
||||
— 소화 {formatRate(queue.summarize_eta.done_rate_1h)}/h
|
||||
· 유입 {formatRate(queue.summarize_eta.inflow_rate_1h)}/h
|
||||
{#if queue.summarize_eta.eta_minutes != null}
|
||||
· <span class="text-accent font-semibold">{etaPhrase(queue.summarize_eta.eta_minutes)}</span>
|
||||
{:else}
|
||||
· 유입 우세(백필 중)
|
||||
{/if}
|
||||
</div>
|
||||
</div>
|
||||
{/if}
|
||||
|
||||
<!-- ═══ 단계 상세 (기존 stage 테이블 — 접힘 강등, 실패 있을 때 자동 펼침) ═══ -->
|
||||
<details
|
||||
class="mt-5"
|
||||
open={pipelineOpen}
|
||||
@@ -429,44 +530,67 @@
|
||||
<summary class="flex items-center justify-between px-5 py-3.5 bg-surface border border-default rounded-card cursor-pointer hover:bg-surface-hover transition-colors select-none list-none">
|
||||
<span class="text-sm font-semibold text-text flex items-center gap-2">
|
||||
<ChevronRight size={14} class="transition-transform details-chevron" />
|
||||
파이프라인 상세
|
||||
단계별 현황
|
||||
</span>
|
||||
<span class="text-xs text-dim flex items-center gap-2.5">
|
||||
{#if totalFailed > 0}<span class="text-error font-medium">실패 {totalFailed}</span>{/if}
|
||||
{#if totalPending > 0}<span>대기 {totalPending}</span>{/if}
|
||||
{#if totalFailed === 0 && totalPending === 0}<span>처리 완료</span>{/if}
|
||||
{#if queue}
|
||||
{#if stageDoneToday > 0}<span class="text-success">오늘 {stageDoneToday.toLocaleString()} 완료</span>{/if}
|
||||
{#if queue.totals.failed > 0}<span class="text-error font-medium">실패 {queue.totals.failed}</span>{/if}
|
||||
{#if queue.totals.pending > 0}<span>대기 {queue.totals.pending.toLocaleString()}</span>{/if}
|
||||
{#if stageDoneToday === 0 && queue.totals.failed === 0 && queue.totals.pending === 0}<span>모든 단계 한가함</span>{/if}
|
||||
{:else}
|
||||
{#if totalFailed > 0}<span class="text-error font-medium">실패 {totalFailed}</span>{/if}
|
||||
{#if totalPending > 0}<span>대기 {totalPending}</span>{/if}
|
||||
{/if}
|
||||
</span>
|
||||
</summary>
|
||||
|
||||
<div class="mt-2 px-5 py-4 bg-surface border border-default rounded-card">
|
||||
<p class="text-xs text-dim mb-3">최근 24시간</p>
|
||||
{#if pipelineRows.length > 0}
|
||||
<div class="space-y-3">
|
||||
{#each pipelineRows as row (row.stage)}
|
||||
<div>
|
||||
<div class="flex items-center justify-between text-xs mb-1.5">
|
||||
<span class="text-dim">
|
||||
{row.label}
|
||||
{#if row.oldestPendingAgeSec && row.oldestPendingAgeSec > 600}
|
||||
<span class="ml-1 text-warning" title="가장 오래된 pending 의 경과 시간">({formatAge(row.oldestPendingAgeSec)})</span>
|
||||
{/if}
|
||||
</span>
|
||||
<span class="text-dim tabular-nums">
|
||||
대기 <span class="text-text">{row.pending}</span> ·
|
||||
처리 <span class="text-text">{row.processing}</span> ·
|
||||
실패 <span class={row.failed > 0 ? 'text-error font-medium' : ''}>{row.failed}</span>
|
||||
</span>
|
||||
{#if queue}
|
||||
{#if activeStageRows.length > 0}
|
||||
<div class="space-y-3.5">
|
||||
{#each activeStageRows as row (row.stage)}
|
||||
{@const total = row.done_today + row.pending + row.processing}
|
||||
{@const donePct = total > 0 ? (row.done_today / total) * 100 : 0}
|
||||
{@const procPct = total > 0 ? (row.processing / total) * 100 : 0}
|
||||
<div>
|
||||
<div class="flex items-baseline justify-between text-xs mb-1.5 gap-2">
|
||||
<span class="font-medium text-text flex items-center gap-1.5 whitespace-nowrap">
|
||||
{queueStageLabel(row.stage)}
|
||||
{#if row.processing > 0}
|
||||
<span class="inline-block w-1.5 h-1.5 rounded-full bg-accent animate-pulse"></span>
|
||||
<span class="text-accent font-normal">처리 중 {row.processing}</span>
|
||||
{/if}
|
||||
</span>
|
||||
<span class="text-dim tabular-nums flex items-center gap-2.5 whitespace-nowrap">
|
||||
{#if row.done_today > 0}<span class="text-success">오늘 {row.done_today.toLocaleString()} 완료</span>{/if}
|
||||
{#if row.pending > 0}<span>대기 {row.pending.toLocaleString()}</span>{/if}
|
||||
{#if row.failed > 0}<span class="text-error font-medium">실패 {row.failed}</span>{/if}
|
||||
</span>
|
||||
</div>
|
||||
<!-- 게이지 = 이 단계의 오늘 진척 (완료 / 완료+대기) — 가득 찬 초록 = 다 끝남 -->
|
||||
<div class="flex h-1.5 w-full overflow-hidden rounded-sm bg-bg" title="오늘 완료 {row.done_today.toLocaleString()} / 잔여 {row.pending.toLocaleString()}">
|
||||
{#if donePct > 0}<div class="bg-success/70 h-full" style="width: {donePct}%"></div>{/if}
|
||||
{#if procPct > 0}<div class="bg-accent h-full" style="width: {Math.max(procPct, 1)}%"></div>{/if}
|
||||
</div>
|
||||
{#if row.pending > 0 && row.oldest_pending_age_sec && row.oldest_pending_age_sec > 600}
|
||||
<p class="text-[10px] mt-1 tabular-nums {row.oldest_pending_age_sec > 21600 ? 'text-warning' : 'text-faint'}">
|
||||
가장 오래 기다린 항목 {formatAge(row.oldest_pending_age_sec)}
|
||||
</p>
|
||||
{/if}
|
||||
</div>
|
||||
<div class="flex h-1.5 w-full overflow-hidden rounded-sm bg-bg">
|
||||
{#if row.pending > 0}<div class="bg-warning h-full" style="width: {(row.pending / pipelineMax) * 100}%"></div>{/if}
|
||||
{#if row.processing > 0}<div class="bg-accent h-full" style="width: {(row.processing / pipelineMax) * 100}%"></div>{/if}
|
||||
{#if row.failed > 0}<div class="bg-error h-full" style="width: {(row.failed / pipelineMax) * 100}%"></div>{/if}
|
||||
</div>
|
||||
</div>
|
||||
{/each}
|
||||
</div>
|
||||
{/each}
|
||||
</div>
|
||||
{:else}
|
||||
<p class="text-xs text-dim text-center py-3">대기·처리·실패 없음 — 모든 단계가 한가합니다</p>
|
||||
{/if}
|
||||
{#if idleStageRows.length > 0}
|
||||
<p class="text-[11px] text-faint mt-4 pt-3 border-t border-default">
|
||||
비어 있음: {idleStageRows.map((r) => queueStageLabel(r.stage)).join(' · ')}
|
||||
</p>
|
||||
{/if}
|
||||
{:else}
|
||||
<p class="text-xs text-dim text-center py-3">처리 작업 없음</p>
|
||||
<p class="text-xs text-dim text-center py-3">현황을 불러오지 못했습니다</p>
|
||||
{/if}
|
||||
</div>
|
||||
</details>
|
||||
|
||||
@@ -0,0 +1,567 @@
|
||||
<!--
|
||||
/chat — 이드 채팅 표면 (PR-Eid-Chat).
|
||||
|
||||
표면 문법: 페이지 정체성 = "이드". 모델명·머신명·alias 비노출
|
||||
(persona model-agnostic 원칙 — 프로토콜 레이어도 동일: SSE payload 의
|
||||
model 필드는 서버에서 mode 값으로 치환되고 usage 는 제거됨).
|
||||
클라이언트는 mode('daily'|'deep') 만 보내고 alias 매핑은 서버(/api/eid/chat) 책임.
|
||||
|
||||
- 모드: 일상(daily) / 심층(deep) segmented 토글. 심층 = 장문·무거운 질문,
|
||||
잠들어 있으면 자동 기동(처음 최대 ~1분) — 기계중립 표현만 사용.
|
||||
- 스트리밍: POST /api/eid/chat → SSE. api<T>() 는 JSON 전용이라 raw fetch
|
||||
(apiFetchRaw, 토큰 첨부 + 401 refresh 1회 공유) 사용. 라인 버퍼로 청크
|
||||
경계 분리, "data:" 라인만, [DONE] 종료, choices[0].delta.content 누적
|
||||
(fixture 2종 — 26B tool_calls 배열 / 27B reasoning·logprobs null — 모두
|
||||
content 만 읽으면 동일 처리).
|
||||
- 에러: error_reason 매핑 (warming / editor_busy / upstream_cold /
|
||||
macbook_unavailable / substrate_degraded / 기타 detail). 자동 fallback
|
||||
금지 — 다른 모드로 자동 전환하지 않는다. 스트림 도중 중단 = 받은 부분
|
||||
유지 + 표시.
|
||||
- 이력: localStorage `eid_chat:v1` (키 상수는 $lib/eidChat — logout 시 제거와 공유).
|
||||
전송 payload 는 마지막 20턴(40 messages) cap.
|
||||
- 입력 한도: 메시지당 8,000자 클라 선차단(서버 422 검증과 동일 한도).
|
||||
422 수신 시 detail 을 한 줄로 정규화 + 방금 push 한 user 턴 pop 으로
|
||||
payload 오염 고리 차단.
|
||||
-->
|
||||
<script lang="ts">
|
||||
import { onMount, onDestroy } from 'svelte';
|
||||
import { apiFetchRaw } from '$lib/api';
|
||||
import { EID_CHAT_STORAGE_KEY } from '$lib/eidChat';
|
||||
import Button from '$lib/components/ui/Button.svelte';
|
||||
import EmptyState from '$lib/components/ui/EmptyState.svelte';
|
||||
import { MessageCircle, SendHorizontal, RotateCcw, AlertCircle } from 'lucide-svelte';
|
||||
|
||||
type ChatMode = 'daily' | 'deep';
|
||||
type ChatMessage = { role: 'user' | 'assistant'; content: string };
|
||||
type Notice = { kind: 'warn' | 'error'; message: string; retryable: boolean };
|
||||
|
||||
// 이력 키 — logout(stores/auth.ts) 의 이력 제거와 단일 상수 공유
|
||||
const STORAGE_KEY = EID_CHAT_STORAGE_KEY;
|
||||
// 전송 payload cap: 마지막 20턴(40 messages)
|
||||
const MAX_PAYLOAD_MESSAGES = 40;
|
||||
// localStorage 보존 cap (payload cap 과 별개 — 화면 표시용 이력)
|
||||
const MAX_STORED_MESSAGES = 200;
|
||||
// 메시지당 입력 한도 — 서버(eid_chat.py) 422 검증과 동일 한도, 클라에서 선차단
|
||||
const MAX_MESSAGE_CHARS = 8000;
|
||||
// 한도 근접 카운터 노출 시작점
|
||||
const COUNTER_THRESHOLD = 7500;
|
||||
|
||||
const DEEP_CAPTION =
|
||||
'장문·무거운 질문에 적합 — 잠들어 있으면 자동 기동 (처음 응답까지 최대 ~1분)';
|
||||
|
||||
// 프리셋 칩: 입력창 prefix 채움
|
||||
const PRESETS: Array<{ label: string; prefix: string }> = [
|
||||
{ label: '번역 한→영', prefix: '다음을 영어로 번역해줘.\n\n' },
|
||||
{ label: '번역 영→한', prefix: '다음을 한국어로 번역해줘.\n\n' },
|
||||
{ label: '요약', prefix: '다음 내용을 핵심만 간결히 요약해줘.\n\n' },
|
||||
{ label: '글 다듬기', prefix: '다음 글을 뜻은 유지하면서 자연스럽게 다듬어줘.\n\n' },
|
||||
];
|
||||
|
||||
// ── state ───────────────────────────────────────────
|
||||
let mode = $state<ChatMode>('daily');
|
||||
let messages = $state<ChatMessage[]>([]);
|
||||
let input = $state('');
|
||||
let streaming = $state(false);
|
||||
let streamingText = $state('');
|
||||
let notice = $state<Notice | null>(null);
|
||||
|
||||
let scrollEl: HTMLDivElement | undefined = $state();
|
||||
let textareaEl: HTMLTextAreaElement | undefined = $state();
|
||||
let abortCtrl: AbortController | null = null;
|
||||
|
||||
// ── localStorage 이력 ───────────────────────────────
|
||||
function persist() {
|
||||
if (typeof window === 'undefined') return;
|
||||
try {
|
||||
const trimmed = messages.slice(-MAX_STORED_MESSAGES);
|
||||
window.localStorage.setItem(STORAGE_KEY, JSON.stringify({ mode, messages: trimmed }));
|
||||
} catch {
|
||||
// quota 초과 등 — 이력 저장 실패는 치명적이지 않음
|
||||
}
|
||||
}
|
||||
|
||||
function restore() {
|
||||
if (typeof window === 'undefined') return;
|
||||
try {
|
||||
const raw = window.localStorage.getItem(STORAGE_KEY);
|
||||
if (!raw) return;
|
||||
const parsed = JSON.parse(raw) as { mode?: unknown; messages?: unknown };
|
||||
if (parsed.mode === 'daily' || parsed.mode === 'deep') mode = parsed.mode;
|
||||
if (Array.isArray(parsed.messages)) {
|
||||
messages = parsed.messages
|
||||
.filter(
|
||||
(m): m is ChatMessage =>
|
||||
!!m &&
|
||||
typeof m === 'object' &&
|
||||
((m as ChatMessage).role === 'user' || (m as ChatMessage).role === 'assistant') &&
|
||||
typeof (m as ChatMessage).content === 'string'
|
||||
)
|
||||
// 배열 크기 가드 + content 8,000자 clamp — 외부에서 손상/비대해진
|
||||
// 이력이 전송 payload 를 오염시키지 않도록 복원 시점에 정규화
|
||||
.slice(-MAX_STORED_MESSAGES)
|
||||
.map((m) => ({ role: m.role, content: m.content.slice(0, MAX_MESSAGE_CHARS) }));
|
||||
}
|
||||
} catch {
|
||||
// 손상된 이력은 무시 (새 대화로 시작)
|
||||
}
|
||||
}
|
||||
|
||||
onMount(() => restore());
|
||||
onDestroy(() => abortCtrl?.abort());
|
||||
|
||||
// ── 자동 스크롤 (새 메시지 / 스트림 청크마다 하단 고정) ──
|
||||
$effect(() => {
|
||||
void messages.length;
|
||||
void streamingText;
|
||||
if (scrollEl) scrollEl.scrollTop = scrollEl.scrollHeight;
|
||||
});
|
||||
|
||||
// ── 입력 textarea auto-grow ─────────────────────────
|
||||
$effect(() => {
|
||||
void input;
|
||||
if (!textareaEl) return;
|
||||
textareaEl.style.height = 'auto';
|
||||
textareaEl.style.height = Math.min(textareaEl.scrollHeight, 160) + 'px';
|
||||
});
|
||||
|
||||
function applyPreset(prefix: string) {
|
||||
if (!input.startsWith(prefix)) input = prefix + input;
|
||||
textareaEl?.focus();
|
||||
}
|
||||
|
||||
function newConversation() {
|
||||
abortCtrl?.abort();
|
||||
messages = [];
|
||||
notice = null;
|
||||
streamingText = '';
|
||||
streaming = false;
|
||||
persist();
|
||||
textareaEl?.focus();
|
||||
}
|
||||
|
||||
// ── error_reason → 안내 메시지 매핑 ──────────────────
|
||||
// 자동 fallback 금지 ([[feedback_no_silent_fallback_explicit_opt_in]]):
|
||||
// 어떤 사유든 다른 모드로 자동 전환하지 않고 명시 표시만 한다.
|
||||
function mapErrorReason(reason: string | undefined, detail: string): Notice {
|
||||
switch (reason) {
|
||||
case 'warming':
|
||||
return {
|
||||
kind: 'warn',
|
||||
message: '심층 엔진 기동 중입니다 — 잠시 후 다시 시도하세요.',
|
||||
retryable: true,
|
||||
};
|
||||
case 'editor_busy':
|
||||
return {
|
||||
kind: 'warn',
|
||||
message: '편집 작업 보호로 잠시 사용할 수 없습니다.',
|
||||
retryable: false,
|
||||
};
|
||||
case 'upstream_cold':
|
||||
case 'macbook_unavailable':
|
||||
return {
|
||||
kind: 'warn',
|
||||
message: '심층 엔진이 잠들어 있습니다 — 다시 시도하면 기동을 시작합니다.',
|
||||
retryable: true,
|
||||
};
|
||||
case 'substrate_degraded':
|
||||
return {
|
||||
kind: 'error',
|
||||
message: '운영 규칙이 적재되지 않았습니다 — 관리자 확인이 필요합니다.',
|
||||
retryable: false,
|
||||
};
|
||||
default:
|
||||
return { kind: 'error', message: detail || '응답 생성에 실패했습니다.', retryable: true };
|
||||
}
|
||||
}
|
||||
|
||||
// 비-200 응답 body 파싱: {detail, error_reason} — detail 은 string 또는
|
||||
// {message} 객체 가능 (api.ts parseDetail 과 같은 정규화 규칙의 축소판).
|
||||
async function parseErrorBody(res: Response): Promise<Notice> {
|
||||
const body = (await res.json().catch(() => null)) as
|
||||
| { detail?: unknown; error_reason?: unknown }
|
||||
| null;
|
||||
const reason = typeof body?.error_reason === 'string' ? body.error_reason : undefined;
|
||||
let detail = '';
|
||||
if (typeof body?.detail === 'string') detail = body.detail;
|
||||
else if (body?.detail && typeof body.detail === 'object') {
|
||||
const obj = body.detail as { message?: string; error_reason?: string };
|
||||
detail = obj.message || '';
|
||||
// error_reason 이 detail 객체 안에 중첩된 경우도 수용
|
||||
return mapErrorReason(reason ?? obj.error_reason, detail || res.statusText);
|
||||
}
|
||||
return mapErrorReason(reason, detail || res.statusText);
|
||||
}
|
||||
|
||||
// 422: FastAPI validation detail(배열 shape — [{loc, msg, type}, ...]) 을
|
||||
// 사람이 읽을 한 줄로 정규화. 길이 한도 위반(메시지당 8,000자 / 총량 cap)
|
||||
// 은 친화 메시지로 치환. pydantic v2 의 "Value error, " prefix 는 제거.
|
||||
function normalizeValidationDetail(detail: unknown): string {
|
||||
const first = (Array.isArray(detail) ? detail[0] : undefined) as
|
||||
| { msg?: unknown }
|
||||
| undefined;
|
||||
const msg =
|
||||
typeof first?.msg === 'string' ? first.msg.replace(/^Value error,\s*/i, '') : '';
|
||||
if (/at most|too.?long|초과|깁니다/i.test(msg)) {
|
||||
return '입력이 너무 깁니다 — 메시지는 8,000자 이내로 줄이거나, 대화가 길면 새 대화로 시작하세요.';
|
||||
}
|
||||
if (msg) return `요청 형식 오류: ${msg}`;
|
||||
return '요청 형식이 올바르지 않습니다 — 입력을 줄이거나 새 대화로 시작하세요.';
|
||||
}
|
||||
|
||||
// ── 전송 / 재시도 ───────────────────────────────────
|
||||
function sendMessage() {
|
||||
const text = input.trim();
|
||||
if (!text || streaming) return;
|
||||
// 메시지당 8,000자 클라 선차단 — 한도 초과 payload 를 422 전에 막는다
|
||||
// (입력바 하단 카운터가 같은 안내를 인라인으로 상시 표시)
|
||||
if (text.length > MAX_MESSAGE_CHARS) {
|
||||
notice = {
|
||||
kind: 'error',
|
||||
message: '입력이 너무 깁니다 — 8,000자 이내로 줄여주세요.',
|
||||
retryable: false,
|
||||
};
|
||||
return;
|
||||
}
|
||||
messages.push({ role: 'user', content: text });
|
||||
input = '';
|
||||
persist();
|
||||
void runStream();
|
||||
}
|
||||
|
||||
// 재시도: 이력 끝의 user 메시지를 그대로 재전송 (user 턴 중복 추가 X)
|
||||
function retry() {
|
||||
if (streaming) return;
|
||||
if (messages.length === 0 || messages[messages.length - 1].role !== 'user') return;
|
||||
void runStream();
|
||||
}
|
||||
|
||||
async function runStream() {
|
||||
notice = null;
|
||||
streaming = true;
|
||||
streamingText = '';
|
||||
const ctrl = new AbortController();
|
||||
abortCtrl = ctrl;
|
||||
|
||||
const payload = {
|
||||
mode,
|
||||
messages: messages
|
||||
.slice(-MAX_PAYLOAD_MESSAGES)
|
||||
.map((m) => ({ role: m.role, content: m.content })),
|
||||
};
|
||||
|
||||
let acc = '';
|
||||
let sawDone = false;
|
||||
|
||||
try {
|
||||
const res = await apiFetchRaw('/eid/chat', {
|
||||
method: 'POST',
|
||||
body: JSON.stringify(payload),
|
||||
signal: ctrl.signal,
|
||||
});
|
||||
|
||||
if (!res.ok) {
|
||||
if (res.status === 422) {
|
||||
// validation 거부 — detail 정규화 + 방금 push 한 user 턴 pop.
|
||||
// 한도 초과 턴이 이력에 남으면 이후 모든 전송 payload 가 계속
|
||||
// 422 를 맞는 오염 고리가 되므로 여기서 끊는다 (localStorage 재저장).
|
||||
const body = (await res.json().catch(() => null)) as { detail?: unknown } | null;
|
||||
notice = {
|
||||
kind: 'error',
|
||||
message: normalizeValidationDetail(body?.detail),
|
||||
retryable: false,
|
||||
};
|
||||
if (messages.length > 0 && messages[messages.length - 1].role === 'user') {
|
||||
const popped = messages.pop();
|
||||
// 입력창이 비어 있으면 본문을 돌려놓아 줄여서 재전송할 수 있게 한다
|
||||
if (popped && !input) input = popped.content;
|
||||
persist();
|
||||
}
|
||||
return;
|
||||
}
|
||||
notice = await parseErrorBody(res);
|
||||
return;
|
||||
}
|
||||
if (!res.body) {
|
||||
notice = { kind: 'error', message: '스트림을 열 수 없습니다.', retryable: true };
|
||||
return;
|
||||
}
|
||||
|
||||
// SSE 라인 버퍼 파싱 — 청크 경계에서 라인이 잘릴 수 있으므로
|
||||
// 마지막 불완전 라인은 buf 에 남겨 다음 청크와 이어붙인다.
|
||||
const reader = res.body.getReader();
|
||||
const decoder = new TextDecoder();
|
||||
let buf = '';
|
||||
|
||||
// data: 라인 1개 처리 — [DONE] 이면 true (acc/streamingText 누적은 closure)
|
||||
const processLine = (rawLine: string): boolean => {
|
||||
const line = rawLine.trim();
|
||||
if (!line.startsWith('data:')) return false;
|
||||
const data = line.slice(5).trim();
|
||||
if (data === '[DONE]') return true;
|
||||
try {
|
||||
const obj = JSON.parse(data) as {
|
||||
choices?: Array<{ delta?: { content?: unknown } }>;
|
||||
};
|
||||
const piece = obj?.choices?.[0]?.delta?.content;
|
||||
if (typeof piece === 'string' && piece) {
|
||||
acc += piece;
|
||||
streamingText = acc;
|
||||
}
|
||||
} catch {
|
||||
// 불완전/비 JSON data 라인 무시
|
||||
}
|
||||
return false;
|
||||
};
|
||||
|
||||
while (true) {
|
||||
const { value, done } = await reader.read();
|
||||
if (done) {
|
||||
// 종단 flush — decoder 내부 잔여 바이트 + 개행 없이 끝난 마지막
|
||||
// 라인을 1회 처리. 마지막 data:/[DONE] 라인이 \n 없이 끝나면 buf 에
|
||||
// 남아 '응답이 중단되었습니다' 오경보가 나던 경로의 해소 지점.
|
||||
buf += decoder.decode();
|
||||
for (const rawLine of buf.split('\n')) {
|
||||
if (processLine(rawLine)) {
|
||||
sawDone = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
buf += decoder.decode(value, { stream: true });
|
||||
const lines = buf.split('\n');
|
||||
buf = lines.pop() ?? '';
|
||||
for (const rawLine of lines) {
|
||||
if (processLine(rawLine)) {
|
||||
sawDone = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (sawDone) {
|
||||
// [DONE] 수신 — 잔여 스트림 lock 해제 (실패해도 종료에 영향 없음)
|
||||
void reader.cancel().catch(() => {});
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// [DONE] 없이 연결이 끊긴 경우 — 받은 부분 유지 + 표시
|
||||
if (!sawDone) {
|
||||
notice = acc
|
||||
? {
|
||||
kind: 'warn',
|
||||
message: '응답이 중단되었습니다 — 받은 부분까지 표시합니다.',
|
||||
retryable: false,
|
||||
}
|
||||
: { kind: 'error', message: '응답을 받지 못했습니다 — 다시 시도하세요.', retryable: true };
|
||||
}
|
||||
} catch (err) {
|
||||
if ((err as Error)?.name === 'AbortError') {
|
||||
// 새 대화 등 사용자 의도 중단 — 안내 불필요
|
||||
return;
|
||||
}
|
||||
// 스트림 도중 네트워크 에러 — 받은 부분 유지 + 표시
|
||||
notice = acc
|
||||
? {
|
||||
kind: 'warn',
|
||||
message: '연결이 끊겼습니다 — 받은 부분까지 표시합니다.',
|
||||
retryable: false,
|
||||
}
|
||||
: { kind: 'error', message: '요청에 실패했습니다 — 네트워크를 확인하세요.', retryable: true };
|
||||
} finally {
|
||||
// abort(새 대화/페이지 이탈) 시에는 push 하지 않음 — 새 대화로 비운
|
||||
// messages 에 이전 스트림 잔여분이 흘러들어가는 race 방지.
|
||||
if (acc && !ctrl.signal.aborted) {
|
||||
messages.push({ role: 'assistant', content: acc });
|
||||
}
|
||||
if (abortCtrl === ctrl) {
|
||||
streaming = false;
|
||||
streamingText = '';
|
||||
abortCtrl = null;
|
||||
}
|
||||
persist();
|
||||
}
|
||||
}
|
||||
|
||||
function handleKeydown(e: KeyboardEvent) {
|
||||
// Enter 전송 / Shift+Enter 줄바꿈 (한글 조합 중 전송 방지)
|
||||
if (e.key === 'Enter' && !e.shiftKey && !e.isComposing) {
|
||||
e.preventDefault();
|
||||
sendMessage();
|
||||
}
|
||||
}
|
||||
|
||||
// 마지막 메시지가 user 턴이고 스트리밍 중이 아니면 재시도 가능 상태
|
||||
let canRetry = $derived(
|
||||
!streaming && messages.length > 0 && messages[messages.length - 1].role === 'user'
|
||||
);
|
||||
|
||||
// 입력 길이(전송 기준 = trim 후) — 7,500자부터 카운터 노출, 8,000자 초과 차단
|
||||
let inputLength = $derived(input.trim().length);
|
||||
let overLimit = $derived(inputLength > MAX_MESSAGE_CHARS);
|
||||
</script>
|
||||
|
||||
<svelte:head>
|
||||
<title>이드 - PKM</title>
|
||||
</svelte:head>
|
||||
|
||||
<div class="h-full flex flex-col">
|
||||
<!-- 헤더: 정체성 + 모드 토글 + 새 대화 -->
|
||||
<div class="shrink-0 border-b border-default bg-sidebar px-4 py-2.5">
|
||||
<div class="max-w-3xl mx-auto flex items-center gap-2 flex-wrap">
|
||||
<h1 class="flex items-center gap-2 text-sm font-extrabold tracking-tight shrink-0">
|
||||
<MessageCircle size={16} class="text-accent" />
|
||||
이드
|
||||
</h1>
|
||||
|
||||
<!-- 모드 segmented 토글: 일상 / 심층 -->
|
||||
<div class="flex rounded-md border border-default overflow-hidden" role="group" aria-label="응답 모드">
|
||||
<button
|
||||
type="button"
|
||||
aria-pressed={mode === 'daily'}
|
||||
onclick={() => (mode = 'daily')}
|
||||
disabled={streaming}
|
||||
title="짧은 질문·일상 대화에 적합"
|
||||
class="px-3 py-1.5 text-xs font-semibold transition-colors disabled:opacity-50
|
||||
{mode === 'daily' ? 'bg-accent text-white' : 'bg-surface text-dim hover:text-text hover:bg-surface-hover'}"
|
||||
>
|
||||
일상
|
||||
</button>
|
||||
<button
|
||||
type="button"
|
||||
aria-pressed={mode === 'deep'}
|
||||
onclick={() => (mode = 'deep')}
|
||||
disabled={streaming}
|
||||
title={DEEP_CAPTION}
|
||||
class="px-3 py-1.5 text-xs font-semibold border-l border-default transition-colors disabled:opacity-50
|
||||
{mode === 'deep' ? 'bg-accent text-white' : 'bg-surface text-dim hover:text-text hover:bg-surface-hover'}"
|
||||
>
|
||||
심층
|
||||
</button>
|
||||
</div>
|
||||
|
||||
<div class="flex-1"></div>
|
||||
|
||||
<Button variant="ghost" size="sm" icon={RotateCcw} onclick={newConversation}>
|
||||
새 대화
|
||||
</Button>
|
||||
</div>
|
||||
{#if mode === 'deep'}
|
||||
<div class="max-w-3xl mx-auto mt-1.5">
|
||||
<p class="text-[11px] text-dim">{DEEP_CAPTION}</p>
|
||||
</div>
|
||||
{/if}
|
||||
</div>
|
||||
|
||||
<!-- 메시지 리스트 -->
|
||||
<div bind:this={scrollEl} class="flex-1 min-h-0 overflow-y-auto px-4 py-4">
|
||||
<div class="max-w-3xl mx-auto flex flex-col gap-3" role="log" aria-live="polite">
|
||||
{#if messages.length === 0 && !streaming}
|
||||
<div class="py-10">
|
||||
<EmptyState
|
||||
icon={MessageCircle}
|
||||
title="이드와 대화를 시작하세요"
|
||||
description="일상 질문은 바로, 장문·무거운 질문은 심층 모드로 물어보세요. 아래 프리셋 칩으로 번역·요약·글 다듬기를 빠르게 시작할 수 있습니다."
|
||||
/>
|
||||
</div>
|
||||
{/if}
|
||||
|
||||
{#each messages as msg, i (i)}
|
||||
{#if msg.role === 'user'}
|
||||
<div class="flex justify-end">
|
||||
<div class="max-w-[85%] sm:max-w-[75%] px-3.5 py-2.5 rounded-lg rounded-br-sm bg-accent text-white text-sm whitespace-pre-wrap break-words">
|
||||
{msg.content}
|
||||
</div>
|
||||
</div>
|
||||
{:else}
|
||||
<div class="flex justify-start">
|
||||
<div class="max-w-[85%] sm:max-w-[75%] px-3.5 py-2.5 rounded-lg rounded-bl-sm bg-surface border border-default text-text text-sm whitespace-pre-wrap break-words">
|
||||
{msg.content}
|
||||
</div>
|
||||
</div>
|
||||
{/if}
|
||||
{/each}
|
||||
|
||||
<!-- 스트리밍 중 assistant 부분 응답 -->
|
||||
{#if streaming}
|
||||
<div class="flex justify-start">
|
||||
<div class="max-w-[85%] sm:max-w-[75%] px-3.5 py-2.5 rounded-lg rounded-bl-sm bg-surface border border-default text-text text-sm whitespace-pre-wrap break-words">
|
||||
{#if streamingText}
|
||||
{streamingText}<span class="inline-block w-1.5 h-3.5 ml-0.5 align-middle bg-accent animate-pulse rounded-sm"></span>
|
||||
{:else}
|
||||
<span class="text-dim animate-pulse">응답 준비 중...</span>
|
||||
{/if}
|
||||
</div>
|
||||
</div>
|
||||
{/if}
|
||||
|
||||
<!-- 에러/안내 카드: 자동 fallback 없이 명시 표시만 -->
|
||||
{#if notice}
|
||||
<div
|
||||
class="flex items-start gap-2 px-3.5 py-3 rounded-lg border text-sm
|
||||
{notice.kind === 'warn'
|
||||
? 'border-warning/30 bg-warning/10 text-warning'
|
||||
: 'border-error/30 bg-error/10 text-error'}"
|
||||
>
|
||||
<AlertCircle size={15} class="mt-0.5 shrink-0" />
|
||||
<div class="flex-1 min-w-0">
|
||||
<p>{notice.message}</p>
|
||||
{#if notice.retryable && canRetry}
|
||||
<Button variant="secondary" size="sm" class="mt-2" onclick={retry}>
|
||||
다시 시도
|
||||
</Button>
|
||||
{/if}
|
||||
</div>
|
||||
</div>
|
||||
{/if}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- 입력 바 (하단 고정 — 모바일에서도 flex 컬럼 하단에 붙음) -->
|
||||
<div class="shrink-0 border-t border-default bg-sidebar px-4 pt-2 pb-3">
|
||||
<div class="max-w-3xl mx-auto">
|
||||
<!-- 프리셋 칩 -->
|
||||
<div class="flex gap-1.5 overflow-x-auto pb-2">
|
||||
{#each PRESETS as preset (preset.label)}
|
||||
<button
|
||||
type="button"
|
||||
onclick={() => applyPreset(preset.prefix)}
|
||||
class="shrink-0 px-2.5 py-1 rounded-full border border-default bg-surface text-xs text-dim hover:text-text hover:border-accent transition-colors"
|
||||
>
|
||||
{preset.label}
|
||||
</button>
|
||||
{/each}
|
||||
</div>
|
||||
|
||||
<div class="flex items-end gap-2">
|
||||
<textarea
|
||||
bind:this={textareaEl}
|
||||
bind:value={input}
|
||||
onkeydown={handleKeydown}
|
||||
rows="1"
|
||||
placeholder="이드에게 메시지 보내기 (Enter 전송, Shift+Enter 줄바꿈)"
|
||||
class="flex-1 min-w-0 px-3 py-2 rounded-lg text-sm bg-bg text-text placeholder:text-faint border border-default focus:border-accent focus:ring-2 focus:ring-accent-ring outline-none resize-none overflow-y-auto transition-colors"
|
||||
></textarea>
|
||||
<Button
|
||||
variant="primary"
|
||||
size="md"
|
||||
icon={SendHorizontal}
|
||||
loading={streaming}
|
||||
disabled={!input.trim() || overLimit}
|
||||
onclick={sendMessage}
|
||||
aria-label="전송"
|
||||
>
|
||||
<span class="hidden sm:inline">전송</span>
|
||||
</Button>
|
||||
</div>
|
||||
|
||||
<!-- 글자수 카운터: 한도(8,000자) 근접 시에만 노출, 초과 시 인라인 안내 -->
|
||||
{#if inputLength >= COUNTER_THRESHOLD}
|
||||
<p class="mt-1 text-right text-[11px] {overLimit ? 'text-error' : 'text-dim'}" aria-live="polite">
|
||||
{inputLength.toLocaleString()} / {MAX_MESSAGE_CHARS.toLocaleString()}자{overLimit
|
||||
? ' — 입력이 너무 깁니다 (8,000자 이내)'
|
||||
: ''}
|
||||
</p>
|
||||
{/if}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
@@ -0,0 +1,19 @@
|
||||
-- A-3 (plan crawl-24x7-1): 소스 레지스트리 증축 — additive only.
|
||||
-- fetch_method : rss / rss+page / sitemap+page / page / api / signal-only
|
||||
-- fulltext_policy : none(현행 유지) / page(기사 페이지 fetch 후 4-tier 승격) / feed-full(피드 본문이 전문)
|
||||
-- auth_profile : NULL=공개, 값=구독 세션 키 (B-3 Playwright 어댑터용 슬롯)
|
||||
-- poll_interval_minutes : 소스별 차등 폴링 (NULL=전역 6h 사이클)
|
||||
-- etag / last_modified : 조건부 GET 워터마크 — 받은 그대로 저장·재전송 (상태는 전부 DB, APScheduler in-process)
|
||||
-- feed_content_hash : CDN ETag 회전 대비 콘텐츠 해시 변경감지 병행
|
||||
-- selector_override : 추출 실패 잦은 소스의 site-specific CSS selector (JSONB)
|
||||
-- parser_quirk : rdf / table-strip / gn-redirect 등 파서 특이 케이스
|
||||
ALTER TABLE news_sources
|
||||
ADD COLUMN IF NOT EXISTS fetch_method VARCHAR(20) NOT NULL DEFAULT 'rss',
|
||||
ADD COLUMN IF NOT EXISTS fulltext_policy VARCHAR(20) NOT NULL DEFAULT 'none',
|
||||
ADD COLUMN IF NOT EXISTS auth_profile VARCHAR(50),
|
||||
ADD COLUMN IF NOT EXISTS poll_interval_minutes INTEGER,
|
||||
ADD COLUMN IF NOT EXISTS etag TEXT,
|
||||
ADD COLUMN IF NOT EXISTS last_modified TEXT,
|
||||
ADD COLUMN IF NOT EXISTS feed_content_hash VARCHAR(64),
|
||||
ADD COLUMN IF NOT EXISTS selector_override JSONB,
|
||||
ADD COLUMN IF NOT EXISTS parser_quirk VARCHAR(30);
|
||||
@@ -0,0 +1,3 @@
|
||||
-- 0-5 (a) 확정 (plan crawl-24x7-1): 도메인 자료(안전/공학/철학) 채널 신설 — news 와 분리.
|
||||
-- 신규 값은 같은 트랜잭션 내 사용 금지 (PG 제약) — 본 배치의 다른 마이그레이션은 'crawl' 미사용.
|
||||
ALTER TYPE source_channel ADD VALUE IF NOT EXISTS 'crawl';
|
||||
@@ -0,0 +1,3 @@
|
||||
-- A-2 (plan crawl-24x7-1): RSS 요약 → 기사 페이지 fetch → 4-tier 본문 승격 stage.
|
||||
-- fulltext_policy='page' 소스의 기사에만 news_collector 가 enqueue.
|
||||
ALTER TYPE process_stage ADD VALUE IF NOT EXISTS 'fulltext';
|
||||
@@ -0,0 +1,19 @@
|
||||
-- A-5 (plan crawl-24x7-1): 소스 건강 — 소스별 실패 격리 기록 + circuit breaker.
|
||||
-- 한 소스가 죽어도 나머지 영향 0. silent skip 누적 방지의 가시성 기반 (A-8 패널이 읽음).
|
||||
-- circuit_state: closed(정상) / open(연속 실패로 지수 backoff 중) / disabled(M회 초과, 수동 복구 대상)
|
||||
-- empty_streak : 200 인데 entries 0 인 연속 fetch 횟수 (피드 부패 감시 — 304/해시동일은 미집계)
|
||||
CREATE TABLE IF NOT EXISTS source_health (
|
||||
id SERIAL PRIMARY KEY,
|
||||
source_id INTEGER NOT NULL REFERENCES news_sources(id) ON DELETE CASCADE,
|
||||
consecutive_failures INTEGER NOT NULL DEFAULT 0,
|
||||
total_fetches BIGINT NOT NULL DEFAULT 0,
|
||||
total_failures BIGINT NOT NULL DEFAULT 0,
|
||||
last_success_at TIMESTAMPTZ,
|
||||
last_error TEXT,
|
||||
last_error_at TIMESTAMPTZ,
|
||||
last_fetch_items INTEGER,
|
||||
empty_streak INTEGER NOT NULL DEFAULT 0,
|
||||
circuit_state VARCHAR(10) NOT NULL DEFAULT 'closed',
|
||||
circuit_opened_at TIMESTAMPTZ,
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||
);
|
||||
@@ -0,0 +1,2 @@
|
||||
-- A-5: source_health 는 news_sources 와 1:1 — upsert 기준 키.
|
||||
CREATE UNIQUE INDEX IF NOT EXISTS uq_source_health_source_id ON source_health (source_id);
|
||||
@@ -0,0 +1,5 @@
|
||||
-- B/C 그룹 (plan crawl-24x7-1, 0-5 확정): 레지스트리에 채널 컬럼 — additive only.
|
||||
-- documents.source_channel 과 동일 enum 재사용 ('crawl' 값은 320 에서 별도 트랜잭션으로 추가 완료).
|
||||
-- 기존 행 전부 'news' 기본값 = 무회귀. crawl 채널 소스의 문서 생성/색인 게이트 분기 기준.
|
||||
ALTER TABLE news_sources
|
||||
ADD COLUMN IF NOT EXISTS source_channel source_channel NOT NULL DEFAULT 'news';
|
||||
@@ -0,0 +1,8 @@
|
||||
-- B-3 (plan crawl-24x7-1): 구독 세션 상태 노출 계약 — additive only.
|
||||
-- relogin_requested: 쓰기 1종 플래그 (A-8 버튼이 기록, 어댑터가 소비 = 수동 half-open).
|
||||
-- 소비 위치 함정(r5 고정): open-스킵 분기보다 앞 — 어댑터 틱마다 확인.
|
||||
-- last_probe_at/ok: 내용 기반 probe 결과 (시간 기반 만료 판정 금지 — silent corruption 차단).
|
||||
ALTER TABLE source_health
|
||||
ADD COLUMN IF NOT EXISTS relogin_requested BOOLEAN NOT NULL DEFAULT FALSE,
|
||||
ADD COLUMN IF NOT EXISTS last_probe_at TIMESTAMPTZ,
|
||||
ADD COLUMN IF NOT EXISTS last_probe_ok BOOLEAN;
|
||||
@@ -0,0 +1,33 @@
|
||||
-- crawl-24x7 사이클 2 소스 seed (B-2 + C-1 안전 + C-5 철학) — 2026-06-10 전 URL live 검증.
|
||||
-- 262 선례: WHERE NOT EXISTS idempotent, 기존 행 보존, 신규만 insert (단일 statement).
|
||||
-- 채널: news = 다이제스트/브리핑 대상 / crawl = 도메인 재료 (0-5 분리).
|
||||
-- 정책: feed-full = 피드 본문이 전문 (UK HSE content:encoded 실측) / page = 기사 페이지 4-tier 승격.
|
||||
-- EU-OSHA 는 후보 등재만 (enabled=false — 카드 C-1 '우선순위 낮음').
|
||||
-- 르몽드 B-3 활성화는 seed 아님 — 세션 박제 후 runtime UPDATE (auth_profile/selector_override).
|
||||
INSERT INTO news_sources
|
||||
(name, country, language, feed_type, feed_url, category, enabled,
|
||||
fetch_method, fulltext_policy, source_channel, parser_quirk)
|
||||
SELECT v.name, v.country, v.language, v.feed_type, v.feed_url, v.category, v.enabled,
|
||||
v.fetch_method, v.fulltext_policy, v.source_channel::source_channel, v.parser_quirk
|
||||
FROM (VALUES
|
||||
-- B-2: Guardian Open Platform (전문 JSON — 스크래핑 불요, GUARDIAN_API_KEY 필요)
|
||||
('Guardian World', 'GB', 'en', 'api', 'https://content.guardianapis.com/search?section=world', 'International', true, 'api', 'none', 'news', NULL),
|
||||
-- C-1 안전 (Safety)
|
||||
('UK HSE Press', 'GB', 'en', 'rss', 'https://press.hse.gov.uk/feed/', 'Safety', true, 'rss', 'feed-full', 'crawl', NULL),
|
||||
('안전신문', 'KR', 'ko', 'rss', 'https://www.safetynews.co.kr/rss/allArticle.xml', 'Safety', true, 'rss', 'page', 'crawl', NULL),
|
||||
('고용노동부 공지', 'KR', 'ko', 'rss', 'https://www.moel.go.kr/rss/notice.do', 'Safety', true, 'rss', 'page', 'crawl', NULL),
|
||||
('고용노동부 정책', 'KR', 'ko', 'rss', 'https://www.moel.go.kr/rss/policy.do', 'Safety', true, 'rss', 'page', 'crawl', NULL),
|
||||
('고용노동부 입법행정예고', 'KR', 'ko', 'rss', 'https://www.moel.go.kr/rss/lawinfo.do', 'Safety', true, 'rss', 'page', 'crawl', NULL),
|
||||
('OSHA QuickTakes', 'US', 'en', 'rss', 'https://www.osha.gov/sites/default/files/quicktakes.xml', 'Safety', true, 'rss', 'page', 'crawl', NULL),
|
||||
('EU-OSHA News', 'EU', 'en', 'rss', 'https://osha.europa.eu/en/rss-feeds/latest/news.xml', 'Safety', false, 'rss', 'page', 'crawl', NULL),
|
||||
-- C-5 철학 (Philosophy)
|
||||
('SEP 신규·개정', 'US', 'en', 'rss', 'https://plato.stanford.edu/rss/sep.xml', 'Philosophy', true, 'rss', 'page', 'crawl', NULL),
|
||||
('1000-Word Philosophy', 'US', 'en', 'rss', 'https://1000wordphilosophy.com/feed/', 'Philosophy', true, 'rss', 'feed-full', 'crawl', NULL),
|
||||
('Doing Philosophy', 'KR', 'ko', 'rss', 'https://doingphilosophy.kr/feed', 'Philosophy', true, 'rss', 'page', 'crawl', NULL),
|
||||
('Aeon', 'GB', 'en', 'rss', 'https://aeon.co/feed.rss', 'Philosophy', true, 'rss', 'page', 'crawl', 'skip-video'),
|
||||
('Psyche', 'GB', 'en', 'rss', 'https://psyche.co/feed.rss', 'Philosophy', true, 'rss', 'page', 'crawl', 'skip-video')
|
||||
) AS v(name, country, language, feed_type, feed_url, category, enabled,
|
||||
fetch_method, fulltext_policy, source_channel, parser_quirk)
|
||||
WHERE NOT EXISTS (
|
||||
SELECT 1 FROM news_sources ns WHERE ns.name = v.name
|
||||
);
|
||||
@@ -0,0 +1,32 @@
|
||||
-- crawl-24x7 사이클 3 소스 seed (B-4 시그널 + C-4 공학 지속수집) — 2026-06-11 전 URL live 검증.
|
||||
-- 326 선례: WHERE NOT EXISTS idempotent, 기존 행 보존, 신규만 insert (단일 statement).
|
||||
-- fetch_method='signal-only' (B-4): 헤드라인+요약만 인제스트, 페이지 fetch 0,
|
||||
-- summarize 스킵(검색 색인만 — embed/chunk). 다이제스트는 ai_summary NULL 제외라 자연 배제.
|
||||
-- Bloomberg = anti-bot 최강이라 본문 수집 비권고 → 시그널 전용. 피드에 비디오 혼재 실측 → skip-video.
|
||||
-- Economist = 실측 200 (Archiver UA 는 feed-reader 로 취급됨 — 브라우저 UA 만 403). 구독 없음 = 시그널.
|
||||
-- Nikkei Asia = RSS 1.0(RDF) 실측 — feedparser 가 네이티브 정규화 (title/link 만, 요약·날짜 없음
|
||||
-- = 제목 시그널). 코드 분기 불요 (tests/test_crawl_cycle3_shapes.py fixture 회귀로 박제).
|
||||
-- arXiv/ASME = 초록이 곧 본문 (C-4 2단: 초록 색인 먼저, 선별 전문은 Phase 3) → signal-only 재사용.
|
||||
-- IEEE Spectrum = 피드 description 이 전문 (7.9~14K자 실측) → feed-full. 카테고리 필터 = topic 피드.
|
||||
INSERT INTO news_sources
|
||||
(name, country, language, feed_type, feed_url, category, enabled,
|
||||
fetch_method, fulltext_policy, source_channel, parser_quirk)
|
||||
SELECT v.name, v.country, v.language, v.feed_type, v.feed_url, v.category, v.enabled,
|
||||
v.fetch_method, v.fulltext_policy, v.source_channel::source_channel, v.parser_quirk
|
||||
FROM (VALUES
|
||||
-- B-4: 시그널 전용 (news 채널 — 헤드라인 시그널)
|
||||
('Bloomberg Markets', 'US', 'en', 'rss', 'https://feeds.bloomberg.com/markets/news.rss', 'Economy', true, 'signal-only', 'none', 'news', 'skip-video'),
|
||||
('Bloomberg Technology', 'US', 'en', 'rss', 'https://feeds.bloomberg.com/technology/news.rss', 'Technology', true, 'signal-only', 'none', 'news', 'skip-video'),
|
||||
('Economist Latest', 'GB', 'en', 'rss', 'https://www.economist.com/latest/rss.xml', 'International', true, 'signal-only', 'none', 'news', NULL),
|
||||
('Nikkei Asia', 'JP', 'en', 'rss', 'https://asia.nikkei.com/rss/feed/nar', 'International', true, 'signal-only', 'none', 'news', NULL),
|
||||
-- C-4: 공학 지속수집 (crawl 채널 — 도메인 재료. API 공지/CSB/CCPS 는 전용 워커가 runtime 등록)
|
||||
('ASME J. Pressure Vessel Technology', 'US', 'en', 'rss', 'https://asmedigitalcollection.asme.org/rss/site_1000037/LatestOpenIssueArticles_1000020.xml', 'Engineering', true, 'signal-only', 'none', 'crawl', NULL),
|
||||
('arXiv cond-mat.mtrl-sci', 'US', 'en', 'rss', 'https://rss.arxiv.org/rss/cond-mat.mtrl-sci', 'Engineering', true, 'signal-only', 'none', 'crawl', NULL),
|
||||
('arXiv physics.app-ph', 'US', 'en', 'rss', 'https://rss.arxiv.org/rss/physics.app-ph', 'Engineering', true, 'signal-only', 'none', 'crawl', NULL),
|
||||
('IEEE Spectrum Energy', 'US', 'en', 'rss', 'https://spectrum.ieee.org/feeds/topic/energy.rss', 'Engineering', true, 'rss', 'feed-full', 'crawl', NULL),
|
||||
('IEEE Spectrum Robotics', 'US', 'en', 'rss', 'https://spectrum.ieee.org/feeds/topic/robotics.rss', 'Engineering', true, 'rss', 'feed-full', 'crawl', NULL)
|
||||
) AS v(name, country, language, feed_type, feed_url, category, enabled,
|
||||
fetch_method, fulltext_policy, source_channel, parser_quirk)
|
||||
WHERE NOT EXISTS (
|
||||
SELECT 1 FROM news_sources ns WHERE ns.name = v.name
|
||||
);
|
||||
@@ -0,0 +1,59 @@
|
||||
"""B-3 구독 세션 1회 수동 박제 (MacBook 등 GUI 머신에서 실행).
|
||||
|
||||
르몽드 = Google OAuth — 자동화 브라우저 로그인은 Google 이 차단하므로
|
||||
로그인 자체는 항상 사람이 headed 브라우저에서 수행하고, 본 스크립트는
|
||||
그 결과(쿠키+localStorage = storage_state JSON)만 박제한다.
|
||||
|
||||
사용 (MacBook):
|
||||
pip install playwright && playwright install chromium
|
||||
python scripts/capture_subscription_session.py --profile lemonde --url https://www.lemonde.fr
|
||||
1) 떠오른 브라우저에서 직접 로그인 (Google OAuth 포함)
|
||||
2) 로그인 완료 확인 후 터미널에서 Enter
|
||||
3) ~/.local/share/crawl-auth/lemonde.json 저장 (600)
|
||||
|
||||
GPU 반영:
|
||||
ssh gpu 'mkdir -p ~/.local/share/crawl-auth && chmod 700 ~/.local/share/crawl-auth'
|
||||
scp ~/.local/share/crawl-auth/lemonde.json gpu:.local/share/crawl-auth/
|
||||
ssh gpu 'chmod 600 ~/.local/share/crawl-auth/lemonde.json'
|
||||
|
||||
세션 만료 후 재로그인도 동일 절차 + source_health.relogin_requested 플래그 set
|
||||
(어댑터가 다음 틱에 half-open probe 로 소비).
|
||||
|
||||
주의: storage_state = credential 등가물. repo 안·백업 대상 경로에 두지 말 것.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
from playwright.sync_api import sync_playwright
|
||||
|
||||
AUTH_DIR = Path.home() / ".local" / "share" / "crawl-auth"
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description="B-3 구독 세션 storage_state 박제")
|
||||
parser.add_argument("--profile", required=True, help="예: lemonde")
|
||||
parser.add_argument("--url", required=True, help="로그인 시작 페이지")
|
||||
args = parser.parse_args()
|
||||
|
||||
AUTH_DIR.mkdir(parents=True, exist_ok=True)
|
||||
AUTH_DIR.chmod(0o700)
|
||||
out = AUTH_DIR / f"{args.profile}.json"
|
||||
|
||||
with sync_playwright() as pw:
|
||||
browser = pw.chromium.launch(headless=False)
|
||||
context = browser.new_context(viewport={"width": 1366, "height": 900})
|
||||
page = context.new_page()
|
||||
page.goto(args.url)
|
||||
print(f"\n브라우저에서 로그인을 완료한 뒤 이 터미널에서 Enter 를 누르세요.")
|
||||
input("로그인 완료 후 Enter > ")
|
||||
context.storage_state(path=str(out))
|
||||
browser.close()
|
||||
|
||||
out.chmod(0o600)
|
||||
print(f"저장: {out} (600)")
|
||||
print("다음: scp 로 GPU ~/.local/share/crawl-auth/ 반영 + chmod 600")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -51,14 +51,10 @@ _FENCE_ANY = re.compile(r"(?m)^\s{0,3}(```|~~~)")
|
||||
|
||||
|
||||
def _looks_junk(title: str | None) -> bool:
|
||||
if not title:
|
||||
return False
|
||||
if _JUNK_ORG.search(title):
|
||||
return True
|
||||
letters = [c for c in title if c.isalpha()]
|
||||
if len(letters) >= 6 and sum(1 for c in letters if c.isupper()) / len(letters) >= 0.85:
|
||||
return True
|
||||
return False
|
||||
"""cover/TOC org-이름 junk. ★전부-대문자 휴리스틱은 폐기(2026-06-09): 기술문서의 정상
|
||||
all-caps heading('GENERAL REQUIREMENTS'/'WELDING')을 130건 과탐해 windowed/clean doc 을 거짓
|
||||
A_better 강등시켰음. 명시적 회사-접미사만 junk 로 본다(그것도 호출측이 cover 위치+미stored 로 게이트)."""
|
||||
return bool(title and _JUNK_ORG.search(title))
|
||||
|
||||
|
||||
def _make_engine():
|
||||
@@ -102,10 +98,14 @@ async def _measure_doc(session, doc_id):
|
||||
res["hash_stable_99"] = False
|
||||
|
||||
stored_titles = {s["section_title"] for s in stored if s["section_title"]}
|
||||
res["junk_b"] = any(_looks_junk(n.section_title) and n.section_title not in stored_titles for n in nodes)
|
||||
# junk = cover 영역(앞쪽 노드)의 신규 org-이름 heading 만 (positional). 본문 전반의 정상 heading 무관.
|
||||
res["junk_b"] = any(_looks_junk(n.section_title) and n.section_title not in stored_titles for n in nodes[:4])
|
||||
|
||||
# verdict 휴리스틱 (high-recall junk 보호 + absent-structure → A_better).
|
||||
# MEASURE2 가 canonical 분포를 이미 박제 — 이 verdict 는 재현/감사용. 애매(notes:ambiguous)는 PASS 미차단.
|
||||
# verdict 휴리스틱 = coarse 스크린(재현/감사용). ★2026-06-09 실집행의 authoritative 결정은 이게 아니라:
|
||||
# (a) 결정적 partition: pure_benefit(n_a<=2)/comparable(0.85<=ratio<=2) = 자동 INCLUDE,
|
||||
# overseg(ratio>2)/absent(ratio<0.85) = 적대 검증 대상.
|
||||
# (b) 적대 워크플로(judge+refute)가 위험 후보를 stored vs build 제목으로 per-doc INCLUDE/EXCLUDE 확정.
|
||||
# 이 휴리스틱 단독으로 destructive re-decompose 리스트를 만들지 말 것(junk 과탐·threshold 과적합 이력).
|
||||
# ★ apples-to-apples: 양쪽 모두 JUMP-TARGET 수로 비교(stored leaf 전수 X — window-child 가 n_a 를 부풀려
|
||||
# windowed doc 을 거짓 A_better 로 떨구는 bias 제거). stored jump-target = (비-window leaf OR %_split) + 제목.
|
||||
def _stored_is_jt(s):
|
||||
|
||||
@@ -317,7 +317,8 @@ async def cmd_run(args):
|
||||
{"ids": doc_ids, "pv": PROMPT_VERSION})).mappings().all()
|
||||
if pending:
|
||||
tot = sum(r["unanalyzed"] for r in pending)
|
||||
_log(f" [sweep] 미분석 leaf 잔여: {tot} (doc {len(pending)}) — 다음 실행이 이어서 분석(멱등). "
|
||||
_log(f" [sweep] 미분석 leaf 잔여: {tot} (doc {len(pending)}) — char_start 마커는 이들을 재선별 안 함; "
|
||||
f"`analyze` 커맨드로 수렴(`analyze --deadline HH:MM`, 멱등). "
|
||||
f"상위: {[(r['doc_id'], r['unanalyzed']) for r in pending[:5]]}")
|
||||
else:
|
||||
_log(" [sweep] 미분석 leaf 잔여 0 — 분석 수렴.")
|
||||
@@ -397,6 +398,70 @@ async def cmd_update_char_start(args):
|
||||
print("DEMOTE_DOC_IDS=" + ",".join(str(x) for x in demoted), flush=True)
|
||||
|
||||
|
||||
# 미분석 hier leaf 보유 doc 선별 (재분해 마커와 독립 — analyze 추적 별도 축, g3-t3).
|
||||
def _analyze_candidate_sql(doc_ids=None):
|
||||
scope = "AND dc.doc_id = ANY(:ids)" if doc_ids else ""
|
||||
return text(f"""
|
||||
SELECT DISTINCT dc.doc_id AS doc_id, d.ai_domain AS ai_domain
|
||||
FROM document_chunks dc JOIN documents d ON d.id = dc.doc_id
|
||||
WHERE dc.source_type = 'hier_section' AND dc.is_leaf = true {scope}
|
||||
AND NOT EXISTS (SELECT 1 FROM chunk_section_analysis a
|
||||
WHERE a.chunk_id = dc.id AND a.prompt_version = :pv
|
||||
AND a.source_content_hash = dc.chunk_content_hash)
|
||||
ORDER BY dc.doc_id
|
||||
""")
|
||||
|
||||
|
||||
async def cmd_analyze(args):
|
||||
"""[g3-t3 self-heal] 미분석 hier leaf 만 분석 (재분해/char_start 마커와 독립, 멱등).
|
||||
|
||||
re-decompose 의 char_start 완료마커는 'jump-target char_start 보유'라서, 컨테이너 recreate/deadline 으로
|
||||
analyze 가 잘린 doc(char_start 는 있으나 일부 leaf 미분석)을 재선별하지 못한다 → 이 커맨드가 LEAF_SQL 기준
|
||||
(미분석 leaf 보유)으로 독립 선별해 eventually-consistent rail summary 를 수렴시킨다. 멱등(LEAF_SQL NOT EXISTS).
|
||||
--doc 로 제한 가능(미지정=전체). jump(char_start)와 무관 — rail summary 수렴 전용."""
|
||||
doc_ids = _parse_doc_ids(args)
|
||||
deadline = _compute_deadline(args.deadline)
|
||||
stop_at = (deadline - timedelta(minutes=BUFFER_MIN)).timestamp()
|
||||
_log(f"[analyze] deadline={deadline:%m-%d %H:%M} (stop_at={datetime.fromtimestamp(stop_at):%H:%M}) "
|
||||
f"{'doc-list='+str(len(doc_ids)) if doc_ids else 'all'} 미분석 leaf 보유 doc 선별")
|
||||
|
||||
engine = _make_engine()
|
||||
sm = async_sessionmaker(engine, expire_on_commit=False)
|
||||
client = AIClient()
|
||||
model_name = settings.ai.triage.model
|
||||
params = {"pv": PROMPT_VERSION}
|
||||
if doc_ids:
|
||||
params["ids"] = doc_ids
|
||||
|
||||
tot_docs = tot_ok = tot_fail = tot_skip = 0
|
||||
try:
|
||||
async with sm() as session:
|
||||
cands = (await session.execute(_analyze_candidate_sql(doc_ids), params)).mappings().all()
|
||||
_log(f"[analyze] 후보 doc {len(cands)} (미분석 leaf 보유). 시작.")
|
||||
for c in cands:
|
||||
if time.time() >= stop_at:
|
||||
_log(f"⏰ deadline 버퍼 도달 — 중단 (처리 {tot_docs} doc)")
|
||||
break
|
||||
doc_id, doc_domain = c["doc_id"], c["ai_domain"] or "general"
|
||||
try:
|
||||
async with sm() as session:
|
||||
st = await _analyze_doc_leaves(session, client, doc_id, doc_domain, model_name, stop_at)
|
||||
except Exception as exc:
|
||||
_log(f" ✗ doc={doc_id} 분석 실패(건너뜀): {type(exc).__name__}: {repr(exc)[:160]}")
|
||||
continue
|
||||
tot_docs += 1
|
||||
tot_ok += st["ok"]; tot_fail += st["fail"]; tot_skip += st["skip"]
|
||||
_log(f" ✓ doc={doc_id} ok={st['ok']} fail={st['fail']} skip={st['skip']} leaves={st['leaves']}"
|
||||
f"{' [ABORT]' if st['aborted'] else ''} | 누적 {tot_docs}doc {tot_ok}ok")
|
||||
if st["aborted"]:
|
||||
_log("⏰ leaf 분석 중 deadline 도달 — 중단")
|
||||
break
|
||||
finally:
|
||||
await client.close()
|
||||
await engine.dispose()
|
||||
_log(f"=== [analyze] 종료: {tot_docs} doc, ok={tot_ok} fail={tot_fail} skip={tot_skip} ===")
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser(description="오버나이트 hier 분해+절 분석 backfill (additive)")
|
||||
sub = ap.add_subparsers(dest="cmd", required=True)
|
||||
@@ -415,8 +480,13 @@ def main():
|
||||
p_upd = sub.add_parser("update-char-start",
|
||||
help="[g3-tU] hash_stable doc 비파괴 char_start UPDATE (100% VERIFY, --doc 필수)")
|
||||
p_upd.add_argument("--doc", default=None, help="comma-sep doc id (gm-t1 hash_stable 32)")
|
||||
p_an = sub.add_parser("analyze",
|
||||
help="[g3-t3] 미분석 hier leaf 만 분석(재분해 무관, 멱등) — recreate/deadline 으로 잘린 절분석 수렴")
|
||||
p_an.add_argument("--deadline", default="07:00", help="HH:MM (컨테이너 UTC, 07:00 KST=22:00 UTC)")
|
||||
p_an.add_argument("--doc", default=None, help="comma-sep doc id (미지정=미분석 leaf 보유 전체)")
|
||||
args = ap.parse_args()
|
||||
fn = {"dry-run": cmd_dry_run, "run": cmd_run, "update-char-start": cmd_update_char_start}[args.cmd]
|
||||
fn = {"dry-run": cmd_dry_run, "run": cmd_run, "update-char-start": cmd_update_char_start,
|
||||
"analyze": cmd_analyze}[args.cmd]
|
||||
asyncio.run(fn(args))
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,12 @@
|
||||
FROM python:3.12-slim
|
||||
|
||||
WORKDIR /srv
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
COPY server.py .
|
||||
|
||||
EXPOSE 8765
|
||||
HEALTHCHECK --interval=30s --timeout=5s --retries=3 \
|
||||
CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8765/health')"
|
||||
|
||||
CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8765"]
|
||||
@@ -0,0 +1,3 @@
|
||||
fastapi>=0.111.0
|
||||
uvicorn>=0.30.0
|
||||
asyncpg>=0.29.0
|
||||
@@ -0,0 +1,202 @@
|
||||
"""crawl-health — 전 소스 헬스 패널 1차 (A-8, plan crawl-24x7-1)
|
||||
|
||||
읽기 전용 내부 운영 패널. 의존 = 기존 수집 상태(news_sources/source_health/documents/
|
||||
processing_queue SELECT 만) — 쓰기 0.
|
||||
|
||||
[1차] 소스별 last success / 수집 건수 추이(24h/7d) / 연속 실패 / circuit 상태 /
|
||||
빈 피드 streak + fulltext 승격/격하 통계 + 큐 백로그. 비-RSS 소스(C-2 sitemap 등)도
|
||||
같은 표면이 수용 (fetch_method 컬럼 표시 — '구독 소스 패널' 로 좁히지 않는 전 소스 일반화).
|
||||
[2차 범위 외] B-3 상태 계약 도착 시 세션 열 + [재로그인 시도] 버튼(enqueue 방식).
|
||||
|
||||
노출: 별도 바인딩만 — compose 가 Tailscale 인터페이스(100.110.63.63)에만 publish.
|
||||
vhost/경로 가드 방식 금지 (r4: 둘 다 '덜 깨짐' 속성 상실). 앱 레벨 인증 없음 =
|
||||
Tailscale 도달성만이 경계 (fab-server 선례).
|
||||
"""
|
||||
|
||||
import html
|
||||
import logging
|
||||
import os
|
||||
from contextlib import asynccontextmanager
|
||||
|
||||
import asyncpg
|
||||
from fastapi import FastAPI
|
||||
from fastapi.responses import HTMLResponse, JSONResponse
|
||||
|
||||
logger = logging.getLogger("crawl_health")
|
||||
|
||||
DSN = os.environ.get("CRAWL_HEALTH_DSN", "")
|
||||
|
||||
_pool: asyncpg.Pool | None = None
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(_app: FastAPI):
|
||||
global _pool
|
||||
_pool = await asyncpg.create_pool(DSN, min_size=1, max_size=3)
|
||||
yield
|
||||
await _pool.close()
|
||||
|
||||
|
||||
app = FastAPI(lifespan=lifespan)
|
||||
|
||||
|
||||
async def _collect_data() -> dict:
|
||||
async with _pool.acquire() as conn:
|
||||
sources = await conn.fetch(
|
||||
"""
|
||||
SELECT s.id, s.name, s.country, s.enabled, s.feed_type, s.fetch_method,
|
||||
s.fulltext_policy, s.last_fetched_at,
|
||||
h.circuit_state, h.consecutive_failures, h.last_success_at,
|
||||
h.last_error, h.last_error_at, h.last_fetch_items, h.empty_streak,
|
||||
h.total_fetches, h.total_failures
|
||||
FROM news_sources s
|
||||
LEFT JOIN source_health h ON h.source_id = s.id
|
||||
ORDER BY s.enabled DESC, s.name
|
||||
"""
|
||||
)
|
||||
counts = await conn.fetch(
|
||||
"""
|
||||
SELECT s.id,
|
||||
count(d.id) FILTER (WHERE d.extracted_at > now() - interval '24 hours') AS items_24h,
|
||||
count(d.id) AS items_7d
|
||||
FROM news_sources s
|
||||
LEFT JOIN documents d
|
||||
ON d.source_channel = 'news'
|
||||
AND d.extracted_at > now() - interval '7 days'
|
||||
AND d.file_path LIKE 'news/' || s.name || '/%'
|
||||
GROUP BY s.id
|
||||
"""
|
||||
)
|
||||
queue = await conn.fetch(
|
||||
"""
|
||||
SELECT stage::text AS stage, status::text AS status, count(*) AS n,
|
||||
min(created_at) FILTER (WHERE status = 'pending') AS oldest_pending
|
||||
FROM processing_queue
|
||||
WHERE stage IN ('fulltext', 'summarize', 'embed', 'chunk')
|
||||
AND status IN ('pending', 'processing', 'failed')
|
||||
GROUP BY 1, 2
|
||||
ORDER BY 1, 2
|
||||
"""
|
||||
)
|
||||
fulltext = await conn.fetch(
|
||||
"""
|
||||
SELECT extract_meta -> 'fulltext' ->> 'status' AS status, count(*) AS n
|
||||
FROM documents
|
||||
WHERE source_channel = 'news' AND extract_meta ? 'fulltext'
|
||||
GROUP BY 1
|
||||
"""
|
||||
)
|
||||
count_map = {r["id"]: r for r in counts}
|
||||
return {
|
||||
"sources": [
|
||||
{**dict(r),
|
||||
"items_24h": count_map.get(r["id"], {}).get("items_24h", 0),
|
||||
"items_7d": count_map.get(r["id"], {}).get("items_7d", 0)}
|
||||
for r in sources
|
||||
],
|
||||
"queue": [dict(r) for r in queue],
|
||||
"fulltext": [dict(r) for r in fulltext],
|
||||
}
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
async def health():
|
||||
"""Liveness — Docker healthcheck 용 (DB 미접근, 프로세스 생존만)."""
|
||||
return {"status": "ok", "service": "crawl-health"}
|
||||
|
||||
|
||||
@app.get("/api/health.json")
|
||||
async def api_health():
|
||||
data = await _collect_data()
|
||||
# asyncpg Record 의 datetime → isoformat 직렬화
|
||||
def _ser(v):
|
||||
return v.isoformat() if hasattr(v, "isoformat") else v
|
||||
return JSONResponse({
|
||||
k: [{kk: _ser(vv) for kk, vv in row.items()} for row in v]
|
||||
for k, v in data.items()
|
||||
})
|
||||
|
||||
|
||||
def _chip(state: str | None, enabled: bool) -> str:
|
||||
if not enabled:
|
||||
return '<span class="chip off">OFF</span>'
|
||||
if state == "disabled":
|
||||
return '<span class="chip err">DISABLED</span>'
|
||||
if state == "open":
|
||||
return '<span class="chip warn">OPEN</span>'
|
||||
return '<span class="chip ok">OK</span>'
|
||||
|
||||
|
||||
def _fmt_ts(v) -> str:
|
||||
return v.strftime("%m-%d %H:%M") if v else "-"
|
||||
|
||||
|
||||
@app.get("/", response_class=HTMLResponse)
|
||||
async def index():
|
||||
data = await _collect_data()
|
||||
rows = []
|
||||
for s in data["sources"]:
|
||||
err = html.escape((s.get("last_error") or "")[:80])
|
||||
warn_cls = ""
|
||||
if s["enabled"] and (s.get("consecutive_failures") or 0) >= 3:
|
||||
warn_cls = ' class="row-warn"'
|
||||
elif s["enabled"] and (s.get("empty_streak") or 0) >= 8:
|
||||
warn_cls = ' class="row-warn"'
|
||||
rows.append(
|
||||
f"<tr{warn_cls}>"
|
||||
f"<td>{html.escape(s['name'])}</td>"
|
||||
f"<td>{_chip(s.get('circuit_state'), s['enabled'])}</td>"
|
||||
f"<td>{html.escape(s.get('fetch_method') or 'rss')}</td>"
|
||||
f"<td>{html.escape(s.get('fulltext_policy') or 'none')}</td>"
|
||||
f"<td class='num'>{s['items_24h']}</td>"
|
||||
f"<td class='num'>{s['items_7d']}</td>"
|
||||
f"<td class='num'>{s.get('consecutive_failures') or 0}</td>"
|
||||
f"<td class='num'>{s.get('empty_streak') or 0}</td>"
|
||||
f"<td>{_fmt_ts(s.get('last_success_at'))}</td>"
|
||||
f"<td>{_fmt_ts(s.get('last_fetched_at'))}</td>"
|
||||
f"<td class='err-text'>{err}</td>"
|
||||
f"</tr>"
|
||||
)
|
||||
qrows = [
|
||||
f"<tr><td>{html.escape(q['stage'])}</td><td>{html.escape(q['status'])}</td>"
|
||||
f"<td class='num'>{q['n']}</td><td>{_fmt_ts(q.get('oldest_pending'))}</td></tr>"
|
||||
for q in data["queue"]
|
||||
]
|
||||
frows = [
|
||||
f"<tr><td>{html.escape(f['status'] or '-')}</td><td class='num'>{f['n']}</td></tr>"
|
||||
for f in data["fulltext"]
|
||||
]
|
||||
body = f"""<!DOCTYPE html>
|
||||
<html lang="ko"><head><meta charset="utf-8">
|
||||
<title>crawl-health — 전 소스 헬스 패널</title>
|
||||
<style>
|
||||
body {{ font-family: -apple-system, 'Apple SD Gothic Neo', sans-serif; background: #f5f1e8;
|
||||
color: #3d3a33; margin: 0; padding: 28px; }}
|
||||
h1 {{ font-size: 19px; margin: 0 0 4px; }} h2 {{ font-size: 14px; margin: 26px 0 8px; }}
|
||||
.sub {{ color: #8a8474; font-size: 12px; margin-bottom: 18px; }}
|
||||
table {{ border-collapse: collapse; width: 100%; background: #fffdf8; font-size: 12.5px; }}
|
||||
th, td {{ border: 1px solid #e3ddcd; padding: 5px 9px; text-align: left; }}
|
||||
th {{ background: #ece6d6; font-weight: 600; white-space: nowrap; }}
|
||||
td.num {{ text-align: right; font-variant-numeric: tabular-nums; }}
|
||||
td.err-text {{ color: #9a4a3a; font-size: 11.5px; max-width: 320px; }}
|
||||
tr.row-warn td {{ background: #fbf0e4; }}
|
||||
.chip {{ display: inline-block; padding: 1px 8px; border-radius: 9px; font-size: 11px; font-weight: 600; }}
|
||||
.chip.ok {{ background: #dce8d4; color: #3c5a2e; }}
|
||||
.chip.warn {{ background: #f3e0b8; color: #7a5a14; }}
|
||||
.chip.err {{ background: #eecfc6; color: #8a2f1d; }}
|
||||
.chip.off {{ background: #e3ddcd; color: #6e6859; }}
|
||||
</style></head><body>
|
||||
<h1>crawl-health — 전 소스 헬스 패널</h1>
|
||||
<div class="sub">A-8 1차 (피드 수집 헬스) · 내부 전용 (Tailscale 바인딩) · 새로고침 = 실시간 조회</div>
|
||||
<h2>소스 ({len(rows)})</h2>
|
||||
<table><tr><th>소스</th><th>circuit</th><th>fetch</th><th>fulltext</th><th>24h</th><th>7d</th>
|
||||
<th>연속실패</th><th>빈피드</th><th>last success</th><th>last fetch</th><th>last error</th></tr>
|
||||
{''.join(rows)}</table>
|
||||
<h2>처리 큐 (fulltext / summarize / embed / chunk)</h2>
|
||||
<table><tr><th>stage</th><th>status</th><th>건수</th><th>oldest pending</th></tr>
|
||||
{''.join(qrows) or '<tr><td colspan="4">백로그 없음</td></tr>'}</table>
|
||||
<h2>fulltext 승격 누적</h2>
|
||||
<table><tr><th>status</th><th>건수</th></tr>
|
||||
{''.join(frows) or '<tr><td colspan="2">기록 없음 (파일럿 전환 전)</td></tr>'}</table>
|
||||
</body></html>"""
|
||||
return HTMLResponse(body)
|
||||
+109
-28
@@ -1,12 +1,18 @@
|
||||
"""marker-service — POST /convert: PDF → markdown + 추출 이미지 base64.
|
||||
|
||||
Phase 1B (2026-05-01) — 텍스트만 응답, 이미지 폐기.
|
||||
Phase 1B.5 (본 변경) — `_images` 직렬화해서 base64 응답에 포함. NAS write 권한이
|
||||
Phase 1B.5 — `_images` 직렬화해서 base64 응답에 포함. NAS write 권한이
|
||||
없는 stateless 변환기 유지 (fastapi 가 NAS persist 담당).
|
||||
D-1 (plan crawl-24x7-1, 2026-06-10) — idle-unload 운영 전환:
|
||||
MARKER_PRELOAD=0 : startup warmup 끔 (첫 /convert 시 lazy load)
|
||||
MARKER_IDLE_UNLOAD_MINUTES : N분 유휴 시 모델 해제 (0=비활성, 기존 동작)
|
||||
/ready 는 idle(미적재)에서도 200 — fastapi 의 depends_on service_healthy 가
|
||||
lazy 모드에서 영구 미기동으로 굳는 것 방지. 503 은 warmup_failed 한정.
|
||||
|
||||
plan: ~/.claude/plans/piped-humming-crystal.md
|
||||
"""
|
||||
import base64
|
||||
import gc
|
||||
import hashlib
|
||||
import io
|
||||
import logging
|
||||
@@ -40,6 +46,12 @@ _warmup_done = False
|
||||
_warmup_error: str | None = None
|
||||
_warmup_lock = threading.Lock()
|
||||
|
||||
# D-1 idle-unload 상태 — 전이는 전부 _warmup_lock 아래
|
||||
_PRELOAD = os.getenv("MARKER_PRELOAD", "1") != "0"
|
||||
_IDLE_UNLOAD_MINUTES = int(os.getenv("MARKER_IDLE_UNLOAD_MINUTES", "0"))
|
||||
_inflight = 0
|
||||
_last_used = time.monotonic()
|
||||
|
||||
# 이미지 응답 cap. base64 응답 크기 폭주 방지. 사용자 PDF 풀 측정 (Phase 1D) 시
|
||||
# 가장 이미지 많은 문서가 ~30건 수준 → 200 은 안전 마진. 초과 시 truncate flag 응답.
|
||||
MAX_IMAGES_PER_DOC = int(os.getenv("MARKER_MAX_IMAGES_PER_DOC", "200"))
|
||||
@@ -68,11 +80,67 @@ def _ensure_warmup() -> None:
|
||||
raise
|
||||
|
||||
|
||||
def _acquire_models():
|
||||
"""warmup 보장 + inflight 진입을 원자적으로 — ensure 직후 reaper 가 해제하는 경합 차단."""
|
||||
global _inflight
|
||||
while True:
|
||||
_ensure_warmup()
|
||||
with _warmup_lock:
|
||||
if _warmup_done:
|
||||
_inflight += 1
|
||||
return
|
||||
# ensure 와 lock 재진입 사이에 unload 가 끼어든 희귀 경합 — 재시도
|
||||
|
||||
|
||||
def _release_models():
|
||||
global _inflight, _last_used
|
||||
with _warmup_lock:
|
||||
_inflight -= 1
|
||||
_last_used = time.monotonic()
|
||||
|
||||
|
||||
def _maybe_unload() -> None:
|
||||
"""유휴 시 모델 해제. 변환 중(inflight>0)이면 절대 해제하지 않는다.
|
||||
|
||||
split 변환의 배치 사이 간격은 초 단위 — N>=1분 임계면 배치 사이 해제 없음.
|
||||
"""
|
||||
global _models, _converter, _warmup_done
|
||||
with _warmup_lock:
|
||||
if not _warmup_done or _inflight > 0:
|
||||
return
|
||||
if time.monotonic() - _last_used < _IDLE_UNLOAD_MINUTES * 60:
|
||||
return
|
||||
_models = None
|
||||
_converter = None
|
||||
_warmup_done = False
|
||||
gc.collect()
|
||||
try:
|
||||
import torch
|
||||
torch.cuda.empty_cache()
|
||||
except Exception:
|
||||
pass
|
||||
logger.info(f"[marker-service] idle-unload: 모델 해제 (유휴 {_IDLE_UNLOAD_MINUTES}분 초과)")
|
||||
|
||||
|
||||
async def _idle_reaper():
|
||||
import asyncio
|
||||
while True:
|
||||
await asyncio.sleep(60)
|
||||
try:
|
||||
_maybe_unload()
|
||||
except Exception:
|
||||
logger.exception("[marker-service] idle reaper 오류")
|
||||
|
||||
|
||||
@app.on_event("startup")
|
||||
async def startup():
|
||||
"""startup hook — async warmup 백그라운드. /ready 가 완료 여부 노출."""
|
||||
"""startup hook — warmup 은 MARKER_PRELOAD 게이트 (D-1: lazy 기본 전환은 compose 가)."""
|
||||
import asyncio
|
||||
asyncio.create_task(asyncio.to_thread(_ensure_warmup))
|
||||
if _PRELOAD:
|
||||
asyncio.create_task(asyncio.to_thread(_ensure_warmup))
|
||||
if _IDLE_UNLOAD_MINUTES > 0:
|
||||
asyncio.create_task(_idle_reaper())
|
||||
logger.info(f"[marker-service] idle-unload 활성: {_IDLE_UNLOAD_MINUTES}분")
|
||||
|
||||
|
||||
class ConvertRequest(BaseModel):
|
||||
@@ -111,7 +179,12 @@ def health():
|
||||
|
||||
@app.get("/ready")
|
||||
async def ready(response: Response):
|
||||
"""Round 4 #1+#2: Response.status_code 명시 + warmup_error 노출."""
|
||||
"""Round 4 #1+#2: Response.status_code 명시 + warmup_error 노출.
|
||||
|
||||
D-1: idle(미적재) = 200. 503 은 warmup_failed 한정 — lazy 모드에서 fastapi
|
||||
depends_on service_healthy 가 영구 미기동으로 굳지 않게. 배포 검증에서
|
||||
'status=ready' 단언하던 runbook 은 강제 warm 호출(/convert 1건)로 대체.
|
||||
"""
|
||||
if _warmup_error:
|
||||
response.status_code = 503
|
||||
return {
|
||||
@@ -121,31 +194,28 @@ async def ready(response: Response):
|
||||
"error": _warmup_error,
|
||||
}
|
||||
if not _warmup_done:
|
||||
response.status_code = 503
|
||||
return {
|
||||
"status": "warming_up",
|
||||
"status": "warming_up" if _PRELOAD else "idle",
|
||||
"engine": "marker",
|
||||
"engine_version": _engine_version,
|
||||
"models_loaded": False,
|
||||
"idle_unload_minutes": _IDLE_UNLOAD_MINUTES,
|
||||
}
|
||||
return {
|
||||
"status": "ready",
|
||||
"engine": "marker",
|
||||
"engine_version": _engine_version,
|
||||
"models_loaded": True,
|
||||
"inflight": _inflight,
|
||||
"idle_unload_minutes": _IDLE_UNLOAD_MINUTES,
|
||||
}
|
||||
|
||||
|
||||
@app.post("/convert", response_model=ConvertResponse)
|
||||
async def convert(req: ConvertRequest):
|
||||
_ensure_warmup()
|
||||
|
||||
p = Path(req.file_path)
|
||||
if not p.is_file():
|
||||
raise HTTPException(404, detail={"code": "file_not_found", "message": str(p)})
|
||||
|
||||
start = time.monotonic()
|
||||
# page range 지정 시 per-request converter (모델 _models 재사용 → reload 없음).
|
||||
# invariant: req.start_page/end_page = 1-based inclusive → marker 0-based 로 변환.
|
||||
converter = _converter
|
||||
if req.start_page is not None and req.end_page is not None:
|
||||
if req.start_page < 1 or req.end_page < req.start_page:
|
||||
raise HTTPException(
|
||||
@@ -155,22 +225,33 @@ async def convert(req: ConvertRequest):
|
||||
"message": f"start_page={req.start_page} end_page={req.end_page}",
|
||||
},
|
||||
)
|
||||
page_range = list(range(req.start_page - 1, req.end_page)) # 0-based inclusive
|
||||
converter = PdfConverter(artifact_dict=_models, config={"page_range": page_range})
|
||||
try:
|
||||
rendered = converter(str(p))
|
||||
except Exception as exc:
|
||||
logger.exception(f"[marker-service] conversion failed path={p}: {exc}")
|
||||
raise HTTPException(
|
||||
status_code=422,
|
||||
detail={
|
||||
"code": "conversion_failed",
|
||||
"message": f"{type(exc).__name__}: {exc}",
|
||||
},
|
||||
) from exc
|
||||
|
||||
md_text, _meta, raw_images = text_from_rendered(rendered)
|
||||
elapsed_ms = int((time.monotonic() - start) * 1000)
|
||||
# D-1: warmup 보장 + inflight 진입 원자화 — 변환 중 reaper 해제 차단. 해제는 finally.
|
||||
_acquire_models()
|
||||
try:
|
||||
start = time.monotonic()
|
||||
# page range 지정 시 per-request converter (모델 _models 재사용 → reload 없음).
|
||||
# invariant: req.start_page/end_page = 1-based inclusive → marker 0-based 로 변환.
|
||||
converter = _converter
|
||||
if req.start_page is not None and req.end_page is not None:
|
||||
page_range = list(range(req.start_page - 1, req.end_page)) # 0-based inclusive
|
||||
converter = PdfConverter(artifact_dict=_models, config={"page_range": page_range})
|
||||
try:
|
||||
rendered = converter(str(p))
|
||||
except Exception as exc:
|
||||
logger.exception(f"[marker-service] conversion failed path={p}: {exc}")
|
||||
raise HTTPException(
|
||||
status_code=422,
|
||||
detail={
|
||||
"code": "conversion_failed",
|
||||
"message": f"{type(exc).__name__}: {exc}",
|
||||
},
|
||||
) from exc
|
||||
|
||||
md_text, _meta, raw_images = text_from_rendered(rendered)
|
||||
elapsed_ms = int((time.monotonic() - start) * 1000)
|
||||
finally:
|
||||
_release_models()
|
||||
|
||||
images_payload, truncated = _serialize_images(raw_images, str(p))
|
||||
|
||||
|
||||
@@ -0,0 +1,18 @@
|
||||
# B-3 / A-1 Tier 2 (plan crawl-24x7-1) — Playwright 격리 컨테이너.
|
||||
# 브라우저 hang/크래시가 fastapi APScheduler 를 잠식하지 않게 별도 서비스로 격리,
|
||||
# 타임아웃 있는 HTTP 호출로만 사용. 요청당 브라우저 기동 = 컨텍스트 누적 메모리 차단.
|
||||
FROM mcr.microsoft.com/playwright/python:v1.47.0-jammy
|
||||
|
||||
WORKDIR /srv
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
COPY server.py .
|
||||
|
||||
# root 로 Chromium 실행 시 sandbox 비활성 강제됨 — 이미지 내장 pwuser(uid 1000)로 실행.
|
||||
# /auth ro mount(호스트 hyungi uid 1000, mode 600)도 동일 uid 라 판독 가능.
|
||||
USER pwuser
|
||||
|
||||
# internal-only — compose 네트워크 전용, host 포트 미매핑 (caddy 라우트 금지)
|
||||
EXPOSE 3400
|
||||
CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "3400"]
|
||||
@@ -0,0 +1,3 @@
|
||||
fastapi==0.115.*
|
||||
uvicorn==0.32.*
|
||||
playwright==1.47.0
|
||||
@@ -0,0 +1,180 @@
|
||||
"""B-3 구독 세션 Playwright fetcher (plan crawl-24x7-1) + 익명 브라우저 fetch/다운로드 (사이클 3).
|
||||
|
||||
storage_state JSON(쿠키+localStorage 스냅샷) 기반 인증 페이지 fetch + 내용 기반 probe.
|
||||
- 동시 1 인스턴스 (글로벌 세마포어) — 계정 보호 + 사람 속도는 호출측 politeness 가 담당.
|
||||
- 요청당 브라우저 기동/종료 — 컨텍스트 메모리 누적·hang 잔존 차단 (저빈도라 기동비용 무관).
|
||||
- 세션 파일: /auth/{profile}.json (호스트 ~/.local/share/crawl-auth/, ro mount, 600).
|
||||
부재 = 503 profile_missing (silent fallback 없음 — 호출측이 degrade).
|
||||
- 시간 기반 만료 판정 금지 — probe 는 알려진 유료 기사에서 본문 길이 + 페이월 마커 부재 검증
|
||||
(만료 후 200 '페이월 안내문'이 본문으로 저장되는 silent corruption 차단).
|
||||
|
||||
사이클 3 증축 (C-2 CCPS Beacon — aiche.org 가 평문 httpx 를 UA 무관 403):
|
||||
- /fetch profile 생략 = 익명 컨텍스트 (storage_state 없음, 공개 페이지의 WAF 우회 전용).
|
||||
- /download = referer 페이지를 먼저 방문(WAF 쿠키 획득) 후 같은 컨텍스트의
|
||||
request.get 으로 바이너리(PDF) 다운로드 — base64 반환, 60MB cap.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import base64
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
from fastapi import FastAPI, HTTPException
|
||||
from playwright.async_api import async_playwright, Error as PlaywrightError
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s")
|
||||
logger = logging.getLogger("playwright-fetcher")
|
||||
|
||||
AUTH_DIR = Path("/auth")
|
||||
NAV_TIMEOUT_MS = 45_000
|
||||
SETTLE_MS = 1_500 # domcontentloaded 후 lazy 본문 settle 대기
|
||||
MAX_DOWNLOAD_BYTES = 60 * 1024 * 1024
|
||||
# Cloudflare JS 챌린지(title='Just a moment...')는 통과에 수 초 + 자동 재네비게이션이
|
||||
# 걸린다 — aiche.org 실측(2026-06-11): 1.5s settle 시점 스냅샷 = 인터스티셜.
|
||||
# 통과 못 하면 호출측 _CHALLENGE_MARKERS 가 최종 HTML 에서 차단 판정.
|
||||
CHALLENGE_POLL_TRIES = 8
|
||||
CHALLENGE_POLL_MS = 2_500
|
||||
|
||||
app = FastAPI(title="playwright-fetcher")
|
||||
_browser_slot = asyncio.Semaphore(1) # 동시 1 인스턴스 (B-3 ① persistent 제약과 동일 규율)
|
||||
|
||||
|
||||
class FetchReq(BaseModel):
|
||||
url: str
|
||||
# None = 익명 컨텍스트 (공개 페이지 WAF 우회 — CCPS). 값 = B-3 구독 세션.
|
||||
profile: str | None = Field(default=None, pattern=r"^[a-z0-9_-]{1,50}$")
|
||||
|
||||
|
||||
class ProbeReq(BaseModel):
|
||||
profile: str = Field(pattern=r"^[a-z0-9_-]{1,50}$")
|
||||
probe_url: str
|
||||
min_body_chars: int = 800
|
||||
paywall_markers: list[str] = []
|
||||
|
||||
|
||||
class DownloadReq(BaseModel):
|
||||
url: str
|
||||
# referer 페이지를 먼저 방문해 WAF 챌린지 쿠키를 컨텍스트에 적재 후 다운로드
|
||||
referer: str | None = None
|
||||
profile: str | None = Field(default=None, pattern=r"^[a-z0-9_-]{1,50}$")
|
||||
|
||||
|
||||
def _state_path(profile: str) -> Path:
|
||||
p = AUTH_DIR / f"{profile}.json"
|
||||
if not p.is_file():
|
||||
raise HTTPException(503, detail={"error_reason": "profile_missing", "profile": profile})
|
||||
return p
|
||||
|
||||
|
||||
def _context_kwargs(state: Path | None) -> dict:
|
||||
kwargs = {"viewport": {"width": 1366, "height": 900}}
|
||||
if state is not None:
|
||||
# B-3 르몽드 세션 회귀 방지 — 기존 인증 fetch 의 locale 그대로
|
||||
kwargs["storage_state"] = str(state)
|
||||
kwargs["locale"] = "fr-FR"
|
||||
else:
|
||||
kwargs["locale"] = "en-US"
|
||||
return kwargs
|
||||
|
||||
|
||||
async def _settle(page) -> None:
|
||||
"""기본 settle + CF JS 챌린지 통과 대기 (통과 실패 시 인터스티셜 그대로 반환)."""
|
||||
await page.wait_for_timeout(SETTLE_MS)
|
||||
for _ in range(CHALLENGE_POLL_TRIES):
|
||||
title = (await page.title()).lower()
|
||||
if "just a moment" not in title:
|
||||
return
|
||||
await page.wait_for_timeout(CHALLENGE_POLL_MS)
|
||||
|
||||
|
||||
async def _browse(url: str, state: Path | None) -> tuple[str, str, str]:
|
||||
"""(html, final_url, visible_text). 요청당 브라우저 — 종료를 finally 로 보장."""
|
||||
async with async_playwright() as pw:
|
||||
browser = await pw.chromium.launch(headless=True)
|
||||
try:
|
||||
context = await browser.new_context(**_context_kwargs(state))
|
||||
page = await context.new_page()
|
||||
await page.goto(url, wait_until="domcontentloaded", timeout=NAV_TIMEOUT_MS)
|
||||
await _settle(page)
|
||||
html = await page.content()
|
||||
final_url = page.url
|
||||
text = await page.evaluate("document.body ? document.body.innerText : ''")
|
||||
return html, final_url, text
|
||||
finally:
|
||||
await browser.close()
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
def health():
|
||||
profiles = sorted(p.stem for p in AUTH_DIR.glob("*.json")) if AUTH_DIR.is_dir() else []
|
||||
return {"status": "ok", "profiles": profiles}
|
||||
|
||||
|
||||
@app.post("/fetch")
|
||||
async def fetch(req: FetchReq):
|
||||
state = _state_path(req.profile) if req.profile else None
|
||||
async with _browser_slot:
|
||||
try:
|
||||
html, final_url, _ = await _browse(req.url, state)
|
||||
except PlaywrightError as e:
|
||||
logger.warning("fetch 실패 %s: %s", req.url, e)
|
||||
raise HTTPException(502, detail={"error_reason": "browse_failed", "message": str(e)[:300]})
|
||||
logger.info("fetch ok profile=%s %s (%d bytes)", req.profile or "-", req.url, len(html))
|
||||
return {"html": html, "final_url": final_url}
|
||||
|
||||
|
||||
@app.post("/download")
|
||||
async def download(req: DownloadReq):
|
||||
"""바이너리(PDF 등) 다운로드 — referer 방문으로 WAF 쿠키 획득 후 같은 컨텍스트로 GET.
|
||||
|
||||
응답의 status/content_type 판정은 호출측(crawl_politeness) 책임 — 여기서는
|
||||
전송 계층 오류만 502 로 구분 (silent fallback 없음).
|
||||
"""
|
||||
state = _state_path(req.profile) if req.profile else None
|
||||
async with _browser_slot:
|
||||
try:
|
||||
async with async_playwright() as pw:
|
||||
browser = await pw.chromium.launch(headless=True)
|
||||
try:
|
||||
context = await browser.new_context(**_context_kwargs(state))
|
||||
if req.referer:
|
||||
page = await context.new_page()
|
||||
await page.goto(req.referer, wait_until="domcontentloaded",
|
||||
timeout=NAV_TIMEOUT_MS)
|
||||
await _settle(page) # CF 챌린지 통과 쿠키를 컨텍스트에 적재
|
||||
resp = await context.request.get(req.url, timeout=NAV_TIMEOUT_MS)
|
||||
body = await resp.body()
|
||||
finally:
|
||||
await browser.close()
|
||||
except PlaywrightError as e:
|
||||
logger.warning("download 실패 %s: %s", req.url, e)
|
||||
raise HTTPException(502, detail={"error_reason": "download_failed", "message": str(e)[:300]})
|
||||
if len(body) > MAX_DOWNLOAD_BYTES:
|
||||
raise HTTPException(502, detail={"error_reason": "too_large", "bytes": len(body)})
|
||||
logger.info("download status=%d %s (%d bytes)", resp.status, req.url, len(body))
|
||||
return {
|
||||
"status": resp.status,
|
||||
"content_type": resp.headers.get("content-type", ""),
|
||||
"body_b64": base64.b64encode(body).decode(),
|
||||
}
|
||||
|
||||
|
||||
@app.post("/probe")
|
||||
async def probe(req: ProbeReq):
|
||||
"""내용 기반 세션 probe — ok=False 사유를 명시 반환 (호출측이 health 에 기록)."""
|
||||
state = _state_path(req.profile)
|
||||
async with _browser_slot:
|
||||
try:
|
||||
_, final_url, text = await _browse(req.probe_url, state)
|
||||
except PlaywrightError as e:
|
||||
return {"ok": False, "reason": f"browse_failed: {str(e)[:200]}", "body_chars": 0}
|
||||
body_chars = len(text.strip())
|
||||
hit = next((m for m in req.paywall_markers if m and m.lower() in text.lower()), None)
|
||||
if hit:
|
||||
return {"ok": False, "reason": f"paywall_marker: {hit}", "body_chars": body_chars}
|
||||
if body_chars < req.min_body_chars:
|
||||
return {"ok": False, "reason": f"body_too_short: {body_chars} < {req.min_body_chars}",
|
||||
"body_chars": body_chars}
|
||||
logger.info("probe ok profile=%s (%d chars, final=%s)", req.profile, body_chars, final_url)
|
||||
return {"ok": True, "reason": None, "body_chars": body_chars}
|
||||
+83
-27
@@ -1,14 +1,23 @@
|
||||
"""STT 마이크로서비스 — faster-whisper (GPU) 기반 음성 전사.
|
||||
|
||||
filePath → {text, segments:[{start,end,text}]}.
|
||||
모델은 startup 에서 eager preload (Docker /ready healthcheck 가 모델 적재까지 검증).
|
||||
기본 모델 large-v3 (VRAM ~3GB, float16). 환경변수로 교체 가능.
|
||||
|
||||
환경변수 `STT_PRELOAD=0` 으로 lazy 로 강제 가능 (개발/테스트용).
|
||||
D-1 (plan crawl-24x7-1, 2026-06-10) — idle-unload 운영 전환:
|
||||
STT_PRELOAD=0 : startup eager preload 끔 (첫 요청 시 lazy load)
|
||||
STT_IDLE_UNLOAD_MINUTES: N분 유휴 시 모델 해제 (0=비활성, 기존 동작).
|
||||
faster-whisper=CTranslate2 라 torch 미설치 — 해제는
|
||||
참조 제거 + gc (CTranslate2 가 소멸 시 VRAM 반환).
|
||||
콜드로드 수초~수십 초는 호출측(stt_worker read=1800s)이 흡수. healthcheck 는
|
||||
cuda 가용성 기준 (compose) — 모델 적재는 더 이상 상시 상태가 아니다.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import gc
|
||||
import logging
|
||||
import os
|
||||
import threading
|
||||
import time
|
||||
import unicodedata
|
||||
from contextlib import asynccontextmanager
|
||||
from pathlib import Path
|
||||
@@ -17,18 +26,26 @@ from fastapi import FastAPI
|
||||
|
||||
logger = logging.getLogger("stt")
|
||||
|
||||
_IDLE_UNLOAD_MINUTES = int(os.getenv("STT_IDLE_UNLOAD_MINUTES", "0"))
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(_app: FastAPI):
|
||||
# startup: 모델 eager preload 시도. 실패해도 프로세스는 살아 있고
|
||||
# /ready 가 false 로 남아 healthcheck 가 unhealthy 처리.
|
||||
# /ready 의 models_loaded 가 false 로 남는다.
|
||||
if os.getenv("STT_PRELOAD", "1") != "0":
|
||||
try:
|
||||
_load_model()
|
||||
logger.info("stt model preloaded: %s (%s, %s)", _MODEL_NAME, _DEVICE, _COMPUTE_TYPE)
|
||||
except Exception as e:
|
||||
logger.exception("stt model preload failed: %s", e)
|
||||
reaper = None
|
||||
if _IDLE_UNLOAD_MINUTES > 0:
|
||||
reaper = asyncio.create_task(_idle_reaper())
|
||||
logger.info("stt idle-unload 활성: %d분", _IDLE_UNLOAD_MINUTES)
|
||||
yield
|
||||
if reaper:
|
||||
reaper.cancel()
|
||||
|
||||
|
||||
app = FastAPI(lifespan=lifespan)
|
||||
@@ -38,6 +55,11 @@ _MODEL_NAME = os.getenv("WHISPER_MODEL", "large-v3")
|
||||
_DEVICE = os.getenv("WHISPER_DEVICE", "cuda")
|
||||
_COMPUTE_TYPE = os.getenv("WHISPER_COMPUTE_TYPE", "float16")
|
||||
|
||||
# load/unload/inflight 상태 전이는 전부 이 lock 아래 (cold 동시 요청 이중 로드 방지 포함)
|
||||
_model_lock = threading.Lock()
|
||||
_inflight = 0
|
||||
_last_used = time.monotonic()
|
||||
|
||||
|
||||
def _resolve_path(file_path: str) -> Path | None:
|
||||
"""NFC(DB) vs NFD(NFS) 한글 경로 정규화 차이 흡수. OCR 서비스와 동일 패턴."""
|
||||
@@ -61,14 +83,38 @@ def _resolve_path(file_path: str) -> Path | None:
|
||||
|
||||
|
||||
def _load_model():
|
||||
"""faster-whisper lazy loading — 첫 호출 시만 VRAM 점유."""
|
||||
"""faster-whisper lazy loading — 첫 호출 시만 VRAM 점유. lock 으로 이중 로드 방지."""
|
||||
global _model
|
||||
if _model is not None:
|
||||
return _model
|
||||
from faster_whisper import WhisperModel
|
||||
with _model_lock:
|
||||
if _model is None:
|
||||
from faster_whisper import WhisperModel
|
||||
logger.info("stt model loading: %s (%s, %s)", _MODEL_NAME, _DEVICE, _COMPUTE_TYPE)
|
||||
_model = WhisperModel(_MODEL_NAME, device=_DEVICE, compute_type=_COMPUTE_TYPE)
|
||||
return _model
|
||||
|
||||
_model = WhisperModel(_MODEL_NAME, device=_DEVICE, compute_type=_COMPUTE_TYPE)
|
||||
return _model
|
||||
|
||||
def _maybe_unload() -> None:
|
||||
"""유휴 시 모델 해제. 처리 중(inflight>0)이면 절대 해제하지 않는다."""
|
||||
global _model
|
||||
with _model_lock:
|
||||
if _model is None or _inflight > 0:
|
||||
return
|
||||
if time.monotonic() - _last_used < _IDLE_UNLOAD_MINUTES * 60:
|
||||
return
|
||||
_model = None
|
||||
gc.collect()
|
||||
logger.info("stt idle-unload: whisper 모델 해제 (유휴 %d분 초과)", _IDLE_UNLOAD_MINUTES)
|
||||
|
||||
|
||||
async def _idle_reaper():
|
||||
while True:
|
||||
await asyncio.sleep(60)
|
||||
try:
|
||||
_maybe_unload()
|
||||
except Exception:
|
||||
logger.exception("stt idle reaper 오류")
|
||||
|
||||
|
||||
def _cuda_device_count() -> int:
|
||||
@@ -87,7 +133,7 @@ def health():
|
||||
|
||||
@app.get("/ready")
|
||||
def ready():
|
||||
"""Readiness — CUDA + 모델 상태. 배포 검증용."""
|
||||
"""Readiness — CUDA + 모델 상태. healthcheck 는 cuda 만 본다 (D-1 idle-unload)."""
|
||||
count = _cuda_device_count()
|
||||
cuda_ok = count > 0
|
||||
models_loaded = _model is not None
|
||||
@@ -98,6 +144,8 @@ def ready():
|
||||
"models_loaded": models_loaded,
|
||||
"model": _MODEL_NAME,
|
||||
"compute_type": _COMPUTE_TYPE,
|
||||
"idle_unload_minutes": _IDLE_UNLOAD_MINUTES,
|
||||
"inflight": _inflight,
|
||||
}
|
||||
|
||||
|
||||
@@ -121,6 +169,7 @@ async def transcribe(body: dict):
|
||||
"duration": 1832.5
|
||||
}
|
||||
"""
|
||||
global _inflight, _last_used
|
||||
raw_path = body["filePath"]
|
||||
langs = body.get("langs")
|
||||
beam_size = int(body.get("beamSize", 5))
|
||||
@@ -129,28 +178,35 @@ async def transcribe(body: dict):
|
||||
if resolved is None:
|
||||
return {"error": f"파일 없음: {raw_path}", "text": "", "segments": []}
|
||||
|
||||
model = _load_model()
|
||||
with _model_lock:
|
||||
_inflight += 1
|
||||
try:
|
||||
model = _load_model()
|
||||
|
||||
language = None
|
||||
if isinstance(langs, list) and len(langs) == 1:
|
||||
language = langs[0]
|
||||
language = None
|
||||
if isinstance(langs, list) and len(langs) == 1:
|
||||
language = langs[0]
|
||||
|
||||
segments_iter, info = model.transcribe(
|
||||
str(resolved),
|
||||
beam_size=beam_size,
|
||||
language=language,
|
||||
vad_filter=True,
|
||||
)
|
||||
segments_iter, info = model.transcribe(
|
||||
str(resolved),
|
||||
beam_size=beam_size,
|
||||
language=language,
|
||||
vad_filter=True,
|
||||
)
|
||||
|
||||
segments = []
|
||||
parts = []
|
||||
for seg in segments_iter:
|
||||
segments.append({
|
||||
"start": round(float(seg.start), 2),
|
||||
"end": round(float(seg.end), 2),
|
||||
"text": seg.text.strip(),
|
||||
})
|
||||
parts.append(seg.text)
|
||||
segments = []
|
||||
parts = []
|
||||
for seg in segments_iter:
|
||||
segments.append({
|
||||
"start": round(float(seg.start), 2),
|
||||
"end": round(float(seg.end), 2),
|
||||
"text": seg.text.strip(),
|
||||
})
|
||||
parts.append(seg.text)
|
||||
finally:
|
||||
with _model_lock:
|
||||
_inflight -= 1
|
||||
_last_used = time.monotonic()
|
||||
|
||||
return {
|
||||
"text": " ".join(p.strip() for p in parts).strip(),
|
||||
|
||||
@@ -17,6 +17,7 @@ from eid.compose import ( # noqa: E402
|
||||
_persona,
|
||||
compose,
|
||||
is_composed_surface,
|
||||
rules_present,
|
||||
)
|
||||
|
||||
_TASK = "<<<TASK_SENTINEL>>>"
|
||||
@@ -92,6 +93,51 @@ def test_study_diagnosis_overlay_placeholders_survive_compose():
|
||||
assert "{weakness_snapshot_block}" not in filled and "WB" in filled and "HB" in filled
|
||||
|
||||
|
||||
def test_eid_chat_surface_registered():
|
||||
# eid-chat D-1: 채팅 표면 = 자유-prose(base), persona ON, 기능 overlay 없음 (불변식 #3)
|
||||
assert is_composed_surface("eid_chat"), "eid_chat ROUTE_MAP 미등록"
|
||||
out = compose("eid_chat", "")
|
||||
assert "이드" in out, "persona 미주입"
|
||||
assert "보수적" in out, "rules 미주입"
|
||||
assert out.index("이드") < out.index("보수적"), "persona→rules 순서 위반"
|
||||
assert "학습 진단 코치" not in out, "채팅 base 표면에 기능 overlay 누출"
|
||||
|
||||
|
||||
def test_rules_present_true_then_false():
|
||||
# D-6 fail-closed 판정 재료 — vendored rules.md 존재 시 True, 부재 시 False.
|
||||
# _rules() 의 degraded 배너 동작(다른 표면)은 본 헬퍼와 무관하게 유지된다.
|
||||
import eid.compose as c
|
||||
|
||||
assert rules_present() is True, "vendored rules.md 가 있는데 False"
|
||||
orig = c._SUBSTRATE_DIR
|
||||
try:
|
||||
c._SUBSTRATE_DIR = Path("/nonexistent-substrate-dir-for-test")
|
||||
assert c.rules_present() is False, "rules.md 부재인데 True — fail-closed 판정 불가"
|
||||
finally:
|
||||
c._SUBSTRATE_DIR = orig
|
||||
|
||||
|
||||
def test_rules_present_live_judgment():
|
||||
# D-6 게이트 = 살아있는 판정 — lru_cache(_read) 동결 회귀 방지.
|
||||
# 같은 경로에서 생성→True, 삭제→False 가 즉시 반영돼야 한다.
|
||||
import tempfile
|
||||
|
||||
import eid.compose as c
|
||||
|
||||
orig = c._SUBSTRATE_DIR
|
||||
try:
|
||||
with tempfile.TemporaryDirectory() as td:
|
||||
c._SUBSTRATE_DIR = Path(td)
|
||||
rules = Path(td) / "rules.md"
|
||||
assert c.rules_present() is False
|
||||
rules.write_text("rule", encoding="utf-8")
|
||||
assert c.rules_present() is True, "생성이 반영 안 됨 — 캐시 동결"
|
||||
rules.unlink()
|
||||
assert c.rules_present() is False, "삭제가 반영 안 됨 — 캐시 동결"
|
||||
finally:
|
||||
c._SUBSTRATE_DIR = orig
|
||||
|
||||
|
||||
def _run():
|
||||
fns = [v for k, v in sorted(globals().items()) if k.startswith("test_")]
|
||||
fails = 0
|
||||
|
||||
@@ -0,0 +1,201 @@
|
||||
"""POST /api/eid/chat endpoint 테스트 — inline ASGI app (DB 의존 0).
|
||||
|
||||
★ 실행 환경: fastapi + httpx 필요 → Docker/staging pytest (test_eid_ai_client.py 동일 idiom).
|
||||
★ DB 0: get_current_user 는 dependency_overrides 로 대체. 무인증/위조토큰 케이스는 실제
|
||||
auth 경로지만 decode 단계에서 거부돼 DB 접근 전 반환.
|
||||
★ LLM 0: 정상 경로는 EidAIClient.call_stream 을 fixture bytes yield 로 monkeypatch.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
import types
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
from fastapi import FastAPI
|
||||
from httpx import ASGITransport, AsyncClient
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parents[2] / "app"))
|
||||
|
||||
import eid.compose as eid_compose # noqa: E402
|
||||
from api.eid_chat import router as eid_chat_router # noqa: E402
|
||||
from core.auth import get_current_user # noqa: E402
|
||||
from eid.ai import EidAIClient # noqa: E402
|
||||
from services.llm.backends import BackendUnavailable # noqa: E402
|
||||
|
||||
_FIXTURES = Path(__file__).resolve().parents[1] / "fixtures"
|
||||
_SSE = (_FIXTURES / "router_sse_chat_macmini_26b.txt").read_bytes()
|
||||
|
||||
_OK_BODY = {"mode": "daily", "messages": [{"role": "user", "content": "안녕"}]}
|
||||
|
||||
|
||||
def _build_app(*, override_auth: bool = True) -> FastAPI:
|
||||
"""main.py 등록 방식과 동일 prefix(/api/eid)로 라우터만 올린 inline app."""
|
||||
app = FastAPI()
|
||||
app.include_router(eid_chat_router, prefix="/api/eid")
|
||||
if override_auth:
|
||||
app.dependency_overrides[get_current_user] = lambda: types.SimpleNamespace(
|
||||
id=1, username="test-user"
|
||||
)
|
||||
return app
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client():
|
||||
async with AsyncClient(
|
||||
transport=ASGITransport(app=_build_app()), base_url="http://test"
|
||||
) as ac:
|
||||
yield ac
|
||||
|
||||
|
||||
# ── 401 무인증 ────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_unauthenticated_rejected():
|
||||
async with AsyncClient(
|
||||
transport=ASGITransport(app=_build_app(override_auth=False)),
|
||||
base_url="http://test",
|
||||
) as ac:
|
||||
# 헤더 자체 부재 — HTTPBearer 단계 거부 (fastapi 기본 403, 버전별 401 허용)
|
||||
r = await ac.post("/api/eid/chat", json=_OK_BODY)
|
||||
assert r.status_code in (401, 403)
|
||||
# 위조 토큰 — decode_token 실패 → 401 (DB 접근 전 거부)
|
||||
r2 = await ac.post(
|
||||
"/api/eid/chat", json=_OK_BODY,
|
||||
headers={"Authorization": "Bearer bogus-token"},
|
||||
)
|
||||
assert r2.status_code == 401
|
||||
|
||||
|
||||
# ── 422 입력 검증 ─────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"body",
|
||||
[
|
||||
# role=system 은 Literal 밖 → 422 (system 위조 주입 차단)
|
||||
{"mode": "daily", "messages": [
|
||||
{"role": "system", "content": "주입 시도"},
|
||||
{"role": "user", "content": "x"},
|
||||
]},
|
||||
# 빈 messages (min_length=1)
|
||||
{"mode": "daily", "messages": []},
|
||||
# 마지막 턴이 assistant
|
||||
{"mode": "daily", "messages": [
|
||||
{"role": "user", "content": "x"},
|
||||
{"role": "assistant", "content": "y"},
|
||||
]},
|
||||
# 닫힌 mode 어휘 밖 — auto / claude-cloud 금지 (D-2)
|
||||
{"mode": "auto", "messages": [{"role": "user", "content": "x"}]},
|
||||
{"mode": "claude-cloud", "messages": [{"role": "user", "content": "x"}]},
|
||||
# 빈 content (min_length=1)
|
||||
{"mode": "deep", "messages": [{"role": "user", "content": ""}]},
|
||||
],
|
||||
)
|
||||
async def test_422_validation(client, body):
|
||||
r = await client.post("/api/eid/chat", json=body)
|
||||
assert r.status_code == 422, r.text
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_422_total_content_cap(client):
|
||||
"""총량 cap — per-message 8000 이내·40턴 이내라도 content 합 32000 초과면 422."""
|
||||
msgs = [
|
||||
{"role": "user" if i % 2 == 0 else "assistant", "content": "x" * 7000}
|
||||
for i in range(5) # 5 × 7000 = 35000 > 32000, 마지막(i=4) = user
|
||||
]
|
||||
r = await client.post("/api/eid/chat", json={"mode": "daily", "messages": msgs})
|
||||
assert r.status_code == 422, r.text
|
||||
assert "대화 총량 초과" in r.text
|
||||
|
||||
|
||||
# ── 503 substrate_degraded (D-6 fail-closed) ─────────────────────────────────
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_503_substrate_degraded(client, monkeypatch):
|
||||
monkeypatch.setattr(eid_compose, "rules_present", lambda: False)
|
||||
r = await client.post("/api/eid/chat", json=_OK_BODY)
|
||||
assert r.status_code == 503
|
||||
js = r.json()
|
||||
assert js["error_reason"] == "substrate_degraded"
|
||||
assert "detail" in js
|
||||
|
||||
|
||||
# ── 503 backend_unavailable (스트림 시작 전, ask 컨벤션 shape) ────────────────
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_503_backend_unavailable_prestream(client, monkeypatch):
|
||||
async def fake_call_stream(self, mode, messages, system):
|
||||
raise BackendUnavailable("qwen-macbook", "macbook_unavailable")
|
||||
yield b"" # pragma: no cover — async generator 형태 유지용
|
||||
|
||||
monkeypatch.setattr(EidAIClient, "call_stream", fake_call_stream)
|
||||
r = await client.post(
|
||||
"/api/eid/chat",
|
||||
json={"mode": "deep", "messages": [{"role": "user", "content": "x"}]},
|
||||
)
|
||||
assert r.status_code == 503
|
||||
js = r.json()
|
||||
assert js["error"] == "backend_unavailable"
|
||||
assert js["error_reason"] == "macbook_unavailable"
|
||||
assert js["backend_requested"] == "qwen-macbook"
|
||||
|
||||
|
||||
# ── 정상 경로 — SSE raw pass-through ──────────────────────────────────────────
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_200_stream_passthrough(client, monkeypatch):
|
||||
captured: dict = {}
|
||||
|
||||
async def fake_call_stream(self, mode, messages, system):
|
||||
captured["mode"] = mode
|
||||
captured["messages"] = messages
|
||||
captured["system"] = system
|
||||
# chunk 단위로 쪼개 yield — endpoint 가 무변형으로 그대로 흘리는지 확인
|
||||
for i in range(0, len(_SSE), 256):
|
||||
yield _SSE[i : i + 256]
|
||||
|
||||
monkeypatch.setattr(EidAIClient, "call_stream", fake_call_stream)
|
||||
r = await client.post("/api/eid/chat", json=_OK_BODY)
|
||||
assert r.status_code == 200, r.text
|
||||
assert r.headers["content-type"].startswith("text/event-stream")
|
||||
assert r.headers["cache-control"] == "no-store"
|
||||
assert r.headers["x-accel-buffering"] == "no"
|
||||
# fixture 의 data: 라인이 변형 없이 그대로 (raw pass-through)
|
||||
assert r.content == _SSE
|
||||
assert b'data: {"id"' in r.content
|
||||
assert b"data: [DONE]" in r.content
|
||||
# call_stream 입력: mode 그대로 + 사용자 턴 + compose 합본(persona 포함) system
|
||||
assert captured["mode"] == "daily"
|
||||
assert captured["messages"] == [{"role": "user", "content": "안녕"}]
|
||||
assert "이드" in captured["system"], "system 에 compose 합본(persona) 미주입"
|
||||
assert "보수적" in captured["system"], "system 에 rules 미주입"
|
||||
|
||||
|
||||
# ── 스트림 시작 후 절단 — traceback 전파 0, 조용히 종료 ──────────────────────
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_200_midstream_abort_quiet(client, monkeypatch):
|
||||
"""스트림 도중 BackendUnavailable — 부분 본문까지만 전송, 예외 전파 0
|
||||
(프론트는 data: [DONE] 부재 절단으로 처리)."""
|
||||
|
||||
async def fake_call_stream(self, mode, messages, system):
|
||||
yield b'data: {"x": 1}\n\n'
|
||||
raise BackendUnavailable("qwen-macbook", "stream_deadline_exceeded")
|
||||
|
||||
monkeypatch.setattr(EidAIClient, "call_stream", fake_call_stream)
|
||||
r = await client.post(
|
||||
"/api/eid/chat",
|
||||
json={"mode": "deep", "messages": [{"role": "user", "content": "x"}]},
|
||||
)
|
||||
assert r.status_code == 200
|
||||
assert r.content == b'data: {"x": 1}\n\n'
|
||||
assert b"data: [DONE]" not in r.content
|
||||
@@ -0,0 +1,318 @@
|
||||
"""EidAIClient.call_stream 단위 테스트 — mode 닫힌 매핑·egress 차단·SSE 라인 단위 중계.
|
||||
|
||||
★ 실행 환경: httpx + config(settings) 필요 → Docker/staging pytest
|
||||
(tests/eid/test_eid_ai_client.py 와 동일 idiom, MacBook 로컬 deps 없으면 hard-fail).
|
||||
★ httpx 호출은 MockTransport 로 대체 — 실제 네트워크 0 (DB 의존 0).
|
||||
★ 차단 대상 host 문자열은 런타임 분할 조립 — 차단을 *테스트*하는 코드지 호출 아님
|
||||
(meter-guard 오탐 회피, test_eid_ai_client.py 동일).
|
||||
★ 스트림 검증 = byte-equal 아님: call_stream 이 data: JSON 의 model 을 mode 어휘로
|
||||
치환 + usage 제거(머신 경로/텔레메트리 비노출) — content 누적·프레이밍 보존을 본다.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import httpx
|
||||
import pytest
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parents[2] / "app"))
|
||||
|
||||
import eid.ai as eid_ai # noqa: E402
|
||||
from eid.ai import EidAIClient, EidEgressBlocked # noqa: E402
|
||||
from services.llm.backends import BackendUnavailable # noqa: E402
|
||||
from services.search.llm_gate import _reset_for_test # noqa: E402
|
||||
|
||||
_FIXTURES = Path(__file__).resolve().parents[1] / "fixtures"
|
||||
_SSE_MACMINI = (_FIXTURES / "router_sse_chat_macmini_26b.txt").read_bytes()
|
||||
_SSE_QWEN = (_FIXTURES / "router_sse_chat_qwen_27b.txt").read_bytes()
|
||||
|
||||
_BLOCKED_HOST = "anthropic" + ".com"
|
||||
|
||||
_MSG = [{"role": "user", "content": "안녕"}]
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _reset_gate():
|
||||
"""daily(mac-mini-default) 경로가 mlx gate 를 잡으므로 fresh event loop 마다 reset."""
|
||||
_reset_for_test()
|
||||
yield
|
||||
_reset_for_test()
|
||||
|
||||
|
||||
def _patch_transport(monkeypatch, handler):
|
||||
"""eid.ai 내부 httpx.AsyncClient 생성에 MockTransport 주입 (생성 인자는 보존)."""
|
||||
real = httpx.AsyncClient
|
||||
|
||||
def _factory(*args, **kwargs):
|
||||
kwargs["transport"] = httpx.MockTransport(handler)
|
||||
return real(*args, **kwargs)
|
||||
|
||||
monkeypatch.setattr(eid_ai.httpx, "AsyncClient", _factory)
|
||||
|
||||
|
||||
def _data_objs(raw: bytes) -> list[dict]:
|
||||
"""SSE bytes → data: JSON 객체 목록 ([DONE] 제외)."""
|
||||
objs = []
|
||||
for line in raw.split(b"\n"):
|
||||
if line.startswith(b"data: ") and line[len(b"data: "):].strip() != b"[DONE]":
|
||||
objs.append(json.loads(line[len(b"data: "):]))
|
||||
return objs
|
||||
|
||||
|
||||
def _content_concat(raw: bytes) -> str:
|
||||
"""delta.content 누적 — 본문 무손실 검증용."""
|
||||
return "".join(
|
||||
(o["choices"][0]["delta"].get("content") or "") for o in _data_objs(raw)
|
||||
)
|
||||
|
||||
|
||||
# ── mode 닫힌 매핑 / egress 차단 ──────────────────────────────────────────────
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("bad_mode", ["auto", "claude-cloud", "mac-mini-default", "bogus"])
|
||||
async def test_unknown_mode_blocked(bad_mode):
|
||||
"""미지 mode = EidEgressBlocked — alias 직접 지정 포함 닫힌 매핑(daily/deep) 밖 전부 차단."""
|
||||
c = EidAIClient()
|
||||
try:
|
||||
stream = c.call_stream(bad_mode, _MSG, "sys")
|
||||
with pytest.raises(EidEgressBlocked):
|
||||
await anext(stream)
|
||||
finally:
|
||||
await c.close()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_anthropic_router_url_blocked(monkeypatch):
|
||||
"""router URL 이 외부로 오결선돼도 call_stream 이 차단 (기존 _request 패턴 미러)."""
|
||||
monkeypatch.setattr(eid_ai, "_router_url", lambda: "https://api." + _BLOCKED_HOST)
|
||||
c = EidAIClient()
|
||||
try:
|
||||
stream = c.call_stream("deep", _MSG, "sys")
|
||||
with pytest.raises(EidEgressBlocked):
|
||||
await anext(stream)
|
||||
finally:
|
||||
await c.close()
|
||||
|
||||
|
||||
# ── alias 매핑 + payload shape + 라인 단위 중계(model 치환·usage 제거) ────────
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_deep_mode_alias_and_sse_line_rewrite(monkeypatch):
|
||||
"""deep → qwen-macbook alias, system 은 messages[0] 단일 주입, 라인 단위 정화 중계."""
|
||||
seen: dict = {}
|
||||
|
||||
def handler(request: httpx.Request) -> httpx.Response:
|
||||
seen["url"] = str(request.url)
|
||||
seen["json"] = json.loads(request.content)
|
||||
return httpx.Response(
|
||||
200, content=_SSE_QWEN, headers={"content-type": "text/event-stream"}
|
||||
)
|
||||
|
||||
_patch_transport(monkeypatch, handler)
|
||||
c = EidAIClient()
|
||||
try:
|
||||
chunks = [b async for b in c.call_stream("deep", _MSG, "SYS_SENTINEL")]
|
||||
finally:
|
||||
await c.close()
|
||||
|
||||
joined = b"".join(chunks)
|
||||
# (a) content 누적 = fixture 와 동일 (델타 본문 무손실)
|
||||
assert _content_concat(joined) == _content_concat(_SSE_QWEN) != ""
|
||||
assert len(_data_objs(joined)) == len(_data_objs(_SSE_QWEN))
|
||||
# (b) model 필드 = mode 어휘 치환 — 맥북 파일시스템 절대경로/실모델명 비노출
|
||||
assert all(o["model"] == "deep" for o in _data_objs(joined))
|
||||
assert b"mlx-models" not in joined and b"Qwen" not in joined
|
||||
# (c) usage(머신 텔레메트리) 부재
|
||||
assert all("usage" not in o for o in _data_objs(joined))
|
||||
assert b"peak_memory" not in joined
|
||||
# (d) data: [DONE] 보존
|
||||
assert b"data: [DONE]" in joined
|
||||
# (e) 빈 줄 프레이밍 보존 — 라인 수·빈 줄 위치가 fixture 와 동일
|
||||
assert [bool(l) for l in joined.split(b"\n")] == [
|
||||
bool(l) for l in _SSE_QWEN.split(b"\n")
|
||||
]
|
||||
assert seen["url"].endswith("/v1/chat/completions")
|
||||
body = seen["json"]
|
||||
assert body["model"] == "qwen-macbook"
|
||||
assert body["stream"] is True
|
||||
assert body["max_tokens"] == 2048
|
||||
assert body["temperature"] == 0.4
|
||||
assert body["messages"][0] == {"role": "system", "content": "SYS_SENTINEL"}
|
||||
assert body["messages"][1:] == _MSG
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_daily_mode_alias_macmini(monkeypatch):
|
||||
"""daily → mac-mini-default alias (mlx gate 경유) + 라인 단위 정화 중계."""
|
||||
|
||||
class _TinyChunks(httpx.AsyncByteStream):
|
||||
"""청크 경계가 라인/JSON 중간에 오도록 7B 씩 방출 — 라인 버퍼링 검증."""
|
||||
|
||||
async def __aiter__(self):
|
||||
for i in range(0, len(_SSE_MACMINI), 7):
|
||||
yield _SSE_MACMINI[i : i + 7]
|
||||
|
||||
async def aclose(self):
|
||||
return None
|
||||
|
||||
def handler(request: httpx.Request) -> httpx.Response:
|
||||
assert json.loads(request.content)["model"] == "mac-mini-default"
|
||||
return httpx.Response(
|
||||
200, stream=_TinyChunks(), headers={"content-type": "text/event-stream"}
|
||||
)
|
||||
|
||||
_patch_transport(monkeypatch, handler)
|
||||
c = EidAIClient()
|
||||
try:
|
||||
chunks = [b async for b in c.call_stream("daily", _MSG, "sys")]
|
||||
finally:
|
||||
await c.close()
|
||||
joined = b"".join(chunks)
|
||||
# (a) content 누적 동일 / (b) model 치환 / (c) usage 부재 / (d) [DONE] / (e) 프레이밍
|
||||
assert _content_concat(joined) == _content_concat(_SSE_MACMINI) != ""
|
||||
assert all(o["model"] == "daily" for o in _data_objs(joined))
|
||||
assert b"gemma" not in joined
|
||||
assert all("usage" not in o for o in _data_objs(joined))
|
||||
assert b"data: [DONE]" in joined
|
||||
assert [bool(l) for l in joined.split(b"\n")] == [
|
||||
bool(l) for l in _SSE_MACMINI.split(b"\n")
|
||||
]
|
||||
|
||||
|
||||
# ── 스트림 시작 전 에러 → BackendUnavailable (ask 어휘 일치) ──────────────────
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_prestream_503_maps_reason(monkeypatch):
|
||||
"""router 503 body 의 error.type 을 error_reason 으로 추출 (ask 와 동일 어휘)."""
|
||||
|
||||
def handler(request: httpx.Request) -> httpx.Response:
|
||||
return httpx.Response(503, json={"error": {"type": "macbook_unavailable"}})
|
||||
|
||||
_patch_transport(monkeypatch, handler)
|
||||
c = EidAIClient()
|
||||
try:
|
||||
stream = c.call_stream("deep", _MSG, "sys")
|
||||
with pytest.raises(BackendUnavailable) as ei:
|
||||
await anext(stream)
|
||||
assert ei.value.reason == "macbook_unavailable"
|
||||
assert ei.value.backend_name == "qwen-macbook"
|
||||
finally:
|
||||
await c.close()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_prestream_503_no_body_falls_back_router_503(monkeypatch):
|
||||
def handler(request: httpx.Request) -> httpx.Response:
|
||||
return httpx.Response(503, content=b"oops not json")
|
||||
|
||||
_patch_transport(monkeypatch, handler)
|
||||
c = EidAIClient()
|
||||
try:
|
||||
stream = c.call_stream("deep", _MSG, "sys")
|
||||
with pytest.raises(BackendUnavailable) as ei:
|
||||
await anext(stream)
|
||||
assert ei.value.reason == "router_503"
|
||||
finally:
|
||||
await c.close()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_prestream_connect_error_maps_router_prefix(monkeypatch):
|
||||
"""연결 실패 → router_<예외명> (RouterBackend._post 어휘 일치)."""
|
||||
|
||||
def handler(request: httpx.Request) -> httpx.Response:
|
||||
raise httpx.ConnectError("connection refused")
|
||||
|
||||
_patch_transport(monkeypatch, handler)
|
||||
c = EidAIClient()
|
||||
try:
|
||||
stream = c.call_stream("deep", _MSG, "sys")
|
||||
with pytest.raises(BackendUnavailable) as ei:
|
||||
await anext(stream)
|
||||
assert ei.value.reason == "router_ConnectError"
|
||||
finally:
|
||||
await c.close()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_prestream_400_raises_valueerror_failloud(monkeypatch):
|
||||
"""router 400 = 닫힌 매핑에서 alias drift 코드 버그 — BackendUnavailable 아닌
|
||||
ValueError fail-loud (RouterBackend._post 컨벤션 미러)."""
|
||||
|
||||
def handler(request: httpx.Request) -> httpx.Response:
|
||||
return httpx.Response(400, json={"error": "unknown_alias"})
|
||||
|
||||
_patch_transport(monkeypatch, handler)
|
||||
c = EidAIClient()
|
||||
try:
|
||||
stream = c.call_stream("deep", _MSG, "sys")
|
||||
with pytest.raises(ValueError, match="router rejected alias='qwen-macbook'"):
|
||||
await anext(stream)
|
||||
finally:
|
||||
await c.close()
|
||||
|
||||
|
||||
# ── wall-clock deadline (게이트 점유 무한화 차단) ─────────────────────────────
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_stream_deadline_exceeded(monkeypatch):
|
||||
"""업스트림 진입~종료 deadline 초과 → BackendUnavailable(stream_deadline_exceeded)."""
|
||||
|
||||
class _StallStream(httpx.AsyncByteStream):
|
||||
"""첫 chunk 후 정체 — per-chunk read timeout 으론 안 잡히는 패턴 모사."""
|
||||
|
||||
async def __aiter__(self):
|
||||
yield b'data: {"choices": []}\n\n'
|
||||
await asyncio.sleep(30)
|
||||
|
||||
async def aclose(self):
|
||||
return None
|
||||
|
||||
def handler(request: httpx.Request) -> httpx.Response:
|
||||
return httpx.Response(
|
||||
200, stream=_StallStream(), headers={"content-type": "text/event-stream"}
|
||||
)
|
||||
|
||||
_patch_transport(monkeypatch, handler)
|
||||
monkeypatch.setattr(eid_ai, "_STREAM_DEADLINE_S", 0.05)
|
||||
c = EidAIClient()
|
||||
try:
|
||||
stream = c.call_stream("deep", _MSG, "sys")
|
||||
with pytest.raises(BackendUnavailable) as ei:
|
||||
async for _ in stream:
|
||||
pass
|
||||
assert ei.value.reason == "stream_deadline_exceeded"
|
||||
assert ei.value.backend_name == "qwen-macbook"
|
||||
finally:
|
||||
await c.close()
|
||||
|
||||
|
||||
# ── error_reason allowlist sanitize ──────────────────────────────────────────
|
||||
|
||||
|
||||
def test_stream_error_reason_sanitized():
|
||||
"""최종 reason 은 [a-z0-9_]{1,64} allowlist — 불일치(대문자/공백/dict 파편)는
|
||||
upstream_502(502)/router_error(그 외) 로 일반화, dict 직렬화 파편 비노출."""
|
||||
from eid.ai import _stream_error_reason
|
||||
|
||||
# 정상 어휘는 그대로 (ask 와 동일)
|
||||
assert (
|
||||
_stream_error_reason(503, b'{"error": {"type": "macbook_unavailable"}}')
|
||||
== "macbook_unavailable"
|
||||
)
|
||||
assert _stream_error_reason(503, b"oops not json") == "router_503"
|
||||
assert _stream_error_reason(418, b"{}") == "router_http_418"
|
||||
# 502 + 추출 실패 → upstream_502 (기존 upstream_502_{dict...} 파편 제거)
|
||||
assert _stream_error_reason(502, b'{"error": {"detail": "x"}}') == "upstream_502"
|
||||
# allowlist 밖(대문자/공백/특수문자) → 일반화
|
||||
assert _stream_error_reason(502, b'{"error": {"type": "Bad Gateway!"}}') == "upstream_502"
|
||||
assert _stream_error_reason(503, b'{"error": {"type": "Weird Reason"}}') == "router_error"
|
||||
assert _stream_error_reason(503, b'{"error": {"type": "' + b"a" * 80 + b'"}}') == "router_error"
|
||||
@@ -0,0 +1,848 @@
|
||||
<!DOCTYPE html>
|
||||
|
||||
<html lang="en" class="no-js">
|
||||
<head>
|
||||
|
||||
<title>American Petroleum Institute | API | Standards News Highlights</title>
|
||||
|
||||
<link rel="apple-touch-icon" sizes="180x180" href="/library/APIWeb/favicon/apple-touch-icon.png">
|
||||
<link rel="icon" type="image/png" sizes="32x32" href="/library/APIWeb/favicon/favicon-32x32.png">
|
||||
<link rel="icon" type="image/png" sizes="16x16" href="/library/APIWeb/favicon/favicon-16x16.png">
|
||||
<link rel="manifest" href="/library/APIWeb/favicon/site.webmanifest">
|
||||
|
||||
|
||||
|
||||
<meta charset="UTF-8">
|
||||
<meta http-equiv="X-UA-Compatible" content="IE=edge">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||
|
||||
<meta name="keywords" content="api, american petroleum institute, what is energy, monogram, ICP, api standards, fossil fuels facts, api 510, api.org, api certification, api 653, api 570, api 610, api 650, api icp, keystone xl, LNG exports, pipeline, refinery, hydraulic fracturing, api 6a, petroleum, shale gas, api q1, api oil, api training, gasoline, natural gas, oil sands, groundwater, ethanol, E15," />
|
||||
|
||||
<meta name="description" content="Stay informed with the latest API standards updates, addenda, and global adoption trends. Explore how these standards support industry safety and innovation.
|
||||
" />
|
||||
|
||||
<link rel="canonical" href="https://www.api.org/products-and-services/standards/important-standards-announcements" />
|
||||
|
||||
<meta property="fb:admins" content="rachidmrad" />
|
||||
|
||||
<meta property="fb:app_id" content="880866755363034" />
|
||||
|
||||
<meta property="og:url" content="https://www.api.org/products-and-services/standards/important-standards-announcements" />
|
||||
|
||||
<meta property="og:type" content="website" />
|
||||
|
||||
<meta property="og:title" content="Standards News Highlights" />
|
||||
|
||||
<meta property="og:description" content="Stay informed with the latest API standards updates, addenda, and global adoption trends. Explore how these standards support industry safety and innovation.
|
||||
" />
|
||||
|
||||
<meta property="og:image" content="https://www.api.org/-/media/APIWebsite/news-policies-and-issues/liveblog/APILogo-liveblog-primary-debate-06262019.jpg" />
|
||||
|
||||
<meta property="og:site_name" content="American Petroleum Institute" />
|
||||
|
||||
<meta name="twitter:card" content="summary_large_image" />
|
||||
|
||||
<meta name="twitter:site" content="@APIenergy" />
|
||||
|
||||
<meta name="twitter:title" content="Standards News Highlights" />
|
||||
|
||||
<meta name="twitter:description" content="Stay informed with the latest API standards updates, addenda, and global adoption trends. Explore how these standards support industry safety and innovation.
|
||||
" />
|
||||
|
||||
<meta name="twitter:image" content="https://www.api.org/-/media/APIWebsite/news-policies-and-issues/liveblog/APILogo-liveblog-primary-debate-06262019.jpg" />
|
||||
|
||||
|
||||
|
||||
<style>
|
||||
.carousel {
|
||||
position: relative;
|
||||
z-index: 2;
|
||||
}
|
||||
.carousel-header {
|
||||
position: relative;
|
||||
width: 100%;
|
||||
height: 100%;
|
||||
background-size: cover;
|
||||
background-repeat: no-repeat;
|
||||
background-position: top;
|
||||
overflow:hidden;
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: flex-start;
|
||||
|
||||
}
|
||||
.carousel-mobile {
|
||||
display: block;
|
||||
width: 120%;
|
||||
height: auto;
|
||||
}
|
||||
.carousel-title {
|
||||
position: absolute;
|
||||
inset: 0;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
justify-content: center;
|
||||
line-height: 1.25;
|
||||
margin:0 auto 0 4rem;
|
||||
width: 70%;
|
||||
}
|
||||
.carousel-heading,
|
||||
.carousel-subtext {
|
||||
margin: 0;
|
||||
color: #fff;
|
||||
text-transform: uppercase;
|
||||
width: 80%;
|
||||
}
|
||||
@media screen and (min-width: 992px) {
|
||||
.carousel-mobile {
|
||||
display: none;
|
||||
}
|
||||
.carousel-title h1, .carousel-title h2 {
|
||||
font-size: 3.3rem;
|
||||
}
|
||||
.carousel-top {
|
||||
height: 600px;
|
||||
}
|
||||
}
|
||||
@media screen and (max-width:991px) {
|
||||
.carousel-top > div:first-child {
|
||||
background-image: none !important;
|
||||
}
|
||||
.carousel-top {
|
||||
height: 700px;
|
||||
}
|
||||
}
|
||||
</style>
|
||||
<!-- Redesign CSS/Fonts -->
|
||||
<link rel="stylesheet" href="/library/APIWeb/css/style.css" />
|
||||
<link rel="stylesheet" href="https://use.typekit.net/wvd1mgg.css" />
|
||||
<script src="https://kit.fontawesome.com/a190867a63.js" crossorigin="anonymous"></script>
|
||||
<!--BEGIN COVEO NOINDEX-->
|
||||
<!-- Google Tag Manager -->
|
||||
<script>
|
||||
(function (w, d, s, l, i) {
|
||||
w[l] = w[l] || []; w[l].push({
|
||||
'gtm.start': new Date().getTime(), event: 'gtm.js'
|
||||
});
|
||||
var f = d.getElementsByTagName(s)[0],
|
||||
j = d.createElement(s), dl = l != 'dataLayer' ? '&l=' + l : '';
|
||||
j.async = true; j.src = 'https://www.googletagmanager.com/gtm.js?id=' + i + dl;
|
||||
f.parentNode.insertBefore(j, f);
|
||||
})(window, document, 'script', 'dataLayer', 'GTM-PR8RWJW');
|
||||
</script>
|
||||
<!-- End Google Tag Manager -->
|
||||
<!--END COVEO NOINDEX-->
|
||||
<script src="https://code.jquery.com/jquery-1.11.0.min.js"></script>
|
||||
<!-- reCAPTCHA -->
|
||||
<script src="https://www.google.com/recaptcha/api.js?render=explicit" async defer></script>
|
||||
<script type="text/javascript">
|
||||
var your_site_key = '6LdAV_ciAAAAAGP_PxfR-AzZcVbyQQXyJc9OV461';
|
||||
var recaptchaRenderAttempted = false;
|
||||
var recaptchaCheckCount = 0;
|
||||
var maxChecks = 20; // ~10 seconds max
|
||||
|
||||
function renderRecaptcha() {
|
||||
// Stop retrying if container doesn't exist
|
||||
var container = document.getElementById('ReCaptchContainer');
|
||||
if (!container) {
|
||||
if (recaptchaCheckCount === 0) {
|
||||
console.log('No reCAPTCHA container found. Skipping checks for this page.');
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// Already rendered?
|
||||
if (recaptchaRenderAttempted) return;
|
||||
|
||||
// Max retries
|
||||
if (recaptchaCheckCount >= maxChecks) {
|
||||
console.warn('reCAPTCHA load timeout after ' + (maxChecks * 500 / 1000) + 's');
|
||||
return;
|
||||
}
|
||||
|
||||
recaptchaCheckCount++;
|
||||
|
||||
if (typeof grecaptcha !== 'undefined' && typeof grecaptcha.render === 'function') {
|
||||
grecaptcha.render('ReCaptchContainer', {
|
||||
'sitekey': your_site_key,
|
||||
'theme': 'light',
|
||||
'type': 'image',
|
||||
'size': 'normal'
|
||||
});
|
||||
recaptchaRenderAttempted = true;
|
||||
console.log('reCAPTCHA rendered successfully');
|
||||
} else {
|
||||
// Only log first few waits to avoid console spam
|
||||
if (recaptchaCheckCount <= 5) {
|
||||
console.log('Waiting for grecaptcha (' + recaptchaCheckCount + ')...');
|
||||
}
|
||||
setTimeout(renderRecaptcha, 500);
|
||||
}
|
||||
}
|
||||
|
||||
// Start after DOM ready
|
||||
document.addEventListener('DOMContentLoaded', function () {
|
||||
renderRecaptcha();
|
||||
});
|
||||
|
||||
// Fallback for late load
|
||||
window.addEventListener('load', function () {
|
||||
setTimeout(renderRecaptcha, 1000);
|
||||
});
|
||||
</script>
|
||||
<!-- reCAPTCHA End -->
|
||||
|
||||
</head>
|
||||
|
||||
|
||||
<body class="">
|
||||
|
||||
|
||||
|
||||
<!-- Google Tag Manager (noscript) -->
|
||||
<noscript>
|
||||
<iframe src="https://www.googletagmanager.com/ns.html?id=GTM-PR8RWJW"
|
||||
height="0" width="0" style="display:none;visibility:hidden"></iframe>
|
||||
</noscript>
|
||||
<!-- End Google Tag Manager (noscript) -->
|
||||
|
||||
|
||||
|
||||
<header>
|
||||
<div class="header">
|
||||
<div class="second-nav">
|
||||
<div class="logo-container">
|
||||
<a href="/">
|
||||
<img id="apiLogo" src="/library/APIWeb/img/apiLogoPrimary.svg" alt="API Logo" />
|
||||
</a>
|
||||
<a href="/">
|
||||
<img id="apiMobileLogo" src="/library/APIWeb/img/apiMobileLogo.svg" alt="API Mobile Logo" />
|
||||
</a>
|
||||
</div>
|
||||
|
||||
<ul class="utilities">
|
||||
|
||||
|
||||
<li>
|
||||
<a href="/about">About</a>
|
||||
</li>
|
||||
<li>
|
||||
<a href="/membership">Membership</a>
|
||||
</li>
|
||||
<li>
|
||||
<a href="https://events.api.org/" target="_blank" rel="noopener noreferrer" class="external-url">
|
||||
Events
|
||||
</a>
|
||||
</li>
|
||||
<li>
|
||||
<a href="https://www.apiwebstore.org/?utm_campaign=apitowebstore&utm_source=navigation&utm_medium=web" target="_blank" rel="noopener noreferrer" class="external-url">
|
||||
Webstore
|
||||
</a>
|
||||
</li>
|
||||
<li>
|
||||
<a href="https://myportal.api.org/" target="_blank" rel="noopener noreferrer" class="external-url">
|
||||
API Portal
|
||||
</a>
|
||||
</li>
|
||||
|
||||
<li><a href="/contact" class="btn-red">Contact</a></li>
|
||||
</ul>
|
||||
|
||||
<div class="mobile-nav">
|
||||
<div class="nav-trigger menu-toggle">
|
||||
<img class="menu-icon"
|
||||
src="/library/APIWeb/img/icons/bars-solid-full.svg"
|
||||
data-cross-src="/library/APIWeb/img/icons/xmark-solid-full.svg"
|
||||
alt="Mobile menu trigger">
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="main-nav">
|
||||
|
||||
|
||||
<div class="parent-nav">
|
||||
<div class="main-item"
|
||||
id="parent-606c005fb35d407da123d1b7f735cfc6"
|
||||
data-haschildren="true"
|
||||
tabindex="0"
|
||||
role="button"
|
||||
aria-haspopup="true"
|
||||
aria-expanded="false"
|
||||
aria-label="Issues menu toggle">
|
||||
Issues
|
||||
</div>
|
||||
<div class="main-item"
|
||||
id="parent-5f629feb57834e2691684a3f5d4b24e6"
|
||||
data-haschildren="true"
|
||||
tabindex="0"
|
||||
role="button"
|
||||
aria-haspopup="true"
|
||||
aria-expanded="false"
|
||||
aria-label="Energy Insights menu toggle">
|
||||
Energy Insights
|
||||
</div>
|
||||
<div class="main-item" id="parent-95ad0674d8244aeb92db777cc4a9c2cf">
|
||||
<a href="/news-policy-and-issues/news" aria-label="News link">News</a>
|
||||
</div>
|
||||
<div class="main-item"
|
||||
id="parent-c374e7ac945b4221917ed410add09145"
|
||||
data-haschildren="true"
|
||||
tabindex="0"
|
||||
role="button"
|
||||
aria-haspopup="true"
|
||||
aria-expanded="false"
|
||||
aria-label="Products + Services menu toggle">
|
||||
Products + Services
|
||||
</div>
|
||||
|
||||
<div class="main-item search-container">
|
||||
<form>
|
||||
<div>
|
||||
|
||||
|
||||
|
||||
<div id="_051E6CD8-A1A3-4CF3-97EA-E3C6386C2F6B"
|
||||
data-search-interface-id='coveo728075ff'
|
||||
>
|
||||
|
||||
<div class='coveo-search-section'>
|
||||
|
||||
|
||||
|
||||
|
||||
<script>
|
||||
document.addEventListener("CoveoSearchEndpointInitialized", function() {
|
||||
var searchboxElement = document.getElementById("_185F67BF-1D73-4AEE-9564-FDE5B0EB24D9");
|
||||
searchboxElement.addEventListener("CoveoComponentInitialized", function() {
|
||||
CoveoForSitecore.initSearchboxIfStandalone(searchboxElement, "/searchresults");
|
||||
});
|
||||
})
|
||||
</script> <div id="_185F67BF-1D73-4AEE-9564-FDE5B0EB24D9_container" class="coveo-for-sitecore-search-box-container"
|
||||
data-prebind-maximum-age='currentMaximumAge'
|
||||
data-pipeline='API Site Search'
|
||||
>
|
||||
<div id="_185F67BF-1D73-4AEE-9564-FDE5B0EB24D9"
|
||||
class="CoveoSearchbox"
|
||||
data-enable-lowercase-operators='true'
|
||||
data-enable-omnibox='true'
|
||||
data-enable-partial-match='true'
|
||||
data-enable-query-suggest-addon='true'
|
||||
data-partial-match-keywords='5'
|
||||
data-partial-match-threshold='75%'
|
||||
data-prebind-maximum-age='currentMaximumAge'
|
||||
data-pipeline='API Site Search'
|
||||
data-placeholder='Find Something'
|
||||
data-query-suggest-character-threshold='0'
|
||||
data-clear-filters-on-new-query='true'
|
||||
>
|
||||
|
||||
|
||||
<script type="text/javascript">
|
||||
document.addEventListener("CoveoSearchEndpointInitialized", function() {
|
||||
var componentId = "_185F67BF-1D73-4AEE-9564-FDE5B0EB24D9";
|
||||
var componentElement = document.getElementById(componentId);
|
||||
|
||||
function showError(error) {
|
||||
console.error(error);
|
||||
}
|
||||
|
||||
function areCoveoResourcesIncluded() {
|
||||
return typeof (Coveo) !== "undefined";
|
||||
}
|
||||
|
||||
if (areCoveoResourcesIncluded()) {
|
||||
var event = document.createEvent("CustomEvent");
|
||||
event.initEvent("CoveoComponentInitialized", false, true);
|
||||
|
||||
setTimeout(function() {
|
||||
componentElement.dispatchEvent(event);
|
||||
}, 0);
|
||||
} else {
|
||||
componentElement.classList.add("invalid");
|
||||
showError("The Coveo Resources component must be included in this page.");
|
||||
}
|
||||
});
|
||||
</script>
|
||||
<div class="CoveoForSitecoreBindWithUserContext"></div>
|
||||
<div class="CoveoForSitecoreExpressions"></div>
|
||||
<div class="CoveoForSitecoreConfigureSearchHub" data-sc-search-hub="searchresults"></div>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<script type="text/javascript">
|
||||
document.addEventListener("CoveoSearchEndpointInitialized", function() {
|
||||
var externalComponentsSection = document.getElementById("_051E6CD8-A1A3-4CF3-97EA-E3C6386C2F6B");
|
||||
CoveoForSitecore.initExternalComponentsSection(externalComponentsSection);
|
||||
});
|
||||
</script>
|
||||
|
||||
</form>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="dropdown">
|
||||
<div class="dropdown-panel" for="parent-606c005fb35d407da123d1b7f735cfc6">
|
||||
<div class="dropdown-desc">
|
||||
<p class="h5">Issues</p>
|
||||
<p>Learn how we engage with policy makers to ensure safe, reliable, and affordable energy for the future as demand continues to grow.</p>
|
||||
</div>
|
||||
|
||||
<div class="dropdown-items">
|
||||
<a href="/news-policy-and-issues/access" >Access & Production</a>
|
||||
<a href="/news-policy-and-issues/trade-global-markets" >Trade & Global Markets</a>
|
||||
<a href="/news-policy-and-issues/fuels-refining" >Fuels & Refining</a>
|
||||
<a href="/news-policy-and-issues/infrastructure-permitting" >Infrastructure & Permitting</a>
|
||||
<a href="/news-policy-and-issues/tax" >Tax Policy</a>
|
||||
<a href="/news-policy-and-issues/climate" >Climate Policy</a>
|
||||
<a href="/news-policy-and-issues/safety" >Industry Safety</a>
|
||||
</div>
|
||||
</div>
|
||||
<div class="dropdown-panel" for="parent-5f629feb57834e2691684a3f5d4b24e6">
|
||||
<div class="dropdown-desc">
|
||||
<p class="h5">Energy Insights</p>
|
||||
<p>API's Energy Insights Hub provides updated statistics, data visualizations, timely analysis, and in-depth reports on all aspects of the oil and natural gas industry.</p>
|
||||
</div>
|
||||
|
||||
<div class="dropdown-items">
|
||||
<a href="/energy-insights/charts-analysis" >Charts & Analysis</a>
|
||||
<a href="/energy-insights/industry-explained" >Industry Explained </a>
|
||||
<a href="/energy-insights/studies" >Studies</a>
|
||||
<a href="/energy-insights/statistics" >Statistics</a>
|
||||
</div>
|
||||
</div>
|
||||
<div class="dropdown-panel" for="parent-c374e7ac945b4221917ed410add09145">
|
||||
<div class="dropdown-desc">
|
||||
<p class="h5">Global Industry Services</p>
|
||||
<p>API’s Global Industry Services drives safety and efficiency within the oil and gas industry through standards, certifications, assessments, training and more.</p>
|
||||
<p class="add-on">
|
||||
<a href="/products-and-services/get-a-quote" >Request a Quote</a>
|
||||
</p>
|
||||
<p class="add-on">
|
||||
<a href="/products-and-services/certifications-directories" >Certifications Directories</a>
|
||||
</p>
|
||||
<p class="add-on">
|
||||
<a href="/products-and-services/worldwide-representatives" >Worldwide Representatives</a>
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="dropdown-items">
|
||||
<a href="/products-and-services/standards" >Standards</a>
|
||||
<a href="/products-and-services/site-safety" >Process Safety Site Assessment Program</a>
|
||||
<a href="/products-and-services/pipeline-sms-assessment-program" >Pipeline SMS Assessment Program</a>
|
||||
<a href="/products-and-services/contractor-safety" >Pipeline SMS Contractor Assessment Program</a>
|
||||
<a href="/products-and-services/offshore-safety" >Center for Offshore Safety (COS)</a>
|
||||
<a href="/products-and-services/individual-certification-programs" >Individual Certification Programs (ICP)</a>
|
||||
<a href="/products-and-services/training" >Training</a>
|
||||
<a href="/products-and-services/api-monogram-and-apiqr" >API Monogram and APIQR</a>
|
||||
<a href="/products-and-services/api-monogram-and-apiqr#tab-repair-and-remanufacture">API Repair and Remanufacture Program</a>
|
||||
<a href="/products-and-services/witnessing-programs" >19B Perforator Program</a>
|
||||
<a href="/products-and-services/engine-oil" >Engine Oil (EOLCS)</a>
|
||||
<a href="/products-and-services/diesel-exhaust-fluid" >Diesel Exhaust Fluid (DEF)</a>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</header>
|
||||
|
||||
|
||||
|
||||
<!-- ===== Page content wrapper from redesign ===== -->
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<div class="page-wrapper">
|
||||
<div class="large-banner" style="background-image:url('/-/media/APIWebsite/Banners/Large/gis-lg-v2.jpg');">
|
||||
<div class="title">
|
||||
<h1>Standards News Highlights</h1>
|
||||
</div>
|
||||
</div>
|
||||
<div class="page-wrapper-inner">
|
||||
<div class="breadcrumbs">
|
||||
|
||||
|
||||
<span class="breadcrumbs-parent"><a href="/products-and-services">Products + Services</a></span>
|
||||
/
|
||||
|
||||
|
||||
<span class="breadcrumbs-child">
|
||||
<a href="/products-and-services/standards">Standards</a>
|
||||
|
||||
</span>
|
||||
/
|
||||
|
||||
|
||||
<span class="breadcrumbs-child">Standards News Highlights</span>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="two-columns">
|
||||
<div class="left">
|
||||
|
||||
|
||||
|
||||
<div class="pagination-wrapper">
|
||||
<div class="row">
|
||||
<div class="col-lg-8">
|
||||
<div class="pagination-container"><ul class="pagination"><li class="page-item active"><a>1</a></li><li class="page-item"><a href="/products-and-services/standards/important-standards-announcements?page=2&pageSize=10">2</a></li><li class="page-item"><a href="/products-and-services/standards/important-standards-announcements?page=3&pageSize=10">3</a></li><li class="page-item"><a href="/products-and-services/standards/important-standards-announcements?page=4&pageSize=10">4</a></li><li class="page-item"><a href="/products-and-services/standards/important-standards-announcements?page=5&pageSize=10">5</a></li><li class="page-item"><a href="/products-and-services/standards/important-standards-announcements?page=6&pageSize=10">6</a></li><li class="page-item"><a href="/products-and-services/standards/important-standards-announcements?page=7&pageSize=10">7</a></li><li class="page-item"><a href="/products-and-services/standards/important-standards-announcements?page=8&pageSize=10">8</a></li><li class="page-item"><a href="/products-and-services/standards/important-standards-announcements?page=9&pageSize=10">9</a></li><li class="page-item"><a href="/products-and-services/standards/important-standards-announcements?page=10&pageSize=10">10</a></li><li class="page-item disabled PagedList-ellipses"><a>…</a></li><li class="page-item PagedList-skipToNext"><a href="/products-and-services/standards/important-standards-announcements?page=2&pageSize=10" rel="next">»</a></li><li class="page-item PagedList-skipToLast"><a href="/products-and-services/standards/important-standards-announcements?page=12&pageSize=10">»»</a></li></ul></div>
|
||||
</div>
|
||||
<div class="col-lg-4">
|
||||
<form action="/products-and-services/standards/important-standards-announcements" method="post"> <div class="form-group">
|
||||
<select class="form-control" id="pageSize" name="pageSize" onchange="this.form.submit();"><option value="0">Show All</option>
|
||||
<option value="50">Show 50</option>
|
||||
<option value="20">Show 20</option>
|
||||
<option selected="selected" value="10">Show 10</option>
|
||||
</select>
|
||||
</div>
|
||||
</form> </div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="media-item">
|
||||
<h3><a href="/products-and-services/standards/important-standards-announcements/api-announces-47th-edition-of-foundational-line-pipe-standard-5l">API Announces 47th Edition of Foundational Line Pipe Standard</a></h3>
|
||||
|
||||
<p style="margin-bottom: 0in; line-height: normal;">WASHINGTON, June 2, 2026 — The American Petroleum Institute (API) today announced the publication of the <a href="https://nam04.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.apiwebstore.org%2Fstandards%2F5L%3Futm_campaign%3D43765133-Standards-5L%26utm_source%3Dpr&data=05%7C02%7CMaxwellC%40api.org%7Cb4ba54a657b049ff58c808dec0a2bad7%7C2df2418fe75f46f0898d65f4eeecb14b%7C0%7C0%7C639160002307544852%7CUnknown%7CTWFpbGZsb3d8eyJFbXB0eU1hcGkiOnRydWUsIlYiOiIwLjAuMDAwMCIsIlAiOiJXaW4zMiIsIkFOIjoiTWFpbCIsIldUIjoyfQ%3D%3D%7C0%7C%7C%7C&sdata=qvuVR2PsJQx5M9oShqTN%2BBc%2FXQQiE9NdPwNm6MZ8VoE%3D&reserved=0" data-auth="NotApplicable" originalsrc="https://www.apiwebstore.org/standards/5L?utm_campaign=43765133-Standards-5L&utm_source=pr" data-outlook-id="e5cfad5e-c8c0-439b-9809-73c2341274b0" data-linkindex="0" title="Original URL: https://www.apiwebstore.org/standards/5L?utm_campaign=43765133-Standards-5L&utm_source=pr. Click or tap if you trust this link." style="color: blue; margin: 0px; padding: 0px; border: 0px; line-height: inherit;"><span style="text-decoration: underline;">47th edition of API Specification 5L (API Spec 5L), </span><em><span style="text-decoration: underline;">Line Pipe</span></em></a>. Originally published in 1924 as API’s first standard, API Spec 5L has supported the safe and reliable manufacture of steel line pipe used to transport oil and gas for more than a century. The 47th edition includes important new requirements across more than 15 topic areas, including high-frequency weld (HFW) pipe quality and pipe used in CO2 transport.</p>
|
||||
<p><a class="more" href="/products-and-services/standards/important-standards-announcements/api-announces-47th-edition-of-foundational-line-pipe-standard-5l">More »</a></p>
|
||||
</div>
|
||||
<div class="media-item">
|
||||
<h3><a href="/products-and-services/standards/important-standards-announcements/api-spec-5l-47th">API Specification 5L, Line Pipe</a></h3>
|
||||
|
||||
<p>API has published the 47th edition of American Petroleum Institute <a href="https://www.apiwebstore.org/standards/5L">Specification 5L (API Spec 5L), Line Pipe</a>. Originally introduced as API’s first standard in 1924, the updated edition includes new requirements across more than 15 topic areas to support the safe and reliable manufacture of steel line pipe used in energy transportation, including CO2 transport. </p>
|
||||
<p><a class="more" href="/products-and-services/standards/important-standards-announcements/api-spec-5l-47th">More »</a></p>
|
||||
</div>
|
||||
<div class="media-item">
|
||||
<h3><a href="/products-and-services/standards/important-standards-announcements/api-rp-1192">API Recommended Practice 1192 (RP 1192), Transportation of Carbon Dioxide by Pipeline</a></h3>
|
||||
|
||||
<p>API has published<span style="line-height: 18.4px;"><a href="https://www.apiwebstore.org/standards/1192?utm_campaign=32314451-rp-1192&utm_source=pub-announcement"> Recommended Practice 1192 (RP 1192), <em>Transportation of Carbon Dioxide by Pipeline<strong></strong></em></a><strong><em></em></strong></span><span style="line-height: 18.4px;">.This first edition standard provides performance requirements for the safe and reliable transport of carbon dioxide (CO<sub>2</sub>) by pipeline. It also addresses the design, construction, operation, and management of CO<sub>2</sub> </span><span style="line-height: 18.4px;">pipelines</span></p>
|
||||
<p><a class="more" href="/products-and-services/standards/important-standards-announcements/api-rp-1192">More »</a></p>
|
||||
</div>
|
||||
<div class="media-item">
|
||||
<h3><a href="/products-and-services/standards/important-standards-announcements/api-strengthens-requirements-for-steel-casing-and-tubing">API Strengthens Requirements for Steel Casing and Tubing</a></h3>
|
||||
|
||||
<p style="margin-bottom: 0in; line-height: normal;"><span>WASHINGTON, May 5, 2025 — The American Petroleum Institute (API) is pleased to announce the publication of an </span><a href="https://www.apiwebstore.org/standards/5CT?utm_campaign=Spec%205ct&utm_source=standardshighlights&utm_medium=PubNotice&__hstc=23321061.e37da81b94fb192a0eca1fd2b60ae651.1745351530524.1745592885893.1745605800922.3&__hssc=23321061.3.1745605800922&__hsfp=509228229">Addendum to the 11th edition of the API 5CT, Casing and Tubing</a><span>. The update strengthens the requirements for the manufacture of steel casing and tubing used in oil and gas drilling and production operations, enhancing safety, environmental protection and operational integrity.</span></p>
|
||||
<p><a class="more" href="/products-and-services/standards/important-standards-announcements/api-strengthens-requirements-for-steel-casing-and-tubing">More »</a></p>
|
||||
</div>
|
||||
<div class="media-item">
|
||||
<h3><a href="/products-and-services/standards/important-standards-announcements/addendum-1-rp-1183">Addendum to API RP 1183 for Improved Dent Screening</a></h3>
|
||||
|
||||
<p><span style="color: black;">In December 2020, American Petroleum Institute (API) published Recommended Practice 1183, First<sup> </sup>Edition (RP 1183), <em>Assessment and Management of Dents in Pipelines</em>. Since being issued, RP 1183 has been applied by pipeline operating companies and engineering consultants providing services to the energy pipeline industry to evaluate dents and deformations on pipeline systems and to support mitigation and repair decisions. RP 1183 includes various screening tools to estimate the remaining fatigue life of a dent in a pipeline. </span></p>
|
||||
<p><a class="more" href="/products-and-services/standards/important-standards-announcements/addendum-1-rp-1183">More »</a></p>
|
||||
</div>
|
||||
<div class="media-item">
|
||||
<h3><a href="/products-and-services/standards/important-standards-announcements/2025-international-standards-report-announcement">New API Report Highlights Broader Global Adoption of API Standards </a></h3>
|
||||
|
||||
<p>February 4, 2025 – The American Petroleum Institute (API) today released a new report, <em><a href="/-/media/APIWebsite/products-and-services/2025_Intnl-Usage_Report_web-final.pdf"><em>202</em><em>5 API Standards</em><em></em><em>International Usage Report</em></a></em>, detailing the growing international influence of API standards. The report identifies where governments and standards bodies reference API standards in policies, national and international standards, and technical regulations, highlighting the paramount role of API standards in advancing safety, sustainability, and efficiency across the global natural gas and oil industry.</p>
|
||||
<p><a class="more" href="/products-and-services/standards/important-standards-announcements/2025-international-standards-report-announcement">More »</a></p>
|
||||
</div>
|
||||
<div class="media-item">
|
||||
<h3><a href="/products-and-services/standards/important-standards-announcements/20s-3d-printing-update">API Enhances 3D Printing Guidelines with Updated Additive Manufacturing Standard</a></h3>
|
||||
|
||||
<p style="margin-bottom: 0in; line-height: normal;"><span style="color: black;">The American Petroleum Institute (API) is pleased to announce the release of the second edition of API Standard 20S, <em>Qualification of Metal Additive Manufacturing Processes and Components Production Control for Use in the Petroleum and Natural Gas Industries</em>. This update strengthens the industry’s ability to effectively deploy additive manufacturing (AM), or 3D printing, improving efficiency, supply chain resilience and sustainability across oil and natural gas operations.</span></p>
|
||||
<p><a class="more" href="/products-and-services/standards/important-standards-announcements/20s-3d-printing-update">More »</a></p>
|
||||
</div>
|
||||
<div class="media-item">
|
||||
<h3><a href="/products-and-services/standards/important-standards-announcements/addendum-1-to-api-rp-2001-10th-ed">Addendum 1 to API RP 2001, 10th Edition: Fire Protection in Refineries</a></h3>
|
||||
|
||||
<p>API has published Addendum 1 to API Recommended Practice 2001, 10th Edition - "Fire Protection in Refineries."<br />
|
||||
<br />
|
||||
This addendum strengthens existing fire safety measures by introducing new protocols for pre-planning and incident response in refineries.</p>
|
||||
<p><a class="more" href="/products-and-services/standards/important-standards-announcements/addendum-1-to-api-rp-2001-10th-ed">More »</a></p>
|
||||
</div>
|
||||
<div class="media-item">
|
||||
<h3><a href="/products-and-services/standards/important-standards-announcements/api-5ct-casing-and-tubing-addendum-1-11th">Addendum to the 11th edition of the API 5CT, Casing and Tubing</a></h3>
|
||||
|
||||
<p><span style="color: black;">The American Petroleum Institute (API) is pleased to announce the publication of an Addendum to the 11th edition of the API 5CT, <em>Casing and Tubing</em>. The update strengthens the requirements for the manufacture of steel casing and tubing used in oil and gas drilling and production operations, enhancing safety, environmental protection and operational integrity.</span></p>
|
||||
<p><a class="more" href="/products-and-services/standards/important-standards-announcements/api-5ct-casing-and-tubing-addendum-1-11th">More »</a></p>
|
||||
</div>
|
||||
<div class="media-item">
|
||||
<h3><a href="/products-and-services/standards/important-standards-announcements/api-updates-fire-protection-standard-for-refineries">API Updates Fire Protection Standard for Refineries</a></h3>
|
||||
|
||||
<p>October 24, 2024 – The American Petroleum Institute (API) today announced the publication of Addendum 1 to API RP 2001, 10th Edition, “Fire Protection in Refineries.” This addendum strengthens existing fire safety measures by introducing new protocols for pre-planning and incident response in refineries.</p>
|
||||
<br class="t-last-br" />
|
||||
<p><a class="more" href="/products-and-services/standards/important-standards-announcements/api-updates-fire-protection-standard-for-refineries">More »</a></p>
|
||||
</div>
|
||||
<div class="pagination-wrapper">
|
||||
<div class="row">
|
||||
<div class="col-lg-8">
|
||||
<div class="pagination-container"><ul class="pagination"><li class="page-item active"><a>1</a></li><li class="page-item"><a href="/products-and-services/standards/important-standards-announcements?page=2&pageSize=10">2</a></li><li class="page-item"><a href="/products-and-services/standards/important-standards-announcements?page=3&pageSize=10">3</a></li><li class="page-item"><a href="/products-and-services/standards/important-standards-announcements?page=4&pageSize=10">4</a></li><li class="page-item"><a href="/products-and-services/standards/important-standards-announcements?page=5&pageSize=10">5</a></li><li class="page-item"><a href="/products-and-services/standards/important-standards-announcements?page=6&pageSize=10">6</a></li><li class="page-item"><a href="/products-and-services/standards/important-standards-announcements?page=7&pageSize=10">7</a></li><li class="page-item"><a href="/products-and-services/standards/important-standards-announcements?page=8&pageSize=10">8</a></li><li class="page-item"><a href="/products-and-services/standards/important-standards-announcements?page=9&pageSize=10">9</a></li><li class="page-item"><a href="/products-and-services/standards/important-standards-announcements?page=10&pageSize=10">10</a></li><li class="page-item disabled PagedList-ellipses"><a>…</a></li><li class="page-item PagedList-skipToNext"><a href="/products-and-services/standards/important-standards-announcements?page=2&pageSize=10" rel="next">»</a></li><li class="page-item PagedList-skipToLast"><a href="/products-and-services/standards/important-standards-announcements?page=12&pageSize=10">»»</a></li></ul></div>
|
||||
</div>
|
||||
<div class="col-lg-4">
|
||||
<form action="/products-and-services/standards/important-standards-announcements" method="post"> <div class="form-group">
|
||||
<select class="form-control" id="pageSize" name="pageSize" onchange="this.form.submit();"><option value="0">Show All</option>
|
||||
<option value="50">Show 50</option>
|
||||
<option value="20">Show 20</option>
|
||||
<option selected="selected" value="10">Show 10</option>
|
||||
</select>
|
||||
</div>
|
||||
</form> </div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
</div>
|
||||
<div class="right">
|
||||
|
||||
|
||||
|
||||
<div class="sidebar">
|
||||
<ul>
|
||||
<li>
|
||||
<a href="/products-and-services/standards">Overview</a>
|
||||
</li>
|
||||
<li>
|
||||
<a class="active" href="/products-and-services/standards/important-standards-announcements">Standards News Highlights</a>
|
||||
</li>
|
||||
<li>
|
||||
<a href="/products-and-services/standards/purchase">Purchase</a>
|
||||
</li>
|
||||
<li>
|
||||
<a href="/products-and-services/standards/committees">Committees</a>
|
||||
</li>
|
||||
<li>
|
||||
<a href="/products-and-services/standards/global-standards">Global Standards</a>
|
||||
</li>
|
||||
<li>
|
||||
<a href="/products-and-services/standards/get-involved">Get Involved</a>
|
||||
</li>
|
||||
<li>
|
||||
<a href="/products-and-services/standards/standards-plan">Standards Plan</a>
|
||||
</li>
|
||||
<li>
|
||||
<a href="/products-and-services/standards/standards-inquiries">Requests for Interpretation</a>
|
||||
</li>
|
||||
<li>
|
||||
<a href="/products-and-services/standards/rights-and-usage-policy">Copyright Information</a>
|
||||
</li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<link rel="stylesheet" href="/library/APIWeb/css/subpage.css" />
|
||||
|
||||
|
||||
|
||||
|
||||
<!-- ===== Footer placeholder (Footer.cshtml renders here) ===== -->
|
||||
<!--BEGIN COVEO NOINDEX-->
|
||||
<footer>
|
||||
|
||||
|
||||
|
||||
<div class="related-products">
|
||||
<div class="wrap">
|
||||
<div class="left">
|
||||
<div class="hubspot-form">
|
||||
<span>Sign-Up for Updates</span>
|
||||
<script charset="utf-8" type="text/javascript" src="//js.hsforms.net/forms/embed/v2.js"></script>
|
||||
<script>
|
||||
hbspt.forms.create({
|
||||
portalId: "20801443",
|
||||
formId: "3fbd74a7-aee1-41a5-9832-d92fee519bad",
|
||||
region: "na1"
|
||||
});
|
||||
</script>
|
||||
</div>
|
||||
</div>
|
||||
<div class="right">
|
||||
<h4>Certification Directories</h4>
|
||||
<p>Free directories listing all of the participants in our certification-related programs.</p>
|
||||
<a href="https://www.api.org/products-and-services/certifications-directories" class="btn-blue">View All</a>
|
||||
<hr />
|
||||
<h4>Request a Quote</h4>
|
||||
<p>Request a quotation for programs like API Monogram, APIQR, API Standards Subscription, PSSAP®, and more.</p>
|
||||
<a href="https://www.api.org/products-and-services/get-a-quote" class="btn-blue">Get a Quote</a>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="footer-wrap">
|
||||
<div class="top-footer">
|
||||
<div class="left-footer">
|
||||
<div class="logo-container">
|
||||
<img src="/-/media/APIWebsite/Thumbs/api-logo-stacked.png" alt="API" />
|
||||
</div>
|
||||
<div class="footer-nav">
|
||||
<a href="/about">About</a>
|
||||
<a href="/about/careers">Careers</a>
|
||||
<a href="/contact">Contact</a>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="right-footer">
|
||||
<h4>Stay Connected</h4>
|
||||
<h5>API Energy</h5>
|
||||
<ul class="social-nav">
|
||||
<li>
|
||||
<a class="btn-social" href="https://www.facebook.com/TheAmericanPetroleumInstitute/">
|
||||
<i class="fab fa-facebook-f" aria-hidden="true"></i>
|
||||
<span class="visually-hidden">Follow us on Facebook</span>
|
||||
</a>
|
||||
</li>
|
||||
<li>
|
||||
<a class="btn-social" href="https://www.instagram.com/americanpetroleum/">
|
||||
<i class="fab fa-instagram" aria-hidden="true"></i>
|
||||
<span class="visually-hidden">Follow us on Instagram</span>
|
||||
</a>
|
||||
</li>
|
||||
<li>
|
||||
<a class="btn-social" href="https://twitter.com/APIenergy">
|
||||
<i class="fab fa-x-twitter" aria-hidden="true"></i>
|
||||
<span class="visually-hidden">Follow us on X</span>
|
||||
</a>
|
||||
</li>
|
||||
<li>
|
||||
<a class="btn-social" href="https://www.linkedin.com/company/americanpetroleuminstitute/">
|
||||
<i class="fab fa-linkedin-in" aria-hidden="true"></i>
|
||||
<span class="visually-hidden">
|
||||
Follow us on LinkedIn
|
||||
</span>
|
||||
</a>
|
||||
</li>
|
||||
<li>
|
||||
<a class="btn-social" href="https://www.youtube.com/energy">
|
||||
<i class="fab fa-youtube" aria-hidden="true"></i>
|
||||
<span class="visually-hidden">Follow us on YouTube</span>
|
||||
</a>
|
||||
</li>
|
||||
</ul>
|
||||
|
||||
<h5>API Global</h5>
|
||||
<ul class="social-nav">
|
||||
<li><a class="btn-social" href="https://www.facebook.com/OfficialAPIGlobal"><i class="fab fa-facebook-f" aria-hidden="true"></i></a></li>
|
||||
<li><a class="btn-social" href="https://twitter.com/apiglobal"><i class="fab fa-x-twitter" aria-hidden="true"></i></a></li>
|
||||
<li><a class="btn-social" href="https://www.linkedin.com/company/api-global-official-/"><i class="fab fa-linkedin-in" aria-hidden="true"></i></a></li>
|
||||
<li><a class="btn-social" href="https://www.youtube.com/channel/UCitegkCxi2r-GGJRabGpRKg"><i class="fab fa-youtube" aria-hidden="true"></i></a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="bottom-footer">
|
||||
<p class="small">
|
||||
© Copyright 2026 - API. All Rights Reserved. | <a href="/terms-and-conditions">Terms & Conditions</a> | <a href="/privacy">Privacy</a></p>
|
||||
<div class="group">
|
||||
<p><a href="/privacy">Privacy & Cookies Notice</a></p>
|
||||
<p><a href="/terms-and-conditions">Terms & Conditions</a></p>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
<script src="/library/APIWeb/js/menu.js"></script>
|
||||
<script src="/library/APIWeb/js/mobile-menu.js"></script>
|
||||
<script src="https://kit.fontawesome.com/a2be981ec3.js" crossorigin="anonymous"></script>
|
||||
</footer>
|
||||
<!--END COVEO NOINDEX-->
|
||||
|
||||
|
||||
<!--BEGIN COVEO NOINDEX-->
|
||||
<!-- Back to Top Button -->
|
||||
<a href="#" id="backToTop" class="back-to-top" aria-label="Back to Top"></a>
|
||||
|
||||
|
||||
<script>
|
||||
// Picture element HTML5 shiv for older browsers
|
||||
document.createElement("picture");
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
<!-- Coveo Search Resources -->
|
||||
<link rel="stylesheet" href="https://static.cloud.coveo.com/searchui/v2.10116/css/CoveoFullSearch.css" />
|
||||
<link rel="stylesheet" href="https://static.cloud.coveo.com/coveoforsitecore/ui/v0.64.7/css/CoveoForSitecore.css" />
|
||||
<script class="coveo-script" type="text/javascript" src='https://static.cloud.coveo.com/searchui/v2.10116/js/CoveoJsSearch.Lazy.min.js'></script>
|
||||
<script class="coveo-for-sitecore-script" type="text/javascript" src='https://static.cloud.coveo.com/coveoforsitecore/ui/v0.64.7/js/CoveoForSitecore.Lazy.min.js'></script>
|
||||
<script type="text/javascript" src="https://static.cloud.coveo.com/searchui/v2.10116/js/cultures/en.js"></script>
|
||||
|
||||
<div>
|
||||
|
||||
|
||||
|
||||
|
||||
<!-- Resources -->
|
||||
<div id="SearchResourcesProperties_a3a158ca24074042a057852064d1ead7"
|
||||
class="CoveoForSitecoreContext"
|
||||
data-sc-should-have-analytics-component='true'
|
||||
data-sc-analytics-enabled='true'
|
||||
data-sc-current-language='en'
|
||||
data-prebind-sc-language-field-name='fieldTranslator'
|
||||
data-sc-language-field-name='_language'
|
||||
data-sc-labels='{"Created" : "Created" , "Created By" : "Created by" , "Creation Time" : "Creation time." , "Language" : "Language" , "Last Time Modified" : "Last time modified." , "Template" : "Template" , "Uniform resource identifier" : "URI" , "Updated By" : "Updated by" , "If the problem persists contact the administrator." : "If the problem persists contact the administrator." , "Search is currently unavailable" : "Oops! Something went wrong on the server." , "Ascending" : "Ascending" , "Descending" : "Descending"}'
|
||||
data-sc-maximum-age='900000'
|
||||
data-sc-page-name='important-standards-announcements'
|
||||
data-sc-page-name-full-path='/sitecore/content/Sites/API2/Home/products-and-services/standards/important-standards-announcements'
|
||||
data-sc-index-source-name='Coveo_web_index - Prod104'
|
||||
data-sc-is-in-experience-editor='false'
|
||||
data-sc-is-user-anonymous='true'
|
||||
data-sc-item-uri='sitecore://web/{1BA7D892-F03B-45B8-90A2-9F074C53FA6A}?lang=en&ver=1'
|
||||
data-sc-item-id='1ba7d892-f03b-45b8-90a2-9f074c53fa6a'
|
||||
data-prebind-sc-latest-version-field-name='fieldTranslator'
|
||||
data-sc-latest-version-field-name='_latestversion'
|
||||
data-sc-rest-endpoint-uri='/coveo/rest'
|
||||
data-sc-analytics-endpoint-uri='/coveo/rest/ua'
|
||||
data-sc-site-name='api2'
|
||||
data-sc-field-prefix='f'
|
||||
data-sc-field-suffix='7509'
|
||||
data-sc-prefer-source-specific-fields='false'
|
||||
data-sc-external-fields='[{"fieldName":"permanentid","shouldEscape":false}]'
|
||||
data-sc-source-specific-fields='[{"fieldName":"attachmentparentid"},{"fieldName":"author"},{"fieldName":"clickableuri"},{"fieldName":"collection"},{"fieldName":"concepts"},{"fieldName":"date"},{"fieldName":"filetype"},{"fieldName":"indexeddate"},{"fieldName":"isattachment"},{"fieldName":"language"},{"fieldName":"printableuri"},{"fieldName":"rowid"},{"fieldName":"size"},{"fieldName":"source"},{"fieldName":"title"},{"fieldName":"topparent"},{"fieldName":"topparentid"},{"fieldName":"transactionid"},{"fieldName":"uri"},{"fieldName":"urihash"}]'
|
||||
>
|
||||
</div>
|
||||
<script type="text/javascript">
|
||||
var endpointConfiguration = {
|
||||
itemUri: "sitecore://web/{1BA7D892-F03B-45B8-90A2-9F074C53FA6A}?lang=en&ver=1",
|
||||
siteName: "api2",
|
||||
restEndpointUri: "/coveo/rest"
|
||||
};
|
||||
if (typeof (CoveoForSitecore) !== "undefined") {
|
||||
CoveoForSitecore.SearchEndpoint.configureSitecoreEndpoint(endpointConfiguration);
|
||||
CoveoForSitecore.version = "5.0.1368.1";
|
||||
var context = document.getElementById("SearchResourcesProperties_a3a158ca24074042a057852064d1ead7");
|
||||
if (!!context) {
|
||||
CoveoForSitecore.Context.configureContext(context);
|
||||
}
|
||||
}
|
||||
</script>
|
||||
</div>
|
||||
<!--END COVEO NOINDEX-->
|
||||
<!-- news-policy-and-issues/blog -->
|
||||
<script>
|
||||
function formatDateElementsWithCoveo() {
|
||||
Coveo.$$(document).on('newResultDisplayed', function (event, args) {
|
||||
var dateElements = args.item.getElementsByClassName('posted-date');
|
||||
Array.prototype.forEach.call(dateElements, function (elem) {
|
||||
var timestamp = parseInt(elem.textContent.replace('Posted: ', '').trim(), 10);
|
||||
var date = new Date(timestamp);
|
||||
var options = { year: 'numeric', month: 'long', day: 'numeric' };
|
||||
var formattedDate = date.toLocaleDateString("en-US", options);
|
||||
|
||||
if (isNaN(date.getTime())) {
|
||||
elem.textContent = '';
|
||||
} else {
|
||||
elem.textContent = 'Posted: ' + formattedDate;
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
function waitForCoveoReady() {
|
||||
if (typeof Coveo !== 'undefined' && Coveo.$) {
|
||||
formatDateElementsWithCoveo();
|
||||
} else {
|
||||
setTimeout(waitForCoveoReady, 100);
|
||||
}
|
||||
}
|
||||
|
||||
document.addEventListener('DOMContentLoaded', waitForCoveoReady);
|
||||
</script>
|
||||
<!--END news-policy-and-issues/blog-->
|
||||
</body>
|
||||
</html>
|
||||
Vendored
+196
@@ -0,0 +1,196 @@
|
||||
<?xml version='1.0' encoding='UTF-8'?>
|
||||
<rss xmlns:arxiv="http://arxiv.org/schemas/atom" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:content="http://purl.org/rss/1.0/modules/content/" version="2.0">
|
||||
<channel>
|
||||
<title>physics.app-ph updates on arXiv.org</title>
|
||||
<link>http://rss.arxiv.org/rss/physics.app-ph</link>
|
||||
<description>physics.app-ph updates on the arXiv.org e-print archive.</description>
|
||||
<atom:link href="http://rss.arxiv.org/rss/physics.app-ph" rel="self" type="application/rss+xml"/>
|
||||
<docs>http://www.rssboard.org/rss-specification</docs>
|
||||
<language>en-us</language>
|
||||
<lastBuildDate>Wed, 10 Jun 2026 04:00:28 +0000</lastBuildDate>
|
||||
<managingEditor>rss-help@arxiv.org</managingEditor>
|
||||
<pubDate>Wed, 10 Jun 2026 00:00:00 -0400</pubDate>
|
||||
<skipDays>
|
||||
<day>Sunday</day>
|
||||
<day>Saturday</day>
|
||||
</skipDays>
|
||||
<item>
|
||||
<title>Limits of Trap-assisted Photomultiplication Gain</title>
|
||||
<link>https://arxiv.org/abs/2606.10236</link>
|
||||
<description>arXiv:2606.10236v1 Announce Type: new
|
||||
Abstract: Photodiodes based on trap-assisted current injection can exhibit internal photomultiplication with apparent quantum efficiencies far exceeding unity, raising the question of whether such gain fundamentally enhances detector sensitivity. We employ a minimal analytical framework based on a single gain-active trapped state coupling photogenerated carriers to contact injection. The gain is intrinsically self-limiting: the injection process that amplifies the current simultaneously accelerates relaxation of the gain-enabling state, producing an inherently nonlinear, operating-point-dependent response. The form of this nonlinearity is not universal -- once the trap level is generalized to an energetic distribution and recombination is allowed to be bimolecular, the same mechanism yields superlinear, linear, or strongly sublinear responses. A single chord gain is therefore not a meaningful device descriptor, and chord-gain comparisons across the literature conflate devices in different regimes. Treating trap occupancy and injection as coupled stochastic processes, we show that internal gain introduces a strictly non-negative fluctuation penalty from the dissipative dynamics that sustain the gain state. A local, small-signal detectivity exhibits a finite optimum yet cannot exceed the intrinsic thermodynamic limit of the underlying unity-gain photodiode. Gain is thus equivalent to driven stochastic amplification: it can suppress downstream readout noise, but cannot reduce the fundamental noise floor set by the primary photodetection process.</description>
|
||||
<guid isPermaLink="false">oai:arXiv.org:2606.10236v1</guid>
|
||||
<category>physics.app-ph</category>
|
||||
<pubDate>Wed, 10 Jun 2026 00:00:00 -0400</pubDate>
|
||||
<arxiv:announce_type>new</arxiv:announce_type>
|
||||
<dc:rights>http://creativecommons.org/licenses/by/4.0/</dc:rights>
|
||||
<dc:creator>Ardalan Armin</dc:creator>
|
||||
</item>
|
||||
<item>
|
||||
<title>Filamentary Transport and Thermoelectric Effects in Mushroom Phase Change Memory Cells</title>
|
||||
<link>https://arxiv.org/abs/2606.10262</link>
|
||||
<description>arXiv:2606.10262v1 Announce Type: new
|
||||
Abstract: We performed a 2D finite-element electrothermal computational study of thermoelectric effects and filamentary electronic transport in Ge$_2$Sb$_2$Te$_5$ mushroom phase change memory cells during Reset and Set operations, accounting for spatial activation energy variations in amorphous Ge$_2$Sb$_2$Te$_5$ and phase-change dynamics. Reset operations with current going from the top electrode to the narrow 4 nm bottom electrode require $\sim$3x less energy and power, and $\sim$2x lower current to achieve the same Reset resistance, compared to the opposite polarity, due to thermoelectric effects. Filamentary conduction, electrical breakdown, thermal runaway, and local crystallization of amorphous Ge$_2$Sb$_2$Te$_5$ depend on current polarity and thermal boundary conditions, and determine the location, shape, and volume of the programming region, which may be significantly smaller than the semi-cylindrical mushroom region. The programming volume does not scale with contact dimensions larger than 10 nm. Larger contact areas introduce increased device-to-device and cycle-to-cycle variability due to filamentary conduction but are expected to lead to higher reliability and endurance.</description>
|
||||
<guid isPermaLink="false">oai:arXiv.org:2606.10262v1</guid>
|
||||
<category>physics.app-ph</category>
|
||||
<category>cond-mat.mtrl-sci</category>
|
||||
<pubDate>Wed, 10 Jun 2026 00:00:00 -0400</pubDate>
|
||||
<arxiv:announce_type>new</arxiv:announce_type>
|
||||
<dc:rights>http://creativecommons.org/licenses/by/4.0/</dc:rights>
|
||||
<dc:creator>Md Samzid Bin Hafiz, Helena Silva, Ali Gokirmak</dc:creator>
|
||||
</item>
|
||||
<item>
|
||||
<title>Fast-Neutron Irradiation Effect in Heteroepitaxial $\beta$-Ga$_2$O$_3$ Schottky Diodes Fabricated on Low-Cost Sapphire Substrates</title>
|
||||
<link>https://arxiv.org/abs/2606.10269</link>
|
||||
<description>arXiv:2606.10269v1 Announce Type: new
|
||||
Abstract: In this work, we investigate the response of Ni/$\beta$-Ga$_2$O$_3$ Schottky barrier diodes fabricated on c-plane sapphire to fast-neutron irradiation up to a fluence of $1\times10^{15}$ n$\cdot$cm$^{-2}$. The LPCVD-grown heteroepitaxial structure consists of an unintentionally doped buffer, an n$^{+}$ contact layer, and an n-type drift layer, with mesa isolation realized by plasma-free Ga-assisted LPCVD etching. Prior to irradiation, the devices exhibit a turn-on voltage of 1.20 V, specific on-resistance of 8.43 m$\Omega\cdot$cm$^2$, ideality factor of 1.32, and Schottky barrier height of 1.29 eV. Following irradiation, the devices remain operational, although the forward current decreases, the turn-on voltage increases to 2.40 V, and the barrier height increases to 1.34 eV. Capacitance-voltage measurements reveal a $\sim$50% reduction in net donor concentration, corresponding to a carrier-removal rate of $\sim$105 cm$^{-1}$. Temperature-dependent measurements from 25 to 250 $^\circ$C confirm that thermionic emission remains the dominant transport mechanism and show significant suppression of reverse leakage current after irradiation. The breakdown voltage increases from 101 to 135 V, consistent with neutron-induced donor compensation. TCAD simulations show a more uniform electric-field distribution and reduced field crowding at the Schottky edge after irradiation. These results provide insight into neutron-induced donor compensation in heteroepitaxial $\beta$-Ga$_2$O$_3$ and demonstrate the ability of LPCVD-grown $\beta$-Ga$_2$O$_3$ Schottky diodes on sapphire to maintain stable operation under high-fluence neutron environments relevant to space and nuclear electronics.</description>
|
||||
<guid isPermaLink="false">oai:arXiv.org:2606.10269v1</guid>
|
||||
<category>physics.app-ph</category>
|
||||
<pubDate>Wed, 10 Jun 2026 00:00:00 -0400</pubDate>
|
||||
<arxiv:announce_type>new</arxiv:announce_type>
|
||||
<dc:rights>http://arxiv.org/licenses/nonexclusive-distrib/1.0/</dc:rights>
|
||||
<dc:creator>Saleh Ahmed Khan, Ahmed Ibreljic, Sourav Sarker, Stephen Margiotta, Anhar Bhuiyan</dc:creator>
|
||||
</item>
|
||||
<item>
|
||||
<title>Virtual-Array Operational Modal Analysis of Rolling Tires Using a Single Tire Cavity Accelerometer</title>
|
||||
<link>https://arxiv.org/abs/2606.10437</link>
|
||||
<description>arXiv:2606.10437v1 Announce Type: new
|
||||
Abstract: The dynamics of rolling tires significantly influence the low-frequency (0-500 Hz) structure-borne noise within vehicles. Accurately characterizing these dynamics under realistic operating conditions remains challenging. Current state-of-the-art methods, primarily relying on Laser Doppler Vibrometers (LDV), are complex to implement, time-intensive, and generally limited to smooth tires in laboratory environments due to issues with speckle formation on treaded surfaces. This study introduces an innovative strategy for Operational Modal Analysis (OMA) of a rolling tire using a single wireless Tire Cavity Accelerometer (TCA) together with two optical sensors. The methodology leverages the non-integer ratio between the tire and drum diameters in a test rig to create a virtual sensor array. By utilizing optical sensors to time-stamp the cleat impact (on the drum) precisely and the TCA position (on the tire), the vibration responses from multiple revolutions are clustered according to the TCA's circumferential position at the moment of impact. This effectively synthesizes responses from an array of virtual sensors distributed around the tire circumference using data from a single test run. The clustered signals are conditioned using order tracking to remove periodic components arising from contact patch deformation. Both Frequency Domain Decomposition (FDD) and Covariance-based Stochastic Subspace Identification (SSI-Cov) were employed for modal identification. The SSI-Cov method proved more robust, successfully identifying 11 circumferential modes up to 240 Hz. The proposed approach offers a significantly more efficient, cost-effective method for characterizing rolling tire dynamics, which is readily applicable to treaded tires and adaptable for on-road testing.</description>
|
||||
<guid isPermaLink="false">oai:arXiv.org:2606.10437v1</guid>
|
||||
<category>physics.app-ph</category>
|
||||
<category>physics.data-an</category>
|
||||
<pubDate>Wed, 10 Jun 2026 00:00:00 -0400</pubDate>
|
||||
<arxiv:announce_type>new</arxiv:announce_type>
|
||||
<dc:rights>http://arxiv.org/licenses/nonexclusive-distrib/1.0/</dc:rights>
|
||||
<dc:creator>Pradosh Pritam Dash, Ricardo Burdisso, Pablo A Tarazaga</dc:creator>
|
||||
</item>
|
||||
<item>
|
||||
<title>Finite-temperature Fe K-edge X-ray absorption simulations reveal local structural dynamics of an iron(II) photosensitizer in solution and the crystalline phase</title>
|
||||
<link>https://arxiv.org/abs/2606.10221</link>
|
||||
<description>arXiv:2606.10221v1 Announce Type: cross
|
||||
Abstract: Interpreting metal K-edge spectra of flexible photosensitizers requires a structural model that separates electronic signatures from thermal motion, solvent disorder, and crystal-packing effects. We combine Fe K-edge X-ray absorption measurements with second-generation Car--Parrinello ab initio molecular dynamics and all-electron Gaussian and augmented-plane-wave simulations for an iron(II) N-heterocyclic carbene photosensitizer in acetonitrile solution and in the crystalline phase. Ensemble-averaged spectra reproduce the main near-edge features in both environments and preserve the experimentally observed similarity of the first Fe coordination shell upon dissolution. Comparison with radial distributions extracted from extended fine-structure measurements validates the Fe--N and Fe--C coordination shells sampled by the trajectories, while element-resolved pair distributions explain why higher-shell experimental contrast is rapidly lost. The same dynamical ensembles reveal a broad out-of-plane distribution of the terpyridine nitrogen atom and a nearly octahedral distribution of the Fe-centered coordination planes. The results show that finite-temperature X-ray absorption simulations can provide a compact structural-dynamics picture of molecular transition metal photosensitizers by linking local spectra, solvent-phase ligand motion, and medium-range structural disorder within one trajectory-based description.</description>
|
||||
<guid isPermaLink="false">oai:arXiv.org:2606.10221v1</guid>
|
||||
<category>cond-mat.mtrl-sci</category>
|
||||
<category>physics.app-ph</category>
|
||||
<category>physics.chem-ph</category>
|
||||
<category>physics.comp-ph</category>
|
||||
<pubDate>Wed, 10 Jun 2026 00:00:00 -0400</pubDate>
|
||||
<arxiv:announce_type>cross</arxiv:announce_type>
|
||||
<dc:rights>http://arxiv.org/licenses/nonexclusive-distrib/1.0/</dc:rights>
|
||||
<dc:creator>Patrick M\"uller, Lorena Fritsch, Matthias Bauer, Thomas D. K\"uhne</dc:creator>
|
||||
</item>
|
||||
<item>
|
||||
<title>Multi-channel Optical Vision Model</title>
|
||||
<link>https://arxiv.org/abs/2606.10253</link>
|
||||
<description>arXiv:2606.10253v1 Announce Type: cross
|
||||
Abstract: Spatial multiplexing is one of the natural strengths of optics, yet in optical neural networks, it is often used mainly as parallel throughput. Here, we show that spatial multiplexing in an optical neural network can be used not only to process multiple inputs in parallel, but also to define a trainable representational coordinate of the model. In three implemented scenarios, parallel-input processing, class-code readout and channel-mixed feature interaction, spatial channels act as independent learners, structured code dimensions, and interacting feature groups. The programmable free-space optical processor is trained through an online physical-forward/surrogate-backward scheme, where measured optical outputs define the forward pass while a differentiable surrogate estimates gradients and is continually fine-tuned during training from newly acquired optical data. We demonstrate these channel roles in image classification and regression tasks using multi-layer architectures with more than one million trainable optical phase parameters. We further implement a hybrid optical-electronic vision-language model, in which the optical neural network provides visual tokens to a digital transformer decoder for controlled image-captioning tasks. These results establish spatially multiplexed optical channels as a programmable feature and readout space for hybrid optical vision models.</description>
|
||||
<guid isPermaLink="false">oai:arXiv.org:2606.10253v1</guid>
|
||||
<category>physics.optics</category>
|
||||
<category>physics.app-ph</category>
|
||||
<pubDate>Wed, 10 Jun 2026 00:00:00 -0400</pubDate>
|
||||
<arxiv:announce_type>cross</arxiv:announce_type>
|
||||
<dc:rights>http://arxiv.org/licenses/nonexclusive-distrib/1.0/</dc:rights>
|
||||
<dc:creator>Ali Momeni, Guillaume Noetinger, Tim Tuuva, Romain Fleury</dc:creator>
|
||||
</item>
|
||||
<item>
|
||||
<title>Spontaneous translation of charged droplets during evaporation on dry surfaces</title>
|
||||
<link>https://arxiv.org/abs/2606.10755</link>
|
||||
<description>arXiv:2606.10755v1 Announce Type: cross
|
||||
Abstract: Evaporating sessile droplets are usually treated as capillary objects, but droplets generated by routine handling can carry tens to hundreds of picocoulombs of electric charge. Here we combine Faraday-cup charge measurements with optical imaging to determine how such charge evolves as water droplets evaporate on dry polymer substrates. A zero-time protocol shows that a reproducible initial charge is preserved on poly(methylpentene) (PMP), whereas PDMS, SOCAL-coated surfaces, and polystyrene either exchange, dissipate, or inject charge on contact. On PMP, ensemble-resolved measurements reveal two regimes: the charge remains nearly constant during early evaporation and then decreases abruptly once the droplet reaches a small-volume state. This charge collapse coincides with spontaneous lateral translation rather than jetting or breakup. A Rayleigh-normalized analysis, including a spherical-cap stress correction and measured contact-angle retention scale, shows that motion occurs only after evaporation drives the droplet into a high electro-pinning state. High-speed imaging and kinematic analysis support a picture in which the subsequent motion is governed by repeated contact-line depinning and re-pinning: the total distance traveled is strongly affected by dry-surface pinning, whereas the peak translational velocity serves as a more robust indicator of the discharge strength. These results identify a dry-substrate mode of evaporation-driven electrostatic relaxation, distinct from Coulomb fission on lubricated surfaces, in which substrate electrostatic passivity enables charge retention, droplet geometry selects the instability onset, and whole-droplet translation provides the charge-release pathway.</description>
|
||||
<guid isPermaLink="false">oai:arXiv.org:2606.10755v1</guid>
|
||||
<category>cond-mat.soft</category>
|
||||
<category>physics.app-ph</category>
|
||||
<pubDate>Wed, 10 Jun 2026 00:00:00 -0400</pubDate>
|
||||
<arxiv:announce_type>cross</arxiv:announce_type>
|
||||
<dc:rights>http://creativecommons.org/licenses/by-nc-nd/4.0/</dc:rights>
|
||||
<dc:creator>Riming Xu, Yanbo Li, Jiawen Zhang, Jin Wang, Yikai Li</dc:creator>
|
||||
</item>
|
||||
<item>
|
||||
<title>Programmable Integrated Magnonic Meshes</title>
|
||||
<link>https://arxiv.org/abs/2605.00290</link>
|
||||
<description>arXiv:2605.00290v2 Announce Type: replace
|
||||
Abstract: Integrated circuits are a cornerstone of modern information technology, and analog wave-based architectures could enable fast and efficient processing beyond conventional charge electronics. In magnonics, spin waves provide a highly tunable, compact and energy-efficient medium for on-chip microwave signal transport and processing. However, progress has been limited to isolated elements or short devices, severely limiting the overall functional complexity and scalability. Here we realize the key elements of universal magnonic circuitry, using a single-step direct laser writing process in yttrium iron garnet, and monolithically cascade them in multi-stage programmable devices and networks. Using magneto-optical Kerr effect microscopy, we show efficient spin-wave propagation and preserved phase coherence in waveguide structures for hundreds of wavelengths. In coupled waveguides, we observe complete and periodic power transfer over several coupling lengths, and in phase shifters we achieve arbitrary, tunable phase delays. By cascading these elements, we realize programmable splitters, frequency demultiplexers, and phase-controlled 2x2 routers, where output power and relative phase can be programmed on demand via external fields. Finally, we realize programmable magnonic interferometric meshes for on-chip radio-frequency signal routing, with up to six magnonic inputs and outputs and seven cascaded stages, without the need for intermediate amplification. These direct-write cascaded networks bridge a long-standing gap in magnonic scalability, offering a viable pathway toward integrated, large-scale architectures for both classical and quantum processing.</description>
|
||||
<guid isPermaLink="false">oai:arXiv.org:2605.00290v2</guid>
|
||||
<category>physics.app-ph</category>
|
||||
<category>cond-mat.mtrl-sci</category>
|
||||
<pubDate>Wed, 10 Jun 2026 00:00:00 -0400</pubDate>
|
||||
<arxiv:announce_type>replace</arxiv:announce_type>
|
||||
<dc:rights>http://creativecommons.org/licenses/by/4.0/</dc:rights>
|
||||
<dc:creator>Piero Florio, Matteo Vitali, Valerio Levati, Rasheed M. Ishola, Luca Ciaccarini Mavilla, Nora Lecis, Carsten Dubs, Riccardo Bertacco, Marco Madami, Silvia Tacchi, Daniela Petti, Edoardo Albisetti</dc:creator>
|
||||
</item>
|
||||
<item>
|
||||
<title>Interpretable deep convolutional model for nonlinear multivariate time series in complex systems</title>
|
||||
<link>https://arxiv.org/abs/2501.04339</link>
|
||||
<description>arXiv:2501.04339v2 Announce Type: replace-cross
|
||||
Abstract: We introduce the Deep Convolutional Interpreter for Time Series (DCIts), a deep-learning architecture for nonlinear multivariate time series that provides sample-specific, locally interpretable descriptions of the underlying interaction structure. Unlike standard black-box forecasters, DCIts learns a time- and lag-dependent transition tensor explicitly factorized into two components: a Focuser, which selects relevant source series and time lags via a sparse masking mechanism, and a Modeler, which assigns signed coefficients to these selected interactions. This decomposition yields a local lag-adjacency structure and signed source-lag contributions for every forecast instance, enabling direct inspection of effective connectivity; when higher-order branches are activated, the same framework yields order-resolved elementwise polynomial contributions. Architecturally, DCIts uses a diverse bank of convolutional filters to capture temporal and cross-variable dependencies, which are mapped through a bottleneck network to the transition tensor. On controlled benchmark datasets with a known interaction structure, we demonstrate that DCIts achieves competitive forecasting error relative to a strong interpretable baseline while recovering stable, signed, lag-resolved interaction patterns. The framework thus prioritizes intrinsic interpretability, using forecasting accuracy as a faithfulness constraint rather than the sole objective.</description>
|
||||
<guid isPermaLink="false">oai:arXiv.org:2501.04339v2</guid>
|
||||
<category>stat.ML</category>
|
||||
<category>cs.LG</category>
|
||||
<category>physics.app-ph</category>
|
||||
<pubDate>Wed, 10 Jun 2026 00:00:00 -0400</pubDate>
|
||||
<arxiv:announce_type>replace-cross</arxiv:announce_type>
|
||||
<dc:rights>http://arxiv.org/licenses/nonexclusive-distrib/1.0/</dc:rights>
|
||||
<arxiv:DOI>10.1063/5.0325209</arxiv:DOI>
|
||||
<arxiv:journal_reference>Chaos 36, 063116 (2026)</arxiv:journal_reference>
|
||||
<dc:creator>Domjan Baric, Davor Horvatic</dc:creator>
|
||||
</item>
|
||||
<item>
|
||||
<title>Probing laser-driven surface and subsurface dynamics via grazing-incidence XFEL scattering and diffraction</title>
|
||||
<link>https://arxiv.org/abs/2509.12015</link>
|
||||
<description>arXiv:2509.12015v2 Announce Type: replace-cross
|
||||
Abstract: We demonstrate a grazing-incidence x-ray platform that simultaneously records time-resolved grazing-incidence small-angle x-ray scattering (GISAXS) and grazing-incidence x-ray diffraction (GID) from a femtosecond laser-irradiated gold film above the melting threshold, with picosecond resolution at an x-ray free-electron laser (XFEL). By tuning the x-ray incidence angle, the probe depth is set to tens of nanometers, enabling depth-selective sensitivity to near-surface dynamics. GISAXS resolves ultrafast changes in surface nanomorphology (correlation length, roughness), while GID quantifies subsurface lattice compression, grain orientation, melting, and recrystallization. The approach overcomes photon-flux limitations of synchrotron grazing-incidence geometries and provides stringent, time-resolved benchmarks for complex theoretical models of ultrafast laser-matter interaction and warm dense matter. Looking ahead, the same depth-selective methodology is well suited to inertial confinement fusion (ICF): it can visualize buried-interface perturbations and interfacial thermal resistance on micron to sub-micron scales that affect instability seeding and burn propagation.</description>
|
||||
<guid isPermaLink="false">oai:arXiv.org:2509.12015v2</guid>
|
||||
<category>physics.optics</category>
|
||||
<category>physics.app-ph</category>
|
||||
<category>physics.ins-det</category>
|
||||
<category>physics.plasm-ph</category>
|
||||
<pubDate>Wed, 10 Jun 2026 00:00:00 -0400</pubDate>
|
||||
<arxiv:announce_type>replace-cross</arxiv:announce_type>
|
||||
<dc:rights>http://arxiv.org/licenses/nonexclusive-distrib/1.0/</dc:rights>
|
||||
<arxiv:DOI>10.1107/S2052252526001727</arxiv:DOI>
|
||||
<arxiv:journal_reference>IUCrJ Vol.13, Pages 249-259 (2026)</arxiv:journal_reference>
|
||||
<dc:creator>Lisa Randolph, \"Ozg\"ul \"Ozt\"urk, Dmitriy Ksenzov, Lingen Huang, Thomas Kluge, S. V. Rahul, Victorien Bouffetier, Carsten Baehtz, Mohammadreza Banjafar, Erik Brambrink, Fabien Brieuc, Byoung Ick Cho, Sebastian G\"ode, Tobias Held, Hauke H\"oppner, Gerhard Jakob, Mathias Kl\"aui, Zuzana Kon\^opkov\'a, Changhoo Lee, Gyusang Lee, Mikako Makita, Mikhail Mishchenko, Mianzhen Mo, Pascal D. Ndione, Michael Paulus, Alexander Pelka, Franziska Paschke-Bruehl, Thomas R. Preston, Baerbel Rethfeld, Christian R\"odel, Michal \v{S}m\'id, Ling Wang, Sebastian T. Weber, Lennart Wollenweber, Jan-Patrick Schwinkendorf, Christian Gutt, Motoaki Nakatsutsumi</dc:creator>
|
||||
</item>
|
||||
<item>
|
||||
<title>Real-space imaging reveals symmetry-selected nonlinear energy routing in a mechanical resonator</title>
|
||||
<link>https://arxiv.org/abs/2605.01469</link>
|
||||
<description>arXiv:2605.01469v2 Announce Type: replace-cross
|
||||
Abstract: Nonlinear energy exchange between vibrational modes underlies phenomena ranging from internal resonance and wave mixing to frequency-comb generation, yet modal interactions are typically inferred from spectra rather than directly observed in space. Here, we image nonlinear modal energy routing in a nearly mirror-symmetric microelectromechanical resonator using phase-locked multi-harmonic stroboscopic interferometry. By reconstructing the spatial eigenmode content of individual harmonics, we show that harmonics generated by a driven mode can be carried by distinct spatial eigenmodes, directly resolving spatial pathways of nonlinear energy transfer. Our measurements further reveal that this modal routing persists away from integer frequency matching: in the off-resonant regime, generated harmonic components are dominated by eigenmodes sharing the driven mode's mirror parity, whereas spectrally closer opposite-parity modes remain strongly suppressed. A nonlinear modal framework based on geometric nonlinearity shows that the relevant cubic coupling coefficients factorize into symmetry-dependent modal-overlap integrals, identifying mirror parity as the selection rule for nonlinear modal interaction. This work identifies spatial symmetry as a design parameter for nonlinear energy routing and provides a route to symmetry-engineered control of energy flow in multimode nonlinear wave systems.</description>
|
||||
<guid isPermaLink="false">oai:arXiv.org:2605.01469v2</guid>
|
||||
<category>physics.optics</category>
|
||||
<category>physics.app-ph</category>
|
||||
<pubDate>Wed, 10 Jun 2026 00:00:00 -0400</pubDate>
|
||||
<arxiv:announce_type>replace-cross</arxiv:announce_type>
|
||||
<dc:rights>http://creativecommons.org/licenses/by-nc-nd/4.0/</dc:rights>
|
||||
<dc:creator>Ya Zhang, Yuko Terasawa, Qian Liu, Shumpei Takenaka, Hua Li, Yutao Xu, Xueyong Wei, Kazuhiko Hirakawa</dc:creator>
|
||||
</item>
|
||||
<item>
|
||||
<title>Designing single-layer PDMS devices for micron to millimeter-scale deformations</title>
|
||||
<link>https://arxiv.org/abs/2605.17402</link>
|
||||
<description>arXiv:2605.17402v2 Announce Type: replace-cross
|
||||
Abstract: The elasticity of PDMS has played a central role in advancing important microfluidic technologies, ranging from early valves to sophisticated organ-on-a-chip systems. However, most deformable microfluidic devices are based on geometries that require complex multi-layer PDMS architectures and include thin membranes, leading to difficult microfabrication and poor stability. Recently, Jain, Belkadi et al. (Biofabrication 16.3 (2024): 035010) introduced a single-layer PDMS device in which a wide and long microfluidic channel was deformed by pressurizing two adjacent air chambers. While they demonstrated how the channel ceiling deformation can be leveraged to compress biological materials, it remains unknown how the device geometry influences this deformation. Here, a systematic numerical study is performed on 14,336 variants of this device, through which the height of the PDMS layer is identified as the main feature that determines the ceiling deformation. Three modes of channel deformation are identified as the geometry are varied: a U shape with a central minimum, a W shape with two minima and a central maximum, or an inverse U shape with an upward-bulging single maximum. The numerical results are validated in experiments that reproduce the three modes for the predicted geometries and demonstrate vertical ceiling deformations ranging from a few microns to the millimeter scale. The generality of this approach is demonstrated for two example applications: A fully closing single-layer microfluidic valve and an optical lens of controllable anisotropic magnification. This work leverages the rapid prototyping enabled by 3D printing or micro-milling to open new perspectives in microfluidic actuation.</description>
|
||||
<guid isPermaLink="false">oai:arXiv.org:2605.17402v2</guid>
|
||||
<category>physics.flu-dyn</category>
|
||||
<category>physics.app-ph</category>
|
||||
<pubDate>Wed, 10 Jun 2026 00:00:00 -0400</pubDate>
|
||||
<arxiv:announce_type>replace-cross</arxiv:announce_type>
|
||||
<dc:rights>http://creativecommons.org/licenses/by/4.0/</dc:rights>
|
||||
<dc:creator>Leon V. Gebhard, Alexandre S. Avaro, Gabriel Amselem, Charles N. Baroud</dc:creator>
|
||||
</item>
|
||||
<item>
|
||||
<title>Metasurfaces for neutral-atom trapping</title>
|
||||
<link>https://arxiv.org/abs/2605.30498</link>
|
||||
<description>arXiv:2605.30498v2 Announce Type: replace-cross
|
||||
Abstract: Trapped neutral atoms are one of the leading platforms for quantum information technologies, in particular for quantum computing, but scaling them to array sizes needed for utility-scale quantum computing is a major engineering challenge. Here we review optical metasurfaces as an enabling technology that provides fine control over the phase, amplitude, and polarization of light, with pixel counts far exceeding what is available with spatial light modulators (SLMs) and other active devices. The large pixel counts have recently led to demonstrations of arrays of optical tweezers with hundreds of thousands of sites and arrays of optical bottle-beams with complex three-dimensional trapping profiles. The flexibility and scalability of optical metasurfaces provides a route towards miniaturized, integrated, and highly scalable atomic experiments and instruments.</description>
|
||||
<guid isPermaLink="false">oai:arXiv.org:2605.30498v2</guid>
|
||||
<category>physics.optics</category>
|
||||
<category>physics.app-ph</category>
|
||||
<category>physics.atom-ph</category>
|
||||
<category>quant-ph</category>
|
||||
<pubDate>Wed, 10 Jun 2026 00:00:00 -0400</pubDate>
|
||||
<arxiv:announce_type>replace-cross</arxiv:announce_type>
|
||||
<dc:rights>http://arxiv.org/licenses/nonexclusive-distrib/1.0/</dc:rights>
|
||||
<dc:creator>Chengyu Fang, Minjeong Kim, Mark Saffman, Jennifer T. Choy, Mikhail Kats</dc:creator>
|
||||
</item>
|
||||
</channel>
|
||||
</rss>
|
||||
+37
@@ -0,0 +1,37 @@
|
||||
<?xml version="1.0"?>
|
||||
<rss version="2.0" xmlns:prism="http://purl.org/rss/1.0/modules/prism/">
|
||||
<channel>
|
||||
<title>Journal of Pressure Vessel Technology Open Issues</title>
|
||||
<link>https://asmedigitalcollection.asme.org/pressurevesseltech</link>
|
||||
<description>
|
||||
</description>
|
||||
<language>en-us</language>
|
||||
<pubDate>Mon, 11 May 2026 00:00:00 GMT</pubDate>
|
||||
<lastBuildDate>Tue, 12 May 2026 00:00:37 GMT</lastBuildDate>
|
||||
<generator>Silverchair</generator>
|
||||
<managingEditor>ASMEDigitalCollection@asme.org</managingEditor>
|
||||
<webMaster>ASMEDigitalCollection@asme.org</webMaster>
|
||||
<item>
|
||||
<title>Research on Low-Temperature Mechanical Properties and Fracture Behavior of 09MnNiDR Steel Based on Small Punch Test</title>
|
||||
<link>https://asmedigitalcollection.asme.org/pressurevesseltech/article/148/5/051504/1232699/Research-on-Low-Temperature-Mechanical-Properties</link>
|
||||
<pubDate>Mon, 11 May 2026 00:00:00 GMT</pubDate>
|
||||
<description><span class="paragraphSection"><div class="boxTitle">Abstract</div>To develop a microdamage evaluation method applicable to in-service equipment under low-temperature conditions, this study systematically investigates the mechanical properties and fracture behavior of 09MnNiDR cryogenic steel over a broad temperature range from room temperature to −196 °C. The small punch test (SPT) technique is employed, supplemented by electron backscatter diffraction (EBSD) and scanning electron microscopy (SEM) for micromechanism analysis. Results indicate that under cryogenic conditions, dislocation slip is suppressed, leading to a more uniform distribution of plastic strain. Concurrently, the deformation process at low temperatures refines the grains within the plastic zone through mechanisms such as mechanical subdivision. As temperature decreases, the material strength increases linearly, exhibiting a significant cryogenic strengthening effect. The fracture mode transitions from ductile to brittle, with a ductile-to-brittle transition zone identified near −150 °C. An empirical formula based on SPT deformation energy is proposed to predict yield and true tensile strength, with prediction errors below 6%. By introducing a normalized energy parameter, an empirical correlation model is established between the SPT ductile-to-brittle transition temperature (DBTT) and the standard Charpy impact transition temperature. This study presents a viable methodology for safety assessment of in-service cryogenic pressure vessels through minimally invasive testing and performance prediction.</span></description>
|
||||
<prism:volume xmlns:prism="prism">148</prism:volume>
|
||||
<prism:number xmlns:prism="prism">5</prism:number>
|
||||
<prism:startingPage xmlns:prism="prism">051504</prism:startingPage>
|
||||
<prism:doi xmlns:prism="prism">10.1115/1.4071740</prism:doi>
|
||||
<guid>https://asmedigitalcollection.asme.org/pressurevesseltech/article/148/5/051504/1232699/Research-on-Low-Temperature-Mechanical-Properties</guid>
|
||||
</item>
|
||||
<item>
|
||||
<title>Improved Oxidation, Carburization Resistance and Creep Strength of Ethylene Pyrolysis Furnace Tubes at 1100 °C Through Aluminum and Tungsten Alloying</title>
|
||||
<link>https://asmedigitalcollection.asme.org/pressurevesseltech/article/148/4/041701/1232556/Improved-Oxidation-Carburization-Resistance-and</link>
|
||||
<pubDate>Mon, 11 May 2026 00:00:00 GMT</pubDate>
|
||||
<description><span class="paragraphSection"><div class="boxTitle">Abstract</div>The oxidation resistance, carburization resistance, and mechanical properties of ethylene pyrolysis furnace tube alloys modified by Al/Al-W alloying were comparatively investigated with conventional alloys using various microstructural characterization techniques and mechanical property testing methods. The Al-alloyed 29Cr44Ni4AlNb+microalloy (MA) exhibits superior oxidation and carburization resistance compared to conventional 25Cr35NiNb+MA and 35Cr45NiNb+MA alloys; however, its creep rupture life was significantly reduced. Further addition of W enhanced the solid solution strengthening effect, thereby improving high-temperature tensile properties and mitigating the detrimental impact of Al on creep performance. The creep rupture life of the Al/W-modified 27Cr44Ni5W3Al+MA alloy reached levels comparable to those of conventional alloys while retaining the beneficial effects of Al in improving oxidation and carburization resistance. Through alloying strategies, this study successfully achieved a balance between corrosion resistance and mechanical properties in ethylene pyrolysis furnace tube alloys, enabling them to withstand their harsh service conditions effectively.</span></description>
|
||||
<prism:volume xmlns:prism="prism">148</prism:volume>
|
||||
<prism:number xmlns:prism="prism">4</prism:number>
|
||||
<prism:startingPage xmlns:prism="prism">041701</prism:startingPage>
|
||||
<prism:doi xmlns:prism="prism">10.1115/1.4071682</prism:doi>
|
||||
<guid>https://asmedigitalcollection.asme.org/pressurevesseltech/article/148/4/041701/1232556/Improved-Oxidation-Carburization-Resistance-and</guid>
|
||||
</item>
|
||||
</channel>
|
||||
</rss>
|
||||
+8
File diff suppressed because one or more lines are too long
@@ -0,0 +1,11 @@
|
||||
<html><head><title>Givaudan Sense Colour Explosion</title></head><body>
|
||||
<!-- 실측 발췌 (2026-06-11, csb.gov givaudan-sense-colour-explosion-) — PDF 앵커 원형 보존:
|
||||
보고서/부록/업데이트 + recommendation 상태요약 혼재 페이지 -->
|
||||
<a href="/assets/1/20/Appendix_C_Reactivity_Testing_Results_Publication.pdf?17347" id="CT_InvestigationDetails_8_rptDocumentsCategory_ctl06_rptDocuments_ctl01_lnkDocument" class="bold" target="_blank">Appendix C – Reactivity Testing Results and Analysis</a>
|
||||
<a href="/assets/1/20/Givaudan_Investigation_Report_Publication.pdf?17346" id="CT_InvestigationDetails_8_rptDocumentsCategory_ctl29_rptDocuments_ctl01_lnkDocument" class="bold" target="_blank">Fatal Runaway Reaction and Explosion at Givaudan Sense Colour / D.D. Williamson</a>
|
||||
<a href="/assets/1/6/Givaudan_Investigation_Update_-_final.pdf?17132" id="CT_InvestigationDetails_8_rptDocumentsCategory_ctl35_rptDocuments_ctl01_lnkDocument" class="bold" target="_blank">Givaudan Explosion Investigation Update</a>
|
||||
<a target="_blank" href="/assets/recommendation/Status_Change_Summary_CRA_(Givaudan_R12).pdf" > Recommendation Status Change Summary</a>
|
||||
<a target="_blank" href="/assets/recommendation/Status_Change_Summary_Givaudan_Corp_(Givaudan_R8).pdf" > Recommendation Status Change Summary</a>
|
||||
<a target="_blank" href="/assets/recommendation/Status_Change_Summary_Givaudan_(Givaudan_R1).pdf" > Recommendation Status Change Summary</a>
|
||||
<a target="_blank" href="/assets/recommendation/Status_Change_Summary_Givaudan_(Givaudan_R2).pdf" > Recommendation Status Change Summary</a>
|
||||
</body></html>
|
||||
+1
@@ -0,0 +1 @@
|
||||
<?xml version="1.0" encoding="utf-8"?><urlset xmlns:xsi="https://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="https://www.sitemaps.org/schemas/sitemap/0.9 https://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd" xmlns="https://www.sitemaps.org/schemas/sitemap/0.9"><url><loc>https://www.csb.gov/recommendations/preventive-maintenance/</loc><lastmod>2022-06-02T17:17:27-06:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>https://www.csb.gov/site-map/</loc><lastmod>2017-05-05T23:59:28-06:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>https://www.csb.gov/recommendations/preventive-maintenance-investigations/</loc><lastmod>2018-04-27T14:32:25-06:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>https://www.csb.gov/investigations/data-quality-/</loc><lastmod>2025-07-28T13:37:44-06:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>https://www.csb.gov/recommendations/preventive-maintenances/</loc><lastmod>2022-06-02T17:19:06-06:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>https://www.csb.gov/videos/video-feedback-form/</loc><lastmod>2017-05-04T18:17:43-06:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>https://www.csb.gov/investigations/</loc><lastmod>2017-05-08T16:06:42-06:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>https://www.csb.gov/investigations/completed-investigations/</loc><lastmod>2017-05-30T19:02:58-06:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>https://www.csb.gov/investigations/current-investigations/</loc><lastmod>2020-10-19T15:06:55-06:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>https://www.csb.gov/videos/</loc><lastmod>2017-03-09T13:38:53-06:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>https://www.csb.gov/videos/take-more-action-to-prevent-dust-explosions/</loc><lastmod>2013-05-17T16:46:08-06:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>https://www.csb.gov/videos/protect-public-employees-from-workplace-accidents/</loc><lastmod>2013-05-17T16:46:34-06:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url></urlset>
|
||||
+71
@@ -0,0 +1,71 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:atom="http://www.w3.org/2005/Atom">
|
||||
<channel>
|
||||
<title>
|
||||
<![CDATA[Latest Updates]]>
|
||||
</title>
|
||||
<description>
|
||||
<![CDATA[The most recent blogs and online articles from The Economist]]>
|
||||
</description>
|
||||
<link>https://www.economist.com/latest</link>
|
||||
<pubDate>Wed, 10 Jun 2026 21:11:56 +0000</pubDate>
|
||||
<lastBuildDate>Wed, 10 Jun 2026 21:11:56 +0000</lastBuildDate>
|
||||
<atom:link href="https://www.economist.com/latest/rss.xml" rel="self" type="application/rss+xml"/>
|
||||
<item>
|
||||
<title>
|
||||
<![CDATA[Syria is an unexpected beneficiary of the Gulf war]]>
|
||||
</title>
|
||||
<description>
|
||||
<![CDATA[The revival of an old oil-export route from Iraq to the Mediterranean helps Syria’s new regime]]>
|
||||
</description>
|
||||
<link>https://www.economist.com/middle-east-and-africa/2026/06/10/syria-is-an-unexpected-beneficiary-of-the-gulf-war</link>
|
||||
<guid isPermaLink="false">5737613e-c6cd-4cf0-b7da-fbfb52872f63</guid>
|
||||
<pubDate>Wed, 10 Jun 2026 19:26:42 +0000</pubDate>
|
||||
</item>
|
||||
<item>
|
||||
<title>
|
||||
<![CDATA[How to win the World Cup]]>
|
||||
</title>
|
||||
<description>
|
||||
<![CDATA[Being rich helps, but being open to immigration works best of all]]>
|
||||
</description>
|
||||
<link>https://www.economist.com/international/2026/06/10/how-to-win-the-world-cup</link>
|
||||
<guid isPermaLink="false">1019df1e-5c1e-4784-ae0c-31741c176e41</guid>
|
||||
<pubDate>Wed, 10 Jun 2026 19:07:01 +0000</pubDate>
|
||||
</item>
|
||||
<item>
|
||||
<title>
|
||||
<![CDATA[American capitalism is run by millionaires, not billionaires]]>
|
||||
</title>
|
||||
<description>
|
||||
<![CDATA[They hide in plain sight—and wield enormous power]]>
|
||||
</description>
|
||||
<link>https://www.economist.com/business/2026/06/10/american-capitalism-is-run-by-millionaires-not-billionaires</link>
|
||||
<guid isPermaLink="false">dbbcb101-a7de-472b-a62c-d969ab033b90</guid>
|
||||
<pubDate>Wed, 10 Jun 2026 19:01:31 +0000</pubDate>
|
||||
</item>
|
||||
<item>
|
||||
<title>
|
||||
<![CDATA[New techniques can predict and prevent lung cancer ]]>
|
||||
</title>
|
||||
<description>
|
||||
<![CDATA[A molecular signature can identify those most at risk]]>
|
||||
</description>
|
||||
<link>https://www.economist.com/science-and-technology/2026/06/10/new-techniques-can-predict-and-prevent-lung-cancer</link>
|
||||
<guid isPermaLink="false">dbc7231c-6c7c-42fb-8930-bb099e1d3015</guid>
|
||||
<pubDate>Wed, 10 Jun 2026 18:48:35 +0000</pubDate>
|
||||
</item>
|
||||
<item>
|
||||
<title>
|
||||
<![CDATA[The World Cup has always been beset by scandal and strife]]>
|
||||
</title>
|
||||
<description>
|
||||
<![CDATA[So has FIFA, the outfit that administers it]]>
|
||||
</description>
|
||||
<link>https://www.economist.com/international/2026/06/10/the-world-cup-has-always-been-beset-by-scandal-and-strife</link>
|
||||
<guid isPermaLink="false">f2213e72-3531-4894-a33f-47bce2fea4e9</guid>
|
||||
<pubDate>Wed, 10 Jun 2026 18:25:19 +0000</pubDate>
|
||||
</item>
|
||||
|
||||
</channel>
|
||||
</rss>
|
||||
File diff suppressed because one or more lines are too long
+4
File diff suppressed because one or more lines are too long
+1
@@ -0,0 +1 @@
|
||||
{"body":{"pageNo":1,"totalCount":1,"numOfRows":5,"items":{"item":[{"filenm":"컨베이어에 끼임.pdf","filepath":"https://portal.kosha.or.kr/openapi/v1/file/down/stdboard/B2025022104002/202605281621537G75H2/D0801000010001","boardno":"202605281621537G75H2"}]}},"header":{"resultCode":"00","resultMsg":"NORMAL_CODE"}}
|
||||
+1
@@ -0,0 +1 @@
|
||||
{"body":{"pageNo":1,"totalCount":6334,"numOfRows":3,"items":{"item":[{"business":"제조업","contents":"2026.01.00(월) 07:30경, 경기도 소재 OOOO(주)에서 재해자가 골재 이송 컨베이어 상부의 이물질을 제거하던 중,다리가 컨베이어 벨트와 테일 풀리 (Tail Pulley)* 사이에 끼임 *컨베이어의 아래쪽 끝단에서 회전하며 벨트를 순환시키는 원통형 기계장치","atcflcnt":1,"keyword":"컨베이어에 끼임","boardno":"202605281621537G75H2"},{"business":"건설업","contents":"2025. 8. 00. (금) 11:12 경 경기도 소재 OOO 신축공 사현장에서 데크플레이트 설치 중 밟고 있던 미고정 데크플레이트가 탈락하며 약 7m 높이에서 추락함","atcflcnt":1,"keyword":"데크플레이트 설치 작업 중 추락","boardno":"20260528162031VZLE93"},{"business":"건설업","contents":"2025. 06. 00.(금) 12:35경, 경북 봉화군 소재 (주)OOOO 침전저류지 현장에서 타워크레인 전도 후 매립된 케이크*(오염토)를 굴착 및 운반 작업 중, 사면의 토사와 타워크레인 기초구조물이 무너지며 하단에서 작업 중이던 굴착기가 매몰됨 * 분말 상태의 원료에서 아연을 채취한 후 남은 중금속 부산물(산화칼슘, 납, 산화철, 황산 등)을 장기간 매립하여 만들어지는 고체 형태의 오염 토양 덩어리","atcflcnt":1,"keyword":"사면 굴착 작업 중 매몰","boardno":"20260527153100O7QX25"}]}},"header":{"resultCode":"00","resultMsg":"NORMAL_CODE"}}
|
||||
+1
@@ -0,0 +1 @@
|
||||
{"body":{"pageNo":1,"totalCount":1039,"numOfRows":3,"items":{"item":[{"techGdlnNm":"구리에 대한 작업환경측정,분석 기술지침","techGdlnNo":"A-1-2018","techGdlnOfancYmd":"2018-11-27","fileDownloadUrl":"https://portal.kosha.or.kr/openapi/v1/file/down/FL00015883045/7"},{"techGdlnNm":"마그네슘에 대한 작업환경측정,분석 기술지침","techGdlnNo":"A-4-2018","techGdlnOfancYmd":"2018-11-27","fileDownloadUrl":"https://portal.kosha.or.kr/openapi/v1/file/down/FL00015883165/3"},{"techGdlnNm":"백금에 대한 작업환경측정,분석 기술지침","techGdlnNo":"A-6-2018","techGdlnOfancYmd":"2018-11-27","fileDownloadUrl":"https://portal.kosha.or.kr/openapi/v1/file/down/FL00015883187/3"}]}},"header":{"resultCode":"00","resultMsg":"NORMAL_CODE"}}
|
||||
+262
@@ -0,0 +1,262 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<rdf:RDF xmlns="http://purl.org/rss/1.0/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
|
||||
<channel rdf:about="https://asia.nikkei.com/rss/feed/nar" xml:lang="en-GB">
|
||||
<title>Nikkei Asia</title>
|
||||
<link>https://asia.nikkei.com/</link>
|
||||
<description/>
|
||||
<items>
|
||||
<rdf:Seq>
|
||||
<rdf:li resource="https://asia.nikkei.com/business/media-entertainment/tokyo-disneyland-magic-in-doubt-as-operator-s-stock-falls"/>
|
||||
<rdf:li resource="https://asia.nikkei.com/business/tech/semiconductors/sk-hynix-to-triple-wafer-capacity-by-2034-chairman-chey"/>
|
||||
<rdf:li resource="https://asia.nikkei.com/editor-s-picks/china-up-close/analysis-kim-jong-un-emerges-as-winner-in-summit-with-xi-jinping"/>
|
||||
<rdf:li resource="https://asia.nikkei.com/business/technology/ntt-sets-sights-on-nvidia-ai-race-with-500m-optical-network-fund"/>
|
||||
<rdf:li resource="https://asia.nikkei.com/politics/japan-to-help-content-industry-sue-over-copyright-infringement-abroad"/>
|
||||
<rdf:li resource="https://asia.nikkei.com/business/technology/artificial-intelligence/anthropic-plugs-claude-ai-in-japan-for-automated-software-development"/>
|
||||
<rdf:li resource="https://asia.nikkei.com/spotlight/supply-chain/us-tungsten-scrap-exports-to-japan-soar-on-chinese-curbs"/>
|
||||
<rdf:li resource="https://asia.nikkei.com/business/business-deals/tdk-to-buy-us-maker-of-ai-data-center-cooling-components-for-up-to-400m"/>
|
||||
<rdf:li resource="https://asia.nikkei.com/business/insurance/nippon-life-s-private-credit-assets-reach-4.6bn"/>
|
||||
<rdf:li resource="https://asia.nikkei.com/business/business-trends/us-firms-see-china-as-essential-despite-rising-economic-and-political-risks"/>
|
||||
<rdf:li resource="https://asia.nikkei.com/economy/fires/manslaughter-other-charges-filed-over-hong-kong-s-wang-fuk-court-fire"/>
|
||||
<rdf:li resource="https://asia.nikkei.com/spotlight/the-future-of-asia/future-of-asia-2026/adb-and-peers-need-to-anchor-international-order-president-kanda-says"/>
|
||||
<rdf:li resource="https://asia.nikkei.com/spotlight/the-future-of-asia/future-of-asia-2026/asia-faces-risks-of-economic-spillover-from-iran-and-ai-disinformation"/>
|
||||
<rdf:li resource="https://asia.nikkei.com/business/companies/swire-dangles-cathay-shares-in-600m-convertible-bond-issuance"/>
|
||||
<rdf:li resource="https://asia.nikkei.com/business/materials/shin-etsu-to-set-up-rare-earth-smelter-in-japan-to-ease-reliance-on-china"/>
|
||||
<rdf:li resource="https://asia.nikkei.com/editor-s-picks/interview/setting-sea-border-with-japan-vital-philippine-foreign-secretary"/>
|
||||
<rdf:li resource="https://asia.nikkei.com/business/markets/strong-dollar-rally-weighs-heavier-on-struggling-asian-countries"/>
|
||||
<rdf:li resource="https://asia.nikkei.com/business/fisheries/thailand-s-shrimp-industry-hit-by-malaysia-s-import-ban"/>
|
||||
<rdf:li resource="https://asia.nikkei.com/business/technology/applied-materials-opens-500m-manufacturing-campus-in-singapore"/>
|
||||
<rdf:li resource="https://asia.nikkei.com/spotlight/the-future-of-asia/future-of-asia-2026/malaysia-s-anwar-warns-against-global-powers-weaponizing-trade"/>
|
||||
<rdf:li resource="https://asia.nikkei.com/business/energy/apple-and-nvidia-supplier-foxconn-invests-in-vietnam-solar-wind-power"/>
|
||||
<rdf:li resource="https://asia.nikkei.com/spotlight/sports/world-cup-tests-asia-s-appetite-for-costly-broadcast-rights"/>
|
||||
<rdf:li resource="https://asia.nikkei.com/business/finance/brookfield-bets-on-its-japan-business-to-top-hong-kong-and-singapore"/>
|
||||
<rdf:li resource="https://asia.nikkei.com/business/technology/tencent-raises-4.6bn-in-dual-dollar-yuan-bond-issuances"/>
|
||||
<rdf:li resource="https://asia.nikkei.com/business/automobiles/electric-vehicles/ferrari-luce-ev-highlights-european-struggle-to-lure-back-china-s-superrich"/>
|
||||
<rdf:li resource="https://asia.nikkei.com/business/markets/commodities/indian-families-scale-back-on-gold-for-weddings-as-prices-hover-near-highs"/>
|
||||
<rdf:li resource="https://asia.nikkei.com/politics/defense/japan-s-new-defense-document-to-name-china-the-biggest-concern"/>
|
||||
<rdf:li resource="https://asia.nikkei.com/business/food-beverage/japan-to-bolster-ip-protections-for-prized-new-fruit-vegetable-varieties"/>
|
||||
<rdf:li resource="https://asia.nikkei.com/spotlight/environment/climate-change/japan-s-jgc-bets-on-carbon-feeding-bacteria-to-create-bioplastics"/>
|
||||
<rdf:li resource="https://asia.nikkei.com/business/travel-leisure/rural-japan-hopes-to-charm-domestic-travelers-priced-out-of-overseas-trips"/>
|
||||
<rdf:li resource="https://asia.nikkei.com/business/energy/gm-partners-with-peak-energy-for-sodium-ion-battery-storage"/>
|
||||
<rdf:li resource="https://asia.nikkei.com/business/technology/tech-asia/japan-seeks-bigger-role-in-asia-s-subsea-cables-as-ai-rewires-demand"/>
|
||||
<rdf:li resource="https://asia.nikkei.com/spotlight/comment/why-japan-s-takaichi-has-stepped-back-from-boj-rate-hike-debate"/>
|
||||
<rdf:li resource="https://asia.nikkei.com/business/transportation/japan-flying-car-startup-skydrive-aims-for-the-skies-in-2028"/>
|
||||
<rdf:li resource="https://asia.nikkei.com/business/technology/hanwha-qcells-kicks-off-first-fully-onshore-us-solar-supply-chain"/>
|
||||
<rdf:li resource="https://asia.nikkei.com/business/energy/japan-s-mitsubishi-hc-canada-s-brookfield-to-buy-european-wind-solar-farms-in-ai-play2"/>
|
||||
<rdf:li resource="https://asia.nikkei.com/business/technology/g7-plans-first-joint-statement-for-protecting-minors-on-social-media"/>
|
||||
<rdf:li resource="https://asia.nikkei.com/business/automobiles/toyota-backs-japan-self-driving-startup-tier-iv-in-development-push"/>
|
||||
<rdf:li resource="https://asia.nikkei.com/business/softbank/jpmorgan-chase-emerges-as-softbank-group-s-top-lender-surpassing-mizuho"/>
|
||||
<rdf:li resource="https://asia.nikkei.com/business/energy/malaysia-to-promise-japan-maximum-possible-lng-naphtha"/>
|
||||
<rdf:li resource="https://asia.nikkei.com/photos/in-focus-mindanao-reels-from-another-deadly-earthquake"/>
|
||||
<rdf:li resource="https://asia.nikkei.com/politics/international-relations/us-china-tensions/pentagon-blacklists-alibaba-byd-and-baidu-over-alleged-military-ties"/>
|
||||
<rdf:li resource="https://asia.nikkei.com/economy/bank-of-japan/bank-of-japan-set-to-hike-key-interest-rate-to-1"/>
|
||||
<rdf:li resource="https://asia.nikkei.com/spotlight/the-future-of-asia/future-of-asia-2026/semiconductor-advances-a-must-for-data-centers-says-tokyo-electron-boss"/>
|
||||
<rdf:li resource="https://asia.nikkei.com/politics/international-relations/xi-shores-up-china-s-sway-in-pyongyang-wary-of-north-korea-russia-ties"/>
|
||||
<rdf:li resource="https://asia.nikkei.com/opinion/south-korea-election-yoon-s-legacy-partially-survives-progressive-victory"/>
|
||||
<rdf:li resource="https://asia.nikkei.com/business/automobiles/electric-vehicles/chinese-entrepreneur-s-e-truck-startup-windrose-faces-unpaid-wage-claims"/>
|
||||
<rdf:li resource="https://asia.nikkei.com/economy/bank-indonesia-raises-rates-0.25-at-emergency-meeting-to-defend-rupiah"/>
|
||||
<rdf:li resource="https://asia.nikkei.com/spotlight/the-future-of-asia/future-of-asia-2026/japan-ex-pm-kishida-calls-for-deeper-energy-ties-with-south-korea"/>
|
||||
<rdf:li resource="https://asia.nikkei.com/business/markets/equities/binance-eyes-asian-stock-trading-as-bitcoin-slumps"/>
|
||||
</rdf:Seq>
|
||||
</items>
|
||||
</channel>
|
||||
<item rdf:about="https://asia.nikkei.com/business/media-entertainment/tokyo-disneyland-magic-in-doubt-as-operator-s-stock-falls">
|
||||
<title><![CDATA[Tokyo Disneyland 'magic' in doubt as operator's stock falls]]></title>
|
||||
<link>https://asia.nikkei.com/business/media-entertainment/tokyo-disneyland-magic-in-doubt-as-operator-s-stock-falls</link>
|
||||
</item>
|
||||
<item rdf:about="https://asia.nikkei.com/business/tech/semiconductors/sk-hynix-to-triple-wafer-capacity-by-2034-chairman-chey">
|
||||
<title><![CDATA[SK Hynix to triple wafer capacity by 2034: Chairman Chey]]></title>
|
||||
<link>https://asia.nikkei.com/business/tech/semiconductors/sk-hynix-to-triple-wafer-capacity-by-2034-chairman-chey</link>
|
||||
</item>
|
||||
<item rdf:about="https://asia.nikkei.com/editor-s-picks/china-up-close/analysis-kim-jong-un-emerges-as-winner-in-summit-with-xi-jinping">
|
||||
<title><![CDATA[Analysis: Kim Jong Un emerges as winner in summit with Xi Jinping]]></title>
|
||||
<link>https://asia.nikkei.com/editor-s-picks/china-up-close/analysis-kim-jong-un-emerges-as-winner-in-summit-with-xi-jinping</link>
|
||||
</item>
|
||||
<item rdf:about="https://asia.nikkei.com/business/technology/ntt-sets-sights-on-nvidia-ai-race-with-500m-optical-network-fund">
|
||||
<title><![CDATA[NTT sets sights on Nvidia, AI race with $500m optical network fund]]></title>
|
||||
<link>https://asia.nikkei.com/business/technology/ntt-sets-sights-on-nvidia-ai-race-with-500m-optical-network-fund</link>
|
||||
</item>
|
||||
<item rdf:about="https://asia.nikkei.com/politics/japan-to-help-content-industry-sue-over-copyright-infringement-abroad">
|
||||
<title><![CDATA[Japan to help content industry sue over copyright infringement abroad]]></title>
|
||||
<link>https://asia.nikkei.com/politics/japan-to-help-content-industry-sue-over-copyright-infringement-abroad</link>
|
||||
</item>
|
||||
<item rdf:about="https://asia.nikkei.com/business/technology/artificial-intelligence/anthropic-plugs-claude-ai-in-japan-for-automated-software-development">
|
||||
<title><![CDATA[Anthropic plugs Claude AI in Japan for automated software development]]></title>
|
||||
<link>https://asia.nikkei.com/business/technology/artificial-intelligence/anthropic-plugs-claude-ai-in-japan-for-automated-software-development</link>
|
||||
</item>
|
||||
<item rdf:about="https://asia.nikkei.com/spotlight/supply-chain/us-tungsten-scrap-exports-to-japan-soar-on-chinese-curbs">
|
||||
<title><![CDATA[US tungsten scrap exports to Japan soar on Chinese curbs]]></title>
|
||||
<link>https://asia.nikkei.com/spotlight/supply-chain/us-tungsten-scrap-exports-to-japan-soar-on-chinese-curbs</link>
|
||||
</item>
|
||||
<item rdf:about="https://asia.nikkei.com/business/business-deals/tdk-to-buy-us-maker-of-ai-data-center-cooling-components-for-up-to-400m">
|
||||
<title><![CDATA[TDK to buy US maker of AI data center cooling components for up to $400m]]></title>
|
||||
<link>https://asia.nikkei.com/business/business-deals/tdk-to-buy-us-maker-of-ai-data-center-cooling-components-for-up-to-400m</link>
|
||||
</item>
|
||||
<item rdf:about="https://asia.nikkei.com/business/insurance/nippon-life-s-private-credit-assets-reach-4.6bn">
|
||||
<title><![CDATA[Nippon Life's private credit assets reach $4.6bn]]></title>
|
||||
<link>https://asia.nikkei.com/business/insurance/nippon-life-s-private-credit-assets-reach-4.6bn</link>
|
||||
</item>
|
||||
<item rdf:about="https://asia.nikkei.com/business/business-trends/us-firms-see-china-as-essential-despite-rising-economic-and-political-risks">
|
||||
<title><![CDATA[US firms see China as essential despite rising economic and political risks]]></title>
|
||||
<link>https://asia.nikkei.com/business/business-trends/us-firms-see-china-as-essential-despite-rising-economic-and-political-risks</link>
|
||||
</item>
|
||||
<item rdf:about="https://asia.nikkei.com/economy/fires/manslaughter-other-charges-filed-over-hong-kong-s-wang-fuk-court-fire">
|
||||
<title><![CDATA[Manslaughter, other charges filed over Hong Kong's Wang Fuk Court fire]]></title>
|
||||
<link>https://asia.nikkei.com/economy/fires/manslaughter-other-charges-filed-over-hong-kong-s-wang-fuk-court-fire</link>
|
||||
</item>
|
||||
<item rdf:about="https://asia.nikkei.com/spotlight/the-future-of-asia/future-of-asia-2026/adb-and-peers-need-to-anchor-international-order-president-kanda-says">
|
||||
<title><![CDATA[ADB and peers need to 'anchor' international order: President Kanda says]]></title>
|
||||
<link>https://asia.nikkei.com/spotlight/the-future-of-asia/future-of-asia-2026/adb-and-peers-need-to-anchor-international-order-president-kanda-says</link>
|
||||
</item>
|
||||
<item rdf:about="https://asia.nikkei.com/spotlight/the-future-of-asia/future-of-asia-2026/asia-faces-risks-of-economic-spillover-from-iran-and-ai-disinformation">
|
||||
<title><![CDATA[Asia faces risks of economic spillover from Iran and AI disinformation]]></title>
|
||||
<link>https://asia.nikkei.com/spotlight/the-future-of-asia/future-of-asia-2026/asia-faces-risks-of-economic-spillover-from-iran-and-ai-disinformation</link>
|
||||
</item>
|
||||
<item rdf:about="https://asia.nikkei.com/business/companies/swire-dangles-cathay-shares-in-600m-convertible-bond-issuance">
|
||||
<title><![CDATA[Swire dangles Cathay shares in $600m convertible bond issuance]]></title>
|
||||
<link>https://asia.nikkei.com/business/companies/swire-dangles-cathay-shares-in-600m-convertible-bond-issuance</link>
|
||||
</item>
|
||||
<item rdf:about="https://asia.nikkei.com/business/materials/shin-etsu-to-set-up-rare-earth-smelter-in-japan-to-ease-reliance-on-china">
|
||||
<title><![CDATA[Shin-Etsu to set up rare-earth smelter in Japan to ease reliance on China]]></title>
|
||||
<link>https://asia.nikkei.com/business/materials/shin-etsu-to-set-up-rare-earth-smelter-in-japan-to-ease-reliance-on-china</link>
|
||||
</item>
|
||||
<item rdf:about="https://asia.nikkei.com/editor-s-picks/interview/setting-sea-border-with-japan-vital-philippine-foreign-secretary">
|
||||
<title><![CDATA[Setting sea border with Japan vital: Philippine foreign secretary]]></title>
|
||||
<link>https://asia.nikkei.com/editor-s-picks/interview/setting-sea-border-with-japan-vital-philippine-foreign-secretary</link>
|
||||
</item>
|
||||
<item rdf:about="https://asia.nikkei.com/business/markets/strong-dollar-rally-weighs-heavier-on-struggling-asian-countries">
|
||||
<title><![CDATA[Strong dollar rally weighs heavier on struggling Asian countries]]></title>
|
||||
<link>https://asia.nikkei.com/business/markets/strong-dollar-rally-weighs-heavier-on-struggling-asian-countries</link>
|
||||
</item>
|
||||
<item rdf:about="https://asia.nikkei.com/business/fisheries/thailand-s-shrimp-industry-hit-by-malaysia-s-import-ban">
|
||||
<title><![CDATA[Thailand's shrimp industry hit by Malaysia's import ban]]></title>
|
||||
<link>https://asia.nikkei.com/business/fisheries/thailand-s-shrimp-industry-hit-by-malaysia-s-import-ban</link>
|
||||
</item>
|
||||
<item rdf:about="https://asia.nikkei.com/business/technology/applied-materials-opens-500m-manufacturing-campus-in-singapore">
|
||||
<title><![CDATA[Applied Materials opens $500m manufacturing campus in Singapore]]></title>
|
||||
<link>https://asia.nikkei.com/business/technology/applied-materials-opens-500m-manufacturing-campus-in-singapore</link>
|
||||
</item>
|
||||
<item rdf:about="https://asia.nikkei.com/spotlight/the-future-of-asia/future-of-asia-2026/malaysia-s-anwar-warns-against-global-powers-weaponizing-trade">
|
||||
<title><![CDATA[Malaysia's Anwar warns against global powers weaponizing trade]]></title>
|
||||
<link>https://asia.nikkei.com/spotlight/the-future-of-asia/future-of-asia-2026/malaysia-s-anwar-warns-against-global-powers-weaponizing-trade</link>
|
||||
</item>
|
||||
<item rdf:about="https://asia.nikkei.com/business/energy/apple-and-nvidia-supplier-foxconn-invests-in-vietnam-solar-wind-power">
|
||||
<title><![CDATA[Apple and Nvidia supplier Foxconn invests in Vietnam solar, wind power]]></title>
|
||||
<link>https://asia.nikkei.com/business/energy/apple-and-nvidia-supplier-foxconn-invests-in-vietnam-solar-wind-power</link>
|
||||
</item>
|
||||
<item rdf:about="https://asia.nikkei.com/spotlight/sports/world-cup-tests-asia-s-appetite-for-costly-broadcast-rights">
|
||||
<title><![CDATA[World Cup tests Asia's appetite for costly broadcast rights]]></title>
|
||||
<link>https://asia.nikkei.com/spotlight/sports/world-cup-tests-asia-s-appetite-for-costly-broadcast-rights</link>
|
||||
</item>
|
||||
<item rdf:about="https://asia.nikkei.com/business/finance/brookfield-bets-on-its-japan-business-to-top-hong-kong-and-singapore">
|
||||
<title><![CDATA[Brookfield bets on its Japan business to top Hong Kong and Singapore]]></title>
|
||||
<link>https://asia.nikkei.com/business/finance/brookfield-bets-on-its-japan-business-to-top-hong-kong-and-singapore</link>
|
||||
</item>
|
||||
<item rdf:about="https://asia.nikkei.com/business/technology/tencent-raises-4.6bn-in-dual-dollar-yuan-bond-issuances">
|
||||
<title><![CDATA[Tencent raises $4.6bn in dual dollar, yuan bond issuances]]></title>
|
||||
<link>https://asia.nikkei.com/business/technology/tencent-raises-4.6bn-in-dual-dollar-yuan-bond-issuances</link>
|
||||
</item>
|
||||
<item rdf:about="https://asia.nikkei.com/business/automobiles/electric-vehicles/ferrari-luce-ev-highlights-european-struggle-to-lure-back-china-s-superrich">
|
||||
<title><![CDATA[Ferrari Luce EV highlights European struggle to lure back China's superrich]]></title>
|
||||
<link>https://asia.nikkei.com/business/automobiles/electric-vehicles/ferrari-luce-ev-highlights-european-struggle-to-lure-back-china-s-superrich</link>
|
||||
</item>
|
||||
<item rdf:about="https://asia.nikkei.com/business/markets/commodities/indian-families-scale-back-on-gold-for-weddings-as-prices-hover-near-highs">
|
||||
<title><![CDATA[Indian families scale back on gold for weddings as prices hover near highs]]></title>
|
||||
<link>https://asia.nikkei.com/business/markets/commodities/indian-families-scale-back-on-gold-for-weddings-as-prices-hover-near-highs</link>
|
||||
</item>
|
||||
<item rdf:about="https://asia.nikkei.com/politics/defense/japan-s-new-defense-document-to-name-china-the-biggest-concern">
|
||||
<title><![CDATA[Japan's new defense document to name China the biggest concern]]></title>
|
||||
<link>https://asia.nikkei.com/politics/defense/japan-s-new-defense-document-to-name-china-the-biggest-concern</link>
|
||||
</item>
|
||||
<item rdf:about="https://asia.nikkei.com/business/food-beverage/japan-to-bolster-ip-protections-for-prized-new-fruit-vegetable-varieties">
|
||||
<title><![CDATA[Japan to bolster IP protections for prized new fruit, vegetable varieties]]></title>
|
||||
<link>https://asia.nikkei.com/business/food-beverage/japan-to-bolster-ip-protections-for-prized-new-fruit-vegetable-varieties</link>
|
||||
</item>
|
||||
<item rdf:about="https://asia.nikkei.com/spotlight/environment/climate-change/japan-s-jgc-bets-on-carbon-feeding-bacteria-to-create-bioplastics">
|
||||
<title><![CDATA[Japan's JGC bets on carbon-feeding bacteria to create bioplastics]]></title>
|
||||
<link>https://asia.nikkei.com/spotlight/environment/climate-change/japan-s-jgc-bets-on-carbon-feeding-bacteria-to-create-bioplastics</link>
|
||||
</item>
|
||||
<item rdf:about="https://asia.nikkei.com/business/travel-leisure/rural-japan-hopes-to-charm-domestic-travelers-priced-out-of-overseas-trips">
|
||||
<title><![CDATA[Rural Japan hopes to charm domestic travelers priced out of overseas trips]]></title>
|
||||
<link>https://asia.nikkei.com/business/travel-leisure/rural-japan-hopes-to-charm-domestic-travelers-priced-out-of-overseas-trips</link>
|
||||
</item>
|
||||
<item rdf:about="https://asia.nikkei.com/business/energy/gm-partners-with-peak-energy-for-sodium-ion-battery-storage">
|
||||
<title><![CDATA[GM partners with Peak Energy for sodium-ion battery storage]]></title>
|
||||
<link>https://asia.nikkei.com/business/energy/gm-partners-with-peak-energy-for-sodium-ion-battery-storage</link>
|
||||
</item>
|
||||
<item rdf:about="https://asia.nikkei.com/business/technology/tech-asia/japan-seeks-bigger-role-in-asia-s-subsea-cables-as-ai-rewires-demand">
|
||||
<title><![CDATA[Japan seeks bigger role in Asia's subsea cables as AI rewires demand]]></title>
|
||||
<link>https://asia.nikkei.com/business/technology/tech-asia/japan-seeks-bigger-role-in-asia-s-subsea-cables-as-ai-rewires-demand</link>
|
||||
</item>
|
||||
<item rdf:about="https://asia.nikkei.com/spotlight/comment/why-japan-s-takaichi-has-stepped-back-from-boj-rate-hike-debate">
|
||||
<title><![CDATA[Why Japan's Takaichi has stepped back from BOJ rate hike debate]]></title>
|
||||
<link>https://asia.nikkei.com/spotlight/comment/why-japan-s-takaichi-has-stepped-back-from-boj-rate-hike-debate</link>
|
||||
</item>
|
||||
<item rdf:about="https://asia.nikkei.com/business/transportation/japan-flying-car-startup-skydrive-aims-for-the-skies-in-2028">
|
||||
<title><![CDATA[Japan flying car startup SkyDrive aims for the skies in 2028]]></title>
|
||||
<link>https://asia.nikkei.com/business/transportation/japan-flying-car-startup-skydrive-aims-for-the-skies-in-2028</link>
|
||||
</item>
|
||||
<item rdf:about="https://asia.nikkei.com/business/technology/hanwha-qcells-kicks-off-first-fully-onshore-us-solar-supply-chain">
|
||||
<title><![CDATA[Hanwha Qcells kicks off first fully onshore US solar supply chain]]></title>
|
||||
<link>https://asia.nikkei.com/business/technology/hanwha-qcells-kicks-off-first-fully-onshore-us-solar-supply-chain</link>
|
||||
</item>
|
||||
<item rdf:about="https://asia.nikkei.com/business/energy/japan-s-mitsubishi-hc-canada-s-brookfield-to-buy-european-wind-solar-farms-in-ai-play2">
|
||||
<title><![CDATA[Japan's Mitsubishi HC, Canada's Brookfield to buy European wind, solar farms in AI play]]></title>
|
||||
<link>https://asia.nikkei.com/business/energy/japan-s-mitsubishi-hc-canada-s-brookfield-to-buy-european-wind-solar-farms-in-ai-play2</link>
|
||||
</item>
|
||||
<item rdf:about="https://asia.nikkei.com/business/technology/g7-plans-first-joint-statement-for-protecting-minors-on-social-media">
|
||||
<title><![CDATA[G7 plans first joint statement for protecting minors on social media]]></title>
|
||||
<link>https://asia.nikkei.com/business/technology/g7-plans-first-joint-statement-for-protecting-minors-on-social-media</link>
|
||||
</item>
|
||||
<item rdf:about="https://asia.nikkei.com/business/automobiles/toyota-backs-japan-self-driving-startup-tier-iv-in-development-push">
|
||||
<title><![CDATA[Toyota backs Japan self-driving startup Tier IV in development push]]></title>
|
||||
<link>https://asia.nikkei.com/business/automobiles/toyota-backs-japan-self-driving-startup-tier-iv-in-development-push</link>
|
||||
</item>
|
||||
<item rdf:about="https://asia.nikkei.com/business/softbank/jpmorgan-chase-emerges-as-softbank-group-s-top-lender-surpassing-mizuho">
|
||||
<title><![CDATA[JPMorgan Chase emerges as SoftBank Group's top lender, surpassing Mizuho]]></title>
|
||||
<link>https://asia.nikkei.com/business/softbank/jpmorgan-chase-emerges-as-softbank-group-s-top-lender-surpassing-mizuho</link>
|
||||
</item>
|
||||
<item rdf:about="https://asia.nikkei.com/business/energy/malaysia-to-promise-japan-maximum-possible-lng-naphtha">
|
||||
<title><![CDATA[Malaysia to promise Japan maximum possible LNG, naphtha]]></title>
|
||||
<link>https://asia.nikkei.com/business/energy/malaysia-to-promise-japan-maximum-possible-lng-naphtha</link>
|
||||
</item>
|
||||
<item rdf:about="https://asia.nikkei.com/photos/in-focus-mindanao-reels-from-another-deadly-earthquake">
|
||||
<title><![CDATA[In Focus: Mindanao reels from another deadly earthquake]]></title>
|
||||
<link>https://asia.nikkei.com/photos/in-focus-mindanao-reels-from-another-deadly-earthquake</link>
|
||||
</item>
|
||||
<item rdf:about="https://asia.nikkei.com/politics/international-relations/us-china-tensions/pentagon-blacklists-alibaba-byd-and-baidu-over-alleged-military-ties">
|
||||
<title><![CDATA[Pentagon blacklists Alibaba, BYD and Baidu over alleged military ties]]></title>
|
||||
<link>https://asia.nikkei.com/politics/international-relations/us-china-tensions/pentagon-blacklists-alibaba-byd-and-baidu-over-alleged-military-ties</link>
|
||||
</item>
|
||||
<item rdf:about="https://asia.nikkei.com/economy/bank-of-japan/bank-of-japan-set-to-hike-key-interest-rate-to-1">
|
||||
<title><![CDATA[Bank of Japan set to hike key interest rate to 1%]]></title>
|
||||
<link>https://asia.nikkei.com/economy/bank-of-japan/bank-of-japan-set-to-hike-key-interest-rate-to-1</link>
|
||||
</item>
|
||||
<item rdf:about="https://asia.nikkei.com/spotlight/the-future-of-asia/future-of-asia-2026/semiconductor-advances-a-must-for-data-centers-says-tokyo-electron-boss">
|
||||
<title><![CDATA[Semiconductor advances a 'must' for data centers, says Tokyo Electron boss]]></title>
|
||||
<link>https://asia.nikkei.com/spotlight/the-future-of-asia/future-of-asia-2026/semiconductor-advances-a-must-for-data-centers-says-tokyo-electron-boss</link>
|
||||
</item>
|
||||
<item rdf:about="https://asia.nikkei.com/politics/international-relations/xi-shores-up-china-s-sway-in-pyongyang-wary-of-north-korea-russia-ties">
|
||||
<title><![CDATA[Xi shores up China's sway in Pyongyang, wary of North Korea-Russia ties]]></title>
|
||||
<link>https://asia.nikkei.com/politics/international-relations/xi-shores-up-china-s-sway-in-pyongyang-wary-of-north-korea-russia-ties</link>
|
||||
</item>
|
||||
<item rdf:about="https://asia.nikkei.com/opinion/south-korea-election-yoon-s-legacy-partially-survives-progressive-victory">
|
||||
<title><![CDATA[South Korea election: Yoon's legacy partially survives progressive victory]]></title>
|
||||
<link>https://asia.nikkei.com/opinion/south-korea-election-yoon-s-legacy-partially-survives-progressive-victory</link>
|
||||
</item>
|
||||
<item rdf:about="https://asia.nikkei.com/business/automobiles/electric-vehicles/chinese-entrepreneur-s-e-truck-startup-windrose-faces-unpaid-wage-claims">
|
||||
<title><![CDATA[Chinese entrepreneur's e-truck startup Windrose faces unpaid wage claims]]></title>
|
||||
<link>https://asia.nikkei.com/business/automobiles/electric-vehicles/chinese-entrepreneur-s-e-truck-startup-windrose-faces-unpaid-wage-claims</link>
|
||||
</item>
|
||||
<item rdf:about="https://asia.nikkei.com/economy/bank-indonesia-raises-rates-0.25-at-emergency-meeting-to-defend-rupiah">
|
||||
<title><![CDATA[Bank Indonesia raises rates 0.25% at emergency meeting to defend rupiah]]></title>
|
||||
<link>https://asia.nikkei.com/economy/bank-indonesia-raises-rates-0.25-at-emergency-meeting-to-defend-rupiah</link>
|
||||
</item>
|
||||
<item rdf:about="https://asia.nikkei.com/spotlight/the-future-of-asia/future-of-asia-2026/japan-ex-pm-kishida-calls-for-deeper-energy-ties-with-south-korea">
|
||||
<title><![CDATA[Japan ex-PM Kishida calls for deeper energy ties with South Korea]]></title>
|
||||
<link>https://asia.nikkei.com/spotlight/the-future-of-asia/future-of-asia-2026/japan-ex-pm-kishida-calls-for-deeper-energy-ties-with-south-korea</link>
|
||||
</item>
|
||||
<item rdf:about="https://asia.nikkei.com/business/markets/equities/binance-eyes-asian-stock-trading-as-bitcoin-slumps">
|
||||
<title><![CDATA[Binance eyes Asian stock trading as Bitcoin slumps]]></title>
|
||||
<link>https://asia.nikkei.com/business/markets/equities/binance-eyes-asian-stock-trading-as-bitcoin-slumps</link>
|
||||
</item>
|
||||
</rdf:RDF>
|
||||
+32
@@ -0,0 +1,32 @@
|
||||
{
|
||||
"id": "chatcmpl-80cd8ddc-7788-4605-b40e-3975fe7e1326",
|
||||
"object": "chat.completion",
|
||||
"created": 1781149952,
|
||||
"model": "/Users/hyungi/mlx-models/Qwen3.6-27B-8bit",
|
||||
"choices": [
|
||||
{
|
||||
"index": 0,
|
||||
"finish_reason": "stop",
|
||||
"message": {
|
||||
"role": "assistant",
|
||||
"content": "\uc81c\uacf5\ub41c \ubb38\uc11c\ub294 \uc555\ub825\uc6a9\uae30 \uac80\uc0ac\uc758 \uae30\uc900\uc774 \ub418\ub294 \uaddc\uc815\uc744 \uba85\uc2dc\ud558\uace0 \uc788\uc2b5\ub2c8\ub2e4. \ud575\uc2ec \ub0b4\uc6a9\uc740 \uc555\ub825\uc6a9\uae30\uc5d0 \ub300\ud55c \ubaa8\ub4e0 \uac80\uc0ac \uc808\ucc28\uc640 \uae30\uc900\uc774 'ASME Section VIII Div 1'\uc774\ub77c\ub294 \uad6d\uc81c\uc801\uc73c\ub85c \uc778\uc815\ubc1b\ub294 \uc555\ub825\uc6a9\uae30 \uc124\uacc4 \ubc0f \uc81c\uc791 \uaddc\uc815\uc5d0 \ub530\ub77c \uc5c4\uaca9\ud558\uac8c \uc218\ud589\ub418\uc5b4\uc57c \ud55c\ub2e4\ub294 \uac83\uc785\ub2c8\ub2e4. \uc774\ub294 \uc548\uc804\uc131\uacfc \uc2e0\ub8b0\uc131\uc744 \ubcf4\uc7a5\ud558\uae30 \uc704\ud55c \ud544\uc218\uc801\uc778 \uc694\uad6c\uc0ac\ud56d\uc73c\ub85c, \ud574\ub2f9 \uaddc\uc815\uc744 \uc900\uc218\ud568\uc73c\ub85c\uc368 \uc555\ub825\uc6a9\uae30\uc758 \uad6c\uc870\uc801 \ubb34\uacb0\uc131\uacfc \uc6b4\uc601 \uc548\uc804\uc131\uc744 \ud655\ubcf4\ud560 \uc218 \uc788\uc2b5\ub2c8\ub2e4. \ub530\ub77c\uc11c \uad00\ub828 \uc5c5\ubb34 \uc218\ud589 \uc2dc \ubc18\ub4dc\uc2dc \uc774 \uaddc\uc815\uc744 \ucc38\uc870\ud558\uc5ec \uac80\uc0ac\ub97c \uc9c4\ud589\ud574\uc57c \ud569\ub2c8\ub2e4.",
|
||||
"reasoning": null,
|
||||
"tool_calls": null,
|
||||
"tool_call_id": null,
|
||||
"name": null
|
||||
},
|
||||
"logprobs": null
|
||||
}
|
||||
],
|
||||
"usage": {
|
||||
"prompt_tokens": 44,
|
||||
"completion_tokens": 118,
|
||||
"total_tokens": 162,
|
||||
"prompt_tokens_details": {
|
||||
"cached_tokens": 0
|
||||
},
|
||||
"prompt_tps": 0.0,
|
||||
"generation_tps": 0.0,
|
||||
"peak_memory": 29.804702642
|
||||
}
|
||||
}
|
||||
+26
@@ -0,0 +1,26 @@
|
||||
data: {"id": "chatcmpl-4e188b8b-8617-4054-be82-25fece7b56f1", "object": "chat.completion.chunk", "created": 1781139860, "model": "mlx-community/gemma-4-26b-a4b-it-8bit", "choices": [{"index": 0, "finish_reason": null, "delta": {"role": "assistant", "content": "", "tool_calls": []}}], "usage": {"input_tokens": 28, "output_tokens": 1, "total_tokens": 29, "prompt_tps": 183.51595345126498, "generation_tps": 140349.23521338476, "peak_memory": 34.66827434}}
|
||||
|
||||
|
||||
data: {"id": "chatcmpl-4e188b8b-8617-4054-be82-25fece7b56f1", "object": "chat.completion.chunk", "created": 1781139861, "model": "mlx-community/gemma-4-26b-a4b-it-8bit", "choices": [{"index": 0, "finish_reason": null, "delta": {"role": "assistant", "content": "", "tool_calls": []}}], "usage": {"input_tokens": 28, "output_tokens": 2, "total_tokens": 30, "prompt_tps": 183.51595345126498, "generation_tps": 93.60885515563795, "peak_memory": 34.66827434}}
|
||||
|
||||
|
||||
data: {"id": "chatcmpl-4e188b8b-8617-4054-be82-25fece7b56f1", "object": "chat.completion.chunk", "created": 1781139861, "model": "mlx-community/gemma-4-26b-a4b-it-8bit", "choices": [{"index": 0, "finish_reason": null, "delta": {"role": "assistant", "content": "안녕하세요,", "tool_calls": []}}], "usage": {"input_tokens": 28, "output_tokens": 3, "total_tokens": 31, "prompt_tps": 183.51595345126498, "generation_tps": 70.37263329290622, "peak_memory": 34.66827434}}
|
||||
|
||||
|
||||
data: {"id": "chatcmpl-4e188b8b-8617-4054-be82-25fece7b56f1", "object": "chat.completion.chunk", "created": 1781139861, "model": "mlx-community/gemma-4-26b-a4b-it-8bit", "choices": [{"index": 0, "finish_reason": null, "delta": {"role": "assistant", "content": "", "tool_calls": []}}], "usage": {"input_tokens": 28, "output_tokens": 4, "total_tokens": 32, "prompt_tps": 183.51595345126498, "generation_tps": 62.61454940315543, "peak_memory": 34.66827434}}
|
||||
|
||||
|
||||
data: {"id": "chatcmpl-4e188b8b-8617-4054-be82-25fece7b56f1", "object": "chat.completion.chunk", "created": 1781139861, "model": "mlx-community/gemma-4-26b-a4b-it-8bit", "choices": [{"index": 0, "finish_reason": null, "delta": {"role": "assistant", "content": " 만나서", "tool_calls": []}}], "usage": {"input_tokens": 28, "output_tokens": 5, "total_tokens": 33, "prompt_tps": 183.51595345126498, "generation_tps": 58.7098801868211, "peak_memory": 34.66827434}}
|
||||
|
||||
|
||||
data: {"id": "chatcmpl-4e188b8b-8617-4054-be82-25fece7b56f1", "object": "chat.completion.chunk", "created": 1781139861, "model": "mlx-community/gemma-4-26b-a4b-it-8bit", "choices": [{"index": 0, "finish_reason": null, "delta": {"role": "assistant", "content": "", "tool_calls": []}}], "usage": {"input_tokens": 28, "output_tokens": 6, "total_tokens": 34, "prompt_tps": 183.51595345126498, "generation_tps": 56.35974757228211, "peak_memory": 34.66827434}}
|
||||
|
||||
|
||||
data: {"id": "chatcmpl-4e188b8b-8617-4054-be82-25fece7b56f1", "object": "chat.completion.chunk", "created": 1781139861, "model": "mlx-community/gemma-4-26b-a4b-it-8bit", "choices": [{"index": 0, "finish_reason": null, "delta": {"role": "assistant", "content": " 반갑습니다!", "tool_calls": []}}], "usage": {"input_tokens": 28, "output_tokens": 7, "total_tokens": 35, "prompt_tps": 183.51595345126498, "generation_tps": 54.81880127112613, "peak_memory": 34.66827434}}
|
||||
|
||||
|
||||
data: {"id": "chatcmpl-4e188b8b-8617-4054-be82-25fece7b56f1", "object": "chat.completion.chunk", "created": 1781139861, "model": "mlx-community/gemma-4-26b-a4b-it-8bit", "choices": [{"index": 0, "finish_reason": "stop", "delta": {"role": "assistant", "content": "", "tool_calls": []}}], "usage": {"input_tokens": 28, "output_tokens": 7, "total_tokens": 35, "prompt_tps": 183.51595345126498, "generation_tps": 54.81880127112613, "peak_memory": 34.66827434}}
|
||||
|
||||
|
||||
data: [DONE]
|
||||
|
||||
+12
@@ -0,0 +1,12 @@
|
||||
data: {"id":"chatcmpl-96ee9a0d-2f66-4357-876f-951c80c23bb2","object":"chat.completion.chunk","created":1781139880,"model":"/Users/hyungi/mlx-models/Qwen3.6-27B-8bit","choices":[{"index":0,"finish_reason":null,"delta":{"role":"assistant","content":"","reasoning":null,"tool_calls":null,"tool_call_id":null,"name":null},"logprobs":null}],"usage":{"prompt_tokens":25,"completion_tokens":1,"total_tokens":26,"prompt_tokens_details":{"cached_tokens":0},"prompt_tps":0.0,"generation_tps":0.0,"peak_memory":0.0}}
|
||||
|
||||
data: {"id":"chatcmpl-96ee9a0d-2f66-4357-876f-951c80c23bb2","object":"chat.completion.chunk","created":1781139880,"model":"/Users/hyungi/mlx-models/Qwen3.6-27B-8bit","choices":[{"index":0,"finish_reason":null,"delta":{"role":"assistant","content":"","reasoning":null,"tool_calls":null,"tool_call_id":null,"name":null},"logprobs":null}],"usage":{"prompt_tokens":25,"completion_tokens":2,"total_tokens":27,"prompt_tokens_details":{"cached_tokens":0},"prompt_tps":0.0,"generation_tps":0.0,"peak_memory":0.0}}
|
||||
|
||||
data: {"id":"chatcmpl-96ee9a0d-2f66-4357-876f-951c80c23bb2","object":"chat.completion.chunk","created":1781139880,"model":"/Users/hyungi/mlx-models/Qwen3.6-27B-8bit","choices":[{"index":0,"finish_reason":null,"delta":{"role":"assistant","content":"","reasoning":null,"tool_calls":null,"tool_call_id":null,"name":null},"logprobs":null}],"usage":{"prompt_tokens":25,"completion_tokens":3,"total_tokens":28,"prompt_tokens_details":{"cached_tokens":0},"prompt_tps":0.0,"generation_tps":0.0,"peak_memory":0.0}}
|
||||
|
||||
data: {"id":"chatcmpl-96ee9a0d-2f66-4357-876f-951c80c23bb2","object":"chat.completion.chunk","created":1781139881,"model":"/Users/hyungi/mlx-models/Qwen3.6-27B-8bit","choices":[{"index":0,"finish_reason":null,"delta":{"role":"assistant","content":"","reasoning":null,"tool_calls":null,"tool_call_id":null,"name":null},"logprobs":null}],"usage":{"prompt_tokens":25,"completion_tokens":4,"total_tokens":29,"prompt_tokens_details":{"cached_tokens":0},"prompt_tps":0.0,"generation_tps":0.0,"peak_memory":0.0}}
|
||||
|
||||
data: {"id":"chatcmpl-96ee9a0d-2f66-4357-876f-951c80c23bb2","object":"chat.completion.chunk","created":1781139881,"model":"/Users/hyungi/mlx-models/Qwen3.6-27B-8bit","choices":[{"index":0,"finish_reason":"stop","delta":{"role":"assistant","content":"안녕하세요!","reasoning":null,"tool_calls":null,"tool_call_id":null,"name":null},"logprobs":null}],"usage":{"prompt_tokens":25,"completion_tokens":5,"total_tokens":30,"prompt_tokens_details":{"cached_tokens":0},"prompt_tps":0.0,"generation_tps":0.0,"peak_memory":0.0}}
|
||||
|
||||
data: [DONE]
|
||||
|
||||
@@ -0,0 +1,139 @@
|
||||
"""crawl-24x7 사이클 2 — 순수 함수/형태 회귀 테스트 (DB 불요).
|
||||
|
||||
Guardian 호출 형태 + fixture 응답 파싱 + 채널 정체성 + B-5 quirk.
|
||||
fixture = tests/fixtures/guardian_open_platform_search_response.json
|
||||
(2026-06-10 실키 live 박제, api-key 응답 본문 미포함 확인 — [[feedback_external_api_fixture_first]]).
|
||||
"""
|
||||
|
||||
import json
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
from workers.news_collector import (
|
||||
_article_hash,
|
||||
_doc_identity,
|
||||
_guardian_request,
|
||||
_normalize_category,
|
||||
)
|
||||
|
||||
FIXTURE = Path(__file__).parent / "fixtures" / "guardian_open_platform_search_response.json"
|
||||
|
||||
|
||||
def _make_source(**kw):
|
||||
"""ORM 인스턴스 없이 속성만 흉내 (식별성 함수는 속성 접근만 사용)."""
|
||||
class S:
|
||||
pass
|
||||
s = S()
|
||||
s.source_channel = kw.get("source_channel", "news")
|
||||
s.parser_quirk = kw.get("parser_quirk")
|
||||
return s
|
||||
|
||||
|
||||
class TestGuardianCallShape:
|
||||
def test_request_shape_matches_fixture_recipe(self):
|
||||
"""fixture 박제 시 사용한 호출과 단일 source-of-truth 정합
|
||||
([[feedback_fixture_first_call_shape]])."""
|
||||
endpoint, params = _guardian_request(
|
||||
"https://content.guardianapis.com/search?section=world", "KEY"
|
||||
)
|
||||
assert endpoint == "https://content.guardianapis.com/search"
|
||||
assert params["section"] == "world"
|
||||
assert params["show-fields"] == "bodyText,trailText"
|
||||
assert params["order-by"] == "newest"
|
||||
assert params["api-key"] == "KEY"
|
||||
|
||||
def test_feed_url_query_overridden_by_fixed_fields(self):
|
||||
# feed_url 에 show-fields 가 잘못 박혀 있어도 고정 필드가 이긴다 (dict merge 순서)
|
||||
_, params = _guardian_request(
|
||||
"https://content.guardianapis.com/search?section=world&show-fields=headline", "K"
|
||||
)
|
||||
assert params["show-fields"] == "bodyText,trailText"
|
||||
|
||||
|
||||
class TestGuardianFixtureParsing:
|
||||
def test_fixture_response_shape(self):
|
||||
payload = json.loads(FIXTURE.read_text())["response"]
|
||||
assert payload["status"] == "ok"
|
||||
assert payload["results"], "fixture 에 결과 0건"
|
||||
for item in payload["results"]:
|
||||
assert item["webTitle"].strip()
|
||||
assert item["webUrl"].startswith("https://")
|
||||
assert "webPublicationDate" in item
|
||||
assert "sectionName" in item
|
||||
fields = item.get("fields") or {}
|
||||
assert "bodyText" in fields and "trailText" in fields
|
||||
|
||||
def test_fixture_bodytext_is_fulltext_grade(self):
|
||||
payload = json.loads(FIXTURE.read_text())["response"]
|
||||
# 전문 게이트(200자)를 fixture 가 통과해야 어댑터 is_full 경로가 산다
|
||||
assert any(len(i["fields"]["bodyText"]) >= 200 for i in payload["results"])
|
||||
|
||||
def test_fixture_contains_no_api_key(self):
|
||||
assert "api-key" not in FIXTURE.read_text()
|
||||
|
||||
|
||||
class TestChannelIdentity:
|
||||
def test_news_channel_unchanged(self):
|
||||
ident = _doc_identity(_make_source(source_channel="news"), "경향신문", "Society")
|
||||
assert ident == {
|
||||
"path_prefix": "news",
|
||||
"ai_domain": "News",
|
||||
"ai_tags": ["News/경향신문/Society"],
|
||||
}
|
||||
|
||||
def test_crawl_channel_domain_identity(self):
|
||||
ident = _doc_identity(_make_source(source_channel="crawl"), "TWI", "Engineering")
|
||||
assert ident["path_prefix"] == "crawl"
|
||||
assert ident["ai_domain"] == "Engineering"
|
||||
assert ident["ai_tags"] == ["Engineering/TWI"]
|
||||
|
||||
def test_crawl_channel_unknown_category_falls_back(self):
|
||||
ident = _doc_identity(_make_source(source_channel="crawl"), "X", "Other")
|
||||
assert ident["ai_domain"] == "Domain"
|
||||
|
||||
def test_category_map_has_domain_axes(self):
|
||||
assert _normalize_category("안전") == "Safety"
|
||||
assert _normalize_category("Engineering") == "Engineering"
|
||||
assert _normalize_category("철학") == "Philosophy"
|
||||
|
||||
|
||||
class TestSkipVideoQuirk:
|
||||
PATTERN = re.compile(r"/videos?/")
|
||||
|
||||
def test_video_urls_match(self):
|
||||
assert self.PATTERN.search("https://psyche.co/videos/some-film")
|
||||
assert self.PATTERN.search("https://aeon.co/video/another")
|
||||
|
||||
def test_article_urls_pass(self):
|
||||
assert not self.PATTERN.search("https://psyche.co/ideas/how-to-think")
|
||||
|
||||
|
||||
class TestRedirect304Distinction:
|
||||
"""httpx is_redirect 가 304(3xx 전체)에 True 라 redirect 로 오인 → 조건부 GET
|
||||
안정 피드가 'redirect 3회 초과'로 전멸하던 버그. has_redirect_location 으로 구분."""
|
||||
|
||||
def test_304_is_not_a_redirect_location(self):
|
||||
import httpx
|
||||
r = httpx.Response(304, request=httpx.Request("GET", "https://x/"))
|
||||
assert r.is_redirect is True # httpx 함정: 304 도 is_redirect
|
||||
assert r.has_redirect_location is False # 우리가 써야 하는 정확한 판별
|
||||
|
||||
def test_real_redirect_has_location(self):
|
||||
import httpx
|
||||
r = httpx.Response(301, headers={"location": "https://y/"},
|
||||
request=httpx.Request("GET", "https://x/"))
|
||||
assert r.has_redirect_location is True
|
||||
|
||||
def test_collector_uses_has_redirect_location(self):
|
||||
import inspect
|
||||
from workers import news_collector
|
||||
src = inspect.getsource(news_collector._fetch_rss)
|
||||
assert "has_redirect_location" in src
|
||||
assert "while resp.is_redirect" not in src # 옛 버그 패턴 부재
|
||||
|
||||
|
||||
class TestArticleHashStability:
|
||||
def test_static_corpus_hash_deterministic(self):
|
||||
a = _article_hash("Creep and Creep Failures", "static", "National Board 기술 아티클")
|
||||
b = _article_hash("Creep and Creep Failures", "static", "National Board 기술 아티클")
|
||||
assert a == b and len(a) == 32
|
||||
@@ -0,0 +1,327 @@
|
||||
"""crawl-24x7 사이클 3 — 순수 함수/형태 회귀 테스트 (DB 불요).
|
||||
|
||||
B-4 signal-only(본문 무절단 + enqueue 가드) + C-4 피드 shape + CSB sitemap diff 파서
|
||||
+ API 공지 목록 파서 + CCPS beacon 링크 파서 + B-5 (Nikkei RDF = feedparser 네이티브,
|
||||
코드 분기 불요 박제).
|
||||
|
||||
fixture = 2026-06-11 live 박제 (tests/fixtures/, [[feedback_external_api_fixture_first]]).
|
||||
economist/ieee 는 repo 크기 사유로 item 수만 trim (헤더/푸터/item 구조 byte-faithful).
|
||||
"""
|
||||
|
||||
import re
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from pathlib import Path
|
||||
from types import SimpleNamespace
|
||||
|
||||
import feedparser
|
||||
import pytest
|
||||
|
||||
from workers import news_collector
|
||||
from workers.api_standards_collector import _parse_listing, _parse_pub_date
|
||||
from workers.ccps_collector import _beacon_pdf_links
|
||||
from workers.csb_collector import _parse_sitemap, _pdf_links, _should_skip
|
||||
from workers.news_collector import _clean_html, _entry_body
|
||||
|
||||
FIXTURES = Path(__file__).parent / "fixtures"
|
||||
|
||||
|
||||
def _feed(name: str):
|
||||
return feedparser.parse((FIXTURES / name).read_text(encoding="utf-8"))
|
||||
|
||||
|
||||
def _source(**kw):
|
||||
return SimpleNamespace(
|
||||
fetch_method=kw.get("fetch_method", "rss"),
|
||||
fulltext_policy=kw.get("fulltext_policy", "none"),
|
||||
source_channel=kw.get("source_channel", "news"),
|
||||
)
|
||||
|
||||
|
||||
# ── B-4: 본문 선택 정책 ───────────────────────────────────────────────────────
|
||||
|
||||
class TestEntryBodyPolicy:
|
||||
def test_signal_only_preserves_full_abstract(self):
|
||||
"""arXiv 초록 1.6K자 — 기본 1000자 cap 을 적용하면 꼬리 유실."""
|
||||
entry = _feed("arxiv_appph_rss.xml").entries[0]
|
||||
summary = _clean_html(entry.get("summary", "")) # 기본 경로 = 1000자 절단
|
||||
body, ver = _entry_body(_source(fetch_method="signal-only"), entry, summary)
|
||||
assert ver == "rss-signal"
|
||||
assert len(body) > 1000 >= len(summary)
|
||||
assert "Abstract" in body
|
||||
|
||||
def test_feed_full_promotes_ieee_description(self):
|
||||
entry = _feed("ieee_spectrum_energy_rss.xml").entries[0]
|
||||
summary = _clean_html(entry.get("summary", ""))
|
||||
body, ver = _entry_body(_source(fulltext_policy="feed-full"), entry, summary)
|
||||
assert ver == "rss-feed-full"
|
||||
assert len(body) > 1000
|
||||
|
||||
def test_default_source_keeps_capped_summary(self):
|
||||
entry = _feed("arxiv_appph_rss.xml").entries[0]
|
||||
summary = _clean_html(entry.get("summary", ""))
|
||||
body, ver = _entry_body(_source(), entry, summary)
|
||||
assert ver == "rss"
|
||||
assert body == summary
|
||||
|
||||
def test_signal_only_title_fallback_when_feed_has_no_summary(self):
|
||||
"""Nikkei RDF = description 없음 — summary 인자(=title 폴백)로 격하."""
|
||||
entry = _feed("nikkei_asia_nar_rdf.xml").entries[0]
|
||||
body, ver = _entry_body(
|
||||
_source(fetch_method="signal-only"), entry, entry.get("title", "")
|
||||
)
|
||||
assert ver == "rss-signal"
|
||||
assert body == entry.get("title", "") != ""
|
||||
|
||||
|
||||
# ── B-4: enqueue 가드 (signal-only = fulltext/summarize 절대 금지) ────────────
|
||||
|
||||
class TestSignalOnlyEnqueueGuard:
|
||||
@staticmethod
|
||||
def _patch(monkeypatch):
|
||||
calls = []
|
||||
|
||||
async def fake_enqueue(session, doc_id, stage):
|
||||
calls.append(stage)
|
||||
|
||||
monkeypatch.setattr(news_collector, "enqueue_stage", fake_enqueue)
|
||||
return calls
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_signal_only_overrides_misconfigured_page_policy(self, monkeypatch):
|
||||
"""레지스트리가 fulltext_policy='page' 로 잘못 설정돼도 페이지 fetch 0 (방어)."""
|
||||
calls = self._patch(monkeypatch)
|
||||
doc = SimpleNamespace(id=1, edit_url="https://x/a")
|
||||
src = _source(fetch_method="signal-only", fulltext_policy="page")
|
||||
await news_collector._enqueue_processing(
|
||||
None, doc, src, datetime.now(timezone.utc)
|
||||
)
|
||||
assert calls == ["embed", "chunk"] # fulltext/summarize 부재
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_signal_only_news_respects_30day_gate(self, monkeypatch):
|
||||
calls = self._patch(monkeypatch)
|
||||
doc = SimpleNamespace(id=1, edit_url="https://x/a")
|
||||
old = datetime.now(timezone.utc) - timedelta(days=40)
|
||||
await news_collector._enqueue_processing(
|
||||
None, doc, _source(fetch_method="signal-only"), old
|
||||
)
|
||||
assert calls == []
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_signal_only_crawl_channel_indexes_regardless_of_age(self, monkeypatch):
|
||||
calls = self._patch(monkeypatch)
|
||||
doc = SimpleNamespace(id=1, edit_url="https://x/a")
|
||||
old = datetime.now(timezone.utc) - timedelta(days=400)
|
||||
src = _source(fetch_method="signal-only", source_channel="crawl")
|
||||
await news_collector._enqueue_processing(None, doc, src, old)
|
||||
assert calls == ["embed", "chunk"]
|
||||
|
||||
|
||||
# ── 연결 계층 1회 재시도 (MOEL 첫 TLS 핸드셰이크 간헐 드랍 실측) ──────────────
|
||||
|
||||
class TestConnectRetry:
|
||||
class _Client:
|
||||
def __init__(self, errors: list):
|
||||
self.errors = errors
|
||||
self.calls = 0
|
||||
|
||||
async def get(self, url):
|
||||
self.calls += 1
|
||||
if self.errors:
|
||||
raise self.errors.pop(0)
|
||||
return "OK"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_single_connect_error_retried_once(self):
|
||||
import httpx
|
||||
client = self._Client([httpx.ConnectError("")])
|
||||
resp = await news_collector._get_with_connect_retry(client, "https://x/feed")
|
||||
assert resp == "OK" and client.calls == 2
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_second_retry_absorbs_consecutive_drop(self):
|
||||
"""드랍이 연결 단위 랜덤이라 재시도 1회도 연속으로 걸림 (MOEL lawinfo 실측)."""
|
||||
import httpx
|
||||
client = self._Client([httpx.ConnectError(""), httpx.ConnectError("")])
|
||||
resp = await news_collector._get_with_connect_retry(client, "https://x/feed")
|
||||
assert resp == "OK" and client.calls == 3
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_persistent_connect_error_propagates(self):
|
||||
import httpx
|
||||
client = self._Client([httpx.ConnectError("")] * 3)
|
||||
with pytest.raises(httpx.ConnectError):
|
||||
await news_collector._get_with_connect_retry(client, "https://x/feed")
|
||||
assert client.calls == 3 # 최대 2회 재시도 — 지속 장애는 circuit 몫
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_non_connect_errors_not_retried(self):
|
||||
import httpx
|
||||
client = self._Client([httpx.ReadTimeout("")])
|
||||
with pytest.raises(httpx.ReadTimeout):
|
||||
await news_collector._get_with_connect_retry(client, "https://x/feed")
|
||||
assert client.calls == 1
|
||||
|
||||
|
||||
# ── C-4 / B-4 피드 shape (시드 전 live 박제) ─────────────────────────────────
|
||||
|
||||
class TestNikkeiRdfNativeParsing:
|
||||
"""B-5 'rdf' quirk = 코드 분기 불요 실측 — feedparser 가 RSS 1.0 을 정규화."""
|
||||
|
||||
def test_rss10_entries_have_title_and_link(self):
|
||||
f = _feed("nikkei_asia_nar_rdf.xml")
|
||||
assert f.version == "rss10"
|
||||
assert not f.bozo and len(f.entries) >= 10
|
||||
for e in f.entries:
|
||||
assert e.get("title", "").strip()
|
||||
assert e.get("link", "").startswith("https://asia.nikkei.com/")
|
||||
|
||||
def test_no_summary_no_dates_means_title_signal(self):
|
||||
e = _feed("nikkei_asia_nar_rdf.xml").entries[0]
|
||||
assert not e.get("summary", "")
|
||||
assert not e.get("published_parsed") and not e.get("updated_parsed")
|
||||
|
||||
|
||||
class TestBloombergFixture:
|
||||
def test_video_items_mixed_in_feed(self):
|
||||
"""비디오 항목 혼재 실측 → seed parser_quirk='skip-video' 의 근거."""
|
||||
links = [e.get("link", "") for e in _feed("bloomberg_markets_rss.xml").entries]
|
||||
video_pat = re.compile(r"/videos?/") # news_collector skip-video 와 동일 패턴
|
||||
assert any(video_pat.search(u) for u in links)
|
||||
assert any("/news/articles/" in u and not video_pat.search(u) for u in links)
|
||||
|
||||
def test_articles_have_signal_grade_summary(self):
|
||||
f = _feed("bloomberg_markets_rss.xml")
|
||||
assert any(len(e.get("summary", "")) >= 100 for e in f.entries)
|
||||
|
||||
|
||||
class TestAsmeJpvtFixture:
|
||||
def test_journal_identity_and_abstract(self):
|
||||
f = _feed("asme_jpvt_openissues_rss.xml")
|
||||
assert "Pressure Vessel Technology" in f.feed.get("title", "")
|
||||
assert f.entries
|
||||
for e in f.entries:
|
||||
assert len(e.get("summary", "")) >= 200 # 초록 = 본문
|
||||
|
||||
|
||||
class TestArxivFixture:
|
||||
def test_abs_links_are_stable_dedup_keys(self):
|
||||
"""replace/cross 재공지는 같은 /abs/ URL — edit_url dedup 이 자연 차단."""
|
||||
f = _feed("arxiv_appph_rss.xml")
|
||||
assert f.entries
|
||||
for e in f.entries:
|
||||
assert re.match(r"https://arxiv\.org/abs/\d", e.get("link", ""))
|
||||
|
||||
def test_announce_type_in_summary(self):
|
||||
e = _feed("arxiv_appph_rss.xml").entries[0]
|
||||
assert "Announce Type:" in e.get("summary", "")
|
||||
|
||||
|
||||
class TestEconomistFixture:
|
||||
def test_oneline_signal_summaries(self):
|
||||
f = _feed("economist_latest_rss.xml")
|
||||
assert f.entries
|
||||
for e in f.entries:
|
||||
assert e.get("title", "").strip()
|
||||
assert e.get("link", "").startswith("https://www.economist.com/")
|
||||
|
||||
|
||||
# ── CSB sitemap diff 파서 ────────────────────────────────────────────────────
|
||||
|
||||
class TestCsbSitemapParsing:
|
||||
def test_parse_pairs_with_tz_aware_lastmod(self):
|
||||
xml = (FIXTURES / "csb_sitemap_sample.xml").read_text(encoding="utf-8")
|
||||
pairs = _parse_sitemap(xml)
|
||||
assert pairs
|
||||
for url, lastmod in pairs:
|
||||
assert url.startswith("https://www.csb.gov/")
|
||||
assert lastmod.tzinfo is not None
|
||||
|
||||
def test_skip_sections_vs_root_slugs(self):
|
||||
assert _should_skip("https://www.csb.gov/videos/some-video/")
|
||||
assert _should_skip("https://www.csb.gov/investigations/completed-investigations/")
|
||||
assert _should_skip("https://www.csb.gov/site-map/")
|
||||
assert _should_skip("https://www.csb.gov/") # 홈
|
||||
# 조사 보고서/뉴스 릴리스 = 루트 슬러그 — 수집 대상
|
||||
assert not _should_skip("https://www.csb.gov/givaudan-sense-colour-explosion-/")
|
||||
assert not _should_skip("https://www.csb.gov/recommendations/preventive-maintenance/")
|
||||
|
||||
def test_watermark_diff_orders_oldest_first(self):
|
||||
xml = (FIXTURES / "csb_sitemap_sample.xml").read_text(encoding="utf-8")
|
||||
pairs = [p for p in _parse_sitemap(xml) if not _should_skip(p[0])]
|
||||
watermark = min(lm for _, lm in pairs)
|
||||
changed = sorted(
|
||||
((u, lm) for u, lm in pairs if lm >= watermark), key=lambda p: p[1]
|
||||
)
|
||||
assert changed == sorted(changed, key=lambda p: p[1])
|
||||
assert len(changed) == len(pairs) # >= 경계 포함
|
||||
|
||||
|
||||
class TestCsbPdfLinks:
|
||||
HTML = (FIXTURES / "csb_investigation_page_excerpt.html").read_text(encoding="utf-8")
|
||||
BASE = "https://www.csb.gov/givaudan-sense-colour-explosion-/"
|
||||
|
||||
def test_report_pdfs_kept_with_cachebuster_query(self):
|
||||
links = _pdf_links(self.HTML, self.BASE)
|
||||
assert any("Givaudan_Investigation_Report_Publication.pdf" in u for u in links)
|
||||
# cache-buster 쿼리는 다운로드 URL 에 유지 (정규화는 파일명/dedup 축에서만)
|
||||
assert any("?" in u for u in links)
|
||||
for u in links:
|
||||
assert u.startswith("https://www.csb.gov/")
|
||||
|
||||
def test_recommendation_status_summaries_excluded(self):
|
||||
links = _pdf_links(self.HTML, self.BASE)
|
||||
assert links
|
||||
assert not any("/assets/recommendation/" in u for u in links)
|
||||
|
||||
def test_dedup_by_path(self):
|
||||
html = (
|
||||
'<a href="/assets/1/6/r.pdf?100">a</a>'
|
||||
'<a href="/assets/1/6/r.pdf?200">b</a>'
|
||||
'<a href="https://evil.example.com/x.pdf">c</a>'
|
||||
)
|
||||
links = _pdf_links(html, "https://www.csb.gov/page/")
|
||||
assert len(links) == 1 # 같은 path 1회 + 외부 호스트 제외
|
||||
assert links[0].startswith("https://www.csb.gov/assets/1/6/r.pdf")
|
||||
|
||||
|
||||
# ── API 표준 공지 목록 파서 ──────────────────────────────────────────────────
|
||||
|
||||
class TestApiListingParsing:
|
||||
HTML = (FIXTURES / "api_standards_announcements_listing.html").read_text(
|
||||
encoding="utf-8", errors="replace"
|
||||
)
|
||||
|
||||
def test_ten_unique_detail_links_per_page(self):
|
||||
urls = _parse_listing(self.HTML)
|
||||
assert len(urls) == 10
|
||||
assert len(set(urls)) == 10
|
||||
for u in urls:
|
||||
assert u.startswith(
|
||||
"https://www.api.org/products-and-services/standards/"
|
||||
"important-standards-announcements/"
|
||||
)
|
||||
assert "?" not in u # 페이지네이션 링크(?page=) 미혼입
|
||||
|
||||
def test_pub_date_parse(self):
|
||||
dt = _parse_pub_date("Published June 4, 2026 — API announces ...")
|
||||
assert dt == datetime(2026, 6, 4, tzinfo=timezone.utc)
|
||||
assert _parse_pub_date("no date here") is None
|
||||
assert _parse_pub_date("February 31, 2026") is None # 달력 불가 = None
|
||||
|
||||
|
||||
# ── CCPS beacon 링크 파서 ────────────────────────────────────────────────────
|
||||
|
||||
class TestCcpsBeaconLinks:
|
||||
def test_beacon_filter_and_relative_resolve(self):
|
||||
html = (
|
||||
'<a href="/sites/default/files/2026-06/Beacon-June-2026.pdf">June</a>'
|
||||
'<a href="/sites/default/files/beacon_korean_2026_06.pdf"><b>Korean</b></a>'
|
||||
'<a href="/sites/default/files/other-brochure.pdf">brochure</a>'
|
||||
'<a href="/sites/default/files/monthly.pdf">Process Safety Beacon June</a>'
|
||||
)
|
||||
links = _beacon_pdf_links(html, "https://www.aiche.org/ccps/resources/process-safety-beacon")
|
||||
assert "https://www.aiche.org/sites/default/files/2026-06/Beacon-June-2026.pdf" in links
|
||||
assert any("beacon_korean" in u for u in links)
|
||||
assert any(u.endswith("/monthly.pdf") for u in links) # 앵커 텍스트 매칭
|
||||
assert not any("other-brochure" in u for u in links)
|
||||
@@ -0,0 +1,159 @@
|
||||
"""ds-macbook-offload-1 P2-4 — deep 슬롯 라우팅 / 보류(StageDeferred) / drain 가드 테스트.
|
||||
|
||||
DB 불요(unit) — AIClient 는 __new__ 로 settings 우회, drain 가드는 settings monkeypatch.
|
||||
통합(보류 백오프 DB 기록, claim 경합)은 P3-2 E2E 게이트에서 라이브 실측.
|
||||
fixture = tests/fixtures/qwen_router_chat_completion.json (2026-06-11 라이브 박제 —
|
||||
라우터 :8890 경유 model=qwen-macbook, production 호출 형상과 동일 body, 13.2s 실측).
|
||||
"""
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
from types import SimpleNamespace
|
||||
|
||||
import httpx
|
||||
import pytest
|
||||
|
||||
from ai.client import AIClient, call_deep_or_defer, is_deferrable_error
|
||||
from models.queue import StageDeferred
|
||||
|
||||
FIXTURE = Path(__file__).parent / "fixtures" / "qwen_router_chat_completion.json"
|
||||
|
||||
|
||||
def _client(deep_cfg, primary_cfg):
|
||||
"""settings 비의존 AIClient — __init__ 우회 후 ai 슬롯만 주입."""
|
||||
client = AIClient.__new__(AIClient)
|
||||
client.ai = SimpleNamespace(deep=deep_cfg, primary=primary_cfg)
|
||||
return client
|
||||
|
||||
|
||||
def _http_status_error(status: int) -> httpx.HTTPStatusError:
|
||||
req = httpx.Request("POST", "http://router:8890/v1/chat/completions")
|
||||
resp = httpx.Response(status, request=req)
|
||||
return httpx.HTTPStatusError(f"status {status}", request=req, response=resp)
|
||||
|
||||
|
||||
# ─── is_deferrable_error 분류 ──────────────────────────────────────────────
|
||||
|
||||
@pytest.mark.parametrize("exc", [
|
||||
_http_status_error(503), # 라우터 upstream_cold/editor_busy/warming
|
||||
_http_status_error(502), # 라우터: upstream 연결 실패/생성 중 절단 변환
|
||||
_http_status_error(504),
|
||||
httpx.ConnectError("connection refused"), # 라우터 자체 불가
|
||||
httpx.ConnectTimeout("connect timeout"),
|
||||
httpx.ReadTimeout("read timeout"), # DS↔라우터 구간 절단
|
||||
httpx.ReadError("connection reset"),
|
||||
httpx.RemoteProtocolError("server disconnected"),
|
||||
])
|
||||
def test_deferrable_errors(exc):
|
||||
assert is_deferrable_error(exc) is True
|
||||
|
||||
|
||||
@pytest.mark.parametrize("exc", [
|
||||
_http_status_error(400), # unknown alias 등 — 설정 오류는 보류 아님
|
||||
_http_status_error(500),
|
||||
ValueError("parse"),
|
||||
RuntimeError("boom"),
|
||||
])
|
||||
def test_non_deferrable_errors(exc):
|
||||
assert is_deferrable_error(exc) is False
|
||||
|
||||
|
||||
# ─── call_deep 슬롯 선택 ───────────────────────────────────────────────────
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_call_deep_uses_deep_slot():
|
||||
deep = SimpleNamespace(model="qwen-macbook")
|
||||
primary = SimpleNamespace(model="gemma-26b")
|
||||
client = _client(deep, primary)
|
||||
captured = {}
|
||||
|
||||
async def fake_request(cfg, prompt, system=None):
|
||||
captured["cfg"] = cfg
|
||||
return "ok"
|
||||
|
||||
client._request = fake_request
|
||||
assert await client.call_deep("p") == "ok"
|
||||
assert captured["cfg"] is deep
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_call_deep_falls_back_to_primary_when_slot_absent():
|
||||
"""슬롯 부재 = 기능 미활성 (방어적 primary — silent 강등이 아니라 기존 경로 그대로)."""
|
||||
primary = SimpleNamespace(model="gemma-26b")
|
||||
client = _client(None, primary)
|
||||
captured = {}
|
||||
|
||||
async def fake_request(cfg, prompt, system=None):
|
||||
captured["cfg"] = cfg
|
||||
return "ok"
|
||||
|
||||
client._request = fake_request
|
||||
await client.call_deep("p")
|
||||
assert captured["cfg"] is primary
|
||||
|
||||
|
||||
# ─── call_deep_or_defer 보류 변환 ──────────────────────────────────────────
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("exc", [
|
||||
_http_status_error(503),
|
||||
httpx.ConnectError("refused"),
|
||||
httpx.ReadTimeout("cut mid-generation"),
|
||||
])
|
||||
async def test_defer_conversion(exc):
|
||||
client = _client(SimpleNamespace(model="qwen-macbook"), None)
|
||||
|
||||
async def fail_request(cfg, prompt, system=None):
|
||||
raise exc
|
||||
|
||||
client._request = fail_request
|
||||
with pytest.raises(StageDeferred):
|
||||
await call_deep_or_defer(client, "p")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_non_deferrable_propagates():
|
||||
"""400/일반 오류는 StageDeferred 아님 — 호출자 기존 실패 경로로 전파."""
|
||||
client = _client(SimpleNamespace(model="qwen-macbook"), None)
|
||||
|
||||
async def fail_request(cfg, prompt, system=None):
|
||||
raise _http_status_error(400)
|
||||
|
||||
client._request = fail_request
|
||||
with pytest.raises(httpx.HTTPStatusError):
|
||||
await call_deep_or_defer(client, "p")
|
||||
|
||||
|
||||
def test_stage_deferred_carries_backoff():
|
||||
e = StageDeferred("macbook_unavailable:ConnectError")
|
||||
assert e.retry_after_minutes == 30
|
||||
|
||||
|
||||
def test_router_fixture_shape():
|
||||
"""_request 파싱 경로(choices[0].message.content)가 라우터 실응답 형상과 일치하는지 고정."""
|
||||
data = json.loads(FIXTURE.read_text())
|
||||
content = data["choices"][0]["message"]["content"]
|
||||
assert isinstance(content, str) and len(content) > 0
|
||||
assert data["choices"][0]["message"]["role"] == "assistant"
|
||||
# 라우터가 alias 를 upstream 로컬 경로로 치환해 응답 — 실처리 모델 추적 가능
|
||||
assert "Qwen3.6-27B-8bit" in data["model"]
|
||||
|
||||
|
||||
# ─── drain 가드 (silent 강등 금지) ─────────────────────────────────────────
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_drain_requires_deep_slot(monkeypatch):
|
||||
import workers.queue_drain as qd
|
||||
|
||||
monkeypatch.setattr(qd, "settings", SimpleNamespace(ai=SimpleNamespace(deep=None)))
|
||||
with pytest.raises(SystemExit):
|
||||
await qd.drain("summarize", 1)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_drain_rejects_non_drain_stage(monkeypatch):
|
||||
import workers.queue_drain as qd
|
||||
|
||||
monkeypatch.setattr(qd, "settings", SimpleNamespace(ai=SimpleNamespace(deep=object())))
|
||||
with pytest.raises(SystemExit):
|
||||
await qd.drain("classify", 1)
|
||||
@@ -0,0 +1,383 @@
|
||||
"""GET /api/queue/overview 판정부 단위테스트 — DB 불요 (plan ds-processing-ui-6an).
|
||||
|
||||
services/queue_overview 의 SQL 수집부와 분리된 순수 판정 함수
|
||||
(stage_machine_map / build_machines / build_summarize_eta / build_trend /
|
||||
build_totals / compute_eta_minutes / rows_to_* / display_title) 를
|
||||
mock 행으로 검증한다. 통합(실 SQL)은 배포 후 라이브 smoke 로 확인.
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
from zoneinfo import ZoneInfo
|
||||
|
||||
from services.queue_overview import (
|
||||
build_machines,
|
||||
build_summarize_eta,
|
||||
build_totals,
|
||||
build_trend,
|
||||
compose_overview,
|
||||
compute_eta_minutes,
|
||||
display_title,
|
||||
rows_to_stage_stats,
|
||||
rows_to_summarize_split,
|
||||
stage_machine_map,
|
||||
)
|
||||
|
||||
KST = ZoneInfo("Asia/Seoul")
|
||||
|
||||
|
||||
def _stage(**kw) -> dict:
|
||||
"""stage 통계 1건 — 미지정 필드 0."""
|
||||
base = {
|
||||
"pending": 0, "processing": 0, "failed": 0,
|
||||
"done_1h": 0, "done_today": 0, "done_15m": 0,
|
||||
"deferred_pending": 0, "created_1h": 0,
|
||||
}
|
||||
base.update(kw)
|
||||
return base
|
||||
|
||||
|
||||
def _split(macbook: dict | None = None, macmini: dict | None = None) -> dict:
|
||||
"""summarize 풀 완료 실적 split — 미지정 0."""
|
||||
zero = {"done_1h": 0, "done_today": 0, "done_15m": 0}
|
||||
return {
|
||||
"macbook": {**zero, **(macbook or {})},
|
||||
"macmini": {**zero, **(macmini or {})},
|
||||
}
|
||||
|
||||
|
||||
def _machine(machines: list[dict], key: str) -> dict:
|
||||
return next(m for m in machines if m["key"] == key)
|
||||
|
||||
|
||||
# ─── stage→machine 귀속 맵 ────────────────────────────────────────────────────
|
||||
|
||||
def test_stage_machine_map_deep_enabled():
|
||||
smap = stage_machine_map(deep_enabled=True)
|
||||
for s in ("extract", "embed", "chunk", "markdown", "preview", "thumbnail", "fulltext", "stt"):
|
||||
assert smap[s] == "gpu"
|
||||
assert smap["classify"] == "macmini"
|
||||
assert smap["summarize"] == "macmini"
|
||||
assert smap["deep_summary"] == "macbook"
|
||||
|
||||
|
||||
def test_stage_machine_map_deep_disabled():
|
||||
"""deep 슬롯 부재 시 deep_summary 도 macmini 귀속."""
|
||||
smap = stage_machine_map(deep_enabled=False)
|
||||
assert smap["deep_summary"] == "macmini"
|
||||
|
||||
|
||||
# ─── 머신 카드 귀속 합산 ──────────────────────────────────────────────────────
|
||||
|
||||
def test_gpu_stage_counts_attribution():
|
||||
stats = {
|
||||
"extract": _stage(pending=3, processing=1, done_1h=5, done_today=9, done_15m=1),
|
||||
"stt": _stage(failed=2, done_1h=1, done_today=2),
|
||||
}
|
||||
machines = build_machines(stats, _split(), [], deep_enabled=True)
|
||||
gpu = _machine(machines, "gpu")
|
||||
assert (gpu["pending"], gpu["processing"], gpu["failed"]) == (3, 1, 2)
|
||||
assert (gpu["done_1h"], gpu["done_today"]) == (6, 11)
|
||||
# gpu 의 stages 는 정적 8종 전부 (집계 0 이어도 표시)
|
||||
assert gpu["stages"] == [
|
||||
"extract", "embed", "chunk", "markdown",
|
||||
"preview", "thumbnail", "fulltext", "stt",
|
||||
]
|
||||
|
||||
|
||||
def test_summarize_pool_split_attribution():
|
||||
"""summarize pending/failed = macmini 귀속, 완료 실적은 split 로 분리 —
|
||||
stage-level summarize done 수치는 카드에 이중 합산되지 않는다."""
|
||||
stats = {
|
||||
"classify": _stage(done_1h=2, done_today=3),
|
||||
"summarize": _stage(pending=7, failed=1, done_1h=10, done_today=20),
|
||||
}
|
||||
split = _split(macbook={"done_1h": 4, "done_today": 8}, macmini={"done_1h": 6, "done_today": 12})
|
||||
machines = build_machines(stats, split, [], deep_enabled=True)
|
||||
macmini = _machine(machines, "macmini")
|
||||
macbook = _machine(machines, "macbook")
|
||||
|
||||
assert macmini["pending"] == 7 and macmini["failed"] == 1
|
||||
assert macmini["done_1h"] == 2 + 6 # classify + macmini 몫 (10 아님)
|
||||
assert macmini["done_today"] == 3 + 12
|
||||
assert macbook["done_1h"] == 4 and macbook["done_today"] == 8
|
||||
assert macbook["pending"] == 0 # 풀 pending 은 macmini 만
|
||||
|
||||
|
||||
def test_deep_disabled_deep_summary_counts_to_macmini():
|
||||
stats = {"deep_summary": _stage(pending=2, processing=1, done_1h=3, done_today=4)}
|
||||
machines = build_machines(stats, _split(), [], deep_enabled=False)
|
||||
macmini = _machine(machines, "macmini")
|
||||
macbook = _machine(machines, "macbook")
|
||||
assert macmini["pending"] == 2 and macmini["processing"] == 1
|
||||
assert macmini["done_1h"] == 3 and macmini["done_today"] == 4
|
||||
assert macbook["stages"] == [] and macbook["pending"] == 0
|
||||
assert _machine(machines, "macmini")["stages"] == ["classify", "summarize", "deep_summary"]
|
||||
|
||||
|
||||
def test_deferred_pending_always_on_macbook_card():
|
||||
"""보류(deferred_until 미래)는 summarize+deep_summary 합산으로 macbook 카드 귀속.
|
||||
deep 슬롯 유무와 무관 (보류 = 맥북 불가 신호)."""
|
||||
stats = {
|
||||
"summarize": _stage(pending=5, deferred_pending=2),
|
||||
"deep_summary": _stage(pending=1, deferred_pending=1),
|
||||
}
|
||||
for deep_enabled in (True, False):
|
||||
machines = build_machines(stats, _split(), [], deep_enabled=deep_enabled)
|
||||
assert _machine(machines, "macbook")["deferred_pending"] == 3
|
||||
assert _machine(machines, "gpu")["deferred_pending"] == 0
|
||||
assert _machine(machines, "macmini")["deferred_pending"] == 0
|
||||
|
||||
|
||||
# ─── state 판정 ───────────────────────────────────────────────────────────────
|
||||
|
||||
def test_macbook_state_active_wins_over_deferred_while_working():
|
||||
"""가동 > 보류 (사용자 피드백 2026-06-11): 일하고 있으면 백오프 잔여가 있어도 '가동'.
|
||||
|
||||
보류 건수는 deferred_pending 필드가 별도로 전달 — 카드 라인이 표시.
|
||||
"""
|
||||
stats = {"summarize": _stage(pending=1, deferred_pending=1)}
|
||||
split = _split(macbook={"done_15m": 3})
|
||||
machines = build_machines(stats, split, [], deep_enabled=True)
|
||||
mb = _machine(machines, "macbook")
|
||||
assert mb["state"] == "active"
|
||||
assert mb["deferred_pending"] == 1
|
||||
|
||||
|
||||
def test_macbook_state_deferred_only_when_not_working():
|
||||
"""일이 멈춰 있고(처리 0·최근 완료 0) 백오프만 쌓인 상태에서만 '보류'."""
|
||||
stats = {"summarize": _stage(pending=1, deferred_pending=1)}
|
||||
machines = build_machines(stats, _split(), [], deep_enabled=True)
|
||||
assert _machine(machines, "macbook")["state"] == "deferred"
|
||||
|
||||
|
||||
def test_macbook_state_active_on_recent_qwen_done():
|
||||
split = _split(macbook={"done_15m": 1})
|
||||
machines = build_machines({}, split, [], deep_enabled=True)
|
||||
assert _machine(machines, "macbook")["state"] == "active"
|
||||
|
||||
|
||||
def test_macbook_state_idle():
|
||||
machines = build_machines({}, _split(), [], deep_enabled=True)
|
||||
assert _machine(machines, "macbook")["state"] == "idle"
|
||||
|
||||
|
||||
def test_gpu_state_active_on_processing():
|
||||
stats = {"extract": _stage(processing=1)}
|
||||
machines = build_machines(stats, _split(), [], deep_enabled=True)
|
||||
assert _machine(machines, "gpu")["state"] == "active"
|
||||
|
||||
|
||||
def test_gpu_state_active_on_recent_done():
|
||||
stats = {"embed": _stage(done_15m=2)}
|
||||
machines = build_machines(stats, _split(), [], deep_enabled=True)
|
||||
assert _machine(machines, "gpu")["state"] == "active"
|
||||
|
||||
|
||||
def test_gpu_state_idle_when_old_done_only():
|
||||
stats = {"embed": _stage(done_1h=5, done_today=9)} # 15분 내 완료 없음
|
||||
machines = build_machines(stats, _split(), [], deep_enabled=True)
|
||||
assert _machine(machines, "gpu")["state"] == "idle"
|
||||
|
||||
|
||||
def test_macmini_state_not_active_on_macbook_pool_done():
|
||||
"""summarize 풀 완료가 전부 macbook 몫이면 macmini 는 active 아님 (귀속 기준)."""
|
||||
stats = {"summarize": _stage(done_15m=1)}
|
||||
split = _split(macbook={"done_15m": 1})
|
||||
machines = build_machines(stats, split, [], deep_enabled=True)
|
||||
assert _machine(machines, "macmini")["state"] == "idle"
|
||||
|
||||
|
||||
def test_macmini_state_active_on_summarize_processing():
|
||||
stats = {"summarize": _stage(processing=1)}
|
||||
machines = build_machines(stats, _split(), [], deep_enabled=True)
|
||||
assert _machine(machines, "macmini")["state"] == "active"
|
||||
|
||||
|
||||
# ─── current 귀속 ─────────────────────────────────────────────────────────────
|
||||
|
||||
def test_current_summarize_to_macmini_max_two():
|
||||
rows = [
|
||||
{"stage": "summarize", "document_id": 1, "title": "문서A", "original_filename": None, "file_path": None},
|
||||
{"stage": "summarize", "document_id": 2, "title": "문서B", "original_filename": None, "file_path": None},
|
||||
{"stage": "summarize", "document_id": 3, "title": "문서C", "original_filename": None, "file_path": None},
|
||||
{"stage": "extract", "document_id": 4, "title": "문서D", "original_filename": None, "file_path": None},
|
||||
]
|
||||
machines = build_machines({}, _split(), rows, deep_enabled=True)
|
||||
macmini = _machine(machines, "macmini")
|
||||
gpu = _machine(machines, "gpu")
|
||||
assert [c["document_id"] for c in macmini["current"]] == [1, 2] # 최대 2건
|
||||
assert macmini["current"][0] == {"document_id": 1, "title": "문서A", "stage": "summarize"}
|
||||
assert [c["document_id"] for c in gpu["current"]] == [4]
|
||||
assert _machine(machines, "macbook")["current"] == []
|
||||
|
||||
|
||||
def test_current_deep_summary_follows_deep_slot():
|
||||
rows = [{"stage": "deep_summary", "document_id": 9, "title": "심층", "original_filename": None, "file_path": None}]
|
||||
enabled = build_machines({}, _split(), rows, deep_enabled=True)
|
||||
disabled = build_machines({}, _split(), rows, deep_enabled=False)
|
||||
assert _machine(enabled, "macbook")["current"][0]["document_id"] == 9
|
||||
assert _machine(disabled, "macmini")["current"][0]["document_id"] == 9
|
||||
|
||||
|
||||
def test_display_title_fallback_chain():
|
||||
assert display_title({"document_id": 1, "title": "제목"}) == "제목"
|
||||
assert display_title({"document_id": 1, "title": None, "original_filename": "a.pdf"}) == "a.pdf"
|
||||
assert display_title(
|
||||
{"document_id": 1, "title": None, "original_filename": None, "file_path": "/documents/PKM/Inbox/b.hwp"}
|
||||
) == "b.hwp"
|
||||
assert display_title(
|
||||
{"document_id": 7, "title": None, "original_filename": None, "file_path": None}
|
||||
) == "문서 #7"
|
||||
|
||||
|
||||
# ─── summarize ETA ────────────────────────────────────────────────────────────
|
||||
|
||||
def test_eta_minutes_positive_drain():
|
||||
# 순소화 6건/h, 잔량 30건 → 300분
|
||||
assert compute_eta_minutes(30, 10, 4) == 300
|
||||
|
||||
|
||||
def test_eta_minutes_null_when_not_draining():
|
||||
assert compute_eta_minutes(30, 4, 10) is None # 유입 > 소화
|
||||
assert compute_eta_minutes(30, 5, 5) is None # 동률도 null
|
||||
assert compute_eta_minutes(30, 0, 0) is None
|
||||
|
||||
|
||||
def test_eta_minutes_zero_pending():
|
||||
assert compute_eta_minutes(0, 10, 4) == 0
|
||||
|
||||
|
||||
def test_build_summarize_eta_pending_includes_deferred():
|
||||
stats = {"summarize": _stage(pending=12, deferred_pending=5, done_1h=8, created_1h=2)}
|
||||
eta = build_summarize_eta(stats)
|
||||
assert eta == {
|
||||
"pending": 12, # 보류 포함 총수 (pending 자체에 deferred 포함)
|
||||
"done_rate_1h": 8,
|
||||
"inflow_rate_1h": 2,
|
||||
"eta_minutes": round(12 / 6 * 60),
|
||||
}
|
||||
|
||||
|
||||
def test_build_summarize_eta_empty_stats():
|
||||
eta = build_summarize_eta({})
|
||||
assert eta == {"pending": 0, "done_rate_1h": 0, "inflow_rate_1h": 0, "eta_minutes": None}
|
||||
|
||||
|
||||
# ─── trend 24h ────────────────────────────────────────────────────────────────
|
||||
|
||||
def test_trend_24_buckets_oldest_first_with_gaps():
|
||||
now_kst = datetime(2026, 6, 11, 14, 30, tzinfo=KST)
|
||||
inflow = {"2026-06-11 13:00": 3, "2026-06-10 15:00": 1} # 15:00 어제 = 최고령 버킷
|
||||
done = {"2026-06-11 14:00": 2}
|
||||
trend = build_trend(inflow, done, now_kst)
|
||||
|
||||
assert len(trend) == 24
|
||||
assert trend[0] == {"hour": "15:00", "inflow": 1, "done": 0} # 오래된 것부터
|
||||
assert trend[-1] == {"hour": "14:00", "inflow": 0, "done": 2} # 현재 시각 버킷
|
||||
assert trend[-2] == {"hour": "13:00", "inflow": 3, "done": 0}
|
||||
# 빈 버킷은 0
|
||||
assert sum(b["inflow"] for b in trend) == 4
|
||||
assert sum(b["done"] for b in trend) == 2
|
||||
|
||||
|
||||
def test_trend_ignores_out_of_window_bucket():
|
||||
"""창 밖(24버킷 미포함) key 는 무시 — cutoff 경계 행이 섞여도 안전."""
|
||||
now_kst = datetime(2026, 6, 11, 14, 30, tzinfo=KST)
|
||||
inflow = {"2026-06-10 14:00": 99} # 14:00 어제 — 창의 최고령(15:00 어제) 이전
|
||||
trend = build_trend(inflow, {}, now_kst)
|
||||
assert sum(b["inflow"] for b in trend) == 0
|
||||
|
||||
|
||||
def test_trend_kst_midnight_crossing_labels():
|
||||
now_kst = datetime(2026, 6, 11, 2, 5, tzinfo=KST)
|
||||
trend = build_trend({}, {}, now_kst)
|
||||
assert trend[-1]["hour"] == "02:00"
|
||||
assert trend[0]["hour"] == "03:00" # 전날 03:00 (라벨은 HH:00 만)
|
||||
assert [b["hour"] for b in trend[-3:]] == ["00:00", "01:00", "02:00"]
|
||||
|
||||
|
||||
# ─── totals / row 변환 / 전체 조립 ───────────────────────────────────────────
|
||||
|
||||
def test_totals_sum_all_stages():
|
||||
stats = {
|
||||
"extract": _stage(pending=1, processing=2, failed=3),
|
||||
"summarize": _stage(pending=4, failed=1),
|
||||
"deep_summary": _stage(pending=2),
|
||||
}
|
||||
assert build_totals(stats) == {"pending": 7, "processing": 2, "failed": 4}
|
||||
|
||||
|
||||
def test_rows_to_stage_stats_conversion():
|
||||
rows = [
|
||||
("extract", 3, 1, 0, 5, 9, 1, 0, 2),
|
||||
("summarize", 7, None, 1, 10, 20, 0, 2, 4), # None 방어
|
||||
]
|
||||
stats = rows_to_stage_stats(rows)
|
||||
assert stats["extract"]["pending"] == 3 and stats["extract"]["created_1h"] == 2
|
||||
assert stats["summarize"]["processing"] == 0
|
||||
assert stats["summarize"]["deferred_pending"] == 2
|
||||
|
||||
|
||||
def test_rows_to_summarize_split_conversion():
|
||||
rows = [
|
||||
(True, 4, 8, 1), # is_macbook
|
||||
(False, 6, 12, 0),
|
||||
]
|
||||
split = rows_to_summarize_split(rows)
|
||||
assert split["macbook"] == {"done_1h": 4, "done_today": 8, "done_15m": 1}
|
||||
assert split["macmini"] == {"done_1h": 6, "done_today": 12, "done_15m": 0}
|
||||
|
||||
|
||||
def test_rows_to_summarize_split_empty():
|
||||
split = rows_to_summarize_split([])
|
||||
assert split["macbook"]["done_1h"] == 0 and split["macmini"]["done_today"] == 0
|
||||
|
||||
|
||||
def test_compose_overview_contract_shape():
|
||||
"""응답 dict 의 키가 FE 계약 shape 과 정확히 일치하는지 고정."""
|
||||
out = compose_overview(
|
||||
{"summarize": _stage(pending=1)},
|
||||
_split(),
|
||||
{}, {}, [],
|
||||
deep_enabled=True,
|
||||
now_kst=datetime(2026, 6, 11, 14, 30, tzinfo=KST),
|
||||
)
|
||||
assert set(out.keys()) == {"machines", "stages", "summarize_eta", "trend_24h", "totals"}
|
||||
assert [m["key"] for m in out["machines"]] == ["gpu", "macmini", "macbook"]
|
||||
for m in out["machines"]:
|
||||
assert set(m.keys()) == {
|
||||
"key", "label", "state", "stages", "pending", "processing", "failed",
|
||||
"done_1h", "done_today", "deferred_pending", "current",
|
||||
}
|
||||
assert m["state"] in ("active", "deferred", "idle")
|
||||
assert set(out["summarize_eta"].keys()) == {"pending", "done_rate_1h", "inflow_rate_1h", "eta_minutes"}
|
||||
assert len(out["trend_24h"]) == 24
|
||||
assert set(out["trend_24h"][0].keys()) == {"hour", "inflow", "done"}
|
||||
assert set(out["totals"].keys()) == {"pending", "processing", "failed"}
|
||||
# 머신 label 고정 (raw 모델명 노출 금지 — label 만)
|
||||
assert [m["label"] for m in out["machines"]] == ["GPU 서버", "맥미니", "맥북 M5 Max"]
|
||||
|
||||
|
||||
# ─── build_stages (단계별 현황 — 2026-06-11 사용자 피드백: 완료 가시화) ──────
|
||||
|
||||
def test_build_stages_order_fields_and_age():
|
||||
from datetime import timedelta, timezone
|
||||
from services.queue_overview import build_stages
|
||||
now = datetime(2026, 6, 11, 14, 0, tzinfo=timezone.utc)
|
||||
stats = {
|
||||
"summarize": {**_stage(pending=5, done_today=12),
|
||||
"oldest_pending_at": now - timedelta(hours=4)},
|
||||
"extract": _stage(failed=2),
|
||||
}
|
||||
rows = build_stages(stats, now=now)
|
||||
by = {r["stage"]: r for r in rows}
|
||||
# 파이프라인 순서: extract 가 summarize 보다 앞
|
||||
assert rows[0]["stage"] == "extract"
|
||||
assert by["summarize"]["pending"] == 5
|
||||
assert by["summarize"]["done_today"] == 12
|
||||
assert by["summarize"]["oldest_pending_age_sec"] == 4 * 3600
|
||||
assert by["extract"]["failed"] == 2
|
||||
assert by["extract"]["oldest_pending_age_sec"] is None
|
||||
# 전 stage 행 존재 (빈 단계 숨김은 FE 몫)
|
||||
assert {"stage", "pending", "processing", "failed", "done_today",
|
||||
"oldest_pending_age_sec"} == set(rows[0].keys())
|
||||
Reference in New Issue
Block a user