fix(news): SSRF validation + admin auth + API key masking + collect lock + XML safety

- 신규 url_validator.py: SSRF 차단 (private IP/loopback/link-local/reserved/multicast/CGNAT 블록, HTTPS only)
- require_admin dependency 추가 — 소스 CRUD, /collect, /digest/regenerate에 적용
- User.is_admin 컬럼 + migration 104
- NYT API key 로그 마스킹 (쿼리스트링 제거)
- RSS fetch: redirect 수동 처리(3회, target 재검증), 5MB 크기 제한, content-type 허용목록, feed.bozo 체크
- /collect 재진입 차단 (asyncio.Lock, 단일 인스턴스 한정)
- HTTP feed allowlist (코드 레벨 상수, API 미노출)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Hyungi Ahn
2026-04-13 14:32:55 +09:00
parent e405ed3414
commit 5038007998
7 changed files with 188 additions and 22 deletions
+3 -3
View File
@@ -20,7 +20,7 @@ from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.orm import selectinload
from core.auth import get_current_user
from core.auth import get_current_user, require_admin
from core.database import get_session
from models.digest import DigestTopic, GlobalDigest
from models.user import User
@@ -155,9 +155,9 @@ async def get_digest(
@router.post("/regenerate")
async def regenerate(
user: Annotated[User, Depends(get_current_user)],
user: Annotated[User, Depends(require_admin)],
):
"""디버그용 수동 트리거 — 백그라운드 태스크로 워커 실행 (auth 필요)."""
"""수동 트리거 — 백그라운드 태스크로 워커 실행 (admin 필요)."""
from workers.digest_worker import run
asyncio.create_task(run())
+35 -8
View File
@@ -8,7 +8,7 @@ from pydantic import BaseModel
from sqlalchemy import String, select
from sqlalchemy.ext.asyncio import AsyncSession
from core.auth import get_current_user
from core.auth import get_current_user, require_admin
from core.database import get_session
from models.news_source import NewsSource
from models.user import User
@@ -60,9 +60,14 @@ async def list_sources(
@router.post("/sources")
async def create_source(
body: NewsSourceCreate,
user: Annotated[User, Depends(get_current_user)],
user: Annotated[User, Depends(require_admin)],
session: Annotated[AsyncSession, Depends(get_session)],
):
from core.url_validator import validate_feed_url
try:
validate_feed_url(body.feed_url)
except ValueError as e:
raise HTTPException(status_code=422, detail=f"feed_url 검증 실패: {e}")
source = NewsSource(**body.model_dump())
session.add(source)
await session.commit()
@@ -73,12 +78,18 @@ async def create_source(
async def update_source(
source_id: int,
body: NewsSourceUpdate,
user: Annotated[User, Depends(get_current_user)],
user: Annotated[User, Depends(require_admin)],
session: Annotated[AsyncSession, Depends(get_session)],
):
source = await session.get(NewsSource, source_id)
if not source:
raise HTTPException(status_code=404)
if body.feed_url is not None:
from core.url_validator import validate_feed_url
try:
validate_feed_url(body.feed_url)
except ValueError as e:
raise HTTPException(status_code=422, detail=f"feed_url 검증 실패: {e}")
for field, value in body.model_dump(exclude_unset=True).items():
setattr(source, field, value)
await session.commit()
@@ -88,7 +99,7 @@ async def update_source(
@router.delete("/sources/{source_id}")
async def delete_source(
source_id: int,
user: Annotated[User, Depends(get_current_user)],
user: Annotated[User, Depends(require_admin)],
session: Annotated[AsyncSession, Depends(get_session)],
):
source = await session.get(NewsSource, source_id)
@@ -162,12 +173,28 @@ async def mark_all_read(
return {"marked": result.rowcount}
import asyncio
_collect_lock = asyncio.Lock()
@router.post("/collect")
async def trigger_collect(
user: Annotated[User, Depends(get_current_user)],
user: Annotated[User, Depends(require_admin)],
):
"""수동 수집 트리거"""
"""수동 수집 트리거 (admin 전용).
asyncio.Lock은 단일 프로세스/이벤트루프 기준.
현재 FastAPI 단일 인스턴스 운영이므로 유효하지만,
scale-out 시 DB advisory lock으로 교체 필요.
"""
if _collect_lock.locked():
raise HTTPException(status_code=429, detail="수집이 이미 진행 중입니다")
async def _run_with_lock():
async with _collect_lock:
from workers.news_collector import run
import asyncio
asyncio.create_task(run())
await run()
asyncio.create_task(_run_with_lock())
return {"message": "뉴스 수집 시작됨"}
+14
View File
@@ -83,3 +83,17 @@ async def get_current_user(
detail="유저를 찾을 수 없음",
)
return user
async def require_admin(
credentials: Annotated[HTTPAuthorizationCredentials, Depends(security)],
session: Annotated[AsyncSession, Depends(get_session)],
):
"""관리자 권한 확인 — 뉴스 소스 CRUD, 수집 트리거, digest 재생성 등"""
user = await get_current_user(credentials, session)
if not user.is_admin:
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail="관리자 권한 필요",
)
return user
+61
View File
@@ -0,0 +1,61 @@
"""외부 피드 URL 검증 — SSRF 차단 + redirect target 재검증
등록 시 validate_feed_url()로 1차 검증, fetch 시 redirect target마다
동일 함수로 재검증. 완전한 TOCTOU 방어는 httpx transport 레벨 후킹이
필요하므로 이 이중 검증이 현재 현실적 상한선.
"""
import ipaddress
import socket
from urllib.parse import urlparse
ALLOWED_SCHEMES = {"https"}
# HTTP 예외 도메인 — 여기에 없으면 HTTPS만 허용
# 추가 시 사유/승인일/재검토일을 주석에 기록
HTTP_EXCEPTION_DOMAINS: set[str] = set()
# 예: {"www.chinadaily.com.cn"} # 2026-04-14 승인, HTTPS 미지원 확인, 2026-07 재검토
def _is_blocked_ip(ip: ipaddress.IPv4Address | ipaddress.IPv6Address) -> bool:
"""ipaddress 내장 속성으로 넓게 차단 (단순 대역 비교보다 안전)"""
return (
ip.is_private
or ip.is_loopback
or ip.is_link_local
or ip.is_reserved
or ip.is_multicast
or ip.is_unspecified
# Tailscale CGNAT 대역 (is_private에 포함 안 됨)
or ip in ipaddress.ip_network("100.64.0.0/10")
)
def validate_feed_url(url: str, allow_http: bool = False) -> str:
"""URL 검증. 실패 시 ValueError raise.
allow_http는 HTTP_EXCEPTION_DOMAINS allowlist 연동 시에만 사용.
API 파라미터로 노출하지 않는다.
"""
parsed = urlparse(url)
allowed = ALLOWED_SCHEMES | ({"http"} if allow_http else set())
if parsed.scheme not in allowed:
raise ValueError(f"허용되지 않은 스킴: {parsed.scheme}")
if not parsed.hostname:
raise ValueError("호스트명 누락")
# DNS 해석 후 IP 차단
try:
addrs = socket.getaddrinfo(parsed.hostname, None)
except socket.gaierror:
raise ValueError(f"DNS 해석 실패: {parsed.hostname}")
for _, _, _, _, sockaddr in addrs:
ip = ipaddress.ip_address(sockaddr[0])
if _is_blocked_ip(ip):
# IP 자체를 에러에 노출하지 않음 — hostname만
raise ValueError(f"차단된 네트워크: {parsed.hostname}")
return url
+1
View File
@@ -16,6 +16,7 @@ class User(Base):
password_hash: Mapped[str] = mapped_column(Text, nullable=False)
totp_secret: Mapped[str | None] = mapped_column(String(64))
is_active: Mapped[bool] = mapped_column(Boolean, default=True)
is_admin: Mapped[bool] = mapped_column(Boolean, default=False, server_default="false")
created_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True), default=datetime.now
)
+65 -4
View File
@@ -103,13 +103,64 @@ async def run():
logger.info(f"뉴스 수집 완료: {total}건 신규")
MAX_RESPONSE_SIZE = 5 * 1024 * 1024 # 5MB
ALLOWED_CONTENT_TYPES = ("application/rss+xml", "application/atom+xml",
"application/xml", "text/xml")
async def _fetch_rss(session, source: NewsSource) -> int:
"""RSS 피드 수집"""
async with httpx.AsyncClient(timeout=10) as client:
"""RSS 피드 수집 — redirect 재검증 + 크기/content-type 제한"""
from urllib.parse import urljoin
from core.url_validator import validate_feed_url, HTTP_EXCEPTION_DOMAINS
# HTTP allowlist 체크
if source.feed_url.startswith("http://"):
hostname = urlparse(source.feed_url).hostname
if hostname not in HTTP_EXCEPTION_DOMAINS:
logger.error(f"[{source.name}] HTTP 차단 (allowlist 미등록): {hostname}")
return 0
# fetch 전 URL 재검증 (등록 이후 DNS 변경 대비)
try:
validate_feed_url(source.feed_url, allow_http=source.feed_url.startswith("http://"))
except ValueError as e:
logger.error(f"[{source.name}] URL 검증 실패: {e}")
return 0
async with httpx.AsyncClient(timeout=10, follow_redirects=False) as client:
resp = await client.get(source.feed_url)
# redirect 수동 처리 (최대 3회, 각 target 재검증)
redirects = 0
while resp.is_redirect and redirects < 3:
location = resp.headers.get("location", "")
location = urljoin(str(resp.request.url), location)
try:
validate_feed_url(location, allow_http=source.feed_url.startswith("http://"))
except ValueError as e:
logger.error(f"[{source.name}] redirect target 차단: {e}")
return 0
resp = await client.get(location)
redirects += 1
if resp.is_redirect:
logger.error(f"[{source.name}] redirect 3회 초과")
return 0
resp.raise_for_status()
if len(resp.content) > MAX_RESPONSE_SIZE:
logger.warning(f"[{source.name}] 응답 크기 초과: {len(resp.content)} bytes")
return 0
ct = resp.headers.get("content-type", "").lower()
if not any(t in ct for t in ALLOWED_CONTENT_TYPES):
logger.warning(f"[{source.name}] 비정상 content-type: {ct}")
return 0
feed = feedparser.parse(resp.text)
if feed.bozo and not feed.entries:
logger.warning(f"[{source.name}] RSS 파싱 실패: {feed.bozo_exception}")
return 0
count = 0
for entry in feed.entries:
@@ -175,19 +226,29 @@ async def _fetch_rss(session, source: NewsSource) -> int:
async def _fetch_api(session, source: NewsSource) -> int:
"""NYT API 수집"""
"""NYT API 수집 — 키 마스킹 + health degradation"""
import os
nyt_key = os.getenv("NYT_API_KEY", "")
if not nyt_key:
logger.warning("NYT_API_KEY 미설정")
logger.error("NYT_API_KEY 미설정 — US 뉴스 수집 불가")
return 0
try:
async with httpx.AsyncClient(timeout=10) as client:
resp = await client.get(
f"https://api.nytimes.com/svc/topstories/v2/{source.category or 'world'}.json",
params={"api-key": nyt_key},
)
resp.raise_for_status()
except httpx.HTTPStatusError as e:
# 쿼리스트링(api-key 포함) 제거 — path까지만 로깅
safe_url = str(e.request.url).split("?")[0]
logger.error(f"NYT API 실패: {e.response.status_code} @ {safe_url}")
return 0
except httpx.RequestError as e:
safe_url = str(e.request.url).split("?")[0] if e.request else "unknown"
logger.error(f"NYT API 연결 실패: {safe_url}")
return 0
data = resp.json()
count = 0
+2
View File
@@ -0,0 +1,2 @@
-- 관리자 권한 컬럼 추가 (뉴스 소스 CRUD, 수집 트리거, digest 재생성 등)
ALTER TABLE users ADD COLUMN IF NOT EXISTS is_admin BOOLEAN NOT NULL DEFAULT false;