refactor(news): 3 fetch Document 빌드 _build_news_doc 헬퍼 통합 (R11c)

_fetch_rss/_fetch_api_guardian/_fetch_api_nyt 의 22필드 Document 빌드가 정적 동일 (필드키 22개 동순서 실측) — 채널별 차이는 body(NYT=summary)·extractor_version·ident(category 계산)뿐이라 인자화. _build_news_doc 헬퍼로 통합 = 동작 보존(정적 검증). news_collector god-file 중복 30줄×3 → 1 헬퍼. 검증: py_compile 통과, doc=Document( 직접빌드 0건. ★채널별 ingest smoke(staging)로 3 경로 동등 확인 권장. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-16 14:39:27 +09:00
parent 205a7bf3d5
commit 523c509954
1 changed files with 43 additions and 80 deletions
@@ -412,6 +412,40 @@ async def _already_ingested(session, article_id: str, normalized_url: str, link:
    return existing.scalars().first() is not None


+def _build_news_doc(source, ident, source_short, article_id, title, body,
+                    extractor_version, normalized_url, pub_dt) -> Document:
+    """3 fetch 공통 뉴스 Document 빌더 (R11c). 채널별 차이는 인자로만 — body(NYT=summary)·
+    extractor_version·ident(category 계산 차이 흡수)만 다르고 22 필드 구조는 정적 동일.
+    edit_url 은 조회와 동일 정규화 저장(raw 저장 시 URL dedup 무력화)."""
+    return Document(
+        file_path=f"{ident['path_prefix']}/{source.name}/{article_id}",
+        file_hash=article_id,
+        file_format="article",
+        file_size=len(body.encode()),
+        file_type="note",
+        title=title,
+        extracted_text=f"{title}\n\n{body}",
+        extracted_at=datetime.now(timezone.utc),
+        extractor_version=extractor_version,
+        # article = 텍스트 네이티브 → 생성 시점 terminal 'skipped' 명시(markdown 변환 비대상,
+        # 미명시 시 'pending' 영구 비수렴 → backlog 지표 오염). page 정책은 fulltext_worker 승격.
+        md_status="skipped",
+        md_extraction_error="news article: 텍스트 네이티브, markdown 변환 비대상",
+        source_channel=source.source_channel,
+        data_origin="external",
+        edit_url=normalized_url,
+        review_status="approved",
+        ai_domain=ident["ai_domain"],
+        ai_sub_group=source_short,
+        ai_tags=ident["ai_tags"],
+        # 안전 자료실 A-2 — 레지스트리 deterministic (classify-skip 경로라 ingest 시점 필수)
+        material_type=ident["material_type"],
+        jurisdiction=ident["jurisdiction"],
+        published_date=pub_dt.date() if pub_dt else None,
+        extract_meta=_build_extract_meta(source, pub_dt),
+    )
+
+
 async def _fetch_rss(session, source: NewsSource) -> tuple[int, str]:
    """RSS 피드 수집 — redirect 재검증 + 크기/content-type 제한 + 조건부 GET (A-1).

@@ -542,35 +576,9 @@ async def _fetch_rss(session, source: NewsSource) -> tuple[int, str]:
        source_short = source.name.split(" ")[0]  # "경향신문 문화" → "경향신문"
        ident = _doc_identity(source, source_short, category)

-        doc = Document(
-            file_path=f"{ident['path_prefix']}/{source.name}/{article_id}",
-            file_hash=article_id,
-            file_format="article",
-            file_size=len(body.encode()),
-            file_type="note",
-            title=title,
-            extracted_text=f"{title}\n\n{body}",
-            extracted_at=datetime.now(timezone.utc),
-            extractor_version=extractor_version,
-            # article = 텍스트 네이티브(본문=extracted_text). markdown 단계 미enqueue 라
-            # 기본값 'pending' 이면 영구 비수렴 → backlog 지표 오염 + md_status_pending partial
-            # 인덱스 비대. 생성 시점에 terminal 'skipped' 로 명시(변환 비대상).
-            # fulltext_policy='page' 소스는 fulltext_worker 가 승격 시 success 로 갱신.
-            md_status="skipped",
-            md_extraction_error="news article: 텍스트 네이티브, markdown 변환 비대상",
-            source_channel=source.source_channel,
-            data_origin="external",
-            # 조회와 동일하게 정규화해 저장 — raw(tracking param 포함) 저장 시 URL dedup 무력화
-            edit_url=normalized_url,
-            review_status="approved",
-            ai_domain=ident["ai_domain"],
-            ai_sub_group=source_short,
-            ai_tags=ident["ai_tags"],
-            # 안전 자료실 A-2 — 레지스트리 deterministic (classify-skip 경로라 ingest 시점 필수)
-            material_type=ident["material_type"],
-            jurisdiction=ident["jurisdiction"],
-            published_date=pub_dt.date() if pub_dt else None,
-            extract_meta=_build_extract_meta(source, pub_dt),
+        doc = _build_news_doc(
+            source, ident, source_short, article_id, title, body,
+            extractor_version, normalized_url, pub_dt,
        )
        session.add(doc)
        await session.flush()
@@ -678,30 +686,9 @@ async def _fetch_api_guardian(session, source: NewsSource) -> tuple[int, str]:
        source_short = source.name.split(" ")[0]
        ident = _doc_identity(source, source_short, category)

-        doc = Document(
-            file_path=f"{ident['path_prefix']}/{source.name}/{article_id}",
-            file_hash=article_id,
-            file_format="article",
-            file_size=len(body.encode()),
-            file_type="note",
-            title=title,
-            extracted_text=f"{title}\n\n{body}",
-            extracted_at=datetime.now(timezone.utc),
-            extractor_version="guardian_api_full" if is_full else "guardian_api",
-            md_status="skipped",
-            md_extraction_error="news article: 텍스트 네이티브, markdown 변환 비대상",
-            source_channel=source.source_channel,
-            data_origin="external",
-            edit_url=normalized_url,
-            review_status="approved",
-            ai_domain=ident["ai_domain"],
-            ai_sub_group=source_short,
-            ai_tags=ident["ai_tags"],
-            # 안전 자료실 A-2 — 레지스트리 deterministic (classify-skip 경로라 ingest 시점 필수)
-            material_type=ident["material_type"],
-            jurisdiction=ident["jurisdiction"],
-            published_date=pub_dt.date() if pub_dt else None,
-            extract_meta=_build_extract_meta(source, pub_dt),
+        doc = _build_news_doc(
+            source, ident, source_short, article_id, title, body,
+            "guardian_api_full" if is_full else "guardian_api", normalized_url, pub_dt,
        )
        session.add(doc)
        await session.flush()
@@ -769,33 +756,9 @@ async def _fetch_api_nyt(session, source: NewsSource) -> tuple[int, str]:
        source_short = source.name.split(" ")[0]

        ident = _doc_identity(source, source_short, category)
-        doc = Document(
-            file_path=f"{ident['path_prefix']}/{source.name}/{article_id}",
-            file_hash=article_id,
-            file_format="article",
-            file_size=len(summary.encode()),
-            file_type="note",
-            title=title,
-            extracted_text=f"{title}\n\n{summary}",
-            extracted_at=datetime.now(timezone.utc),
-            extractor_version="nyt_api",
-            # article = 텍스트 네이티브(본문=extracted_text). markdown 단계 미enqueue 라
-            # 기본값 'pending' 이면 영구 비수렴 → backlog 지표 오염 + md_status_pending partial
-            # 인덱스 비대. 생성 시점에 terminal 'skipped' 로 명시(변환 비대상).
-            md_status="skipped",
-            md_extraction_error="news article: 텍스트 네이티브, markdown 변환 비대상",
-            source_channel=source.source_channel,
-            data_origin="external",
-            edit_url=normalized_url,
-            review_status="approved",
-            ai_domain=ident["ai_domain"],
-            ai_sub_group=source_short,
-            ai_tags=ident["ai_tags"],
-            # 안전 자료실 A-2 — 레지스트리 deterministic (classify-skip 경로라 ingest 시점 필수)
-            material_type=ident["material_type"],
-            jurisdiction=ident["jurisdiction"],
-            published_date=pub_dt.date() if pub_dt else None,
-            extract_meta=_build_extract_meta(source, pub_dt),
+        doc = _build_news_doc(
+            source, ident, source_short, article_id, title, summary,
+            "nyt_api", normalized_url, pub_dt,
        )
        session.add(doc)
        await session.flush()