From 95bcdb851b89dc57561a165c62708767c92a34b0 Mon Sep 17 00:00:00 2001 From: Hyungi Ahn Date: Mon, 27 Apr 2026 08:25:12 +0900 Subject: [PATCH] =?UTF-8?q?fix(ops):=20backfill=20=EC=BF=BC=EB=A6=AC?= =?UTF-8?q?=EC=97=90=20=EB=B9=88=20extracted=5Ftext=20=EC=A0=9C=EC=99=B8?= =?UTF-8?q?=20=E2=80=94=20=EB=AC=B4=ED=95=9C=20retry=20=EB=B0=A9=EC=A7=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 3일 운영 결과 doc 4811, 5181 가 extracted_text='' (빈 문자열) 인데 IS NOT NULL 만 걸려 enqueue → classify_worker 의 not doc.extracted_text truthy 체크에서 ValueError → max_attempts(3) 도달 → status=failed. 다음 backfill 사이클에서 다시 enqueue 되어 12회 반복, failed 24건 누적. 수정: tier_backfill.py + backfill_tier.py 양쪽 SQL 에 LENGTH(extracted_text) > 0 추가. 빈 문자열 문서는 enqueue 자체에서 제외. 기존 failed 24건 정리 SQL (사용자가 수동 실행): DELETE FROM processing_queue WHERE stage='classify' AND status='failed' AND error_message LIKE '%extracted_text%'; Co-Authored-By: Claude Opus 4.7 (1M context) --- app/workers/tier_backfill.py | 7 ++++++- scripts/backfill_tier.py | 3 +++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/app/workers/tier_backfill.py b/app/workers/tier_backfill.py index 3da1493..f2f8ec0 100644 --- a/app/workers/tier_backfill.py +++ b/app/workers/tier_backfill.py @@ -61,13 +61,18 @@ async def _classify_pending(session: AsyncSession) -> int: async def _enqueue_domain(session: AsyncSession, filter_clause: str, limit: int) -> int: - """도메인 조건 + NULL tier 문서 limit 건 classify 큐에 enqueue. 반환 = 실제 enqueue 수.""" + """도메인 조건 + NULL tier 문서 limit 건 classify 큐에 enqueue. 반환 = 실제 enqueue 수. + + extracted_text 빈 문자열 (LENGTH=0) 도 제외 — classify_worker 는 not doc.extracted_text + truthy 체크라 빈 문자열에서 ValueError raise. 무한 retry 루프 방지. + """ sql = text(f""" INSERT INTO processing_queue (document_id, stage, status, attempts, max_attempts) SELECT id, 'classify', 'pending', 0, 3 FROM documents WHERE deleted_at IS NULL AND extracted_text IS NOT NULL + AND LENGTH(extracted_text) > 0 AND ai_analysis_tier IS NULL AND {filter_clause} ORDER BY created_at DESC diff --git a/scripts/backfill_tier.py b/scripts/backfill_tier.py index ece998d..e19ad85 100644 --- a/scripts/backfill_tier.py +++ b/scripts/backfill_tier.py @@ -59,6 +59,7 @@ SELECT COUNT(*) FROM documents WHERE deleted_at IS NULL AND extracted_text IS NOT NULL + AND LENGTH(extracted_text) > 0 AND ai_analysis_tier IS NULL AND {filter} """ @@ -69,6 +70,7 @@ SELECT id, LEFT(title, 60) AS title, ai_domain, source_channel, FROM documents WHERE deleted_at IS NULL AND extracted_text IS NOT NULL + AND LENGTH(extracted_text) > 0 AND ai_analysis_tier IS NULL AND {filter} ORDER BY created_at DESC @@ -81,6 +83,7 @@ SELECT id, 'classify', 'pending', 0, 3 FROM documents WHERE deleted_at IS NULL AND extracted_text IS NOT NULL + AND LENGTH(extracted_text) > 0 AND ai_analysis_tier IS NULL AND {filter} ORDER BY created_at DESC