diff --git a/app/workers/tier_backfill.py b/app/workers/tier_backfill.py index 3da1493..f2f8ec0 100644 --- a/app/workers/tier_backfill.py +++ b/app/workers/tier_backfill.py @@ -61,13 +61,18 @@ async def _classify_pending(session: AsyncSession) -> int: async def _enqueue_domain(session: AsyncSession, filter_clause: str, limit: int) -> int: - """도메인 조건 + NULL tier 문서 limit 건 classify 큐에 enqueue. 반환 = 실제 enqueue 수.""" + """도메인 조건 + NULL tier 문서 limit 건 classify 큐에 enqueue. 반환 = 실제 enqueue 수. + + extracted_text 빈 문자열 (LENGTH=0) 도 제외 — classify_worker 는 not doc.extracted_text + truthy 체크라 빈 문자열에서 ValueError raise. 무한 retry 루프 방지. + """ sql = text(f""" INSERT INTO processing_queue (document_id, stage, status, attempts, max_attempts) SELECT id, 'classify', 'pending', 0, 3 FROM documents WHERE deleted_at IS NULL AND extracted_text IS NOT NULL + AND LENGTH(extracted_text) > 0 AND ai_analysis_tier IS NULL AND {filter_clause} ORDER BY created_at DESC diff --git a/scripts/backfill_tier.py b/scripts/backfill_tier.py index ece998d..e19ad85 100644 --- a/scripts/backfill_tier.py +++ b/scripts/backfill_tier.py @@ -59,6 +59,7 @@ SELECT COUNT(*) FROM documents WHERE deleted_at IS NULL AND extracted_text IS NOT NULL + AND LENGTH(extracted_text) > 0 AND ai_analysis_tier IS NULL AND {filter} """ @@ -69,6 +70,7 @@ SELECT id, LEFT(title, 60) AS title, ai_domain, source_channel, FROM documents WHERE deleted_at IS NULL AND extracted_text IS NOT NULL + AND LENGTH(extracted_text) > 0 AND ai_analysis_tier IS NULL AND {filter} ORDER BY created_at DESC @@ -81,6 +83,7 @@ SELECT id, 'classify', 'pending', 0, 3 FROM documents WHERE deleted_at IS NULL AND extracted_text IS NOT NULL + AND LENGTH(extracted_text) > 0 AND ai_analysis_tier IS NULL AND {filter} ORDER BY created_at DESC