From 0e8d5cccafbda591f6327f7be6abc4fb90105ab9 Mon Sep 17 00:00:00 2001 From: hyungi Date: Sat, 23 May 2026 07:08:23 +0000 Subject: [PATCH] =?UTF-8?q?feat(worker):=20summarize=20sliding=20window=20?= =?UTF-8?q?=E2=80=94=2050k=20chunk=20+=20cumulative=20carry-over?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit P3 of family-adaptive-bengio (Mac mini 4-lever bundle). 50k 초과 input 은 CHUNK_SIZE=50000 단위로 N 분할 + cumulative carry-over (prev chunk summary 를 다음 chunk prompt 에 prefix). 50k 이하 input = 기존 동작 (변동 0). 첫 chunk = client.summarize() legacy / 후속 chunk = call_primary + SUMMARY_PROMPT_CONTINUATION. log trace: single vs sliding chunk N/M done. Co-Authored-By: Claude Opus 4.7 (1M context) --- app/workers/summarize_worker.py | 44 ++++++++++++++++++++++++++++++--- 1 file changed, 41 insertions(+), 3 deletions(-) diff --git a/app/workers/summarize_worker.py b/app/workers/summarize_worker.py index 28f6c9e..6f51ed2 100644 --- a/app/workers/summarize_worker.py +++ b/app/workers/summarize_worker.py @@ -1,4 +1,8 @@ -"""요약 전용 워커 — 뉴스 등 classify 불필요한 문서의 AI 요약만 생성""" +"""요약 전용 워커 — 뉴스 등 classify 불필요한 문서의 AI 요약만 생성. + +P3 of family-adaptive-bengio (2026-05-23): 50k 초과 input 은 sliding window +(cumulative carry-over) 로 분할 처리. 50k 이하 input 은 기존 동작 유지. +""" from datetime import datetime, timezone @@ -10,6 +14,12 @@ from models.document import Document logger = setup_logger("summarize_worker") +CHUNK_SIZE = 50000 +SUMMARY_PROMPT_CONTINUATION = ( + "이전 부분 요약:\n{prior}\n\n다음 부분:\n{text}\n\n" + "위 두 정보를 합쳐 전체 문서를 500자 이내로 요약해주세요." +) + async def process(document_id: int, session: AsyncSession) -> None: """문서 AI 요약 생성 (분류 없이 요약만)""" @@ -26,10 +36,38 @@ async def process(document_id: int, session: AsyncSession) -> None: client = AIClient() try: - summary = await client.summarize(doc.extracted_text[:50000]) + text = doc.extracted_text + total_chars = len(text) + if total_chars <= CHUNK_SIZE: + summary = await client.summarize(text) + logger.info( + f"[요약] document_id={document_id}: single chunk ({total_chars}자)" + ) + else: + chunks = [text[i:i + CHUNK_SIZE] for i in range(0, total_chars, CHUNK_SIZE)] + logger.info( + f"[요약] document_id={document_id}: sliding window {len(chunks)} chunk " + f"(total {total_chars}자, chunk_size={CHUNK_SIZE})" + ) + carry = "" + for idx, chunk in enumerate(chunks): + if idx == 0: + partial = await client.summarize(chunk) + else: + prompt = SUMMARY_PROMPT_CONTINUATION.format(prior=carry, text=chunk) + partial = await client.call_primary(prompt) + carry = strip_thinking(partial) + logger.info( + f"[요약] document_id={document_id}: chunk {idx + 1}/{len(chunks)} done " + f"(in={len(chunk)}자, carry={len(carry)}자)" + ) + summary = carry + doc.ai_summary = strip_thinking(summary) doc.ai_model_version = "qwen3.5-35b-a3b" doc.ai_processed_at = datetime.now(timezone.utc) - logger.info(f"[요약] document_id={document_id}: {len(doc.ai_summary)}자") + logger.info( + f"[요약] document_id={document_id}: {len(doc.ai_summary)}자 final" + ) finally: await client.close()