diff --git a/app/workers/deep_summary_worker.py b/app/workers/deep_summary_worker.py index c644609..5eddc4b 100644 --- a/app/workers/deep_summary_worker.py +++ b/app/workers/deep_summary_worker.py @@ -18,7 +18,9 @@ from pydantic import BaseModel, Field, ValidationError from sqlalchemy import desc, select from sqlalchemy.ext.asyncio import AsyncSession -from ai.client import AIClient, parse_json_response +import json +import re +from ai.client import AIClient, parse_json_response, strip_thinking from ai.envelope import EscalationEnvelope from core.config import settings from core.utils import setup_logger @@ -117,7 +119,9 @@ async def process(document_id: int, session: AsyncSession) -> None: if raw: try: - parsed = parse_json_response(raw) or {} + # parse_json_response 는 중첩 JSON (entities_confirmed) 을 최외곽으로 오인하는 + # 케이스가 있어 — deep_summary 응답에서 자주 발생 — 최외곽 추출 전용 helper 사용. + parsed = _parse_outermost_json(raw) or parse_json_response(raw) or {} deep_out = DeepSummaryOutput.model_validate(parsed) except (ValidationError, ValueError, TypeError) as exc: parse_error = f"parse:{type(exc).__name__}" @@ -190,6 +194,60 @@ def _build_text_slices(text: str, pointers: dict) -> str: return "\n\n".join(parts) +def _parse_outermost_json(raw: str) -> dict | None: + """Response 의 첫 '{' 부터 brace balance 로 최외곽 JSON 추출. + + parse_json_response 의 re.finditer 패턴이 1단계 중첩까지만 매치해서 deep_summary + 응답처럼 `entities_confirmed: {...}` 2단계 중첩이 포함된 경우 최외곽 대신 내부 + 객체만 반환되는 문제를 우회. 또한 응답이 잘려 closure `}` 가 없으면 강제로 + `}` 추가 시도하여 부분 파싱. + """ + cleaned = strip_thinking(raw) + code_match = re.search(r"```(?:json)?\s*(\{.*)", cleaned, re.DOTALL) + if code_match: + cleaned = code_match.group(1) + start = cleaned.find("{") + if start < 0: + return None + depth = 0 + end = -1 + in_str = False + esc = False + for i in range(start, len(cleaned)): + ch = cleaned[i] + if esc: + esc = False + continue + if ch == "\\": + esc = True + continue + if ch == '"': + in_str = not in_str + continue + if in_str: + continue + if ch == "{": + depth += 1 + elif ch == "}": + depth -= 1 + if depth == 0: + end = i + 1 + break + if end > 0: + try: + return json.loads(cleaned[start:end]) + except json.JSONDecodeError: + pass + # 응답 잘림 — 남은 depth 만큼 `}` 보강 후 재시도 + if depth > 0: + candidate = cleaned[start:].rstrip().rstrip(",") + ("}" * depth) + try: + return json.loads(candidate) + except json.JSONDecodeError: + pass + return None + + def _filter_inconsistencies(items: list) -> list[dict]: """허용 kind 목록 (safety/news 도메인 한정) 만 통과시킨다.