diff --git a/app/workers/deep_summary_worker.py b/app/workers/deep_summary_worker.py index 5eddc4b..08ff909 100644 --- a/app/workers/deep_summary_worker.py +++ b/app/workers/deep_summary_worker.py @@ -121,8 +121,13 @@ async def process(document_id: int, session: AsyncSession) -> None: try: # parse_json_response 는 중첩 JSON (entities_confirmed) 을 최외곽으로 오인하는 # 케이스가 있어 — deep_summary 응답에서 자주 발생 — 최외곽 추출 전용 helper 사용. - parsed = _parse_outermost_json(raw) or parse_json_response(raw) or {} - deep_out = DeepSummaryOutput.model_validate(parsed) + parsed = _parse_outermost_json(raw) or parse_json_response(raw) + if not parsed: + # 잘린 응답 fallback — field-level regex 로 detail/tldr/inconsistencies 추출 + parsed = _regex_extract_fields(raw) + deep_out = DeepSummaryOutput.model_validate(parsed or {}) + if not deep_out.detail and parsed and parsed.get("_fallback"): + logger.info(f"[deep] id={document_id} regex fallback parsed keys={list(parsed.keys())}") except (ValidationError, ValueError, TypeError) as exc: parse_error = f"parse:{type(exc).__name__}" logger.warning(f"[deep] JSON 파싱/검증 실패 id={document_id}: {exc}") @@ -248,6 +253,51 @@ def _parse_outermost_json(raw: str) -> dict | None: return None +def _regex_extract_fields(raw: str) -> dict: + """JSON parse 실패 시 field-level regex 로 detail/tldr/mode/inconsistencies 추출. + + 응답이 잘렸거나 중간에 문자열이 끊긴 경우에도 앞쪽에 완결된 필드는 건진다. + `"detail": "…"` 처럼 key-value 쌍을 개별 매칭. + """ + def _str_field(key: str) -> str | None: + m = re.search(rf'"{key}"\s*:\s*"((?:[^"\\]|\\.)*)"', raw, re.DOTALL) + if not m: + return None + try: + # JSON string escape 복원 (\n, \\, \" 등) + return json.loads('"' + m.group(1) + '"') + except json.JSONDecodeError: + return m.group(1) + + def _arr_field(key: str) -> list | None: + # 단순 문자열 배열만 지원 — bullets / inconsistencies_desc 등 + m = re.search(rf'"{key}"\s*:\s*(\[[^\]]*\])', raw, re.DOTALL) + if not m: + return None + try: + return json.loads(m.group(1)) + except json.JSONDecodeError: + return None + + out: dict = {"_fallback": True} + mode = _str_field("mode") + if mode: + out["mode"] = mode + tldr = _str_field("tldr") + if tldr: + out["tldr"] = tldr + detail = _str_field("detail") + if detail: + out["detail"] = detail + bullets = _arr_field("bullets") + if bullets is not None: + out["bullets"] = bullets + inc = _arr_field("inconsistencies") + if inc is not None: + out["inconsistencies"] = inc + return out + + def _filter_inconsistencies(items: list) -> list[dict]: """허용 kind 목록 (safety/news 도메인 한정) 만 통과시킨다.