118f32f9b1
PR #20 (2026-05-14, GPU LLM 제거 + Mac mini 26B MLX 흡수) 의 swap 이 backends.json + 코드 주석/docstring 까지 따라가지 못한 표현 잔재 정리. - app/ai/client.py: AIClient docstring 및 call_triage / call_fallback docstring 의 "4B Ollama" → "Mac mini 26B MLX" / "현재는 triage 와 동일 엔드포인트" → "Claude Sonnet 4 API (PR #20 swap 완료)" - app/core/config.py: triage/primary/fallback 주석 통합 + Phase 3.5 classifier/verifier 주석에 PR #20 endpoint 명시 (history 보존) - app/services/search/{llm_gate,classifier_service,verifier_service, evidence_service}.py: "fallback(Ollama)" / "Ollama concurrent OK" / "triage(4B Ollama)" 표현을 Mac mini 26B MLX endpoint 기준으로 정정 + concurrent 안전성 별 검토 마커 추가 - app/services/digest/summarizer.py: "MLX hang/Ollama stall 방어" → "MLX hang / fallback Claude API stall 방어" - app/services/prompt_versions.py: SUMMARY_TRIAGE_TASK + ASK_PROMPT_VERSION 주석의 "4B Ollama" / "4B gemma Ollama" → Mac mini 26B MLX - app/workers/classify_worker.py: B-1 tier triage docstring 정정 코드 동작 변경 0 (주석/docstring 만). embed_worker / study_question_embed_worker 의 "Ollama bge-m3" 표현은 사실 정확이라 유지. 검증: - ollama list → bge-m3:latest 잔존 (embedding owner) - /api/embeddings probe → 1024-dim 200 OK - fastapi embed/ollama error 0 (last 10min) - document.hyungi.net 200 plan: ~/.claude/plans/4-stateless-dongarra.md Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
124 lines
4.1 KiB
Python
124 lines
4.1 KiB
Python
"""Cluster-level LLM 호출 + JSON 파싱 + timeout + drop금지 fallback.
|
|
|
|
핵심 결정:
|
|
- AIClient._call_chat 직접 호출 (client.py 수정 회피, fallback 로직 재사용)
|
|
- Semaphore(1) 로 MLX 과부하 회피
|
|
- Per-call timeout 25초 (asyncio.wait_for) — MLX hang / fallback Claude API stall 방어
|
|
- JSON 파싱 실패 → 1회 재시도 → 그래도 실패 시 minimal fallback (drop 금지)
|
|
- fallback: topic_label="주요 뉴스 묶음", summary = top member ai_summary[:200]
|
|
"""
|
|
|
|
import asyncio
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
from ai.client import parse_json_response
|
|
from core.utils import setup_logger
|
|
|
|
logger = setup_logger("digest_summarizer")
|
|
|
|
LLM_CALL_TIMEOUT = 25 # 초. MLX 평균 5초 + tail latency 마진
|
|
FALLBACK_SUMMARY_LIMIT = 200
|
|
|
|
_llm_sem = asyncio.Semaphore(1)
|
|
|
|
_PROMPT_PATH = Path(__file__).resolve().parent.parent.parent / "prompts" / "digest_topic.txt"
|
|
_PROMPT_TEMPLATE: str | None = None
|
|
|
|
|
|
def _load_prompt() -> str:
|
|
global _PROMPT_TEMPLATE
|
|
if _PROMPT_TEMPLATE is None:
|
|
_PROMPT_TEMPLATE = _PROMPT_PATH.read_text(encoding="utf-8")
|
|
return _PROMPT_TEMPLATE
|
|
|
|
|
|
def build_prompt(selected: list[dict]) -> str:
|
|
"""digest_topic.txt 템플릿에 selected article들의 ai_summary_truncated 주입.
|
|
|
|
템플릿 placeholder: {articles_block}
|
|
"""
|
|
template = _load_prompt()
|
|
lines = []
|
|
for i, m in enumerate(selected, start=1):
|
|
text = (m.get("ai_summary_truncated") or m.get("ai_summary") or m.get("title") or "").strip()
|
|
lines.append(f"[{i}] {text}")
|
|
articles_block = "\n".join(lines)
|
|
return template.replace("{articles_block}", articles_block)
|
|
|
|
|
|
async def _try_call_llm(client: Any, prompt: str) -> str:
|
|
"""Semaphore + per-call timeout 으로 감싼 단일 호출."""
|
|
async with _llm_sem:
|
|
return await asyncio.wait_for(
|
|
client._call_chat(client.ai.primary, prompt),
|
|
timeout=LLM_CALL_TIMEOUT,
|
|
)
|
|
|
|
|
|
def _make_fallback(cluster: dict) -> dict:
|
|
"""cluster 의 top member 데이터로 minimal fallback 생성 — 정보 손실 회피."""
|
|
members = cluster["members"]
|
|
if not members:
|
|
return {
|
|
"topic_label": "주요 뉴스 묶음",
|
|
"summary": "",
|
|
"llm_fallback_used": True,
|
|
}
|
|
top = max(members, key=lambda m: m.get("_rel", m.get("weight", 0.0)))
|
|
text = (top.get("ai_summary") or top.get("title") or "").strip()
|
|
return {
|
|
"topic_label": "주요 뉴스 묶음",
|
|
"summary": text[:FALLBACK_SUMMARY_LIMIT],
|
|
"llm_fallback_used": True,
|
|
}
|
|
|
|
|
|
async def summarize_cluster_with_fallback(
|
|
client: Any,
|
|
cluster: dict,
|
|
selected: list[dict],
|
|
) -> dict:
|
|
"""cluster 1개에 대해 LLM 호출 + JSON 파싱 + fallback.
|
|
|
|
Returns:
|
|
{topic_label, summary, llm_fallback_used}
|
|
"""
|
|
prompt = build_prompt(selected)
|
|
|
|
for attempt in range(2): # 1회 재시도 포함
|
|
try:
|
|
raw = await _try_call_llm(client, prompt)
|
|
except asyncio.TimeoutError:
|
|
logger.warning(
|
|
f"LLM 호출 timeout {LLM_CALL_TIMEOUT}s "
|
|
f"(attempt={attempt + 1}, cluster size={len(cluster['members'])})"
|
|
)
|
|
continue
|
|
except Exception as e:
|
|
logger.warning(
|
|
f"LLM 호출 실패 attempt={attempt + 1} "
|
|
f"(cluster size={len(cluster['members'])}): {e}"
|
|
)
|
|
continue
|
|
|
|
parsed = parse_json_response(raw)
|
|
if (
|
|
parsed
|
|
and isinstance(parsed.get("topic_label"), str)
|
|
and isinstance(parsed.get("summary"), str)
|
|
and parsed["topic_label"].strip()
|
|
and parsed["summary"].strip()
|
|
):
|
|
return {
|
|
"topic_label": parsed["topic_label"].strip(),
|
|
"summary": parsed["summary"].strip(),
|
|
"llm_fallback_used": False,
|
|
}
|
|
logger.warning(
|
|
f"JSON 파싱 실패 attempt={attempt + 1} "
|
|
f"(cluster size={len(cluster['members'])}, raw_len={len(raw) if raw else 0})"
|
|
)
|
|
|
|
return _make_fallback(cluster)
|