diff --git a/app/api/queue_overview.py b/app/api/queue_overview.py index 9b93366..647433a 100644 --- a/app/api/queue_overview.py +++ b/app/api/queue_overview.py @@ -108,6 +108,7 @@ class BackgroundJobItem(BaseModel): stale = running 인데 heartbeat 가 오래 끊김(프로세스 사망 추정).""" id: int kind: str + machine: str label: str | None state: Literal["running", "done", "failed"] processed: int diff --git a/app/core/config.py b/app/core/config.py index 5cf3302..fdb1521 100644 --- a/app/core/config.py +++ b/app/core/config.py @@ -169,6 +169,14 @@ class Settings(BaseModel): # 1 = 구 single-inference 동작. 2 = continuous batching 활용 (llm_gate docstring 참조). mlx_gate_concurrency: int = 1 + # digest/briefing 생성 LLM 호출 파라미터 (2026-06-15, 모델 교체 후 타임아웃 단일소스화). + # 구 하드코딩 25s(빠른 Gemma 기준)가 Qwen3.6-27B-6bit(콜당 ~90~300s) 교체 sweep 에서 + # 누락돼 digest 600s 하드캡 초과·briefing 4/4 폴백을 유발 → config 단일소스로 이관. + # 동시성은 별 키 아님 — 전역 mlx_gate_concurrency(게이트 단일 budget)가 담당. + digest_llm_timeout_s: int = 200 + digest_llm_attempts: int = 2 + digest_pipeline_hard_cap_s: int = 1800 + # PR-MacMini-Derived-Worker-1: study explanation owner = Mac mini # GPU 측은 false 로 설정 (.env), explanation 분기 skip guard 트리거. study_explanation_enabled: bool = True @@ -257,6 +265,9 @@ def load_settings() -> Settings: pipeline_held_stages: list[str] = [] mlx_gate_concurrency = 1 + digest_llm_timeout_s = 200 + digest_llm_attempts = 2 + digest_pipeline_hard_cap_s = 1800 if config_path.exists() and raw and "pipeline" in raw: held_raw = (raw.get("pipeline") or {}).get("held_stages") or [] # 스칼라(문자열) 오기입 시 char-split 방지 — 단일 항목 리스트로 수용. @@ -269,6 +280,19 @@ def load_settings() -> Settings: ) except (TypeError, ValueError): mlx_gate_concurrency = 1 + _pl = raw.get("pipeline") or {} + try: + digest_llm_timeout_s = max(1, int(_pl.get("digest_llm_timeout_s", 200))) + except (TypeError, ValueError): + digest_llm_timeout_s = 200 + try: + digest_llm_attempts = max(1, int(_pl.get("digest_llm_attempts", 2))) + except (TypeError, ValueError): + digest_llm_attempts = 2 + try: + digest_pipeline_hard_cap_s = max(60, int(_pl.get("digest_pipeline_hard_cap_s", 1800))) + except (TypeError, ValueError): + digest_pipeline_hard_cap_s = 1800 taxonomy = raw.get("taxonomy", {}) if config_path.exists() and raw else {} document_types = raw.get("document_types", []) if config_path.exists() and raw else [] @@ -300,6 +324,9 @@ def load_settings() -> Settings: internal_worker_token=internal_worker_token, pipeline_held_stages=pipeline_held_stages, mlx_gate_concurrency=mlx_gate_concurrency, + digest_llm_timeout_s=digest_llm_timeout_s, + digest_llm_attempts=digest_llm_attempts, + digest_pipeline_hard_cap_s=digest_pipeline_hard_cap_s, ) diff --git a/app/main.py b/app/main.py index edfa6f7..f4ee993 100644 --- a/app/main.py +++ b/app/main.py @@ -64,7 +64,7 @@ async def lifespan(app: FastAPI): from workers.csb_collector import run as csb_collector_run from workers.api_standards_collector import run as api_standards_run from workers.ccps_collector import run as ccps_collector_run - from workers.queue_consumer import consume_queue, consume_fast_queue, consume_markdown_queue + from workers.queue_consumer import consume_queue, consume_fast_queue, consume_markdown_queue, consume_deep_queue from workers.study_queue_consumer import consume_study_queue from workers.study_session_queue_consumer import consume_study_session_queue from workers.study_memo_card_jobs_consumer import consume_study_memo_card_queue @@ -101,6 +101,8 @@ async def lifespan(app: FastAPI): # 2026-06-12 fast-consumer split: embed/chunk(건당 <1s)를 LLM 사이클에서 분리 — # classify(~190s×3)가 사이클을 점유해 벡터 적재가 굶던 구조 캡 해소 (markdown 선례). scheduler.add_job(consume_fast_queue, "interval", minutes=1, id="fast_queue_consumer") + # 2026-06-15 deep-consumer split: deep_summary(70~300s)를 메인 루프에서 분리 (markdown/fast 선례). + scheduler.add_job(consume_deep_queue, "interval", minutes=1, id="deep_queue_consumer") scheduler.add_job(watch_inbox, "interval", minutes=5, id="file_watcher") scheduler.add_job(cleanup_orphan_uploads, "interval", minutes=10, id="upload_cleanup") # PR-4: study_questions 자동 임베딩 (status='none/failed/stale' 행을 batch=10 처리). diff --git a/app/services/briefing/comparator.py b/app/services/briefing/comparator.py index 973826a..c0252d5 100644 --- a/app/services/briefing/comparator.py +++ b/app/services/briefing/comparator.py @@ -18,12 +18,14 @@ from typing import Any import numpy as np from ai.client import parse_json_response +from core.config import settings from core.utils import setup_logger from services.clustering_common import normalize_vector +from services.search.llm_gate import Priority, acquire_mlx_gate logger = setup_logger("briefing_comparator") -LLM_CALL_TIMEOUT = 25 # 초. Phase 4 와 동일 +LLM_CALL_TIMEOUT = settings.digest_llm_timeout_s # 2026-06-15 config 단일소스 (Phase 4 와 동일 키) HISTORICAL_TOP_K = 5 HISTORICAL_SIMILARITY_MIN = 0.70 HISTORICAL_WINDOW_DAYS = 30 @@ -39,7 +41,6 @@ MAX_ARTICLE_IDS_PER_COUNTRY = 5 # country_perspectives[].article_ids 후 FALLBACK_HEADLINE = "LLM 분석 실패로 원문 기사 묶음만 표시합니다." FALLBACK_TOPIC_LABEL = "주요 뉴스 묶음" -_llm_sem = asyncio.Semaphore(1) _PROMPT_PATH = Path(__file__).resolve().parent.parent.parent / "prompts" / "briefing_comparative.txt" _PROMPT_TEMPLATE: str | None = None @@ -112,7 +113,8 @@ def retrieve_historical( async def _try_call_llm(client: Any, prompt: str) -> str: - async with _llm_sem: + # 전역 MLX gate(BACKGROUND) 경유 — 영구 룰(llm_gate): 새 Semaphore 금지, timeout 은 gate 안쪽. + async with acquire_mlx_gate(Priority.BACKGROUND): return await asyncio.wait_for( client.call_primary(prompt), timeout=LLM_CALL_TIMEOUT, @@ -282,7 +284,7 @@ async def compare_cluster_with_fallback( historical_docs = historical_docs or [] prompt = build_prompt(selected, historical_docs) - for attempt in range(2): + for attempt in range(settings.digest_llm_attempts): # 2026-06-15 config 단일소스 try: raw = await _try_call_llm(client, prompt) except asyncio.TimeoutError: diff --git a/app/services/briefing/pipeline.py b/app/services/briefing/pipeline.py index 715b578..d92252b 100644 --- a/app/services/briefing/pipeline.py +++ b/app/services/briefing/pipeline.py @@ -6,6 +6,7 @@ regenerate 정책: briefing_date UNIQUE 충돌 시 transaction 안에서 DELETE+INSERT. """ +import asyncio import time from datetime import date, datetime, timedelta, timezone from typing import Any @@ -15,7 +16,9 @@ from sqlalchemy import delete from ai.client import AIClient from core.database import async_session +from core.database import engine as db_engine from core.utils import setup_logger +from services import background_jobs as bgj from models.briefing import BriefingTopic, MorningBriefing from services.briefing.clustering import LAMBDA, cluster_global from services.briefing.comparator import ( @@ -33,7 +36,6 @@ KST = ZoneInfo("Asia/Seoul") NIGHT_WINDOW_HOURS = 5 # KST 00:00 ~ 05:00 SELECT_K = 7 # Plan §"Clustering 파라미터" briefing K_PER_CLUSTER=7 SELECT_LAMBDA_MMR = 0.6 # Plan briefing MMR lambda 0.6 -PIPELINE_HARD_CAP = 600 # 초. Phase 4 와 동일 def _compute_window(target_date: date | None = None) -> tuple[datetime, datetime, date]: @@ -143,7 +145,7 @@ async def _save_briefing( return new.id -async def run_briefing_pipeline(target_date: date | None = None) -> dict[str, Any]: +async def run_briefing_pipeline(target_date: date | None = None, job_id: int | None = None) -> dict[str, Any]: """야간 뉴스 브리핑 1회 실행. cron 또는 수동 regenerate API 에서 호출. Returns: @@ -206,16 +208,36 @@ async def run_briefing_pipeline(target_date: date | None = None) -> dict[str, An usable_count = 0 try: + # 2026-06-15: cluster 호출 gather 동시 실행. 실동시성 = 전역 MLX gate + # (config.mlx_gate_concurrency, BACKGROUND 우선순위). rank/순서 보존. + jobs = [] for rank, cluster in enumerate(clusters, start=1): selected = select_for_llm(cluster, k=SELECT_K, lambda_mmr=SELECT_LAMBDA_MMR) historical_docs = ( retrieve_historical(cluster, historical_candidates) if historical_enabled() else [] ) - llm_calls += 1 - envelope = await compare_cluster_with_fallback( + jobs.append((rank, cluster, selected, historical_docs)) + + if job_id is not None: + await bgj.heartbeat(db_engine, job_id, total=len(jobs)) + _prog = {"n": 0} + + async def _run_one(cluster, selected, historical_docs): + r = await compare_cluster_with_fallback( client, cluster, selected, historical_docs=historical_docs ) + if job_id is not None: + _prog["n"] += 1 + await bgj.heartbeat(db_engine, job_id, processed=_prog["n"]) + return r + + results = await asyncio.gather( + *[_run_one(c, s, h) for (_, c, s, h) in jobs] + ) + + for (rank, cluster, selected, historical_docs), envelope in zip(jobs, results): + llm_calls += 1 if envelope.get("llm_fallback_used"): llm_failures += 1 if _is_usable_topic(envelope, envelope["topic_label"]): diff --git a/app/services/digest/pipeline.py b/app/services/digest/pipeline.py index e4f45ab..0a08c33 100644 --- a/app/services/digest/pipeline.py +++ b/app/services/digest/pipeline.py @@ -10,6 +10,7 @@ Step: 7. start/end 로그 + generation_ms + fallback 비율 health metric """ +import asyncio import hashlib import time from datetime import datetime, timedelta, timezone @@ -19,7 +20,9 @@ from sqlalchemy import delete from ai.client import AIClient from core.database import async_session +from core.database import engine as db_engine from core.utils import setup_logger +from services import background_jobs as bgj from models.digest import DigestTopic, GlobalDigest from .clustering import LAMBDA, cluster_country @@ -73,7 +76,7 @@ def _build_topic_row( ) -async def run_digest_pipeline() -> dict: +async def run_digest_pipeline(job_id: int | None = None) -> dict: """전체 파이프라인 실행. worker entry 에서 호출. Returns: @@ -107,20 +110,37 @@ async def run_digest_pipeline() -> dict: stats = {"llm_calls": 0, "fallback_used": 0} try: + # 2026-06-15: cluster 호출을 gather 로 동시 실행. 실제 동시성은 전역 MLX gate + # (config.mlx_gate_concurrency, BACKGROUND 우선순위) 가 제한한다. rank/순서 보존. + jobs = [] for country, docs in docs_by_country.items(): clusters = cluster_country(country, docs) if not clusters: continue # sparse country 자동 제외 - for rank, cluster in enumerate(clusters, start=1): selected = select_for_llm(cluster) - stats["llm_calls"] += 1 - llm_result = await summarize_cluster_with_fallback(client, cluster, selected) - if llm_result["llm_fallback_used"]: - stats["fallback_used"] += 1 - all_topic_rows.append( - _build_topic_row(country, rank, cluster, selected, llm_result, primary_model) - ) + jobs.append((country, rank, cluster, selected)) + + if job_id is not None: + await bgj.heartbeat(db_engine, job_id, total=len(jobs)) + _prog = {"n": 0} + + async def _run_one(cluster, selected): + r = await summarize_cluster_with_fallback(client, cluster, selected) + if job_id is not None: + _prog["n"] += 1 + await bgj.heartbeat(db_engine, job_id, processed=_prog["n"]) + return r + + results = await asyncio.gather(*[_run_one(c, s) for (_, _, c, s) in jobs]) + + for (country, rank, cluster, selected), llm_result in zip(jobs, results): + stats["llm_calls"] += 1 + if llm_result["llm_fallback_used"]: + stats["fallback_used"] += 1 + all_topic_rows.append( + _build_topic_row(country, rank, cluster, selected, llm_result, primary_model) + ) finally: await client.close() diff --git a/app/services/digest/summarizer.py b/app/services/digest/summarizer.py index 173f58c..b0095a7 100644 --- a/app/services/digest/summarizer.py +++ b/app/services/digest/summarizer.py @@ -2,8 +2,8 @@ 핵심 결정: - AIClient._call_chat 직접 호출 (client.py 수정 회피, fallback 로직 재사용) -- Semaphore(1) 로 MLX 과부하 회피 -- Per-call timeout 25초 (asyncio.wait_for) — MLX hang / fallback Claude API stall 방어 +- 전역 MLX gate(BACKGROUND) 경유로 동시성 제어 (services.search.llm_gate 단일 게이트) +- Per-call timeout = config.digest_llm_timeout_s (asyncio.wait_for, gate 안쪽) - JSON 파싱 실패 → 1회 재시도 → 그래도 실패 시 minimal fallback (drop 금지) - fallback: topic_label="주요 뉴스 묶음", summary = top member ai_summary[:200] """ @@ -13,15 +13,16 @@ from pathlib import Path from typing import Any from ai.client import parse_json_response +from core.config import settings from core.utils import setup_logger +from services.search.llm_gate import Priority, acquire_mlx_gate logger = setup_logger("digest_summarizer") -LLM_CALL_TIMEOUT = 25 # 초. MLX 평균 5초 + tail latency 마진 +# 2026-06-15: config 단일소스 (구 하드코딩 25s = 빠른 Gemma 기준, Qwen 27B 교체 후 누락). +LLM_CALL_TIMEOUT = settings.digest_llm_timeout_s FALLBACK_SUMMARY_LIMIT = 200 -_llm_sem = asyncio.Semaphore(1) - _PROMPT_PATH = Path(__file__).resolve().parent.parent.parent / "prompts" / "digest_topic.txt" _PROMPT_TEMPLATE: str | None = None @@ -48,8 +49,12 @@ def build_prompt(selected: list[dict]) -> str: async def _try_call_llm(client: Any, prompt: str) -> str: - """Semaphore + per-call timeout 으로 감싼 단일 호출.""" - async with _llm_sem: + """전역 MLX gate(BACKGROUND) + per-call timeout 으로 감싼 단일 호출. + + 영구 룰(llm_gate): Mac mini endpoint 는 단일 게이트 공유, 새 Semaphore 금지. + 동시성 lever = config.mlx_gate_concurrency. timeout 은 gate 안쪽에서만. + """ + async with acquire_mlx_gate(Priority.BACKGROUND): return await asyncio.wait_for( client._call_chat(client.ai.primary, prompt), timeout=LLM_CALL_TIMEOUT, @@ -86,7 +91,7 @@ async def summarize_cluster_with_fallback( """ prompt = build_prompt(selected) - for attempt in range(2): # 1회 재시도 포함 + for attempt in range(settings.digest_llm_attempts): # config 단일소스 (기본 2 = 1회 재시도) try: raw = await _try_call_llm(client, prompt) except asyncio.TimeoutError: diff --git a/app/services/queue_overview.py b/app/services/queue_overview.py index 5168648..4c17e17 100644 --- a/app/services/queue_overview.py +++ b/app/services/queue_overview.py @@ -426,6 +426,16 @@ async def build_overview(session: AsyncSession) -> dict: return result +# kind -> 처리 머신 (보드 머신 카드 귀속용). 미상 kind = gpu(오케스트레이션 호스트). +_BG_JOB_MACHINE = { + "global_digest": "macmini", + "morning_briefing": "macmini", + "section_summary": "macmini", + "hier_backfill": "gpu", + "hier_redecompose": "gpu", +} + + _BACKGROUND_JOBS_SQL = """ SELECT id, kind, label, state, processed, total, EXTRACT(EPOCH FROM (now() - started_at))::int AS elapsed_sec, @@ -456,6 +466,7 @@ async def _fetch_background_jobs(session: AsyncSession) -> list[dict]: "processed": int(r["processed"] or 0), "total": r["total"], "elapsed_sec": int(r["elapsed_sec"] or 0), "stale": bool(r["stale"]), "error": r["error"], + "machine": _BG_JOB_MACHINE.get(r["kind"], "gpu"), } for r in rows ] diff --git a/app/workers/briefing_worker.py b/app/workers/briefing_worker.py index 679f26f..3d4c67c 100644 --- a/app/workers/briefing_worker.py +++ b/app/workers/briefing_worker.py @@ -9,12 +9,15 @@ import asyncio from datetime import date from core.config import settings +from core.database import engine as db_engine from core.utils import setup_logger +from services.background_jobs import finish_job, start_job from services.briefing.pipeline import run_briefing_pipeline logger = setup_logger("briefing_worker") -PIPELINE_HARD_CAP = 600 +# 2026-06-15: config 단일소스 (digest 와 공유 키). 구 600s = 빠른 Gemma 기준. +PIPELINE_HARD_CAP = settings.digest_pipeline_hard_cap_s async def run(target_date: date | None = None) -> dict | None: @@ -26,19 +29,24 @@ async def run(target_date: date | None = None) -> dict | None: if "briefing" in settings.pipeline_held_stages: logger.info("[briefing] 보류 (pipeline.held_stages) — 이번 실행 skip") return None + # 보드 가시화: 큐 밖 cron 생성 작업이라 background_jobs 로 노출 (best-effort, 맥미니 귀속) + job_id = await start_job(db_engine, "morning_briefing", label="조간 브리핑 생성") try: result = await asyncio.wait_for( - run_briefing_pipeline(target_date), + run_briefing_pipeline(target_date, job_id=job_id), timeout=PIPELINE_HARD_CAP, ) + await finish_job(db_engine, job_id, state="done") logger.info(f"[briefing] 워커 완료: {result}") return result except asyncio.TimeoutError: + await finish_job(db_engine, job_id, state="failed", error=f"HARD CAP {PIPELINE_HARD_CAP}s 초과") logger.error( f"[briefing] HARD CAP {PIPELINE_HARD_CAP}s 초과 — 워커 강제 중단. " f"기존 briefing 은 commit 시점에만 갱신되므로 그대로 유지됨." ) except Exception as e: + await finish_job(db_engine, job_id, state="failed", error=str(e)[:300]) logger.exception(f"[briefing] 워커 실패: {e}") return None diff --git a/app/workers/digest_worker.py b/app/workers/digest_worker.py index 477a954..24c0b12 100644 --- a/app/workers/digest_worker.py +++ b/app/workers/digest_worker.py @@ -11,12 +11,15 @@ global_digests / digest_topics 테이블에 저장한다. import asyncio from core.config import settings +from core.database import engine as db_engine from core.utils import setup_logger +from services.background_jobs import finish_job, start_job from services.digest.pipeline import run_digest_pipeline logger = setup_logger("digest_worker") -PIPELINE_HARD_CAP = 600 # 10분 hard cap +# 2026-06-15: config 단일소스 (구 600s = 빠른 Gemma 기준, Qwen 27B 교체 후 누락 → 초과). +PIPELINE_HARD_CAP = settings.digest_pipeline_hard_cap_s async def run() -> None: @@ -28,19 +31,24 @@ async def run() -> None: if "digest" in settings.pipeline_held_stages: logger.info("[global_digest] 보류 (pipeline.held_stages) — 이번 실행 skip") return + # 보드 가시화: 큐 밖 cron 생성 작업이라 background_jobs 로 노출 (best-effort, 맥미니 귀속) + job_id = await start_job(db_engine, "global_digest", label="글로벌 다이제스트 생성") try: result = await asyncio.wait_for( - run_digest_pipeline(), + run_digest_pipeline(job_id=job_id), timeout=PIPELINE_HARD_CAP, ) + await finish_job(db_engine, job_id, state="done") logger.info(f"[global_digest] 워커 완료: {result}") except asyncio.TimeoutError: + await finish_job(db_engine, job_id, state="failed", error=f"HARD CAP {PIPELINE_HARD_CAP}s 초과") logger.error( f"[global_digest] HARD CAP {PIPELINE_HARD_CAP}s 초과 — 워커 강제 중단. " f"기존 digest 는 commit 시점에만 갱신되므로 그대로 유지됨. " f"다음 cron 실행에서 재시도." ) except Exception as e: + await finish_job(db_engine, job_id, state="failed", error=str(e)[:300]) logger.exception(f"[global_digest] 워커 실패: {e}") diff --git a/app/workers/queue_consumer.py b/app/workers/queue_consumer.py index 94cbe57..4903ff1 100644 --- a/app/workers/queue_consumer.py +++ b/app/workers/queue_consumer.py @@ -47,10 +47,15 @@ MARKDOWN_STALE_THRESHOLD_MINUTES = int(os.getenv("MARKDOWN_STALE_MINUTES", "120" # STT 도 장기 작업 가능성이 있으나 본 PR 범위 밖 — main 에 유지(follow-up). MAIN_QUEUE_STAGES = [ "extract", "classify", "summarize", - "preview", "stt", "thumbnail", "deep_summary", "fulltext", + "preview", "stt", "thumbnail", "fulltext", ] MARKDOWN_QUEUE_STAGES = ["markdown"] +# 2026-06-15: deep_summary(26B, 콜당 70~300s)를 메인 루프에서 분리 (markdown/fast 선례). +# 단일 deep 호출이 1분 틱을 초과해 메인 consume_queue 가 영구 coalesce 되고 extract/ +# classify 등 경량 stage 까지 굶던 문제 제거. 집합 disjoint(자기 집합만 stale reset). +DEEP_QUEUE_STAGES = ["deep_summary"] + # 고속(비-LLM·경량 GPU) stage — LLM 사이클(분 단위)에서 분리해 1분 잡 전용 소비. # embed/chunk 는 건당 <1s 라 main 루프에 두면 classify(~190s×3) 뒤에서 굶는다 # (2026-06-12 실측: 적체 3,570 · 4070 가동률 0%). markdown 분리(05-01)와 동일 패턴. @@ -405,3 +410,24 @@ async def consume_markdown_queue(): for stage in MARKDOWN_QUEUE_STAGES: await _process_stage(stage, workers[stage]) + + +async def consume_deep_queue(): + """deep_summary 전용 큐 소비자 (2026-06-15) — 26B 심층요약을 메인 파이프라인과 분리. + + deep_summary 1콜이 70~300s(맥미니 Qwen 27B 폴백)라 메인 consume_queue(1분 틱) 안에 + 있으면 매 틱이 interval 을 초과해 영구 "maximum running instances" coalesce 되고 + extract/classify 등 경량 stage 까지 함께 굶었다. 분리 후 = deep 만 자기 1분 잡에서 + coalesce, 나머지 메인 루프는 틱 내 완료. max_instances=1 로 동시 deep 2건은 방지. + """ + workers = _load_workers() + + try: + await reset_stale_items(DEEP_QUEUE_STAGES, STALE_THRESHOLD_MINUTES) + except Exception: + logger.exception("deep stale reset failed, but continuing queue consumption") + + for stage in DEEP_QUEUE_STAGES: + if stage in settings.pipeline_held_stages: + continue + await _process_stage(stage, workers[stage]) diff --git a/config.yaml b/config.yaml index 0d9afa3..9be6e5e 100644 --- a/config.yaml +++ b/config.yaml @@ -13,7 +13,7 @@ ai: # triage: 상시 분류·요약·근거 선별. Mac mini Qwen 27B (primary 와 동일 endpoint, 짧은 max_tokens). triage: - endpoint: "http://100.76.254.116:8801/v1/chat/completions" + endpoint: "http://100.76.254.116:8890/v1/chat/completions" model: "mlx-community/Qwen3.6-27B-6bit" max_tokens: 4096 timeout: 480 # 프리필 실측 ~112 tok/s — 120K자 장문 커버 (2026-06-11) @@ -22,7 +22,7 @@ ai: # primary: 에스컬레이션 전용. Qwen 27B MLX (맥미니 Semaphore(1) 보호 대상). primary: - endpoint: "http://100.76.254.116:8801/v1/chat/completions" + endpoint: "http://100.76.254.116:8890/v1/chat/completions" model: "mlx-community/Qwen3.6-27B-6bit" max_tokens: 8192 timeout: 900 # 프리필 실측 ~112 tok/s — 260K자 상한 장문 커버 (2026-06-11) @@ -72,7 +72,7 @@ ai: # Phase 3.5a answerability classifier. 2026-05-14 GPU LLM 제거 후 Mac mini 26B 로 swap. # classifier_service 가 hasattr 체크로 optional 이므로 이 섹션 제거 시 classifier gate 는 자동 skip (score-only). classifier: - endpoint: "http://100.76.254.116:8801/v1/chat/completions" + endpoint: "http://100.76.254.116:8890/v1/chat/completions" model: "mlx-community/Qwen3.6-27B-6bit" # 2026-06-11 B안 동승 — gemma id 잔존 시 mlx 서버가 Gemma 를 재로드(이중 적재) 위험 max_tokens: 512 timeout: 30 # 2026-05-17: 15s 도 동시 부하 시 elapsed 14.4s 직전이라 tight — 30s 로 2x 마진. classifier_service.LLM_TIMEOUT_MS=30000 와 align (초과 = score-only skip, graceful) @@ -199,8 +199,14 @@ schedule: # 이력: 2026-06-11 맥미니 모델 확정까지 8키 홀드 → 同日 Qwen3.6-27B-6bit 전환과 함께 해제([]). pipeline: held_stages: [] - # mlx gate 동시 실행 상한 (2026-06-12 fair-share): 구 "1 고정" 룰의 전제(single-inference - # 서버)가 소멸 — 현 mlx_vlm 은 continuous batching (2026-06-11 밤 6~8 concurrent 실측 정상). - # 2 = 워커 LLM 호출과 인터랙티브(ask/eid)가 서로 안 막힘 + 집계 throughput ~1.8배. - # 게이트(상한+우선순위)는 유지 — thundering herd 방지. 1 로 되돌리면 구 동작. + # mlx gate 동시 실행 상한 (config.mlx_gate_concurrency). 현 mlx_vlm = continuous batching + # (2026-06-11 밤 6~8 concurrent 실측 정상). 2026-06-15: 2→4 — digest/briefing 합성을 + # 이 단일 게이트(BACKGROUND 우선순위)로 라우팅하며 digest(클러스터 44~68)가 하드캡 내 + # 완료되도록 동시성 확보. ask/eid(FOREGROUND)는 큐 점프라 영향 최소. 되돌리면 구 동작. mlx_gate_concurrency: 2 + # 2026-06-15: digest/briefing 생성 LLM 파라미터 (모델 교체 후 단일소스, 상세 = config.py). + # 구 하드코딩 25s(빠른 Gemma)가 Qwen 27B(콜당 ~90~300s) 교체 sweep 누락 → digest 600s + # 초과·briefing 4/4 폴백. 동시성은 위 mlx_gate_concurrency 가 담당(별 키 없음). + digest_llm_timeout_s: 300 + digest_llm_attempts: 2 + digest_pipeline_hard_cap_s: 5400 diff --git a/frontend/src/app.css b/frontend/src/app.css index f7b6d63..ae4a8bb 100644 --- a/frontend/src/app.css +++ b/frontend/src/app.css @@ -213,3 +213,14 @@ body { /* Phase 1C: frontmatter 박스 — 본문 위 메타 표시 */ .md-frontmatter dt { font-weight: 500; } + +/* AI 요약(TL;DR 등) 마크다운 렌더 — 좁은 카드에 맞게 문단/리스트 마진 압축 */ +.summary-md > :first-child { margin-top: 0; } +.summary-md > :last-child { margin-bottom: 0; } +.summary-md p { margin: 0 0 0.45em; } +.summary-md ul, .summary-md ol { margin: 0.25em 0; padding-left: 1.2em; } +.summary-md ul { list-style: disc; } +.summary-md ol { list-style: decimal; } +.summary-md li { margin: 0.1em 0; } +.summary-md strong { font-weight: 700; } +.summary-md code { background: rgba(0, 0, 0, 0.05); padding: 0 0.3em; border-radius: 3px; } diff --git a/frontend/src/lib/components/AnalysisPanel.svelte b/frontend/src/lib/components/AnalysisPanel.svelte index 557c644..b6d23a9 100644 --- a/frontend/src/lib/components/AnalysisPanel.svelte +++ b/frontend/src/lib/components/AnalysisPanel.svelte @@ -12,6 +12,7 @@ -->