From d58565ef38e81242d3c3dcf69a90827b4e5ca87b Mon Sep 17 00:00:00 2001 From: hyungi Date: Tue, 16 Jun 2026 13:56:42 +0900 Subject: [PATCH] =?UTF-8?q?refactor(search):=20Phase=202A=20cand=20?= =?UTF-8?q?=EC=8A=AC=EB=9F=AC=EA=B7=B8=C2=B7=ED=85=8C=EC=9D=B4=EB=B8=94=20?= =?UTF-8?q?=EC=A0=9C=EA=B1=B0=20(R13)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 2A 임베딩 후보(me5_large_inst·snowflake_l_v2·qwen06·qwen4·qwen4m) no-go 종결 (2026-06-12, 후보 전부 -0.03~-0.04) + phase2a_cand_backfill 워커 dormant(미스케줄·미import). - retrieval_service.CANDIDATE_BACKEND_MAP: 5 cand 엔트리 제거(baseline 만 잔존) — read-path 슬러그를 먼저 빼야 embedding_backend=cand_X /search 가 dropped 테이블 읽어 500 안 남. - api.search allowed 하드코딩 리스트 → ["baseline"] (R12 search-error-allowed dangling 동반 제거). - phase2a_cand_backfill.py 삭제(dead code, 드롭될 테이블 참조 — R12 config-bypass 동반 해소). - 마이그 360: cand 10테이블 DROP TABLE IF EXISTS(멱등, 환경별 존재차 흡수). 검증: py_compile 통과, 슬러그 잔존 참조 0. migration txn 제어문 없음. Co-Authored-By: Claude Opus 4.8 (1M context) --- app/api/search.py | 2 +- app/services/search/retrieval_service.py | 40 +----- app/workers/phase2a_cand_backfill.py | 142 -------------------- migrations/360_drop_phase2a_cand_tables.sql | 14 ++ 4 files changed, 19 insertions(+), 179 deletions(-) delete mode 100644 app/workers/phase2a_cand_backfill.py create mode 100644 migrations/360_drop_phase2a_cand_tables.sql diff --git a/app/api/search.py b/app/api/search.py index d70c5af..b781986 100644 --- a/app/api/search.py +++ b/app/api/search.py @@ -291,7 +291,7 @@ async def search( content={ "error_reason": "unknown_embedding_backend", "backend_requested": embedding_backend, - "allowed": ["baseline", "cand_me5_large_inst", "cand_snowflake_l_v2"], + "allowed": ["baseline"], "detail": msg, }, ) diff --git a/app/services/search/retrieval_service.py b/app/services/search/retrieval_service.py index f1539ef..4f22eec 100644 --- a/app/services/search/retrieval_service.py +++ b/app/services/search/retrieval_service.py @@ -54,42 +54,10 @@ QUERY_EMBED_MAXSIZE = 500 # server-side allowlist map. query parameter 가 raw table name 받지 않음. CANDIDATE_BACKEND_MAP: dict[str, dict[str, str] | None] = { "baseline": None, - "cand_me5_large_inst": { - "docs_table": "documents_cand_me5_large_inst", - "chunks_table": "document_chunks_cand_me5_large_inst", - "embed_endpoint": "http://embedding-cand-me5-inst:80/embed", - }, - "cand_snowflake_l_v2": { - "docs_table": "documents_cand_snowflake_l_v2", - "chunks_table": "document_chunks_cand_snowflake_l_v2", - "embed_endpoint": "http://embedding-cand-snowflake-l-v2:80/embed", - }, - # ─── Phase 2A (embedding-phase2a-1, 2026-06-12): Qwen3-Embedding 후보 3종 ─── - # embed_kind="ollama" = /api/embed 호출 + 쿼리측 instruct prefix (비대칭 사용, - # G-1 fixture 실측: prefix 가 관련쌍 cos +0.016). 문서측은 backfill 이 plain 으로 적재. - # qwen4m = 4B 의 MRL 1024d (dimensions 옵션 — Ollama 가 truncate+재정규화 수행, G-1 실측). - "cand_qwen06": { - "docs_table": "documents_cand_qwen06", - "chunks_table": "document_chunks_cand_qwen06", - "embed_endpoint": "http://ollama:11434/api/embed", - "embed_kind": "ollama", - "embed_model": "qwen3-embedding:0.6b", - }, - "cand_qwen4": { - "docs_table": "documents_cand_qwen4", - "chunks_table": "document_chunks_cand_qwen4", - "embed_endpoint": "http://ollama:11434/api/embed", - "embed_kind": "ollama", - "embed_model": "qwen3-embedding:4b", - }, - "cand_qwen4m": { - "docs_table": "documents_cand_qwen4m", - "chunks_table": "document_chunks_cand_qwen4m", - "embed_endpoint": "http://ollama:11434/api/embed", - "embed_kind": "ollama", - "embed_model": "qwen3-embedding:4b", - "embed_dimensions": 1024, - }, + # Phase 2A 임베딩 후보(me5_large_inst·snowflake_l_v2·qwen06·qwen4·qwen4m) 전량 no-go + # 종결(2026-06-12, 후보 전부 -0.03~-0.04) → cand 슬러그·테이블 제거 (R13, 마이그 360 + # DROP). read-path 슬러그를 먼저 빼야 embedding_backend=cand_X /search 가 dropped 테이블을 + # 읽어 500 나지 않는다. baseline(production)만 잔존. } # G-1 핀 고정 instruct 문자열 (inventory 2026-06-12-c 기록과 동일해야 함 — diff --git a/app/workers/phase2a_cand_backfill.py b/app/workers/phase2a_cand_backfill.py deleted file mode 100644 index 4c734d2..0000000 --- a/app/workers/phase2a_cand_backfill.py +++ /dev/null @@ -1,142 +0,0 @@ -"""Phase 2A 후보 임베딩 백필 CLI (embedding-phase2a-1 E-1). - - docker compose exec -T fastapi python -m workers.phase2a_cand_backfill \ - --target qwen06 --doc-id-max 41944 --chunk-id-max 104140 [--batch 32] - -설계 원칙 (plan r3): - - resumable/idempotent: 대상 = NOT EXISTS(후보 테이블) — 중단/재실행 시 이어서. - 배치 단위 커밋. C-1 백필 게이트 = "후보 카운트 == 동결셋 카운트". - - 동결셋: id <= *_id_max AND 베이스라인 embedding IS NOT NULL (AND docs.deleted_at IS NULL). - cand 테이블은 동결 범위로만 INSERT (retrieval cand path 가 snapshot filter 를 안 타는 전제). - - 문서/청크 입력 = production 경로와 동일 구성(embed_worker._build_embed_input / - chunk_worker 의 [제목][섹션][본문]) + plain (instruct prefix 는 쿼리 측 전용 — G-1 불변식). - - 임베딩 = Ollama /api/embed 배치 호출 (G-1 fixture: 정규화 출력). - - qwen4m 은 본 CLI 대상이 아님 — qwen4 적재 후 SQL 파생(subvector+l2_normalize), plan E-1. -""" - -import argparse -import asyncio -import hashlib -import time - -import httpx -from sqlalchemy import text - -from core.database import async_session -from core.utils import setup_logger -from models.document import Document -from workers.embed_worker import _build_embed_input - -logger = setup_logger("phase2a_cand_backfill") - -OLLAMA_EMBED = "http://ollama:11434/api/embed" - -TARGETS = { - "qwen06": { - "model": "qwen3-embedding:0.6b", "dim": 1024, - "docs": "documents_cand_qwen06", "chunks": "document_chunks_cand_qwen06", - }, - "qwen4": { - "model": "qwen3-embedding:4b", "dim": 2560, - "docs": "documents_cand_qwen4", "chunks": "document_chunks_cand_qwen4", - }, -} - - -async def _embed_batch(client: httpx.AsyncClient, model: str, texts: list[str]) -> list[list[float]]: - r = await client.post(OLLAMA_EMBED, json={"model": model, "input": texts}, timeout=600) - r.raise_for_status() - embs = r.json()["embeddings"] - if len(embs) != len(texts): - raise RuntimeError(f"embed count mismatch: {len(embs)} != {len(texts)}") - return embs - - -async def backfill_docs(target: dict, doc_id_max: int, batch: int, http: httpx.AsyncClient) -> int: - total = 0 - while True: - async with async_session() as session: - rows = (await session.execute(text(f""" - SELECT d.id FROM documents d - WHERE d.id <= :m AND d.embedding IS NOT NULL AND d.deleted_at IS NULL - AND NOT EXISTS (SELECT 1 FROM {target['docs']} c WHERE c.doc_id = d.id) - ORDER BY d.id LIMIT :b - """), {"m": doc_id_max, "b": batch})).scalars().all() - if not rows: - break - docs = [(await session.get(Document, i)) for i in rows] - inputs = [_build_embed_input(d) for d in docs] - embs = await _embed_batch(http, target["model"], inputs) - for d, inp, e in zip(docs, inputs, embs): - await session.execute(text(f""" - INSERT INTO {target['docs']} (doc_id, embed_input_hash, embedding) - VALUES (:i, :h, cast(:e AS vector)) - ON CONFLICT (doc_id) DO NOTHING - """), {"i": d.id, "h": hashlib.sha256(inp.encode()).hexdigest()[:16], "e": str(e)}) - await session.commit() - total += len(rows) - if total % (batch * 10) < batch: - logger.info(f"[{target['docs']}] +{total} (last id={rows[-1]})") - return total - - -async def backfill_chunks(target: dict, chunk_id_max: int, batch: int, http: httpx.AsyncClient) -> int: - total = 0 - while True: - async with async_session() as session: - rows = (await session.execute(text(f""" - SELECT c.id, c.doc_id, c.chunk_index, c.section_title, c.text, d.title - FROM corpus_chunks c JOIN documents d ON d.id = c.doc_id - WHERE c.id <= :m AND c.embedding IS NOT NULL AND d.deleted_at IS NULL - AND NOT EXISTS (SELECT 1 FROM {target['chunks']} k WHERE k.id = c.id) - ORDER BY c.id LIMIT :b - """), {"m": chunk_id_max, "b": batch})).all() - if not rows: - break - inputs = [ - f"[제목] {r.title or ''}\n[섹션] {r.section_title or ''}\n[본문] {r.text}" - for r in rows - ] - embs = await _embed_batch(http, target["model"], inputs) - for r, e in zip(rows, embs): - await session.execute(text(f""" - INSERT INTO {target['chunks']} (id, doc_id, chunk_index, section_title, text, embedding) - VALUES (:i, :d, :x, :s, :t, cast(:e AS vector)) - ON CONFLICT (id) DO NOTHING - """), {"i": r.id, "d": r.doc_id, "x": r.chunk_index, - "s": r.section_title, "t": r.text, "e": str(e)}) - await session.commit() - total += len(rows) - if total % (batch * 10) < batch: - logger.info(f"[{target['chunks']}] +{total} (last id={rows[-1]})") - return total - - -async def run(target_key: str, doc_id_max: int, chunk_id_max: int, batch: int) -> None: - target = TARGETS[target_key] - start = time.monotonic() - async with httpx.AsyncClient() as http: - nd = await backfill_docs(target, doc_id_max, batch, http) - nc = await backfill_chunks(target, chunk_id_max, batch, http) - mins = (time.monotonic() - start) / 60 - async with async_session() as session: - cd = (await session.execute(text(f"SELECT count(*) FROM {target['docs']}"))).scalar_one() - cc = (await session.execute(text(f"SELECT count(*) FROM {target['chunks']}"))).scalar_one() - logger.info( - f"[{target_key}] 완료 — 이번 run docs +{nd} chunks +{nc} ({mins:.1f}분) · " - f"누적 docs {cd} / chunks {cc} (동결 게이트 = 베이스라인 동결셋 카운트와 일치 확인)" - ) - - -def main() -> None: - p = argparse.ArgumentParser(description="Phase 2A 후보 임베딩 백필 (resumable)") - p.add_argument("--target", required=True, choices=sorted(TARGETS)) - p.add_argument("--doc-id-max", type=int, required=True) - p.add_argument("--chunk-id-max", type=int, required=True) - p.add_argument("--batch", type=int, default=32) - a = p.parse_args() - asyncio.run(run(a.target, a.doc_id_max, a.chunk_id_max, a.batch)) - - -if __name__ == "__main__": - main() diff --git a/migrations/360_drop_phase2a_cand_tables.sql b/migrations/360_drop_phase2a_cand_tables.sql new file mode 100644 index 0000000..8345f55 --- /dev/null +++ b/migrations/360_drop_phase2a_cand_tables.sql @@ -0,0 +1,14 @@ +-- 360: Phase 2A 임베딩 후보 cand 섀도 테이블 제거 (R13). +-- Phase 2A no-go 종결(2026-06-12, 후보 전부 -0.03~-0.04) + phase2a_cand_backfill 워커 +-- dormant. retrieval_service.CANDIDATE_BACKEND_MAP / api.search allowed 슬러그 선제거 후 DROP. +-- IF EXISTS — me5/snowflake 는 ad-hoc 생성분이라 환경별 존재 여부 다를 수 있음(멱등). +DROP TABLE IF EXISTS document_chunks_cand_me5_large_inst; +DROP TABLE IF EXISTS documents_cand_me5_large_inst; +DROP TABLE IF EXISTS document_chunks_cand_snowflake_l_v2; +DROP TABLE IF EXISTS documents_cand_snowflake_l_v2; +DROP TABLE IF EXISTS document_chunks_cand_qwen06; +DROP TABLE IF EXISTS documents_cand_qwen06; +DROP TABLE IF EXISTS document_chunks_cand_qwen4; +DROP TABLE IF EXISTS documents_cand_qwen4; +DROP TABLE IF EXISTS document_chunks_cand_qwen4m; +DROP TABLE IF EXISTS documents_cand_qwen4m;