a67df0a10b
phase-2a-embedding-diagnose.md v4 § 7 Phase 2 산출. 페어 invariant (R2-2): documents_cand + document_chunks_cand 동기 swap, 부분 swap 금지. - snapshot 박제 (R2-D): v0_2_phase2a_snapshot_2026-05-23.json - SNAPSHOT_DOC_ID_MAX=25180 / SNAPSHOT_CHUNK_ID_MAX=56526 - documents_n=21365 (embedded, active) / chunks_n=30605 - production ingest 정지 0, 모든 candidate reindex + baseline rebaseline 측정이 id<=snapshot 한정 - reindex_candidate.py 신규 (R2-5): - reindex_documents(): production _build_embed_input() import 재사용 - reindex_chunks(): document_chunks.text 그대로 (재 chunking 0) - TEI batch=8 (1.7 internal queue overflow 회피) + truncate=true (mE5 512 context) - retry-8 exponential backoff (10/20/40/80/90s) — TEI SIGSEGV 자동 복구 - idempotent ON CONFLICT DO NOTHING (cancellation/resume 안전) - docker-compose.override.cand.yml: restart=unless-stopped (TEI 1.7 panic 자동 복구) DB 산출물 (4 테이블): - documents_cand_me5_large_inst : 21365 rows (dim 1024) + ivfflat lists=100 - document_chunks_cand_me5_large_inst : 30605 rows (dim 1024) + ivfflat lists=100 - documents_cand_snowflake_l_v2 : 21365 rows (dim 1024) + ivfflat lists=100 - document_chunks_cand_snowflake_l_v2 : 30605 rows (dim 1024) + ivfflat lists=100 - ivfflat.probes=20 (production 동일) 보존 - smoke retrieval (nearest neighbor SQL) PASS 후보 2종 production 영향: - documents / document_chunks 컬럼/row 변경 0 - config.yaml 변경 0 (ollama bge-m3 unchanged) - production fastapi/postgres/reranker 변경 0 (profile embed-cand 격리) 다음 단계: Phase 3 (DS API + retrieval_service slug-based dispatcher 추가, baseline rebaseline + 2 후보 51 case 측정). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
207 lines
8.9 KiB
Python
207 lines
8.9 KiB
Python
#!/usr/bin/env python3
|
|
"""Phase 2A — Candidate embedding reindex (documents + chunks 페어).
|
|
|
|
plan: phase-2a-embedding-diagnose.md v4 § 7 Phase 2
|
|
|
|
Usage (DS fastapi 컨테이너 내부 실행):
|
|
docker exec hyungi_document_server-fastapi-1 python -m tests.search_eval.reindex_candidate \
|
|
--slug me5_large_inst \
|
|
--endpoint http://embedding-cand-me5-inst:80/embed \
|
|
--snapshot-doc-id-max 25180 \
|
|
--snapshot-chunk-id-max 56526
|
|
|
|
idempotent: LEFT JOIN 으로 이미 처리된 row 건너뜀, ON CONFLICT DO NOTHING.
|
|
"""
|
|
|
|
import argparse
|
|
import asyncio
|
|
import hashlib
|
|
import sys
|
|
import time
|
|
import unicodedata
|
|
from types import SimpleNamespace
|
|
|
|
import httpx
|
|
from sqlalchemy import text
|
|
|
|
# /app is the working dir inside fastapi container; ensures app.* importable
|
|
sys.path.insert(0, "/app")
|
|
from core.database import async_session
|
|
from workers.embed_worker import _build_embed_input
|
|
|
|
|
|
def canonical_hash(s: str) -> str:
|
|
normalized = unicodedata.normalize("NFKC", s.strip())
|
|
return hashlib.sha256(normalized.encode("utf-8")).hexdigest()
|
|
|
|
|
|
async def tei_embed_batch(endpoint: str, texts: list[str], retries: int = 8) -> list[list[float]]:
|
|
last_err = None
|
|
for attempt in range(retries):
|
|
try:
|
|
async with httpx.AsyncClient(timeout=180.0) as client:
|
|
r = await client.post(endpoint, json={"inputs": texts, "truncate": True})
|
|
r.raise_for_status()
|
|
data = r.json()
|
|
if not isinstance(data, list) or not all(isinstance(v, list) for v in data):
|
|
raise ValueError(f"Unexpected TEI response shape: {type(data).__name__}")
|
|
if len(data) != len(texts):
|
|
raise ValueError(f"TEI batch size mismatch: sent {len(texts)} got {len(data)}")
|
|
return data
|
|
except (httpx.RemoteProtocolError, httpx.ReadError, httpx.ConnectError, httpx.HTTPStatusError) as e:
|
|
last_err = e
|
|
wait = min(10.0 * (2 ** attempt), 90.0)
|
|
print(f'[reindex-cand] TEI error attempt={attempt+1}/{retries} err={type(e).__name__} sleeping={wait}s', flush=True)
|
|
await asyncio.sleep(wait)
|
|
raise RuntimeError(f'TEI exhausted retries: {last_err}')
|
|
|
|
|
|
async def reindex_documents(slug: str, endpoint: str, snapshot_doc_id_max: int, batch_size: int) -> None:
|
|
table = f"documents_cand_{slug}"
|
|
async with async_session() as session:
|
|
already_done = (await session.execute(text(f"SELECT count(*) FROM {table}"))).scalar() or 0
|
|
total = (await session.execute(text(
|
|
"SELECT count(*) FROM documents "
|
|
"WHERE id <= :max AND deleted_at IS NULL AND embedding IS NOT NULL"
|
|
), {"max": snapshot_doc_id_max})).scalar() or 0
|
|
|
|
print(f"[reindex-cand-docs] slug={slug} total={total} already_done={already_done}", flush=True)
|
|
start = time.time()
|
|
processed = already_done
|
|
|
|
async with async_session() as session:
|
|
stmt = text(f"""
|
|
SELECT d.id, d.title, d.ai_summary, d.ai_tags, d.extracted_text
|
|
FROM documents d
|
|
LEFT JOIN {table} c ON c.doc_id = d.id
|
|
WHERE d.id <= :max
|
|
AND d.deleted_at IS NULL
|
|
AND d.embedding IS NOT NULL
|
|
AND c.doc_id IS NULL
|
|
ORDER BY d.id
|
|
""")
|
|
result = await session.execute(stmt, {"max": snapshot_doc_id_max})
|
|
rows = result.fetchall()
|
|
|
|
for i in range(0, len(rows), batch_size):
|
|
batch = rows[i:i + batch_size]
|
|
doc_objs = [
|
|
SimpleNamespace(title=r[1], ai_summary=r[2], ai_tags=r[3], extracted_text=r[4])
|
|
for r in batch
|
|
]
|
|
texts_built = [_build_embed_input(d) for d in doc_objs]
|
|
valid = [(idx, t) for idx, t in enumerate(texts_built) if t]
|
|
if not valid:
|
|
processed += len(batch)
|
|
continue
|
|
valid_texts = [t for _, t in valid]
|
|
embeddings = await tei_embed_batch(endpoint, valid_texts)
|
|
|
|
insert_rows = [{
|
|
"doc_id": batch[idx][0],
|
|
"embed_input": t,
|
|
"embed_input_hash": canonical_hash(t),
|
|
"embedding": str(emb),
|
|
} for (idx, t), emb in zip(valid, embeddings)]
|
|
|
|
await session.execute(text(f"""
|
|
INSERT INTO {table} (doc_id, embed_input, embed_input_hash, embedding)
|
|
VALUES (:doc_id, :embed_input, :embed_input_hash, CAST(:embedding AS vector))
|
|
ON CONFLICT (doc_id) DO NOTHING
|
|
"""), insert_rows)
|
|
await session.commit()
|
|
|
|
processed += len(batch)
|
|
elapsed = time.time() - start
|
|
rate = (processed - already_done) / elapsed if elapsed > 0 else 0
|
|
print(f"[reindex-cand-docs] slug={slug} done={processed}/{total} rate={rate:.1f}/sec", flush=True)
|
|
|
|
print(f"[reindex-cand-docs] slug={slug} COMPLETE total={total} elapsed={time.time()-start:.1f}s", flush=True)
|
|
|
|
|
|
async def reindex_chunks(slug: str, endpoint: str, snapshot_chunk_id_max: int, batch_size: int) -> None:
|
|
table = f"document_chunks_cand_{slug}"
|
|
async with async_session() as session:
|
|
already_done = (await session.execute(text(f"SELECT count(*) FROM {table}"))).scalar() or 0
|
|
total = (await session.execute(text(
|
|
"SELECT count(*) FROM document_chunks WHERE id <= :max"
|
|
), {"max": snapshot_chunk_id_max})).scalar() or 0
|
|
|
|
print(f"[reindex-cand-chunks] slug={slug} total={total} already_done={already_done}", flush=True)
|
|
start = time.time()
|
|
processed = already_done
|
|
|
|
async with async_session() as session:
|
|
stmt = text(f"""
|
|
SELECT c.id, c.doc_id, c.chunk_index, c.chunk_type, c.section_title, c.heading_path, c.page,
|
|
c.language, c.country, c.source, c.domain_category, c.text
|
|
FROM document_chunks c
|
|
LEFT JOIN {table} cc ON cc.doc_id = c.doc_id AND cc.chunk_index = c.chunk_index
|
|
WHERE c.id <= :max AND cc.id IS NULL
|
|
ORDER BY c.id
|
|
""")
|
|
result = await session.execute(stmt, {"max": snapshot_chunk_id_max})
|
|
rows = result.fetchall()
|
|
|
|
for i in range(0, len(rows), batch_size):
|
|
batch = rows[i:i + batch_size]
|
|
chunk_texts = [r[11] for r in batch]
|
|
valid = [(idx, t) for idx, t in enumerate(chunk_texts) if t and t.strip()]
|
|
if not valid:
|
|
processed += len(batch)
|
|
continue
|
|
valid_texts = [t for _, t in valid]
|
|
embeddings = await tei_embed_batch(endpoint, valid_texts)
|
|
|
|
insert_rows = []
|
|
for (idx, t), emb in zip(valid, embeddings):
|
|
r = batch[idx]
|
|
insert_rows.append({
|
|
"doc_id": r[1], "chunk_index": r[2], "chunk_type": r[3],
|
|
"section_title": r[4], "heading_path": r[5], "page": r[6],
|
|
"language": r[7], "country": r[8], "source": r[9],
|
|
"domain_category": r[10], "text": t,
|
|
"text_canonical_hash": canonical_hash(t),
|
|
"embedding": str(emb),
|
|
})
|
|
|
|
await session.execute(text(f"""
|
|
INSERT INTO {table}
|
|
(doc_id, chunk_index, chunk_type, section_title, heading_path, page,
|
|
language, country, source, domain_category, text, text_canonical_hash, embedding)
|
|
VALUES
|
|
(:doc_id, :chunk_index, :chunk_type, :section_title, :heading_path, :page,
|
|
:language, :country, :source, :domain_category, :text, :text_canonical_hash,
|
|
CAST(:embedding AS vector))
|
|
ON CONFLICT (doc_id, chunk_index) DO NOTHING
|
|
"""), insert_rows)
|
|
await session.commit()
|
|
|
|
processed += len(batch)
|
|
elapsed = time.time() - start
|
|
rate = (processed - already_done) / elapsed if elapsed > 0 else 0
|
|
print(f"[reindex-cand-chunks] slug={slug} done={processed}/{total} rate={rate:.1f}/sec", flush=True)
|
|
|
|
print(f"[reindex-cand-chunks] slug={slug} COMPLETE total={total} elapsed={time.time()-start:.1f}s", flush=True)
|
|
|
|
|
|
async def main() -> None:
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("--slug", required=True)
|
|
ap.add_argument("--endpoint", required=True)
|
|
ap.add_argument("--snapshot-doc-id-max", type=int, required=True)
|
|
ap.add_argument("--snapshot-chunk-id-max", type=int, required=True)
|
|
ap.add_argument("--batch-size", type=int, default=8)
|
|
ap.add_argument("--documents-only", action="store_true")
|
|
ap.add_argument("--chunks-only", action="store_true")
|
|
args = ap.parse_args()
|
|
|
|
if not args.chunks_only:
|
|
await reindex_documents(args.slug, args.endpoint, args.snapshot_doc_id_max, args.batch_size)
|
|
if not args.documents_only:
|
|
await reindex_chunks(args.slug, args.endpoint, args.snapshot_chunk_id_max, args.batch_size)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|