From 2ca67daceab7c8ed8059dcbc832867e5ef35d62e Mon Sep 17 00:00:00 2001 From: Hyungi Ahn Date: Wed, 8 Apr 2026 13:02:23 +0900 Subject: [PATCH 1/4] feat(search): Phase 1.2-G hybrid retrieval (doc + chunks) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 1.2-C 평가셋: chunks-only Recall 0.788 → 0.660 catastrophic. ivfflat probes 1 → 10 → 20 진단 결과 잔여 차이는 chunks vs docs embedding의 본질적 차이 (segment 매칭 vs 전체 본문 평균). 해결: doc + chunks hybrid retrieval (정석). 신규 구조: - search_vector(): 두 SQL을 asyncio.gather로 병렬 호출 - _search_vector_docs(): documents.embedding cosine top N (recall robust) - _search_vector_chunks(): document_chunks.embedding window partition (doc당 top 2 chunks, ivfflat top inner_k 후 ROW_NUMBER PARTITION) - _merge_doc_and_chunk_vectors(): 가중치 + dedup - chunk score * 1.2 (segment 매칭 더 정확) - doc score * 1.0 (recall 보완) - doc_id 기준 dedup, chunks 우선 데이터 흐름: 1. query embedding 1번 (bge-m3) 2. asyncio.gather([_docs_call(), _chunks_call()]) 3. _merge_doc_and_chunk_vectors → list[SearchResult] 4. compress_chunks_to_docs (그대로 사용) 5. fusion (그대로) 6. (Phase 1.3) chunks_by_doc 회수 → reranker 검증 게이트 (회복 목표): - Recall@10 ≥ 0.75 (baseline 0.788 - 0.04 이내) - unique_docs per query ≥ 8 - natural_language_ko Recall ≥ 0.65 - latency p95 < 250ms --- app/services/search/retrieval_service.py | 157 +++++++++++++++++++---- 1 file changed, 134 insertions(+), 23 deletions(-) diff --git a/app/services/search/retrieval_service.py b/app/services/search/retrieval_service.py index e690ef6..8097a00 100644 --- a/app/services/search/retrieval_service.py +++ b/app/services/search/retrieval_service.py @@ -1,26 +1,37 @@ """검색 후보 수집 서비스 (Phase 1.2). -text(documents FTS + trigram) + vector(documents.embedding → chunks) 후보를 +text(documents FTS + trigram) + vector(documents.embedding + chunks.embedding hybrid) 후보를 SearchResult 리스트로 반환. Phase 1.1a: search.py의 _search_text/_search_vector를 이전 (ILIKE 그대로). Phase 1.2-B: ILIKE → trigram `%` + `similarity()`. ILIKE 풀 스캔 제거. -Phase 1.2-B 이후: vector retrieval을 document_chunks 테이블 기반으로 전환. +Phase 1.2-C: vector retrieval을 document_chunks 테이블로 전환 → catastrophic recall 손실. +Phase 1.2-G: doc + chunks hybrid retrieval 보강. + - documents.embedding (recall robust, 자연어 매칭 강함) + - document_chunks.embedding (precision, segment 매칭) + - 두 SQL 동시 호출 후 doc_id 기준 merge (chunk 가중치 1.2, doc 1.0) """ from __future__ import annotations +import asyncio from typing import TYPE_CHECKING from sqlalchemy import text -from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker from ai.client import AIClient +from core.database import engine if TYPE_CHECKING: from api.search import SearchResult +# Hybrid merge 가중치 (1.2-G) +DOC_VECTOR_WEIGHT = 1.0 +CHUNK_VECTOR_WEIGHT = 1.2 + + async def search_text( session: AsyncSession, query: str, limit: int ) -> list["SearchResult"]: @@ -121,27 +132,27 @@ async def search_text( async def search_vector( session: AsyncSession, query: str, limit: int ) -> list["SearchResult"]: - """벡터 유사도 검색 — chunk-level + doc 다양성 보장 (Phase 1.2-C). + """Hybrid 벡터 검색 — doc + chunks 동시 retrieval (Phase 1.2-G). Phase 1.2-C 진단: - 단순 chunk top-N 가져오면 같은 doc의 여러 chunks가 상위에 몰려 - unique doc 다양성 붕괴 → recall 0.788 → 0.531 (catastrophic). + chunks-only는 segment 의미 손실로 자연어 query에서 catastrophic recall. + doc embedding은 전체 본문 평균 → recall robust. + → 두 retrieval 동시 사용이 정석. - 해결 (사용자 추천 C 방식): - Window function으로 doc_id 기준 PARTITION → 각 doc의 top 2 chunks만 반환. - raw_chunks(chunks_by_doc 보존)와 doc-level 압축 둘 다 만족. - - SQL 흐름: - 1. inner CTE: ivfflat 인덱스로 top-K chunks 빠르게 추출 - 2. ranked CTE: doc_id PARTITION 후 score 내림차순 ROW_NUMBER - 3. outer: rn <= 2 (doc당 max 2 chunks) + JOIN documents + 데이터 흐름: + 1. query embedding 1번 (bge-m3) + 2. asyncio.gather로 두 SQL 동시 호출: + - _search_vector_docs: documents.embedding cosine top N + - _search_vector_chunks: document_chunks.embedding window partition (doc당 top 2) + 3. _merge_doc_and_chunk_vectors로 가중치 + dedup: + - chunk score * 1.2 (precision) + - doc score * 1.0 (recall) + - doc_id 기준 dedup, chunks 우선 Returns: - list[SearchResult] — chunk-level, 각 doc 최대 2개. compress_chunks_to_docs로 - doc-level 압축 + chunks_by_doc 보존. + list[SearchResult] — doc_id 중복 제거됨. compress_chunks_to_docs는 그대로 동작. + chunks_by_doc은 search.py에서 group_by_doc으로 보존. """ - from api.search import SearchResult # 순환 import 회피 - try: client = AIClient() query_embedding = await client.embed(query) @@ -149,9 +160,71 @@ async def search_vector( except Exception: return [] - # ivfflat 인덱스로 top-K chunks 추출 후 doc 단위 partition - # inner_k = limit * 10 정도로 충분 unique doc 확보 (~30~50 docs) - inner_k = max(limit * 10, 200) + embedding_str = str(query_embedding) + + # 두 SQL 병렬 호출 — 각각 별도 session 사용 (asyncpg connection은 statement 단위 직렬) + Session = async_sessionmaker(engine) + + async def _docs_call() -> list["SearchResult"]: + async with Session() as s: + return await _search_vector_docs(s, embedding_str, limit * 4) + + async def _chunks_call() -> list["SearchResult"]: + async with Session() as s: + return await _search_vector_chunks(s, embedding_str, limit * 4) + + doc_results, chunk_results = await asyncio.gather(_docs_call(), _chunks_call()) + + return _merge_doc_and_chunk_vectors(doc_results, chunk_results) + + +async def _search_vector_docs( + session: AsyncSession, embedding_str: str, limit: int +) -> list["SearchResult"]: + """documents.embedding 직접 검색 — recall robust (자연어 매칭). + + chunks가 없는 doc도 매칭 가능. score는 cosine similarity (1 - distance). + chunk_id/chunk_index/section_title은 None. + """ + from api.search import SearchResult # 순환 import 회피 + + result = await session.execute( + text(""" + SELECT + id, + title, + ai_domain, + ai_summary, + file_format, + (1 - (embedding <=> cast(:embedding AS vector))) AS score, + left(extracted_text, 200) AS snippet, + 'vector_doc' AS match_reason, + NULL::bigint AS chunk_id, + NULL::integer AS chunk_index, + NULL::text AS section_title + FROM documents + WHERE embedding IS NOT NULL AND deleted_at IS NULL + ORDER BY embedding <=> cast(:embedding AS vector) + LIMIT :limit + """), + {"embedding": embedding_str, "limit": limit}, + ) + return [SearchResult(**row._mapping) for row in result] + + +async def _search_vector_chunks( + session: AsyncSession, embedding_str: str, limit: int +) -> list["SearchResult"]: + """document_chunks.embedding 검색 + window partition (doc당 top 2 chunks). + + SQL 흐름: + 1. inner CTE topk: ivfflat 인덱스로 top-K chunks 추출 + 2. ranked CTE: doc_id PARTITION + ROW_NUMBER (score 내림차순) + 3. outer: rn <= 2 (doc당 max 2 chunks) + JOIN documents + """ + from api.search import SearchResult # 순환 import 회피 + + inner_k = max(limit * 5, 500) result = await session.execute( text(""" WITH topk AS ( @@ -181,7 +254,7 @@ async def search_vector( d.file_format AS file_format, (1 - r.dist) AS score, left(r.text, 200) AS snippet, - 'vector' AS match_reason, + 'vector_chunk' AS match_reason, r.chunk_id AS chunk_id, r.chunk_index AS chunk_index, r.section_title AS section_title @@ -191,11 +264,49 @@ async def search_vector( ORDER BY r.dist LIMIT :limit """), - {"embedding": str(query_embedding), "inner_k": inner_k, "limit": limit * 4}, + {"embedding": embedding_str, "inner_k": inner_k, "limit": limit}, ) return [SearchResult(**row._mapping) for row in result] +def _merge_doc_and_chunk_vectors( + doc_results: list["SearchResult"], + chunk_results: list["SearchResult"], +) -> list["SearchResult"]: + """doc + chunks vector 결과 merge (Phase 1.2-G). + + 가중치: + - chunk score * 1.2 (segment 매칭이 더 정확) + - doc score * 1.0 (전체 본문 평균, recall 보완) + + Dedup: + - doc_id 기준 + - chunks가 있으면 chunks 우선 (segment 정보 + chunk_id 보존) + - chunks에 없는 doc은 doc-wrap으로 추가 + + Returns: + score 내림차순 정렬된 SearchResult 리스트. + chunk_id가 None이면 doc-wrap 결과(text-only 매치 doc 처리에 사용). + """ + by_doc_id: dict[int, "SearchResult"] = {} + + # chunks 먼저 (가중치 적용 + chunk_id 보존) + for c in chunk_results: + c.score = c.score * CHUNK_VECTOR_WEIGHT + prev = by_doc_id.get(c.id) + if prev is None or c.score > prev.score: + by_doc_id[c.id] = c + + # doc 매치는 chunks에 없는 doc만 추가 (chunks 우선 원칙) + for d in doc_results: + d.score = d.score * DOC_VECTOR_WEIGHT + if d.id not in by_doc_id: + by_doc_id[d.id] = d + + # score 내림차순 정렬 + return sorted(by_doc_id.values(), key=lambda r: r.score, reverse=True) + + def compress_chunks_to_docs( chunks: list["SearchResult"], limit: int ) -> tuple[list["SearchResult"], dict[int, list["SearchResult"]]]: From 25ef3996ecce8b6f4bac5de06ba028f83e2f476a Mon Sep 17 00:00:00 2001 From: Hyungi Ahn Date: Wed, 8 Apr 2026 13:08:23 +0900 Subject: [PATCH 2/4] =?UTF-8?q?feat(chunk):=20Phase=201.2-G=20embedding=20?= =?UTF-8?q?=EC=9E=85=EB=A0=A5=20=EA=B0=95=ED=99=94=20(title=20+=20section?= =?UTF-8?q?=20+=20text)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 1.2-G hybrid retrieval 측정 결과 Recall 0.66 정체 + 진단: 직접 nl 쿼리 시도 결과 일부 정답 doc(3854, 3981, 3982, 3920, 3921)이 top-100에도 못 들어옴. doc은 corpus + chunks + embedding 모두 정상. 진짜 원인: 자연어 query ↔ 법령 조항 의미 거리 + 짧은 본문 embedding signal 약함. - query: '유해화학물질을 다루는 회사가 지켜야 할 안전 의무' - 본문: '화학물질관리법 제4장 유해화학물질 영업자' - bge-m3 입장: chunk text만으로는 같은 의미인지 못 알아봄 해결: chunks embedding 입력에 doc.title + section_title 포함. - before: embed(c['text']) - after: embed('[제목] {title}\n[섹션] {section}\n[본문] {text}') 기대 효과: - 짧은 조항 문서 매칭 회복 (3920/3921 등 300자대) - 자연어 query → 법령 조항 의미 매칭 개선 - Recall 0.66 → 0.72~0.78 영향: chunks embedding 차원/구조 변경 X — 입력 텍스트 prefix만 다름. 재인덱싱 1회로 모든 chunks 재생성 필요. --- app/workers/chunk_worker.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/app/workers/chunk_worker.py b/app/workers/chunk_worker.py index a9f1baf..f6ff953 100644 --- a/app/workers/chunk_worker.py +++ b/app/workers/chunk_worker.py @@ -313,8 +313,16 @@ async def process(document_id: int, session: AsyncSession) -> None: client = AIClient() try: for idx, c in enumerate(chunk_dicts): + # Phase 1.2-G: embedding 입력 강화 (자연어 query ↔ 법령 조항 의미 매칭 개선) + # 짧은 본문이나 segment-only chunk는 임베딩 signal이 약함 → title/section 포함. + section = c.get("section_title") or "" + embed_input = ( + f"[제목] {doc.title or ''}\n" + f"[섹션] {section}\n" + f"[본문] {c['text']}" + ) try: - embedding = await client.embed(c["text"]) + embedding = await client.embed(embed_input) except Exception as e: logger.warning(f"[chunk] document_id={document_id} chunk {idx} 임베딩 실패: {e}") embedding = None From e0f928f429565a011ea44cf1e694490ca7c3a964 Mon Sep 17 00:00:00 2001 From: Hyungi Ahn Date: Wed, 8 Apr 2026 13:16:37 +0900 Subject: [PATCH 3/4] =?UTF-8?q?feat(deploy):=20Phase=201.3=20reranker=20(T?= =?UTF-8?q?EI=20bge-reranker-v2-m3)=20=EC=84=9C=EB=B9=84=EC=8A=A4=20?= =?UTF-8?q?=EC=B6=94=EA=B0=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit docker-compose.yml에 reranker 서비스 추가: - image: ghcr.io/huggingface/text-embeddings-inference:1.5 - MODEL_ID=BAAI/bge-reranker-v2-m3 - MAX_BATCH_TOKENS=8192, MAX_CONCURRENT_REQUESTS=4 - GPU 1개 할당 (RTX 4070 Ti Super, CUDA 13.0) - expose 80만 (host 노출 X, internal network 전용) - reranker_cache volume으로 모델 영속화 - fastapi가 depends_on 안 함 → 단독 시작 가능, reranker 없어도 fastapi 동작 (rerank_service가 RRF fallback) 다음 단계: - GPU에서 docker pull로 호환성 확인 - docker compose up -d reranker → warmup - config.yaml의 rerank.endpoint를 http://reranker:80/rerank로 갱신 (GPU 직접) - fastapi rebuild + 평가셋 측정 (rerank=true) --- docker-compose.yml | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/docker-compose.yml b/docker-compose.yml index b55599a..cf4e1f3 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -45,6 +45,28 @@ services: - "127.0.0.1:11434:11434" restart: unless-stopped + # Phase 1.3: bge-reranker-v2-m3 (TEI) — internal only, fastapi에서 reranker:80으로 호출 + # fastapi가 depends_on 안 함 → 단독 시작 가능, 없어도 fastapi 동작 (rerank=false fallback) + reranker: + image: ghcr.io/huggingface/text-embeddings-inference:1.5 + container_name: hyungi_document_server-reranker-1 + expose: + - "80" + environment: + - MODEL_ID=BAAI/bge-reranker-v2-m3 + - MAX_BATCH_TOKENS=8192 + - MAX_CONCURRENT_REQUESTS=4 + volumes: + - reranker_cache:/data + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: 1 + capabilities: [gpu] + restart: unless-stopped + ai-gateway: build: ./gpu-server/services/ai-gateway ports: @@ -103,3 +125,4 @@ volumes: pgdata: caddy_data: ollama_data: + reranker_cache: From 3bf619333775ce7620d29347546281e8e0df5e2b Mon Sep 17 00:00:00 2001 From: Hyungi Ahn Date: Wed, 8 Apr 2026 13:18:37 +0900 Subject: [PATCH 4/4] =?UTF-8?q?fix(deploy):=20TEI=201.5=20=E2=86=92=201.7?= =?UTF-8?q?=20(1.5=EB=8A=94=20reranker=20=EB=AA=A8=EB=8D=B8=20=EB=8B=A4?= =?UTF-8?q?=EC=9A=B4=EB=A1=9C=EB=93=9C=20=EB=B2=84=EA=B7=B8)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit TEI 1.5 첫 시도 시 'builder error: relative URL without a base' 에러로 BAAI/bge-reranker-v2-m3 metadata 다운로드 실패. TEI 1.5의 알려진 버그. 해결: TEI 1.7로 업그레이드 (sequence-classification reranker 모델 지원 개선). --- docker-compose.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker-compose.yml b/docker-compose.yml index cf4e1f3..99deb58 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -48,7 +48,7 @@ services: # Phase 1.3: bge-reranker-v2-m3 (TEI) — internal only, fastapi에서 reranker:80으로 호출 # fastapi가 depends_on 안 함 → 단독 시작 가능, 없어도 fastapi 동작 (rerank=false fallback) reranker: - image: ghcr.io/huggingface/text-embeddings-inference:1.5 + image: ghcr.io/huggingface/text-embeddings-inference:1.7 container_name: hyungi_document_server-reranker-1 expose: - "80"