From 51a6f7c9af325967f772591a46042f097d139661 Mon Sep 17 00:00:00 2001 From: Hyungi Ahn Date: Mon, 20 Apr 2026 15:04:39 +0900 Subject: [PATCH] =?UTF-8?q?feat(eval):=20=EB=B0=9C=EC=A3=BC=EA=B1=B4=20?= =?UTF-8?q?=EB=8B=A8=EC=9C=84=20baseline=20=ED=8F=89=EA=B0=80=20=EA=B2=BD?= =?UTF-8?q?=EB=A1=9C=20=EC=B6=94=EA=B0=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - run_eval.py: --queries-order / --order-groups / --output-order / --debug 옵션 추가. 기존 legacy CSV 스키마/값 불변 (출력 소비자 보호). - Tier 1A/1B/2 지표 구현: cross_format_link_success (top-10 공식 + top-5 보조, eligible/success 분수), top_5_document_match (guardrail + 절대 건수), manual_refind_flag (v0 heuristic), chunk_idx_stddev, range/page_citation_available capability flags. - order_groups.yaml: 발주건 3건 매핑 (TKP-26-0114/0132/0112, 10 docs). - queries_order_baseline.yaml: 12개 질문 (A:4 B:4 C:3 D:1). plan: ~/.claude/plans/merry-yawning-owl.md --- tests/search_eval/order_groups.yaml | 64 ++ tests/search_eval/queries_order_baseline.yaml | 266 +++++++ tests/search_eval/run_eval.py | 737 +++++++++++++++++- 3 files changed, 1027 insertions(+), 40 deletions(-) create mode 100644 tests/search_eval/order_groups.yaml create mode 100644 tests/search_eval/queries_order_baseline.yaml diff --git a/tests/search_eval/order_groups.yaml b/tests/search_eval/order_groups.yaml new file mode 100644 index 0000000..a0076aa --- /dev/null +++ b/tests/search_eval/order_groups.yaml @@ -0,0 +1,64 @@ +version: "0.1" +created_at: "2026-04-20" +notes: | + Phase 0 발주건 단위 baseline 전용 그룹 매핑. DB 스키마 건드리지 않고 가설적 grouping. + Phase 1A에서 구조화 추출 결과 본 뒤에야 정식 domain 모델 결정. + + role enum: + - order_xlsx: 발주서 원본 xlsx + - order_pdf: 발주서 PDF 변환본 (xlsx에서 출력) + - invoice: 전자세금계산서 PDF (세무 증빙) + - statement: 거래명세표/명세서 PDF (출고·배송 증빙, 선택) + + 한 발주건에 invoice/statement 둘 다 있거나, 하나만 있거나, 둘 다 없을 수 있음. + + 샘플 소스: ~/Desktop/3월지급 → NAS /volume4/Document_Server/PKM/Inbox/발주/2026-03/ + (2026-04-20 ingest 완료, 9 파일 / 3 발주건) + + 제외된 파일: + - 신양철강 74,290원.pdf (TKP-26-0132 invoice 중복 사본, 같은 세금계산서의 공급받는자 보관용) + - 260220-테크니컬코리아 - 복사본.jpg (이미지, OCR 필요 — Phase 0 범위 밖) + - 면취날 연마.pdf (DCS엔지 별도 PO의 invoice, TKP-26-0112와 무관) + + 2026-04-20 추가: TKP-26-0112의 invoice(TrusBill print.pdf, doc_id 8944) 누락 발견 후 ingest. + 초기엔 "invoice 없는 케이스"로 잡았으나 파일명이 invoice를 암시 안 해서 놓침 (실제로는 존재). + + doc_id 조회 쿼리: + docker exec hyungi_document_server-postgres-1 psql -U pkm -d pkm -c \ + "SELECT id, file_path, length(extracted_text) AS tlen, + (SELECT count(*) FROM document_chunks WHERE doc_id = d.id) AS chunks + FROM documents d + WHERE file_path LIKE '%Inbox/발주/2026-03/%' ORDER BY id;" + +groups: + - order_group_id: TKP-26-0114 + description: "대연기업 845,000원 (2026-02, 발주 + 세금계산서)" + docs: + - doc_id: 8853 + role: order_xlsx + - doc_id: 8854 + role: order_pdf + - doc_id: 8855 + role: invoice # "2월 26일.pdf" (전자세금계산서) + + - order_group_id: TKP-26-0132 + description: "신양철강 74,290원 (2026-03, 발주 + 세금계산서 + 거래명세표, 가장 풍성한 세트)" + docs: + - doc_id: 8856 + role: order_xlsx + - doc_id: 8857 + role: order_pdf + - doc_id: 8858 + role: invoice # "전자세금계산서.pdf" + - doc_id: 8859 + role: statement # "테크니컬 3-3일자 명세서(출고완료건).pdf" + + - order_group_id: TKP-26-0112 + description: "신양철강 650,000원 (2026-02, plate 절단 1건, statement 없음)" + docs: + - doc_id: 8851 + role: order_xlsx + - doc_id: 8852 + role: order_pdf + - doc_id: 8944 + role: invoice # "TrusBill print.pdf" (전자세금계산서, 공급가액 650,000) diff --git a/tests/search_eval/queries_order_baseline.yaml b/tests/search_eval/queries_order_baseline.yaml new file mode 100644 index 0000000..0e911f3 --- /dev/null +++ b/tests/search_eval/queries_order_baseline.yaml @@ -0,0 +1,266 @@ +version: "0.1" +created_at: "2026-04-20" +notes: | + Phase 0 발주건 단위 baseline 질문셋. 12개, 분포 A:B:C:D = 4:4:3:1. + + 카테고리: + A. 발주서 내부 찾기 (품목명/수량·단가·금액/납기/공급처) + B. 발주서 PDF 변환본 대응 (xlsx↔PDF 일치, PDF 페이지 위치) + C. 세금계산서/명세표 연결 (대응 invoice/statement 존재, 금액 일치, 발주건 식별) + D. 포맷 간 일치성 (핵심 필드 일치 or PDF만으로 발주건 식별) + + expected_locations 스키마: + - doc_id : order_groups.yaml의 doc_id + - role : order_xlsx | order_pdf | invoice | statement + - location_type : sheet_range | page | document_only + - location_value : xlsx는 A1 notation 예) "발주서!F9", PDF는 "p1" + document_only일 때는 null + - is_primary : 해당 role이 이 질문의 1차 정답 포맷인가 + + document_only 허용 규칙: + - 원칙: 가능한 경우 sheet_range 또는 page로 라벨링 + - 예외: 근거 위치를 안정적으로 특정할 수 없을 때 또는 여러 문서 간 "일치성" 자체가 질문일 때 + - 규칙: document_only 사용 시 notes 필드에 이유 필수 기재 + + 2026-04-20 초안 (Claude 자동 draft, 사용자 검수 대기): + - 발주서 xlsx의 핵심 셀 매핑 (전 발주건 공통): + D5 ORDER DATE, D6 PO NO, F9 공급처, F11 주소, F13 전화, W9 납기일, + W12 프로젝트, W13 담당자, B17~ 품목 행, V17~ 수량, X17~ 단가, AB17~ 금액, AB23 TOTAL + - PDF는 모두 1페이지 분량 → p1 고정 + - 실제 오라벨 가능성 있으니 사용자 확인 필요 (sheet range 특히) + +questions: + # ───────────────────────────── + # A. 발주서 내부 찾기 (4) + # ───────────────────────────── + - id: Q-A-001 + query: "TKP-26-0114 발주의 공급처는 어디인가?" + category: A + order_group_id: TKP-26-0114 + intent: fact_lookup + expected_locations: + - doc_id: 8853 + role: order_xlsx + location_type: sheet_range + location_value: "발주서!F9" + is_primary: true + - doc_id: 8854 + role: order_pdf + location_type: page + location_value: "p1" + is_primary: false + notes: "정답: (주)대연기업. xlsx F9." + + - id: Q-A-002 + query: "TKP-26-0132 발주의 납기일은?" + category: A + order_group_id: TKP-26-0132 + intent: fact_lookup + expected_locations: + - doc_id: 8856 + role: order_xlsx + location_type: sheet_range + location_value: "발주서!W9" + is_primary: true + - doc_id: 8857 + role: order_pdf + location_type: page + location_value: "p1" + is_primary: false + notes: "정답: 2026-02-23. xlsx W9." + + - id: Q-A-003 + query: "TKP-26-0112 plate 절단 단가는 얼마인가?" + category: A + order_group_id: TKP-26-0112 + intent: fact_lookup + expected_locations: + - doc_id: 8851 + role: order_xlsx + location_type: sheet_range + location_value: "발주서!X17" + is_primary: true + - doc_id: 8852 + role: order_pdf + location_type: page + location_value: "p1" + is_primary: false + notes: "정답: 650,000원 (1개 항목). xlsx X17." + + - id: Q-A-004 + query: "TKP-26-0114 발주의 총 금액은?" + category: A + order_group_id: TKP-26-0114 + intent: fact_lookup + expected_locations: + - doc_id: 8853 + role: order_xlsx + location_type: sheet_range + location_value: "발주서!AB23" + is_primary: true + - doc_id: 8854 + role: order_pdf + location_type: page + location_value: "p1" + is_primary: false + notes: "정답: 845,000원 (부가세 별도). xlsx AB23." + + # ───────────────────────────── + # B. 발주서 PDF 변환본 대응 (4) + # ───────────────────────────── + - id: Q-B-001 + query: "TKP-26-0114에서 2:1 HEAD SA516-70 품목의 수량은?" + category: B + order_group_id: TKP-26-0114 + intent: fact_lookup + expected_locations: + - doc_id: 8853 + role: order_xlsx + location_type: sheet_range + location_value: "발주서!V17" + is_primary: true + - doc_id: 8854 + role: order_pdf + location_type: page + location_value: "p1" + is_primary: false + notes: "정답: 2 EA. xlsx V17 / PDF p1 품목표. B 카테고리 = xlsx↔PDF 대응 확인." + + - id: Q-B-002 + query: "TKP-26-0132 발주서 총액은 얼마이고 PDF 변환본에서도 확인 가능한가?" + category: B + order_group_id: TKP-26-0132 + intent: fact_lookup + expected_locations: + - doc_id: 8856 + role: order_xlsx + location_type: sheet_range + location_value: "발주서!AB20" + is_primary: true + - doc_id: 8857 + role: order_pdf + location_type: page + location_value: "p1" + is_primary: false + notes: "정답: 74,290원. xlsx AB20 / PDF p1 TOTAL. 두 포맷 모두 일치해야 함." + + - id: Q-B-003 + query: "TKP-26-0112 PO 번호를 PDF 변환본에서 확인" + category: B + order_group_id: TKP-26-0112 + intent: fact_lookup + expected_locations: + - doc_id: 8852 + role: order_pdf + location_type: page + location_value: "p1" + is_primary: true + - doc_id: 8851 + role: order_xlsx + location_type: sheet_range + location_value: "발주서!D6" + is_primary: false + notes: "정답: TKP-26-0112. PDF만으로도 식별 가능한지 확인 (primary=order_pdf)." + + - id: Q-B-004 + query: "TKP-26-0114 발주서 담당자는 누구인가?" + category: B + order_group_id: TKP-26-0114 + intent: fact_lookup + expected_locations: + - doc_id: 8853 + role: order_xlsx + location_type: sheet_range + location_value: "발주서!W13" + is_primary: true + - doc_id: 8854 + role: order_pdf + location_type: page + location_value: "p1" + is_primary: false + notes: "정답: 안현기(Hyunki,Ahn). xlsx W13 (PREPAIRED BY) / PDF p1." + + # ───────────────────────────── + # C. 세금계산서/명세표 연결 (3) + # ───────────────────────────── + - id: Q-C-001 + query: "TKP-26-0132 세금계산서의 공급가액은?" + category: C + order_group_id: TKP-26-0132 + intent: fact_lookup + expected_locations: + - doc_id: 8858 + role: invoice + location_type: page + location_value: "p1" + is_primary: true + notes: "정답: 74,290원. invoice p1 공급가액 칸. 발주서 총액과 일치 확인용." + + - id: Q-C-002 + query: "TKP-26-0114 발주금액과 세금계산서 공급가액이 일치하는가?" + category: C + order_group_id: TKP-26-0114 + intent: comparison + expected_locations: + - doc_id: 8855 + role: invoice + location_type: page + location_value: "p1" + is_primary: true + - doc_id: 8853 + role: order_xlsx + location_type: sheet_range + location_value: "발주서!AB23" + is_primary: false + notes: | + 정답: 일치 (양쪽 845,000원). invoice 공급가액 + xlsx TOTAL 비교 필요 — + cross-format retrieval 특성 강조. C 카테고리지만 order_xlsx도 근거로 필요. + + - id: Q-C-003 + query: "TKP-26-0132 거래명세표에 기재된 품목은 무엇인가?" + category: C + order_group_id: TKP-26-0132 + intent: fact_lookup + expected_locations: + - doc_id: 8859 + role: statement + location_type: page + location_value: "p1" + is_primary: true + notes: | + 정답: "레이져 A516-70 가공비 12t x 1197 x 1197" 외 1건. + statement p1 품목 테이블. + + # ───────────────────────────── + # D. 포맷 간 일치성 (1) + # ───────────────────────────── + - id: Q-D-001 + query: "TKP-26-0132 발주번호가 발주서·PDF·세금계산서·거래명세표 4개 문서에 모두 나오는가?" + category: D + order_group_id: TKP-26-0132 + intent: comparison + expected_locations: + - doc_id: 8856 + role: order_xlsx + location_type: document_only + location_value: null + is_primary: true + - doc_id: 8857 + role: order_pdf + location_type: document_only + location_value: null + is_primary: false + - doc_id: 8858 + role: invoice + location_type: document_only + location_value: null + is_primary: false + - doc_id: 8859 + role: statement + location_type: document_only + location_value: null + is_primary: false + notes: | + 정답: 모두 나옴 — order_xlsx(D6), order_pdf(PO NO), invoice(<안현기님-TKP-26-0132>), + statement(<안현기님-TKP-26-0132>). D 카테고리는 "여러 문서 간 일치성" 자체가 + 질문이라 document_only 사용 — 위치보다 "같은 발주건에 속하는가"가 본질. diff --git a/tests/search_eval/run_eval.py b/tests/search_eval/run_eval.py index e506248..3e80cb4 100644 --- a/tests/search_eval/run_eval.py +++ b/tests/search_eval/run_eval.py @@ -7,6 +7,11 @@ Recall@10, MRR@10, NDCG@10, Top3 hit-rate, Latency p50/p95를 계산한다. A/B 비교 모드: --baseline-url, --candidate-url 를 각각 지정하면 두 엔드포인트에 동일 쿼리셋을 던지고 결과를 비교한다. +발주건 단위 baseline 모드 (Phase 0 / plan: merry-yawning-owl): + --queries-order + --order-groups + --output-order 로 xlsx/PDF 구조화 추출 + gap 측정용 Tier 1A/1B/2 지표를 계산한다. 기존 --queries 경로와 CSV + 스키마는 변경되지 않는다 (출력 소비자 보호). + 사용 예: # 단일 평가 @@ -21,6 +26,13 @@ A/B 비교 모드: --baseline-url, --candidate-url 를 각각 지정하면 --candidate-url http://localhost:8000 \ --output reports/phase1_vs_baseline.csv + # 발주건 단위 baseline + python tests/search_eval/run_eval.py \ + --base-url http://localhost:8000 \ + --queries-order tests/search_eval/queries_order_baseline.yaml \ + --order-groups tests/search_eval/order_groups.yaml \ + --output-order reports/baseline_order_unit_2026-04-20.csv + 토큰은 env DOCSRV_TOKEN 또는 --token 플래그로 전달. """ @@ -367,6 +379,578 @@ def load_queries(yaml_path: Path) -> list[Query]: return queries +# ═════════════════════════════════════════════════════════════════ +# 발주건 단위 baseline (Phase 0 / plan: merry-yawning-owl) +# ───────────────────────────────────────────────────────────────── +# 아래 섹션은 "구조화 추출 gap 측정" 전용 코드 경로. 기존 legacy +# 쿼리 평가는 위 섹션 그대로 — 스키마/값 불변. +# ═════════════════════════════════════════════════════════════════ + + +# ───────────────────────────────────────────────────────── +# 발주건 데이터 모델 +# ───────────────────────────────────────────────────────── + + +@dataclass +class OrderGroupDoc: + doc_id: int + role: str # order_xlsx | order_pdf | calc_pdf + + +@dataclass +class OrderGroup: + order_group_id: str + description: str + docs: list[OrderGroupDoc] + + def role_of(self, doc_id: int) -> str | None: + for d in self.docs: + if d.doc_id == doc_id: + return d.role + return None + + def roles_set(self) -> set[str]: + return {d.role for d in self.docs} + + +@dataclass +class ExpectedLocation: + doc_id: int + role: str + location_type: str # sheet_range | page | document_only + location_value: str | None + is_primary: bool + + +@dataclass +class OrderQuery: + id: str + query: str + category: str # A | B | C | D + order_group_id: str + intent: str + expected_locations: list[ExpectedLocation] + notes: str = "" + + +@dataclass +class OrderQueryResult: + query: OrderQuery + returned_results: list[dict] + latency_ms: float + doc_match_top5: bool + cross_format_eligible: bool + cross_format_link_success_top10: bool + cross_format_link_success_top5: bool + range_citation_available: bool + page_citation_available: bool + matched_location_value: str | None + manual_refind_flag: bool + chunk_idx_stddev_top10: float | None + error: str | None = None + + +# ───────────────────────────────────────────────────────── +# 발주건 지표 (Tier 1A / 1B / 2) +# ───────────────────────────────────────────────────────── + + +def _collect_expected_doc_ids(locs: list[ExpectedLocation]) -> set[int]: + return {loc.doc_id for loc in locs} + + +def doc_match_at_k(returned_ids: list[int], expected_doc_ids: set[int], k: int = 5) -> bool: + """Top-k에 expected doc_id 중 하나라도 있는가 (Tier 1A guardrail).""" + if not expected_doc_ids: + return False + return any(doc_id in expected_doc_ids for doc_id in returned_ids[:k]) + + +def cross_format_link_success( + returned_ids: list[int], + expected_locations: list[ExpectedLocation], + group: OrderGroup, + k: int, +) -> tuple[bool, bool]: + """Tier 1A 공식 지표. (success, eligible) 반환. + + Eligible: order_group이 서로 다른 role을 2개 이상 보유 (즉 cross-format + 연결을 측정할 의미가 있는 그룹). + + Success (3조건 동시): + ① is_primary=true expected role의 doc이 top-k에 1개 이상 존재 + ② 다른 role의 doc이 top-k에 1개 이상 존재 + ③ 두 doc 모두 동일 order_group 소속 + + "role 다른 doc_id 2개" 단순 존재 검사는 false positive 있어 사용 금지. + """ + if len(group.roles_set()) < 2: + return False, False # ineligible + + primary_roles = {loc.role for loc in expected_locations if loc.is_primary} + if not primary_roles: + # primary 라벨이 없으면 eligible이긴 해도 success 판정 불가 + return False, True + + top_k_set = set(returned_ids[:k]) + group_doc_ids_by_role: dict[str, list[int]] = {} + for d in group.docs: + group_doc_ids_by_role.setdefault(d.role, []).append(d.doc_id) + + has_primary = any( + doc_id in top_k_set + for role in primary_roles + for doc_id in group_doc_ids_by_role.get(role, []) + ) + other_roles = group.roles_set() - primary_roles + has_other = any( + doc_id in top_k_set + for role in other_roles + for doc_id in group_doc_ids_by_role.get(role, []) + ) + return (has_primary and has_other), True + + +def range_citation_available(returned_results: list[dict]) -> bool: + """Tier 2: 응답에 sheet_name 또는 cell_range 필드가 존재하고 비어있지 않은가. + + 현재 API(`app/api/search.py`)에는 해당 필드 없음 → baseline = False (0%). + """ + for r in returned_results: + if r.get("sheet_name") or r.get("cell_range"): + return True + return False + + +def page_citation_available(returned_results: list[dict]) -> bool: + """Tier 2: 응답에 page 필드가 존재하고 비어있지 않은가. + + 현재 chunk.page는 항상 null → baseline = False (0%). + """ + for r in returned_results: + page = r.get("page") + if page is not None and page != "": + return True + return False + + +def _tokenize_query(q: str) -> list[str]: + """간단한 토큰화: 공백 split 후 2자 이상만.""" + return [t for t in q.lower().split() if len(t) >= 2] + + +def manual_refind_flag_v0( + returned_results: list[dict], + query_text: str, + score_threshold: float = 0.5, +) -> bool: + """Tier 1B v0 heuristic. top_1 score < threshold AND snippet 핵심 토큰 미포함. + + 주의: v0. 점수 임계값 0.5는 **임시값** — 검색 score calibration 바뀌면 + baseline 간 비교가 흔들릴 수 있다. 절대값처럼 취급 금지. 보고서에 + "heuristic vs 실감각 수동 교차검증" 결과 병기 필수. + """ + if not returned_results: + return True + top_1 = returned_results[0] + score = top_1.get("score", 0.0) + if score is None: + score = 0.0 + if score >= score_threshold: + return False + snippet = (top_1.get("snippet") or "").lower() + title = (top_1.get("title") or "").lower() + haystack = f"{title} {snippet}" + tokens = _tokenize_query(query_text) + if not tokens: + return False + has_any_token = any(t in haystack for t in tokens) + return not has_any_token + + +def _chunk_idx_stddev_top10(returned_results: list[dict]) -> float | None: + """Top-10의 chunk_index 분산 (낮을수록 한 섹션에 몰림). Observational only.""" + idxs = [r.get("chunk_index") for r in returned_results[:10]] + vals = [i for i in idxs if isinstance(i, int)] + if len(vals) < 2: + return None + return statistics.stdev(vals) + + +def _matched_location_value( + returned_results: list[dict], + expected_locations: list[ExpectedLocation], +) -> str | None: + """Tier 2 matched_location: 현재 API는 location 필드를 노출하지 않으므로 + baseline에선 항상 None. Phase 1A/1B 구현 이후 값이 채워진다. + """ + # 현재 API 응답에 location 정보 없음 → 항상 None + # Phase 1A/1B 구현 후 r.get("cell_range") / r.get("page") 체크로 확장 + return None + + +# ───────────────────────────────────────────────────────── +# 발주건 API 호출 (full result dict 반환) +# ───────────────────────────────────────────────────────── + + +async def call_search_full( + client: httpx.AsyncClient, + base_url: str, + token: str, + query: str, + mode: str = "hybrid", + limit: int = 20, + fusion: str | None = None, + rerank: str | None = None, + analyze: str | None = None, + debug: bool = False, +) -> tuple[list[dict], float]: + """call_search와 동일 로직. 단 full result dict 리스트 반환.""" + url = f"{base_url.rstrip('/')}/api/search/" + headers = {"Authorization": f"Bearer {token}"} + params: dict[str, str | int] = {"q": query, "mode": mode, "limit": limit} + if fusion: + params["fusion"] = fusion + if rerank is not None: + params["rerank"] = rerank + if analyze is not None: + params["analyze"] = analyze + if debug: + params["debug"] = "true" + + import time + + start = time.perf_counter() + response = await client.get(url, headers=headers, params=params, timeout=30.0) + latency_ms = (time.perf_counter() - start) * 1000 + response.raise_for_status() + data = response.json() + return data.get("results", []), latency_ms + + +# ───────────────────────────────────────────────────────── +# 발주건 평가 실행 +# ───────────────────────────────────────────────────────── + + +async def evaluate_orders( + queries: list[OrderQuery], + groups: dict[str, OrderGroup], + base_url: str, + token: str, + mode: str = "hybrid", + fusion: str | None = None, + rerank: str | None = None, + analyze: str | None = None, + debug: bool = False, +) -> list[OrderQueryResult]: + """발주건 쿼리셋 평가.""" + results: list[OrderQueryResult] = [] + async with httpx.AsyncClient() as client: + for q in queries: + group = groups.get(q.order_group_id) + if group is None: + results.append( + OrderQueryResult( + query=q, + returned_results=[], + latency_ms=0.0, + doc_match_top5=False, + cross_format_eligible=False, + cross_format_link_success_top10=False, + cross_format_link_success_top5=False, + range_citation_available=False, + page_citation_available=False, + matched_location_value=None, + manual_refind_flag=True, + chunk_idx_stddev_top10=None, + error=f"unknown order_group_id={q.order_group_id}", + ) + ) + continue + try: + returned, latency_ms = await call_search_full( + client, base_url, token, q.query, + mode=mode, fusion=fusion, rerank=rerank, analyze=analyze, debug=debug, + ) + returned_ids = [r["id"] for r in returned] + expected_ids = _collect_expected_doc_ids(q.expected_locations) + + cf10, eligible10 = cross_format_link_success(returned_ids, q.expected_locations, group, 10) + cf5, _eligible5 = cross_format_link_success(returned_ids, q.expected_locations, group, 5) + + results.append( + OrderQueryResult( + query=q, + returned_results=returned, + latency_ms=latency_ms, + doc_match_top5=doc_match_at_k(returned_ids, expected_ids, 5), + cross_format_eligible=eligible10, + cross_format_link_success_top10=cf10 if eligible10 else False, + cross_format_link_success_top5=cf5 if eligible10 else False, + range_citation_available=range_citation_available(returned), + page_citation_available=page_citation_available(returned), + matched_location_value=_matched_location_value(returned, q.expected_locations), + manual_refind_flag=manual_refind_flag_v0(returned, q.query), + chunk_idx_stddev_top10=_chunk_idx_stddev_top10(returned), + ) + ) + except Exception as exc: + results.append( + OrderQueryResult( + query=q, + returned_results=[], + latency_ms=0.0, + doc_match_top5=False, + cross_format_eligible=False, + cross_format_link_success_top10=False, + cross_format_link_success_top5=False, + range_citation_available=False, + page_citation_available=False, + matched_location_value=None, + manual_refind_flag=True, + chunk_idx_stddev_top10=None, + error=str(exc), + ) + ) + return results + + +# ───────────────────────────────────────────────────────── +# 발주건 결과 집계 / 출력 +# ───────────────────────────────────────────────────────── + + +def print_order_summary(results: list[OrderQueryResult]) -> dict[str, Any]: + """Tier 1A/1B/2 지표 요약. 절대 건수 병기. 집계 dict 반환.""" + n = len(results) + if n == 0: + return {} + + # Tier 1A + doc_match_count = sum(1 for r in results if r.doc_match_top5) + eligible_results = [r for r in results if r.cross_format_eligible] + cf10_success = sum(1 for r in eligible_results if r.cross_format_link_success_top10) + cf5_success = sum(1 for r in eligible_results if r.cross_format_link_success_top5) + + # Tier 1B + refind_flag_count = sum(1 for r in results if r.manual_refind_flag) + stddev_values = [r.chunk_idx_stddev_top10 for r in results if r.chunk_idx_stddev_top10 is not None] + avg_stddev = statistics.mean(stddev_values) if stddev_values else None + + # Tier 2 + range_avail_count = sum(1 for r in results if r.range_citation_available) + page_avail_count = sum(1 for r in results if r.page_citation_available) + + # Latency + latencies = [r.latency_ms for r in results if r.latency_ms > 0] + p50 = percentile(latencies, 0.50) + p95 = percentile(latencies, 0.95) + + print(f"\n=== Order-unit baseline (n={n}) ===") + print(" Tier 1A (gate 후보 / guardrail):") + print( + f" top_5_document_match_rate : {doc_match_count}/{n}" + f" ({doc_match_count / n:.1%}) — Guardrail, 비악화 강제" + ) + if eligible_results: + print( + f" cross_format_link top-10 : {cf10_success}/{len(eligible_results)}" + f" ({cf10_success / len(eligible_results):.1%}) [공식 gate 후보]" + ) + print( + f" cross_format_link top-5 : {cf5_success}/{len(eligible_results)}" + f" ({cf5_success / len(eligible_results):.1%}) [보조 관찰]" + ) + else: + print(" cross_format_link : no eligible queries (group roles<2)") + + print(" Tier 1B (관찰용):") + print( + f" manual_refind_flag (v0) : {refind_flag_count}/{n}" + f" ({refind_flag_count / n:.1%}) — heuristic, 수동 교차검증 필수" + ) + if avg_stddev is not None: + print(f" chunk_idx_stddev_top10 (mean) : {avg_stddev:.2f}") + + print(" Tier 2 (auto-eval 기준, 현재 시스템 baseline = 0):") + print( + f" range_citation_available : {range_avail_count}/{n}" + f" ({range_avail_count / n:.1%})" + ) + print( + f" page_citation_available : {page_avail_count}/{n}" + f" ({page_avail_count / n:.1%})" + ) + + print(f" Latency p50 / p95 : {p50:.0f} / {p95:.0f} ms") + + # 카테고리별 rollup + by_cat: dict[str, list[OrderQueryResult]] = {} + for r in results: + by_cat.setdefault(r.query.category, []).append(r) + print(" by category (A/B/C/D):") + for cat in sorted(by_cat.keys()): + items = by_cat[cat] + cat_doc = sum(1 for r in items if r.doc_match_top5) + cat_cf_eligible = [r for r in items if r.cross_format_eligible] + cat_cf10 = sum(1 for r in cat_cf_eligible if r.cross_format_link_success_top10) + cf_str = ( + f"cf10 {cat_cf10}/{len(cat_cf_eligible)}" + if cat_cf_eligible else "cf10 n/a" + ) + print(f" {cat} n={len(items):>2} doc_match {cat_doc}/{len(items)} {cf_str}") + + # 발주건별 rollup + by_group: dict[str, list[OrderQueryResult]] = {} + for r in results: + by_group.setdefault(r.query.order_group_id, []).append(r) + print(" by order_group:") + for gid in sorted(by_group.keys()): + items = by_group[gid] + g_doc = sum(1 for r in items if r.doc_match_top5) + print(f" {gid} n={len(items):>2} doc_match {g_doc}/{len(items)}") + + # 에러 + errors = [r for r in results if r.error] + if errors: + print(f" ERRORS ({len(errors)}):") + for r in errors: + print(f" [{r.query.id}] {r.error}") + + return { + "n": n, + "doc_match_top5": (doc_match_count, n), + "cross_format_link_top10": (cf10_success, len(eligible_results)), + "cross_format_link_top5": (cf5_success, len(eligible_results)), + "manual_refind_flag": (refind_flag_count, n), + "range_citation_available": (range_avail_count, n), + "page_citation_available": (page_avail_count, n), + "latency_p50": p50, + "latency_p95": p95, + } + + +def write_order_csv(results: list[OrderQueryResult], output_path: Path) -> None: + """발주건 baseline 전용 CSV. 기존 write_csv와 분리 — 스키마 간섭 없음.""" + output_path.parent.mkdir(parents=True, exist_ok=True) + columns = [ + "id", + "query", + "category_abcd", + "order_group_id", + "intent", + "expected_doc_ids", + "expected_roles", + "expected_location_type", + "expected_location_value", + "returned_ids_top10", + "latency_ms", + "doc_match_top5", + "cross_format_eligible", + "cross_format_link_success_top10", + "cross_format_link_success_top5", + "range_citation_available", + "page_citation_available", + "matched_location_value", + "manual_refind_flag", + "chunk_idx_stddev_top10", + "notes", + "error", + ] + with output_path.open("w", newline="", encoding="utf-8") as f: + writer = csv.writer(f) + writer.writerow(columns) + for r in results: + returned_ids = [item["id"] for item in r.returned_results[:10]] + # primary 우선으로 location_type/value 선택 + primary_locs = [loc for loc in r.query.expected_locations if loc.is_primary] + repr_loc = primary_locs[0] if primary_locs else ( + r.query.expected_locations[0] if r.query.expected_locations else None + ) + writer.writerow( + [ + r.query.id, + r.query.query, + r.query.category, + r.query.order_group_id, + r.query.intent, + ";".join(str(loc.doc_id) for loc in r.query.expected_locations), + ";".join(loc.role for loc in r.query.expected_locations), + repr_loc.location_type if repr_loc else "", + repr_loc.location_value if repr_loc and repr_loc.location_value else "", + ";".join(map(str, returned_ids)), + f"{r.latency_ms:.1f}", + "1" if r.doc_match_top5 else "0", + "1" if r.cross_format_eligible else "0", + "1" if r.cross_format_link_success_top10 else "0", + "1" if r.cross_format_link_success_top5 else "0", + "1" if r.range_citation_available else "0", + "1" if r.page_citation_available else "0", + r.matched_location_value or "", + "1" if r.manual_refind_flag else "0", + f"{r.chunk_idx_stddev_top10:.2f}" if r.chunk_idx_stddev_top10 is not None else "", + r.query.notes, + r.error or "", + ] + ) + print(f"\nOrder baseline CSV written: {output_path}") + + +# ───────────────────────────────────────────────────────── +# 발주건 YAML 로딩 +# ───────────────────────────────────────────────────────── + + +def load_order_groups(yaml_path: Path) -> dict[str, OrderGroup]: + with yaml_path.open(encoding="utf-8") as f: + data = yaml.safe_load(f) + groups: dict[str, OrderGroup] = {} + for g in data.get("groups", []): + docs = [ + OrderGroupDoc(doc_id=int(d["doc_id"]), role=d["role"]) + for d in g.get("docs", []) + ] + groups[g["order_group_id"]] = OrderGroup( + order_group_id=g["order_group_id"], + description=g.get("description", "") or "", + docs=docs, + ) + return groups + + +def load_order_queries(yaml_path: Path) -> list[OrderQuery]: + with yaml_path.open(encoding="utf-8") as f: + data = yaml.safe_load(f) + queries: list[OrderQuery] = [] + for q in data.get("questions", []): + locs = [] + for loc in q.get("expected_locations", []) or []: + locs.append( + ExpectedLocation( + doc_id=int(loc["doc_id"]), + role=loc["role"], + location_type=loc["location_type"], + location_value=loc.get("location_value"), + is_primary=bool(loc.get("is_primary", False)), + ) + ) + queries.append( + OrderQuery( + id=q["id"], + query=q["query"], + category=q["category"], + order_group_id=q["order_group_id"], + intent=q.get("intent", "") or "", + expected_locations=locs, + notes=q.get("notes", "") or "", + ) + ) + return queries + + # ───────────────────────────────────────────────────────── # CLI # ───────────────────────────────────────────────────────── @@ -438,6 +1022,30 @@ def main() -> int: default=None, help="CSV 출력 경로 (지정하면 raw 결과 저장)", ) + # 발주건 단위 baseline (Phase 0 / plan: merry-yawning-owl) + parser.add_argument( + "--queries-order", + type=Path, + default=None, + help="발주건 쿼리 YAML (queries_order_baseline.yaml)", + ) + parser.add_argument( + "--order-groups", + type=Path, + default=None, + help="발주건 그룹 매핑 YAML (order_groups.yaml)", + ) + parser.add_argument( + "--output-order", + type=Path, + default=None, + help="발주건 baseline 전용 CSV 출력 경로 (legacy --output과 분리)", + ) + parser.add_argument( + "--debug", + action="store_true", + help="검색 API debug=true 요청 (발주건 모드에서 응답 검증용)", + ) args = parser.parse_args() if not args.token: @@ -451,8 +1059,32 @@ def main() -> int: ) return 2 - queries = load_queries(args.queries) - print(f"Loaded {len(queries)} queries from {args.queries}") + # 발주건 단위 baseline 모드 (Phase 0 / plan: merry-yawning-owl) + run_order_mode = args.queries_order is not None + # Legacy 경로 실행 조건: order-only 실행이 아닐 때 (= --queries-order + --output-order만 단독으로 + # 준 경우는 skip). --output / --baseline-url / --candidate-url 중 하나라도 있으면 legacy도 실행. + run_legacy_mode = ( + not run_order_mode + or args.output is not None + or args.baseline_url is not None + or args.candidate_url is not None + ) + + if run_order_mode: + if args.order_groups is None: + print("ERROR: --queries-order 사용 시 --order-groups 필수", file=sys.stderr) + return 2 + if not args.base_url: + print("ERROR: --queries-order 모드는 --base-url만 지원 (A/B 미지원)", file=sys.stderr) + return 2 + + if not run_legacy_mode and not run_order_mode: + print("ERROR: 실행할 평가 경로가 없음", file=sys.stderr) + return 2 + + queries = load_queries(args.queries) if run_legacy_mode else [] + if run_legacy_mode: + print(f"Loaded {len(queries)} queries from {args.queries}") print(f"Mode: {args.mode}", end="") if args.fusion: print(f" / fusion: {args.fusion}", end="") @@ -462,47 +1094,72 @@ def main() -> int: all_results: list[QueryResult] = [] - if args.base_url: - print(f"\n>>> evaluating: {args.base_url}") - results = asyncio.run( - evaluate(queries, args.base_url, args.token, "single", mode=args.mode, fusion=args.fusion, rerank=args.rerank, analyze=args.analyze) - ) - print_summary("single", results) - all_results.extend(results) - else: - print(f"\n>>> baseline: {args.baseline_url}") - baseline_results = asyncio.run( - evaluate(queries, args.baseline_url, args.token, "baseline", mode=args.mode, fusion=args.fusion, rerank=args.rerank, analyze=args.analyze) - ) - baseline_summary = print_summary("baseline", baseline_results) + if run_legacy_mode: + if args.base_url: + print(f"\n>>> evaluating: {args.base_url}") + results = asyncio.run( + evaluate(queries, args.base_url, args.token, "single", mode=args.mode, fusion=args.fusion, rerank=args.rerank, analyze=args.analyze) + ) + print_summary("single", results) + all_results.extend(results) + else: + print(f"\n>>> baseline: {args.baseline_url}") + baseline_results = asyncio.run( + evaluate(queries, args.baseline_url, args.token, "baseline", mode=args.mode, fusion=args.fusion, rerank=args.rerank, analyze=args.analyze) + ) + baseline_summary = print_summary("baseline", baseline_results) - print(f"\n>>> candidate: {args.candidate_url}") - candidate_results = asyncio.run( - evaluate( - queries, args.candidate_url, args.token, "candidate", mode=args.mode, fusion=args.fusion, rerank=args.rerank, analyze=args.analyze + print(f"\n>>> candidate: {args.candidate_url}") + candidate_results = asyncio.run( + evaluate( + queries, args.candidate_url, args.token, "candidate", mode=args.mode, fusion=args.fusion, rerank=args.rerank, analyze=args.analyze + ) + ) + candidate_summary = print_summary("candidate", candidate_results) + + # 델타 + print("\n=== Δ (candidate - baseline) ===") + for k in ( + "recall_at_10", + "mrr_at_10", + "ndcg_at_10", + "top3_hit_rate", + "latency_p50", + "latency_p95", + ): + delta = candidate_summary[k] - baseline_summary[k] + sign = "+" if delta >= 0 else "" + print(f" {k:<16}: {sign}{delta:.3f}") + + all_results.extend(baseline_results) + all_results.extend(candidate_results) + + if args.output: + write_csv(all_results, args.output) + + # 발주건 단위 baseline (Phase 0) + if run_order_mode: + order_queries = load_order_queries(args.queries_order) + order_groups = load_order_groups(args.order_groups) + print( + f"\nLoaded {len(order_queries)} order queries from {args.queries_order}" + f" / {len(order_groups)} groups from {args.order_groups}" + ) + order_results = asyncio.run( + evaluate_orders( + order_queries, order_groups, args.base_url, args.token, + mode=args.mode, fusion=args.fusion, rerank=args.rerank, analyze=args.analyze, + debug=args.debug, ) ) - candidate_summary = print_summary("candidate", candidate_results) - - # 델타 - print("\n=== Δ (candidate - baseline) ===") - for k in ( - "recall_at_10", - "mrr_at_10", - "ndcg_at_10", - "top3_hit_rate", - "latency_p50", - "latency_p95", - ): - delta = candidate_summary[k] - baseline_summary[k] - sign = "+" if delta >= 0 else "" - print(f" {k:<16}: {sign}{delta:.3f}") - - all_results.extend(baseline_results) - all_results.extend(candidate_results) - - if args.output: - write_csv(all_results, args.output) + print_order_summary(order_results) + if args.output_order: + write_order_csv(order_results, args.output_order) + elif not args.output: + print( + "\nNOTE: --output-order 미지정 — CSV 저장 skip. 결과는 stdout 요약만.", + file=sys.stderr, + ) return 0