From 51a6f7c9af325967f772591a46042f097d139661 Mon Sep 17 00:00:00 2001
From: Hyungi Ahn <hyungiahn@Hyungiui-MacBookPro.local>
Date: Mon, 20 Apr 2026 15:04:39 +0900
Subject: [PATCH] =?UTF-8?q?feat(eval):=20=EB=B0=9C=EC=A3=BC=EA=B1=B4=20?=
 =?UTF-8?q?=EB=8B=A8=EC=9C=84=20baseline=20=ED=8F=89=EA=B0=80=20=EA=B2=BD?=
 =?UTF-8?q?=EB=A1=9C=20=EC=B6=94=EA=B0=80?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- run_eval.py: --queries-order / --order-groups / --output-order / --debug
  옵션 추가. 기존 legacy CSV 스키마/값 불변 (출력 소비자 보호).
- Tier 1A/1B/2 지표 구현: cross_format_link_success (top-10 공식 +
  top-5 보조, eligible/success 분수), top_5_document_match (guardrail +
  절대 건수), manual_refind_flag (v0 heuristic), chunk_idx_stddev,
  range/page_citation_available capability flags.
- order_groups.yaml: 발주건 3건 매핑 (TKP-26-0114/0132/0112, 10 docs).
- queries_order_baseline.yaml: 12개 질문 (A:4 B:4 C:3 D:1).

plan: ~/.claude/plans/merry-yawning-owl.md
---
 tests/search_eval/order_groups.yaml           |  64 ++
 tests/search_eval/queries_order_baseline.yaml | 266 +++++++
 tests/search_eval/run_eval.py                 | 737 +++++++++++++++++-
 3 files changed, 1027 insertions(+), 40 deletions(-)
 create mode 100644 tests/search_eval/order_groups.yaml
 create mode 100644 tests/search_eval/queries_order_baseline.yaml

diff --git a/tests/search_eval/order_groups.yaml b/tests/search_eval/order_groups.yaml
new file mode 100644
index 0000000..a0076aa
--- /dev/null
+++ b/tests/search_eval/order_groups.yaml
@@ -0,0 +1,64 @@
+version: "0.1"
+created_at: "2026-04-20"
+notes: |
+  Phase 0 발주건 단위 baseline 전용 그룹 매핑. DB 스키마 건드리지 않고 가설적 grouping.
+  Phase 1A에서 구조화 추출 결과 본 뒤에야 정식 domain 모델 결정.
+
+  role enum:
+    - order_xlsx: 발주서 원본 xlsx
+    - order_pdf:  발주서 PDF 변환본 (xlsx에서 출력)
+    - invoice:    전자세금계산서 PDF (세무 증빙)
+    - statement:  거래명세표/명세서 PDF (출고·배송 증빙, 선택)
+
+  한 발주건에 invoice/statement 둘 다 있거나, 하나만 있거나, 둘 다 없을 수 있음.
+
+  샘플 소스: ~/Desktop/3월지급 → NAS /volume4/Document_Server/PKM/Inbox/발주/2026-03/
+  (2026-04-20 ingest 완료, 9 파일 / 3 발주건)
+
+  제외된 파일:
+    - 신양철강 74,290원.pdf (TKP-26-0132 invoice 중복 사본, 같은 세금계산서의 공급받는자 보관용)
+    - 260220-테크니컬코리아 - 복사본.jpg (이미지, OCR 필요 — Phase 0 범위 밖)
+    - 면취날 연마.pdf (DCS엔지 별도 PO의 invoice, TKP-26-0112와 무관)
+
+  2026-04-20 추가: TKP-26-0112의 invoice(TrusBill print.pdf, doc_id 8944) 누락 발견 후 ingest.
+  초기엔 "invoice 없는 케이스"로 잡았으나 파일명이 invoice를 암시 안 해서 놓침 (실제로는 존재).
+
+  doc_id 조회 쿼리:
+    docker exec hyungi_document_server-postgres-1 psql -U pkm -d pkm -c \
+      "SELECT id, file_path, length(extracted_text) AS tlen,
+              (SELECT count(*) FROM document_chunks WHERE doc_id = d.id) AS chunks
+       FROM documents d
+       WHERE file_path LIKE '%Inbox/발주/2026-03/%' ORDER BY id;"
+
+groups:
+  - order_group_id: TKP-26-0114
+    description: "대연기업 845,000원 (2026-02, 발주 + 세금계산서)"
+    docs:
+      - doc_id: 8853
+        role: order_xlsx
+      - doc_id: 8854
+        role: order_pdf
+      - doc_id: 8855
+        role: invoice     # "2월 26일.pdf" (전자세금계산서)
+
+  - order_group_id: TKP-26-0132
+    description: "신양철강 74,290원 (2026-03, 발주 + 세금계산서 + 거래명세표, 가장 풍성한 세트)"
+    docs:
+      - doc_id: 8856
+        role: order_xlsx
+      - doc_id: 8857
+        role: order_pdf
+      - doc_id: 8858
+        role: invoice     # "전자세금계산서.pdf"
+      - doc_id: 8859
+        role: statement   # "테크니컬 3-3일자 명세서(출고완료건).pdf"
+
+  - order_group_id: TKP-26-0112
+    description: "신양철강 650,000원 (2026-02, plate 절단 1건, statement 없음)"
+    docs:
+      - doc_id: 8851
+        role: order_xlsx
+      - doc_id: 8852
+        role: order_pdf
+      - doc_id: 8944
+        role: invoice     # "TrusBill print.pdf" (전자세금계산서, 공급가액 650,000)
diff --git a/tests/search_eval/queries_order_baseline.yaml b/tests/search_eval/queries_order_baseline.yaml
new file mode 100644
index 0000000..0e911f3
--- /dev/null
+++ b/tests/search_eval/queries_order_baseline.yaml
@@ -0,0 +1,266 @@
+version: "0.1"
+created_at: "2026-04-20"
+notes: |
+  Phase 0 발주건 단위 baseline 질문셋. 12개, 분포 A:B:C:D = 4:4:3:1.
+
+  카테고리:
+    A. 발주서 내부 찾기           (품목명/수량·단가·금액/납기/공급처)
+    B. 발주서 PDF 변환본 대응      (xlsx↔PDF 일치, PDF 페이지 위치)
+    C. 세금계산서/명세표 연결      (대응 invoice/statement 존재, 금액 일치, 발주건 식별)
+    D. 포맷 간 일치성              (핵심 필드 일치 or PDF만으로 발주건 식별)
+
+  expected_locations 스키마:
+    - doc_id         : order_groups.yaml의 doc_id
+    - role           : order_xlsx | order_pdf | invoice | statement
+    - location_type  : sheet_range | page | document_only
+    - location_value : xlsx는 A1 notation 예) "발주서!F9", PDF는 "p1"
+                       document_only일 때는 null
+    - is_primary     : 해당 role이 이 질문의 1차 정답 포맷인가
+
+  document_only 허용 규칙:
+    - 원칙: 가능한 경우 sheet_range 또는 page로 라벨링
+    - 예외: 근거 위치를 안정적으로 특정할 수 없을 때 또는 여러 문서 간 "일치성" 자체가 질문일 때
+    - 규칙: document_only 사용 시 notes 필드에 이유 필수 기재
+
+  2026-04-20 초안 (Claude 자동 draft, 사용자 검수 대기):
+    - 발주서 xlsx의 핵심 셀 매핑 (전 발주건 공통):
+        D5 ORDER DATE, D6 PO NO, F9 공급처, F11 주소, F13 전화, W9 납기일,
+        W12 프로젝트, W13 담당자, B17~ 품목 행, V17~ 수량, X17~ 단가, AB17~ 금액, AB23 TOTAL
+    - PDF는 모두 1페이지 분량 → p1 고정
+    - 실제 오라벨 가능성 있으니 사용자 확인 필요 (sheet range 특히)
+
+questions:
+  # ─────────────────────────────
+  # A. 발주서 내부 찾기 (4)
+  # ─────────────────────────────
+  - id: Q-A-001
+    query: "TKP-26-0114 발주의 공급처는 어디인가?"
+    category: A
+    order_group_id: TKP-26-0114
+    intent: fact_lookup
+    expected_locations:
+      - doc_id: 8853
+        role: order_xlsx
+        location_type: sheet_range
+        location_value: "발주서!F9"
+        is_primary: true
+      - doc_id: 8854
+        role: order_pdf
+        location_type: page
+        location_value: "p1"
+        is_primary: false
+    notes: "정답: （주）대연기업. xlsx F9."
+
+  - id: Q-A-002
+    query: "TKP-26-0132 발주의 납기일은?"
+    category: A
+    order_group_id: TKP-26-0132
+    intent: fact_lookup
+    expected_locations:
+      - doc_id: 8856
+        role: order_xlsx
+        location_type: sheet_range
+        location_value: "발주서!W9"
+        is_primary: true
+      - doc_id: 8857
+        role: order_pdf
+        location_type: page
+        location_value: "p1"
+        is_primary: false
+    notes: "정답: 2026-02-23. xlsx W9."
+
+  - id: Q-A-003
+    query: "TKP-26-0112 plate 절단 단가는 얼마인가?"
+    category: A
+    order_group_id: TKP-26-0112
+    intent: fact_lookup
+    expected_locations:
+      - doc_id: 8851
+        role: order_xlsx
+        location_type: sheet_range
+        location_value: "발주서!X17"
+        is_primary: true
+      - doc_id: 8852
+        role: order_pdf
+        location_type: page
+        location_value: "p1"
+        is_primary: false
+    notes: "정답: 650,000원 (1개 항목). xlsx X17."
+
+  - id: Q-A-004
+    query: "TKP-26-0114 발주의 총 금액은?"
+    category: A
+    order_group_id: TKP-26-0114
+    intent: fact_lookup
+    expected_locations:
+      - doc_id: 8853
+        role: order_xlsx
+        location_type: sheet_range
+        location_value: "발주서!AB23"
+        is_primary: true
+      - doc_id: 8854
+        role: order_pdf
+        location_type: page
+        location_value: "p1"
+        is_primary: false
+    notes: "정답: 845,000원 (부가세 별도). xlsx AB23."
+
+  # ─────────────────────────────
+  # B. 발주서 PDF 변환본 대응 (4)
+  # ─────────────────────────────
+  - id: Q-B-001
+    query: "TKP-26-0114에서 2:1 HEAD SA516-70 품목의 수량은?"
+    category: B
+    order_group_id: TKP-26-0114
+    intent: fact_lookup
+    expected_locations:
+      - doc_id: 8853
+        role: order_xlsx
+        location_type: sheet_range
+        location_value: "발주서!V17"
+        is_primary: true
+      - doc_id: 8854
+        role: order_pdf
+        location_type: page
+        location_value: "p1"
+        is_primary: false
+    notes: "정답: 2 EA. xlsx V17 / PDF p1 품목표. B 카테고리 = xlsx↔PDF 대응 확인."
+
+  - id: Q-B-002
+    query: "TKP-26-0132 발주서 총액은 얼마이고 PDF 변환본에서도 확인 가능한가?"
+    category: B
+    order_group_id: TKP-26-0132
+    intent: fact_lookup
+    expected_locations:
+      - doc_id: 8856
+        role: order_xlsx
+        location_type: sheet_range
+        location_value: "발주서!AB20"
+        is_primary: true
+      - doc_id: 8857
+        role: order_pdf
+        location_type: page
+        location_value: "p1"
+        is_primary: false
+    notes: "정답: 74,290원. xlsx AB20 / PDF p1 TOTAL. 두 포맷 모두 일치해야 함."
+
+  - id: Q-B-003
+    query: "TKP-26-0112 PO 번호를 PDF 변환본에서 확인"
+    category: B
+    order_group_id: TKP-26-0112
+    intent: fact_lookup
+    expected_locations:
+      - doc_id: 8852
+        role: order_pdf
+        location_type: page
+        location_value: "p1"
+        is_primary: true
+      - doc_id: 8851
+        role: order_xlsx
+        location_type: sheet_range
+        location_value: "발주서!D6"
+        is_primary: false
+    notes: "정답: TKP-26-0112. PDF만으로도 식별 가능한지 확인 (primary=order_pdf)."
+
+  - id: Q-B-004
+    query: "TKP-26-0114 발주서 담당자는 누구인가?"
+    category: B
+    order_group_id: TKP-26-0114
+    intent: fact_lookup
+    expected_locations:
+      - doc_id: 8853
+        role: order_xlsx
+        location_type: sheet_range
+        location_value: "발주서!W13"
+        is_primary: true
+      - doc_id: 8854
+        role: order_pdf
+        location_type: page
+        location_value: "p1"
+        is_primary: false
+    notes: "정답: 안현기(Hyunki,Ahn). xlsx W13 (PREPAIRED BY) / PDF p1."
+
+  # ─────────────────────────────
+  # C. 세금계산서/명세표 연결 (3)
+  # ─────────────────────────────
+  - id: Q-C-001
+    query: "TKP-26-0132 세금계산서의 공급가액은?"
+    category: C
+    order_group_id: TKP-26-0132
+    intent: fact_lookup
+    expected_locations:
+      - doc_id: 8858
+        role: invoice
+        location_type: page
+        location_value: "p1"
+        is_primary: true
+    notes: "정답: 74,290원. invoice p1 공급가액 칸. 발주서 총액과 일치 확인용."
+
+  - id: Q-C-002
+    query: "TKP-26-0114 발주금액과 세금계산서 공급가액이 일치하는가?"
+    category: C
+    order_group_id: TKP-26-0114
+    intent: comparison
+    expected_locations:
+      - doc_id: 8855
+        role: invoice
+        location_type: page
+        location_value: "p1"
+        is_primary: true
+      - doc_id: 8853
+        role: order_xlsx
+        location_type: sheet_range
+        location_value: "발주서!AB23"
+        is_primary: false
+    notes: |
+      정답: 일치 (양쪽 845,000원). invoice 공급가액 + xlsx TOTAL 비교 필요 —
+      cross-format retrieval 특성 강조. C 카테고리지만 order_xlsx도 근거로 필요.
+
+  - id: Q-C-003
+    query: "TKP-26-0132 거래명세표에 기재된 품목은 무엇인가?"
+    category: C
+    order_group_id: TKP-26-0132
+    intent: fact_lookup
+    expected_locations:
+      - doc_id: 8859
+        role: statement
+        location_type: page
+        location_value: "p1"
+        is_primary: true
+    notes: |
+      정답: "레이져 A516-70 가공비 12t x 1197 x 1197" 외 1건.
+      statement p1 품목 테이블.
+
+  # ─────────────────────────────
+  # D. 포맷 간 일치성 (1)
+  # ─────────────────────────────
+  - id: Q-D-001
+    query: "TKP-26-0132 발주번호가 발주서·PDF·세금계산서·거래명세표 4개 문서에 모두 나오는가?"
+    category: D
+    order_group_id: TKP-26-0132
+    intent: comparison
+    expected_locations:
+      - doc_id: 8856
+        role: order_xlsx
+        location_type: document_only
+        location_value: null
+        is_primary: true
+      - doc_id: 8857
+        role: order_pdf
+        location_type: document_only
+        location_value: null
+        is_primary: false
+      - doc_id: 8858
+        role: invoice
+        location_type: document_only
+        location_value: null
+        is_primary: false
+      - doc_id: 8859
+        role: statement
+        location_type: document_only
+        location_value: null
+        is_primary: false
+    notes: |
+      정답: 모두 나옴 — order_xlsx(D6), order_pdf(PO NO), invoice(<안현기님-TKP-26-0132>),
+      statement(<안현기님-TKP-26-0132>). D 카테고리는 "여러 문서 간 일치성" 자체가
+      질문이라 document_only 사용 — 위치보다 "같은 발주건에 속하는가"가 본질.
diff --git a/tests/search_eval/run_eval.py b/tests/search_eval/run_eval.py
index e506248..3e80cb4 100644
--- a/tests/search_eval/run_eval.py
+++ b/tests/search_eval/run_eval.py
@@ -7,6 +7,11 @@ Recall@10, MRR@10, NDCG@10, Top3 hit-rate, Latency p50/p95를 계산한다.
 A/B 비교 모드: --baseline-url, --candidate-url 를 각각 지정하면
 두 엔드포인트에 동일 쿼리셋을 던지고 결과를 비교한다.
 
+발주건 단위 baseline 모드 (Phase 0 / plan: merry-yawning-owl):
+  --queries-order + --order-groups + --output-order 로 xlsx/PDF 구조화 추출
+  gap 측정용 Tier 1A/1B/2 지표를 계산한다. 기존 --queries 경로와 CSV
+  스키마는 변경되지 않는다 (출력 소비자 보호).
+
 사용 예:
 
     # 단일 평가
@@ -21,6 +26,13 @@ A/B 비교 모드: --baseline-url, --candidate-url 를 각각 지정하면
         --candidate-url http://localhost:8000 \
         --output reports/phase1_vs_baseline.csv
 
+    # 발주건 단위 baseline
+    python tests/search_eval/run_eval.py \
+        --base-url http://localhost:8000 \
+        --queries-order tests/search_eval/queries_order_baseline.yaml \
+        --order-groups tests/search_eval/order_groups.yaml \
+        --output-order reports/baseline_order_unit_2026-04-20.csv
+
 토큰은 env DOCSRV_TOKEN 또는 --token 플래그로 전달.
 """
 
@@ -367,6 +379,578 @@ def load_queries(yaml_path: Path) -> list[Query]:
     return queries
 
 
+# ═════════════════════════════════════════════════════════════════
+# 발주건 단위 baseline (Phase 0 / plan: merry-yawning-owl)
+# ─────────────────────────────────────────────────────────────────
+# 아래 섹션은 "구조화 추출 gap 측정" 전용 코드 경로. 기존 legacy
+# 쿼리 평가는 위 섹션 그대로 — 스키마/값 불변.
+# ═════════════════════════════════════════════════════════════════
+
+
+# ─────────────────────────────────────────────────────────
+# 발주건 데이터 모델
+# ─────────────────────────────────────────────────────────
+
+
+@dataclass
+class OrderGroupDoc:
+    doc_id: int
+    role: str  # order_xlsx | order_pdf | calc_pdf
+
+
+@dataclass
+class OrderGroup:
+    order_group_id: str
+    description: str
+    docs: list[OrderGroupDoc]
+
+    def role_of(self, doc_id: int) -> str | None:
+        for d in self.docs:
+            if d.doc_id == doc_id:
+                return d.role
+        return None
+
+    def roles_set(self) -> set[str]:
+        return {d.role for d in self.docs}
+
+
+@dataclass
+class ExpectedLocation:
+    doc_id: int
+    role: str
+    location_type: str  # sheet_range | page | document_only
+    location_value: str | None
+    is_primary: bool
+
+
+@dataclass
+class OrderQuery:
+    id: str
+    query: str
+    category: str  # A | B | C | D
+    order_group_id: str
+    intent: str
+    expected_locations: list[ExpectedLocation]
+    notes: str = ""
+
+
+@dataclass
+class OrderQueryResult:
+    query: OrderQuery
+    returned_results: list[dict]
+    latency_ms: float
+    doc_match_top5: bool
+    cross_format_eligible: bool
+    cross_format_link_success_top10: bool
+    cross_format_link_success_top5: bool
+    range_citation_available: bool
+    page_citation_available: bool
+    matched_location_value: str | None
+    manual_refind_flag: bool
+    chunk_idx_stddev_top10: float | None
+    error: str | None = None
+
+
+# ─────────────────────────────────────────────────────────
+# 발주건 지표 (Tier 1A / 1B / 2)
+# ─────────────────────────────────────────────────────────
+
+
+def _collect_expected_doc_ids(locs: list[ExpectedLocation]) -> set[int]:
+    return {loc.doc_id for loc in locs}
+
+
+def doc_match_at_k(returned_ids: list[int], expected_doc_ids: set[int], k: int = 5) -> bool:
+    """Top-k에 expected doc_id 중 하나라도 있는가 (Tier 1A guardrail)."""
+    if not expected_doc_ids:
+        return False
+    return any(doc_id in expected_doc_ids for doc_id in returned_ids[:k])
+
+
+def cross_format_link_success(
+    returned_ids: list[int],
+    expected_locations: list[ExpectedLocation],
+    group: OrderGroup,
+    k: int,
+) -> tuple[bool, bool]:
+    """Tier 1A 공식 지표. (success, eligible) 반환.
+
+    Eligible: order_group이 서로 다른 role을 2개 이상 보유 (즉 cross-format
+    연결을 측정할 의미가 있는 그룹).
+
+    Success (3조건 동시):
+      ① is_primary=true expected role의 doc이 top-k에 1개 이상 존재
+      ② 다른 role의 doc이 top-k에 1개 이상 존재
+      ③ 두 doc 모두 동일 order_group 소속
+
+    "role 다른 doc_id 2개" 단순 존재 검사는 false positive 있어 사용 금지.
+    """
+    if len(group.roles_set()) < 2:
+        return False, False  # ineligible
+
+    primary_roles = {loc.role for loc in expected_locations if loc.is_primary}
+    if not primary_roles:
+        # primary 라벨이 없으면 eligible이긴 해도 success 판정 불가
+        return False, True
+
+    top_k_set = set(returned_ids[:k])
+    group_doc_ids_by_role: dict[str, list[int]] = {}
+    for d in group.docs:
+        group_doc_ids_by_role.setdefault(d.role, []).append(d.doc_id)
+
+    has_primary = any(
+        doc_id in top_k_set
+        for role in primary_roles
+        for doc_id in group_doc_ids_by_role.get(role, [])
+    )
+    other_roles = group.roles_set() - primary_roles
+    has_other = any(
+        doc_id in top_k_set
+        for role in other_roles
+        for doc_id in group_doc_ids_by_role.get(role, [])
+    )
+    return (has_primary and has_other), True
+
+
+def range_citation_available(returned_results: list[dict]) -> bool:
+    """Tier 2: 응답에 sheet_name 또는 cell_range 필드가 존재하고 비어있지 않은가.
+
+    현재 API(`app/api/search.py`)에는 해당 필드 없음 → baseline = False (0%).
+    """
+    for r in returned_results:
+        if r.get("sheet_name") or r.get("cell_range"):
+            return True
+    return False
+
+
+def page_citation_available(returned_results: list[dict]) -> bool:
+    """Tier 2: 응답에 page 필드가 존재하고 비어있지 않은가.
+
+    현재 chunk.page는 항상 null → baseline = False (0%).
+    """
+    for r in returned_results:
+        page = r.get("page")
+        if page is not None and page != "":
+            return True
+    return False
+
+
+def _tokenize_query(q: str) -> list[str]:
+    """간단한 토큰화: 공백 split 후 2자 이상만."""
+    return [t for t in q.lower().split() if len(t) >= 2]
+
+
+def manual_refind_flag_v0(
+    returned_results: list[dict],
+    query_text: str,
+    score_threshold: float = 0.5,
+) -> bool:
+    """Tier 1B v0 heuristic. top_1 score < threshold AND snippet 핵심 토큰 미포함.
+
+    주의: v0. 점수 임계값 0.5는 **임시값** — 검색 score calibration 바뀌면
+    baseline 간 비교가 흔들릴 수 있다. 절대값처럼 취급 금지. 보고서에
+    "heuristic vs 실감각 수동 교차검증" 결과 병기 필수.
+    """
+    if not returned_results:
+        return True
+    top_1 = returned_results[0]
+    score = top_1.get("score", 0.0)
+    if score is None:
+        score = 0.0
+    if score >= score_threshold:
+        return False
+    snippet = (top_1.get("snippet") or "").lower()
+    title = (top_1.get("title") or "").lower()
+    haystack = f"{title} {snippet}"
+    tokens = _tokenize_query(query_text)
+    if not tokens:
+        return False
+    has_any_token = any(t in haystack for t in tokens)
+    return not has_any_token
+
+
+def _chunk_idx_stddev_top10(returned_results: list[dict]) -> float | None:
+    """Top-10의 chunk_index 분산 (낮을수록 한 섹션에 몰림). Observational only."""
+    idxs = [r.get("chunk_index") for r in returned_results[:10]]
+    vals = [i for i in idxs if isinstance(i, int)]
+    if len(vals) < 2:
+        return None
+    return statistics.stdev(vals)
+
+
+def _matched_location_value(
+    returned_results: list[dict],
+    expected_locations: list[ExpectedLocation],
+) -> str | None:
+    """Tier 2 matched_location: 현재 API는 location 필드를 노출하지 않으므로
+    baseline에선 항상 None. Phase 1A/1B 구현 이후 값이 채워진다.
+    """
+    # 현재 API 응답에 location 정보 없음 → 항상 None
+    # Phase 1A/1B 구현 후 r.get("cell_range") / r.get("page") 체크로 확장
+    return None
+
+
+# ─────────────────────────────────────────────────────────
+# 발주건 API 호출 (full result dict 반환)
+# ─────────────────────────────────────────────────────────
+
+
+async def call_search_full(
+    client: httpx.AsyncClient,
+    base_url: str,
+    token: str,
+    query: str,
+    mode: str = "hybrid",
+    limit: int = 20,
+    fusion: str | None = None,
+    rerank: str | None = None,
+    analyze: str | None = None,
+    debug: bool = False,
+) -> tuple[list[dict], float]:
+    """call_search와 동일 로직. 단 full result dict 리스트 반환."""
+    url = f"{base_url.rstrip('/')}/api/search/"
+    headers = {"Authorization": f"Bearer {token}"}
+    params: dict[str, str | int] = {"q": query, "mode": mode, "limit": limit}
+    if fusion:
+        params["fusion"] = fusion
+    if rerank is not None:
+        params["rerank"] = rerank
+    if analyze is not None:
+        params["analyze"] = analyze
+    if debug:
+        params["debug"] = "true"
+
+    import time
+
+    start = time.perf_counter()
+    response = await client.get(url, headers=headers, params=params, timeout=30.0)
+    latency_ms = (time.perf_counter() - start) * 1000
+    response.raise_for_status()
+    data = response.json()
+    return data.get("results", []), latency_ms
+
+
+# ─────────────────────────────────────────────────────────
+# 발주건 평가 실행
+# ─────────────────────────────────────────────────────────
+
+
+async def evaluate_orders(
+    queries: list[OrderQuery],
+    groups: dict[str, OrderGroup],
+    base_url: str,
+    token: str,
+    mode: str = "hybrid",
+    fusion: str | None = None,
+    rerank: str | None = None,
+    analyze: str | None = None,
+    debug: bool = False,
+) -> list[OrderQueryResult]:
+    """발주건 쿼리셋 평가."""
+    results: list[OrderQueryResult] = []
+    async with httpx.AsyncClient() as client:
+        for q in queries:
+            group = groups.get(q.order_group_id)
+            if group is None:
+                results.append(
+                    OrderQueryResult(
+                        query=q,
+                        returned_results=[],
+                        latency_ms=0.0,
+                        doc_match_top5=False,
+                        cross_format_eligible=False,
+                        cross_format_link_success_top10=False,
+                        cross_format_link_success_top5=False,
+                        range_citation_available=False,
+                        page_citation_available=False,
+                        matched_location_value=None,
+                        manual_refind_flag=True,
+                        chunk_idx_stddev_top10=None,
+                        error=f"unknown order_group_id={q.order_group_id}",
+                    )
+                )
+                continue
+            try:
+                returned, latency_ms = await call_search_full(
+                    client, base_url, token, q.query,
+                    mode=mode, fusion=fusion, rerank=rerank, analyze=analyze, debug=debug,
+                )
+                returned_ids = [r["id"] for r in returned]
+                expected_ids = _collect_expected_doc_ids(q.expected_locations)
+
+                cf10, eligible10 = cross_format_link_success(returned_ids, q.expected_locations, group, 10)
+                cf5, _eligible5 = cross_format_link_success(returned_ids, q.expected_locations, group, 5)
+
+                results.append(
+                    OrderQueryResult(
+                        query=q,
+                        returned_results=returned,
+                        latency_ms=latency_ms,
+                        doc_match_top5=doc_match_at_k(returned_ids, expected_ids, 5),
+                        cross_format_eligible=eligible10,
+                        cross_format_link_success_top10=cf10 if eligible10 else False,
+                        cross_format_link_success_top5=cf5 if eligible10 else False,
+                        range_citation_available=range_citation_available(returned),
+                        page_citation_available=page_citation_available(returned),
+                        matched_location_value=_matched_location_value(returned, q.expected_locations),
+                        manual_refind_flag=manual_refind_flag_v0(returned, q.query),
+                        chunk_idx_stddev_top10=_chunk_idx_stddev_top10(returned),
+                    )
+                )
+            except Exception as exc:
+                results.append(
+                    OrderQueryResult(
+                        query=q,
+                        returned_results=[],
+                        latency_ms=0.0,
+                        doc_match_top5=False,
+                        cross_format_eligible=False,
+                        cross_format_link_success_top10=False,
+                        cross_format_link_success_top5=False,
+                        range_citation_available=False,
+                        page_citation_available=False,
+                        matched_location_value=None,
+                        manual_refind_flag=True,
+                        chunk_idx_stddev_top10=None,
+                        error=str(exc),
+                    )
+                )
+    return results
+
+
+# ─────────────────────────────────────────────────────────
+# 발주건 결과 집계 / 출력
+# ─────────────────────────────────────────────────────────
+
+
+def print_order_summary(results: list[OrderQueryResult]) -> dict[str, Any]:
+    """Tier 1A/1B/2 지표 요약. 절대 건수 병기. 집계 dict 반환."""
+    n = len(results)
+    if n == 0:
+        return {}
+
+    # Tier 1A
+    doc_match_count = sum(1 for r in results if r.doc_match_top5)
+    eligible_results = [r for r in results if r.cross_format_eligible]
+    cf10_success = sum(1 for r in eligible_results if r.cross_format_link_success_top10)
+    cf5_success = sum(1 for r in eligible_results if r.cross_format_link_success_top5)
+
+    # Tier 1B
+    refind_flag_count = sum(1 for r in results if r.manual_refind_flag)
+    stddev_values = [r.chunk_idx_stddev_top10 for r in results if r.chunk_idx_stddev_top10 is not None]
+    avg_stddev = statistics.mean(stddev_values) if stddev_values else None
+
+    # Tier 2
+    range_avail_count = sum(1 for r in results if r.range_citation_available)
+    page_avail_count = sum(1 for r in results if r.page_citation_available)
+
+    # Latency
+    latencies = [r.latency_ms for r in results if r.latency_ms > 0]
+    p50 = percentile(latencies, 0.50)
+    p95 = percentile(latencies, 0.95)
+
+    print(f"\n=== Order-unit baseline (n={n}) ===")
+    print("  Tier 1A (gate 후보 / guardrail):")
+    print(
+        f"    top_5_document_match_rate     : {doc_match_count}/{n}"
+        f" ({doc_match_count / n:.1%}) — Guardrail, 비악화 강제"
+    )
+    if eligible_results:
+        print(
+            f"    cross_format_link top-10      : {cf10_success}/{len(eligible_results)}"
+            f" ({cf10_success / len(eligible_results):.1%}) [공식 gate 후보]"
+        )
+        print(
+            f"    cross_format_link top-5       : {cf5_success}/{len(eligible_results)}"
+            f" ({cf5_success / len(eligible_results):.1%}) [보조 관찰]"
+        )
+    else:
+        print("    cross_format_link             : no eligible queries (group roles<2)")
+
+    print("  Tier 1B (관찰용):")
+    print(
+        f"    manual_refind_flag (v0)       : {refind_flag_count}/{n}"
+        f" ({refind_flag_count / n:.1%}) — heuristic, 수동 교차검증 필수"
+    )
+    if avg_stddev is not None:
+        print(f"    chunk_idx_stddev_top10 (mean) : {avg_stddev:.2f}")
+
+    print("  Tier 2 (auto-eval 기준, 현재 시스템 baseline = 0):")
+    print(
+        f"    range_citation_available      : {range_avail_count}/{n}"
+        f" ({range_avail_count / n:.1%})"
+    )
+    print(
+        f"    page_citation_available       : {page_avail_count}/{n}"
+        f" ({page_avail_count / n:.1%})"
+    )
+
+    print(f"  Latency p50 / p95             : {p50:.0f} / {p95:.0f} ms")
+
+    # 카테고리별 rollup
+    by_cat: dict[str, list[OrderQueryResult]] = {}
+    for r in results:
+        by_cat.setdefault(r.query.category, []).append(r)
+    print("  by category (A/B/C/D):")
+    for cat in sorted(by_cat.keys()):
+        items = by_cat[cat]
+        cat_doc = sum(1 for r in items if r.doc_match_top5)
+        cat_cf_eligible = [r for r in items if r.cross_format_eligible]
+        cat_cf10 = sum(1 for r in cat_cf_eligible if r.cross_format_link_success_top10)
+        cf_str = (
+            f"cf10 {cat_cf10}/{len(cat_cf_eligible)}"
+            if cat_cf_eligible else "cf10 n/a"
+        )
+        print(f"    {cat}  n={len(items):>2}  doc_match {cat_doc}/{len(items)}  {cf_str}")
+
+    # 발주건별 rollup
+    by_group: dict[str, list[OrderQueryResult]] = {}
+    for r in results:
+        by_group.setdefault(r.query.order_group_id, []).append(r)
+    print("  by order_group:")
+    for gid in sorted(by_group.keys()):
+        items = by_group[gid]
+        g_doc = sum(1 for r in items if r.doc_match_top5)
+        print(f"    {gid}  n={len(items):>2}  doc_match {g_doc}/{len(items)}")
+
+    # 에러
+    errors = [r for r in results if r.error]
+    if errors:
+        print(f"  ERRORS ({len(errors)}):")
+        for r in errors:
+            print(f"    [{r.query.id}] {r.error}")
+
+    return {
+        "n": n,
+        "doc_match_top5": (doc_match_count, n),
+        "cross_format_link_top10": (cf10_success, len(eligible_results)),
+        "cross_format_link_top5": (cf5_success, len(eligible_results)),
+        "manual_refind_flag": (refind_flag_count, n),
+        "range_citation_available": (range_avail_count, n),
+        "page_citation_available": (page_avail_count, n),
+        "latency_p50": p50,
+        "latency_p95": p95,
+    }
+
+
+def write_order_csv(results: list[OrderQueryResult], output_path: Path) -> None:
+    """발주건 baseline 전용 CSV. 기존 write_csv와 분리 — 스키마 간섭 없음."""
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    columns = [
+        "id",
+        "query",
+        "category_abcd",
+        "order_group_id",
+        "intent",
+        "expected_doc_ids",
+        "expected_roles",
+        "expected_location_type",
+        "expected_location_value",
+        "returned_ids_top10",
+        "latency_ms",
+        "doc_match_top5",
+        "cross_format_eligible",
+        "cross_format_link_success_top10",
+        "cross_format_link_success_top5",
+        "range_citation_available",
+        "page_citation_available",
+        "matched_location_value",
+        "manual_refind_flag",
+        "chunk_idx_stddev_top10",
+        "notes",
+        "error",
+    ]
+    with output_path.open("w", newline="", encoding="utf-8") as f:
+        writer = csv.writer(f)
+        writer.writerow(columns)
+        for r in results:
+            returned_ids = [item["id"] for item in r.returned_results[:10]]
+            # primary 우선으로 location_type/value 선택
+            primary_locs = [loc for loc in r.query.expected_locations if loc.is_primary]
+            repr_loc = primary_locs[0] if primary_locs else (
+                r.query.expected_locations[0] if r.query.expected_locations else None
+            )
+            writer.writerow(
+                [
+                    r.query.id,
+                    r.query.query,
+                    r.query.category,
+                    r.query.order_group_id,
+                    r.query.intent,
+                    ";".join(str(loc.doc_id) for loc in r.query.expected_locations),
+                    ";".join(loc.role for loc in r.query.expected_locations),
+                    repr_loc.location_type if repr_loc else "",
+                    repr_loc.location_value if repr_loc and repr_loc.location_value else "",
+                    ";".join(map(str, returned_ids)),
+                    f"{r.latency_ms:.1f}",
+                    "1" if r.doc_match_top5 else "0",
+                    "1" if r.cross_format_eligible else "0",
+                    "1" if r.cross_format_link_success_top10 else "0",
+                    "1" if r.cross_format_link_success_top5 else "0",
+                    "1" if r.range_citation_available else "0",
+                    "1" if r.page_citation_available else "0",
+                    r.matched_location_value or "",
+                    "1" if r.manual_refind_flag else "0",
+                    f"{r.chunk_idx_stddev_top10:.2f}" if r.chunk_idx_stddev_top10 is not None else "",
+                    r.query.notes,
+                    r.error or "",
+                ]
+            )
+    print(f"\nOrder baseline CSV written: {output_path}")
+
+
+# ─────────────────────────────────────────────────────────
+# 발주건 YAML 로딩
+# ─────────────────────────────────────────────────────────
+
+
+def load_order_groups(yaml_path: Path) -> dict[str, OrderGroup]:
+    with yaml_path.open(encoding="utf-8") as f:
+        data = yaml.safe_load(f)
+    groups: dict[str, OrderGroup] = {}
+    for g in data.get("groups", []):
+        docs = [
+            OrderGroupDoc(doc_id=int(d["doc_id"]), role=d["role"])
+            for d in g.get("docs", [])
+        ]
+        groups[g["order_group_id"]] = OrderGroup(
+            order_group_id=g["order_group_id"],
+            description=g.get("description", "") or "",
+            docs=docs,
+        )
+    return groups
+
+
+def load_order_queries(yaml_path: Path) -> list[OrderQuery]:
+    with yaml_path.open(encoding="utf-8") as f:
+        data = yaml.safe_load(f)
+    queries: list[OrderQuery] = []
+    for q in data.get("questions", []):
+        locs = []
+        for loc in q.get("expected_locations", []) or []:
+            locs.append(
+                ExpectedLocation(
+                    doc_id=int(loc["doc_id"]),
+                    role=loc["role"],
+                    location_type=loc["location_type"],
+                    location_value=loc.get("location_value"),
+                    is_primary=bool(loc.get("is_primary", False)),
+                )
+            )
+        queries.append(
+            OrderQuery(
+                id=q["id"],
+                query=q["query"],
+                category=q["category"],
+                order_group_id=q["order_group_id"],
+                intent=q.get("intent", "") or "",
+                expected_locations=locs,
+                notes=q.get("notes", "") or "",
+            )
+        )
+    return queries
+
+
 # ─────────────────────────────────────────────────────────
 # CLI
 # ─────────────────────────────────────────────────────────
@@ -438,6 +1022,30 @@ def main() -> int:
         default=None,
         help="CSV 출력 경로 (지정하면 raw 결과 저장)",
     )
+    # 발주건 단위 baseline (Phase 0 / plan: merry-yawning-owl)
+    parser.add_argument(
+        "--queries-order",
+        type=Path,
+        default=None,
+        help="발주건 쿼리 YAML (queries_order_baseline.yaml)",
+    )
+    parser.add_argument(
+        "--order-groups",
+        type=Path,
+        default=None,
+        help="발주건 그룹 매핑 YAML (order_groups.yaml)",
+    )
+    parser.add_argument(
+        "--output-order",
+        type=Path,
+        default=None,
+        help="발주건 baseline 전용 CSV 출력 경로 (legacy --output과 분리)",
+    )
+    parser.add_argument(
+        "--debug",
+        action="store_true",
+        help="검색 API debug=true 요청 (발주건 모드에서 응답 검증용)",
+    )
     args = parser.parse_args()
 
     if not args.token:
@@ -451,8 +1059,32 @@ def main() -> int:
         )
         return 2
 
-    queries = load_queries(args.queries)
-    print(f"Loaded {len(queries)} queries from {args.queries}")
+    # 발주건 단위 baseline 모드 (Phase 0 / plan: merry-yawning-owl)
+    run_order_mode = args.queries_order is not None
+    # Legacy 경로 실행 조건: order-only 실행이 아닐 때 (= --queries-order + --output-order만 단독으로
+    # 준 경우는 skip). --output / --baseline-url / --candidate-url 중 하나라도 있으면 legacy도 실행.
+    run_legacy_mode = (
+        not run_order_mode
+        or args.output is not None
+        or args.baseline_url is not None
+        or args.candidate_url is not None
+    )
+
+    if run_order_mode:
+        if args.order_groups is None:
+            print("ERROR: --queries-order 사용 시 --order-groups 필수", file=sys.stderr)
+            return 2
+        if not args.base_url:
+            print("ERROR: --queries-order 모드는 --base-url만 지원 (A/B 미지원)", file=sys.stderr)
+            return 2
+
+    if not run_legacy_mode and not run_order_mode:
+        print("ERROR: 실행할 평가 경로가 없음", file=sys.stderr)
+        return 2
+
+    queries = load_queries(args.queries) if run_legacy_mode else []
+    if run_legacy_mode:
+        print(f"Loaded {len(queries)} queries from {args.queries}")
     print(f"Mode: {args.mode}", end="")
     if args.fusion:
         print(f" / fusion: {args.fusion}", end="")
@@ -462,47 +1094,72 @@ def main() -> int:
 
     all_results: list[QueryResult] = []
 
-    if args.base_url:
-        print(f"\n>>> evaluating: {args.base_url}")
-        results = asyncio.run(
-            evaluate(queries, args.base_url, args.token, "single", mode=args.mode, fusion=args.fusion, rerank=args.rerank, analyze=args.analyze)
-        )
-        print_summary("single", results)
-        all_results.extend(results)
-    else:
-        print(f"\n>>> baseline: {args.baseline_url}")
-        baseline_results = asyncio.run(
-            evaluate(queries, args.baseline_url, args.token, "baseline", mode=args.mode, fusion=args.fusion, rerank=args.rerank, analyze=args.analyze)
-        )
-        baseline_summary = print_summary("baseline", baseline_results)
+    if run_legacy_mode:
+        if args.base_url:
+            print(f"\n>>> evaluating: {args.base_url}")
+            results = asyncio.run(
+                evaluate(queries, args.base_url, args.token, "single", mode=args.mode, fusion=args.fusion, rerank=args.rerank, analyze=args.analyze)
+            )
+            print_summary("single", results)
+            all_results.extend(results)
+        else:
+            print(f"\n>>> baseline: {args.baseline_url}")
+            baseline_results = asyncio.run(
+                evaluate(queries, args.baseline_url, args.token, "baseline", mode=args.mode, fusion=args.fusion, rerank=args.rerank, analyze=args.analyze)
+            )
+            baseline_summary = print_summary("baseline", baseline_results)
 
-        print(f"\n>>> candidate: {args.candidate_url}")
-        candidate_results = asyncio.run(
-            evaluate(
-                queries, args.candidate_url, args.token, "candidate", mode=args.mode, fusion=args.fusion, rerank=args.rerank, analyze=args.analyze
+            print(f"\n>>> candidate: {args.candidate_url}")
+            candidate_results = asyncio.run(
+                evaluate(
+                    queries, args.candidate_url, args.token, "candidate", mode=args.mode, fusion=args.fusion, rerank=args.rerank, analyze=args.analyze
+                )
+            )
+            candidate_summary = print_summary("candidate", candidate_results)
+
+            # 델타
+            print("\n=== Δ (candidate - baseline) ===")
+            for k in (
+                "recall_at_10",
+                "mrr_at_10",
+                "ndcg_at_10",
+                "top3_hit_rate",
+                "latency_p50",
+                "latency_p95",
+            ):
+                delta = candidate_summary[k] - baseline_summary[k]
+                sign = "+" if delta >= 0 else ""
+                print(f"  {k:<16}: {sign}{delta:.3f}")
+
+            all_results.extend(baseline_results)
+            all_results.extend(candidate_results)
+
+        if args.output:
+            write_csv(all_results, args.output)
+
+    # 발주건 단위 baseline (Phase 0)
+    if run_order_mode:
+        order_queries = load_order_queries(args.queries_order)
+        order_groups = load_order_groups(args.order_groups)
+        print(
+            f"\nLoaded {len(order_queries)} order queries from {args.queries_order}"
+            f" / {len(order_groups)} groups from {args.order_groups}"
+        )
+        order_results = asyncio.run(
+            evaluate_orders(
+                order_queries, order_groups, args.base_url, args.token,
+                mode=args.mode, fusion=args.fusion, rerank=args.rerank, analyze=args.analyze,
+                debug=args.debug,
             )
         )
-        candidate_summary = print_summary("candidate", candidate_results)
-
-        # 델타
-        print("\n=== Δ (candidate - baseline) ===")
-        for k in (
-            "recall_at_10",
-            "mrr_at_10",
-            "ndcg_at_10",
-            "top3_hit_rate",
-            "latency_p50",
-            "latency_p95",
-        ):
-            delta = candidate_summary[k] - baseline_summary[k]
-            sign = "+" if delta >= 0 else ""
-            print(f"  {k:<16}: {sign}{delta:.3f}")
-
-        all_results.extend(baseline_results)
-        all_results.extend(candidate_results)
-
-    if args.output:
-        write_csv(all_results, args.output)
+        print_order_summary(order_results)
+        if args.output_order:
+            write_order_csv(order_results, args.output_order)
+        elif not args.output:
+            print(
+                "\nNOTE: --output-order 미지정 — CSV 저장 skip. 결과는 stdout 요약만.",
+                file=sys.stderr,
+            )
 
     return 0