From 9dad5e62898c69e4d70ece0ad87bfa879c8643f7 Mon Sep 17 00:00:00 2001 From: hyungi Date: Sun, 24 May 2026 04:35:33 +0000 Subject: [PATCH] =?UTF-8?q?chore(eval):=20graded=20NDCG=20dedup=20+=20warn?= =?UTF-8?q?ing=20+=20audit=20stats=20(Phase=202Q=20inflation=20=EC=A0=95?= =?UTF-8?q?=EC=A0=95)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR-Eval-GradedNDCG-Dedup. [[feedback_graded_ndcg_dedup_invariant]] cleanup. plan pr-eval-graded-ndcg-dedup-stormy-tide.md. 변경: - tests/search_eval/run_eval.py: · _dedup_returned_ids() helper — returned[:k] 첫 등장 순서 보존 dedup + count 반환 · count_dedup() wrapper (audit 용) · ndcg_at_k + graded_ndcg_at_k 진입 시 dedup (NDCG > 1.0 invariant 강제) · QueryResult.dedup_count 필드 + csv schema 신규 column · evaluate() 에서 dedup_count > 0 시 stderr WARNING · print_summary 에 dedup audit stats (cases/total chunks + 정상/⚠️ flag) - tests/search_eval/test_eval_graded_ndcg_dedup.py 신규 — 13 test: · _dedup_returned_ids 6 (empty / no-dup / dup-first / k-limit / count helper / Phase 2Q kw_001) · graded_ndcg invariant 5 (baseline 회귀 0 / dup 차단 / all-dup / exam_001 regression / empty grades) · ndcg_at_k binary dedup 1 + graded_recall set 변환 1 51/51 test PASS (13 신규 + 38 기존 회귀 0). 🚨 CRITICAL 측정 발견: dedup audit baseline = 0/51 정상 (single-query path 의 retrieval 가 doc unique 박제) dedup audit gemma = 42/51 (totaling 81 chunks dedup) ⚠️ → _rrf_fuse_variants 의 representative 보존 logic 이 같은 doc_id 의 여러 SearchResult 를 unique 가정. chunk_id dedup (Rerank-Fix) 이후에도 doc_id 중복 잔재. 정정값 (이번이 가장 정확): baseline NDCG 0.644 (이전 0.659 와 noise level diff) gemma NDCG 0.641 → Δ vs baseline = -0.003 (사실상 동일, multi-query 실제 net 효과 ≈ 0) latency p50 +1005ms (+266%) — 회귀 Recall t≥3 -0.033 (회귀) 이전 박제값 (모두 inflation): Phase 3 (a41adb6) NDCG 0.927 — chunk_id 중복 Rerank-Fix (b734fc5) NDCG 0.876 — doc_id 중복 잔재 Category-Analysis (b00d9f5) NDCG 0.876 정정 박제 — 위와 동일 산출물: reports/v0_2_phase2q_eval_dedup_baseline_2026-05-24.csv (baseline 회귀 verify) reports/v0_2_phase2q_eval_dedup_gemma_2026-05-24.csv (실제 효과 측정) tests/search_eval/baselines/v0_2_phase2q_eval_dedup_2026-05-24.json (요약 + critical 권고) 권고 (사용자 결정 필요): 1. Apply rollback 검토 — multi-query 의 실제 net 효과 ≈ 0 + latency 4x 회귀 2. 또는 PR-2Q-Search-Result-Dedup 진입 (real fix _rrf_fuse_variants representative) 후 재측정 → 실제 multi-query 효과 측정 후 Apply 결정 Co-Authored-By: Claude Opus 4.7 (1M context) --- ...phase2q_eval_dedup_baseline_2026-05-24.csv | 52 +++++++ ..._2_phase2q_eval_dedup_gemma_2026-05-24.csv | 52 +++++++ .../v0_2_phase2q_eval_dedup_2026-05-24.json | 91 +++++++++++ tests/search_eval/run_eval.py | 62 +++++++- .../test_eval_graded_ndcg_dedup.py | 142 ++++++++++++++++++ 5 files changed, 396 insertions(+), 3 deletions(-) create mode 100644 reports/v0_2_phase2q_eval_dedup_baseline_2026-05-24.csv create mode 100644 reports/v0_2_phase2q_eval_dedup_gemma_2026-05-24.csv create mode 100644 tests/search_eval/baselines/v0_2_phase2q_eval_dedup_2026-05-24.json create mode 100644 tests/search_eval/test_eval_graded_ndcg_dedup.py diff --git a/reports/v0_2_phase2q_eval_dedup_baseline_2026-05-24.csv b/reports/v0_2_phase2q_eval_dedup_baseline_2026-05-24.csv new file mode 100644 index 0000000..b26fdc0 --- /dev/null +++ b/reports/v0_2_phase2q_eval_dedup_baseline_2026-05-24.csv @@ -0,0 +1,52 @@ +label,id,category,legacy_category,intent,domain_hint,language,ocr_derived,failure_expected,query,relevant_ids,graded_relevance,returned_ids_top10,latency_ms,recall_at_10,mrr_at_10,ndcg_at_10,top3_hit,graded_ndcg_at_10,graded_recall_at_10_t2,graded_recall_at_10_t3,dedup_count,error +single,kw_001,standards,exact_keyword,fact_lookup,document,ko,0,0,산업안전보건법 제6장,3856;3868;3879,3856:3;3868:2;3879:2,3879;3868;3890;3863;3856;3908;3851;4041;10573;3895,474.5,1.000,1.000,0.947,0,0.731,1.000,1.000,0, +single,kw_002,standards,exact_keyword,fact_lookup,document,ko,0,0,중대재해 처벌 등에 관한 법률 제2장 중대산업재해,3917;3921,3917:3;3921:2,3921;3874;3922;3917;3918;10573;3854;10571;3877;3920,860.9,1.000,1.000,0.877,0,0.676,1.000,1.000,0, +single,kw_003,standards,exact_keyword,fact_lookup,document,ko,0,0,화학물질관리법 유해화학물질 영업자,3981,3981:3,3981;3985;3980;3857;3984;3880;3993;3903;3988;3869,344.3,1.000,1.000,1.000,1,1.000,1.000,1.000,0, +single,kw_004,standards,exact_keyword,fact_lookup,document,ko,0,0,근로기준법 안전과 보건,4041,4041:3,4041;3858;3852;3851;3881;3905;3913;3915;3877;3903,327.5,1.000,1.000,1.000,1,1.000,1.000,1.000,0, +single,kw_005,standards,exact_keyword,fact_lookup,document,ko,0,0,산업안전보건기준에 관한 규칙 보호구,3888,3888:3,3910;3888;3905;3890;3885;3913;3895;3908;3894;3898,351.9,1.000,0.500,0.631,1,0.631,1.000,1.000,0, +single,nl_001,korean_only,natural_language_ko,semantic_search,document,ko,0,0,기계로 인한 산업재해 관련 법령,3856;3868;3879;3854,3854:1;3856:3;3868:2;3879:2,5249;3897;5244;3868;5253;3851;3895;3856;3874;3879,431.5,0.750,0.250,0.404,0,0.404,1.000,1.000,0, +single,nl_002,korean_only,natural_language_ko,semantic_search,document,ko,0,0,사업주가 도급을 줄 때 산업재해를 예방하기 위해 해야 할 일,3855;3867;3878,3855:3;3867:2;3878:2,3855;3867;3878;5227;10571;5244;3854;10573;3896;5249,351.3,1.000,1.000,1.000,1,1.000,1.000,1.000,0, +single,nl_003,korean_only,natural_language_ko,semantic_search,document,ko,0,0,유해화학물질을 다루는 회사가 지켜야 할 안전 의무,3980;3981;3982,3980:2;3981:2;3982:2,3980;5253;3985;3760;3917;5227;3757;5238;3904;3903,290.6,0.333,1.000,0.469,1,0.469,0.333,0.000,0, +single,nl_004,korean_only,natural_language_ko,semantic_search,document,ko,0,0,중대재해가 발생했을 때 경영책임자가 처벌받는 기준,3916;3917;3920;3921,3916:2;3917:3;3920:2;3921:2,3917;3854;3918;10572;3916;3877;5227;5226;3759;3859,291.8,0.500,1.000,0.541,1,0.698,0.500,1.000,0, +single,nl_005,korean_only,natural_language_ko,semantic_search,document,ko,0,0,안전보건교육은 누가 받아야 하고 어떤 내용을 다루는가,3853;3865,3853:3;3865:2,3853;3876;5249;3811;11677;3778;3810;10573;6675;3757,489.8,0.500,1.000,0.613,1,0.787,0.500,1.000,0, +single,news_001,korean_only,news_ko,semantic_search,news,ko,0,0,이란과 미국의 군사 충돌,4303;4304;4307;4316;4322;4323;4327;4335,4303:2;4304:2;4307:2;4316:2;4322:2;4323:2;4327:2;4335:2,20893;22048;21276;22054;15793;16081;18088;15922;16526;21273,373.7,0.000,0.000,0.000,1,0.000,0.000,0.000,0, +single,news_002,korean_only,news_ko,semantic_search,news,ko,0,0,호르무즈 해협 봉쇄,4316;4320;4322;4327,4316:3;4320:2;4322:2;4327:2,16532;21706;15922;17123;21890;22049;4346;9022;4767;6067,247.4,0.000,0.000,0.000,0,0.000,0.000,0.000,0, +single,misc_001,korean_only,other_domain,fact_lookup,document,ko,0,0,강체의 평면 운동학,4063;4065,4063:3;4065:2,4063;4071;4064;4065;4066;4068;5105;4058;11481;4067,321.8,1.000,1.000,0.877,1,0.932,1.000,1.000,0, +single,misc_002,korean_only,other_domain,semantic_search,document,ko,0,0,질점의 운동역학,4060;4061;4062,4060:2;4061:2;4062:2,4062;4059;4070;4060;4061;4066;4068;4058;4063;5113,292.4,1.000,1.000,0.853,1,0.853,1.000,0.000,0, +single,news_003,english_only,news_en,semantic_search,news,en,0,0,Trump Iran ultimatum,4258;4260;4262,4258:2;4260:2;4262:2,4775;23446;4776;4202;4679;24382;21155;4668;4199;21855,216.9,0.000,0.000,0.000,1,0.000,0.000,0.000,0, +single,cl_001,mixed,crosslingual_ko_en,semantic_search,document,mixed,0,0,기계 안전 가드 설계 원리,3770;3856,3770:3;3856:2,5239;3817;3791;3770;4540;3758;4548;3787;3789;5249,399.0,0.500,0.250,0.264,0,0.339,0.500,1.000,0, +single,cl_002,mixed,crosslingual_ko_en,semantic_search,document,mixed,0,0,산업 안전 입문서,3755;3775;3776;3777,3755:2;3775:2;3776:2;3777:2,5249;5230;3817;10573;3755;3787;3815;3802;5235;3774,324.7,0.250,0.200,0.151,1,0.151,0.250,0.000,0, +single,cl_003,mixed,crosslingual_ko_en,semantic_search,document,mixed,0,0,전기 안전 위험,3772;3790,3772:2;3790:2,3790;3772;5260;3897;5248;3771;3769;11671;13936;3755,959.2,1.000,1.000,1.000,1,1.000,1.000,0.000,0, +single,news_004,mixed,news_fr,semantic_search,news,mixed,0,0,guerre en Iran,4199;4202;4210;4361;4363;4507;4519;4521,4199:2;4202:2;4210:2;4361:2;4363:2;4507:2;4519:2;4521:2,22342;19576;17069;15924;16935;23149;16019;16462;16010;4776,299.2,0.000,0.000,0.000,1,0.000,0.000,0.000,0, +single,news_005,mixed,news_crosslingual,semantic_search,news,mixed,0,0,이란 미국 전쟁 글로벌 반응,4202;4258;4262;4536;4303;4304;4316,4202:2;4258:2;4262:2;4303:2;4304:2;4316:2;4536:2,16761;21275;16927;20893;16771;17242;4329;20886;4457;4307,482.1,0.000,0.000,0.000,1,0.000,0.000,0.000,0, +single,fail_001,failure_expected,failure_expected,semantic_search,document,mixed,0,1,Rust async runtime tokio scheduler 내부 구조,,,23732;3774;5262;4547;5161;19373;20758;5174;24155;20032,236.1,0.000,0.000,0.000,1,0.000,0.000,0.000,0, +single,fail_002,failure_expected,failure_expected,semantic_search,document,ko,0,1,양자컴퓨터 큐비트 디코히어런스,,,16289;5089;5092;5250;22202;20507;5070;5118;5173;23605,258.1,0.000,0.000,0.000,1,0.000,0.000,0.000,0, +single,fail_003,failure_expected,failure_expected,semantic_search,news,ko,0,1,재즈 보컬리스트 빌리 홀리데이,,,20022;20470;4634;15361;16059;9102;23336;18286;16218;5738,214.7,0.000,0.000,0.000,1,0.000,0.000,0.000,0, +single,kw_006,standards,standards,fact_lookup,document,ko,0,0,산업안전보건기준에 관한 규칙 작업장 통로,3886;3887,3886:3;3887:2,5236;3886;3900;3895;3887;3881;3894;3908;3889;3912,377.8,1.000,0.500,0.624,1,0.627,1.000,1.000,0, +single,kw_007,standards,standards,fact_lookup,document,ko,0,0,산업안전보건기준 폭발 화재 위험물 누출 방지,3896;3766,3766:1;3896:3,3896;3895;3903;13930;11562;13931;13929;3900;3886;3902,353.0,0.500,1.000,0.613,1,0.917,1.000,1.000,0, +single,kw_008,standards,standards,fact_lookup,document,ko,0,0,고압가스 안전관리법 전문,4025;4026,4025:3;4026:2,11644;11579;4025;4026;11645;13750;11676;13299;13749;13766,419.8,1.000,0.333,0.571,1,0.539,1.000,1.000,0, +single,kw_009,standards,standards,fact_lookup,document,ko,0,0,KGS FP111 가스설비 배관설비 기준,13305,13305:3,13305;13311;13306;13312;13302;13304;13309;13299;13313;13918,387.5,1.000,1.000,1.000,1,1.000,1.000,1.000,0, +single,kw_010,standards,standards,fact_lookup,document,ko,0,0,KGS FU551 가스설비 압력조정기 가스계량기,13652,13652:3,13652;11689;13657;13655;13656;13649;13651;13752;13659;13650,278.8,1.000,1.000,1.000,1,1.000,1.000,1.000,0, +single,kw_011,standards,standards,fact_lookup,document,ko,0,0,산업안전보건기준 전기로 인한 위험 방지,3897;3772,3772:1;3897:3,3897;3895;3902;3896;3887;13935;13938;3877;3900;3899,411.5,0.500,1.000,0.613,1,0.917,1.000,1.000,0, +single,en_001,english_only,english_only,semantic_search,document,en,0,0,pressure vessel flange design,5144;5136,5136:2;5144:3,5144;5136;5180;5193;5140;5137;5149;5178;5207;5148,2266.0,1.000,1.000,1.000,1,1.000,1.000,1.000,0, +single,en_002,english_only,english_only,semantic_search,document,en,0,0,ASME Section VIII introduction,5204;5206,5204:3;5206:2,5204;5180;5182;5208;5206;5210;5141;5137;5145;5183,2217.5,1.000,1.000,0.850,1,0.918,1.000,1.000,0, +single,en_003,english_only,english_only,semantic_search,document,en,0,0,impact test requirements ASME,5205;5148,5148:1;5205:3,5178;5214;5205;5186;5145;5204;5148;5180;5192;5190,1960.3,1.000,0.333,0.511,1,0.502,1.000,1.000,0, +single,en_004,english_only,english_only,semantic_search,document,en,0,0,design of vessel supports,5149,5149:3,5149;5140;5136;5137;5141;5186;5145;5182;5190;5185,1399.1,1.000,1.000,1.000,1,1.000,1.000,1.000,0, +single,en_005,english_only,english_only,semantic_search,document,en,0,0,hydrogen piping ASME code,5178,5178:3,5178;5210;5180;5182;5143;5204;5211;5207;5185;5186,1482.0,1.000,1.000,1.000,1,1.000,1.000,1.000,0, +single,en_006,english_only,english_only,semantic_search,document,en,0,0,ASME welding qualification requirements,5209;3771,3771:1;5209:3,5180;5178;5186;5187;5182;5209;5224;5188;5204;4835,1336.9,0.500,0.167,0.218,0,0.327,1.000,1.000,0, +single,en_007,english_only,english_only,semantic_search,document,en,0,0,pressure vessel fabrication and inspection,5208;5145,5145:2;5208:3,5208;5189;5192;5180;5187;5186;5212;5188;5182;5137,1902.0,0.500,1.000,0.613,1,0.787,0.500,1.000,0, +single,en_008,english_only,english_only,semantic_search,document,en,0,0,Industrial Safety and Health Management ergonomics,3763;3755,3755:1;3763:3,3763;3759;3774;3755;3818;3812;3778;3756;3761;3771,1402.9,1.000,1.000,0.877,1,0.974,1.000,1.000,0, +single,cl_004,mixed,mixed,semantic_search,document,mixed,0,0,ASME 압력용기 설계 실무,5207;5210;5139;5135,5135:2;5139:2;5207:3;5210:3,5139;5207;5204;5225;5206;5208;5210;5137;5182;5145,961.2,0.750,1.000,0.767,1,0.686,0.750,1.000,0, +single,cl_005,mixed,mixed,semantic_search,document,mixed,0,0,ASME 용접 코드 해설,5224;5209,5209:2;5224:3,5224;5222;5225;5209;5180;5204;5210;5205;5178;5143,781.8,1.000,1.000,0.877,1,0.932,1.000,1.000,0, +single,cl_006,mixed,mixed,semantic_search,document,mixed,0,0,pressure vessel Korean industrial safety regulation,4025;3856;5136,3856:2;4025:2;5136:1,4026;5145;5182;5143;5210;13749;5204;5186;13760;13671,895.6,0.000,0.000,0.000,1,0.000,0.000,0.000,0, +single,cl_007,mixed,mixed,semantic_search,document,mixed,0,0,KGS 코드 LPG safety standard,11647;11689;11645;4025,4025:1;11645:2;11647:3;11689:2,11647;13760;13674;13669;13774;13773;13675;13755;13924;13772,362.2,0.250,1.000,0.390,1,0.647,0.333,1.000,0, +single,cl_008,mixed,mixed,semantic_search,document,mixed,0,0,수소 가스 안전 기준 hydrogen safety,5178;5169,5169:2;5178:3,10575;11671;11649;11648;13915;5241;11563;5173;5177;11653,608.8,0.000,0.000,0.000,0,0.000,0.000,0.000,0, +single,exam_001,exam,exam,fact_lookup,document,ko,0,0,레이놀즈수 정의 공식,11504;11505,11504:3;11505:2,11504;11533;5081;11509;11476;11486;5064;3788;5134;5075,496.4,0.500,1.000,0.613,1,0.787,0.500,1.000,0, +single,exam_002,exam,exam,fact_lookup,document,ko,0,0,탱크 바닥 구멍 유체 유속 토리첼리,11500;11495;11496,11495:2;11496:2;11500:3,11500;11495;11501;5139;5090;5178;11515;5210;11493;11719,293.0,0.667,1.000,0.765,1,0.856,0.667,1.000,0, +single,exam_003,exam,exam,fact_lookup,document,ko,0,0,이상기체 음속 마하수,11514;11515,11514:3;11515:2,11514;11475;5090;5084;11531;11476;11473;5093;11479;5124,546.2,0.500,1.000,0.613,1,0.787,0.500,1.000,0, +single,exam_004,exam,exam,fact_lookup,document,ko,0,0,고압가스 용기 내압시험 영구증가량,11591;11644;11691,11591:3;11644:2;11691:2,11591;11664;13948;13660;5177;13652;11665;13917;11660;13752,301.3,0.333,1.000,0.469,1,0.674,0.333,1.000,0, +single,exam_005,exam,exam,fact_lookup,document,ko,0,0,도시가스 배관 매설 이격거리,11627;11625;11646,11625:2;11627:3;11646:1,11627;11658;11600;11625;11692;13918;13751;5177;13653;13753,308.9,0.667,1.000,0.671,1,0.883,1.000,1.000,0, +single,exam_006,exam,exam,fact_lookup,document,ko,0,0,LPG 저장탱크 안전거리 분말소화기,11617;11669;11620,11617:3;11620:1;11669:2,11595;11616;13669;11617;11649;11655;11690;11658;11653;11689,254.7,0.333,0.250,0.202,0,0.321,0.500,1.000,0, +single,exam_007,exam,exam,fact_lookup,document,ko,0,0,오리피스 차압식 유량계,11712;11711;11503,11503:2;11711:2;11712:3,11711;11712;11503;11500;11713;11714;13930;11717;11701;11502,309.1,1.000,1.000,1.000,1,0.858,1.000,1.000,0, +single,fail_004,failure_expected,failure_expected,fact_lookup,document,ko,0,1,KGS AC999 임의 가스 코드,,,11691;11693;11692;13665;13661;13664;13666;13670;13773;13934,309.6,0.000,0.000,0.000,1,0.000,0.000,0.000,0, +single,fail_005,failure_expected,failure_expected,fact_lookup,document,ko,0,1,초전도 안전 관리법 시행규칙,,,4026;5236;3977;3971;3966;4018;3972;3973;3974;3895,382.5,0.000,0.000,0.000,1,0.000,0.000,0.000,0, diff --git a/reports/v0_2_phase2q_eval_dedup_gemma_2026-05-24.csv b/reports/v0_2_phase2q_eval_dedup_gemma_2026-05-24.csv new file mode 100644 index 0000000..e8c3bfe --- /dev/null +++ b/reports/v0_2_phase2q_eval_dedup_gemma_2026-05-24.csv @@ -0,0 +1,52 @@ +label,id,category,legacy_category,intent,domain_hint,language,ocr_derived,failure_expected,query,relevant_ids,graded_relevance,returned_ids_top10,latency_ms,recall_at_10,mrr_at_10,ndcg_at_10,top3_hit,graded_ndcg_at_10,graded_recall_at_10_t2,graded_recall_at_10_t3,dedup_count,error +single,kw_001,standards,exact_keyword,fact_lookup,document,ko,0,0,산업안전보건법 제6장,3856;3868;3879,3856:3;3868:2;3879:2,3879;3868;3890;3863;3856;3908;3851;4041;10573;3895,1690.9,1.000,1.000,0.947,0,0.731,1.000,1.000,0, +single,kw_002,standards,exact_keyword,fact_lookup,document,ko,0,0,중대재해 처벌 등에 관한 법률 제2장 중대산업재해,3917;3921,3917:3;3921:2,3921;3917;3917;10573;3923;3919;3916;3919;3918;10573,1277.5,1.000,1.000,1.000,1,0.834,1.000,1.000,3, +single,kw_003,standards,exact_keyword,fact_lookup,document,ko,0,0,화학물질관리법 유해화학물질 영업자,3981,3981:3,3981;3981;3985;3980;3984;3869;3984;3993;3857;3978,1045.5,1.000,1.000,1.000,1,1.000,1.000,1.000,2, +single,kw_004,standards,exact_keyword,fact_lookup,document,ko,0,0,근로기준법 안전과 보건,4041,4041:3,4041;3852;3851;3851;3915;3775;3905;3904;3777;3903,2418.1,1.000,1.000,1.000,1,1.000,1.000,1.000,1, +single,kw_005,standards,exact_keyword,fact_lookup,document,ko,0,0,산업안전보건기준에 관한 규칙 보호구,3888,3888:3,10570;3888;3888;3911;3905;3889;3890;3910;3902;3893,1742.0,1.000,0.500,0.631,1,0.631,1.000,1.000,1, +single,nl_001,korean_only,natural_language_ko,semantic_search,document,ko,0,0,기계로 인한 산업재해 관련 법령,3856;3868;3879;3854,3854:1;3856:3;3868:2;3879:2,5244;3878;5249;3855;3897;3863;3867;3868;3874;5253,1178.5,0.250,0.125,0.123,0,0.087,0.333,0.000,0, +single,nl_002,korean_only,natural_language_ko,semantic_search,document,ko,0,0,사업주가 도급을 줄 때 산업재해를 예방하기 위해 해야 할 일,3855;3867;3878,3855:3;3867:2;3878:2,3855;5227;3867;3855;5236;3878;3917;3854;3851;5244,1025.7,1.000,1.000,0.885,1,0.930,1.000,1.000,1, +single,nl_003,korean_only,natural_language_ko,semantic_search,document,ko,0,0,유해화학물질을 다루는 회사가 지켜야 할 안전 의무,3980;3981;3982,3980:2;3981:2;3982:2,3980;3980;3903;3904;3896;3903;3909;3985;3981;3904,1109.9,0.667,1.000,0.626,1,0.626,0.667,0.000,3, +single,nl_004,korean_only,natural_language_ko,semantic_search,document,ko,0,0,중대재해가 발생했을 때 경영책임자가 처벌받는 기준,3916;3917;3920;3921,3916:2;3917:3;3920:2;3921:2,10572;10573;3917;3916;3917;3923;3921;3918;3923;3919,1134.2,0.750,0.333,0.502,1,0.502,0.750,1.000,2, +single,nl_005,korean_only,natural_language_ko,semantic_search,document,ko,0,0,안전보건교육은 누가 받아야 하고 어떤 내용을 다루는가,3853;3865,3853:3;3865:2,3853;3876;5249;5234;4025;11677;6675;10573;4842;11677,1792.1,0.500,1.000,0.613,1,0.787,0.500,1.000,1, +single,news_001,korean_only,news_ko,semantic_search,news,ko,0,0,이란과 미국의 군사 충돌,4303;4304;4307;4316;4322;4323;4327;4335,4303:2;4304:2;4307:2;4316:2;4322:2;4323:2;4327:2;4335:2,14813;15924;15924;15976;16378;16081;18077;22048;12213;16019,759.8,0.000,0.000,0.000,1,0.000,0.000,0.000,1, +single,news_002,korean_only,news_ko,semantic_search,news,ko,0,0,호르무즈 해협 봉쇄,4316;4320;4322;4327,4316:3;4320:2;4322:2;4327:2,22049;17123;9022;11945;5391;6396;6829;9105;6774;6314,541.3,0.000,0.000,0.000,0,0.000,0.000,0.000,0, +single,misc_001,korean_only,other_domain,fact_lookup,document,ko,0,0,강체의 평면 운동학,4063;4065,4063:3;4065:2,4063;4065;4064;4066;4065;4066;4063;4071;4071;4068,898.0,1.000,1.000,1.000,1,1.000,1.000,1.000,4, +single,misc_002,korean_only,other_domain,semantic_search,document,ko,0,0,질점의 운동역학,4060;4061;4062,4060:2;4061:2;4062:2,4070;4060;4062;4059;4059;4061;4064;4062;4058;4065,1382.8,1.000,0.500,0.712,1,0.712,1.000,0.000,2, +single,news_003,english_only,news_en,semantic_search,news,en,0,0,Trump Iran ultimatum,4258;4260;4262,4258:2;4260:2;4262:2,21186;4775;4202;4776;4679;4199;4519;4668;4515;22069,711.9,0.000,0.000,0.000,1,0.000,0.000,0.000,0, +single,cl_001,mixed,crosslingual_ko_en,semantic_search,document,mixed,0,0,기계 안전 가드 설계 원리,3770;3856,3770:3;3856:2,5239;3758;3770;3791;3770;3817;3763;4540;4540;3787,1546.4,0.500,0.333,0.307,1,0.394,0.500,1.000,2, +single,cl_002,mixed,crosslingual_ko_en,semantic_search,document,mixed,0,0,산업 안전 입문서,3755;3775;3776;3777,3755:2;3775:2;3776:2;3777:2,5244;5249;5249;5229;3774;3755;3755;3767;3756;3758,1706.2,0.250,0.167,0.151,1,0.151,0.250,0.000,2, +single,cl_003,mixed,crosslingual_ko_en,semantic_search,document,mixed,0,0,전기 안전 위험,3772;3790,3772:2;3790:2,3772;3790;5260;3897;3897;3772;3755;10574;13936;13937,2118.3,1.000,1.000,1.000,1,1.000,1.000,0.000,2, +single,news_004,mixed,news_fr,semantic_search,news,mixed,0,0,guerre en Iran,4199;4202;4210;4361;4363;4507;4519;4521,4199:2;4202:2;4210:2;4361:2;4363:2;4507:2;4519:2;4521:2,5840;16010;16457;6945;5398;4199;6996;23149;4776;17069,793.2,0.125,0.167,0.090,1,0.090,0.125,0.000,0, +single,news_005,mixed,news_crosslingual,semantic_search,news,mixed,0,0,이란 미국 전쟁 글로벌 반응,4202;4258;4262;4536;4303;4304;4316,4202:2;4258:2;4262:2;4303:2;4304:2;4316:2;4536:2,21848;8381;16823;7473;21275;4262;9545;16927;16378;15924,1547.6,0.143,0.167,0.098,1,0.098,0.143,0.000,0, +single,fail_001,failure_expected,failure_expected,semantic_search,document,mixed,0,1,Rust async runtime tokio scheduler 내부 구조,,,5161;5070;5262;23732;5262;4546;24155;4546;5092;20758,748.5,0.000,0.000,0.000,1,0.000,0.000,0.000,2, +single,fail_002,failure_expected,failure_expected,semantic_search,document,ko,0,1,양자컴퓨터 큐비트 디코히어런스,,,5057;5090;5090;5068;5063;5103;5066;5066;5076;24955,743.1,0.000,0.000,0.000,1,0.000,0.000,0.000,2, +single,fail_003,failure_expected,failure_expected,semantic_search,news,ko,0,1,재즈 보컬리스트 빌리 홀리데이,,,18567;18567;20022;20022;20470;20470;4634;20066;15361;15984,439.8,0.000,0.000,0.000,1,0.000,0.000,0.000,3, +single,kw_006,standards,standards,fact_lookup,document,ko,0,0,산업안전보건기준에 관한 규칙 작업장 통로,3886;3887,3886:3;3887:2,3886;3887;3895;3902;3887;3895;3894;3889;3892;3890,1747.6,1.000,1.000,1.000,1,1.000,1.000,1.000,2, +single,kw_007,standards,standards,fact_lookup,document,ko,0,0,산업안전보건기준 폭발 화재 위험물 누출 방지,3896;3766,3766:1;3896:3,3896;3896;3895;3903;13930;3897;3772;3766;3766;13931,1222.0,1.000,1.000,0.818,1,0.961,1.000,1.000,2, +single,kw_008,standards,standards,fact_lookup,document,ko,0,0,고압가스 안전관리법 전문,4025;4026,4025:3;4026:2,11644;11644;11579;11579;4025;4025;4026;11693;4026;13299,2097.6,1.000,0.200,0.571,0,0.539,1.000,1.000,4, +single,kw_009,standards,standards,fact_lookup,document,ko,0,0,KGS FP111 가스설비 배관설비 기준,13305,13305:3,13305;13311;13306;13312;13302;13304;13299;13313;13310;13303,893.7,1.000,1.000,1.000,1,1.000,1.000,1.000,0, +single,kw_010,standards,standards,fact_lookup,document,ko,0,0,KGS FU551 가스설비 압력조정기 가스계량기,13652,13652:3,13652;11689;13657;13655;13651;13656;13649;13651;13752;13658,560.5,1.000,1.000,1.000,1,1.000,1.000,1.000,1, +single,kw_011,standards,standards,fact_lookup,document,ko,0,0,산업안전보건기준 전기로 인한 위험 방지,3897;3772,3772:1;3897:3,3897;3897;3895;3902;3758;3886;3755;3896;3887;13935,2015.1,0.500,1.000,0.613,1,0.917,1.000,1.000,1, +single,en_001,english_only,english_only,semantic_search,document,en,0,0,pressure vessel flange design,5144;5136,5136:2;5144:3,5144;5136;5178;5180;5136;5207;5140;5137;5140;5149,3164.7,1.000,1.000,1.000,1,1.000,1.000,1.000,2, +single,en_002,english_only,english_only,semantic_search,document,en,0,0,ASME Section VIII introduction,5204;5206,5204:3;5206:2,5204;5204;5180;5208;5210;5143;5206;5137;5207;5182,3527.0,1.000,1.000,0.832,1,0.907,1.000,1.000,1, +single,en_003,english_only,english_only,semantic_search,document,en,0,0,impact test requirements ASME,5205;5148,5148:1;5205:3,5205;5204;5178;5214;5214;5224;5210;5148;5145;5186,4178.2,1.000,1.000,0.818,1,0.961,1.000,1.000,1, +single,en_004,english_only,english_only,semantic_search,document,en,0,0,design of vessel supports,5149,5149:3,5149;5149;5141;5136;5137;5186;5139;5136;5140;5186,3640.1,1.000,1.000,1.000,1,1.000,1.000,1.000,3, +single,en_005,english_only,english_only,semantic_search,document,en,0,0,hydrogen piping ASME code,5178,5178:3,5178;5178;5139;5180;5210;5179;5180;5210;5143;5182,3293.5,1.000,1.000,1.000,1,1.000,1.000,1.000,3, +single,en_006,english_only,english_only,semantic_search,document,en,0,0,ASME welding qualification requirements,5209;3771,3771:1;5209:3,5204;5224;5208;5209;5205;5178;5180;5178;5225;5208,3715.8,0.500,0.250,0.264,0,0.395,1.000,1.000,2, +single,en_007,english_only,english_only,semantic_search,document,en,0,0,pressure vessel fabrication and inspection,5208;5145,5145:2;5208:3,5139;5135;5208;5210;5187;5210;5133;5204;4026;5204,3376.9,0.500,0.333,0.307,1,0.394,0.500,1.000,2, +single,en_008,english_only,english_only,semantic_search,document,en,0,0,Industrial Safety and Health Management ergonomics,3763;3755,3755:1;3763:3,3763;3812;3755;3760;5253;5230;5249;3812;3858;10573,1391.2,1.000,1.000,0.920,1,0.983,1.000,1.000,1, +single,cl_004,mixed,mixed,semantic_search,document,mixed,0,0,ASME 압력용기 설계 실무,5207;5210;5139;5135,5135:2;5139:2;5207:3;5210:3,5139;5210;5145;5137;5180;5204;5140;5206;5145;5206,2204.9,0.500,1.000,0.637,0,0.522,0.500,0.500,2, +single,cl_005,mixed,mixed,semantic_search,document,mixed,0,0,ASME 용접 코드 해설,5224;5209,5209:2;5224:3,5224;5225;5225;5224;5210;5180;5204;5209;5209;5178,1763.9,1.000,1.000,0.832,1,0.907,1.000,1.000,3, +single,cl_006,mixed,mixed,semantic_search,document,mixed,0,0,pressure vessel Korean industrial safety regulation,4025;3856;5136,3856:2;4025:2;5136:1,5210;5210;5186;13913;5143;13760;13749;5145;5137;5140,1494.0,0.000,0.000,0.000,1,0.000,0.000,0.000,1, +single,cl_007,mixed,mixed,semantic_search,document,mixed,0,0,KGS 코드 LPG safety standard,11647;11689;11645;4025,4025:1;11645:2;11647:3;11689:2,11647;13760;13674;13669;13774;13773;13675;11688;13757;11689,602.1,0.500,1.000,0.503,1,0.727,0.667,1.000,0, +single,cl_008,mixed,mixed,semantic_search,document,mixed,0,0,수소 가스 안전 기준 hydrogen safety,5178;5169,5169:2;5178:3,10575;5177;10572;11671;11653;11649;5173;5177;11653;13946,1940.6,0.000,0.000,0.000,0,0.000,0.000,0.000,2, +single,exam_001,exam,exam,fact_lookup,document,ko,0,0,레이놀즈수 정의 공식,11504;11505,11504:3;11505:2,11504;11533;11504;5090;4544;5081;11509;5140;5089;11476,1233.9,0.500,1.000,0.613,1,0.787,0.500,1.000,1, +single,exam_002,exam,exam,fact_lookup,document,ko,0,0,탱크 바닥 구멍 유체 유속 토리첼리,11500;11495;11496,11495:2;11496:2;11500:3,11500;11495;11501;3788;5071;5090;5139;11486;5106;5090,701.2,0.667,1.000,0.765,1,0.856,0.667,1.000,1, +single,exam_003,exam,exam,fact_lookup,document,ko,0,0,이상기체 음속 마하수,11514;11515,11514:3;11515:2,11514;11479;11516;11475;5090;5084;11515;11531;11476;11514,1587.7,1.000,1.000,0.818,1,0.900,1.000,1.000,1, +single,exam_004,exam,exam,fact_lookup,document,ko,0,0,고압가스 용기 내압시험 영구증가량,11591;11644;11691,11591:3;11644:2;11691:2,11665;11664;11591;11591;13660;11664;13948;13660;11665;13942,1795.3,0.333,0.333,0.235,1,0.337,0.333,1.000,4, +single,exam_005,exam,exam,fact_lookup,document,ko,0,0,도시가스 배관 매설 이격거리,11627;11625;11646,11625:2;11627:3;11646:1,11658;11627;13753;11625;13918;13752;13653;13918;13751;13915,1122.2,0.667,0.500,0.498,1,0.608,1.000,1.000,1, +single,exam_006,exam,exam,fact_lookup,document,ko,0,0,LPG 저장탱크 안전거리 분말소화기,11617;11669;11620,11617:3;11620:1;11669:2,13916;13752;11595;13669;11616;11690;11649;11617;11658;11655,602.1,0.333,0.125,0.148,0,0.235,0.500,1.000,0, +single,exam_007,exam,exam,fact_lookup,document,ko,0,0,오리피스 차압식 유량계,11712;11711;11503,11503:2;11711:2;11712:3,11712;11711;11503;11500;11711;11701;11502;11713;13930;11717,517.9,1.000,1.000,1.000,1,1.000,1.000,1.000,1, +single,fail_004,failure_expected,failure_expected,fact_lookup,document,ko,0,1,KGS AC999 임의 가스 코드,,,11691;11693;5210;13936;5210;13665;13664;11691;13673;11693,364.7,0.000,0.000,0.000,1,0.000,0.000,0.000,3, +single,fail_005,failure_expected,failure_expected,fact_lookup,document,ko,0,1,초전도 안전 관리법 시행규칙,,,3895;4026;3875;3966;5210;5210;3961;3971;4025;4026,1106.7,0.000,0.000,0.000,1,0.000,0.000,0.000,2, diff --git a/tests/search_eval/baselines/v0_2_phase2q_eval_dedup_2026-05-24.json b/tests/search_eval/baselines/v0_2_phase2q_eval_dedup_2026-05-24.json new file mode 100644 index 0000000..b127a32 --- /dev/null +++ b/tests/search_eval/baselines/v0_2_phase2q_eval_dedup_2026-05-24.json @@ -0,0 +1,91 @@ +{ + "version": "v0.2-phase2q-eval-dedup", + "label": "phase_2q_eval_graded_ndcg_dedup_invariant_recovery", + "date": "2026-05-24", + "plan": "pr-eval-graded-ndcg-dedup-stormy-tide.md", + "main_head_pre": "b00d9f5", + "critical_finding": { + "summary": "Phase 2Q multi-query 의 실제 net 효과 = 거의 0 (NDCG -0.003 vs baseline). 모든 박제 측정 (0.927/0.876/+0.217) 은 inflation 결과.", + "root_cause": "_rrf_fuse_variants 의 representative 보존 logic 이 같은 doc_id 의 여러 SearchResult 를 unique 가정 — 실제로는 multi-query path 에서 doc_id 중복 박제. chunk_id dedup (Rerank-Fix) 으로 chunk-level inflation 만 해결, doc-level inflation 잔재.", + "evidence": "dedup audit = baseline 0/51 정상 vs gemma 42/51 cases with 81 chunks dedup applied", + "rollout_impact": "Apply opt-in 1주 관찰 결정 재검토 필요. multi-query 의 net 개선 sub-noise level + latency 4x 회귀." + }, + "snapshot": { + "doc_id_max": 25180, + "chunk_id_max": 56526 + }, + "eval_set": {"total_cases": 51, "scored_cases": 46}, + "measurements": { + "baseline_rewrite_null": { + "graded_ndcg_at_10": 0.644, + "graded_recall_at_10_t2": 0.699, + "graded_recall_at_10_t3": 0.761, + "latency_p50_ms": 378, + "latency_p95_ms": 1931, + "dedup_audit": { + "cases_with_dedup": "0/51", + "total_dedup_chunks": 0, + "status": "✓ 정상 — single-query path 의 retrieval 가 doc unique 박제" + }, + "by_category": { + "english_only": {"recall": 0.78, "gndcg": 0.72}, + "exam": {"recall": 0.57, "gndcg": 0.74}, + "korean_only": {"recall": 0.56, "gndcg": 0.57}, + "mixed": {"recall": 0.38, "gndcg": 0.38}, + "standards": {"recall": 0.91, "gndcg": 0.82} + }, + "csv": "reports/v0_2_phase2q_eval_dedup_baseline_2026-05-24.csv" + }, + "cand_multi_query_macmini": { + "graded_ndcg_at_10": 0.641, + "graded_recall_at_10_t2": 0.716, + "graded_recall_at_10_t3": 0.728, + "latency_p50_ms": 1383, + "latency_p95_ms": 3584, + "dedup_audit": { + "cases_with_dedup": "42/51", + "total_dedup_chunks": 81, + "status": "⚠️ inflation — _rrf_fuse_variants representative 의 doc_id 중복 박제 (chunk_id dedup 후 잔재)" + }, + "by_category": { + "english_only": {"recall": 0.78, "gndcg": 0.74}, + "exam": {"recall": 0.64, "gndcg": 0.67}, + "korean_only": {"recall": 0.57, "gndcg": 0.52}, + "mixed": {"recall": 0.40, "gndcg": 0.39}, + "standards": {"recall": 0.95, "gndcg": 0.87} + }, + "csv": "reports/v0_2_phase2q_eval_dedup_gemma_2026-05-24.csv" + }, + "delta_vs_baseline": { + "overall_ndcg": "-0.003 (사실상 동일, noise level)", + "recall_t2": "+0.017", + "recall_t3": "-0.033 (회귀)", + "english_only": "+0.02", + "exam": "-0.07", + "korean_only": "-0.05", + "mixed": "+0.01", + "standards": "+0.05", + "latency_p50": "+1005ms (+266%)", + "latency_p95": "+1653ms (+86%)" + } + }, + "previous_inflated_records": [ + {"source": "Phase 3 commit a41adb6", "ndcg": 0.927, "inflation_source": "chunk_id 중복 (chunks_per_doc cap 만, dedup 0)"}, + {"source": "Rerank-Fix commit b734fc5", "ndcg": 0.876, "inflation_source": "chunk_id dedup 적용 단 doc_id 중복 잔재"}, + {"source": "Category-Analysis commit b00d9f5", "ndcg": 0.876, "note": "Rerank-Fix 측정값 재사용 — 본 정정 후 0.641"} + ], + "recommendations": { + "immediate_user_decision": [ + "Apply rollback 검토 — multi-query 의 실제 net 효과 ≈ 0 + latency 4x 회귀 + LLM endpoint 의존", + "또는 PR-2Q-Search-Result-Dedup 진입 (real fix) 후 재측정 → 실제 multi-query 효과 측정 후 Apply 결정" + ], + "next_pr": "PR-2Q-Search-Result-Dedup — _rrf_fuse_variants representative 가 doc_id unique 가정 invariant 강제. dedup audit 가 42/51 → 0/51 회복 + NDCG 실제 효과 측정 가치" + }, + "changes_summary": { + "files_changed": [ + "tests/search_eval/run_eval.py — _dedup_returned_ids helper + count_dedup wrapper + ndcg_at_k/graded_ndcg_at_k 진입 시 dedup + print_summary dedup audit stats + QueryResult.dedup_count + csv schema column", + "tests/search_eval/test_eval_graded_ndcg_dedup.py — 13 신규 test (dedup helper + invariant + Phase 2Q 실측 case regression)" + ], + "test_results": "13 신규 PASS + 38 기존 PASS = 51/51, retrieval path 영향 0" + } +} diff --git a/tests/search_eval/run_eval.py b/tests/search_eval/run_eval.py index 91df4b1..5c9f57a 100644 --- a/tests/search_eval/run_eval.py +++ b/tests/search_eval/run_eval.py @@ -90,6 +90,8 @@ class QueryResult: graded_ndcg_at_10: float = 0.0 graded_recall_at_10_t2: float = 0.0 graded_recall_at_10_t3: float = 0.0 + # PR-Eval-GradedNDCG-Dedup: returned[:k] 의 중복 doc 수 박제. inflation 검출 audit. + dedup_count: int = 0 error: str | None = None @@ -98,6 +100,34 @@ class QueryResult: # ───────────────────────────────────────────────────────── +def _dedup_returned_ids(returned: list[int], k: int) -> tuple[list[int], int]: + """returned[:k] 의 첫 등장 순서 보존 dedup. + + PR-Eval-GradedNDCG-Dedup ([[feedback_graded_ndcg_dedup_invariant]]). graded NDCG / + binary NDCG 계산은 top-N 에 unique doc 가정 — retrieval path 가 중복 doc 박제 + 가능 시 actual DCG > ideal DCG → NDCG > 1.0 invariant 위반. Phase 2Q Phase 3 + NDCG 0.927 inflation origin. + + Returns: (deduped_top_k, dedup_count) — dedup_count = top-k 영역에서 제거된 중복 entry 수. + """ + seen: set[int] = set() + deduped: list[int] = [] + raw_top_k = returned[:k] + for doc_id in raw_top_k: + if doc_id in seen: + continue + seen.add(doc_id) + deduped.append(doc_id) + dedup_count = len(raw_top_k) - len(deduped) + return deduped, dedup_count + + +def count_dedup(returned: list[int], k: int = 10) -> int: + """returned[:k] 의 중복 doc 수 (audit 용).""" + _, dedup_count = _dedup_returned_ids(returned, k) + return dedup_count + + def recall_at_k(returned: list[int], relevant: list[int], k: int = 10) -> float: """top-k 안에 들어간 정답 비율. 정답 0개면 1.0(빈 케이스는 별도 fail metric).""" if not relevant: @@ -119,12 +149,16 @@ def mrr_at_k(returned: list[int], relevant: list[int], k: int = 10) -> float: def ndcg_at_k(returned: list[int], relevant: list[int], k: int = 10) -> float: - """binary relevance 기반 NDCG@k. top3_ids 같은 가중치는 v0.1에선 무시.""" + """binary relevance 기반 NDCG@k. top3_ids 같은 가중치는 v0.1에선 무시. + + PR-Eval-GradedNDCG-Dedup: returned[:k] 진입 직전 dedup (중복 doc inflation 방지). + """ if not relevant: return 0.0 + deduped, _ = _dedup_returned_ids(returned, k) relevant_set = set(relevant) dcg = 0.0 - for rank, doc_id in enumerate(returned[:k], start=1): + for rank, doc_id in enumerate(deduped, start=1): if doc_id in relevant_set: # binary gain = 1, DCG = 1 / log2(rank+1) dcg += 1.0 / math.log2(rank + 1) @@ -140,11 +174,16 @@ def graded_ndcg_at_k(returned: list[int], grades: dict[int, int], k: int = 10) - gain = 2^grade - 1 (grade=0 → gain=0, grade=3 → gain=7). ideal DCG = grades 를 grade 내림차순으로 top-k 채운 경우. grades 비어 있으면 0.0 (failure_expected 케이스는 별도 처리). + + PR-Eval-GradedNDCG-Dedup: returned[:k] 진입 직전 dedup. Phase 2Q Phase 3 NDCG 0.927 + inflation (top-N doc 중복 박제) 같은 invariant 위반 회피. + [[feedback_graded_ndcg_dedup_invariant]]. """ if not grades: return 0.0 + deduped, _ = _dedup_returned_ids(returned, k) dcg = 0.0 - for rank, doc_id in enumerate(returned[:k], start=1): + for rank, doc_id in enumerate(deduped, start=1): grade = grades.get(doc_id, 0) if grade > 0: dcg += (2 ** grade - 1) / math.log2(rank + 1) @@ -272,6 +311,12 @@ async def evaluate( reranker_backend=reranker_backend, rewrite_backend=rewrite_backend, ) + dedup_count = count_dedup(returned_ids, 10) + if dedup_count > 0: + print( + f" [dedup] {q.id}: top-10 에 중복 doc {dedup_count}개 (inflation 회피)", + file=sys.stderr, + ) results.append( QueryResult( query=q, @@ -289,6 +334,7 @@ async def evaluate( graded_recall_at_10_t3=graded_recall_at_k( returned_ids, q.graded_relevance, threshold=3, k=10 ), + dedup_count=dedup_count, ) ) except Exception as exc: @@ -403,6 +449,14 @@ def print_summary( f" Failure-case precision: {failure_correct}/{len(failure_cases)}" f" ({failure_precision:.2f}) — empty result expected" ) + # PR-Eval-GradedNDCG-Dedup: dedup audit stats (inflation 검출). + dedup_cases = [r for r in results if r.dedup_count > 0] + dedup_total = sum(r.dedup_count for r in dedup_cases) + print( + f" Dedup audit: {len(dedup_cases)}/{len(results)} cases with dedup applied" + f" (totaling {dedup_total} chunks). " + + ("⚠️ inflation 의심 — retrieval path 검증" if dedup_cases else "✓ 정상 (top-N unique doc invariant)") + ) # 카테고리별 by_cat: dict[str, list[QueryResult]] = {} @@ -539,6 +593,7 @@ def write_csv(results: list[QueryResult], output_path: Path) -> None: "graded_ndcg_at_10", "graded_recall_at_10_t2", "graded_recall_at_10_t3", + "dedup_count", "error", ] ) @@ -569,6 +624,7 @@ def write_csv(results: list[QueryResult], output_path: Path) -> None: f"{r.graded_ndcg_at_10:.3f}", f"{r.graded_recall_at_10_t2:.3f}", f"{r.graded_recall_at_10_t3:.3f}", + str(r.dedup_count), r.error or "", ] ) diff --git a/tests/search_eval/test_eval_graded_ndcg_dedup.py b/tests/search_eval/test_eval_graded_ndcg_dedup.py new file mode 100644 index 0000000..9160a06 --- /dev/null +++ b/tests/search_eval/test_eval_graded_ndcg_dedup.py @@ -0,0 +1,142 @@ +"""PR-Eval-GradedNDCG-Dedup — run_eval.py 의 graded NDCG dedup invariant 테스트. + +[[feedback_graded_ndcg_dedup_invariant]] regression. graded NDCG 는 top-N unique doc 가정 +— retrieval path 가 doc 중복 박제 시 inflation (NDCG > 1.0). dedup helper + 함수 진입 +시 dedup 으로 invariant 복원. +""" + +from __future__ import annotations + +import os +import sys +from pathlib import Path + +import pytest + +# tests/search_eval/ → 프로젝트 루트 +THIS_DIR = Path(__file__).resolve().parent +sys.path.insert(0, str(THIS_DIR)) # run_eval 직접 import 위해 + +from run_eval import ( + _dedup_returned_ids, + count_dedup, + graded_ndcg_at_k, + graded_recall_at_k, + ndcg_at_k, +) + + +# ─── 1. _dedup_returned_ids helper ───────────────────────── + + +def test_dedup_empty_returns_empty(): + out, n = _dedup_returned_ids([], 10) + assert out == [] + assert n == 0 + + +def test_dedup_no_duplicates_passthrough(): + out, n = _dedup_returned_ids([100, 200, 300], 10) + assert out == [100, 200, 300] + assert n == 0 + + +def test_dedup_with_duplicates_first_only(): + """3868 중복 등장 → 첫 등장만 유지, 순서 보존.""" + out, n = _dedup_returned_ids([3868, 3879, 3856, 3851, 3868, 4041, 3890], 10) + assert out == [3868, 3879, 3856, 3851, 4041, 3890] + assert n == 1 + + +def test_dedup_k_limit_applied_before_dedup(): + """returned[:k] 만 dedup. k 외부 등장은 무시.""" + # k=3 안에서 dedup + out, n = _dedup_returned_ids([1, 2, 1, 3], 3) # [:3] = [1,2,1] → [1,2], n=1 + assert out == [1, 2] + assert n == 1 + + +def test_dedup_count_helper(): + """count_dedup wrapper 도 정확한 카운트 반환.""" + assert count_dedup([10, 10, 20, 30], 10) == 1 + assert count_dedup([10, 20, 30], 10) == 0 + + +def test_dedup_phase2q_kw_001_case(): + """Phase 2Q Phase 3 의 kw_001 실측 case — 3868 중복 → dedup 1건.""" + # Phase 3 returned: [3868, 3879, 3856, 3851, 3868, ...] + returned = [3868, 3879, 3856, 3851, 3868, 3858, 3878, 3859, 3850, 3863] + out, n = _dedup_returned_ids(returned, 10) + assert n == 1 + assert out[:5] == [3868, 3879, 3856, 3851, 3858] # 3868 첫 등장 보존, 두 번째 제거 + + +# ─── 2. graded_ndcg_at_k 회귀 0 (dedup 가 정상 case 영향 X) ───── + + +def test_graded_ndcg_baseline_no_duplicates_unchanged(): + """unique doc 만 있는 case = dedup 영향 0 = 기존 결과 그대로.""" + returned = [3868, 3879, 3856, 4041, 3851, 3890, 3917, 3863, 3908, 3855] + grades = {3856: 3, 3868: 2, 3879: 2} + out = graded_ndcg_at_k(returned, grades, 10) + # 0.808 = baseline kw_001 측정값 + assert 0.80 <= out <= 0.82 + + +def test_graded_ndcg_with_duplicates_no_longer_inflated(): + """중복 doc 박제 case → NDCG ≤ 1.0 invariant 복원 (이전엔 > 1.0 가능).""" + # 3856 (grade 3) 가 두 번 등장하면 이전 (dedup 미적용) 에서 inflation + returned = [3856, 3856, 3868, 3879, 3851, 3890, 3917, 3863, 3908, 3855] + grades = {3856: 3, 3868: 2, 3879: 2} + out = graded_ndcg_at_k(returned, grades, 10) + assert out <= 1.0, f"NDCG > 1.0 = invariant 위반: {out}" + + +def test_graded_ndcg_all_duplicates_invariant(): + """top-N 이 모두 같은 doc 의 중복 = 단 1 entry rank 1 으로 처리.""" + returned = [11504] * 10 + grades = {11504: 3, 11505: 2} + out = graded_ndcg_at_k(returned, grades, 10) + # rank 1 만 grade 3 hit → DCG = (2^3-1)/log2(2) = 7 + # iDCG = (2^3-1)/log2(2) + (2^2-1)/log2(3) = 7 + 3/log2(3) ≈ 8.893 + # NDCG ≈ 7 / 8.893 ≈ 0.787 + assert 0.78 <= out <= 0.80 + + +def test_graded_ndcg_phase2q_exam_001_inflation_fix(): + """Phase 2Q Phase 3 의 exam_001 실측 — 11504 중복 박제. dedup 후 정상화.""" + # Phase 3 returned: [11504, 11504, 11533, 5106, 11533, ...] (11504 + 11533 중복) + returned = [11504, 11504, 11533, 5106, 11533, 11504, 11479, 11475, 11533, 5090] + grades = {11504: 3, 11505: 2} + out = graded_ndcg_at_k(returned, grades, 10) + # 11504 rank 1 만 hit (dedup 후), NDCG ≈ 0.787 (test 9 와 동일 결과) + assert out <= 1.0, f"inflation 잔재: {out}" + assert 0.78 <= out <= 0.80, f"예상 0.787 ± noise, 실제 {out}" + + +def test_graded_ndcg_empty_grades_returns_zero(): + assert graded_ndcg_at_k([1, 2, 3], {}, 10) == 0.0 + + +# ─── 3. ndcg_at_k (binary) 도 dedup 적용 ────────────────── + + +def test_ndcg_binary_with_duplicates_invariant(): + """binary NDCG 도 같은 invariant.""" + returned = [100, 100, 200, 300] + relevant = [100, 200] + out = ndcg_at_k(returned, relevant, 10) + assert out <= 1.0, f"binary NDCG > 1.0 = invariant 위반: {out}" + + +# ─── 4. graded_recall_at_k 영향 0 (set 변환 invariant) ──── + + +def test_graded_recall_unaffected_by_duplicates(): + """recall 은 set 변환이라 dedup invariant 자동 만족.""" + grades = {100: 3, 200: 2} + # 중복 있는 returned + r1 = graded_recall_at_k([100, 100, 200, 300], grades, threshold=2, k=10) + # dedup 한 returned + r2 = graded_recall_at_k([100, 200, 300], grades, threshold=2, k=10) + assert r1 == r2 == 1.0 # 100, 200 둘 다 hit