diff --git a/app/services/hier_decomp/builder.py b/app/services/hier_decomp/builder.py index 3718e54..416ef4d 100644 --- a/app/services/hier_decomp/builder.py +++ b/app/services/hier_decomp/builder.py @@ -26,7 +26,16 @@ _ATX = re.compile(r'^(#{1,6})\s+(?P\S.*?)\s*#*\s*$') _KO_JANG = re.compile(r'^\s*(?P<title>제\s*\d+\s*장\b.*)$') _KO_JEOL = re.compile(r'^\s*(?P<title>제\s*\d+\s*절\b.*)$') _KO_JO = re.compile(r'^\s*(?P<title>제\s*\d+\s*조\b.*)$') -_ENG = re.compile(r'^\s*(?P<title>(?:Chapter|Section|Article|Part|PART)\s+[\dIVXLA-Z]+\b.*)$') +# _ENG: 영문 구조 헤딩(ATX 미사용 문서용). ASME 파트는 보통 ATX(`# PART PG`)로 잡혀 _ENG 의존 낮음. +# D1: 식별자 뒤가 소문자 문장연속이면("Part III to demonstrate to the satisfaction…") 본문이므로 +# 미탐지 — 가짜 절 차단. 선택 제목은 대문자/괄호/숫자로 시작해야 헤딩 인정(소문자 시작=문장으로 봄). +# 식별자는 번호/PG/3.31/UHX/A-1 등 (.·- 소수·하이픈 확장 허용). +_ENG = re.compile( + r'^\s*(?P<title>(?:Chapter|Section|Article|Part|PART)\s+' + r'[\dIVXLA-Z]+(?:[.\-][\dA-Za-z]+)*' + r'(?:\s+[A-Z(\d][^\n]*)?' + r')\s*$' +) # 코드펜스 경계 (FE outlineAnchors.ts:60 `/^\s{0,3}(```|~~~)/` 와 동일). 펜스 내부 라인은 # heading 미탐지 — 코드블록 안 '# foo' 가 가짜 절을 만들지 않게(O3). diff --git a/tests/hier_decomp/test_eng_matcher.py b/tests/hier_decomp/test_eng_matcher.py new file mode 100644 index 0000000..7705bb0 --- /dev/null +++ b/tests/hier_decomp/test_eng_matcher.py @@ -0,0 +1,106 @@ +"""_ENG 매처 노이즈 차단 단위테스트 (asme-item-decomp-1 D1). + +핵심 불변식: 영문 구조 헤딩 매처(_ENG)가 + - (음성) 본문 중간 'Part III to demonstrate…' 같은 소문자 문장연속을 가짜 절로 잡지 않고, + - (양성) 진짜 영문 구조 헤딩(PART PG / Part 1 / Section 3.31 / Part UHX …)은 탐지하며, + - (ATX 보존) _ENG 축소가 ATX 파트(`# PART PG`)·항목(`#### PG-1`)을 떨구지 않는다(ATX 우선). + +pytest + 단독 실행 양쪽 지원: + PYTHONPATH=. python3 tests/hier_decomp/test_eng_matcher.py +""" +from __future__ import annotations + +try: # pytest 경로 (앱 패키지) + from app.services.hier_decomp.builder import _detect_heading, build_hier_tree +except Exception: # 단독 실행 (앱 deps 없이 builder.py 직접 로드 — stdlib only) + import importlib.util + import pathlib + import sys + + _bp = pathlib.Path(__file__).resolve().parents[2] / "app/services/hier_decomp/builder.py" + _spec = importlib.util.spec_from_file_location("_hier_builder_t", _bp) + _m = importlib.util.module_from_spec(_spec) + sys.modules[_spec.name] = _m # dataclass __module__ 해소 + _spec.loader.exec_module(_m) + _detect_heading, build_hier_tree = _m._detect_heading, _m.build_hier_tree + + +# ── 음성: 본문 문장은 헤딩 아님 (가짜 절 차단 — D1 회귀의 핵심) ── +NEG = [ + "Part III to demonstrate to the satisfaction of the represen-", + "Section V of the agreement applies to all parties", + "Part IV is hereby amended as follows", + "Article II shall be interpreted broadly", + "Chapter 3 describes the general method used here", +] + +# ── 양성: 진짜 영문 구조 헤딩 ── +POS = [ + "PART PG GENERAL REQUIREMENTS FOR ALL METHODS OF CONSTRUCTION", + "Part 1", + "Part PFH", + "Part UHX (TUBESHEET CALCULATION)", + "Section 3.31", + "Chapter 1 Introduction", + "Article 5 Definitions", +] + + +def test_eng_negatives_not_detected(): + for line in NEG: + assert _detect_heading(line) is None, f"가짜 절로 잡힘: {line!r}" + + +def test_eng_positives_detected_as_chapter(): + for line in POS: + r = _detect_heading(line) + assert r is not None, f"진짜 헤딩 미탐지: {line!r}" + _lvl, _title, nt = r + assert nt == "chapter", f"{line!r} node_type={nt}" + + +def test_atx_part_and_item_still_detected(): + # _ENG 축소가 진짜 ATX 파트/항목을 떨구지 않음 (ATX 우선 탐지) + r = _detect_heading("# PART PG GENERAL REQUIREMENTS FOR ALL METHODS OF CONSTRUCTION") + assert r is not None + lvl, title, nt = r + assert lvl == 1 and nt is None, r # ATX = level(# 수), node_type None + assert title.startswith("PART PG") + r2 = _detect_heading("#### PG-1 SCOPE") + assert r2 is not None and r2[0] == 4 and r2[2] is None, r2 + + +def test_build_hier_tree_drops_false_part_section(): + # 본문에 'Part III to demonstrate…' 가 섞여도 가짜 절이 생기지 않음 + md = ( + "# PART PG GENERAL REQUIREMENTS\n" + "#### PG-1 SCOPE\n" + "The rules cover power boilers.\n" + "Part III to demonstrate to the satisfaction of the representative\n" + "that the requirements are met, the manufacturer shall proceed...\n" + "#### PG-2 SERVICE LIMITATIONS\n" + "body of pg-2 here.\n" + ) + titles = [n.section_title for n in build_hier_tree(md) if n.section_title] + assert any(t.startswith("PART PG") for t in titles), titles + assert any(t.startswith("PG-1") for t in titles), titles + assert any(t.startswith("PG-2") for t in titles), titles + assert not any("demonstrate" in (t or "") for t in titles), f"가짜 절 누출: {titles}" + + +if __name__ == "__main__": + import sys + import traceback + + fns = [(k, v) for k, v in sorted(globals().items()) if k.startswith("test_") and callable(v)] + failed = 0 + for name, fn in fns: + try: + fn() + print(f"PASS {name}") + except Exception as e: + failed += 1 + print(f"FAIL {name}: {e}") + traceback.print_exc() + print(f"\n{len(fns) - failed}/{len(fns)} passed") + sys.exit(1 if failed else 0)