From 06da098eabcb4ef902e0cfa6aed04d73040ef2c9 Mon Sep 17 00:00:00 2001 From: Hyungi Ahn Date: Fri, 3 Apr 2026 15:05:48 +0900 Subject: [PATCH] =?UTF-8?q?fix:=20=EB=B2=95=EB=A0=B9=20=EB=B6=84=ED=95=A0?= =?UTF-8?q?=20=E2=80=94=20=EC=A1=B0=EB=AC=B8=ED=82=A4=20000=20=EA=B8=B0?= =?UTF-8?q?=EB=B0=98=20=EC=9E=A5(=E7=AB=A0)=20=EB=8B=A8=EC=9C=84=20?= =?UTF-8?q?=EB=B6=84=ED=95=A0=EB=A1=9C=20=EB=B3=80=EA=B2=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 국가법령 XML은 <편>/<장> 태그가 아닌 <조문단위 조문키="xxxx000">에 "제X장 ..." 형태로 장 구분자가 포함됨. 이를 파싱하여 분할. Co-Authored-By: Claude Opus 4.6 (1M context) --- app/workers/law_monitor.py | 63 +++++++++++++++++++++++++------------- 1 file changed, 42 insertions(+), 21 deletions(-) diff --git a/app/workers/law_monitor.py b/app/workers/law_monitor.py index b9ec8f1..545697e 100644 --- a/app/workers/law_monitor.py +++ b/app/workers/law_monitor.py @@ -191,29 +191,50 @@ async def _save_law_split( session, xml_text: str, law_name: str, proclamation_date: str, revision_type: str, prev_date: str, ) -> int: - """법령 XML → 편/장 단위 Markdown 분할 저장""" + """법령 XML → 장(章) 단위 Markdown 분할 저장""" root = ET.fromstring(xml_text) + + # 조문단위에서 장 구분자 찾기 (조문키가 000으로 끝나는 조문) + units = root.findall(".//조문단위") + chapters = [] # [(장제목, [조문들])] + current_chapter = None + current_articles = [] + + for unit in units: + key = unit.attrib.get("조문키", "") + content = (unit.findtext("조문내용", "") or "").strip() + + # 장 구분자: 키가 000으로 끝나고 내용에 "제X장" 포함 + if key.endswith("000") and re.search(r"제\d+장", content): + # 이전 장 저장 + if current_chapter and current_articles: + chapters.append((current_chapter, current_articles)) + chapter_match = re.search(r"(제\d+장\s*.+)", content) + current_chapter = chapter_match.group(1).strip() if chapter_match else content.strip() + current_articles = [] + else: + current_articles.append(unit) + + # 마지막 장 저장 + if current_chapter and current_articles: + chapters.append((current_chapter, current_articles)) + + # 장 분할 성공 sections = [] - - # 편(編) 단위 분할 시도 - for part in root.findall(".//*편"): - title = part.attrib.get("제목", part.findtext("편제목", "")) - number = part.attrib.get("번호", "") - content = _xml_section_to_markdown(part) - if content.strip(): - sections.append((f"제{number}편_{_safe_name(title)}", content)) - - # 편이 없으면 장(章) 단위 시도 - if not sections: - for chapter in root.findall(".//*장"): - title = chapter.attrib.get("제목", chapter.findtext("장제목", "")) - number = chapter.attrib.get("번호", "") - content = _xml_section_to_markdown(chapter) - if content.strip(): - sections.append((f"제{number}장_{_safe_name(title)}", content)) - - # 편/장 둘 다 없으면 전체 1파일 - if not sections: + if chapters: + for chapter_title, articles in chapters: + md_lines = [f"# {law_name}\n", f"## {chapter_title}\n"] + for article in articles: + title = article.findtext("조문제목", "") + content = article.findtext("조문내용", "") + if title: + md_lines.append(f"\n### {title}\n") + if content: + md_lines.append(content.strip()) + section_name = _safe_name(chapter_title) + sections.append((section_name, "\n".join(md_lines))) + else: + # 장 분할 실패 → 전체 1파일 full_md = _law_xml_to_markdown(xml_text, law_name) sections.append(("전문", full_md))