feat(study): 가스기사 import 스크립트 — 보기 형식 다양화 + subject 슬래시 정규화

운영 중 발견한 패턴 추가: - 보기 형식: "1번:" + "1." + "1)" 모두 매칭 (2022년 회차에서 "1." 사용 발견). - subject 정규화: 괄호 형태(연소공학 (열역학))뿐 아니라 슬래시 형태 (가스안전관리 / 가스설비) 도 head + scope 분리. 운영 결과 (6회분 = 600문항 추가): - 2020년 3회 / 2021년 1·2·3회 / 2022년 1·2회 모두 등록 완료. - 이미지 27건 자동 첨부 (1+4+7+6+5+4). - audit: HC 0건, LC-5 2건 (2022년 2회 q41/q90 표 구분자 누락) 자동 fix. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 16:56:17 +09:00
parent cb07ffa4ce
commit 1d73986fd6
7 changed files with 3713 additions and 8 deletions
@@ -68,17 +68,31 @@ CANONICAL_SUBJECTS = {
 def normalize_subject(
    raw_subject: str, raw_scope: Optional[str]
 ) -> tuple[str, Optional[str]]:
-    """('연소공학 (열역학)', 'X') → ('연소공학', '열역학 · X')."""
+    """subject 정규화. 두 패턴 처리:
+       (a) '연소공학 (열역학)' → ('연소공학', '열역학 · scope')
+       (b) '가스안전관리 / 가스설비' → ('가스안전관리', '가스설비 · scope')
+    head 가 5과목 중 하나일 때만 분리. 외엔 그대로.
+    """
    s = (raw_subject or "").strip()
    if not s or s in CANONICAL_SUBJECTS:
        return s, raw_scope
+
+    head: Optional[str] = None
+    detail: Optional[str] = None
+
+    # (a) 괄호 형태
    m = re.match(r"^(.+?)\s*\(\s*([^()]+?)\s*\)\s*$", s)
-    if not m:
-        return s, raw_scope
-    head = m.group(1).strip()
-    detail = m.group(2).strip()
+    if m:
+        head = m.group(1).strip()
+        detail = m.group(2).strip()
+    else:
+        # (b) 슬래시 형태 — 첫 항목 = subject, 나머지 = scope.
+        m = re.match(r"^(.+?)\s*/\s*(.+)$", s)
+        if m:
+            head = m.group(1).strip()
+            detail = m.group(2).strip()
+
    if head not in CANONICAL_SUBJECTS:
-        # 정규 5과목 외면 정규화 안 함 (사용자 검토 대상)
        return s, raw_scope
    if raw_scope:
        new_scope = f"{detail} · {raw_scope}"
@@ -194,13 +208,14 @@ def _extract_meta(md: str) -> dict[str, str]:
 def _parse_choices(raw: str) -> tuple[str, str, str, str]:
    """보기 fenced block → 4개 텍스트.

-    한 줄당 "{N}번: ..." 형태. 멀티라인 보기는 다음 N번 보기 직전까지 합침.
+    한 줄당 "{N}번:", "{N}.", "{N})" 형태. 멀티라인 보기는 다음 항목 직전까지 합침.
    """
    lines = raw.splitlines()
    buckets: dict[int, list[str]] = {1: [], 2: [], 3: [], 4: []}
    current: Optional[int] = None
    for line in lines:
-        m = re.match(r"^\s*(\d)번\s*:\s*(.*)$", line)
+        # "1번:" / "1." / "1)" 모두 인정. 점/괄호 다음 공백 허용.
+        m = re.match(r"^\s*(\d)\s*[번.):]\s*(.*)$", line)
        if m:
            current = int(m.group(1))
            if 1 <= current <= 4: