hyungi_document_server/app/services/study/concept_parser.py

"""concept_parser — 개념노트 markdown 구조 파서 + 관련개념 백링크 해소 (이론 리더용).

정찰 실측 불변식(273/273): 개념노트는 고정 골격을 100% 따름 —
    # {H1 제목}                     (첫 줄, DB title 과 다른 표시용 제목)
    > **한 줄 요약**: {요약}          (blockquote, 라벨 고정)
    ## {본문 라벨}  ...              (BODY, 자유 라벨 H2 0~N, 트레일 ★ 가능)
    ## 빈출 포인트                    (항상, 관련개념 직전)
    ## 관련 개념                      (항상, 문서 최종 섹션)

코드펜스(``` ASCII 도식) 내부의 ##/- 는 무시. 헤딩 트레일 ★ 는 스트립(라벨 정규화).
'빈출 포인트'/'관련 개념' 앵커만 이름으로 잡고 나머지 BODY 는 순서·위치로 처리(라벨 화이트리스트 금지).
순수 함수 · LLM 0.
"""

from __future__ import annotations

import re

_FENCE = re.compile(r"^\s*```")
_H1 = re.compile(r"^#\s+(.+?)\s*$")
_H2 = re.compile(r"^##\s+(.+?)\s*$")  # ### 는 매칭 안 됨(## 뒤 \s 요구)
_SUMMARY = re.compile(r"^>\s*\*\*한 줄 요약\*\*:\s*(.+)$")
_STAR_SUFFIX = re.compile(r"\s*★+\s*$")
_TRAIL_STARS = re.compile(r"★+\s*$")
_BINCHEOL_ITEM = re.compile(r"^\s*-\s+(★*)\s*(.+)$")
_RELATED_ITEM = re.compile(r"^\s*-\s+(.+)$")
_PAREN = re.compile(r"\s*\(.*$")  # 괄호부터 끝(clarifier 힌트 절단)
_NUM_PREFIX = re.compile(r"^\d+_")
_STRIP_SYM = re.compile(r"[\s_·,./()\-]")

_ANCHOR_BINCHEOL = "빈출 포인트"
_ANCHOR_RELATED = "관련 개념"


def parse_concept(md: str) -> dict:
    """개념노트 md → {title, summary, body[{label,stars,md}], bincheol[{tier,text}], related[{raw,phrase,hint}]}."""
    lines = (md or "").split("\n")
    title: str | None = None
    summary: str | None = None
    body: list[dict] = []
    bincheol_lines: list[str] = []
    related_lines: list[str] = []

    in_fence = False
    zone = "pre"  # pre | body | bincheol | related
    body_cur: dict | None = None

    def emit(line: str) -> None:
        if body_cur is not None:
            body_cur["_lines"].append(line)
        elif zone == "bincheol":
            bincheol_lines.append(line)
        elif zone == "related":
            related_lines.append(line)
        # pre-zone 내용(요약 앞 잡음)은 버림

    for ln in lines:
        if _FENCE.match(ln):
            in_fence = not in_fence
            emit(ln)
            continue
        if in_fence:
            emit(ln)
            continue

        if title is None:
            m = _H1.match(ln)
            if m:
                title = m.group(1).strip()
                continue
        if summary is None:
            m = _SUMMARY.match(ln)
            if m:
                summary = m.group(1).strip()
                continue

        m2 = _H2.match(ln)
        if m2:
            raw_label = m2.group(1).strip()
            star_m = _TRAIL_STARS.search(raw_label)
            stars = len(star_m.group(0).strip()) if star_m else 0
            label = _STAR_SUFFIX.sub("", raw_label).strip()
            if label == _ANCHOR_BINCHEOL:
                zone = "bincheol"
                body_cur = None
                continue
            if label == _ANCHOR_RELATED:
                zone = "related"
                body_cur = None
                continue
            body_cur = {"label": label, "stars": stars, "_lines": []}
            body.append(body_cur)
            zone = "body"
            continue

        emit(ln)

    body_out = []
    for s in body:
        text = "\n".join(s["_lines"]).strip()
        if text or s["label"]:
            body_out.append({"label": s["label"], "stars": s["stars"], "md": text})

    bincheol = []
    for ln in bincheol_lines:
        m = _BINCHEOL_ITEM.match(ln)
        if m:
            bincheol.append({"tier": len(m.group(1)), "text": m.group(2).strip()})

    related = []
    for ln in related_lines:
        m = _RELATED_ITEM.match(ln)
        if m:
            raw = m.group(1).strip()
            phrase = _PAREN.sub("", raw).strip()
            hint = raw[len(phrase):].strip() if len(raw) > len(phrase) else ""
            if phrase:
                related.append({"raw": raw, "phrase": phrase, "hint": hint})

    return {
        "title": title,
        "summary": summary,
        "body": body_out,
        "bincheol": bincheol,
        "related": related,
    }


def _normalize(s: str) -> str:
    """해소용 정규화: NN_ 접두 제거 → 소문자 → 공백/기호 제거. 영문은 lowercase 유지."""
    s = _NUM_PREFIX.sub("", s or "")
    s = s.lower()
    s = _STRIP_SYM.sub("", s)
    return s


def resolve_related(related: list[dict], title_index: list[tuple]) -> list[dict]:
    """관련개념 구절 → 개념 doc 해소. title_index = [(doc_id, title, subject), ...].

    다단 fallback(정찰 ~79%): 정규화 exact → 양방향 substring(≥2자 가드) → 미해소=dangling(doc_id None).
    """
    norm_exact: dict[str, int] = {}
    norm_list: list[tuple[str, int, str]] = []
    for did, ttl, _subj in title_index:
        n = _normalize(ttl)
        if n:
            norm_exact.setdefault(n, did)
            norm_list.append((n, did, ttl))

    out = []
    for it in related:
        pn = _normalize(it["phrase"])
        did: int | None = None
        rtitle: str | None = None
        if pn and len(pn) >= 2:
            if pn in norm_exact:
                did = norm_exact[pn]
            else:
                # substring 폴백: title-norm ⊆ phrase-norm 방향만(짧은 phrase 가 더 큰 title 을
                # 삼키는 오결선 방지, 예: '염산'→'염산나트륨' X) + 길이차 최소(가장 구체적) +
                # doc_id tiebreak(순서 무관 결정성). 후보 없으면 dangling(doc_id None).
                cands = [
                    (abs(len(n) - len(pn)), cand, ttl)
                    for n, cand, ttl in norm_list
                    if len(n) >= 2 and n in pn
                ]
                if cands:
                    cands.sort(key=lambda c: (c[0], c[1]))
                    _, did, rtitle = cands[0]
        if did is not None and rtitle is None:
            rtitle = next((t for d, t, _ in title_index if d == did), None)
        out.append(
            {"phrase": it["phrase"], "hint": it["hint"], "doc_id": did, "title": rtitle}
        )
    return out