From 51a7c96b5604353c38f267e6dab4f0c3c99ea6a0 Mon Sep 17 00:00:00 2001
From: hyungi <hyun49196@gmail.com>
Date: Mon, 29 Jun 2026 23:20:16 +0000
Subject: [PATCH] =?UTF-8?q?feat(clause-kb):=20over-CAP=20=EC=A0=88=20?=
 =?UTF-8?q?=EB=B3=B8=EB=AC=B8=20=ED=8E=98=EC=9D=B4=EC=A7=80=EB=84=A4?=
 =?UTF-8?q?=EC=9D=B4=EC=85=98(~11K=20tok/page)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 scripts/asme_clause_persist.py | 70 ++++++++++++++++++++--------------
 1 file changed, 42 insertions(+), 28 deletions(-)
diff --git a/scripts/asme_clause_persist.py b/scripts/asme_clause_persist.py
index 8f7b6a0..5a54751 100644
--- a/scripts/asme_clause_persist.py
+++ b/scripts/asme_clause_persist.py
@@ -1,16 +1,16 @@
 #!/usr/bin/env python3
-"""ASME clause-KB persist: split a parent standard into per-clause documents (A-granularity).
-Idempotent per parent. Clause docs: doc_kind='clause', embedding NULL (search-excluded via
-doc_kind filter), parent_id=<parent>. Also writes Part tags. Run inside fastapi container.
+"""ASME clause-KB persist (v2: over-CAP pagination). Split a parent standard into per-clause
+documents (A-granularity); over-CAP clause bodies are paginated into readable page-docs.
+Idempotent per parent. doc_kind='clause', embedding NULL (search-excluded), parent_id=<parent>.
 Usage: python3 asme_clause_persist.py <parent_id> [--commit]
 """
 import asyncio, os, re, sys, hashlib, statistics
 
-CAP = 12000
+CAP = 12000; PAGE_TOK = 11000
 EN, KO = 0.217, 0.529
 LINE_RE = re.compile(r'^([ \t#>*]{0,8})([A-Z]{2,4}-\d+(?:\.\d+)*[A-Za-z]?)(.*)$')
 MENTION_RE = re.compile(r'(?<![A-Za-z0-9])([A-Z]{1,4}-\d+(?:\.\d+)*[A-Za-z]?)(?![A-Za-z0-9])')
-EXACT_TOP = re.compile(r'^[A-Z]{2,4}-\d+$')   # top-level clause code (no dotted suffix)
+EXACT_TOP = re.compile(r'^[A-Z]{2,4}-\d+$')
 TITLE_AFTER = re.compile(r'^[\s.]*[A-Z(]')
 REF_LEAD = re.compile(r'^[\s.]*(and|or|to|of|in|on|the|as|is|are|shall|through|per|see|with|'
                       r'for|by|that|which|such|또는|및|등|의|은|는|에|을|를|과|와)\b', re.I)
@@ -19,12 +19,10 @@ def tok(s):
     ko = sum(1 for c in s if '가' <= c <= '힣'); return int((len(s)-ko)*EN + ko*KO)
 
 def clean_title(rest):
-    t = rest
-    t = re.sub(r'<sup>ð</sup>\s*\**\d*\**\s*<sup>Þ</sup>', '', t)   # revision bar (sup form)
-    t = re.sub(r'ð\**\d*\**Þ', '', t)                              # revision bar (plain)
+    t = re.sub(r'<sup>ð</sup>\s*\**\d*\**\s*<sup>Þ</sup>', '', rest)
+    t = re.sub(r'ð\**\d*\**Þ', '', t)
     t = t.replace('**', '').replace('#', '')
-    t = re.sub(r'\s+', ' ', t).strip(' *:—-')
-    return t
+    return re.sub(r'\s+', ' ', t).strip(' *:—-')
 
 def is_header(markup, rest):
     if '#' in markup or '*' in markup: return True
@@ -36,12 +34,22 @@ def is_header(markup, rest):
     if rs[0].islower(): return False
     return bool(TITLE_AFTER.match(rs))
 
+def paginate(body):
+    """split an over-CAP body into <=MAX_PAGES line-aligned pages of ~PAGE_TOK tokens."""
+    pages, cur, ct = [], [], 0
+    for ln in body.split('\n'):
+        lt = tok(ln) + 1
+        if ct + lt > PAGE_TOK and cur:
+            pages.append('\n'.join(cur)); cur, ct = [ln], lt
+        else:
+            cur.append(ln); ct += lt
+    if cur: pages.append('\n'.join(cur))
+    return pages
+
 def build_clauses(text):
     lines = text.split('\n'); off = []; a = 0
     for ln in lines: off.append(a); a += len(ln) + 1
-    # exact-top-level HEADER boundaries, first-seen only (fixes dup + sub-fragment noise)
-    bounds = []   # (pos, code, title)
-    seen = set()
+    bounds = []; seen = set()
     for i, ln in enumerate(lines):
         m = LINE_RE.match(ln)
         if not m: continue
@@ -50,37 +58,44 @@ def build_clauses(text):
         if not is_header(markup, rest): continue
         if code in seen: continue
         seen.add(code); bounds.append((off[i], code, clean_title(rest)))
-    clauses = []
+    raw = []
     for idx, (start, code, title) in enumerate(bounds):
         end = bounds[idx+1][0] if idx+1 < len(bounds) else len(text)
         body = text[start:end]
         part = re.match(r'^[A-Z]{2,4}', code).group(0)
         links = sorted(set(re.match(r'^[A-Z]{1,4}-\d+', mm).group(0)
                            for mm in MENTION_RE.findall(body)) - {code})
-        clauses.append(dict(code=code, part=part, order=idx, title=(code + (' ' + title if title else '')),
-                            body=body, tok=tok(body), links=links))
-    return clauses
+        raw.append(dict(code=code, part=part, title=(code + (' ' + title if title else '')),
+                        body=body, tok=tok(body), links=links))
+    # expand over-CAP into pages; assign running clause_order
+    final, order = [], 0
+    for c in raw:
+        if c['tok'] <= CAP:
+            final.append({**c, 'order': order}); order += 1; continue
+        pages = paginate(c['body'])
+        for pi, pb in enumerate(pages):
+            code = c['code'] if pi == 0 else f"{c['code']}·p{pi+1}"
+            title = c['title'] if pi == 0 else f"{c['title']} (페이지 {pi+1}/{len(pages)})"
+            final.append(dict(code=code, part=c['part'], order=order, title=title,
+                              body=pb, tok=tok(pb), links=c['links'] if pi == 0 else []))
+            order += 1
+    return final
 
 async def main():
-    parent = int(sys.argv[1])
-    commit = '--commit' in sys.argv
+    parent = int(sys.argv[1]); commit = '--commit' in sys.argv
     import asyncpg
-    dsn = os.environ['DATABASE_URL'].replace('+asyncpg', '')
-    conn = await asyncpg.connect(dsn)
+    conn = await asyncpg.connect(os.environ['DATABASE_URL'].replace('+asyncpg', ''))
     row = await conn.fetchrow("SELECT md_content, ai_domain, data_origin FROM documents WHERE id=$1", parent)
     if not row: print(f"parent {parent} not found"); return
     clauses = build_clauses(row['md_content'])
     toks = [c['tok'] for c in clauses]
     over = [c for c in clauses if c['tok'] > CAP]
     print(f"parent={parent} clause_docs={len(clauses)} median_tok={int(statistics.median(toks))} "
-          f"max_tok={max(toks)} over_cap={len(over)} total_backlinks={sum(len(c['links']) for c in clauses)}")
-    print("sample:", [f"{c['code']}:{c['tok']}t" for c in clauses[:8]])
-    if over: print("over-CAP:", [f"{c['code']}:{c['tok']}t" for c in over])
+          f"max_tok={max(toks)} over_cap_remaining={len(over)}")
+    if over: print("still over-CAP:", [f"{c['code']}:{c['tok']}t" for c in over])
     if not commit:
-        print("DRY-RUN (no write). pass --commit to persist."); await conn.close(); return
-
+        print("DRY-RUN. pass --commit to persist."); await conn.close(); return
     async with conn.transaction():
-        # idempotent: remove prior clause docs of this parent (cascades clause_links/document_tags)
         deld = await conn.execute("DELETE FROM documents WHERE parent_id=$1 AND doc_kind='clause'", parent)
         print("deleted prior:", deld)
         for c in clauses:
@@ -94,7 +109,6 @@ async def main():
                 RETURNING id
             """, fh, c['title'], c['body'], parent, c['code'], c['part'], c['order'],
                  row['ai_domain'], row['data_origin'] or 'external')
-            # Part tag
             await conn.execute("INSERT INTO document_tags(doc_id,tag,tag_kind) VALUES ($1,$2,'part') "
                                "ON CONFLICT DO NOTHING", cid, c['part'])
         n = await conn.fetchval("SELECT count(*) FROM documents WHERE parent_id=$1 AND doc_kind='clause'", parent)