#!/usr/bin/env python3 """ASME clause-KB persist (v2: over-CAP pagination). Split a parent standard into per-clause documents (A-granularity); over-CAP clause bodies are paginated into readable page-docs. Idempotent per parent. doc_kind='clause', embedding NULL (search-excluded), parent_id=. Usage: python3 asme_clause_persist.py [--commit] """ import asyncio, os, re, sys, hashlib, statistics CAP = 12000; PAGE_TOK = 11000 EN, KO = 0.217, 0.529 LINE_RE = re.compile(r'^([ \t#>*]{0,8})([A-Z]{2,4}-\d+(?:\.\d+)*[A-Za-z]?)(.*)$') MENTION_RE = re.compile(r'(?ð\s*\**\d*\**\s*Þ', '', rest) t = re.sub(r'ð\**\d*\**Þ', '', t) t = t.replace('**', '').replace('#', '') return re.sub(r'\s+', ' ', t).strip(' *:—-') def is_header(markup, rest): if '#' in markup or '*' in markup: return True rs = rest.strip() if rs == '': return True if REF_LEAD.match(rest): return False if rs[0] in ',;.)': return False if '가' <= rs[0] <= '힣': return False if rs[0].islower(): return False return bool(TITLE_AFTER.match(rs)) def paginate(body): """split an over-CAP body into <=MAX_PAGES line-aligned pages of ~PAGE_TOK tokens.""" pages, cur, ct = [], [], 0 for ln in body.split('\n'): lt = tok(ln) + 1 if ct + lt > PAGE_TOK and cur: pages.append('\n'.join(cur)); cur, ct = [ln], lt else: cur.append(ln); ct += lt if cur: pages.append('\n'.join(cur)) return pages def build_clauses(text): lines = text.split('\n'); off = []; a = 0 for ln in lines: off.append(a); a += len(ln) + 1 bounds = []; seen = set() for i, ln in enumerate(lines): m = LINE_RE.match(ln) if not m: continue markup, code, rest = m.group(1), m.group(2), m.group(3) if not EXACT_TOP.match(code): continue if not is_header(markup, rest): continue if code in seen: continue seen.add(code); bounds.append((off[i], code, clean_title(rest))) raw = [] for idx, (start, code, title) in enumerate(bounds): end = bounds[idx+1][0] if idx+1 < len(bounds) else len(text) body = text[start:end] part = re.match(r'^[A-Z]{2,4}', code).group(0) links = sorted(set(re.match(r'^[A-Z]{1,4}-\d+', mm).group(0) for mm in MENTION_RE.findall(body)) - {code}) raw.append(dict(code=code, part=part, title=(code + (' ' + title if title else '')), body=body, tok=tok(body), links=links)) # expand over-CAP into pages; assign running clause_order final, order = [], 0 for c in raw: if c['tok'] <= CAP: final.append({**c, 'order': order}); order += 1; continue pages = paginate(c['body']) for pi, pb in enumerate(pages): code = c['code'] if pi == 0 else f"{c['code']}·p{pi+1}" title = c['title'] if pi == 0 else f"{c['title']} (페이지 {pi+1}/{len(pages)})" final.append(dict(code=code, part=c['part'], order=order, title=title, body=pb, tok=tok(pb), links=c['links'] if pi == 0 else [])) order += 1 return final async def main(): parent = int(sys.argv[1]); commit = '--commit' in sys.argv import asyncpg conn = await asyncpg.connect(os.environ['DATABASE_URL'].replace('+asyncpg', '')) row = await conn.fetchrow("SELECT md_content, ai_domain, data_origin FROM documents WHERE id=$1", parent) if not row: print(f"parent {parent} not found"); return clauses = build_clauses(row['md_content']) toks = [c['tok'] for c in clauses] over = [c for c in clauses if c['tok'] > CAP] print(f"parent={parent} clause_docs={len(clauses)} median_tok={int(statistics.median(toks))} " f"max_tok={max(toks)} over_cap_remaining={len(over)}") if over: print("still over-CAP:", [f"{c['code']}:{c['tok']}t" for c in over]) if not commit: print("DRY-RUN. pass --commit to persist."); await conn.close(); return async with conn.transaction(): deld = await conn.execute("DELETE FROM documents WHERE parent_id=$1 AND doc_kind='clause'", parent) print("deleted prior:", deld) for c in clauses: fh = hashlib.sha256(f"{parent}:{c['code']}:{c['body']}".encode()).hexdigest() cid = await conn.fetchval(""" INSERT INTO documents (file_format, file_hash, title, md_content, parent_id, doc_kind, clause_code, clause_part, clause_order, ai_domain, data_origin, md_status, review_status, conversion_status, preview_status) VALUES ('md',$1,$2,$3,$4,'clause',$5,$6,$7,$8,$9,'success','approved','none','none') RETURNING id """, fh, c['title'], c['body'], parent, c['code'], c['part'], c['order'], row['ai_domain'], row['data_origin'] or 'external') await conn.execute("INSERT INTO document_tags(doc_id,tag,tag_kind) VALUES ($1,$2,'part') " "ON CONFLICT DO NOTHING", cid, c['part']) n = await conn.fetchval("SELECT count(*) FROM documents WHERE parent_id=$1 AND doc_kind='clause'", parent) print(f"COMMITTED: {n} clause docs for parent {parent}") await conn.close() asyncio.run(main())