From 51a7c96b5604353c38f267e6dab4f0c3c99ea6a0 Mon Sep 17 00:00:00 2001 From: hyungi Date: Mon, 29 Jun 2026 23:20:16 +0000 Subject: [PATCH] =?UTF-8?q?feat(clause-kb):=20over-CAP=20=EC=A0=88=20?= =?UTF-8?q?=EB=B3=B8=EB=AC=B8=20=ED=8E=98=EC=9D=B4=EC=A7=80=EB=84=A4?= =?UTF-8?q?=EC=9D=B4=EC=85=98(~11K=20tok/page)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scripts/asme_clause_persist.py | 70 ++++++++++++++++++++-------------- 1 file changed, 42 insertions(+), 28 deletions(-) diff --git a/scripts/asme_clause_persist.py b/scripts/asme_clause_persist.py index 8f7b6a0..5a54751 100644 --- a/scripts/asme_clause_persist.py +++ b/scripts/asme_clause_persist.py @@ -1,16 +1,16 @@ #!/usr/bin/env python3 -"""ASME clause-KB persist: split a parent standard into per-clause documents (A-granularity). -Idempotent per parent. Clause docs: doc_kind='clause', embedding NULL (search-excluded via -doc_kind filter), parent_id=. Also writes Part tags. Run inside fastapi container. +"""ASME clause-KB persist (v2: over-CAP pagination). Split a parent standard into per-clause +documents (A-granularity); over-CAP clause bodies are paginated into readable page-docs. +Idempotent per parent. doc_kind='clause', embedding NULL (search-excluded), parent_id=. Usage: python3 asme_clause_persist.py [--commit] """ import asyncio, os, re, sys, hashlib, statistics -CAP = 12000 +CAP = 12000; PAGE_TOK = 11000 EN, KO = 0.217, 0.529 LINE_RE = re.compile(r'^([ \t#>*]{0,8})([A-Z]{2,4}-\d+(?:\.\d+)*[A-Za-z]?)(.*)$') MENTION_RE = re.compile(r'(?ð\s*\**\d*\**\s*Þ', '', t) # revision bar (sup form) - t = re.sub(r'ð\**\d*\**Þ', '', t) # revision bar (plain) + t = re.sub(r'ð\s*\**\d*\**\s*Þ', '', rest) + t = re.sub(r'ð\**\d*\**Þ', '', t) t = t.replace('**', '').replace('#', '') - t = re.sub(r'\s+', ' ', t).strip(' *:—-') - return t + return re.sub(r'\s+', ' ', t).strip(' *:—-') def is_header(markup, rest): if '#' in markup or '*' in markup: return True @@ -36,12 +34,22 @@ def is_header(markup, rest): if rs[0].islower(): return False return bool(TITLE_AFTER.match(rs)) +def paginate(body): + """split an over-CAP body into <=MAX_PAGES line-aligned pages of ~PAGE_TOK tokens.""" + pages, cur, ct = [], [], 0 + for ln in body.split('\n'): + lt = tok(ln) + 1 + if ct + lt > PAGE_TOK and cur: + pages.append('\n'.join(cur)); cur, ct = [ln], lt + else: + cur.append(ln); ct += lt + if cur: pages.append('\n'.join(cur)) + return pages + def build_clauses(text): lines = text.split('\n'); off = []; a = 0 for ln in lines: off.append(a); a += len(ln) + 1 - # exact-top-level HEADER boundaries, first-seen only (fixes dup + sub-fragment noise) - bounds = [] # (pos, code, title) - seen = set() + bounds = []; seen = set() for i, ln in enumerate(lines): m = LINE_RE.match(ln) if not m: continue @@ -50,37 +58,44 @@ def build_clauses(text): if not is_header(markup, rest): continue if code in seen: continue seen.add(code); bounds.append((off[i], code, clean_title(rest))) - clauses = [] + raw = [] for idx, (start, code, title) in enumerate(bounds): end = bounds[idx+1][0] if idx+1 < len(bounds) else len(text) body = text[start:end] part = re.match(r'^[A-Z]{2,4}', code).group(0) links = sorted(set(re.match(r'^[A-Z]{1,4}-\d+', mm).group(0) for mm in MENTION_RE.findall(body)) - {code}) - clauses.append(dict(code=code, part=part, order=idx, title=(code + (' ' + title if title else '')), - body=body, tok=tok(body), links=links)) - return clauses + raw.append(dict(code=code, part=part, title=(code + (' ' + title if title else '')), + body=body, tok=tok(body), links=links)) + # expand over-CAP into pages; assign running clause_order + final, order = [], 0 + for c in raw: + if c['tok'] <= CAP: + final.append({**c, 'order': order}); order += 1; continue + pages = paginate(c['body']) + for pi, pb in enumerate(pages): + code = c['code'] if pi == 0 else f"{c['code']}·p{pi+1}" + title = c['title'] if pi == 0 else f"{c['title']} (페이지 {pi+1}/{len(pages)})" + final.append(dict(code=code, part=c['part'], order=order, title=title, + body=pb, tok=tok(pb), links=c['links'] if pi == 0 else [])) + order += 1 + return final async def main(): - parent = int(sys.argv[1]) - commit = '--commit' in sys.argv + parent = int(sys.argv[1]); commit = '--commit' in sys.argv import asyncpg - dsn = os.environ['DATABASE_URL'].replace('+asyncpg', '') - conn = await asyncpg.connect(dsn) + conn = await asyncpg.connect(os.environ['DATABASE_URL'].replace('+asyncpg', '')) row = await conn.fetchrow("SELECT md_content, ai_domain, data_origin FROM documents WHERE id=$1", parent) if not row: print(f"parent {parent} not found"); return clauses = build_clauses(row['md_content']) toks = [c['tok'] for c in clauses] over = [c for c in clauses if c['tok'] > CAP] print(f"parent={parent} clause_docs={len(clauses)} median_tok={int(statistics.median(toks))} " - f"max_tok={max(toks)} over_cap={len(over)} total_backlinks={sum(len(c['links']) for c in clauses)}") - print("sample:", [f"{c['code']}:{c['tok']}t" for c in clauses[:8]]) - if over: print("over-CAP:", [f"{c['code']}:{c['tok']}t" for c in over]) + f"max_tok={max(toks)} over_cap_remaining={len(over)}") + if over: print("still over-CAP:", [f"{c['code']}:{c['tok']}t" for c in over]) if not commit: - print("DRY-RUN (no write). pass --commit to persist."); await conn.close(); return - + print("DRY-RUN. pass --commit to persist."); await conn.close(); return async with conn.transaction(): - # idempotent: remove prior clause docs of this parent (cascades clause_links/document_tags) deld = await conn.execute("DELETE FROM documents WHERE parent_id=$1 AND doc_kind='clause'", parent) print("deleted prior:", deld) for c in clauses: @@ -94,7 +109,6 @@ async def main(): RETURNING id """, fh, c['title'], c['body'], parent, c['code'], c['part'], c['order'], row['ai_domain'], row['data_origin'] or 'external') - # Part tag await conn.execute("INSERT INTO document_tags(doc_id,tag,tag_kind) VALUES ($1,$2,'part') " "ON CONFLICT DO NOTHING", cid, c['part']) n = await conn.fetchval("SELECT count(*) FROM documents WHERE parent_id=$1 AND doc_kind='clause'", parent)