119 lines
5.7 KiB
Python
119 lines
5.7 KiB
Python
#!/usr/bin/env python3
|
|
"""ASME clause-KB persist (v2: over-CAP pagination). Split a parent standard into per-clause
|
|
documents (A-granularity); over-CAP clause bodies are paginated into readable page-docs.
|
|
Idempotent per parent. doc_kind='clause', embedding NULL (search-excluded), parent_id=<parent>.
|
|
Usage: python3 asme_clause_persist.py <parent_id> [--commit]
|
|
"""
|
|
import asyncio, os, re, sys, hashlib, statistics
|
|
|
|
CAP = 12000; PAGE_TOK = 11000
|
|
EN, KO = 0.217, 0.529
|
|
LINE_RE = re.compile(r'^([ \t#>*]{0,8})([A-Z]{2,4}-\d+(?:\.\d+)*[A-Za-z]?)(.*)$')
|
|
MENTION_RE = re.compile(r'(?<![A-Za-z0-9])([A-Z]{1,4}-\d+(?:\.\d+)*[A-Za-z]?)(?![A-Za-z0-9])')
|
|
EXACT_TOP = re.compile(r'^[A-Z]{2,4}-\d+$')
|
|
TITLE_AFTER = re.compile(r'^[\s.]*[A-Z(]')
|
|
REF_LEAD = re.compile(r'^[\s.]*(and|or|to|of|in|on|the|as|is|are|shall|through|per|see|with|'
|
|
r'for|by|that|which|such|또는|및|등|의|은|는|에|을|를|과|와)\b', re.I)
|
|
|
|
def tok(s):
|
|
ko = sum(1 for c in s if '가' <= c <= '힣'); return int((len(s)-ko)*EN + ko*KO)
|
|
|
|
def clean_title(rest):
|
|
t = re.sub(r'<sup>ð</sup>\s*\**\d*\**\s*<sup>Þ</sup>', '', rest)
|
|
t = re.sub(r'ð\**\d*\**Þ', '', t)
|
|
t = t.replace('**', '').replace('#', '')
|
|
return re.sub(r'\s+', ' ', t).strip(' *:—-')
|
|
|
|
def is_header(markup, rest):
|
|
if '#' in markup or '*' in markup: return True
|
|
rs = rest.strip()
|
|
if rs == '': return True
|
|
if REF_LEAD.match(rest): return False
|
|
if rs[0] in ',;.)': return False
|
|
if '가' <= rs[0] <= '힣': return False
|
|
if rs[0].islower(): return False
|
|
return bool(TITLE_AFTER.match(rs))
|
|
|
|
def paginate(body):
|
|
"""split an over-CAP body into <=MAX_PAGES line-aligned pages of ~PAGE_TOK tokens."""
|
|
pages, cur, ct = [], [], 0
|
|
for ln in body.split('\n'):
|
|
lt = tok(ln) + 1
|
|
if ct + lt > PAGE_TOK and cur:
|
|
pages.append('\n'.join(cur)); cur, ct = [ln], lt
|
|
else:
|
|
cur.append(ln); ct += lt
|
|
if cur: pages.append('\n'.join(cur))
|
|
return pages
|
|
|
|
def build_clauses(text):
|
|
lines = text.split('\n'); off = []; a = 0
|
|
for ln in lines: off.append(a); a += len(ln) + 1
|
|
bounds = []; seen = set()
|
|
for i, ln in enumerate(lines):
|
|
m = LINE_RE.match(ln)
|
|
if not m: continue
|
|
markup, code, rest = m.group(1), m.group(2), m.group(3)
|
|
if not EXACT_TOP.match(code): continue
|
|
if not is_header(markup, rest): continue
|
|
if code in seen: continue
|
|
seen.add(code); bounds.append((off[i], code, clean_title(rest)))
|
|
raw = []
|
|
for idx, (start, code, title) in enumerate(bounds):
|
|
end = bounds[idx+1][0] if idx+1 < len(bounds) else len(text)
|
|
body = text[start:end]
|
|
part = re.match(r'^[A-Z]{2,4}', code).group(0)
|
|
links = sorted(set(re.match(r'^[A-Z]{1,4}-\d+', mm).group(0)
|
|
for mm in MENTION_RE.findall(body)) - {code})
|
|
raw.append(dict(code=code, part=part, title=(code + (' ' + title if title else '')),
|
|
body=body, tok=tok(body), links=links))
|
|
# expand over-CAP into pages; assign running clause_order
|
|
final, order = [], 0
|
|
for c in raw:
|
|
if c['tok'] <= CAP:
|
|
final.append({**c, 'order': order}); order += 1; continue
|
|
pages = paginate(c['body'])
|
|
for pi, pb in enumerate(pages):
|
|
code = c['code'] if pi == 0 else f"{c['code']}·p{pi+1}"
|
|
title = c['title'] if pi == 0 else f"{c['title']} (페이지 {pi+1}/{len(pages)})"
|
|
final.append(dict(code=code, part=c['part'], order=order, title=title,
|
|
body=pb, tok=tok(pb), links=c['links'] if pi == 0 else []))
|
|
order += 1
|
|
return final
|
|
|
|
async def main():
|
|
parent = int(sys.argv[1]); commit = '--commit' in sys.argv
|
|
import asyncpg
|
|
conn = await asyncpg.connect(os.environ['DATABASE_URL'].replace('+asyncpg', ''))
|
|
row = await conn.fetchrow("SELECT md_content, ai_domain, data_origin FROM documents WHERE id=$1", parent)
|
|
if not row: print(f"parent {parent} not found"); return
|
|
clauses = build_clauses(row['md_content'])
|
|
toks = [c['tok'] for c in clauses]
|
|
over = [c for c in clauses if c['tok'] > CAP]
|
|
print(f"parent={parent} clause_docs={len(clauses)} median_tok={int(statistics.median(toks))} "
|
|
f"max_tok={max(toks)} over_cap_remaining={len(over)}")
|
|
if over: print("still over-CAP:", [f"{c['code']}:{c['tok']}t" for c in over])
|
|
if not commit:
|
|
print("DRY-RUN. pass --commit to persist."); await conn.close(); return
|
|
async with conn.transaction():
|
|
deld = await conn.execute("DELETE FROM documents WHERE parent_id=$1 AND doc_kind='clause'", parent)
|
|
print("deleted prior:", deld)
|
|
for c in clauses:
|
|
fh = hashlib.sha256(f"{parent}:{c['code']}:{c['body']}".encode()).hexdigest()
|
|
cid = await conn.fetchval("""
|
|
INSERT INTO documents
|
|
(file_format, file_hash, title, md_content, parent_id, doc_kind,
|
|
clause_code, clause_part, clause_order, ai_domain, data_origin,
|
|
md_status, review_status, conversion_status, preview_status)
|
|
VALUES ('md',$1,$2,$3,$4,'clause',$5,$6,$7,$8,$9,'success','approved','none','none')
|
|
RETURNING id
|
|
""", fh, c['title'], c['body'], parent, c['code'], c['part'], c['order'],
|
|
row['ai_domain'], row['data_origin'] or 'external')
|
|
await conn.execute("INSERT INTO document_tags(doc_id,tag,tag_kind) VALUES ($1,$2,'part') "
|
|
"ON CONFLICT DO NOTHING", cid, c['part'])
|
|
n = await conn.fetchval("SELECT count(*) FROM documents WHERE parent_id=$1 AND doc_kind='clause'", parent)
|
|
print(f"COMMITTED: {n} clause docs for parent {parent}")
|
|
await conn.close()
|
|
|
|
asyncio.run(main())
|