54 lines
2.8 KiB
Python
54 lines
2.8 KiB
Python
#!/usr/bin/env python3
|
|
"""ASME clause-KB backlinks: resolve clause-id mentions in each clause doc -> clause_links.
|
|
dst resolved to the clause doc of the same parent (top-level code); sub-code mention -> anchor;
|
|
unresolved (cross-standard / material spec not split) -> dangling (dst_doc_id NULL).
|
|
Idempotent per parent. Usage: python3 asme_backlinks_persist.py <parent_id> [--commit]
|
|
"""
|
|
import asyncio, os, re, sys
|
|
|
|
MENTION_RE = re.compile(r'(?<![A-Za-z0-9])([A-Z]{1,4}-\d+(?:\.\d+)*[A-Za-z]?)(?![A-Za-z0-9])')
|
|
def top(code): return re.match(r'^[A-Z]{1,4}-\d+', code).group(0)
|
|
|
|
async def main():
|
|
parent = int(sys.argv[1]); commit = '--commit' in sys.argv
|
|
import asyncpg
|
|
conn = await asyncpg.connect(os.environ['DATABASE_URL'].replace('+asyncpg', ''))
|
|
docs = await conn.fetch("SELECT id, clause_code, md_content FROM documents "
|
|
"WHERE parent_id=$1 AND doc_kind='clause' ORDER BY clause_order", parent)
|
|
code2id = {d['clause_code']: d['id'] for d in docs}
|
|
edges = [] # (src_id, dst_code, dst_doc_id, anchor, ctx, char_off)
|
|
resolved = dangling = 0
|
|
for d in docs:
|
|
body = d['md_content']; src_top = d['clause_code']
|
|
seen = set()
|
|
for m in MENTION_RE.finditer(body):
|
|
code = m.group(1); t = top(code)
|
|
if t == src_top: continue # self-reference
|
|
if (d['id'], code) in seen: continue # dedup per (src,dst_code)
|
|
seen.add((d['id'], code))
|
|
dst_id = code2id.get(t) # resolve to same-parent clause doc
|
|
anchor = code.lower().replace('.', '-') if code != t else None
|
|
off = m.start()
|
|
ctx = re.sub(r'\s+', ' ', body[max(0, off-50):off+50]).strip()
|
|
edges.append((d['id'], code, dst_id, anchor, ctx, off))
|
|
if dst_id: resolved += 1
|
|
else: dangling += 1
|
|
print(f"parent={parent} clause_docs={len(docs)} edges={len(edges)} resolved={resolved} dangling={dangling}")
|
|
# top referenced clauses
|
|
from collections import Counter
|
|
tgt = Counter(top(e[1]) for e in edges if e[2])
|
|
print("most-referenced:", tgt.most_common(8))
|
|
if not commit:
|
|
print("DRY-RUN. pass --commit to persist."); await conn.close(); return
|
|
async with conn.transaction():
|
|
ids = [d['id'] for d in docs]
|
|
await conn.execute("DELETE FROM clause_links WHERE src_doc_id = ANY($1::bigint[])", ids)
|
|
await conn.executemany(
|
|
"INSERT INTO clause_links(src_doc_id,dst_code,dst_doc_id,anchor,ctx,char_off) "
|
|
"VALUES ($1,$2,$3,$4,$5,$6)", edges)
|
|
n = await conn.fetchval("SELECT count(*) FROM clause_links WHERE src_doc_id = ANY($1::bigint[])", ids)
|
|
print(f"COMMITTED: {n} clause_links for parent {parent}")
|
|
await conn.close()
|
|
|
|
asyncio.run(main())
|