Files
hyungi_document_server/scripts/asme_backlinks_persist.py
T

54 lines
2.8 KiB
Python

#!/usr/bin/env python3
"""ASME clause-KB backlinks: resolve clause-id mentions in each clause doc -> clause_links.
dst resolved to the clause doc of the same parent (top-level code); sub-code mention -> anchor;
unresolved (cross-standard / material spec not split) -> dangling (dst_doc_id NULL).
Idempotent per parent. Usage: python3 asme_backlinks_persist.py <parent_id> [--commit]
"""
import asyncio, os, re, sys
MENTION_RE = re.compile(r'(?<![A-Za-z0-9])([A-Z]{1,4}-\d+(?:\.\d+)*[A-Za-z]?)(?![A-Za-z0-9])')
def top(code): return re.match(r'^[A-Z]{1,4}-\d+', code).group(0)
async def main():
parent = int(sys.argv[1]); commit = '--commit' in sys.argv
import asyncpg
conn = await asyncpg.connect(os.environ['DATABASE_URL'].replace('+asyncpg', ''))
docs = await conn.fetch("SELECT id, clause_code, md_content FROM documents "
"WHERE parent_id=$1 AND doc_kind='clause' ORDER BY clause_order", parent)
code2id = {d['clause_code']: d['id'] for d in docs}
edges = [] # (src_id, dst_code, dst_doc_id, anchor, ctx, char_off)
resolved = dangling = 0
for d in docs:
body = d['md_content']; src_top = d['clause_code']
seen = set()
for m in MENTION_RE.finditer(body):
code = m.group(1); t = top(code)
if t == src_top: continue # self-reference
if (d['id'], code) in seen: continue # dedup per (src,dst_code)
seen.add((d['id'], code))
dst_id = code2id.get(t) # resolve to same-parent clause doc
anchor = code.lower().replace('.', '-') if code != t else None
off = m.start()
ctx = re.sub(r'\s+', ' ', body[max(0, off-50):off+50]).strip()
edges.append((d['id'], code, dst_id, anchor, ctx, off))
if dst_id: resolved += 1
else: dangling += 1
print(f"parent={parent} clause_docs={len(docs)} edges={len(edges)} resolved={resolved} dangling={dangling}")
# top referenced clauses
from collections import Counter
tgt = Counter(top(e[1]) for e in edges if e[2])
print("most-referenced:", tgt.most_common(8))
if not commit:
print("DRY-RUN. pass --commit to persist."); await conn.close(); return
async with conn.transaction():
ids = [d['id'] for d in docs]
await conn.execute("DELETE FROM clause_links WHERE src_doc_id = ANY($1::bigint[])", ids)
await conn.executemany(
"INSERT INTO clause_links(src_doc_id,dst_code,dst_doc_id,anchor,ctx,char_off) "
"VALUES ($1,$2,$3,$4,$5,$6)", edges)
n = await conn.fetchval("SELECT count(*) FROM clause_links WHERE src_doc_id = ANY($1::bigint[])", ids)
print(f"COMMITTED: {n} clause_links for parent {parent}")
await conn.close()
asyncio.run(main())