feat(clause-kb): 책 API(절 목차/백링크) + /book/[id] 유기적 책 리더 + persist 스크립트
This commit is contained in:
@@ -0,0 +1,53 @@
|
||||
#!/usr/bin/env python3
|
||||
"""ASME clause-KB backlinks: resolve clause-id mentions in each clause doc -> clause_links.
|
||||
dst resolved to the clause doc of the same parent (top-level code); sub-code mention -> anchor;
|
||||
unresolved (cross-standard / material spec not split) -> dangling (dst_doc_id NULL).
|
||||
Idempotent per parent. Usage: python3 asme_backlinks_persist.py <parent_id> [--commit]
|
||||
"""
|
||||
import asyncio, os, re, sys
|
||||
|
||||
MENTION_RE = re.compile(r'(?<![A-Za-z0-9])([A-Z]{1,4}-\d+(?:\.\d+)*[A-Za-z]?)(?![A-Za-z0-9])')
|
||||
def top(code): return re.match(r'^[A-Z]{1,4}-\d+', code).group(0)
|
||||
|
||||
async def main():
|
||||
parent = int(sys.argv[1]); commit = '--commit' in sys.argv
|
||||
import asyncpg
|
||||
conn = await asyncpg.connect(os.environ['DATABASE_URL'].replace('+asyncpg', ''))
|
||||
docs = await conn.fetch("SELECT id, clause_code, md_content FROM documents "
|
||||
"WHERE parent_id=$1 AND doc_kind='clause' ORDER BY clause_order", parent)
|
||||
code2id = {d['clause_code']: d['id'] for d in docs}
|
||||
edges = [] # (src_id, dst_code, dst_doc_id, anchor, ctx, char_off)
|
||||
resolved = dangling = 0
|
||||
for d in docs:
|
||||
body = d['md_content']; src_top = d['clause_code']
|
||||
seen = set()
|
||||
for m in MENTION_RE.finditer(body):
|
||||
code = m.group(1); t = top(code)
|
||||
if t == src_top: continue # self-reference
|
||||
if (d['id'], code) in seen: continue # dedup per (src,dst_code)
|
||||
seen.add((d['id'], code))
|
||||
dst_id = code2id.get(t) # resolve to same-parent clause doc
|
||||
anchor = code.lower().replace('.', '-') if code != t else None
|
||||
off = m.start()
|
||||
ctx = re.sub(r'\s+', ' ', body[max(0, off-50):off+50]).strip()
|
||||
edges.append((d['id'], code, dst_id, anchor, ctx, off))
|
||||
if dst_id: resolved += 1
|
||||
else: dangling += 1
|
||||
print(f"parent={parent} clause_docs={len(docs)} edges={len(edges)} resolved={resolved} dangling={dangling}")
|
||||
# top referenced clauses
|
||||
from collections import Counter
|
||||
tgt = Counter(top(e[1]) for e in edges if e[2])
|
||||
print("most-referenced:", tgt.most_common(8))
|
||||
if not commit:
|
||||
print("DRY-RUN. pass --commit to persist."); await conn.close(); return
|
||||
async with conn.transaction():
|
||||
ids = [d['id'] for d in docs]
|
||||
await conn.execute("DELETE FROM clause_links WHERE src_doc_id = ANY($1::bigint[])", ids)
|
||||
await conn.executemany(
|
||||
"INSERT INTO clause_links(src_doc_id,dst_code,dst_doc_id,anchor,ctx,char_off) "
|
||||
"VALUES ($1,$2,$3,$4,$5,$6)", edges)
|
||||
n = await conn.fetchval("SELECT count(*) FROM clause_links WHERE src_doc_id = ANY($1::bigint[])", ids)
|
||||
print(f"COMMITTED: {n} clause_links for parent {parent}")
|
||||
await conn.close()
|
||||
|
||||
asyncio.run(main())
|
||||
@@ -0,0 +1,104 @@
|
||||
#!/usr/bin/env python3
|
||||
"""ASME clause-KB persist: split a parent standard into per-clause documents (A-granularity).
|
||||
Idempotent per parent. Clause docs: doc_kind='clause', embedding NULL (search-excluded via
|
||||
doc_kind filter), parent_id=<parent>. Also writes Part tags. Run inside fastapi container.
|
||||
Usage: python3 asme_clause_persist.py <parent_id> [--commit]
|
||||
"""
|
||||
import asyncio, os, re, sys, hashlib, statistics
|
||||
|
||||
CAP = 12000
|
||||
EN, KO = 0.217, 0.529
|
||||
LINE_RE = re.compile(r'^([ \t#>*]{0,8})([A-Z]{2,4}-\d+(?:\.\d+)*[A-Za-z]?)(.*)$')
|
||||
MENTION_RE = re.compile(r'(?<![A-Za-z0-9])([A-Z]{1,4}-\d+(?:\.\d+)*[A-Za-z]?)(?![A-Za-z0-9])')
|
||||
EXACT_TOP = re.compile(r'^[A-Z]{2,4}-\d+$') # top-level clause code (no dotted suffix)
|
||||
TITLE_AFTER = re.compile(r'^[\s.]*[A-Z(]')
|
||||
REF_LEAD = re.compile(r'^[\s.]*(and|or|to|of|in|on|the|as|is|are|shall|through|per|see|with|'
|
||||
r'for|by|that|which|such|또는|및|등|의|은|는|에|을|를|과|와)\b', re.I)
|
||||
|
||||
def tok(s):
|
||||
ko = sum(1 for c in s if '가' <= c <= '힣'); return int((len(s)-ko)*EN + ko*KO)
|
||||
|
||||
def clean_title(rest):
|
||||
t = rest
|
||||
t = re.sub(r'<sup>ð</sup>\s*\**\d*\**\s*<sup>Þ</sup>', '', t) # revision bar (sup form)
|
||||
t = re.sub(r'ð\**\d*\**Þ', '', t) # revision bar (plain)
|
||||
t = t.replace('**', '').replace('#', '')
|
||||
t = re.sub(r'\s+', ' ', t).strip(' *:—-')
|
||||
return t
|
||||
|
||||
def is_header(markup, rest):
|
||||
if '#' in markup or '*' in markup: return True
|
||||
rs = rest.strip()
|
||||
if rs == '': return True
|
||||
if REF_LEAD.match(rest): return False
|
||||
if rs[0] in ',;.)': return False
|
||||
if '가' <= rs[0] <= '힣': return False
|
||||
if rs[0].islower(): return False
|
||||
return bool(TITLE_AFTER.match(rs))
|
||||
|
||||
def build_clauses(text):
|
||||
lines = text.split('\n'); off = []; a = 0
|
||||
for ln in lines: off.append(a); a += len(ln) + 1
|
||||
# exact-top-level HEADER boundaries, first-seen only (fixes dup + sub-fragment noise)
|
||||
bounds = [] # (pos, code, title)
|
||||
seen = set()
|
||||
for i, ln in enumerate(lines):
|
||||
m = LINE_RE.match(ln)
|
||||
if not m: continue
|
||||
markup, code, rest = m.group(1), m.group(2), m.group(3)
|
||||
if not EXACT_TOP.match(code): continue
|
||||
if not is_header(markup, rest): continue
|
||||
if code in seen: continue
|
||||
seen.add(code); bounds.append((off[i], code, clean_title(rest)))
|
||||
clauses = []
|
||||
for idx, (start, code, title) in enumerate(bounds):
|
||||
end = bounds[idx+1][0] if idx+1 < len(bounds) else len(text)
|
||||
body = text[start:end]
|
||||
part = re.match(r'^[A-Z]{2,4}', code).group(0)
|
||||
links = sorted(set(re.match(r'^[A-Z]{1,4}-\d+', mm).group(0)
|
||||
for mm in MENTION_RE.findall(body)) - {code})
|
||||
clauses.append(dict(code=code, part=part, order=idx, title=(code + (' ' + title if title else '')),
|
||||
body=body, tok=tok(body), links=links))
|
||||
return clauses
|
||||
|
||||
async def main():
|
||||
parent = int(sys.argv[1])
|
||||
commit = '--commit' in sys.argv
|
||||
import asyncpg
|
||||
dsn = os.environ['DATABASE_URL'].replace('+asyncpg', '')
|
||||
conn = await asyncpg.connect(dsn)
|
||||
row = await conn.fetchrow("SELECT md_content, ai_domain, data_origin FROM documents WHERE id=$1", parent)
|
||||
if not row: print(f"parent {parent} not found"); return
|
||||
clauses = build_clauses(row['md_content'])
|
||||
toks = [c['tok'] for c in clauses]
|
||||
over = [c for c in clauses if c['tok'] > CAP]
|
||||
print(f"parent={parent} clause_docs={len(clauses)} median_tok={int(statistics.median(toks))} "
|
||||
f"max_tok={max(toks)} over_cap={len(over)} total_backlinks={sum(len(c['links']) for c in clauses)}")
|
||||
print("sample:", [f"{c['code']}:{c['tok']}t" for c in clauses[:8]])
|
||||
if over: print("over-CAP:", [f"{c['code']}:{c['tok']}t" for c in over])
|
||||
if not commit:
|
||||
print("DRY-RUN (no write). pass --commit to persist."); await conn.close(); return
|
||||
|
||||
async with conn.transaction():
|
||||
# idempotent: remove prior clause docs of this parent (cascades clause_links/document_tags)
|
||||
deld = await conn.execute("DELETE FROM documents WHERE parent_id=$1 AND doc_kind='clause'", parent)
|
||||
print("deleted prior:", deld)
|
||||
for c in clauses:
|
||||
fh = hashlib.sha256(f"{parent}:{c['code']}:{c['body']}".encode()).hexdigest()
|
||||
cid = await conn.fetchval("""
|
||||
INSERT INTO documents
|
||||
(file_format, file_hash, title, md_content, parent_id, doc_kind,
|
||||
clause_code, clause_part, clause_order, ai_domain, data_origin,
|
||||
md_status, review_status, conversion_status, preview_status)
|
||||
VALUES ('md',$1,$2,$3,$4,'clause',$5,$6,$7,$8,$9,'success','approved','none','none')
|
||||
RETURNING id
|
||||
""", fh, c['title'], c['body'], parent, c['code'], c['part'], c['order'],
|
||||
row['ai_domain'], row['data_origin'] or 'external')
|
||||
# Part tag
|
||||
await conn.execute("INSERT INTO document_tags(doc_id,tag,tag_kind) VALUES ($1,$2,'part') "
|
||||
"ON CONFLICT DO NOTHING", cid, c['part'])
|
||||
n = await conn.fetchval("SELECT count(*) FROM documents WHERE parent_id=$1 AND doc_kind='clause'", parent)
|
||||
print(f"COMMITTED: {n} clause docs for parent {parent}")
|
||||
await conn.close()
|
||||
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user