feat(clause-kb): 책 API(절 목차/백링크) + /book/[id] 유기적 책 리더 + persist 스크립트
This commit is contained in:
@@ -1821,3 +1821,153 @@ async def analyze_document(
|
|||||||
error_code=error_code,
|
error_code=error_code,
|
||||||
source=source,
|
source=source,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ─── ASME 절-지식베이스: 유기적 책 네비 (clause-KB, doc_kind='clause' 자식 문서 기반) ───
|
||||||
|
class ClauseTocItem(BaseModel):
|
||||||
|
id: int
|
||||||
|
clause_code: str | None = None
|
||||||
|
clause_part: str | None = None
|
||||||
|
clause_order: int | None = None
|
||||||
|
title: str | None = None
|
||||||
|
|
||||||
|
|
||||||
|
class ClauseBookResponse(BaseModel):
|
||||||
|
parent_id: int
|
||||||
|
parent_title: str | None = None
|
||||||
|
clauses: list[ClauseTocItem]
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/{doc_id}/clauses", response_model=ClauseBookResponse)
|
||||||
|
async def get_document_clauses(
|
||||||
|
doc_id: int,
|
||||||
|
user: Annotated[User, Depends(get_current_user)],
|
||||||
|
session: Annotated[AsyncSession, Depends(get_session)],
|
||||||
|
):
|
||||||
|
"""부모 표준 doc 의 절-문서 목차(유기적 책 TOC). doc_kind='clause' 자식을 clause_order 순 반환.
|
||||||
|
|
||||||
|
절-문서는 in_corpus=false + doc_kind='clause'(검색 제외)라 일반 목록/검색엔 안 뜨지만,
|
||||||
|
이 책-내 네비는 부모 표준에서 자식 절로 진입하는 전용 경로다(ASME 2025판=한 권의 책).
|
||||||
|
"""
|
||||||
|
from sqlalchemy import text as sql_text
|
||||||
|
|
||||||
|
parent = await session.get(Document, doc_id)
|
||||||
|
if not parent or parent.deleted_at is not None:
|
||||||
|
raise HTTPException(status_code=404, detail="문서를 찾을 수 없습니다")
|
||||||
|
rows = (
|
||||||
|
await session.execute(
|
||||||
|
sql_text(
|
||||||
|
"""
|
||||||
|
SELECT id, clause_code, clause_part, clause_order, title
|
||||||
|
FROM documents
|
||||||
|
WHERE parent_id = :pid AND doc_kind = 'clause' AND deleted_at IS NULL
|
||||||
|
ORDER BY clause_order
|
||||||
|
"""
|
||||||
|
).bindparams(pid=doc_id)
|
||||||
|
)
|
||||||
|
).mappings().all()
|
||||||
|
return ClauseBookResponse(
|
||||||
|
parent_id=doc_id,
|
||||||
|
parent_title=parent.title,
|
||||||
|
clauses=[ClauseTocItem(**dict(r)) for r in rows],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class BacklinkRef(BaseModel):
|
||||||
|
code: str
|
||||||
|
doc_id: int | None = None # 해소된 절-문서(같은 부모) — dangling 이면 None
|
||||||
|
title: str | None = None
|
||||||
|
anchor: str | None = None
|
||||||
|
ctx: str | None = None
|
||||||
|
|
||||||
|
|
||||||
|
class BacklinksResponse(BaseModel):
|
||||||
|
doc_id: int
|
||||||
|
clause_code: str | None = None
|
||||||
|
parent_id: int | None = None
|
||||||
|
prev: ClauseTocItem | None = None
|
||||||
|
next: ClauseTocItem | None = None
|
||||||
|
forward: list[BacklinkRef] # 이 절이 참조하는 절들
|
||||||
|
back: list[BacklinkRef] # 이 절을 참조하는 절들
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/{doc_id}/backlinks", response_model=BacklinksResponse)
|
||||||
|
async def get_document_backlinks(
|
||||||
|
doc_id: int,
|
||||||
|
user: Annotated[User, Depends(get_current_user)],
|
||||||
|
session: Annotated[AsyncSession, Depends(get_session)],
|
||||||
|
):
|
||||||
|
"""절-문서의 양방향 백링크 + 같은 부모 내 이전/다음 절(유기적 책 흐름)."""
|
||||||
|
from sqlalchemy import text as sql_text
|
||||||
|
|
||||||
|
doc = await session.get(Document, doc_id)
|
||||||
|
if not doc or doc.deleted_at is not None:
|
||||||
|
raise HTTPException(status_code=404, detail="문서를 찾을 수 없습니다")
|
||||||
|
|
||||||
|
forward = (
|
||||||
|
await session.execute(
|
||||||
|
sql_text(
|
||||||
|
"""
|
||||||
|
SELECT cl.dst_code AS code, cl.dst_doc_id AS doc_id, cl.anchor, cl.ctx, d.title
|
||||||
|
FROM clause_links cl
|
||||||
|
LEFT JOIN documents d ON d.id = cl.dst_doc_id
|
||||||
|
WHERE cl.src_doc_id = :id
|
||||||
|
ORDER BY cl.char_off NULLS LAST
|
||||||
|
LIMIT 300
|
||||||
|
"""
|
||||||
|
).bindparams(id=doc_id)
|
||||||
|
)
|
||||||
|
).mappings().all()
|
||||||
|
back = (
|
||||||
|
await session.execute(
|
||||||
|
sql_text(
|
||||||
|
"""
|
||||||
|
SELECT s.clause_code AS code, cl.src_doc_id AS doc_id, s.title, cl.ctx
|
||||||
|
FROM clause_links cl
|
||||||
|
JOIN documents s ON s.id = cl.src_doc_id
|
||||||
|
WHERE cl.dst_doc_id = :id
|
||||||
|
ORDER BY s.clause_order NULLS LAST
|
||||||
|
LIMIT 300
|
||||||
|
"""
|
||||||
|
).bindparams(id=doc_id)
|
||||||
|
)
|
||||||
|
).mappings().all()
|
||||||
|
|
||||||
|
prev = nxt = None
|
||||||
|
if doc.parent_id is not None and doc.clause_order is not None:
|
||||||
|
prow = (
|
||||||
|
await session.execute(
|
||||||
|
sql_text(
|
||||||
|
"""
|
||||||
|
SELECT id, clause_code, clause_part, clause_order, title FROM documents
|
||||||
|
WHERE parent_id = :pid AND doc_kind='clause' AND deleted_at IS NULL
|
||||||
|
AND clause_order < :ord
|
||||||
|
ORDER BY clause_order DESC LIMIT 1
|
||||||
|
"""
|
||||||
|
).bindparams(pid=doc.parent_id, ord=doc.clause_order)
|
||||||
|
)
|
||||||
|
).mappings().first()
|
||||||
|
nrow = (
|
||||||
|
await session.execute(
|
||||||
|
sql_text(
|
||||||
|
"""
|
||||||
|
SELECT id, clause_code, clause_part, clause_order, title FROM documents
|
||||||
|
WHERE parent_id = :pid AND doc_kind='clause' AND deleted_at IS NULL
|
||||||
|
AND clause_order > :ord
|
||||||
|
ORDER BY clause_order ASC LIMIT 1
|
||||||
|
"""
|
||||||
|
).bindparams(pid=doc.parent_id, ord=doc.clause_order)
|
||||||
|
)
|
||||||
|
).mappings().first()
|
||||||
|
prev = ClauseTocItem(**dict(prow)) if prow else None
|
||||||
|
nxt = ClauseTocItem(**dict(nrow)) if nrow else None
|
||||||
|
|
||||||
|
return BacklinksResponse(
|
||||||
|
doc_id=doc_id,
|
||||||
|
clause_code=doc.clause_code,
|
||||||
|
parent_id=doc.parent_id,
|
||||||
|
prev=prev,
|
||||||
|
next=nxt,
|
||||||
|
forward=[BacklinkRef(**dict(r)) for r in forward],
|
||||||
|
back=[BacklinkRef(**dict(r)) for r in back],
|
||||||
|
)
|
||||||
|
|||||||
@@ -0,0 +1,201 @@
|
|||||||
|
<script>
|
||||||
|
// ASME 절-지식베이스: 유기적 단일-책 리더. parent 표준의 절-문서들을 한 권의 책처럼 탐색.
|
||||||
|
// 좌: Part-그룹 TOC / 우: 선택 절 본문 + breadcrumb + 이전/다음 + 양방향 백링크.
|
||||||
|
import { onMount } from 'svelte';
|
||||||
|
import { page } from '$app/stores';
|
||||||
|
import { goto } from '$app/navigation';
|
||||||
|
import { api } from '$lib/api';
|
||||||
|
import MarkdownDoc from '$lib/components/MarkdownDoc.svelte';
|
||||||
|
|
||||||
|
let parentId = $state(null);
|
||||||
|
let parentTitle = $state('');
|
||||||
|
let clauses = $state([]);
|
||||||
|
let selectedId = $state(null);
|
||||||
|
let clauseDoc = $state(null);
|
||||||
|
let links = $state(null);
|
||||||
|
let expanded = $state({});
|
||||||
|
let loading = $state(false);
|
||||||
|
|
||||||
|
let parts = $derived.by(() => {
|
||||||
|
const out = [], idx = {};
|
||||||
|
for (const c of clauses) {
|
||||||
|
const p = c.clause_part || '·';
|
||||||
|
if (!(p in idx)) { idx[p] = out.length; out.push({ part: p, items: [] }); }
|
||||||
|
out[idx[p]].items.push(c);
|
||||||
|
}
|
||||||
|
return out;
|
||||||
|
});
|
||||||
|
let selMeta = $derived(clauses.find((c) => c.id === selectedId) || null);
|
||||||
|
const strip = (title, code) => (title || '').replace(code || '', '').trim();
|
||||||
|
|
||||||
|
async function loadBook() {
|
||||||
|
const r = await api(`/documents/${parentId}/clauses`);
|
||||||
|
parentTitle = r?.parent_title ?? '';
|
||||||
|
clauses = r?.clauses ?? [];
|
||||||
|
const e = {};
|
||||||
|
for (const c of clauses) e[c.clause_part || '·'] = true;
|
||||||
|
expanded = e;
|
||||||
|
}
|
||||||
|
async function loadClause(id) {
|
||||||
|
if (!id) return;
|
||||||
|
loading = true;
|
||||||
|
selectedId = id;
|
||||||
|
try {
|
||||||
|
const [d, l] = await Promise.all([
|
||||||
|
api(`/documents/${id}`),
|
||||||
|
api(`/documents/${id}/backlinks`)
|
||||||
|
]);
|
||||||
|
clauseDoc = d;
|
||||||
|
links = l;
|
||||||
|
const sel = clauses.find((c) => c.id === id);
|
||||||
|
if (sel) expanded = { ...expanded, [sel.clause_part || '·']: true };
|
||||||
|
goto(`/book/${parentId}?c=${id}`, { replaceState: true, keepFocus: true, noScroll: true });
|
||||||
|
window.scrollTo({ top: 0 });
|
||||||
|
} finally {
|
||||||
|
loading = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
onMount(async () => {
|
||||||
|
parentId = Number($page.params.id);
|
||||||
|
await loadBook();
|
||||||
|
const c = Number($page.url.searchParams.get('c'));
|
||||||
|
await loadClause(c && clauses.find((x) => x.id === c) ? c : clauses[0]?.id);
|
||||||
|
});
|
||||||
|
</script>
|
||||||
|
|
||||||
|
<div class="book">
|
||||||
|
<aside class="toc">
|
||||||
|
<a class="btitle" href={`/documents/${parentId}`}>{parentTitle || 'ASME 표준'}</a>
|
||||||
|
<div class="hint">절 {clauses.length}개 · 한 권의 책처럼 탐색</div>
|
||||||
|
{#each parts as g (g.part)}
|
||||||
|
<div class="part">
|
||||||
|
<button class="phead" onclick={() => (expanded = { ...expanded, [g.part]: !expanded[g.part] })}>
|
||||||
|
<span class="caret">{expanded[g.part] ? '▾' : '▸'}</span>
|
||||||
|
<span class="pname">{g.part}</span>
|
||||||
|
<span class="cnt">{g.items.length}</span>
|
||||||
|
</button>
|
||||||
|
{#if expanded[g.part]}
|
||||||
|
<ul>
|
||||||
|
{#each g.items as c (c.id)}
|
||||||
|
<li>
|
||||||
|
<button class="citem" class:active={c.id === selectedId} onclick={() => loadClause(c.id)}>
|
||||||
|
<b>{c.clause_code}</b><span class="ct">{strip(c.title, c.clause_code)}</span>
|
||||||
|
</button>
|
||||||
|
</li>
|
||||||
|
{/each}
|
||||||
|
</ul>
|
||||||
|
{/if}
|
||||||
|
</div>
|
||||||
|
{/each}
|
||||||
|
</aside>
|
||||||
|
|
||||||
|
<main class="reader">
|
||||||
|
{#if clauseDoc}
|
||||||
|
<nav class="crumb">
|
||||||
|
<a href={`/documents/${parentId}`}>{parentTitle}</a>
|
||||||
|
<span class="sep">›</span><span>{selMeta?.clause_part}</span>
|
||||||
|
<span class="sep">›</span><b>{links?.clause_code ?? selMeta?.clause_code}</b>
|
||||||
|
</nav>
|
||||||
|
<div class="flow">
|
||||||
|
<button disabled={!links?.prev} onclick={() => loadClause(links?.prev?.id)}>
|
||||||
|
← {links?.prev?.clause_code ?? ''}
|
||||||
|
</button>
|
||||||
|
<span class="flowmid">{selMeta?.clause_part}</span>
|
||||||
|
<button disabled={!links?.next} onclick={() => loadClause(links?.next?.id)}>
|
||||||
|
{links?.next?.clause_code ?? ''} →
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
<h1 class="ctitle">{clauseDoc.title}</h1>
|
||||||
|
{#key clauseDoc.id}
|
||||||
|
<MarkdownDoc
|
||||||
|
documentId={clauseDoc.id}
|
||||||
|
mdContent={clauseDoc.md_content ?? clauseDoc.extracted_text}
|
||||||
|
mdStatus={null}
|
||||||
|
class="prose prose-base max-w-none text-text"
|
||||||
|
/>
|
||||||
|
{/key}
|
||||||
|
|
||||||
|
{#if links && (links.forward.length || links.back.length)}
|
||||||
|
<section class="xlinks">
|
||||||
|
{#if links.forward.length}
|
||||||
|
<div class="xcol">
|
||||||
|
<h3>이 절이 참조 <span>{links.forward.length}</span></h3>
|
||||||
|
<ul>
|
||||||
|
{#each links.forward as f}
|
||||||
|
<li>
|
||||||
|
{#if f.doc_id}
|
||||||
|
<button class="xref" onclick={() => loadClause(f.doc_id)}>{f.code}</button>
|
||||||
|
{:else}
|
||||||
|
<span class="xref dangling" title="외부/미분해 참조">{f.code}</span>
|
||||||
|
{/if}
|
||||||
|
{#if f.title}<span class="xt">{strip(f.title, f.code)}</span>{/if}
|
||||||
|
</li>
|
||||||
|
{/each}
|
||||||
|
</ul>
|
||||||
|
</div>
|
||||||
|
{/if}
|
||||||
|
{#if links.back.length}
|
||||||
|
<div class="xcol">
|
||||||
|
<h3>이 절을 참조 <span>{links.back.length}</span></h3>
|
||||||
|
<ul>
|
||||||
|
{#each links.back as b}
|
||||||
|
<li>
|
||||||
|
<button class="xref" onclick={() => loadClause(b.doc_id)}>{b.code}</button>
|
||||||
|
{#if b.title}<span class="xt">{strip(b.title, b.code)}</span>{/if}
|
||||||
|
</li>
|
||||||
|
{/each}
|
||||||
|
</ul>
|
||||||
|
</div>
|
||||||
|
{/if}
|
||||||
|
</section>
|
||||||
|
{/if}
|
||||||
|
{:else}
|
||||||
|
<p class="empty">{loading ? '로딩…' : '왼쪽에서 절을 선택하세요'}</p>
|
||||||
|
{/if}
|
||||||
|
</main>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<style>
|
||||||
|
.book { display: grid; grid-template-columns: 300px 1fr; gap: 0; min-height: 100vh; align-items: start; }
|
||||||
|
.toc { position: sticky; top: 0; max-height: 100vh; overflow-y: auto; border-right: 1px solid #e5e7eb; padding: 16px 12px; background: #fafafa; }
|
||||||
|
.btitle { display: block; font-weight: 700; font-size: 15px; color: #111827; text-decoration: none; line-height: 1.35; }
|
||||||
|
.btitle:hover { text-decoration: underline; }
|
||||||
|
.hint { font-size: 11.5px; color: #6b7280; margin: 4px 0 14px; }
|
||||||
|
.part { margin-bottom: 2px; }
|
||||||
|
.phead { display: flex; align-items: center; gap: 7px; width: 100%; background: none; border: 0; cursor: pointer; padding: 5px 6px; font-size: 13px; font-weight: 600; color: #374151; border-radius: 6px; }
|
||||||
|
.phead:hover { background: #f0f0f0; }
|
||||||
|
.caret { color: #9ca3af; width: 10px; }
|
||||||
|
.pname { flex: 1; text-align: left; }
|
||||||
|
.cnt { font-size: 11px; color: #9ca3af; }
|
||||||
|
.toc ul { list-style: none; margin: 0 0 4px; padding: 0 0 0 16px; }
|
||||||
|
.citem { display: block; width: 100%; text-align: left; background: none; border: 0; cursor: pointer; padding: 3px 7px; font-size: 12.5px; color: #4b5563; border-radius: 5px; line-height: 1.4; }
|
||||||
|
.citem:hover { background: #eef2ff; }
|
||||||
|
.citem.active { background: #e0e7ff; color: #1e3a8a; }
|
||||||
|
.citem b { color: #1d4ed8; margin-right: 5px; }
|
||||||
|
.citem.active b { color: #1e3a8a; }
|
||||||
|
.ct { color: #6b7280; }
|
||||||
|
.citem.active .ct { color: #334155; }
|
||||||
|
.reader { padding: 26px 34px 80px; max-width: 880px; }
|
||||||
|
.crumb { font-size: 12.5px; color: #6b7280; margin-bottom: 12px; }
|
||||||
|
.crumb a { color: #2563eb; text-decoration: none; }
|
||||||
|
.crumb a:hover { text-decoration: underline; }
|
||||||
|
.crumb .sep { margin: 0 6px; color: #cbd5e1; }
|
||||||
|
.flow { display: flex; align-items: center; justify-content: space-between; gap: 10px; margin-bottom: 18px; }
|
||||||
|
.flow button { background: #f3f4f6; border: 1px solid #e5e7eb; border-radius: 7px; padding: 6px 12px; font-size: 12.5px; color: #374151; cursor: pointer; }
|
||||||
|
.flow button:hover:not(:disabled) { background: #e5e7eb; }
|
||||||
|
.flow button:disabled { opacity: 0.4; cursor: default; }
|
||||||
|
.flowmid { font-size: 11.5px; color: #9ca3af; }
|
||||||
|
.ctitle { font-size: 22px; font-weight: 700; color: #111827; margin: 0 0 18px; letter-spacing: -0.2px; }
|
||||||
|
.xlinks { display: grid; grid-template-columns: 1fr 1fr; gap: 18px; margin-top: 40px; padding-top: 20px; border-top: 1px solid #e5e7eb; }
|
||||||
|
@media (max-width: 700px) { .book { grid-template-columns: 1fr; } .toc { position: static; max-height: none; } .xlinks { grid-template-columns: 1fr; } }
|
||||||
|
.xcol h3 { font-size: 13px; color: #374151; margin: 0 0 8px; }
|
||||||
|
.xcol h3 span { color: #9ca3af; font-weight: 400; }
|
||||||
|
.xcol ul { list-style: none; margin: 0; padding: 0; }
|
||||||
|
.xcol li { display: flex; align-items: baseline; gap: 7px; padding: 2px 0; font-size: 12.5px; }
|
||||||
|
.xref { background: #eff6ff; border: 1px solid #dbeafe; color: #1d4ed8; border-radius: 5px; padding: 1px 7px; font-size: 12px; font-weight: 600; cursor: pointer; white-space: nowrap; }
|
||||||
|
.xref:hover { background: #dbeafe; }
|
||||||
|
.xref.dangling { background: #f9fafb; border-color: #e5e7eb; color: #9ca3af; cursor: default; }
|
||||||
|
.xt { color: #6b7280; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; }
|
||||||
|
.empty { color: #9ca3af; padding: 60px 0; text-align: center; }
|
||||||
|
</style>
|
||||||
@@ -0,0 +1,53 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""ASME clause-KB backlinks: resolve clause-id mentions in each clause doc -> clause_links.
|
||||||
|
dst resolved to the clause doc of the same parent (top-level code); sub-code mention -> anchor;
|
||||||
|
unresolved (cross-standard / material spec not split) -> dangling (dst_doc_id NULL).
|
||||||
|
Idempotent per parent. Usage: python3 asme_backlinks_persist.py <parent_id> [--commit]
|
||||||
|
"""
|
||||||
|
import asyncio, os, re, sys
|
||||||
|
|
||||||
|
MENTION_RE = re.compile(r'(?<![A-Za-z0-9])([A-Z]{1,4}-\d+(?:\.\d+)*[A-Za-z]?)(?![A-Za-z0-9])')
|
||||||
|
def top(code): return re.match(r'^[A-Z]{1,4}-\d+', code).group(0)
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
parent = int(sys.argv[1]); commit = '--commit' in sys.argv
|
||||||
|
import asyncpg
|
||||||
|
conn = await asyncpg.connect(os.environ['DATABASE_URL'].replace('+asyncpg', ''))
|
||||||
|
docs = await conn.fetch("SELECT id, clause_code, md_content FROM documents "
|
||||||
|
"WHERE parent_id=$1 AND doc_kind='clause' ORDER BY clause_order", parent)
|
||||||
|
code2id = {d['clause_code']: d['id'] for d in docs}
|
||||||
|
edges = [] # (src_id, dst_code, dst_doc_id, anchor, ctx, char_off)
|
||||||
|
resolved = dangling = 0
|
||||||
|
for d in docs:
|
||||||
|
body = d['md_content']; src_top = d['clause_code']
|
||||||
|
seen = set()
|
||||||
|
for m in MENTION_RE.finditer(body):
|
||||||
|
code = m.group(1); t = top(code)
|
||||||
|
if t == src_top: continue # self-reference
|
||||||
|
if (d['id'], code) in seen: continue # dedup per (src,dst_code)
|
||||||
|
seen.add((d['id'], code))
|
||||||
|
dst_id = code2id.get(t) # resolve to same-parent clause doc
|
||||||
|
anchor = code.lower().replace('.', '-') if code != t else None
|
||||||
|
off = m.start()
|
||||||
|
ctx = re.sub(r'\s+', ' ', body[max(0, off-50):off+50]).strip()
|
||||||
|
edges.append((d['id'], code, dst_id, anchor, ctx, off))
|
||||||
|
if dst_id: resolved += 1
|
||||||
|
else: dangling += 1
|
||||||
|
print(f"parent={parent} clause_docs={len(docs)} edges={len(edges)} resolved={resolved} dangling={dangling}")
|
||||||
|
# top referenced clauses
|
||||||
|
from collections import Counter
|
||||||
|
tgt = Counter(top(e[1]) for e in edges if e[2])
|
||||||
|
print("most-referenced:", tgt.most_common(8))
|
||||||
|
if not commit:
|
||||||
|
print("DRY-RUN. pass --commit to persist."); await conn.close(); return
|
||||||
|
async with conn.transaction():
|
||||||
|
ids = [d['id'] for d in docs]
|
||||||
|
await conn.execute("DELETE FROM clause_links WHERE src_doc_id = ANY($1::bigint[])", ids)
|
||||||
|
await conn.executemany(
|
||||||
|
"INSERT INTO clause_links(src_doc_id,dst_code,dst_doc_id,anchor,ctx,char_off) "
|
||||||
|
"VALUES ($1,$2,$3,$4,$5,$6)", edges)
|
||||||
|
n = await conn.fetchval("SELECT count(*) FROM clause_links WHERE src_doc_id = ANY($1::bigint[])", ids)
|
||||||
|
print(f"COMMITTED: {n} clause_links for parent {parent}")
|
||||||
|
await conn.close()
|
||||||
|
|
||||||
|
asyncio.run(main())
|
||||||
@@ -0,0 +1,104 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""ASME clause-KB persist: split a parent standard into per-clause documents (A-granularity).
|
||||||
|
Idempotent per parent. Clause docs: doc_kind='clause', embedding NULL (search-excluded via
|
||||||
|
doc_kind filter), parent_id=<parent>. Also writes Part tags. Run inside fastapi container.
|
||||||
|
Usage: python3 asme_clause_persist.py <parent_id> [--commit]
|
||||||
|
"""
|
||||||
|
import asyncio, os, re, sys, hashlib, statistics
|
||||||
|
|
||||||
|
CAP = 12000
|
||||||
|
EN, KO = 0.217, 0.529
|
||||||
|
LINE_RE = re.compile(r'^([ \t#>*]{0,8})([A-Z]{2,4}-\d+(?:\.\d+)*[A-Za-z]?)(.*)$')
|
||||||
|
MENTION_RE = re.compile(r'(?<![A-Za-z0-9])([A-Z]{1,4}-\d+(?:\.\d+)*[A-Za-z]?)(?![A-Za-z0-9])')
|
||||||
|
EXACT_TOP = re.compile(r'^[A-Z]{2,4}-\d+$') # top-level clause code (no dotted suffix)
|
||||||
|
TITLE_AFTER = re.compile(r'^[\s.]*[A-Z(]')
|
||||||
|
REF_LEAD = re.compile(r'^[\s.]*(and|or|to|of|in|on|the|as|is|are|shall|through|per|see|with|'
|
||||||
|
r'for|by|that|which|such|또는|및|등|의|은|는|에|을|를|과|와)\b', re.I)
|
||||||
|
|
||||||
|
def tok(s):
|
||||||
|
ko = sum(1 for c in s if '가' <= c <= '힣'); return int((len(s)-ko)*EN + ko*KO)
|
||||||
|
|
||||||
|
def clean_title(rest):
|
||||||
|
t = rest
|
||||||
|
t = re.sub(r'<sup>ð</sup>\s*\**\d*\**\s*<sup>Þ</sup>', '', t) # revision bar (sup form)
|
||||||
|
t = re.sub(r'ð\**\d*\**Þ', '', t) # revision bar (plain)
|
||||||
|
t = t.replace('**', '').replace('#', '')
|
||||||
|
t = re.sub(r'\s+', ' ', t).strip(' *:—-')
|
||||||
|
return t
|
||||||
|
|
||||||
|
def is_header(markup, rest):
|
||||||
|
if '#' in markup or '*' in markup: return True
|
||||||
|
rs = rest.strip()
|
||||||
|
if rs == '': return True
|
||||||
|
if REF_LEAD.match(rest): return False
|
||||||
|
if rs[0] in ',;.)': return False
|
||||||
|
if '가' <= rs[0] <= '힣': return False
|
||||||
|
if rs[0].islower(): return False
|
||||||
|
return bool(TITLE_AFTER.match(rs))
|
||||||
|
|
||||||
|
def build_clauses(text):
|
||||||
|
lines = text.split('\n'); off = []; a = 0
|
||||||
|
for ln in lines: off.append(a); a += len(ln) + 1
|
||||||
|
# exact-top-level HEADER boundaries, first-seen only (fixes dup + sub-fragment noise)
|
||||||
|
bounds = [] # (pos, code, title)
|
||||||
|
seen = set()
|
||||||
|
for i, ln in enumerate(lines):
|
||||||
|
m = LINE_RE.match(ln)
|
||||||
|
if not m: continue
|
||||||
|
markup, code, rest = m.group(1), m.group(2), m.group(3)
|
||||||
|
if not EXACT_TOP.match(code): continue
|
||||||
|
if not is_header(markup, rest): continue
|
||||||
|
if code in seen: continue
|
||||||
|
seen.add(code); bounds.append((off[i], code, clean_title(rest)))
|
||||||
|
clauses = []
|
||||||
|
for idx, (start, code, title) in enumerate(bounds):
|
||||||
|
end = bounds[idx+1][0] if idx+1 < len(bounds) else len(text)
|
||||||
|
body = text[start:end]
|
||||||
|
part = re.match(r'^[A-Z]{2,4}', code).group(0)
|
||||||
|
links = sorted(set(re.match(r'^[A-Z]{1,4}-\d+', mm).group(0)
|
||||||
|
for mm in MENTION_RE.findall(body)) - {code})
|
||||||
|
clauses.append(dict(code=code, part=part, order=idx, title=(code + (' ' + title if title else '')),
|
||||||
|
body=body, tok=tok(body), links=links))
|
||||||
|
return clauses
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
parent = int(sys.argv[1])
|
||||||
|
commit = '--commit' in sys.argv
|
||||||
|
import asyncpg
|
||||||
|
dsn = os.environ['DATABASE_URL'].replace('+asyncpg', '')
|
||||||
|
conn = await asyncpg.connect(dsn)
|
||||||
|
row = await conn.fetchrow("SELECT md_content, ai_domain, data_origin FROM documents WHERE id=$1", parent)
|
||||||
|
if not row: print(f"parent {parent} not found"); return
|
||||||
|
clauses = build_clauses(row['md_content'])
|
||||||
|
toks = [c['tok'] for c in clauses]
|
||||||
|
over = [c for c in clauses if c['tok'] > CAP]
|
||||||
|
print(f"parent={parent} clause_docs={len(clauses)} median_tok={int(statistics.median(toks))} "
|
||||||
|
f"max_tok={max(toks)} over_cap={len(over)} total_backlinks={sum(len(c['links']) for c in clauses)}")
|
||||||
|
print("sample:", [f"{c['code']}:{c['tok']}t" for c in clauses[:8]])
|
||||||
|
if over: print("over-CAP:", [f"{c['code']}:{c['tok']}t" for c in over])
|
||||||
|
if not commit:
|
||||||
|
print("DRY-RUN (no write). pass --commit to persist."); await conn.close(); return
|
||||||
|
|
||||||
|
async with conn.transaction():
|
||||||
|
# idempotent: remove prior clause docs of this parent (cascades clause_links/document_tags)
|
||||||
|
deld = await conn.execute("DELETE FROM documents WHERE parent_id=$1 AND doc_kind='clause'", parent)
|
||||||
|
print("deleted prior:", deld)
|
||||||
|
for c in clauses:
|
||||||
|
fh = hashlib.sha256(f"{parent}:{c['code']}:{c['body']}".encode()).hexdigest()
|
||||||
|
cid = await conn.fetchval("""
|
||||||
|
INSERT INTO documents
|
||||||
|
(file_format, file_hash, title, md_content, parent_id, doc_kind,
|
||||||
|
clause_code, clause_part, clause_order, ai_domain, data_origin,
|
||||||
|
md_status, review_status, conversion_status, preview_status)
|
||||||
|
VALUES ('md',$1,$2,$3,$4,'clause',$5,$6,$7,$8,$9,'success','approved','none','none')
|
||||||
|
RETURNING id
|
||||||
|
""", fh, c['title'], c['body'], parent, c['code'], c['part'], c['order'],
|
||||||
|
row['ai_domain'], row['data_origin'] or 'external')
|
||||||
|
# Part tag
|
||||||
|
await conn.execute("INSERT INTO document_tags(doc_id,tag,tag_kind) VALUES ($1,$2,'part') "
|
||||||
|
"ON CONFLICT DO NOTHING", cid, c['part'])
|
||||||
|
n = await conn.fetchval("SELECT count(*) FROM documents WHERE parent_id=$1 AND doc_kind='clause'", parent)
|
||||||
|
print(f"COMMITTED: {n} clause docs for parent {parent}")
|
||||||
|
await conn.close()
|
||||||
|
|
||||||
|
asyncio.run(main())
|
||||||
Reference in New Issue
Block a user