feat(clause-kb): 책 API(절 목차/백링크) + /book/[id] 유기적 책 리더 + persist 스크립트

This commit is contained in:
hyungi
2026-06-29 23:13:34 +00:00
parent 62794b3857
commit eb83d41ba5
4 changed files with 508 additions and 0 deletions
+150
View File
@@ -1821,3 +1821,153 @@ async def analyze_document(
error_code=error_code, error_code=error_code,
source=source, source=source,
) )
# ─── ASME 절-지식베이스: 유기적 책 네비 (clause-KB, doc_kind='clause' 자식 문서 기반) ───
class ClauseTocItem(BaseModel):
id: int
clause_code: str | None = None
clause_part: str | None = None
clause_order: int | None = None
title: str | None = None
class ClauseBookResponse(BaseModel):
parent_id: int
parent_title: str | None = None
clauses: list[ClauseTocItem]
@router.get("/{doc_id}/clauses", response_model=ClauseBookResponse)
async def get_document_clauses(
doc_id: int,
user: Annotated[User, Depends(get_current_user)],
session: Annotated[AsyncSession, Depends(get_session)],
):
"""부모 표준 doc 의 절-문서 목차(유기적 책 TOC). doc_kind='clause' 자식을 clause_order 순 반환.
절-문서는 in_corpus=false + doc_kind='clause'(검색 제외)라 일반 목록/검색엔 안 뜨지만,
이 책-내 네비는 부모 표준에서 자식 절로 진입하는 전용 경로다(ASME 2025판=한 권의 책).
"""
from sqlalchemy import text as sql_text
parent = await session.get(Document, doc_id)
if not parent or parent.deleted_at is not None:
raise HTTPException(status_code=404, detail="문서를 찾을 수 없습니다")
rows = (
await session.execute(
sql_text(
"""
SELECT id, clause_code, clause_part, clause_order, title
FROM documents
WHERE parent_id = :pid AND doc_kind = 'clause' AND deleted_at IS NULL
ORDER BY clause_order
"""
).bindparams(pid=doc_id)
)
).mappings().all()
return ClauseBookResponse(
parent_id=doc_id,
parent_title=parent.title,
clauses=[ClauseTocItem(**dict(r)) for r in rows],
)
class BacklinkRef(BaseModel):
code: str
doc_id: int | None = None # 해소된 절-문서(같은 부모) — dangling 이면 None
title: str | None = None
anchor: str | None = None
ctx: str | None = None
class BacklinksResponse(BaseModel):
doc_id: int
clause_code: str | None = None
parent_id: int | None = None
prev: ClauseTocItem | None = None
next: ClauseTocItem | None = None
forward: list[BacklinkRef] # 이 절이 참조하는 절들
back: list[BacklinkRef] # 이 절을 참조하는 절들
@router.get("/{doc_id}/backlinks", response_model=BacklinksResponse)
async def get_document_backlinks(
doc_id: int,
user: Annotated[User, Depends(get_current_user)],
session: Annotated[AsyncSession, Depends(get_session)],
):
"""절-문서의 양방향 백링크 + 같은 부모 내 이전/다음 절(유기적 책 흐름)."""
from sqlalchemy import text as sql_text
doc = await session.get(Document, doc_id)
if not doc or doc.deleted_at is not None:
raise HTTPException(status_code=404, detail="문서를 찾을 수 없습니다")
forward = (
await session.execute(
sql_text(
"""
SELECT cl.dst_code AS code, cl.dst_doc_id AS doc_id, cl.anchor, cl.ctx, d.title
FROM clause_links cl
LEFT JOIN documents d ON d.id = cl.dst_doc_id
WHERE cl.src_doc_id = :id
ORDER BY cl.char_off NULLS LAST
LIMIT 300
"""
).bindparams(id=doc_id)
)
).mappings().all()
back = (
await session.execute(
sql_text(
"""
SELECT s.clause_code AS code, cl.src_doc_id AS doc_id, s.title, cl.ctx
FROM clause_links cl
JOIN documents s ON s.id = cl.src_doc_id
WHERE cl.dst_doc_id = :id
ORDER BY s.clause_order NULLS LAST
LIMIT 300
"""
).bindparams(id=doc_id)
)
).mappings().all()
prev = nxt = None
if doc.parent_id is not None and doc.clause_order is not None:
prow = (
await session.execute(
sql_text(
"""
SELECT id, clause_code, clause_part, clause_order, title FROM documents
WHERE parent_id = :pid AND doc_kind='clause' AND deleted_at IS NULL
AND clause_order < :ord
ORDER BY clause_order DESC LIMIT 1
"""
).bindparams(pid=doc.parent_id, ord=doc.clause_order)
)
).mappings().first()
nrow = (
await session.execute(
sql_text(
"""
SELECT id, clause_code, clause_part, clause_order, title FROM documents
WHERE parent_id = :pid AND doc_kind='clause' AND deleted_at IS NULL
AND clause_order > :ord
ORDER BY clause_order ASC LIMIT 1
"""
).bindparams(pid=doc.parent_id, ord=doc.clause_order)
)
).mappings().first()
prev = ClauseTocItem(**dict(prow)) if prow else None
nxt = ClauseTocItem(**dict(nrow)) if nrow else None
return BacklinksResponse(
doc_id=doc_id,
clause_code=doc.clause_code,
parent_id=doc.parent_id,
prev=prev,
next=nxt,
forward=[BacklinkRef(**dict(r)) for r in forward],
back=[BacklinkRef(**dict(r)) for r in back],
)
+201
View File
@@ -0,0 +1,201 @@
<script>
// ASME 절-지식베이스: 유기적 단일-책 리더. parent 표준의 절-문서들을 한 권의 책처럼 탐색.
// 좌: Part-그룹 TOC / 우: 선택 절 본문 + breadcrumb + 이전/다음 + 양방향 백링크.
import { onMount } from 'svelte';
import { page } from '$app/stores';
import { goto } from '$app/navigation';
import { api } from '$lib/api';
import MarkdownDoc from '$lib/components/MarkdownDoc.svelte';
let parentId = $state(null);
let parentTitle = $state('');
let clauses = $state([]);
let selectedId = $state(null);
let clauseDoc = $state(null);
let links = $state(null);
let expanded = $state({});
let loading = $state(false);
let parts = $derived.by(() => {
const out = [], idx = {};
for (const c of clauses) {
const p = c.clause_part || '·';
if (!(p in idx)) { idx[p] = out.length; out.push({ part: p, items: [] }); }
out[idx[p]].items.push(c);
}
return out;
});
let selMeta = $derived(clauses.find((c) => c.id === selectedId) || null);
const strip = (title, code) => (title || '').replace(code || '', '').trim();
async function loadBook() {
const r = await api(`/documents/${parentId}/clauses`);
parentTitle = r?.parent_title ?? '';
clauses = r?.clauses ?? [];
const e = {};
for (const c of clauses) e[c.clause_part || '·'] = true;
expanded = e;
}
async function loadClause(id) {
if (!id) return;
loading = true;
selectedId = id;
try {
const [d, l] = await Promise.all([
api(`/documents/${id}`),
api(`/documents/${id}/backlinks`)
]);
clauseDoc = d;
links = l;
const sel = clauses.find((c) => c.id === id);
if (sel) expanded = { ...expanded, [sel.clause_part || '·']: true };
goto(`/book/${parentId}?c=${id}`, { replaceState: true, keepFocus: true, noScroll: true });
window.scrollTo({ top: 0 });
} finally {
loading = false;
}
}
onMount(async () => {
parentId = Number($page.params.id);
await loadBook();
const c = Number($page.url.searchParams.get('c'));
await loadClause(c && clauses.find((x) => x.id === c) ? c : clauses[0]?.id);
});
</script>
<div class="book">
<aside class="toc">
<a class="btitle" href={`/documents/${parentId}`}>{parentTitle || 'ASME 표준'}</a>
<div class="hint">{clauses.length}개 · 한 권의 책처럼 탐색</div>
{#each parts as g (g.part)}
<div class="part">
<button class="phead" onclick={() => (expanded = { ...expanded, [g.part]: !expanded[g.part] })}>
<span class="caret">{expanded[g.part] ? '▾' : '▸'}</span>
<span class="pname">{g.part}</span>
<span class="cnt">{g.items.length}</span>
</button>
{#if expanded[g.part]}
<ul>
{#each g.items as c (c.id)}
<li>
<button class="citem" class:active={c.id === selectedId} onclick={() => loadClause(c.id)}>
<b>{c.clause_code}</b><span class="ct">{strip(c.title, c.clause_code)}</span>
</button>
</li>
{/each}
</ul>
{/if}
</div>
{/each}
</aside>
<main class="reader">
{#if clauseDoc}
<nav class="crumb">
<a href={`/documents/${parentId}`}>{parentTitle}</a>
<span class="sep"></span><span>{selMeta?.clause_part}</span>
<span class="sep"></span><b>{links?.clause_code ?? selMeta?.clause_code}</b>
</nav>
<div class="flow">
<button disabled={!links?.prev} onclick={() => loadClause(links?.prev?.id)}>
{links?.prev?.clause_code ?? ''}
</button>
<span class="flowmid">{selMeta?.clause_part}</span>
<button disabled={!links?.next} onclick={() => loadClause(links?.next?.id)}>
{links?.next?.clause_code ?? ''}
</button>
</div>
<h1 class="ctitle">{clauseDoc.title}</h1>
{#key clauseDoc.id}
<MarkdownDoc
documentId={clauseDoc.id}
mdContent={clauseDoc.md_content ?? clauseDoc.extracted_text}
mdStatus={null}
class="prose prose-base max-w-none text-text"
/>
{/key}
{#if links && (links.forward.length || links.back.length)}
<section class="xlinks">
{#if links.forward.length}
<div class="xcol">
<h3>이 절이 참조 <span>{links.forward.length}</span></h3>
<ul>
{#each links.forward as f}
<li>
{#if f.doc_id}
<button class="xref" onclick={() => loadClause(f.doc_id)}>{f.code}</button>
{:else}
<span class="xref dangling" title="외부/미분해 참조">{f.code}</span>
{/if}
{#if f.title}<span class="xt">{strip(f.title, f.code)}</span>{/if}
</li>
{/each}
</ul>
</div>
{/if}
{#if links.back.length}
<div class="xcol">
<h3>이 절을 참조 <span>{links.back.length}</span></h3>
<ul>
{#each links.back as b}
<li>
<button class="xref" onclick={() => loadClause(b.doc_id)}>{b.code}</button>
{#if b.title}<span class="xt">{strip(b.title, b.code)}</span>{/if}
</li>
{/each}
</ul>
</div>
{/if}
</section>
{/if}
{:else}
<p class="empty">{loading ? '로딩…' : '왼쪽에서 절을 선택하세요'}</p>
{/if}
</main>
</div>
<style>
.book { display: grid; grid-template-columns: 300px 1fr; gap: 0; min-height: 100vh; align-items: start; }
.toc { position: sticky; top: 0; max-height: 100vh; overflow-y: auto; border-right: 1px solid #e5e7eb; padding: 16px 12px; background: #fafafa; }
.btitle { display: block; font-weight: 700; font-size: 15px; color: #111827; text-decoration: none; line-height: 1.35; }
.btitle:hover { text-decoration: underline; }
.hint { font-size: 11.5px; color: #6b7280; margin: 4px 0 14px; }
.part { margin-bottom: 2px; }
.phead { display: flex; align-items: center; gap: 7px; width: 100%; background: none; border: 0; cursor: pointer; padding: 5px 6px; font-size: 13px; font-weight: 600; color: #374151; border-radius: 6px; }
.phead:hover { background: #f0f0f0; }
.caret { color: #9ca3af; width: 10px; }
.pname { flex: 1; text-align: left; }
.cnt { font-size: 11px; color: #9ca3af; }
.toc ul { list-style: none; margin: 0 0 4px; padding: 0 0 0 16px; }
.citem { display: block; width: 100%; text-align: left; background: none; border: 0; cursor: pointer; padding: 3px 7px; font-size: 12.5px; color: #4b5563; border-radius: 5px; line-height: 1.4; }
.citem:hover { background: #eef2ff; }
.citem.active { background: #e0e7ff; color: #1e3a8a; }
.citem b { color: #1d4ed8; margin-right: 5px; }
.citem.active b { color: #1e3a8a; }
.ct { color: #6b7280; }
.citem.active .ct { color: #334155; }
.reader { padding: 26px 34px 80px; max-width: 880px; }
.crumb { font-size: 12.5px; color: #6b7280; margin-bottom: 12px; }
.crumb a { color: #2563eb; text-decoration: none; }
.crumb a:hover { text-decoration: underline; }
.crumb .sep { margin: 0 6px; color: #cbd5e1; }
.flow { display: flex; align-items: center; justify-content: space-between; gap: 10px; margin-bottom: 18px; }
.flow button { background: #f3f4f6; border: 1px solid #e5e7eb; border-radius: 7px; padding: 6px 12px; font-size: 12.5px; color: #374151; cursor: pointer; }
.flow button:hover:not(:disabled) { background: #e5e7eb; }
.flow button:disabled { opacity: 0.4; cursor: default; }
.flowmid { font-size: 11.5px; color: #9ca3af; }
.ctitle { font-size: 22px; font-weight: 700; color: #111827; margin: 0 0 18px; letter-spacing: -0.2px; }
.xlinks { display: grid; grid-template-columns: 1fr 1fr; gap: 18px; margin-top: 40px; padding-top: 20px; border-top: 1px solid #e5e7eb; }
@media (max-width: 700px) { .book { grid-template-columns: 1fr; } .toc { position: static; max-height: none; } .xlinks { grid-template-columns: 1fr; } }
.xcol h3 { font-size: 13px; color: #374151; margin: 0 0 8px; }
.xcol h3 span { color: #9ca3af; font-weight: 400; }
.xcol ul { list-style: none; margin: 0; padding: 0; }
.xcol li { display: flex; align-items: baseline; gap: 7px; padding: 2px 0; font-size: 12.5px; }
.xref { background: #eff6ff; border: 1px solid #dbeafe; color: #1d4ed8; border-radius: 5px; padding: 1px 7px; font-size: 12px; font-weight: 600; cursor: pointer; white-space: nowrap; }
.xref:hover { background: #dbeafe; }
.xref.dangling { background: #f9fafb; border-color: #e5e7eb; color: #9ca3af; cursor: default; }
.xt { color: #6b7280; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; }
.empty { color: #9ca3af; padding: 60px 0; text-align: center; }
</style>
+53
View File
@@ -0,0 +1,53 @@
#!/usr/bin/env python3
"""ASME clause-KB backlinks: resolve clause-id mentions in each clause doc -> clause_links.
dst resolved to the clause doc of the same parent (top-level code); sub-code mention -> anchor;
unresolved (cross-standard / material spec not split) -> dangling (dst_doc_id NULL).
Idempotent per parent. Usage: python3 asme_backlinks_persist.py <parent_id> [--commit]
"""
import asyncio, os, re, sys
MENTION_RE = re.compile(r'(?<![A-Za-z0-9])([A-Z]{1,4}-\d+(?:\.\d+)*[A-Za-z]?)(?![A-Za-z0-9])')
def top(code): return re.match(r'^[A-Z]{1,4}-\d+', code).group(0)
async def main():
parent = int(sys.argv[1]); commit = '--commit' in sys.argv
import asyncpg
conn = await asyncpg.connect(os.environ['DATABASE_URL'].replace('+asyncpg', ''))
docs = await conn.fetch("SELECT id, clause_code, md_content FROM documents "
"WHERE parent_id=$1 AND doc_kind='clause' ORDER BY clause_order", parent)
code2id = {d['clause_code']: d['id'] for d in docs}
edges = [] # (src_id, dst_code, dst_doc_id, anchor, ctx, char_off)
resolved = dangling = 0
for d in docs:
body = d['md_content']; src_top = d['clause_code']
seen = set()
for m in MENTION_RE.finditer(body):
code = m.group(1); t = top(code)
if t == src_top: continue # self-reference
if (d['id'], code) in seen: continue # dedup per (src,dst_code)
seen.add((d['id'], code))
dst_id = code2id.get(t) # resolve to same-parent clause doc
anchor = code.lower().replace('.', '-') if code != t else None
off = m.start()
ctx = re.sub(r'\s+', ' ', body[max(0, off-50):off+50]).strip()
edges.append((d['id'], code, dst_id, anchor, ctx, off))
if dst_id: resolved += 1
else: dangling += 1
print(f"parent={parent} clause_docs={len(docs)} edges={len(edges)} resolved={resolved} dangling={dangling}")
# top referenced clauses
from collections import Counter
tgt = Counter(top(e[1]) for e in edges if e[2])
print("most-referenced:", tgt.most_common(8))
if not commit:
print("DRY-RUN. pass --commit to persist."); await conn.close(); return
async with conn.transaction():
ids = [d['id'] for d in docs]
await conn.execute("DELETE FROM clause_links WHERE src_doc_id = ANY($1::bigint[])", ids)
await conn.executemany(
"INSERT INTO clause_links(src_doc_id,dst_code,dst_doc_id,anchor,ctx,char_off) "
"VALUES ($1,$2,$3,$4,$5,$6)", edges)
n = await conn.fetchval("SELECT count(*) FROM clause_links WHERE src_doc_id = ANY($1::bigint[])", ids)
print(f"COMMITTED: {n} clause_links for parent {parent}")
await conn.close()
asyncio.run(main())
+104
View File
@@ -0,0 +1,104 @@
#!/usr/bin/env python3
"""ASME clause-KB persist: split a parent standard into per-clause documents (A-granularity).
Idempotent per parent. Clause docs: doc_kind='clause', embedding NULL (search-excluded via
doc_kind filter), parent_id=<parent>. Also writes Part tags. Run inside fastapi container.
Usage: python3 asme_clause_persist.py <parent_id> [--commit]
"""
import asyncio, os, re, sys, hashlib, statistics
CAP = 12000
EN, KO = 0.217, 0.529
LINE_RE = re.compile(r'^([ \t#>*]{0,8})([A-Z]{2,4}-\d+(?:\.\d+)*[A-Za-z]?)(.*)$')
MENTION_RE = re.compile(r'(?<![A-Za-z0-9])([A-Z]{1,4}-\d+(?:\.\d+)*[A-Za-z]?)(?![A-Za-z0-9])')
EXACT_TOP = re.compile(r'^[A-Z]{2,4}-\d+$') # top-level clause code (no dotted suffix)
TITLE_AFTER = re.compile(r'^[\s.]*[A-Z(]')
REF_LEAD = re.compile(r'^[\s.]*(and|or|to|of|in|on|the|as|is|are|shall|through|per|see|with|'
r'for|by|that|which|such|또는|및|등|의|은|는|에|을|를|과|와)\b', re.I)
def tok(s):
ko = sum(1 for c in s if '' <= c <= ''); return int((len(s)-ko)*EN + ko*KO)
def clean_title(rest):
t = rest
t = re.sub(r'<sup>ð</sup>\s*\**\d*\**\s*<sup>Þ</sup>', '', t) # revision bar (sup form)
t = re.sub(r'ð\**\d*\**Þ', '', t) # revision bar (plain)
t = t.replace('**', '').replace('#', '')
t = re.sub(r'\s+', ' ', t).strip(' *:—-')
return t
def is_header(markup, rest):
if '#' in markup or '*' in markup: return True
rs = rest.strip()
if rs == '': return True
if REF_LEAD.match(rest): return False
if rs[0] in ',;.)': return False
if '' <= rs[0] <= '': return False
if rs[0].islower(): return False
return bool(TITLE_AFTER.match(rs))
def build_clauses(text):
lines = text.split('\n'); off = []; a = 0
for ln in lines: off.append(a); a += len(ln) + 1
# exact-top-level HEADER boundaries, first-seen only (fixes dup + sub-fragment noise)
bounds = [] # (pos, code, title)
seen = set()
for i, ln in enumerate(lines):
m = LINE_RE.match(ln)
if not m: continue
markup, code, rest = m.group(1), m.group(2), m.group(3)
if not EXACT_TOP.match(code): continue
if not is_header(markup, rest): continue
if code in seen: continue
seen.add(code); bounds.append((off[i], code, clean_title(rest)))
clauses = []
for idx, (start, code, title) in enumerate(bounds):
end = bounds[idx+1][0] if idx+1 < len(bounds) else len(text)
body = text[start:end]
part = re.match(r'^[A-Z]{2,4}', code).group(0)
links = sorted(set(re.match(r'^[A-Z]{1,4}-\d+', mm).group(0)
for mm in MENTION_RE.findall(body)) - {code})
clauses.append(dict(code=code, part=part, order=idx, title=(code + (' ' + title if title else '')),
body=body, tok=tok(body), links=links))
return clauses
async def main():
parent = int(sys.argv[1])
commit = '--commit' in sys.argv
import asyncpg
dsn = os.environ['DATABASE_URL'].replace('+asyncpg', '')
conn = await asyncpg.connect(dsn)
row = await conn.fetchrow("SELECT md_content, ai_domain, data_origin FROM documents WHERE id=$1", parent)
if not row: print(f"parent {parent} not found"); return
clauses = build_clauses(row['md_content'])
toks = [c['tok'] for c in clauses]
over = [c for c in clauses if c['tok'] > CAP]
print(f"parent={parent} clause_docs={len(clauses)} median_tok={int(statistics.median(toks))} "
f"max_tok={max(toks)} over_cap={len(over)} total_backlinks={sum(len(c['links']) for c in clauses)}")
print("sample:", [f"{c['code']}:{c['tok']}t" for c in clauses[:8]])
if over: print("over-CAP:", [f"{c['code']}:{c['tok']}t" for c in over])
if not commit:
print("DRY-RUN (no write). pass --commit to persist."); await conn.close(); return
async with conn.transaction():
# idempotent: remove prior clause docs of this parent (cascades clause_links/document_tags)
deld = await conn.execute("DELETE FROM documents WHERE parent_id=$1 AND doc_kind='clause'", parent)
print("deleted prior:", deld)
for c in clauses:
fh = hashlib.sha256(f"{parent}:{c['code']}:{c['body']}".encode()).hexdigest()
cid = await conn.fetchval("""
INSERT INTO documents
(file_format, file_hash, title, md_content, parent_id, doc_kind,
clause_code, clause_part, clause_order, ai_domain, data_origin,
md_status, review_status, conversion_status, preview_status)
VALUES ('md',$1,$2,$3,$4,'clause',$5,$6,$7,$8,$9,'success','approved','none','none')
RETURNING id
""", fh, c['title'], c['body'], parent, c['code'], c['part'], c['order'],
row['ai_domain'], row['data_origin'] or 'external')
# Part tag
await conn.execute("INSERT INTO document_tags(doc_id,tag,tag_kind) VALUES ($1,$2,'part') "
"ON CONFLICT DO NOTHING", cid, c['part'])
n = await conn.fetchval("SELECT count(*) FROM documents WHERE parent_id=$1 AND doc_kind='clause'", parent)
print(f"COMMITTED: {n} clause docs for parent {parent}")
await conn.close()
asyncio.run(main())