diff --git a/app/api/documents.py b/app/api/documents.py index 8160f32..94f2090 100644 --- a/app/api/documents.py +++ b/app/api/documents.py @@ -1821,3 +1821,153 @@ async def analyze_document( error_code=error_code, source=source, ) + + +# ─── ASME 절-지식베이스: 유기적 책 네비 (clause-KB, doc_kind='clause' 자식 문서 기반) ─── +class ClauseTocItem(BaseModel): + id: int + clause_code: str | None = None + clause_part: str | None = None + clause_order: int | None = None + title: str | None = None + + +class ClauseBookResponse(BaseModel): + parent_id: int + parent_title: str | None = None + clauses: list[ClauseTocItem] + + +@router.get("/{doc_id}/clauses", response_model=ClauseBookResponse) +async def get_document_clauses( + doc_id: int, + user: Annotated[User, Depends(get_current_user)], + session: Annotated[AsyncSession, Depends(get_session)], +): + """부모 표준 doc 의 절-문서 목차(유기적 책 TOC). doc_kind='clause' 자식을 clause_order 순 반환. + + 절-문서는 in_corpus=false + doc_kind='clause'(검색 제외)라 일반 목록/검색엔 안 뜨지만, + 이 책-내 네비는 부모 표준에서 자식 절로 진입하는 전용 경로다(ASME 2025판=한 권의 책). + """ + from sqlalchemy import text as sql_text + + parent = await session.get(Document, doc_id) + if not parent or parent.deleted_at is not None: + raise HTTPException(status_code=404, detail="문서를 찾을 수 없습니다") + rows = ( + await session.execute( + sql_text( + """ + SELECT id, clause_code, clause_part, clause_order, title + FROM documents + WHERE parent_id = :pid AND doc_kind = 'clause' AND deleted_at IS NULL + ORDER BY clause_order + """ + ).bindparams(pid=doc_id) + ) + ).mappings().all() + return ClauseBookResponse( + parent_id=doc_id, + parent_title=parent.title, + clauses=[ClauseTocItem(**dict(r)) for r in rows], + ) + + +class BacklinkRef(BaseModel): + code: str + doc_id: int | None = None # 해소된 절-문서(같은 부모) — dangling 이면 None + title: str | None = None + anchor: str | None = None + ctx: str | None = None + + +class BacklinksResponse(BaseModel): + doc_id: int + clause_code: str | None = None + parent_id: int | None = None + prev: ClauseTocItem | None = None + next: ClauseTocItem | None = None + forward: list[BacklinkRef] # 이 절이 참조하는 절들 + back: list[BacklinkRef] # 이 절을 참조하는 절들 + + +@router.get("/{doc_id}/backlinks", response_model=BacklinksResponse) +async def get_document_backlinks( + doc_id: int, + user: Annotated[User, Depends(get_current_user)], + session: Annotated[AsyncSession, Depends(get_session)], +): + """절-문서의 양방향 백링크 + 같은 부모 내 이전/다음 절(유기적 책 흐름).""" + from sqlalchemy import text as sql_text + + doc = await session.get(Document, doc_id) + if not doc or doc.deleted_at is not None: + raise HTTPException(status_code=404, detail="문서를 찾을 수 없습니다") + + forward = ( + await session.execute( + sql_text( + """ + SELECT cl.dst_code AS code, cl.dst_doc_id AS doc_id, cl.anchor, cl.ctx, d.title + FROM clause_links cl + LEFT JOIN documents d ON d.id = cl.dst_doc_id + WHERE cl.src_doc_id = :id + ORDER BY cl.char_off NULLS LAST + LIMIT 300 + """ + ).bindparams(id=doc_id) + ) + ).mappings().all() + back = ( + await session.execute( + sql_text( + """ + SELECT s.clause_code AS code, cl.src_doc_id AS doc_id, s.title, cl.ctx + FROM clause_links cl + JOIN documents s ON s.id = cl.src_doc_id + WHERE cl.dst_doc_id = :id + ORDER BY s.clause_order NULLS LAST + LIMIT 300 + """ + ).bindparams(id=doc_id) + ) + ).mappings().all() + + prev = nxt = None + if doc.parent_id is not None and doc.clause_order is not None: + prow = ( + await session.execute( + sql_text( + """ + SELECT id, clause_code, clause_part, clause_order, title FROM documents + WHERE parent_id = :pid AND doc_kind='clause' AND deleted_at IS NULL + AND clause_order < :ord + ORDER BY clause_order DESC LIMIT 1 + """ + ).bindparams(pid=doc.parent_id, ord=doc.clause_order) + ) + ).mappings().first() + nrow = ( + await session.execute( + sql_text( + """ + SELECT id, clause_code, clause_part, clause_order, title FROM documents + WHERE parent_id = :pid AND doc_kind='clause' AND deleted_at IS NULL + AND clause_order > :ord + ORDER BY clause_order ASC LIMIT 1 + """ + ).bindparams(pid=doc.parent_id, ord=doc.clause_order) + ) + ).mappings().first() + prev = ClauseTocItem(**dict(prow)) if prow else None + nxt = ClauseTocItem(**dict(nrow)) if nrow else None + + return BacklinksResponse( + doc_id=doc_id, + clause_code=doc.clause_code, + parent_id=doc.parent_id, + prev=prev, + next=nxt, + forward=[BacklinkRef(**dict(r)) for r in forward], + back=[BacklinkRef(**dict(r)) for r in back], + ) diff --git a/frontend/src/routes/book/[id]/+page.svelte b/frontend/src/routes/book/[id]/+page.svelte new file mode 100644 index 0000000..25be284 --- /dev/null +++ b/frontend/src/routes/book/[id]/+page.svelte @@ -0,0 +1,201 @@ + + +
+ + +
+ {#if clauseDoc} + +
+ + {selMeta?.clause_part} + +
+

{clauseDoc.title}

+ {#key clauseDoc.id} + + {/key} + + {#if links && (links.forward.length || links.back.length)} + + {/if} + {:else} +

{loading ? '로딩…' : '왼쪽에서 절을 선택하세요'}

+ {/if} +
+
+ + diff --git a/scripts/asme_backlinks_persist.py b/scripts/asme_backlinks_persist.py new file mode 100644 index 0000000..b410a4c --- /dev/null +++ b/scripts/asme_backlinks_persist.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python3 +"""ASME clause-KB backlinks: resolve clause-id mentions in each clause doc -> clause_links. +dst resolved to the clause doc of the same parent (top-level code); sub-code mention -> anchor; +unresolved (cross-standard / material spec not split) -> dangling (dst_doc_id NULL). +Idempotent per parent. Usage: python3 asme_backlinks_persist.py [--commit] +""" +import asyncio, os, re, sys + +MENTION_RE = re.compile(r'(?. Also writes Part tags. Run inside fastapi container. +Usage: python3 asme_clause_persist.py [--commit] +""" +import asyncio, os, re, sys, hashlib, statistics + +CAP = 12000 +EN, KO = 0.217, 0.529 +LINE_RE = re.compile(r'^([ \t#>*]{0,8})([A-Z]{2,4}-\d+(?:\.\d+)*[A-Za-z]?)(.*)$') +MENTION_RE = re.compile(r'(?ð\s*\**\d*\**\s*Þ', '', t) # revision bar (sup form) + t = re.sub(r'ð\**\d*\**Þ', '', t) # revision bar (plain) + t = t.replace('**', '').replace('#', '') + t = re.sub(r'\s+', ' ', t).strip(' *:—-') + return t + +def is_header(markup, rest): + if '#' in markup or '*' in markup: return True + rs = rest.strip() + if rs == '': return True + if REF_LEAD.match(rest): return False + if rs[0] in ',;.)': return False + if '가' <= rs[0] <= '힣': return False + if rs[0].islower(): return False + return bool(TITLE_AFTER.match(rs)) + +def build_clauses(text): + lines = text.split('\n'); off = []; a = 0 + for ln in lines: off.append(a); a += len(ln) + 1 + # exact-top-level HEADER boundaries, first-seen only (fixes dup + sub-fragment noise) + bounds = [] # (pos, code, title) + seen = set() + for i, ln in enumerate(lines): + m = LINE_RE.match(ln) + if not m: continue + markup, code, rest = m.group(1), m.group(2), m.group(3) + if not EXACT_TOP.match(code): continue + if not is_header(markup, rest): continue + if code in seen: continue + seen.add(code); bounds.append((off[i], code, clean_title(rest))) + clauses = [] + for idx, (start, code, title) in enumerate(bounds): + end = bounds[idx+1][0] if idx+1 < len(bounds) else len(text) + body = text[start:end] + part = re.match(r'^[A-Z]{2,4}', code).group(0) + links = sorted(set(re.match(r'^[A-Z]{1,4}-\d+', mm).group(0) + for mm in MENTION_RE.findall(body)) - {code}) + clauses.append(dict(code=code, part=part, order=idx, title=(code + (' ' + title if title else '')), + body=body, tok=tok(body), links=links)) + return clauses + +async def main(): + parent = int(sys.argv[1]) + commit = '--commit' in sys.argv + import asyncpg + dsn = os.environ['DATABASE_URL'].replace('+asyncpg', '') + conn = await asyncpg.connect(dsn) + row = await conn.fetchrow("SELECT md_content, ai_domain, data_origin FROM documents WHERE id=$1", parent) + if not row: print(f"parent {parent} not found"); return + clauses = build_clauses(row['md_content']) + toks = [c['tok'] for c in clauses] + over = [c for c in clauses if c['tok'] > CAP] + print(f"parent={parent} clause_docs={len(clauses)} median_tok={int(statistics.median(toks))} " + f"max_tok={max(toks)} over_cap={len(over)} total_backlinks={sum(len(c['links']) for c in clauses)}") + print("sample:", [f"{c['code']}:{c['tok']}t" for c in clauses[:8]]) + if over: print("over-CAP:", [f"{c['code']}:{c['tok']}t" for c in over]) + if not commit: + print("DRY-RUN (no write). pass --commit to persist."); await conn.close(); return + + async with conn.transaction(): + # idempotent: remove prior clause docs of this parent (cascades clause_links/document_tags) + deld = await conn.execute("DELETE FROM documents WHERE parent_id=$1 AND doc_kind='clause'", parent) + print("deleted prior:", deld) + for c in clauses: + fh = hashlib.sha256(f"{parent}:{c['code']}:{c['body']}".encode()).hexdigest() + cid = await conn.fetchval(""" + INSERT INTO documents + (file_format, file_hash, title, md_content, parent_id, doc_kind, + clause_code, clause_part, clause_order, ai_domain, data_origin, + md_status, review_status, conversion_status, preview_status) + VALUES ('md',$1,$2,$3,$4,'clause',$5,$6,$7,$8,$9,'success','approved','none','none') + RETURNING id + """, fh, c['title'], c['body'], parent, c['code'], c['part'], c['order'], + row['ai_domain'], row['data_origin'] or 'external') + # Part tag + await conn.execute("INSERT INTO document_tags(doc_id,tag,tag_kind) VALUES ($1,$2,'part') " + "ON CONFLICT DO NOTHING", cid, c['part']) + n = await conn.fetchval("SELECT count(*) FROM documents WHERE parent_id=$1 AND doc_kind='clause'", parent) + print(f"COMMITTED: {n} clause docs for parent {parent}") + await conn.close() + +asyncio.run(main())