diff --git a/app/api/documents.py b/app/api/documents.py
index ca01a8b..0443fb5 100644
--- a/app/api/documents.py
+++ b/app/api/documents.py
@@ -1990,3 +1990,61 @@ async def get_document_backlinks(
forward=[BacklinkRef(**dict(r)) for r in forward],
back=[BacklinkRef(**dict(r)) for r in back],
)
+
+
+# ─── 관련 문서 (유사도, on-demand pgvector KNN — 저부하·무저장) ───
+class RelatedItem(BaseModel):
+ id: int
+ title: str | None = None
+ ai_domain: str | None = None
+ material_type: str | None = None
+ year: int | None = None
+ sim: float | None = None
+
+
+class RelatedResponse(BaseModel):
+ doc_id: int
+ related: list[RelatedItem]
+
+
+@router.get("/{doc_id}/related", response_model=RelatedResponse)
+async def get_related_documents(
+ doc_id: int,
+ user: Annotated[User, Depends(get_current_user)],
+ session: Annotated[AsyncSession, Depends(get_session)],
+ limit: int = 8,
+ same_type: bool = True,
+):
+ """문서-레벨 임베딩 코사인 최근접 = '관련 문서'. on-demand(저장/배치 없음).
+
+ 인용그래프가 부적합한 코퍼스(업계 기술기사=인용망 부재)의 대안 연결 레이어.
+ same_type=true면 같은 material_type 내, false면 전 코퍼스. doc_kind='clause'(절-문서)는 제외.
+ """
+ from sqlalchemy import text as sql_text
+
+ lim = max(1, min(limit, 30))
+ type_clause = "AND d.material_type = src.material_type" if same_type else ""
+ rows = (
+ await session.execute(
+ sql_text(
+ f"""
+ WITH src AS (
+ SELECT embedding, material_type FROM documents WHERE id = :id
+ )
+ SELECT d.id, d.title, d.ai_domain, d.material_type, d.facet_year AS year,
+ round((1 - (d.embedding <=> (SELECT embedding FROM src)))::numeric, 3) AS sim
+ FROM documents d, src
+ WHERE d.doc_kind = 'standard' AND d.deleted_at IS NULL
+ AND d.id <> :id AND d.embedding IS NOT NULL
+ AND (SELECT embedding FROM src) IS NOT NULL
+ {type_clause}
+ ORDER BY d.embedding <=> (SELECT embedding FROM src)
+ LIMIT :lim
+ """
+ ).bindparams(id=doc_id, lim=lim)
+ )
+ ).mappings().all()
+ return RelatedResponse(
+ doc_id=doc_id,
+ related=[RelatedItem(**{k: r[k] for k in ("id", "title", "ai_domain", "material_type", "year")}, sim=float(r["sim"]) if r["sim"] is not None else None) for r in rows],
+ )
diff --git a/frontend/src/lib/components/RelatedDocs.svelte b/frontend/src/lib/components/RelatedDocs.svelte
new file mode 100644
index 0000000..c650d01
--- /dev/null
+++ b/frontend/src/lib/components/RelatedDocs.svelte
@@ -0,0 +1,45 @@
+
+
+{#if items.length}
+
+{/if}
+
+
diff --git a/frontend/src/routes/documents/[id]/+page.svelte b/frontend/src/routes/documents/[id]/+page.svelte
index a105a7c..e68fa6b 100644
--- a/frontend/src/routes/documents/[id]/+page.svelte
+++ b/frontend/src/routes/documents/[id]/+page.svelte
@@ -16,6 +16,7 @@
import Skeleton from '$lib/components/ui/Skeleton.svelte';
import HandwriteCanvas from '$lib/components/HandwriteCanvas.svelte';
import MarkdownDoc from '$lib/components/MarkdownDoc.svelte';
+ import RelatedDocs from '$lib/components/RelatedDocs.svelte';
import { renderDocMarkdown } from '$lib/utils/docMarkdown';
import MarkdownStatusBadge from '$lib/components/MarkdownStatusBadge.svelte';
import NoteEditor from '$lib/components/editors/NoteEditor.svelte';
@@ -321,6 +322,7 @@
{#snippet rail()}
+
{#if doc.ai_tldr || doc.ai_summary}
TL;DR
diff --git a/scripts/guide_clause_persist.py b/scripts/guide_clause_persist.py
new file mode 100644
index 0000000..5b61092
--- /dev/null
+++ b/scripts/guide_clause_persist.py
@@ -0,0 +1,100 @@
+#!/usr/bin/env python3
+"""기술지침(KOSHA guide) 절-KB persist: 번호섹션(# 1. 목적 / ## 4.1) 단위 분해 + 제본.
+ASME/법령과 동일 clause-KB 모델(doc_kind='clause', parent_id=지침, 검색제외, /book 리더 공용).
+Usage: python3 guide_clause_persist.py
[--commit]
+"""
+import asyncio, os, re, sys, hashlib, statistics
+
+CAP = 12000; PAGE_TOK = 11000
+EN, KO = 0.217, 0.529
+# 번호섹션 헤더: '# 1. 목 적', '## 4.1 누출...' (번호 1~3자리=연도(4자리) 배제)
+ART_RE = re.compile(r'^#{1,6}\s*(\d{1,3}(?:\.\d{1,3})*)\.?\s+(\S.*)$')
+TOP_RE = re.compile(r'^\d{1,3}$')
+# 외부 표준/법규 참조(대부분 dangling): ASME B16.5 · KS B 1501 · 규칙 제N조
+EXT_RE = re.compile(r'(ASME\s+[A-Z][0-9.]+|KS\s+[A-Z]\s*[0-9]+|ISO\s+[0-9]+|제\d+조)')
+
+def tok(s):
+ ko = sum(1 for c in s if '가' <= c <= '힣'); return int((len(s)-ko)*EN + ko*KO)
+
+def build_sections(text):
+ lines = text.split('\n'); off = []; a = 0
+ for ln in lines: off.append(a); a += len(ln) + 1
+ bounds = []; seen = set()
+ for i, ln in enumerate(lines):
+ m = ART_RE.match(ln)
+ if not m: continue
+ code, name = m.group(1), m.group(2).strip()
+ if not TOP_RE.match(code): continue # top-level 번호섹션만 경계
+ if code in seen: continue
+ if len(name) < 1: continue
+ seen.add(code); bounds.append((off[i], code, name))
+ out = []
+ for idx, (start, code, name) in enumerate(bounds):
+ end = bounds[idx+1][0] if idx+1 < len(bounds) else len(text)
+ body = text[start:end].strip()
+ ext = sorted(set(EXT_RE.findall(body)))[:8]
+ out.append(dict(code=code, part='본문', order=0, title=f"{code}. {name}"[:120],
+ body=body, tok=tok(body), links=[], ext=ext))
+ # over-CAP 페이지네이션 + 순번
+ final, order = [], 0
+ for c in out:
+ if c['tok'] <= CAP:
+ final.append({**c, 'order': order}); order += 1; continue
+ pages, cur, ct = [], [], 0
+ for ln in c['body'].split('\n'):
+ lt = tok(ln)+1
+ if ct+lt > PAGE_TOK and cur: pages.append('\n'.join(cur)); cur=[ln]; ct=lt
+ else: cur.append(ln); ct+=lt
+ if cur: pages.append('\n'.join(cur))
+ for pi, pb in enumerate(pages):
+ final.append(dict(code=c['code'] if pi==0 else f"{c['code']}·p{pi+1}", part='본문',
+ order=order, title=c['title'] if pi==0 else f"{c['title']} (p{pi+1})",
+ body=pb, tok=tok(pb), links=[], ext=[]))
+ order += 1
+ return final
+
+async def process_one(conn, gid, commit, verbose=True):
+ row = await conn.fetchrow("SELECT title, md_content, ai_domain, data_origin FROM documents WHERE id=$1", gid)
+ if not row: return ('notfound', 0)
+ if not row['md_content']: return ('nullmd', 0)
+ secs = build_sections(row['md_content'])
+ if len(secs) < 2: return ('few', len(secs)) # 섹션 2 미만 = 번호구조 아님
+ toks = [c['tok'] for c in secs]
+ if verbose:
+ print(f"guide={gid} «{(row['title'] or '')[:40]}» 섹션={len(secs)} median={int(statistics.median(toks))} max={max(toks)}")
+ print(" 샘플:", [c['title'][:26] for c in secs[:7]])
+ if not commit: return ('dry', len(secs))
+ async with conn.transaction():
+ await conn.execute("DELETE FROM clause_links WHERE src_doc_id IN (SELECT id FROM documents WHERE parent_id=$1 AND doc_kind='clause')", gid)
+ await conn.execute("DELETE FROM documents WHERE parent_id=$1 AND doc_kind='clause'", gid)
+ for c in secs:
+ fh = hashlib.sha256(f"{gid}:{c['code']}:{c['body']}".encode()).hexdigest()
+ cid = await conn.fetchval("""
+ INSERT INTO documents (file_format,file_hash,title,md_content,parent_id,doc_kind,
+ clause_code,clause_part,clause_order,ai_domain,data_origin,
+ md_status,review_status,conversion_status,preview_status)
+ VALUES ('md',$1,$2,$3,$4,'clause',$5,$6,$7,$8,$9,'success','approved','none','none') RETURNING id
+ """, fh, c['title'], c['body'], gid, c['code'], c['part'], c['order'], row['ai_domain'], row['data_origin'] or 'external')
+ await conn.execute("INSERT INTO document_tags(doc_id,tag,tag_kind) VALUES ($1,'기술지침','kind') ON CONFLICT DO NOTHING", cid)
+ n = await conn.fetchval("SELECT count(*) FROM documents WHERE parent_id=$1 AND doc_kind='clause'", gid)
+ print(f" COMMITTED: {n} 섹션 for guide {gid}")
+ return ('committed', len(secs))
+
+async def main():
+ import asyncpg
+ arg = sys.argv[1]; commit = '--commit' in sys.argv
+ conn = await asyncpg.connect(os.environ['DATABASE_URL'].replace('+asyncpg', ''))
+ if arg == 'all':
+ gs = await conn.fetch("SELECT id FROM documents WHERE material_type='guide' AND doc_kind='standard' "
+ "AND deleted_at IS NULL AND md_content IS NOT NULL ORDER BY id")
+ agg = {}; tot = 0
+ for i, r in enumerate(gs):
+ st, n = await process_one(conn, r['id'], commit, verbose=False)
+ agg[st] = agg.get(st, 0)+1; tot += n if st in ('dry','committed') else 0
+ if commit and (i+1) % 40 == 0: print(f" …{i+1}/{len(gs)} (누적섹션 {tot})")
+ print(f"BATCH {'COMMIT' if commit else 'DRY'} guides={len(gs)} status={agg} 총섹션={tot}")
+ else:
+ await process_one(conn, int(arg), commit, verbose=True)
+ await conn.close()
+
+asyncio.run(main())
diff --git a/scripts/law_clause_persist.py b/scripts/law_clause_persist.py
new file mode 100644
index 0000000..a87fd6d
--- /dev/null
+++ b/scripts/law_clause_persist.py
@@ -0,0 +1,146 @@
+#!/usr/bin/env python3
+"""법령 조-KB persist: 법령을 조(條) 단위 개별 문서로 분해 + 조↔조 백링크 + 장(章) 태그.
+ASME clause-KB와 동일 모델(doc_kind='clause', parent_id=법령, embedding NULL, 검색제외).
+법령 추출 노이즈(조 앞 ### 메타 반복) 트림. Usage: python3 law_clause_persist.py [--commit]
+"""
+import asyncio, os, re, sys, hashlib, statistics
+
+CAP = 12000; PAGE_TOK = 11000
+EN, KO = 0.217, 0.529
+# 조 헤더: '### 제3조의2(가스안전관리...) 본문'
+ART_RE = re.compile(r'^#{0,6}\s*(제\d+조(?:의\d+)?)\s*\(([^)]*)\)\s*(.*)$')
+CHAP_RE = re.compile(r'^#{1,6}\s*(제\d+장(?:의\d+)?)\s*(.*)$') # 장 = part
+# 같은-법 조 멘션(백링크)
+MENTION_RE = re.compile(r'제\d+조(?:의\d+)?')
+# 타법 참조: 「법명」 ... 제N조
+EXTLAW_RE = re.compile(r'「([^」]+)」')
+
+def tok(s):
+ ko = sum(1 for c in s if '가' <= c <= '힣'); return int((len(s)-ko)*EN + ko*KO)
+def art_code(c): return c # '제3조의2'
+
+def build_articles(text):
+ lines = text.split('\n'); off = []; a = 0
+ for ln in lines: off.append(a); a += len(ln) + 1
+ arts = [] # (line_idx, code, name, part)
+ cur_part = None
+ for i, ln in enumerate(lines):
+ ch = CHAP_RE.match(ln)
+ if ch and not ART_RE.match(ln):
+ cur_part = (ch.group(1) + (' ' + ch.group(2).strip() if ch.group(2).strip() else '')).strip()
+ continue
+ m = ART_RE.match(ln)
+ if m:
+ arts.append((i, m.group(1), m.group(2).strip(), cur_part))
+ # 본문 슬라이스 + 다음 조 앞 메타 노이즈 트림
+ out = []
+ for idx, (li, code, name, part) in enumerate(arts):
+ end_li = arts[idx+1][0] if idx+1 < len(arts) else len(lines)
+ body_lines = lines[li:end_li]
+ # 트림: 끝에서부터 '### {짧은 메타}' (조번호/조문/날짜/제목, [개정] 제N조 아님) 제거
+ while len(body_lines) > 1:
+ last = body_lines[-1].strip()
+ if last == '':
+ body_lines.pop(); continue
+ mh = re.match(r'^#{1,6}\s+(.*)$', last)
+ if mh:
+ c = mh.group(1).strip()
+ if not c.startswith('[') and not c.startswith('제') and (
+ c in ('조문', 'N') or re.fullmatch(r'\d+', c) or re.fullmatch(r'\d{8}', c) or len(c) <= 30):
+ body_lines.pop(); continue
+ break
+ body = '\n'.join(body_lines).strip()
+ links = sorted(set(MENTION_RE.findall(body)) - {code})
+ ext = sorted(set(EXTLAW_RE.findall(body)))[:6]
+ out.append(dict(code=code, part=part or '본칙', order=0,
+ title=f"{code}({name})" if name else code,
+ body=body, tok=tok(body), links=links, ext=ext))
+ # 페이지네이션(over-CAP) + 순번
+ final, order = [], 0
+ for c in out:
+ if c['tok'] <= CAP:
+ final.append({**c, 'order': order}); order += 1; continue
+ # 11K 토큰 라인 단위 분할
+ pages, cur, ct = [], [], 0
+ for ln in c['body'].split('\n'):
+ lt = tok(ln)+1
+ if ct+lt > PAGE_TOK and cur: pages.append('\n'.join(cur)); cur=[ln]; ct=lt
+ else: cur.append(ln); ct+=lt
+ if cur: pages.append('\n'.join(cur))
+ for pi, pb in enumerate(pages):
+ final.append(dict(code=c['code'] if pi==0 else f"{c['code']}·p{pi+1}", part=c['part'],
+ order=order, title=c['title'] if pi==0 else f"{c['title']} (p{pi+1}/{len(pages)})",
+ body=pb, tok=tok(pb), links=c['links'] if pi==0 else [], ext=[]))
+ order += 1
+ return final
+
+async def process_one(conn, law, commit, verbose=True):
+ row = await conn.fetchrow("SELECT title, coalesce(md_content, extracted_text) AS md_content, ai_domain, data_origin FROM documents WHERE id=$1", law)
+ if not row: return ('notfound', 0, 0)
+ if not row['md_content']: return ('nullmd', 0, 0)
+ arts = build_articles(row['md_content'])
+ if not arts: return ('noart', 0, 0)
+ toks = [c['tok'] for c in arts]
+ nlink = sum(len(c['links']) for c in arts)
+ if verbose:
+ parts = {}
+ for c in arts: parts[c['part']] = parts.get(c['part'], 0)+1
+ print(f"law={law} «{(row['title'] or '')[:34]}» 조문={len(arts)} median={int(statistics.median(toks))} "
+ f"max={max(toks)} 장={len(parts)} 백링크={nlink}")
+ print(" 샘플:", [c['title'][:22] for c in arts[:6]])
+ if not commit:
+ return ('dry', len(arts), nlink)
+ async with conn.transaction():
+ await conn.execute(
+ "DELETE FROM clause_links WHERE src_doc_id IN (SELECT id FROM documents WHERE parent_id=$1 AND doc_kind='clause')", law)
+ await conn.execute("DELETE FROM documents WHERE parent_id=$1 AND doc_kind='clause'", law)
+ code2id = {}
+ for c in arts:
+ fh = hashlib.sha256(f"{law}:{c['code']}:{c['body']}".encode()).hexdigest()
+ cid = await conn.fetchval("""
+ INSERT INTO documents (file_format,file_hash,title,md_content,parent_id,doc_kind,
+ clause_code,clause_part,clause_order,ai_domain,data_origin,
+ md_status,review_status,conversion_status,preview_status)
+ VALUES ('md',$1,$2,$3,$4,'clause',$5,$6,$7,$8,$9,'success','approved','none','none') RETURNING id
+ """, fh, c['title'], c['body'], law, c['code'], c['part'], c['order'],
+ row['ai_domain'], row['data_origin'] or 'external')
+ code2id[c['code']] = cid
+ await conn.execute("INSERT INTO document_tags(doc_id,tag,tag_kind) VALUES ($1,$2,'chapter') ON CONFLICT DO NOTHING", cid, c['part'])
+ # 조↔조 백링크 (같은 법 내부; 타법 참조는 dangling)
+ edges = []
+ for c in arts:
+ src = code2id[c['code']]
+ for dst in c['links']:
+ edges.append((src, dst, code2id.get(dst), None, None, None))
+ if edges:
+ await conn.executemany(
+ "INSERT INTO clause_links(src_doc_id,dst_code,dst_doc_id,anchor,ctx,char_off) VALUES ($1,$2,$3,$4,$5,$6)", edges)
+ n = await conn.fetchval("SELECT count(*) FROM documents WHERE parent_id=$1 AND doc_kind='clause'", law)
+ print(f" COMMITTED: {n} 조문 + {len(edges)} 백링크 for law {law}")
+ return ('committed', n, len(edges))
+
+
+async def main():
+ import asyncpg
+ arg = sys.argv[1]; commit = '--commit' in sys.argv
+ conn = await asyncpg.connect(os.environ['DATABASE_URL'].replace('+asyncpg', ''))
+ if arg == 'all':
+ laws = await conn.fetch("SELECT lm.document_id AS id FROM legal_meta lm "
+ "JOIN documents d ON d.id=lm.document_id "
+ "WHERE lm.law_doc_kind='primary' AND lm.version_status='current' "
+ "AND coalesce(d.md_content, d.extracted_text) IS NOT NULL "
+ "ORDER BY lm.document_id")
+ agg = {}; tot_art = tot_link = 0; zero = []
+ for i, r in enumerate(laws):
+ st, na, nl = await process_one(conn, r['id'], commit, verbose=False)
+ agg[st] = agg.get(st, 0) + 1
+ tot_art += na; tot_link += nl
+ if st == 'noart': zero.append(r['id'])
+ if commit and (i + 1) % 30 == 0: print(f" …{i+1}/{len(laws)} (누적 조 {tot_art})")
+ print(f"BATCH {'COMMIT' if commit else 'DRY'} laws={len(laws)} status={agg} 총조문={tot_art} 총백링크={tot_link}")
+ if zero: print(f" 0-조(추출구조 이질) {len(zero)}건: {zero[:20]}")
+ else:
+ await process_one(conn, int(arg), commit, verbose=True)
+ await conn.close()
+
+asyncio.run(main())
diff --git a/scripts/paper_citation_analyze.py b/scripts/paper_citation_analyze.py
new file mode 100644
index 0000000..c843de1
--- /dev/null
+++ b/scripts/paper_citation_analyze.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python3
+"""논문 인용그래프 가능성 측정(read-only) — 본문 DOI로 코퍼스내 인용 엣지 추정.
+own_doi = 헤더(앞 2500자) 첫 DOI / cited = References 이후(또는 전체) DOI. owner 맵 → 엣지.
+"""
+import asyncio, os, re, sys
+
+DOI_RE = re.compile(r'10\.\d{4,9}/[^\s"<>)\]\},;]+')
+REF_RE = re.compile(r'(references|참고문헌|bibliography|reference\s*list)', re.I)
+
+def norm(d): return d.rstrip('.').lower()
+
+async def main():
+ import asyncpg
+ conn = await asyncpg.connect(os.environ['DATABASE_URL'].replace('+asyncpg', ''))
+ rows = await conn.fetch("SELECT id, title, coalesce(md_content, extracted_text) AS txt FROM documents "
+ "WHERE material_type='paper' AND doc_kind='standard' AND deleted_at IS NULL "
+ "AND coalesce(md_content, extracted_text) IS NOT NULL")
+ owner = {} # doi -> paper id (헤더 DOI = 그 논문 소유)
+ cited = {} # paper id -> set(cited doi)
+ n_own = n_refsec = 0
+ for r in rows:
+ txt = r['txt']
+ head = txt[:2500]
+ hdois = [norm(d) for d in DOI_RE.findall(head)]
+ if hdois:
+ owner.setdefault(hdois[0], r['id']); n_own += 1
+ m = REF_RE.search(txt)
+ body = txt[m.start():] if m else ''
+ if m: n_refsec += 1
+ cds = set(norm(d) for d in DOI_RE.findall(body))
+ if cds: cited[r['id']] = cds
+ # 엣지: paper -> owner(cited doi)
+ edges = []
+ for pid, cds in cited.items():
+ for d in cds:
+ o = owner.get(d)
+ if o and o != pid: edges.append((pid, o, d))
+ cited_papers = set(e[0] for e in edges)
+ target_papers = set(e[1] for e in edges)
+ print(f"papers={len(rows)} 헤더DOI보유={n_own} References보유={n_refsec} owner_map={len(owner)}")
+ print(f"인용엣지(코퍼스내)={len(edges)} 인용하는논문={len(cited_papers)} 피인용논문={len(target_papers)}")
+ # 피인용 top
+ from collections import Counter
+ top = Counter(e[1] for e in edges).most_common(6)
+ if top:
+ idmap = {r['id']: r['title'] for r in rows}
+ print("피인용 top:")
+ for pid, c in top: print(f" {c}회 ← {(idmap.get(pid) or '')[:48]}")
+ await conn.close()
+
+asyncio.run(main())
diff --git a/scripts/paper_openalex_test.py b/scripts/paper_openalex_test.py
new file mode 100644
index 0000000..c835391
--- /dev/null
+++ b/scripts/paper_openalex_test.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python3
+"""OpenAlex 보강 타당성 테스트 — 소수 논문 제목으로 매칭/메타 확인 (외부 API)."""
+import asyncio, os, re
+
+async def main():
+ import asyncpg, httpx
+ conn = await asyncpg.connect(os.environ['DATABASE_URL'].replace('+asyncpg', ''))
+ rows = await conn.fetch("SELECT id, title FROM documents WHERE material_type='paper' "
+ "AND doc_kind='standard' AND deleted_at IS NULL AND title IS NOT NULL "
+ "AND length(title) > 15 ORDER BY id LIMIT 6")
+ async with httpx.AsyncClient(timeout=20) as client:
+ for r in rows:
+ title = re.sub(r'\s+', ' ', r['title']).strip()
+ try:
+ resp = await client.get("https://api.openalex.org/works",
+ params={"search": title[:200], "per_page": 1, "mailto": "hyun49196@gmail.com"})
+ js = resp.json()
+ res = (js.get("results") or [])
+ if not res:
+ print(f"[{r['id']}] NO MATCH | {title[:50]}"); continue
+ w = res[0]
+ oid = (w.get("id") or "").split("/")[-1]
+ print(f"[{r['id']}] {title[:46]}")
+ print(f" → OA {oid} | {(w.get('title') or '')[:46]} | {w.get('publication_year')} | "
+ f"cited_by={w.get('cited_by_count')} | refs={len(w.get('referenced_works') or [])} | doi={w.get('doi')}")
+ except Exception as e:
+ print(f"[{r['id']}] ERROR {type(e).__name__}: {e}")
+ await conn.close()
+
+asyncio.run(main())