From a22b2c76475475596f9bc4ca8a13ebc3dc1b82c8 Mon Sep 17 00:00:00 2001
From: hyungi <hyun49196@gmail.com>
Date: Tue, 30 Jun 2026 06:10:11 +0000
Subject: [PATCH] =?UTF-8?q?feat(docs):=20=EA=B4=80=EB=A0=A8=20=EB=AC=B8?=
 =?UTF-8?q?=EC=84=9C(=EC=9C=A0=EC=82=AC=EB=8F=84=20KNN)=20=EC=97=94?=
 =?UTF-8?q?=EB=93=9C=ED=8F=AC=EC=9D=B8=ED=8A=B8+=ED=8C=A8=EB=84=90=20+=20?=
 =?UTF-8?q?=EB=B2=95=EB=A0=B9/=EC=A7=80=EC=B9=A8=20splitter?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 app/api/documents.py                          |  58 +++++++
 .../src/lib/components/RelatedDocs.svelte     |  45 ++++++
 .../src/routes/documents/[id]/+page.svelte    |   2 +
 scripts/guide_clause_persist.py               | 100 ++++++++++++
 scripts/law_clause_persist.py                 | 146 ++++++++++++++++++
 scripts/paper_citation_analyze.py             |  51 ++++++
 scripts/paper_openalex_test.py                |  30 ++++
 7 files changed, 432 insertions(+)
 create mode 100644 frontend/src/lib/components/RelatedDocs.svelte
 create mode 100644 scripts/guide_clause_persist.py
 create mode 100644 scripts/law_clause_persist.py
 create mode 100644 scripts/paper_citation_analyze.py
 create mode 100644 scripts/paper_openalex_test.py

diff --git a/app/api/documents.py b/app/api/documents.py
index ca01a8b..0443fb5 100644
--- a/app/api/documents.py
+++ b/app/api/documents.py
@@ -1990,3 +1990,61 @@ async def get_document_backlinks(
         forward=[BacklinkRef(**dict(r)) for r in forward],
         back=[BacklinkRef(**dict(r)) for r in back],
     )
+
+
+# ─── 관련 문서 (유사도, on-demand pgvector KNN — 저부하·무저장) ───
+class RelatedItem(BaseModel):
+    id: int
+    title: str | None = None
+    ai_domain: str | None = None
+    material_type: str | None = None
+    year: int | None = None
+    sim: float | None = None
+
+
+class RelatedResponse(BaseModel):
+    doc_id: int
+    related: list[RelatedItem]
+
+
+@router.get("/{doc_id}/related", response_model=RelatedResponse)
+async def get_related_documents(
+    doc_id: int,
+    user: Annotated[User, Depends(get_current_user)],
+    session: Annotated[AsyncSession, Depends(get_session)],
+    limit: int = 8,
+    same_type: bool = True,
+):
+    """문서-레벨 임베딩 코사인 최근접 = '관련 문서'. on-demand(저장/배치 없음).
+
+    인용그래프가 부적합한 코퍼스(업계 기술기사=인용망 부재)의 대안 연결 레이어.
+    same_type=true면 같은 material_type 내, false면 전 코퍼스. doc_kind='clause'(절-문서)는 제외.
+    """
+    from sqlalchemy import text as sql_text
+
+    lim = max(1, min(limit, 30))
+    type_clause = "AND d.material_type = src.material_type" if same_type else ""
+    rows = (
+        await session.execute(
+            sql_text(
+                f"""
+                WITH src AS (
+                    SELECT embedding, material_type FROM documents WHERE id = :id
+                )
+                SELECT d.id, d.title, d.ai_domain, d.material_type, d.facet_year AS year,
+                       round((1 - (d.embedding <=> (SELECT embedding FROM src)))::numeric, 3) AS sim
+                FROM documents d, src
+                WHERE d.doc_kind = 'standard' AND d.deleted_at IS NULL
+                  AND d.id <> :id AND d.embedding IS NOT NULL
+                  AND (SELECT embedding FROM src) IS NOT NULL
+                  {type_clause}
+                ORDER BY d.embedding <=> (SELECT embedding FROM src)
+                LIMIT :lim
+                """
+            ).bindparams(id=doc_id, lim=lim)
+        )
+    ).mappings().all()
+    return RelatedResponse(
+        doc_id=doc_id,
+        related=[RelatedItem(**{k: r[k] for k in ("id", "title", "ai_domain", "material_type", "year")}, sim=float(r["sim"]) if r["sim"] is not None else None) for r in rows],
+    )
diff --git a/frontend/src/lib/components/RelatedDocs.svelte b/frontend/src/lib/components/RelatedDocs.svelte
new file mode 100644
index 0000000..c650d01
--- /dev/null
+++ b/frontend/src/lib/components/RelatedDocs.svelte
@@ -0,0 +1,45 @@
+<script>
+  // 관련 문서 (유사도) — 문서 레벨 임베딩 KNN. 자기완결: docId 받아 /related 조회.
+  import { onMount } from 'svelte';
+  import { api } from '$lib/api';
+
+  let { documentId } = $props();
+  let items = $state([]);
+  let loaded = $state(false);
+
+  const KIND = { law: '법령', guide: '지침', paper: '논문', standard: '표준', incident: '사례' };
+
+  onMount(async () => {
+    try {
+      const r = await api(`/documents/${documentId}/related?limit=6`);
+      items = r?.related ?? [];
+    } catch (e) { /* silent */ }
+    finally { loaded = true; }
+  });
+</script>
+
+{#if items.length}
+  <div class="rel">
+    <div class="lab">관련 문서</div>
+    {#each items as it (it.id)}
+      <a class="ri" href={`/documents/${it.id}`}>
+        <span class="rt">{it.title}</span>
+        <span class="rm">
+          {#if it.material_type && KIND[it.material_type]}<span class="kind">{KIND[it.material_type]}</span>{/if}
+          <span class="rs">{Math.round((it.sim ?? 0) * 100)}</span>
+        </span>
+      </a>
+    {/each}
+  </div>
+{/if}
+
+<style>
+  .rel { background: var(--surface); border: 1px solid var(--border); border-radius: 14px; padding: 13px; }
+  .lab { font-size: 10.5px; font-weight: 700; color: var(--text-dim); letter-spacing: .4px; margin-bottom: 8px; }
+  .ri { display: flex; align-items: baseline; gap: 8px; padding: 5px 6px; border-radius: 7px; text-decoration: none; }
+  .ri:hover { background: var(--surface-hover, #ecf0e8); }
+  .rt { flex: 1; font-size: 12px; line-height: 1.4; color: var(--text); overflow: hidden; display: -webkit-box; -webkit-line-clamp: 2; -webkit-box-orient: vertical; }
+  .rm { flex-shrink: 0; display: flex; align-items: center; gap: 5px; }
+  .kind { font-size: 9px; font-weight: 700; color: var(--accent-hover, #3d7256); background: #e3efe2; border: 1px solid #cfe3cd; border-radius: 4px; padding: 0 4px; }
+  .rs { font-size: 10.5px; font-family: ui-monospace, Menlo, monospace; color: var(--faint, #9aa090); }
+</style>
diff --git a/frontend/src/routes/documents/[id]/+page.svelte b/frontend/src/routes/documents/[id]/+page.svelte
index a105a7c..e68fa6b 100644
--- a/frontend/src/routes/documents/[id]/+page.svelte
+++ b/frontend/src/routes/documents/[id]/+page.svelte
@@ -16,6 +16,7 @@
   import Skeleton from '$lib/components/ui/Skeleton.svelte';
   import HandwriteCanvas from '$lib/components/HandwriteCanvas.svelte';
   import MarkdownDoc from '$lib/components/MarkdownDoc.svelte';
+  import RelatedDocs from '$lib/components/RelatedDocs.svelte';
   import { renderDocMarkdown } from '$lib/utils/docMarkdown';
   import MarkdownStatusBadge from '$lib/components/MarkdownStatusBadge.svelte';
   import NoteEditor from '$lib/components/editors/NoteEditor.svelte';
@@ -321,6 +322,7 @@
 <!-- ════ 우 슬림 레일 (시안 카드 스타일) ════ -->
 {#snippet rail()}
   <div style="display:flex;flex-direction:column;gap:11px;font-size:14px;">
+    <RelatedDocs documentId={doc.id} />
     {#if doc.ai_tldr || doc.ai_summary}
       <div style="background:#f4f7f1;border:1px solid #dde3d6;border-radius:14px;padding:13px;">
         <div style="font-size:10.5px;font-weight:700;color:#697061;letter-spacing:.4px;margin-bottom:7px;">TL;DR</div>
diff --git a/scripts/guide_clause_persist.py b/scripts/guide_clause_persist.py
new file mode 100644
index 0000000..5b61092
--- /dev/null
+++ b/scripts/guide_clause_persist.py
@@ -0,0 +1,100 @@
+#!/usr/bin/env python3
+"""기술지침(KOSHA guide) 절-KB persist: 번호섹션(# 1. 목적 / ## 4.1) 단위 분해 + 제본.
+ASME/법령과 동일 clause-KB 모델(doc_kind='clause', parent_id=지침, 검색제외, /book 리더 공용).
+Usage: python3 guide_clause_persist.py <id|all> [--commit]
+"""
+import asyncio, os, re, sys, hashlib, statistics
+
+CAP = 12000; PAGE_TOK = 11000
+EN, KO = 0.217, 0.529
+# 번호섹션 헤더: '# 1. 목 적', '## 4.1 누출...'  (번호 1~3자리=연도(4자리) 배제)
+ART_RE = re.compile(r'^#{1,6}\s*(\d{1,3}(?:\.\d{1,3})*)\.?\s+(\S.*)$')
+TOP_RE = re.compile(r'^\d{1,3}$')
+# 외부 표준/법규 참조(대부분 dangling): ASME B16.5 · KS B 1501 · 규칙 제N조
+EXT_RE = re.compile(r'(ASME\s+[A-Z][0-9.]+|KS\s+[A-Z]\s*[0-9]+|ISO\s+[0-9]+|제\d+조)')
+
+def tok(s):
+    ko = sum(1 for c in s if '가' <= c <= '힣'); return int((len(s)-ko)*EN + ko*KO)
+
+def build_sections(text):
+    lines = text.split('\n'); off = []; a = 0
+    for ln in lines: off.append(a); a += len(ln) + 1
+    bounds = []; seen = set()
+    for i, ln in enumerate(lines):
+        m = ART_RE.match(ln)
+        if not m: continue
+        code, name = m.group(1), m.group(2).strip()
+        if not TOP_RE.match(code): continue       # top-level 번호섹션만 경계
+        if code in seen: continue
+        if len(name) < 1: continue
+        seen.add(code); bounds.append((off[i], code, name))
+    out = []
+    for idx, (start, code, name) in enumerate(bounds):
+        end = bounds[idx+1][0] if idx+1 < len(bounds) else len(text)
+        body = text[start:end].strip()
+        ext = sorted(set(EXT_RE.findall(body)))[:8]
+        out.append(dict(code=code, part='본문', order=0, title=f"{code}. {name}"[:120],
+                        body=body, tok=tok(body), links=[], ext=ext))
+    # over-CAP 페이지네이션 + 순번
+    final, order = [], 0
+    for c in out:
+        if c['tok'] <= CAP:
+            final.append({**c, 'order': order}); order += 1; continue
+        pages, cur, ct = [], [], 0
+        for ln in c['body'].split('\n'):
+            lt = tok(ln)+1
+            if ct+lt > PAGE_TOK and cur: pages.append('\n'.join(cur)); cur=[ln]; ct=lt
+            else: cur.append(ln); ct+=lt
+        if cur: pages.append('\n'.join(cur))
+        for pi, pb in enumerate(pages):
+            final.append(dict(code=c['code'] if pi==0 else f"{c['code']}·p{pi+1}", part='본문',
+                              order=order, title=c['title'] if pi==0 else f"{c['title']} (p{pi+1})",
+                              body=pb, tok=tok(pb), links=[], ext=[]))
+            order += 1
+    return final
+
+async def process_one(conn, gid, commit, verbose=True):
+    row = await conn.fetchrow("SELECT title, md_content, ai_domain, data_origin FROM documents WHERE id=$1", gid)
+    if not row: return ('notfound', 0)
+    if not row['md_content']: return ('nullmd', 0)
+    secs = build_sections(row['md_content'])
+    if len(secs) < 2: return ('few', len(secs))     # 섹션 2 미만 = 번호구조 아님
+    toks = [c['tok'] for c in secs]
+    if verbose:
+        print(f"guide={gid} «{(row['title'] or '')[:40]}» 섹션={len(secs)} median={int(statistics.median(toks))} max={max(toks)}")
+        print("  샘플:", [c['title'][:26] for c in secs[:7]])
+    if not commit: return ('dry', len(secs))
+    async with conn.transaction():
+        await conn.execute("DELETE FROM clause_links WHERE src_doc_id IN (SELECT id FROM documents WHERE parent_id=$1 AND doc_kind='clause')", gid)
+        await conn.execute("DELETE FROM documents WHERE parent_id=$1 AND doc_kind='clause'", gid)
+        for c in secs:
+            fh = hashlib.sha256(f"{gid}:{c['code']}:{c['body']}".encode()).hexdigest()
+            cid = await conn.fetchval("""
+                INSERT INTO documents (file_format,file_hash,title,md_content,parent_id,doc_kind,
+                  clause_code,clause_part,clause_order,ai_domain,data_origin,
+                  md_status,review_status,conversion_status,preview_status)
+                VALUES ('md',$1,$2,$3,$4,'clause',$5,$6,$7,$8,$9,'success','approved','none','none') RETURNING id
+            """, fh, c['title'], c['body'], gid, c['code'], c['part'], c['order'], row['ai_domain'], row['data_origin'] or 'external')
+            await conn.execute("INSERT INTO document_tags(doc_id,tag,tag_kind) VALUES ($1,'기술지침','kind') ON CONFLICT DO NOTHING", cid)
+        n = await conn.fetchval("SELECT count(*) FROM documents WHERE parent_id=$1 AND doc_kind='clause'", gid)
+        print(f"  COMMITTED: {n} 섹션 for guide {gid}")
+    return ('committed', len(secs))
+
+async def main():
+    import asyncpg
+    arg = sys.argv[1]; commit = '--commit' in sys.argv
+    conn = await asyncpg.connect(os.environ['DATABASE_URL'].replace('+asyncpg', ''))
+    if arg == 'all':
+        gs = await conn.fetch("SELECT id FROM documents WHERE material_type='guide' AND doc_kind='standard' "
+                              "AND deleted_at IS NULL AND md_content IS NOT NULL ORDER BY id")
+        agg = {}; tot = 0
+        for i, r in enumerate(gs):
+            st, n = await process_one(conn, r['id'], commit, verbose=False)
+            agg[st] = agg.get(st, 0)+1; tot += n if st in ('dry','committed') else 0
+            if commit and (i+1) % 40 == 0: print(f"  …{i+1}/{len(gs)} (누적섹션 {tot})")
+        print(f"BATCH {'COMMIT' if commit else 'DRY'} guides={len(gs)} status={agg} 총섹션={tot}")
+    else:
+        await process_one(conn, int(arg), commit, verbose=True)
+    await conn.close()
+
+asyncio.run(main())
diff --git a/scripts/law_clause_persist.py b/scripts/law_clause_persist.py
new file mode 100644
index 0000000..a87fd6d
--- /dev/null
+++ b/scripts/law_clause_persist.py
@@ -0,0 +1,146 @@
+#!/usr/bin/env python3
+"""법령 조-KB persist: 법령을 조(條) 단위 개별 문서로 분해 + 조↔조 백링크 + 장(章) 태그.
+ASME clause-KB와 동일 모델(doc_kind='clause', parent_id=법령, embedding NULL, 검색제외).
+법령 추출 노이즈(조 앞 ### 메타 반복) 트림. Usage: python3 law_clause_persist.py <law_id> [--commit]
+"""
+import asyncio, os, re, sys, hashlib, statistics
+
+CAP = 12000; PAGE_TOK = 11000
+EN, KO = 0.217, 0.529
+# 조 헤더: '### 제3조의2(가스안전관리...) 본문'
+ART_RE = re.compile(r'^#{0,6}\s*(제\d+조(?:의\d+)?)\s*\(([^)]*)\)\s*(.*)$')
+CHAP_RE = re.compile(r'^#{1,6}\s*(제\d+장(?:의\d+)?)\s*(.*)$')         # 장 = part
+# 같은-법 조 멘션(백링크)
+MENTION_RE = re.compile(r'제\d+조(?:의\d+)?')
+# 타법 참조: 「법명」 ... 제N조
+EXTLAW_RE = re.compile(r'「([^」]+)」')
+
+def tok(s):
+    ko = sum(1 for c in s if '가' <= c <= '힣'); return int((len(s)-ko)*EN + ko*KO)
+def art_code(c): return c  # '제3조의2'
+
+def build_articles(text):
+    lines = text.split('\n'); off = []; a = 0
+    for ln in lines: off.append(a); a += len(ln) + 1
+    arts = []      # (line_idx, code, name, part)
+    cur_part = None
+    for i, ln in enumerate(lines):
+        ch = CHAP_RE.match(ln)
+        if ch and not ART_RE.match(ln):
+            cur_part = (ch.group(1) + (' ' + ch.group(2).strip() if ch.group(2).strip() else '')).strip()
+            continue
+        m = ART_RE.match(ln)
+        if m:
+            arts.append((i, m.group(1), m.group(2).strip(), cur_part))
+    # 본문 슬라이스 + 다음 조 앞 메타 노이즈 트림
+    out = []
+    for idx, (li, code, name, part) in enumerate(arts):
+        end_li = arts[idx+1][0] if idx+1 < len(arts) else len(lines)
+        body_lines = lines[li:end_li]
+        # 트림: 끝에서부터 '### {짧은 메타}' (조번호/조문/날짜/제목, [개정] 제N조 아님) 제거
+        while len(body_lines) > 1:
+            last = body_lines[-1].strip()
+            if last == '':
+                body_lines.pop(); continue
+            mh = re.match(r'^#{1,6}\s+(.*)$', last)
+            if mh:
+                c = mh.group(1).strip()
+                if not c.startswith('[') and not c.startswith('제') and (
+                        c in ('조문', 'N') or re.fullmatch(r'\d+', c) or re.fullmatch(r'\d{8}', c) or len(c) <= 30):
+                    body_lines.pop(); continue
+            break
+        body = '\n'.join(body_lines).strip()
+        links = sorted(set(MENTION_RE.findall(body)) - {code})
+        ext = sorted(set(EXTLAW_RE.findall(body)))[:6]
+        out.append(dict(code=code, part=part or '본칙', order=0,
+                        title=f"{code}({name})" if name else code,
+                        body=body, tok=tok(body), links=links, ext=ext))
+    # 페이지네이션(over-CAP) + 순번
+    final, order = [], 0
+    for c in out:
+        if c['tok'] <= CAP:
+            final.append({**c, 'order': order}); order += 1; continue
+        # 11K 토큰 라인 단위 분할
+        pages, cur, ct = [], [], 0
+        for ln in c['body'].split('\n'):
+            lt = tok(ln)+1
+            if ct+lt > PAGE_TOK and cur: pages.append('\n'.join(cur)); cur=[ln]; ct=lt
+            else: cur.append(ln); ct+=lt
+        if cur: pages.append('\n'.join(cur))
+        for pi, pb in enumerate(pages):
+            final.append(dict(code=c['code'] if pi==0 else f"{c['code']}·p{pi+1}", part=c['part'],
+                              order=order, title=c['title'] if pi==0 else f"{c['title']} (p{pi+1}/{len(pages)})",
+                              body=pb, tok=tok(pb), links=c['links'] if pi==0 else [], ext=[]))
+            order += 1
+    return final
+
+async def process_one(conn, law, commit, verbose=True):
+    row = await conn.fetchrow("SELECT title, coalesce(md_content, extracted_text) AS md_content, ai_domain, data_origin FROM documents WHERE id=$1", law)
+    if not row: return ('notfound', 0, 0)
+    if not row['md_content']: return ('nullmd', 0, 0)
+    arts = build_articles(row['md_content'])
+    if not arts: return ('noart', 0, 0)
+    toks = [c['tok'] for c in arts]
+    nlink = sum(len(c['links']) for c in arts)
+    if verbose:
+        parts = {}
+        for c in arts: parts[c['part']] = parts.get(c['part'], 0)+1
+        print(f"law={law} «{(row['title'] or '')[:34]}» 조문={len(arts)} median={int(statistics.median(toks))} "
+              f"max={max(toks)} 장={len(parts)} 백링크={nlink}")
+        print("  샘플:", [c['title'][:22] for c in arts[:6]])
+    if not commit:
+        return ('dry', len(arts), nlink)
+    async with conn.transaction():
+        await conn.execute(
+            "DELETE FROM clause_links WHERE src_doc_id IN (SELECT id FROM documents WHERE parent_id=$1 AND doc_kind='clause')", law)
+        await conn.execute("DELETE FROM documents WHERE parent_id=$1 AND doc_kind='clause'", law)
+        code2id = {}
+        for c in arts:
+            fh = hashlib.sha256(f"{law}:{c['code']}:{c['body']}".encode()).hexdigest()
+            cid = await conn.fetchval("""
+                INSERT INTO documents (file_format,file_hash,title,md_content,parent_id,doc_kind,
+                  clause_code,clause_part,clause_order,ai_domain,data_origin,
+                  md_status,review_status,conversion_status,preview_status)
+                VALUES ('md',$1,$2,$3,$4,'clause',$5,$6,$7,$8,$9,'success','approved','none','none') RETURNING id
+            """, fh, c['title'], c['body'], law, c['code'], c['part'], c['order'],
+                 row['ai_domain'], row['data_origin'] or 'external')
+            code2id[c['code']] = cid
+            await conn.execute("INSERT INTO document_tags(doc_id,tag,tag_kind) VALUES ($1,$2,'chapter') ON CONFLICT DO NOTHING", cid, c['part'])
+        # 조↔조 백링크 (같은 법 내부; 타법 참조는 dangling)
+        edges = []
+        for c in arts:
+            src = code2id[c['code']]
+            for dst in c['links']:
+                edges.append((src, dst, code2id.get(dst), None, None, None))
+        if edges:
+            await conn.executemany(
+                "INSERT INTO clause_links(src_doc_id,dst_code,dst_doc_id,anchor,ctx,char_off) VALUES ($1,$2,$3,$4,$5,$6)", edges)
+        n = await conn.fetchval("SELECT count(*) FROM documents WHERE parent_id=$1 AND doc_kind='clause'", law)
+        print(f"  COMMITTED: {n} 조문 + {len(edges)} 백링크 for law {law}")
+    return ('committed', n, len(edges))
+
+
+async def main():
+    import asyncpg
+    arg = sys.argv[1]; commit = '--commit' in sys.argv
+    conn = await asyncpg.connect(os.environ['DATABASE_URL'].replace('+asyncpg', ''))
+    if arg == 'all':
+        laws = await conn.fetch("SELECT lm.document_id AS id FROM legal_meta lm "
+                                "JOIN documents d ON d.id=lm.document_id "
+                                "WHERE lm.law_doc_kind='primary' AND lm.version_status='current' "
+                                "AND coalesce(d.md_content, d.extracted_text) IS NOT NULL "
+                                "ORDER BY lm.document_id")
+        agg = {}; tot_art = tot_link = 0; zero = []
+        for i, r in enumerate(laws):
+            st, na, nl = await process_one(conn, r['id'], commit, verbose=False)
+            agg[st] = agg.get(st, 0) + 1
+            tot_art += na; tot_link += nl
+            if st == 'noart': zero.append(r['id'])
+            if commit and (i + 1) % 30 == 0: print(f"  …{i+1}/{len(laws)} (누적 조 {tot_art})")
+        print(f"BATCH {'COMMIT' if commit else 'DRY'} laws={len(laws)} status={agg} 총조문={tot_art} 총백링크={tot_link}")
+        if zero: print(f"  0-조(추출구조 이질) {len(zero)}건: {zero[:20]}")
+    else:
+        await process_one(conn, int(arg), commit, verbose=True)
+    await conn.close()
+
+asyncio.run(main())
diff --git a/scripts/paper_citation_analyze.py b/scripts/paper_citation_analyze.py
new file mode 100644
index 0000000..c843de1
--- /dev/null
+++ b/scripts/paper_citation_analyze.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python3
+"""논문 인용그래프 가능성 측정(read-only) — 본문 DOI로 코퍼스내 인용 엣지 추정.
+own_doi = 헤더(앞 2500자) 첫 DOI / cited = References 이후(또는 전체) DOI. owner 맵 → 엣지.
+"""
+import asyncio, os, re, sys
+
+DOI_RE = re.compile(r'10\.\d{4,9}/[^\s"<>)\]\},;]+')
+REF_RE = re.compile(r'(references|참고문헌|bibliography|reference\s*list)', re.I)
+
+def norm(d): return d.rstrip('.').lower()
+
+async def main():
+    import asyncpg
+    conn = await asyncpg.connect(os.environ['DATABASE_URL'].replace('+asyncpg', ''))
+    rows = await conn.fetch("SELECT id, title, coalesce(md_content, extracted_text) AS txt FROM documents "
+                            "WHERE material_type='paper' AND doc_kind='standard' AND deleted_at IS NULL "
+                            "AND coalesce(md_content, extracted_text) IS NOT NULL")
+    owner = {}        # doi -> paper id (헤더 DOI = 그 논문 소유)
+    cited = {}        # paper id -> set(cited doi)
+    n_own = n_refsec = 0
+    for r in rows:
+        txt = r['txt']
+        head = txt[:2500]
+        hdois = [norm(d) for d in DOI_RE.findall(head)]
+        if hdois:
+            owner.setdefault(hdois[0], r['id']); n_own += 1
+        m = REF_RE.search(txt)
+        body = txt[m.start():] if m else ''
+        if m: n_refsec += 1
+        cds = set(norm(d) for d in DOI_RE.findall(body))
+        if cds: cited[r['id']] = cds
+    # 엣지: paper -> owner(cited doi)
+    edges = []
+    for pid, cds in cited.items():
+        for d in cds:
+            o = owner.get(d)
+            if o and o != pid: edges.append((pid, o, d))
+    cited_papers = set(e[0] for e in edges)
+    target_papers = set(e[1] for e in edges)
+    print(f"papers={len(rows)} 헤더DOI보유={n_own} References보유={n_refsec} owner_map={len(owner)}")
+    print(f"인용엣지(코퍼스내)={len(edges)} 인용하는논문={len(cited_papers)} 피인용논문={len(target_papers)}")
+    # 피인용 top
+    from collections import Counter
+    top = Counter(e[1] for e in edges).most_common(6)
+    if top:
+        idmap = {r['id']: r['title'] for r in rows}
+        print("피인용 top:")
+        for pid, c in top: print(f"  {c}회 ← {(idmap.get(pid) or '')[:48]}")
+    await conn.close()
+
+asyncio.run(main())
diff --git a/scripts/paper_openalex_test.py b/scripts/paper_openalex_test.py
new file mode 100644
index 0000000..c835391
--- /dev/null
+++ b/scripts/paper_openalex_test.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python3
+"""OpenAlex 보강 타당성 테스트 — 소수 논문 제목으로 매칭/메타 확인 (외부 API)."""
+import asyncio, os, re
+
+async def main():
+    import asyncpg, httpx
+    conn = await asyncpg.connect(os.environ['DATABASE_URL'].replace('+asyncpg', ''))
+    rows = await conn.fetch("SELECT id, title FROM documents WHERE material_type='paper' "
+                            "AND doc_kind='standard' AND deleted_at IS NULL AND title IS NOT NULL "
+                            "AND length(title) > 15 ORDER BY id LIMIT 6")
+    async with httpx.AsyncClient(timeout=20) as client:
+        for r in rows:
+            title = re.sub(r'\s+', ' ', r['title']).strip()
+            try:
+                resp = await client.get("https://api.openalex.org/works",
+                    params={"search": title[:200], "per_page": 1, "mailto": "hyun49196@gmail.com"})
+                js = resp.json()
+                res = (js.get("results") or [])
+                if not res:
+                    print(f"[{r['id']}] NO MATCH | {title[:50]}"); continue
+                w = res[0]
+                oid = (w.get("id") or "").split("/")[-1]
+                print(f"[{r['id']}] {title[:46]}")
+                print(f"   → OA {oid} | {(w.get('title') or '')[:46]} | {w.get('publication_year')} | "
+                      f"cited_by={w.get('cited_by_count')} | refs={len(w.get('referenced_works') or [])} | doi={w.get('doi')}")
+            except Exception as e:
+                print(f"[{r['id']}] ERROR {type(e).__name__}: {e}")
+    await conn.close()
+
+asyncio.run(main())