#!/usr/bin/env python3 """논문 인용그래프 가능성 측정(read-only) — 본문 DOI로 코퍼스내 인용 엣지 추정. own_doi = 헤더(앞 2500자) 첫 DOI / cited = References 이후(또는 전체) DOI. owner 맵 → 엣지. """ import asyncio, os, re, sys DOI_RE = re.compile(r'10\.\d{4,9}/[^\s"<>)\]\},;]+') REF_RE = re.compile(r'(references|참고문헌|bibliography|reference\s*list)', re.I) def norm(d): return d.rstrip('.').lower() async def main(): import asyncpg conn = await asyncpg.connect(os.environ['DATABASE_URL'].replace('+asyncpg', '')) rows = await conn.fetch("SELECT id, title, coalesce(md_content, extracted_text) AS txt FROM documents " "WHERE material_type='paper' AND doc_kind='standard' AND deleted_at IS NULL " "AND coalesce(md_content, extracted_text) IS NOT NULL") owner = {} # doi -> paper id (헤더 DOI = 그 논문 소유) cited = {} # paper id -> set(cited doi) n_own = n_refsec = 0 for r in rows: txt = r['txt'] head = txt[:2500] hdois = [norm(d) for d in DOI_RE.findall(head)] if hdois: owner.setdefault(hdois[0], r['id']); n_own += 1 m = REF_RE.search(txt) body = txt[m.start():] if m else '' if m: n_refsec += 1 cds = set(norm(d) for d in DOI_RE.findall(body)) if cds: cited[r['id']] = cds # 엣지: paper -> owner(cited doi) edges = [] for pid, cds in cited.items(): for d in cds: o = owner.get(d) if o and o != pid: edges.append((pid, o, d)) cited_papers = set(e[0] for e in edges) target_papers = set(e[1] for e in edges) print(f"papers={len(rows)} 헤더DOI보유={n_own} References보유={n_refsec} owner_map={len(owner)}") print(f"인용엣지(코퍼스내)={len(edges)} 인용하는논문={len(cited_papers)} 피인용논문={len(target_papers)}") # 피인용 top from collections import Counter top = Counter(e[1] for e in edges).most_common(6) if top: idmap = {r['id']: r['title'] for r in rows} print("피인용 top:") for pid, c in top: print(f" {c}회 ← {(idmap.get(pid) or '')[:48]}") await conn.close() asyncio.run(main())