Files

52 lines
2.2 KiB
Python

#!/usr/bin/env python3
"""논문 인용그래프 가능성 측정(read-only) — 본문 DOI로 코퍼스내 인용 엣지 추정.
own_doi = 헤더(앞 2500자) 첫 DOI / cited = References 이후(또는 전체) DOI. owner 맵 → 엣지.
"""
import asyncio, os, re, sys
DOI_RE = re.compile(r'10\.\d{4,9}/[^\s"<>)\]\},;]+')
REF_RE = re.compile(r'(references|참고문헌|bibliography|reference\s*list)', re.I)
def norm(d): return d.rstrip('.').lower()
async def main():
import asyncpg
conn = await asyncpg.connect(os.environ['DATABASE_URL'].replace('+asyncpg', ''))
rows = await conn.fetch("SELECT id, title, coalesce(md_content, extracted_text) AS txt FROM documents "
"WHERE material_type='paper' AND doc_kind='standard' AND deleted_at IS NULL "
"AND coalesce(md_content, extracted_text) IS NOT NULL")
owner = {} # doi -> paper id (헤더 DOI = 그 논문 소유)
cited = {} # paper id -> set(cited doi)
n_own = n_refsec = 0
for r in rows:
txt = r['txt']
head = txt[:2500]
hdois = [norm(d) for d in DOI_RE.findall(head)]
if hdois:
owner.setdefault(hdois[0], r['id']); n_own += 1
m = REF_RE.search(txt)
body = txt[m.start():] if m else ''
if m: n_refsec += 1
cds = set(norm(d) for d in DOI_RE.findall(body))
if cds: cited[r['id']] = cds
# 엣지: paper -> owner(cited doi)
edges = []
for pid, cds in cited.items():
for d in cds:
o = owner.get(d)
if o and o != pid: edges.append((pid, o, d))
cited_papers = set(e[0] for e in edges)
target_papers = set(e[1] for e in edges)
print(f"papers={len(rows)} 헤더DOI보유={n_own} References보유={n_refsec} owner_map={len(owner)}")
print(f"인용엣지(코퍼스내)={len(edges)} 인용하는논문={len(cited_papers)} 피인용논문={len(target_papers)}")
# 피인용 top
from collections import Counter
top = Counter(e[1] for e in edges).most_common(6)
if top:
idmap = {r['id']: r['title'] for r in rows}
print("피인용 top:")
for pid, c in top: print(f" {c}회 ← {(idmap.get(pid) or '')[:48]}")
await conn.close()
asyncio.run(main())