52 lines
2.2 KiB
Python
52 lines
2.2 KiB
Python
#!/usr/bin/env python3
|
|
"""논문 인용그래프 가능성 측정(read-only) — 본문 DOI로 코퍼스내 인용 엣지 추정.
|
|
own_doi = 헤더(앞 2500자) 첫 DOI / cited = References 이후(또는 전체) DOI. owner 맵 → 엣지.
|
|
"""
|
|
import asyncio, os, re, sys
|
|
|
|
DOI_RE = re.compile(r'10\.\d{4,9}/[^\s"<>)\]\},;]+')
|
|
REF_RE = re.compile(r'(references|참고문헌|bibliography|reference\s*list)', re.I)
|
|
|
|
def norm(d): return d.rstrip('.').lower()
|
|
|
|
async def main():
|
|
import asyncpg
|
|
conn = await asyncpg.connect(os.environ['DATABASE_URL'].replace('+asyncpg', ''))
|
|
rows = await conn.fetch("SELECT id, title, coalesce(md_content, extracted_text) AS txt FROM documents "
|
|
"WHERE material_type='paper' AND doc_kind='standard' AND deleted_at IS NULL "
|
|
"AND coalesce(md_content, extracted_text) IS NOT NULL")
|
|
owner = {} # doi -> paper id (헤더 DOI = 그 논문 소유)
|
|
cited = {} # paper id -> set(cited doi)
|
|
n_own = n_refsec = 0
|
|
for r in rows:
|
|
txt = r['txt']
|
|
head = txt[:2500]
|
|
hdois = [norm(d) for d in DOI_RE.findall(head)]
|
|
if hdois:
|
|
owner.setdefault(hdois[0], r['id']); n_own += 1
|
|
m = REF_RE.search(txt)
|
|
body = txt[m.start():] if m else ''
|
|
if m: n_refsec += 1
|
|
cds = set(norm(d) for d in DOI_RE.findall(body))
|
|
if cds: cited[r['id']] = cds
|
|
# 엣지: paper -> owner(cited doi)
|
|
edges = []
|
|
for pid, cds in cited.items():
|
|
for d in cds:
|
|
o = owner.get(d)
|
|
if o and o != pid: edges.append((pid, o, d))
|
|
cited_papers = set(e[0] for e in edges)
|
|
target_papers = set(e[1] for e in edges)
|
|
print(f"papers={len(rows)} 헤더DOI보유={n_own} References보유={n_refsec} owner_map={len(owner)}")
|
|
print(f"인용엣지(코퍼스내)={len(edges)} 인용하는논문={len(cited_papers)} 피인용논문={len(target_papers)}")
|
|
# 피인용 top
|
|
from collections import Counter
|
|
top = Counter(e[1] for e in edges).most_common(6)
|
|
if top:
|
|
idmap = {r['id']: r['title'] for r in rows}
|
|
print("피인용 top:")
|
|
for pid, c in top: print(f" {c}회 ← {(idmap.get(pid) or '')[:48]}")
|
|
await conn.close()
|
|
|
|
asyncio.run(main())
|