chore(paper): OpenAlex 매치율 측정 스크립트(결론=인용보강 부적합)
This commit is contained in:
@@ -0,0 +1,39 @@
|
||||
#!/usr/bin/env python3
|
||||
"""OpenAlex 고신뢰 매치율 측정 — References 보유 논문(학술 추정) 표본."""
|
||||
import asyncio, os, re
|
||||
|
||||
def toks(s):
|
||||
return set(re.findall(r'[a-z0-9]+', (s or '').lower()))
|
||||
def sim(a, b):
|
||||
ta, tb = toks(a), toks(b)
|
||||
if not ta or not tb: return 0.0
|
||||
return len(ta & tb) / len(ta | tb)
|
||||
|
||||
async def main():
|
||||
import asyncpg, httpx
|
||||
conn = await asyncpg.connect(os.environ['DATABASE_URL'].replace('+asyncpg', ''))
|
||||
rows = await conn.fetch("SELECT id, title FROM documents WHERE material_type='paper' "
|
||||
"AND doc_kind='standard' AND deleted_at IS NULL AND title IS NOT NULL "
|
||||
"AND coalesce(md_content,extracted_text) ~* 'references|참고문헌' "
|
||||
"ORDER BY id LIMIT 40")
|
||||
hi = mid = lo = 0; hits = []
|
||||
async with httpx.AsyncClient(timeout=20) as client:
|
||||
for r in rows:
|
||||
title = re.sub(r'\s+', ' ', r['title']).strip()
|
||||
try:
|
||||
resp = await client.get("https://api.openalex.org/works",
|
||||
params={"search": title[:200], "per_page": 1, "mailto": "hyun49196@gmail.com"})
|
||||
res = (resp.json().get("results") or [])
|
||||
if not res: lo += 1; continue
|
||||
s = sim(title, res[0].get("title"))
|
||||
if s >= 0.6: hi += 1; hits.append((s, title[:40], (res[0].get('title') or '')[:40], res[0].get('cited_by_count'), len(res[0].get('referenced_works') or [])))
|
||||
elif s >= 0.4: mid += 1
|
||||
else: lo += 1
|
||||
except Exception: lo += 1
|
||||
print(f"표본={len(rows)} 고신뢰(≥0.6)={hi} 중간(0.4~0.6)={mid} 저신뢰/무매치={lo}")
|
||||
print("고신뢰 매치 샘플:")
|
||||
for s, a, b, cb, rf in hits[:8]:
|
||||
print(f" sim={s:.2f} cited={cb} refs={rf} | {a} ≈ {b}")
|
||||
await conn.close()
|
||||
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user