From 91ce54c1cd638b90b860b1462b337256551dde0d Mon Sep 17 00:00:00 2001 From: hyungi Date: Tue, 30 Jun 2026 06:20:59 +0000 Subject: [PATCH] =?UTF-8?q?chore(paper):=20OpenAlex=20=EB=A7=A4=EC=B9=98?= =?UTF-8?q?=EC=9C=A8=20=EC=B8=A1=EC=A0=95=20=EC=8A=A4=ED=81=AC=EB=A6=BD?= =?UTF-8?q?=ED=8A=B8(=EA=B2=B0=EB=A1=A0=3D=EC=9D=B8=EC=9A=A9=EB=B3=B4?= =?UTF-8?q?=EA=B0=95=20=EB=B6=80=EC=A0=81=ED=95=A9)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scripts/paper_openalex_match.py | 39 +++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 scripts/paper_openalex_match.py diff --git a/scripts/paper_openalex_match.py b/scripts/paper_openalex_match.py new file mode 100644 index 0000000..2c4ddab --- /dev/null +++ b/scripts/paper_openalex_match.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python3 +"""OpenAlex 고신뢰 매치율 측정 — References 보유 논문(학술 추정) 표본.""" +import asyncio, os, re + +def toks(s): + return set(re.findall(r'[a-z0-9]+', (s or '').lower())) +def sim(a, b): + ta, tb = toks(a), toks(b) + if not ta or not tb: return 0.0 + return len(ta & tb) / len(ta | tb) + +async def main(): + import asyncpg, httpx + conn = await asyncpg.connect(os.environ['DATABASE_URL'].replace('+asyncpg', '')) + rows = await conn.fetch("SELECT id, title FROM documents WHERE material_type='paper' " + "AND doc_kind='standard' AND deleted_at IS NULL AND title IS NOT NULL " + "AND coalesce(md_content,extracted_text) ~* 'references|참고문헌' " + "ORDER BY id LIMIT 40") + hi = mid = lo = 0; hits = [] + async with httpx.AsyncClient(timeout=20) as client: + for r in rows: + title = re.sub(r'\s+', ' ', r['title']).strip() + try: + resp = await client.get("https://api.openalex.org/works", + params={"search": title[:200], "per_page": 1, "mailto": "hyun49196@gmail.com"}) + res = (resp.json().get("results") or []) + if not res: lo += 1; continue + s = sim(title, res[0].get("title")) + if s >= 0.6: hi += 1; hits.append((s, title[:40], (res[0].get('title') or '')[:40], res[0].get('cited_by_count'), len(res[0].get('referenced_works') or []))) + elif s >= 0.4: mid += 1 + else: lo += 1 + except Exception: lo += 1 + print(f"표본={len(rows)} 고신뢰(≥0.6)={hi} 중간(0.4~0.6)={mid} 저신뢰/무매치={lo}") + print("고신뢰 매치 샘플:") + for s, a, b, cb, rf in hits[:8]: + print(f" sim={s:.2f} cited={cb} refs={rf} | {a} ≈ {b}") + await conn.close() + +asyncio.run(main())