feat(docs): 관련 문서(유사도 KNN) 엔드포인트+패널 + 법령/지침 splitter

This commit is contained in:
hyungi
2026-06-30 06:10:11 +00:00
parent c44692fddc
commit a22b2c7647
7 changed files with 432 additions and 0 deletions
+30
View File
@@ -0,0 +1,30 @@
#!/usr/bin/env python3
"""OpenAlex 보강 타당성 테스트 — 소수 논문 제목으로 매칭/메타 확인 (외부 API)."""
import asyncio, os, re
async def main():
import asyncpg, httpx
conn = await asyncpg.connect(os.environ['DATABASE_URL'].replace('+asyncpg', ''))
rows = await conn.fetch("SELECT id, title FROM documents WHERE material_type='paper' "
"AND doc_kind='standard' AND deleted_at IS NULL AND title IS NOT NULL "
"AND length(title) > 15 ORDER BY id LIMIT 6")
async with httpx.AsyncClient(timeout=20) as client:
for r in rows:
title = re.sub(r'\s+', ' ', r['title']).strip()
try:
resp = await client.get("https://api.openalex.org/works",
params={"search": title[:200], "per_page": 1, "mailto": "hyun49196@gmail.com"})
js = resp.json()
res = (js.get("results") or [])
if not res:
print(f"[{r['id']}] NO MATCH | {title[:50]}"); continue
w = res[0]
oid = (w.get("id") or "").split("/")[-1]
print(f"[{r['id']}] {title[:46]}")
print(f" → OA {oid} | {(w.get('title') or '')[:46]} | {w.get('publication_year')} | "
f"cited_by={w.get('cited_by_count')} | refs={len(w.get('referenced_works') or [])} | doi={w.get('doi')}")
except Exception as e:
print(f"[{r['id']}] ERROR {type(e).__name__}: {e}")
await conn.close()
asyncio.run(main())