feat(docs): 관련 문서(유사도 KNN) 엔드포인트+패널 + 법령/지침 splitter
This commit is contained in:
@@ -1990,3 +1990,61 @@ async def get_document_backlinks(
|
|||||||
forward=[BacklinkRef(**dict(r)) for r in forward],
|
forward=[BacklinkRef(**dict(r)) for r in forward],
|
||||||
back=[BacklinkRef(**dict(r)) for r in back],
|
back=[BacklinkRef(**dict(r)) for r in back],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ─── 관련 문서 (유사도, on-demand pgvector KNN — 저부하·무저장) ───
|
||||||
|
class RelatedItem(BaseModel):
|
||||||
|
id: int
|
||||||
|
title: str | None = None
|
||||||
|
ai_domain: str | None = None
|
||||||
|
material_type: str | None = None
|
||||||
|
year: int | None = None
|
||||||
|
sim: float | None = None
|
||||||
|
|
||||||
|
|
||||||
|
class RelatedResponse(BaseModel):
|
||||||
|
doc_id: int
|
||||||
|
related: list[RelatedItem]
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/{doc_id}/related", response_model=RelatedResponse)
|
||||||
|
async def get_related_documents(
|
||||||
|
doc_id: int,
|
||||||
|
user: Annotated[User, Depends(get_current_user)],
|
||||||
|
session: Annotated[AsyncSession, Depends(get_session)],
|
||||||
|
limit: int = 8,
|
||||||
|
same_type: bool = True,
|
||||||
|
):
|
||||||
|
"""문서-레벨 임베딩 코사인 최근접 = '관련 문서'. on-demand(저장/배치 없음).
|
||||||
|
|
||||||
|
인용그래프가 부적합한 코퍼스(업계 기술기사=인용망 부재)의 대안 연결 레이어.
|
||||||
|
same_type=true면 같은 material_type 내, false면 전 코퍼스. doc_kind='clause'(절-문서)는 제외.
|
||||||
|
"""
|
||||||
|
from sqlalchemy import text as sql_text
|
||||||
|
|
||||||
|
lim = max(1, min(limit, 30))
|
||||||
|
type_clause = "AND d.material_type = src.material_type" if same_type else ""
|
||||||
|
rows = (
|
||||||
|
await session.execute(
|
||||||
|
sql_text(
|
||||||
|
f"""
|
||||||
|
WITH src AS (
|
||||||
|
SELECT embedding, material_type FROM documents WHERE id = :id
|
||||||
|
)
|
||||||
|
SELECT d.id, d.title, d.ai_domain, d.material_type, d.facet_year AS year,
|
||||||
|
round((1 - (d.embedding <=> (SELECT embedding FROM src)))::numeric, 3) AS sim
|
||||||
|
FROM documents d, src
|
||||||
|
WHERE d.doc_kind = 'standard' AND d.deleted_at IS NULL
|
||||||
|
AND d.id <> :id AND d.embedding IS NOT NULL
|
||||||
|
AND (SELECT embedding FROM src) IS NOT NULL
|
||||||
|
{type_clause}
|
||||||
|
ORDER BY d.embedding <=> (SELECT embedding FROM src)
|
||||||
|
LIMIT :lim
|
||||||
|
"""
|
||||||
|
).bindparams(id=doc_id, lim=lim)
|
||||||
|
)
|
||||||
|
).mappings().all()
|
||||||
|
return RelatedResponse(
|
||||||
|
doc_id=doc_id,
|
||||||
|
related=[RelatedItem(**{k: r[k] for k in ("id", "title", "ai_domain", "material_type", "year")}, sim=float(r["sim"]) if r["sim"] is not None else None) for r in rows],
|
||||||
|
)
|
||||||
|
|||||||
@@ -0,0 +1,45 @@
|
|||||||
|
<script>
|
||||||
|
// 관련 문서 (유사도) — 문서 레벨 임베딩 KNN. 자기완결: docId 받아 /related 조회.
|
||||||
|
import { onMount } from 'svelte';
|
||||||
|
import { api } from '$lib/api';
|
||||||
|
|
||||||
|
let { documentId } = $props();
|
||||||
|
let items = $state([]);
|
||||||
|
let loaded = $state(false);
|
||||||
|
|
||||||
|
const KIND = { law: '법령', guide: '지침', paper: '논문', standard: '표준', incident: '사례' };
|
||||||
|
|
||||||
|
onMount(async () => {
|
||||||
|
try {
|
||||||
|
const r = await api(`/documents/${documentId}/related?limit=6`);
|
||||||
|
items = r?.related ?? [];
|
||||||
|
} catch (e) { /* silent */ }
|
||||||
|
finally { loaded = true; }
|
||||||
|
});
|
||||||
|
</script>
|
||||||
|
|
||||||
|
{#if items.length}
|
||||||
|
<div class="rel">
|
||||||
|
<div class="lab">관련 문서</div>
|
||||||
|
{#each items as it (it.id)}
|
||||||
|
<a class="ri" href={`/documents/${it.id}`}>
|
||||||
|
<span class="rt">{it.title}</span>
|
||||||
|
<span class="rm">
|
||||||
|
{#if it.material_type && KIND[it.material_type]}<span class="kind">{KIND[it.material_type]}</span>{/if}
|
||||||
|
<span class="rs">{Math.round((it.sim ?? 0) * 100)}</span>
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
{/each}
|
||||||
|
</div>
|
||||||
|
{/if}
|
||||||
|
|
||||||
|
<style>
|
||||||
|
.rel { background: var(--surface); border: 1px solid var(--border); border-radius: 14px; padding: 13px; }
|
||||||
|
.lab { font-size: 10.5px; font-weight: 700; color: var(--text-dim); letter-spacing: .4px; margin-bottom: 8px; }
|
||||||
|
.ri { display: flex; align-items: baseline; gap: 8px; padding: 5px 6px; border-radius: 7px; text-decoration: none; }
|
||||||
|
.ri:hover { background: var(--surface-hover, #ecf0e8); }
|
||||||
|
.rt { flex: 1; font-size: 12px; line-height: 1.4; color: var(--text); overflow: hidden; display: -webkit-box; -webkit-line-clamp: 2; -webkit-box-orient: vertical; }
|
||||||
|
.rm { flex-shrink: 0; display: flex; align-items: center; gap: 5px; }
|
||||||
|
.kind { font-size: 9px; font-weight: 700; color: var(--accent-hover, #3d7256); background: #e3efe2; border: 1px solid #cfe3cd; border-radius: 4px; padding: 0 4px; }
|
||||||
|
.rs { font-size: 10.5px; font-family: ui-monospace, Menlo, monospace; color: var(--faint, #9aa090); }
|
||||||
|
</style>
|
||||||
@@ -16,6 +16,7 @@
|
|||||||
import Skeleton from '$lib/components/ui/Skeleton.svelte';
|
import Skeleton from '$lib/components/ui/Skeleton.svelte';
|
||||||
import HandwriteCanvas from '$lib/components/HandwriteCanvas.svelte';
|
import HandwriteCanvas from '$lib/components/HandwriteCanvas.svelte';
|
||||||
import MarkdownDoc from '$lib/components/MarkdownDoc.svelte';
|
import MarkdownDoc from '$lib/components/MarkdownDoc.svelte';
|
||||||
|
import RelatedDocs from '$lib/components/RelatedDocs.svelte';
|
||||||
import { renderDocMarkdown } from '$lib/utils/docMarkdown';
|
import { renderDocMarkdown } from '$lib/utils/docMarkdown';
|
||||||
import MarkdownStatusBadge from '$lib/components/MarkdownStatusBadge.svelte';
|
import MarkdownStatusBadge from '$lib/components/MarkdownStatusBadge.svelte';
|
||||||
import NoteEditor from '$lib/components/editors/NoteEditor.svelte';
|
import NoteEditor from '$lib/components/editors/NoteEditor.svelte';
|
||||||
@@ -321,6 +322,7 @@
|
|||||||
<!-- ════ 우 슬림 레일 (시안 카드 스타일) ════ -->
|
<!-- ════ 우 슬림 레일 (시안 카드 스타일) ════ -->
|
||||||
{#snippet rail()}
|
{#snippet rail()}
|
||||||
<div style="display:flex;flex-direction:column;gap:11px;font-size:14px;">
|
<div style="display:flex;flex-direction:column;gap:11px;font-size:14px;">
|
||||||
|
<RelatedDocs documentId={doc.id} />
|
||||||
{#if doc.ai_tldr || doc.ai_summary}
|
{#if doc.ai_tldr || doc.ai_summary}
|
||||||
<div style="background:#f4f7f1;border:1px solid #dde3d6;border-radius:14px;padding:13px;">
|
<div style="background:#f4f7f1;border:1px solid #dde3d6;border-radius:14px;padding:13px;">
|
||||||
<div style="font-size:10.5px;font-weight:700;color:#697061;letter-spacing:.4px;margin-bottom:7px;">TL;DR</div>
|
<div style="font-size:10.5px;font-weight:700;color:#697061;letter-spacing:.4px;margin-bottom:7px;">TL;DR</div>
|
||||||
|
|||||||
@@ -0,0 +1,100 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""기술지침(KOSHA guide) 절-KB persist: 번호섹션(# 1. 목적 / ## 4.1) 단위 분해 + 제본.
|
||||||
|
ASME/법령과 동일 clause-KB 모델(doc_kind='clause', parent_id=지침, 검색제외, /book 리더 공용).
|
||||||
|
Usage: python3 guide_clause_persist.py <id|all> [--commit]
|
||||||
|
"""
|
||||||
|
import asyncio, os, re, sys, hashlib, statistics
|
||||||
|
|
||||||
|
CAP = 12000; PAGE_TOK = 11000
|
||||||
|
EN, KO = 0.217, 0.529
|
||||||
|
# 번호섹션 헤더: '# 1. 목 적', '## 4.1 누출...' (번호 1~3자리=연도(4자리) 배제)
|
||||||
|
ART_RE = re.compile(r'^#{1,6}\s*(\d{1,3}(?:\.\d{1,3})*)\.?\s+(\S.*)$')
|
||||||
|
TOP_RE = re.compile(r'^\d{1,3}$')
|
||||||
|
# 외부 표준/법규 참조(대부분 dangling): ASME B16.5 · KS B 1501 · 규칙 제N조
|
||||||
|
EXT_RE = re.compile(r'(ASME\s+[A-Z][0-9.]+|KS\s+[A-Z]\s*[0-9]+|ISO\s+[0-9]+|제\d+조)')
|
||||||
|
|
||||||
|
def tok(s):
|
||||||
|
ko = sum(1 for c in s if '가' <= c <= '힣'); return int((len(s)-ko)*EN + ko*KO)
|
||||||
|
|
||||||
|
def build_sections(text):
|
||||||
|
lines = text.split('\n'); off = []; a = 0
|
||||||
|
for ln in lines: off.append(a); a += len(ln) + 1
|
||||||
|
bounds = []; seen = set()
|
||||||
|
for i, ln in enumerate(lines):
|
||||||
|
m = ART_RE.match(ln)
|
||||||
|
if not m: continue
|
||||||
|
code, name = m.group(1), m.group(2).strip()
|
||||||
|
if not TOP_RE.match(code): continue # top-level 번호섹션만 경계
|
||||||
|
if code in seen: continue
|
||||||
|
if len(name) < 1: continue
|
||||||
|
seen.add(code); bounds.append((off[i], code, name))
|
||||||
|
out = []
|
||||||
|
for idx, (start, code, name) in enumerate(bounds):
|
||||||
|
end = bounds[idx+1][0] if idx+1 < len(bounds) else len(text)
|
||||||
|
body = text[start:end].strip()
|
||||||
|
ext = sorted(set(EXT_RE.findall(body)))[:8]
|
||||||
|
out.append(dict(code=code, part='본문', order=0, title=f"{code}. {name}"[:120],
|
||||||
|
body=body, tok=tok(body), links=[], ext=ext))
|
||||||
|
# over-CAP 페이지네이션 + 순번
|
||||||
|
final, order = [], 0
|
||||||
|
for c in out:
|
||||||
|
if c['tok'] <= CAP:
|
||||||
|
final.append({**c, 'order': order}); order += 1; continue
|
||||||
|
pages, cur, ct = [], [], 0
|
||||||
|
for ln in c['body'].split('\n'):
|
||||||
|
lt = tok(ln)+1
|
||||||
|
if ct+lt > PAGE_TOK and cur: pages.append('\n'.join(cur)); cur=[ln]; ct=lt
|
||||||
|
else: cur.append(ln); ct+=lt
|
||||||
|
if cur: pages.append('\n'.join(cur))
|
||||||
|
for pi, pb in enumerate(pages):
|
||||||
|
final.append(dict(code=c['code'] if pi==0 else f"{c['code']}·p{pi+1}", part='본문',
|
||||||
|
order=order, title=c['title'] if pi==0 else f"{c['title']} (p{pi+1})",
|
||||||
|
body=pb, tok=tok(pb), links=[], ext=[]))
|
||||||
|
order += 1
|
||||||
|
return final
|
||||||
|
|
||||||
|
async def process_one(conn, gid, commit, verbose=True):
|
||||||
|
row = await conn.fetchrow("SELECT title, md_content, ai_domain, data_origin FROM documents WHERE id=$1", gid)
|
||||||
|
if not row: return ('notfound', 0)
|
||||||
|
if not row['md_content']: return ('nullmd', 0)
|
||||||
|
secs = build_sections(row['md_content'])
|
||||||
|
if len(secs) < 2: return ('few', len(secs)) # 섹션 2 미만 = 번호구조 아님
|
||||||
|
toks = [c['tok'] for c in secs]
|
||||||
|
if verbose:
|
||||||
|
print(f"guide={gid} «{(row['title'] or '')[:40]}» 섹션={len(secs)} median={int(statistics.median(toks))} max={max(toks)}")
|
||||||
|
print(" 샘플:", [c['title'][:26] for c in secs[:7]])
|
||||||
|
if not commit: return ('dry', len(secs))
|
||||||
|
async with conn.transaction():
|
||||||
|
await conn.execute("DELETE FROM clause_links WHERE src_doc_id IN (SELECT id FROM documents WHERE parent_id=$1 AND doc_kind='clause')", gid)
|
||||||
|
await conn.execute("DELETE FROM documents WHERE parent_id=$1 AND doc_kind='clause'", gid)
|
||||||
|
for c in secs:
|
||||||
|
fh = hashlib.sha256(f"{gid}:{c['code']}:{c['body']}".encode()).hexdigest()
|
||||||
|
cid = await conn.fetchval("""
|
||||||
|
INSERT INTO documents (file_format,file_hash,title,md_content,parent_id,doc_kind,
|
||||||
|
clause_code,clause_part,clause_order,ai_domain,data_origin,
|
||||||
|
md_status,review_status,conversion_status,preview_status)
|
||||||
|
VALUES ('md',$1,$2,$3,$4,'clause',$5,$6,$7,$8,$9,'success','approved','none','none') RETURNING id
|
||||||
|
""", fh, c['title'], c['body'], gid, c['code'], c['part'], c['order'], row['ai_domain'], row['data_origin'] or 'external')
|
||||||
|
await conn.execute("INSERT INTO document_tags(doc_id,tag,tag_kind) VALUES ($1,'기술지침','kind') ON CONFLICT DO NOTHING", cid)
|
||||||
|
n = await conn.fetchval("SELECT count(*) FROM documents WHERE parent_id=$1 AND doc_kind='clause'", gid)
|
||||||
|
print(f" COMMITTED: {n} 섹션 for guide {gid}")
|
||||||
|
return ('committed', len(secs))
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
import asyncpg
|
||||||
|
arg = sys.argv[1]; commit = '--commit' in sys.argv
|
||||||
|
conn = await asyncpg.connect(os.environ['DATABASE_URL'].replace('+asyncpg', ''))
|
||||||
|
if arg == 'all':
|
||||||
|
gs = await conn.fetch("SELECT id FROM documents WHERE material_type='guide' AND doc_kind='standard' "
|
||||||
|
"AND deleted_at IS NULL AND md_content IS NOT NULL ORDER BY id")
|
||||||
|
agg = {}; tot = 0
|
||||||
|
for i, r in enumerate(gs):
|
||||||
|
st, n = await process_one(conn, r['id'], commit, verbose=False)
|
||||||
|
agg[st] = agg.get(st, 0)+1; tot += n if st in ('dry','committed') else 0
|
||||||
|
if commit and (i+1) % 40 == 0: print(f" …{i+1}/{len(gs)} (누적섹션 {tot})")
|
||||||
|
print(f"BATCH {'COMMIT' if commit else 'DRY'} guides={len(gs)} status={agg} 총섹션={tot}")
|
||||||
|
else:
|
||||||
|
await process_one(conn, int(arg), commit, verbose=True)
|
||||||
|
await conn.close()
|
||||||
|
|
||||||
|
asyncio.run(main())
|
||||||
@@ -0,0 +1,146 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""법령 조-KB persist: 법령을 조(條) 단위 개별 문서로 분해 + 조↔조 백링크 + 장(章) 태그.
|
||||||
|
ASME clause-KB와 동일 모델(doc_kind='clause', parent_id=법령, embedding NULL, 검색제외).
|
||||||
|
법령 추출 노이즈(조 앞 ### 메타 반복) 트림. Usage: python3 law_clause_persist.py <law_id> [--commit]
|
||||||
|
"""
|
||||||
|
import asyncio, os, re, sys, hashlib, statistics
|
||||||
|
|
||||||
|
CAP = 12000; PAGE_TOK = 11000
|
||||||
|
EN, KO = 0.217, 0.529
|
||||||
|
# 조 헤더: '### 제3조의2(가스안전관리...) 본문'
|
||||||
|
ART_RE = re.compile(r'^#{0,6}\s*(제\d+조(?:의\d+)?)\s*\(([^)]*)\)\s*(.*)$')
|
||||||
|
CHAP_RE = re.compile(r'^#{1,6}\s*(제\d+장(?:의\d+)?)\s*(.*)$') # 장 = part
|
||||||
|
# 같은-법 조 멘션(백링크)
|
||||||
|
MENTION_RE = re.compile(r'제\d+조(?:의\d+)?')
|
||||||
|
# 타법 참조: 「법명」 ... 제N조
|
||||||
|
EXTLAW_RE = re.compile(r'「([^」]+)」')
|
||||||
|
|
||||||
|
def tok(s):
|
||||||
|
ko = sum(1 for c in s if '가' <= c <= '힣'); return int((len(s)-ko)*EN + ko*KO)
|
||||||
|
def art_code(c): return c # '제3조의2'
|
||||||
|
|
||||||
|
def build_articles(text):
|
||||||
|
lines = text.split('\n'); off = []; a = 0
|
||||||
|
for ln in lines: off.append(a); a += len(ln) + 1
|
||||||
|
arts = [] # (line_idx, code, name, part)
|
||||||
|
cur_part = None
|
||||||
|
for i, ln in enumerate(lines):
|
||||||
|
ch = CHAP_RE.match(ln)
|
||||||
|
if ch and not ART_RE.match(ln):
|
||||||
|
cur_part = (ch.group(1) + (' ' + ch.group(2).strip() if ch.group(2).strip() else '')).strip()
|
||||||
|
continue
|
||||||
|
m = ART_RE.match(ln)
|
||||||
|
if m:
|
||||||
|
arts.append((i, m.group(1), m.group(2).strip(), cur_part))
|
||||||
|
# 본문 슬라이스 + 다음 조 앞 메타 노이즈 트림
|
||||||
|
out = []
|
||||||
|
for idx, (li, code, name, part) in enumerate(arts):
|
||||||
|
end_li = arts[idx+1][0] if idx+1 < len(arts) else len(lines)
|
||||||
|
body_lines = lines[li:end_li]
|
||||||
|
# 트림: 끝에서부터 '### {짧은 메타}' (조번호/조문/날짜/제목, [개정] 제N조 아님) 제거
|
||||||
|
while len(body_lines) > 1:
|
||||||
|
last = body_lines[-1].strip()
|
||||||
|
if last == '':
|
||||||
|
body_lines.pop(); continue
|
||||||
|
mh = re.match(r'^#{1,6}\s+(.*)$', last)
|
||||||
|
if mh:
|
||||||
|
c = mh.group(1).strip()
|
||||||
|
if not c.startswith('[') and not c.startswith('제') and (
|
||||||
|
c in ('조문', 'N') or re.fullmatch(r'\d+', c) or re.fullmatch(r'\d{8}', c) or len(c) <= 30):
|
||||||
|
body_lines.pop(); continue
|
||||||
|
break
|
||||||
|
body = '\n'.join(body_lines).strip()
|
||||||
|
links = sorted(set(MENTION_RE.findall(body)) - {code})
|
||||||
|
ext = sorted(set(EXTLAW_RE.findall(body)))[:6]
|
||||||
|
out.append(dict(code=code, part=part or '본칙', order=0,
|
||||||
|
title=f"{code}({name})" if name else code,
|
||||||
|
body=body, tok=tok(body), links=links, ext=ext))
|
||||||
|
# 페이지네이션(over-CAP) + 순번
|
||||||
|
final, order = [], 0
|
||||||
|
for c in out:
|
||||||
|
if c['tok'] <= CAP:
|
||||||
|
final.append({**c, 'order': order}); order += 1; continue
|
||||||
|
# 11K 토큰 라인 단위 분할
|
||||||
|
pages, cur, ct = [], [], 0
|
||||||
|
for ln in c['body'].split('\n'):
|
||||||
|
lt = tok(ln)+1
|
||||||
|
if ct+lt > PAGE_TOK and cur: pages.append('\n'.join(cur)); cur=[ln]; ct=lt
|
||||||
|
else: cur.append(ln); ct+=lt
|
||||||
|
if cur: pages.append('\n'.join(cur))
|
||||||
|
for pi, pb in enumerate(pages):
|
||||||
|
final.append(dict(code=c['code'] if pi==0 else f"{c['code']}·p{pi+1}", part=c['part'],
|
||||||
|
order=order, title=c['title'] if pi==0 else f"{c['title']} (p{pi+1}/{len(pages)})",
|
||||||
|
body=pb, tok=tok(pb), links=c['links'] if pi==0 else [], ext=[]))
|
||||||
|
order += 1
|
||||||
|
return final
|
||||||
|
|
||||||
|
async def process_one(conn, law, commit, verbose=True):
|
||||||
|
row = await conn.fetchrow("SELECT title, coalesce(md_content, extracted_text) AS md_content, ai_domain, data_origin FROM documents WHERE id=$1", law)
|
||||||
|
if not row: return ('notfound', 0, 0)
|
||||||
|
if not row['md_content']: return ('nullmd', 0, 0)
|
||||||
|
arts = build_articles(row['md_content'])
|
||||||
|
if not arts: return ('noart', 0, 0)
|
||||||
|
toks = [c['tok'] for c in arts]
|
||||||
|
nlink = sum(len(c['links']) for c in arts)
|
||||||
|
if verbose:
|
||||||
|
parts = {}
|
||||||
|
for c in arts: parts[c['part']] = parts.get(c['part'], 0)+1
|
||||||
|
print(f"law={law} «{(row['title'] or '')[:34]}» 조문={len(arts)} median={int(statistics.median(toks))} "
|
||||||
|
f"max={max(toks)} 장={len(parts)} 백링크={nlink}")
|
||||||
|
print(" 샘플:", [c['title'][:22] for c in arts[:6]])
|
||||||
|
if not commit:
|
||||||
|
return ('dry', len(arts), nlink)
|
||||||
|
async with conn.transaction():
|
||||||
|
await conn.execute(
|
||||||
|
"DELETE FROM clause_links WHERE src_doc_id IN (SELECT id FROM documents WHERE parent_id=$1 AND doc_kind='clause')", law)
|
||||||
|
await conn.execute("DELETE FROM documents WHERE parent_id=$1 AND doc_kind='clause'", law)
|
||||||
|
code2id = {}
|
||||||
|
for c in arts:
|
||||||
|
fh = hashlib.sha256(f"{law}:{c['code']}:{c['body']}".encode()).hexdigest()
|
||||||
|
cid = await conn.fetchval("""
|
||||||
|
INSERT INTO documents (file_format,file_hash,title,md_content,parent_id,doc_kind,
|
||||||
|
clause_code,clause_part,clause_order,ai_domain,data_origin,
|
||||||
|
md_status,review_status,conversion_status,preview_status)
|
||||||
|
VALUES ('md',$1,$2,$3,$4,'clause',$5,$6,$7,$8,$9,'success','approved','none','none') RETURNING id
|
||||||
|
""", fh, c['title'], c['body'], law, c['code'], c['part'], c['order'],
|
||||||
|
row['ai_domain'], row['data_origin'] or 'external')
|
||||||
|
code2id[c['code']] = cid
|
||||||
|
await conn.execute("INSERT INTO document_tags(doc_id,tag,tag_kind) VALUES ($1,$2,'chapter') ON CONFLICT DO NOTHING", cid, c['part'])
|
||||||
|
# 조↔조 백링크 (같은 법 내부; 타법 참조는 dangling)
|
||||||
|
edges = []
|
||||||
|
for c in arts:
|
||||||
|
src = code2id[c['code']]
|
||||||
|
for dst in c['links']:
|
||||||
|
edges.append((src, dst, code2id.get(dst), None, None, None))
|
||||||
|
if edges:
|
||||||
|
await conn.executemany(
|
||||||
|
"INSERT INTO clause_links(src_doc_id,dst_code,dst_doc_id,anchor,ctx,char_off) VALUES ($1,$2,$3,$4,$5,$6)", edges)
|
||||||
|
n = await conn.fetchval("SELECT count(*) FROM documents WHERE parent_id=$1 AND doc_kind='clause'", law)
|
||||||
|
print(f" COMMITTED: {n} 조문 + {len(edges)} 백링크 for law {law}")
|
||||||
|
return ('committed', n, len(edges))
|
||||||
|
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
import asyncpg
|
||||||
|
arg = sys.argv[1]; commit = '--commit' in sys.argv
|
||||||
|
conn = await asyncpg.connect(os.environ['DATABASE_URL'].replace('+asyncpg', ''))
|
||||||
|
if arg == 'all':
|
||||||
|
laws = await conn.fetch("SELECT lm.document_id AS id FROM legal_meta lm "
|
||||||
|
"JOIN documents d ON d.id=lm.document_id "
|
||||||
|
"WHERE lm.law_doc_kind='primary' AND lm.version_status='current' "
|
||||||
|
"AND coalesce(d.md_content, d.extracted_text) IS NOT NULL "
|
||||||
|
"ORDER BY lm.document_id")
|
||||||
|
agg = {}; tot_art = tot_link = 0; zero = []
|
||||||
|
for i, r in enumerate(laws):
|
||||||
|
st, na, nl = await process_one(conn, r['id'], commit, verbose=False)
|
||||||
|
agg[st] = agg.get(st, 0) + 1
|
||||||
|
tot_art += na; tot_link += nl
|
||||||
|
if st == 'noart': zero.append(r['id'])
|
||||||
|
if commit and (i + 1) % 30 == 0: print(f" …{i+1}/{len(laws)} (누적 조 {tot_art})")
|
||||||
|
print(f"BATCH {'COMMIT' if commit else 'DRY'} laws={len(laws)} status={agg} 총조문={tot_art} 총백링크={tot_link}")
|
||||||
|
if zero: print(f" 0-조(추출구조 이질) {len(zero)}건: {zero[:20]}")
|
||||||
|
else:
|
||||||
|
await process_one(conn, int(arg), commit, verbose=True)
|
||||||
|
await conn.close()
|
||||||
|
|
||||||
|
asyncio.run(main())
|
||||||
@@ -0,0 +1,51 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""논문 인용그래프 가능성 측정(read-only) — 본문 DOI로 코퍼스내 인용 엣지 추정.
|
||||||
|
own_doi = 헤더(앞 2500자) 첫 DOI / cited = References 이후(또는 전체) DOI. owner 맵 → 엣지.
|
||||||
|
"""
|
||||||
|
import asyncio, os, re, sys
|
||||||
|
|
||||||
|
DOI_RE = re.compile(r'10\.\d{4,9}/[^\s"<>)\]\},;]+')
|
||||||
|
REF_RE = re.compile(r'(references|참고문헌|bibliography|reference\s*list)', re.I)
|
||||||
|
|
||||||
|
def norm(d): return d.rstrip('.').lower()
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
import asyncpg
|
||||||
|
conn = await asyncpg.connect(os.environ['DATABASE_URL'].replace('+asyncpg', ''))
|
||||||
|
rows = await conn.fetch("SELECT id, title, coalesce(md_content, extracted_text) AS txt FROM documents "
|
||||||
|
"WHERE material_type='paper' AND doc_kind='standard' AND deleted_at IS NULL "
|
||||||
|
"AND coalesce(md_content, extracted_text) IS NOT NULL")
|
||||||
|
owner = {} # doi -> paper id (헤더 DOI = 그 논문 소유)
|
||||||
|
cited = {} # paper id -> set(cited doi)
|
||||||
|
n_own = n_refsec = 0
|
||||||
|
for r in rows:
|
||||||
|
txt = r['txt']
|
||||||
|
head = txt[:2500]
|
||||||
|
hdois = [norm(d) for d in DOI_RE.findall(head)]
|
||||||
|
if hdois:
|
||||||
|
owner.setdefault(hdois[0], r['id']); n_own += 1
|
||||||
|
m = REF_RE.search(txt)
|
||||||
|
body = txt[m.start():] if m else ''
|
||||||
|
if m: n_refsec += 1
|
||||||
|
cds = set(norm(d) for d in DOI_RE.findall(body))
|
||||||
|
if cds: cited[r['id']] = cds
|
||||||
|
# 엣지: paper -> owner(cited doi)
|
||||||
|
edges = []
|
||||||
|
for pid, cds in cited.items():
|
||||||
|
for d in cds:
|
||||||
|
o = owner.get(d)
|
||||||
|
if o and o != pid: edges.append((pid, o, d))
|
||||||
|
cited_papers = set(e[0] for e in edges)
|
||||||
|
target_papers = set(e[1] for e in edges)
|
||||||
|
print(f"papers={len(rows)} 헤더DOI보유={n_own} References보유={n_refsec} owner_map={len(owner)}")
|
||||||
|
print(f"인용엣지(코퍼스내)={len(edges)} 인용하는논문={len(cited_papers)} 피인용논문={len(target_papers)}")
|
||||||
|
# 피인용 top
|
||||||
|
from collections import Counter
|
||||||
|
top = Counter(e[1] for e in edges).most_common(6)
|
||||||
|
if top:
|
||||||
|
idmap = {r['id']: r['title'] for r in rows}
|
||||||
|
print("피인용 top:")
|
||||||
|
for pid, c in top: print(f" {c}회 ← {(idmap.get(pid) or '')[:48]}")
|
||||||
|
await conn.close()
|
||||||
|
|
||||||
|
asyncio.run(main())
|
||||||
@@ -0,0 +1,30 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""OpenAlex 보강 타당성 테스트 — 소수 논문 제목으로 매칭/메타 확인 (외부 API)."""
|
||||||
|
import asyncio, os, re
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
import asyncpg, httpx
|
||||||
|
conn = await asyncpg.connect(os.environ['DATABASE_URL'].replace('+asyncpg', ''))
|
||||||
|
rows = await conn.fetch("SELECT id, title FROM documents WHERE material_type='paper' "
|
||||||
|
"AND doc_kind='standard' AND deleted_at IS NULL AND title IS NOT NULL "
|
||||||
|
"AND length(title) > 15 ORDER BY id LIMIT 6")
|
||||||
|
async with httpx.AsyncClient(timeout=20) as client:
|
||||||
|
for r in rows:
|
||||||
|
title = re.sub(r'\s+', ' ', r['title']).strip()
|
||||||
|
try:
|
||||||
|
resp = await client.get("https://api.openalex.org/works",
|
||||||
|
params={"search": title[:200], "per_page": 1, "mailto": "hyun49196@gmail.com"})
|
||||||
|
js = resp.json()
|
||||||
|
res = (js.get("results") or [])
|
||||||
|
if not res:
|
||||||
|
print(f"[{r['id']}] NO MATCH | {title[:50]}"); continue
|
||||||
|
w = res[0]
|
||||||
|
oid = (w.get("id") or "").split("/")[-1]
|
||||||
|
print(f"[{r['id']}] {title[:46]}")
|
||||||
|
print(f" → OA {oid} | {(w.get('title') or '')[:46]} | {w.get('publication_year')} | "
|
||||||
|
f"cited_by={w.get('cited_by_count')} | refs={len(w.get('referenced_works') or [])} | doi={w.get('doi')}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[{r['id']}] ERROR {type(e).__name__}: {e}")
|
||||||
|
await conn.close()
|
||||||
|
|
||||||
|
asyncio.run(main())
|
||||||
Reference in New Issue
Block a user