feat(docs): 관련 문서(유사도 KNN) 엔드포인트+패널 + 법령/지침 splitter

This commit is contained in:
hyungi
2026-06-30 06:10:11 +00:00
parent c44692fddc
commit a22b2c7647
7 changed files with 432 additions and 0 deletions
+100
View File
@@ -0,0 +1,100 @@
#!/usr/bin/env python3
"""기술지침(KOSHA guide) 절-KB persist: 번호섹션(# 1. 목적 / ## 4.1) 단위 분해 + 제본.
ASME/법령과 동일 clause-KB 모델(doc_kind='clause', parent_id=지침, 검색제외, /book 리더 공용).
Usage: python3 guide_clause_persist.py <id|all> [--commit]
"""
import asyncio, os, re, sys, hashlib, statistics
CAP = 12000; PAGE_TOK = 11000
EN, KO = 0.217, 0.529
# 번호섹션 헤더: '# 1. 목 적', '## 4.1 누출...' (번호 1~3자리=연도(4자리) 배제)
ART_RE = re.compile(r'^#{1,6}\s*(\d{1,3}(?:\.\d{1,3})*)\.?\s+(\S.*)$')
TOP_RE = re.compile(r'^\d{1,3}$')
# 외부 표준/법규 참조(대부분 dangling): ASME B16.5 · KS B 1501 · 규칙 제N조
EXT_RE = re.compile(r'(ASME\s+[A-Z][0-9.]+|KS\s+[A-Z]\s*[0-9]+|ISO\s+[0-9]+|제\d+조)')
def tok(s):
ko = sum(1 for c in s if '' <= c <= ''); return int((len(s)-ko)*EN + ko*KO)
def build_sections(text):
lines = text.split('\n'); off = []; a = 0
for ln in lines: off.append(a); a += len(ln) + 1
bounds = []; seen = set()
for i, ln in enumerate(lines):
m = ART_RE.match(ln)
if not m: continue
code, name = m.group(1), m.group(2).strip()
if not TOP_RE.match(code): continue # top-level 번호섹션만 경계
if code in seen: continue
if len(name) < 1: continue
seen.add(code); bounds.append((off[i], code, name))
out = []
for idx, (start, code, name) in enumerate(bounds):
end = bounds[idx+1][0] if idx+1 < len(bounds) else len(text)
body = text[start:end].strip()
ext = sorted(set(EXT_RE.findall(body)))[:8]
out.append(dict(code=code, part='본문', order=0, title=f"{code}. {name}"[:120],
body=body, tok=tok(body), links=[], ext=ext))
# over-CAP 페이지네이션 + 순번
final, order = [], 0
for c in out:
if c['tok'] <= CAP:
final.append({**c, 'order': order}); order += 1; continue
pages, cur, ct = [], [], 0
for ln in c['body'].split('\n'):
lt = tok(ln)+1
if ct+lt > PAGE_TOK and cur: pages.append('\n'.join(cur)); cur=[ln]; ct=lt
else: cur.append(ln); ct+=lt
if cur: pages.append('\n'.join(cur))
for pi, pb in enumerate(pages):
final.append(dict(code=c['code'] if pi==0 else f"{c['code']}·p{pi+1}", part='본문',
order=order, title=c['title'] if pi==0 else f"{c['title']} (p{pi+1})",
body=pb, tok=tok(pb), links=[], ext=[]))
order += 1
return final
async def process_one(conn, gid, commit, verbose=True):
row = await conn.fetchrow("SELECT title, md_content, ai_domain, data_origin FROM documents WHERE id=$1", gid)
if not row: return ('notfound', 0)
if not row['md_content']: return ('nullmd', 0)
secs = build_sections(row['md_content'])
if len(secs) < 2: return ('few', len(secs)) # 섹션 2 미만 = 번호구조 아님
toks = [c['tok'] for c in secs]
if verbose:
print(f"guide={gid} «{(row['title'] or '')[:40]}» 섹션={len(secs)} median={int(statistics.median(toks))} max={max(toks)}")
print(" 샘플:", [c['title'][:26] for c in secs[:7]])
if not commit: return ('dry', len(secs))
async with conn.transaction():
await conn.execute("DELETE FROM clause_links WHERE src_doc_id IN (SELECT id FROM documents WHERE parent_id=$1 AND doc_kind='clause')", gid)
await conn.execute("DELETE FROM documents WHERE parent_id=$1 AND doc_kind='clause'", gid)
for c in secs:
fh = hashlib.sha256(f"{gid}:{c['code']}:{c['body']}".encode()).hexdigest()
cid = await conn.fetchval("""
INSERT INTO documents (file_format,file_hash,title,md_content,parent_id,doc_kind,
clause_code,clause_part,clause_order,ai_domain,data_origin,
md_status,review_status,conversion_status,preview_status)
VALUES ('md',$1,$2,$3,$4,'clause',$5,$6,$7,$8,$9,'success','approved','none','none') RETURNING id
""", fh, c['title'], c['body'], gid, c['code'], c['part'], c['order'], row['ai_domain'], row['data_origin'] or 'external')
await conn.execute("INSERT INTO document_tags(doc_id,tag,tag_kind) VALUES ($1,'기술지침','kind') ON CONFLICT DO NOTHING", cid)
n = await conn.fetchval("SELECT count(*) FROM documents WHERE parent_id=$1 AND doc_kind='clause'", gid)
print(f" COMMITTED: {n} 섹션 for guide {gid}")
return ('committed', len(secs))
async def main():
import asyncpg
arg = sys.argv[1]; commit = '--commit' in sys.argv
conn = await asyncpg.connect(os.environ['DATABASE_URL'].replace('+asyncpg', ''))
if arg == 'all':
gs = await conn.fetch("SELECT id FROM documents WHERE material_type='guide' AND doc_kind='standard' "
"AND deleted_at IS NULL AND md_content IS NOT NULL ORDER BY id")
agg = {}; tot = 0
for i, r in enumerate(gs):
st, n = await process_one(conn, r['id'], commit, verbose=False)
agg[st] = agg.get(st, 0)+1; tot += n if st in ('dry','committed') else 0
if commit and (i+1) % 40 == 0: print(f"{i+1}/{len(gs)} (누적섹션 {tot})")
print(f"BATCH {'COMMIT' if commit else 'DRY'} guides={len(gs)} status={agg} 총섹션={tot}")
else:
await process_one(conn, int(arg), commit, verbose=True)
await conn.close()
asyncio.run(main())
+146
View File
@@ -0,0 +1,146 @@
#!/usr/bin/env python3
"""법령 조-KB persist: 법령을 조(條) 단위 개별 문서로 분해 + 조↔조 백링크 + 장(章) 태그.
ASME clause-KB와 동일 모델(doc_kind='clause', parent_id=법령, embedding NULL, 검색제외).
법령 추출 노이즈(조 앞 ### 메타 반복) 트림. Usage: python3 law_clause_persist.py <law_id> [--commit]
"""
import asyncio, os, re, sys, hashlib, statistics
CAP = 12000; PAGE_TOK = 11000
EN, KO = 0.217, 0.529
# 조 헤더: '### 제3조의2(가스안전관리...) 본문'
ART_RE = re.compile(r'^#{0,6}\s*(제\d+조(?:의\d+)?)\s*\(([^)]*)\)\s*(.*)$')
CHAP_RE = re.compile(r'^#{1,6}\s*(제\d+장(?:의\d+)?)\s*(.*)$') # 장 = part
# 같은-법 조 멘션(백링크)
MENTION_RE = re.compile(r'\d+조(?:의\d+)?')
# 타법 참조: 「법명」 ... 제N조
EXTLAW_RE = re.compile(r'「([^」]+)」')
def tok(s):
ko = sum(1 for c in s if '' <= c <= ''); return int((len(s)-ko)*EN + ko*KO)
def art_code(c): return c # '제3조의2'
def build_articles(text):
lines = text.split('\n'); off = []; a = 0
for ln in lines: off.append(a); a += len(ln) + 1
arts = [] # (line_idx, code, name, part)
cur_part = None
for i, ln in enumerate(lines):
ch = CHAP_RE.match(ln)
if ch and not ART_RE.match(ln):
cur_part = (ch.group(1) + (' ' + ch.group(2).strip() if ch.group(2).strip() else '')).strip()
continue
m = ART_RE.match(ln)
if m:
arts.append((i, m.group(1), m.group(2).strip(), cur_part))
# 본문 슬라이스 + 다음 조 앞 메타 노이즈 트림
out = []
for idx, (li, code, name, part) in enumerate(arts):
end_li = arts[idx+1][0] if idx+1 < len(arts) else len(lines)
body_lines = lines[li:end_li]
# 트림: 끝에서부터 '### {짧은 메타}' (조번호/조문/날짜/제목, [개정] 제N조 아님) 제거
while len(body_lines) > 1:
last = body_lines[-1].strip()
if last == '':
body_lines.pop(); continue
mh = re.match(r'^#{1,6}\s+(.*)$', last)
if mh:
c = mh.group(1).strip()
if not c.startswith('[') and not c.startswith('') and (
c in ('조문', 'N') or re.fullmatch(r'\d+', c) or re.fullmatch(r'\d{8}', c) or len(c) <= 30):
body_lines.pop(); continue
break
body = '\n'.join(body_lines).strip()
links = sorted(set(MENTION_RE.findall(body)) - {code})
ext = sorted(set(EXTLAW_RE.findall(body)))[:6]
out.append(dict(code=code, part=part or '본칙', order=0,
title=f"{code}({name})" if name else code,
body=body, tok=tok(body), links=links, ext=ext))
# 페이지네이션(over-CAP) + 순번
final, order = [], 0
for c in out:
if c['tok'] <= CAP:
final.append({**c, 'order': order}); order += 1; continue
# 11K 토큰 라인 단위 분할
pages, cur, ct = [], [], 0
for ln in c['body'].split('\n'):
lt = tok(ln)+1
if ct+lt > PAGE_TOK and cur: pages.append('\n'.join(cur)); cur=[ln]; ct=lt
else: cur.append(ln); ct+=lt
if cur: pages.append('\n'.join(cur))
for pi, pb in enumerate(pages):
final.append(dict(code=c['code'] if pi==0 else f"{c['code']}·p{pi+1}", part=c['part'],
order=order, title=c['title'] if pi==0 else f"{c['title']} (p{pi+1}/{len(pages)})",
body=pb, tok=tok(pb), links=c['links'] if pi==0 else [], ext=[]))
order += 1
return final
async def process_one(conn, law, commit, verbose=True):
row = await conn.fetchrow("SELECT title, coalesce(md_content, extracted_text) AS md_content, ai_domain, data_origin FROM documents WHERE id=$1", law)
if not row: return ('notfound', 0, 0)
if not row['md_content']: return ('nullmd', 0, 0)
arts = build_articles(row['md_content'])
if not arts: return ('noart', 0, 0)
toks = [c['tok'] for c in arts]
nlink = sum(len(c['links']) for c in arts)
if verbose:
parts = {}
for c in arts: parts[c['part']] = parts.get(c['part'], 0)+1
print(f"law={law} «{(row['title'] or '')[:34]}» 조문={len(arts)} median={int(statistics.median(toks))} "
f"max={max(toks)} 장={len(parts)} 백링크={nlink}")
print(" 샘플:", [c['title'][:22] for c in arts[:6]])
if not commit:
return ('dry', len(arts), nlink)
async with conn.transaction():
await conn.execute(
"DELETE FROM clause_links WHERE src_doc_id IN (SELECT id FROM documents WHERE parent_id=$1 AND doc_kind='clause')", law)
await conn.execute("DELETE FROM documents WHERE parent_id=$1 AND doc_kind='clause'", law)
code2id = {}
for c in arts:
fh = hashlib.sha256(f"{law}:{c['code']}:{c['body']}".encode()).hexdigest()
cid = await conn.fetchval("""
INSERT INTO documents (file_format,file_hash,title,md_content,parent_id,doc_kind,
clause_code,clause_part,clause_order,ai_domain,data_origin,
md_status,review_status,conversion_status,preview_status)
VALUES ('md',$1,$2,$3,$4,'clause',$5,$6,$7,$8,$9,'success','approved','none','none') RETURNING id
""", fh, c['title'], c['body'], law, c['code'], c['part'], c['order'],
row['ai_domain'], row['data_origin'] or 'external')
code2id[c['code']] = cid
await conn.execute("INSERT INTO document_tags(doc_id,tag,tag_kind) VALUES ($1,$2,'chapter') ON CONFLICT DO NOTHING", cid, c['part'])
# 조↔조 백링크 (같은 법 내부; 타법 참조는 dangling)
edges = []
for c in arts:
src = code2id[c['code']]
for dst in c['links']:
edges.append((src, dst, code2id.get(dst), None, None, None))
if edges:
await conn.executemany(
"INSERT INTO clause_links(src_doc_id,dst_code,dst_doc_id,anchor,ctx,char_off) VALUES ($1,$2,$3,$4,$5,$6)", edges)
n = await conn.fetchval("SELECT count(*) FROM documents WHERE parent_id=$1 AND doc_kind='clause'", law)
print(f" COMMITTED: {n} 조문 + {len(edges)} 백링크 for law {law}")
return ('committed', n, len(edges))
async def main():
import asyncpg
arg = sys.argv[1]; commit = '--commit' in sys.argv
conn = await asyncpg.connect(os.environ['DATABASE_URL'].replace('+asyncpg', ''))
if arg == 'all':
laws = await conn.fetch("SELECT lm.document_id AS id FROM legal_meta lm "
"JOIN documents d ON d.id=lm.document_id "
"WHERE lm.law_doc_kind='primary' AND lm.version_status='current' "
"AND coalesce(d.md_content, d.extracted_text) IS NOT NULL "
"ORDER BY lm.document_id")
agg = {}; tot_art = tot_link = 0; zero = []
for i, r in enumerate(laws):
st, na, nl = await process_one(conn, r['id'], commit, verbose=False)
agg[st] = agg.get(st, 0) + 1
tot_art += na; tot_link += nl
if st == 'noart': zero.append(r['id'])
if commit and (i + 1) % 30 == 0: print(f"{i+1}/{len(laws)} (누적 조 {tot_art})")
print(f"BATCH {'COMMIT' if commit else 'DRY'} laws={len(laws)} status={agg} 총조문={tot_art} 총백링크={tot_link}")
if zero: print(f" 0-조(추출구조 이질) {len(zero)}건: {zero[:20]}")
else:
await process_one(conn, int(arg), commit, verbose=True)
await conn.close()
asyncio.run(main())
+51
View File
@@ -0,0 +1,51 @@
#!/usr/bin/env python3
"""논문 인용그래프 가능성 측정(read-only) — 본문 DOI로 코퍼스내 인용 엣지 추정.
own_doi = 헤더(앞 2500자) 첫 DOI / cited = References 이후(또는 전체) DOI. owner 맵 → 엣지.
"""
import asyncio, os, re, sys
DOI_RE = re.compile(r'10\.\d{4,9}/[^\s"<>)\]\},;]+')
REF_RE = re.compile(r'(references|참고문헌|bibliography|reference\s*list)', re.I)
def norm(d): return d.rstrip('.').lower()
async def main():
import asyncpg
conn = await asyncpg.connect(os.environ['DATABASE_URL'].replace('+asyncpg', ''))
rows = await conn.fetch("SELECT id, title, coalesce(md_content, extracted_text) AS txt FROM documents "
"WHERE material_type='paper' AND doc_kind='standard' AND deleted_at IS NULL "
"AND coalesce(md_content, extracted_text) IS NOT NULL")
owner = {} # doi -> paper id (헤더 DOI = 그 논문 소유)
cited = {} # paper id -> set(cited doi)
n_own = n_refsec = 0
for r in rows:
txt = r['txt']
head = txt[:2500]
hdois = [norm(d) for d in DOI_RE.findall(head)]
if hdois:
owner.setdefault(hdois[0], r['id']); n_own += 1
m = REF_RE.search(txt)
body = txt[m.start():] if m else ''
if m: n_refsec += 1
cds = set(norm(d) for d in DOI_RE.findall(body))
if cds: cited[r['id']] = cds
# 엣지: paper -> owner(cited doi)
edges = []
for pid, cds in cited.items():
for d in cds:
o = owner.get(d)
if o and o != pid: edges.append((pid, o, d))
cited_papers = set(e[0] for e in edges)
target_papers = set(e[1] for e in edges)
print(f"papers={len(rows)} 헤더DOI보유={n_own} References보유={n_refsec} owner_map={len(owner)}")
print(f"인용엣지(코퍼스내)={len(edges)} 인용하는논문={len(cited_papers)} 피인용논문={len(target_papers)}")
# 피인용 top
from collections import Counter
top = Counter(e[1] for e in edges).most_common(6)
if top:
idmap = {r['id']: r['title'] for r in rows}
print("피인용 top:")
for pid, c in top: print(f" {c}회 ← {(idmap.get(pid) or '')[:48]}")
await conn.close()
asyncio.run(main())
+30
View File
@@ -0,0 +1,30 @@
#!/usr/bin/env python3
"""OpenAlex 보강 타당성 테스트 — 소수 논문 제목으로 매칭/메타 확인 (외부 API)."""
import asyncio, os, re
async def main():
import asyncpg, httpx
conn = await asyncpg.connect(os.environ['DATABASE_URL'].replace('+asyncpg', ''))
rows = await conn.fetch("SELECT id, title FROM documents WHERE material_type='paper' "
"AND doc_kind='standard' AND deleted_at IS NULL AND title IS NOT NULL "
"AND length(title) > 15 ORDER BY id LIMIT 6")
async with httpx.AsyncClient(timeout=20) as client:
for r in rows:
title = re.sub(r'\s+', ' ', r['title']).strip()
try:
resp = await client.get("https://api.openalex.org/works",
params={"search": title[:200], "per_page": 1, "mailto": "hyun49196@gmail.com"})
js = resp.json()
res = (js.get("results") or [])
if not res:
print(f"[{r['id']}] NO MATCH | {title[:50]}"); continue
w = res[0]
oid = (w.get("id") or "").split("/")[-1]
print(f"[{r['id']}] {title[:46]}")
print(f" → OA {oid} | {(w.get('title') or '')[:46]} | {w.get('publication_year')} | "
f"cited_by={w.get('cited_by_count')} | refs={len(w.get('referenced_works') or [])} | doi={w.get('doi')}")
except Exception as e:
print(f"[{r['id']}] ERROR {type(e).__name__}: {e}")
await conn.close()
asyncio.run(main())