147 lines
7.6 KiB
Python
147 lines
7.6 KiB
Python
#!/usr/bin/env python3
|
|
"""법령 조-KB persist: 법령을 조(條) 단위 개별 문서로 분해 + 조↔조 백링크 + 장(章) 태그.
|
|
ASME clause-KB와 동일 모델(doc_kind='clause', parent_id=법령, embedding NULL, 검색제외).
|
|
법령 추출 노이즈(조 앞 ### 메타 반복) 트림. Usage: python3 law_clause_persist.py <law_id> [--commit]
|
|
"""
|
|
import asyncio, os, re, sys, hashlib, statistics
|
|
|
|
CAP = 12000; PAGE_TOK = 11000
|
|
EN, KO = 0.217, 0.529
|
|
# 조 헤더: '### 제3조의2(가스안전관리...) 본문'
|
|
ART_RE = re.compile(r'^#{0,6}\s*(제\d+조(?:의\d+)?)\s*\(([^)]*)\)\s*(.*)$')
|
|
CHAP_RE = re.compile(r'^#{1,6}\s*(제\d+장(?:의\d+)?)\s*(.*)$') # 장 = part
|
|
# 같은-법 조 멘션(백링크)
|
|
MENTION_RE = re.compile(r'제\d+조(?:의\d+)?')
|
|
# 타법 참조: 「법명」 ... 제N조
|
|
EXTLAW_RE = re.compile(r'「([^」]+)」')
|
|
|
|
def tok(s):
|
|
ko = sum(1 for c in s if '가' <= c <= '힣'); return int((len(s)-ko)*EN + ko*KO)
|
|
def art_code(c): return c # '제3조의2'
|
|
|
|
def build_articles(text):
|
|
lines = text.split('\n'); off = []; a = 0
|
|
for ln in lines: off.append(a); a += len(ln) + 1
|
|
arts = [] # (line_idx, code, name, part)
|
|
cur_part = None
|
|
for i, ln in enumerate(lines):
|
|
ch = CHAP_RE.match(ln)
|
|
if ch and not ART_RE.match(ln):
|
|
cur_part = (ch.group(1) + (' ' + ch.group(2).strip() if ch.group(2).strip() else '')).strip()
|
|
continue
|
|
m = ART_RE.match(ln)
|
|
if m:
|
|
arts.append((i, m.group(1), m.group(2).strip(), cur_part))
|
|
# 본문 슬라이스 + 다음 조 앞 메타 노이즈 트림
|
|
out = []
|
|
for idx, (li, code, name, part) in enumerate(arts):
|
|
end_li = arts[idx+1][0] if idx+1 < len(arts) else len(lines)
|
|
body_lines = lines[li:end_li]
|
|
# 트림: 끝에서부터 '### {짧은 메타}' (조번호/조문/날짜/제목, [개정] 제N조 아님) 제거
|
|
while len(body_lines) > 1:
|
|
last = body_lines[-1].strip()
|
|
if last == '':
|
|
body_lines.pop(); continue
|
|
mh = re.match(r'^#{1,6}\s+(.*)$', last)
|
|
if mh:
|
|
c = mh.group(1).strip()
|
|
if not c.startswith('[') and not c.startswith('제') and (
|
|
c in ('조문', 'N') or re.fullmatch(r'\d+', c) or re.fullmatch(r'\d{8}', c) or len(c) <= 30):
|
|
body_lines.pop(); continue
|
|
break
|
|
body = '\n'.join(body_lines).strip()
|
|
links = sorted(set(MENTION_RE.findall(body)) - {code})
|
|
ext = sorted(set(EXTLAW_RE.findall(body)))[:6]
|
|
out.append(dict(code=code, part=part or '본칙', order=0,
|
|
title=f"{code}({name})" if name else code,
|
|
body=body, tok=tok(body), links=links, ext=ext))
|
|
# 페이지네이션(over-CAP) + 순번
|
|
final, order = [], 0
|
|
for c in out:
|
|
if c['tok'] <= CAP:
|
|
final.append({**c, 'order': order}); order += 1; continue
|
|
# 11K 토큰 라인 단위 분할
|
|
pages, cur, ct = [], [], 0
|
|
for ln in c['body'].split('\n'):
|
|
lt = tok(ln)+1
|
|
if ct+lt > PAGE_TOK and cur: pages.append('\n'.join(cur)); cur=[ln]; ct=lt
|
|
else: cur.append(ln); ct+=lt
|
|
if cur: pages.append('\n'.join(cur))
|
|
for pi, pb in enumerate(pages):
|
|
final.append(dict(code=c['code'] if pi==0 else f"{c['code']}·p{pi+1}", part=c['part'],
|
|
order=order, title=c['title'] if pi==0 else f"{c['title']} (p{pi+1}/{len(pages)})",
|
|
body=pb, tok=tok(pb), links=c['links'] if pi==0 else [], ext=[]))
|
|
order += 1
|
|
return final
|
|
|
|
async def process_one(conn, law, commit, verbose=True):
|
|
row = await conn.fetchrow("SELECT title, coalesce(md_content, extracted_text) AS md_content, ai_domain, data_origin FROM documents WHERE id=$1", law)
|
|
if not row: return ('notfound', 0, 0)
|
|
if not row['md_content']: return ('nullmd', 0, 0)
|
|
arts = build_articles(row['md_content'])
|
|
if not arts: return ('noart', 0, 0)
|
|
toks = [c['tok'] for c in arts]
|
|
nlink = sum(len(c['links']) for c in arts)
|
|
if verbose:
|
|
parts = {}
|
|
for c in arts: parts[c['part']] = parts.get(c['part'], 0)+1
|
|
print(f"law={law} «{(row['title'] or '')[:34]}» 조문={len(arts)} median={int(statistics.median(toks))} "
|
|
f"max={max(toks)} 장={len(parts)} 백링크={nlink}")
|
|
print(" 샘플:", [c['title'][:22] for c in arts[:6]])
|
|
if not commit:
|
|
return ('dry', len(arts), nlink)
|
|
async with conn.transaction():
|
|
await conn.execute(
|
|
"DELETE FROM clause_links WHERE src_doc_id IN (SELECT id FROM documents WHERE parent_id=$1 AND doc_kind='clause')", law)
|
|
await conn.execute("DELETE FROM documents WHERE parent_id=$1 AND doc_kind='clause'", law)
|
|
code2id = {}
|
|
for c in arts:
|
|
fh = hashlib.sha256(f"{law}:{c['code']}:{c['body']}".encode()).hexdigest()
|
|
cid = await conn.fetchval("""
|
|
INSERT INTO documents (file_format,file_hash,title,md_content,parent_id,doc_kind,
|
|
clause_code,clause_part,clause_order,ai_domain,data_origin,
|
|
md_status,review_status,conversion_status,preview_status)
|
|
VALUES ('md',$1,$2,$3,$4,'clause',$5,$6,$7,$8,$9,'success','approved','none','none') RETURNING id
|
|
""", fh, c['title'], c['body'], law, c['code'], c['part'], c['order'],
|
|
row['ai_domain'], row['data_origin'] or 'external')
|
|
code2id[c['code']] = cid
|
|
await conn.execute("INSERT INTO document_tags(doc_id,tag,tag_kind) VALUES ($1,$2,'chapter') ON CONFLICT DO NOTHING", cid, c['part'])
|
|
# 조↔조 백링크 (같은 법 내부; 타법 참조는 dangling)
|
|
edges = []
|
|
for c in arts:
|
|
src = code2id[c['code']]
|
|
for dst in c['links']:
|
|
edges.append((src, dst, code2id.get(dst), None, None, None))
|
|
if edges:
|
|
await conn.executemany(
|
|
"INSERT INTO clause_links(src_doc_id,dst_code,dst_doc_id,anchor,ctx,char_off) VALUES ($1,$2,$3,$4,$5,$6)", edges)
|
|
n = await conn.fetchval("SELECT count(*) FROM documents WHERE parent_id=$1 AND doc_kind='clause'", law)
|
|
print(f" COMMITTED: {n} 조문 + {len(edges)} 백링크 for law {law}")
|
|
return ('committed', n, len(edges))
|
|
|
|
|
|
async def main():
|
|
import asyncpg
|
|
arg = sys.argv[1]; commit = '--commit' in sys.argv
|
|
conn = await asyncpg.connect(os.environ['DATABASE_URL'].replace('+asyncpg', ''))
|
|
if arg == 'all':
|
|
laws = await conn.fetch("SELECT lm.document_id AS id FROM legal_meta lm "
|
|
"JOIN documents d ON d.id=lm.document_id "
|
|
"WHERE lm.law_doc_kind='primary' AND lm.version_status='current' "
|
|
"AND coalesce(d.md_content, d.extracted_text) IS NOT NULL "
|
|
"ORDER BY lm.document_id")
|
|
agg = {}; tot_art = tot_link = 0; zero = []
|
|
for i, r in enumerate(laws):
|
|
st, na, nl = await process_one(conn, r['id'], commit, verbose=False)
|
|
agg[st] = agg.get(st, 0) + 1
|
|
tot_art += na; tot_link += nl
|
|
if st == 'noart': zero.append(r['id'])
|
|
if commit and (i + 1) % 30 == 0: print(f" …{i+1}/{len(laws)} (누적 조 {tot_art})")
|
|
print(f"BATCH {'COMMIT' if commit else 'DRY'} laws={len(laws)} status={agg} 총조문={tot_art} 총백링크={tot_link}")
|
|
if zero: print(f" 0-조(추출구조 이질) {len(zero)}건: {zero[:20]}")
|
|
else:
|
|
await process_one(conn, int(arg), commit, verbose=True)
|
|
await conn.close()
|
|
|
|
asyncio.run(main())
|