#!/usr/bin/env python3 """법령 조-KB persist: 법령을 조(條) 단위 개별 문서로 분해 + 조↔조 백링크 + 장(章) 태그. ASME clause-KB와 동일 모델(doc_kind='clause', parent_id=법령, embedding NULL, 검색제외). 법령 추출 노이즈(조 앞 ### 메타 반복) 트림. Usage: python3 law_clause_persist.py [--commit] """ import asyncio, os, re, sys, hashlib, statistics CAP = 12000; PAGE_TOK = 11000 EN, KO = 0.217, 0.529 # 조 헤더: '### 제3조의2(가스안전관리...) 본문' ART_RE = re.compile(r'^#{0,6}\s*(제\d+조(?:의\d+)?)\s*\(([^)]*)\)\s*(.*)$') CHAP_RE = re.compile(r'^#{1,6}\s*(제\d+장(?:의\d+)?)\s*(.*)$') # 장 = part # 같은-법 조 멘션(백링크) MENTION_RE = re.compile(r'제\d+조(?:의\d+)?') # 타법 참조: 「법명」 ... 제N조 EXTLAW_RE = re.compile(r'「([^」]+)」') def tok(s): ko = sum(1 for c in s if '가' <= c <= '힣'); return int((len(s)-ko)*EN + ko*KO) def art_code(c): return c # '제3조의2' def build_articles(text): lines = text.split('\n'); off = []; a = 0 for ln in lines: off.append(a); a += len(ln) + 1 arts = [] # (line_idx, code, name, part) cur_part = None for i, ln in enumerate(lines): ch = CHAP_RE.match(ln) if ch and not ART_RE.match(ln): cur_part = (ch.group(1) + (' ' + ch.group(2).strip() if ch.group(2).strip() else '')).strip() continue m = ART_RE.match(ln) if m: arts.append((i, m.group(1), m.group(2).strip(), cur_part)) # 본문 슬라이스 + 다음 조 앞 메타 노이즈 트림 out = [] for idx, (li, code, name, part) in enumerate(arts): end_li = arts[idx+1][0] if idx+1 < len(arts) else len(lines) body_lines = lines[li:end_li] # 트림: 끝에서부터 '### {짧은 메타}' (조번호/조문/날짜/제목, [개정] 제N조 아님) 제거 while len(body_lines) > 1: last = body_lines[-1].strip() if last == '': body_lines.pop(); continue mh = re.match(r'^#{1,6}\s+(.*)$', last) if mh: c = mh.group(1).strip() if not c.startswith('[') and not c.startswith('제') and ( c in ('조문', 'N') or re.fullmatch(r'\d+', c) or re.fullmatch(r'\d{8}', c) or len(c) <= 30): body_lines.pop(); continue break body = '\n'.join(body_lines).strip() links = sorted(set(MENTION_RE.findall(body)) - {code}) ext = sorted(set(EXTLAW_RE.findall(body)))[:6] out.append(dict(code=code, part=part or '본칙', order=0, title=f"{code}({name})" if name else code, body=body, tok=tok(body), links=links, ext=ext)) # 페이지네이션(over-CAP) + 순번 final, order = [], 0 for c in out: if c['tok'] <= CAP: final.append({**c, 'order': order}); order += 1; continue # 11K 토큰 라인 단위 분할 pages, cur, ct = [], [], 0 for ln in c['body'].split('\n'): lt = tok(ln)+1 if ct+lt > PAGE_TOK and cur: pages.append('\n'.join(cur)); cur=[ln]; ct=lt else: cur.append(ln); ct+=lt if cur: pages.append('\n'.join(cur)) for pi, pb in enumerate(pages): final.append(dict(code=c['code'] if pi==0 else f"{c['code']}·p{pi+1}", part=c['part'], order=order, title=c['title'] if pi==0 else f"{c['title']} (p{pi+1}/{len(pages)})", body=pb, tok=tok(pb), links=c['links'] if pi==0 else [], ext=[])) order += 1 return final async def process_one(conn, law, commit, verbose=True): row = await conn.fetchrow("SELECT title, coalesce(md_content, extracted_text) AS md_content, ai_domain, data_origin FROM documents WHERE id=$1", law) if not row: return ('notfound', 0, 0) if not row['md_content']: return ('nullmd', 0, 0) arts = build_articles(row['md_content']) if not arts: return ('noart', 0, 0) toks = [c['tok'] for c in arts] nlink = sum(len(c['links']) for c in arts) if verbose: parts = {} for c in arts: parts[c['part']] = parts.get(c['part'], 0)+1 print(f"law={law} «{(row['title'] or '')[:34]}» 조문={len(arts)} median={int(statistics.median(toks))} " f"max={max(toks)} 장={len(parts)} 백링크={nlink}") print(" 샘플:", [c['title'][:22] for c in arts[:6]]) if not commit: return ('dry', len(arts), nlink) async with conn.transaction(): await conn.execute( "DELETE FROM clause_links WHERE src_doc_id IN (SELECT id FROM documents WHERE parent_id=$1 AND doc_kind='clause')", law) await conn.execute("DELETE FROM documents WHERE parent_id=$1 AND doc_kind='clause'", law) code2id = {} for c in arts: fh = hashlib.sha256(f"{law}:{c['code']}:{c['body']}".encode()).hexdigest() cid = await conn.fetchval(""" INSERT INTO documents (file_format,file_hash,title,md_content,parent_id,doc_kind, clause_code,clause_part,clause_order,ai_domain,data_origin, md_status,review_status,conversion_status,preview_status) VALUES ('md',$1,$2,$3,$4,'clause',$5,$6,$7,$8,$9,'success','approved','none','none') RETURNING id """, fh, c['title'], c['body'], law, c['code'], c['part'], c['order'], row['ai_domain'], row['data_origin'] or 'external') code2id[c['code']] = cid await conn.execute("INSERT INTO document_tags(doc_id,tag,tag_kind) VALUES ($1,$2,'chapter') ON CONFLICT DO NOTHING", cid, c['part']) # 조↔조 백링크 (같은 법 내부; 타법 참조는 dangling) edges = [] for c in arts: src = code2id[c['code']] for dst in c['links']: edges.append((src, dst, code2id.get(dst), None, None, None)) if edges: await conn.executemany( "INSERT INTO clause_links(src_doc_id,dst_code,dst_doc_id,anchor,ctx,char_off) VALUES ($1,$2,$3,$4,$5,$6)", edges) n = await conn.fetchval("SELECT count(*) FROM documents WHERE parent_id=$1 AND doc_kind='clause'", law) print(f" COMMITTED: {n} 조문 + {len(edges)} 백링크 for law {law}") return ('committed', n, len(edges)) async def main(): import asyncpg arg = sys.argv[1]; commit = '--commit' in sys.argv conn = await asyncpg.connect(os.environ['DATABASE_URL'].replace('+asyncpg', '')) if arg == 'all': laws = await conn.fetch("SELECT lm.document_id AS id FROM legal_meta lm " "JOIN documents d ON d.id=lm.document_id " "WHERE lm.law_doc_kind='primary' AND lm.version_status='current' " "AND coalesce(d.md_content, d.extracted_text) IS NOT NULL " "ORDER BY lm.document_id") agg = {}; tot_art = tot_link = 0; zero = [] for i, r in enumerate(laws): st, na, nl = await process_one(conn, r['id'], commit, verbose=False) agg[st] = agg.get(st, 0) + 1 tot_art += na; tot_link += nl if st == 'noart': zero.append(r['id']) if commit and (i + 1) % 30 == 0: print(f" …{i+1}/{len(laws)} (누적 조 {tot_art})") print(f"BATCH {'COMMIT' if commit else 'DRY'} laws={len(laws)} status={agg} 총조문={tot_art} 총백링크={tot_link}") if zero: print(f" 0-조(추출구조 이질) {len(zero)}건: {zero[:20]}") else: await process_one(conn, int(arg), commit, verbose=True) await conn.close() asyncio.run(main())