#!/usr/bin/env python3 """기술지침(KOSHA guide) 절-KB persist: 번호섹션(# 1. 목적 / ## 4.1) 단위 분해 + 제본. ASME/법령과 동일 clause-KB 모델(doc_kind='clause', parent_id=지침, 검색제외, /book 리더 공용). Usage: python3 guide_clause_persist.py [--commit] """ import asyncio, os, re, sys, hashlib, statistics CAP = 12000; PAGE_TOK = 11000 EN, KO = 0.217, 0.529 # 번호섹션 헤더: '# 1. 목 적', '## 4.1 누출...' (번호 1~3자리=연도(4자리) 배제) ART_RE = re.compile(r'^#{1,6}\s*(\d{1,3}(?:\.\d{1,3})*)\.?\s+(\S.*)$') TOP_RE = re.compile(r'^\d{1,3}$') # 외부 표준/법규 참조(대부분 dangling): ASME B16.5 · KS B 1501 · 규칙 제N조 EXT_RE = re.compile(r'(ASME\s+[A-Z][0-9.]+|KS\s+[A-Z]\s*[0-9]+|ISO\s+[0-9]+|제\d+조)') def tok(s): ko = sum(1 for c in s if '가' <= c <= '힣'); return int((len(s)-ko)*EN + ko*KO) def build_sections(text): lines = text.split('\n'); off = []; a = 0 for ln in lines: off.append(a); a += len(ln) + 1 bounds = []; seen = set() for i, ln in enumerate(lines): m = ART_RE.match(ln) if not m: continue code, name = m.group(1), m.group(2).strip() if not TOP_RE.match(code): continue # top-level 번호섹션만 경계 if code in seen: continue if len(name) < 1: continue seen.add(code); bounds.append((off[i], code, name)) out = [] for idx, (start, code, name) in enumerate(bounds): end = bounds[idx+1][0] if idx+1 < len(bounds) else len(text) body = text[start:end].strip() ext = sorted(set(EXT_RE.findall(body)))[:8] out.append(dict(code=code, part='본문', order=0, title=f"{code}. {name}"[:120], body=body, tok=tok(body), links=[], ext=ext)) # over-CAP 페이지네이션 + 순번 final, order = [], 0 for c in out: if c['tok'] <= CAP: final.append({**c, 'order': order}); order += 1; continue pages, cur, ct = [], [], 0 for ln in c['body'].split('\n'): lt = tok(ln)+1 if ct+lt > PAGE_TOK and cur: pages.append('\n'.join(cur)); cur=[ln]; ct=lt else: cur.append(ln); ct+=lt if cur: pages.append('\n'.join(cur)) for pi, pb in enumerate(pages): final.append(dict(code=c['code'] if pi==0 else f"{c['code']}·p{pi+1}", part='본문', order=order, title=c['title'] if pi==0 else f"{c['title']} (p{pi+1})", body=pb, tok=tok(pb), links=[], ext=[])) order += 1 return final async def process_one(conn, gid, commit, verbose=True): row = await conn.fetchrow("SELECT title, md_content, ai_domain, data_origin FROM documents WHERE id=$1", gid) if not row: return ('notfound', 0) if not row['md_content']: return ('nullmd', 0) secs = build_sections(row['md_content']) if len(secs) < 2: return ('few', len(secs)) # 섹션 2 미만 = 번호구조 아님 toks = [c['tok'] for c in secs] if verbose: print(f"guide={gid} «{(row['title'] or '')[:40]}» 섹션={len(secs)} median={int(statistics.median(toks))} max={max(toks)}") print(" 샘플:", [c['title'][:26] for c in secs[:7]]) if not commit: return ('dry', len(secs)) async with conn.transaction(): await conn.execute("DELETE FROM clause_links WHERE src_doc_id IN (SELECT id FROM documents WHERE parent_id=$1 AND doc_kind='clause')", gid) await conn.execute("DELETE FROM documents WHERE parent_id=$1 AND doc_kind='clause'", gid) for c in secs: fh = hashlib.sha256(f"{gid}:{c['code']}:{c['body']}".encode()).hexdigest() cid = await conn.fetchval(""" INSERT INTO documents (file_format,file_hash,title,md_content,parent_id,doc_kind, clause_code,clause_part,clause_order,ai_domain,data_origin, md_status,review_status,conversion_status,preview_status) VALUES ('md',$1,$2,$3,$4,'clause',$5,$6,$7,$8,$9,'success','approved','none','none') RETURNING id """, fh, c['title'], c['body'], gid, c['code'], c['part'], c['order'], row['ai_domain'], row['data_origin'] or 'external') await conn.execute("INSERT INTO document_tags(doc_id,tag,tag_kind) VALUES ($1,'기술지침','kind') ON CONFLICT DO NOTHING", cid) n = await conn.fetchval("SELECT count(*) FROM documents WHERE parent_id=$1 AND doc_kind='clause'", gid) print(f" COMMITTED: {n} 섹션 for guide {gid}") return ('committed', len(secs)) async def main(): import asyncpg arg = sys.argv[1]; commit = '--commit' in sys.argv conn = await asyncpg.connect(os.environ['DATABASE_URL'].replace('+asyncpg', '')) if arg == 'all': gs = await conn.fetch("SELECT id FROM documents WHERE material_type='guide' AND doc_kind='standard' " "AND deleted_at IS NULL AND md_content IS NOT NULL ORDER BY id") agg = {}; tot = 0 for i, r in enumerate(gs): st, n = await process_one(conn, r['id'], commit, verbose=False) agg[st] = agg.get(st, 0)+1; tot += n if st in ('dry','committed') else 0 if commit and (i+1) % 40 == 0: print(f" …{i+1}/{len(gs)} (누적섹션 {tot})") print(f"BATCH {'COMMIT' if commit else 'DRY'} guides={len(gs)} status={agg} 총섹션={tot}") else: await process_one(conn, int(arg), commit, verbose=True) await conn.close() asyncio.run(main())