feat(docs): 관련 문서(유사도 KNN) 엔드포인트+패널 + 법령/지침 splitter
This commit is contained in:
@@ -0,0 +1,100 @@
|
||||
#!/usr/bin/env python3
|
||||
"""기술지침(KOSHA guide) 절-KB persist: 번호섹션(# 1. 목적 / ## 4.1) 단위 분해 + 제본.
|
||||
ASME/법령과 동일 clause-KB 모델(doc_kind='clause', parent_id=지침, 검색제외, /book 리더 공용).
|
||||
Usage: python3 guide_clause_persist.py <id|all> [--commit]
|
||||
"""
|
||||
import asyncio, os, re, sys, hashlib, statistics
|
||||
|
||||
CAP = 12000; PAGE_TOK = 11000
|
||||
EN, KO = 0.217, 0.529
|
||||
# 번호섹션 헤더: '# 1. 목 적', '## 4.1 누출...' (번호 1~3자리=연도(4자리) 배제)
|
||||
ART_RE = re.compile(r'^#{1,6}\s*(\d{1,3}(?:\.\d{1,3})*)\.?\s+(\S.*)$')
|
||||
TOP_RE = re.compile(r'^\d{1,3}$')
|
||||
# 외부 표준/법규 참조(대부분 dangling): ASME B16.5 · KS B 1501 · 규칙 제N조
|
||||
EXT_RE = re.compile(r'(ASME\s+[A-Z][0-9.]+|KS\s+[A-Z]\s*[0-9]+|ISO\s+[0-9]+|제\d+조)')
|
||||
|
||||
def tok(s):
|
||||
ko = sum(1 for c in s if '가' <= c <= '힣'); return int((len(s)-ko)*EN + ko*KO)
|
||||
|
||||
def build_sections(text):
|
||||
lines = text.split('\n'); off = []; a = 0
|
||||
for ln in lines: off.append(a); a += len(ln) + 1
|
||||
bounds = []; seen = set()
|
||||
for i, ln in enumerate(lines):
|
||||
m = ART_RE.match(ln)
|
||||
if not m: continue
|
||||
code, name = m.group(1), m.group(2).strip()
|
||||
if not TOP_RE.match(code): continue # top-level 번호섹션만 경계
|
||||
if code in seen: continue
|
||||
if len(name) < 1: continue
|
||||
seen.add(code); bounds.append((off[i], code, name))
|
||||
out = []
|
||||
for idx, (start, code, name) in enumerate(bounds):
|
||||
end = bounds[idx+1][0] if idx+1 < len(bounds) else len(text)
|
||||
body = text[start:end].strip()
|
||||
ext = sorted(set(EXT_RE.findall(body)))[:8]
|
||||
out.append(dict(code=code, part='본문', order=0, title=f"{code}. {name}"[:120],
|
||||
body=body, tok=tok(body), links=[], ext=ext))
|
||||
# over-CAP 페이지네이션 + 순번
|
||||
final, order = [], 0
|
||||
for c in out:
|
||||
if c['tok'] <= CAP:
|
||||
final.append({**c, 'order': order}); order += 1; continue
|
||||
pages, cur, ct = [], [], 0
|
||||
for ln in c['body'].split('\n'):
|
||||
lt = tok(ln)+1
|
||||
if ct+lt > PAGE_TOK and cur: pages.append('\n'.join(cur)); cur=[ln]; ct=lt
|
||||
else: cur.append(ln); ct+=lt
|
||||
if cur: pages.append('\n'.join(cur))
|
||||
for pi, pb in enumerate(pages):
|
||||
final.append(dict(code=c['code'] if pi==0 else f"{c['code']}·p{pi+1}", part='본문',
|
||||
order=order, title=c['title'] if pi==0 else f"{c['title']} (p{pi+1})",
|
||||
body=pb, tok=tok(pb), links=[], ext=[]))
|
||||
order += 1
|
||||
return final
|
||||
|
||||
async def process_one(conn, gid, commit, verbose=True):
|
||||
row = await conn.fetchrow("SELECT title, md_content, ai_domain, data_origin FROM documents WHERE id=$1", gid)
|
||||
if not row: return ('notfound', 0)
|
||||
if not row['md_content']: return ('nullmd', 0)
|
||||
secs = build_sections(row['md_content'])
|
||||
if len(secs) < 2: return ('few', len(secs)) # 섹션 2 미만 = 번호구조 아님
|
||||
toks = [c['tok'] for c in secs]
|
||||
if verbose:
|
||||
print(f"guide={gid} «{(row['title'] or '')[:40]}» 섹션={len(secs)} median={int(statistics.median(toks))} max={max(toks)}")
|
||||
print(" 샘플:", [c['title'][:26] for c in secs[:7]])
|
||||
if not commit: return ('dry', len(secs))
|
||||
async with conn.transaction():
|
||||
await conn.execute("DELETE FROM clause_links WHERE src_doc_id IN (SELECT id FROM documents WHERE parent_id=$1 AND doc_kind='clause')", gid)
|
||||
await conn.execute("DELETE FROM documents WHERE parent_id=$1 AND doc_kind='clause'", gid)
|
||||
for c in secs:
|
||||
fh = hashlib.sha256(f"{gid}:{c['code']}:{c['body']}".encode()).hexdigest()
|
||||
cid = await conn.fetchval("""
|
||||
INSERT INTO documents (file_format,file_hash,title,md_content,parent_id,doc_kind,
|
||||
clause_code,clause_part,clause_order,ai_domain,data_origin,
|
||||
md_status,review_status,conversion_status,preview_status)
|
||||
VALUES ('md',$1,$2,$3,$4,'clause',$5,$6,$7,$8,$9,'success','approved','none','none') RETURNING id
|
||||
""", fh, c['title'], c['body'], gid, c['code'], c['part'], c['order'], row['ai_domain'], row['data_origin'] or 'external')
|
||||
await conn.execute("INSERT INTO document_tags(doc_id,tag,tag_kind) VALUES ($1,'기술지침','kind') ON CONFLICT DO NOTHING", cid)
|
||||
n = await conn.fetchval("SELECT count(*) FROM documents WHERE parent_id=$1 AND doc_kind='clause'", gid)
|
||||
print(f" COMMITTED: {n} 섹션 for guide {gid}")
|
||||
return ('committed', len(secs))
|
||||
|
||||
async def main():
|
||||
import asyncpg
|
||||
arg = sys.argv[1]; commit = '--commit' in sys.argv
|
||||
conn = await asyncpg.connect(os.environ['DATABASE_URL'].replace('+asyncpg', ''))
|
||||
if arg == 'all':
|
||||
gs = await conn.fetch("SELECT id FROM documents WHERE material_type='guide' AND doc_kind='standard' "
|
||||
"AND deleted_at IS NULL AND md_content IS NOT NULL ORDER BY id")
|
||||
agg = {}; tot = 0
|
||||
for i, r in enumerate(gs):
|
||||
st, n = await process_one(conn, r['id'], commit, verbose=False)
|
||||
agg[st] = agg.get(st, 0)+1; tot += n if st in ('dry','committed') else 0
|
||||
if commit and (i+1) % 40 == 0: print(f" …{i+1}/{len(gs)} (누적섹션 {tot})")
|
||||
print(f"BATCH {'COMMIT' if commit else 'DRY'} guides={len(gs)} status={agg} 총섹션={tot}")
|
||||
else:
|
||||
await process_one(conn, int(arg), commit, verbose=True)
|
||||
await conn.close()
|
||||
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user