From a22b2c76475475596f9bc4ca8a13ebc3dc1b82c8 Mon Sep 17 00:00:00 2001 From: hyungi Date: Tue, 30 Jun 2026 06:10:11 +0000 Subject: [PATCH] =?UTF-8?q?feat(docs):=20=EA=B4=80=EB=A0=A8=20=EB=AC=B8?= =?UTF-8?q?=EC=84=9C(=EC=9C=A0=EC=82=AC=EB=8F=84=20KNN)=20=EC=97=94?= =?UTF-8?q?=EB=93=9C=ED=8F=AC=EC=9D=B8=ED=8A=B8+=ED=8C=A8=EB=84=90=20+=20?= =?UTF-8?q?=EB=B2=95=EB=A0=B9/=EC=A7=80=EC=B9=A8=20splitter?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- app/api/documents.py | 58 +++++++ .../src/lib/components/RelatedDocs.svelte | 45 ++++++ .../src/routes/documents/[id]/+page.svelte | 2 + scripts/guide_clause_persist.py | 100 ++++++++++++ scripts/law_clause_persist.py | 146 ++++++++++++++++++ scripts/paper_citation_analyze.py | 51 ++++++ scripts/paper_openalex_test.py | 30 ++++ 7 files changed, 432 insertions(+) create mode 100644 frontend/src/lib/components/RelatedDocs.svelte create mode 100644 scripts/guide_clause_persist.py create mode 100644 scripts/law_clause_persist.py create mode 100644 scripts/paper_citation_analyze.py create mode 100644 scripts/paper_openalex_test.py diff --git a/app/api/documents.py b/app/api/documents.py index ca01a8b..0443fb5 100644 --- a/app/api/documents.py +++ b/app/api/documents.py @@ -1990,3 +1990,61 @@ async def get_document_backlinks( forward=[BacklinkRef(**dict(r)) for r in forward], back=[BacklinkRef(**dict(r)) for r in back], ) + + +# ─── 관련 문서 (유사도, on-demand pgvector KNN — 저부하·무저장) ─── +class RelatedItem(BaseModel): + id: int + title: str | None = None + ai_domain: str | None = None + material_type: str | None = None + year: int | None = None + sim: float | None = None + + +class RelatedResponse(BaseModel): + doc_id: int + related: list[RelatedItem] + + +@router.get("/{doc_id}/related", response_model=RelatedResponse) +async def get_related_documents( + doc_id: int, + user: Annotated[User, Depends(get_current_user)], + session: Annotated[AsyncSession, Depends(get_session)], + limit: int = 8, + same_type: bool = True, +): + """문서-레벨 임베딩 코사인 최근접 = '관련 문서'. on-demand(저장/배치 없음). + + 인용그래프가 부적합한 코퍼스(업계 기술기사=인용망 부재)의 대안 연결 레이어. + same_type=true면 같은 material_type 내, false면 전 코퍼스. doc_kind='clause'(절-문서)는 제외. + """ + from sqlalchemy import text as sql_text + + lim = max(1, min(limit, 30)) + type_clause = "AND d.material_type = src.material_type" if same_type else "" + rows = ( + await session.execute( + sql_text( + f""" + WITH src AS ( + SELECT embedding, material_type FROM documents WHERE id = :id + ) + SELECT d.id, d.title, d.ai_domain, d.material_type, d.facet_year AS year, + round((1 - (d.embedding <=> (SELECT embedding FROM src)))::numeric, 3) AS sim + FROM documents d, src + WHERE d.doc_kind = 'standard' AND d.deleted_at IS NULL + AND d.id <> :id AND d.embedding IS NOT NULL + AND (SELECT embedding FROM src) IS NOT NULL + {type_clause} + ORDER BY d.embedding <=> (SELECT embedding FROM src) + LIMIT :lim + """ + ).bindparams(id=doc_id, lim=lim) + ) + ).mappings().all() + return RelatedResponse( + doc_id=doc_id, + related=[RelatedItem(**{k: r[k] for k in ("id", "title", "ai_domain", "material_type", "year")}, sim=float(r["sim"]) if r["sim"] is not None else None) for r in rows], + ) diff --git a/frontend/src/lib/components/RelatedDocs.svelte b/frontend/src/lib/components/RelatedDocs.svelte new file mode 100644 index 0000000..c650d01 --- /dev/null +++ b/frontend/src/lib/components/RelatedDocs.svelte @@ -0,0 +1,45 @@ + + +{#if items.length} +
+
관련 문서
+ {#each items as it (it.id)} + + {it.title} + + {#if it.material_type && KIND[it.material_type]}{KIND[it.material_type]}{/if} + {Math.round((it.sim ?? 0) * 100)} + + + {/each} +
+{/if} + + diff --git a/frontend/src/routes/documents/[id]/+page.svelte b/frontend/src/routes/documents/[id]/+page.svelte index a105a7c..e68fa6b 100644 --- a/frontend/src/routes/documents/[id]/+page.svelte +++ b/frontend/src/routes/documents/[id]/+page.svelte @@ -16,6 +16,7 @@ import Skeleton from '$lib/components/ui/Skeleton.svelte'; import HandwriteCanvas from '$lib/components/HandwriteCanvas.svelte'; import MarkdownDoc from '$lib/components/MarkdownDoc.svelte'; + import RelatedDocs from '$lib/components/RelatedDocs.svelte'; import { renderDocMarkdown } from '$lib/utils/docMarkdown'; import MarkdownStatusBadge from '$lib/components/MarkdownStatusBadge.svelte'; import NoteEditor from '$lib/components/editors/NoteEditor.svelte'; @@ -321,6 +322,7 @@ {#snippet rail()}
+ {#if doc.ai_tldr || doc.ai_summary}
TL;DR
diff --git a/scripts/guide_clause_persist.py b/scripts/guide_clause_persist.py new file mode 100644 index 0000000..5b61092 --- /dev/null +++ b/scripts/guide_clause_persist.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python3 +"""기술지침(KOSHA guide) 절-KB persist: 번호섹션(# 1. 목적 / ## 4.1) 단위 분해 + 제본. +ASME/법령과 동일 clause-KB 모델(doc_kind='clause', parent_id=지침, 검색제외, /book 리더 공용). +Usage: python3 guide_clause_persist.py [--commit] +""" +import asyncio, os, re, sys, hashlib, statistics + +CAP = 12000; PAGE_TOK = 11000 +EN, KO = 0.217, 0.529 +# 번호섹션 헤더: '# 1. 목 적', '## 4.1 누출...' (번호 1~3자리=연도(4자리) 배제) +ART_RE = re.compile(r'^#{1,6}\s*(\d{1,3}(?:\.\d{1,3})*)\.?\s+(\S.*)$') +TOP_RE = re.compile(r'^\d{1,3}$') +# 외부 표준/법규 참조(대부분 dangling): ASME B16.5 · KS B 1501 · 규칙 제N조 +EXT_RE = re.compile(r'(ASME\s+[A-Z][0-9.]+|KS\s+[A-Z]\s*[0-9]+|ISO\s+[0-9]+|제\d+조)') + +def tok(s): + ko = sum(1 for c in s if '가' <= c <= '힣'); return int((len(s)-ko)*EN + ko*KO) + +def build_sections(text): + lines = text.split('\n'); off = []; a = 0 + for ln in lines: off.append(a); a += len(ln) + 1 + bounds = []; seen = set() + for i, ln in enumerate(lines): + m = ART_RE.match(ln) + if not m: continue + code, name = m.group(1), m.group(2).strip() + if not TOP_RE.match(code): continue # top-level 번호섹션만 경계 + if code in seen: continue + if len(name) < 1: continue + seen.add(code); bounds.append((off[i], code, name)) + out = [] + for idx, (start, code, name) in enumerate(bounds): + end = bounds[idx+1][0] if idx+1 < len(bounds) else len(text) + body = text[start:end].strip() + ext = sorted(set(EXT_RE.findall(body)))[:8] + out.append(dict(code=code, part='본문', order=0, title=f"{code}. {name}"[:120], + body=body, tok=tok(body), links=[], ext=ext)) + # over-CAP 페이지네이션 + 순번 + final, order = [], 0 + for c in out: + if c['tok'] <= CAP: + final.append({**c, 'order': order}); order += 1; continue + pages, cur, ct = [], [], 0 + for ln in c['body'].split('\n'): + lt = tok(ln)+1 + if ct+lt > PAGE_TOK and cur: pages.append('\n'.join(cur)); cur=[ln]; ct=lt + else: cur.append(ln); ct+=lt + if cur: pages.append('\n'.join(cur)) + for pi, pb in enumerate(pages): + final.append(dict(code=c['code'] if pi==0 else f"{c['code']}·p{pi+1}", part='본문', + order=order, title=c['title'] if pi==0 else f"{c['title']} (p{pi+1})", + body=pb, tok=tok(pb), links=[], ext=[])) + order += 1 + return final + +async def process_one(conn, gid, commit, verbose=True): + row = await conn.fetchrow("SELECT title, md_content, ai_domain, data_origin FROM documents WHERE id=$1", gid) + if not row: return ('notfound', 0) + if not row['md_content']: return ('nullmd', 0) + secs = build_sections(row['md_content']) + if len(secs) < 2: return ('few', len(secs)) # 섹션 2 미만 = 번호구조 아님 + toks = [c['tok'] for c in secs] + if verbose: + print(f"guide={gid} «{(row['title'] or '')[:40]}» 섹션={len(secs)} median={int(statistics.median(toks))} max={max(toks)}") + print(" 샘플:", [c['title'][:26] for c in secs[:7]]) + if not commit: return ('dry', len(secs)) + async with conn.transaction(): + await conn.execute("DELETE FROM clause_links WHERE src_doc_id IN (SELECT id FROM documents WHERE parent_id=$1 AND doc_kind='clause')", gid) + await conn.execute("DELETE FROM documents WHERE parent_id=$1 AND doc_kind='clause'", gid) + for c in secs: + fh = hashlib.sha256(f"{gid}:{c['code']}:{c['body']}".encode()).hexdigest() + cid = await conn.fetchval(""" + INSERT INTO documents (file_format,file_hash,title,md_content,parent_id,doc_kind, + clause_code,clause_part,clause_order,ai_domain,data_origin, + md_status,review_status,conversion_status,preview_status) + VALUES ('md',$1,$2,$3,$4,'clause',$5,$6,$7,$8,$9,'success','approved','none','none') RETURNING id + """, fh, c['title'], c['body'], gid, c['code'], c['part'], c['order'], row['ai_domain'], row['data_origin'] or 'external') + await conn.execute("INSERT INTO document_tags(doc_id,tag,tag_kind) VALUES ($1,'기술지침','kind') ON CONFLICT DO NOTHING", cid) + n = await conn.fetchval("SELECT count(*) FROM documents WHERE parent_id=$1 AND doc_kind='clause'", gid) + print(f" COMMITTED: {n} 섹션 for guide {gid}") + return ('committed', len(secs)) + +async def main(): + import asyncpg + arg = sys.argv[1]; commit = '--commit' in sys.argv + conn = await asyncpg.connect(os.environ['DATABASE_URL'].replace('+asyncpg', '')) + if arg == 'all': + gs = await conn.fetch("SELECT id FROM documents WHERE material_type='guide' AND doc_kind='standard' " + "AND deleted_at IS NULL AND md_content IS NOT NULL ORDER BY id") + agg = {}; tot = 0 + for i, r in enumerate(gs): + st, n = await process_one(conn, r['id'], commit, verbose=False) + agg[st] = agg.get(st, 0)+1; tot += n if st in ('dry','committed') else 0 + if commit and (i+1) % 40 == 0: print(f" …{i+1}/{len(gs)} (누적섹션 {tot})") + print(f"BATCH {'COMMIT' if commit else 'DRY'} guides={len(gs)} status={agg} 총섹션={tot}") + else: + await process_one(conn, int(arg), commit, verbose=True) + await conn.close() + +asyncio.run(main()) diff --git a/scripts/law_clause_persist.py b/scripts/law_clause_persist.py new file mode 100644 index 0000000..a87fd6d --- /dev/null +++ b/scripts/law_clause_persist.py @@ -0,0 +1,146 @@ +#!/usr/bin/env python3 +"""법령 조-KB persist: 법령을 조(條) 단위 개별 문서로 분해 + 조↔조 백링크 + 장(章) 태그. +ASME clause-KB와 동일 모델(doc_kind='clause', parent_id=법령, embedding NULL, 검색제외). +법령 추출 노이즈(조 앞 ### 메타 반복) 트림. Usage: python3 law_clause_persist.py [--commit] +""" +import asyncio, os, re, sys, hashlib, statistics + +CAP = 12000; PAGE_TOK = 11000 +EN, KO = 0.217, 0.529 +# 조 헤더: '### 제3조의2(가스안전관리...) 본문' +ART_RE = re.compile(r'^#{0,6}\s*(제\d+조(?:의\d+)?)\s*\(([^)]*)\)\s*(.*)$') +CHAP_RE = re.compile(r'^#{1,6}\s*(제\d+장(?:의\d+)?)\s*(.*)$') # 장 = part +# 같은-법 조 멘션(백링크) +MENTION_RE = re.compile(r'제\d+조(?:의\d+)?') +# 타법 참조: 「법명」 ... 제N조 +EXTLAW_RE = re.compile(r'「([^」]+)」') + +def tok(s): + ko = sum(1 for c in s if '가' <= c <= '힣'); return int((len(s)-ko)*EN + ko*KO) +def art_code(c): return c # '제3조의2' + +def build_articles(text): + lines = text.split('\n'); off = []; a = 0 + for ln in lines: off.append(a); a += len(ln) + 1 + arts = [] # (line_idx, code, name, part) + cur_part = None + for i, ln in enumerate(lines): + ch = CHAP_RE.match(ln) + if ch and not ART_RE.match(ln): + cur_part = (ch.group(1) + (' ' + ch.group(2).strip() if ch.group(2).strip() else '')).strip() + continue + m = ART_RE.match(ln) + if m: + arts.append((i, m.group(1), m.group(2).strip(), cur_part)) + # 본문 슬라이스 + 다음 조 앞 메타 노이즈 트림 + out = [] + for idx, (li, code, name, part) in enumerate(arts): + end_li = arts[idx+1][0] if idx+1 < len(arts) else len(lines) + body_lines = lines[li:end_li] + # 트림: 끝에서부터 '### {짧은 메타}' (조번호/조문/날짜/제목, [개정] 제N조 아님) 제거 + while len(body_lines) > 1: + last = body_lines[-1].strip() + if last == '': + body_lines.pop(); continue + mh = re.match(r'^#{1,6}\s+(.*)$', last) + if mh: + c = mh.group(1).strip() + if not c.startswith('[') and not c.startswith('제') and ( + c in ('조문', 'N') or re.fullmatch(r'\d+', c) or re.fullmatch(r'\d{8}', c) or len(c) <= 30): + body_lines.pop(); continue + break + body = '\n'.join(body_lines).strip() + links = sorted(set(MENTION_RE.findall(body)) - {code}) + ext = sorted(set(EXTLAW_RE.findall(body)))[:6] + out.append(dict(code=code, part=part or '본칙', order=0, + title=f"{code}({name})" if name else code, + body=body, tok=tok(body), links=links, ext=ext)) + # 페이지네이션(over-CAP) + 순번 + final, order = [], 0 + for c in out: + if c['tok'] <= CAP: + final.append({**c, 'order': order}); order += 1; continue + # 11K 토큰 라인 단위 분할 + pages, cur, ct = [], [], 0 + for ln in c['body'].split('\n'): + lt = tok(ln)+1 + if ct+lt > PAGE_TOK and cur: pages.append('\n'.join(cur)); cur=[ln]; ct=lt + else: cur.append(ln); ct+=lt + if cur: pages.append('\n'.join(cur)) + for pi, pb in enumerate(pages): + final.append(dict(code=c['code'] if pi==0 else f"{c['code']}·p{pi+1}", part=c['part'], + order=order, title=c['title'] if pi==0 else f"{c['title']} (p{pi+1}/{len(pages)})", + body=pb, tok=tok(pb), links=c['links'] if pi==0 else [], ext=[])) + order += 1 + return final + +async def process_one(conn, law, commit, verbose=True): + row = await conn.fetchrow("SELECT title, coalesce(md_content, extracted_text) AS md_content, ai_domain, data_origin FROM documents WHERE id=$1", law) + if not row: return ('notfound', 0, 0) + if not row['md_content']: return ('nullmd', 0, 0) + arts = build_articles(row['md_content']) + if not arts: return ('noart', 0, 0) + toks = [c['tok'] for c in arts] + nlink = sum(len(c['links']) for c in arts) + if verbose: + parts = {} + for c in arts: parts[c['part']] = parts.get(c['part'], 0)+1 + print(f"law={law} «{(row['title'] or '')[:34]}» 조문={len(arts)} median={int(statistics.median(toks))} " + f"max={max(toks)} 장={len(parts)} 백링크={nlink}") + print(" 샘플:", [c['title'][:22] for c in arts[:6]]) + if not commit: + return ('dry', len(arts), nlink) + async with conn.transaction(): + await conn.execute( + "DELETE FROM clause_links WHERE src_doc_id IN (SELECT id FROM documents WHERE parent_id=$1 AND doc_kind='clause')", law) + await conn.execute("DELETE FROM documents WHERE parent_id=$1 AND doc_kind='clause'", law) + code2id = {} + for c in arts: + fh = hashlib.sha256(f"{law}:{c['code']}:{c['body']}".encode()).hexdigest() + cid = await conn.fetchval(""" + INSERT INTO documents (file_format,file_hash,title,md_content,parent_id,doc_kind, + clause_code,clause_part,clause_order,ai_domain,data_origin, + md_status,review_status,conversion_status,preview_status) + VALUES ('md',$1,$2,$3,$4,'clause',$5,$6,$7,$8,$9,'success','approved','none','none') RETURNING id + """, fh, c['title'], c['body'], law, c['code'], c['part'], c['order'], + row['ai_domain'], row['data_origin'] or 'external') + code2id[c['code']] = cid + await conn.execute("INSERT INTO document_tags(doc_id,tag,tag_kind) VALUES ($1,$2,'chapter') ON CONFLICT DO NOTHING", cid, c['part']) + # 조↔조 백링크 (같은 법 내부; 타법 참조는 dangling) + edges = [] + for c in arts: + src = code2id[c['code']] + for dst in c['links']: + edges.append((src, dst, code2id.get(dst), None, None, None)) + if edges: + await conn.executemany( + "INSERT INTO clause_links(src_doc_id,dst_code,dst_doc_id,anchor,ctx,char_off) VALUES ($1,$2,$3,$4,$5,$6)", edges) + n = await conn.fetchval("SELECT count(*) FROM documents WHERE parent_id=$1 AND doc_kind='clause'", law) + print(f" COMMITTED: {n} 조문 + {len(edges)} 백링크 for law {law}") + return ('committed', n, len(edges)) + + +async def main(): + import asyncpg + arg = sys.argv[1]; commit = '--commit' in sys.argv + conn = await asyncpg.connect(os.environ['DATABASE_URL'].replace('+asyncpg', '')) + if arg == 'all': + laws = await conn.fetch("SELECT lm.document_id AS id FROM legal_meta lm " + "JOIN documents d ON d.id=lm.document_id " + "WHERE lm.law_doc_kind='primary' AND lm.version_status='current' " + "AND coalesce(d.md_content, d.extracted_text) IS NOT NULL " + "ORDER BY lm.document_id") + agg = {}; tot_art = tot_link = 0; zero = [] + for i, r in enumerate(laws): + st, na, nl = await process_one(conn, r['id'], commit, verbose=False) + agg[st] = agg.get(st, 0) + 1 + tot_art += na; tot_link += nl + if st == 'noart': zero.append(r['id']) + if commit and (i + 1) % 30 == 0: print(f" …{i+1}/{len(laws)} (누적 조 {tot_art})") + print(f"BATCH {'COMMIT' if commit else 'DRY'} laws={len(laws)} status={agg} 총조문={tot_art} 총백링크={tot_link}") + if zero: print(f" 0-조(추출구조 이질) {len(zero)}건: {zero[:20]}") + else: + await process_one(conn, int(arg), commit, verbose=True) + await conn.close() + +asyncio.run(main()) diff --git a/scripts/paper_citation_analyze.py b/scripts/paper_citation_analyze.py new file mode 100644 index 0000000..c843de1 --- /dev/null +++ b/scripts/paper_citation_analyze.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python3 +"""논문 인용그래프 가능성 측정(read-only) — 본문 DOI로 코퍼스내 인용 엣지 추정. +own_doi = 헤더(앞 2500자) 첫 DOI / cited = References 이후(또는 전체) DOI. owner 맵 → 엣지. +""" +import asyncio, os, re, sys + +DOI_RE = re.compile(r'10\.\d{4,9}/[^\s"<>)\]\},;]+') +REF_RE = re.compile(r'(references|참고문헌|bibliography|reference\s*list)', re.I) + +def norm(d): return d.rstrip('.').lower() + +async def main(): + import asyncpg + conn = await asyncpg.connect(os.environ['DATABASE_URL'].replace('+asyncpg', '')) + rows = await conn.fetch("SELECT id, title, coalesce(md_content, extracted_text) AS txt FROM documents " + "WHERE material_type='paper' AND doc_kind='standard' AND deleted_at IS NULL " + "AND coalesce(md_content, extracted_text) IS NOT NULL") + owner = {} # doi -> paper id (헤더 DOI = 그 논문 소유) + cited = {} # paper id -> set(cited doi) + n_own = n_refsec = 0 + for r in rows: + txt = r['txt'] + head = txt[:2500] + hdois = [norm(d) for d in DOI_RE.findall(head)] + if hdois: + owner.setdefault(hdois[0], r['id']); n_own += 1 + m = REF_RE.search(txt) + body = txt[m.start():] if m else '' + if m: n_refsec += 1 + cds = set(norm(d) for d in DOI_RE.findall(body)) + if cds: cited[r['id']] = cds + # 엣지: paper -> owner(cited doi) + edges = [] + for pid, cds in cited.items(): + for d in cds: + o = owner.get(d) + if o and o != pid: edges.append((pid, o, d)) + cited_papers = set(e[0] for e in edges) + target_papers = set(e[1] for e in edges) + print(f"papers={len(rows)} 헤더DOI보유={n_own} References보유={n_refsec} owner_map={len(owner)}") + print(f"인용엣지(코퍼스내)={len(edges)} 인용하는논문={len(cited_papers)} 피인용논문={len(target_papers)}") + # 피인용 top + from collections import Counter + top = Counter(e[1] for e in edges).most_common(6) + if top: + idmap = {r['id']: r['title'] for r in rows} + print("피인용 top:") + for pid, c in top: print(f" {c}회 ← {(idmap.get(pid) or '')[:48]}") + await conn.close() + +asyncio.run(main()) diff --git a/scripts/paper_openalex_test.py b/scripts/paper_openalex_test.py new file mode 100644 index 0000000..c835391 --- /dev/null +++ b/scripts/paper_openalex_test.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python3 +"""OpenAlex 보강 타당성 테스트 — 소수 논문 제목으로 매칭/메타 확인 (외부 API).""" +import asyncio, os, re + +async def main(): + import asyncpg, httpx + conn = await asyncpg.connect(os.environ['DATABASE_URL'].replace('+asyncpg', '')) + rows = await conn.fetch("SELECT id, title FROM documents WHERE material_type='paper' " + "AND doc_kind='standard' AND deleted_at IS NULL AND title IS NOT NULL " + "AND length(title) > 15 ORDER BY id LIMIT 6") + async with httpx.AsyncClient(timeout=20) as client: + for r in rows: + title = re.sub(r'\s+', ' ', r['title']).strip() + try: + resp = await client.get("https://api.openalex.org/works", + params={"search": title[:200], "per_page": 1, "mailto": "hyun49196@gmail.com"}) + js = resp.json() + res = (js.get("results") or []) + if not res: + print(f"[{r['id']}] NO MATCH | {title[:50]}"); continue + w = res[0] + oid = (w.get("id") or "").split("/")[-1] + print(f"[{r['id']}] {title[:46]}") + print(f" → OA {oid} | {(w.get('title') or '')[:46]} | {w.get('publication_year')} | " + f"cited_by={w.get('cited_by_count')} | refs={len(w.get('referenced_works') or [])} | doi={w.get('doi')}") + except Exception as e: + print(f"[{r['id']}] ERROR {type(e).__name__}: {e}") + await conn.close() + +asyncio.run(main())