Files
hyungi_document_server/scripts/hier_outline_quality_gate.py
hyungi 53999b2825 fix(documents): g-measure junk 검출 all-caps 과탐 제거 + verdict=coarse 스크린 명시
전부-대문자 휴리스틱이 기술문서 정상 heading(GENERAL REQUIREMENTS/WELDING) 130건 과탐 →
windowed/clean doc 거짓 A_better 강등. 회사-접미사(INC./LLC…)만, cover 영역(앞 4노드)+미stored 게이트.
verdict 는 coarse 스크린(감사용)이고 실집행 결정 = 결정적 partition + 적대 워크플로임을 docstring 박제.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-09 12:58:36 +09:00

204 lines
10 KiB
Python

"""hier 개요 keep-better 게이트 + g-measure 엔진 (플랜 ds-outline-anchor-b5 g6-t1 / gm-t1).
READ-ONLY dry-run. doc 별로:
(A) 현 저장 hier 절제목 (source_type='hier_section', char_start IS NULL = extracted_text 산)
(B) build_hier_tree(md_content) 절제목 (= 새 g2 builder: split('\n')+UTF-16+fence skip)
를 비교해 산출:
- verdict {B_better, A_better, equivalent} (+ junk-heading 검출 → A_better 보호)
- B_jumptarget_count (build 후 jump-target node 수) — B3 게이트 입력
- hash_stable 판정 — UPDATE-only(g3-tU) vs re-decompose(g3-t2) 라우팅:
* hash_stable_strict = build(md) 가 저장 hier hash 를 position-by-position 100% 재현
(= 런타임 g3-tU 가 UPDATE-only 로 처리할 정확한 집합; demote 안 함)
* hash_stable_99 = >=99% 재현 (원 MEASURE2 분류 기준 — 비교용)
- dup_title_count / has_fence (measure3 budget note: fence 보유 doc 은 새 builder 에서 hash_changed flip 가능)
- REFINED PASS = (verdict B>=A) AND (B_jumptarget>=1)
★ gm-t1 재확인(이 빌드의 유일 잔여 측정): g2 builder 코딩 후 1회 실행 → REFINED PASS 중
hash_changed(=re-decompose) count 가 ~230 인지 확인(코드펜스-skip 으로 32 중 ≤2 flip → 최대 ~232 수용).
실행 (GPU 서버, 컨테이너):
docker compose exec -T fastapi python /app/scripts/hier_outline_quality_gate.py run
docker compose exec -T fastapi python /app/scripts/hier_outline_quality_gate.py run --json /tmp/measure.json
docker compose exec -T fastapi python /app/scripts/hier_outline_quality_gate.py run --doc 5140,5209,5165 # 코어 spot-check
"""
import argparse
import asyncio
import json
import os
import re
import sys
from collections import Counter
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
from sqlalchemy import text
from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine
from services.hier_decomp.builder import build_hier_tree
def _is_jump_target(node) -> bool:
"""jump-target = 비-window leaf OR %_split parent + 제목 보유 (resolveAnchorMap / _JUMP_TARGET_PRED 일치)."""
structural = (node.is_leaf and node.node_type != "window") or bool(
node.node_type and node.node_type.endswith("_split"))
return structural and bool(node.section_title)
# cover/TOC org-이름 junk 검출 (g6-t1 high-recall): 회사명 접미사 + 거의-전부-대문자.
_JUNK_ORG = re.compile(r"\b(INC\.?|LLC|L\.L\.C|CORP\.?|CO\.,?\s*LTD|CONSULTING|COMPANY|LIMITED|LTD\.?)\b", re.I)
_FENCE_ANY = re.compile(r"(?m)^\s{0,3}(```|~~~)")
def _looks_junk(title: str | None) -> bool:
"""cover/TOC org-이름 junk. ★전부-대문자 휴리스틱은 폐기(2026-06-09): 기술문서의 정상
all-caps heading('GENERAL REQUIREMENTS'/'WELDING')을 130건 과탐해 windowed/clean doc 을 거짓
A_better 강등시켰음. 명시적 회사-접미사만 junk 로 본다(그것도 호출측이 cover 위치+미stored 로 게이트)."""
return bool(title and _JUNK_ORG.search(title))
def _make_engine():
return create_async_engine(os.environ["DATABASE_URL"], pool_pre_ping=True)
async def _measure_doc(session, doc_id):
md = await session.scalar(text("SELECT md_content FROM documents WHERE id=:d"), {"d": doc_id})
stored = (await session.execute(text("""
SELECT chunk_index, chunk_content_hash, node_type, is_leaf, section_title, char_start
FROM document_chunks WHERE doc_id=:d AND source_type='hier_section'
ORDER BY chunk_index"""), {"d": doc_id})).mappings().all()
if not stored:
return None
res = {"doc_id": doc_id, "n_stored": len(stored)}
if not md or not md.strip():
res.update({"md_null": True, "verdict": "A_better", "b_jumptarget": 0,
"hash_stable_strict": False, "refined_pass": False})
return res
nodes = build_hier_tree(md)
jt = [n for n in nodes if _is_jump_target(n)]
titles = [n.section_title for n in jt]
res["n_build"] = len(nodes)
res["b_jumptarget"] = len(jt)
res["dup_title"] = len(titles) - len(set(titles))
res["has_fence"] = bool(_FENCE_ANY.search(md))
res["len_md"] = len(md)
# hash 비교 (position-aligned, runtime g3-tU 기준).
if len(nodes) == len(stored):
mism = sum(1 for n, s in zip(nodes, stored)
if n.chunk_content_hash != s["chunk_content_hash"])
frac = (len(stored) - mism) / len(stored)
res["hash_match_frac"] = round(frac, 4)
res["hash_stable_strict"] = (mism == 0)
res["hash_stable_99"] = (frac >= 0.99)
else:
res["hash_match_frac"] = 0.0
res["hash_stable_strict"] = False
res["hash_stable_99"] = False
stored_titles = {s["section_title"] for s in stored if s["section_title"]}
# junk = cover 영역(앞쪽 노드)의 신규 org-이름 heading 만 (positional). 본문 전반의 정상 heading 무관.
res["junk_b"] = any(_looks_junk(n.section_title) and n.section_title not in stored_titles for n in nodes[:4])
# verdict 휴리스틱 = coarse 스크린(재현/감사용). ★2026-06-09 실집행의 authoritative 결정은 이게 아니라:
# (a) 결정적 partition: pure_benefit(n_a<=2)/comparable(0.85<=ratio<=2) = 자동 INCLUDE,
# overseg(ratio>2)/absent(ratio<0.85) = 적대 검증 대상.
# (b) 적대 워크플로(judge+refute)가 위험 후보를 stored vs build 제목으로 per-doc INCLUDE/EXCLUDE 확정.
# 이 휴리스틱 단독으로 destructive re-decompose 리스트를 만들지 말 것(junk 과탐·threshold 과적합 이력).
# ★ apples-to-apples: 양쪽 모두 JUMP-TARGET 수로 비교(stored leaf 전수 X — window-child 가 n_a 를 부풀려
# windowed doc 을 거짓 A_better 로 떨구는 bias 제거). stored jump-target = (비-window leaf OR %_split) + 제목.
def _stored_is_jt(s):
st = (s["is_leaf"] and s["node_type"] != "window") or bool(
s["node_type"] and s["node_type"].endswith("_split"))
return st and bool(s["section_title"])
n_a = sum(1 for s in stored if _stored_is_jt(s))
res["a_jumptarget"] = n_a
n_b = res["b_jumptarget"]
if n_b == 0:
res["verdict"] = "A_better" # B 개요 없음(빈 jump-target)
elif res["junk_b"]:
res["verdict"] = "A_better" # B 가 cover junk 도입
elif n_b >= max(1, n_a * 0.7):
res["verdict"] = "B_better" if n_b > n_a else "equivalent"
else:
res["verdict"] = "A_better" # B 가 구조 상실(5209 absent-class)
res["notes"] = "absent_or_degraded"
res["refined_pass"] = res["verdict"] in ("B_better", "equivalent") and n_b >= 1
return res
async def cmd_run(args):
doc_ids = [int(x) for x in args.doc.split(",") if x.strip()] if args.doc else None
engine = _make_engine()
sm = async_sessionmaker(engine, expire_on_commit=False)
try:
async with sm() as session:
if doc_ids is None:
doc_ids = [r[0] for r in (await session.execute(text(
"SELECT DISTINCT doc_id FROM document_chunks WHERE source_type='hier_section' ORDER BY doc_id"))).all()]
results = []
for d in doc_ids:
r = await _measure_doc(session, d)
if r is not None:
results.append(r)
finally:
await engine.dispose()
total = len(results)
md_null = [r for r in results if r.get("md_null")]
measured = [r for r in results if not r.get("md_null")]
passes = [r for r in measured if r.get("refined_pass")]
pass_jt0 = [r for r in measured if r["verdict"] in ("B_better", "equivalent") and r["b_jumptarget"] == 0]
hash_stable = [r for r in passes if r.get("hash_stable_strict")]
hash_stable_99 = [r for r in passes if r.get("hash_stable_99")]
hash_changed = [r for r in passes if not r.get("hash_stable_strict")]
verdict_dist = Counter(r["verdict"] for r in measured)
dup_among_stable = [r for r in hash_stable if r.get("dup_title", 0) > 0]
fence_among_stable = [r for r in hash_stable if r.get("has_fence")]
print("=" * 64)
print(f"hier doc 측정: {total} (md_null {len(md_null)}, measured {len(measured)})")
print(f"verdict 분포: {dict(verdict_dist)}")
print(f"B_jumptarget==0 (PASS-verdict 이나 빈 jump-target, B3 HOLD): {len(pass_jt0)}")
print("-" * 64)
print(f"REFINED PASS = (verdict B>=A) AND (B_jumptarget>=1): {len(passes)}")
print(f" ├─ hash_stable (strict 100% position 재현 = g3-tU UPDATE-only): {len(hash_stable)}")
print(f" │ dup_title>0: {len(dup_among_stable)} / has_fence: {len(fence_among_stable)}")
print(f" │ (참고) hash_stable_99(원 MEASURE2 기준): {len(hash_stable_99)}")
print(f" └─ hash_changed (re-decompose 대상, g3-t2 --reprocess): {len(hash_changed)} ← ★ '230' 재확인 수치")
print("-" * 64)
print(f" re-decompose --doc(B_jumptarget>=1) = {','.join(str(r['doc_id']) for r in hash_changed) or '(없음)'}")
print(f" UPDATE-only --doc(hash_stable) = {','.join(str(r['doc_id']) for r in hash_stable) or '(없음)'}")
if md_null:
print(f" md_null(suspect, V4): {[r['doc_id'] for r in md_null]}")
print("=" * 64)
print("NOTE: '230' 은 hash_changed PASS 수치. 코드펜스-skip 으로 hash_stable 32 중 fence 보유분(measure3=2)이 "
"hash_changed 로 flip 가능 → 230~232 수용(NEW-3 budget-only, 정확성은 g3-tU 런타임 100% VERIFY 가 보증).")
if args.json:
with open(args.json, "w") as f:
json.dump({"summary": {
"total": total, "measured": len(measured), "refined_pass": len(passes),
"hash_stable": len(hash_stable), "hash_changed": len(hash_changed),
"b_jumptarget_0": len(pass_jt0), "md_null": [r["doc_id"] for r in md_null],
"hash_changed_doc_ids": [r["doc_id"] for r in hash_changed],
"hash_stable_doc_ids": [r["doc_id"] for r in hash_stable],
}, "docs": results}, f, ensure_ascii=False, indent=2)
print(f"[json] {args.json} 기록 ({len(results)} doc)")
def main():
ap = argparse.ArgumentParser(description="hier 개요 keep-better 게이트 + g-measure (read-only)")
sub = ap.add_subparsers(dest="cmd", required=True)
p = sub.add_parser("run", help="전체(또는 --doc) 측정 + 분포 출력")
p.add_argument("--doc", default=None, help="comma-sep doc id (미지정=전 hier doc)")
p.add_argument("--json", default=None, help="per-doc 결과 JSON 덤프 경로")
args = ap.parse_args()
asyncio.run({"run": cmd_run}[args.cmd](args))
if __name__ == "__main__":
main()