From a6db6c999b263fbd2fd02bda6070752ecc923618 Mon Sep 17 00:00:00 2001 From: hyungi Date: Sat, 13 Jun 2026 14:51:05 +0900 Subject: [PATCH] =?UTF-8?q?fix(safety):=20B-4=20=EB=A6=AC=EB=B7=B0=20?= =?UTF-8?q?=EB=B0=98=EC=98=81=20=E2=80=94=20=EB=8B=A8=EC=9D=BC=20=EC=88=A0?= =?UTF-8?q?=EC=96=B4=20=EC=A4=91=EC=95=99=ED=99=94=20+=20study/briefing=20?= =?UTF-8?q?=EA=B2=BD=EB=A1=9C=20=EC=BB=A4=EB=B2=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 적대 리뷰(10에이전트) 확정 반영: - license_filter.py 신설 — restricted_exclude_sql(raw)/restricted_exclude_orm(ORM) 단일 정의. retrieval _license_sql·digest·briefing·study 풀이가 공유(드리프트 방지). - major: explanation_rag(study 문제 AI 풀이 RAG)에 술어 누락 → doc_meta 쿼리에 ORM 적용(valid_doc_ids 경유로 청크도 차단). briefing/loader 2쿼리에 누락 → digest 와 동일 술어 추가(news restricted 부재=방어적·경로 일관성). - blocker(low-impact): file_watcher changed-doc 경로 material/license 보정(merge 주입· license 부재 시만 — extract_meta clobber 회피, pre-B-4 적재분 동기화). - 테스트: 단일-source 검증 + ORM 구성 스모크 2건 추가. Co-Authored-By: Claude Opus 4.8 (1M context) --- app/services/briefing/loader.py | 9 ++++++-- app/services/digest/loader.py | 7 +++--- app/services/search/license_filter.py | 28 ++++++++++++++++++++++++ app/services/search/retrieval_service.py | 5 +++-- app/services/study/explanation_rag.py | 4 ++++ app/workers/file_watcher.py | 9 ++++++++ tests/test_b4_license_watch.py | 23 +++++++++++++++++++ 7 files changed, 78 insertions(+), 7 deletions(-) create mode 100644 app/services/search/license_filter.py diff --git a/app/services/briefing/loader.py b/app/services/briefing/loader.py index 42a1e85..a84e6d3 100644 --- a/app/services/briefing/loader.py +++ b/app/services/briefing/loader.py @@ -15,11 +15,12 @@ from sqlalchemy import text from core.database import async_session from core.utils import setup_logger +from services.search.license_filter import restricted_exclude_sql logger = setup_logger("briefing_loader") -_NEWS_WINDOW_SQL = text(""" +_NEWS_WINDOW_SQL = text(f""" SELECT d.id, d.title, @@ -41,6 +42,8 @@ _NEWS_WINDOW_SQL = text(""" AND d.created_at < :window_end AND d.embedding IS NOT NULL AND d.ai_summary IS NOT NULL + -- 안전 자료실 B-4: licensed_restricted 발행 차단 (digest 와 동일 공유 술어, 경로 일관성) + AND {restricted_exclude_sql("d")} """) @@ -49,7 +52,7 @@ _SOURCE_COUNTRY_SQL = text(""" """) -_HISTORICAL_CANDIDATES_SQL = text(""" +_HISTORICAL_CANDIDATES_SQL = text(f""" SELECT d.id, d.title, @@ -63,6 +66,8 @@ _HISTORICAL_CANDIDATES_SQL = text(""" AND d.created_at < :hist_end AND d.embedding IS NOT NULL AND d.ai_summary IS NOT NULL + -- 안전 자료실 B-4: licensed_restricted 발행 차단 (공유 술어) + AND {restricted_exclude_sql("d")} """) diff --git a/app/services/digest/loader.py b/app/services/digest/loader.py index 703c39f..3bead5c 100644 --- a/app/services/digest/loader.py +++ b/app/services/digest/loader.py @@ -15,11 +15,12 @@ from sqlalchemy import text from core.database import async_session from core.utils import setup_logger +from services.search.license_filter import restricted_exclude_sql logger = setup_logger("digest_loader") -_NEWS_WINDOW_SQL = text(""" +_NEWS_WINDOW_SQL = text(f""" SELECT d.id, d.title, @@ -41,9 +42,9 @@ _NEWS_WINDOW_SQL = text(""" AND d.created_at < :window_end AND d.embedding IS NOT NULL AND d.ai_summary IS NOT NULL - -- 안전 자료실 B-4: licensed_restricted 발행 차단 (retrieval_service._license_sql 와 동일 술어). + -- 안전 자료실 B-4: licensed_restricted 발행 차단 (모든 경로 공유 술어 = license_filter). -- news 채널엔 현재 restricted 부재 = 방어적 게이트(미래 유료 news 소스 대비, 경로 누락 방지). - AND COALESCE(d.extract_meta -> 'license' ->> 'restricted', 'false') <> 'true' + AND {restricted_exclude_sql("d")} """) diff --git a/app/services/search/license_filter.py b/app/services/search/license_filter.py new file mode 100644 index 0000000..3c49148 --- /dev/null +++ b/app/services/search/license_filter.py @@ -0,0 +1,28 @@ +"""안전 자료실 B-4 — licensed_restricted 단일 술어 (a안 U-2①, 모든 경로 공유 정의). + +색인은 허용하되 restricted=true(구매 전자책·유료자료)의 verbatim span 이 RAG 증거·발행물 +(검색/ask·digest·morning_briefing·study 풀이)에 들어가는 모든 경로를 구조적으로 차단. +경로마다 술어를 복붙하지 않고 이 한 정의를 공유 — 가드 누락/드리프트 방지 +([[feedback_structural_integrity_over_path_discipline]]). +개인 파일 열람(GET /documents/{id}?download)은 a안상 허용 = 미적용. + +두 표현(raw SQL / ORM)은 의미 동일: restricted 부재·false·extract_meta NULL = COALESCE 로 +미제외(redistribute=false 여도 restricted 부재면 미제외 — redistribute≠restricted 가 핵심). +""" + + +def restricted_exclude_sql(alias: str = "") -> str: + """raw text() 쿼리용 bare 술어('AND' 미포함). alias='' = 컬럼 직접 참조.""" + p = (alias + ".") if alias else "" + return f"COALESCE({p}extract_meta -> 'license' ->> 'restricted', 'false') <> 'true'" + + +def restricted_exclude_orm(): + """SQLAlchemy ORM .where() 절 — restricted_exclude_sql 과 동일 의미(JSONB extract_meta).""" + from sqlalchemy import func + + from models.document import Document + + return func.coalesce( + Document.extract_meta["license"]["restricted"].astext, "false" + ) != "true" diff --git a/app/services/search/retrieval_service.py b/app/services/search/retrieval_service.py index 366d1ca..c14a2eb 100644 --- a/app/services/search/retrieval_service.py +++ b/app/services/search/retrieval_service.py @@ -150,9 +150,10 @@ def _license_sql(alias: str) -> str: axis 필터(조건부)와 달리 항상 적용. restricted 부재/false = COALESCE 로 미제외 → 기존 코퍼스(restricted=true 0건)에서 결과 불변. 반환 ' AND ...' (alias='' = 컬럼 직접). + 술어 정의 = license_filter.restricted_exclude_sql 공유(digest/briefing/study 풀이와 단일 source). """ - p = (alias + ".") if alias else "" - return f" AND COALESCE({p}extract_meta -> 'license' ->> 'restricted', 'false') <> 'true'" + from services.search.license_filter import restricted_exclude_sql + return " AND " + restricted_exclude_sql(alias) # 2단계 gate (R2-B1) — SQL string interpolation 직전 final allowlist. diff --git a/app/services/study/explanation_rag.py b/app/services/study/explanation_rag.py index a690d8c..dc088b1 100644 --- a/app/services/study/explanation_rag.py +++ b/app/services/study/explanation_rag.py @@ -24,6 +24,7 @@ from models.chunk import DocumentChunk from models.document import Document from models.study_question import StudyQuestion from models.study_topic import StudyTopicDocument +from services.search.license_filter import restricted_exclude_orm logger = logging.getLogger(__name__) @@ -124,11 +125,14 @@ async def _gather_document_evidence( return [] # 매핑된 documents 메타 (제목·요약 표기) + # B-4: licensed_restricted 제외 → valid_doc_ids 에서 빠지므로 아래 청크 쿼리(doc_id IN)도 + # 자동 차단. study 풀이 RAG 도 retrieval/digest 와 동일 단일 술어 공유(a안 U-2①). doc_meta_rows = ( await session.execute( select(Document.id, Document.title, Document.ai_summary).where( Document.id.in_(doc_ids), Document.deleted_at.is_(None), + restricted_exclude_orm(), ) ) ).all() diff --git a/app/workers/file_watcher.py b/app/workers/file_watcher.py index 78313fd..f1987cc 100644 --- a/app/workers/file_watcher.py +++ b/app/workers/file_watcher.py @@ -320,6 +320,15 @@ async def watch_inbox(): existing.category = category if needs_conversion and not getattr(existing, "needs_conversion", False): existing.needs_conversion = True + # B-4 — 축/license 보정(B-4 이전 적재분이 재변경 시): material 미설정 시 주입, + # license 부재 시에만 merge 주입(clobber 회피 — 기존 extract_meta 키 보존). + if existing.material_type is None and target_mt is not None: + existing.material_type = target_mt + existing.jurisdiction = target_jur + if target_license and not (existing.extract_meta or {}).get("license"): + meta = dict(existing.extract_meta or {}) + meta["license"] = dict(target_license) + existing.extract_meta = meta if next_stage: await enqueue_stage(session, existing.id, next_stage) diff --git a/tests/test_b4_license_watch.py b/tests/test_b4_license_watch.py index a38d267..a3bd4d6 100644 --- a/tests/test_b4_license_watch.py +++ b/tests/test_b4_license_watch.py @@ -10,10 +10,33 @@ from pathlib import Path sys.path.insert(0, str(Path(__file__).parent.parent / "app")) +from services.search.license_filter import ( # noqa: E402 + restricted_exclude_orm, + restricted_exclude_sql, +) from services.search.retrieval_service import _license_sql # noqa: E402 from workers.file_watcher import _TARGET_AXIS # noqa: E402 +def test_shared_predicate_single_source(): + # retrieval/digest/briefing 가 같은 술어 정의를 공유 — drift 방지(단일 source 계약) + assert _license_sql("d") == " AND " + restricted_exclude_sql("d") + assert _license_sql("") == " AND " + restricted_exclude_sql("") + assert restricted_exclude_sql("d").startswith("COALESCE(d.extract_meta") + + +def test_restricted_exclude_orm_constructs(): + # study 풀이(explanation_rag)용 ORM 표현 — 컴파일 SQL 이 raw 술어와 동일 구조인지 + from sqlalchemy.dialects import postgresql + + clause = restricted_exclude_orm() + sql = str(clause.compile(dialect=postgresql.dialect(), + compile_kwargs={"literal_binds": True})) + assert "extract_meta" in sql + assert "'license'" in sql and "'restricted'" in sql # JSONB 경로 키 + assert "'false'" in sql and "'true'" in sql # COALESCE 기본 + 비교값 + + def test_license_sql_shape_with_alias(): sql = _license_sql("d") assert sql.startswith(" AND ") # 항상 ' AND ...' (WHERE 합성용)