feat: enhance revision logic with fuzzy matching, dynamic material loading, and schema automation

- Improved RevisionComparator with fuzzy matching (RapidFuzz) and dynamic DB material loading - Enhanced regex patterns for better size/material extraction - Initialized Alembic for schema migrations and created baseline migration - Added entrypoint.sh for automated migrations in Docker - Fixed SyntaxError in fitting_classifier.py - Updated test suite with new functionality tests
2026-01-09 09:36:40 +09:00
parent afea8428b2
commit f16bc662ad
11 changed files with 1575 additions and 76 deletions
--- a/backend/app/config.py
+++ b/backend/app/config.py
@@ -147,6 +147,7 @@ class Settings(BaseSettings):
        env_file = ".env"
        env_file_encoding = "utf-8"
        case_sensitive = False
+        extra = "ignore"
        
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
--- a/backend/app/services/fitting_classifier.py
+++ b/backend/app/services/fitting_classifier.py
@@ -253,17 +253,28 @@ def classify_fitting(dat_file: str, description: str, main_nom: str,
    is_instrument = any(kw in desc_upper for kw in instrument_keywords)
    
    if is_instrument:
-        fitting_type["category"] = "INSTRUMENT_FITTING"
-        if "SWAGELOK" in desc_upper: fitting_type["brand"] = "SWAGELOK"
+        fitting_type_result["category"] = "INSTRUMENT_FITTING"
+        if "SWAGELOK" in desc_upper: fitting_type_result["brand"] = "SWAGELOK"
        
        # Tube OD 추출 (예: 1/4", 6MM, 12MM)
        tube_match = re.search(r'(\d+(?:/\d+)?)\s*(?:\"|INCH|MM)\s*(?:OD|TUBE)', desc_upper)
        if tube_match:
-            fitting_type["tube_od"] = tube_match.group(0)
+            fitting_type_result["tube_od"] = tube_match.group(0)

    return {
        "category": "FITTING",
-        "fitting_type": fitting_type,
+        "fitting_type": fitting_type_result,
+        "connection_method": connection_result,
+        "pressure_rating": pressure_result,
+        "schedule": schedule_result,
+        "manufacturing": manufacturing_result,
+        "overall_confidence": calculate_fitting_confidence({
+            "material": material_result.get("confidence", 0),
+            "fitting_type": fitting_type_result.get("confidence", 0),
+            "connection": connection_result.get("confidence", 0),
+            "pressure": pressure_result.get("confidence", 0)
+        })
+    }


 def analyze_size_pattern_for_fitting_type(description: str, main_nom: str, red_nom: str = None) -> Dict:
--- a/backend/app/services/revision_comparator.py
+++ b/backend/app/services/revision_comparator.py
@@ -93,16 +93,11 @@ class RevisionComparator:
    def compare_materials(self, previous_confirmed: Dict, new_materials: List[Dict]) -> Dict:
        """
        기존 확정 자재와 신규 자재 비교
-        
-        Args:
-            previous_confirmed: 이전 확정 자재 정보
-            new_materials: 신규 업로드된 자재 목록
-            
-        Returns:
-            비교 결과 딕셔너리
        """
        try:
-            # 이전 확정 자재를 해시맵으로 변환 (빠른 검색을 위해)
+            from rapidfuzz import fuzz
+
+            # 이전 확정 자재 해시맵 생성
            confirmed_materials = {}
            for item in previous_confirmed["items"]:
                material_hash = self._generate_material_hash(
@@ -112,13 +107,19 @@ class RevisionComparator:
                )
                confirmed_materials[material_hash] = item
            
+            # 해시 역참조 맵 (유사도 비교용)
+            # 해시 -> 정규화된 설명 문자열 (비교 대상)
+            # 여기서는 specification 자체를 비교 대상으로 사용 (가장 정보량이 많음)
+            confirmed_specs = {
+                h: item["specification"] for h, item in confirmed_materials.items()
+            }
+            
            # 신규 자재 분석
-            unchanged_materials = []  # 변경 없음 (분류 불필요)
-            changed_materials = []    # 변경됨 (재분류 필요)
-            new_materials_list = []   # 신규 추가 (분류 필요)
+            unchanged_materials = [] 
+            changed_materials = []    
+            new_materials_list = []   
            
            for new_material in new_materials:
-                # 자재 해시 생성 (description 기반)
                description = new_material.get("description", "")
                size = self._extract_size_from_description(description)
                material = self._extract_material_from_description(description)
@@ -126,13 +127,13 @@ class RevisionComparator:
                material_hash = self._generate_material_hash(description, size, material)
                
                if material_hash in confirmed_materials:
+                    # 정확히 일치하는 자재 발견 (해시 일치)
                    confirmed_item = confirmed_materials[material_hash]
                    
-                    # 수량 비교
                    new_qty = float(new_material.get("quantity", 0))
                    confirmed_qty = float(confirmed_item["bom_quantity"])
                    
-                    if abs(new_qty - confirmed_qty) > 0.001:  # 수량 변경
+                    if abs(new_qty - confirmed_qty) > 0.001:
                        changed_materials.append({
                            **new_material,
                            "change_type": "QUANTITY_CHANGED",
@@ -140,27 +141,49 @@ class RevisionComparator:
                            "previous_item": confirmed_item
                        })
                    else:
-                        # 수량 동일 - 기존 분류 결과 재사용
                        unchanged_materials.append({
                            **new_material,
                            "reuse_classification": True,
                            "previous_item": confirmed_item
                        })
                else:
-                    # 신규 자재
-                    new_materials_list.append({
-                        **new_material,
-                        "change_type": "NEW_MATERIAL"
-                    })
+                    # 해시 불일치 - 유사도 검사 (Fuzzy Matching)
+                    # 신규 자재 설명과 기존 확정 자재들의 스펙 비교
+                    best_match_hash = None
+                    best_match_score = 0
+                    
+                    # 성능을 위해 간단한 필터링 후 정밀 비교 권장되나, 
+                    # 현재는 전체 비교 (데이터량이 많지 않다고 가정)
+                    for h, spec in confirmed_specs.items():
+                        score = fuzz.ratio(description.lower(), spec.lower())
+                        if score > 85: # 85점 이상이면 매우 유사
+                            if score > best_match_score:
+                                best_match_score = score
+                                best_match_hash = h
+                    
+                    if best_match_hash:
+                        # 유사한 자재 발견 (오타 또는 미세 변경 가능성)
+                        similar_item = confirmed_materials[best_match_hash]
+                        new_materials_list.append({
+                            **new_material,
+                            "change_type": "NEW_BUT_SIMILAR",
+                            "similarity_score": best_match_score,
+                            "similar_to": similar_item
+                        })
+                    else:
+                        # 완전히 새로운 자재
+                        new_materials_list.append({
+                            **new_material,
+                            "change_type": "NEW_MATERIAL"
+                        })
            
-            # 삭제된 자재 찾기 (이전에는 있었지만 현재는 없는 것)
+            # 삭제된 자재 찾기
            new_material_hashes = set()
            for material in new_materials:
-                description = material.get("description", "")
-                size = self._extract_size_from_description(description)
-                material_grade = self._extract_material_from_description(description)
-                hash_key = self._generate_material_hash(description, size, material_grade)
-                new_material_hashes.add(hash_key)
+                d = material.get("description", "")
+                s = self._extract_size_from_description(d)
+                m = self._extract_material_from_description(d)
+                new_material_hashes.add(self._generate_material_hash(d, s, m))
            
            removed_materials = []
            for hash_key, confirmed_item in confirmed_materials.items():
@@ -186,7 +209,7 @@ class RevisionComparator:
                "removed_materials": removed_materials
            }
            
-            logger.info(f"리비전 비교 완료: 변경없음 {len(unchanged_materials)}, "
+            logger.info(f"리비전 비교 완료 (Fuzzy 적용): 변경없음 {len(unchanged_materials)}, "
                       f"변경됨 {len(changed_materials)}, 신규 {len(new_materials_list)}, "
                       f"삭제됨 {len(removed_materials)}")
            
@@ -195,7 +218,7 @@ class RevisionComparator:
        except Exception as e:
            logger.error(f"자재 비교 실패: {str(e)}")
            raise
-    
+
    def _extract_revision_number(self, revision: str) -> int:
        """리비전 문자열에서 숫자 추출 (Rev.1 → 1)"""
        try:
@@ -206,37 +229,136 @@ class RevisionComparator:
            return 0
    
    def _generate_material_hash(self, description: str, size: str, material: str) -> str:
-        """자재 고유성 판단을 위한 해시 생성"""
-        # RULES.md의 코딩 컨벤션 준수
-        hash_input = f"{description}|{size}|{material}".lower().strip()
+        """
+        자재 고유성 판단을 위한 해시 생성
+        
+        Args:
+            description: 자재 설명
+            size: 자재 규격/크기
+            material: 자재 재질
+            
+        Returns:
+            MD5 해시 문자열
+        """
+        import re
+        
+        def normalize(s: Optional[str]) -> str:
+            if s is None:
+                return ""
+            # 다중 공백을 단일 공백으로 치환하고 앞뒤 공백 제거
+            s = re.sub(r'\s+', ' ', str(s))
+            return s.strip().lower()
+
+        # 각 컴포넌트 정규화
+        d_norm = normalize(description)
+        s_norm = normalize(size)
+        m_norm = normalize(material)
+
+        # RULES.md의 코딩 컨벤션 준수 (pipe separator 사용)
+        # 값이 없는 경우에도 구분자를 포함하여 구조 유지 (예: "desc||mat")
+        hash_input = f"{d_norm}|{s_norm}|{m_norm}"
+        
        return hashlib.md5(hash_input.encode()).hexdigest()
    
    def _extract_size_from_description(self, description: str) -> str:
-        """자재 설명에서 사이즈 정보 추출"""
-        # 간단한 사이즈 패턴 추출 (실제로는 더 정교한 로직 필요)
+        """
+        자재 설명에서 사이즈 정보 추출
+        
+        지원하는 패턴 (단어 경계 \b 추가하여 정확도 향상):
+        - 1/2" (인치)
+        - 100A (A단위)
+        - 50mm (밀리미터)
+        - 10x20 (가로x세로)
+        - DN100 (DN단위)
+        """
+        if not description:
+            return ""
+            
        import re
        size_patterns = [
-            r'(\d+(?:\.\d+)?)\s*(?:mm|MM|인치|inch|")',
-            r'(\d+(?:\.\d+)?)\s*x\s*(\d+(?:\.\d+)?)',
-            r'DN\s*(\d+)',
-            r'(\d+)\s*A'
+            # 인치 패턴 (분수 포함): 1/2", 1.5", 1-1/2"
+            r'\b(\d+(?:[-/.]\d+)?)\s*(?:inch|인치|")',
+            # 밀리미터 패턴: 100mm, 100.5 MM
+            r'\b(\d+(?:\.\d+)?)\s*(?:mm|MM)\b',
+            # A단위 패턴: 100A, 100 A
+            r'\b(\d+)\s*A\b',
+            # DN단위 패턴: DN100, DN 100
+            r'DN\s*(\d+)\b',
+            # 치수 패턴: 10x20, 10*20
+            r'\b(\d+(?:\.\d+)?)\s*[xX*]\s*(\d+(?:\.\d+)?)\b'
        ]
        
        for pattern in size_patterns:
            match = re.search(pattern, description, re.IGNORECASE)
            if match:
-                return match.group(0)
+                return match.group(0).strip()
        
        return ""
    
+    def _load_materials_from_db(self) -> List[str]:
+        """DB에서 자재 목록 동적 로딩 (캐싱 적용 고려 가능)"""
+        try:
+            # MaterialSpecification 및 SpecialMaterial 테이블에서 자재 코드 조회
+            query = text("""
+                SELECT spec_code FROM material_specifications 
+                WHERE is_active = TRUE
+                UNION 
+                SELECT grade_code FROM material_grades 
+                WHERE is_active = TRUE
+                UNION
+                SELECT material_name FROM special_materials
+                WHERE is_active = TRUE
+            """)
+            result = self.db.execute(query).fetchall()
+            db_materials = [row[0] for row in result]
+            
+            # 기본 하드코딩 리스트 (DB 조회 실패 시 또는 보완용)
+            default_materials = [
+                "SUS316L", "SUS316", "SUS304L", "SUS304",
+                "SS316L", "SS316", "SS304L", "SS304",
+                "A105N", "A105", 
+                "A234 WPB", "A234",
+                "A106 Gr.B", "A106",
+                "WCB", "CF8M", "CF8", 
+                "CS", "STS", "PVC", "PP", "PE"
+            ]
+            
+            # 합치고 중복 제거 후 길이 역순 정렬 (긴 단어 우선 매칭)
+            combined = list(set(db_materials + default_materials))
+            combined.sort(key=len, reverse=True)
+            
+            return combined
+            
+        except Exception as e:
+            logger.warning(f"DB 자재 로딩 실패 (기본값 사용): {str(e)}")
+            materials = [
+                "SUS316L", "SUS316", "SUS304L", "SUS304",
+                "SS316L", "SS316", "SS304L", "SS304",
+                "A105N", "A105", 
+                "A234 WPB", "A234",
+                "A106 Gr.B", "A106",
+                "WCB", "CF8M", "CF8", 
+                "CS", "STS", "PVC", "PP", "PE"
+            ]
+            return materials
+
    def _extract_material_from_description(self, description: str) -> str:
-        """자재 설명에서 재질 정보 추출"""
-        # 일반적인 재질 패턴
-        materials = ["SS304", "SS316", "SS316L", "A105", "WCB", "CF8M", "CF8", "CS"]
+        """
+        자재 설명에서 재질 정보 추출
+        우선순위에 따라 매칭 (구체적인 재질 먼저)
+        """
+        if not description:
+            return ""
+            
+        # 자재 목록 로딩 (메모리 캐싱을 위해 클래스 속성으로 저장 고려 가능하지만 여기선 매번 호출로 단순화)
+        # 성능이 중요하다면 __init__ 시점에 로드하거나 lru_cache 사용 권장
+        materials = self._load_materials_from_db()
        
        description_upper = description.upper()
+        
        for material in materials:
-            if material in description_upper:
+            # 단어 매칭을 위해 간단한 검사 수행 (부분 문자열이 다른 단어의 일부가 아닌지)
+            if material.upper() in description_upper:
                return material
        
        return ""