feat: 초기 프로젝트 구조 설정 및 소스 코드 추가

2025-07-24 15:06:29 +09:00
parent 9003616737
commit 5db20e2943
17 changed files with 4840 additions and 0 deletions
--- a/src/test_summarizer_fixed.py
+++ b/src/test_summarizer_fixed.py
@@ -0,0 +1,103 @@
+#!/usr/bin/env python3
+"""
+요약 모델 테스트 (토큰 오류 수정)
+"""
+
+import torch
+import time
+from transformers import PreTrainedTokenizerFast, BartForConditionalGeneration
+
+def test_summarizer_fixed():
+    print("🧪 한국어 요약 모델 테스트 (수정된 버전)")
+    
+    model_name = "gogamza/kobart-summarization"
+    
+    try:
+        # 모델 로드
+        print("📥 모델 로딩 중...")
+        tokenizer = PreTrainedTokenizerFast.from_pretrained(model_name)
+        model = BartForConditionalGeneration.from_pretrained(
+            model_name,
+            torch_dtype=torch.float16,
+            low_cpu_mem_usage=True
+        )
+        
+        # Apple Silicon 최적화
+        if torch.backends.mps.is_available():
+            device = torch.device("mps")
+            model = model.to(device)
+            print("🚀 Apple Silicon MPS 가속 사용")
+        else:
+            device = torch.device("cpu")
+            print("💻 CPU 모드 사용")
+        
+        def summarize_text_fixed(text):
+            print(f"\n📝 요약 테스트:")
+            print(f"원문 ({len(text)}자):")
+            print(f"{text[:150]}...")
+            
+            start_time = time.time()
+            
+            # 토큰화 (token_type_ids 제거)
+            inputs = tokenizer(
+                text,
+                return_tensors="pt",
+                max_length=1024,
+                truncation=True,
+                padding=True,
+                return_token_type_ids=False  # 이 부분이 핵심!
+            ).to(device)
+            
+            # 요약 생성
+            with torch.no_grad():
+                summary_ids = model.generate(
+                    input_ids=inputs['input_ids'],
+                    attention_mask=inputs['attention_mask'],
+                    max_length=150,
+                    min_length=30,
+                    num_beams=4,
+                    early_stopping=True,
+                    no_repeat_ngram_size=2,
+                    length_penalty=1.2,
+                    do_sample=False
+                )
+            
+            # 결과 디코딩
+            summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+            
+            process_time = time.time() - start_time
+            print(f"\n📋 요약 결과 ({len(summary)}자):")
+            print(f"{summary}")
+            print(f"⏱️  처리 시간: {process_time:.2f}초")
+            print(f"📊 압축률: {len(summary)/len(text)*100:.1f}%")
+            
+            return summary
+        
+        # 테스트 실행
+        test_text = """
+        인공지능과 기계학습 기술이 급속도로 발전하면서 우리의 일상생활과 업무 환경에 혁신적인 변화를 가져오고 있습니다. 
+        특히 자연어 처리 분야에서는 번역, 요약, 대화형 AI 등의 기술이 실용적인 수준에 도달하여 다양한 서비스에 적용되고 있습니다. 
+        기계학습 알고리즘은 대량의 텍스트 데이터를 학습하여 언어의 패턴과 의미를 이해하고, 이를 바탕으로 인간과 유사한 수준의 
+        언어 처리 능력을 보여주고 있습니다. 딥러닝 기술의 발전으로 번역의 정확도가 크게 향상되었으며, 실시간 번역 서비스도 
+        일상적으로 사용할 수 있게 되었습니다.
+        """
+        
+        summarize_text_fixed(test_text.strip())
+        
+        print(f"\n✅ 요약 모델 테스트 성공!")
+        return True
+        
+    except Exception as e:
+        print(f"❌ 테스트 실패: {e}")
+        return False
+
+if __name__ == "__main__":
+    print("🚀 한국어 요약 모델 테스트 (수정)")
+    print("="*50)
+    
+    if test_summarizer_fixed():
+        print("\n🎉 요약 모델 정상 작동!")
+        print("📝 다음 단계: 통합 번역 시스템 구축")
+    else:
+        print("\n❌ 여전히 문제 있음")
+        print("📝 요약 없이 번역만으로 진행 고려")