Fix identify special token logic and loading processor code

lablup · juk1329 · Jul 30, 2025 · Jul 30, 2025 · Jul 30, 2025 · Aug 1, 2025
commit a21f106d0a5229509e1bb58ebf48ec79e2003011
diff --git a/pytorch/vlm-models/README.md b/pytorch/vlm-models/README.md
@@ -2,8 +2,32 @@
 
 이 프로젝트는 Vision-Language Model (VLM)에 대해 Hugging Face 데이터셋으로 파인튜닝하고 평가하는 파이프라인을 제공합니다.
 
+**주요 특징**:
+
+-   🤖 **자동 특수 토큰 감지**: 모델의 tokenizer에서 모든 특수 토큰을 자동으로 감지하고 설정
+-   🔄 **범용 VLM 지원**: 다양한 VLM 모델 아키텍처를 자동으로 지원
+-   🛡️ **안전한 설정**: 사용자의 수동 토큰 설정 오류를 방지하는 자동화 시스템
+-   📊 **포괄적 평가**: VLM 태스크에 특화된 평가 지표 제공
+
 **주요 변경사항**: 기존 Language Model 파이프라인을 VLM 파이프라인으로 전환하였습니다.
 
+## ✨ 새로운 자동 특수 토큰 감지 시스템
+
+**더 이상 특수 토큰을 수동으로 설정할 필요가 없습니다!**
+
+### 🔍 자동 감지 기능
+
+-   **4단계 자동 감지**: Core tokens → Additional tokens → Visual tokens → Template compatibility
+-   **모델별 적응**: 각 VLM 모델의 특수 토큰을 자동으로 파악
+-   **안전한 처리**: 잘못된 토큰 설정으로 인한 학습 오류 방지
+
+### 🎯 감지되는 토큰 유형
+
+-   **기본 토큰**: pad, eos, bos, unk, sep, cls, mask
+-   **VLM 토큰**: `<image>`, `<video>`, `<|vision_start|>`, `<|vision_end|>` 등
+-   **모델별 토큰**: Qwen2VL, LLaVA, PaliGemma 등의 모델 특화 토큰
+-   **템플릿 토큰**: apply_chat_template에서 사용되는 토큰들
+
 ## 🚀 Quick Start
 
 ### 환경 설정

diff --git a/pytorch/vlm-models/pipeline-code/configs/vlm_collator_config.yaml b/pytorch/vlm-models/pipeline-code/configs/vlm_collator_config.yaml
@@ -71,12 +71,28 @@ batch_processing:
     return_tensors: "pt" # 반환할 텐서 타입
     padding_strategy: "longest" # 패딩 전략: longest, max_length
 
-# 특수 토큰 설정 (모델별로 다를 수 있음)
+# 특수 토큰 설정 (자동 감지 사용, 수동 설정 불필요)
 special_tokens:
-    image_token: "<image>" # 이미지 토큰
-    video_token: "<video>" # 비디오 토큰 (필요시)
-    pad_token: null # 패드 토큰 (auto-detect)
-    eos_token: null # EOS 토큰 (auto-detect)
+    # ✨ 새로운 자동 감지 시스템 ✨
+    # 이제 tokenizer.special_tokens_map과 additional_special_tokens에서
+    # 모든 특수 토큰을 자동으로 감지하므로 수동 설정이 불필요합니다!
+
+    # 🔧 기본 동작:
+    # - 패드, EOS, BOS 등 core 토큰들 자동 감지
+    # - 이미지, 비디오 등 VLM 관련 토큰들 자동 감지
+    # - apply_chat_template과 호환성 자동 검증
+    # - 손실 계산 시 자동 마스킹 적용
+
+    # 📝 Manual override (선택사항 - 고급 사용자용)
+    # 자동 감지가 실패하는 경우에만 아래 섹션을 활성화하세요
+    manual_tokens:
+        enabled: false # true로 변경시 수동 토큰 설정 활성화
+        tokens:
+            []
+            # 예시 (필요시에만 사용):
+            # - name: "custom_image_token"
+            #   token: "<custom_image>"
+            #   ignore_in_loss: true
 
 # 비디오 처리 설정
 video_processing:

diff --git a/pytorch/vlm-models/pipeline-code/configs/vlm_model_config.yaml b/pytorch/vlm-models/pipeline-code/configs/vlm_model_config.yaml
@@ -1,53 +1,58 @@
 # VLM Model Configuration
 # 다양한 VLM 모델에 대한 클래스 매핑 설정
+#
+# 📝 Processor 설정 방식:
+# - processor_class가 지정되지 않으면 기본적으로 AutoProcessor 사용 (권장)
+# - 특수한 processor가 필요한 모델만 processor_class 명시적 지정
+# - 대부분의 VLM 모델은 AutoProcessor로 충분히 호환됩니다
 
 # 모델별 클래스 매핑
 model_classes:
-    # Qwen2-VL 모델들
+    # Qwen2-VL 모델들 (특수 processor 필요)
     "Qwen/Qwen2-VL-2B-Instruct":
         model_class: "Qwen2VLForConditionalGeneration"
-        processor_class: "Qwen2VLProcessor"
+        processor_class: "Qwen2VLProcessor" # 특수 processor 명시적 지정
         import_path: "transformers"
 
     "Qwen/Qwen2-VL-7B-Instruct":
         model_class: "Qwen2VLForConditionalGeneration"
-        processor_class: "Qwen2VLProcessor"
+        processor_class: "Qwen2VLProcessor" # 특수 processor 명시적 지정
         import_path: "transformers"
 
-    # LLaVA 모델들
+    # LLaVA 모델들 (AutoProcessor 사용)
     "llava-hf/llava-1.5-7b-hf":
         model_class: "LlavaForConditionalGeneration"
-        processor_class: "LlavaProcessor"
         import_path: "transformers"
+        # processor_class 미지정 -> AutoProcessor 사용
 
     "llava-hf/llava-1.5-13b-hf":
         model_class: "LlavaForConditionalGeneration"
-        processor_class: "LlavaProcessor"
         import_path: "transformers"
+        # processor_class 미지정 -> AutoProcessor 사용
 
-    # InternVL 모델들
+    # InternVL 모델들 (AutoProcessor 사용)
     "OpenGVLab/InternVL2-2B":
         model_class: "InternVLChatModel"
-        processor_class: "InternVLChatProcessor"
         import_path: "transformers"
+        # processor_class 미지정 -> AutoProcessor 사용
 
-    # PaliGemma 모델들
+    # PaliGemma 모델들 (특수 processor 필요)
     "google/paligemma-3b-pt-448":
         model_class: "PaliGemmaForConditionalGeneration"
-        processor_class: "PaliGemmaProcessor"
+        processor_class: "PaliGemmaProcessor" # 특수 processor 명시적 지정
         import_path: "transformers"
 
-    # Phi-3-Vision 모델들
+    # Phi-3-Vision 모델들 (AutoProcessor 사용)
     "microsoft/Phi-3-vision-128k-instruct":
         model_class: "Phi3VForCausalLM"
-        processor_class: "Phi3VProcessor"
         import_path: "transformers"
+        # processor_class 미지정 -> AutoProcessor 사용
 
-# 기본 fallback 설정 (AutoModel 사용)
+# 기본 fallback 설정 (VLM용 AutoModel 사용)
 default_fallback:
-    model_class: "AutoModelForCausalLM"
-    processor_class: "AutoProcessor"
+    model_class: "AutoModelForVision2Seq"
     import_path: "transformers"
+    # processor_class 미지정 -> AutoProcessor 사용
 
 # 공통 로딩 파라미터
 loading_params: