diff --git a/.gitignore b/.gitignore index f72c7e06..7246f210 100644 --- a/.gitignore +++ b/.gitignore @@ -17,6 +17,7 @@ __pycache__/ /doc/_apidoc/ /build +/hyperpod-pytorch-job-template/build /sagemaker-hyperpod/build /sagemaker-hyperpod/.coverage diff --git a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/recipes/hf/__init__.py b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/recipes/hf/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/recipes/hf/model.py b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/recipes/hf/model.py new file mode 100644 index 00000000..cec37765 --- /dev/null +++ b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/recipes/hf/model.py @@ -0,0 +1,360 @@ +from typing import Any, Dict, List, Optional, Union +from pydantic import BaseModel, ConfigDict + + +class RunConfig(BaseModel): + model_config = ConfigDict(extra="forbid") + + name: Optional[str] = None + results_dir: Optional[str] = None + time_limit: Optional[str] = None + model_type: Optional[str] = None + dependency: Optional[str] = None + + +class TrainerConfig(BaseModel): + model_config = ConfigDict(extra="forbid") + + devices: Optional[int] = None + num_nodes: Optional[int] = None + accelerator: Optional[str] = None + precision: Optional[str] = None + max_steps: Optional[int] = None + log_every_n_steps: Optional[int] = None + val_check_interval: Optional[Union[int, float]] = None + limit_val_batches: Optional[Union[int, float]] = None + logger: Optional[bool] = None + enable_checkpointing: Optional[bool] = None + use_distributed_sampler: Optional[bool] = None + max_epochs: Optional[int] = None + max_time: Optional[str] = None + limit_test_batches: Optional[int] = None + accumulate_grad_batches: Optional[int] = None + gradient_clip_val: Optional[float] = None + + +class CheckpointCallbackParamsConfig(BaseModel): + model_config = ConfigDict(extra="forbid") + + save_top_k: Optional[int] = None + every_n_train_steps: Optional[int] = None + monitor: Optional[str] = None + mode: Optional[str] = None + save_last: Optional[bool] = None + + +class AutoCheckpointConfig(BaseModel): + model_config = ConfigDict(extra="forbid") + + enabled: Optional[bool] = None + + +class ExportFullModelConfig(BaseModel): + model_config = ConfigDict(extra="forbid") + + every_n_train_steps: Optional[int] = None + save_last: Optional[bool] = None + + +class TokenizerConfig(BaseModel): + model_config = ConfigDict(extra="forbid") + + library: Optional[str] = None + type: Optional[str] = None + model: Optional[str] = None + delimiter: Optional[str] = None + vocab_file: Optional[str] = None + merge_file: Optional[str] = None + sentencepiece_legacy: Optional[bool] = None + + +class StepTimingKwargsConfig(BaseModel): + model_config = ConfigDict(extra="forbid") + + sync_cuda: Optional[bool] = None + buffer_size: Optional[int] = None + + +class ExpManagerConfig(BaseModel): + model_config = ConfigDict(extra="forbid") + + exp_dir: Optional[str] = None + name: Optional[str] = None + create_tensorboard_logger: Optional[bool] = None + summary_writer_kwargs: Optional[Dict[str, Any]] = None + create_mlflow_logger: Optional[bool] = None + mlflow_logger_kwargs: Optional[Dict[str, Any]] = None + create_wandb_logger: Optional[bool] = None + wandb_logger_kwargs: Optional[Dict[str, Any]] = None + create_checkpoint_callback: Optional[bool] = None + checkpoint_callback_params: Optional[CheckpointCallbackParamsConfig] = None + checkpoint_dir: Optional[str] = None + resume_from_checkpoint: Optional[str] = None + auto_checkpoint: Optional[AutoCheckpointConfig] = None + export_full_model: Optional[ExportFullModelConfig] = None + explicit_log_dir: Optional[str] = None + resume_if_exists: Optional[bool] = None + resume_ignore_no_checkpoint: Optional[bool] = None + log_step_timing: Optional[bool] = None + step_timing_kwargs: Optional[StepTimingKwargsConfig] = None + + +class RopeScalingConfig(BaseModel): + model_config = ConfigDict(extra="forbid") + + rope_type: Optional[str] = None + factor: Optional[float] = None + high_freq_factor: Optional[float] = None + low_freq_factor: Optional[float] = None + original_max_position_embeddings: Optional[int] = None + + +class PeftConfig(BaseModel): + model_config = ConfigDict(extra="forbid") + + peft_type: Optional[str] = None + rank: Optional[int] = None + alpha: Optional[int] = None + dropout: Optional[float] = None + target_modules: Optional[List[str]] = None + + +class SchedulerConfig(BaseModel): + model_config = ConfigDict(extra="forbid") + + name: Optional[str] = None + warmup_steps: Optional[int] = None + constant_steps: Optional[int] = None + min_lr: Optional[float] = None + + +class OptimizerConfig(BaseModel): + model_config = ConfigDict(extra="forbid") + + name: Optional[str] = None + lr: Optional[float] = None + weight_decay: Optional[float] = None + betas: Optional[List[float]] = None + sched: Optional[SchedulerConfig] = None + bucket_cap_mb: Optional[int] = None + overlap_grad_sync: Optional[bool] = None + overlap_param_sync: Optional[bool] = None + contiguous_grad_buffer: Optional[bool] = None + contiguous_param_buffer: Optional[bool] = None + + +class DataConfig(BaseModel): + model_config = ConfigDict(extra="forbid") + + train_dir: Optional[str] = None + val_dir: Optional[str] = None + dataset_type: Optional[str] = None + use_synthetic_data: Optional[bool] = None + tokenizer_name: Optional[str] = None + zipped_data: Optional[bool] = None + data_impl: Optional[str] = None + splits_string: Optional[str] = None + seq_length: Optional[int] = None + skip_warmup: Optional[bool] = None + num_workers: Optional[int] = None + dataloader_type: Optional[str] = None + reset_position_ids: Optional[bool] = None + reset_attention_mask: Optional[bool] = None + eod_mask_loss: Optional[bool] = None + index_mapping_dir: Optional[str] = None + data_prefix: Optional[List[str]] = None + + +class VizTracerConfig(BaseModel): + model_config = ConfigDict(extra="forbid") + + enabled: Optional[bool] = None + + +class DpoConfig(BaseModel): + model_config = ConfigDict(extra="forbid") + + enabled: Optional[bool] = None + beta: Optional[float] = None + label_smoothing: Optional[float] = None + + +class ModelConfig(BaseModel): + model_config = ConfigDict(extra="allow") + + model_type: Optional[str] = None + train_batch_size: Optional[int] = None + val_batch_size: Optional[int] = None + seed: Optional[int] = None + grad_clip: Optional[float] = None + log_reduced_training_loss: Optional[bool] = None + + # Additional model-specific fields found in recipes + max_window_layers: Optional[int] = None + rms_norm_eps: Optional[float] = None + tie_word_embeddings: Optional[bool] = None + use_sliding_window: Optional[bool] = None + + # Memory saving/distributed training configs + tensor_model_parallel_degree: Optional[int] = None + expert_model_parallel_degree: Optional[int] = None + context_parallel_degree: Optional[int] = None + moe: Optional[bool] = None + sliding_window: Optional[int] = None + num_experts_per_tok: Optional[int] = None + num_local_experts: Optional[int] = None + moe_load_balancing: Optional[str] = None + global_token_shuffle: Optional[bool] = None + moe_all_to_all_dispatcher: Optional[bool] = None + activation_checkpointing: Optional[bool] = None + activation_loading_horizon: Optional[int] = None + delayed_param: Optional[bool] = None + offload_activations: Optional[bool] = None + multi_modal: Optional[bool] = None + + # FSDP Configs + sharding_strategy: Optional[str] = None + forward_prefetch: Optional[bool] = None + shard_degree: Optional[int] = None + backward_fetch_policy: Optional[str] = None + auto_wrap_policy: Optional[str] = None + limit_all_gathers: Optional[bool] = None + use_orig_param: Optional[bool] = None + + # FP8 config + fp8: Optional[bool] = None + fp8_amax_history_len: Optional[int] = None + fp8_amax_compute_algo: Optional[str] = None + + # Model architecture + max_context_width: Optional[int] = None + max_position_embeddings: Optional[int] = None + num_hidden_layers: Optional[int] = None + hidden_size: Optional[int] = None + num_attention_heads: Optional[int] = None + intermediate_size: Optional[int] = None + initializer_range: Optional[float] = None + layernorm_epsilon: Optional[float] = None + vocab_size: Optional[int] = None + num_key_value_heads: Optional[int] = None + use_flash_attention: Optional[bool] = None + rope_theta: Optional[float] = None + rope_scaling: Optional[RopeScalingConfig] = None + + # Finetuning config + do_finetune: Optional[bool] = None + hf_model_name_or_path: Optional[str] = None + hf_access_token: Optional[str] = None + peft: Optional[PeftConfig] = None + + precision: Optional[str] = None + lr_decay_iters: Optional[int] = None + optim: Optional[OptimizerConfig] = None + data: Optional[DataConfig] = None + viztracer: Optional[VizTracerConfig] = None + dpo: Optional[DpoConfig] = None + + # Megatron-specific fields + mcore_gpt: Optional[bool] = None + micro_batch_size: Optional[int] = None + global_batch_size: Optional[int] = None + rampup_batch_size: Optional[int] = None + pipeline_model_parallel_size: Optional[int] = None + virtual_pipeline_model_parallel_size: Optional[int] = None + encoder_seq_length: Optional[int] = None + ffn_hidden_size: Optional[int] = None + num_query_groups: Optional[int] = None + init_method_std: Optional[float] = None + use_scaled_init_method: Optional[bool] = None + kv_channels: Optional[int] = None + apply_query_key_layer_scaling: Optional[bool] = None + normalization: Optional[str] = None + do_layer_norm_weight_decay: Optional[bool] = None + make_vocab_size_divisible_by: Optional[int] = None + pre_process: Optional[bool] = None + post_process: Optional[bool] = None + persist_layer_norm: Optional[bool] = None + bias: Optional[bool] = None + activation: Optional[str] = None + headscale: Optional[bool] = None + transformer_block_type: Optional[str] = None + openai_gelu: Optional[bool] = None + normalize_attention_scores: Optional[bool] = None + position_embedding_type: Optional[str] = None + rotary_percentage: Optional[float] = None + apply_rope_fusion: Optional[bool] = None + cross_entropy_loss_fusion: Optional[bool] = None + attention_type: Optional[str] = None + share_embeddings_and_output_weights: Optional[bool] = None + scale_positional_embedding: Optional[bool] = None + tokenizer: Optional[TokenizerConfig] = None + native_amp_init_scale: Optional[int] = None + native_amp_growth_interval: Optional[int] = None + hysteresis: Optional[int] = None + fp32_residual_connection: Optional[bool] = None + fp16_lm_cross_entropy: Optional[bool] = None + megatron_amp_O2: Optional[bool] = None + grad_allreduce_chunk_size_mb: Optional[int] = None + grad_div_ar_fusion: Optional[bool] = None + gradient_accumulation_fusion: Optional[bool] = None + bias_activation_fusion: Optional[bool] = None + bias_dropout_add_fusion: Optional[bool] = None + masked_softmax_fusion: Optional[bool] = None + resume_from_checkpoint: Optional[str] = None + use_cpu_initialization: Optional[bool] = None + onnx_safe: Optional[bool] = None + apex_transformer_log_level: Optional[int] = None + gradient_as_bucket_view: Optional[bool] = None + sync_batch_comm: Optional[bool] = None + activations_checkpoint_granularity: Optional[str] = None + activations_checkpoint_method: Optional[str] = None + activations_checkpoint_num_layers: Optional[int] = None + num_micro_batches_with_partial_activation_checkpoints: Optional[int] = None + activations_checkpoint_layers_per_pipeline: Optional[int] = None + sequence_parallel: Optional[bool] = None + deterministic_mode: Optional[bool] = None + transformer_engine: Optional[bool] = None + fp8_e4m3: Optional[bool] = None + fp8_hybrid: Optional[bool] = None + fp8_margin: Optional[int] = None + fp8_interval: Optional[int] = None + use_emha: Optional[bool] = None + ub_tp_comm_overlap: Optional[bool] = None + + +class EvaluationConfig(BaseModel): + model_config = ConfigDict(extra="forbid") + + task: Optional[str] = None + strategy: Optional[str] = None + metric: Optional[str] = None + subtask: Optional[str] = None + + +class InferenceConfig(BaseModel): + model_config = ConfigDict(extra="forbid") + + max_new_tokens: Optional[int] = None + top_k: Optional[int] = None + top_p: Optional[float] = None + temperature: Optional[float] = None + + +class HfRecipeSchema(BaseModel): + model_config = ConfigDict(extra="forbid") + + # Common configurations + run: RunConfig + + # Training and fine-tuning specific configurations + trainer: Optional[TrainerConfig] = None + exp_manager: Optional[ExpManagerConfig] = None + use_smp_model: Optional[bool] = None + distributed_backend: Optional[str] = None + model: Optional[ModelConfig] = None + + # Evaluation specific configurations + evaluation: Optional[EvaluationConfig] = None + + # Inference specific configurations + inference: Optional[InferenceConfig] = None diff --git a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/recipes/neuron_hf/__init__.py b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/recipes/neuron_hf/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/recipes/neuron_hf/model.py b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/recipes/neuron_hf/model.py new file mode 100644 index 00000000..3e1a1d88 --- /dev/null +++ b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/recipes/neuron_hf/model.py @@ -0,0 +1,182 @@ +from typing import Any, Dict, List, Optional, Union +from pydantic import BaseModel, ConfigDict + + +class RunConfig(BaseModel): + model_config = ConfigDict(extra="forbid") + + name: Optional[str] = None + results_dir: Optional[str] = None + time_limit: Optional[str] = None + model_type: Optional[str] = None + compile: Optional[int] = None + + +class TrainerConfig(BaseModel): + model_config = ConfigDict(extra="forbid") + + devices: Optional[int] = None + num_nodes: Optional[int] = None + max_epochs: Optional[int] = None + max_steps: Optional[int] = None + log_every_n_steps: Optional[int] = None + val_check_interval: Optional[int] = None + check_val_every_n_epoch: Optional[int] = None + num_sanity_val_steps: Optional[int] = None + limit_val_batches: Optional[float] = None + limit_test_batches: Optional[float] = None + gradient_clip_val: Optional[float] = None + + +class CheckpointCallbackParamsConfig(BaseModel): + model_config = ConfigDict(extra="forbid") + + monitor: Optional[str] = None + save_top_k: Optional[int] = None + mode: Optional[str] = None + save_last: Optional[bool] = None + filename: Optional[str] = None + model_parallel_size: Optional[int] = None + every_n_train_steps: Optional[int] = None + + +class ExpManagerConfig(BaseModel): + model_config = ConfigDict(extra="forbid") + + log_local_rank_0_only: Optional[bool] = None + create_tensorboard_logger: Optional[bool] = None + summary_writer_kwargs: Optional[Dict[str, str]] = None + create_mlflow_logger: Optional[bool] = None + mlflow_logger_kwargs: Optional[Dict[str, str]] = None + create_wandb_logger: Optional[bool] = None + wandb_logger_kwargs: Optional[Dict[str, str]] = None + explicit_log_dir: Optional[str] = None + exp_dir: Optional[str] = None + name: Optional[str] = None + resume_if_exists: Optional[bool] = None + resume_ignore_no_checkpoint: Optional[bool] = None + create_checkpoint_callback: Optional[bool] = None + checkpoint_callback_params: Optional[CheckpointCallbackParamsConfig] = None + log_parameter_norm: Optional[bool] = None + log_gradient_norm: Optional[bool] = None + enable_recovery_time_instrumentation: Optional[bool] = None + save_xser: Optional[bool] = None + load_xser: Optional[bool] = None + save_bf16: Optional[bool] = None + async_checkpointing: Optional[bool] = None + resume_from_checkpoint: Optional[str] = None + + +class DistributedStrategyConfig(BaseModel): + model_config = ConfigDict(extra="forbid") + + tensor_model_parallel_size: Optional[int] = None + pipeline_model_parallel_size: Optional[int] = None + virtual_pipeline_model_parallel_size: Optional[int] = None + zero1: Optional[bool] = None + sequence_parallel: Optional[bool] = None + kv_replicator: Optional[int] = None + + +class TokenizerConfig(BaseModel): + model_config = ConfigDict(extra="forbid") + + type: Optional[str] = None + + +class DataConfig(BaseModel): + model_config = ConfigDict(extra="forbid") + + micro_batch_size: Optional[int] = None + global_batch_size: Optional[int] = None + train_dir: Optional[str] = None + val_dir: Optional[str] = None + packing: Optional[bool] = None + use_sft_style_data_module: Optional[bool] = None + dev_choose_samples: Optional[int] = None + seq_length: Optional[int] = None + tokenizer: Optional[TokenizerConfig] = None + + +class SchedulerConfig(BaseModel): + model_config = ConfigDict(extra="forbid") + + name: Optional[str] = None + warmup_steps: Optional[int] = None + max_steps: Optional[int] = None + + +class OptimizerConfig(BaseModel): + model_config = ConfigDict(extra="forbid") + + name: Optional[str] = None + lr: Optional[float] = None + weight_decay: Optional[float] = None + capturable: Optional[bool] = None + betas: Optional[List[float]] = None + sched: Optional[SchedulerConfig] = None + + +class FusionsConfig(BaseModel): + model_config = ConfigDict(extra="forbid") + + softmax: Optional[bool] = None + flash_attention: Optional[bool] = None + + +class ModelConfig(BaseModel): + # Allow additional field as some recipes have "model_config" field and + # this name is not allowed in Pydantic + model_config = ConfigDict(extra="allow") + + encoder_seq_length: Optional[int] = None + max_position_embeddings: Optional[int] = None + num_layers: Optional[int] = None + hidden_size: Optional[int] = None + qkv_linear: Optional[bool] = None + rope_theta: Optional[float] = None + use_cpu_initialization: Optional[bool] = None + weight_init_only: Optional[bool] = None + activations_checkpoint_granularity: Optional[str] = None + fusions: Optional[FusionsConfig] = None + do_layer_norm_weight_decay: Optional[bool] = None + optim: Optional[OptimizerConfig] = None + + +class PrecisionConfig(BaseModel): + model_config = ConfigDict(extra="forbid") + + type: Optional[str] = None + master_weights: Optional[bool] = None + fp32_grad_acc: Optional[bool] = None + xla_use_bf16: Optional[str] = None + xla_downcast_bf16: Optional[str] = None + neuron_rt_stochastic_rounding_en: Optional[str] = None + + +class NeuronHfRecipeSchema(BaseModel): + model_config = ConfigDict(extra="forbid") + + """Schema for neuron-hf SageMaker HyperPod recipes.""" + + # Common configurations + run: RunConfig + name: Optional[str] = None + model_source: Optional[str] = None + seed: Optional[int] = None + + # Training configurations + trainer: Optional[TrainerConfig] = None + exp_manager: Optional[ExpManagerConfig] = None + distributed_strategy: Optional[DistributedStrategyConfig] = None + data: Optional[DataConfig] = None + model: Optional[ModelConfig] = None + precision: Optional[PrecisionConfig] = None + + # Neuron-specific configurations + compiler_flags: Optional[str] = None + compiler_cache_url: Optional[str] = None + aync_exec_max_inflight_requests: Optional[int] = None + bucket_size_collectives: Optional[int] = None + neuron_rt_exec_timeout: Optional[int] = None + neuron_experimental_compress_rg: Optional[bool] = None diff --git a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/recipes/nova/__init__.py b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/recipes/nova/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/recipes/nova/model.py b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/recipes/nova/model.py new file mode 100644 index 00000000..7c75cb79 --- /dev/null +++ b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/recipes/nova/model.py @@ -0,0 +1,167 @@ +from typing import List, Optional +from pydantic import BaseModel, ConfigDict + + +class RunConfig(BaseModel): + model_config = ConfigDict(extra="forbid") + + name: Optional[str] = None + model_type: Optional[str] = None + model_name_or_path: Optional[str] = None + replicas: Optional[int] = None + data_s3_path: Optional[str] = None + output_s3_path: Optional[str] = None + + # PPO-specific replica configurations + actor_train_replicas: Optional[int] = None + rm_replicas: Optional[int] = None + cm_replicas: Optional[int] = None + actor_generation_replicas: Optional[int] = None + am_replicas: Optional[int] = None + + +class TrainerConfig(BaseModel): + model_config = ConfigDict(extra="forbid") + + max_epochs: Optional[int] = None + num_nodes: Optional[int] = None + + +class SchedulerConfig(BaseModel): + model_config = ConfigDict(extra="forbid") + + warmup_steps: Optional[int] = None + constant_steps: Optional[int] = None + min_lr: Optional[float] = None + + +class OptimizerConfig(BaseModel): + model_config = ConfigDict(extra="forbid") + + name: Optional[str] = None + lr: Optional[float] = None + adam_w_mode: Optional[bool] = None + eps: Optional[float] = None + weight_decay: Optional[float] = None + betas: Optional[List[float]] = None + sched: Optional[SchedulerConfig] = None + + +class DpoConfig(BaseModel): + model_config = ConfigDict(extra="forbid") + + beta: Optional[float] = None + + +class LoraTuningConfig(BaseModel): + model_config = ConfigDict(extra="forbid") + + loraplus_lr_ratio: Optional[float] = None + alpha: Optional[float] = None + adapter_dropout: Optional[float] = None + + +class PeftConfig(BaseModel): + model_config = ConfigDict(extra="forbid") + + peft_scheme: Optional[str] = None + lora_tuning: Optional[LoraTuningConfig] = None + + +class ModelConfig(BaseModel): + hidden_dropout: Optional[float] = None + attention_dropout: Optional[float] = None + ffn_dropout: Optional[float] = None + optim: Optional[OptimizerConfig] = None + dpo_cfg: Optional[DpoConfig] = None + peft: Optional[PeftConfig] = None + global_batch_size: Optional[int] = None + ent_coeff: Optional[float] = None + clip_ratio: Optional[float] = None + lam: Optional[float] = None + kl_loss_coeff: Optional[float] = None + kl_loss_type: Optional[str] = None + kl_reward_penalty_coeff: Optional[float] = None + + +class TrainingConfig(BaseModel): + model_config = ConfigDict(extra="forbid") + + max_length: Optional[int] = None + global_batch_size: Optional[int] = None + trainer: Optional[TrainerConfig] = None + model: Optional[ModelConfig] = None + + # Distillation-specific fields + distillation_data: Optional[str] = None + maxNumberOfPrompts: Optional[str] = None + maxResponseLength: Optional[str] = None + minNumberOfPrompts: Optional[str] = None + maxInputFileSizeInGB: Optional[str] = None + maxLineLengthInKB: Optional[str] = None + maxStudentModelFineTuningContextLengthInTokens: Optional[str] = None + teacherModelId: Optional[str] = None + temperature: Optional[str] = None + top_p: Optional[str] = None + customer_bucket: Optional[str] = None + kms_key: Optional[str] = None + + +class PpoRewardConfig(BaseModel): + model_config = ConfigDict(extra="forbid") + + max_length: Optional[int] + trainer: Optional[TrainerConfig] = None + model: Optional[ModelConfig] = None + + +class PpoCriticConfig(BaseModel): + model_config = ConfigDict(extra="forbid") + + max_length: Optional[int] + trainer: Optional[TrainerConfig] = None + model: Optional[ModelConfig] = None + + +class PpoAnchorConfig(BaseModel): + model_config = ConfigDict(extra="forbid") + + max_length: Optional[int] + trainer: Optional[TrainerConfig] = None + model: Optional[ModelConfig] = None + + +class PpoActorGenerationConfig(BaseModel): + model_config = ConfigDict(extra="forbid") + + actor_model_max_length: Optional[int] + trainer: Optional[TrainerConfig] = None + + +class PpoActorTrainConfig(BaseModel): + model_config = ConfigDict(extra="forbid") + + max_length: Optional[int] = None + max_steps: Optional[int] = None + actor_model_max_length: Optional[int] = None + reward_model_max_length: Optional[int] = None + trajectory_buffer_scale: Optional[int] = None + trainer: Optional[TrainerConfig] = None + model: Optional[ModelConfig] = None + + +class NovaRecipeSchema(BaseModel): + model_config = ConfigDict(extra="forbid") + + # Common configurations + run: RunConfig + + # Training and fine-tuning specific configurations + training_config: Optional[TrainingConfig] = None + + # PPO-specific configurations + ppo_reward: Optional[PpoRewardConfig] = None + ppo_critic: Optional[PpoCriticConfig] = None + ppo_anchor: Optional[PpoAnchorConfig] = None + ppo_actor_generation: Optional[PpoActorGenerationConfig] = None + ppo_actor_train: Optional[PpoActorTrainConfig] = None diff --git a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/recipes/nova_evaluation/__init__.py b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/recipes/nova_evaluation/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/recipes/nova_evaluation/model.py b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/recipes/nova_evaluation/model.py new file mode 100644 index 00000000..82dcf714 --- /dev/null +++ b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/recipes/nova_evaluation/model.py @@ -0,0 +1,39 @@ +from typing import Optional +from pydantic import BaseModel, ConfigDict + + +class RunConfig(BaseModel): + model_config = ConfigDict(extra="forbid") + + name: Optional[str] = None + model_type: Optional[str] = None + model_name_or_path: Optional[str] = None + replicas: Optional[int] = None + data_s3_path: Optional[str] = None + output_s3_path: Optional[str] = None + + +class EvaluationConfig(BaseModel): + model_config = ConfigDict(extra="forbid") + + task: Optional[str] = None + strategy: Optional[str] = None + metric: Optional[str] = None + subtask: Optional[str] = None + + +class InferenceConfig(BaseModel): + model_config = ConfigDict(extra="forbid") + + top_k: Optional[int] = None + top_p: Optional[float] = None + temperature: Optional[float] = None + max_new_tokens: Optional[int] = None + + +class NovaEvaluationRecipeSchema(BaseModel): + model_config = ConfigDict(extra="forbid") + + run: RunConfig + evaluation: EvaluationConfig + inference: Optional[InferenceConfig] = None diff --git a/src/sagemaker/hyperpod/cli/recipe_utils/recipe_loader.py b/src/sagemaker/hyperpod/cli/recipe_utils/recipe_loader.py index 0f06d79b..e6feab8e 100644 --- a/src/sagemaker/hyperpod/cli/recipe_utils/recipe_loader.py +++ b/src/sagemaker/hyperpod/cli/recipe_utils/recipe_loader.py @@ -1,62 +1,109 @@ import os import yaml -from typing import Dict, Any, Optional +from hyperpod_pytorch_job_template.v1_0.recipes.hf.model import HfRecipeSchema +from hyperpod_pytorch_job_template.v1_0.recipes.neuron_hf.model import ( + NeuronHfRecipeSchema, +) +from hyperpod_pytorch_job_template.v1_0.recipes.nova.model import NovaRecipeSchema +from hyperpod_pytorch_job_template.v1_0.recipes.nova_evaluation.model import ( + NovaEvaluationRecipeSchema, +) -from sagemaker.hyperpod.cli.recipe_utils.recipe_schema import RecipeSchema - -# from recipe_schema import RecipeSchema - - -def load_recipe(recipe_path: str) -> RecipeSchema: +def load_recipe( + recipe_path: str, +) -> ( + HfRecipeSchema + | NeuronHfRecipeSchema + | NovaRecipeSchema + | NovaEvaluationRecipeSchema +): """ Load and validate a recipe YAML file using the RecipeSchema. - + Args: recipe_path: Path to the recipe YAML file - + Returns: RecipeSchema: Validated recipe object - + Raises: FileNotFoundError: If the recipe file doesn't exist ValueError: If the recipe file is invalid """ if not os.path.exists(recipe_path): raise FileNotFoundError(f"Recipe file not found: {recipe_path}") - + try: - with open(recipe_path, 'r') as f: + with open(recipe_path, "r") as f: recipe_data = yaml.safe_load(f) - - # Validate and return the recipe - return RecipeSchema(**recipe_data) + + if "run" in recipe_data and "model_type" in recipe_data["run"]: + model_type = recipe_data["run"]["model_type"] + + if model_type == "hf": + return HfRecipeSchema(**recipe_data) + elif model_type == "neuron-hf": + return NeuronHfRecipeSchema(**recipe_data) + elif "nova" in model_type and "evaluation" in recipe_data: + return NovaEvaluationRecipeSchema(**recipe_data) + elif "nova" in model_type: + return NovaRecipeSchema(**recipe_data) + else: + raise Exception("Invalid model_type {model_type}") + else: + # there are 3 yaml without model_type + try: + # recipes/training/llama/megatron_llama3_1_8b_nemo.yaml + return HfRecipeSchema(**recipe_data) + except: + pass + + try: + # recipes/fine-tuning/nova/nova_premier_r5_cpu_distill.yaml + # recipes/fine-tuning/nova/nova_pro_r5_cpu_distill.yaml + return NovaRecipeSchema(**recipe_data) + except: + pass + + raise Exception( + "Cannot validate recipe with existing templates. Check your recipe.yaml file." + ) + except yaml.YAMLError as e: raise ValueError(f"Invalid YAML in recipe file: {e}") except Exception as e: raise ValueError(f"Error validating recipe: {e}") -def save_recipe(recipe: RecipeSchema, output_path: str) -> None: +def save_recipe( + recipe: ( + HfRecipeSchema + | NeuronHfRecipeSchema + | NovaRecipeSchema + | NovaEvaluationRecipeSchema + ), + output_path: str, +) -> None: """ Save a recipe object to a YAML file. - + Args: - recipe: RecipeSchema object + recipe: schema object output_path: Path to save the YAML file - + Raises: ValueError: If the recipe can't be saved """ try: # Convert to dict, excluding None values recipe_dict = recipe.model_dump(exclude_none=True) - + # Create directory if it doesn't exist os.makedirs(os.path.dirname(output_path), exist_ok=True) - + # Save to YAML - with open(output_path, 'w') as f: + with open(output_path, "w") as f: yaml.dump(recipe_dict, f, default_flow_style=False) except Exception as e: - raise ValueError(f"Error saving recipe: {e}") \ No newline at end of file + raise ValueError(f"Error saving recipe: {e}")