|
| 1 | +# Note: |
| 2 | +# This recipe is currently supported only on Amazon SageMaker training jobs. |
| 3 | + |
| 4 | + |
| 5 | +## Run config |
| 6 | +run: |
| 7 | + name: "my-lora-run" # A descriptive name for your training job |
| 8 | + model_type: "amazon.nova-micro-v1:0:128k" # Model variant specification, do not change |
| 9 | + model_name_or_path: "nova-micro/prod" # Base model path, do not change |
| 10 | + replicas: 1 # Number of compute instances for training, allowed value is 1 |
| 11 | + data_s3_path: "" # Customer data path |
| 12 | + output_s3_path: "" # Output artifact path, Sagemaker Hyperpod job-specific configuration - not compatible with standard Sagemaker Training jobs |
| 13 | + |
| 14 | +## Training specific configs |
| 15 | +training_config: |
| 16 | + max_length: 8196 # Maximum context window size (tokens). Should be between [1024, 8192] and multiple of 1024. |
| 17 | + global_batch_size: 64 # Global batch size, allowed values are 16, 32, 64 |
| 18 | + |
| 19 | + trainer: |
| 20 | + max_epochs: 2 # Number of training epochs |
| 21 | + |
| 22 | + model: |
| 23 | + hidden_dropout: 0.0 # Dropout for hidden states, must be between 0.0 and 1.0 |
| 24 | + attention_dropout: 0.0 # Dropout for attention weights, must be between 0.0 and 1.0 |
| 25 | + ffn_dropout: 0.0 # Dropout for feed-forward networks, must be between 0.0 and 1.0 |
| 26 | + |
| 27 | + optim: |
| 28 | + lr: 1e-5 # Learning rate |
| 29 | + name: distributed_fused_adam # Optimizer algorithm, do not change |
| 30 | + adam_w_mode: true # Enable AdamW mode |
| 31 | + eps: 1e-06 # Epsilon for numerical stability |
| 32 | + weight_decay: 0.0 # L2 regularization strength, must be between 0.0 and 1.0 |
| 33 | + betas: # Adam optimizer betas, must be between 0.0 and 1.0 |
| 34 | + - 0.9 |
| 35 | + - 0.999 |
| 36 | + sched: |
| 37 | + warmup_steps: 10 # Learning rate warmup steps |
| 38 | + constant_steps: 0 # Steps at constant learning rate |
| 39 | + min_lr: 1e-6 # Minimum learning rate |
| 40 | + |
| 41 | + peft: |
| 42 | + peft_scheme: "lora" # Enable LoRA for parameter-efficient fine-tuning |
| 43 | + lora_tuning: |
| 44 | + loraplus_lr_ratio: 8.0 # LoRA+ learning rate scaling factor, must be between 0.0 and 100.0 |
| 45 | + alpha: 32 # Scaling factor for LoRA weights. Allowed values are 32, 64, 96, 128, 160 and 192 |
| 46 | + adapter_dropout: 0.01 # Regularization for LoRA parameters. Must be between 0.0 and 1.0 |
0 commit comments