diff --git a/.gitignore b/.gitignore index 8a264a78..c6fd50da 100644 --- a/.gitignore +++ b/.gitignore @@ -22,6 +22,7 @@ doc/_build/ /sagemaker-hyperpod/build /sagemaker-hyperpod/.coverage /sagemaker-hyperpod/.coverage.* +/hyperpod-cluster-stack-template/build # Ignore all contents of result and results directories /result/ diff --git a/CHANGELOG.md b/CHANGELOG.md index 8a914068..9f1c3b14 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,11 +1,23 @@ # Changelog -## v3.1.0 (2025-08-13) +## v3.2.0 (2025-08-25) ### Features + * Cluster management + * Creation of cluster stack + * Describing and listing a cluster stack + * Updating a cluster + * Init Experience + * Init, Validate, Create with local configurations + + +## v3.1.0 (2025-08-13) + +### Features * Task Governance feature for training jobs. + ## v3.0.2 (2025-07-31) ### Features @@ -34,3 +46,4 @@ ### Features - feature: Add support for SageMaker HyperPod CLI + diff --git a/README.md b/README.md index 7d017999..cf0bff56 100644 --- a/README.md +++ b/README.md @@ -159,6 +159,13 @@ hyp create hyp-pytorch-job \ --queue-name "training-queue" \ --priority "high" \ --max-retry 3 \ + --accelerators 8 \ + --vcpu 96.0 \ + --memory 1152.0 \ + --accelerators-limit 8 \ + --vcpu-limit 96.0 \ + --memory-limit 1152.0 \ + --preferred-topology "topology.kubernetes.io/zone=us-west-2a" \ --volume name=model-data,type=hostPath,mount_path=/data,path=/data \ --volume name=training-output,type=pvc,mount_path=/data2,claim_name=my-pvc,read_only=false ``` diff --git a/doc/_static/custom.css b/doc/_static/custom.css index b4bfb4cc..c37521b6 100644 --- a/doc/_static/custom.css +++ b/doc/_static/custom.css @@ -59,3 +59,126 @@ html[data-theme="dark"] .navbar-brand .title { html[data-theme="dark"] p { color: #d1d5db !important; } + +.current.active>a { + background-color: aliceblue !important; +} + +.bd-sidebar-primary li.has-children .caption, +.bd-sidebar-primary li.has-children>.reference { + margin-right: inherit; +} + +nav.bd-links li>a { + margin-right: inherit; +} + +.table tbody tr:hover { + background: none !important; +} + +.wy-table-responsive table td, +.wy-table-responsive table th { + white-space: normal; +} + +.wy-table-responsive { + margin-bottom: 24px; + max-width: 100%; + overflow: visible; +} + +.pagination { + display: inline-block; +} + +.pagination a { + color: black; + float: left; + padding: 8px 16px; + text-decoration: none; +} + +.pagination a.active { + background-color: #2a80b9; + color: white; +} + +.pagination a:hover:not(.active) { + background-color: #ddd; +} + + +dl.py.class.dt.sig.sig-object.py { + overflow: auto; + margin: 6px 0; + font-size: 90%; + line-height: normal; + background: #e7f2fa !important; + color: #2980b9 !important; + border-top: 3px solid #6ab0de !important; + padding: 6px; + position: relative; +} + +.bd-article { + overflow: auto; +} + +.sig-prename.descclassname { + color: #000; +} + +.field-list { + display: grid !important; + grid-template-columns: 0.5fr 2fr !important; +} + +.field-list dt { + background: transparent !important; + word-break: normal !important; +} + +.py.class dl { + margin: 1rem 0 !important; +} + +.page-toc.tocsection.onthispage svg { + margin-right: 0.5rem; +} + +.sidebar-secondary-items { + display: block !important; + padding: 0.5rem 0 !important; +} + +.table { + border-radius: 4px !important; + border: 1px solid #e1e5e9 !important; + border-collapse: separate !important; + border-spacing: 0 !important; + overflow: hidden !important; +} + +.table tbody tr { + background: none !important; +} + +.table tbody tr:hover { + background: none !important; +} + +.table td, +.table th { + border: none !important; + border-bottom: 1px solid #e1e5e9 !important; +} + +.table tr:last-child td { + border-bottom: none !important; +} + +.bd-toc code { + background: transparent !important; + border: none; +} \ No newline at end of file diff --git a/doc/cli/cli_index.rst b/doc/cli/cli_index.rst new file mode 100644 index 00000000..3d3885a3 --- /dev/null +++ b/doc/cli/cli_index.rst @@ -0,0 +1,38 @@ +CLI Reference +============= + +Complete reference for the SageMaker HyperPod Command Line Interface. + +.. toctree:: + :hidden: + :maxdepth: 2 + + cluster_management/cli_cluster_management + training/cli_training + inference/cli_inference + +.. container:: + + .. grid:: 1 1 3 3 + :gutter: 3 + + .. grid-item-card:: Cluster Management CLI + :link: cluster_management/cli_cluster_management + :link-type: doc + :class-card: sd-border-secondary + + Cluster stack management commands, options and parameters. + + .. grid-item-card:: Training CLI + :link: training/cli_training + :link-type: doc + :class-card: sd-border-secondary + + Training CLI commands, options and parameters. + + .. grid-item-card:: Inference CLI + :link: inference/cli_inference + :link-type: doc + :class-card: sd-border-secondary + + Inference CLI commands, options and parameters. \ No newline at end of file diff --git a/doc/cli_reference.md b/doc/cli/cli_reference.md similarity index 72% rename from doc/cli_reference.md rename to doc/cli/cli_reference.md index 744ab4ed..6ae3af58 100644 --- a/doc/cli_reference.md +++ b/doc/cli/cli_reference.md @@ -8,6 +8,7 @@ cli_training cli_inference +cli_cluster_management ``` Complete reference for the SageMaker HyperPod Command Line Interface. @@ -32,5 +33,13 @@ Training CLI commands, options and parameters. Inference CLI commands, options and parameters. ::: +:::{grid-item-card} Cluster Management CLI +:link: cli_cluster_management +:link-type: ref +:class-card: sd-border-secondary + +Cluster stack management commands, options and parameters. +::: + :::: :::: \ No newline at end of file diff --git a/doc/cli/cluster_management/cli_cluster_management.md b/doc/cli/cluster_management/cli_cluster_management.md new file mode 100644 index 00000000..e626d0a5 --- /dev/null +++ b/doc/cli/cluster_management/cli_cluster_management.md @@ -0,0 +1,367 @@ +(cli_cluster_management)= + +# Cluster Management + +Complete reference for SageMaker HyperPod cluster management parameters and configuration options. + +```{note} +**Region Configuration**: For commands that accept the `--region` option, if no region is explicitly provided, the command will use the default region from your AWS credentials configuration. +``` + +* [Initialize Configuration](#hyp-init) +* [Create Cluster Stack](#hyp-create) +* [Update Cluster](#hyp-update-hyp-cluster) +* [List Cluster Stacks](#hyp-list-hyp-cluster) +* [Describe Cluster Stack](#hyp-describe-hyp-cluster) +* [List HyperPod Clusters](#hyp-list-cluster) +* [Set Cluster Context](#hyp-set-cluster-context) +* [Get Cluster Context](#hyp-get-cluster-context) +* [Get Monitoring](#hyp-get-monitoring) + +* [Configure Parameters](#hyp-configure) +* [Validate Configuration](#hyp-validate) +* [Reset Configuration](#hyp-reset) + +## hyp init + +Initialize a template scaffold in the current directory. + +#### Syntax + +```bash +hyp init TEMPLATE [DIRECTORY] [OPTIONS] +``` + +#### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `TEMPLATE` | CHOICE | Yes | Template type (hyp-cluster, hyp-pytorch-job, hyp-custom-endpoint, hyp-jumpstart-endpoint) | +| `DIRECTORY` | PATH | No | Target directory (default: current directory) | +| `--version` | TEXT | No | Schema version to use | + +```{important} +The `resource_name_prefix` parameter in the generated `config.yaml` file serves as the primary identifier for all AWS resources created during deployment. Each deployment must use a unique resource name prefix to avoid conflicts. This prefix is automatically appended with a unique identifier during cluster creation to ensure resource uniqueness. +``` + +## hyp create + +Create a new HyperPod cluster stack using the provided configuration. + +#### Syntax + +```bash +hyp create [OPTIONS] +``` + +#### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `--region` | TEXT | No | AWS region where the cluster stack will be created | +| `--debug` | FLAG | No | Enable debug logging | + +## hyp update hyp-cluster + +Update an existing HyperPod cluster configuration. + +#### Syntax + +```bash +hyp update hyp-cluster [OPTIONS] +``` + +#### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `--cluster-name` | TEXT | Yes | Name of the cluster to update | +| `--instance-groups` | TEXT | No | JSON string of instance group configurations | +| `--instance-groups-to-delete` | TEXT | No | JSON string of instance groups to delete | +| `--region` | TEXT | No | AWS region of the cluster | +| `--node-recovery` | TEXT | No | Node recovery setting (Automatic or None) | +| `--debug` | FLAG | No | Enable debug logging | + +## hyp list hyp-cluster + +List all HyperPod cluster stacks (CloudFormation stacks). + +#### Syntax + +```bash +hyp list hyp-cluster [OPTIONS] +``` + +#### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `--region` | TEXT | No | AWS region to list stacks from | +| `--status` | TEXT | No | Filter by stack status. Format: "['CREATE_COMPLETE', 'UPDATE_COMPLETE']" | +| `--debug` | FLAG | No | Enable debug logging | + +## hyp describe hyp-cluster + +Describe a specific HyperPod cluster stack. + +#### Syntax + +```bash +hyp describe hyp-cluster STACK-NAME [OPTIONS] +``` + +#### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `STACK-NAME` | TEXT | Yes | Name of the CloudFormation stack to describe | +| `--region` | TEXT | No | AWS region of the stack | +| `--debug` | FLAG | No | Enable debug logging | + +## hyp list-cluster + +List SageMaker HyperPod clusters with capacity information. + +#### Syntax + +```bash +hyp list-cluster [OPTIONS] +``` + +#### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `--region` | TEXT | No | AWS region to list clusters from | +| `--output` | TEXT | No | Output format ("table" or "json", default: "json") | +| `--clusters` | TEXT | No | Comma-separated list of specific cluster names | +| `--namespace` | TEXT | No | Namespace to check capacity for (can be used multiple times) | +| `--debug` | FLAG | No | Enable debug logging | + +## hyp set-cluster-context + +Connect to a HyperPod EKS cluster and set kubectl context. + +#### Syntax + +```bash +hyp set-cluster-context [OPTIONS] +``` + +#### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `--cluster-name` | TEXT | Yes | Name of the HyperPod cluster to connect to | +| `--region` | TEXT | No | AWS region of the cluster | +| `--namespace` | TEXT | No | Kubernetes namespace to connect to | +| `--debug` | FLAG | No | Enable debug logging | + +## hyp get-cluster-context + +Get context information for the currently connected cluster. + +#### Syntax + +```bash +hyp get-cluster-context [OPTIONS] +``` + +#### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `--debug` | FLAG | No | Enable debug logging | + +## hyp get-monitoring + +Get monitoring configurations for the HyperPod cluster. + +#### Syntax + +```bash +hyp get-monitoring [OPTIONS] +``` + +#### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `--grafana` | FLAG | No | Return Grafana dashboard URL | +| `--prometheus` | FLAG | No | Return Prometheus workspace URL | +| `--list` | FLAG | No | Return list of available metrics | + +## hyp configure + +Configure cluster parameters interactively or via command line. + +#### Syntax + +```bash +hyp configure [OPTIONS] +``` + +#### Parameters + +This command dynamically supports all configuration parameters available in the current template's schema. Common parameters include: + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `--resource-name-prefix` | TEXT | No | Prefix for all AWS resources | +| `--stage` | TEXT | No | Deployment stage ("gamma" or "prod") | +| `--vpc-cidr` | TEXT | No | VPC CIDR block | +| `--kubernetes-version` | TEXT | No | Kubernetes version for EKS cluster | +| `--node-recovery` | TEXT | No | Node recovery setting ("Automatic" or "None") | +| `--env` | JSON | No | Environment variables as JSON object | +| `--args` | JSON | No | Command arguments as JSON array | +| `--command` | JSON | No | Command to run as JSON array | +| `--tags` | JSON | No | Resource tags as JSON object | + +**Note:** The exact parameters available depend on your current template type and version. Run `hyp configure --help` to see all available options for your specific configuration. + +## hyp validate + +Validate the current directory's configuration file syntax and structure. + +#### Syntax + +```bash +hyp validate +``` + +#### Parameters + +No parameters required. + +```{note} +This command performs **syntactic validation only** of the `config.yaml` file against the appropriate schema. It checks: + +- **YAML syntax**: Ensures file is valid YAML +- **Required fields**: Verifies all mandatory fields are present +- **Data types**: Confirms field values match expected types (string, number, boolean, array) +- **Schema structure**: Validates against the template's defined structure + +This command performs syntactic validation only and does **not** verify the actual validity of values (e.g., whether AWS regions exist, instance types are available, or resources can be created). + +**Prerequisites** + +- Must be run in a directory where `hyp init` has created configuration files +- A `config.yaml` file must exist in the current directory + +**Output** + +- **Success**: Displays confirmation message if syntax is valid +- **Errors**: Lists specific syntax errors with field names and descriptions +``` + + +#### Syntax + +```bash +# Validate current configuration syntax +hyp validate + +# Example output on success +✔️ config.yaml is valid! + +# Example output with syntax errors +❌ Config validation errors: + – kubernetes_version: Field is required + – vpc_cidr: Expected string, got number +``` + +## hyp reset + +Reset the current directory's config.yaml to default values. + +#### Syntax + +```bash +hyp reset +``` + +#### Parameters + +No parameters required. + + + +## Parameter Reference + +### Common Parameters Across Commands + +| Parameter | Type | Description | Default | +|-----------|------|-------------|---------| +| `--region` | TEXT | AWS region | Current AWS profile region | +| `--help` | FLAG | Show command help | - | +| `--verbose` | FLAG | Enable verbose output | false | + +### Configuration File Parameters + +The `config.yaml` file supports the following parameters: + +| Parameter | Type | Description | Default | +|-----------|------|-------------|---------| +| `template` | TEXT | Template name | "hyp-cluster" | +| `namespace` | TEXT | Kubernetes namespace | "kube-system" | +| `stage` | TEXT | Deployment stage | "gamma" | +| `resource_name_prefix` | TEXT | Resource name prefix | "sagemaker-hyperpod-eks" | +| `vpc_cidr` | TEXT | VPC CIDR block | "10.192.0.0/16" | +| `kubernetes_version` | TEXT | Kubernetes version | "1.31" | +| `node_recovery` | TEXT | Node recovery setting | "Automatic" | +| `create_vpc_stack` | BOOLEAN | Create new VPC | true | +| `create_eks_cluster_stack` | BOOLEAN | Create new EKS cluster | true | +| `create_hyperpod_cluster_stack` | BOOLEAN | Create HyperPod cluster | true | + +**Note:** The actual available configuration parameters depend on the specific template schema version. Use `hyp init hyp-cluster` to see all available parameters for your version. + +## Examples + +### Basic Cluster Stack Creation + +```bash +# Start with a clean directory +mkdir my-hyperpod-cluster +cd my-hyperpod-cluster + +# Initialize cluster configuration +hyp init hyp-cluster + +# Configure basic parameters +hyp configure --resource-name-prefix my-cluster --stage prod + +# Validate configuration +hyp validate + +# Create cluster stack +hyp create --region us-west-2 +``` + +### Update Existing Cluster + +```bash +# Update instance groups +hyp update hyp-cluster \ + --cluster-name my-cluster \ + --instance-groups '[{"InstanceCount":2,"InstanceGroupName":"worker-nodes","InstanceType":"ml.m5.large"}]' \ + --region us-west-2 +``` + +### List and Describe + +```bash +# List all cluster stacks +hyp list hyp-cluster --region us-west-2 + +# Describe specific cluster stack +hyp describe hyp-cluster my-stack-name --region us-west-2 + +# List HyperPod clusters with capacity info +hyp list-cluster --region us-west-2 --output table + +# Connect to cluster +hyp set-cluster-context --cluster-name my-cluster --region us-west-2 + +# Get current context +hyp get-cluster-context +``` \ No newline at end of file diff --git a/doc/cli/cluster_management/cli_cluster_management_autogen.rst b/doc/cli/cluster_management/cli_cluster_management_autogen.rst new file mode 100644 index 00000000..63d3aa27 --- /dev/null +++ b/doc/cli/cluster_management/cli_cluster_management_autogen.rst @@ -0,0 +1,16 @@ +.. Just kept as placeholder for autodoc gen, this file is not referenced in the actual docs. + +.. Cluster Management +.. ======================================== + +.. .. .. click:: sagemaker.hyperpod.cli.commands.cluster_stack:create_cluster_stack +.. .. :prog: hyp create hyp-cluster + +.. .. click:: sagemaker.hyperpod.cli.commands.cluster_stack:describe_cluster_stack +.. :prog: hyp describe hyp-cluster + +.. .. click:: sagemaker.hyperpod.cli.commands.cluster_stack:list_cluster_stacks +.. :prog: hyp list hyp-cluster + +.. .. click:: sagemaker.hyperpod.cli.commands.cluster_stack:update_cluster +.. :prog: hyp update hyp-cluster \ No newline at end of file diff --git a/doc/cli/inference/cli_inference.md b/doc/cli/inference/cli_inference.md new file mode 100644 index 00000000..df108d76 --- /dev/null +++ b/doc/cli/inference/cli_inference.md @@ -0,0 +1,350 @@ +(cli_inference)= + +# Inference + +Complete reference for SageMaker HyperPod inference parameters and configuration options. + +```{note} +**Region Configuration**: For commands that accept the `--region` option, if no region is explicitly provided, the command will use the default region from your AWS credentials configuration. +``` + +* [Create JumpStart Endpoint](#hyp-create-hyp-jumpstart-endpoint) +* [Create Custom Endpoint](#hyp-create-hyp-custom-endpoint) + +* [List JumpStart Endpoints](#hyp-list-hyp-jumpstart-endpoint) +* [List Custom Endpoints](#hyp-list-hyp-custom-endpoint) +* [Describe JumpStart Endpoint](#hyp-describe-hyp-jumpstart-endpoint) +* [Describe Custom Endpoint](#hyp-describe-hyp-custom-endpoint) +* [Invoke JumpStart Endpoint](#hyp-invoke-hyp-jumpstart-endpoint) +* [Invoke Custom Endpoint](#hyp-invoke-hyp-custom-endpoint) +* [Delete JumpStart Endpoint](#hyp-delete-hyp-jumpstart-endpoint) +* [Delete Custom Endpoint](#hyp-delete-hyp-custom-endpoint) + +* [List JumpStart Pods](#hyp-list-pods-hyp-jumpstart-endpoint) +* [List Custom Pods](#hyp-list-pods-hyp-custom-endpoint) +* [Get JumpStart Logs](#hyp-get-logs-hyp-jumpstart-endpoint) +* [Get Custom Logs](#hyp-get-logs-hyp-custom-endpoint) +* [Get JumpStart Operator Logs](#hyp-get-operator-logs-hyp-jumpstart-endpoint) +* [Get Custom Operator Logs](#hyp-get-operator-logs-hyp-custom-endpoint) + + + +## hyp create hyp-jumpstart-endpoint + +Deploy pre-trained models from SageMaker JumpStart. + +#### Syntax + +```bash +hyp create hyp-jumpstart-endpoint [OPTIONS] +``` + +#### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `--model-id` | TEXT | Yes | JumpStart model identifier (1-63 characters, alphanumeric with hyphens) | +| `--instance-type` | TEXT | Yes | EC2 instance type for inference (must start with "ml.") | +| `--accept-eula` | BOOLEAN | No | Whether model terms of use have been accepted (default: false) | +| `--model-version` | TEXT | No | Semantic version of the model (e.g., "1.0.0", 5-14 characters) | +| `--endpoint-name` | TEXT | No | Name of SageMaker endpoint (1-63 characters, alphanumeric with hyphens) | +| `--tls-certificate-output-s3-uri` | TEXT | No | S3 URI to write the TLS certificate (optional) | + +### hyp create hyp-custom-endpoint + +Deploy custom models with your own inference code. + +#### Syntax + +```bash +hyp create hyp-custom-endpoint [OPTIONS] +``` + +#### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `--instance-type` | TEXT | Yes | EC2 instance type for inference (must start with "ml.") | +| `--model-name` | TEXT | Yes | Name of model to create on SageMaker (1-63 characters, alphanumeric with hyphens) | +| `--model-source-type` | TEXT | Yes | Model source type ("s3" or "fsx") | +| `--image-uri` | TEXT | Yes | Docker image URI for inference | +| `--container-port` | INTEGER | Yes | Port on which model server listens (1-65535) | +| `--model-volume-mount-name` | TEXT | Yes | Name of the model volume mount | +| `--endpoint-name` | TEXT | No | Name of SageMaker endpoint (1-63 characters, alphanumeric with hyphens) | +| `--env` | OBJECT | No | Environment variables as key-value pairs | +| `--metrics-enabled` | BOOLEAN | No | Enable metrics collection (default: false) | +| `--model-version` | TEXT | No | Version of the model (semantic version format) | +| `--model-location` | TEXT | No | Specific model data location | +| `--prefetch-enabled` | BOOLEAN | No | Whether to pre-fetch model data (default: false) | +| `--tls-certificate-output-s3-uri` | TEXT | No | S3 URI for TLS certificate output | +| `--fsx-dns-name` | TEXT | No | FSx File System DNS Name | +| `--fsx-file-system-id` | TEXT | No | FSx File System ID | +| `--fsx-mount-name` | TEXT | No | FSx File System Mount Name | +| `--s3-bucket-name` | TEXT | No | S3 bucket location | +| `--s3-region` | TEXT | No | S3 bucket region | +| `--model-volume-mount-path` | TEXT | No | Path inside container for model volume (default: "/opt/ml/model") | +| `--resources-limits` | OBJECT | No | Resource limits for the worker | +| `--resources-requests` | OBJECT | No | Resource requests for the worker | +| `--dimensions` | OBJECT | No | CloudWatch Metric dimensions as key-value pairs | +| `--metric-collection-period` | INTEGER | No | Period for CloudWatch query (default: 300) | +| `--metric-collection-start-time` | INTEGER | No | StartTime for CloudWatch query (default: 300) | +| `--metric-name` | TEXT | No | Metric name to query for CloudWatch trigger | +| `--metric-stat` | TEXT | No | Statistics metric for CloudWatch (default: "Average") | +| `--metric-type` | TEXT | No | Type of metric for HPA ("Value" or "Average", default: "Average") | +| `--min-value` | NUMBER | No | Minimum metric value for empty CloudWatch response (default: 0) | +| `--cloud-watch-trigger-name` | TEXT | No | Name for the CloudWatch trigger | +| `--cloud-watch-trigger-namespace` | TEXT | No | AWS CloudWatch namespace for the metric | +| `--target-value` | NUMBER | No | Target value for the CloudWatch metric | +| `--use-cached-metrics` | BOOLEAN | No | Enable caching of metric values (default: true) | +| `--invocation-endpoint` | TEXT | No | Invocation endpoint path (default: "invocations") | + +## Inference Endpoint Management Commands + +Commands for managing inference endpoints. + +### hyp list hyp-jumpstart-endpoint + +List JumpStart model endpoints. + +#### Syntax + +```bash +hyp list hyp-jumpstart-endpoint [OPTIONS] +``` + +#### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `--namespace` | TEXT | No | Namespace to list endpoints from (default: "default") | + +### hyp list hyp-custom-endpoint + +List custom model endpoints. + +#### Syntax + +```bash +hyp list hyp-custom-endpoint [OPTIONS] +``` + +#### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `--namespace` | TEXT | No | Namespace to list endpoints from (default: "default") | + +### hyp describe hyp-jumpstart-endpoint + +Describe a JumpStart model endpoint. + +#### Syntax + +```bash +hyp describe hyp-jumpstart-endpoint [OPTIONS] +``` + +#### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `--name` | TEXT | Yes | Name of the endpoint to describe | +| `--namespace` | TEXT | No | Namespace of the endpoint (default: "default") | +| `--full` | FLAG | No | Display full JSON output | + +### hyp describe hyp-custom-endpoint + +Describe a custom model endpoint. + +#### Syntax + +```bash +hyp describe hyp-custom-endpoint [OPTIONS] +``` + +#### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `--name` | TEXT | Yes | Name of the endpoint to describe | +| `--namespace` | TEXT | No | Namespace of the endpoint (default: "default") | +| `--full` | FLAG | No | Display full JSON output | + +### hyp invoke hyp-jumpstart-endpoint + +Invoke a JumpStart model endpoint. + +#### Syntax + +```bash +hyp invoke hyp-jumpstart-endpoint [OPTIONS] +``` + +#### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `--endpoint-name` | TEXT | Yes | Name of the endpoint to invoke | +| `--body` | TEXT | Yes | Request body (JSON format) | +| `--content-type` | TEXT | No | Content type of the request (default: "application/json") | + +### hyp invoke hyp-custom-endpoint + +Invoke a custom model endpoint. + +#### Syntax + +```bash +hyp invoke hyp-custom-endpoint [OPTIONS] +``` + +#### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `--endpoint-name` | TEXT | Yes | Name of the endpoint to invoke | +| `--body` | TEXT | Yes | Request body (JSON format) | +| `--content-type` | TEXT | No | Content type of the request (default: "application/json") | + +### hyp delete hyp-jumpstart-endpoint + +Delete a JumpStart model endpoint. + +#### Syntax + +```bash +hyp delete hyp-jumpstart-endpoint [OPTIONS] +``` + +#### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `--name` | TEXT | Yes | Name of the endpoint to delete | +| `--namespace` | TEXT | No | Namespace of the endpoint (default: "default") | + +### hyp delete hyp-custom-endpoint + +Delete a custom model endpoint. + +#### Syntax + +```bash +hyp delete hyp-custom-endpoint [OPTIONS] +``` + +#### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `--name` | TEXT | Yes | Name of the endpoint to delete | +| `--namespace` | TEXT | No | Namespace of the endpoint (default: "default") | + +### hyp list-pods hyp-jumpstart-endpoint + +List pods for JumpStart endpoints. + +#### Syntax + +```bash +hyp list-pods hyp-jumpstart-endpoint [OPTIONS] +``` + +#### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `--namespace` | TEXT | No | Namespace to list pods from (default: "default") | + +### hyp list-pods hyp-custom-endpoint + +List pods for custom endpoints. + +#### Syntax + +```bash +hyp list-pods hyp-custom-endpoint [OPTIONS] +``` + +#### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `--namespace` | TEXT | No | Namespace to list pods from (default: "default") | + +### hyp get-logs hyp-jumpstart-endpoint + +Get logs from JumpStart endpoint pods. + +#### Syntax + +```bash +hyp get-logs hyp-jumpstart-endpoint [OPTIONS] +``` + +#### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `--pod-name` | TEXT | Yes | Name of the pod to get logs from | +| `--container` | TEXT | No | Container name to get logs from | +| `--namespace` | TEXT | No | Namespace of the pod (default: "default") | + +### hyp get-logs hyp-custom-endpoint + +Get logs from custom endpoint pods. + +#### Syntax + +```bash +hyp get-logs hyp-custom-endpoint [OPTIONS] +``` + +#### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `--pod-name` | TEXT | Yes | Name of the pod to get logs from | +| `--container` | TEXT | No | Container name to get logs from | +| `--namespace` | TEXT | No | Namespace of the pod (default: "default") | + +### hyp get-operator-logs hyp-jumpstart-endpoint + +Get operator logs for JumpStart endpoints. + +#### Syntax + +```bash +hyp get-operator-logs hyp-jumpstart-endpoint [OPTIONS] +``` + +#### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `--since-hours` | FLOAT | Yes | Time frame to get logs for (in hours) | + +### hyp get-operator-logs hyp-custom-endpoint + +Get operator logs for custom endpoints. + +#### Syntax + +```bash +hyp get-operator-logs hyp-custom-endpoint [OPTIONS] +``` + +#### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `--since-hours` | FLOAT | Yes | Time frame to get logs for (in hours) | + +## Parameter Reference + +### Common Parameters Across Commands + +| Parameter | Type | Description | Default | +|-----------|------|-------------|---------| +| `--namespace` | TEXT | Kubernetes namespace | Current context | +| `--help` | FLAG | Show command help | - | diff --git a/doc/cli/training/cli_training.md b/doc/cli/training/cli_training.md new file mode 100644 index 00000000..dc89d221 --- /dev/null +++ b/doc/cli/training/cli_training.md @@ -0,0 +1,182 @@ +(cli_training)= + + +# Training + +Complete reference for SageMaker HyperPod PyTorch training job parameters and configuration options. + +```{note} +**Region Configuration**: For commands that accept the `--region` option, if no region is explicitly provided, the command will use the default region from your AWS credentials configuration. +``` + +* [Create PyTorch Job](#hyp-create-hyp-pytorch-job) +* [List Jobs](#hyp-list-hyp-pytorch-job) +* [Describe Job](#hyp-describe-hyp-pytorch-job) +* [Delete Job](#hyp-delete-hyp-pytorch-job) +* [List Pods](#hyp-list-pods-hyp-pytorch-job) +* [Get Logs](#hyp-get-logs-hyp-pytorch-job) + + +## hyp create hyp-pytorch-job + +Create distributed PyTorch training jobs on SageMaker HyperPod clusters. + +### Syntax + +```bash +hyp create hyp-pytorch-job [OPTIONS] +``` + +### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `--job-name` | TEXT | Yes | Unique name for the training job (1-63 characters, alphanumeric with hyphens) | +| `--image` | TEXT | Yes | Docker image URI containing your training code | +| `--namespace` | TEXT | No | Kubernetes namespace | +| `--command` | ARRAY | No | Command to run in the container (array of strings) | +| `--args` | ARRAY | No | Arguments for the entry script (array of strings) | +| `--environment` | OBJECT | No | Environment variables as key-value pairs | +| `--pull-policy` | TEXT | No | Image pull policy (Always, Never, IfNotPresent) | +| `--instance-type` | TEXT | No | Instance type for training | +| `--node-count` | INTEGER | No | Number of nodes (minimum: 1) | +| `--tasks-per-node` | INTEGER | No | Number of tasks per node (minimum: 1) | +| `--label-selector` | OBJECT | No | Node label selector as key-value pairs | +| `--deep-health-check-passed-nodes-only` | BOOLEAN | No | Schedule pods only on nodes that passed deep health check (default: false) | +| `--scheduler-type` | TEXT | No | Scheduler type | +| `--queue-name` | TEXT | No | Queue name for job scheduling (1-63 characters, alphanumeric with hyphens) | +| `--priority` | TEXT | No | Priority class for job scheduling | +| `--max-retry` | INTEGER | No | Maximum number of job retries (minimum: 0) | +| `--volume` | ARRAY | No | List of volume configurations (Refer [Volume Configuration](#volume-configuration) for detailed parameter info) | +| `--service-account-name` | TEXT | No | Service account name | +| `--accelerators` | INTEGER | No | Number of accelerators a.k.a GPUs or Trainium Chips | +| `--vcpu` | FLOAT | No | Number of vCPUs | +| `--memory` | FLOAT | No | Amount of memory in GiB | +| `--accelerators-limit` | INTEGER | No | Limit for the number of accelerators a.k.a GPUs or Trainium Chips | +| `--vcpu-limit` | FLOAT | No | Limit for the number of vCPUs | +| `--memory-limit` | FLOAT | No | Limit for the amount of memory in GiB | +| `--preferred-topology` | TEXT | No | Preferred topology annotation for scheduling | +| `--required-topology` | TEXT | No | Required topology annotation for scheduling | +| `--debug` | FLAG | No | Enable debug mode (default: false) | + +### Volume Configuration + +The `--volume` parameter supports mounting different types of storage to your training containers. + +### Volume Syntax + +```bash +--volume name=,type=,mount_path=[,additional_options] +``` + +### Volume Types + +**hostPath Volume** +```bash +--volume name=model-data,type=hostPath,mount_path=/data,path=/host/data +``` + +**Persistent Volume Claim (PVC)** +```bash +--volume name=training-output,type=pvc,mount_path=/output,claim_name=training-pvc,read_only=false +``` + +### Volume Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `name` | TEXT | Yes | Volume name | +| `type` | TEXT | Yes | Volume type (`hostPath` or `pvc`) | +| `mount_path` | TEXT | Yes | Mount path in container | +| `path` | TEXT | For hostPath | Host path for hostPath volumes | +| `claim_name` | TEXT | For pvc | PVC claim name for pvc volumes | +| `read_only` | BOOLEAN | No | Read-only flag for pvc volumes | + +## Training Job Management Commands + +Commands for managing PyTorch training jobs. + +### hyp list hyp-pytorch-job + +List all HyperPod PyTorch jobs in a namespace. + +#### Syntax + +```bash +hyp list hyp-pytorch-job [OPTIONS] +``` + +#### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `--namespace, -n` | TEXT | No | Namespace to list jobs from (default: "default") | + +### hyp describe hyp-pytorch-job + +Describe a specific HyperPod PyTorch job. + +#### Syntax + +```bash +hyp describe hyp-pytorch-job [OPTIONS] +``` + +#### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `--job-name` | TEXT | Yes | Name of the job to describe | +| `--namespace, -n` | TEXT | No | Namespace of the job (default: "default") | + +### hyp delete hyp-pytorch-job + +Delete a HyperPod PyTorch job. + +#### Syntax + +```bash +hyp delete hyp-pytorch-job [OPTIONS] +``` + +#### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `--job-name` | TEXT | Yes | Name of the job to delete | +| `--namespace, -n` | TEXT | No | Namespace of the job (default: "default") | + +### hyp list-pods hyp-pytorch-job + +List all pods associated with a PyTorch job. + +#### Syntax + +```bash +hyp list-pods hyp-pytorch-job [OPTIONS] +``` + +#### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `--job-name` | TEXT | Yes | Name of the job to list pods for | +| `--namespace, -n` | TEXT | No | Namespace of the job (default: "default") | + +### hyp get-logs hyp-pytorch-job + +Get logs from a specific pod in a PyTorch job. + +#### Syntax + +```bash +hyp get-logs hyp-pytorch-job [OPTIONS] +``` + +#### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `--job-name` | TEXT | Yes | Name of the job | +| `--pod-name` | TEXT | Yes | Name of the pod to get logs from | +| `--namespace, -n` | TEXT | No | Namespace of the job (default: "default") | diff --git a/doc/cli_inference.md b/doc/cli_inference.md deleted file mode 100644 index 1c79a706..00000000 --- a/doc/cli_inference.md +++ /dev/null @@ -1,344 +0,0 @@ -(cli_inference)= - -# Inference - -Complete reference for SageMaker HyperPod inference parameters and configuration options. - -* [Create JumpStart Endpoint](#hyp-create-hyp-jumpstart-endpoint) -* [Create Custom Endpoint](#hyp-create-hyp-custom-endpoint) - -* [List JumpStart Endpoints](#hyp-list-hyp-jumpstart-endpoint) -* [List Custom Endpoints](#hyp-list-hyp-custom-endpoint) -* [Describe JumpStart Endpoint](#hyp-describe-hyp-jumpstart-endpoint) -* [Describe Custom Endpoint](#hyp-describe-hyp-custom-endpoint) -* [Invoke JumpStart Endpoint](#hyp-invoke-hyp-jumpstart-endpoint) -* [Invoke Custom Endpoint](#hyp-invoke-hyp-custom-endpoint) -* [Delete JumpStart Endpoint](#hyp-delete-hyp-jumpstart-endpoint) -* [Delete Custom Endpoint](#hyp-delete-hyp-custom-endpoint) - -* [List JumpStart Pods](#hyp-list-pods-hyp-jumpstart-endpoint) -* [List Custom Pods](#hyp-list-pods-hyp-custom-endpoint) -* [Get JumpStart Logs](#hyp-get-logs-hyp-jumpstart-endpoint) -* [Get Custom Logs](#hyp-get-logs-hyp-custom-endpoint) -* [Get JumpStart Operator Logs](#hyp-get-operator-logs-hyp-jumpstart-endpoint) -* [Get Custom Operator Logs](#hyp-get-operator-logs-hyp-custom-endpoint) - - - -## hyp create hyp-jumpstart-endpoint - -Deploy pre-trained models from SageMaker JumpStart. - -#### Syntax - -```bash -hyp create hyp-jumpstart-endpoint [OPTIONS] -``` - -#### Required Parameters - -- `--model-id TEXT`: JumpStart model identifier (1-63 characters, alphanumeric with hyphens) -- `--instance-type TEXT`: EC2 instance type for inference (must start with "ml.") - -#### Optional Parameters - -- `--accept-eula BOOLEAN`: Whether model terms of use have been accepted (default: false) -- `--model-version TEXT`: Semantic version of the model (e.g., "1.0.0", 5-14 characters) -- `--endpoint-name TEXT`: Name of SageMaker endpoint (1-63 characters, alphanumeric with hyphens) -- `--tls-certificate-output-s3-uri TEXT`: S3 URI to write the TLS certificate (optional) - -### hyp create hyp-custom-endpoint - -Deploy custom models with your own inference code. - -#### Syntax - -```bash -hyp create hyp-custom-endpoint [OPTIONS] -``` - -#### Required Parameters - -- `--instance-type TEXT`: EC2 instance type for inference (must start with "ml.") -- `--model-name TEXT`: Name of model to create on SageMaker (1-63 characters, alphanumeric with hyphens) -- `--model-source-type TEXT`: Model source type ("s3" or "fsx") -- `--image-uri TEXT`: Docker image URI for inference -- `--container-port INTEGER`: Port on which model server listens (1-65535) -- `--model-volume-mount-name TEXT`: Name of the model volume mount - -#### Optional Parameters - -- `--endpoint-name TEXT`: Name of SageMaker endpoint (1-63 characters, alphanumeric with hyphens) -- `--env OBJECT`: Environment variables as key-value pairs -- `--metrics-enabled BOOLEAN`: Enable metrics collection (default: false) -- `--model-version TEXT`: Version of the model (semantic version format) -- `--model-location TEXT`: Specific model data location -- `--prefetch-enabled BOOLEAN`: Whether to pre-fetch model data (default: false) -- `--tls-certificate-output-s3-uri TEXT`: S3 URI for TLS certificate output -- `--fsx-dns-name TEXT`: FSx File System DNS Name -- `--fsx-file-system-id TEXT`: FSx File System ID -- `--fsx-mount-name TEXT`: FSx File System Mount Name -- `--s3-bucket-name TEXT`: S3 bucket location -- `--s3-region TEXT`: S3 bucket region -- `--model-volume-mount-path TEXT`: Path inside container for model volume (default: "/opt/ml/model") -- `--resources-limits OBJECT`: Resource limits for the worker -- `--resources-requests OBJECT`: Resource requests for the worker -- `--dimensions OBJECT`: CloudWatch Metric dimensions as key-value pairs -- `--metric-collection-period INTEGER`: Period for CloudWatch query (default: 300) -- `--metric-collection-start-time INTEGER`: StartTime for CloudWatch query (default: 300) -- `--metric-name TEXT`: Metric name to query for CloudWatch trigger -- `--metric-stat TEXT`: Statistics metric for CloudWatch (default: "Average") -- `--metric-type TEXT`: Type of metric for HPA ("Value" or "Average", default: "Average") -- `--min-value NUMBER`: Minimum metric value for empty CloudWatch response (default: 0) -- `--cloud-watch-trigger-name TEXT`: Name for the CloudWatch trigger -- `--cloud-watch-trigger-namespace TEXT`: AWS CloudWatch namespace for the metric -- `--target-value NUMBER`: Target value for the CloudWatch metric -- `--use-cached-metrics BOOLEAN`: Enable caching of metric values (default: true) -- `--invocation-endpoint TEXT`: Invocation endpoint path (default: "invocations") - -## Inference Endpoint Management Commands - -Commands for managing inference endpoints. - -### hyp list hyp-jumpstart-endpoint - -List JumpStart model endpoints. - -#### Syntax - -```bash -hyp list hyp-jumpstart-endpoint [OPTIONS] -``` - -#### Optional Parameters - -- `--namespace TEXT`: Namespace to list endpoints from (default: "default") - -### hyp list hyp-custom-endpoint - -List custom model endpoints. - -#### Syntax - -```bash -hyp list hyp-custom-endpoint [OPTIONS] -``` - -#### Optional Parameters - -- `--namespace TEXT`: Namespace to list endpoints from (default: "default") - -### hyp describe hyp-jumpstart-endpoint - -Describe a JumpStart model endpoint. - -#### Syntax - -```bash -hyp describe hyp-jumpstart-endpoint [OPTIONS] -``` - -#### Required Parameters - -- `--name TEXT`: Name of the endpoint to describe - -#### Optional Parameters - -- `--namespace TEXT`: Namespace of the endpoint (default: "default") -- `--full`: Display full JSON output - -### hyp describe hyp-custom-endpoint - -Describe a custom model endpoint. - -#### Syntax - -```bash -hyp describe hyp-custom-endpoint [OPTIONS] -``` - -#### Required Parameters - -- `--name TEXT`: Name of the endpoint to describe - -#### Optional Parameters - -- `--namespace TEXT`: Namespace of the endpoint (default: "default") -- `--full`: Display full JSON output - -### hyp invoke hyp-jumpstart-endpoint - -Invoke a JumpStart model endpoint. - -#### Syntax - -```bash -hyp invoke hyp-jumpstart-endpoint [OPTIONS] -``` - -#### Required Parameters - -- `--endpoint-name TEXT`: Name of the endpoint to invoke -- `--body TEXT`: Request body (JSON format) - -#### Optional Parameters - -- `--content-type TEXT`: Content type of the request (default: "application/json") - -### hyp invoke hyp-custom-endpoint - -Invoke a custom model endpoint. - -#### Syntax - -```bash -hyp invoke hyp-custom-endpoint [OPTIONS] -``` - -#### Required Parameters - -- `--endpoint-name TEXT`: Name of the endpoint to invoke -- `--body TEXT`: Request body (JSON format) - -#### Optional Parameters - -- `--content-type TEXT`: Content type of the request (default: "application/json") - -### hyp delete hyp-jumpstart-endpoint - -Delete a JumpStart model endpoint. - -#### Syntax - -```bash -hyp delete hyp-jumpstart-endpoint [OPTIONS] -``` - -#### Required Parameters - -- `--name TEXT`: Name of the endpoint to delete - -#### Optional Parameters - -- `--namespace TEXT`: Namespace of the endpoint (default: "default") - -### hyp delete hyp-custom-endpoint - -Delete a custom model endpoint. - -#### Syntax - -```bash -hyp delete hyp-custom-endpoint [OPTIONS] -``` - -#### Required Parameters - -- `--name TEXT`: Name of the endpoint to delete - -#### Optional Parameters - -- `--namespace TEXT`: Namespace of the endpoint (default: "default") - -### hyp list-pods hyp-jumpstart-endpoint - -List pods for JumpStart endpoints. - -#### Syntax - -```bash -hyp list-pods hyp-jumpstart-endpoint [OPTIONS] -``` - -#### Optional Parameters - -- `--namespace TEXT`: Namespace to list pods from (default: "default") - -### hyp list-pods hyp-custom-endpoint - -List pods for custom endpoints. - -#### Syntax - -```bash -hyp list-pods hyp-custom-endpoint [OPTIONS] -``` - -#### Optional Parameters - -- `--namespace TEXT`: Namespace to list pods from (default: "default") - -### hyp get-logs hyp-jumpstart-endpoint - -Get logs from JumpStart endpoint pods. - -#### Syntax - -```bash -hyp get-logs hyp-jumpstart-endpoint [OPTIONS] -``` - -#### Required Parameters - -- `--pod-name TEXT`: Name of the pod to get logs from - -#### Optional Parameters - -- `--container TEXT`: Container name to get logs from -- `--namespace TEXT`: Namespace of the pod (default: "default") - -### hyp get-logs hyp-custom-endpoint - -Get logs from custom endpoint pods. - -#### Syntax - -```bash -hyp get-logs hyp-custom-endpoint [OPTIONS] -``` - -#### Required Parameters - -- `--pod-name TEXT`: Name of the pod to get logs from - -#### Optional Parameters - -- `--container TEXT`: Container name to get logs from -- `--namespace TEXT`: Namespace of the pod (default: "default") - -### hyp get-operator-logs hyp-jumpstart-endpoint - -Get operator logs for JumpStart endpoints. - -#### Syntax - -```bash -hyp get-operator-logs hyp-jumpstart-endpoint [OPTIONS] -``` - -#### Required Parameters - -- `--since-hours FLOAT`: Time frame to get logs for (in hours) - -### hyp get-operator-logs hyp-custom-endpoint - -Get operator logs for custom endpoints. - -#### Syntax - -```bash -hyp get-operator-logs hyp-custom-endpoint [OPTIONS] -``` - -#### Required Parameters - -- `--since-hours FLOAT`: Time frame to get logs for (in hours) - -## Parameter Reference - -### Common Parameters Across Commands - -| Parameter | Type | Description | Default | -|-----------|------|-------------|---------| -| `--namespace` | TEXT | Kubernetes namespace | Current context | -| `--help` | FLAG | Show command help | - | diff --git a/doc/cli_training.md b/doc/cli_training.md deleted file mode 100644 index b483f7eb..00000000 --- a/doc/cli_training.md +++ /dev/null @@ -1,172 +0,0 @@ -(cli_training)= - - -# Training - -Complete reference for SageMaker HyperPod PyTorch training job parameters and configuration options. - -* [Create PyTorch Job](#hyp-create-hyp-pytorch-job) -* [List Jobs](#hyp-list-hyp-pytorch-job) -* [Describe Job](#hyp-describe-hyp-pytorch-job) -* [Delete Job](#hyp-delete-hyp-pytorch-job) -* [List Pods](#hyp-list-pods-hyp-pytorch-job) -* [Get Logs](#hyp-get-logs-hyp-pytorch-job) - - -## hyp create hyp-pytorch-job - -Create distributed PyTorch training jobs on SageMaker HyperPod clusters. - -### Syntax - -```bash -hyp create hyp-pytorch-job [OPTIONS] -``` - -### Required Parameters - -- `--job-name TEXT`: Unique name for the training job (1-63 characters, alphanumeric with hyphens) -- `--image TEXT`: Docker image URI containing your training code - -### Optional Parameters - -- `--namespace TEXT`: Kubernetes namespace -- `--command ARRAY`: Command to run in the container (array of strings) -- `--args ARRAY`: Arguments for the entry script (array of strings) -- `--environment OBJECT`: Environment variables as key-value pairs -- `--pull-policy TEXT`: Image pull policy (Always, Never, IfNotPresent) -- `--instance-type TEXT`: Instance type for training -- `--node-count INTEGER`: Number of nodes (minimum: 1) -- `--tasks-per-node INTEGER`: Number of tasks per node (minimum: 1) -- `--label-selector OBJECT`: Node label selector as key-value pairs -- `--deep-health-check-passed-nodes-only BOOLEAN`: Schedule pods only on nodes that passed deep health check (default: false) -- `--scheduler-type TEXT`: If specified, training job pod will be dispatched by specified scheduler. If not specified, the pod will be dispatched by default scheduler. -- `--queue-name TEXT`: Queue name for job scheduling (1-63 characters, alphanumeric with hyphens) -- `--priority TEXT`: Priority class for job scheduling -- `--max-retry INTEGER`: Maximum number of job retries (minimum: 0) -- `--volume ARRAY`: List of volume configurations (Refer [Volume Configuration](#volume-configuration) for detailed parameter info) -- `--service-account-name TEXT`: Service account name - -### Volume Configuration - -The `--volume` parameter supports mounting different types of storage to your training containers. - -### Volume Syntax - -```bash ---volume name=,type=,mount_path=[,additional_options] -``` - -### Volume Types - -**hostPath Volume** -```bash ---volume name=model-data,type=hostPath,mount_path=/data,path=/host/data -``` - -**Persistent Volume Claim (PVC)** -```bash ---volume name=training-output,type=pvc,mount_path=/output,claim_name=training-pvc,read_only=false -``` - -### Volume Parameters - -| Parameter | Type | Required | Description | -|-----------|------|----------|-------------| -| `name` | TEXT | Yes | Volume name | -| `type` | TEXT | Yes | Volume type (`hostPath` or `pvc`) | -| `mount_path` | TEXT | Yes | Mount path in container | -| `path` | TEXT | For hostPath | Host path for hostPath volumes | -| `claim_name` | TEXT | For pvc | PVC claim name for pvc volumes | -| `read_only` | BOOLEAN | No | Read-only flag for pvc volumes | - -## Training Job Management Commands - -Commands for managing PyTorch training jobs. - -### hyp list hyp-pytorch-job - -List all HyperPod PyTorch jobs in a namespace. - -#### Syntax - -```bash -hyp list hyp-pytorch-job [OPTIONS] -``` - -#### Optional Parameters - -- `--namespace, -n TEXT`: Namespace to list jobs from (default: "default") - -### hyp describe hyp-pytorch-job - -Describe a specific HyperPod PyTorch job. - -#### Syntax - -```bash -hyp describe hyp-pytorch-job [OPTIONS] -``` - -#### Required Parameters - -- `--job-name TEXT`: Name of the job to describe - -#### Optional Parameters - -- `--namespace, -n TEXT`: Namespace of the job (default: "default") - -### hyp delete hyp-pytorch-job - -Delete a HyperPod PyTorch job. - -#### Syntax - -```bash -hyp delete hyp-pytorch-job [OPTIONS] -``` - -#### Required Parameters - -- `--job-name TEXT`: Name of the job to delete - -#### Optional Parameters - -- `--namespace, -n TEXT`: Namespace of the job (default: "default") - -### hyp list-pods hyp-pytorch-job - -List all pods associated with a PyTorch job. - -#### Syntax - -```bash -hyp list-pods hyp-pytorch-job [OPTIONS] -``` - -#### Required Parameters - -- `--job-name TEXT`: Name of the job to list pods for - -#### Optional Parameters - -- `--namespace, -n TEXT`: Namespace of the job (default: "default") - -### hyp get-logs hyp-pytorch-job - -Get logs from a specific pod in a PyTorch job. - -#### Syntax - -```bash -hyp get-logs hyp-pytorch-job [OPTIONS] -``` - -#### Required Parameters - -- `--job-name TEXT`: Name of the job -- `--pod-name TEXT`: Name of the pod to get logs from - -#### Optional Parameters - -- `--namespace, -n TEXT`: Namespace of the job (default: "default") diff --git a/doc/conf.py b/doc/conf.py index cf944cf8..3bcc39e0 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -19,12 +19,17 @@ import re import json from pathlib import Path -from typing import Dict, List, Any, Optional +from typing import Dict, List, Any, Optional, ClassVar +# Mock kubernetes.config before adding source path to prevent import errors +from unittest.mock import MagicMock +import types +kubernetes_config = types.ModuleType('kubernetes.config') +kubernetes_config.KUBE_CONFIG_DEFAULT_LOCATION = "~/.kube/config" +sys.modules['kubernetes.config'] = kubernetes_config - -def setup(app): - """Register our sphinx hooks.""" +# Add the source directory to Python path +sys.path.insert(0, str(Path(__file__).parent.parent / 'src')) # Get version from setup.py @@ -71,10 +76,12 @@ def get_version(): "sphinx_copybutton", "sphinx.ext.autosummary", "sphinx.ext.autosectionlabel", + "sphinx_design", + "sphinx_click" ] -autodoc_mock_imports = ["pyspark", "feature_store_pyspark", "py4j"] +autodoc_mock_imports = ["pyspark", "feature_store_pyspark", "py4j", "boto3", "botocore", "kubernetes", "yaml", "sagemaker_core"] source_suffix = { '.rst': 'restructuredtext', @@ -82,8 +89,19 @@ def get_version(): '.md': 'myst-nb', } -autoclass_content = "both" -autodoc_default_flags = ["show-inheritance", "members", "undoc-members"] +autoclass_content = "class" +autodoc_class_signature = "mixed" +autodoc_default_options = { + "members": True, + "undoc-members": False, + "private-members": False, + "special-members": False, + "show-inheritance": False, +} + +# Don't document class attributes automatically +autodoc_typehints_format = "short" +autodoc_preserve_defaults = True autodoc_member_order = "bysource" default_role = "py:obj" @@ -103,9 +121,9 @@ def get_version(): "use_fullscreen_button": False, "use_download_button": False, "home_page_in_toc": True, - # Configuration to disable right-side table of contents - "secondary_sidebar_items": [], # Remove all content from right sidebar - "show_toc_level": 0, # Disable automatic TOC generation + "secondary_sidebar_items": ["edit-this-page", "page-toc"], + "toc_title": "Table of contents", + "show_toc_level": 3, } author = "Amazon Web Services" @@ -117,6 +135,14 @@ def get_version(): "search_accessories.css", ] napoleon_use_rtype = False +napoleon_use_param = False +napoleon_include_init_with_doc = False +napoleon_use_ivar = True +napoleon_parameter_style = "table" +napoleon_type_aliases = None +napoleon_custom_sections = [('Parameters', 'params_style')] + +viewcode_line_numbers = True # nbsphinx configuration nbsphinx_allow_errors = True @@ -135,6 +161,7 @@ def get_version(): "smartquotes", "substitution", "tasklist", + "attrs_inline", ] myst_heading_anchors = 3 nb_execution_mode = "off" @@ -146,11 +173,20 @@ def get_version(): # Automatically extract typehints when specified and place them in # descriptions of the relevant function/method. -autodoc_typehints = "description" +autodoc_typehints = "signature" + +# Clean documentation without Pydantic boilerplate +# Hide constructor signature and parameters +autodoc_class_signature = "separated" +autodoc_member_order = "bysource" + +def setup(app): + pass # autosummary autosummary_generate = True +autosummary_ignore_module_all = False # autosectionlabel autosectionlabel_prefix_document = True \ No newline at end of file diff --git a/doc/getting_started.md b/doc/getting_started.md index a7b34103..718ab168 100644 --- a/doc/getting_started.md +++ b/doc/getting_started.md @@ -6,13 +6,18 @@ :hidden: :maxdepth: 1 -Training -Inference +Cluster Management +Training +Inference ``` This guide will help you get started with the SageMaker HyperPod CLI and SDK to perform basic operations. +```{note} +**Region Configuration**: For commands that accept the `--region` option, if no region is explicitly provided, the command will use the default region from your AWS credentials configuration. +``` + ## List Available Clusters List all available SageMaker HyperPod clusters in your account: diff --git a/doc/getting_started/cluster_management.rst b/doc/getting_started/cluster_management.rst new file mode 100644 index 00000000..ad4f3dea --- /dev/null +++ b/doc/getting_started/cluster_management.rst @@ -0,0 +1,220 @@ +Cluster Management +=============================================== + +This guide will help you create and manage your first HyperPod cluster using the CLI. + +Prerequisites +------------- + +Before you begin, ensure you have: + +- An AWS account with appropriate permissions for SageMaker HyperPod +- AWS CLI configured with your credentials +- HyperPod CLI installed (``pip install sagemaker-hyperpod``) + +.. note:: + **Region Configuration**: For commands that accept the ``--region`` option, if no region is explicitly provided, the command will use the default region from your AWS credentials configuration. + +Creating Your First Cluster +---------------------------- + +1. Start with a Clean Directory +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +It's recommended to start with a new and clean directory for each cluster configuration: + +.. code-block:: bash + + mkdir my-hyperpod-cluster + cd my-hyperpod-cluster + +2. Initialize a New Cluster Configuration +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. tab-set:: + + .. tab-item:: CLI + + .. code-block:: bash + + hyp init hyp-cluster + +This creates three files: + +- ``config.yaml``: The main configuration file you'll use to customize your cluster +- ``cfn_params.jinja``: A reference template for CloudFormation parameters +- ``README.md``: Usage guide with instructions and examples + +.. important:: + The ``resource_name_prefix`` parameter in the generated ``config.yaml`` file serves as the primary identifier for all AWS resources created during deployment. Each deployment must use a unique resource name prefix to avoid conflicts. This prefix is automatically appended with a unique identifier during cluster creation to ensure resource uniqueness. + +3. Configure Your Cluster +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +You can configure your cluster in two ways: + +**Option 1: Edit config.yaml directly** + +The config.yaml file contains key parameters like: + +.. code-block:: yaml + + template: hyp-cluster + namespace: kube-system + stage: gamma + resource_name_prefix: sagemaker-hyperpod-eks + +**Option 2: Use CLI/SDK commands** + +.. tab-set:: + + .. tab-item:: CLI + + .. code-block:: bash + + hyp configure --resource-name-prefix your-resource-prefix + +4. Create the Cluster +~~~~~~~~~~~~~~~~~~~~~ + +.. tab-set:: + + .. tab-item:: CLI + + .. code-block:: bash + + hyp create --region your-region + +This will: + +- Validate your configuration +- Create a timestamped folder in the ``run`` directory +- Initialize the cluster creation process + +5. Monitor Your Cluster +~~~~~~~~~~~~~~~~~~~~~~~ + +Check the status of your cluster: + +.. tab-set:: + + .. tab-item:: CLI + + .. code-block:: bash + + hyp describe hyp-cluster your-cluster-name --region your-region + + .. tab-item:: SDK + + .. code-block:: python + + from sagemaker.hyperpod.cluster_management.hp_cluster_stack import HpClusterStack + + # Describe a specific cluster stack + response = HpClusterStack.describe("your-cluster-name", region="your-region") + print(f"Stack Status: {response['Stacks'][0]['StackStatus']}") + print(f"Stack Name: {response['Stacks'][0]['StackName']}") + + +List all clusters: + +.. tab-set:: + + .. tab-item:: CLI + + .. code-block:: bash + + hyp list hyp-cluster --region your-region + + .. tab-item:: SDK + + .. code-block:: python + + from sagemaker.hyperpod.cluster_management.hp_cluster_stack import HpClusterStack + + # List all CloudFormation stacks (including cluster stacks) + stacks = HpClusterStack.list(region="your-region") + for stack in stacks['StackSummaries']: + print(f"Stack: {stack['StackName']}, Status: {stack['StackStatus']}") + + +Common Operations +----------------- + +Update a Cluster +~~~~~~~~~~~~~~~~~ + +.. tab-set:: + + .. tab-item:: CLI + + .. code-block:: bash + + hyp update hyp-cluster \ + --cluster-name your-cluster-name \ + --instance-groups "[]" \ + --region your-region + +Reset Configuration +~~~~~~~~~~~~~~~~~~~ + +.. tab-set:: + + .. tab-item:: CLI + + .. code-block:: bash + + hyp reset + + +Best Practices +-------------- + +- Always validate your configuration before submission: + + .. tab-set:: + + .. tab-item:: CLI + + .. code-block:: bash + + hyp validate + + .. note:: + This command performs **syntactic validation only** of the ``config.yaml`` file against the appropriate schema. It checks: + + - **YAML syntax**: Ensures file is valid YAML + - **Required fields**: Verifies all mandatory fields are present + - **Data types**: Confirms field values match expected types (string, number, boolean, array) + - **Schema structure**: Validates against the template's defined structure + + This command performs syntactic validation only and does **not** verify the actual validity of values (e.g., whether AWS regions exist, instance types are available, or resources can be created). + +- Use meaningful resource prefixes to easily identify your clusters +- Monitor cluster status regularly after creation +- Keep your configuration files in version control for reproducibility + +Next Steps +---------- + +After creating your cluster, you can: + +- Connect to your cluster: + + .. tab-set:: + + .. tab-item:: CLI + + .. code-block:: bash + + hyp set-cluster-context --cluster-name your-cluster-name + +- Start training jobs with PyTorch +- Deploy inference endpoints +- Monitor cluster resources and performance + +For more detailed information on specific commands, use the ``--help`` flag: + +.. code-block:: bash + + hyp --help \ No newline at end of file diff --git a/doc/inference.md b/doc/getting_started/inference.md similarity index 89% rename from doc/inference.md rename to doc/getting_started/inference.md index aa81a327..9b53139c 100644 --- a/doc/inference.md +++ b/doc/getting_started/inference.md @@ -15,6 +15,10 @@ SageMaker HyperPod inference endpoints allow you to: - Invoke endpoints for real-time predictions - Monitor endpoint performance +```{note} +**Region Configuration**: For commands that accept the `--region` option, if no region is explicitly provided, the command will use the default region from your AWS credentials configuration. +``` + ## Creating Inference Endpoints You can create inference endpoints using either JumpStart models or custom models: @@ -130,18 +134,24 @@ custom_endpoint.create() When creating an inference endpoint, you'll need to specify: 1. **Parameters required for Jumpstart Endpoint** - - **endpoint-name**: Unique identifier for your endpoint - - **instance-type**: The EC2 instance type to use - - **model-id**: ID of the pre-trained JumpStart model + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| **endpoint-name** | TEXT | Yes | Unique identifier for your endpoint | +| **instance-type** | TEXT | Yes | The EC2 instance type to use | +| **model-id** | TEXT | Yes | ID of the pre-trained JumpStart model | 2. **Parameters required for Custom Endpoint** - - **endpoint-name**: Unique identifier for your endpoint - - **instance-type**: The EC2 instance type to use - - **image-uri**: Docker image containing your inference code - - **model-name**: Name of model to create on SageMaker - - **model-source-type**: Source type: fsx or s3 - - **model-volume-mount-name**: Name of the model volume mount - - **container-port**: Port on which the model server listens + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| **endpoint-name** | TEXT | Yes | Unique identifier for your endpoint | +| **instance-type** | TEXT | Yes | The EC2 instance type to use | +| **image-uri** | TEXT | Yes | Docker image containing your inference code | +| **model-name** | TEXT | Yes | Name of model to create on SageMaker | +| **model-source-type** | TEXT | Yes | Source type: fsx or s3 | +| **model-volume-mount-name** | TEXT | Yes | Name of the model volume mount | +| **container-port** | INTEGER | Yes | Port on which the model server listens | ## Managing Inference Endpoints diff --git a/doc/training.md b/doc/getting_started/training.md similarity index 80% rename from doc/training.md rename to doc/getting_started/training.md index 7d49ae57..cd26cf46 100644 --- a/doc/training.md +++ b/doc/getting_started/training.md @@ -24,6 +24,10 @@ SageMaker HyperPod training jobs allow you to: - Manage job scheduling and priorities - Mount volumes and persistent volume claims +```{note} +**Region Configuration**: For commands that accept the `--region` option, if no region is explicitly provided, the command will use the default region from your AWS credentials configuration. +``` + ## Creating Training Jobs You can create training jobs using either the CLI or SDK approach: @@ -90,8 +94,19 @@ pytorch_job.create() When creating a training job, you'll need to specify: -- **job-name**: Unique identifier for your training job -- **image**: Docker image containing your training environment +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| **job-name** | TEXT | Yes | Unique identifier for your training job | +| **image** | TEXT | Yes | Docker image containing your training environment | +| **accelerators** | INTEGER | No | Number of accelerators a.k.a GPUs or Trainium Chips | +| **vcpu** | FLOAT | No | Number of vCPUs | +| **memory** | FLOAT | No | Amount of memory in GiB | +| **accelerators-limit** | INTEGER | No | Limit for the number of accelerators a.k.a GPUs or Trainium Chips | +| **vcpu-limit** | FLOAT | No | Limit for the number of vCPUs | +| **memory-limit** | FLOAT | No | Limit for the amount of memory in GiB | +| **preferred-topology** | TEXT | No | Preferred topology annotation for scheduling | +| **required-topology** | TEXT | No | Required topology annotation for scheduling | +| **debug** | FLAG | No | Enable debug mode | ## Managing Training Jobs diff --git a/doc/index.md b/doc/index.md index 8551d445..39e697c6 100644 --- a/doc/index.md +++ b/doc/index.md @@ -17,8 +17,8 @@ keywords: Installation Getting Started -CLI Reference -SDK reference +CLI Reference +SDK Reference Advanced Resources ``` @@ -93,7 +93,7 @@ Version Info - you’re viewing latest documentation for SageMaker Hyperpod CLI :gutter: 3 :::{grid-item-card} API reference -:link: api/api_index.html +:link: sdk/sdk_index.html :class-card: sd-border-primary **Explore APIs** - Checkout API Documentation diff --git a/doc/requirements.txt b/doc/requirements.txt index a9f4a087..98058a3c 100644 --- a/doc/requirements.txt +++ b/doc/requirements.txt @@ -8,3 +8,5 @@ linkify-it-py>=2.0.0 sphinx-design>=0.5.0 sphinx-tabs>=3.4.1 sphinx-copybutton +autodoc-pydantic>=2.0.0 +sphinx-click>=6.0.0 diff --git a/doc/sdk/cluster_management/hp_cluster_stack.rst b/doc/sdk/cluster_management/hp_cluster_stack.rst new file mode 100644 index 00000000..f89de192 --- /dev/null +++ b/doc/sdk/cluster_management/hp_cluster_stack.rst @@ -0,0 +1,7 @@ +Cluster Management +================================ + +.. automodule:: sagemaker.hyperpod.cluster_management.hp_cluster_stack + :exclude-members: model_config + :no-undoc-members: + :no-show-inheritance: \ No newline at end of file diff --git a/doc/api/inference/hp_endpoint.rst b/doc/sdk/inference/hp_endpoint.rst similarity index 50% rename from doc/api/inference/hp_endpoint.rst rename to doc/sdk/inference/hp_endpoint.rst index 53afbad0..7fb1fb08 100644 --- a/doc/api/inference/hp_endpoint.rst +++ b/doc/sdk/inference/hp_endpoint.rst @@ -7,39 +7,19 @@ Inference * `HPEndpoint Configs`_ -HPEndpointBase -------------------- - .. automodule:: sagemaker.hyperpod.inference.hp_endpoint_base - :members: - :undoc-members: - :show-inheritance: - -HPEndpoint -------------------- - + :exclude-members: is_kubeconfig_loaded, get_logger, verify_kube_config + :no-undoc-members: + :no-show-inheritance: + .. automodule:: sagemaker.hyperpod.inference.hp_endpoint - :members: - :undoc-members: - :show-inheritance: - -HPJumpStartEndpoint ---------------------- + :no-undoc-members: .. automodule:: sagemaker.hyperpod.inference.hp_jumpstart_endpoint - :members: - :undoc-members: - :show-inheritance: - -HPEndpoint Configs -------------------- + :no-undoc-members: .. automodule:: sagemaker.hyperpod.inference.config.hp_endpoint_config - :members: - :undoc-members: - :show-inheritance: + :no-undoc-members: .. automodule:: sagemaker.hyperpod.inference.config.hp_jumpstart_endpoint_config - :members: - :undoc-members: - :show-inheritance: + :no-undoc-members: diff --git a/doc/sdk/metadata.rst b/doc/sdk/metadata.rst new file mode 100644 index 00000000..6ae5472d --- /dev/null +++ b/doc/sdk/metadata.rst @@ -0,0 +1,7 @@ +Metadata +------------ + +.. automodule:: sagemaker.hyperpod.common.config.metadata + :members: + :undoc-members: + :show-inheritance: diff --git a/doc/api/api_index.rst b/doc/sdk/sdk_index.rst similarity index 70% rename from doc/api/api_index.rst rename to doc/sdk/sdk_index.rst index b5d37197..7bdad56b 100644 --- a/doc/api/api_index.rst +++ b/doc/sdk/sdk_index.rst @@ -6,6 +6,7 @@ SDK Reference :hidden: :maxdepth: 2 + cluster_management/hp_cluster_stack training/hyperpod_pytorch_job inference/hp_endpoint @@ -16,6 +17,13 @@ Complete reference for the SageMaker HyperPod SDK. .. grid:: 1 1 3 3 :gutter: 3 + .. grid-item-card:: Cluster Management SDK + :link: cluster_management/hp_cluster_stack + :link-type: doc + :class-card: sd-border-secondary + + Cluster Management SDK classes, methods and parameters. + .. grid-item-card:: Training SDK :link: training/hyperpod_pytorch_job :link-type: doc diff --git a/doc/api/training/hyperpod_pytorch_job.rst b/doc/sdk/training/hyperpod_pytorch_job.rst similarity index 57% rename from doc/api/training/hyperpod_pytorch_job.rst rename to doc/sdk/training/hyperpod_pytorch_job.rst index 6a33dddd..779bc85e 100644 --- a/doc/api/training/hyperpod_pytorch_job.rst +++ b/doc/sdk/training/hyperpod_pytorch_job.rst @@ -8,9 +8,8 @@ Training HyperPodPytorchJob ------------------- -.. automodule:: sagemaker.hyperpod.training.hyperpod_pytorch_job - :members: - :undoc-members: +.. autoclass:: sagemaker.hyperpod.training.hyperpod_pytorch_job.HyperPodPytorchJob + :exclude-members: is_kubeconfig_loaded, model_config, metadata, status, get_logger, verify_kube_config :show-inheritance: @@ -18,7 +17,5 @@ HyperPodPytorchJob Configs --------------------------- .. automodule:: sagemaker.hyperpod.training.config.hyperpod_pytorch_job_unified_config - :members: - :undoc-members: + :members: _HyperPodPytorchJob :show-inheritance: - diff --git a/examples/cluster_management/cluster_creation_init_experience.ipynb b/examples/cluster_management/cluster_creation_init_experience.ipynb new file mode 100644 index 00000000..db01dcc6 --- /dev/null +++ b/examples/cluster_management/cluster_creation_init_experience.ipynb @@ -0,0 +1,384 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# SageMaker HyperPod Cluster Creation - Init Experience\n", + "\n", + "This notebook demonstrates the complete end-to-end workflow for creating a SageMaker HyperPod cluster using the HyperPod CLI. The init experience provides a guided approach to cluster creation with validation and configuration management.\n", + "\n", + "## Prerequisites\n", + "\n", + "- AWS CLI configured with appropriate permissions\n", + "- SageMaker HyperPod CLI installed (`pip install sagemaker-hyperpod`)\n", + "- Helm installed (required for cluster operations)\n", + "- Python 3.8+ environment\n", + "\n", + "## Workflow Overview\n", + "\n", + "1. **Initialize** - Create initial cluster configuration\n", + "2. **Configure** - Customize cluster settings and tags\n", + "3. **Validate** - Verify configuration before deployment\n", + "4. **Create** - Deploy the cluster infrastructure\n", + "5. **Monitor** - Check cluster status and manage lifecycle\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 1: Initialize Cluster Configuration\n", + "\n", + "The `hyp init cluster-stack` command creates a new cluster configuration template with default settings. This generates a `config.yaml` file that serves as the foundation for your cluster deployment.\n", + "\n", + "**What this does:**\n", + "- Creates a new `config.yaml` with default cluster settings\n", + "- Sets up basic infrastructure components (VPC, EKS, S3, etc.)\n", + "- Generates unique resource names to avoid conflicts\n" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Initialize a new cluster stack configuration\n", + "!hyp init cluster-stack" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 2: Configure Cluster Settings\n", + "\n", + "The `hyp configure` command allows you to customize your cluster configuration. You can add tags for resource management, modify instance types, adjust networking settings, and more.\n", + "\n", + "**Key configuration options:**\n", + "- **Tags**: For resource organization and cost tracking\n", + "- **Instance Groups**: Define compute resources and their specifications\n", + "- **Networking**: VPC, subnets, and security group settings\n", + "- **Storage**: FSx and EBS volume configurations\n" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Configure cluster with custom tags for resource management\n", + "# Tags help with cost tracking, resource organization, and compliance\n", + "!hyp configure --tags '[{\"Key\": \"Environment\", \"Value\": \"Development\"}, {\"Key\": \"Project\", \"Value\": \"MLTraining\"}, {\"Key\": \"Owner\", \"Value\": \"DataScienceTeam\"}, {\"Key\": \"CostCenter\", \"Value\": \"ML-Research\"}]'" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### View Current Configuration\n", + "\n", + "Let's examine the generated configuration to understand what will be deployed:" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Display the current configuration\n", + "!cat config.yaml | head -50" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 3: Validate Configuration\n", + "\n", + "The `hyp validate` command performs comprehensive validation of your cluster configuration before deployment. This helps catch configuration errors early and ensures all prerequisites are met.\n", + "\n", + "**Validation checks include:**\n", + "- AWS credentials and permissions\n", + "- Resource quotas and limits\n", + "- Configuration syntax and values\n", + "- Network and security settings\n", + "- Instance type availability in target regions\n" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Validate the cluster configuration\n", + "# This checks for potential issues before deployment\n", + "!hyp validate" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 4: Reset Configuration (Optional)\n", + "\n", + "The `hyp reset` command allows you to reset your configuration to defaults or clean up any partial deployments. This is useful when you want to start fresh or if validation reveals issues that require a clean slate.\n", + "\n", + "**Use cases for reset:**\n", + "- Starting over with a clean configuration\n", + "- Cleaning up after failed deployments\n", + "- Switching between different cluster configurations\n" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Reset configuration if needed (uncomment to use)\n", + "# !hyp reset\n", + "\n", + "print(\"Reset command available if configuration changes are needed\")" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 5: Create the Cluster\n", + "\n", + "The `hyp create` command deploys your HyperPod cluster infrastructure. This process creates all the necessary AWS resources including VPC, EKS cluster, IAM roles, S3 buckets, and the HyperPod cluster itself.\n", + "\n", + "**Deployment includes:**\n", + "- VPC and networking infrastructure\n", + "- EKS cluster with managed node groups\n", + "- SageMaker HyperPod cluster\n", + "- IAM roles and policies\n", + "- S3 buckets for artifacts\n", + "- FSx file system (if configured)\n", + "\n", + "**Note:** This process typically takes 15-30 minutes to complete.\n" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Create the HyperPod cluster\n", + "# This will deploy all infrastructure components\n", + "!hyp create" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 6: Monitor Cluster Creation\n", + "\n", + "While the cluster is being created, you can monitor its progress using the describe and list commands. These provide real-time status updates on the deployment process.\n" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Check cluster creation status\n", + "import time\n", + "\n", + "print(\"Monitoring cluster creation progress...\")\n", + "for i in range(5):\n", + " print(f\"\\n--- Status Check {i+1} ---\")\n", + " !hyp describe cluster-stack \n", + " time.sleep(30) # Wait 30 seconds between checks" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 7: Describe Cluster Stack\n", + "\n", + "The `hyp describe cluster-stack` command provides detailed information about your deployed cluster, including resource IDs, endpoints, and current status.\n", + "\n", + "**Information provided:**\n", + "- Cluster status and health\n", + "- Resource ARNs and IDs\n", + "- Network configuration details\n", + "- Instance group information\n", + "- Storage configuration\n" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Get detailed information about the cluster stack\n", + "!hyp describe cluster-stack " + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 8: List All Cluster Stacks\n", + "\n", + "The `hyp list cluster-stack` command shows all HyperPod cluster stacks in your account. This is useful for managing multiple clusters and getting an overview of your infrastructure.\n", + "\n", + "**Displays:**\n", + "- All cluster stacks in the current region\n", + "- Stack names and creation timestamps\n", + "- Current status of each stack\n", + "- Resource counts and types\n" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# List all cluster stacks in your account\n", + "!hyp list cluster-stack" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 9: Update Cluster Configuration\n", + "\n", + "The `hyp update cluster` command allows you to modify your existing cluster configuration. You can add or remove instance groups, update tags, or modify other cluster settings.\n", + "\n", + "**Common update scenarios:**\n", + "- Scaling instance groups up or down\n", + "- Adding new instance types\n", + "- Updating cluster tags\n", + "- Modifying storage configurations\n", + "\n", + "**Note:** Some changes may require cluster restart or recreation.\n" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Update cluster configuration (example: adding more tags)\n", + "# Uncomment and modify as needed\n", + "# !hyp update cluster --add-tags '[{\"Key\": \"UpdatedBy\", \"Value\": \"NotebookExample\"}]'\n", + "\n", + "print(\"Update command available for cluster modifications\")" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 10: Verify Cluster Connectivity\n", + "\n", + "Once your cluster is created, verify that you can connect to it and that all components are functioning properly.\n" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Set cluster context for kubectl operations\n", + "# Replace 'your-cluster-name' with your actual cluster name\n", + "# !hyp set-cluster-context --cluster-name your-cluster-name\n", + "\n", + "# Get cluster context information\n", + "# !hyp get-cluster-context\n", + "\n", + "print(\"Cluster connectivity commands available after deployment\")" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Next Steps\n", + "\n", + "After successfully creating your HyperPod cluster, you can:\n", + "\n", + "1. **Submit Training Jobs**: Use `hyp create hyp-pytorch-job` to run distributed training\n", + "2. **Deploy Inference Endpoints**: Use `hyp create hyp-jumpstart-endpoint` for model serving\n", + "3. **Monitor Resources**: Check pod status with `hyp list-pods`\n", + "4. **Access Logs**: View training logs with `hyp get-logs`\n", + "5. **Scale Cluster**: Add or remove instance groups as needed\n", + "\n", + "## Troubleshooting\n", + "\n", + "If you encounter issues during cluster creation:\n", + "\n", + "- Check AWS CloudFormation console for detailed error messages\n", + "- Verify AWS credentials and permissions\n", + "- Ensure resource quotas are sufficient\n", + "- Review the configuration file for syntax errors\n", + "- Use `hyp validate` to identify configuration issues\n", + "\n", + "## Cleanup\n", + "\n", + "To avoid ongoing charges, remember to delete your cluster when no longer needed:\n", + "\n", + "```bash\n", + "hyp delete cluster-stack --stack-name your-stack-name\n", + "```\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Summary\n", + "\n", + "This notebook demonstrated the complete HyperPod cluster creation workflow:\n", + "\n", + "✅ **Initialized** cluster configuration with `hyp init cluster-stack` \n", + "✅ **Configured** cluster settings and tags with `hyp configure` \n", + "✅ **Validated** configuration with `hyp validate` \n", + "✅ **Created** cluster infrastructure with `hyp create` \n", + "✅ **Monitored** deployment with `hyp describe cluster-stack` \n", + "✅ **Listed** all clusters with `hyp list cluster-stack` \n", + "✅ **Updated** cluster configuration with `hyp update cluster` \n", + "\n", + "Your HyperPod cluster is now ready for distributed machine learning workloads!\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/examples/cluster_management/cluster_creation_sdk_experience.ipynb b/examples/cluster_management/cluster_creation_sdk_experience.ipynb new file mode 100644 index 00000000..ce176052 --- /dev/null +++ b/examples/cluster_management/cluster_creation_sdk_experience.ipynb @@ -0,0 +1,683 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# SageMaker HyperPod Cluster Creation - SDK Experience\n", + "\n", + "This notebook demonstrates the complete end-to-end workflow for creating a SageMaker HyperPod cluster using the HyperPod SDK with the HpClusterStack class. The SDK provides programmatic control over cluster lifecycle management.\n", + "\n", + "## Prerequisites\n", + "\n", + "- AWS CLI configured with appropriate permissions\n", + "- SageMaker HyperPod SDK installed (`pip install sagemaker-hyperpod`)\n", + "- SageMaker Core SDK installed (`pip install sagemaker-core`)\n", + "- Python 3.8+ environment\n", + "\n", + "## Workflow Overview\n", + "\n", + "1. **Initialize** - Create HpClusterStack instance with configuration\n", + "2. **Configure** - Set cluster settings and tags programmatically\n", + "3. **Create** - Deploy the cluster infrastructure\n", + "4. **Monitor** - Check cluster status and manage lifecycle" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 1: Import Required Libraries and Initialize Configuration\n", + "\n", + "First, we'll import the necessary SDK components and create an HpClusterStack instance with default settings. This is equivalent to `hyp init cluster-stack` in the CLI.\n", + "\n", + "**What this does:**\n", + "- Imports HpClusterStack and related classes\n", + "- Creates cluster configuration with default settings\n", + "- Sets up basic infrastructure components (VPC, EKS, S3, etc.)\n", + "- Generates unique resource names to avoid conflicts" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "import uuid\n", + "import time\n", + "from sagemaker.hyperpod.cluster_management.hp_cluster_stack import HpClusterStack\n", + "from sagemaker_core.main.resources import Cluster\n", + "\n", + "# Generate unique resource prefix to avoid conflicts\n", + "resource_prefix = f\"hyperpod-sdk-{str(uuid.uuid4())[:8]}\"\n", + "\n", + "# Initialize cluster stack configuration (equivalent to hyp init cluster-stack)\n", + "cluster_stack = HpClusterStack(\n", + " stage=\"prod\",\n", + " resource_name_prefix=resource_prefix,\n", + " hyperpod_cluster_name=f\"{resource_prefix}-cluster\",\n", + " eks_cluster_name=f\"{resource_prefix}-eks\",\n", + " s3_bucket_name=f\"{resource_prefix}-s3-bucket\",\n", + " sagemaker_iam_role_name=f\"{resource_prefix}-iam-role\",\n", + " \n", + " # Infrastructure components to create\n", + " create_vpc_stack=True,\n", + " create_security_group_stack=True,\n", + " create_eks_cluster_stack=True,\n", + " create_s3_bucket_stack=True,\n", + " create_s3_endpoint_stack=True,\n", + " create_life_cycle_script_stack=True,\n", + " create_sagemaker_iam_role_stack=True,\n", + " create_helm_chart_stack=True,\n", + " create_hyperpod_cluster_stack=True,\n", + " create_fsx_stack=True,\n", + " \n", + " # Network configuration\n", + " vpc_cidr=\"10.192.0.0/16\",\n", + " availability_zone_ids=[\"use2-az1\", \"use2-az2\", \"use2-az3\"],\n", + " \n", + " # Kubernetes configuration\n", + " kubernetes_version=\"1.31\",\n", + " node_provisioning_mode=\"Continuous\",\n", + " \n", + " # Instance group configuration\n", + " instance_group_settings=[\n", + " {\n", + " \"InstanceCount\": 1,\n", + " \"InstanceGroupName\": \"controller-group\",\n", + " \"InstanceType\": \"ml.t3.medium\",\n", + " \"TargetAvailabilityZoneId\": \"use2-az2\",\n", + " \"ThreadsPerCore\": 1,\n", + " \"InstanceStorageConfigs\": [\n", + " {\"EbsVolumeConfig\": {\"VolumeSizeInGB\": 500}}\n", + " ]\n", + " }\n", + " ]\n", + ")\n", + "\n", + "print(f\"Initialized cluster stack with prefix: {resource_prefix}\")\n", + "print(f\"Cluster name: {cluster_stack.hyperpod_cluster_name}\")" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 2: Configure Cluster Settings and Tags\n", + "\n", + "Configure the cluster with custom tags and additional settings. This is equivalent to `hyp configure --tags []` in the CLI.\n", + "\n", + "**Key configuration options:**\n", + "- **Tags**: For resource organization and cost tracking\n", + "- **Instance Groups**: Define compute resources and their specifications\n", + "- **Networking**: VPC, subnets, and security group settings\n", + "- **Storage**: FSx and EBS volume configurations" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Configure cluster with custom tags (equivalent to hyp configure --tags)\n", + "cluster_tags = [\n", + " {\"Key\": \"Environment\", \"Value\": \"Development\"},\n", + " {\"Key\": \"Project\", \"Value\": \"MLTraining\"},\n", + " {\"Key\": \"Owner\", \"Value\": \"DataScienceTeam\"},\n", + " {\"Key\": \"CostCenter\", \"Value\": \"ML-Research\"},\n", + " {\"Key\": \"CreatedBy\", \"Value\": \"SDK-Example\"}\n", + "]\n", + "\n", + "# Update cluster stack with tags\n", + "cluster_stack.tags = cluster_tags\n", + "\n", + "# Additional configuration options\n", + "cluster_stack.node_recovery = \"Automatic\"\n", + "cluster_stack.fsx_availability_zone_id = \"use2-az2\"\n", + "cluster_stack.storage_capacity = 1200\n", + "cluster_stack.per_unit_storage_throughput = 250\n", + "\n", + "print(\"Configured cluster with custom tags:\")\n", + "for tag in cluster_tags:\n", + " print(f\" {tag['Key']}: {tag['Value']}\")\n", + "\n", + "print(f\"\\nNode recovery: {cluster_stack.node_recovery}\")\n", + "print(f\"FSx storage capacity: {cluster_stack.storage_capacity} GiB\")" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### View Current Configuration\n", + "\n", + "Let's examine the current configuration to understand what will be deployed:" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Display current configuration details\n", + "print(\"=== Cluster Configuration ===\")\n", + "print(f\"Resource Prefix: {cluster_stack.resource_name_prefix}\")\n", + "print(f\"HyperPod Cluster: {cluster_stack.hyperpod_cluster_name}\")\n", + "print(f\"EKS Cluster: {cluster_stack.eks_cluster_name}\")\n", + "print(f\"S3 Bucket: {cluster_stack.s3_bucket_name}\")\n", + "print(f\"VPC CIDR: {cluster_stack.vpc_cidr}\")\n", + "print(f\"Kubernetes Version: {cluster_stack.kubernetes_version}\")\n", + "print(f\"\\nInstance Groups:\")\n", + "for ig in cluster_stack.instance_group_settings:\n", + " print(f\" - {ig['InstanceGroupName']}: {ig['InstanceCount']}x {ig['InstanceType']}\")\n", + "print(f\"\\nInfrastructure Components:\")\n", + "print(f\" VPC Stack: {cluster_stack.create_vpc_stack}\")\n", + "print(f\" EKS Stack: {cluster_stack.create_eks_cluster_stack}\")\n", + "print(f\" HyperPod Stack: {cluster_stack.create_hyperpod_cluster_stack}\")\n", + "print(f\" FSx Stack: {cluster_stack.create_fsx_stack}\")" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 3: Create the Cluster\n", + "\n", + "Deploy the HyperPod cluster infrastructure using the SDK. This is equivalent to `hyp create` in the CLI.\n", + "\n", + "**Deployment includes:**\n", + "- VPC and networking infrastructure\n", + "- EKS cluster with managed node groups\n", + "- SageMaker HyperPod cluster\n", + "- IAM roles and policies\n", + "- S3 buckets for artifacts\n", + "- FSx file system (if configured)\n", + "\n", + "**Note:** This process typically takes 15-30 minutes to complete." + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Create the HyperPod cluster (equivalent to hyp create)\n", + "try:\n", + " print(\"Starting cluster creation...\")\n", + " print(f\"This will create cluster: {cluster_stack.hyperpod_cluster_name}\")\n", + " \n", + " # Deploy the cluster infrastructure\n", + " response = cluster_stack.create(region=\"us-east-2\")\n", + " \n", + " print(\"\\n✅ Cluster creation initiated successfully!\")\n", + " print(f\"Stack Name: {cluster_stack.stack_name}\")\n", + " print(f\"Stack ID: {cluster_stack.stack_id}\")\n", + " \n", + " # Store cluster information for later use\n", + " cluster_name = cluster_stack.hyperpod_cluster_name\n", + " stack_name = cluster_stack.stack_name\n", + " \n", + " print(f\"\\nCluster creation is in progress. This may take 15-30 minutes.\")\n", + " print(f\"Monitor progress in the next steps.\")\n", + " \n", + "except Exception as e:\n", + " print(f\"\\n❌ Cluster creation failed: {str(e)}\")\n", + " raise" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 4: Monitor Cluster Creation\n", + "\n", + "Monitor the cluster creation progress using SDK methods. This provides real-time status updates on the deployment process." + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Monitor cluster creation progress\n", + "def monitor_cluster_creation(stack_name, max_checks=30, interval=120):\n", + " \"\"\"Monitor cluster creation progress\"\"\"\n", + " print(f\"Monitoring cluster creation progress for stack: {stack_name}\")\n", + " \n", + " for i in range(max_checks):\n", + " try:\n", + " print(f\"\\n--- Status Check {i+1}/{max_checks} ---\")\n", + " \n", + " # Check stack status\n", + " status = HpClusterStack.check_status(stack_name, region=\"us-east-2\")\n", + " print(f\"Stack Status: {status}\")\n", + " \n", + " # Check if creation is complete\n", + " if status == \"CREATE_COMPLETE\":\n", + " print(\"\\n🎉 Cluster creation completed successfully!\")\n", + " break\n", + " elif status in [\"CREATE_FAILED\", \"ROLLBACK_COMPLETE\", \"DELETE_COMPLETE\"]:\n", + " print(f\"\\n❌ Cluster creation failed with status: {status}\")\n", + " break\n", + " elif status == \"CREATE_IN_PROGRESS\":\n", + " print(\"⏳ Cluster creation still in progress...\")\n", + " \n", + " if i < max_checks - 1: # Don't sleep on the last iteration\n", + " print(f\"Waiting {interval} seconds before next check...\")\n", + " time.sleep(interval)\n", + " \n", + " except Exception as e:\n", + " print(f\"Error checking status: {str(e)}\")\n", + " break\n", + " \n", + " return status\n", + "\n", + "# Start monitoring (uncomment when cluster creation is initiated)\n", + "# final_status = monitor_cluster_creation(stack_name, max_checks=5, interval=30)\n", + "print(\"Monitoring function ready. Uncomment to start monitoring after cluster creation.\")" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 5: Describe Cluster Stack\n", + "\n", + "Get detailed information about the deployed cluster using SDK methods. This is equivalent to `hyp describe cluster-stack` in the CLI.\n", + "\n", + "**Information provided:**\n", + "- Cluster status and health\n", + "- Resource ARNs and IDs\n", + "- Network configuration details\n", + "- Instance group information\n", + "- Storage configuration" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Get detailed information about the cluster stack (equivalent to hyp describe cluster-stack)\n", + "def describe_cluster_stack(stack_name, region=\"us-east-2\"):\n", + " \"\"\"Describe cluster stack details\"\"\"\n", + " try:\n", + " print(f\"Describing cluster stack: {stack_name}\")\n", + " \n", + " # Get stack description\n", + " response = HpClusterStack.describe(stack_name, region=region)\n", + " \n", + " if response and 'Stacks' in response and len(response['Stacks']) > 0:\n", + " stack = response['Stacks'][0]\n", + " \n", + " print(\"\\n=== Stack Information ===\")\n", + " print(f\"Stack Name: {stack.get('StackName', 'N/A')}\")\n", + " print(f\"Stack Status: {stack.get('StackStatus', 'N/A')}\")\n", + " print(f\"Creation Time: {stack.get('CreationTime', 'N/A')}\")\n", + " print(f\"Stack ID: {stack.get('StackId', 'N/A')}\")\n", + " \n", + " # Display parameters\n", + " if 'Parameters' in stack:\n", + " print(\"\\n=== Parameters ===\")\n", + " for param in stack['Parameters'][:10]: # Show first 10 parameters\n", + " print(f\" {param['ParameterKey']}: {param['ParameterValue']}\")\n", + " \n", + " # Display outputs\n", + " if 'Outputs' in stack:\n", + " print(\"\\n=== Outputs ===\")\n", + " for output in stack['Outputs'][:10]: # Show first 10 outputs\n", + " print(f\" {output['OutputKey']}: {output['OutputValue']}\")\n", + " \n", + " # Display tags\n", + " if 'Tags' in stack:\n", + " print(\"\\n=== Tags ===\")\n", + " for tag in stack['Tags']:\n", + " print(f\" {tag['Key']}: {tag['Value']}\")\n", + " \n", + " return response\n", + " \n", + " except Exception as e:\n", + " print(f\"Error describing stack: {str(e)}\")\n", + " return None\n", + "\n", + "# Describe the cluster stack (uncomment when stack exists)\n", + "# describe_cluster_stack(stack_name)\n", + "print(\"Describe function ready. Use after cluster creation is complete.\")" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 6: List All Cluster Stacks\n", + "\n", + "List all HyperPod cluster stacks in your account using SDK methods. This is equivalent to `hyp list cluster-stack` in the CLI.\n", + "\n", + "**Displays:**\n", + "- All cluster stacks in the current region\n", + "- Stack names and creation timestamps\n", + "- Current status of each stack\n", + "- Resource counts and types" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# List all cluster stacks (equivalent to hyp list cluster-stack)\n", + "def list_cluster_stacks(region=\"us-east-2\"):\n", + " \"\"\"List all cluster stacks in the account\"\"\"\n", + " try:\n", + " print(f\"Listing cluster stacks in region: {region}\")\n", + " \n", + " # Get list of stacks\n", + " response = HpClusterStack.list(region=region)\n", + " \n", + " if response and 'StackSummaries' in response:\n", + " stacks = response['StackSummaries']\n", + " \n", + " print(f\"\\n=== Found {len(stacks)} Stack(s) ===\")\n", + " \n", + " if stacks:\n", + " print(f\"{'Stack Name':<40} {'Status':<25} {'Creation Time':<20}\")\n", + " print(\"-\" * 85)\n", + " \n", + " for stack in stacks:\n", + " name = stack.get('StackName', 'N/A')[:39]\n", + " status = stack.get('StackStatus', 'N/A')[:24]\n", + " created = str(stack.get('CreationTime', 'N/A'))[:19]\n", + " print(f\"{name:<40} {status:<25} {created:<20}\")\n", + " else:\n", + " print(\"No cluster stacks found.\")\n", + " \n", + " return response\n", + " \n", + " except Exception as e:\n", + " print(f\"Error listing stacks: {str(e)}\")\n", + " return None\n", + "\n", + "# List all cluster stacks\n", + "list_response = list_cluster_stacks()\n", + "\n", + "# Filter for HyperPod-related stacks\n", + "if list_response and 'StackSummaries' in list_response:\n", + " hyperpod_stacks = [\n", + " stack for stack in list_response['StackSummaries']\n", + " if 'hyperpod' in stack.get('StackName', '').lower()\n", + " ]\n", + " \n", + " if hyperpod_stacks:\n", + " print(f\"\\n=== HyperPod Stacks ({len(hyperpod_stacks)}) ===\")\n", + " for stack in hyperpod_stacks:\n", + " print(f\" - {stack['StackName']} ({stack['StackStatus']})\")" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 7: Update Cluster Configuration\n", + "\n", + "Update the existing cluster configuration using sagemaker-core's Cluster class. This is equivalent to `hyp update cluster` in the CLI.\n", + "\n", + "**Common update scenarios:**\n", + "- Scaling instance groups up or down\n", + "- Adding new instance types\n", + "- Updating cluster tags\n", + "- Modifying storage configurations\n", + "\n", + "**Note:** Some changes may require cluster restart or recreation." + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Update cluster configuration using sagemaker-core Cluster class\n", + "def update_cluster(cluster_name, region=\"us-east-2\"):\n", + " \"\"\"Update cluster configuration (equivalent to hyp update cluster)\"\"\"\n", + " try:\n", + " print(f\"Updating cluster: {cluster_name}\")\n", + " \n", + " # Get existing cluster using sagemaker-core\n", + " cluster = Cluster.get(cluster_name=cluster_name)\n", + " \n", + " print(f\"\\nCurrent cluster status: {cluster.cluster_status}\")\n", + " print(f\"Current instance groups: {len(cluster.instance_groups)}\")\n", + " \n", + " # Display current instance groups\n", + " print(\"\\n=== Current Instance Groups ===\")\n", + " for ig in cluster.instance_groups:\n", + " print(f\" - {ig.instance_group_name}: {ig.current_count}x {ig.instance_type}\")\n", + " \n", + " # Example: Update cluster tags\n", + " updated_tags = [\n", + " {\"Key\": \"Environment\", \"Value\": \"Development\"},\n", + " {\"Key\": \"Project\", \"Value\": \"MLTraining\"},\n", + " {\"Key\": \"Owner\", \"Value\": \"DataScienceTeam\"},\n", + " {\"Key\": \"CostCenter\", \"Value\": \"ML-Research\"},\n", + " {\"Key\": \"UpdatedBy\", \"Value\": \"SDK-Example\"},\n", + " {\"Key\": \"LastUpdated\", \"Value\": str(time.time())}\n", + " ]\n", + " \n", + " # Update cluster with new tags\n", + " cluster.update(tags=updated_tags)\n", + " \n", + " print(\"\\n✅ Cluster updated successfully!\")\n", + " print(\"Updated tags:\")\n", + " for tag in updated_tags:\n", + " print(f\" {tag['Key']}: {tag['Value']}\")\n", + " \n", + " return cluster\n", + " \n", + " except Exception as e:\n", + " print(f\"Error updating cluster: {str(e)}\")\n", + " return None\n", + "\n", + "# Example: Scale instance group\n", + "def scale_instance_group(cluster_name, instance_group_name, target_count, region=\"us-east-2\"):\n", + " \"\"\"Scale an instance group to target count\"\"\"\n", + " try:\n", + " print(f\"Scaling instance group '{instance_group_name}' to {target_count} instances\")\n", + " \n", + " # Get cluster\n", + " cluster = Cluster.get(cluster_name=cluster_name)\n", + " \n", + " # Find the instance group\n", + " target_ig = None\n", + " for ig in cluster.instance_groups:\n", + " if ig.instance_group_name == instance_group_name:\n", + " target_ig = ig\n", + " break\n", + " \n", + " if not target_ig:\n", + " print(f\"Instance group '{instance_group_name}' not found\")\n", + " return None\n", + " \n", + " print(f\"Current count: {target_ig.current_count}\")\n", + " print(f\"Target count: {target_count}\")\n", + " \n", + " # Update instance group count\n", + " target_ig.target_count = target_count\n", + " \n", + " # Apply the update\n", + " cluster.update(instance_groups=[target_ig])\n", + " \n", + " print(f\"\\n✅ Instance group scaling initiated!\")\n", + " \n", + " return cluster\n", + " \n", + " except Exception as e:\n", + " print(f\"Error scaling instance group: {str(e)}\")\n", + " return None\n", + "\n", + "# Update functions ready (uncomment when cluster exists)\n", + "# updated_cluster = update_cluster(cluster_name)\n", + "# scaled_cluster = scale_instance_group(cluster_name, \"controller-group\", 2)\n", + "\n", + "print(\"Update functions ready. Use after cluster creation is complete.\")" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 8: Verify Cluster Status and Health\n", + "\n", + "Verify that the cluster is healthy and ready for workloads using comprehensive status checks." + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Comprehensive cluster health check\n", + "def check_cluster_health(cluster_name, region=\"us-east-2\"):\n", + " \"\"\"Perform comprehensive cluster health check\"\"\"\n", + " try:\n", + " print(f\"Checking health for cluster: {cluster_name}\")\n", + " \n", + " # Get cluster details\n", + " cluster = Cluster.get(cluster_name=cluster_name)\n", + " \n", + " print(\"\\n=== Cluster Health Summary ===\")\n", + " print(f\"Cluster Name: {cluster.cluster_name}\")\n", + " print(f\"Cluster Status: {cluster.cluster_status}\")\n", + " print(f\"Creation Time: {cluster.creation_time}\")\n", + " print(f\"Cluster ARN: {cluster.cluster_arn}\")\n", + " \n", + " # Check instance groups health\n", + " print(\"\\n=== Instance Groups Health ===\")\n", + " total_instances = 0\n", + " healthy_instances = 0\n", + " \n", + " for ig in cluster.instance_groups:\n", + " print(f\"\\nInstance Group: {ig.instance_group_name}\")\n", + " print(f\" Instance Type: {ig.instance_type}\")\n", + " print(f\" Current Count: {ig.current_count}\")\n", + " print(f\" Target Count: {getattr(ig, 'target_count', 'N/A')}\")\n", + " print(f\" Status: {getattr(ig, 'instance_group_status', 'N/A')}\")\n", + " \n", + " total_instances += ig.current_count\n", + " if getattr(ig, 'instance_group_status', '') == 'InService':\n", + " healthy_instances += ig.current_count\n", + " \n", + " print(f\"\\n=== Overall Health ===\")\n", + " print(f\"Total Instances: {total_instances}\")\n", + " print(f\"Healthy Instances: {healthy_instances}\")\n", + " health_percentage = (healthy_instances / total_instances * 100) if total_instances > 0 else 0\n", + " print(f\"Health Percentage: {health_percentage:.1f}%\")\n", + " \n", + " # Determine overall health status\n", + " if cluster.cluster_status == 'InService' and health_percentage >= 80:\n", + " print(\"\\n🟢 Cluster is HEALTHY and ready for workloads\")\n", + " elif cluster.cluster_status == 'Creating':\n", + " print(\"\\n🟡 Cluster is still CREATING\")\n", + " else:\n", + " print(\"\\n🔴 Cluster may have ISSUES - check individual components\")\n", + " \n", + " return cluster\n", + " \n", + " except Exception as e:\n", + " print(f\"Error checking cluster health: {str(e)}\")\n", + " return None\n", + "\n", + "# Health check function ready (uncomment when cluster exists)\n", + "# cluster_health = check_cluster_health(cluster_name)\n", + "\n", + "print(\"Health check function ready. Use after cluster creation is complete.\")" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Next Steps\n", + "\n", + "After successfully creating your HyperPod cluster using the SDK, you can:\n", + "\n", + "1. **Submit Training Jobs**: Use HyperPod SDK training classes for distributed training\n", + "2. **Deploy Inference Endpoints**: Use HyperPod SDK inference classes for model serving\n", + "3. **Monitor Resources**: Use SDK methods to check pod and job status\n", + "4. **Access Logs**: Retrieve training and system logs programmatically\n", + "5. **Scale Cluster**: Modify instance groups using the Cluster class\n", + "\n", + "## Troubleshooting\n", + "\n", + "If you encounter issues during cluster creation:\n", + "\n", + "- Check AWS CloudFormation console for detailed error messages\n", + "- Verify AWS credentials and permissions using `boto3.Session()`\n", + "- Ensure resource quotas are sufficient\n", + "- Review the cluster configuration parameters\n", + "\n", + "## Cleanup\n", + "\n", + "To avoid ongoing charges, remember to delete your cluster when no longer needed:\n", + "\n", + "```python\n", + "# Delete cluster using sagemaker-core\n", + "cluster = Cluster.get(cluster_name=cluster_name)\n", + "cluster.delete()\n", + "\n", + "# Or delete the entire stack\n", + "import boto3\n", + "cf_client = boto3.client('cloudformation', region_name='us-east-2')\n", + "cf_client.delete_stack(StackName=stack_name)\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Summary\n", + "\n", + "This notebook demonstrated the complete HyperPod cluster creation workflow using the SDK:\n", + "\n", + "✅ **Initialized** cluster configuration with `HpClusterStack` class \n", + "✅ **Configured** cluster settings and tags programmatically \n", + "✅ **Created** cluster infrastructure with `cluster_stack.create()` \n", + "✅ **Monitored** deployment with `HpClusterStack.check_status()` \n", + "✅ **Listed** all clusters with `HpClusterStack.list()` \n", + "✅ **Updated** cluster configuration with `Cluster.update()` \n", + "✅ **Verified** cluster health with comprehensive checks \n", + "\n", + "Your HyperPod cluster is now ready for distributed machine learning workloads using the SDK!" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/__init__.py b/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/creation_template.yaml b/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/creation_template.yaml new file mode 100644 index 00000000..bd019b6c --- /dev/null +++ b/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/creation_template.yaml @@ -0,0 +1,1124 @@ +Description: Main Stack for EKS based HyperPod Cluster +Metadata: + AWS::CloudFormation::Interface: + ParameterGroups: + - Label: + default: General Settings + Parameters: + - ResourceNamePrefix + - Stage + - NodeRecovery + - Tags + - Label: + default: Networking + Parameters: + - CreateVPCStack + - VpcId + - VpcCIDR + - AvailabilityZoneIds + - CreateSecurityGroupStack + - SecurityGroupId + - SecurityGroupIds + - CreatePrivateSubnetStack + - PrivateSubnetIds + - EksPrivateSubnetIds + - NatGatewayIds + - PrivateRouteTableIds + - CreateS3EndpointStack + - Label: + default: Orchestration + Parameters: + - CreateEKSClusterStack + - EKSClusterName + - KubernetesVersion + - CreateHelmChartStack + - HelmRepoUrl + - HelmRepoPath + - HelmRelease + - Namespace + - HelmOperators + - Label: + default: Lifecycle Configuration + Parameters: + - CreateLifeCycleScriptStack + - CreateS3BucketStack + - S3BucketName + - GithubRawUrl + - OnCreatePath + - Label: + default: Permissions + Parameters: + - CreateSageMakerIAMRoleStack + - SageMakerIAMRoleName + - Label: + default: Storage + Parameters: + - CreateFsxStack + - FsxFileSystemId + - FsxSubnetId + - FsxAvailabilityZone + - StorageCapacity + - PerUnitStorageThroughput + - DataCompressionType + - FileSystemTypeVersion + - Label: + default: HyperPod Cluster + Parameters: + - CreateHyperPodClusterStack + - HyperPodClusterName + - Label: + default: Instance Groups + Parameters: + - InstanceGroupSettings1 + - InstanceGroupSettings2 + - InstanceGroupSettings3 + - InstanceGroupSettings4 + - InstanceGroupSettings5 + - InstanceGroupSettings6 + - InstanceGroupSettings7 + - InstanceGroupSettings8 + - InstanceGroupSettings9 + - InstanceGroupSettings10 + - InstanceGroupSettings11 + - InstanceGroupSettings12 + - InstanceGroupSettings13 + - InstanceGroupSettings14 + - InstanceGroupSettings15 + - InstanceGroupSettings16 + - InstanceGroupSettings17 + - InstanceGroupSettings18 + - InstanceGroupSettings19 + - InstanceGroupSettings20 + - Label: + default: Restricted Instance Groups + Parameters: + - RigSettings1 + - RigSettings2 + - RigSettings3 + - RigSettings4 + - RigSettings5 + - RigSettings6 + - RigSettings7 + - RigSettings8 + - RigSettings9 + - RigSettings10 + - RigSettings11 + - RigSettings12 + - RigSettings13 + - RigSettings14 + - RigSettings15 + - RigSettings16 + - RigSettings17 + - RigSettings18 + - RigSettings19 + - RigSettings20 + ParameterLabels: + ResourceNamePrefix: + default: Resource Name Prefix + Stage: + default: Deployment Stage + NodeRecovery: + default: Instance Recovery + Tags: + default: Resource Tags + CreateVPCStack: + default: Create New VPC + VpcId: + default: Existing VPC ID + VpcCIDR: + default: VPC CIDR Range + AvailabilityZoneIds: + default: Availability Zone IDs + CreateSecurityGroupStack: + default: Create New Security Group + SecurityGroupId: + default: Existing Security Group ID + SecurityGroupIds: + default: Security Group IDs + CreatePrivateSubnetStack: + default: Create Private Subnets + PrivateSubnetIds: + default: Private Subnet IDs + EksPrivateSubnetIds: + default: EKS Private Subnet IDs + NatGatewayIds: + default: NAT Gateway IDs + PrivateRouteTableIds: + default: Private Route Table IDs + CreateS3EndpointStack: + default: Create S3 Endpoint + CreateEKSClusterStack: + default: Create New EKS Cluster + EKSClusterName: + default: EKS Cluster Name + KubernetesVersion: + default: Kubernetes Version + CreateHelmChartStack: + default: Install Helm Charts + HelmRepoUrl: + default: Helm Repository URL + HelmRepoPath: + default: Helm Chart Path + HelmRelease: + default: Helm Release Name + Namespace: + default: Kubernetes Namespace + HelmOperators: + default: Enabled Operators + CreateLifeCycleScriptStack: + default: Create Lifecycle Scripts + CreateS3BucketStack: + default: Create New S3 Bucket + S3BucketName: + default: S3 Bucket Name + GithubRawUrl: + default: GitHub Raw URL + OnCreatePath: + default: OnCreate Script Path + CreateSageMakerIAMRoleStack: + default: Create New IAM Role + SageMakerIAMRoleName: + default: IAM Role Name + CreateFsxStack: + default: Create New FSx for Lustre File System + FsxFileSystemId: + default: Existing FSx File System ID + FsxSubnetId: + default: FSx Subnet ID + FsxAvailabilityZone: + default: FSx Availability Zone + StorageCapacity: + default: Storage Capacity (GB) + PerUnitStorageThroughput: + default: Per-unit Storage Throughput (MB/s/TiB) + DataCompressionType: + default: Compression Type + FileSystemTypeVersion: + default: Lustre Version + CreateHyperPodClusterStack: + default: Create HyperPod Cluster + HyperPodClusterName: + default: HyperPod Cluster Name +Parameters: + Stage: + Type: String + Default: prod + AllowedValues: + - gamma + - prod + Description: Deployment stage (gamma, prod) + EnableHPInferenceFeature: + Type: String + Default: 'false' + Description: Feature flag for enabling HP inference + CustomBucketName: + Type: String + Default: '' + Description: Custom S3 bucket name for templates + ResourceNamePrefix: + Type: String + Default: hyperpod-cli-integ-test + Description: Prefix to be used for all resources created by this template. + VpcCIDR: + Type: String + Default: 10.192.0.0/16 + Description: The IP range (CIDR notation) for the VPC. + AvailabilityZoneIds: + Type: String + Default: use2-az1,use2-az2,use2-az3 + Description: List of AZs to deploy subnets in (up to 5, comma separated) + NodeProvisioningMode: + Type: String + Default: Continuous + Description: The node provisioning mode + VpcId: + Type: String + Default: '' + Description: The ID of the VPC you wish to use if you do not want to create a new VPC. + NatGatewayIds: + Type: String + Default: '' + Description: Comma-separated list of NAT Gateway IDs to route internet bound traffic to from the newly created private subnets. + SecurityGroupId: + Type: String + Default: '' + Description: The ID of the security group associated with an existing EKS cluster. + KubernetesVersion: + Type: String + Default: '1.31' + Description: The Kubernetes version to use for the EKS cluster. + EKSClusterName: + Type: String + Default: eks + Description: The name of the newly created of preexisting EKS cluster you wish to use. + EksPrivateSubnetIds: + Type: String + Default: '' + Description: Comma-delimited list of private subnet IDs for the EKS cluster + SecurityGroupIds: + Type: String + Default: '' + Description: The Id of your cluster security group. + PrivateRouteTableIds: + Type: String + Default: '' + Description: Comma-separated list of private route table IDs. + S3BucketName: + Type: String + Default: s3-bucket + Description: The name of the S3 bucket used to store the cluster lifecycle scripts. + GithubRawUrl: + Type: String + Default: >- + https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/refs/heads/main/1.architectures/7.sagemaker-hyperpod-eks/LifecycleScripts/base-config/on_create.sh + Description: The raw GitHub URL for the lifecycle script. + HelmRepoUrl: + Type: String + Default: https://github.com/aws/sagemaker-hyperpod-cli.git + Description: The URL of the Helm repo containing the HyperPod Helm chart. + HelmRepoPath: + Type: String + Default: helm_chart/HyperPodHelmChart + Description: The path to the HyperPod Helm chart in the Helm repo. + HelmOperators: + Type: String + Default: 'mlflow.enabled=true,trainingOperators.enabled=true,storage.enabled=true,cluster-role-and-bindings.enabled=true,namespaced-role-and-bindings.enable=true,nvidia-device-plugin.devicePlugin.enabled=true,neuron-device-plugin.devicePlugin.enabled=true,aws-efa-k8s-device-plugin.devicePlugin.enabled=true,mpi-operator.enabled=true,health-monitoring-agent.enabled=true,deep-health-check.enabled=true,job-auto-restart.enabled=true,hyperpod-patching.enabled=true' + Description: The configuration of HyperPod Helm chart + Namespace: + Type: String + Default: kube-system + Description: The namespace to deploy the HyperPod Helm chart into. + HelmRelease: + Type: String + Default: dependencies + Description: The name of the Helm release. + HyperPodClusterName: + Type: String + Default: hyperpod-cluster-integ-test + Description: Name of SageMaker HyperPod Cluster. + NodeRecovery: + Type: String + Default: Automatic + AllowedValues: + - Automatic + - None + Description: Specifies whether to enable or disable the automatic node recovery feature (Automatic or None). + SageMakerIAMRoleName: + Type: String + Default: iam-role + Description: The name of the IAM role that SageMaker will use to access the AWS resources on your behalf. + PrivateSubnetIds: + Type: String + Default: '' + Description: Comma-separated list of private subnet IDs for EKS cluster. + OnCreatePath: + Type: String + Default: sagemaker-hyperpod-eks-bucket + Description: >- + The file name of lifecycle script for the general purpose instance group. This script runs during cluster + creation. + InstanceGroupSettings1: + Type: String + Default: >- + [{"InstanceCount":1,"InstanceGroupName":"controller-group","InstanceType":"ml.t3.medium","TargetAvailabilityZoneId":"use2-az2","ThreadsPerCore":1,"InstanceStorageConfigs":[{"EbsVolumeConfig":{"VolumeSizeInGB":500}}]}] + Description: JSON array string containing instance group configurations. + RigS3BucketName: + Type: String + Default: '' + Description: The name of the S3 bucket for RIG resources + RigSettings1: + Type: String + Default: '[]' + Description: JSON array string containing restricted instance group configurations. + InstanceGroupSettings2: + Type: String + Default: '[]' + Description: JSON array string containing instance group configurations. + RigSettings2: + Type: String + Default: '[]' + Description: JSON array string containing restricted instance group configurations. + InstanceGroupSettings3: + Type: String + Default: '[]' + Description: JSON array string containing instance group configurations. + RigSettings3: + Type: String + Default: '[]' + Description: JSON array string containing restricted instance group configurations. + InstanceGroupSettings4: + Type: String + Default: '[]' + Description: JSON array string containing instance group configurations. + RigSettings4: + Type: String + Default: '[]' + Description: JSON array string containing restricted instance group configurations. + InstanceGroupSettings5: + Type: String + Default: '[]' + Description: JSON array string containing instance group configurations. + RigSettings5: + Type: String + Default: '[]' + Description: JSON array string containing restricted instance group configurations. + InstanceGroupSettings6: + Type: String + Default: '[]' + Description: JSON array string containing instance group configurations. + RigSettings6: + Type: String + Default: '[]' + Description: JSON array string containing restricted instance group configurations. + InstanceGroupSettings7: + Type: String + Default: '[]' + Description: JSON array string containing instance group configurations. + RigSettings7: + Type: String + Default: '[]' + Description: JSON array string containing restricted instance group configurations. + InstanceGroupSettings8: + Type: String + Default: '[]' + Description: JSON array string containing instance group configurations. + RigSettings8: + Type: String + Default: '[]' + Description: JSON array string containing restricted instance group configurations. + InstanceGroupSettings9: + Type: String + Default: '[]' + Description: JSON array string containing instance group configurations. + RigSettings9: + Type: String + Default: '[]' + Description: JSON array string containing restricted instance group configurations. + InstanceGroupSettings10: + Type: String + Default: '[]' + Description: JSON array string containing instance group configurations. + RigSettings10: + Type: String + Default: '[]' + Description: JSON array string containing restricted instance group configurations. + InstanceGroupSettings11: + Type: String + Default: '[]' + Description: JSON array string containing instance group configurations. + RigSettings11: + Type: String + Default: '[]' + Description: JSON array string containing restricted instance group configurations. + InstanceGroupSettings12: + Type: String + Default: '[]' + Description: JSON array string containing instance group configurations. + RigSettings12: + Type: String + Default: '[]' + Description: JSON array string containing restricted instance group configurations. + InstanceGroupSettings13: + Type: String + Default: '[]' + Description: JSON array string containing instance group configurations. + RigSettings13: + Type: String + Default: '[]' + Description: JSON array string containing restricted instance group configurations. + InstanceGroupSettings14: + Type: String + Default: '[]' + Description: JSON array string containing instance group configurations. + RigSettings14: + Type: String + Default: '[]' + Description: JSON array string containing restricted instance group configurations. + InstanceGroupSettings15: + Type: String + Default: '[]' + Description: JSON array string containing instance group configurations. + RigSettings15: + Type: String + Default: '[]' + Description: JSON array string containing restricted instance group configurations. + InstanceGroupSettings16: + Type: String + Default: '[]' + Description: JSON array string containing instance group configurations. + RigSettings16: + Type: String + Default: '[]' + Description: JSON array string containing restricted instance group configurations. + InstanceGroupSettings17: + Type: String + Default: '[]' + Description: JSON array string containing instance group configurations. + RigSettings17: + Type: String + Default: '[]' + Description: JSON array string containing restricted instance group configurations. + InstanceGroupSettings18: + Type: String + Default: '[]' + Description: JSON array string containing instance group configurations. + RigSettings18: + Type: String + Default: '[]' + Description: JSON array string containing restricted instance group configurations. + InstanceGroupSettings19: + Type: String + Default: '[]' + Description: JSON array string containing instance group configurations. + RigSettings19: + Type: String + Default: '[]' + Description: JSON array string containing restricted instance group configurations. + InstanceGroupSettings20: + Type: String + Default: '[]' + Description: JSON array string containing instance group configurations. + RigSettings20: + Type: String + Default: '[]' + Description: JSON array string containing restricted instance group configurations. + Tags: + Type: String + Default: '[]' + Description: Custom tags for managing the SageMaker HyperPod cluster as an AWS resource. + FsxSubnetId: + Type: String + Default: '' + Description: The subnet id that will be used to create FSx + FsxAvailabilityZone: + Type: String + Default: use2-az2 + Description: The availability zone to get subnet id that will be used to create FSx + PerUnitStorageThroughput: + Type: Number + Default: 250 + Description: Per unit storage throughput for the FSx file system + DataCompressionType: + Type: String + Default: NONE + AllowedValues: + - NONE + - LZ4 + Description: Data compression type for the FSx file system (NONE, LZ4) + FileSystemTypeVersion: + Type: Number + Default: 2.15 + Description: File system type version for the FSx file system + StorageCapacity: + Type: Number + Default: 1200 + Description: Storage capacity for the FSx file system in GiB + FsxFileSystemId: + Type: String + Default: '' + Description: Existing FSx for Lustre file system + CreateVPCStack: + Type: String + Default: 'true' + AllowedValues: + - 'true' + - 'false' + Description: Boolean to Create VPC Stack + CreatePrivateSubnetStack: + Type: String + Default: 'true' + AllowedValues: + - 'true' + - 'false' + Description: Boolean to Create Private Subnet Stack + CreateSecurityGroupStack: + Type: String + Default: 'true' + AllowedValues: + - 'true' + - 'false' + Description: Boolean to Create Security Group Stack + CreateEKSClusterStack: + Type: String + Default: 'true' + AllowedValues: + - 'true' + - 'false' + Description: Boolean to Create EKS Cluster Stack + CreateS3BucketStack: + Type: String + Default: 'true' + AllowedValues: + - 'true' + - 'false' + Description: Boolean to Create S3 Bucket Stack + CreateS3EndpointStack: + Type: String + Default: 'true' + AllowedValues: + - 'true' + - 'false' + Description: Boolean to Create S3 Endpoint Stack + CreateLifeCycleScriptStack: + Type: String + Default: 'true' + AllowedValues: + - 'true' + - 'false' + Description: Boolean to Create Life Cycle Script Stack + CreateSageMakerIAMRoleStack: + Type: String + Default: 'true' + AllowedValues: + - 'true' + - 'false' + Description: Boolean to Create SageMaker IAM Role Stack + CreateHelmChartStack: + Type: String + Default: 'true' + AllowedValues: + - 'true' + - 'false' + Description: Boolean to Create Helm Chart Stack + CreateHyperPodClusterStack: + Type: String + Default: 'true' + AllowedValues: + - 'true' + - 'false' + Description: Boolean to Create HyperPod Cluster Stack + CreateFsxStack: + Type: String + Default: 'true' + AllowedValues: + - 'true' + - 'false' + Description: Boolean to Create FSx for Lustre File System Stack +Conditions: + CreateVPCStackCondition: + Fn::Equals: + - Ref: CreateVPCStack + - 'true' + CreatePrivateSubnetStackCondition: + Fn::Equals: + - Ref: CreatePrivateSubnetStack + - 'true' + CreateSecurityGroupStackCondition: + Fn::Equals: + - Ref: CreateSecurityGroupStack + - 'true' + CreateEKSClusterStackCondition: + Fn::Equals: + - Ref: CreateEKSClusterStack + - 'true' + CreateS3BucketStackCondition: + Fn::Equals: + - Ref: CreateS3BucketStack + - 'true' + CreateS3EndpointStackCondition: + Fn::Equals: + - Ref: CreateS3EndpointStack + - 'true' + CreateLifeCycleScriptStackCondition: + Fn::Equals: + - Ref: CreateLifeCycleScriptStack + - 'true' + CreateSageMakerIAMRoleStackCondition: + Fn::Equals: + - Ref: CreateSageMakerIAMRoleStack + - 'true' + CreateHelmChartStackCondition: + Fn::Equals: + - Ref: CreateHelmChartStack + - 'true' + CreateHyperPodClusterStackCondition: + Fn::And: + - Fn::Equals: + - Ref: CreateHyperPodClusterStack + - 'true' + - Fn::Not: + - Fn::And: + - Fn::Equals: + - Ref: CreateEKSClusterStack + - 'true' + - Fn::Equals: + - Ref: CreateHelmChartStack + - 'false' + CreateFsxStackCondition: + Fn::Equals: + - Ref: CreateFsxStack + - 'true' +Resources: + VPCStack: + Type: AWS::CloudFormation::Stack + Properties: + TemplateURL: + Fn::Sub: >- + https://aws-sagemaker-hyperpod-cluster-setup-${AWS::Region}-${Stage}.s3.${AWS::Region}.amazonaws.com/templates/vpc-template.yaml + Parameters: + ResourceNamePrefix: + Ref: ResourceNamePrefix + VpcCIDR: + Ref: VpcCIDR + AvailabilityZoneIds: + Fn::Join: + - ',' + - - Ref: AvailabilityZoneIds + - ',,,' + Metadata: + aws:cdk:path: MainEksBasedCfnTemplate/VPCStack + Condition: CreateVPCStackCondition + PrivateSubnetStack: + Type: AWS::CloudFormation::Stack + Properties: + TemplateURL: + Fn::Sub: >- + https://aws-sagemaker-hyperpod-cluster-setup-${AWS::Region}-${Stage}.s3.${AWS::Region}.amazonaws.com/templates/private-subnet-template.yaml + Parameters: + ResourceNamePrefix: + Ref: ResourceNamePrefix + VpcId: + Fn::If: + - CreateVPCStackCondition + - Fn::GetAtt: + - VPCStack + - Outputs.VpcId + - Ref: VpcId + VpcCidrBlock: + Ref: VpcCIDR + AvailabilityZoneIds: + Fn::Join: + - ',' + - - Ref: AvailabilityZoneIds + - ',,,' + NatGatewayIds: + Fn::If: + - CreateVPCStackCondition + - Fn::GetAtt: + - VPCStack + - Outputs.NatGatewayIds + - Ref: NatGatewayIds + Metadata: + aws:cdk:path: MainEksBasedCfnTemplate/PrivateSubnetStack + Condition: CreatePrivateSubnetStackCondition + SecurityGroupStack: + Type: AWS::CloudFormation::Stack + Properties: + TemplateURL: + Fn::Sub: >- + https://aws-sagemaker-hyperpod-cluster-setup-${AWS::Region}-${Stage}.s3.${AWS::Region}.amazonaws.com/templates/security-group-template.yaml + Parameters: + ResourceNamePrefix: + Ref: ResourceNamePrefix + VpcId: + Fn::If: + - CreateVPCStackCondition + - Fn::GetAtt: + - VPCStack + - Outputs.VpcId + - Ref: VpcId + SecurityGroupId: + Ref: SecurityGroupId + Metadata: + aws:cdk:path: MainEksBasedCfnTemplate/SecurityGroupStack + Condition: CreateSecurityGroupStackCondition + EKSClusterStack: + Type: AWS::CloudFormation::Stack + Properties: + TemplateURL: + Fn::Sub: >- + https://aws-sagemaker-hyperpod-cluster-setup-${AWS::Region}-${Stage}.s3.${AWS::Region}.amazonaws.com/templates/eks-template.yaml + Parameters: + ResourceNamePrefix: + Ref: ResourceNamePrefix + VpcId: + Fn::If: + - CreateVPCStackCondition + - Fn::GetAtt: + - VPCStack + - Outputs.VpcId + - Ref: VpcId + KubernetesVersion: + Ref: KubernetesVersion + EKSClusterName: + Ref: EKSClusterName + EksPrivateSubnetIds: + Fn::If: + - CreatePrivateSubnetStackCondition + - Fn::GetAtt: + - PrivateSubnetStack + - Outputs.EksPrivateSubnetIds + - Ref: PrivateSubnetIds + SecurityGroupIds: + Fn::If: + - CreateSecurityGroupStackCondition + - Fn::GetAtt: + - SecurityGroupStack + - Outputs.SecurityGroupId + - Ref: SecurityGroupIds + Metadata: + aws:cdk:path: MainEksBasedCfnTemplate/EKSClusterStack + Condition: CreateEKSClusterStackCondition + S3BucketStack: + Type: AWS::CloudFormation::Stack + Properties: + TemplateURL: + Fn::Sub: >- + https://aws-sagemaker-hyperpod-cluster-setup-${AWS::Region}-${Stage}.s3.${AWS::Region}.amazonaws.com/templates/s3-bucket-template.yaml + Parameters: + ResourceNamePrefix: + Ref: ResourceNamePrefix + Metadata: + aws:cdk:path: MainEksBasedCfnTemplate/S3BucketStack + Condition: CreateS3BucketStackCondition + S3EndpointStack: + Type: AWS::CloudFormation::Stack + Properties: + TemplateURL: + Fn::Sub: >- + https://aws-sagemaker-hyperpod-cluster-setup-${AWS::Region}-${Stage}.s3.${AWS::Region}.amazonaws.com/templates/s3-endpoint-template.yaml + Parameters: + VpcId: + Fn::If: + - CreateVPCStackCondition + - Fn::GetAtt: + - VPCStack + - Outputs.VpcId + - Ref: VpcId + PrivateRouteTableIds: + Fn::If: + - CreatePrivateSubnetStackCondition + - Fn::GetAtt: + - PrivateSubnetStack + - Outputs.PrivateRouteTableIds + - Ref: PrivateRouteTableIds + Metadata: + aws:cdk:path: MainEksBasedCfnTemplate/S3EndpointStack + Condition: CreateS3EndpointStackCondition + LifeCycleScriptStack: + Type: AWS::CloudFormation::Stack + Properties: + TemplateURL: + Fn::Sub: >- + https://aws-sagemaker-hyperpod-cluster-setup-${AWS::Region}-${Stage}.s3.${AWS::Region}.amazonaws.com/templates/lifecycle-script-template.yaml + Parameters: + ResourceNamePrefix: + Ref: ResourceNamePrefix + S3BucketName: + Fn::If: + - CreateS3BucketStackCondition + - Fn::GetAtt: + - S3BucketStack + - Outputs.S3BucketName + - Ref: S3BucketName + Metadata: + aws:cdk:path: MainEksBasedCfnTemplate/LifeCycleScriptStack + Condition: CreateLifeCycleScriptStackCondition + SageMakerIAMRoleStack: + Type: AWS::CloudFormation::Stack + Properties: + TemplateURL: + Fn::Sub: >- + https://aws-sagemaker-hyperpod-cluster-setup-${AWS::Region}-${Stage}.s3.${AWS::Region}.amazonaws.com/templates/sagemaker-iam-role-template.yaml + Parameters: + ResourceNamePrefix: + Ref: ResourceNamePrefix + S3BucketName: + Fn::If: + - CreateS3BucketStackCondition + - Fn::GetAtt: + - S3BucketStack + - Outputs.S3BucketName + - Ref: S3BucketName + Metadata: + aws:cdk:path: MainEksBasedCfnTemplate/SageMakerIAMRoleStack + Condition: CreateSageMakerIAMRoleStackCondition + HelmChartStack: + Type: AWS::CloudFormation::Stack + Properties: + TemplateURL: + Fn::Sub: >- + https://aws-sagemaker-hyperpod-cluster-setup-${AWS::Region}-${Stage}.s3.${AWS::Region}.amazonaws.com/templates/helm-chart-template.yaml + Parameters: + ResourceNamePrefix: + Ref: ResourceNamePrefix + HelmRepoUrl: + Ref: HelmRepoUrl + HelmRepoPath: + Ref: HelmRepoPath + Namespace: + Ref: Namespace + HelmRelease: + Ref: HelmRelease + HelmOperators: + Ref: HelmOperators + CustomResourceS3Bucket: + Fn::Sub: aws-sagemaker-hyperpod-cluster-setup-${AWS::Region}-${Stage} + EKSClusterName: + Fn::If: + - CreateEKSClusterStackCondition + - Fn::GetAtt: + - EKSClusterStack + - Outputs.EKSClusterName + - Ref: EKSClusterName + Metadata: + aws:cdk:path: MainEksBasedCfnTemplate/HelmChartStack + Condition: CreateHelmChartStackCondition + HyperPodClusterStack: + Type: AWS::CloudFormation::Stack + Properties: + TemplateURL: + Fn::Sub: >- + https://aws-sagemaker-hyperpod-cluster-setup-${AWS::Region}-${Stage}.s3.${AWS::Region}.amazonaws.com/templates/hyperpod-cluster-template.yaml + Parameters: + ResourceNamePrefix: + Ref: ResourceNamePrefix + HelmChartStatus: + Fn::If: + - CreateHelmChartStackCondition + - Fn::GetAtt: + - HelmChartStack + - Outputs.HelmChartDeploymentComplete + - HelmChartNotRequired + HyperPodClusterName: + Ref: HyperPodClusterName + NodeRecovery: + Ref: NodeRecovery + EKSClusterName: + Fn::If: + - CreateEKSClusterStackCondition + - Fn::GetAtt: + - EKSClusterStack + - Outputs.EKSClusterName + - Ref: EKSClusterName + SecurityGroupIds: + Fn::If: + - CreateSecurityGroupStackCondition + - Fn::GetAtt: + - SecurityGroupStack + - Outputs.SecurityGroupId + - Ref: SecurityGroupIds + PrivateSubnetIds: + Fn::If: + - CreatePrivateSubnetStackCondition + - Fn::GetAtt: + - PrivateSubnetStack + - Outputs.PrivateSubnetIds + - Ref: PrivateSubnetIds + CustomResourceS3Bucket: + Fn::Sub: aws-sagemaker-hyperpod-cluster-setup-${AWS::Region}-${Stage} + SageMakerIAMRoleName: + Fn::If: + - CreateSageMakerIAMRoleStackCondition + - Fn::GetAtt: + - SageMakerIAMRoleStack + - Outputs.SageMakerIAMRoleName + - Ref: SageMakerIAMRoleName + S3BucketName: + Fn::If: + - CreateS3BucketStackCondition + - Fn::GetAtt: + - S3BucketStack + - Outputs.S3BucketName + - Ref: S3BucketName + OnCreatePath: + Fn::If: + - CreateS3BucketStackCondition + - on_create.sh + - Ref: OnCreatePath + InstanceGroupSettings1: + Ref: InstanceGroupSettings1 + InstanceGroupSettings2: + Ref: InstanceGroupSettings2 + InstanceGroupSettings3: + Ref: InstanceGroupSettings3 + InstanceGroupSettings4: + Ref: InstanceGroupSettings4 + InstanceGroupSettings5: + Ref: InstanceGroupSettings5 + InstanceGroupSettings6: + Ref: InstanceGroupSettings6 + InstanceGroupSettings7: + Ref: InstanceGroupSettings7 + InstanceGroupSettings8: + Ref: InstanceGroupSettings8 + InstanceGroupSettings9: + Ref: InstanceGroupSettings9 + InstanceGroupSettings10: + Ref: InstanceGroupSettings10 + InstanceGroupSettings11: + Ref: InstanceGroupSettings11 + InstanceGroupSettings12: + Ref: InstanceGroupSettings12 + InstanceGroupSettings13: + Ref: InstanceGroupSettings13 + InstanceGroupSettings14: + Ref: InstanceGroupSettings14 + InstanceGroupSettings15: + Ref: InstanceGroupSettings15 + InstanceGroupSettings16: + Ref: InstanceGroupSettings16 + InstanceGroupSettings17: + Ref: InstanceGroupSettings17 + InstanceGroupSettings18: + Ref: InstanceGroupSettings18 + InstanceGroupSettings19: + Ref: InstanceGroupSettings19 + InstanceGroupSettings20: + Ref: InstanceGroupSettings20 + RigSettings1: + Ref: RigSettings1 + RigSettings2: + Ref: RigSettings2 + RigSettings3: + Ref: RigSettings3 + RigSettings4: + Ref: RigSettings4 + RigSettings5: + Ref: RigSettings5 + RigSettings6: + Ref: RigSettings6 + RigSettings7: + Ref: RigSettings7 + RigSettings8: + Ref: RigSettings8 + RigSettings9: + Ref: RigSettings9 + RigSettings10: + Ref: RigSettings10 + RigSettings11: + Ref: RigSettings11 + RigSettings12: + Ref: RigSettings12 + RigSettings13: + Ref: RigSettings13 + RigSettings14: + Ref: RigSettings14 + RigSettings15: + Ref: RigSettings15 + RigSettings16: + Ref: RigSettings16 + RigSettings17: + Ref: RigSettings17 + RigSettings18: + Ref: RigSettings18 + RigSettings19: + Ref: RigSettings19 + RigSettings20: + Ref: RigSettings20 + Tags: + Ref: Tags + Metadata: + aws:cdk:path: MainEksBasedCfnTemplate/HyperPodClusterStack + Condition: CreateHyperPodClusterStackCondition + FsxStack: + Type: AWS::CloudFormation::Stack + Properties: + TemplateURL: + Fn::Sub: >- + https://aws-sagemaker-hyperpod-cluster-setup-${AWS::Region}-${Stage}.s3.${AWS::Region}.amazonaws.com/templates/fsx-template.yaml + Parameters: + ResourceNamePrefix: + Ref: ResourceNamePrefix + HelmChartStatus: + Fn::If: + - CreateHelmChartStackCondition + - Fn::GetAtt: + - HelmChartStack + - Outputs.HelmChartDeploymentComplete + - HelmChartNotRequired + EKSClusterName: + Fn::If: + - CreateEKSClusterStackCondition + - Fn::GetAtt: + - EKSClusterStack + - Outputs.EKSClusterName + - Ref: EKSClusterName + CustomResourceS3Bucket: + Fn::Sub: aws-sagemaker-hyperpod-cluster-setup-${AWS::Region}-${Stage} + PrivateSubnetIds: + Fn::If: + - CreatePrivateSubnetStackCondition + - Fn::GetAtt: + - PrivateSubnetStack + - Outputs.PrivateSubnetIds + - Ref: PrivateSubnetIds + FsxSubnetId: + Ref: FsxSubnetId + FsxAvailabilityZone: + Ref: FsxAvailabilityZone + SecurityGroupIds: + Fn::If: + - CreateSecurityGroupStackCondition + - Fn::GetAtt: + - SecurityGroupStack + - Outputs.SecurityGroupId + - Ref: SecurityGroupIds + PerUnitStorageThroughput: + Ref: PerUnitStorageThroughput + DataCompressionType: + Ref: DataCompressionType + FileSystemTypeVersion: + Ref: FileSystemTypeVersion + StorageCapacity: + Ref: StorageCapacity + FsxFileSystemId: + Ref: FsxFileSystemId + Metadata: + aws:cdk:path: MainEksBasedCfnTemplate/FsxStack + Condition: CreateFsxStackCondition +Outputs: + OutputVpcId: + Value: + Fn::GetAtt: + - VPCStack + - Outputs.VpcId + Condition: CreateVPCStackCondition + OutputPrivateSubnetIds: + Value: + Fn::GetAtt: + - PrivateSubnetStack + - Outputs.PrivateSubnetIds + Condition: CreatePrivateSubnetStackCondition + OutputSecurityGroupId: + Value: + Fn::GetAtt: + - SecurityGroupStack + - Outputs.SecurityGroupId + Condition: CreateSecurityGroupStackCondition + OutputEKSClusterArn: + Value: + Fn::GetAtt: + - EKSClusterStack + - Outputs.EKSClusterArn + Condition: CreateEKSClusterStackCondition + OutputEKSClusterName: + Value: + Fn::GetAtt: + - EKSClusterStack + - Outputs.EKSClusterName + Condition: CreateEKSClusterStackCondition + OutputSageMakerIAMRoleArn: + Value: + Fn::GetAtt: + - SageMakerIAMRoleStack + - Outputs.SageMakerIAMRoleArn + Condition: CreateSageMakerIAMRoleStackCondition + OutputS3BucketName: + Value: + Fn::GetAtt: + - S3BucketStack + - Outputs.S3BucketName + Condition: CreateS3BucketStackCondition + OutputHyperPodClusterName: + Value: + Fn::GetAtt: + - HyperPodClusterStack + - Outputs.HyperPodClusterName + Condition: CreateHyperPodClusterStackCondition + OutputHyperPodClusterArn: + Value: + Fn::GetAtt: + - HyperPodClusterStack + - Outputs.HyperPodClusterArn + Condition: CreateHyperPodClusterStackCondition diff --git a/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/__init__.py b/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/model.py b/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/model.py new file mode 100644 index 00000000..cd5d50a0 --- /dev/null +++ b/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/model.py @@ -0,0 +1,53 @@ +from pydantic import BaseModel, Field +from typing import Optional, Literal, List, Any, Union + +class ClusterStackBase(BaseModel): + resource_name_prefix: Optional[str] = Field("hyp-eks-stack", description="Prefix to be used for all resources. A 4-digit UUID will be added to prefix during submission") + create_hyperpod_cluster_stack: Optional[bool] = Field(True, description="Boolean to Create HyperPod Cluster Stack") + hyperpod_cluster_name: Optional[str] = Field("hyperpod-cluster", description="Name of SageMaker HyperPod Cluster") + create_eks_cluster_stack: Optional[bool] = Field(True, description="Boolean to Create EKS Cluster Stack") + kubernetes_version: Optional[str] = Field("1.31", description="The Kubernetes version") + eks_cluster_name: Optional[str] = Field("eks-cluster", description="The name of the EKS cluster") + create_helm_chart_stack: Optional[bool] = Field(True, description="Boolean to Create Helm Chart Stack") + namespace: Optional[str] = Field("kube-system", description="The namespace to deploy the HyperPod Helm chart") + helm_repo_url: str = Field("https://github.com/aws/sagemaker-hyperpod-cli.git", description="The URL of the Helm repo containing the HyperPod Helm chart (fixed default)") + helm_repo_path: str = Field("helm_chart/HyperPodHelmChart", description="The path to the HyperPod Helm chart in the Helm repo (fixed default)") + helm_operators: Optional[str] = Field("mlflow.enabled=true,trainingOperators.enabled=true,storage.enabled=true,cluster-role-and-bindings.enabled=true,namespaced-role-and-bindings.enable=true,nvidia-device-plugin.devicePlugin.enabled=true,neuron-device-plugin.devicePlugin.enabled=true,aws-efa-k8s-device-plugin.devicePlugin.enabled=true,mpi-operator.enabled=true,health-monitoring-agent.enabled=true,deep-health-check.enabled=true,job-auto-restart.enabled=true,hyperpod-patching.enabled=true", description="The configuration of HyperPod Helm chart") + helm_release: Optional[str] = Field("dependencies", description="The name used for Helm chart release") + node_provisioning_mode: Optional[str] = Field("Continuous", description="Enable or disable the continuous provisioning mode. Valid values: \"Continuous\" or leave empty") + node_recovery: Optional[str] = Field("Automatic", description="Specifies whether to enable or disable the automatic node recovery feature. Valid values: \"Automatic\", \"None\"") + instance_group_settings: Union[List[Any], None] = Field([{"InstanceCount":1,"InstanceGroupName":"controller-group","InstanceType":"ml.t3.medium","TargetAvailabilityZoneId":"use2-az2","ThreadsPerCore":1,"InstanceStorageConfigs":[{"EbsVolumeConfig":{"VolumeSizeInGB":500}}]}], description="List of string containing instance group configurations") + rig_settings: Union[List[Any], None] = Field(None, description="List of string containing restricted instance group configurations") + rig_s3_bucket_name: Optional[str] = Field(None, description="The name of the S3 bucket used to store the RIG resources") + tags: Union[List[Any], None] = Field(None, description="Custom tags for managing the SageMaker HyperPod cluster as an AWS resource") + create_vpc_stack: Optional[bool] = Field(True, description="Boolean to Create VPC Stack") + vpc_id: Optional[str] = Field(None, description="The ID of the VPC you wish to use if you do not want to create a new VPC") + vpc_cidr: Optional[str] = Field("10.192.0.0/16", description="The IP range (CIDR notation) for the VPC") + availability_zone_ids: Union[List[str], None] = Field(None, description="List of AZs in submission region to deploy subnets in. Must be provided in YAML format starting with \"-\" below. Example: - use2-az1 for us-east-2 region") + create_security_group_stack: Optional[bool] = Field(True, description="Boolean to Create Security Group Stack") + security_group_id: Optional[str] = Field(None, description="The ID of the security group you wish to use in SecurityGroup substack if you do not want to create a new one") + security_group_ids: Union[List[str], None] = Field(None, description="The security groups you wish to use for Hyperpod cluster if you do not want to create new ones") + private_subnet_ids: Union[List[str], None] = Field(None, description="List of private subnet IDs used for HyperPod cluster if you do not want to create VPC stack") + eks_private_subnet_ids: Union[List[str], None] = Field(None, description="List of private subnet IDs for the EKS cluster if you do not want to create VPC stack") + nat_gateway_ids: Union[List[str], None] = Field(None, description="List of NAT Gateway IDs to route internet bound traffic if you do not want to create VPC stack") + private_route_table_ids: Union[List[str], None] = Field(None, description="List of private route table IDs if you do not want to create VPC stack") + create_s3_endpoint_stack: Optional[bool] = Field(True, description="Boolean to Create S3 Endpoint stack") + enable_hp_inference_feature: Optional[bool] = Field(False, description="Boolean to enable inference operator in Hyperpod cluster") + stage: Optional[str] = Field("prod", description="Deployment stage used in S3 bucket naming for inference operator. Valid values: \"gamma\", \"prod\"") + custom_bucket_name: str = Field("sagemaker-hyperpod-cluster-stack-bucket", description="S3 bucket name for templates") + create_life_cycle_script_stack: Optional[bool] = Field(True, description="Boolean to Create Life Cycle Script Stack") + create_s3_bucket_stack: Optional[bool] = Field(True, description="Boolean to Create S3 Bucket Stack") + s3_bucket_name: Optional[str] = Field("s3-bucket", description="The name of the S3 bucket used to store the cluster lifecycle scripts") + github_raw_url: str = Field("https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/refs/heads/main/1.architectures/7.sagemaker-hyperpod-eks/LifecycleScripts/base-config/on_create.sh", description="The raw GitHub URL for the lifecycle script (fixed default)") + on_create_path: Optional[str] = Field("sagemaker-hyperpod-eks-bucket", description="The file name of lifecycle script") + create_sagemaker_iam_role_stack: Optional[bool] = Field(True, description="Boolean to Create SageMaker IAM Role Stack") + sagemaker_iam_role_name: Optional[str] = Field("create-cluster-role", description="The name of the IAM role that SageMaker will use during cluster creation to access the AWS resources on your behalf") + create_fsx_stack: Optional[bool] = Field(True, description="Boolean to Create FSx Stack") + fsx_subnet_id: Optional[str] = Field("", description="The subnet id that will be used to create FSx") + fsx_availability_zone_id: Optional[str] = Field("", description="The availability zone to get subnet id that will be used to create FSx") + per_unit_storage_throughput: Optional[int] = Field(250, description="Per unit storage throughput") + data_compression_type: Optional[str] = Field("NONE", description="Data compression type for the FSx file system. Valid values: \"NONE\", \"LZ4\"") + file_system_type_version: Optional[float] = Field(2.15, description="File system type version for the FSx file system") + storage_capacity: Optional[int] = Field(1200, description="Storage capacity for the FSx file system in GiB") + fsx_file_system_id: Optional[str] = Field("", description="Existing FSx file system ID") + diff --git a/hyperpod-cluster-stack-template/pyproject.toml b/hyperpod-cluster-stack-template/pyproject.toml new file mode 100644 index 00000000..428acf18 --- /dev/null +++ b/hyperpod-cluster-stack-template/pyproject.toml @@ -0,0 +1,27 @@ +[build-system] +requires = ["setuptools>=61.0", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "hyperpod-cluster-stack-template" +version = "1.0" +readme = "README.md" +authors = [{name = "Amazon Web Services"}] +license = {text = "Apache-2.0"} +description = "Versioned JSON-schema + Pydantic models for HyperpodPytorchJobOperator" +requires-python = ">=3.8" +dependencies = [ + "pydantic", +] + +[tool.setuptools.packages.find] +# find all subpackages under hyperpod_pytorch_job_template +where = ["."] +include = ["hyperpod_cluster_stack_template*"] + +[tool.setuptools] +# tells setuptools to include package_data entries below +include-package-data = true + +[tool.setuptools.package-data] +"*" = ["*.yaml"] \ No newline at end of file diff --git a/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_0/model.py b/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_0/model.py index 08e9cfc8..f8ee12ca 100644 --- a/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_0/model.py +++ b/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_0/model.py @@ -243,7 +243,7 @@ class FlatHPEndpoint(BaseModel): "Please fill in the path after http://:/ specific to your model server.", ) ) - + @model_validator(mode='after') def validate_model_source_config(self): """Validate that required fields are provided based on model_source_type""" @@ -254,7 +254,7 @@ def validate_model_source_config(self): if not self.fsx_file_system_id: raise ValueError("fsx_file_system_id is required when model_source_type is 'fsx'") return self - + def to_domain(self) -> HPEndpoint: env_vars = None if self.env: diff --git a/hyperpod-jumpstart-inference-template/hyperpod_jumpstart_inference_template/v1_0/model.py b/hyperpod-jumpstart-inference-template/hyperpod_jumpstart_inference_template/v1_0/model.py index 2dd257ed..4a427662 100644 --- a/hyperpod-jumpstart-inference-template/hyperpod_jumpstart_inference_template/v1_0/model.py +++ b/hyperpod-jumpstart-inference-template/hyperpod_jumpstart_inference_template/v1_0/model.py @@ -17,8 +17,7 @@ from sagemaker.hyperpod.inference.config.hp_jumpstart_endpoint_config import ( Model, SageMakerEndpoint, - Server, - TlsConfig, + Server ) from sagemaker.hyperpod.inference.hp_jumpstart_endpoint import HPJumpStartEndpoint @@ -69,11 +68,10 @@ class FlatHPJumpStartEndpoint(BaseModel): max_length=63, pattern=r"^[a-zA-Z0-9](-*[a-zA-Z0-9]){0,62}$", ) - tls_certificate_output_s3_uri: Optional[str] = Field( None, alias="tls_certificate_output_s3_uri", - description="S3 URI to write the TLS certificate (optional)", + description="S3 URI to write the TLS certificate", pattern=r"^s3://([^/]+)/?(.*)$", ) @@ -88,12 +86,8 @@ def to_domain(self) -> HPJumpStartEndpoint: instance_type=self.instance_type, ) sage_ep = SageMakerEndpoint(name=self.endpoint_name) - tls = ( - TlsConfig(tls_certificate_output_s3_uri=self.tls_certificate_output_s3_uri) - ) return HPJumpStartEndpoint( model=model, server=server, sage_maker_endpoint=sage_ep, - tls_config=tls, ) diff --git a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/model.py b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/model.py index ffbeceda..530be835 100644 --- a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/model.py +++ b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/model.py @@ -15,6 +15,8 @@ class VolumeConfig(BaseModel): + model_config = ConfigDict(extra="forbid") + name: str = Field( ..., description="Volume name", @@ -36,7 +38,7 @@ class VolumeConfig(BaseModel): description="PVC claim name (required for pvc volumes)", min_length=1 ) - read_only: Optional[Literal['true', 'false']] = Field(None, description="Read-only flag for pvc volumes") + read_only: Optional[bool] = Field(None, description="Read-only flag for pvc volumes") @field_validator('mount_path', 'path') @classmethod @@ -75,7 +77,7 @@ class PyTorchJobConfig(BaseModel): min_length=1 ) namespace: Optional[str] = Field( - default=None, + default="default", description="Kubernetes namespace", min_length=1 ) @@ -101,16 +103,15 @@ class PyTorchJobConfig(BaseModel): min_length=1 ) node_count: Optional[int] = Field( - default=None, + default=1, alias="node_count", description="Number of nodes", ge=1 ) - tasks_per_node: Optional[int] = Field( - default=None, + tasks_per_node: Optional[str] = Field( + default="auto", alias="tasks_per_node", - description="Number of tasks per node", - ge=1 + description="Number of workers per node; supported values: [auto,cpu, gpu, int]", ) label_selector: Optional[Dict[str, str]] = Field( default=None, @@ -281,7 +282,7 @@ def to_domain(self) -> Dict: elif vol.type == "pvc": pvc_config = PersistentVolumeClaim( claim_name=vol.claim_name, - read_only=vol.read_only == "true" if vol.read_only else False + read_only=vol.read_only if vol.read_only is not None else False ) volume_obj = Volumes(name=vol.name, persistent_volume_claim=pvc_config) volumes.append(volume_obj) diff --git a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/schema.json b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/schema.json index a3a2c619..6cd80ff6 100644 --- a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/schema.json +++ b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/schema.json @@ -54,11 +54,7 @@ "read_only": { "anyOf": [ { - "enum": [ - "true", - "false" - ], - "type": "string" + "type": "boolean" }, { "type": "null" @@ -104,7 +100,7 @@ "type": "null" } ], - "default": null, + "default": "default", "description": "Kubernetes namespace", "title": "Namespace" }, @@ -194,22 +190,21 @@ "type": "null" } ], - "default": null, + "default": 1, "description": "Number of nodes", "title": "Node Count" }, "tasks_per_node": { "anyOf": [ { - "minimum": 1, - "type": "integer" + "type": "string" }, { "type": "null" } ], - "default": null, - "description": "Number of tasks per node", + "default": "auto", + "description": "Number of workers per node; supported values: [auto,cpu, gpu, int]", "title": "Tasks Per Node" }, "label_selector": { diff --git a/pyproject.toml b/pyproject.toml index 16fc720e..fa2f0d18 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta" [project] dynamic = ["dependencies"] name = "sagemaker-hyperpod" -version = "3.1.0" +version = "3.2.0" description = "Amazon SageMaker HyperPod SDK and CLI" readme = "README.md" requires-python = ">=3.8" @@ -112,4 +112,4 @@ docstring-code-format = false # # This only has an effect when the `docstring-code-format` setting is # enabled. -docstring-code-line-length = "dynamic" \ No newline at end of file +docstring-code-line-length = "dynamic" diff --git a/setup.py b/setup.py index 35730729..4292d5a0 100644 --- a/setup.py +++ b/setup.py @@ -47,7 +47,7 @@ setup( data_files=sagemaker_hyperpod_recipes, name="sagemaker-hyperpod", - version="3.1.0", + version="3.2.0", description="Amazon SageMaker HyperPod SDK and CLI", long_description=open("README.md").read(), long_description_content_type="text/markdown", @@ -89,7 +89,9 @@ "pydantic>=2.10.6,<3.0.0", "hyperpod-pytorch-job-template>=1.0.0, <2.0.0", "hyperpod-custom-inference-template>=1.0.0, <2.0.0", - "hyperpod-jumpstart-inference-template>=1.0.0, <2.0.0" + "hyperpod-jumpstart-inference-template>=1.0.0, <2.0.0", + # To be enabled after launch + #"hyperpod-cluster-stack-template>=1.0.0, <2.0.0" ], entry_points={ "console_scripts": [ diff --git a/src/sagemaker/hyperpod/cli/commands/cluster.py b/src/sagemaker/hyperpod/cli/commands/cluster.py index 6921d989..cb19f24c 100644 --- a/src/sagemaker/hyperpod/cli/commands/cluster.py +++ b/src/sagemaker/hyperpod/cli/commands/cluster.py @@ -14,6 +14,7 @@ import subprocess import json import sys +import signal import botocore.config from collections import defaultdict from concurrent.futures import ThreadPoolExecutor, as_completed @@ -252,6 +253,39 @@ def rate_limited_operation( namespace: Optional[List[str]], ) -> Optional[List[List[str]]]: try: + cluster_capacities = [] # Initialize at the beginning + + # Get cluster details to check instance count + cluster_response = sm_client.describe_cluster(ClusterName=cluster_name) + cluster_status = cluster_response.get('ClusterStatus', 'Unknown') + + # Check if cluster has zero instances + instance_groups = cluster_response.get('InstanceGroups', []) + total_instances = sum( + group.get('CurrentCount', 0) for group in instance_groups + ) + + # If cluster has 0 instances, add it with 0 nodes + if total_instances == 0: + logger.info(f"Adding cluster {cluster_name} with 0 instances (status: {cluster_status})") + zero_instance_row = [ + cluster_name, + "N/A", # InstanceType + 0, # TotalNodes + 0, # AcceleratorDevicesAvailable + 0, # NodeHealthStatus=Schedulable + "N/A", # DeepHealthCheckStatus=Passed + ] + + # Add namespace columns with 0 values + if namespace: + for ns in namespace: + zero_instance_row.extend([0, 0]) # Total and Available accelerator devices + + cluster_capacities.append(zero_instance_row) + return cluster_capacities + + # Proceed with EKS validation for clusters with instances eks_cluster_arn = validator.validate_cluster_and_get_eks_arn( cluster_name, sm_client ) @@ -259,7 +293,7 @@ def rate_limited_operation( logger.warning( f"Cannot find EKS cluster behind {cluster_name}, continue..." ) - return + return None eks_cluster_name = get_name_from_arn(eks_cluster_arn) _update_kube_config(eks_cluster_name, region, temp_config_file) k8s_client = KubernetesClient(config_file=temp_config_file) @@ -267,31 +301,31 @@ def rate_limited_operation( temp_config_file, SAGEMAKER_HYPERPOD_NAME_LABEL ) nodes_info = _aggregate_nodes_info(nodes) - cluster_capacities = [] ns_nominal_quota = {} ns_quota_usage = {} - for ns in namespace: - sm_managed_namespace = k8s_client.get_sagemaker_managed_namespace(ns) - if sm_managed_namespace: - quota_allocation_id = sm_managed_namespace.metadata.labels[ - SAGEMAKER_QUOTA_ALLOCATION_LABEL - ] - cluster_queue_name = ( - HYPERPOD_NAMESPACE_PREFIX - + quota_allocation_id - + SAGEMAKER_MANAGED_CLUSTER_QUEUE_SUFFIX - ) + if namespace: + for ns in namespace: + sm_managed_namespace = k8s_client.get_sagemaker_managed_namespace(ns) + if sm_managed_namespace: + quota_allocation_id = sm_managed_namespace.metadata.labels[ + SAGEMAKER_QUOTA_ALLOCATION_LABEL + ] + cluster_queue_name = ( + HYPERPOD_NAMESPACE_PREFIX + + quota_allocation_id + + SAGEMAKER_MANAGED_CLUSTER_QUEUE_SUFFIX + ) - cluster_queue = k8s_client.get_cluster_queue(cluster_queue_name) - nominal_quota = _get_cluster_queue_nominal_quota(cluster_queue) - quota_usage = _get_cluster_queue_quota_usage(cluster_queue) - ns_nominal_quota[ns] = nominal_quota - ns_quota_usage[ns] = quota_usage - else: - ns_nominal_quota[ns] = {} - ns_quota_usage[ns] = {} + cluster_queue = k8s_client.get_cluster_queue(cluster_queue_name) + nominal_quota = _get_cluster_queue_nominal_quota(cluster_queue) + quota_usage = _get_cluster_queue_quota_usage(cluster_queue) + ns_nominal_quota[ns] = nominal_quota + ns_quota_usage[ns] = quota_usage + else: + ns_nominal_quota[ns] = {} + ns_quota_usage[ns] = {} for instance_type, nodes_summary in nodes_info.items(): capacities = [ @@ -302,20 +336,21 @@ def rate_limited_operation( nodes_summary["schedulable"], nodes_summary["deep_health_check_passed"], ] - for ns in namespace: - capacities.append( - ns_nominal_quota.get(ns) - .get(instance_type, {}) - .get(NVIDIA_GPU_RESOURCE_LIMIT_KEY, "N/A") - ) - capacities.append( - _get_available_quota( - ns_nominal_quota.get(ns), - ns_quota_usage.get(ns), - instance_type, - NVIDIA_GPU_RESOURCE_LIMIT_KEY, + if namespace: + for ns in namespace: + capacities.append( + ns_nominal_quota.get(ns) + .get(instance_type, {}) + .get(NVIDIA_GPU_RESOURCE_LIMIT_KEY, "N/A") + ) + capacities.append( + _get_available_quota( + ns_nominal_quota.get(ns), + ns_quota_usage.get(ns), + instance_type, + NVIDIA_GPU_RESOURCE_LIMIT_KEY, + ) ) - ) cluster_capacities.append(capacities) return cluster_capacities except Exception as e: @@ -526,16 +561,26 @@ def set_cluster_context( """ if debug: set_logging_level(logger, logging.DEBUG) - validator = ClusterValidator() - botocore_config = botocore.config.Config( - user_agent_extra=get_user_agent_extra_suffix() - ) - session = boto3.Session(region_name=region) if region else boto3.Session() - if not validator.validate_aws_credential(session): - logger.error("Cannot connect to HyperPod cluster due to aws credentials error") - sys.exit(1) - + + timeout = 60 # 1 minute + + def timeout_handler(signum, frame): + raise TimeoutError(f"Operation timed out after {timeout} seconds") + + # Set up timeout + signal.signal(signal.SIGALRM, timeout_handler) + signal.alarm(timeout) + try: + validator = ClusterValidator() + botocore_config = botocore.config.Config( + user_agent_extra=get_user_agent_extra_suffix() + ) + session = boto3.Session(region_name=region) if region else boto3.Session() + if not validator.validate_aws_credential(session): + logger.error("Cannot connect to HyperPod cluster due to aws credentials error") + sys.exit(1) + sm_client = get_sagemaker_client(session, botocore_config) hp_cluster_details = sm_client.describe_cluster(ClusterName=cluster_name) logger.debug("Fetched hyperpod cluster details") @@ -549,6 +594,14 @@ def set_cluster_context( _update_kube_config(eks_name, region, None) k8s_client = KubernetesClient() k8s_client.set_context(eks_cluster_arn, namespace) + + # Cancel the alarm if operation completes successfully + signal.alarm(0) + logger.info(f"Successfully connected to cluster {cluster_name}") + + except TimeoutError as e: + logger.error("Timed out - Please check credentials, setup configurations and try again") + sys.exit(1) except botocore.exceptions.NoRegionError: logger.error( f"Please ensure you configured AWS default region or use '--region' argument to specify the region" @@ -559,6 +612,9 @@ def set_cluster_context( f"Unexpected error happens when try to connect to cluster {cluster_name}. Error: {e}" ) sys.exit(1) + finally: + # Ensure alarm is cancelled in all cases + signal.alarm(0) @click.command() diff --git a/src/sagemaker/hyperpod/cli/commands/cluster_stack.py b/src/sagemaker/hyperpod/cli/commands/cluster_stack.py new file mode 100644 index 00000000..285ba1f7 --- /dev/null +++ b/src/sagemaker/hyperpod/cli/commands/cluster_stack.py @@ -0,0 +1,379 @@ +""" +Command module for HyperPod cluster stack operations. +""" + +import ast +import logging +import click +import json +import os +from typing import Optional + +from sagemaker_core.main.resources import Cluster +from sagemaker_core.main.shapes import ClusterInstanceGroupSpecification + +from tabulate import tabulate +from sagemaker.hyperpod.cluster_management.hp_cluster_stack import HpClusterStack +from sagemaker.hyperpod.common.telemetry import _hyperpod_telemetry_emitter +from sagemaker.hyperpod.common.telemetry.constants import Feature +from sagemaker.hyperpod.common.utils import setup_logging +from sagemaker.hyperpod.cli.utils import convert_datetimes + +logger = logging.getLogger(__name__) + + +def parse_status_list(ctx, param, value): + """Parse status list from string format like "['CREATE_COMPLETE', 'UPDATE_COMPLETE']" """ + if not value: + return None + + try: + # Handle both string representation and direct list + if isinstance(value, str): + # Parse string like "['item1', 'item2']" + parsed = ast.literal_eval(value) + if isinstance(parsed, list): + return parsed + else: + raise click.BadParameter(f"Expected list format, got: {type(parsed).__name__}") + return value + except (ValueError, SyntaxError) as e: + raise click.BadParameter(f"Invalid list format. Use: \"['STATUS1', 'STATUS2']\". Error: {e}") + + +@click.command("cluster-stack") +@click.argument("config-file", required=True) +@click.argument("stack-name", required=True) +@click.option("--region", help="AWS region") +@click.option("--debug", is_flag=True, help="Enable debug logging") +def create_cluster_stack(config_file, region, debug): + """Create a new HyperPod cluster stack using the provided configuration. + + Creates a CloudFormation stack for a HyperPod cluster using settings from a YAML configuration file. + The stack will provision all necessary AWS resources for the cluster. + + .. dropdown:: Usage Examples + :open: + + .. code-block:: bash + + # Create cluster stack with config file + hyp create hyp-cluster cluster-config.yaml my-stack-name --region us-west-2 + + # Create with debug logging + hyp create hyp-cluster cluster-config.yaml my-stack-name --debug + """ + create_cluster_stack_helper(config_file, region, debug) + +def create_cluster_stack_helper(config_file: str, region: Optional[str] = None, debug: bool = False) -> None: + """Helper function to create a HyperPod cluster stack. + + **Parameters:** + + .. list-table:: + :header-rows: 1 + :widths: 20 20 60 + + * - Parameter + - Type + - Description + * - config_file + - str + - Path to the YAML configuration file containing cluster stack settings + * - region + - str, optional + - AWS region where the cluster stack will be created + * - debug + - bool + - Enable debug logging for detailed error information + + **Raises:** + + ClickException: When cluster stack creation fails or configuration is invalid + """ + try: + # Validate the config file path + if not os.path.exists(config_file): + logger.error(f"Config file not found: {config_file}") + return + + # Load the configuration from the YAML file + import yaml + import uuid + with open(config_file, 'r') as f: + config_data = yaml.safe_load(f) + + # Filter out template and namespace fields + filtered_config = {} + for k, v in config_data.items(): + if k not in ('template', 'namespace') and v is not None: + # Append 4-digit UUID to resource_name_prefix + if k == 'resource_name_prefix' and v: + v = f"{v}-{str(uuid.uuid4())[:4]}" + filtered_config[k] = v + + # Create the HpClusterStack object + # Ensure fixed defaults are always set + if 'custom_bucket_name' not in filtered_config: + filtered_config['custom_bucket_name'] = 'sagemaker-hyperpod-cluster-stack-bucket' + if 'github_raw_url' not in filtered_config: + filtered_config['github_raw_url'] = 'https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/refs/heads/main/1.architectures/7.sagemaker-hyperpod-eks/LifecycleScripts/base-config/on_create.sh' + if 'helm_repo_url' not in filtered_config: + filtered_config['helm_repo_url'] = 'https://github.com/aws/sagemaker-hyperpod-cli.git' + if 'helm_repo_path' not in filtered_config: + filtered_config['helm_repo_path'] = 'helm_chart/HyperPodHelmChart' + + cluster_stack = HpClusterStack(**filtered_config) + + # Log the configuration + logger.info("Creating HyperPod cluster stack with the following configuration:") + for key, value in filtered_config.items(): + if value is not None: + logger.info(f" {key}: {value}") + + # Create the cluster stack + stack_id = cluster_stack.create(region) + + logger.info(f"Stack creation initiated successfully with ID: {stack_id}") + logger.info("You can monitor the stack creation in the AWS CloudFormation console.") + + except Exception as e: + logger.error(f"Failed to create cluster stack: {e}") + if debug: + logger.exception("Detailed error information:") + raise click.ClickException(str(e)) + +@click.command("cluster-stack") +@click.argument("stack-name", required=True) +@click.option("--region", help="AWS region") +@click.option("--debug", is_flag=True, help="Enable debug logging") +@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "describe_cluster_stack_cli") +def describe_cluster_stack(stack_name: str, debug: bool, region: str) -> None: + """Describe the status of a HyperPod cluster stack. + + Shows detailed information about a CloudFormation stack including its current status, + resources, and configuration parameters. + + .. dropdown:: Usage Examples + :open: + + .. code-block:: bash + + # Describe a cluster stack + hyp describe hyp-cluster my-stack-name + + # Describe with specific region + hyp describe hyp-cluster my-stack-name --region us-west-2 + """ + logger = setup_logging(logging.getLogger(__name__), debug) + + try: + stack_info = HpClusterStack.describe(stack_name=stack_name, region=region) + + if not stack_info or 'Stacks' not in stack_info or not stack_info['Stacks']: + click.secho(f"❌ Stack '{stack_name}' not found", fg='red') + return + + stack = stack_info['Stacks'][0] + + logger.debug(f"Describing stack name: {stack_name}\ninfo: {json.dumps(stack_info, indent=2, default=str)}") + + click.echo(f"📋 Stack Details for: {stack_name}") + + # Highlight stack status + stack_status = stack.get('StackStatus', 'UNKNOWN') + click.echo(f"Status: ", nl=False) + click.secho(stack_status) + + table_data = [] + for key, value in stack.items(): + if isinstance(value, (dict, list)): + formatted_value = json.dumps(value, indent=2, default=str) + else: + formatted_value = str(value) + table_data.append([key, formatted_value]) + + # Calculate column widths + max_field_width = max(len(str(row[0])) for row in table_data) + max_value_width = max(len(str(row[1]).split('\n')[0]) for row in table_data) # First line only for width calc + + # Add headers with matching separators (presto format adds spaces around |) + field_header = "Field".ljust(max_field_width) + value_header = "Value".ljust(max_value_width) + click.echo(f" {field_header} | {value_header} ") + click.echo(f"-{'-' * max_field_width}-+-{'-' * max_value_width}-") + + click.echo(tabulate(table_data, tablefmt="presto")) + + except Exception as e: + logger.error(f"Failed to describe stack: {e}") + if debug: + logger.exception("Detailed error information:") + + if "does not exist" in str(e): + click.echo(f"❌ Stack '{stack_name}' not found") + elif "AccessDenied" in str(e): + click.echo("❌ Access denied. Check AWS permissions") + else: + click.echo(f"❌ Error describing stack: {e}") + + raise click.ClickException(str(e)) + +@click.command("cluster-stack") +@click.option("--region", help="AWS region") +@click.option("--debug", is_flag=True, help="Enable debug logging") +@click.option("--status", + callback=parse_status_list, + help="Filter by stack status. Format: \"['CREATE_COMPLETE', 'UPDATE_COMPLETE']\"") +@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "list_cluster_stack_cli") +def list_cluster_stacks(region, debug, status): + """List all HyperPod cluster stacks. + + Displays a summary of all CloudFormation stacks related to HyperPod clusters + in the specified region or default region. + + .. dropdown:: Usage Examples + :open: + + .. code-block:: bash + + # List all cluster stacks + hyp list hyp-cluster + + # List stacks in specific region + hyp list hyp-cluster --region us-east-1 + """ + logger = setup_logging(logging.getLogger(__name__), debug) + + try: + stacks_info = HpClusterStack.list(region=region, stack_status_filter=status) + + if not stacks_info or 'StackSummaries' not in stacks_info: + click.secho("No stacks found", fg='yellow') + return + + stack_summaries = stacks_info['StackSummaries'] + + # Convert datetimes for display + stack_summaries = [convert_datetimes(stack) for stack in stack_summaries] + + logger.debug(f"Listing stacks in region: {region or 'default'}") + + click.echo(f"📋 HyperPod Cluster Stacks ({len(stack_summaries)} found)") + + if stack_summaries: + for i, stack in enumerate(stack_summaries, 1): + try: + click.echo(f"\n[{i}] Stack Details:") + + table_data = [] + for key, value in stack.items(): + table_data.append([key, str(value)]) + + click.echo(tabulate(table_data, headers=["Field", "Value"], tablefmt="presto")) + except Exception as e: + logger.error(f"Error processing stack {i}: {e}") + click.echo(f"❌ Error processing stack {i}: {stack.get('StackName', 'Unknown')}") + continue + else: + click.echo("No stacks found") + + except Exception as e: + logger.error(f"Failed to list stacks: {e}") + if debug: + logger.exception("Detailed error information:") + + if "AccessDenied" in str(e) or "Insufficient permissions" in str(e): + click.secho("❌ Access denied. Check AWS permissions", fg='red') + else: + click.secho(f"❌ Error listing stacks: {e}", fg='red') + + raise click.ClickException(str(e)) + +@click.command("cluster-stack") +@click.argument("stack-name", required=True) +@click.option("--debug", is_flag=True, help="Enable debug logging") +def delete(stack_name: str, debug: bool) -> None: + """Delete a HyperPod cluster stack. + + Removes the specified CloudFormation stack and all associated AWS resources. + This operation cannot be undone. + + .. dropdown:: Usage Examples + :open: + + .. code-block:: bash + + # Delete a cluster stack + hyp delete hyp-cluster my-stack-name + """ + logger = setup_logging(logging.getLogger(__name__), debug) + + logger.info(f"Deleting stack: {stack_name}") + logger.info("This feature is not yet implemented.") + +@click.command("cluster") +@click.option("--cluster-name", required=True, help="The name of the cluster to update") +@click.option("--instance-groups", help="Instance Groups JSON string") +@click.option("--instance-groups-to-delete", help="Instance Groups to delete JSON string") +@click.option("--region", help="Region") +@click.option("--node-recovery", help="Node Recovery (Automatic or None)") +@click.option("--debug", is_flag=True, help="Enable debug logging") +@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "update_cluster_cli") +def update_cluster( + cluster_name: str, + instance_groups: Optional[str], + instance_groups_to_delete: Optional[str], + region: Optional[str], + node_recovery: Optional[str], + debug: bool) -> None: + """Update an existing HyperPod cluster configuration. + + Modifies cluster settings such as instance groups and node recovery policies. + At least one update parameter must be provided. + + .. dropdown:: Usage Examples + :open: + + .. code-block:: bash + + # Update cluster with new instance groups + hyp update hyp-cluster --cluster-name my-cluster --instance-groups '{"group1": {...}}' + + # Update node recovery setting + hyp update hyp-cluster --cluster-name my-cluster --node-recovery Automatic + """ + """Update an existing HyperPod cluster configuration.""" + logger = setup_logging(logging.getLogger(__name__), debug) + + # Validate that at least one parameter is provided + if not any([instance_groups, instance_groups_to_delete, node_recovery]): + raise click.ClickException("At least one of --instance-groups, --instance-groups-to-delete, or --node-recovery must be provided") + + cluster = Cluster.get(cluster_name=cluster_name, region=region) + + # Prepare update parameters + update_params = {} + + # Convert instance_groups to list of ClusterInstanceGroupSpecification + if instance_groups: + if isinstance(instance_groups, str): + instance_groups = json.loads(instance_groups) + update_params['instance_groups'] = [ClusterInstanceGroupSpecification(**ig) for ig in instance_groups] + + # Convert instance_groups_to_delete to list of strings + if instance_groups_to_delete: + if isinstance(instance_groups_to_delete, str): + instance_groups_to_delete = json.loads(instance_groups_to_delete) + update_params['instance_groups_to_delete'] = instance_groups_to_delete + + # Add node_recovery if provided + if node_recovery: + update_params['node_recovery'] = node_recovery + + click.secho(f"Update Params: {update_params}") + cluster.update(**update_params) + + logger.info("Cluster has been updated") + click.secho(f"Cluster {cluster_name} has been updated") + diff --git a/src/sagemaker/hyperpod/cli/commands/inference.py b/src/sagemaker/hyperpod/cli/commands/inference.py index dfa19b70..410ba1d3 100644 --- a/src/sagemaker/hyperpod/cli/commands/inference.py +++ b/src/sagemaker/hyperpod/cli/commands/inference.py @@ -94,7 +94,7 @@ def custom_invoke( content_type: Optional[str] ): """ - Invoke a model endpoint. + Invoke a custom model endpoint. """ try: payload = json.dumps(json.loads(body)) diff --git a/src/sagemaker/hyperpod/cli/commands/init.py b/src/sagemaker/hyperpod/cli/commands/init.py new file mode 100644 index 00000000..f209e99d --- /dev/null +++ b/src/sagemaker/hyperpod/cli/commands/init.py @@ -0,0 +1,430 @@ +import click +import yaml +import sys +from pathlib import Path +from datetime import datetime +from jinja2 import Template +import shutil +from sagemaker.hyperpod.cli.constants.init_constants import ( + USAGE_GUIDE_TEXT_CFN, + USAGE_GUIDE_TEXT_CRD, + CFN, + CRD +) +from sagemaker.hyperpod.common.config import Metadata +from sagemaker.hyperpod.training.hyperpod_pytorch_job import HyperPodPytorchJob +from sagemaker.hyperpod.cluster_management.hp_cluster_stack import HpClusterStack +from sagemaker.hyperpod.cli.init_utils import ( + generate_click_command, + save_config_yaml, + TEMPLATES, + load_config, + load_config_and_validate, + validate_config_against_model, + filter_validation_errors_for_user_input, + display_validation_results, + build_config_from_schema, + save_template, + get_default_version_for_template, + add_default_az_ids_to_config, +) +from sagemaker.hyperpod.common.utils import get_aws_default_region + +@click.command("init") +@click.argument("template", type=click.Choice(list(TEMPLATES.keys()))) +@click.argument("directory", type=click.Path(file_okay=False), default=".") +@click.option("--version", "-v", default=None, help="Schema version") +def init( + template: str, + directory: str, + version: str, +): + """ + Initialize a TEMPLATE scaffold in DIRECTORY. + + This command creates a complete project scaffold for the specified template type. + It performs the following steps: + + 1. Checks if the directory already contains a config.yaml and handles existing configurations + 2. Creates the target directory if it doesn't exist + 3. Generates a config.yaml file with schema-based default values and user-provided inputs + 4. Creates a template file (.jinja) for the specified template type + 5. Adds a README.md with usage instructions + + The generated files provide a starting point for configuring and submitting + jobs to SageMaker HyperPod clusters orchestrated by Amazon EKS. + """ + dir_path = Path(directory).resolve() + config_file = dir_path / "config.yaml" + skip_readme = False + + # 1) Inspect existing config.yaml + try: + if config_file.is_file(): + try: + existing = yaml.safe_load(config_file.read_text()) or {} + existing_template = existing.get("template") + except Exception as e: + click.echo("Could not parse existing config.yaml: %s", e) + existing_template = None + + if existing_template == template: + click.echo(f"⚠️ config.yaml already initialized as '{template}'.") + if not click.confirm("Override?", default=False): + click.echo("Aborting init.") + return + click.echo("Overriding config.yaml...") + skip_readme = True + else: + click.echo(f"⚠️ Directory already initialized as '{existing_template}'.") + click.secho(f"⚠️ It is highly unrecommended to initiate this directory with a different template.", fg="red") + click.echo(f"⚠️ Recommended path is create a new folder and then init with '{template}'.") + if not click.confirm(f"Do you want to re-initialize this directory with {template}?", default=False): + click.echo("Aborting init.") + return + click.echo(f"Re-initializing {existing_template} → {template}…") + + else: + click.echo(f"Initializing new scaffold for '{template}'…") + except Exception as e: + click.secho("💥 Initialization aborted due to error: %s", e, fg="red") + sys.exit(1) + + # 2) Ensure directory exists + try: + dir_path.mkdir(parents=True, exist_ok=True) + except Exception as e: + click.secho(f"❌ Could not create directory {dir_path}: {e}", fg="red") + sys.exit(1) + + # 3) Build config dict + comment map, then write config.yaml + try: + # Determine version: use user-provided version or default to latest + if version is None: + version = get_default_version_for_template(template) + + # Use the common function to build config from schema + full_cfg, comment_map = build_config_from_schema(template, version) + + save_config_yaml( + prefill=full_cfg, + comment_map=comment_map, + directory=str(dir_path), + ) + + except Exception as e: + click.secho(f"💥 Could not write config.yaml: {e}", fg="red") + sys.exit(1) + + # 4) Generate template + if not save_template(template, dir_path): + click.secho("⚠️ Template generation failed", fg="yellow") + + # 5) Write README.md + if not skip_readme: + try: + readme_path = dir_path / "README.md" + with open(readme_path, "w") as f: + if TEMPLATES[template]["schema_type"] == CFN: + f.write(USAGE_GUIDE_TEXT_CFN) + else: + f.write(USAGE_GUIDE_TEXT_CRD) + except Exception as e: + click.secho("⚠️ README.md generation failed: %s", e, fg="yellow") + + click.secho( + f"✔️ {template} for schema version={version!r} is initialized in {dir_path}", + fg="green", + ) + click.echo( + click.style( + "🚀 Welcome!\n" + f"📘 See {dir_path}/README.md for usage.\n", + fg="green", + ) + ) + + +@click.command("reset") +def reset(): + """ + Reset the current directory's config.yaml to an "empty" scaffold: + all schema keys set to default values (but keeping the template and version). + """ + dir_path = Path(".").resolve() + + # 1) Load and validate config + data, template, version = load_config(dir_path) + + # 2) Build config with default values from schema + full_cfg, comment_map = build_config_from_schema(template, version) + # 3) Overwrite config.yaml + try: + save_config_yaml( + prefill=full_cfg, + comment_map=comment_map, + directory=str(dir_path), + ) + click.secho("✔️ config.yaml reset: all fields set to default values.", fg="green") + except Exception as e: + click.secho(f"💥 Could not reset config.yaml: {e}", fg="red") + sys.exit(1) + + # 4) Regenerate the k8s Jinja template + if save_template(template, dir_path): + click.secho(f"✔️ {template} is regenerated.", fg="green") + + +@click.command("configure") +@generate_click_command() +@click.pass_context +def configure(ctx, model_config): + """ + Update any subset of fields in ./config.yaml by passing -- flags. + + This command allows you to modify specific configuration fields without having + to regenerate the entire config or fix unrelated validation issues. Only the + fields you explicitly provide will be validated, making it easy to update + configurations incrementally. + + Examples: + + # Update a single field + hyp configure --hyperpod-cluster-name my-new-cluster + + # Update multiple fields at once + hyp configure --stack-name my-stack --create-fsx-stack: False + + # Update complex fields with JSON object + hyp configure --availability-zone-ids '["id1", "id2"]' + + """ + # 1) Load existing config without validation + dir_path = Path(".").resolve() + data, template, version = load_config(dir_path) + + # 2) Determine which fields the user actually provided + # Use Click's parameter source tracking to identify command-line provided parameters + user_input_fields = set() + + if ctx and hasattr(ctx, 'params') and model_config: + # Check which parameters were provided via command line (not defaults) + for param_name, param_value in ctx.params.items(): + # Skip if the parameter source indicates it came from default + param_source = ctx.get_parameter_source(param_name) + if param_source and param_source.name == 'COMMANDLINE': + user_input_fields.add(param_name) + + if not user_input_fields: + click.secho("⚠️ No arguments provided to configure.", fg="yellow") + return + + # 3) Build merged config with user input + full_cfg, comment_map = build_config_from_schema( + template=template, + version=version, + model_config=model_config, + existing_config=data, + user_provided_fields=user_input_fields + ) + + # 4) Validate the merged config, but only check user-provided fields + all_validation_errors = validate_config_against_model(full_cfg, template, version) + user_input_errors = filter_validation_errors_for_user_input(all_validation_errors, user_input_fields) + + is_valid = display_validation_results( + user_input_errors, + success_message="User input is valid!" if user_input_errors else "Configuration updated successfully!", + error_prefix="Invalid input arguments:" + ) + + if not is_valid: + click.secho("❌ config.yaml was not updated due to invalid input.", fg="red") + sys.exit(1) + + # 5) Write out the updated config.yaml (only if user input is valid) + try: + save_config_yaml( + prefill=full_cfg, + comment_map=comment_map, + directory=str(dir_path), + ) + click.secho("✔️ config.yaml updated successfully.", fg="green") + except Exception as e: + click.secho(f"💥 Could not update config.yaml: {e}", fg="red") + sys.exit(1) + + +@click.command("validate") +def validate(): + """ + Validate this directory's config.yaml against the appropriate schema. + """ + dir_path = Path(".").resolve() + load_config_and_validate(dir_path) + + +@click.command(name="_default_create") +@click.option("--region", "-r", default=None, help="Region, default to your region in aws configure") +def _default_create(region): + """ + Validate configuration and render template files for deployment. + + This command performs the following operations: + + 1. Loads and validates the config.yaml file in the current directory + 2. Determines the template type (CFN for CloudFormation or CRD for Kubernetes) + 3. Locates the appropriate Jinja template file: + - cfn_params.jinja for CloudFormation templates + - k8s.jinja for Kubernetes CRD templates + 4. Validates the configuration using the appropriate schema: + - HpClusterStack validation for CFN templates + - Registry-based validation for CRD templates + 5. Renders the Jinja template with configuration values + 6. Creates a timestamped directory under run/ (e.g., run/20240116T143022/) + 7. Copies the validated config.yaml to the run directory + 8. Writes the rendered output: + - cfn_params.yaml for CloudFormation templates + - k8s.yaml for Kubernetes templates + + The generated files in the run directory can be used for actual deployment + to SageMaker HyperPod clusters or CloudFormation stacks. + + Prerequisites: + - Must be run in a directory initialized with 'hyp init' + - config.yaml and the appropriate template file must exist + """ + dir_path = Path('.').resolve() + config_file = dir_path / 'config.yaml' + + # 1) Load config to determine template type + data, template, version = load_config_and_validate(dir_path) + + # 2) Determine correct jinja file based on template type + info = TEMPLATES[template] + schema_type = info["schema_type"] + if schema_type == CFN: + jinja_file = dir_path / 'cfn_params.jinja' + else: + jinja_file = dir_path / 'k8s.jinja' + + # 3) Ensure files exist + if not config_file.is_file() or not jinja_file.is_file(): + click.secho(f"❌ Missing config.yaml or {jinja_file.name}. Run `hyp init` first.", fg="red") + sys.exit(1) + + # 4) Validate config using consolidated function + validation_errors = validate_config_against_model(data, template, version) + is_valid = display_validation_results( + validation_errors, + success_message="Configuration is valid!", + error_prefix="Validation errors:" + ) + + if not is_valid: + sys.exit(1) + + try: + template_source = jinja_file.read_text() + tpl = Template(template_source) + + # For CFN templates, prepare arrays for Jinja template + if schema_type == CFN: + # Prepare instance_group_settings array + instance_group_settings = [] + rig_settings = [] + for i in range(1, 21): + ig_key = f'instance_group_settings{i}' + rig_key = f'rig_settings{i}' + if ig_key in data: + instance_group_settings.append(data[ig_key]) + if rig_key in data: + rig_settings.append(data[rig_key]) + + # Add arrays to template context + template_data = dict(data) + template_data['instance_group_settings'] = instance_group_settings + template_data['rig_settings'] = rig_settings + rendered = tpl.render(**template_data) + else: + rendered = tpl.render(**data) + except Exception as e: + click.secho(f"❌ Failed to render template: {e}", fg="red") + sys.exit(1) + + # 6) Prepare run/ directory and write files + run_root = dir_path / 'run' + run_root.mkdir(exist_ok=True) + timestamp = datetime.now().strftime('%Y%m%dT%H%M%S') + out_dir = run_root / timestamp + out_dir.mkdir() + + try: + shutil.copy(config_file, out_dir / 'config.yaml') + output_file = 'cfn_params.yaml' if schema_type == CFN else 'k8s.yaml' + with open(out_dir / output_file, 'w', encoding='utf-8') as f: + f.write(rendered) + click.secho(f"✔️ Submitted! Files written to {out_dir}", fg="green") + except Exception as e: + click.secho(f"❌ Failed to write run files: {e}", fg="red") + sys.exit(1) + + # 7) Make the downstream call + try : + if region is None: + region = get_aws_default_region() + click.secho(f"Submitting to default region: {region}.", fg="yellow") + + if schema_type == CFN: + add_default_az_ids_to_config(out_dir, region) + + from sagemaker.hyperpod.cli.commands.cluster_stack import create_cluster_stack_helper + create_cluster_stack_helper(config_file=f"{out_dir}/config.yaml", + region=region) + else: + dir_path = Path(".").resolve() + data, template, version = load_config(dir_path) + namespace = data.get("namespace", "default") + registry = TEMPLATES[template]["registry"] + model = registry.get(version) + if model: + # Filter out CLI metadata fields before passing to model + from sagemaker.hyperpod.cli.init_utils import filter_cli_metadata_fields + filtered_config = filter_cli_metadata_fields(data) + flat = model(**filtered_config) + domain = flat.to_domain() + if template == "hyp-custom-endpoint" or template == "hyp-jumpstart-endpoint": + domain.create(namespace=namespace) + elif template == "hyp-pytorch-job": + # Currently algin with pytorch_create. Open for refactor and simplify + # Prepare metadata + job_name = domain.get("name") + namespace = domain.get("namespace") + spec = domain.get("spec") + + # Prepare metadata + metadata_kwargs = {"name": job_name} + if namespace: + metadata_kwargs["namespace"] = namespace + + # Prepare job kwargs + job_kwargs = { + "metadata": Metadata(**metadata_kwargs), + "replica_specs": spec.get("replica_specs"), + } + + # Add nproc_per_node if present + if "nproc_per_node" in spec: + job_kwargs["nproc_per_node"] = spec.get("nproc_per_node") + + # Add run_policy if present + if "run_policy" in spec: + job_kwargs["run_policy"] = spec.get("run_policy") + + job = HyperPodPytorchJob(**job_kwargs) + job.create() + + + except Exception as e: + click.secho(f"❌ Failed to submit the command: {e}", fg="red") + sys.exit(1) diff --git a/src/sagemaker/hyperpod/cli/commands/training.py b/src/sagemaker/hyperpod/cli/commands/training.py index bef71203..f0c4c829 100644 --- a/src/sagemaker/hyperpod/cli/commands/training.py +++ b/src/sagemaker/hyperpod/cli/commands/training.py @@ -331,6 +331,38 @@ def pytorch_get_logs(job_name: str, pod_name: str, namespace: str): def pytorch_get_operator_logs(since_hours: float): """Get operator logs for pytorch training jobs.""" logs = HyperPodPytorchJob.get_operator_logs(since_hours=since_hours) - + # Use common log display utility for consistent formatting across all job types display_formatted_logs(logs, title="PyTorch Operator Logs") + + +@click.command("hyp-pytorch-job", + help="""Execute commands in pods associated with a HyperPod PyTorch job. + +Usage Format: + hyp exec --job-name [-p ] [--all-pods] -- """) +@click.option("--job-name", required=True, help="Required. The name of the job to execute the command within.") +@click.option("--pod", "-p", help="The name of the pod to execute the command in. (Required: specify either --pod or --all-pods)") +@click.option("--all-pods", is_flag=True, help="Execute command in all pods associated with the job. (Required: specify either --pod or --all-pods)") +@click.option("--namespace", "-n", default="default", help="Optional. The namespace of the job.") +@click.option("--container", help="Optional. The container name to execute the command in.") +@click.argument("command", nargs=-1, required=True) +@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "exec_pytorchjob_cli") +def pytorch_exec(job_name: str, pod: str, all_pods: bool, namespace: str, container: str, command: tuple): + """Execute commands in pods associated with a HyperPod PyTorch job.""" + if (all_pods and pod) or not (all_pods or pod): + raise click.UsageError("Must specify exactly one of the following: --all-pods, --pod") + + try: + job = HyperPodPytorchJob.get(name=job_name, namespace=namespace) + output = job.exec_command(list(command), pod, all_pods, container) + if output: + click.echo(output) + else: + click.echo("Command executed successfully (no output)") + except ValueError as e: + # User input validation errors + raise click.UsageError(str(e)) + except Exception as e: + # Other errors (API, network, etc.) + raise click.UsageError(f"Failed to execute command: {str(e)}") diff --git a/src/sagemaker/hyperpod/cli/constants/init_constants.py b/src/sagemaker/hyperpod/cli/constants/init_constants.py new file mode 100644 index 00000000..d600b666 --- /dev/null +++ b/src/sagemaker/hyperpod/cli/constants/init_constants.py @@ -0,0 +1,319 @@ +from sagemaker.hyperpod.cli.templates.cfn_cluster_creation import CLOUDFORMATION_CLUSTER_CREATION_TEMPLATE +from sagemaker.hyperpod.cli.templates.k8s_js_endpoint_template import KUBERNETES_JS_ENDPOINT_TEMPLATE +from sagemaker.hyperpod.cli.templates.k8s_custom_endpoint_template import KUBERNETES_CUSTOM_ENDPOINT_TEMPLATE +from sagemaker.hyperpod.cli.templates.k8s_pytorch_job_template import KUBERNETES_PYTORCH_JOB_TEMPLATE + +from hyperpod_jumpstart_inference_template.registry import SCHEMA_REGISTRY as JS_REG +from hyperpod_custom_inference_template.registry import SCHEMA_REGISTRY as C_REG +from hyperpod_pytorch_job_template.registry import SCHEMA_REGISTRY as P_REG + +# Here is the list of existing templates supported +# You can onboard new template by adding the mapping here + +CRD = "crd" +CFN = "cfn" +TEMPLATES = { + # "hyp-jumpstart-endpoint": { + # "registry": JS_REG, + # "schema_pkg": "hyperpod_jumpstart_inference_template", + # "schema_type": CRD, + # 'template': KUBERNETES_JS_ENDPOINT_TEMPLATE, + # 'type': "jinja" + # }, + # "hyp-custom-endpoint": { + # "registry": C_REG, + # "schema_pkg": "hyperpod_custom_inference_template", + # "schema_type": CRD, + # 'template': KUBERNETES_CUSTOM_ENDPOINT_TEMPLATE, + # 'type': "jinja" + # }, + # "hyp-pytorch-job": { + # "registry": P_REG, + # "schema_pkg": "hyperpod_pytorch_job_template", + # "schema_type": CRD, + # 'template': KUBERNETES_PYTORCH_JOB_TEMPLATE, + # 'type': "jinja" + # }, + "cluster-stack": { + "schema_pkg": "hyperpod_cluster_stack_template", + "schema_type": CFN, + 'template': CLOUDFORMATION_CLUSTER_CREATION_TEMPLATE, + 'type': "jinja" + } +} + + +USAGE_GUIDE_TEXT_CFN = """# SageMaker HyperPod CLI - Initialization Workflow + +This document explains the initialization workflow and related commands for the SageMaker HyperPod CLI. + +## Table of Contents +- [Init Command](#init-command) +- [Configure Command](#configure-command) +- [Reset Command](#reset-command) +- [Validate Command](#validate-command) +- [Create Command](#create-command) + +## Init Command + +The `init` command creates a scaffold for your HyperPod cluster stack configuration. It generates a `config.yaml` file, a CFN template (`cfn_params.jinja`), and a README with usage instructions. + +### Basic Usage + +```bash +hyp init +``` + +Example: +```bash +hyp init cluster-stack +``` + +This creates the following files in your current directory: +``` +├── config.yaml # Configuration file with default values +├── cfn_params.jinja # Cloudformation template with placeholders +└── README.md # Usage instructions +``` + +### Specifying a Directory + +You can specify a target directory for initialization: + +```bash +hyp init cluster-stack +cd +``` + +### Edge Cases + +**Re-initializing the same template:** +``` +hyp init cluster-stack +⚠️ config.yaml already initialized as 'cluster-stack'. +Overwrite? [y/N]: +``` + +**Initializing with a different template:** +``` +hyp init hyp-custom-endpoint +⚠️ Directory already initialized as 'cluster-stack'. +⚠️ It is highly unrecommended to initiate this directory with a different template. +⚠️ Recommended path is create a new folder and then init with 'hyp-custom-endpoint'. +If you insist, re-init as 'hyp-custom-endpoint' instead? [y/N]: +``` + +## Configure Command + +The `configure` command updates specific fields in your `config.yaml` file without modifying other values. + +```bash +hyp configure \ + --stack-name my-stack \ + --create-fsx-stack: False +``` + +## Reset Command + +The `reset` command resets your `config.yaml` to default values while preserving the template type and namespace. + +```bash +hyp reset +``` + +## Validate Command + +The `validate` command checks your `config.yaml` against the JSON schema to ensure all required fields are present and valid. + +```bash +hyp validate +``` + +## Create Command + +The `create` command processes your configuration and creates the cluster stack. It injects values from `config.yaml` into the `cfn_params.jinja` template and creates a timestamped record in the `runs` directory. + +```bash +hyp create +``` + +After submission, your directory structure will look like: +``` +├── config.yaml +├── cfn_params.jinja +├── README.md +└── runs/ + └── 2025-07-16T15-22-03Z/ + ├── config.yaml # Copy of the config used for this run + └── cfn_params.yaml # Generated Cloudformation template +``` + +## Workflow Example + +A typical workflow might look like: + +1. Initialize a new endpoint configuration: + ```bash + hyp init cluster-stack + ``` + +2. Configure required parameters: + ```bash + hyp configure \ + --stack-name my-stack \ + --create-fsx-stack: False + ``` + +3. Validate the configuration: + ```bash + hyp validate + ``` + +4. Create the cluster stack request: + ```bash + hyp create + ``` + +5. Check the status of your cluster stack: + ```bash + hyp list cluster-stack + ``` +""" + +USAGE_GUIDE_TEXT_CRD = """# SageMaker HyperPod CLI - Initialization Workflow + +This document explains the initialization workflow and related commands for the SageMaker HyperPod CLI. + +## Table of Contents +- [Init Command](#init-command) +- [Configure Command](#configure-command) +- [Reset Command](#reset-command) +- [Validate Command](#validate-command) +- [Create Command](#create-command) + +## Init Command + +The `init` command creates a scaffold for your HyperPod endpoint configuration. It generates a `config.yaml` file, a Kubernetes template (`k8s.jinja`), and a README with usage instructions. + +### Basic Usage + +```bash +hyp init +``` + +Example: +```bash +hyp init hyp-jumpstart-endpoint +``` + +This creates the following files in your current directory: +``` +├── config.yaml # Configuration file with default values +├── k8s.jinja # Kubernetes template with placeholders +└── README.md # Usage instructions +``` + +### Specifying a Directory + +You can specify a target directory for initialization: + +```bash +hyp init hyp-jumpstart-endpoint +cd +``` + +### Edge Cases + +**Re-initializing the same template:** +``` +hyp init hyp-jumpstart-endpoint +⚠️ config.yaml already initialized as 'hyp-jumpstart-endpoint'. +Overwrite? [y/N]: +``` + +**Initializing with a different template:** +``` +hyp init hyp-custom-endpoint +⚠️ Directory already initialized as 'hyp-jumpstart-endpoint'. +⚠️ It is highly unrecommended to initiate this directory with a different template. +⚠️ Recommended path is create a new folder and then init with 'hyp-custom-endpoint'. +If you insist, re-init as 'hyp-custom-endpoint' instead? [y/N]: +``` + +## Configure Command + +The `configure` command updates specific fields in your `config.yaml` file without modifying other values. + +```bash +hyp configure \ + --instance-type ml.g5.12xlarge \ + --model-version 2.0.4 +``` + +## Reset Command + +The `reset` command resets your `config.yaml` to default values while preserving the template type and namespace. + +```bash +hyp reset +``` + +## Validate Command + +The `validate` command checks your `config.yaml` against the JSON schema to ensure all required fields are present and valid. + +```bash +hyp validate +``` + +## Create Command + +The `create` command processes your configuration and creates the endpoint. It injects values from `config.yaml` into the `k8s.jinja` template and creates a timestamped record in the `runs` directory. + +```bash +hyp create +``` + +After submission, your directory structure will look like: +``` +├── config.yaml +├── k8s.jinja +├── README.md +└── runs/ + └── 2025-07-16T15-22-03Z/ + ├── config.yaml # Copy of the config used for this run + └── k8s.yaml # Generated Kubernetes manifest +``` + +## Workflow Example + +A typical workflow might look like: + +1. Initialize a new endpoint configuration: + ```bash + hyp init hyp-jumpstart-endpoint + ``` + +2. Configure required parameters: + ```bash + hyp configure \ + --model-id meta-textgeneration-llama-3-70b \ + --instance-type ml.g5.8xlarge \ + --endpoint-name my-llama-endpoint + ``` + +3. Validate the configuration: + ```bash + hyp validate + ``` + +4. Create the endpoint creation request: + ```bash + hyp create + ``` + +5. Check the status of your endpoint: + ```bash + hyp list hyp-jumpstart-endpoint + ``` +""" diff --git a/src/sagemaker/hyperpod/cli/hyp_cli.py b/src/sagemaker/hyperpod/cli/hyp_cli.py index c395845d..9012dee8 100644 --- a/src/sagemaker/hyperpod/cli/hyp_cli.py +++ b/src/sagemaker/hyperpod/cli/hyp_cli.py @@ -4,11 +4,13 @@ import os import subprocess from pydantic import BaseModel, ValidationError, Field -from typing import Optional +from typing import Optional, Union from importlib.metadata import version, PackageNotFoundError from sagemaker.hyperpod.cli.commands.cluster import list_cluster, set_cluster_context, get_cluster_context, \ get_monitoring +from sagemaker.hyperpod.cli.commands.cluster_stack import create_cluster_stack, describe_cluster_stack, \ + list_cluster_stacks, update_cluster from sagemaker.hyperpod.cli.commands.training import ( pytorch_create, list_jobs, @@ -17,6 +19,7 @@ pytorch_list_pods, pytorch_get_logs, pytorch_get_operator_logs, + pytorch_exec, ) from sagemaker.hyperpod.cli.commands.inference import ( js_create, @@ -36,7 +39,16 @@ custom_get_operator_logs, ) +from sagemaker.hyperpod.cli.commands.init import ( + init, + reset, + configure, + validate, + _default_create +) + +@click.group(context_settings={'max_content_width': 200}) def get_package_version(package_name): try: return version(package_name) @@ -58,33 +70,63 @@ def print_version(ctx, param, value): click.echo(f"hyperpod-jumpstart-inference-template version: {jumpstart_inference_version}") ctx.exit() -@click.group() + +@click.group(context_settings={'max_content_width': 200}) @click.option('--version', is_flag=True, callback=print_version, expose_value=False, is_eager=True, help='Show version information') def cli(): pass class CLICommand(click.Group): - pass - - -@cli.group(cls=CLICommand) + def __init__(self, *args, default_cmd: Union[str, None] = None, **kwargs): + super().__init__(*args, **kwargs) + self.default_cmd = default_cmd + + def parse_args(self, ctx, args): + # Only inject default subcommand when: + # - user didn't name a subcommand, and + # - user didn't ask for help + if self.default_cmd: + # any non-flag token that is a known subcommand? + has_subcmd = any((not a.startswith("-")) and (a in self.commands) for a in args) + asked_for_help = any(a in ("-h", "--help") for a in args) + if (not has_subcmd) and (not asked_for_help): + args = [self.default_cmd] + args + return super().parse_args(ctx, args) + + +@cli.group(cls=CLICommand, default_cmd='_default_create') def create(): - """Create endpoints or pytorch jobs.""" + """ + Create endpoints, pytorch jobs or cluster stacks. + + If only used as 'hyp create' without [OPTIONS] COMMAND [ARGS] during init experience, + then it will validate configuration and render template files for deployment. + The generated files in the run directory can be used for actual deployment + to SageMaker HyperPod clusters or CloudFormation stacks. + + Prerequisites for directly calling 'hyp create': + - Must be run in a directory initialized with 'hyp init' + - config.yaml and the appropriate template file must exist + """ pass @cli.group(cls=CLICommand) def list(): - """List endpoints or pytorch jobs.""" + """List endpoints, pytorch jobs or cluster stacks.""" pass @cli.group(cls=CLICommand) def describe(): - """Describe endpoints or pytorch jobs.""" + """Describe endpoints, pytorch jobs or cluster stacks.""" pass +@cli.group(cls=CLICommand) +def update(): + """Update an existing HyperPod cluster configuration.""" + pass @cli.group(cls=CLICommand) def delete(): @@ -116,17 +158,34 @@ def get_operator_logs(): pass +@cli.group(cls=CLICommand) +def exec(): + """Execute commands in pods for endpoints or pytorch jobs.""" + pass + + +cli.add_command(init) +cli.add_command(reset) +cli.add_command(configure) +cli.add_command(validate) + create.add_command(pytorch_create) create.add_command(js_create) create.add_command(custom_create) +_default_create.hidden = True +create.add_command(_default_create) list.add_command(list_jobs) list.add_command(js_list) list.add_command(custom_list) +list.add_command(list_cluster_stacks) describe.add_command(pytorch_describe) describe.add_command(js_describe) describe.add_command(custom_describe) +describe.add_command(describe_cluster_stack) + +update.add_command(update_cluster) delete.add_command(pytorch_delete) delete.add_command(js_delete) @@ -151,7 +210,9 @@ def get_operator_logs(): cli.add_command(set_cluster_context) cli.add_command(get_cluster_context) cli.add_command(get_monitoring) +# cli.add_command(create_cluster_stack) # Not supported yet +exec.add_command(pytorch_exec) if __name__ == "__main__": cli() diff --git a/src/sagemaker/hyperpod/cli/inference_utils.py b/src/sagemaker/hyperpod/cli/inference_utils.py index db44c77a..5ecf2395 100644 --- a/src/sagemaker/hyperpod/cli/inference_utils.py +++ b/src/sagemaker/hyperpod/cli/inference_utils.py @@ -41,17 +41,17 @@ def wrapped_func(*args, **kwargs): domain = flat.to_domain() return func(name, namespace, version, domain) - # 2) inject JSON flags only if they exist in the schema + # 2) inject the special JSON‐env flag before everything else schema = load_schema_for_version(version, schema_pkg) props = schema.get("properties", {}) - + json_flags = { "env": ("JSON object of environment variables, e.g. " '\'{"VAR1":"foo","VAR2":"bar"}\''), "dimensions": ("JSON object of dimensions, e.g. " '\'{"VAR1":"foo","VAR2":"bar"}\''), "resources_limits": ('JSON object of resource limits, e.g. \'{"cpu":"2","memory":"4Gi"}\''), "resources_requests": ('JSON object of resource requests, e.g. \'{"cpu":"1","memory":"2Gi"}\''), } - + for flag_name, help_text in json_flags.items(): if flag_name in props: wrapped_func = click.option( @@ -99,4 +99,4 @@ def wrapped_func(*args, **kwargs): return wrapped_func - return decorator + return decorator \ No newline at end of file diff --git a/src/sagemaker/hyperpod/cli/init_utils.py b/src/sagemaker/hyperpod/cli/init_utils.py new file mode 100644 index 00000000..a2dfed5e --- /dev/null +++ b/src/sagemaker/hyperpod/cli/init_utils.py @@ -0,0 +1,949 @@ +import importlib +import json +import logging +import pkgutil +import click +from typing import Callable, Tuple +import os +import yaml +import sys +from pathlib import Path +import functools +from pydantic import ValidationError +from sagemaker.hyperpod.common.utils import ( + region_to_az_ids +) +from typing import List, Any +from sagemaker.hyperpod.cli.constants.init_constants import ( + TEMPLATES, + CRD, + CFN +) +from sagemaker.hyperpod.cluster_management.hp_cluster_stack import HpClusterStack + +log = logging.getLogger() + +def save_template(template: str, directory_path: Path) -> bool: + """ + Save the appropriate k8s template based on the template type. + """ + try: + if TEMPLATES[template]["schema_type"] == CRD: + save_k8s_jinja(directory=str(directory_path), content=TEMPLATES[template]["template"]) + elif TEMPLATES[template]["schema_type"] == CFN: + save_cfn_jinja(directory=str(directory_path), content=TEMPLATES[template]["template"]) + return True + except Exception as e: + click.secho(f"⚠️ Template generation failed: {e}", fg="yellow") + return False + +def save_cfn_jinja(directory: str, content: str): + Path(directory).mkdir(parents=True, exist_ok=True) + path = os.path.join(directory, "cfn_params.jinja") + + with open(path, "w", encoding="utf-8") as f: + f.write(content) + click.secho(f"Cloudformation Parameters Jinja template saved to: {path}") + return path + +def save_k8s_jinja(directory: str, content: str): + Path(directory).mkdir(parents=True, exist_ok=True) + path = os.path.join(directory, "k8s.jinja") + with open(path, "w", encoding="utf-8") as f: + f.write(content) + print(f"K8s Jinja template saved to: {path}") + return path + + +def filter_cli_metadata_fields(config_data: dict) -> dict: + """ + Filter out CLI metadata fields that should not be passed to Pydantic models. + + Args: + config_data: Configuration data dictionary + + Returns: + Filtered dictionary without CLI metadata fields + """ + return { + k: v for k, v in config_data.items() + if k not in ('template', 'version') and v is not None + } + + +def get_latest_version_from_registry(template: str) -> str: + """ + Get the latest version available in the registry for a given template. + + Args: + template: Template name + + Returns: + Latest version string (e.g., "1.0", "2.0") + """ + template_info = TEMPLATES.get(template) + if not template_info: + raise click.ClickException(f"Unknown template: {template}") + + if template_info.get("schema_type") == CFN: + # CFN templates don't have versioned registries, return default + return "1.0" + + registry = template_info.get("registry") + if not registry: + raise click.ClickException(f"No registry found for template: {template}") + + # Get all available versions and return the latest + available_versions = list(registry.keys()) + if not available_versions: + raise click.ClickException(f"No versions available in registry for template: {template}") + + # Sort versions to get the latest (assuming semantic versioning) + # Convert to tuples for proper version comparison (e.g., "1.0" -> (1, 0)) + def version_key(v): + try: + return tuple(map(int, v.split('.'))) + except ValueError: + # Fallback for non-numeric versions + return (0, 0) + + latest_version = max(available_versions, key=version_key) + return str(latest_version) + + +def get_default_version_for_template(template: str) -> str: + """ + Get the default version for a template (latest available). + + Args: + template: Template name + + Returns: + Default version string + """ + # Check if template exists first + if template not in TEMPLATES: + raise click.ClickException(f"Unknown template: {template}") + + try: + return get_latest_version_from_registry(template) + except Exception: + raise click.ClickException(f"Could not get the latest version for template: {template}") + + +def load_schema_for_version(version: str, schema_pkg: str) -> dict: + ver_pkg = f"{schema_pkg}.v{str(version).replace('.', '_')}" + raw = pkgutil.get_data(ver_pkg, "schema.json") + if raw is None: + raise click.ClickException(f"Could not load schema.json for version {version}") + return json.loads(raw) + + +def generate_click_command( + *, + version_key_arg: str = "version", + template_arg_name: str = "template", +) -> Callable: + """ + Decorator that: + - injects -- for every property in the current template's schema (detected from config.yaml) + - only works for configure command, returns minimal decorator for others + """ + + # Only execute full decorator logic for configure command + is_configure_command = len(sys.argv) > 1 and sys.argv[1] == "configure" + + if not is_configure_command: + # Return a minimal decorator that doesn't add any options + def decorator(func: Callable) -> Callable: + return func + return decorator + + config_file = Path(".").resolve() / "config.yaml" + if not config_file.is_file(): + click.secho("❌ No config.yaml found. Run 'hyp init