Skip to content

Commit eb3c1d0

Browse files
pintaoz-awspintaoz
authored andcommitted
Add integ tests for topology annotations (#180)
* Add labels to top level metadata v1.1 * Move topology labels to annotations * Update topology parameter names * Add unit test * Topology integ tests * Add invalid test case * Add empty test case --------- Co-authored-by: pintaoz <[email protected]>
1 parent 5f23ff0 commit eb3c1d0

File tree

3 files changed

+131
-3
lines changed

3 files changed

+131
-3
lines changed

src/sagemaker/hyperpod/cli/commands/training.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,8 @@ def pytorch_describe(job_name: str, namespace: str):
160160
click.echo("=" * 80)
161161
click.echo(f"Name: {job.metadata.name}")
162162
click.echo(f"Namespace: {job.metadata.namespace}")
163+
click.echo(f"Labels: {job.metadata.labels}")
164+
click.echo(f"Annotations: {job.metadata.annotations}")
163165

164166
# Print Spec details
165167
click.echo("\nSpec:")

src/sagemaker/hyperpod/training/hyperpod_pytorch_job.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -235,11 +235,9 @@ def get_logs_from_pod(self, pod_name: str, container: Optional[str] = None) -> s
235235

236236

237237
def _load_hp_job(response: dict) -> HyperPodPytorchJob:
238-
name = response["metadata"]["name"]
239-
namespace = response["metadata"]["namespace"]
240238

241239
spec = _HyperPodPytorchJob.model_validate(response["spec"], by_name=True)
242-
metadata = Metadata(name=name, namespace=namespace)
240+
metadata = Metadata(**response["metadata"])
243241

244242
if "status" in response:
245243
status = HyperPodPytorchJobStatus.model_validate(
Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
import pytest
2+
import time
3+
import json
4+
5+
from sagemaker.hyperpod.cli.utils import setup_logger
6+
from test.integration_tests.utils import execute_command
7+
8+
logger = setup_logger(__name__)
9+
10+
NAMESPACE = "hyperpod-ns-team1"
11+
QUEUE = "hyperpod-ns-team1-localqueue"
12+
TOPOLOGY = "topology.k8s.aws/network-node-layer-1"
13+
14+
class TestTopologyIntegration:
15+
"""Integration tests for topology-related CLI commands"""
16+
17+
def test_create_job_with_required_topology(self, test_job_name):
18+
"""Test creating a job with --required-topology parameter"""
19+
20+
# Create job with required topology
21+
create_cmd = [
22+
"hyp", "create", "hyp-pytorch-job",
23+
"--version", "1.1",
24+
"--job-name", test_job_name,
25+
"--image", "pytorch:latest",
26+
"--pull-policy", "IfNotPresent",
27+
"--tasks-per-node", "1",
28+
"--queue-name", QUEUE,
29+
"--namespace", NAMESPACE,
30+
"--required-topology", TOPOLOGY
31+
]
32+
33+
result = execute_command(create_cmd)
34+
assert result.returncode == 0
35+
assert "Using version: 1.1" in result.stdout
36+
logger.info(f"Successfully created job with required topology: {test_job_name}")
37+
38+
describe_cmd = [
39+
"hyp", "describe", "hyp-pytorch-job",
40+
"--job-name", test_job_name,
41+
"--namespace", NAMESPACE
42+
]
43+
result = execute_command(describe_cmd)
44+
assert result.returncode == 0
45+
assert f"Annotations: {{'kueue.x-k8s.io/podset-required-topology': '{TOPOLOGY}'}}" in result.stdout
46+
47+
delete_cmd = [
48+
"hyp", "delete", "hyp-pytorch-job",
49+
"--job-name", test_job_name,
50+
"--namespace", NAMESPACE
51+
]
52+
result = execute_command(delete_cmd)
53+
assert result.returncode == 0
54+
logger.info(f"Successfully deleted job: {test_job_name}")
55+
56+
def test_create_job_with_preferred_topology(self, test_job_name):
57+
"""Test creating a job with --preferred-topology parameter"""
58+
59+
# Create job with preferred topology
60+
create_cmd = [
61+
"hyp", "create", "hyp-pytorch-job",
62+
"--version", "1.1",
63+
"--job-name", test_job_name,
64+
"--image", "pytorch:latest",
65+
"--pull-policy", "IfNotPresent",
66+
"--tasks-per-node", "1",
67+
"--queue-name", QUEUE,
68+
"--namespace", NAMESPACE,
69+
"--preferred-topology", TOPOLOGY
70+
]
71+
72+
result = execute_command(create_cmd)
73+
assert result.returncode == 0
74+
assert "Using version: 1.1" in result.stdout
75+
logger.info(f"Successfully created job with preferred topology: {test_job_name}")
76+
77+
describe_cmd = [
78+
"hyp", "describe", "hyp-pytorch-job",
79+
"--job-name", test_job_name,
80+
"--namespace", NAMESPACE
81+
]
82+
result = execute_command(describe_cmd)
83+
assert result.returncode == 0
84+
assert f"Annotations: {{'kueue.x-k8s.io/podset-preferred-topology': '{TOPOLOGY}'}}" in result.stdout
85+
86+
delete_cmd = [
87+
"hyp", "delete", "hyp-pytorch-job",
88+
"--job-name", test_job_name,
89+
"--namespace", NAMESPACE
90+
]
91+
result = execute_command(delete_cmd)
92+
assert result.returncode == 0
93+
logger.info(f"Successfully deleted job: {test_job_name}")
94+
95+
def test_invalid_topology_parameter(self, test_job_name):
96+
"""Test that invalid topology parameters are handled correctly"""
97+
98+
# Test with invalid topology value
99+
create_cmd = [
100+
"hyp", "create", "hyp-pytorch-job",
101+
"--version", "1.1",
102+
"--job-name", test_job_name,
103+
"--image", "pytorch:latest",
104+
"--required-topology",
105+
"topology.k8s.aws/network-node-layer-6" # invalid topology annotation
106+
]
107+
108+
try:
109+
execute_command(create_cmd)
110+
except RuntimeError as e:
111+
assert "Failed to execute command: hyp create hyp-pytorch-job" in str(e)
112+
113+
def test_empty_topology_parameter(self, test_job_name):
114+
"""Test that invalid topology parameters are handled correctly"""
115+
116+
# Test with empty topology value
117+
create_cmd = [
118+
"hyp", "create", "hyp-pytorch-job",
119+
"--version", "1.1",
120+
"--job-name", test_job_name,
121+
"--image", "pytorch:latest",
122+
"--preferred-topology" # empty topology annotation
123+
]
124+
125+
try:
126+
execute_command(create_cmd)
127+
except RuntimeError as e:
128+
assert "Failed to execute command: hyp create hyp-pytorch-job" in str(e)

0 commit comments

Comments
 (0)