Skip to content

Commit e627229

Browse files
Satish615jam-jee
authored andcommitted
Add integration tests for gpu quota allocation feature (#184)
* add integration tests for gpu quota allocation feature * add valueError assertions for invalid test cases
1 parent eb3c1d0 commit e627229

File tree

1 file changed

+278
-0
lines changed

1 file changed

+278
-0
lines changed
Lines changed: 278 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,278 @@
1+
import pytest
2+
import time
3+
import json
4+
import subprocess
5+
6+
from sagemaker.hyperpod.cli.utils import setup_logger
7+
from test.integration_tests.utils import execute_command
8+
9+
logger = setup_logger(__name__)
10+
11+
NAMESPACE = "hyperpod-ns-team1"
12+
QUEUE = "hyperpod-ns-team1-localqueue"
13+
14+
class TestGpuQuotaAllocationIntegration:
15+
"""Integration tests for Gpu-Quota Allocation related CLI commands"""
16+
17+
def test_create_job_with_integer_quota_parameters(self, test_job_name):
18+
"""Test creating a job with accelerators, vcpu and memory parameters"""
19+
20+
# Create job with required gpu quota parameters
21+
create_cmd = [
22+
"hyp", "create", "hyp-pytorch-job",
23+
"--version", "1.1",
24+
"--job-name", test_job_name,
25+
"--image", "pytorch:latest",
26+
"--pull-policy", "IfNotPresent",
27+
"--tasks-per-node", "1",
28+
"--accelerators", "1",
29+
"--instance-type", "ml.g5.8xlarge",
30+
"--vcpu", "3",
31+
"--memory", "1",
32+
"--accelerators-limit", "1",
33+
"--vcpu-limit", "4",
34+
"--memory-limit", "2",
35+
"--queue-name", QUEUE,
36+
"--namespace", NAMESPACE
37+
]
38+
39+
result = execute_command(create_cmd)
40+
assert result.returncode == 0
41+
assert "Using version: 1.1" in result.stdout
42+
logger.info(f"Successfully created job with required gpu quota parameters: {test_job_name}")
43+
44+
describe_cmd = [
45+
"hyp", "describe", "hyp-pytorch-job",
46+
"--job-name", test_job_name,
47+
"--namespace", NAMESPACE
48+
]
49+
result = execute_command(describe_cmd)
50+
logger.info(f"describe result: {result}")
51+
assert result.returncode == 0
52+
assert " Limits: {'cpu': '4', 'memory': '2Gi', 'nvidia.com/gpu': '1'}" in result.stdout
53+
assert " Requests: {'cpu': '3', 'memory': '1Gi', 'nvidia.com/gpu': '1'}" in result.stdout
54+
55+
delete_cmd = [
56+
"hyp", "delete", "hyp-pytorch-job",
57+
"--job-name", test_job_name,
58+
"--namespace", NAMESPACE
59+
]
60+
result = execute_command(delete_cmd)
61+
assert result.returncode == 0
62+
logger.info(f"Successfully deleted job: {test_job_name}")
63+
64+
def test_create_job_with_float_quota_parameters(self, test_job_name):
65+
"""Test creating a job with float values for accelerators, vcpu and memory parameters"""
66+
67+
# Create job with required gpu quota parameters with float values
68+
create_cmd = [
69+
"hyp", "create", "hyp-pytorch-job",
70+
"--version", "1.1",
71+
"--job-name", test_job_name,
72+
"--image", "pytorch:latest",
73+
"--pull-policy", "IfNotPresent",
74+
"--tasks-per-node", "1",
75+
"--accelerators", "1",
76+
"--instance-type", "ml.g5.8xlarge",
77+
"--vcpu", "3.6",
78+
"--memory", "1",
79+
"--accelerators-limit", "1",
80+
"--vcpu-limit", "4.8",
81+
"--memory-limit", "2.7",
82+
"--queue-name", QUEUE,
83+
"--namespace", NAMESPACE
84+
]
85+
86+
result = execute_command(create_cmd)
87+
assert result.returncode == 0
88+
assert "Using version: 1.1" in result.stdout
89+
logger.info(f"Successfully created job with required gpu quota parameters: {test_job_name}")
90+
91+
describe_cmd = [
92+
"hyp", "describe", "hyp-pytorch-job",
93+
"--job-name", test_job_name,
94+
"--namespace", NAMESPACE
95+
]
96+
result = execute_command(describe_cmd)
97+
assert result.returncode == 0
98+
assert " Limits: {'cpu': '4800m', 'memory': '2899102924800m', 'nvidia.com/gpu': '1'}" in result.stdout
99+
assert " Requests: {'cpu': '3600m', 'memory': '1Gi', 'nvidia.com/gpu': '1'}" in result.stdout
100+
101+
delete_cmd = [
102+
"hyp", "delete", "hyp-pytorch-job",
103+
"--job-name", test_job_name,
104+
"--namespace", NAMESPACE
105+
]
106+
result = execute_command(delete_cmd)
107+
assert result.returncode == 0
108+
logger.info(f"Successfully deleted job: {test_job_name}")
109+
110+
def test_create_job_with_only_accelerators_parameter(self, test_job_name):
111+
"""Test creating a job with only accelerators parameter"""
112+
113+
# Create job with only accelerators parameter
114+
create_cmd = [
115+
"hyp", "create", "hyp-pytorch-job",
116+
"--version", "1.1",
117+
"--job-name", test_job_name,
118+
"--image", "pytorch:latest",
119+
"--pull-policy", "IfNotPresent",
120+
"--tasks-per-node", "1",
121+
"--accelerators", "1",
122+
"--instance-type", "ml.g5.8xlarge",
123+
"--accelerators-limit", "1",
124+
"--queue-name", QUEUE,
125+
"--namespace", NAMESPACE
126+
]
127+
128+
result = execute_command(create_cmd)
129+
assert result.returncode == 0
130+
assert "Using version: 1.1" in result.stdout
131+
logger.info(f"Successfully created job with required gpu quota parameters: {test_job_name}")
132+
133+
describe_cmd = [
134+
"hyp", "describe", "hyp-pytorch-job",
135+
"--job-name", test_job_name,
136+
"--namespace", NAMESPACE
137+
]
138+
result = execute_command(describe_cmd)
139+
assert result.returncode == 0
140+
assert " Limits: {'nvidia.com/gpu': '1'}" in result.stdout
141+
assert " Requests: {'cpu': '32', 'memory': '128Gi', 'nvidia.com/gpu': '1'}" in result.stdout
142+
143+
delete_cmd = [
144+
"hyp", "delete", "hyp-pytorch-job",
145+
"--job-name", test_job_name,
146+
"--namespace", NAMESPACE
147+
]
148+
result = execute_command(delete_cmd)
149+
assert result.returncode == 0
150+
logger.info(f"Successfully deleted job: {test_job_name}")
151+
152+
def test_create_job_with_accelerators_memory_parameters(self, test_job_name):
153+
"""Test creating a job with accelerators, memory parameters"""
154+
# Create job with only accelerators, memory parameters
155+
create_cmd = [
156+
"hyp", "create", "hyp-pytorch-job",
157+
"--version", "1.1",
158+
"--job-name", test_job_name,
159+
"--image", "pytorch:latest",
160+
"--pull-policy", "IfNotPresent",
161+
"--tasks-per-node", "1",
162+
"--accelerators", "1",
163+
"--memory", "1.9",
164+
"--instance-type", "ml.g5.8xlarge",
165+
"--accelerators-limit", "1",
166+
"--memory-limit", "2.7",
167+
"--queue-name", QUEUE,
168+
"--namespace", NAMESPACE
169+
]
170+
171+
result = execute_command(create_cmd)
172+
assert result.returncode == 0
173+
assert "Using version: 1.1" in result.stdout
174+
logger.info(f"Successfully created job with required gpu quota parameters: {test_job_name}")
175+
176+
describe_cmd = [
177+
"hyp", "describe", "hyp-pytorch-job",
178+
"--job-name", test_job_name,
179+
"--namespace", NAMESPACE
180+
]
181+
result = execute_command(describe_cmd)
182+
assert result.returncode == 0
183+
assert " Limits: {'memory': '2899102924800m', 'nvidia.com/gpu': '1'}" in result.stdout
184+
assert " Requests: {'cpu': '32', 'memory': '2040109465600m', 'nvidia.com/gpu': '1'}" in result.stdout
185+
186+
delete_cmd = [
187+
"hyp", "delete", "hyp-pytorch-job",
188+
"--job-name", test_job_name,
189+
"--namespace", NAMESPACE
190+
]
191+
result = execute_command(delete_cmd)
192+
assert result.returncode == 0
193+
logger.info(f"Successfully deleted job: {test_job_name}")
194+
195+
def test_invalid_node_count_accelerators_parameter(self, test_job_name):
196+
"""Test that invalid case where both node-count and accelerators are provided"""
197+
198+
# Test with both node-count and accelerators parameters
199+
create_cmd = [
200+
"hyp", "create", "hyp-pytorch-job",
201+
"--version", "1.1",
202+
"--job-name", test_job_name,
203+
"--image", "pytorch:latest",
204+
"--pull-policy", "IfNotPresent",
205+
"--tasks-per-node", "1",
206+
"--accelerators", "1",
207+
"--instance-type", "ml.g5.8xlarge",
208+
"--vcpu", "3",
209+
"--memory", "1",
210+
"--accelerators-limit", "1",
211+
"--vcpu-limit", "4",
212+
"--memory-limit", "2",
213+
"--node-count", "1",
214+
"--queue-name", QUEUE,
215+
"--namespace", NAMESPACE
216+
]
217+
result = subprocess.run(
218+
create_cmd,
219+
capture_output=True,
220+
text=True
221+
)
222+
assert result.returncode != 0
223+
assert "ValueError: Either node-count or a combination of accelerators, vcpu, " in result.stdout
224+
assert "memory-in-gib must be specified for instance-type ml.g5.8xlarge" in result.stdout
225+
226+
def test_invalid_no_node_count_or_quota_parameter(self, test_job_name):
227+
"""Test that invalid case where both node-count and any of the quota parameters are provided"""
228+
# Test with no node-count, no accelerators/vcpu/memory parameters
229+
create_cmd = [
230+
"hyp", "create", "hyp-pytorch-job",
231+
"--version", "1.1",
232+
"--job-name", test_job_name,
233+
"--image", "pytorch:latest",
234+
"--pull-policy", "IfNotPresent",
235+
"--tasks-per-node", "1",
236+
"--instance-type", "ml.g5.8xlarge",
237+
"--queue-name", QUEUE,
238+
"--namespace", NAMESPACE
239+
]
240+
result = subprocess.run(
241+
create_cmd,
242+
capture_output=True,
243+
text=True
244+
)
245+
assert result.returncode != 0
246+
assert "ValueError: Either node-count or a combination of accelerators, vcpu, " in result.stdout
247+
assert "memory-in-gib must be specified for instance-type ml.g5.8xlarge" in result.stdout
248+
249+
def test_invalid_instance_type_parameter(self, test_job_name):
250+
"""Test case where invalid instance type parameter is provided"""
251+
252+
# Test with both node-count and accelerators parameters
253+
create_cmd = [
254+
"hyp", "create", "hyp-pytorch-job",
255+
"--version", "1.1",
256+
"--job-name", test_job_name,
257+
"--image", "pytorch:latest",
258+
"--pull-policy", "IfNotPresent",
259+
"--tasks-per-node", "1",
260+
"--accelerators", "1",
261+
"--instance-type", "ml.n5.8xlarge",
262+
"--vcpu", "3",
263+
"--memory", "1",
264+
"--accelerators-limit", "1",
265+
"--vcpu-limit", "4",
266+
"--memory-limit", "2",
267+
"--node-count", "1",
268+
"--queue-name", QUEUE,
269+
"--namespace", NAMESPACE
270+
]
271+
result = subprocess.run(
272+
create_cmd,
273+
capture_output=True,
274+
text=True
275+
)
276+
assert result.returncode != 0
277+
assert "ValueError: Invalid instance-type ml.n5.8xlarge" in result.stdout
278+
logger.info("Successfully verified invalid instance type error")

0 commit comments

Comments
 (0)