Skip to content

Commit 15c7a1a

Browse files
mx26polMarta Aleszewicz
authored andcommitted
Changed neuron key to neurondevice. (#177)
Co-authored-by: Marta Aleszewicz <[email protected]>
1 parent c2f44f5 commit 15c7a1a

File tree

2 files changed

+8
-8
lines changed

2 files changed

+8
-8
lines changed

hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/quota_allocation_util.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -268,7 +268,7 @@ def _get_accelerator_type_and_count(instance_type: str) -> Tuple[Optional[str],
268268

269269
# Determine the appropriate key based on instance type
270270
if trainium_count > 0:
271-
accelerator_key = "aws.amazon.com/neuron"
271+
accelerator_key = "aws.amazon.com/neurondevice"
272272
instance_accelerator_count = trainium_count
273273
elif gpu_count > 0:
274274
accelerator_key = "nvidia.com/gpu"

test/unit_tests/cli/test_quota_allocation_util.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -64,9 +64,9 @@ def test_has_gpu_quota_allocation_resources(self, memory_in_gib, vcpu, accelerat
6464
("ml.g5.12xlarge", "nvidia.com/gpu", 4),
6565
("ml.g6.48xlarge", "nvidia.com/gpu", 8),
6666
# Trainium instances
67-
("ml.trn1.32xlarge", "aws.amazon.com/neuron", 16),
68-
("ml.trn1n.32xlarge", "aws.amazon.com/neuron", 16),
69-
("ml.trn2.48xlarge", "aws.amazon.com/neuron", 16),
67+
("ml.trn1.32xlarge", "aws.amazon.com/neurondevice", 16),
68+
("ml.trn1n.32xlarge", "aws.amazon.com/neurondevice", 16),
69+
("ml.trn2.48xlarge", "aws.amazon.com/neurondevice", 16),
7070
# CPU-only instances
7171
("ml.c5.large", None, 0),
7272
("ml.m5.xlarge", None, 0),
@@ -110,7 +110,7 @@ def test_get_resources_from_compute_quotas_trainium_instance(self):
110110
result = _get_resources_from_compute_quotas("ml.trn1.32xlarge", None, None, 8)
111111
# ml.trn1.32xlarge has 16 trainium, 128 CPUs, 512GB memory
112112
# 8 trainium is half, so we should get half of CPU and memory
113-
assert result == {"cpu": "64.0", "memory": "256.0Gi", "aws.amazon.com/neuron": 8}
113+
assert result == {"cpu": "64.0", "memory": "256.0Gi", "aws.amazon.com/neurondevice": 8}
114114

115115
def test_get_resources_from_compute_quotas_cpu_only_instance(self):
116116
result = _get_resources_from_compute_quotas("ml.c5.large", 1.0, 2.0, 1)
@@ -137,8 +137,8 @@ def test_get_resources_from_compute_quotas_accelerators_and_cpu_only(self):
137137
("ml.g5.xlarge", 1, {"cpu": "4", "memory": "16Gi", "nvidia.com/gpu": 1}),
138138
("ml.g5.xlarge", 3, {"cpu": "12", "memory": "48Gi", "nvidia.com/gpu": 3}),
139139
# Trainium instances
140-
("ml.trn1.32xlarge", 1, {"cpu": "128", "memory": "512Gi", "aws.amazon.com/neuron": 16}),
141-
("ml.trn1.32xlarge", 2, {"cpu": "256", "memory": "1024Gi", "aws.amazon.com/neuron": 32}),
140+
("ml.trn1.32xlarge", 1, {"cpu": "128", "memory": "512Gi", "aws.amazon.com/neurondevice": 16}),
141+
("ml.trn1.32xlarge", 2, {"cpu": "256", "memory": "1024Gi", "aws.amazon.com/neurondevice": 32}),
142142
# CPU-only instances
143143
("ml.c5.large", 1, {"cpu": "2", "memory": "4Gi"}),
144144
("ml.c5.large", 5, {"cpu": "10", "memory": "20Gi"}),
@@ -177,7 +177,7 @@ def test_get_limits_zero_values(self):
177177

178178
def test_get_limits_trainium_instance(self):
179179
result = _get_limits("ml.trn1.32xlarge", 8.0, 32.0, 4)
180-
assert result == {"cpu": "8.0", "memory": "32.0Gi", "aws.amazon.com/neuron": 4}
180+
assert result == {"cpu": "8.0", "memory": "32.0Gi", "aws.amazon.com/neurondevice": 4}
181181

182182
def test_get_limits_cpu_only_instance(self):
183183
result = _get_limits("ml.c5.large", 2.0, 8.0, 1)

0 commit comments

Comments
 (0)