Skip to content

Commit 015f7e8

Browse files
authored
Breaking: Update sagemaker-training version >=5.0.0 (#260)
* Breaking: Update sagemaker-training version >=5.0.0 * update test dependency * Add latest version and python version to README * fix GitHub README shield badges * fix tests
1 parent 5eaa126 commit 015f7e8

File tree

11 files changed

+64
-47
lines changed

11 files changed

+64
-47
lines changed

README.rst

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,16 @@
1-
21
==================================
32
SageMaker PyTorch Training Toolkit
43
==================================
54

5+
.. image:: https://img.shields.io/pypi/v/sagemaker-pytorch-training.svg
6+
:target: https://pypi.python.org/pypi/sagemaker-pytorch-training
7+
:alt: Latest Version
8+
9+
.. image:: https://img.shields.io/pypi/pyversions/sagemaker-pytorch-training.svg
10+
:target: https://pypi.python.org/pypi/sagemaker-pytorch-training
11+
:alt: Supported Python Versions
12+
13+
614
SageMaker PyTorch Training Toolkit is an open-source library for using PyTorch to train models on Amazon SageMaker.
715

816
This toolkit depends and extends the base `SageMaker Training Toolkit <https://github.com/aws/sagemaker-training-toolkit>`__ with PyTorch specific support.

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
2.9.1.dev0
1+
3.0.0

setup.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ def read(fname):
2525

2626

2727
test_dependencies = ['boto3', 'coverage==6.5.0', 'flake8', 'future', 'mock', 'pytest', 'pytest-cov',
28-
'pytest-xdist', 'sagemaker[local]<2', 'torch', 'torchvision', 'tox']
28+
'pytest-xdist', 'sagemaker[local]', 'torch', 'torchvision', 'tox']
2929

3030
setup(
3131
name='sagemaker_pytorch_training',
@@ -48,12 +48,11 @@ def read(fname):
4848
"Natural Language :: English",
4949
"License :: OSI Approved :: Apache Software License",
5050
"Programming Language :: Python",
51-
'Programming Language :: Python :: 3.7',
5251
'Programming Language :: Python :: 3.8',
5352
'Programming Language :: Python :: 3.9',
5453
],
5554

56-
install_requires=['retrying', 'sagemaker-training>=4.3.0,<=4.8.3', 'six>=1.12.0'],
55+
install_requires=['retrying', 'sagemaker-training>=5.0.0,<6.0.0', 'six>=1.12.0'],
5756
extras_require={
5857
'test': test_dependencies
5958
},

test/integration/local/test_distributed_training.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,9 @@ def fixture_dist_gpu_backend(request):
3535
def test_dist_operations_path_cpu(image_uri, dist_cpu_backend, sagemaker_local_session, tmpdir):
3636
estimator = PyTorch(entry_point=dist_operations_path,
3737
role=ROLE,
38-
image_name=image_uri,
39-
train_instance_count=2,
40-
train_instance_type='local',
38+
image_uri=image_uri,
39+
instance_count=2,
40+
instance_type='local',
4141
sagemaker_session=sagemaker_local_session,
4242
hyperparameters={'backend': dist_cpu_backend},
4343
output_path='file://{}'.format(tmpdir))
@@ -49,9 +49,9 @@ def test_dist_operations_path_cpu(image_uri, dist_cpu_backend, sagemaker_local_s
4949
def test_dist_operations_path_gpu_nccl(image_uri, sagemaker_local_session, tmpdir):
5050
estimator = PyTorch(entry_point=dist_operations_path,
5151
role=ROLE,
52-
image_name=image_uri,
53-
train_instance_count=1,
54-
train_instance_type='local_gpu',
52+
image_uri=image_uri,
53+
instance_count=1,
54+
instance_type='local_gpu',
5555
sagemaker_session=sagemaker_local_session,
5656
hyperparameters={'backend': 'nccl'},
5757
output_path='file://{}'.format(tmpdir))
@@ -63,9 +63,9 @@ def test_dist_operations_path_gpu_nccl(image_uri, sagemaker_local_session, tmpdi
6363
def test_cpu_nccl(image_uri, sagemaker_local_session, tmpdir):
6464
estimator = PyTorch(entry_point=mnist_script,
6565
role=ROLE,
66-
image_name=image_uri,
67-
train_instance_count=2,
68-
train_instance_type='local',
66+
image_uri=image_uri,
67+
instance_count=2,
68+
instance_type='local',
6969
sagemaker_session=sagemaker_local_session,
7070
hyperparameters={'backend': 'nccl'},
7171
output_path='file://{}'.format(tmpdir))
@@ -81,9 +81,9 @@ def test_cpu_nccl(image_uri, sagemaker_local_session, tmpdir):
8181
def test_mnist_cpu(image_uri, dist_cpu_backend, sagemaker_local_session, tmpdir):
8282
estimator = PyTorch(entry_point=mnist_script,
8383
role=ROLE,
84-
image_name=image_uri,
85-
train_instance_count=2,
86-
train_instance_type='local',
84+
image_uri=image_uri,
85+
instance_count=2,
86+
instance_type='local',
8787
sagemaker_session=sagemaker_local_session,
8888
hyperparameters={'backend': dist_cpu_backend},
8989
output_path='file://{}'.format(tmpdir))

test/integration/local/test_horovod.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,10 @@ def test_horovod_simple(sagemaker_local_session, image_uri, framework_version, t
3131
estimator = PyTorch(
3232
entry_point=os.path.join(resources_path, 'horovod', 'simple.py'),
3333
role='SageMakerRole',
34-
train_instance_type="local_gpu",
34+
instance_type="local_gpu",
3535
sagemaker_session=sagemaker_local_session,
36-
train_instance_count=instances,
37-
image_name=image_uri,
36+
instance_count=instances,
37+
image_uri=image_uri,
3838
output_path=output_path,
3939
framework_version=framework_version,
4040
hyperparameters={'sagemaker_mpi_enabled': True,
@@ -66,10 +66,10 @@ def test_horovod_training(sagemaker_local_session, image_uri, framework_version,
6666
estimator = PyTorch(
6767
entry_point=os.path.join(resources_path, 'horovod', 'train.py'),
6868
role='SageMakerRole',
69-
train_instance_type="local_gpu",
69+
instance_type="local_gpu",
7070
sagemaker_session=sagemaker_local_session,
71-
train_instance_count=1,
72-
image_name=image_uri,
71+
instance_count=1,
72+
image_uri=image_uri,
7373
framework_version=framework_version,
7474
hyperparameters={'sagemaker_mpi_enabled': True,
7575
'sagemaker_mpi_num_of_processes_per_host': 2,

test/integration/local/test_requirements.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,9 @@ def test_requirements_file(image_uri, instance_type, sagemaker_local_session, tm
2323
entry_point=requirements_script,
2424
source_dir=requirements_dir,
2525
role=ROLE,
26-
image_name=image_uri,
27-
train_instance_count=1,
28-
train_instance_type=instance_type,
26+
image_uri=image_uri,
27+
instance_count=1,
28+
instance_type=instance_type,
2929
sagemaker_session=sagemaker_local_session,
3030
output_path='file://{}'.format(tmpdir)
3131
)

test/integration/local/test_single_machine_training.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,9 @@
2323
def test_mnist(image_uri, processor, instance_type, sagemaker_local_session, tmpdir):
2424
estimator = PyTorch(entry_point=mnist_script,
2525
role=ROLE,
26-
image_name=image_uri,
27-
train_instance_count=1,
28-
train_instance_type=instance_type,
26+
image_uri=image_uri,
27+
instance_count=1,
28+
instance_type=instance_type,
2929
sagemaker_session=sagemaker_local_session,
3030
hyperparameters={'processor': processor},
3131
output_path='file://{}'.format(tmpdir))

test/integration/sagemaker/test_distributed_operations.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
import pytest
1919
from sagemaker import utils
2020
from sagemaker.pytorch import PyTorch
21+
from sagemaker.local import LocalSession
2122
from six.moves.urllib.parse import urlparse
2223

2324
from integration import data_dir, dist_operations_path, mnist_script, DEFAULT_TIMEOUT
@@ -31,13 +32,17 @@
3132
@pytest.mark.skip_test_in_region
3233
def test_dist_operations_cpu(sagemaker_session, image_uri, instance_type, dist_cpu_backend):
3334
instance_type = instance_type or 'ml.c4.xlarge'
35+
if "local" in instance_type:
36+
sagemaker_session = LocalSession()
3437
_test_dist_operations(sagemaker_session, image_uri, instance_type, dist_cpu_backend)
3538

3639

3740
@pytest.mark.skip_cpu
3841
@pytest.mark.deploy_test
3942
def test_dist_operations_gpu(sagemaker_session, instance_type, image_uri, dist_gpu_backend):
4043
instance_type = instance_type or 'ml.p2.xlarge'
44+
if "local" in instance_type:
45+
sagemaker_session = LocalSession()
4146
_test_dist_operations(sagemaker_session, image_uri, instance_type, dist_gpu_backend)
4247

4348

@@ -52,9 +57,9 @@ def test_mnist_gpu(sagemaker_session, image_uri, dist_gpu_backend):
5257
with timeout(minutes=DEFAULT_TIMEOUT):
5358
pytorch = PyTorch(entry_point=mnist_script,
5459
role='SageMakerRole',
55-
train_instance_count=2,
56-
image_name=image_uri,
57-
train_instance_type=MULTI_GPU_INSTANCE,
60+
instance_count=2,
61+
image_uri=image_uri,
62+
instance_type=MULTI_GPU_INSTANCE,
5863
sagemaker_session=sagemaker_session,
5964
debugger_hook_config=False,
6065
hyperparameters={'backend': dist_gpu_backend})
@@ -70,10 +75,10 @@ def _test_dist_operations(sagemaker_session, image_uri, instance_type, dist_back
7075
with timeout(minutes=DEFAULT_TIMEOUT):
7176
pytorch = PyTorch(entry_point=dist_operations_path,
7277
role='SageMakerRole',
73-
train_instance_count=train_instance_count,
74-
train_instance_type=instance_type,
78+
instance_count=train_instance_count,
79+
instance_type=instance_type,
7580
sagemaker_session=sagemaker_session,
76-
image_name=image_uri,
81+
image_uri=image_uri,
7782
debugger_hook_config=False,
7883
hyperparameters={'backend': dist_backend})
7984

test/integration/sagemaker/test_horovod.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -44,10 +44,10 @@ def test_horovod_simple(
4444
estimator = PyTorch(
4545
entry_point=os.path.join(resources_path, "horovod", "simple.py"),
4646
role="SageMakerRole",
47-
train_instance_type=train_instance_type,
47+
instance_type=train_instance_type,
4848
sagemaker_session=sagemaker_session,
49-
train_instance_count=instances,
50-
image_name=image_uri,
49+
instance_count=instances,
50+
image_uri=image_uri,
5151
output_path=output_path,
5252
framework_version=framework_version,
5353
hyperparameters={
@@ -100,10 +100,10 @@ def test_horovod_training(
100100
estimator = PyTorch(
101101
entry_point=os.path.join(resources_path, "horovod", "train.py"),
102102
role="SageMakerRole",
103-
train_instance_type=train_instance_type,
103+
instance_type=train_instance_type,
104104
sagemaker_session=sagemaker_session,
105-
train_instance_count=instances,
106-
image_name=image_uri,
105+
instance_count=instances,
106+
image_uri=image_uri,
107107
framework_version=framework_version,
108108
hyperparameters={
109109
"sagemaker_mpi_enabled": True,

test/integration/sagemaker/test_mnist.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
import pytest
1616
from sagemaker import utils
1717
from sagemaker.pytorch import PyTorch
18+
from sagemaker.local import LocalSession
1819

1920
from integration import training_dir, mnist_script, DEFAULT_TIMEOUT
2021
from integration.sagemaker.timeout import timeout
@@ -23,23 +24,27 @@
2324
@pytest.mark.skip_gpu
2425
def test_mnist_distributed_cpu(sagemaker_session, image_uri, instance_type, dist_cpu_backend):
2526
instance_type = instance_type or 'ml.c4.xlarge'
27+
if "local" in instance_type:
28+
sagemaker_session = LocalSession()
2629
_test_mnist_distributed(sagemaker_session, image_uri, instance_type, dist_cpu_backend)
2730

2831

2932
@pytest.mark.skip_cpu
3033
def test_mnist_distributed_gpu(sagemaker_session, image_uri, instance_type, dist_gpu_backend):
3134
instance_type = instance_type or 'ml.p2.xlarge'
35+
if "local" in instance_type:
36+
sagemaker_session = LocalSession()
3237
_test_mnist_distributed(sagemaker_session, image_uri, instance_type, dist_gpu_backend)
3338

3439

3540
def _test_mnist_distributed(sagemaker_session, image_uri, instance_type, dist_backend):
3641
with timeout(minutes=DEFAULT_TIMEOUT):
3742
pytorch = PyTorch(entry_point=mnist_script,
3843
role='SageMakerRole',
39-
train_instance_count=2,
40-
train_instance_type=instance_type,
44+
instance_count=2,
45+
instance_type=instance_type,
4146
sagemaker_session=sagemaker_session,
42-
image_name=image_uri,
47+
image_uri=image_uri,
4348
debugger_hook_config=False,
4449
hyperparameters={'backend': dist_backend, 'epochs': 2})
4550
training_input = pytorch.sagemaker_session.upload_data(path=training_dir,

0 commit comments

Comments
 (0)