Skip to content

Commit 3756cbf

Browse files
Merge branch 'develop' into develop
2 parents 6817bcd + 41bbe1f commit 3756cbf

File tree

4 files changed

+16
-16
lines changed

4 files changed

+16
-16
lines changed

CHANGELOG.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@ CHANGELOG
99
- Add support for adding and removing shared storages at cluster update by updating `SharedStorage` configuration.
1010
- Add new configuration parameter `DeletionPolicy` for EFS and FSx for Lustre shared storage
1111
to support storage retention on deletion.
12-
- Add support for AWS Trainium instances.
1312
- Add support for Slurm Accounting.
1413
- Add support for on-demand capacity reservations.
1514
- Add support for specifying the supported IMDS version in cluster and build image configurations via the `Imds/ImdsSettings` property.

tests/integration-tests/tests/trainium/test_trainium.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,13 +29,12 @@ def test_trainium(
2929
bucket = boto3.resource("s3", region_name=region).Bucket(bucket_name)
3030
bucket.upload_file(str(test_datadir / "neuron-installation.sh"), "neuron-installation.sh")
3131

32-
# FIXME remove suppress_validators after GA
3332
cluster_config = pcluster_config_reader(bucket_name=bucket_name)
34-
cluster = clusters_factory(cluster_config, suppress_validators="type:InstanceTypeBaseAMICompatibleValidator")
33+
cluster = clusters_factory(cluster_config)
3534
remote_command_executor = RemoteCommandExecutor(cluster)
3635
scheduler_commands = scheduler_commands_factory(remote_command_executor)
3736

38-
# TODO uncomment allreduce test
37+
# TODO uncomment allreduce test when bug fix in collective library
3938
# _test_allreduce_single_node(test_datadir, remote_command_executor, scheduler_commands)
4039
_test_ccl_two_nodes(test_datadir, remote_command_executor, scheduler_commands)
4140

tests/integration-tests/tests/trainium/test_trainium/test_trainium/neuron-allreduce.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ import os
2424
import torch_xla.core.xla_model as xm
2525
import torch
2626
import torch.distributed as dist
27-
from torch_xla.neuron.distributed import xla_backend
27+
from torch_xla.distributed import xla_backend
2828
2929
#os.environ["NEURON_RT_LOG_LEVEL"] = "INFO"
3030
#os.environ["NEURON_RT_LOG_LOCATION"] = "syslog"

tests/integration-tests/tests/trainium/test_trainium/test_trainium/neuron-installation.sh

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,13 @@ TEMPORARY_ARTIFACTS_BUCKET_PATH=s3://aws-parallelcluster-beta/neuron/
3333

3434
_ubuntu_installation() {
3535
# Configure Linux for Neuron repository updates
36-
sudo tee /etc/apt/sources.list.d/neuron.list > /dev/null <<EOF
36+
sudo tee /etc/apt/sources.list.d/neuron-private.list > /dev/null <<EOF
3737
deb https://${REPO_USER}:${REPO_SECRET}@apt.${REPO_SUFFIX} focal main
3838
EOF
39+
40+
sudo apt-get update -y
41+
sudo apt-get install -y aws-neuronx-runtime-lib=2.* aws-neuronx-collectives=2.*
42+
3943
wget -qO - https://${REPO_USER}:${REPO_SECRET}@apt.${REPO_SUFFIX}/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add -
4044

4145
# Install packages from S3 --> FIXME they should be installed from configured repository
@@ -61,13 +65,14 @@ EOF
6165
}
6266

6367
_rhel_installation() {
64-
# Install dkms driver. This is not required, installation is performed at AMI creation time
65-
sudo tee /etc/yum.repos.d/neuron.repo > /dev/null <<EOF
66-
[neuron]
68+
sudo tee /etc/yum.repos.d/neuron-private.repo > /dev/null <<EOF
69+
[neuron-private]
6770
name=Neuron YUM Repository
6871
baseurl=https://${REPO_USER}:${REPO_SECRET}@yum.${REPO_SUFFIX}
6972
enabled=1
7073
EOF
74+
sudo yum install -y aws-neuronx-runtime-lib-2.* aws-neuronx-collectives-2.*
75+
7176
sudo rpm --import https://${REPO_USER}:${REPO_SECRET}@yum.${REPO_SUFFIX}/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB
7277

7378
# Install packages from S3 --> FIXME they should be installed from configured repository
@@ -79,7 +84,6 @@ EOF
7984

8085

8186
_dkms_ubuntu_installation() {
82-
# Install dkms driver. This is not required, installation is performed at AMI creation time
8387
sudo tee /etc/apt/sources.list.d/neuron.list > /dev/null <<EOF
8488
deb https://apt.repos.neuron.amazonaws.com focal main
8589
EOF
@@ -91,7 +95,6 @@ EOF
9195

9296

9397
_dkms_rhel_installation() {
94-
# Install dkms driver. This is not required, installation is performed at AMI creation time
9598
sudo tee /etc/yum.repos.d/neuron.repo > /dev/null <<EOF
9699
[neuron]
97100
name=Neuron YUM Repository
@@ -111,12 +114,12 @@ function main() {
111114
local OS="$(grep "^ID=" /etc/os-release | cut -d"=" -f 2 | xargs)"
112115
case ${OS} in
113116
ubuntu)
114-
_dkms_ubuntu_installation # not needed, installed at AMI creation time
117+
_dkms_ubuntu_installation
115118
_ubuntu_installation
116119
USER=ubuntu
117120
;;
118121
amzn)
119-
_dkms_rhel_installation # not needed, installed at AMI creation time
122+
_dkms_rhel_installation
120123
_rhel_installation
121124
USER=ec2-user
122125
;;
@@ -131,9 +134,8 @@ function main() {
131134
pip3 install -U pip
132135
pip3 install pytest
133136

134-
# Install packages from beta repo --> FIXME they should be installed from official PyPI
135-
python3 -m pip config set global.extra-index-url "https://${REPO_USER}:${REPO_SECRET}@pip.${REPO_SUFFIX}"
136-
PIPS=$(aws secretsmanager get-secret-value --secret-id arn:aws:secretsmanager:us-east-1:447714826191:secret:TrainiumPreviewRepository --region us-east-1 --query 'SecretString' --output text | jq -r '.pips')
137+
python3 -m pip config set global.extra-index-url "https://pip.repos.neuron.amazonaws.com"
138+
PIPS='torch-neuronx==1.11.0.1.* neuronx-cc==2.* transformers'
137139
pip3 install ${PIPS}
138140
}
139141

0 commit comments

Comments
 (0)