Skip to content

Commit 363716a

Browse files
committed
A3 High: adopt startup-script solution for network delay
1 parent 45f6501 commit 363716a

File tree

1 file changed

+1
-25
lines changed

1 file changed

+1
-25
lines changed

examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml

Lines changed: 1 addition & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ deployment_groups:
5050
docker:
5151
enabled: true
5252
world_writable: true
53+
enable_gpu_network_wait_online: true
5354
configure_ssh_host_patterns:
5455
- 10.0.0.*
5556
- 10.1.0.*
@@ -116,31 +117,6 @@ deployment_groups:
116117
* - nofile 1048576
117118
* - cpu unlimited
118119
* - rtprio unlimited
119-
- type: data
120-
destination: /etc/systemd/system/delay-a3.service
121-
content: |
122-
[Unit]
123-
Description=Delay A3 boot until all network interfaces are routable
124-
After=network-online.target
125-
Wants=network-online.target
126-
Before=google-startup-scripts.service
127-
128-
[Service]
129-
ExecCondition=/bin/bash -c '/usr/bin/curl -s -H "Metadata-Flavor: Google" http://metadata.google.internal/computeMetadata/v1/instance/machine-type | grep -q "/a3-highgpu-8g$"'
130-
ExecStart=/usr/lib/systemd/systemd-networkd-wait-online -i enp6s0 -i enp12s0 -i enp134s0 -i enp140s0 -o routable --timeout=120
131-
ExecStartPost=/bin/sleep 10
132-
133-
[Install]
134-
WantedBy=multi-user.target
135-
- type: shell
136-
destination: enable_delay_a3.sh
137-
content: |
138-
#!/bin/bash
139-
set -e -o pipefail
140-
# workaround b/309016676 (systemd-resolved restarts 4 times causing DNS
141-
# resolution failures during google-startup-scripts.service)
142-
systemctl daemon-reload
143-
systemctl enable delay-a3.service
144120
- type: data
145121
destination: /etc/enroot/enroot.conf
146122
content: |

0 commit comments

Comments
 (0)