-
Notifications
You must be signed in to change notification settings - Fork 333
/
Copy pathnvidia-mig.yml
53 lines (47 loc) · 1.82 KB
/
nvidia-mig.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
---
# Install the NVIDIA MIG Manager tooling on all MIG-capable nodes
# Copy over the custom MIG config to all nodes and apply them
# Cluster-wide config is set in group_vars/config.yml by mig_manager_profile
# Per-node config can be configured in the inventory file
# Check if MIG capabilities and software on nodes
- hosts: all
vars:
mig_manager_reboot_timeout: 900
tasks:
- name: check for MIG capable devices
shell: nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | grep -v 'N/A'
register: has_mig
failed_when: false
- name: Install MIG Manager
include_role:
name: nvidia-mig-manager
when:
- has_mig.rc == 0
# TODO: Consider storing a custom copy of the hooks.yaml configuration alongside the config.yaml
- name: copy cluster-wide mig config file
copy:
src: "../../config/nvidia-mig-config.yml"
dest: "{{ mig_manager_config }}"
when: has_mig.rc == 0
- name: Apply MIG configuration
command: nvidia-mig-parted apply -f {{ mig_manager_config }} -c {{ mig_manager_profile }} -k {{ mig_manager_hooks }}
when: has_mig.rc == 0
# Reboot nodes if necessary and poll for them to come up
- name: Reboot if necessary
shell: sleep 2 && /sbin/shutdown -r now "Reboot required"
async: 1
poll: 0
when:
- reboot_required is defined
- reboot_required | default(false)
- name: Wait for server to reboot (if required)
wait_for_connection:
delay=15
timeout={{ reboot_timeout }}
when:
- reboot_required is defined
- reboot_required | default(false)
- has_mig.rc == 0
- name: Assert MIG configuration was applied
command: nvidia-mig-parted assert -f {{ mig_manager_config }} -c {{ mig_manager_profile }}
when: has_mig.rc == 0