forked from Genesis-Embodied-AI/Genesis
-
Notifications
You must be signed in to change notification settings - Fork 0
160 lines (138 loc) · 6.13 KB
/
production.yml
File metadata and controls
160 lines (138 loc) · 6.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
name: Production
on:
# Trigger the workflow on push on the master branch, or for any pull request
push:
branches:
- main
pull_request:
branches:
- main
concurrency:
# Cancel all workflows that are stil running if any when updating branches associated with PRs,
# BUT don't do anything for workflows that are not triggered by PRs.
group: ${{ github.workflow }}-${{ github.head_ref || github.ref }}
cancel-in-progress: ${{ github.event_name == 'pull_request' }}
env:
# Note that secrets are not passed to workflows that are triggered by a pull request from a fork
HF_TOKEN: ${{ secrets.HF_TOKEN }}
HF_HUB_DOWNLOAD_TIMEOUT: 60
GENESIS_IMAGE_VER: "1_14"
TIMEOUT_MINUTES: 60
FORCE_COLOR: 1
PY_COLORS: 1
MADRONA_DISABLE_CUDA_HEAP_SIZE: "1"
OMNI_KIT_ACCEPT_EULA: "yes"
OMNI_KIT_ALLOW_ROOT: "1"
jobs:
unit-tests:
name: production-unit_tests-${{ matrix.GS_ENABLE_NDARRAY == '0' && 'field' || 'ndarray' }}
runs-on: [self-hosted, coreweave, genesis-world]
strategy:
fail-fast: true
max-parallel: 1
matrix:
GS_ENABLE_NDARRAY: ["0", "1"]
env:
GS_ENABLE_NDARRAY: ${{ matrix.GS_ENABLE_NDARRAY }}
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Run unit tests
if: github.event_name == 'pull_request'
run: |
SLURM_JOB_NAME="$(uuidgen)_$(date +%Y%m%d_%H%M%S)"
echo "SLURM_JOB_NAME=${SLURM_JOB_NAME}" >> $GITHUB_ENV
mkdir -p "${HOME}/.cache" "${HOME}/.venv"
# TODO: USD baking does not currently support Python 3.11 since
# NVIDIA does not currently release `omniverse-kit==107.3` on PyPI.
# See: https://github.com/Genesis-Embodied-AI/Genesis/pull/1300
srun \
--container-image="/mnt/data/images/genesis-v${GENESIS_IMAGE_VER}.sqsh" \
--container-mounts=\
"${HOME}/.venv":/root/.venv,\
"${HOME}/.cache":/root/.cache,\
"${{ github.workspace }}":/root/workspace \
--no-container-mount-home --container-workdir=/root/workspace \
--export=NVIDIA_DRIVER_CAPABILITIES=all,BASH_ENV=/root/.bashrc,HF_TOKEN,GS_ENABLE_NDARRAY=${GS_ENABLE_NDARRAY} \
--partition=hpc-mid --nodes=1 --gpus=8 --exclusive --time="${TIMEOUT_MINUTES}" \
--job-name=${SLURM_JOB_NAME} \
bash -e -s << 'EOF'
if test -n "$(find /root/.venv -maxdepth 0 -empty)"; then
python3 -m venv --system-site-packages /root/.venv
source /root/.venv/bin/activate
pip install --no-input --upgrade pip pkg-info wheel
pip install --no-input --ignore-installed --upgrade blinker pyparsing setuptools
fi
source /root/.venv/bin/activate
pip install --no-input --extra-index-url https://pypi.nvidia.com/ omniverse-kit
pip install --no-input ".[dev,render,usd]"
pytest -v -ra --backend gpu --dev --forked ./tests
EOF
- name: Kill srun job systematically
if: always()
run: |
if [ -n "${SLURM_JOB_NAME}" ] ; then
scancel --user=${USER} --name="${SLURM_JOB_NAME}"
fi
benchmarks:
name: production-benchmarks-${{ matrix.GS_ENABLE_NDARRAY == '0' && 'field' || 'ndarray' }}
needs: unit-tests
runs-on: [self-hosted, coreweave, genesis-world]
strategy:
matrix:
GS_ENABLE_NDARRAY: ["0", "1"]
env:
# Note that secrets are not passed to workflows that are triggered by a pull request from a fork
WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
GS_ENABLE_NDARRAY: ${{ matrix.GS_ENABLE_NDARRAY }}
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
# Checkout full history is required to shallow cloning while mark HEAD as "grafted". This breaks remote
# tracking thereby making it impossible to detect whether a commit is contained in upstream main.
fetch-depth: 0
- name: Run benchmarks
run: |
SLURM_JOB_NAME="$(uuidgen)_$(date +%Y%m%d_%H%M%S)"
echo "SLURM_JOB_NAME=${SLURM_JOB_NAME}" >> $GITHUB_ENV
SLURM_ENV_VARS="NVIDIA_DRIVER_CAPABILITIES=all,BASH_ENV=/root/.bashrc,HF_TOKEN,GS_ENABLE_NDARRAY=${GS_ENABLE_NDARRAY}"
if [[ "${{ github.repository }}" == 'Genesis-Embodied-AI/Genesis' && "${{ github.ref }}" == 'refs/heads/main' ]] ; then
SLURM_ENV_VARS="${SLURM_ENV_VARS},WANDB_API_KEY"
fi
srun \
--container-image="/mnt/data/images/genesis-v${GENESIS_IMAGE_VER}.sqsh" \
--container-mounts=\
"${HOME}/.venv":/root/.venv,\
/mnt/data/artifacts:/mnt/data/artifacts,\
"${{ github.workspace }}":/root/workspace \
--no-container-mount-home --container-workdir=/root/workspace \
--export=${SLURM_ENV_VARS} \
--partition=hpc-mid --nodes=1 --gpus=8 --exclusive --time="${TIMEOUT_MINUTES}" \
--job-name=${SLURM_JOB_NAME} \
bash -e -s << 'EOF'
# sudo apt update
# sudo apt install -y tmate
# tmate -S /tmp/tmate.sock new-session -d
# tmate -S /tmp/tmate.sock wait tmate-ready
# tmate -S /tmp/tmate.sock display -p '#{tmate_ssh}'
source /root/.venv/bin/activate
pip install --no-input ".[dev,render]"
pytest --print -x -m "benchmarks" ./tests
cat speed_test*.txt > "/mnt/data/artifacts/speed_test_${SLURM_JOB_NAME}.txt"
# tmate -S /tmp/tmate.sock wait tmate-exit
EOF
- name: Kill srun job systematically
if: always()
run: |
if [ -n "${SLURM_JOB_NAME}" ] ; then
scancel --user=${USER} --name="${SLURM_JOB_NAME}"
fi
- name: Display benchmark stats
run: |
cat "/mnt/data/artifacts/speed_test_${SLURM_JOB_NAME}.txt"
- name: Upload benchmark stats as artifact
uses: actions/upload-artifact@v4
with:
name: speed-test-${{ matrix.GS_ENABLE_NDARRAY }}
path: "/mnt/data/artifacts/speed_test_${{ env.SLURM_JOB_NAME }}.txt"