Skip to content

Commit 514f5d9

Browse files
authored
CI Updates (#500)
* send weekly fail email * branch-aware checkout for tracer add rodinia to weekly * update weekly for spinlook * update weekly * fix weekly * delete old traces after new generated finish. * increate timeout * mkdir
1 parent ff9a5d6 commit 514f5d9

File tree

2 files changed

+103
-18
lines changed

2 files changed

+103
-18
lines changed

.github/workflows/main.yml

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -267,6 +267,41 @@ jobs:
267267
run: |
268268
source ./env-setup/12.8_env_setup.sh
269269
rm -rf ./gpu-simulator/gpgpu-sim
270+
271+
# Clone gpgpu-sim with fork-aware branch selection
272+
echo "Cloning gpgpu-sim with fork-aware branch selection..."
273+
git clone --quiet [email protected]:accel-sim/gpgpu-sim_distribution.git ./gpu-simulator/gpgpu-sim
274+
275+
# Try to checkout the same branch from the same owner's fork first
276+
if [[ ${{ github.event_name }} == 'pull_request' ]]; then
277+
current_owner=$(echo ${{ github.event.pull_request.head.repo.full_name }} | cut -d'/' -f1)
278+
else
279+
current_owner=$(echo ${{ github.repository }} | cut -d'/' -f1)
280+
fi
281+
current_repo=$(echo ${{ github.repository }} | cut -d'/' -f2)
282+
283+
gpgpusim_repo=$(echo $current_repo | sed 's/accel-sim-framework/gpgpu-sim_distribution/')
284+
285+
echo "Attempting to checkout branch '$BRANCH_NAME' from '$current_owner/$gpgpusim_repo'"
286+
287+
# First, try to add the fork owner's repository as a remote and check if the branch exists
288+
if git -C ./gpu-simulator/gpgpu-sim/ remote add fork-owner [email protected]:$current_owner/$gpgpusim_repo.git 2>/dev/null; then
289+
# Check if the branch exists in the fork owner's repository
290+
if git -C ./gpu-simulator/gpgpu-sim/ ls-remote fork-owner | grep -q "refs/heads/$BRANCH_NAME"; then
291+
echo "Found branch '$BRANCH_NAME' in '$current_owner/$gpgpusim_repo' repository, checking it out"
292+
git -C ./gpu-simulator/gpgpu-sim/ fetch fork-owner
293+
git -C ./gpu-simulator/gpgpu-sim/ checkout -B $BRANCH_NAME fork-owner/$BRANCH_NAME
294+
else
295+
echo "Branch '$BRANCH_NAME' not found in '$current_owner/$gpgpusim_repo' repository, falling back to upstream dev branch"
296+
git -C ./gpu-simulator/gpgpu-sim/ checkout -B dev origin/dev
297+
fi
298+
# Remove the temporary remote
299+
git -C ./gpu-simulator/gpgpu-sim/ remote remove fork-owner
300+
else
301+
echo "Could not add '$current_owner/$gpgpusim_repo' remote, falling back to upstream dev branch"
302+
git -C ./gpu-simulator/gpgpu-sim/ checkout -B dev origin/dev
303+
fi
304+
270305
source ./gpu-simulator/setup_environment.sh
271306
make clean -C gpu-simulator
272307
make -j20 -C gpu-simulator

.github/workflows/weekly.yml

Lines changed: 68 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,11 @@ on:
55
# push:
66
schedule:
77
- cron: '0 20 * * FRI' # 8:00 PM every Friday
8-
8+
env:
9+
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
910
jobs:
1011
Tracer-Weekly:
12+
timeout-minutes: 720
1113
if: ${{ github.repository == 'accel-sim/accel-sim-framework' || github.event_name == 'workflow_dispatch' }}
1214
runs-on: tgrogers-gpu01
1315
defaults:
@@ -38,6 +40,7 @@ jobs:
3840
git -C ./gpu-app-collection/ submodule update --init -- ./src/cuda/cuda-samples
3941
source ./gpu-app-collection/src/setup_environment
4042
ln -s /home/tgrogers-raid/a/common/data_dirs ./gpu-app-collection/
43+
make -j8 -C ./gpu-app-collection/src rodinia_2.0-ft
4144
make -j8 -C ./gpu-app-collection/src rodinia-3.1
4245
make -j8 -C ./gpu-app-collection/src GPU_Microbenchmark
4346
# make -j8 -C ./gpu-app-collection/src Deepbench_nvidia
@@ -49,30 +52,22 @@ jobs:
4952
source ./env-setup/12.8_env_setup.sh
5053
source ./gpu-app-collection/src/setup_environment
5154
rm -rf ./hw_run/
55+
./util/tracer_nvbit/run_hw_trace.py -B rodinia_2.0-ft,rodinia-3.1,GPU_Microbenchmark -D 7
5256
rm -rf /scratch/tgrogers-disk01/a/common/for-sharing/$USER/nightly-traces
5357
mkdir -p /scratch/tgrogers-disk01/a/common/for-sharing/$USER/nightly-traces
54-
ln -s /scratch/tgrogers-disk01/a/common/for-sharing/$USER/nightly-traces ./hw_run
55-
./util/tracer_nvbit/run_hw_trace.py -B rodinia-3.1,GPU_Microbenchmark -D 7
56-
# ./util/tracer_nvbit/run_hw_trace.py -B rodinia-3.1,GPU_Microbenchmark,parboil,polybench,cutlass_5_trace,Deepbench_nvidia_tencore,Deepbench_nvidia_normal -D 7
58+
mv ./hw_run /scratch/tgrogers-disk01/a/common/for-sharing/$USER/nightly-traces/hw_run
59+
# ./util/tracer_nvbit/run_hw_trace.py -B rodinia_2.0-ft,rodinia-3.1,GPU_Microbenchmark,parboil,polybench,cutlass_5_trace,Deepbench_nvidia_tencore,Deepbench_nvidia_normal -D 7
5760
- name: generate-spinlock-traces-spinlock_handling
5861
run: |
5962
source ./env-setup/12.8_env_setup.sh
6063
source ./gpu-app-collection/src/setup_environment
6164
rm -rf ./hw_run/
6265
./util/tracer_nvbit/run_hw_trace.py -B Spinlock -D 7 --spinlock_handling fast_forward
63-
mv ./hw_run ./hw_run_fast_forward
66+
mv ./hw_run /scratch/tgrogers-disk01/a/common/for-sharing/$USER/nightly-traces/hw_run_fast_forward
6467
./util/tracer_nvbit/run_hw_trace.py -B Spinlock -D 7 --spinlock_handling none
65-
mv ./hw_run ./hw_run_none
66-
- name: test-new-traces-spinlock_handling
67-
# Test only fast-forwarded traces as the none one takes too long to run (~2-3 hr)
68-
run: |
69-
source ./env-setup/12.8_env_setup.sh
70-
source ./gpu-simulator/setup_environment.sh
71-
./util/job_launching/run_simulations.py -B Spinlock -C QV100-SASS -T ./hw_run_fast_forward/traces/device-7/ -N spinlock-microbenchmark-$$-fast_forward
72-
./util/job_launching/monitor_func_test.py -I -v -s spinlock-stats-per-app.csv -N spinlock-microbenchmark-$$-fast_forward
73-
# ./util/job_launching/run_simulations.py -B Spinlock -C QV100-SASS -T ./hw_run_none/traces/device-7/ -N spinlock-microbenchmark-$$-none
74-
# ./util/job_launching/monitor_func_test.py -I -v -s spinlock-stats-per-app.csv -N spinlock-microbenchmark-$$-none
68+
mv ./hw_run /scratch/tgrogers-disk01/a/common/for-sharing/$USER/nightly-traces/hw_run_none
7569
SASS-Weekly:
70+
timeout-minutes: 720
7671
needs: [Tracer-Weekly]
7772
if: ${{ github.repository == 'accel-sim/accel-sim-framework' || github.event_name == 'workflow_dispatch' }}
7873
runs-on: tgrogers-raid
@@ -93,14 +88,69 @@ jobs:
9388
run: |
9489
source ./env-setup/12.8_env_setup.sh
9590
rm -rf ./gpu-simulator/gpgpu-sim
91+
92+
# Clone gpgpu-sim with fork-aware branch selection
93+
echo "Cloning gpgpu-sim with fork-aware branch selection..."
94+
git clone --quiet [email protected]:accel-sim/gpgpu-sim_distribution.git ./gpu-simulator/gpgpu-sim
95+
96+
current_owner=$(echo ${{ github.repository }} | cut -d'/' -f1)
97+
current_branch=$BRANCH_NAME
98+
current_repo=$(echo $GITHUB_REPOSITORY | cut -d'/' -f2)
99+
100+
gpgpusim_repo=$(echo $current_repo | sed 's/accel-sim-framework/gpgpu-sim_distribution/')
101+
102+
echo "Attempting to checkout branch '$BRANCH_NAME' from '$current_owner/$gpgpusim_repo'"
103+
104+
# First, try to add the fork owner's repository as a remote and check if the branch exists
105+
if git -C ./gpu-simulator/gpgpu-sim/ remote add fork-owner [email protected]:$current_owner/$gpgpusim_repo.git 2>/dev/null; then
106+
# Check if the branch exists in the fork owner's repository
107+
if git -C ./gpu-simulator/gpgpu-sim/ ls-remote fork-owner | grep -q "refs/heads/$BRANCH_NAME"; then
108+
echo "Found branch '$BRANCH_NAME' in '$current_owner/$gpgpusim_repo' repository, checking it out"
109+
git -C ./gpu-simulator/gpgpu-sim/ fetch fork-owner
110+
git -C ./gpu-simulator/gpgpu-sim/ checkout -B $BRANCH_NAME fork-owner/$BRANCH_NAME
111+
else
112+
echo "Branch '$BRANCH_NAME' not found in '$current_owner/$gpgpusim_repo' repository, falling back to accel-sim dev branch"
113+
git -C ./gpu-simulator/gpgpu-sim/ checkout -B dev origin/dev
114+
fi
115+
# Remove the temporary remote
116+
git -C ./gpu-simulator/gpgpu-sim/ remote remove fork-owner
117+
else
118+
echo "Could not add '$current_owner/$gpgpusim_repo' remote, falling back to upstream dev branch"
119+
git -C ./gpu-simulator/gpgpu-sim/ checkout -B dev origin/dev
120+
fi
121+
96122
source ./gpu-simulator/setup_environment.sh
97123
make clean -C gpu-simulator
98124
make -j -C gpu-simulator
99125
- name: run SASS
100126
run: |
101127
source ./env-setup/12.8_env_setup.sh
102128
source ./gpu-simulator/setup_environment.sh
103-
ln -s /scratch/tgrogers-disk01/a/common/for-sharing/$USER/nightly-traces ./hw_run
104-
# ./util/job_launching/run_simulations.py -B rodinia-3.1,GPU_Microbenchmark,sdk-4.2-scaled,parboil,polybench,cutlass_5_trace,Deepbench_nvidia_tencore,Deepbench_nvidia_normal -C QV100-SASS-5B_INSN -T ./hw_run/traces/device-7/12.8 -N weekly-$$ -M 70G
105-
./util/job_launching/run_simulations.py -B rodinia-3.1,GPU_Microbenchmark -C QV100-SASS-5B_INSN -T ./hw_run/traces/device-7/12.8 -N weekly-$$ -M 70G
129+
ln -s /scratch/tgrogers-disk01/a/common/for-sharing/$USER/nightly-traces/hw_run ./hw_run
130+
./util/job_launching/run_simulations.py -B rodinia_2.0-ft,rodinia-3.1,GPU_Microbenchmark -C QV100-SASS -T ./hw_run/traces/device-7/12.8 -N weekly-$$ -M 70G
106131
./util/job_launching/monitor_func_test.py -T 12 -S 1800 -I -v -s weekly-stats-per-app.csv -N weekly-$$
132+
- name: test-new-traces-spinlock_handling
133+
# Test only fast-forwarded traces as the none one takes too long to run (~2-3 hr)
134+
run: |
135+
source ./env-setup/12.8_env_setup.sh
136+
source ./gpu-simulator/setup_environment.sh
137+
./util/job_launching/run_simulations.py -B Spinlock -C QV100-SASS -T /scratch/tgrogers-disk01/a/common/for-sharing/$USER/nightly-traces/hw_run_fast_forward/traces/device-7/ -N spinlock-microbenchmark-$$-fast_forward
138+
./util/job_launching/monitor_func_test.py -I -v -s spinlock-stats-per-app.csv -N spinlock-microbenchmark-$$-fast_forward
139+
# ./util/job_launching/run_simulations.py -B Spinlock -C QV100-SASS -T /scratch/tgrogers-disk01/a/common/for-sharing/$USER/nightly-traces/hw_run_none/traces/device-7/ -N spinlock-microbenchmark-$$-none
140+
# ./util/job_launching/monitor_func_test.py -I -v -s spinlock-stats-per-app.csv -N spinlock-microbenchmark-$$-none
141+
failures:
142+
if: failure()
143+
env:
144+
ACTION_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
145+
REPORT_URL: ""
146+
runs-on: tgrogers-raid
147+
needs: [Tracer-Weekly, SASS-Weekly]
148+
steps:
149+
- uses: actions/checkout@v4
150+
- name: Notify Failure
151+
run: |
152+
# Setup envs
153+
git clone --quiet --branch cluster-ubuntu [email protected]:purdue-aalp/env-setup.git
154+
source ./env-setup/common/common_inc.sh
155+
export BRANCH_NAME="Weekly Tests"
156+
python3 .github/scripts/send_ci_email.py -t failure

0 commit comments

Comments
 (0)