Skip to content

Commit 4dccb2c

Browse files
committed
WIP: Enable testing on Eris
1 parent 3e4801b commit 4dccb2c

File tree

4 files changed

+144
-45
lines changed

4 files changed

+144
-45
lines changed

.ci/env/eris.sh

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
#!/bin/sh
2+
3+
echo "Setting up ERIS environment"
4+
workingDirectory=$PWD
5+
6+
ulimit -s unlimited
7+
8+
source /nfs/coe-sw/lmod/8.7/init/bash
9+
10+
#module use /home/agopal/software/modules
11+
# Overwriting the system value of MODULEPATH to include our own modules
12+
export MODULEPATH=/home/agopal/software/modules/compilers:/home/agopal/software/modules/cdep
13+
14+
echo "Loading modules : $*"
15+
cmd="module --force purge"
16+
echo $cmd && eval "${cmd}"
17+
18+
# We should be handed in the modules to load
19+
while [ $# -gt 0 ]; do
20+
cmd="module load $1"
21+
echo $cmd && eval "${cmd}"
22+
shift
23+
done
24+
25+
# Go back to working directory if for unknown reason HPC config changing your directory on you
26+
if [ "$workingDirectory" != "$PWD" ]; then
27+
echo "Eris module loading changed working directory"
28+
echo " Moving back to $workingDirectory"
29+
cd $workingDirectory
30+
fi

.ci/env/hostenv.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,9 @@ fi
1111
if [ $( contains ${hostname} hsn.de.hpc ) -eq 0 ]; then
1212
# Derecho HPC SuSE PBS
1313
. .ci/env/derecho.sh
14+
elif [ $( contains ${hostname} eris ) -eq 0 ]; then
15+
# Derecho HPC SuSE PBS
16+
. .ci/env/eris.sh
1417
else
1518
echo "No known environment for '${hostname}', using current"
1619
fi

.ci/mpas_tests.jsonc

Lines changed: 85 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -16,19 +16,12 @@
1616
//".*make.*cpu.*::args_build_options" : [ "-b", "-j $NUM_PROCS" ],
1717
".*make.*::args_build_options" : [ "-b", "-j $NUM_PROCS"],
1818
".*gnu.*::args_target" : [ "-t", "gnu" ],
19-
".*make.*nvhpc.*::args_target" : [ "-t", "nvhpc" ],
19+
".*nvhpc.*::args_target" : [ "-t", "nvhpc" ],
20+
".*intel.*::args_target" : [ "-t", "intel" ],
2021
".*make.*nvhpc.*gpu.*::args_openacc" : [ "-g" ],
2122
".*test_.*::args_namelist" : [ "-b", "atmosphere_model"],
22-
".*test_jw.*::args_namelist" : [ "-f", "/glade/campaign/mmm/wmr/mpas_ci/jw_baroclinic"],
23-
".*test_conus.*::args_namelist" : [ "-f", "/glade/campaign/mmm/wmr/mpas_ci/conus_lam"],
24-
".*test_.*cpu.*::args_namelist" :
25-
[
26-
"-p", "mpiexec -n $CI_NTASKS -ppn $CI_TASKS_PER_NODE"
27-
],
28-
".*test_.*gpu.*::args_namelist" :
29-
[
30-
"-p", "mpiexec -n $CI_NTASKS -ppn $CI_TASKS_PER_NODE set_gpu_rank "
31-
],
23+
".*test_jw.*::args_namelist" : [ "-f", "jw_baroclinic"],
24+
".*test_conus.*::args_namelist" : [ "-f", "conus_lam"],
3225
//".*test.*(base|parallel|multinode).*::args_namelist" :
3326
// [
3427
// "-q", "01:00:00",
@@ -57,6 +50,25 @@
5750
"-e", "LD_LIBRARY_PATH=/glade/u/apps/derecho/23.09/spack/opt/spack/parallel-netcdf/1.12.3/cray-mpich/8.1.27/gcc/12.2.0/sq5u/lib:/glade/u/apps/derecho/23.09/spack/opt/spack/netcdf/4.9.2/gcc/12.2.0/gjc6/lib:\\\\$LD_LIBRARY_PATH"
5851
]
5952
},
53+
"eris" :
54+
{
55+
"submission" : "LOCAL",
56+
"arguments" :
57+
{
58+
".*build.*::base_env_numprocs" : [ "-e", "NUM_PROCS=4" ],
59+
".*test_.*::base_env_numprocs" : [ "-e", "NUM_PROCS=4" ],
60+
".*test_.*.*::args" : [ "-s", "LOCAL"],
61+
".*test_.*::args" : [ "-r", "/users/agopal/mpas_ci"],
62+
".*gnu.*::common_modules" : [ "gnu/12.1.0","openmpi/5.0.5", "pnetcdf/1.13.0", "netcdf/4.8.1", "cdo/2.5.2" ],
63+
".*intel.*::common_modules" : [ "intel/2023.0.0","openmpi/5.0.5", "pnetcdf/1.13.0", "cdo/2.5.2" ],
64+
//".*gnu.*::test_modules" : [ "gcc/12.2.0","openmpi/4.1.4", "pnetcdf/1.12.3", "netcdf/4.8.1" ],
65+
//".*::path" : [ "-e", "PNETCDF=/users/agopal/software/pnetcdf-1.13.0/install,PATH=/home/agopal/software/openmpi-5.0.5/install/bin:$PATH,LD_LIBRARY_PATH=/nfs/coe-sw/compilers/gnu/12.1.0/lib64/:$LD_LIBRARY_PATH" ],
66+
".*test_.*cpu.*::args_namelist" :
67+
[
68+
"-p", "mpirun -n $CI_NTASKS "
69+
]
70+
}
71+
},
6072
// Derecho-specifics
6173
"hsn.de.hpc" :
6274
{
@@ -108,12 +120,21 @@
108120
{
109121
// We want NUM_PROCS to match ncpus
110122
//"base_env_numprocs" : [ "-e", "NUM_PROCS=16, LD_LIBRARY_PATH=/glade/u/apps/derecho/23.09/spack/opt/spack/parallel-netcdf/1.12.3/cray-mpich/8.1.27/gcc/12.2.0/sq5u/lib:/glade/u/apps/derecho/23.09/spack/opt/spack/netcdf/4.9.2/gcc/12.2.0/gjc6/lib:\\\\$LD_LIBRARY_PATH" ],
123+
".*test_.*.*::args" : [ "-s", "LOCAL"],
111124
"base_env_numprocs" : [ "-e", "NUM_PROCS=16" ],
112125
"global_modules" : [ "ncarenv/23.09"],
113126
//"very_last_modules" : [ "cray-mpich", "parallel-netcdf" ],
114127
".*gnu.*::test_modules" : [ "gcc/12.2.0","cray-mpich", "parallel-netcdf", "netcdf" ],
115-
".*nvhpc.*::test_modules" : [ "nvhpc/24.3","cray-mpich","ncarcompilers", "parallel-netcdf", "cuda" ]
116-
128+
".*nvhpc.*::test_modules" : [ "nvhpc/24.3","cray-mpich","ncarcompilers", "parallel-netcdf", "cuda" ],
129+
".*test_.*cpu.*::args_namelist" :
130+
[
131+
"-p", "mpiexec -n $CI_NTASKS -ppn $CI_TASKS_PER_NODE"
132+
],
133+
".*test_.*gpu.*::args_namelist" :
134+
[
135+
"-p", "mpiexec -n $CI_NTASKS -ppn $CI_TASKS_PER_NODE set_gpu_rank "
136+
],
137+
"*.gnu.*test_.*::args_namelist_dir" : ["-e", "LD_LIBRARY_PATH=/glade/u/apps/derecho/23.09/spack/opt/spack/parallel-netcdf/1.12.3/cray-mpich/8.1.27/gcc/12.2.0/sq5u/lib:/glade/u/apps/derecho/23.09/spack/opt/spack/netcdf/4.9.2/gcc/12.2.0/gjc6/lib:\\\\$LD_LIBRARY_PATH" ]
117138
}
118139
}
119140
},
@@ -125,7 +146,6 @@
125146
"arguments" :
126147
{
127148
// Remove auto-filled dir and supply sub-testcase specific ones
128-
".*test_.*::args_namelist_dir" : ["-e", "LD_LIBRARY_PATH=/glade/u/apps/derecho/23.09/spack/opt/spack/parallel-netcdf/1.12.3/cray-mpich/8.1.27/gcc/12.2.0/sq5u/lib:/glade/u/apps/derecho/23.09/spack/opt/spack/netcdf/4.9.2/gcc/12.2.0/gjc6/lib:\\\\$LD_LIBRARY_PATH" ]
129149
}
130150
},
131151
"steps" :
@@ -138,21 +158,21 @@
138158
{
139159
"submit_options" : { "timelimit" : "00:15:00" },
140160
"command" : ".ci/tests/runTestCases.sh",
141-
"arguments" : ["-d", "cpu", "-t", "gnu", "-c", "jw", "-g", "base"]
161+
"arguments" : ["-d", "cpu", "-c", "jw", "-g", "base"]
142162
//"dependencies" : { "make-release" : "afterok" }
143163
},
144164
"test_jw-restart-cpu" :
145165
{
146166
"submit_options" : { "timelimit" : "00:15:00" },
147167
"command" : ".ci/tests/runTestCases.sh",
148-
"arguments" : ["-d", "cpu", "-t", "gnu", "-c", "jw", "-g", "restart"],
168+
"arguments" : ["-d", "cpu", "-c", "jw", "-g", "restart"],
149169
"dependencies" : { "test_jw-base-cpu" : "afterok" }
150170
},
151171
"test_jw-mpi-cpu" :
152172
{
153173
"submit_options" : { "timelimit" : "00:15:00" },
154174
"command" : ".ci/tests/runTestCases.sh",
155-
"arguments" : ["-d", "cpu", "-t", "gnu", "-c", "jw", "-g", "mpi"],
175+
"arguments" : ["-d", "cpu", "-c", "jw", "-g", "mpi"],
156176
"dependencies" : { "test_jw-restart-cpu" : "afterok" }
157177
}
158178
}
@@ -177,11 +197,49 @@
177197
{
178198
"submit_options" : { "timelimit" : "00:15:00" },
179199
"command" : ".ci/tests/runTestCases.sh",
180-
"arguments" : ["-d", "cpu", "-t", "gnu", "-c", "jw", "-g", "base"],
200+
"arguments" : ["-d", "cpu", "-c", "jw", "-g", "base"],
181201
"dependencies" : { "make-release-double" : "afterok" }
182202
}
183203
}
184204
},
205+
"intel" :
206+
{
207+
"submit_options" :
208+
{
209+
"arguments" :
210+
{
211+
// Remove auto-filled dir and supply sub-testcase specific ones
212+
}
213+
},
214+
"steps" :
215+
{
216+
"make-release" :
217+
{
218+
"command" : ".ci/tests/build.sh"
219+
},
220+
"test_jw-base-cpu" :
221+
{
222+
"submit_options" : { "timelimit" : "00:15:00" },
223+
"command" : ".ci/tests/runTestCases.sh",
224+
"arguments" : ["-d", "cpu", "-c", "jw", "-g", "base"],
225+
"dependencies" : { "make-release" : "afterok" }
226+
},
227+
"test_jw-restart-cpu" :
228+
{
229+
"submit_options" : { "timelimit" : "00:15:00" },
230+
"command" : ".ci/tests/runTestCases.sh",
231+
"arguments" : ["-d", "cpu", "-c", "jw", "-g", "restart"],
232+
"dependencies" : { "test_jw-base-cpu" : "afterok" }
233+
},
234+
"test_jw-mpi-cpu" :
235+
{
236+
"submit_options" : { "timelimit" : "00:15:00" },
237+
"command" : ".ci/tests/runTestCases.sh",
238+
"arguments" : ["-d", "cpu", "-c", "jw", "-g", "mpi"],
239+
"dependencies" : { "test_jw-restart-cpu" : "afterok" }
240+
}
241+
}
242+
},
185243
"nvhpc-cpu" :
186244
{
187245
"steps" :
@@ -194,21 +252,21 @@
194252
{
195253
"submit_options" : { "timelimit" : "00:15:00" },
196254
"command" : ".ci/tests/runTestCases.sh",
197-
"arguments" : ["-d", "cpu", "-t", "nvhpc", "-c", "jw", "-g", "base"],
255+
"arguments" : ["-d", "cpu", "-c", "jw", "-g", "base"],
198256
"dependencies" : { "make-nvhpc-release-cpu" : "afterok" }
199257
},
200258
"test_jw-nvhpc-restart-cpu" :
201259
{
202260
"submit_options" : { "timelimit" : "00:15:00" },
203261
"command" : ".ci/tests/runTestCases.sh",
204-
"arguments" : ["-d", "cpu", "-t", "nvhpc", "-c", "jw", "-g", "restart"],
262+
"arguments" : ["-d", "cpu", "-c", "jw", "-g", "restart"],
205263
"dependencies" : { "test_jw-nvhpc-base-cpu" : "afterok" }
206264
},
207265
"test_jw-nvhpc-perf-cpu" :
208266
{
209267
"submit_options" : { "timelimit" : "00:15:00" },
210268
"command" : ".ci/tests/runTestCases.sh",
211-
"arguments" : ["-d", "cpu", "-t", "nvhpc", "-c", "jw", "-g", "perf"],
269+
"arguments" : ["-d", "cpu", "-c", "jw", "-g", "perf"],
212270
"dependencies" : { "test_jw-nvhpc-base-cpu" : "afterok" }
213271
}
214272

@@ -226,42 +284,42 @@
226284
{
227285
"submit_options" : { "timelimit" : "00:15:00" },
228286
"command" : ".ci/tests/runTestCases.sh",
229-
"arguments" : ["-d", "gpu", "-t", "nvhpc", "-c", "jw", "-g", "base"],
287+
"arguments" : ["-d", "gpu", "-c", "jw", "-g", "base"],
230288
"dependencies" : { "make-nvhpc-release-gpu" : "afterok" }
231289
},
232290
"test_jw-nvhpc-restart-gpu" :
233291
{
234292
"submit_options" : { "timelimit" : "00:15:00" },
235293
"command" : ".ci/tests/runTestCases.sh",
236-
"arguments" : ["-d", "gpu", "-t", "nvhpc", "-c", "jw", "-g", "restart"],
294+
"arguments" : ["-d", "gpu", "-c", "jw", "-g", "restart"],
237295
"dependencies" : { "test_jw-nvhpc-base-gpu" : "afterok" }
238296
},
239297
"test_jw-nvhpc-multigpu-gpu" :
240298
{
241299
"submit_options" : { "timelimit" : "00:15:00" },
242300
"command" : ".ci/tests/runTestCases.sh",
243-
"arguments" : ["-d", "gpu", "-t", "nvhpc", "-c", "jw", "-g", "multigpu"],
301+
"arguments" : ["-d", "gpu", "-c", "jw", "-g", "multigpu"],
244302
"dependencies" : { "test_jw-nvhpc-restart-gpu" : "afterok" }
245303
},
246304
"test_jw-nvhpc-perf-gpu" :
247305
{
248306
"submit_options" : { "timelimit" : "00:15:00" },
249307
"command" : ".ci/tests/runTestCases.sh",
250-
"arguments" : ["-d", "gpu", "-t", "nvhpc", "-c", "jw", "-g", "perf"],
308+
"arguments" : ["-d", "gpu", "-c", "jw", "-g", "perf"],
251309
"dependencies" : { "test_jw-nvhpc-restart-gpu" : "afterok" }
252310
},
253311
"test_conus-nvhpc-base-gpu" :
254312
{
255313
"submit_options" : { "timelimit" : "00:15:00" },
256314
"command" : ".ci/tests/runTestCases.sh",
257-
"arguments" : ["-d", "gpu", "-t", "nvhpc", "-c", "conus", "-g", "base"],
315+
"arguments" : ["-d", "gpu", "-c", "conus", "-g", "base"],
258316
"dependencies" : { "test_jw-nvhpc-multigpu-gpu" : "afterok" }
259317
},
260318
"test_conus-nvhpc-restart-gpu" :
261319
{
262320
"submit_options" : { "timelimit" : "00:15:00" },
263321
"command" : ".ci/tests/runTestCases.sh",
264-
"arguments" : ["-d", "gpu", "-t", "nvhpc", "-c", "conus", "-g", "restart"],
322+
"arguments" : ["-d", "gpu", "-c", "conus", "-g", "restart"],
265323
"dependencies" : { "test_conus-nvhpc-base-gpu" : "afterok" }
266324
}
267325

.ci/tests/runTestCases.sh

Lines changed: 26 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ while getopts c:g:r:b:f:d:q:y:z:w:p:t:k:s:i:e:nh opt; do
6666
testType="$OPTARG"
6767
;;
6868
r)
69-
rootDir="$OPTARG"
69+
caseInputPrefix="$OPTARG"
7070
;;
7171
b)
7272
mpasExecutable="$OPTARG"
@@ -101,6 +101,9 @@ while getopts c:g:r:b:f:d:q:y:z:w:p:t:k:s:i:e:nh opt; do
101101
k)
102102
diffExec="$OPTARG"
103103
;;
104+
s)
105+
submissionType="$OPTARG"
106+
;;
104107
e)
105108
envVars="$envVars,$OPTARG"
106109
;;
@@ -156,19 +159,28 @@ baserunDir=${testcase}_${toolchain}_base_${device}_${precision}
156159
TESTNAME="${testcase} ${toolchain} ${testType} ${device} ${precision}"
157160
echo "TEST : $TESTNAME"
158161

159-
# from https://ncar-hpc-docs.readthedocs.io/en/latest/pbs/job-scripts/#derecho
160-
CI_NNODES=$(cat ${PBS_NODEFILE} | sort | uniq | wc -l)
161-
CI_NTASKS=$(cat ${PBS_NODEFILE} | sort | wc -l)
162-
CI_TASKS_PER_NODE=$((${CI_NTASKS} / ${CI_NNODES}))
163162

164-
echo "CI_NNODES: $CI_NNODES , CI_NTASKS: $CI_NTASKS , CI_TASKS_PER_NODE: $CI_TASKS_PER_NODE"
163+
if [ "$submissionType" == "PBS" ]; then
164+
# from https://ncar-hpc-docs.readthedocs.io/en/latest/pbs/job-scripts/#derecho
165+
CI_NNODES=$(cat ${PBS_NODEFILE} | sort | uniq | wc -l)
166+
CI_NTASKS=$(cat ${PBS_NODEFILE} | sort | wc -l)
167+
CI_TASKS_PER_NODE=$((${CI_NTASKS} / ${CI_NNODES}))
168+
echo "CI_NNODES: $CI_NNODES , CI_NTASKS: $CI_NTASKS , CI_TASKS_PER_NODE: $CI_TASKS_PER_NODE"
169+
elif [ "$submissionType" == "LOCAL" ]; then
170+
echo "using local execution"
171+
CI_NTASKS=2
172+
else
173+
echo "Error: Invalid submissionType '$submissionType'. Must be one of 'PBS' or 'LOCAL'."
174+
exit 1
175+
fi
165176

166177

167178
log_file="log.atmosphere.0000.out"
168179

169180
# Re-evaluate input values for delayed expansion
170-
eval "rootDir=\$( realpath \"$rootDir\" )"
171-
eval "caseInputDir=\$( realpath \"$caseInputDir\" )"
181+
eval "caseInputPrefix=\$( realpath \"$caseInputPrefix\" )"
182+
eval "caseInputDir=\$( realpath \"$caseInputPrefix/$caseInputDir\" )"
183+
echo "caseInputDir: $caseInputDir"
172184
eval "parallelExec=\"$parallelExec\""
173185
eval "runDir=\"$runDir\""
174186
eval "baserunDir=\"$baserunDir\""
@@ -178,9 +190,11 @@ runDir=$( realpath $runDir )
178190
baserunDir=$( realpath $baserunDir )
179191

180192
rm -rf $runDir
181-
mkdir -p $runDir
193+
#mkdir -p $runDir
194+
195+
eval "python testing_and_setup/atmosphere/setup_run_dir.py -a $runDir"
182196

183-
ln -sf $workingDirectory/$mpasExecutable $runDir/$mpasExecutable
197+
#ln -sf $workingDirectory/$mpasExecutable $runDir/$mpasExecutable
184198

185199
eval "mpasExecutable=\$( realpath \"$runDir/$mpasExecutable\" )"
186200

@@ -212,11 +226,9 @@ eval "repo_timestamp=\$( git show --no-patch --format=%ci )"
212226
cd $runDir || exit $?
213227
# TODO: Clean up previous runs
214228

215-
216-
# Copy namelist
217229
echo "Setting $caseInputDir/namelist.atmosphere as namelist.atmosphere "
218-
# TODO: remove old namelist.input which may be a symlink in which case this would have failed
219-
#rm namelist.input
230+
# Overwrite the following files from the defaults provided by setup_run_dir.py
231+
# to the ones provided by the caseInputDir
220232
cp $caseInputDir/namelist.atmosphere namelist.atmosphere || exit $?
221233
cp $caseInputDir/streams.atmosphere streams.atmosphere || exit $?
222234
cp $caseInputDir/stream_list.atmosphere.output stream_list.atmosphere.output || exit $?
@@ -269,10 +281,6 @@ elif [ "$testcase" = "conus" ]; then
269281
restart_compare_time='2019-09-01_00.20.00'
270282
cp $caseInputDir/stream_list.atmosphere.diagnostics . || exit $?
271283
cp $caseInputDir/stream_list.atmosphere.surface . || exit $?
272-
ln -sf $workingDirectory/*.TBL . || exit $?
273-
ln -sf $workingDirectory/*.DBL . || exit $?
274-
ln -sf $caseInputDir/*.DBL . || exit $?
275-
ln -sf $workingDirectory/*_DATA . || exit $?
276284
fi
277285

278286

0 commit comments

Comments
 (0)