Skip to content

Commit 3e4801b

Browse files
committed
Some more changes
1 parent c3ca7f1 commit 3e4801b

File tree

3 files changed

+134
-82
lines changed

3 files changed

+134
-82
lines changed

.ci/mpas_refgen.jsonc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@
4848
},
4949
".*test_conus.*refgen.*cpu.*::node_select" :
5050
{
51-
"-l " : { "select" : 1, "ncpus" : 64, "mpiprocs" : 4 }
51+
"-l " : { "select" : 1, "ncpus" : 64, "mpiprocs" : 1 }
5252
},
5353
".*test_jw.*parallel.*cpu.*::node_select" :
5454
{

.ci/mpas_tests.jsonc

Lines changed: 88 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,51 @@
1111

1212
".*make.*::args_build_core" : [ "-c", "atmosphere" ],
1313
".*debug.*::args_build_debug" : [ "-d" ],
14+
".*make.*double.*::args_build_precision" : [ "-p" ],
1415
//".*make.*gpu.*::args_build_options_0" : [ "-g" ],
15-
".*make.*cpu.*::args_build_options" : [ "-b", "-j $NUM_PROCS" ],
16-
".*make.*gpu.*::args_build_options" : [ "-b", "-j $NUM_PROCS"],
17-
".*make.*gnu.*::args_target" : [ "-t", "gnu" ],
16+
//".*make.*cpu.*::args_build_options" : [ "-b", "-j $NUM_PROCS" ],
17+
".*make.*::args_build_options" : [ "-b", "-j $NUM_PROCS"],
18+
".*gnu.*::args_target" : [ "-t", "gnu" ],
1819
".*make.*nvhpc.*::args_target" : [ "-t", "nvhpc" ],
19-
".*make.*nvhpc.*gpu.*::args_openacc" : [ "-g" ]
20+
".*make.*nvhpc.*gpu.*::args_openacc" : [ "-g" ],
21+
".*test_.*::args_namelist" : [ "-b", "atmosphere_model"],
22+
".*test_jw.*::args_namelist" : [ "-f", "/glade/campaign/mmm/wmr/mpas_ci/jw_baroclinic"],
23+
".*test_conus.*::args_namelist" : [ "-f", "/glade/campaign/mmm/wmr/mpas_ci/conus_lam"],
24+
".*test_.*cpu.*::args_namelist" :
25+
[
26+
"-p", "mpiexec -n $CI_NTASKS -ppn $CI_TASKS_PER_NODE"
27+
],
28+
".*test_.*gpu.*::args_namelist" :
29+
[
30+
"-p", "mpiexec -n $CI_NTASKS -ppn $CI_TASKS_PER_NODE set_gpu_rank "
31+
],
32+
//".*test.*(base|parallel|multinode).*::args_namelist" :
33+
// [
34+
// "-q", "01:00:00",
35+
// "-w", "02:00:00"
36+
// ],
37+
".*test_jw.*restart.*::args_namelist" :
38+
[
39+
"-y", "0000-01-01_01:00:00",
40+
"-z", "01:00:00"
41+
],
42+
".*test_conus.*restart.*::args_namelist" :
43+
[
44+
"-y", "2019-09-01_00:10:00",
45+
"-z", "00:10:00"
46+
],
47+
".*test_jw.*perf.*::args_namelist" :
48+
[
49+
"-z", "01:00:00"
50+
],
51+
".*test_conus.*perf.*::args_namelist" :
52+
[
53+
"-z", "00:10:00"
54+
],
55+
".*test_.*gnu.*::args_namelist" :
56+
[
57+
"-e", "LD_LIBRARY_PATH=/glade/u/apps/derecho/23.09/spack/opt/spack/parallel-netcdf/1.12.3/cray-mpich/8.1.27/gcc/12.2.0/sq5u/lib:/glade/u/apps/derecho/23.09/spack/opt/spack/netcdf/4.9.2/gcc/12.2.0/gjc6/lib:\\\\$LD_LIBRARY_PATH"
58+
]
2059
},
2160
// Derecho-specifics
2261
"hsn.de.hpc" :
@@ -34,7 +73,7 @@
3473
{
3574
"-l " : { "select" : 1, "ncpus" : 1 }
3675
},
37-
".*test_jw.*parallel.*cpu.*::node_select" :
76+
".*test_jw.*mpi.*cpu.*::node_select" :
3877
{
3978
"-l " : { "select" : 1, "ncpus" : 16, "mpiprocs" : 4 }
4079
},
@@ -73,45 +112,8 @@
73112
"global_modules" : [ "ncarenv/23.09"],
74113
//"very_last_modules" : [ "cray-mpich", "parallel-netcdf" ],
75114
".*gnu.*::test_modules" : [ "gcc/12.2.0","cray-mpich", "parallel-netcdf", "netcdf" ],
76-
".*nvhpc.*::test_modules" : [ "nvhpc/24.3","cray-mpich","ncarcompilers", "parallel-netcdf", "cuda" ],
77-
".*test.*::args_namelist" : [ "-b", "atmosphere_model"],
78-
".*test_jw.*::args_namelist" : [ "-f", "/glade/campaign/mmm/wmr/mpas_ci/jw_baroclinic"],
79-
".*test_conus.*::args_namelist" : [ "-f", "/glade/campaign/mmm/wmr/mpas_ci/conus_lam"],
80-
".*test_.*cpu.*::args_namelist" :
81-
[
82-
"-p", "mpiexec -n $CI_NTASKS -ppn $CI_TASKS_PER_NODE"
83-
],
84-
".*test_.*gpu.*::args_namelist" :
85-
[
86-
"-p", "mpiexec -n $CI_NTASKS -ppn $CI_TASKS_PER_NODE set_gpu_rank "
87-
],
88-
//".*test.*(base|parallel|multinode).*::args_namelist" :
89-
// [
90-
// "-q", "01:00:00",
91-
// "-w", "02:00:00"
92-
// ],
93-
".*test_jw.*restart.*::args_namelist" :
94-
[
95-
"-y", "0000-01-01_01:00:00",
96-
"-z", "01:00:00"
97-
],
98-
".*test_conus.*restart.*::args_namelist" :
99-
[
100-
"-y", "2019-09-01_00:10:00",
101-
"-z", "00:10:00"
102-
],
103-
".*test_jw.*perf.*::args_namelist" :
104-
[
105-
"-z", "01:00:00"
106-
],
107-
".*test_conus.*perf.*::args_namelist" :
108-
[
109-
"-z", "00:10:00"
110-
],
111-
".*test.*gnu.*::args_namelist" :
112-
[
113-
"-e", "LD_LIBRARY_PATH=/glade/u/apps/derecho/23.09/spack/opt/spack/parallel-netcdf/1.12.3/cray-mpich/8.1.27/gcc/12.2.0/sq5u/lib:/glade/u/apps/derecho/23.09/spack/opt/spack/netcdf/4.9.2/gcc/12.2.0/gjc6/lib:\\\\$LD_LIBRARY_PATH"
114-
]
115+
".*nvhpc.*::test_modules" : [ "nvhpc/24.3","cray-mpich","ncarcompilers", "parallel-netcdf", "cuda" ]
116+
115117
}
116118
}
117119
},
@@ -123,28 +125,60 @@
123125
"arguments" :
124126
{
125127
// Remove auto-filled dir and supply sub-testcase specific ones
126-
".*testcase.*::args_namelist_dir" : [ ]
128+
".*test_.*::args_namelist_dir" : ["-e", "LD_LIBRARY_PATH=/glade/u/apps/derecho/23.09/spack/opt/spack/parallel-netcdf/1.12.3/cray-mpich/8.1.27/gcc/12.2.0/sq5u/lib:/glade/u/apps/derecho/23.09/spack/opt/spack/netcdf/4.9.2/gcc/12.2.0/gjc6/lib:\\\\$LD_LIBRARY_PATH" ]
127129
}
128130
},
129131
"steps" :
130132
{
131-
"make-release" :
133+
// "make-release" :
134+
// {
135+
// "command" : ".ci/tests/build.sh"
136+
// },
137+
"test_jw-base-cpu" :
132138
{
133-
"command" : ".ci/tests/build.sh"
139+
"submit_options" : { "timelimit" : "00:15:00" },
140+
"command" : ".ci/tests/runTestCases.sh",
141+
"arguments" : ["-d", "cpu", "-t", "gnu", "-c", "jw", "-g", "base"]
142+
//"dependencies" : { "make-release" : "afterok" }
134143
},
135-
"testcase-jw-base" :
144+
"test_jw-restart-cpu" :
136145
{
137146
"submit_options" : { "timelimit" : "00:15:00" },
138147
"command" : ".ci/tests/runTestCases.sh",
139-
"arguments" : ["-d", "cpu", "-t", "gnu", "-c", "jw", "-g", "base"],
140-
"dependencies" : { "make-release" : "afterok" }
148+
"arguments" : ["-d", "cpu", "-t", "gnu", "-c", "jw", "-g", "restart"],
149+
"dependencies" : { "test_jw-base-cpu" : "afterok" }
141150
},
142-
"testcase-jw-restart" :
151+
"test_jw-mpi-cpu" :
143152
{
144153
"submit_options" : { "timelimit" : "00:15:00" },
145154
"command" : ".ci/tests/runTestCases.sh",
146-
"arguments" : ["-d", "cpu", "-t", "gnu", "-c", "jw", "-g", "restart"],
147-
"dependencies" : { "testcase-jw-base" : "afterok" }
155+
"arguments" : ["-d", "cpu", "-t", "gnu", "-c", "jw", "-g", "mpi"],
156+
"dependencies" : { "test_jw-restart-cpu" : "afterok" }
157+
}
158+
}
159+
},
160+
"gnu-double" :
161+
{
162+
"submit_options" :
163+
{
164+
"arguments" :
165+
{
166+
// Remove auto-filled dir and supply sub-testcase specific ones
167+
".*test_.*::args_namelist_dir" : ["-e", "LD_LIBRARY_PATH=/glade/u/apps/derecho/23.09/spack/opt/spack/parallel-netcdf/1.12.3/cray-mpich/8.1.27/gcc/12.2.0/sq5u/lib:/glade/u/apps/derecho/23.09/spack/opt/spack/netcdf/4.9.2/gcc/12.2.0/gjc6/lib:\\\\$LD_LIBRARY_PATH" ]
168+
}
169+
},
170+
"steps" :
171+
{
172+
"make-release-double" :
173+
{
174+
"command" : ".ci/tests/build.sh"
175+
},
176+
"test_jw-base-cpu" :
177+
{
178+
"submit_options" : { "timelimit" : "00:15:00" },
179+
"command" : ".ci/tests/runTestCases.sh",
180+
"arguments" : ["-d", "cpu", "-t", "gnu", "-c", "jw", "-g", "base"],
181+
"dependencies" : { "make-release-double" : "afterok" }
148182
}
149183
}
150184
},
@@ -206,7 +240,7 @@
206240
{
207241
"submit_options" : { "timelimit" : "00:15:00" },
208242
"command" : ".ci/tests/runTestCases.sh",
209-
"arguments" : ["-d", "gpu", "-t", "nvhpc", "-c", "jw", "-g", "parallel"],
243+
"arguments" : ["-d", "gpu", "-t", "nvhpc", "-c", "jw", "-g", "multigpu"],
210244
"dependencies" : { "test_jw-nvhpc-restart-gpu" : "afterok" }
211245
},
212246
"test_jw-nvhpc-perf-gpu" :

.ci/tests/runTestCases.sh

Lines changed: 45 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ help()
1717
echo " -z <duration> Run duration"
1818
echo " -w <interval> Output interval"
1919
echo " -p <mpirun cmd> Parallel launch command (MPI), e.g. mpirun, mpiexec_mpt, mpiexec -np 8 --oversubscribe"
20-
echo " -t <target> Target for the test"
20+
echo " -t <toolchain> toolchain for the test"
2121
echo " -k <diff exec> Diff executable"
2222
echo " -s <folder> Save result data to prefix location, full path for run constructed as <work>/<thisfolder>/<namelist>/"
2323
echo " -i <folder> Folder for bitwise-identical results, full path for run constructed as <work>/<thisfolder>/<namelist>/"
@@ -96,7 +96,7 @@ while getopts c:g:r:b:f:d:q:y:z:w:p:t:k:s:i:e:nh opt; do
9696
parallelExec="$OPTARG"
9797
;;
9898
t)
99-
target="$OPTARG"
99+
toolchain="$OPTARG"
100100
;;
101101
k)
102102
diffExec="$OPTARG"
@@ -121,24 +121,39 @@ if [ ! -z $envVars ]; then
121121
setenvStr "$envVars"
122122
fi
123123

124-
125124
# Check if testcase is valid
126125
if [ "$testcase" != "jw" ] && [ "$testcase" != "conus" ] && [ "$testcase" != "aquaplanet" ]; then
127-
echo "Error: Invalid testcase '$testcase'. Must be one of 'jw', 'conus', or 'aquaplanet'."
128-
exit 1
126+
echo "Error: Invalid testcase '$testcase'. Must be one of 'jw', 'conus', or 'aquaplanet'."
127+
exit 1
129128
fi
130129

131-
132130
# Check if testType is valid
133-
if [ "$testType" != "base" ] && [ "$testType" != "refgen" ] && [ "$testType" != "restart" ] && [ "$testType" != "mpi" ] && [ "$testType" != "multigpu" ] && [ "$testType" != "omp" ] && [ "$testType" != "perf" ]; then
134-
echo "Error: Invalid testType '$testType'. Must be one of 'base', 'restart', 'mpi', or 'omp'."
135-
exit 1
131+
if [ "$testType" != "base" ] && [ "$testType" != "refgen" ] &&
132+
[ "$testType" != "restart" ] && [ "$testType" != "mpi" ] && [ "$testType" != "multinode" ] &&
133+
[ "$testType" != "multigpu" ] && [ "$testType" != "omp" ] &&
134+
[ "$testType" != "perfgen" ] && [ "$testType" != "perfcmp" ]; then
135+
echo "Error: Invalid testType '$testType'. Must be one of 'base', 'refgen', 'restart', 'mpi', 'multigpu', 'omp', 'perfgen', or 'perfcmp'."
136+
exit 1
137+
fi
138+
139+
# Check if toolchain is valid
140+
if [ "$toolchain" != "gnu" ] && [ "$toolchain" != "nvhpc" ] && [ "$toolchain" != "intel" ]; then
141+
echo "Error: Invalid toolchain '$toolchain'. Must be one of 'gnu', 'nvhpc', or 'intel'."
142+
exit 1
143+
fi
144+
145+
# Check if device is valid
146+
if [ "$device" != "cpu" ] && [ "$device" != "gpu" ]; then
147+
echo "Error: Invalid device '$device'. Must be one of 'cpu' or 'gpu'."
148+
exit 1
136149
fi
137-
runDir=${testcase}_${target}_${testType}_${device}_${precision}
138150

139-
baserunDir=${testcase}_${target}_base_${device}_${precision}
140151

141-
TESTNAME="${testcase} ${target} ${testType} ${device} ${precision}"
152+
runDir=${testcase}_${toolchain}_${testType}_${device}_${precision}
153+
154+
baserunDir=${testcase}_${toolchain}_base_${device}_${precision}
155+
156+
TESTNAME="${testcase} ${toolchain} ${testType} ${device} ${precision}"
142157
echo "TEST : $TESTNAME"
143158

144159
# from https://ncar-hpc-docs.readthedocs.io/en/latest/pbs/job-scripts/#derecho
@@ -193,19 +208,14 @@ eval "repo_id=\$( git rev-parse --short=20 HEAD )" # github actions doesn't fet
193208
eval "repo_id_short=\$( git rev-parse --short=10 HEAD )" # github actions doesn't fetch tags yet
194209
eval "repo_timestamp=\$( git show --no-patch --format=%ci )"
195210

196-
# Clean up previous runs
197-
# rm wrfinput_d* wrfbdy_d* wrfout_d* wrfchemi_d* wrf_chem_input_d* rsl* real.print.out* wrf.print.out* wrf_d0*_runstats.out qr_acr_qg_V4.dat fort.98 fort.88 -rf
198-
199-
200211
# Go to run location now - We only operate here from now on
201212
cd $runDir || exit $?
202-
# Clean up previous runs
203-
# rm wrfinput_d* wrfbdy_d* wrfout_d* wrfchemi_d* wrf_chem_input_d* rsl* real.print.out* wrf.print.out* wrf_d0*_runstats.out qr_acr_qg_V4.dat fort.98 fort.88 -rf
213+
# TODO: Clean up previous runs
204214

205215

206216
# Copy namelist
207217
echo "Setting $caseInputDir/namelist.atmosphere as namelist.atmosphere "
208-
# remove old namelist.input which may be a symlink in which case this would have failed
218+
# TODO: remove old namelist.input which may be a symlink in which case this would have failed
209219
#rm namelist.input
210220
cp $caseInputDir/namelist.atmosphere namelist.atmosphere || exit $?
211221
cp $caseInputDir/streams.atmosphere streams.atmosphere || exit $?
@@ -221,6 +231,7 @@ if [ -n "$outputInterval" ]; then
221231
stream_replace "output" "output_interval" "$outputInterval" streams.atmosphere
222232
fi
223233

234+
224235
if [ "$testType" = "restart" ]; then
225236
nml_replace "config_do_restart" "true" namelist.atmosphere
226237
nml_replace_quotes "config_run_duration" "$runDuration" namelist.atmosphere
@@ -315,19 +326,19 @@ if [ "$testType" = "restart" ] || [ "$testType" = "mpi" ] || [ "$testType" = "mu
315326

316327
elif [ "$testType" = "base" ]; then
317328

318-
head_sha=$( cat $caseInputDir/reference/head_${target}_${device}_${precision} ) || exit $?
329+
head_sha=$( cat $caseInputDir/reference/head_${toolchain}_${device}_${precision} ) || exit $?
319330
echo "Comparing base with reference SHA: $head_sha"
320-
echo "Restart file: $caseInputDir/reference/restart.${restart_compare_time}_${head_sha}_${target}_${device}.nc"
321-
diff_output $runDir/restart.${restart_compare_time}.nc "$caseInputDir/reference/restart.${restart_compare_time}_${head_sha}_${target}_${device}_${precision}.nc"
331+
echo "Restart file: $caseInputDir/reference/restart.${restart_compare_time}_${head_sha}_${toolchain}_${device}.nc"
332+
diff_output $runDir/restart.${restart_compare_time}.nc "$caseInputDir/reference/restart.${restart_compare_time}_${head_sha}_${toolchain}_${device}_${precision}.nc"
322333
result=$?
323334

324335
elif [ "$testType" = "refgen" ]; then
325336

326-
mv "$runDir/restart.${restart_compare_time}.nc" "$caseInputDir/reference/restart.${restart_compare_time}_${repo_id_short}_${target}_${device}_${precision}.nc" || exit $?
337+
mv "$runDir/restart.${restart_compare_time}.nc" "$caseInputDir/reference/restart.${restart_compare_time}_${repo_id_short}_${toolchain}_${device}_${precision}.nc" || exit $?
327338

328-
echo "${repo_id_short}" > $caseInputDir/reference/head_${target}_${device}_${precision} || exit $?
339+
echo "${repo_id_short}" > $caseInputDir/reference/head_${toolchain}_${device}_${precision} || exit $?
329340

330-
elif [ "$testType" = "perf" ]; then
341+
elif [ "$testType" = "perfgen" ] || [ "$testType" = "perfcmp" ]; then
331342

332343
log_file_path=$runDir/$log_file
333344

@@ -351,9 +362,12 @@ elif [ "$testType" = "perf" ]; then
351362

352363
machine="derecho"
353364

354-
eval "python $workingDirectory/.ci/tests/perf_stats.py $db_file $testcase $machine $device $target $repo_id $totaltime_1,$totaltime_2,$totaltime_3,$totaltime_4,$totaltime_5"
365+
if [ "$testType" = "perfgen" ]; then
366+
eval "python $workingDirectory/.ci/tests/perf_stats.py $db_file $testcase $machine $device $toolchain $repo_id $totaltime_1,$totaltime_2,$totaltime_3,$totaltime_4,$totaltime_5"
367+
elif [ "$testType" = "perfcmp" ]; then
368+
eval "python $workingDirectory/.ci/tests/query_perf_db.py compare_to_ref $db_file $testcase $machine $device $toolchain $totaltime_1,$totaltime_2,$totaltime_3,$totaltime_4,$totaltime_5"
369+
fi
355370
result=$?
356-
#eval "python $workingDirectory/.ci/tests/query_perf_db.py compare_to_ref $db_file $testcase $machine $device $target $totaltime_1,$totaltime_2,$totaltime_3,$totaltime_4,$totaltime_5"
357371

358372
fi
359373

@@ -365,6 +379,10 @@ else
365379
echo "TEST $TESTNAME FAIL"
366380
exit 1
367381
fi
382+
383+
384+
385+
368386
#if [ -z "$errorMsg" ]; then
369387
# Unlink everything we linked in
370388
#ls $data/ | xargs -I{} rm {}

0 commit comments

Comments
 (0)