1919
2020from tests .common .assertions import assert_no_errors_in_logs
2121from tests .common .osu_common import run_individual_osu_benchmark
22- from tests .common .utils import fetch_instance_slots , get_installed_parallelcluster_version , run_system_analyzer
22+ from tests .common .utils import (
23+ fetch_instance_slots ,
24+ get_installed_parallelcluster_version ,
25+ run_system_analyzer ,
26+ write_file ,
27+ )
2328
2429# We collected OSU benchmarks results for c5n.18xlarge only.
2530OSU_BENCHMARKS_INSTANCES = ["c5n.18xlarge" ]
@@ -61,6 +66,8 @@ def test_osu(
6166
6267 benchmark_failures = []
6368
69+ output_dir = request .config .getoption ("output_dir" )
70+
6471 # Run OSU benchmarks in efa-enabled queue.
6572 for mpi_version in mpi_variants :
6673 benchmark_failures .extend (
@@ -69,6 +76,7 @@ def test_osu(
6976 remote_command_executor ,
7077 scheduler_commands ,
7178 test_datadir ,
79+ output_dir ,
7280 os ,
7381 instance ,
7482 slots_per_instance ,
@@ -81,6 +89,7 @@ def test_osu(
8189 remote_command_executor ,
8290 scheduler_commands ,
8391 test_datadir ,
92+ output_dir ,
8493 os ,
8594 instance ,
8695 num_instances = 32 ,
@@ -108,6 +117,7 @@ def _test_osu_benchmarks_pt2pt(
108117 remote_command_executor ,
109118 scheduler_commands ,
110119 test_datadir ,
120+ output_dir ,
111121 os ,
112122 instance ,
113123 slots_per_instance ,
@@ -120,10 +130,11 @@ def _test_osu_benchmarks_pt2pt(
120130 accepted_number_of_failures = 4
121131
122132 failed_benchmarks = []
133+ benchmark_group = "pt2pt"
123134 for benchmark_name in ["osu_latency" , "osu_bibw" ]:
124135 _ , output = run_individual_osu_benchmark (
125136 mpi_version ,
126- "pt2pt" ,
137+ benchmark_group ,
127138 benchmark_name ,
128139 partition ,
129140 remote_command_executor ,
@@ -132,7 +143,9 @@ def _test_osu_benchmarks_pt2pt(
132143 slots_per_instance ,
133144 test_datadir ,
134145 )
135- failures = _check_osu_benchmarks_results (test_datadir , os , instance , mpi_version , benchmark_name , output )
146+ failures = _check_osu_benchmarks_results (
147+ test_datadir , output_dir , os , instance , mpi_version , benchmark_name , output
148+ )
136149 if failures > accepted_number_of_failures :
137150 failed_benchmarks .append (f"{ mpi_version } -{ benchmark_name } " )
138151
@@ -144,6 +157,7 @@ def _test_osu_benchmarks_collective(
144157 remote_command_executor ,
145158 scheduler_commands ,
146159 test_datadir ,
160+ output_dir ,
147161 os ,
148162 instance ,
149163 num_instances ,
@@ -154,10 +168,11 @@ def _test_osu_benchmarks_collective(
154168 accepted_number_of_failures = 3
155169
156170 failed_benchmarks = []
171+ benchmark_group = "collective"
157172 for benchmark_name in ["osu_allgather" , "osu_bcast" , "osu_allreduce" , "osu_alltoall" ]:
158173 _ , output = run_individual_osu_benchmark (
159174 mpi_version ,
160- "collective" ,
175+ benchmark_group ,
161176 benchmark_name ,
162177 partition ,
163178 remote_command_executor ,
@@ -167,7 +182,9 @@ def _test_osu_benchmarks_collective(
167182 test_datadir ,
168183 timeout = 24 ,
169184 )
170- failures = _check_osu_benchmarks_results (test_datadir , os , instance , mpi_version , benchmark_name , output )
185+ failures = _check_osu_benchmarks_results (
186+ test_datadir , output_dir , os , instance , mpi_version , benchmark_name , output
187+ )
171188 if failures > accepted_number_of_failures :
172189 failed_benchmarks .append (f"{ mpi_version } -{ benchmark_name } " )
173190
@@ -213,12 +230,13 @@ def _test_osu_benchmarks_multiple_bandwidth(
213230 assert_that (float (max_bandwidth )).is_greater_than (expected_bandwidth )
214231
215232
216- def _check_osu_benchmarks_results (test_datadir , os , instance , mpi_version , benchmark_name , output ):
233+ def _check_osu_benchmarks_results (test_datadir , output_dir , os , instance , mpi_version , benchmark_name , output ):
217234 logging .info (output )
218235 # Check avg latency for all packet sizes
219236 failures = 0
220237 metric_data = []
221238 metric_namespace = "ParallelCluster/test_efa"
239+ evaluation_output = ""
222240 for packet_size , value in re .findall (r"(\d+)\s+(\d+)\." , output ):
223241 with open (
224242 str (test_datadir / "osu_benchmarks" / "results" / os / instance / mpi_version / benchmark_name ),
@@ -236,11 +254,17 @@ def _check_osu_benchmarks_results(test_datadir, os, instance, mpi_version, bench
236254
237255 is_failure = int (value ) > tolerated_value
238256
257+ percentage_diff = (float (value ) - float (tolerated_value )) / float (tolerated_value ) * 100
258+
259+ outcome = "DEGRADATION" if percentage_diff > 0 else "IMPROVEMENT"
260+
239261 message = (
240- f"{ mpi_version } - { benchmark_name } - packet size { packet_size } : "
241- f"tolerated: { tolerated_value } , current: { value } "
262+ f"{ outcome } : { mpi_version } - { benchmark_name } - packet size { packet_size } : "
263+ f"tolerated: { tolerated_value } , current: { value } , percentage_diff: { percentage_diff } % "
242264 )
243265
266+ evaluation_output += f"\n { message } "
267+
244268 dimensions = {
245269 "PclusterVersion" : get_installed_parallelcluster_version (),
246270 "MpiVariant" : mpi_version ,
@@ -263,6 +287,11 @@ def _check_osu_benchmarks_results(test_datadir, os, instance, mpi_version, bench
263287 logging .error (message )
264288 else :
265289 logging .info (message )
290+ write_file (
291+ dirname = f"{ output_dir } /osu-results" ,
292+ filename = f"{ os } -{ instance } -{ mpi_version } -{ benchmark_name } -evaluation.out" ,
293+ content = evaluation_output ,
294+ )
266295 boto3 .client ("cloudwatch" ).put_metric_data (Namespace = metric_namespace , MetricData = metric_data )
267296
268297 return failures
0 commit comments