@@ -277,6 +277,7 @@ def analyze_data(
277
277
)
278
278
279
279
messages = []
280
+ fails = []
280
281
for dimension_set , metric , result , unit in failures :
281
282
# Sanity check as described above
282
283
if abs (statistics .mean (relative_changes_by_metric [metric ])) <= noise_threshold :
@@ -291,18 +292,30 @@ def analyze_data(
291
292
old_mean = statistics .mean (processed_emf_a [dimension_set ][metric ][0 ])
292
293
new_mean = statistics .mean (processed_emf_b [dimension_set ][metric ][0 ])
293
294
295
+ change_unit = format_with_reduced_unit (result .statistic , unit )
296
+ change_p = result .statistic / old_mean
297
+ old_unit = format_with_reduced_unit (old_mean , unit )
298
+ new_unit = format_with_reduced_unit (new_mean , unit )
299
+
300
+ fail = dict (dimension_set )
301
+ fail ["diff" ] = change_p
302
+ fails .append (fail )
303
+
294
304
msg = (
295
305
f"\033 [0;32m[Firecracker A/B-Test Runner]\033 [0m A/B-testing shows a change of "
296
- f"{ format_with_reduced_unit ( result . statistic , unit ) } , or { result . statistic / old_mean :.2%} , "
297
- f"(from { format_with_reduced_unit ( old_mean , unit ) } to { format_with_reduced_unit ( new_mean , unit ) } ) "
306
+ f"{ change_unit } , or { change_p :.2%} , "
307
+ f"(from { old_unit } to { new_unit } ) "
298
308
f"for metric \033 [1m{ metric } \033 [0m with \033 [0;31m\033 [1mp={ result .pvalue } \033 [0m. "
299
309
f"This means that observing a change of this magnitude or worse, assuming that performance "
300
310
f"characteristics did not change across the tested commits, has a probability of { result .pvalue :.2%} . "
301
311
f"Tested Dimensions:\n { json .dumps (dict (dimension_set ), indent = 2 , sort_keys = True )} "
302
312
)
303
313
messages .append (msg )
304
314
305
- assert not messages , "\n " + "\n " .join (messages )
315
+ if messages :
316
+ with open ("test_results/ab.json" , "w" ) as f :
317
+ json .dump ({"fails" : fails }, f , indent = 2 , sort_keys = True )
318
+ assert False , "\n " + "\n " .join (messages )
306
319
print ("No regressions detected!" )
307
320
308
321
0 commit comments