Skip to content

Commit bf5d9da

Browse files
committed
feat(a/b): save A/B fail results
Save A/B test fails to be able to do offline analysis. Signed-off-by: Egor Lazarchuk <[email protected]>
1 parent f2ef491 commit bf5d9da

File tree

1 file changed

+16
-3
lines changed

1 file changed

+16
-3
lines changed

tools/ab_test.py

+16-3
Original file line numberDiff line numberDiff line change
@@ -277,6 +277,7 @@ def analyze_data(
277277
)
278278

279279
messages = []
280+
fails = []
280281
for dimension_set, metric, result, unit in failures:
281282
# Sanity check as described above
282283
if abs(statistics.mean(relative_changes_by_metric[metric])) <= noise_threshold:
@@ -291,18 +292,30 @@ def analyze_data(
291292
old_mean = statistics.mean(processed_emf_a[dimension_set][metric][0])
292293
new_mean = statistics.mean(processed_emf_b[dimension_set][metric][0])
293294

295+
change_unit = format_with_reduced_unit(result.statistic, unit)
296+
change_p = result.statistic / old_mean
297+
old_unit = format_with_reduced_unit(old_mean, unit)
298+
new_unit = format_with_reduced_unit(new_mean, unit)
299+
300+
fail = dict(dimension_set)
301+
fail["diff"] = change_p
302+
fails.append(fail)
303+
294304
msg = (
295305
f"\033[0;32m[Firecracker A/B-Test Runner]\033[0m A/B-testing shows a change of "
296-
f"{format_with_reduced_unit(result.statistic, unit)}, or {result.statistic / old_mean:.2%}, "
297-
f"(from {format_with_reduced_unit(old_mean, unit)} to {format_with_reduced_unit(new_mean, unit)}) "
306+
f"{change_unit}, or {change_p:.2%}, "
307+
f"(from {old_unit} to {new_unit}) "
298308
f"for metric \033[1m{metric}\033[0m with \033[0;31m\033[1mp={result.pvalue}\033[0m. "
299309
f"This means that observing a change of this magnitude or worse, assuming that performance "
300310
f"characteristics did not change across the tested commits, has a probability of {result.pvalue:.2%}. "
301311
f"Tested Dimensions:\n{json.dumps(dict(dimension_set), indent=2, sort_keys=True)}"
302312
)
303313
messages.append(msg)
304314

305-
assert not messages, "\n" + "\n".join(messages)
315+
if messages:
316+
with open("test_results/ab.json", "w") as f:
317+
json.dump({"fails": fails}, f, indent=2, sort_keys=True)
318+
assert False, "\n" + "\n".join(messages)
306319
print("No regressions detected!")
307320

308321

0 commit comments

Comments
 (0)