diff --git a/pdm.lock b/pdm.lock index 914f146..a2ef3b2 100644 --- a/pdm.lock +++ b/pdm.lock @@ -5,7 +5,7 @@ groups = ["default", "test"] strategy = ["inherit_metadata"] lock_version = "4.5.0" -content_hash = "sha256:e273b4e2a24a70e4b0bbcb98ac1f7cec78e390d595ac221ab0c85eb21c5edc29" +content_hash = "sha256:7979ae553052f5db03a354a1a2a85fdf5b3e319aa60803091a97c43e62e46dc1" [[metadata.targets]] requires_python = ">=3.12" @@ -281,6 +281,31 @@ files = [ {file = "json5-0.10.0.tar.gz", hash = "sha256:e66941c8f0a02026943c52c2eb34ebeb2a6f819a0be05920a6f5243cd30fd559"}, ] +[[package]] +name = "markdown-it-py" +version = "3.0.0" +requires_python = ">=3.8" +summary = "Python port of markdown-it. Markdown parsing, done right!" +groups = ["default"] +dependencies = [ + "mdurl~=0.1", +] +files = [ + {file = "markdown-it-py-3.0.0.tar.gz", hash = "sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb"}, + {file = "markdown_it_py-3.0.0-py3-none-any.whl", hash = "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1"}, +] + +[[package]] +name = "mdurl" +version = "0.1.2" +requires_python = ">=3.7" +summary = "Markdown URL utilities" +groups = ["default"] +files = [ + {file = "mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8"}, + {file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"}, +] + [[package]] name = "mock" version = "5.1.0" @@ -354,6 +379,17 @@ files = [ {file = "polars-1.20.0.tar.gz", hash = "sha256:e8e9e3156fae02b58e276e5f2c16a5907a79b38617a9e2d731b533d87798f451"}, ] +[[package]] +name = "pygments" +version = "2.19.1" +requires_python = ">=3.8" +summary = "Pygments is a syntax highlighting package written in Python." +groups = ["default"] +files = [ + {file = "pygments-2.19.1-py3-none-any.whl", hash = "sha256:9ea1544ad55cecf4b8242fab6dd35a93bbce657034b0611ee383099054ab6d8c"}, + {file = "pygments-2.19.1.tar.gz", hash = "sha256:61c16d2a8576dc0649d9f39e089b5f02bcd27fba10d8fb4dcc28173f7a45151f"}, +] + [[package]] name = "pympler" version = "1.1" @@ -482,6 +518,39 @@ files = [ {file = "requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760"}, ] +[[package]] +name = "rich" +version = "13.9.4" +requires_python = ">=3.8.0" +summary = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal" +groups = ["default"] +dependencies = [ + "markdown-it-py>=2.2.0", + "pygments<3.0.0,>=2.13.0", + "typing-extensions<5.0,>=4.0.0; python_version < \"3.11\"", +] +files = [ + {file = "rich-13.9.4-py3-none-any.whl", hash = "sha256:6049d5e6ec054bf2779ab3358186963bac2ea89175919d699e378b99738c2a90"}, + {file = "rich-13.9.4.tar.gz", hash = "sha256:439594978a49a09530cff7ebc4b5c7103ef57baf48d5ea3184f21d9a2befa098"}, +] + +[[package]] +name = "rich-click" +version = "1.8.5" +requires_python = ">=3.7" +summary = "Format click help output nicely with rich" +groups = ["default"] +dependencies = [ + "click>=7", + "importlib-metadata; python_version < \"3.8\"", + "rich>=10.7", + "typing-extensions>=4", +] +files = [ + {file = "rich_click-1.8.5-py3-none-any.whl", hash = "sha256:0fab7bb5b66c15da17c210b4104277cd45f3653a7322e0098820a169880baee0"}, + {file = "rich_click-1.8.5.tar.gz", hash = "sha256:a3eebe81da1c9da3c32f3810017c79bd687ff1b3fa35bfc9d8a3338797f1d1a1"}, +] + [[package]] name = "soupsieve" version = "2.6" @@ -520,7 +589,7 @@ name = "typing-extensions" version = "4.12.2" requires_python = ">=3.8" summary = "Backported and Experimental Type Hints for Python 3.8+" -groups = ["test"] +groups = ["default", "test"] files = [ {file = "typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d"}, {file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"}, diff --git a/pyproject.toml b/pyproject.toml index 7cfe6da..a21a2da 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ description = "File oriented ASV benchmark comparer" authors = [ {name = "Rohit Goswami", email = "rgoswami@ieee.org"}, ] -dependencies = ["asv @ git+https://github.com/airspeed-velocity/asv/", "click>=8.1.8", "asv-runner>=0.2.1", "polars>=1.20.0"] +dependencies = ["asv @ git+https://github.com/airspeed-velocity/asv/", "click>=8.1.8", "asv-runner>=0.2.1", "polars>=1.20.0", "rich-click>=1.8.5"] requires-python = ">=3.12" readme = "README.md" license = {text = "MIT"} diff --git a/src/asv_spyglass/_asv_ro.py b/src/asv_spyglass/_asv_ro.py index 8e249e4..7e8e001 100644 --- a/src/asv_spyglass/_asv_ro.py +++ b/src/asv_spyglass/_asv_ro.py @@ -1,8 +1,6 @@ import itertools -import json import re from pathlib import Path -from typing import Iterator, Union from asv.util import load_json as asv_json_load @@ -14,19 +12,21 @@ class ReadOnlyASVBenchmarks: api_version = 2 - def __init__(self, benchmarks_file: Path, regex: Union[str, list[str]] = None): + def __init__(self, benchmarks_file: Path, regex: str | list[str] = None): """ Initialize and load benchmarks from a JSON file, optionally filtering them. Args: benchmarks_file (Path): Path to the benchmarks JSON file. - regex (Union[str, list[str]], optional): Regular expression(s) to filter benchmarks. - Defaults to None (all benchmarks included). + regex (Union[str, list[str]], optional): Regular expression(s) to + filter benchmarks. Defaults to None (all benchmarks included). """ d = asv_json_load(getstrform(benchmarks_file), api_version=self.api_version) self._base_benchmarks = {} # Store all benchmarks here self._benchmark_selection = {} # Track selected parameter combinations - self.filtered_benchmarks = {} # Store selected benchmarks here after parameter expansion + self.filtered_benchmarks = ( + {} + ) # Store selected benchmarks here after parameter expansion if not regex: regex = [] diff --git a/src/asv_spyglass/_num.py b/src/asv_spyglass/_num.py new file mode 100644 index 0000000..0dfdf4d --- /dev/null +++ b/src/asv_spyglass/_num.py @@ -0,0 +1,90 @@ +import math +from dataclasses import dataclass + + +@dataclass +class BenchNum: + val: float + err: float | None + unit: str | None + + +class Ratio: + """ + Represents the ratio between two numeric values (t2 / t1), handling + potential issues like division by zero, NaN, and None values. It also + supports marking a ratio as insignificant. + + Attributes: + _t1 (float | None): The first value (denominator). + _t2 (float | None): The second value (numerator). + val (float): The calculated ratio, which can be: + - The actual ratio if both _t1 and _t2 are valid numbers and _t1 is + not zero. + - math.inf if either _t1 or _t2 is None or NaN, or if _t1 is zero. + is_insignificant (bool): A flag indicating whether the ratio should be + considered statistically insignificant. Defaults to False. + + Methods: + __init__(self, t1: float | None, t2: float | None): + Initializes a Ratio object, calculating the ratio if possible. + + __repr__(self): + Returns a string representation of the ratio: + - "n/a" if the ratio is undefined (val is math.inf). + - "~" followed by the formatted ratio if is_insignificant is True. + - A formatted string with 2 decimal places otherwise. + + _is_invalid(self, t1, t2): + A private helper method to check if the inputs are invalid for ratio + calculation. + """ + + def __init__(self, t1: float | None, t2: float | None): + """ + Initializes a Ratio object. + + Args: + t1 (float | None): The first value (denominator). + t2 (float | None): The second value (numerator). + """ + self._t1 = t1 + self._t2 = t2 + self.is_insignificant = False + self.val = None + + if self._is_invalid(t1, t2): + self.val = math.inf + else: + try: + self.val = t2 / t1 + except ZeroDivisionError: + self.val = math.inf + + def __repr__(self): + """ + Returns a string representation of the ratio. + + If val is math.inf, returns "n/a". + If is_insignificant is True, returns "~" followed by the formatted ratio. + Otherwise, returns the ratio formatted to 2 decimal places. + """ + if self.val == math.inf: + return "n/a" + elif self.is_insignificant: + return "~" + f"{self.val:6.2f}".strip() + else: + return f"{self.val:6.2f}" + + def _is_invalid(self, t1, t2): + """ + Checks if the inputs are invalid for ratio calculation. + + Args: + t1 (float | None): The first value. + t2 (float | None): The second value. + + Returns: + bool: True if either t1 or t2 is None or NaN, False otherwise. + """ + return t1 is None or t2 is None or math.isnan(t1) or math.isnan(t2) diff --git a/src/asv_spyglass/changes.py b/src/asv_spyglass/changes.py new file mode 100644 index 0000000..2b5a28f --- /dev/null +++ b/src/asv_spyglass/changes.py @@ -0,0 +1,95 @@ +import enum +from dataclasses import dataclass + + +class ResultColor(enum.StrEnum): + DEFAULT = "white" + GREEN = enum.auto() + RED = enum.auto() + LIGHTGREY = "light_grey" + + +class ResultMark(enum.StrEnum): + BETTER = "-" + WORSE = "+" + FAILURE = "!" + FIXED = "*" + INCOMPARABLE = "x" + UNCHANGED = " " + INSIGNIFICANT = "~" + + +class AfterIs(enum.Enum): + LUKEWARM = 0 + WORSE = -1 + BETTER = 1 + + +@dataclass +class ASVChangeInfo: + mark: ResultMark + color: ResultColor + description: str + state: AfterIs + before: str + after: str + + +@dataclass +class Incomparable(ASVChangeInfo): + mark: ResultMark = ResultMark.INCOMPARABLE + color: ResultColor = ResultColor.LIGHTGREY + description: str = "Not comparable" + state: AfterIs = AfterIs.LUKEWARM + before: str = "" + after: str = "" + + +@dataclass +class Failure(ASVChangeInfo): + mark: ResultMark = ResultMark.FAILURE + color: ResultColor = ResultColor.RED + description: str = "Introduced a failure" + state: AfterIs = AfterIs.WORSE + before: str = "Succeeded" + after: str = "Failed" + + +@dataclass +class Fixed(ASVChangeInfo): + mark: ResultMark = ResultMark.FIXED + color: ResultColor = ResultColor.GREEN + description: str = "Fixed a failure" + state: AfterIs = AfterIs.BETTER + before: str = "Failed" + after: str = "Succeeded" + + +@dataclass +class NoChange(ASVChangeInfo): + mark: ResultMark = ResultMark.UNCHANGED + color: ResultColor = ResultColor.DEFAULT + description: str = "Both failed or either was skipped or no significant change" + state: AfterIs = AfterIs.LUKEWARM + before: str = "" + after: str = "" + + +@dataclass +class Better(ASVChangeInfo): + mark: ResultMark = ResultMark.BETTER + color: ResultColor = ResultColor.GREEN + description: str = "Relative improvement" + state: AfterIs = AfterIs.BETTER + before: str = "Worse" + after: str = "Better" + + +@dataclass +class Worse(ASVChangeInfo): + mark: ResultMark = ResultMark.WORSE + color: ResultColor = ResultColor.RED + description: str = "Relatively worse" + state: AfterIs = AfterIs.WORSE + before: str = "Better" + after: str = "Worse" diff --git a/src/asv_spyglass/cli.py b/src/asv_spyglass/cli.py index c26f6f9..910a50a 100644 --- a/src/asv_spyglass/cli.py +++ b/src/asv_spyglass/cli.py @@ -3,13 +3,17 @@ import click import polars as pl from asv import results +from rich import box +from rich.console import Console +from rich.table import Table +from rich_click import RichCommand, RichGroup from asv_spyglass._asv_ro import ReadOnlyASVBenchmarks -from asv_spyglass._aux import getstrform +from asv_spyglass.changes import ResultMark from asv_spyglass.compare import ResultPreparer, do_compare -@click.group() +@click.group(cls=RichGroup) def cli(): """ Command-line interface for ASV benchmark analysis. @@ -17,7 +21,7 @@ def cli(): pass -@cli.command() +@cli.command(cls=RichCommand) @click.argument("b1", type=click.Path(exists=True), required=True) @click.argument("b2", type=click.Path(exists=True), required=True) @click.argument("bconf", type=click.Path(exists=True), required=True) @@ -44,14 +48,65 @@ def cli(): show_default=True, help="Sort output by change, ratio, or name.", ) -def compare(b1, b2, bconf, factor, split, only_changed, sort): # Renamed to 'compare' +def compare(b1, b2, bconf, factor, split, only_changed, sort): """ Compare two ASV result files. """ - print(do_compare(b1, b2, bconf, factor, split, only_changed, sort)) + all_tables = do_compare(b1, b2, bconf, factor, split, only_changed, sort) + console = Console() -@cli.command() + for key, table_data in all_tables.items(): + if not only_changed: + console.print("") + console.print(table_data["title"], style="bold") + console.print("") + + table = Table( + title=table_data["title"], + show_header=True, + header_style="bold magenta", + box=box.SIMPLE, + ) + + for header in table_data["headers"]: + table.add_column( + header, justify="right" if header != "Benchmark (Parameter)" else "left" + ) + + for row in table_data["table_data"]: + change_mark = row[0] + row_style = "" + + # Determine row style based on change_mark + if change_mark == ResultMark.BETTER: + row_style = "green" + elif change_mark == ResultMark.WORSE: + row_style = "red" + elif change_mark == ResultMark.FAILURE: + row_style = "red" + elif change_mark == ResultMark.FIXED: + row_style = "green" + elif change_mark == ResultMark.INCOMPARABLE: + row_style = "light_grey" + elif change_mark == ResultMark.UNCHANGED: + row_style = "white" + elif change_mark == ResultMark.INSIGNIFICANT: + row_style = "white" + + table.add_row(*row, style=row_style) + + console.print(table) + + # Print summary of worsened/improved status + if not split: + if (av_x := sum([x.value for x in table_data["states"]])) > 0: + console.print("[bold green]Net Improvement![/]") + elif av_x < 0: + console.print("[bold red]Net Regression![/]") + + +@cli.command(cls=RichCommand) @click.argument("bres", type=click.Path(exists=True), required=True) @click.argument("bdat", type=click.Path(exists=True), required=True) @click.option( diff --git a/src/asv_spyglass/compare.py b/src/asv_spyglass/compare.py index ab91c4c..30e43e2 100644 --- a/src/asv_spyglass/compare.py +++ b/src/asv_spyglass/compare.py @@ -1,17 +1,23 @@ -import math from pathlib import Path import polars as pl -import tabulate from asv import results from asv.commands.compare import _is_result_better, _isna, unroll_result -from asv.console import log from asv.util import human_value -from asv_runner.console import color_print -from asv_runner.statistics import get_err from asv_spyglass._asv_ro import ReadOnlyASVBenchmarks -from asv_spyglass.results import PreparedResult, result_iter +from asv_spyglass._num import Ratio +from asv_spyglass.changes import ( + ASVChangeInfo, + Better, + Failure, + Fixed, + Incomparable, + NoChange, + ResultMark, + Worse, +) +from asv_spyglass.results import ASVBench, PreparedResult, result_iter class ResultPreparer: @@ -88,229 +94,241 @@ def prepare(self, result_data): ) -def do_compare( - b1, - b2, - bdat, - factor=1.1, - split=False, - only_changed=False, - sort="default", - machine=None, - env_spec=None, - use_stats=True, -): - # Load results - res_1 = results.Results.load(b1) - res_2 = results.Results.load(b2) +def _get_change_info( + asv1: ASVBench, asv2: ASVBench, factor, use_stats +) -> ASVChangeInfo: + if ( + asv1.version is not None + and asv2.version is not None + and asv1.version != asv2.version + ): + # not comparable + return Incomparable() + elif asv1.time is not None and asv2.time is None: + # introduced a failure + return Failure() + elif asv1.time is None and asv2.time is not None: + # fixed a failure + return Fixed() + elif asv1.time is None and asv2.time is None: + # both failed + return NoChange() + elif _isna(asv1.time) or _isna(asv2.time): + # either one was skipped + return NoChange() + elif _is_result_better( + asv2.time, + asv1.time, + asv2.stats_n_samples, + asv1.stats_n_samples, + factor, + use_stats=use_stats, + ): + return Better() + elif _is_result_better( + asv1.time, + asv2.time, + asv1.stats_n_samples, + asv2.stats_n_samples, + factor, + use_stats=use_stats, + ): + return Worse() + else: + return NoChange() - # Initialize benchmarks - benchmarks = ReadOnlyASVBenchmarks(Path(bdat)).benchmarks - # Prepare results using the ResultPreparer class - preparer = ResultPreparer(benchmarks) - prepared_results_1 = preparer.prepare(res_1) - prepared_results_2 = preparer.prepare(res_2) - # Kanged from compare.py - - # Extract data from prepared results - results_1 = prepared_results_1.results - results_2 = prepared_results_2.results - ss_1 = prepared_results_1.stats - ss_2 = prepared_results_2.stats - versions_1 = prepared_results_1.versions - versions_2 = prepared_results_2.versions - units = prepared_results_1.units - - machine_env_names = set() - mname_1 = f"{prepared_results_1.machine_name}/{prepared_results_1.env_name}" - mname_2 = f"{prepared_results_2.machine_name}/{prepared_results_2.env_name}" - machine_env_names.add(mname_1) - machine_env_names.add(mname_2) - - benchmarks_1 = set(results_1.keys()) - benchmarks_2 = set(results_2.keys()) - joint_benchmarks = sorted(list(benchmarks_1 | benchmarks_2)) - bench = {} +def _create_comparison_dataframe( + pr1: PreparedResult, + pr2: PreparedResult, + factor: float, + only_changed: bool, + use_stats: bool, +): + """ + Creates a Polars DataFrame comparing results from two PreparedResult objects. - if split: - bench["green"] = [] - bench["red"] = [] - bench["lightgrey"] = [] - bench["default"] = [] - else: - bench["all"] = [] + Args: + pr1 (PreparedResult): The first PreparedResult object. + pr2 (PreparedResult): The second PreparedResult object. + factor (float): The factor used for determining significance. + only_changed (bool): Whether to only include changed benchmarks. + use_stats (bool): Whether to use statistical significance. - worsened = False - improved = False + Returns: + pl.DataFrame: A DataFrame with comparison data. + """ + data = [] + machine_env_names = { + f"{pr1.machine_name}/{pr1.env_name}", + f"{pr2.machine_name}/{pr2.env_name}", + } + + for benchmark in sorted(set(pr1.results.keys()) | set(pr2.results.keys())): + asv1 = ASVBench(benchmark, pr1) + asv2 = ASVBench(benchmark, pr2) + + ratio = Ratio(asv1.time, asv2.time) + diffinfo = _get_change_info( + asv1, + asv2, + factor, + use_stats, + ) - for benchmark in joint_benchmarks: - if benchmark in results_1: - time_1 = results_1[benchmark] - else: - time_1 = math.nan + if isinstance(diffinfo, NoChange): + # Mark statistically insignificant results + if _is_result_better( + asv1.time, asv2.time, None, None, factor + ) or _is_result_better(asv2.time, asv1.time, None, None, factor): + ratio.is_insignificant = True + + if only_changed and diffinfo.mark in ( + ResultMark.UNCHANGED, + ResultMark.INCOMPARABLE, + ResultMark.FIXED, + ): + continue - if benchmark in results_2: - time_2 = results_2[benchmark] + # Determine benchmark name format + if len(machine_env_names) > 1: + benchmark_name = ( + f"{benchmark} [{pr1.machine_name}/{pr1.env_name}" + f" -> {pr2.machine_name}/{pr2.env_name}]" + ) else: - time_2 = math.nan + benchmark_name = benchmark - if benchmark in ss_1 and ss_1[benchmark][0]: - err_1 = get_err(time_1, ss_1[benchmark][0]) - else: - err_1 = None + assert asv1.unit == asv2.unit, "Units for benchmark must match" + + data.append( + { + "benchmark": benchmark_name, + "mark": diffinfo.mark, + "before": asv1.time, + "after": asv2.time, + "ratio": ratio.val, + "is_insignificant": ratio.is_insignificant, + "err_before": asv1.err, + "err_after": asv2.err, + "unit": asv1.unit or asv2.unit, # Use unit from asv1 or asv2 + "color": diffinfo.color.name.lower(), + "state": diffinfo.state, + } + ) - if benchmark in ss_2 and ss_2[benchmark][0]: - err_2 = get_err(time_2, ss_2[benchmark][0]) - else: - err_2 = None + return pl.DataFrame(data) - version_1 = versions_1.get(benchmark) - version_2 = versions_2.get(benchmark) - if _isna(time_1) or _isna(time_2): - ratio = "n/a" - ratio_num = 1e9 - else: - try: - ratio_num = time_2 / time_1 - ratio = f"{ratio_num:6.2f}" - except ZeroDivisionError: - ratio_num = 1e9 - ratio = "n/a" - - if version_1 is not None and version_2 is not None and version_1 != version_2: - # not comparable - color = "lightgrey" - mark = "x" - elif time_1 is not None and time_2 is None: - # introduced a failure - color = "red" - mark = "!" - worsened = True - elif time_1 is None and time_2 is not None: - # fixed a failure - color = "green" - mark = " " - improved = True - elif time_1 is None and time_2 is None: - # both failed - color = "default" - mark = " " - elif _isna(time_1) or _isna(time_2): - # either one was skipped - color = "default" - mark = " " - elif _is_result_better( - time_2, - time_1, - ss_2.get(benchmark), - ss_1.get(benchmark), - factor, - use_stats=use_stats, - ): - color = "green" - mark = "-" - improved = True - elif _is_result_better( - time_1, - time_2, - ss_1.get(benchmark), - ss_2.get(benchmark), - factor, - use_stats=use_stats, - ): - color = "red" - mark = "+" - worsened = True - else: - color = "default" - mark = " " +def _format_comparison_tables( + df: pl.DataFrame, sort: str, name_1: str, name_2: str, split: bool +): + """ + Formats a comparison DataFrame into tables for display. - # Mark statistically insignificant results - if _is_result_better( - time_1, time_2, None, None, factor - ) or _is_result_better(time_2, time_1, None, None, factor): - ratio = "~" + ratio.strip() + Args: + df (pl.DataFrame): The comparison DataFrame. + sort (str): The sorting method ("ratio" or "name"). + name_1 (str): The name of the first commit/result. + name_2 (str): The name of the second commit/result. - if only_changed and mark in (" ", "x"): - continue + Returns: + dict: A dictionary of formatted tables. + """ - unit = units[benchmark] + # Sort the DataFrame + if sort == "ratio": + df = df.sort("ratio", descending=True) + elif sort == "name": + df = df.sort("benchmark") - details = "{0:1s} {1:>15s} {2:>15s} {3:>8s} ".format( - mark, - human_value(time_1, unit, err=err_1), - human_value(time_2, unit, err=err_2), - ratio, - ) - split_line = details.split() - if len(machine_env_names) > 1: - benchmark_name = f"{benchmark} [{mname_1} -> {mname_2}]" - else: - benchmark_name = benchmark - if len(split_line) == 4: - split_line += [benchmark_name] - else: - split_line = [" "] + split_line + [benchmark_name] - if split: - bench[color].append(split_line) - else: - bench["all"].append(split_line) + # Construct the table data for each category + all_tables = {} if split: - keys = ["green", "default", "red", "lightgrey"] + colors = ["green", "default", "red", "lightgrey"] else: - keys = ["all"] + colors = ["all"] + df = df.with_columns(pl.lit("all").alias("color")) - titles = {} - titles["green"] = "Benchmarks that have improved:" - titles["default"] = "Benchmarks that have stayed the same:" - titles["red"] = "Benchmarks that have got worse:" - titles["lightgrey"] = "Benchmarks that are not comparable:" - titles["all"] = "All benchmarks:" + for color in colors: + if color != "all": + filtered_df = df.filter(pl.col("color") == color) + else: + filtered_df = df + + if not filtered_df.is_empty(): + table_data = [] + for row in filtered_df.iter_rows(named=True): + table_data.append( + [ + str(row["mark"]), + human_value(row["before"], row["unit"], err=row["err_before"]), + human_value(row["after"], row["unit"], err=row["err_after"]), + str(Ratio(row["before"], row["after"])), + row["benchmark"], + ] + ) + if color == "all": + title = "All benchmarks:" + else: + title = { + "green": "Benchmarks that have improved:", + "default": "Benchmarks that have stayed the same:", + "red": "Benchmarks that have got worse:", + "lightgrey": "Benchmarks that are not comparable:", + }[color] + + all_tables[color] = { + "title": title, + "headers": [ + "Change", + f"Before {name_1}", + f"After {name_2}", + "Ratio", + "Benchmark (Parameter)", + ], + "table_data": table_data, + "states": filtered_df.select(pl.col("state")).to_series().to_list(), + } + + return all_tables - log.flush() - for key in keys: - if len(bench[key]) == 0: - continue +def do_compare( + b1, + b2, + bdat, + factor=1.1, + split=False, + only_changed=False, + sort="default", + machine=None, + env_spec=None, + use_stats=True, +): + # Load results + res_1 = results.Results.load(b1) + res_2 = results.Results.load(b2) - if not only_changed: - color_print("") - color_print(titles[key]) - color_print("") + # Initialize benchmarks + benchmarks = ReadOnlyASVBenchmarks(Path(bdat)).benchmarks - name_1 = False # commit_names.get(hash_1) - if name_1: - name_1 = f"<{name_1}>" - else: - name_1 = "" + # Prepare results + preparer = ResultPreparer(benchmarks) + pr1 = preparer.prepare(res_1) + pr2 = preparer.prepare(res_2) - name_2 = False # commit_names.get(hash_2) - if name_2: - name_2 = f"<{name_2}>" - else: - name_2 = "" - - if sort == "default": - pass - elif sort == "ratio": - bench[key].sort(key=lambda v: v[3], reverse=True) - elif sort == "name": - bench[key].sort(key=lambda v: v[2]) - else: - raise ValueError("Unknown 'sort'") - - print(worsened, improved) - return tabulate.tabulate( - bench[key], - headers=[ - "Change", - f"Before {name_1}", - f"After {name_2}", - "Ratio", - "Benchmark (Parameter)", - ], - tablefmt="github", - ) + # Create the comparison DataFrame + df = _create_comparison_dataframe(pr1, pr2, factor, only_changed, use_stats) + + # Get commit names or use empty strings + name_1 = "" # commit_names.get(hash_1, "") + name_2 = "" # commit_names.get(hash_2, "") + name_1 = f"<{name_1}>" if name_1 else "" + name_2 = f"<{name_2}>" if name_2 else "" + + # Format the DataFrame into tables + all_tables = _format_comparison_tables(df, sort, name_1, name_2, split) + + return all_tables diff --git a/src/asv_spyglass/results.py b/src/asv_spyglass/results.py index 826f41a..9e2c812 100644 --- a/src/asv_spyglass/results.py +++ b/src/asv_spyglass/results.py @@ -1,11 +1,11 @@ import dataclasses +import math import re from collections import namedtuple +from dataclasses import dataclass, field import polars as pl -from asv import results - -from asv_spyglass._asv_ro import ReadOnlyASVBenchmarks +from asv_runner.statistics import get_err ASVResult = namedtuple( "ASVResult", @@ -41,7 +41,7 @@ def result_iter(bdot): ) -@dataclasses.dataclass +@dataclass class PreparedResult: """Augmented with information from the benchmarks.json""" @@ -54,8 +54,8 @@ class PreparedResult: param_names: list def __iter__(self): - for field in dataclasses.fields(self): - yield getattr(self, field.name) + for _field in dataclasses.fields(self): + yield getattr(self, _field.name) def to_df(self): """ @@ -111,3 +111,23 @@ def to_df(self): data.append(row) return pl.DataFrame(data) + + +@dataclass +class ASVBench: + name: str + _pr: PreparedResult + time: float = field(init=False) + stats_n_samples: tuple = field(init=False) + err: float | None = field(init=False) + version: str | None = field(init=False) + unit: str | None = field(init=False) + + def __post_init__(self): + self.time = self._pr.results.get(self.name, math.nan) + self.stats_n_samples = self._pr.stats.get(self.name, (None,)) + self.err = None + if self.name in self._pr.stats and self.stats_n_samples[0]: + self.err = get_err(self.time, self.stats_n_samples[0]) + self.version = self._pr.versions.get(self.name) + self.unit = self._pr.units.get(self.name) diff --git a/tests/approved_files/test_results.test_do_compare.approved.txt b/tests/approved_files/test_results.test_do_compare.approved.txt index 3f502e9..0d762d7 100644 --- a/tests/approved_files/test_results.test_do_compare.approved.txt +++ b/tests/approved_files/test_results.test_do_compare.approved.txt @@ -1,3 +1,14 @@ -| Change | Before | After | Ratio | Benchmark (Parameter) | -|----------|-----------|------------|---------|-------------------------------------------------------------------------------------------------------| -| - | 94.8±30μs | 28.4±0.2μs | 0.3 | benchmarks.TimeSuite.time_add_arr [rgx1gen11/conda-py3.11-numpy -> rgx1gen11/virtualenv-py3.12-numpy] | +{'all': {'headers': ['Change', + 'Before ', + 'After ', + 'Ratio', + 'Benchmark (Parameter)'], + 'states': [], + 'table_data': [['-', + '94.8±30μs', + '28.4±0.2μs', + ' 0.30', + 'benchmarks.TimeSuite.time_add_arr ' + '[rgx1gen11/conda-py3.11-numpy -> ' + 'rgx1gen11/virtualenv-py3.12-numpy]']], + 'title': 'All benchmarks:'}} diff --git a/tests/test_num.py b/tests/test_num.py new file mode 100644 index 0000000..a3e69bc --- /dev/null +++ b/tests/test_num.py @@ -0,0 +1,58 @@ +import math + +from asv_spyglass._num import Ratio + + +def test_ratio_normal_calculation(): + ratio = Ratio(t1=10, t2=5) + assert ratio.val == 0.5 + assert str(ratio) == " 0.50" + + +def test_ratio_t2_bigger(): + ratio = Ratio(t1=5, t2=10) + assert ratio.val == 2.0 + assert str(ratio) == " 2.00" + + +def test_ratio_division_by_zero(): + ratio = Ratio(t1=0, t2=10) + assert ratio.val == math.inf + assert str(ratio) == "n/a" + + +def test_ratio_t1_is_nan(): + ratio = Ratio(t1=float("nan"), t2=10) + assert ratio.val == math.inf + assert str(ratio) == "n/a" + + +def test_ratio_t2_is_nan(): + ratio = Ratio(t1=10, t2=float("nan")) + assert ratio.val == math.inf + assert str(ratio) == "n/a" + + +def test_ratio_both_nan(): + ratio = Ratio(t1=float("nan"), t2=float("nan")) + assert ratio.val == math.inf + assert str(ratio) == "n/a" + + +def test_ratio_t1_none(): + ratio = Ratio(t1=None, t2=10) + assert ratio.val == math.inf + assert str(ratio) == "n/a" + + +def test_ratio_t2_none(): + ratio = Ratio(t1=10, t2=None) + assert ratio.val == math.inf + assert str(ratio) == "n/a" + + +def test_ratio_is_insignificant(): + ratio = Ratio(t1=10, t2=500) + ratio.is_insignificant = True + assert ratio.val == 50 + assert str(ratio) == '~50.00' diff --git a/tests/test_results.py b/tests/test_results.py index d9e80db..2aad977 100644 --- a/tests/test_results.py +++ b/tests/test_results.py @@ -21,10 +21,12 @@ def test_result_iter(shared_datadir): def test_do_compare(shared_datadir): verify( - do_compare( - getstrform(shared_datadir / "a0f29428-conda-py3.11-numpy.json"), - getstrform(shared_datadir / "a0f29428-virtualenv-py3.12-numpy.json"), - shared_datadir / "asv_samples_a0f29428_benchmarks.json", + pp.pformat( + do_compare( + getstrform(shared_datadir / "a0f29428-conda-py3.11-numpy.json"), + getstrform(shared_datadir / "a0f29428-virtualenv-py3.12-numpy.json"), + shared_datadir / "asv_samples_a0f29428_benchmarks.json", + ) ) ) diff --git a/tests/test_ro.py b/tests/test_ro.py index c35afbd..0bf29ea 100644 --- a/tests/test_ro.py +++ b/tests/test_ro.py @@ -1,5 +1,4 @@ import pprint as pp -from pathlib import Path from approvaltests.approvals import verify