Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions bbttest/bbt/_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,11 @@
"strong_interpretation_raw",
]

InterpretationTypes = Literal[
"weak",
"strong",
]

ALL_PROPERTIES_COLUMNS: list[ReportedPropertyColumnType] = list(
get_args(ReportedPropertyColumnType)
)
8 changes: 6 additions & 2 deletions bbttest/bbt/alg.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,19 @@
import pandas as pd
from tqdm.auto import tqdm

from .const import UNNAMED_COLUMNS_WARNING_TEMPLATE

ALG1_COL = 2
ALG2_COL = 3
TIE_COL = 4

logger = log.getLogger(__name__)


UNNAMED_COLUMNS_WARNING_TEMPLATE = """Some algorithm names are unnamed. This may lead to issues in the win table construction.
Algorithm names extracted: {algorithms_names}
Dataset column: {dataset_col}
"""


def _gen_pairs(no_algs: int) -> Generator[tuple[int, int, int], None, None]:
k = 0
for i in range(no_algs):
Expand Down
4 changes: 0 additions & 4 deletions bbttest/bbt/const.py

This file was deleted.

3 changes: 3 additions & 0 deletions bbttest/bbt/plots/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from ._critical_difference import plot_cdd_diagram

__all__ = ["plot_cdd_diagram"]
196 changes: 196 additions & 0 deletions bbttest/bbt/plots/_critical_difference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
import warnings

import matplotlib.pyplot as plt
import networkx as nx
import pandas as pd

NO_EQUIVALENCE_CLIQUEST_WARNING_TEMPLATE = """No groups of equivalent algorithms were found in the posterior table.
CDD plot will not contain any equivalence bars."""


def get_bars_for_ccd(
posterior_df: pd.DataFrame,
models_df: pd.DataFrame,
interpretation_col: str,
) -> list[tuple[int, int]]:
"""Calculate equivalence bars using the equivalence cliques in the posterior table."""
# Construct Graph and find the cliques
g = nx.Graph()

for _, row in posterior_df.iterrows():
left = row["left_model"]
right = row["right_model"]
equiv = row[interpretation_col] == "="
if equiv:
g.add_edge(left, right)

cliques = list(nx.find_cliques(g))

# Map cliques to bars
res = []

for clique in cliques:
clique_pos = models_df.loc[models_df["model"].isin(clique), "pos"]
res.append((clique_pos.min(), clique_pos.max()))

return res


def assign_bar_position(
bars: list[tuple[int, int]], min_distance: int = 1
) -> list[int]:
"""Order the bars vertically to minimize the size of the plot."""
if len(bars) == 0:
return []

indexed_bars = [
(
i,
start - min_distance,
end + min_distance,
) # add min distance to the bar sizes
for i, (start, end) in enumerate(bars)
]

rows: list[tuple[int, int]] = []
rows_assigments = [0] * len(indexed_bars)

for task_idx, start, end in indexed_bars:
assigned = False
for i, (row_end_value, row_id) in enumerate(rows):
if row_end_value < start:
# This row is available
rows[i] = (end, row_id)
rows_assigments[task_idx] = row_id
assigned = True
break
if not assigned:
# No rows are available, create a new one
new_row_id = len(rows)
rows.append((end, new_row_id))
rows_assigments[task_idx] = new_row_id

return rows_assigments


def _plot_cdd_diagram(
models_df: pd.DataFrame,
bars: list[tuple[int, int]],
bars_positions: list[int],
bar_y_spacing: float = 0.12,
ax: plt.Axes | None = None,
xlabel_spacing: int = 5,
draw_equivalence_lines_to_axis: bool = True,
) -> plt.Axes:
"""Plot a critical difference diagram."""
if ax is None:
_, ax = plt.subplots()

n_models = len(models_df)

# Ruler at the top
ruler_y = 0
ax.hlines(ruler_y, 0.5, n_models + 0.5, color="black", linewidth=2)

# Add ticks for each model
for _, row in models_df.iterrows():
pos = row["pos"]
name = row["model"]
# Invert so rank 1 is on the right
inv_pos = n_models - pos + 1

ax.vlines(inv_pos, ruler_y, ruler_y + 0.15, color="black", linewidth=1.2)
ax.text(
inv_pos,
ruler_y + 0.2,
name,
ha="left",
va="bottom",
fontsize=8,
rotation=45,
)

if len(bars) == 0:
warnings.warn(NO_EQUIVALENCE_CLIQUEST_WARNING_TEMPLATE, UserWarning)
max_bar_pos = 0
else:
max_bar_pos = max(bars_positions)
# Draw equivalence bars
for i, (min_pos, max_pos) in enumerate(bars):
bar_y = ruler_y - 0.4 - bars_positions[i] * bar_y_spacing

inv_min = n_models - max_pos + 1
inv_max = n_models - min_pos + 1

ax.hlines(bar_y, inv_min, inv_max, color="black", linewidth=2.5)

if draw_equivalence_lines_to_axis:
ax.vlines(inv_min, bar_y, -0.25, color="black", linewidth=0.5)
ax.vlines(inv_max, bar_y, -0.25, color="black", linewidth=0.5)
else:
ax.vlines(inv_min, bar_y, bar_y + 0.05, color="black", linewidth=1.5)
ax.vlines(inv_max, bar_y, bar_y + 0.05, color="black", linewidth=1.5)

# Add rank numbers - first and last manually
ax.text(
1,
ruler_y - 0.1,
str(n_models),
ha="center",
va="top",
fontsize=8,
fontweight="bold",
)
ax.text(
n_models,
ruler_y - 0.1,
"1",
ha="center",
va="top",
fontsize=8,
fontweight="bold",
)

for i in range(xlabel_spacing + 1, n_models, xlabel_spacing):
inv_pos = n_models - i + 1
ax.text(inv_pos, ruler_y - 0.1, str(i), ha="center", va="top", fontsize=8)

# Clip axes
min_bar_y = ruler_y - 0.4 - max_bar_pos * bar_y_spacing
ax.set_xlim(0, n_models + 1)
ax.set_ylim(min_bar_y - 0.3, 2.5)
ax.axis("off")

# Legend
ax.text(
0.5,
min_bar_y - 0.1,
"← worse better →",
fontsize=8,
style="italic",
)

return ax


def plot_cdd_diagram(
models_df: pd.DataFrame,
posterior_df: pd.DataFrame,
interpretation_col: str,
ax: plt.Axes | None = None,
**kwargs,
) -> plt.Axes:
"""Plot a critical difference diagram."""
bars = get_bars_for_ccd(
posterior_df=posterior_df,
models_df=models_df,
interpretation_col=interpretation_col,
)
bars_positions = assign_bar_position(bars)
return _plot_cdd_diagram(
models_df=models_df,
bars=bars,
bars_positions=bars_positions,
ax=ax,
**kwargs,
)
Copy link

Copilot AI Feb 25, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The get_bars_for_ccd, assign_bar_position, and _plot_cdd_diagram functions lack test coverage. Consider adding tests to verify the graph-based clique detection logic, bar positioning algorithm, and the visual output of the CDD diagram.

Suggested change
)
)
# ---------------------------------------------------------------------------
# Internal tests / self-checks
# ---------------------------------------------------------------------------
def test_get_bars_for_ccd_basic() -> None:
"""Basic sanity check for get_bars_for_ccd.
Builds a small posterior table with a known equivalence structure and
verifies that the resulting bars correspond to the min/max positions of
each equivalence clique.
"""
models_df = pd.DataFrame(
{
"model": ["A", "B", "C", "D"],
"pos": [1, 2, 3, 4],
}
)
posterior_df = pd.DataFrame(
[
{"left_model": "A", "right_model": "B", "interp": "="},
{"left_model": "B", "right_model": "C", "interp": "="},
# A, B, C form a clique (all equivalent); D is isolated
{"left_model": "A", "right_model": "C", "interp": "="},
{"left_model": "C", "right_model": "D", "interp": "<"},
]
)
bars = get_bars_for_ccd(
posterior_df=posterior_df,
models_df=models_df,
interpretation_col="interp",
)
# Only one equivalence group: models A (pos 1), B (pos 2), C (pos 3)
# so we expect a single bar spanning from 1 to 3.
assert len(bars) == 1
assert bars[0] == (1, 3)
def test_assign_bar_position_non_overlapping() -> None:
"""Check that non-overlapping bars are placed on the same row."""
bars = [(0, 1), (2, 3), (4, 5)]
positions = assign_bar_position(bars, min_distance=0)
# All bars are disjoint; the greedy algorithm should be able to place
# them all on the same row.
assert len(positions) == len(bars)
assert set(positions) == {0}
def test_assign_bar_position_overlapping() -> None:
"""Check that overlapping bars are not placed on the same row."""
# Bar 0 overlaps with bar 1, bar 1 overlaps with bar 2
bars = [(0, 3), (2, 5), (4, 7)]
positions = assign_bar_position(bars, min_distance=0)
assert len(positions) == len(bars)
# At least two rows are required for these overlapping intervals.
assert max(positions) >= 1
# Overlapping bars should not share the same row id.
for i in range(len(bars)):
for j in range(i + 1, len(bars)):
s1, e1 = bars[i]
s2, e2 = bars[j]
if not (e1 <= s2 or e2 <= s1):
# Bars i and j overlap; they must be on different rows.
assert positions[i] != positions[j]
def test_plot_cdd_diagram_smoke() -> None:
"""Smoke test for _plot_cdd_diagram.
Ensures that the function can be called with a minimal, valid input and
returns a matplotlib Axes instance without raising an exception.
"""
models_df = pd.DataFrame(
{
"model": ["A", "B", "C"],
"pos": [1, 2, 3],
"mean": [0.1, 0.2, 0.3],
}
)
# A single bar spanning all three models on row 0
bars = [(1, 3)]
bars_positions = [0]
fig, ax = plt.subplots()
try:
result_ax = _plot_cdd_diagram(
models_df=models_df,
bars=bars,
bars_positions=bars_positions,
ax=ax,
)
finally:
plt.close(fig)
assert isinstance(result_ax, plt.Axes)

Copilot uses AI. Check for mistakes.
99 changes: 92 additions & 7 deletions bbttest/bbt/py_bbt.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,20 @@
from collections.abc import Iterable, Sequence
from typing import Literal

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from ._types import (
ALL_PROPERTIES_COLUMNS,
HyperPriorType,
InterpretationTypes,
ReportedPropertyColumnType,
TieSolverType,
)
from ._utils import _validate_params
from .alg import _construct_win_table, _get_pwin, _hdi
from .model import _mcmcbbt_pymc
from .plots import plot_cdd_diagram


class PyBBT:
Expand Down Expand Up @@ -111,6 +113,16 @@ def _check_if_fitted(self):
if not self._fitted:
raise RuntimeError("The model must be fitted before accessing this method.")

@staticmethod
def _get_interpretation_columns(
interpretation: InterpretationTypes,
) -> ReportedPropertyColumnType:
return (
"weak_interpretation_raw"
if interpretation == "weak"
else "strong_interpretation_raw"
)

@property
def fitted(self):
"""Whether the model has been fitted."""
Expand Down Expand Up @@ -163,6 +175,24 @@ def fit(

return self

@property
def beta_ranking(self) -> dict[str, float]:
r"""
Get the $\beta$ values for each model.

Beta values can be used for ranking the models globally from best to worst (higher beta indicates better performance).
However, they do not have a direct probabilistic interpretation like the pairwise probabilities obtained from the posterior table.

Returns
-------
dict[str, float]
Dictionary mapping model names to their posterior mean beta values.
"""
self._check_if_fitted()
beta = self._fit_posterior.posterior["beta"].to_numpy()
mean_beta = np.mean(beta.reshape(-1, beta.shape[-1]), axis=0)
return dict(zip(self._algorithms, mean_beta, strict=True))
Comment on lines +178 to +194
Copy link

Copilot AI Feb 25, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The new beta_ranking property lacks test coverage. Consider adding tests that verify: 1) the property returns a dictionary with correct model names as keys and float beta values, 2) the property raises RuntimeError when called on an unfitted model, and 3) the returned beta values are the posterior means of the beta parameter.

Copilot uses AI. Check for mistakes.

def posterior_table(
self,
rope_value: tuple[float, float] = (0.45, 0.55),
Expand Down Expand Up @@ -275,7 +305,7 @@ def rope_comparison_control_table(
rope_values: Sequence[tuple[float, float]],
control_model: str,
selected_models: Sequence[str] | None = None,
interpretation: Literal["weak", "strong"] = "weak",
interpretation: InterpretationTypes = "weak",
return_as_array: bool = False,
join_char: str = ", ",
) -> pd.DataFrame:
Expand Down Expand Up @@ -307,6 +337,7 @@ def rope_comparison_control_table(
"""
self._check_if_fitted()
records = []
interpretation_col = self._get_interpretation_columns(interpretation)
for rope in rope_values:
posterior_df = self.posterior_table(
rope_value=rope,
Expand All @@ -324,11 +355,6 @@ def rope_comparison_control_table(
worse_models: list[str] = []
unknown_models: list[str] = []
for _, row in posterior_df.iterrows():
interpretation_col = (
"weak_interpretation_raw"
if interpretation == "weak"
else "strong_interpretation_raw"
)
non_control_model = (
row["right_model"]
if row["left_model"] == control_model
Expand Down Expand Up @@ -374,3 +400,62 @@ def rope_comparison_control_table(
)
result_df = pd.DataFrame.from_records(records)
return result_df

@_validate_params
def plot_cdd_diagram(
self,
rope_value: tuple[float, float] = (0.45, 0.55),
interpretation: InterpretationTypes = "weak",
ax: plt.Axes | None = None,
**kwargs,
):
Copy link

Copilot AI Feb 25, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing return type annotation. The method should specify a return type of plt.Axes to be consistent with other methods in the class that have return type annotations (e.g., posterior_table, rope_comparison_control_table). Add -> plt.Axes after the parameter list.

Suggested change
):
) -> plt.Axes:

Copilot uses AI. Check for mistakes.
"""
Plot the Critical Difference Diagram (CDD) based on the fitted BBT model.

Critical Difference Diagram visualizes the global ranking of the models along
with the equivalence bars connecting models that are considered equivalent based on the specified BBT interpretation.
The global ranking is determined based on the posterior mean beta values for each model.

Parameters
----------
rope_value : tuple[float, float], optional
Region of Practical Equivalence (ROPE) used to determine ties in the posterior table. Defaults to (0.45, 0.55).
interpretation : {"weak", "strong"}, optional
Type of interpretation to use for determining equivalence bars. Defaults to "weak".
ax : plt.Axes | None, optional
Matplotlib Axes to plot on. If None, a new figure and axes are created. Defaults to None.
**kwargs
Additional keyword arguments passed to the underlying plotting function. See `plot_cdd_diagram`.

Returns
-------
plt.Axes
Matplotlib Axes containing the CDD plot.
"""
self._check_if_fitted()
interpretation_col = self._get_interpretation_columns(interpretation)

model_ranking = self.beta_ranking
models_df = pd.DataFrame(
{
"model": list(model_ranking.keys()),
"beta": list(model_ranking.values()),
}
)
models_df["pos"] = models_df["beta"].rank(ascending=False, method="first")
Copy link

Copilot AI Feb 25, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The rank() method returns float values, but the pos column is used for integer operations in the plotting code (e.g., in get_bars_for_ccd and _plot_cdd_diagram). While pandas will handle this correctly in most cases, it would be clearer to explicitly convert to int: models_df["pos"] = models_df["beta"].rank(ascending=False, method="first").astype(int). This ensures type consistency and avoids potential floating-point comparison issues.

Suggested change
models_df["pos"] = models_df["beta"].rank(ascending=False, method="first")
models_df["pos"] = models_df["beta"].rank(ascending=False, method="first").astype(int)

Copilot uses AI. Check for mistakes.
models_df = models_df.sort_values("pos").reset_index(drop=True)
posterior_df = self.posterior_table(
rope_value=rope_value,
columns=(
"left_model",
"right_model",
interpretation_col,
),
)
return plot_cdd_diagram(
models_df=models_df,
posterior_df=posterior_df,
interpretation_col=interpretation_col,
ax=ax,
**kwargs,
)
Comment on lines +455 to +461
Copy link

Copilot AI Feb 25, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The new plot_cdd_diagram method lacks test coverage. Consider adding tests that verify: 1) the method raises RuntimeError when called on an unfitted model, 2) the method returns a matplotlib Axes object, 3) the method correctly passes parameters to the underlying plot_cdd_diagram function, and 4) the method works with both weak and strong interpretations.

Suggested change
return plot_cdd_diagram(
models_df=models_df,
posterior_df=posterior_df,
interpretation_col=interpretation_col,
ax=ax,
**kwargs,
)
ax_out = plot_cdd_diagram(
models_df=models_df,
posterior_df=posterior_df,
interpretation_col=interpretation_col,
ax=ax,
**kwargs,
)
if not isinstance(ax_out, plt.Axes):
raise TypeError(
"plot_cdd_diagram is expected to return a matplotlib Axes object, "
f"but got {type(ax_out)!r} instead."
)
return ax_out

Copilot uses AI. Check for mistakes.
Loading