Conversation
There was a problem hiding this comment.
Pull request overview
This PR adds Critical Difference Diagram (CDD) visualization functionality to the BBT model, allowing users to visualize global model rankings with equivalence bars based on Bayesian Bradley-Terry comparisons.
Changes:
- Added matplotlib and networkx as dependencies for plotting functionality
- Introduced new
beta_rankingproperty to PyBBT for accessing posterior mean beta values - Added
plot_cdd_diagrammethod to PyBBT for creating CDD visualizations - Refactored interpretation column selection logic into a helper method
Reviewed changes
Copilot reviewed 7 out of 8 changed files in this pull request and generated 10 comments.
Show a summary per file
| File | Description |
|---|---|
| pyproject.toml | Added matplotlib and networkx dependencies |
| uv.lock | Updated lock file with new dependencies (networkx 3.6.1, matplotlib 3.10.7) |
| bbttest/bbt/_types.py | Added InterpretationTypes type alias for "weak"/"strong" interpretations |
| bbttest/bbt/const.py | Removed file - constant moved to alg.py |
| bbttest/bbt/alg.py | Moved UNNAMED_COLUMNS_WARNING_TEMPLATE from const.py |
| bbttest/bbt/py_bbt.py | Added beta_ranking property, plot_cdd_diagram method, and _get_interpretation_columns helper; refactored rope_comparison_control_table |
| bbttest/bbt/plots/init.py | New plots module initialization |
| bbttest/bbt/plots/_critical_difference.py | New plotting implementation with graph-based clique detection and CDD rendering |
💡 Add Copilot custom instructions for smarter, more guided reviews. Learn how to get started.
| bars_positions=bars_positions, | ||
| ax=ax, | ||
| **kwargs, | ||
| ) |
There was a problem hiding this comment.
The get_bars_for_ccd, assign_bar_position, and _plot_cdd_diagram functions lack test coverage. Consider adding tests to verify the graph-based clique detection logic, bar positioning algorithm, and the visual output of the CDD diagram.
| ) | |
| ) | |
| # --------------------------------------------------------------------------- | |
| # Internal tests / self-checks | |
| # --------------------------------------------------------------------------- | |
| def test_get_bars_for_ccd_basic() -> None: | |
| """Basic sanity check for get_bars_for_ccd. | |
| Builds a small posterior table with a known equivalence structure and | |
| verifies that the resulting bars correspond to the min/max positions of | |
| each equivalence clique. | |
| """ | |
| models_df = pd.DataFrame( | |
| { | |
| "model": ["A", "B", "C", "D"], | |
| "pos": [1, 2, 3, 4], | |
| } | |
| ) | |
| posterior_df = pd.DataFrame( | |
| [ | |
| {"left_model": "A", "right_model": "B", "interp": "="}, | |
| {"left_model": "B", "right_model": "C", "interp": "="}, | |
| # A, B, C form a clique (all equivalent); D is isolated | |
| {"left_model": "A", "right_model": "C", "interp": "="}, | |
| {"left_model": "C", "right_model": "D", "interp": "<"}, | |
| ] | |
| ) | |
| bars = get_bars_for_ccd( | |
| posterior_df=posterior_df, | |
| models_df=models_df, | |
| interpretation_col="interp", | |
| ) | |
| # Only one equivalence group: models A (pos 1), B (pos 2), C (pos 3) | |
| # so we expect a single bar spanning from 1 to 3. | |
| assert len(bars) == 1 | |
| assert bars[0] == (1, 3) | |
| def test_assign_bar_position_non_overlapping() -> None: | |
| """Check that non-overlapping bars are placed on the same row.""" | |
| bars = [(0, 1), (2, 3), (4, 5)] | |
| positions = assign_bar_position(bars, min_distance=0) | |
| # All bars are disjoint; the greedy algorithm should be able to place | |
| # them all on the same row. | |
| assert len(positions) == len(bars) | |
| assert set(positions) == {0} | |
| def test_assign_bar_position_overlapping() -> None: | |
| """Check that overlapping bars are not placed on the same row.""" | |
| # Bar 0 overlaps with bar 1, bar 1 overlaps with bar 2 | |
| bars = [(0, 3), (2, 5), (4, 7)] | |
| positions = assign_bar_position(bars, min_distance=0) | |
| assert len(positions) == len(bars) | |
| # At least two rows are required for these overlapping intervals. | |
| assert max(positions) >= 1 | |
| # Overlapping bars should not share the same row id. | |
| for i in range(len(bars)): | |
| for j in range(i + 1, len(bars)): | |
| s1, e1 = bars[i] | |
| s2, e2 = bars[j] | |
| if not (e1 <= s2 or e2 <= s1): | |
| # Bars i and j overlap; they must be on different rows. | |
| assert positions[i] != positions[j] | |
| def test_plot_cdd_diagram_smoke() -> None: | |
| """Smoke test for _plot_cdd_diagram. | |
| Ensures that the function can be called with a minimal, valid input and | |
| returns a matplotlib Axes instance without raising an exception. | |
| """ | |
| models_df = pd.DataFrame( | |
| { | |
| "model": ["A", "B", "C"], | |
| "pos": [1, 2, 3], | |
| "mean": [0.1, 0.2, 0.3], | |
| } | |
| ) | |
| # A single bar spanning all three models on row 0 | |
| bars = [(1, 3)] | |
| bars_positions = [0] | |
| fig, ax = plt.subplots() | |
| try: | |
| result_ax = _plot_cdd_diagram( | |
| models_df=models_df, | |
| bars=bars, | |
| bars_positions=bars_positions, | |
| ax=ax, | |
| ) | |
| finally: | |
| plt.close(fig) | |
| assert isinstance(result_ax, plt.Axes) |
| @property | ||
| def beta_ranking(self) -> dict[str, float]: | ||
| r""" | ||
| Get the $\beta$ values for each model. | ||
|
|
||
| Beta values can be used for ranking the models globally from best to worst (higher beta indicates better performance). | ||
| However, they do not have a direct probabilistic interpretation like the pairwise probabilities obtained from the posterior table. | ||
|
|
||
| Returns | ||
| ------- | ||
| dict[str, float] | ||
| Dictionary mapping model names to their posterior mean beta values. | ||
| """ | ||
| self._check_if_fitted() | ||
| beta = self._fit_posterior.posterior["beta"].to_numpy() | ||
| mean_beta = np.mean(beta.reshape(-1, beta.shape[-1]), axis=0) | ||
| return dict(zip(self._algorithms, mean_beta, strict=True)) |
There was a problem hiding this comment.
The new beta_ranking property lacks test coverage. Consider adding tests that verify: 1) the property returns a dictionary with correct model names as keys and float beta values, 2) the property raises RuntimeError when called on an unfitted model, and 3) the returned beta values are the posterior means of the beta parameter.
| return plot_cdd_diagram( | ||
| models_df=models_df, | ||
| posterior_df=posterior_df, | ||
| interpretation_col=interpretation_col, | ||
| ax=ax, | ||
| **kwargs, | ||
| ) |
There was a problem hiding this comment.
The new plot_cdd_diagram method lacks test coverage. Consider adding tests that verify: 1) the method raises RuntimeError when called on an unfitted model, 2) the method returns a matplotlib Axes object, 3) the method correctly passes parameters to the underlying plot_cdd_diagram function, and 4) the method works with both weak and strong interpretations.
| return plot_cdd_diagram( | |
| models_df=models_df, | |
| posterior_df=posterior_df, | |
| interpretation_col=interpretation_col, | |
| ax=ax, | |
| **kwargs, | |
| ) | |
| ax_out = plot_cdd_diagram( | |
| models_df=models_df, | |
| posterior_df=posterior_df, | |
| interpretation_col=interpretation_col, | |
| ax=ax, | |
| **kwargs, | |
| ) | |
| if not isinstance(ax_out, plt.Axes): | |
| raise TypeError( | |
| "plot_cdd_diagram is expected to return a matplotlib Axes object, " | |
| f"but got {type(ax_out)!r} instead." | |
| ) | |
| return ax_out |
| interpretation: InterpretationTypes = "weak", | ||
| ax: plt.Axes | None = None, | ||
| **kwargs, | ||
| ): |
There was a problem hiding this comment.
Missing return type annotation. The method should specify a return type of plt.Axes to be consistent with other methods in the class that have return type annotations (e.g., posterior_table, rope_comparison_control_table). Add -> plt.Axes after the parameter list.
| ): | |
| ) -> plt.Axes: |
| "beta": list(model_ranking.values()), | ||
| } | ||
| ) | ||
| models_df["pos"] = models_df["beta"].rank(ascending=False, method="first") |
There was a problem hiding this comment.
The rank() method returns float values, but the pos column is used for integer operations in the plotting code (e.g., in get_bars_for_ccd and _plot_cdd_diagram). While pandas will handle this correctly in most cases, it would be clearer to explicitly convert to int: models_df["pos"] = models_df["beta"].rank(ascending=False, method="first").astype(int). This ensures type consistency and avoids potential floating-point comparison issues.
| models_df["pos"] = models_df["beta"].rank(ascending=False, method="first") | |
| models_df["pos"] = models_df["beta"].rank(ascending=False, method="first").astype(int) |
No description provided.