From 29a3c06276322c7fe4f762fc5d962144fdcacc88 Mon Sep 17 00:00:00 2001
From: Teytaud <olivier.teytaud@gmail.com>
Date: Wed, 30 Mar 2022 09:15:59 +0200
Subject: [PATCH 1/2] Relevant weights if not all contexts run equally

When different contexts have been run a different number of times, then it's better to first aggregate by
settings. This is equivalent to the previous code if:
- if all settings have been replicated the same number of times
- asymptotically, if missing runs are equally distributed among different runs and we replicated sufficiently many times
---
 nevergrad/benchmark/plotting.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/nevergrad/benchmark/plotting.py b/nevergrad/benchmark/plotting.py
index 5d2ab6415..6fc37b4f8 100644
--- a/nevergrad/benchmark/plotting.py
+++ b/nevergrad/benchmark/plotting.py
@@ -603,7 +603,10 @@ def make_data(df: pd.DataFrame, normalized_loss: bool = False) -> tp.Dict[str, t
                 ["optimizer_name", "budget", "loss"] + (["pseudotime"] if "pseudotime" in df.columns else []),
             ]
         )
-        groupeddf = df.groupby(["optimizer_name", "budget"])
+        # We first aggregate equivalent rows. The only point of this is that we want all contexts to have the same
+        # weight, in e.g. xpresults_all.png, even if not all contexts have been run the same number of times.
+        compact_df = df.groupby(df.columns).mean()  # We first aggregate equal contexts.
+        groupeddf = compact_df.groupby(["optimizer_name", "budget"])
         means = groupeddf.mean()
         stds = groupeddf.std()
         optim_vals: tp.Dict[str, tp.Dict[str, np.ndarray]] = {}

From 53afc4d55a2017cefa1a4736f20895a4af6f752a Mon Sep 17 00:00:00 2001
From: Teytaud <olivier.teytaud@gmail.com>
Date: Wed, 30 Mar 2022 09:33:26 +0200
Subject: [PATCH 2/2] Update plotting.py

---
 nevergrad/benchmark/plotting.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/nevergrad/benchmark/plotting.py b/nevergrad/benchmark/plotting.py
index 6fc37b4f8..2722ef8e1 100644
--- a/nevergrad/benchmark/plotting.py
+++ b/nevergrad/benchmark/plotting.py
@@ -605,7 +605,18 @@ def make_data(df: pd.DataFrame, normalized_loss: bool = False) -> tp.Dict[str, t
         )
         # We first aggregate equivalent rows. The only point of this is that we want all contexts to have the same
         # weight, in e.g. xpresults_all.png, even if not all contexts have been run the same number of times.
-        compact_df = df.groupby(df.columns).mean()  # We first aggregate equal contexts.
+        descriptors = sorted(
+            set(df.columns)
+            - {
+                "pseudotime",
+                "time",
+                "elapsed_time",
+                "elapsed_budget",
+                "loss",
+                "seed",
+            }
+        )
+        compact_df = df.groupby(list(descriptors)).mean()  # We first aggregate equal contexts.
         groupeddf = compact_df.groupby(["optimizer_name", "budget"])
         means = groupeddf.mean()
         stds = groupeddf.std()