Feat: Add Support for multiple test-set benchmarks (#296)

j279li · web-flow · commit 4dd22f132c87 · 2025-07-30T13:32:21.000-04:00
* updated splits to support multiple test sets
diff --git a/polaris/benchmark/_benchmark_v2.py b/polaris/benchmark/_benchmark_v2.py
@@ -30,6 +30,7 @@ class BenchmarkV2Specification(
 
     Attributes:
         dataset: The dataset the benchmark specification is based on.
+        splits: The predefined train-test splits to use for evaluation.
         n_classes: The number of classes for each of the target columns.
         readme: Markdown text that can be used to provide a formatted description of the benchmark.
         artifact_version: The version of the benchmark.
@@ -85,7 +86,7 @@ def _validate_split_in_dataset(self) -> Self:
           - All indices are valid given the dataset
         """
         dataset_length = len(self.dataset)
-        if self.split.max_index >= dataset_length:
+        if self.max_index >= dataset_length:
             raise InvalidBenchmarkError("The predefined split contains invalid indices")
 
         return self
@@ -102,17 +103,24 @@ def _validate_cols_in_dataset(self) -> Self:
 
         return self
 
-    def _get_test_sets(
+    def _get_splits(
         self, hide_targets=True, featurization_fn: Callable | None = None
-    ) -> dict[str, Subset]:
+    ) -> dict[str, tuple[Subset, Subset]]:
         """
-        Construct the test set(s), given the split in the benchmark specification. Used
-        internally to construct the test set for client use and evaluation.
+        Construct all train-test split pairs, given the splits in the benchmark specification.
+        Used internally to construct the splits for client use and evaluation.
         """
         # TODO: We need a subset class that can handle very large index sets without copying or materializing all of them
         return {
-            label: self._get_subset(index_set.indices, hide_targets, featurization_fn)
-            for label, index_set in self.split.test_items()
+            label: (
+                self._get_subset(
+                    split.training.indices, hide_targets=False, featurization_fn=featurization_fn
+                ),
+                self._get_subset(
+                    split.test.indices, hide_targets=hide_targets, featurization_fn=featurization_fn
+                ),
+            )
+            for label, split in self.split_items()
         }
 
     def _get_subset(self, indices, hide_targets=True, featurization_fn=None) -> Subset:
@@ -129,8 +137,8 @@ def _get_subset(self, indices, hide_targets=True, featurization_fn=None) -> Subs
 
     def get_train_test_split(
         self, featurization_fn: Callable | None = None
-    ) -> tuple[Subset, dict[str, Subset]]:
-        """Construct the train and test sets, given the split in the benchmark specification.
+    ) -> dict[str, tuple[Subset, Subset]]:
+        """Construct the train and test sets for all splits, given the splits in the benchmark specification.
 
         Returns [`Subset`][polaris.dataset.Subset] objects, which offer several ways of accessing the data
         and can thus easily serve as a basis to build framework-specific (e.g. PyTorch, Tensorflow)
@@ -141,15 +149,10 @@ def get_train_test_split(
                 expects an input in the format specified by the `input_format` parameter.
 
         Returns:
-            A tuple with the train `Subset` and test `Subset` objects.
-                If there are multiple test sets, these are returned in a dictionary and each test set has
-                an associated name. The targets of the test set can not be accessed.
+            A dictionary mapping split labels to (train, test) tuples of `Subset` objects.
+            The targets of the test sets cannot be accessed.
         """
-        train = self._get_subset(
-            self.split.training.indices, hide_targets=False, featurization_fn=featurization_fn
-        )
-        test = self._get_test_sets(hide_targets=True, featurization_fn=featurization_fn)
-        return train, test
+        return self._get_splits(hide_targets=True, featurization_fn=featurization_fn)
 
     def upload_to_hub(
         self,
@@ -208,8 +211,8 @@ def submit_predictions(
             benchmark_artifact_id=self.artifact_id,
             predictions=predictions,
             target_labels=list(self.target_cols),
-            test_set_labels=self.test_set_labels,
-            test_set_sizes=self.test_set_sizes,
+            test_set_labels=self.split_labels,
+            test_set_sizes=self.n_test_datapoints,
             contributors=contributors or [],
             model=model,
             description=description,
diff --git a/polaris/benchmark/_split_v2.py b/polaris/benchmark/_split_v2.py
@@ -57,120 +57,123 @@ def deserialize(index_set: bytes) -> "IndexSet":
 
 
 class SplitV2(BaseModel):
+    """
+    A single train-test split pair containing training and test index sets.
+
+    This represents one train-test split with training and test sets.
+    Multiple SplitV2 instances can be used together for cross-validation scenarios.
+    """
+
     training: IndexSet
     test: IndexSet
 
     @field_validator("training", "test", mode="before")
     @classmethod
-    def _parse_index_sets(cls, v: bytes | IndexSet) -> bytes | IndexSet:
-        """
-        Accepted a binary serialized IndexSet
-        """
+    def _parse_index_set(cls, v: bytes | IndexSet) -> IndexSet:
+        """Accept a binary serialized IndexSet"""
         if isinstance(v, bytes):
             return IndexSet.deserialize(v)
         return v
 
     @field_validator("training")
     @classmethod
     def _validate_training_set(cls, v: IndexSet) -> IndexSet:
-        """
-        Training index set can be empty (zero-shot)
-        """
+        """Training index set can be empty (zero-shot)"""
         if v.datapoints == 0:
-            logger.info(
-                "This benchmark only specifies a test set. It will return an empty train set in `get_train_test_split()`"
+            logger.debug(
+                "This train-test split only specifies a test set. It will return an empty train set in `get_train_test_split()`"
             )
         return v
 
     @field_validator("test")
     @classmethod
     def _validate_test_set(cls, v: IndexSet) -> IndexSet:
-        """
-        Test index set cannot be empty
-        """
+        """Test index set cannot be empty"""
         if v.datapoints == 0:
-            raise InvalidBenchmarkError("The predefined split contains empty test partitions")
+            raise InvalidBenchmarkError("Test set cannot be empty")
         return v
 
     @model_validator(mode="after")
     def validate_set_overlap(self) -> Self:
-        """
-        The training and test index sets do not overlap
-        """
+        """The training and test index sets do not overlap"""
         if self.training.intersect(self.test):
             raise InvalidBenchmarkError("The predefined split specifies overlapping train and test sets")
         return self
 
     @property
     def n_train_datapoints(self) -> int:
-        """
-        The size of the train set.
-        """
+        """The size of the train set."""
         return self.training.datapoints
 
     @property
-    def n_test_sets(self) -> int:
-        """
-        The number of test sets
-        """
-        # TODO: Until we support multi-test benchmarks
-        return 1
-
-    @property
-    def n_test_datapoints(self) -> dict[str, int]:
-        """
-        The size of (each of) the test set(s).
-        """
-        # TODO: Until we support multi-test benchmarks
-        return {"test": self.test.datapoints}
+    def n_test_datapoints(self) -> int:
+        """The size of the test set."""
+        return self.test.datapoints
 
     @property
     def max_index(self) -> int:
-        # TODO: Until we support multi-test benchmarks (need)
-        return max(self.training.indices.max(), self.test.indices.max())
+        """Maximum index across train and test sets"""
+        max_indices = []
 
-    def test_items(self) -> Generator[tuple[str, IndexSet], None, None]:
-        # TODO: Until we support multi-test benchmarks
-        yield "test", self.test
+        # Only add max if the bitmap is not empty
+        if len(self.training.indices) > 0:
+            max_indices.append(self.training.indices.max())
+        max_indices.append(self.test.indices.max())
+
+        return max(max_indices)
 
 
 class SplitSpecificationV2Mixin(BaseModel):
     """
-    Mixin class to add a split field to a benchmark. This is the V2 implementation.
+    Mixin class to add splits field to a benchmark. This is the V2 implementation.
 
-    The internal representation for the split is a roaring bitmap,
+    The internal representation for the splits uses roaring bitmaps,
     which drastically improves scalability over the V1 implementation.
 
     Attributes:
-        split: The predefined train-test split to use for evaluation.
+        splits: The predefined train-test splits to use for evaluation.
     """
 
-    split: SplitV2
+    splits: dict[str, SplitV2]
+
+    @model_validator(mode="after")
+    def validate_splits_not_empty(self) -> Self:
+        """Ensure at least one split is provided"""
+        if not self.splits:
+            raise InvalidBenchmarkError("At least one split must be specified")
+        return self
 
     @computed_field
     @property
-    def n_train_datapoints(self) -> int:
-        """The size of the train set."""
-        return self.split.n_train_datapoints
+    def n_splits(self) -> int:
+        """The number of splits"""
+        return len(self.splits)
 
     @computed_field
     @property
-    def n_test_sets(self) -> int:
-        """The number of test sets"""
-        return self.split.n_test_sets
+    def split_labels(self) -> list[str]:
+        """Labels of all splits"""
+        return list(self.splits.keys())
 
     @computed_field
     @property
-    def n_test_datapoints(self) -> dict[str, int]:
-        """The size of (each of) the test set(s)."""
-        return self.split.n_test_datapoints
+    def n_train_datapoints(self) -> dict[str, int]:
+        """The size of the train set for each split."""
+        return {label: split.n_train_datapoints for label, split in self.splits.items()}
 
     @computed_field
     @property
-    def test_set_sizes(self) -> dict[str, int]:
-        return {label: index_set.datapoints for label, index_set in self.split.test_items()}
+    def n_test_datapoints(self) -> dict[str, int]:
+        """The size of the test set for each split."""
+        return {label: split.n_test_datapoints for label, split in self.splits.items()}
 
     @computed_field
     @property
-    def test_set_labels(self) -> list[str]:
-        return list(label for label, _ in self.split.test_items())
+    def max_index(self) -> int:
+        """Maximum index across all splits"""
+        return max(split.max_index for split in self.splits.values())
+
+    def split_items(self) -> Generator[tuple[str, SplitV2], None, None]:
+        """Yield all splits with their labels"""
+        for label, split in self.splits.items():
+            yield label, split
diff --git a/polaris/hub/client.py b/polaris/hub/client.py
@@ -484,12 +484,33 @@ def _get_v2_benchmark(self, owner: str | HubOwner, slug: str) -> BenchmarkV2Spec
 
         response_data["dataset"] = self.get_dataset(*response_data["dataset"]["artifactId"].split("/"))
 
-        split = {}
-        for label, url in response_data.get("split", {}).items():
-            with fsspec.open(url, mode="rb") as f:
-                split[label] = f.read()
-
-        return BenchmarkV2Specification(**{**response_data, "split": split})
+        # Handle split data - each split contains training and test data
+        split_data = response_data["split"]
+        splits = {}
+
+        # Import SplitV2 and IndexSet for creating proper split objects
+        from polaris.benchmark._split_v2 import SplitV2, IndexSet
+
+        for split_label, split_urls in split_data.items():
+            # Each split should have 'training' and 'test' objects with filePath, datapoints, md5Checksum
+            split_indices = {}
+            for data_type, url_info in split_urls.items():
+                # Extract the actual URL from the filePath field
+                url = url_info["filePath"]
+                with fsspec.open(url, mode="rb") as f:
+                    # Deserialize the roaring bitmap data into an IndexSet
+                    roaring_data = f.read()
+                    index_set = IndexSet.deserialize(roaring_data)
+                    split_indices[data_type] = index_set
+
+            # Create a SplitV2 object from the training and test IndexSets
+            splits[split_label] = SplitV2(training=split_indices["training"], test=split_indices["test"])
+
+        # Remove the original 'split' field and add 'splits' field
+        response_data.pop("split", None)
+        response_data["splits"] = splits
+
+        return BenchmarkV2Specification(**response_data)
 
     def upload_results(
         self,
diff --git a/polaris/hub/oauth.py b/polaris/hub/oauth.py
@@ -95,8 +95,7 @@ class DatasetV2Paths(ArtifactPaths):
 
 class BenchmarkV2Paths(ArtifactPaths):
     training: AnyUrlString = Field(json_schema_extra={"file": True})
-    test: AnyUrlString = Field(json_schema_extra={"file": True})
-    test_2: int = 0
+    test_sets: dict[str, AnyUrlString] = Field(json_schema_extra={"file": True})
 
 
 class PredictionPaths(ArtifactPaths):
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -403,15 +403,36 @@ def test_benchmark_v2(test_dataset_v2, test_org_owner):
         name="v2-benchmark-float-dtype",
         owner=test_org_owner,
         dataset=test_dataset_v2,
-        split=split,
+        splits={"default": split},
         target_cols=["A"],
         input_cols=["B"],
     )
     return benchmark
 
 
+@pytest.fixture(scope="function")
+def test_benchmark_v2_multiple_test_sets(test_dataset_v2, test_org_owner):
+    benchmark = BenchmarkV2Specification(
+        name="v2-benchmark-multiple-test-sets",
+        owner=test_org_owner,
+        dataset=test_dataset_v2,
+        splits={
+            "split_1": SplitV2(training=IndexSet(indices=[0, 1, 2, 3, 4]), test=IndexSet(indices=[5, 6, 7])),
+            "split_2": SplitV2(training=IndexSet(indices=[0, 1, 2, 3, 5, 6]), test=IndexSet(indices=[4, 7])),
+            "split_3": SplitV2(
+                training=IndexSet(indices=[0, 1, 2, 4, 7]), test=IndexSet(indices=[3, 5, 6, 8])
+            ),
+        },
+        target_cols=["A"],
+        input_cols=["B"],
+    )
+
+    return benchmark
+
+
 @pytest.fixture(scope="function")
 def v2_benchmark_with_rdkit_object_dtype(tmp_path, test_org_owner):
+    """Fixture for a benchmark with RDKit object dtype"""
     from polaris.utils.zarr.codecs import RDKitMolCodec
 
     zarr_path = tmp_path / "test_rdkit_object_dtype.zarr"
@@ -442,7 +463,7 @@ def v2_benchmark_with_rdkit_object_dtype(tmp_path, test_org_owner):
         name="v2-benchmark-rdkit-object-dtype",
         owner=test_org_owner,
         dataset=dataset,
-        split=split,
+        splits={"test": split},
         target_cols=["expt"],
         input_cols=["smiles"],
     )
@@ -451,6 +472,7 @@ def v2_benchmark_with_rdkit_object_dtype(tmp_path, test_org_owner):
 
 @pytest.fixture(scope="function")
 def v2_benchmark_with_atomarray_object_dtype(tmp_path, test_org_owner):
+    """Fixture for a benchmark with AtomArray object dtype"""
     from polaris.utils.zarr.codecs import AtomArrayCodec
 
     zarr_path = tmp_path / "test_atomarray_object_dtype.zarr"
@@ -481,7 +503,7 @@ def v2_benchmark_with_atomarray_object_dtype(tmp_path, test_org_owner):
         name="v2-benchmark-atomarray-object-dtype",
         owner=test_org_owner,
         dataset=dataset,
-        split=split,
+        splits={"test": split},
         target_cols=["expt"],
         input_cols=["smiles"],
     )
diff --git a/tests/test_benchmark_v2.py b/tests/test_benchmark_v2.py