Updating split validator to allow duplicates across test sets (#158)

Andrewq11 · mercuryseries · web-flow · commit 4bace27e01b1 · 2024-07-26T13:03:06.000-04:00
* updating split validator to allow duplicates across test sets

* Using only unique indices in test set to check for out of bound indices

* removing print statement

* fixing spacing

* Update polaris/benchmark/_base.py

Co-authored-by: Honoré Hounwanou &lt;mercuryseries@gmail.com&gt;

* adding cleaner way to combine all test set indices

---------

Co-authored-by: Honoré Hounwanou &lt;mercuryseries@gmail.com&gt;
diff --git a/polaris/benchmark/_base.py b/polaris/benchmark/_base.py
@@ -182,30 +182,40 @@ def _validate_split(cls, v, info: ValidationInfo):
             raise InvalidBenchmarkError("The predefined split contains empty test partitions")
 
         train_idx_list = v[0]
-        test_idx_list = list(i for part in v[1].values() for i in part) if isinstance(v[1], dict) else v[1]
+        full_test_idx_list = list(chain.from_iterable(v[1].values())) if isinstance(v[1], dict) else v[1]
 
         if len(train_idx_list) == 0:
             logger.info(
                 "This benchmark only specifies a test set. It will return an empty train set in `get_train_test_split()`"
             )
 
         train_idx_set = set(train_idx_list)
-        test_idx_set = set(test_idx_list)
+        full_test_idx_set = set(full_test_idx_list)
 
         # The train and test indices do not overlap
-        if len(train_idx_set & test_idx_set) > 0:
+        if len(train_idx_set & full_test_idx_set) > 0:
             raise InvalidBenchmarkError("The predefined split specifies overlapping train and test sets")
 
-        # Duplicate indices
+        # Check for duplicate indices within the train set
         if len(train_idx_set) != len(train_idx_list):
             raise InvalidBenchmarkError("The training set contains duplicate indices")
-        if len(test_idx_set) != len(test_idx_list):
+
+        # Check for duplicate indices within a given test set. Because a user can specify
+        # multiple test sets for a given benchmark and it is acceptable for indices to be shared
+        # across test sets, we check for duplicates in each test set independently.
+        if isinstance(v[1], dict):
+            for test_set_name, test_set_idx_list in v[1].items():
+                if len(test_set_idx_list) != len(set(test_set_idx_list)):
+                    raise InvalidBenchmarkError(
+                        f'Test set with name "{test_set_name}" contains duplicate indices'
+                    )
+        elif len(full_test_idx_set) != len(full_test_idx_list):
             raise InvalidBenchmarkError("The test set contains duplicate indices")
 
         # All indices are valid given the dataset
         if info.data["dataset"] is not None:
             max_i = len(info.data["dataset"])
-            if any(i < 0 or i >= max_i for i in chain(train_idx_list, test_idx_list)):
+            if any(i < 0 or i >= max_i for i in chain(train_idx_list, full_test_idx_set)):
                 raise InvalidBenchmarkError("The predefined split contains invalid indices")
 
         return v
diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py
@@ -53,6 +53,13 @@ def test_split_verification(is_single_task, test_single_task_benchmark, test_mul
         cls(split=(train_split + train_split[:1], test_split), **default_kwargs)
     with pytest.raises(ValidationError):
         cls(split=(train_split, test_split + test_split[:1]), **default_kwargs)
+    with pytest.raises(ValidationError):
+        cls(
+            split=(train_split, {"test1": test_split, "test2": test_split + test_split[:1]}), **default_kwargs
+        )
+
+    # It should _not_ fail with duplicate indices across test partitions
+    cls(split=(train_split, {"test1": test_split, "test2": test_split}), **default_kwargs)
     # It should _not_ fail with missing indices
     cls(split=(train_split[:-1], test_split), **default_kwargs)
     # It should _not_ fail with an empty train set