From 908edbaa1277bf3c277093b9d5528ed12a7dcada Mon Sep 17 00:00:00 2001
From: Adam Merberg <adam.merberg@equilibriumenergy.com>
Date: Mon, 18 Aug 2025 15:57:25 -0400
Subject: [PATCH 01/11] add check property

Signed-off-by: Adam Merberg <adam.merberg@equilibriumenergy.com>
---
 pandera/api/checks.py | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/pandera/api/checks.py b/pandera/api/checks.py
index d651afc80..d82f779a7 100644
--- a/pandera/api/checks.py
+++ b/pandera/api/checks.py
@@ -34,6 +34,7 @@ def __init__(
         description: Optional[str] = None,
         statistics: Optional[dict[str, Any]] = None,
         strategy: Optional[Any] = None,
+        supports_unique_optimization: bool = False,
         **check_kwargs,
     ) -> None:
         """Apply a validation function to a data object.
@@ -98,6 +99,13 @@ def __init__(
         :param strategy: A hypothesis strategy, used for implementing data
             synthesis strategies for this check. See the
             :ref:`User Guide <custom-strategies>` for more details.
+        :param supports_unique_optimization: If True, indicates that this check
+            can be safely executed on unique values only, rather than the full
+            dataset. This enables significant performance optimizations for
+            MultiIndex validation when dealing with large datasets containing
+            duplicate values. The check function must be idempotent and produce
+            the same result whether applied to unique values or full values.
+            *New in version 0.21.0*
         :param check_kwargs: key-word arguments to pass into ``check_fn``
 
         :example:
@@ -177,6 +185,7 @@ def __init__(
         self.n_failure_cases = n_failure_cases
         self.title = title
         self.description = description
+        self.supports_unique_optimization = supports_unique_optimization
 
         if groupby is None and groups is not None:
             raise ValueError(
@@ -268,6 +277,9 @@ def greater_than(cls, min_value: Any, **kwargs) -> "Check":
         """
         if min_value is None:
             raise ValueError("min_value must not be None")
+        # Set supports_unique_optimization=True by default since this check
+        # is safe to run on unique values only
+        kwargs.setdefault("supports_unique_optimization", True)
         return cls.from_builtin_check_name(
             "greater_than",
             kwargs,
@@ -285,6 +297,9 @@ def greater_than_or_equal_to(cls, min_value: Any, **kwargs) -> "Check":
         """
         if min_value is None:
             raise ValueError("min_value must not be None")
+        # Set supports_unique_optimization=True by default since this check
+        # is safe to run on unique values only
+        kwargs.setdefault("supports_unique_optimization", True)
         return cls.from_builtin_check_name(
             "greater_than_or_equal_to",
             kwargs,
@@ -302,6 +317,9 @@ def less_than(cls, max_value: Any, **kwargs) -> "Check":
         """
         if max_value is None:
             raise ValueError("max_value must not be None")
+        # Set supports_unique_optimization=True by default since this check
+        # is safe to run on unique values only
+        kwargs.setdefault("supports_unique_optimization", True)
         return cls.from_builtin_check_name(
             "less_than",
             kwargs,
@@ -319,6 +337,9 @@ def less_than_or_equal_to(cls, max_value: Any, **kwargs) -> "Check":
         """
         if max_value is None:
             raise ValueError("max_value must not be None")
+        # Set supports_unique_optimization=True by default since this check
+        # is safe to run on unique values only
+        kwargs.setdefault("supports_unique_optimization", True)
         return cls.from_builtin_check_name(
             "less_than_or_equal_to",
             kwargs,
@@ -361,6 +382,9 @@ def in_range(
                 f"The combination of min_value = {min_value} and "
                 f"max_value = {max_value} defines an empty interval!"
             )
+        # Set supports_unique_optimization=True by default since this check
+        # is safe to run on unique values only
+        kwargs.setdefault("supports_unique_optimization", True)
         return cls.from_builtin_check_name(
             "in_range",
             kwargs,
@@ -391,6 +415,9 @@ def isin(cls, allowed_values: Iterable, **kwargs) -> "Check":
             raise ValueError(
                 f"Argument allowed_values must be iterable. Got {allowed_values}"
             ) from exc
+        # Set supports_unique_optimization=True by default since this check
+        # is safe to run on unique values only
+        kwargs.setdefault("supports_unique_optimization", True)
         return cls.from_builtin_check_name(
             "isin",
             kwargs,
@@ -420,6 +447,9 @@ def notin(cls, forbidden_values: Iterable, **kwargs) -> "Check":
                 "Argument forbidden_values must be iterable. "
                 f"Got {forbidden_values}"
             ) from exc
+        # Set supports_unique_optimization=True by default since this check
+        # is safe to run on unique values only
+        kwargs.setdefault("supports_unique_optimization", True)
         return cls.from_builtin_check_name(
             "notin",
             kwargs,
@@ -441,6 +471,9 @@ def str_matches(cls, pattern: Union[str, re.Pattern], **kwargs) -> "Check":
             raise ValueError(
                 f'pattern="{pattern}" cannot be compiled as regular expression'
             ) from exc
+        # Set supports_unique_optimization=True by default since this check
+        # is safe to run on unique values only
+        kwargs.setdefault("supports_unique_optimization", True)
         return cls.from_builtin_check_name(
             "str_matches",
             kwargs,

From 8ac3362eece52dbc0dacce5e823c037f32a27f42 Mon Sep 17 00:00:00 2001
From: Adam Merberg <adam.merberg@equilibriumenergy.com>
Date: Mon, 18 Aug 2025 16:36:46 -0400
Subject: [PATCH 02/11] initial implementation of unique optimization

Signed-off-by: Adam Merberg <adam.merberg@equilibriumenergy.com>
---
 pandera/backends/pandas/components.py | 153 +++++++++++++++++++++++---
 1 file changed, 140 insertions(+), 13 deletions(-)

diff --git a/pandera/backends/pandas/components.py b/pandera/backends/pandas/components.py
index 1b83a1b72..7402e611f 100644
--- a/pandera/backends/pandas/components.py
+++ b/pandera/backends/pandas/components.py
@@ -529,24 +529,38 @@ def validate(
         # Iterate over the expected index levels and validate each level with its
         # corresponding ``Index`` schema component.
         for level_pos, index_schema in level_mapping:
-            stub_df = pd.DataFrame(
-                index=check_obj.index.get_level_values(level_pos)
-            )
             # We've already taken care of coercion, so we can disable it now.
             index_schema = deepcopy(index_schema)
             index_schema.coerce = False
 
+            # Check if we can optimize validation for this level
+            can_optimize = self._can_optimize_level(index_schema)
+
             try:
-                # Validate using the schema for this level
-                index_schema.validate(
-                    stub_df,
-                    head=head,
-                    tail=tail,
-                    sample=sample,
-                    random_state=random_state,
-                    lazy=lazy,
-                    inplace=True,
-                )
+                if can_optimize:
+                    # Use optimized validation with unique values only
+                    self._validate_level_optimized(
+                        check_obj.index,
+                        level_pos,
+                        index_schema,
+                        head=head,
+                        tail=tail,
+                        sample=sample,
+                        random_state=random_state,
+                        lazy=lazy,
+                    )
+                else:
+                    # Fall back to traditional validation with full materialization
+                    self._validate_level_with_full_materialization(
+                        check_obj.index,
+                        level_pos,
+                        index_schema,
+                        head=head,
+                        tail=tail,
+                        sample=sample,
+                        random_state=random_state,
+                        lazy=lazy,
+                    )
             except (SchemaError, SchemaErrors) as exc:
                 self._collect_or_raise(error_handler, exc, schema)
 
@@ -564,6 +578,119 @@ def validate(
 
         return check_obj
 
+    def _can_optimize_level(self, index_schema) -> bool:
+        """Check if we can optimize validation for this level.
+
+        :param index_schema: The schema for this level
+        :returns: True if optimization can be applied to this level
+        """
+        # Check if all checks support unique optimization
+        # Note that if there are no checks all([]) returns True
+        return all(
+            self._check_supports_unique_optimization(check)
+            for check in index_schema.checks
+        )
+
+    def _check_supports_unique_optimization(self, check) -> bool:
+        """Determine if a check can operate on unique values only.
+
+        :param check: The check to analyze
+        :returns: True if the check supports unique value optimization
+        """
+        # Check if the check has explicit support for optimization
+        # All built-in checks that support optimization have this property set in Phase 1
+        if hasattr(check, "supports_unique_optimization"):
+            return check.supports_unique_optimization
+
+        # Conservative default for checks without the property (shouldn't happen for modern checks)
+        return False
+
+    def _validate_level_optimized(
+        self,
+        multiindex: pd.MultiIndex,
+        level_pos: int,
+        index_schema,
+        head: Optional[int] = None,
+        tail: Optional[int] = None,
+        sample: Optional[int] = None,
+        random_state: Optional[int] = None,
+        lazy: bool = False,
+    ) -> None:
+        """Validate a level using unique values optimization.
+
+        :param multiindex: The MultiIndex being validated
+        :param level_pos: Position of this level in the MultiIndex
+        :param index_schema: The schema for this level
+        :param head: validate the first n rows
+        :param tail: validate the last n rows
+        :param sample: validate a random sample of n rows
+        :param random_state: random seed for sampling
+        :param lazy: if True, collect errors instead of raising immediately
+        """
+        try:
+            # Use unique values directly from MultiIndex levels
+            unique_values = multiindex.unique(level=level_pos)
+            unique_stub_df = pd.DataFrame(index=unique_values)
+
+            # Run validation on unique values only
+            index_schema.validate(
+                unique_stub_df,
+                head=head,
+                tail=tail,
+                sample=sample,
+                random_state=random_state,
+                lazy=lazy,
+                inplace=True,
+            )
+            # If we get here, all unique values passed validation
+
+        except (SchemaError, SchemaErrors):
+            # Validation failed on unique values, need to materialize full values
+            # for proper error reporting with correct indices
+            self._validate_level_with_full_materialization(
+                multiindex,
+                level_pos,
+                index_schema,
+                head=head,
+                tail=tail,
+                sample=sample,
+                random_state=random_state,
+                lazy=lazy,
+            )
+
+    def _validate_level_with_full_materialization(
+        self,
+        multiindex: pd.MultiIndex,
+        level_pos: int,
+        index_schema,
+        head: Optional[int] = None,
+        tail: Optional[int] = None,
+        sample: Optional[int] = None,
+        random_state: Optional[int] = None,
+        lazy: bool = False,
+    ) -> None:
+        """Validate a level using full materialization.
+
+        This materializes all values (including duplicates) for validation.
+        Used both as a fallback when optimization isn't possible and when
+        errors are identified in optimized validation
+        in order to provide proper error reporting with correct indices.
+        """
+        # Materialize the full level values
+        full_values = multiindex.get_level_values(level_pos)
+        full_stub_df = pd.DataFrame(index=full_values)
+
+        # Run validation on full materialized values
+        index_schema.validate(
+            full_stub_df,
+            head=head,
+            tail=tail,
+            sample=sample,
+            random_state=random_state,
+            lazy=lazy,
+            inplace=True,
+        )
+
     def _check_strict(
         self,
         check_obj: pd.MultiIndex,

From 43b51d26122212ecfcc64ecec203f6dc7e5418d3 Mon Sep 17 00:00:00 2001
From: Adam Merberg <adam.merberg@equilibriumenergy.com>
Date: Mon, 18 Aug 2025 20:57:39 -0400
Subject: [PATCH 03/11] add tests for optimization

Signed-off-by: Adam Merberg <adam.merberg@equilibriumenergy.com>
---
 pandera/backends/pandas/components.py  |   8 +-
 tests/pandas/test_schema_components.py | 277 +++++++++++++++++++++++++
 2 files changed, 281 insertions(+), 4 deletions(-)

diff --git a/pandera/backends/pandas/components.py b/pandera/backends/pandas/components.py
index 7402e611f..6fb310c06 100644
--- a/pandera/backends/pandas/components.py
+++ b/pandera/backends/pandas/components.py
@@ -632,18 +632,18 @@ def _validate_level_optimized(
             unique_values = multiindex.unique(level=level_pos)
             unique_stub_df = pd.DataFrame(index=unique_values)
 
-            # Run validation on unique values only
+            # Run validation on unique values only, using lazy=False to cut to
+            # full validation as soon as we hit a failure
+
             index_schema.validate(
                 unique_stub_df,
                 head=head,
                 tail=tail,
                 sample=sample,
                 random_state=random_state,
-                lazy=lazy,
+                lazy=False,
                 inplace=True,
             )
-            # If we get here, all unique values passed validation
-
         except (SchemaError, SchemaErrors):
             # Validation failed on unique values, need to materialize full values
             # for proper error reporting with correct indices
diff --git a/tests/pandas/test_schema_components.py b/tests/pandas/test_schema_components.py
index b4d65b773..a5de22cc5 100644
--- a/tests/pandas/test_schema_components.py
+++ b/tests/pandas/test_schema_components.py
@@ -2,6 +2,7 @@
 
 import copy
 from typing import Any, Optional
+from unittest.mock import patch, MagicMock
 
 import numpy as np
 import pandas as pd
@@ -890,6 +891,282 @@ def test_multiindex_incorrect_input(indexes) -> None:
         MultiIndex(indexes)
 
 
+def test_multiindex_optimization_with_sub_index() -> None:
+    """Test that MultiIndex optimization works correctly with sub-MultiIndices.
+
+    This test ensures that when validating MultiIndex levels, the optimization
+    correctly determines the unique values of the level. Typically, this means
+    using multiindex.unique(level=level_pos) and NOT multiindex.levels[level_pos],
+    because .levels can contain values that don't actually appear in sub-MultiIndices
+    (e.g. after slicing or filtering operations).
+    """
+    # Create a large MultiIndex with specific values
+    original_mi = pd.MultiIndex.from_arrays(
+        [
+            ["valid", "INVALID", "valid", "valid"]
+            * 500,  # Some invalid values
+            list(range(2000)),
+        ],
+        names=["status", "id"],
+    )
+
+    # Create original DataFrame
+    df = pd.DataFrame({"value": range(2000)}, index=original_mi)
+
+    # Filter to create a sub-MultiIndex that excludes all 'INVALID' entries
+    # This is a common real-world scenario: filtering data but keeping MultiIndex structure
+    filtered_df = df[df.index.get_level_values("status") == "valid"]
+
+    # Create schema that would reject 'INVALID' values
+    schema = DataFrameSchema(
+        columns={"value": Column(int)},
+        index=MultiIndex(
+            [
+                Index(
+                    String,
+                    checks=[Check.str_matches(r"^valid$")],
+                    name="status",
+                ),
+                Index(
+                    Int, checks=[Check.greater_than_or_equal_to(0)], name="id"
+                ),
+            ]
+        ),
+    )
+
+    # This should pass because filtered_df only contains 'valid' values
+    # If the optimization incorrectly used .levels[level_pos], this would fail
+    # because filtered_df.index.levels[0] still contains 'INVALID' from the original
+    validated_df = schema.validate(filtered_df)
+    assert isinstance(validated_df, pd.DataFrame)
+    assert len(validated_df) > 0
+
+    # Verify that the optimization is actually beneficial by ensuring we have duplicates
+    level0_unique = filtered_df.index.unique(level=0)
+    level0_total = len(filtered_df.index.get_level_values(0))
+    assert (
+        len(level0_unique) < level0_total
+    ), "Should have duplicates for optimization to be beneficial"
+
+    # Additional verification: manually check that .levels would contain phantom values
+    level0_levels = filtered_df.index.levels[0]
+    level0_actual = filtered_df.index.unique(level=0)
+    phantom_values = set(level0_levels) - set(level0_actual)
+    assert (
+        len(phantom_values) > 0
+    ), "Test setup should create phantom values in .levels"
+    assert (
+        "INVALID" in phantom_values
+    ), "Should have 'INVALID' as phantom value"
+
+
+@pytest.mark.parametrize(
+    "schema,expected_optimized_calls,expected_full_calls,expected_optimized_levels,expected_full_levels",
+    [
+        # All optimizable checks -> optimized path for both levels
+        (
+            DataFrameSchema(
+                columns={"value": Column(int)},
+                index=MultiIndex(
+                    [
+                        Index(
+                            String,
+                            checks=[
+                                Check.str_matches(
+                                    r"^(cat|dog)$"
+                                ),  # Optimizable
+                                Check.isin(["cat", "dog"]),  # Optimizable
+                            ],
+                            name="animal",
+                        ),
+                        Index(
+                            Int,
+                            checks=[
+                                Check.greater_than_or_equal_to(
+                                    0
+                                ),  # Optimizable
+                                Check.less_than(1000),  # Optimizable
+                            ],
+                            name="id",
+                        ),
+                    ]
+                ),
+            ),
+            2,
+            0,
+            [0, 1],
+            [],
+        ),
+        # Mixed checks -> full materialization for level with non-optimizable, optimized for others
+        (
+            DataFrameSchema(
+                columns={"value": Column(int)},
+                index=MultiIndex(
+                    [
+                        Index(
+                            String,
+                            checks=[
+                                Check.str_matches(
+                                    r"^(cat|dog)$"
+                                ),  # Optimizable
+                                Check(
+                                    lambda s: len(s) > 50,
+                                    supports_unique_optimization=False,
+                                ),  # NOT optimizable
+                            ],
+                            name="animal",
+                        ),
+                        Index(
+                            Int,
+                            checks=[
+                                Check.greater_than_or_equal_to(
+                                    0
+                                ),  # Optimizable
+                            ],
+                            name="id",
+                        ),
+                    ]
+                ),
+            ),
+            1,
+            1,
+            [1],
+            [0],
+        ),
+    ],
+)
+def test_multiindex_optimization_path_selection(
+    schema: DataFrameSchema,
+    expected_optimized_calls: int,
+    expected_full_calls: int,
+    expected_optimized_levels: list[int],
+    expected_full_levels: list[int],
+) -> None:
+    """Test that MultiIndex validation chooses the correct optimization path."""
+    # Create test MultiIndex with duplicates for optimization benefit
+    mi = pd.MultiIndex.from_arrays(
+        [
+            ["cat", "dog", "cat", "dog"] * 100,  # Lots of duplicates
+            list(range(400)),
+        ],
+        names=["animal", "id"],
+    )
+    df = pd.DataFrame({"value": range(400)}, index=mi)
+
+    # Mock the backend methods to track which path is taken
+    with (
+        patch(
+            "pandera.backends.pandas.components.MultiIndexBackend._validate_level_optimized"
+        ) as mock_optimized,
+        patch(
+            "pandera.backends.pandas.components.MultiIndexBackend._validate_level_with_full_materialization"
+        ) as mock_full,
+    ):
+
+        schema.validate(df)
+
+        # Verify correct number of calls
+        assert (
+            mock_optimized.call_count == expected_optimized_calls
+        ), f"Expected {expected_optimized_calls} calls to optimized path, got {mock_optimized.call_count}"
+        assert (
+            mock_full.call_count == expected_full_calls
+        ), f"Expected {expected_full_calls} calls to full materialization, got {mock_full.call_count}"
+
+        # Verify correct levels were called with correct methods
+        if expected_optimized_calls > 0:
+            optimized_calls = [
+                call[0][1] for call in mock_optimized.call_args_list
+            ]  # Extract level_pos argument
+            assert sorted(optimized_calls) == sorted(
+                expected_optimized_levels
+            ), f"Expected optimized calls for levels {expected_optimized_levels}, got {optimized_calls}"
+
+        if expected_full_calls > 0:
+            full_calls = [call[0][1] for call in mock_full.call_args_list]
+            assert sorted(full_calls) == sorted(
+                expected_full_levels
+            ), f"Expected full calls for levels {expected_full_levels}, got {full_calls}"
+
+
+@pytest.mark.parametrize(
+    "checks,expected_can_optimize",
+    [
+        # Schema with all optimizable checks
+        ([Check.str_matches(r"^test$"), Check.isin(["test"])], True),
+        # Schema with mixed checks (includes non-optimizable)
+        (
+            [
+                Check.str_matches(r"^test$"),
+                Check(
+                    lambda s: len(s) > 100, supports_unique_optimization=False
+                ),
+            ],
+            False,
+        ),
+        # Schema with no checks
+        ([], True),
+        # Schema with only non-optimizable checks
+        (
+            [
+                Check(
+                    lambda s: s.nunique() > 10,
+                    supports_unique_optimization=False,
+                )
+            ],
+            False,
+        ),
+    ],
+)
+def test_multiindex_can_optimize_level(
+    checks: list, expected_can_optimize: bool
+) -> None:
+    """Test the _can_optimize_level decision logic."""
+    from pandera.backends.pandas.components import MultiIndexBackend
+
+    backend = MultiIndexBackend()
+    schema = Index(String, checks=checks)
+
+    result = backend._can_optimize_level(schema)
+    assert result is expected_can_optimize
+
+
+@pytest.mark.parametrize(
+    "check,expected_supports_optimization",
+    [
+        # Built-in optimizable check
+        (Check.greater_than(5), True),
+        # Explicitly non-optimizable check
+        (
+            Check(
+                lambda s: s.nunique() > 10, supports_unique_optimization=False
+            ),
+            False,
+        ),
+        # Custom check marked as optimizable
+        (
+            Check(
+                lambda s: s.str.len() > 2, supports_unique_optimization=True
+            ),
+            True,
+        ),
+        # Built-in optimizable check - isin
+        (Check.isin(["test"]), True),
+        # Built-in optimizable check - str_matches
+        (Check.str_matches(r"^test$"), True),
+    ],
+)
+def test_check_supports_unique_optimization(
+    check, expected_supports_optimization: bool
+) -> None:
+    """Test individual check support detection for unique optimization."""
+    from pandera.backends.pandas.components import MultiIndexBackend
+
+    backend = MultiIndexBackend()
+    result = backend._check_supports_unique_optimization(check)
+    assert result is expected_supports_optimization
+
+
 def test_index_validation_pandas_string_dtype():
     """Test that pandas string type is correctly validated."""
 

From 6b0c9240ca6f075d17ff49106a3524e69b1c67b8 Mon Sep 17 00:00:00 2001
From: Adam Merberg <adam.merberg@equilibriumenergy.com>
Date: Mon, 18 Aug 2025 21:45:02 -0400
Subject: [PATCH 04/11] set more checks to allow optimization

Signed-off-by: Adam Merberg <adam.merberg@equilibriumenergy.com>
---
 pandera/api/checks.py | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/pandera/api/checks.py b/pandera/api/checks.py
index d82f779a7..61e22a2eb 100644
--- a/pandera/api/checks.py
+++ b/pandera/api/checks.py
@@ -245,6 +245,9 @@ def equal_to(cls, value: Any, **kwargs) -> "Check":
         :param value: values in this data object must be
             equal to this value.
         """
+        # Set supports_unique_optimization=True by default since this check
+        # is safe to run on unique values only
+        kwargs.setdefault("supports_unique_optimization", True)
         return cls.from_builtin_check_name(
             "equal_to",
             kwargs,
@@ -258,6 +261,9 @@ def not_equal_to(cls, value: Any, **kwargs) -> "Check":
 
         :param value: This value must not occur in the data object.
         """
+        # Set supports_unique_optimization=True by default since this check
+        # is safe to run on unique values only
+        kwargs.setdefault("supports_unique_optimization", True)
         return cls.from_builtin_check_name(
             "not_equal_to",
             kwargs,
@@ -497,6 +503,9 @@ def str_contains(
             raise ValueError(
                 f'pattern="{pattern}" cannot be compiled as regular expression'
             ) from exc
+        # Set supports_unique_optimization=True by default since this check
+        # is safe to run on unique values only
+        kwargs.setdefault("supports_unique_optimization", True)
         return cls.from_builtin_check_name(
             "str_contains",
             kwargs,
@@ -512,7 +521,9 @@ def str_startswith(cls, string: str, **kwargs) -> "Check":
         :param string: String all values should start with
         :param kwargs: key-word arguments passed into the `Check` initializer.
         """
-
+        # Set supports_unique_optimization=True by default since this check
+        # is safe to run on unique values only
+        kwargs.setdefault("supports_unique_optimization", True)
         return cls.from_builtin_check_name(
             "str_startswith",
             kwargs,
@@ -527,6 +538,9 @@ def str_endswith(cls, string: str, **kwargs) -> "Check":
         :param string: String all values should end with
         :param kwargs: key-word arguments passed into the `Check` initializer.
         """
+        # Set supports_unique_optimization=True by default since this check
+        # is safe to run on unique values only
+        kwargs.setdefault("supports_unique_optimization", True)
         return cls.from_builtin_check_name(
             "str_endswith",
             kwargs,
@@ -551,6 +565,9 @@ def str_length(
                 "At least a minimum or a maximum need to be specified. Got "
                 "None."
             )
+        # Set supports_unique_optimization=True by default since this check
+        # is safe to run on unique values only
+        kwargs.setdefault("supports_unique_optimization", True)
         return cls.from_builtin_check_name(
             "str_length",
             kwargs,
@@ -575,6 +592,9 @@ def unique_values_eq(cls, values: Iterable, **kwargs) -> "Check":
             raise ValueError(
                 f"Argument values must be iterable. Got {values}"
             ) from exc
+        # Set supports_unique_optimization=True by default since this check
+        # is safe to run on unique values only
+        kwargs.setdefault("supports_unique_optimization", True)
         return cls.from_builtin_check_name(
             "unique_values_eq",
             kwargs,

From 1bfcb84fe2a5cb4c5a62360cdccf20361d880af1 Mon Sep 17 00:00:00 2001
From: Adam Merberg <adam.merberg@equilibriumenergy.com>
Date: Mon, 18 Aug 2025 22:37:11 -0400
Subject: [PATCH 05/11] improve name

Signed-off-by: Adam Merberg <adam.merberg@equilibriumenergy.com>
---
 pandera/api/checks.py                  | 81 +++++++++++++-------------
 pandera/backends/pandas/components.py  | 18 +++---
 tests/pandas/test_schema_components.py | 20 +++----
 3 files changed, 56 insertions(+), 63 deletions(-)

diff --git a/pandera/api/checks.py b/pandera/api/checks.py
index 61e22a2eb..111b0a537 100644
--- a/pandera/api/checks.py
+++ b/pandera/api/checks.py
@@ -34,7 +34,7 @@ def __init__(
         description: Optional[str] = None,
         statistics: Optional[dict[str, Any]] = None,
         strategy: Optional[Any] = None,
-        supports_unique_optimization: bool = False,
+        determined_by_unique: bool = False,
         **check_kwargs,
     ) -> None:
         """Apply a validation function to a data object.
@@ -99,13 +99,12 @@ def __init__(
         :param strategy: A hypothesis strategy, used for implementing data
             synthesis strategies for this check. See the
             :ref:`User Guide <custom-strategies>` for more details.
-        :param supports_unique_optimization: If True, indicates that this check
-            can be safely executed on unique values only, rather than the full
-            dataset. This enables significant performance optimizations for
-            MultiIndex validation when dealing with large datasets containing
-            duplicate values. The check function must be idempotent and produce
-            the same result whether applied to unique values or full values.
-            *New in version 0.21.0*
+        :param determined_by_unique: If True, indicates that this check's
+            result is fully determined by the unique values in the data, meaning
+            duplicate values don't affect the outcome. This enables significant
+            performance optimizations for MultiIndex validation when dealing with
+            large datasets. If True, the check function must produce the same result
+            whether applied to unique values or full values.
         :param check_kwargs: key-word arguments to pass into ``check_fn``
 
         :example:
@@ -185,7 +184,7 @@ def __init__(
         self.n_failure_cases = n_failure_cases
         self.title = title
         self.description = description
-        self.supports_unique_optimization = supports_unique_optimization
+        self.determined_by_unique = determined_by_unique
 
         if groupby is None and groups is not None:
             raise ValueError(
@@ -245,9 +244,9 @@ def equal_to(cls, value: Any, **kwargs) -> "Check":
         :param value: values in this data object must be
             equal to this value.
         """
-        # Set supports_unique_optimization=True by default since this check
-        # is safe to run on unique values only
-        kwargs.setdefault("supports_unique_optimization", True)
+        # Set determined_by_unique=True by default since this check
+        # is determined by unique values only
+        kwargs.setdefault("determined_by_unique", True)
         return cls.from_builtin_check_name(
             "equal_to",
             kwargs,
@@ -261,9 +260,9 @@ def not_equal_to(cls, value: Any, **kwargs) -> "Check":
 
         :param value: This value must not occur in the data object.
         """
-        # Set supports_unique_optimization=True by default since this check
-        # is safe to run on unique values only
-        kwargs.setdefault("supports_unique_optimization", True)
+        # Set determined_by_unique=True by default since this check
+        # is determined by unique values only
+        kwargs.setdefault("determined_by_unique", True)
         return cls.from_builtin_check_name(
             "not_equal_to",
             kwargs,
@@ -283,9 +282,9 @@ def greater_than(cls, min_value: Any, **kwargs) -> "Check":
         """
         if min_value is None:
             raise ValueError("min_value must not be None")
-        # Set supports_unique_optimization=True by default since this check
+        # Set determined_by_unique=True by default since this check
         # is safe to run on unique values only
-        kwargs.setdefault("supports_unique_optimization", True)
+        kwargs.setdefault("determined_by_unique", True)
         return cls.from_builtin_check_name(
             "greater_than",
             kwargs,
@@ -303,9 +302,9 @@ def greater_than_or_equal_to(cls, min_value: Any, **kwargs) -> "Check":
         """
         if min_value is None:
             raise ValueError("min_value must not be None")
-        # Set supports_unique_optimization=True by default since this check
+        # Set determined_by_unique=True by default since this check
         # is safe to run on unique values only
-        kwargs.setdefault("supports_unique_optimization", True)
+        kwargs.setdefault("determined_by_unique", True)
         return cls.from_builtin_check_name(
             "greater_than_or_equal_to",
             kwargs,
@@ -323,9 +322,9 @@ def less_than(cls, max_value: Any, **kwargs) -> "Check":
         """
         if max_value is None:
             raise ValueError("max_value must not be None")
-        # Set supports_unique_optimization=True by default since this check
+        # Set determined_by_unique=True by default since this check
         # is safe to run on unique values only
-        kwargs.setdefault("supports_unique_optimization", True)
+        kwargs.setdefault("determined_by_unique", True)
         return cls.from_builtin_check_name(
             "less_than",
             kwargs,
@@ -343,9 +342,9 @@ def less_than_or_equal_to(cls, max_value: Any, **kwargs) -> "Check":
         """
         if max_value is None:
             raise ValueError("max_value must not be None")
-        # Set supports_unique_optimization=True by default since this check
+        # Set determined_by_unique=True by default since this check
         # is safe to run on unique values only
-        kwargs.setdefault("supports_unique_optimization", True)
+        kwargs.setdefault("determined_by_unique", True)
         return cls.from_builtin_check_name(
             "less_than_or_equal_to",
             kwargs,
@@ -388,9 +387,9 @@ def in_range(
                 f"The combination of min_value = {min_value} and "
                 f"max_value = {max_value} defines an empty interval!"
             )
-        # Set supports_unique_optimization=True by default since this check
+        # Set determined_by_unique=True by default since this check
         # is safe to run on unique values only
-        kwargs.setdefault("supports_unique_optimization", True)
+        kwargs.setdefault("determined_by_unique", True)
         return cls.from_builtin_check_name(
             "in_range",
             kwargs,
@@ -421,9 +420,9 @@ def isin(cls, allowed_values: Iterable, **kwargs) -> "Check":
             raise ValueError(
                 f"Argument allowed_values must be iterable. Got {allowed_values}"
             ) from exc
-        # Set supports_unique_optimization=True by default since this check
+        # Set determined_by_unique=True by default since this check
         # is safe to run on unique values only
-        kwargs.setdefault("supports_unique_optimization", True)
+        kwargs.setdefault("determined_by_unique", True)
         return cls.from_builtin_check_name(
             "isin",
             kwargs,
@@ -453,9 +452,9 @@ def notin(cls, forbidden_values: Iterable, **kwargs) -> "Check":
                 "Argument forbidden_values must be iterable. "
                 f"Got {forbidden_values}"
             ) from exc
-        # Set supports_unique_optimization=True by default since this check
+        # Set determined_by_unique=True by default since this check
         # is safe to run on unique values only
-        kwargs.setdefault("supports_unique_optimization", True)
+        kwargs.setdefault("determined_by_unique", True)
         return cls.from_builtin_check_name(
             "notin",
             kwargs,
@@ -477,9 +476,9 @@ def str_matches(cls, pattern: Union[str, re.Pattern], **kwargs) -> "Check":
             raise ValueError(
                 f'pattern="{pattern}" cannot be compiled as regular expression'
             ) from exc
-        # Set supports_unique_optimization=True by default since this check
+        # Set determined_by_unique=True by default since this check
         # is safe to run on unique values only
-        kwargs.setdefault("supports_unique_optimization", True)
+        kwargs.setdefault("determined_by_unique", True)
         return cls.from_builtin_check_name(
             "str_matches",
             kwargs,
@@ -503,9 +502,9 @@ def str_contains(
             raise ValueError(
                 f'pattern="{pattern}" cannot be compiled as regular expression'
             ) from exc
-        # Set supports_unique_optimization=True by default since this check
+        # Set determined_by_unique=True by default since this check
         # is safe to run on unique values only
-        kwargs.setdefault("supports_unique_optimization", True)
+        kwargs.setdefault("determined_by_unique", True)
         return cls.from_builtin_check_name(
             "str_contains",
             kwargs,
@@ -521,9 +520,9 @@ def str_startswith(cls, string: str, **kwargs) -> "Check":
         :param string: String all values should start with
         :param kwargs: key-word arguments passed into the `Check` initializer.
         """
-        # Set supports_unique_optimization=True by default since this check
+        # Set determined_by_unique=True by default since this check
         # is safe to run on unique values only
-        kwargs.setdefault("supports_unique_optimization", True)
+        kwargs.setdefault("determined_by_unique", True)
         return cls.from_builtin_check_name(
             "str_startswith",
             kwargs,
@@ -538,9 +537,9 @@ def str_endswith(cls, string: str, **kwargs) -> "Check":
         :param string: String all values should end with
         :param kwargs: key-word arguments passed into the `Check` initializer.
         """
-        # Set supports_unique_optimization=True by default since this check
+        # Set determined_by_unique=True by default since this check
         # is safe to run on unique values only
-        kwargs.setdefault("supports_unique_optimization", True)
+        kwargs.setdefault("determined_by_unique", True)
         return cls.from_builtin_check_name(
             "str_endswith",
             kwargs,
@@ -565,9 +564,9 @@ def str_length(
                 "At least a minimum or a maximum need to be specified. Got "
                 "None."
             )
-        # Set supports_unique_optimization=True by default since this check
+        # Set determined_by_unique=True by default since this check
         # is safe to run on unique values only
-        kwargs.setdefault("supports_unique_optimization", True)
+        kwargs.setdefault("determined_by_unique", True)
         return cls.from_builtin_check_name(
             "str_length",
             kwargs,
@@ -592,9 +591,9 @@ def unique_values_eq(cls, values: Iterable, **kwargs) -> "Check":
             raise ValueError(
                 f"Argument values must be iterable. Got {values}"
             ) from exc
-        # Set supports_unique_optimization=True by default since this check
+        # Set determined_by_unique=True by default since this check
         # is safe to run on unique values only
-        kwargs.setdefault("supports_unique_optimization", True)
+        kwargs.setdefault("determined_by_unique", True)
         return cls.from_builtin_check_name(
             "unique_values_eq",
             kwargs,
diff --git a/pandera/backends/pandas/components.py b/pandera/backends/pandas/components.py
index 6fb310c06..1ab79a5c1 100644
--- a/pandera/backends/pandas/components.py
+++ b/pandera/backends/pandas/components.py
@@ -584,23 +584,23 @@ def _can_optimize_level(self, index_schema) -> bool:
         :param index_schema: The schema for this level
         :returns: True if optimization can be applied to this level
         """
-        # Check if all checks support unique optimization
+        # Check whether all checks are determined by unique values
         # Note that if there are no checks all([]) returns True
         return all(
-            self._check_supports_unique_optimization(check)
+            self._check_determined_by_unique(check)
             for check in index_schema.checks
         )
 
-    def _check_supports_unique_optimization(self, check) -> bool:
-        """Determine if a check can operate on unique values only.
+    def _check_determined_by_unique(self, check) -> bool:
+        """Determine if a check is determined by unique values only.
 
         :param check: The check to analyze
-        :returns: True if the check supports unique value optimization
+        :returns: True if the check result is determined by unique values
         """
-        # Check if the check has explicit support for optimization
-        # All built-in checks that support optimization have this property set in Phase 1
-        if hasattr(check, "supports_unique_optimization"):
-            return check.supports_unique_optimization
+        # Check if the check result is determined by unique values
+        # All built-in checks that are determined by unique values have this property set
+        if hasattr(check, "determined_by_unique"):
+            return check.determined_by_unique
 
         # Conservative default for checks without the property (shouldn't happen for modern checks)
         return False
diff --git a/tests/pandas/test_schema_components.py b/tests/pandas/test_schema_components.py
index a5de22cc5..1e2fd4e68 100644
--- a/tests/pandas/test_schema_components.py
+++ b/tests/pandas/test_schema_components.py
@@ -1011,7 +1011,7 @@ def test_multiindex_optimization_with_sub_index() -> None:
                                 ),  # Optimizable
                                 Check(
                                     lambda s: len(s) > 50,
-                                    supports_unique_optimization=False,
+                                    determined_by_unique=False,
                                 ),  # NOT optimizable
                             ],
                             name="animal",
@@ -1098,9 +1098,7 @@ def test_multiindex_optimization_path_selection(
         (
             [
                 Check.str_matches(r"^test$"),
-                Check(
-                    lambda s: len(s) > 100, supports_unique_optimization=False
-                ),
+                Check(lambda s: len(s) > 100, determined_by_unique=False),
             ],
             False,
         ),
@@ -1111,7 +1109,7 @@ def test_multiindex_optimization_path_selection(
             [
                 Check(
                     lambda s: s.nunique() > 10,
-                    supports_unique_optimization=False,
+                    determined_by_unique=False,
                 )
             ],
             False,
@@ -1138,16 +1136,12 @@ def test_multiindex_can_optimize_level(
         (Check.greater_than(5), True),
         # Explicitly non-optimizable check
         (
-            Check(
-                lambda s: s.nunique() > 10, supports_unique_optimization=False
-            ),
+            Check(lambda s: s.nunique() > 10, determined_by_unique=False),
             False,
         ),
         # Custom check marked as optimizable
         (
-            Check(
-                lambda s: s.str.len() > 2, supports_unique_optimization=True
-            ),
+            Check(lambda s: s.str.len() > 2, determined_by_unique=True),
             True,
         ),
         # Built-in optimizable check - isin
@@ -1156,14 +1150,14 @@ def test_multiindex_can_optimize_level(
         (Check.str_matches(r"^test$"), True),
     ],
 )
-def test_check_supports_unique_optimization(
+def test_check_determined_by_unique(
     check, expected_supports_optimization: bool
 ) -> None:
     """Test individual check support detection for unique optimization."""
     from pandera.backends.pandas.components import MultiIndexBackend
 
     backend = MultiIndexBackend()
-    result = backend._check_supports_unique_optimization(check)
+    result = backend._check_determined_by_unique(check)
     assert result is expected_supports_optimization
 
 

From 5a8f29f822009535d6697c1666ec601f0ea8dc47 Mon Sep 17 00:00:00 2001
From: Adam Merberg <adam.merberg@equilibriumenergy.com>
Date: Mon, 18 Aug 2025 22:44:35 -0400
Subject: [PATCH 06/11] improved comments

Signed-off-by: Adam Merberg <adam.merberg@equilibriumenergy.com>
---
 pandera/backends/pandas/components.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pandera/backends/pandas/components.py b/pandera/backends/pandas/components.py
index 1ab79a5c1..0a1539e34 100644
--- a/pandera/backends/pandas/components.py
+++ b/pandera/backends/pandas/components.py
@@ -628,13 +628,14 @@ def _validate_level_optimized(
         :param lazy: if True, collect errors instead of raising immediately
         """
         try:
-            # Use unique values directly from MultiIndex levels
+            # Use unique values. Note that we use the MultiIndex.unique() method
+            # to get the unique values, rather than multiindex.levels[level_pos]
+            # which can have extra values that don't appear in the full data.
             unique_values = multiindex.unique(level=level_pos)
             unique_stub_df = pd.DataFrame(index=unique_values)
 
             # Run validation on unique values only, using lazy=False to cut to
             # full validation as soon as we hit a failure
-
             index_schema.validate(
                 unique_stub_df,
                 head=head,

From c083cb3544f7ba1f34cca100d88a152e6258d3ec Mon Sep 17 00:00:00 2001
From: Adam Merberg <adam.merberg@equilibriumenergy.com>
Date: Tue, 19 Aug 2025 09:07:17 -0400
Subject: [PATCH 07/11] consolidation

Signed-off-by: Adam Merberg <adam.merberg@equilibriumenergy.com>
---
 pandera/api/base/checks.py | 16 +++++++++-
 pandera/api/checks.py      | 60 ++++++++++----------------------------
 2 files changed, 30 insertions(+), 46 deletions(-)

diff --git a/pandera/api/base/checks.py b/pandera/api/base/checks.py
index 0f4790699..41f14747e 100644
--- a/pandera/api/base/checks.py
+++ b/pandera/api/base/checks.py
@@ -123,9 +123,23 @@ def from_builtin_check_name(
         init_kwargs,
         error: Union[str, Callable],
         statistics: Optional[dict[str, Any]] = None,
+        defaults: Optional[dict[str, Any]] = None,
         **check_kwargs,
     ):
-        """Create a Check object from a built-in check's name."""
+        """Create a Check object from a built-in check's name.
+
+        :param name: Name of the built-in check function
+        :param init_kwargs: Keyword arguments to pass to the Check constructor
+        :param error: Error message or callable for this check
+        :param statistics: Raw check constraint values
+        :param defaults: Default values to apply to init_kwargs if not already set
+        :param check_kwargs: Additional keyword arguments
+        """
+        # Apply defaults to init_kwargs if provided
+        if defaults:
+            for key, value in defaults.items():
+                init_kwargs.setdefault(key, value)
+
         kws = {**init_kwargs, **check_kwargs}
         if "error" not in kws:
             kws["error"] = error
diff --git a/pandera/api/checks.py b/pandera/api/checks.py
index 111b0a537..526ab289f 100644
--- a/pandera/api/checks.py
+++ b/pandera/api/checks.py
@@ -244,13 +244,11 @@ def equal_to(cls, value: Any, **kwargs) -> "Check":
         :param value: values in this data object must be
             equal to this value.
         """
-        # Set determined_by_unique=True by default since this check
-        # is determined by unique values only
-        kwargs.setdefault("determined_by_unique", True)
         return cls.from_builtin_check_name(
             "equal_to",
             kwargs,
             error=f"equal_to({value})",
+            defaults={"determined_by_unique": True},
             value=value,
         )
 
@@ -260,13 +258,11 @@ def not_equal_to(cls, value: Any, **kwargs) -> "Check":
 
         :param value: This value must not occur in the data object.
         """
-        # Set determined_by_unique=True by default since this check
-        # is determined by unique values only
-        kwargs.setdefault("determined_by_unique", True)
         return cls.from_builtin_check_name(
             "not_equal_to",
             kwargs,
             error=f"not_equal_to({value})",
+            defaults={"determined_by_unique": True},
             value=value,
         )
 
@@ -282,13 +278,11 @@ def greater_than(cls, min_value: Any, **kwargs) -> "Check":
         """
         if min_value is None:
             raise ValueError("min_value must not be None")
-        # Set determined_by_unique=True by default since this check
-        # is safe to run on unique values only
-        kwargs.setdefault("determined_by_unique", True)
         return cls.from_builtin_check_name(
             "greater_than",
             kwargs,
             error=f"greater_than({min_value})",
+            defaults={"determined_by_unique": True},
             min_value=min_value,
         )
 
@@ -302,13 +296,11 @@ def greater_than_or_equal_to(cls, min_value: Any, **kwargs) -> "Check":
         """
         if min_value is None:
             raise ValueError("min_value must not be None")
-        # Set determined_by_unique=True by default since this check
-        # is safe to run on unique values only
-        kwargs.setdefault("determined_by_unique", True)
         return cls.from_builtin_check_name(
             "greater_than_or_equal_to",
             kwargs,
             error=f"greater_than_or_equal_to({min_value})",
+            defaults={"determined_by_unique": True},
             min_value=min_value,
         )
 
@@ -322,13 +314,11 @@ def less_than(cls, max_value: Any, **kwargs) -> "Check":
         """
         if max_value is None:
             raise ValueError("max_value must not be None")
-        # Set determined_by_unique=True by default since this check
-        # is safe to run on unique values only
-        kwargs.setdefault("determined_by_unique", True)
         return cls.from_builtin_check_name(
             "less_than",
             kwargs,
             error=f"less_than({max_value})",
+            defaults={"determined_by_unique": True},
             max_value=max_value,
         )
 
@@ -342,13 +332,11 @@ def less_than_or_equal_to(cls, max_value: Any, **kwargs) -> "Check":
         """
         if max_value is None:
             raise ValueError("max_value must not be None")
-        # Set determined_by_unique=True by default since this check
-        # is safe to run on unique values only
-        kwargs.setdefault("determined_by_unique", True)
         return cls.from_builtin_check_name(
             "less_than_or_equal_to",
             kwargs,
             error=f"less_than_or_equal_to({max_value})",
+            defaults={"determined_by_unique": True},
             max_value=max_value,
         )
 
@@ -387,13 +375,11 @@ def in_range(
                 f"The combination of min_value = {min_value} and "
                 f"max_value = {max_value} defines an empty interval!"
             )
-        # Set determined_by_unique=True by default since this check
-        # is safe to run on unique values only
-        kwargs.setdefault("determined_by_unique", True)
         return cls.from_builtin_check_name(
             "in_range",
             kwargs,
             error=f"in_range({min_value}, {max_value})",
+            defaults={"determined_by_unique": True},
             min_value=min_value,
             max_value=max_value,
             include_min=include_min,
@@ -420,13 +406,11 @@ def isin(cls, allowed_values: Iterable, **kwargs) -> "Check":
             raise ValueError(
                 f"Argument allowed_values must be iterable. Got {allowed_values}"
             ) from exc
-        # Set determined_by_unique=True by default since this check
-        # is safe to run on unique values only
-        kwargs.setdefault("determined_by_unique", True)
         return cls.from_builtin_check_name(
             "isin",
             kwargs,
             error=f"isin({allowed_values})",
+            defaults={"determined_by_unique": True},
             statistics={"allowed_values": allowed_values},
             allowed_values=allowed_values_mod,
         )
@@ -452,13 +436,11 @@ def notin(cls, forbidden_values: Iterable, **kwargs) -> "Check":
                 "Argument forbidden_values must be iterable. "
                 f"Got {forbidden_values}"
             ) from exc
-        # Set determined_by_unique=True by default since this check
-        # is safe to run on unique values only
-        kwargs.setdefault("determined_by_unique", True)
         return cls.from_builtin_check_name(
             "notin",
             kwargs,
             error=f"notin({forbidden_values})",
+            defaults={"determined_by_unique": True},
             statistics={"forbidden_values": forbidden_values},
             forbidden_values=forbidden_values_mod,
         )
@@ -476,13 +458,11 @@ def str_matches(cls, pattern: Union[str, re.Pattern], **kwargs) -> "Check":
             raise ValueError(
                 f'pattern="{pattern}" cannot be compiled as regular expression'
             ) from exc
-        # Set determined_by_unique=True by default since this check
-        # is safe to run on unique values only
-        kwargs.setdefault("determined_by_unique", True)
         return cls.from_builtin_check_name(
             "str_matches",
             kwargs,
             error=f"str_matches('{pattern}')",
+            defaults={"determined_by_unique": True},
             statistics={"pattern": pattern},
             pattern=pattern,
         )
@@ -502,13 +482,11 @@ def str_contains(
             raise ValueError(
                 f'pattern="{pattern}" cannot be compiled as regular expression'
             ) from exc
-        # Set determined_by_unique=True by default since this check
-        # is safe to run on unique values only
-        kwargs.setdefault("determined_by_unique", True)
         return cls.from_builtin_check_name(
             "str_contains",
             kwargs,
             error=f"str_contains('{pattern}')",
+            defaults={"determined_by_unique": True},
             statistics={"pattern": pattern},
             pattern=pattern,
         )
@@ -520,13 +498,11 @@ def str_startswith(cls, string: str, **kwargs) -> "Check":
         :param string: String all values should start with
         :param kwargs: key-word arguments passed into the `Check` initializer.
         """
-        # Set determined_by_unique=True by default since this check
-        # is safe to run on unique values only
-        kwargs.setdefault("determined_by_unique", True)
         return cls.from_builtin_check_name(
             "str_startswith",
             kwargs,
             error=f"str_startswith('{string}')",
+            defaults={"determined_by_unique": True},
             string=string,
         )
 
@@ -537,13 +513,11 @@ def str_endswith(cls, string: str, **kwargs) -> "Check":
         :param string: String all values should end with
         :param kwargs: key-word arguments passed into the `Check` initializer.
         """
-        # Set determined_by_unique=True by default since this check
-        # is safe to run on unique values only
-        kwargs.setdefault("determined_by_unique", True)
         return cls.from_builtin_check_name(
             "str_endswith",
             kwargs,
             error=f"str_endswith('{string}')",
+            defaults={"determined_by_unique": True},
             string=string,
         )
 
@@ -564,13 +538,11 @@ def str_length(
                 "At least a minimum or a maximum need to be specified. Got "
                 "None."
             )
-        # Set determined_by_unique=True by default since this check
-        # is safe to run on unique values only
-        kwargs.setdefault("determined_by_unique", True)
         return cls.from_builtin_check_name(
             "str_length",
             kwargs,
             error=f"str_length({min_value}, {max_value})",
+            defaults={"determined_by_unique": True},
             min_value=min_value,
             max_value=max_value,
         )
@@ -591,13 +563,11 @@ def unique_values_eq(cls, values: Iterable, **kwargs) -> "Check":
             raise ValueError(
                 f"Argument values must be iterable. Got {values}"
             ) from exc
-        # Set determined_by_unique=True by default since this check
-        # is safe to run on unique values only
-        kwargs.setdefault("determined_by_unique", True)
         return cls.from_builtin_check_name(
             "unique_values_eq",
             kwargs,
             error=f"unique_values_eq({values})",
+            defaults={"determined_by_unique": True},
             statistics={"values": values_mod},
             values=values_mod,
         )

From 313d99cddf76a5f69a5d025d9b810dee1b19ea96 Mon Sep 17 00:00:00 2001
From: Adam Merberg <adam.merberg@equilibriumenergy.com>
Date: Tue, 19 Aug 2025 09:55:12 -0400
Subject: [PATCH 08/11] update comment re unique and nulls

Signed-off-by: Adam Merberg <adam.merberg@equilibriumenergy.com>
---
 pandera/backends/pandas/components.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pandera/backends/pandas/components.py b/pandera/backends/pandas/components.py
index 0a1539e34..24df990fc 100644
--- a/pandera/backends/pandas/components.py
+++ b/pandera/backends/pandas/components.py
@@ -628,9 +628,11 @@ def _validate_level_optimized(
         :param lazy: if True, collect errors instead of raising immediately
         """
         try:
-            # Use unique values. Note that we use the MultiIndex.unique() method
+            # Use unique values. Note that we use the MultiIndex.unique method
             # to get the unique values, rather than multiindex.levels[level_pos]
             # which can have extra values that don't appear in the full data.
+            # Additionally, multiindex.unique will include nan if present,
+            # whereas multiindex.levels[level_pos] will not.
             unique_values = multiindex.unique(level=level_pos)
             unique_stub_df = pd.DataFrame(index=unique_values)
 

From d8827462b36adde2039617325cac531ddbe6aa78 Mon Sep 17 00:00:00 2001
From: Adam Merberg <adam.merberg@equilibriumenergy.com>
Date: Tue, 19 Aug 2025 15:01:15 -0400
Subject: [PATCH 09/11] cleanup

Signed-off-by: Adam Merberg <adam.merberg@equilibriumenergy.com>
---
 pandera/api/base/checks.py             | 10 +---
 pandera/api/checks.py                  |  1 +
 pandera/backends/pandas/components.py  | 50 +++++++------------
 tests/pandas/test_schema_components.py | 69 --------------------------
 4 files changed, 19 insertions(+), 111 deletions(-)

diff --git a/pandera/api/base/checks.py b/pandera/api/base/checks.py
index 41f14747e..089e2b133 100644
--- a/pandera/api/base/checks.py
+++ b/pandera/api/base/checks.py
@@ -126,15 +126,7 @@ def from_builtin_check_name(
         defaults: Optional[dict[str, Any]] = None,
         **check_kwargs,
     ):
-        """Create a Check object from a built-in check's name.
-
-        :param name: Name of the built-in check function
-        :param init_kwargs: Keyword arguments to pass to the Check constructor
-        :param error: Error message or callable for this check
-        :param statistics: Raw check constraint values
-        :param defaults: Default values to apply to init_kwargs if not already set
-        :param check_kwargs: Additional keyword arguments
-        """
+        """Create a Check object from a built-in check's name."""
         # Apply defaults to init_kwargs if provided
         if defaults:
             for key, value in defaults.items():
diff --git a/pandera/api/checks.py b/pandera/api/checks.py
index 526ab289f..9fe7599a7 100644
--- a/pandera/api/checks.py
+++ b/pandera/api/checks.py
@@ -498,6 +498,7 @@ def str_startswith(cls, string: str, **kwargs) -> "Check":
         :param string: String all values should start with
         :param kwargs: key-word arguments passed into the `Check` initializer.
         """
+
         return cls.from_builtin_check_name(
             "str_startswith",
             kwargs,
diff --git a/pandera/backends/pandas/components.py b/pandera/backends/pandas/components.py
index 24df990fc..a547c033e 100644
--- a/pandera/backends/pandas/components.py
+++ b/pandera/backends/pandas/components.py
@@ -460,6 +460,8 @@ def validate(
         if not inplace:
             check_obj = check_obj.copy()
 
+        validate_full_df = not (head or tail or sample)
+
         # Ensure the object has a MultiIndex
         if not is_multiindex(check_obj.index):
             # Allow an exception for a *single-level* Index when the schema also
@@ -533,8 +535,14 @@ def validate(
             index_schema = deepcopy(index_schema)
             index_schema.coerce = False
 
-            # Check if we can optimize validation for this level
-            can_optimize = self._can_optimize_level(index_schema)
+            # Check if we can optimize validation for this level. We skip optimization
+            # if we're validating only a subset of the data because subsetting the data
+            # doesn't commute with taking unique values, which can lead to inconsistent
+            # results. For instance, the check may fail on the first n unique values but
+            # pass on the first n values.
+            can_optimize = validate_full_df and self._can_optimize_level(
+                index_schema
+            )
 
             try:
                 if can_optimize:
@@ -543,14 +551,10 @@ def validate(
                         check_obj.index,
                         level_pos,
                         index_schema,
-                        head=head,
-                        tail=tail,
-                        sample=sample,
-                        random_state=random_state,
                         lazy=lazy,
                     )
                 else:
-                    # Fall back to traditional validation with full materialization
+                    # Fall back to validating all of the values.
                     self._validate_level_with_full_materialization(
                         check_obj.index,
                         level_pos,
@@ -599,21 +603,13 @@ def _check_determined_by_unique(self, check) -> bool:
         """
         # Check if the check result is determined by unique values
         # All built-in checks that are determined by unique values have this property set
-        if hasattr(check, "determined_by_unique"):
-            return check.determined_by_unique
-
-        # Conservative default for checks without the property (shouldn't happen for modern checks)
-        return False
+        return getattr(check, "determined_by_unique", False)
 
     def _validate_level_optimized(
         self,
         multiindex: pd.MultiIndex,
         level_pos: int,
         index_schema,
-        head: Optional[int] = None,
-        tail: Optional[int] = None,
-        sample: Optional[int] = None,
-        random_state: Optional[int] = None,
         lazy: bool = False,
     ) -> None:
         """Validate a level using unique values optimization.
@@ -621,18 +617,14 @@ def _validate_level_optimized(
         :param multiindex: The MultiIndex being validated
         :param level_pos: Position of this level in the MultiIndex
         :param index_schema: The schema for this level
-        :param head: validate the first n rows
-        :param tail: validate the last n rows
-        :param sample: validate a random sample of n rows
-        :param random_state: random seed for sampling
         :param lazy: if True, collect errors instead of raising immediately
         """
         try:
-            # Use unique values. Note that we use the MultiIndex.unique method
-            # to get the unique values, rather than multiindex.levels[level_pos]
-            # which can have extra values that don't appear in the full data.
-            # Additionally, multiindex.unique will include nan if present,
-            # whereas multiindex.levels[level_pos] will not.
+            # Use unique values. Use the MultiIndex.unique method rather than
+            # multiindex.levels[level_pos] which can have extra values that
+            # don't appear in the full data. Additionally, multiindex.unique
+            # will include nan if present, whereas multiindex.levels[level_pos]
+            # will not.
             unique_values = multiindex.unique(level=level_pos)
             unique_stub_df = pd.DataFrame(index=unique_values)
 
@@ -640,10 +632,6 @@ def _validate_level_optimized(
             # full validation as soon as we hit a failure
             index_schema.validate(
                 unique_stub_df,
-                head=head,
-                tail=tail,
-                sample=sample,
-                random_state=random_state,
                 lazy=False,
                 inplace=True,
             )
@@ -654,10 +642,6 @@ def _validate_level_optimized(
                 multiindex,
                 level_pos,
                 index_schema,
-                head=head,
-                tail=tail,
-                sample=sample,
-                random_state=random_state,
                 lazy=lazy,
             )
 
diff --git a/tests/pandas/test_schema_components.py b/tests/pandas/test_schema_components.py
index 1e2fd4e68..fb5594706 100644
--- a/tests/pandas/test_schema_components.py
+++ b/tests/pandas/test_schema_components.py
@@ -891,75 +891,6 @@ def test_multiindex_incorrect_input(indexes) -> None:
         MultiIndex(indexes)
 
 
-def test_multiindex_optimization_with_sub_index() -> None:
-    """Test that MultiIndex optimization works correctly with sub-MultiIndices.
-
-    This test ensures that when validating MultiIndex levels, the optimization
-    correctly determines the unique values of the level. Typically, this means
-    using multiindex.unique(level=level_pos) and NOT multiindex.levels[level_pos],
-    because .levels can contain values that don't actually appear in sub-MultiIndices
-    (e.g. after slicing or filtering operations).
-    """
-    # Create a large MultiIndex with specific values
-    original_mi = pd.MultiIndex.from_arrays(
-        [
-            ["valid", "INVALID", "valid", "valid"]
-            * 500,  # Some invalid values
-            list(range(2000)),
-        ],
-        names=["status", "id"],
-    )
-
-    # Create original DataFrame
-    df = pd.DataFrame({"value": range(2000)}, index=original_mi)
-
-    # Filter to create a sub-MultiIndex that excludes all 'INVALID' entries
-    # This is a common real-world scenario: filtering data but keeping MultiIndex structure
-    filtered_df = df[df.index.get_level_values("status") == "valid"]
-
-    # Create schema that would reject 'INVALID' values
-    schema = DataFrameSchema(
-        columns={"value": Column(int)},
-        index=MultiIndex(
-            [
-                Index(
-                    String,
-                    checks=[Check.str_matches(r"^valid$")],
-                    name="status",
-                ),
-                Index(
-                    Int, checks=[Check.greater_than_or_equal_to(0)], name="id"
-                ),
-            ]
-        ),
-    )
-
-    # This should pass because filtered_df only contains 'valid' values
-    # If the optimization incorrectly used .levels[level_pos], this would fail
-    # because filtered_df.index.levels[0] still contains 'INVALID' from the original
-    validated_df = schema.validate(filtered_df)
-    assert isinstance(validated_df, pd.DataFrame)
-    assert len(validated_df) > 0
-
-    # Verify that the optimization is actually beneficial by ensuring we have duplicates
-    level0_unique = filtered_df.index.unique(level=0)
-    level0_total = len(filtered_df.index.get_level_values(0))
-    assert (
-        len(level0_unique) < level0_total
-    ), "Should have duplicates for optimization to be beneficial"
-
-    # Additional verification: manually check that .levels would contain phantom values
-    level0_levels = filtered_df.index.levels[0]
-    level0_actual = filtered_df.index.unique(level=0)
-    phantom_values = set(level0_levels) - set(level0_actual)
-    assert (
-        len(phantom_values) > 0
-    ), "Test setup should create phantom values in .levels"
-    assert (
-        "INVALID" in phantom_values
-    ), "Should have 'INVALID' as phantom value"
-
-
 @pytest.mark.parametrize(
     "schema,expected_optimized_calls,expected_full_calls,expected_optimized_levels,expected_full_levels",
     [

From e9eee9feda9c677af8a0d34eab2aa5a02768a1ea Mon Sep 17 00:00:00 2001
From: Adam Merberg <adam.merberg@equilibriumenergy.com>
Date: Tue, 19 Aug 2025 22:50:42 -0400
Subject: [PATCH 10/11] exclude pyspark indexes from optimization for now

Signed-off-by: Adam Merberg <adam.merberg@equilibriumenergy.com>
---
 pandera/backends/pandas/components.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/pandera/backends/pandas/components.py b/pandera/backends/pandas/components.py
index a547c033e..cbb641b66 100644
--- a/pandera/backends/pandas/components.py
+++ b/pandera/backends/pandas/components.py
@@ -462,16 +462,16 @@ def validate(
 
         validate_full_df = not (head or tail or sample)
 
+        is_pyspark_index = (
+            type(check_obj).__module__.startswith("pyspark.pandas")
+            and hasattr(check_obj.index, "__module__")
+            and check_obj.index.__module__.startswith("pyspark.pandas")
+        )
         # Ensure the object has a MultiIndex
         if not is_multiindex(check_obj.index):
             # Allow an exception for a *single-level* Index when the schema also
             # describes exactly one level to maintain compatibility (e.g. pyspark.pandas
             # often materializes a single-level MultiIndex as a plain Index).
-            is_pyspark_index = (
-                type(check_obj).__module__.startswith("pyspark.pandas")
-                and hasattr(check_obj.index, "__module__")
-                and check_obj.index.__module__.startswith("pyspark.pandas")
-            )
 
             if len(schema.indexes) == 1 and (
                 is_index(check_obj.index) or is_pyspark_index
@@ -540,8 +540,10 @@ def validate(
             # doesn't commute with taking unique values, which can lead to inconsistent
             # results. For instance, the check may fail on the first n unique values but
             # pass on the first n values.
-            can_optimize = validate_full_df and self._can_optimize_level(
-                index_schema
+            can_optimize = (
+                validate_full_df
+                and not is_pyspark_index
+                and self._can_optimize_level(index_schema)
             )
 
             try:

From 5b8b3bf564ae949754deefc36d696bc6e75a5a95 Mon Sep 17 00:00:00 2001
From: Adam Merberg <adam.merberg@equilibriumenergy.com>
Date: Wed, 20 Aug 2025 14:32:19 -0400
Subject: [PATCH 11/11] Revert "exclude pyspark indexes from optimization for
 now"

This reverts commit e9eee9feda9c677af8a0d34eab2aa5a02768a1ea.

Signed-off-by: Adam Merberg <adam.merberg@equilibriumenergy.com>
---
 pandera/backends/pandas/components.py | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/pandera/backends/pandas/components.py b/pandera/backends/pandas/components.py
index cbb641b66..a547c033e 100644
--- a/pandera/backends/pandas/components.py
+++ b/pandera/backends/pandas/components.py
@@ -462,16 +462,16 @@ def validate(
 
         validate_full_df = not (head or tail or sample)
 
-        is_pyspark_index = (
-            type(check_obj).__module__.startswith("pyspark.pandas")
-            and hasattr(check_obj.index, "__module__")
-            and check_obj.index.__module__.startswith("pyspark.pandas")
-        )
         # Ensure the object has a MultiIndex
         if not is_multiindex(check_obj.index):
             # Allow an exception for a *single-level* Index when the schema also
             # describes exactly one level to maintain compatibility (e.g. pyspark.pandas
             # often materializes a single-level MultiIndex as a plain Index).
+            is_pyspark_index = (
+                type(check_obj).__module__.startswith("pyspark.pandas")
+                and hasattr(check_obj.index, "__module__")
+                and check_obj.index.__module__.startswith("pyspark.pandas")
+            )
 
             if len(schema.indexes) == 1 and (
                 is_index(check_obj.index) or is_pyspark_index
@@ -540,10 +540,8 @@ def validate(
             # doesn't commute with taking unique values, which can lead to inconsistent
             # results. For instance, the check may fail on the first n unique values but
             # pass on the first n values.
-            can_optimize = (
-                validate_full_df
-                and not is_pyspark_index
-                and self._can_optimize_level(index_schema)
+            can_optimize = validate_full_df and self._can_optimize_level(
+                index_schema
             )
 
             try: