From 908edbaa1277bf3c277093b9d5528ed12a7dcada Mon Sep 17 00:00:00 2001 From: Adam Merberg Date: Mon, 18 Aug 2025 15:57:25 -0400 Subject: [PATCH 01/11] add check property Signed-off-by: Adam Merberg --- pandera/api/checks.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/pandera/api/checks.py b/pandera/api/checks.py index d651afc80..d82f779a7 100644 --- a/pandera/api/checks.py +++ b/pandera/api/checks.py @@ -34,6 +34,7 @@ def __init__( description: Optional[str] = None, statistics: Optional[dict[str, Any]] = None, strategy: Optional[Any] = None, + supports_unique_optimization: bool = False, **check_kwargs, ) -> None: """Apply a validation function to a data object. @@ -98,6 +99,13 @@ def __init__( :param strategy: A hypothesis strategy, used for implementing data synthesis strategies for this check. See the :ref:`User Guide ` for more details. + :param supports_unique_optimization: If True, indicates that this check + can be safely executed on unique values only, rather than the full + dataset. This enables significant performance optimizations for + MultiIndex validation when dealing with large datasets containing + duplicate values. The check function must be idempotent and produce + the same result whether applied to unique values or full values. + *New in version 0.21.0* :param check_kwargs: key-word arguments to pass into ``check_fn`` :example: @@ -177,6 +185,7 @@ def __init__( self.n_failure_cases = n_failure_cases self.title = title self.description = description + self.supports_unique_optimization = supports_unique_optimization if groupby is None and groups is not None: raise ValueError( @@ -268,6 +277,9 @@ def greater_than(cls, min_value: Any, **kwargs) -> "Check": """ if min_value is None: raise ValueError("min_value must not be None") + # Set supports_unique_optimization=True by default since this check + # is safe to run on unique values only + kwargs.setdefault("supports_unique_optimization", True) return cls.from_builtin_check_name( "greater_than", kwargs, @@ -285,6 +297,9 @@ def greater_than_or_equal_to(cls, min_value: Any, **kwargs) -> "Check": """ if min_value is None: raise ValueError("min_value must not be None") + # Set supports_unique_optimization=True by default since this check + # is safe to run on unique values only + kwargs.setdefault("supports_unique_optimization", True) return cls.from_builtin_check_name( "greater_than_or_equal_to", kwargs, @@ -302,6 +317,9 @@ def less_than(cls, max_value: Any, **kwargs) -> "Check": """ if max_value is None: raise ValueError("max_value must not be None") + # Set supports_unique_optimization=True by default since this check + # is safe to run on unique values only + kwargs.setdefault("supports_unique_optimization", True) return cls.from_builtin_check_name( "less_than", kwargs, @@ -319,6 +337,9 @@ def less_than_or_equal_to(cls, max_value: Any, **kwargs) -> "Check": """ if max_value is None: raise ValueError("max_value must not be None") + # Set supports_unique_optimization=True by default since this check + # is safe to run on unique values only + kwargs.setdefault("supports_unique_optimization", True) return cls.from_builtin_check_name( "less_than_or_equal_to", kwargs, @@ -361,6 +382,9 @@ def in_range( f"The combination of min_value = {min_value} and " f"max_value = {max_value} defines an empty interval!" ) + # Set supports_unique_optimization=True by default since this check + # is safe to run on unique values only + kwargs.setdefault("supports_unique_optimization", True) return cls.from_builtin_check_name( "in_range", kwargs, @@ -391,6 +415,9 @@ def isin(cls, allowed_values: Iterable, **kwargs) -> "Check": raise ValueError( f"Argument allowed_values must be iterable. Got {allowed_values}" ) from exc + # Set supports_unique_optimization=True by default since this check + # is safe to run on unique values only + kwargs.setdefault("supports_unique_optimization", True) return cls.from_builtin_check_name( "isin", kwargs, @@ -420,6 +447,9 @@ def notin(cls, forbidden_values: Iterable, **kwargs) -> "Check": "Argument forbidden_values must be iterable. " f"Got {forbidden_values}" ) from exc + # Set supports_unique_optimization=True by default since this check + # is safe to run on unique values only + kwargs.setdefault("supports_unique_optimization", True) return cls.from_builtin_check_name( "notin", kwargs, @@ -441,6 +471,9 @@ def str_matches(cls, pattern: Union[str, re.Pattern], **kwargs) -> "Check": raise ValueError( f'pattern="{pattern}" cannot be compiled as regular expression' ) from exc + # Set supports_unique_optimization=True by default since this check + # is safe to run on unique values only + kwargs.setdefault("supports_unique_optimization", True) return cls.from_builtin_check_name( "str_matches", kwargs, From 8ac3362eece52dbc0dacce5e823c037f32a27f42 Mon Sep 17 00:00:00 2001 From: Adam Merberg Date: Mon, 18 Aug 2025 16:36:46 -0400 Subject: [PATCH 02/11] initial implementation of unique optimization Signed-off-by: Adam Merberg --- pandera/backends/pandas/components.py | 153 +++++++++++++++++++++++--- 1 file changed, 140 insertions(+), 13 deletions(-) diff --git a/pandera/backends/pandas/components.py b/pandera/backends/pandas/components.py index 1b83a1b72..7402e611f 100644 --- a/pandera/backends/pandas/components.py +++ b/pandera/backends/pandas/components.py @@ -529,24 +529,38 @@ def validate( # Iterate over the expected index levels and validate each level with its # corresponding ``Index`` schema component. for level_pos, index_schema in level_mapping: - stub_df = pd.DataFrame( - index=check_obj.index.get_level_values(level_pos) - ) # We've already taken care of coercion, so we can disable it now. index_schema = deepcopy(index_schema) index_schema.coerce = False + # Check if we can optimize validation for this level + can_optimize = self._can_optimize_level(index_schema) + try: - # Validate using the schema for this level - index_schema.validate( - stub_df, - head=head, - tail=tail, - sample=sample, - random_state=random_state, - lazy=lazy, - inplace=True, - ) + if can_optimize: + # Use optimized validation with unique values only + self._validate_level_optimized( + check_obj.index, + level_pos, + index_schema, + head=head, + tail=tail, + sample=sample, + random_state=random_state, + lazy=lazy, + ) + else: + # Fall back to traditional validation with full materialization + self._validate_level_with_full_materialization( + check_obj.index, + level_pos, + index_schema, + head=head, + tail=tail, + sample=sample, + random_state=random_state, + lazy=lazy, + ) except (SchemaError, SchemaErrors) as exc: self._collect_or_raise(error_handler, exc, schema) @@ -564,6 +578,119 @@ def validate( return check_obj + def _can_optimize_level(self, index_schema) -> bool: + """Check if we can optimize validation for this level. + + :param index_schema: The schema for this level + :returns: True if optimization can be applied to this level + """ + # Check if all checks support unique optimization + # Note that if there are no checks all([]) returns True + return all( + self._check_supports_unique_optimization(check) + for check in index_schema.checks + ) + + def _check_supports_unique_optimization(self, check) -> bool: + """Determine if a check can operate on unique values only. + + :param check: The check to analyze + :returns: True if the check supports unique value optimization + """ + # Check if the check has explicit support for optimization + # All built-in checks that support optimization have this property set in Phase 1 + if hasattr(check, "supports_unique_optimization"): + return check.supports_unique_optimization + + # Conservative default for checks without the property (shouldn't happen for modern checks) + return False + + def _validate_level_optimized( + self, + multiindex: pd.MultiIndex, + level_pos: int, + index_schema, + head: Optional[int] = None, + tail: Optional[int] = None, + sample: Optional[int] = None, + random_state: Optional[int] = None, + lazy: bool = False, + ) -> None: + """Validate a level using unique values optimization. + + :param multiindex: The MultiIndex being validated + :param level_pos: Position of this level in the MultiIndex + :param index_schema: The schema for this level + :param head: validate the first n rows + :param tail: validate the last n rows + :param sample: validate a random sample of n rows + :param random_state: random seed for sampling + :param lazy: if True, collect errors instead of raising immediately + """ + try: + # Use unique values directly from MultiIndex levels + unique_values = multiindex.unique(level=level_pos) + unique_stub_df = pd.DataFrame(index=unique_values) + + # Run validation on unique values only + index_schema.validate( + unique_stub_df, + head=head, + tail=tail, + sample=sample, + random_state=random_state, + lazy=lazy, + inplace=True, + ) + # If we get here, all unique values passed validation + + except (SchemaError, SchemaErrors): + # Validation failed on unique values, need to materialize full values + # for proper error reporting with correct indices + self._validate_level_with_full_materialization( + multiindex, + level_pos, + index_schema, + head=head, + tail=tail, + sample=sample, + random_state=random_state, + lazy=lazy, + ) + + def _validate_level_with_full_materialization( + self, + multiindex: pd.MultiIndex, + level_pos: int, + index_schema, + head: Optional[int] = None, + tail: Optional[int] = None, + sample: Optional[int] = None, + random_state: Optional[int] = None, + lazy: bool = False, + ) -> None: + """Validate a level using full materialization. + + This materializes all values (including duplicates) for validation. + Used both as a fallback when optimization isn't possible and when + errors are identified in optimized validation + in order to provide proper error reporting with correct indices. + """ + # Materialize the full level values + full_values = multiindex.get_level_values(level_pos) + full_stub_df = pd.DataFrame(index=full_values) + + # Run validation on full materialized values + index_schema.validate( + full_stub_df, + head=head, + tail=tail, + sample=sample, + random_state=random_state, + lazy=lazy, + inplace=True, + ) + def _check_strict( self, check_obj: pd.MultiIndex, From 43b51d26122212ecfcc64ecec203f6dc7e5418d3 Mon Sep 17 00:00:00 2001 From: Adam Merberg Date: Mon, 18 Aug 2025 20:57:39 -0400 Subject: [PATCH 03/11] add tests for optimization Signed-off-by: Adam Merberg --- pandera/backends/pandas/components.py | 8 +- tests/pandas/test_schema_components.py | 277 +++++++++++++++++++++++++ 2 files changed, 281 insertions(+), 4 deletions(-) diff --git a/pandera/backends/pandas/components.py b/pandera/backends/pandas/components.py index 7402e611f..6fb310c06 100644 --- a/pandera/backends/pandas/components.py +++ b/pandera/backends/pandas/components.py @@ -632,18 +632,18 @@ def _validate_level_optimized( unique_values = multiindex.unique(level=level_pos) unique_stub_df = pd.DataFrame(index=unique_values) - # Run validation on unique values only + # Run validation on unique values only, using lazy=False to cut to + # full validation as soon as we hit a failure + index_schema.validate( unique_stub_df, head=head, tail=tail, sample=sample, random_state=random_state, - lazy=lazy, + lazy=False, inplace=True, ) - # If we get here, all unique values passed validation - except (SchemaError, SchemaErrors): # Validation failed on unique values, need to materialize full values # for proper error reporting with correct indices diff --git a/tests/pandas/test_schema_components.py b/tests/pandas/test_schema_components.py index b4d65b773..a5de22cc5 100644 --- a/tests/pandas/test_schema_components.py +++ b/tests/pandas/test_schema_components.py @@ -2,6 +2,7 @@ import copy from typing import Any, Optional +from unittest.mock import patch, MagicMock import numpy as np import pandas as pd @@ -890,6 +891,282 @@ def test_multiindex_incorrect_input(indexes) -> None: MultiIndex(indexes) +def test_multiindex_optimization_with_sub_index() -> None: + """Test that MultiIndex optimization works correctly with sub-MultiIndices. + + This test ensures that when validating MultiIndex levels, the optimization + correctly determines the unique values of the level. Typically, this means + using multiindex.unique(level=level_pos) and NOT multiindex.levels[level_pos], + because .levels can contain values that don't actually appear in sub-MultiIndices + (e.g. after slicing or filtering operations). + """ + # Create a large MultiIndex with specific values + original_mi = pd.MultiIndex.from_arrays( + [ + ["valid", "INVALID", "valid", "valid"] + * 500, # Some invalid values + list(range(2000)), + ], + names=["status", "id"], + ) + + # Create original DataFrame + df = pd.DataFrame({"value": range(2000)}, index=original_mi) + + # Filter to create a sub-MultiIndex that excludes all 'INVALID' entries + # This is a common real-world scenario: filtering data but keeping MultiIndex structure + filtered_df = df[df.index.get_level_values("status") == "valid"] + + # Create schema that would reject 'INVALID' values + schema = DataFrameSchema( + columns={"value": Column(int)}, + index=MultiIndex( + [ + Index( + String, + checks=[Check.str_matches(r"^valid$")], + name="status", + ), + Index( + Int, checks=[Check.greater_than_or_equal_to(0)], name="id" + ), + ] + ), + ) + + # This should pass because filtered_df only contains 'valid' values + # If the optimization incorrectly used .levels[level_pos], this would fail + # because filtered_df.index.levels[0] still contains 'INVALID' from the original + validated_df = schema.validate(filtered_df) + assert isinstance(validated_df, pd.DataFrame) + assert len(validated_df) > 0 + + # Verify that the optimization is actually beneficial by ensuring we have duplicates + level0_unique = filtered_df.index.unique(level=0) + level0_total = len(filtered_df.index.get_level_values(0)) + assert ( + len(level0_unique) < level0_total + ), "Should have duplicates for optimization to be beneficial" + + # Additional verification: manually check that .levels would contain phantom values + level0_levels = filtered_df.index.levels[0] + level0_actual = filtered_df.index.unique(level=0) + phantom_values = set(level0_levels) - set(level0_actual) + assert ( + len(phantom_values) > 0 + ), "Test setup should create phantom values in .levels" + assert ( + "INVALID" in phantom_values + ), "Should have 'INVALID' as phantom value" + + +@pytest.mark.parametrize( + "schema,expected_optimized_calls,expected_full_calls,expected_optimized_levels,expected_full_levels", + [ + # All optimizable checks -> optimized path for both levels + ( + DataFrameSchema( + columns={"value": Column(int)}, + index=MultiIndex( + [ + Index( + String, + checks=[ + Check.str_matches( + r"^(cat|dog)$" + ), # Optimizable + Check.isin(["cat", "dog"]), # Optimizable + ], + name="animal", + ), + Index( + Int, + checks=[ + Check.greater_than_or_equal_to( + 0 + ), # Optimizable + Check.less_than(1000), # Optimizable + ], + name="id", + ), + ] + ), + ), + 2, + 0, + [0, 1], + [], + ), + # Mixed checks -> full materialization for level with non-optimizable, optimized for others + ( + DataFrameSchema( + columns={"value": Column(int)}, + index=MultiIndex( + [ + Index( + String, + checks=[ + Check.str_matches( + r"^(cat|dog)$" + ), # Optimizable + Check( + lambda s: len(s) > 50, + supports_unique_optimization=False, + ), # NOT optimizable + ], + name="animal", + ), + Index( + Int, + checks=[ + Check.greater_than_or_equal_to( + 0 + ), # Optimizable + ], + name="id", + ), + ] + ), + ), + 1, + 1, + [1], + [0], + ), + ], +) +def test_multiindex_optimization_path_selection( + schema: DataFrameSchema, + expected_optimized_calls: int, + expected_full_calls: int, + expected_optimized_levels: list[int], + expected_full_levels: list[int], +) -> None: + """Test that MultiIndex validation chooses the correct optimization path.""" + # Create test MultiIndex with duplicates for optimization benefit + mi = pd.MultiIndex.from_arrays( + [ + ["cat", "dog", "cat", "dog"] * 100, # Lots of duplicates + list(range(400)), + ], + names=["animal", "id"], + ) + df = pd.DataFrame({"value": range(400)}, index=mi) + + # Mock the backend methods to track which path is taken + with ( + patch( + "pandera.backends.pandas.components.MultiIndexBackend._validate_level_optimized" + ) as mock_optimized, + patch( + "pandera.backends.pandas.components.MultiIndexBackend._validate_level_with_full_materialization" + ) as mock_full, + ): + + schema.validate(df) + + # Verify correct number of calls + assert ( + mock_optimized.call_count == expected_optimized_calls + ), f"Expected {expected_optimized_calls} calls to optimized path, got {mock_optimized.call_count}" + assert ( + mock_full.call_count == expected_full_calls + ), f"Expected {expected_full_calls} calls to full materialization, got {mock_full.call_count}" + + # Verify correct levels were called with correct methods + if expected_optimized_calls > 0: + optimized_calls = [ + call[0][1] for call in mock_optimized.call_args_list + ] # Extract level_pos argument + assert sorted(optimized_calls) == sorted( + expected_optimized_levels + ), f"Expected optimized calls for levels {expected_optimized_levels}, got {optimized_calls}" + + if expected_full_calls > 0: + full_calls = [call[0][1] for call in mock_full.call_args_list] + assert sorted(full_calls) == sorted( + expected_full_levels + ), f"Expected full calls for levels {expected_full_levels}, got {full_calls}" + + +@pytest.mark.parametrize( + "checks,expected_can_optimize", + [ + # Schema with all optimizable checks + ([Check.str_matches(r"^test$"), Check.isin(["test"])], True), + # Schema with mixed checks (includes non-optimizable) + ( + [ + Check.str_matches(r"^test$"), + Check( + lambda s: len(s) > 100, supports_unique_optimization=False + ), + ], + False, + ), + # Schema with no checks + ([], True), + # Schema with only non-optimizable checks + ( + [ + Check( + lambda s: s.nunique() > 10, + supports_unique_optimization=False, + ) + ], + False, + ), + ], +) +def test_multiindex_can_optimize_level( + checks: list, expected_can_optimize: bool +) -> None: + """Test the _can_optimize_level decision logic.""" + from pandera.backends.pandas.components import MultiIndexBackend + + backend = MultiIndexBackend() + schema = Index(String, checks=checks) + + result = backend._can_optimize_level(schema) + assert result is expected_can_optimize + + +@pytest.mark.parametrize( + "check,expected_supports_optimization", + [ + # Built-in optimizable check + (Check.greater_than(5), True), + # Explicitly non-optimizable check + ( + Check( + lambda s: s.nunique() > 10, supports_unique_optimization=False + ), + False, + ), + # Custom check marked as optimizable + ( + Check( + lambda s: s.str.len() > 2, supports_unique_optimization=True + ), + True, + ), + # Built-in optimizable check - isin + (Check.isin(["test"]), True), + # Built-in optimizable check - str_matches + (Check.str_matches(r"^test$"), True), + ], +) +def test_check_supports_unique_optimization( + check, expected_supports_optimization: bool +) -> None: + """Test individual check support detection for unique optimization.""" + from pandera.backends.pandas.components import MultiIndexBackend + + backend = MultiIndexBackend() + result = backend._check_supports_unique_optimization(check) + assert result is expected_supports_optimization + + def test_index_validation_pandas_string_dtype(): """Test that pandas string type is correctly validated.""" From 6b0c9240ca6f075d17ff49106a3524e69b1c67b8 Mon Sep 17 00:00:00 2001 From: Adam Merberg Date: Mon, 18 Aug 2025 21:45:02 -0400 Subject: [PATCH 04/11] set more checks to allow optimization Signed-off-by: Adam Merberg --- pandera/api/checks.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/pandera/api/checks.py b/pandera/api/checks.py index d82f779a7..61e22a2eb 100644 --- a/pandera/api/checks.py +++ b/pandera/api/checks.py @@ -245,6 +245,9 @@ def equal_to(cls, value: Any, **kwargs) -> "Check": :param value: values in this data object must be equal to this value. """ + # Set supports_unique_optimization=True by default since this check + # is safe to run on unique values only + kwargs.setdefault("supports_unique_optimization", True) return cls.from_builtin_check_name( "equal_to", kwargs, @@ -258,6 +261,9 @@ def not_equal_to(cls, value: Any, **kwargs) -> "Check": :param value: This value must not occur in the data object. """ + # Set supports_unique_optimization=True by default since this check + # is safe to run on unique values only + kwargs.setdefault("supports_unique_optimization", True) return cls.from_builtin_check_name( "not_equal_to", kwargs, @@ -497,6 +503,9 @@ def str_contains( raise ValueError( f'pattern="{pattern}" cannot be compiled as regular expression' ) from exc + # Set supports_unique_optimization=True by default since this check + # is safe to run on unique values only + kwargs.setdefault("supports_unique_optimization", True) return cls.from_builtin_check_name( "str_contains", kwargs, @@ -512,7 +521,9 @@ def str_startswith(cls, string: str, **kwargs) -> "Check": :param string: String all values should start with :param kwargs: key-word arguments passed into the `Check` initializer. """ - + # Set supports_unique_optimization=True by default since this check + # is safe to run on unique values only + kwargs.setdefault("supports_unique_optimization", True) return cls.from_builtin_check_name( "str_startswith", kwargs, @@ -527,6 +538,9 @@ def str_endswith(cls, string: str, **kwargs) -> "Check": :param string: String all values should end with :param kwargs: key-word arguments passed into the `Check` initializer. """ + # Set supports_unique_optimization=True by default since this check + # is safe to run on unique values only + kwargs.setdefault("supports_unique_optimization", True) return cls.from_builtin_check_name( "str_endswith", kwargs, @@ -551,6 +565,9 @@ def str_length( "At least a minimum or a maximum need to be specified. Got " "None." ) + # Set supports_unique_optimization=True by default since this check + # is safe to run on unique values only + kwargs.setdefault("supports_unique_optimization", True) return cls.from_builtin_check_name( "str_length", kwargs, @@ -575,6 +592,9 @@ def unique_values_eq(cls, values: Iterable, **kwargs) -> "Check": raise ValueError( f"Argument values must be iterable. Got {values}" ) from exc + # Set supports_unique_optimization=True by default since this check + # is safe to run on unique values only + kwargs.setdefault("supports_unique_optimization", True) return cls.from_builtin_check_name( "unique_values_eq", kwargs, From 1bfcb84fe2a5cb4c5a62360cdccf20361d880af1 Mon Sep 17 00:00:00 2001 From: Adam Merberg Date: Mon, 18 Aug 2025 22:37:11 -0400 Subject: [PATCH 05/11] improve name Signed-off-by: Adam Merberg --- pandera/api/checks.py | 81 +++++++++++++------------- pandera/backends/pandas/components.py | 18 +++--- tests/pandas/test_schema_components.py | 20 +++---- 3 files changed, 56 insertions(+), 63 deletions(-) diff --git a/pandera/api/checks.py b/pandera/api/checks.py index 61e22a2eb..111b0a537 100644 --- a/pandera/api/checks.py +++ b/pandera/api/checks.py @@ -34,7 +34,7 @@ def __init__( description: Optional[str] = None, statistics: Optional[dict[str, Any]] = None, strategy: Optional[Any] = None, - supports_unique_optimization: bool = False, + determined_by_unique: bool = False, **check_kwargs, ) -> None: """Apply a validation function to a data object. @@ -99,13 +99,12 @@ def __init__( :param strategy: A hypothesis strategy, used for implementing data synthesis strategies for this check. See the :ref:`User Guide ` for more details. - :param supports_unique_optimization: If True, indicates that this check - can be safely executed on unique values only, rather than the full - dataset. This enables significant performance optimizations for - MultiIndex validation when dealing with large datasets containing - duplicate values. The check function must be idempotent and produce - the same result whether applied to unique values or full values. - *New in version 0.21.0* + :param determined_by_unique: If True, indicates that this check's + result is fully determined by the unique values in the data, meaning + duplicate values don't affect the outcome. This enables significant + performance optimizations for MultiIndex validation when dealing with + large datasets. If True, the check function must produce the same result + whether applied to unique values or full values. :param check_kwargs: key-word arguments to pass into ``check_fn`` :example: @@ -185,7 +184,7 @@ def __init__( self.n_failure_cases = n_failure_cases self.title = title self.description = description - self.supports_unique_optimization = supports_unique_optimization + self.determined_by_unique = determined_by_unique if groupby is None and groups is not None: raise ValueError( @@ -245,9 +244,9 @@ def equal_to(cls, value: Any, **kwargs) -> "Check": :param value: values in this data object must be equal to this value. """ - # Set supports_unique_optimization=True by default since this check - # is safe to run on unique values only - kwargs.setdefault("supports_unique_optimization", True) + # Set determined_by_unique=True by default since this check + # is determined by unique values only + kwargs.setdefault("determined_by_unique", True) return cls.from_builtin_check_name( "equal_to", kwargs, @@ -261,9 +260,9 @@ def not_equal_to(cls, value: Any, **kwargs) -> "Check": :param value: This value must not occur in the data object. """ - # Set supports_unique_optimization=True by default since this check - # is safe to run on unique values only - kwargs.setdefault("supports_unique_optimization", True) + # Set determined_by_unique=True by default since this check + # is determined by unique values only + kwargs.setdefault("determined_by_unique", True) return cls.from_builtin_check_name( "not_equal_to", kwargs, @@ -283,9 +282,9 @@ def greater_than(cls, min_value: Any, **kwargs) -> "Check": """ if min_value is None: raise ValueError("min_value must not be None") - # Set supports_unique_optimization=True by default since this check + # Set determined_by_unique=True by default since this check # is safe to run on unique values only - kwargs.setdefault("supports_unique_optimization", True) + kwargs.setdefault("determined_by_unique", True) return cls.from_builtin_check_name( "greater_than", kwargs, @@ -303,9 +302,9 @@ def greater_than_or_equal_to(cls, min_value: Any, **kwargs) -> "Check": """ if min_value is None: raise ValueError("min_value must not be None") - # Set supports_unique_optimization=True by default since this check + # Set determined_by_unique=True by default since this check # is safe to run on unique values only - kwargs.setdefault("supports_unique_optimization", True) + kwargs.setdefault("determined_by_unique", True) return cls.from_builtin_check_name( "greater_than_or_equal_to", kwargs, @@ -323,9 +322,9 @@ def less_than(cls, max_value: Any, **kwargs) -> "Check": """ if max_value is None: raise ValueError("max_value must not be None") - # Set supports_unique_optimization=True by default since this check + # Set determined_by_unique=True by default since this check # is safe to run on unique values only - kwargs.setdefault("supports_unique_optimization", True) + kwargs.setdefault("determined_by_unique", True) return cls.from_builtin_check_name( "less_than", kwargs, @@ -343,9 +342,9 @@ def less_than_or_equal_to(cls, max_value: Any, **kwargs) -> "Check": """ if max_value is None: raise ValueError("max_value must not be None") - # Set supports_unique_optimization=True by default since this check + # Set determined_by_unique=True by default since this check # is safe to run on unique values only - kwargs.setdefault("supports_unique_optimization", True) + kwargs.setdefault("determined_by_unique", True) return cls.from_builtin_check_name( "less_than_or_equal_to", kwargs, @@ -388,9 +387,9 @@ def in_range( f"The combination of min_value = {min_value} and " f"max_value = {max_value} defines an empty interval!" ) - # Set supports_unique_optimization=True by default since this check + # Set determined_by_unique=True by default since this check # is safe to run on unique values only - kwargs.setdefault("supports_unique_optimization", True) + kwargs.setdefault("determined_by_unique", True) return cls.from_builtin_check_name( "in_range", kwargs, @@ -421,9 +420,9 @@ def isin(cls, allowed_values: Iterable, **kwargs) -> "Check": raise ValueError( f"Argument allowed_values must be iterable. Got {allowed_values}" ) from exc - # Set supports_unique_optimization=True by default since this check + # Set determined_by_unique=True by default since this check # is safe to run on unique values only - kwargs.setdefault("supports_unique_optimization", True) + kwargs.setdefault("determined_by_unique", True) return cls.from_builtin_check_name( "isin", kwargs, @@ -453,9 +452,9 @@ def notin(cls, forbidden_values: Iterable, **kwargs) -> "Check": "Argument forbidden_values must be iterable. " f"Got {forbidden_values}" ) from exc - # Set supports_unique_optimization=True by default since this check + # Set determined_by_unique=True by default since this check # is safe to run on unique values only - kwargs.setdefault("supports_unique_optimization", True) + kwargs.setdefault("determined_by_unique", True) return cls.from_builtin_check_name( "notin", kwargs, @@ -477,9 +476,9 @@ def str_matches(cls, pattern: Union[str, re.Pattern], **kwargs) -> "Check": raise ValueError( f'pattern="{pattern}" cannot be compiled as regular expression' ) from exc - # Set supports_unique_optimization=True by default since this check + # Set determined_by_unique=True by default since this check # is safe to run on unique values only - kwargs.setdefault("supports_unique_optimization", True) + kwargs.setdefault("determined_by_unique", True) return cls.from_builtin_check_name( "str_matches", kwargs, @@ -503,9 +502,9 @@ def str_contains( raise ValueError( f'pattern="{pattern}" cannot be compiled as regular expression' ) from exc - # Set supports_unique_optimization=True by default since this check + # Set determined_by_unique=True by default since this check # is safe to run on unique values only - kwargs.setdefault("supports_unique_optimization", True) + kwargs.setdefault("determined_by_unique", True) return cls.from_builtin_check_name( "str_contains", kwargs, @@ -521,9 +520,9 @@ def str_startswith(cls, string: str, **kwargs) -> "Check": :param string: String all values should start with :param kwargs: key-word arguments passed into the `Check` initializer. """ - # Set supports_unique_optimization=True by default since this check + # Set determined_by_unique=True by default since this check # is safe to run on unique values only - kwargs.setdefault("supports_unique_optimization", True) + kwargs.setdefault("determined_by_unique", True) return cls.from_builtin_check_name( "str_startswith", kwargs, @@ -538,9 +537,9 @@ def str_endswith(cls, string: str, **kwargs) -> "Check": :param string: String all values should end with :param kwargs: key-word arguments passed into the `Check` initializer. """ - # Set supports_unique_optimization=True by default since this check + # Set determined_by_unique=True by default since this check # is safe to run on unique values only - kwargs.setdefault("supports_unique_optimization", True) + kwargs.setdefault("determined_by_unique", True) return cls.from_builtin_check_name( "str_endswith", kwargs, @@ -565,9 +564,9 @@ def str_length( "At least a minimum or a maximum need to be specified. Got " "None." ) - # Set supports_unique_optimization=True by default since this check + # Set determined_by_unique=True by default since this check # is safe to run on unique values only - kwargs.setdefault("supports_unique_optimization", True) + kwargs.setdefault("determined_by_unique", True) return cls.from_builtin_check_name( "str_length", kwargs, @@ -592,9 +591,9 @@ def unique_values_eq(cls, values: Iterable, **kwargs) -> "Check": raise ValueError( f"Argument values must be iterable. Got {values}" ) from exc - # Set supports_unique_optimization=True by default since this check + # Set determined_by_unique=True by default since this check # is safe to run on unique values only - kwargs.setdefault("supports_unique_optimization", True) + kwargs.setdefault("determined_by_unique", True) return cls.from_builtin_check_name( "unique_values_eq", kwargs, diff --git a/pandera/backends/pandas/components.py b/pandera/backends/pandas/components.py index 6fb310c06..1ab79a5c1 100644 --- a/pandera/backends/pandas/components.py +++ b/pandera/backends/pandas/components.py @@ -584,23 +584,23 @@ def _can_optimize_level(self, index_schema) -> bool: :param index_schema: The schema for this level :returns: True if optimization can be applied to this level """ - # Check if all checks support unique optimization + # Check whether all checks are determined by unique values # Note that if there are no checks all([]) returns True return all( - self._check_supports_unique_optimization(check) + self._check_determined_by_unique(check) for check in index_schema.checks ) - def _check_supports_unique_optimization(self, check) -> bool: - """Determine if a check can operate on unique values only. + def _check_determined_by_unique(self, check) -> bool: + """Determine if a check is determined by unique values only. :param check: The check to analyze - :returns: True if the check supports unique value optimization + :returns: True if the check result is determined by unique values """ - # Check if the check has explicit support for optimization - # All built-in checks that support optimization have this property set in Phase 1 - if hasattr(check, "supports_unique_optimization"): - return check.supports_unique_optimization + # Check if the check result is determined by unique values + # All built-in checks that are determined by unique values have this property set + if hasattr(check, "determined_by_unique"): + return check.determined_by_unique # Conservative default for checks without the property (shouldn't happen for modern checks) return False diff --git a/tests/pandas/test_schema_components.py b/tests/pandas/test_schema_components.py index a5de22cc5..1e2fd4e68 100644 --- a/tests/pandas/test_schema_components.py +++ b/tests/pandas/test_schema_components.py @@ -1011,7 +1011,7 @@ def test_multiindex_optimization_with_sub_index() -> None: ), # Optimizable Check( lambda s: len(s) > 50, - supports_unique_optimization=False, + determined_by_unique=False, ), # NOT optimizable ], name="animal", @@ -1098,9 +1098,7 @@ def test_multiindex_optimization_path_selection( ( [ Check.str_matches(r"^test$"), - Check( - lambda s: len(s) > 100, supports_unique_optimization=False - ), + Check(lambda s: len(s) > 100, determined_by_unique=False), ], False, ), @@ -1111,7 +1109,7 @@ def test_multiindex_optimization_path_selection( [ Check( lambda s: s.nunique() > 10, - supports_unique_optimization=False, + determined_by_unique=False, ) ], False, @@ -1138,16 +1136,12 @@ def test_multiindex_can_optimize_level( (Check.greater_than(5), True), # Explicitly non-optimizable check ( - Check( - lambda s: s.nunique() > 10, supports_unique_optimization=False - ), + Check(lambda s: s.nunique() > 10, determined_by_unique=False), False, ), # Custom check marked as optimizable ( - Check( - lambda s: s.str.len() > 2, supports_unique_optimization=True - ), + Check(lambda s: s.str.len() > 2, determined_by_unique=True), True, ), # Built-in optimizable check - isin @@ -1156,14 +1150,14 @@ def test_multiindex_can_optimize_level( (Check.str_matches(r"^test$"), True), ], ) -def test_check_supports_unique_optimization( +def test_check_determined_by_unique( check, expected_supports_optimization: bool ) -> None: """Test individual check support detection for unique optimization.""" from pandera.backends.pandas.components import MultiIndexBackend backend = MultiIndexBackend() - result = backend._check_supports_unique_optimization(check) + result = backend._check_determined_by_unique(check) assert result is expected_supports_optimization From 5a8f29f822009535d6697c1666ec601f0ea8dc47 Mon Sep 17 00:00:00 2001 From: Adam Merberg Date: Mon, 18 Aug 2025 22:44:35 -0400 Subject: [PATCH 06/11] improved comments Signed-off-by: Adam Merberg --- pandera/backends/pandas/components.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandera/backends/pandas/components.py b/pandera/backends/pandas/components.py index 1ab79a5c1..0a1539e34 100644 --- a/pandera/backends/pandas/components.py +++ b/pandera/backends/pandas/components.py @@ -628,13 +628,14 @@ def _validate_level_optimized( :param lazy: if True, collect errors instead of raising immediately """ try: - # Use unique values directly from MultiIndex levels + # Use unique values. Note that we use the MultiIndex.unique() method + # to get the unique values, rather than multiindex.levels[level_pos] + # which can have extra values that don't appear in the full data. unique_values = multiindex.unique(level=level_pos) unique_stub_df = pd.DataFrame(index=unique_values) # Run validation on unique values only, using lazy=False to cut to # full validation as soon as we hit a failure - index_schema.validate( unique_stub_df, head=head, From c083cb3544f7ba1f34cca100d88a152e6258d3ec Mon Sep 17 00:00:00 2001 From: Adam Merberg Date: Tue, 19 Aug 2025 09:07:17 -0400 Subject: [PATCH 07/11] consolidation Signed-off-by: Adam Merberg --- pandera/api/base/checks.py | 16 +++++++++- pandera/api/checks.py | 60 ++++++++++---------------------------- 2 files changed, 30 insertions(+), 46 deletions(-) diff --git a/pandera/api/base/checks.py b/pandera/api/base/checks.py index 0f4790699..41f14747e 100644 --- a/pandera/api/base/checks.py +++ b/pandera/api/base/checks.py @@ -123,9 +123,23 @@ def from_builtin_check_name( init_kwargs, error: Union[str, Callable], statistics: Optional[dict[str, Any]] = None, + defaults: Optional[dict[str, Any]] = None, **check_kwargs, ): - """Create a Check object from a built-in check's name.""" + """Create a Check object from a built-in check's name. + + :param name: Name of the built-in check function + :param init_kwargs: Keyword arguments to pass to the Check constructor + :param error: Error message or callable for this check + :param statistics: Raw check constraint values + :param defaults: Default values to apply to init_kwargs if not already set + :param check_kwargs: Additional keyword arguments + """ + # Apply defaults to init_kwargs if provided + if defaults: + for key, value in defaults.items(): + init_kwargs.setdefault(key, value) + kws = {**init_kwargs, **check_kwargs} if "error" not in kws: kws["error"] = error diff --git a/pandera/api/checks.py b/pandera/api/checks.py index 111b0a537..526ab289f 100644 --- a/pandera/api/checks.py +++ b/pandera/api/checks.py @@ -244,13 +244,11 @@ def equal_to(cls, value: Any, **kwargs) -> "Check": :param value: values in this data object must be equal to this value. """ - # Set determined_by_unique=True by default since this check - # is determined by unique values only - kwargs.setdefault("determined_by_unique", True) return cls.from_builtin_check_name( "equal_to", kwargs, error=f"equal_to({value})", + defaults={"determined_by_unique": True}, value=value, ) @@ -260,13 +258,11 @@ def not_equal_to(cls, value: Any, **kwargs) -> "Check": :param value: This value must not occur in the data object. """ - # Set determined_by_unique=True by default since this check - # is determined by unique values only - kwargs.setdefault("determined_by_unique", True) return cls.from_builtin_check_name( "not_equal_to", kwargs, error=f"not_equal_to({value})", + defaults={"determined_by_unique": True}, value=value, ) @@ -282,13 +278,11 @@ def greater_than(cls, min_value: Any, **kwargs) -> "Check": """ if min_value is None: raise ValueError("min_value must not be None") - # Set determined_by_unique=True by default since this check - # is safe to run on unique values only - kwargs.setdefault("determined_by_unique", True) return cls.from_builtin_check_name( "greater_than", kwargs, error=f"greater_than({min_value})", + defaults={"determined_by_unique": True}, min_value=min_value, ) @@ -302,13 +296,11 @@ def greater_than_or_equal_to(cls, min_value: Any, **kwargs) -> "Check": """ if min_value is None: raise ValueError("min_value must not be None") - # Set determined_by_unique=True by default since this check - # is safe to run on unique values only - kwargs.setdefault("determined_by_unique", True) return cls.from_builtin_check_name( "greater_than_or_equal_to", kwargs, error=f"greater_than_or_equal_to({min_value})", + defaults={"determined_by_unique": True}, min_value=min_value, ) @@ -322,13 +314,11 @@ def less_than(cls, max_value: Any, **kwargs) -> "Check": """ if max_value is None: raise ValueError("max_value must not be None") - # Set determined_by_unique=True by default since this check - # is safe to run on unique values only - kwargs.setdefault("determined_by_unique", True) return cls.from_builtin_check_name( "less_than", kwargs, error=f"less_than({max_value})", + defaults={"determined_by_unique": True}, max_value=max_value, ) @@ -342,13 +332,11 @@ def less_than_or_equal_to(cls, max_value: Any, **kwargs) -> "Check": """ if max_value is None: raise ValueError("max_value must not be None") - # Set determined_by_unique=True by default since this check - # is safe to run on unique values only - kwargs.setdefault("determined_by_unique", True) return cls.from_builtin_check_name( "less_than_or_equal_to", kwargs, error=f"less_than_or_equal_to({max_value})", + defaults={"determined_by_unique": True}, max_value=max_value, ) @@ -387,13 +375,11 @@ def in_range( f"The combination of min_value = {min_value} and " f"max_value = {max_value} defines an empty interval!" ) - # Set determined_by_unique=True by default since this check - # is safe to run on unique values only - kwargs.setdefault("determined_by_unique", True) return cls.from_builtin_check_name( "in_range", kwargs, error=f"in_range({min_value}, {max_value})", + defaults={"determined_by_unique": True}, min_value=min_value, max_value=max_value, include_min=include_min, @@ -420,13 +406,11 @@ def isin(cls, allowed_values: Iterable, **kwargs) -> "Check": raise ValueError( f"Argument allowed_values must be iterable. Got {allowed_values}" ) from exc - # Set determined_by_unique=True by default since this check - # is safe to run on unique values only - kwargs.setdefault("determined_by_unique", True) return cls.from_builtin_check_name( "isin", kwargs, error=f"isin({allowed_values})", + defaults={"determined_by_unique": True}, statistics={"allowed_values": allowed_values}, allowed_values=allowed_values_mod, ) @@ -452,13 +436,11 @@ def notin(cls, forbidden_values: Iterable, **kwargs) -> "Check": "Argument forbidden_values must be iterable. " f"Got {forbidden_values}" ) from exc - # Set determined_by_unique=True by default since this check - # is safe to run on unique values only - kwargs.setdefault("determined_by_unique", True) return cls.from_builtin_check_name( "notin", kwargs, error=f"notin({forbidden_values})", + defaults={"determined_by_unique": True}, statistics={"forbidden_values": forbidden_values}, forbidden_values=forbidden_values_mod, ) @@ -476,13 +458,11 @@ def str_matches(cls, pattern: Union[str, re.Pattern], **kwargs) -> "Check": raise ValueError( f'pattern="{pattern}" cannot be compiled as regular expression' ) from exc - # Set determined_by_unique=True by default since this check - # is safe to run on unique values only - kwargs.setdefault("determined_by_unique", True) return cls.from_builtin_check_name( "str_matches", kwargs, error=f"str_matches('{pattern}')", + defaults={"determined_by_unique": True}, statistics={"pattern": pattern}, pattern=pattern, ) @@ -502,13 +482,11 @@ def str_contains( raise ValueError( f'pattern="{pattern}" cannot be compiled as regular expression' ) from exc - # Set determined_by_unique=True by default since this check - # is safe to run on unique values only - kwargs.setdefault("determined_by_unique", True) return cls.from_builtin_check_name( "str_contains", kwargs, error=f"str_contains('{pattern}')", + defaults={"determined_by_unique": True}, statistics={"pattern": pattern}, pattern=pattern, ) @@ -520,13 +498,11 @@ def str_startswith(cls, string: str, **kwargs) -> "Check": :param string: String all values should start with :param kwargs: key-word arguments passed into the `Check` initializer. """ - # Set determined_by_unique=True by default since this check - # is safe to run on unique values only - kwargs.setdefault("determined_by_unique", True) return cls.from_builtin_check_name( "str_startswith", kwargs, error=f"str_startswith('{string}')", + defaults={"determined_by_unique": True}, string=string, ) @@ -537,13 +513,11 @@ def str_endswith(cls, string: str, **kwargs) -> "Check": :param string: String all values should end with :param kwargs: key-word arguments passed into the `Check` initializer. """ - # Set determined_by_unique=True by default since this check - # is safe to run on unique values only - kwargs.setdefault("determined_by_unique", True) return cls.from_builtin_check_name( "str_endswith", kwargs, error=f"str_endswith('{string}')", + defaults={"determined_by_unique": True}, string=string, ) @@ -564,13 +538,11 @@ def str_length( "At least a minimum or a maximum need to be specified. Got " "None." ) - # Set determined_by_unique=True by default since this check - # is safe to run on unique values only - kwargs.setdefault("determined_by_unique", True) return cls.from_builtin_check_name( "str_length", kwargs, error=f"str_length({min_value}, {max_value})", + defaults={"determined_by_unique": True}, min_value=min_value, max_value=max_value, ) @@ -591,13 +563,11 @@ def unique_values_eq(cls, values: Iterable, **kwargs) -> "Check": raise ValueError( f"Argument values must be iterable. Got {values}" ) from exc - # Set determined_by_unique=True by default since this check - # is safe to run on unique values only - kwargs.setdefault("determined_by_unique", True) return cls.from_builtin_check_name( "unique_values_eq", kwargs, error=f"unique_values_eq({values})", + defaults={"determined_by_unique": True}, statistics={"values": values_mod}, values=values_mod, ) From 313d99cddf76a5f69a5d025d9b810dee1b19ea96 Mon Sep 17 00:00:00 2001 From: Adam Merberg Date: Tue, 19 Aug 2025 09:55:12 -0400 Subject: [PATCH 08/11] update comment re unique and nulls Signed-off-by: Adam Merberg --- pandera/backends/pandas/components.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandera/backends/pandas/components.py b/pandera/backends/pandas/components.py index 0a1539e34..24df990fc 100644 --- a/pandera/backends/pandas/components.py +++ b/pandera/backends/pandas/components.py @@ -628,9 +628,11 @@ def _validate_level_optimized( :param lazy: if True, collect errors instead of raising immediately """ try: - # Use unique values. Note that we use the MultiIndex.unique() method + # Use unique values. Note that we use the MultiIndex.unique method # to get the unique values, rather than multiindex.levels[level_pos] # which can have extra values that don't appear in the full data. + # Additionally, multiindex.unique will include nan if present, + # whereas multiindex.levels[level_pos] will not. unique_values = multiindex.unique(level=level_pos) unique_stub_df = pd.DataFrame(index=unique_values) From d8827462b36adde2039617325cac531ddbe6aa78 Mon Sep 17 00:00:00 2001 From: Adam Merberg Date: Tue, 19 Aug 2025 15:01:15 -0400 Subject: [PATCH 09/11] cleanup Signed-off-by: Adam Merberg --- pandera/api/base/checks.py | 10 +--- pandera/api/checks.py | 1 + pandera/backends/pandas/components.py | 50 +++++++------------ tests/pandas/test_schema_components.py | 69 -------------------------- 4 files changed, 19 insertions(+), 111 deletions(-) diff --git a/pandera/api/base/checks.py b/pandera/api/base/checks.py index 41f14747e..089e2b133 100644 --- a/pandera/api/base/checks.py +++ b/pandera/api/base/checks.py @@ -126,15 +126,7 @@ def from_builtin_check_name( defaults: Optional[dict[str, Any]] = None, **check_kwargs, ): - """Create a Check object from a built-in check's name. - - :param name: Name of the built-in check function - :param init_kwargs: Keyword arguments to pass to the Check constructor - :param error: Error message or callable for this check - :param statistics: Raw check constraint values - :param defaults: Default values to apply to init_kwargs if not already set - :param check_kwargs: Additional keyword arguments - """ + """Create a Check object from a built-in check's name.""" # Apply defaults to init_kwargs if provided if defaults: for key, value in defaults.items(): diff --git a/pandera/api/checks.py b/pandera/api/checks.py index 526ab289f..9fe7599a7 100644 --- a/pandera/api/checks.py +++ b/pandera/api/checks.py @@ -498,6 +498,7 @@ def str_startswith(cls, string: str, **kwargs) -> "Check": :param string: String all values should start with :param kwargs: key-word arguments passed into the `Check` initializer. """ + return cls.from_builtin_check_name( "str_startswith", kwargs, diff --git a/pandera/backends/pandas/components.py b/pandera/backends/pandas/components.py index 24df990fc..a547c033e 100644 --- a/pandera/backends/pandas/components.py +++ b/pandera/backends/pandas/components.py @@ -460,6 +460,8 @@ def validate( if not inplace: check_obj = check_obj.copy() + validate_full_df = not (head or tail or sample) + # Ensure the object has a MultiIndex if not is_multiindex(check_obj.index): # Allow an exception for a *single-level* Index when the schema also @@ -533,8 +535,14 @@ def validate( index_schema = deepcopy(index_schema) index_schema.coerce = False - # Check if we can optimize validation for this level - can_optimize = self._can_optimize_level(index_schema) + # Check if we can optimize validation for this level. We skip optimization + # if we're validating only a subset of the data because subsetting the data + # doesn't commute with taking unique values, which can lead to inconsistent + # results. For instance, the check may fail on the first n unique values but + # pass on the first n values. + can_optimize = validate_full_df and self._can_optimize_level( + index_schema + ) try: if can_optimize: @@ -543,14 +551,10 @@ def validate( check_obj.index, level_pos, index_schema, - head=head, - tail=tail, - sample=sample, - random_state=random_state, lazy=lazy, ) else: - # Fall back to traditional validation with full materialization + # Fall back to validating all of the values. self._validate_level_with_full_materialization( check_obj.index, level_pos, @@ -599,21 +603,13 @@ def _check_determined_by_unique(self, check) -> bool: """ # Check if the check result is determined by unique values # All built-in checks that are determined by unique values have this property set - if hasattr(check, "determined_by_unique"): - return check.determined_by_unique - - # Conservative default for checks without the property (shouldn't happen for modern checks) - return False + return getattr(check, "determined_by_unique", False) def _validate_level_optimized( self, multiindex: pd.MultiIndex, level_pos: int, index_schema, - head: Optional[int] = None, - tail: Optional[int] = None, - sample: Optional[int] = None, - random_state: Optional[int] = None, lazy: bool = False, ) -> None: """Validate a level using unique values optimization. @@ -621,18 +617,14 @@ def _validate_level_optimized( :param multiindex: The MultiIndex being validated :param level_pos: Position of this level in the MultiIndex :param index_schema: The schema for this level - :param head: validate the first n rows - :param tail: validate the last n rows - :param sample: validate a random sample of n rows - :param random_state: random seed for sampling :param lazy: if True, collect errors instead of raising immediately """ try: - # Use unique values. Note that we use the MultiIndex.unique method - # to get the unique values, rather than multiindex.levels[level_pos] - # which can have extra values that don't appear in the full data. - # Additionally, multiindex.unique will include nan if present, - # whereas multiindex.levels[level_pos] will not. + # Use unique values. Use the MultiIndex.unique method rather than + # multiindex.levels[level_pos] which can have extra values that + # don't appear in the full data. Additionally, multiindex.unique + # will include nan if present, whereas multiindex.levels[level_pos] + # will not. unique_values = multiindex.unique(level=level_pos) unique_stub_df = pd.DataFrame(index=unique_values) @@ -640,10 +632,6 @@ def _validate_level_optimized( # full validation as soon as we hit a failure index_schema.validate( unique_stub_df, - head=head, - tail=tail, - sample=sample, - random_state=random_state, lazy=False, inplace=True, ) @@ -654,10 +642,6 @@ def _validate_level_optimized( multiindex, level_pos, index_schema, - head=head, - tail=tail, - sample=sample, - random_state=random_state, lazy=lazy, ) diff --git a/tests/pandas/test_schema_components.py b/tests/pandas/test_schema_components.py index 1e2fd4e68..fb5594706 100644 --- a/tests/pandas/test_schema_components.py +++ b/tests/pandas/test_schema_components.py @@ -891,75 +891,6 @@ def test_multiindex_incorrect_input(indexes) -> None: MultiIndex(indexes) -def test_multiindex_optimization_with_sub_index() -> None: - """Test that MultiIndex optimization works correctly with sub-MultiIndices. - - This test ensures that when validating MultiIndex levels, the optimization - correctly determines the unique values of the level. Typically, this means - using multiindex.unique(level=level_pos) and NOT multiindex.levels[level_pos], - because .levels can contain values that don't actually appear in sub-MultiIndices - (e.g. after slicing or filtering operations). - """ - # Create a large MultiIndex with specific values - original_mi = pd.MultiIndex.from_arrays( - [ - ["valid", "INVALID", "valid", "valid"] - * 500, # Some invalid values - list(range(2000)), - ], - names=["status", "id"], - ) - - # Create original DataFrame - df = pd.DataFrame({"value": range(2000)}, index=original_mi) - - # Filter to create a sub-MultiIndex that excludes all 'INVALID' entries - # This is a common real-world scenario: filtering data but keeping MultiIndex structure - filtered_df = df[df.index.get_level_values("status") == "valid"] - - # Create schema that would reject 'INVALID' values - schema = DataFrameSchema( - columns={"value": Column(int)}, - index=MultiIndex( - [ - Index( - String, - checks=[Check.str_matches(r"^valid$")], - name="status", - ), - Index( - Int, checks=[Check.greater_than_or_equal_to(0)], name="id" - ), - ] - ), - ) - - # This should pass because filtered_df only contains 'valid' values - # If the optimization incorrectly used .levels[level_pos], this would fail - # because filtered_df.index.levels[0] still contains 'INVALID' from the original - validated_df = schema.validate(filtered_df) - assert isinstance(validated_df, pd.DataFrame) - assert len(validated_df) > 0 - - # Verify that the optimization is actually beneficial by ensuring we have duplicates - level0_unique = filtered_df.index.unique(level=0) - level0_total = len(filtered_df.index.get_level_values(0)) - assert ( - len(level0_unique) < level0_total - ), "Should have duplicates for optimization to be beneficial" - - # Additional verification: manually check that .levels would contain phantom values - level0_levels = filtered_df.index.levels[0] - level0_actual = filtered_df.index.unique(level=0) - phantom_values = set(level0_levels) - set(level0_actual) - assert ( - len(phantom_values) > 0 - ), "Test setup should create phantom values in .levels" - assert ( - "INVALID" in phantom_values - ), "Should have 'INVALID' as phantom value" - - @pytest.mark.parametrize( "schema,expected_optimized_calls,expected_full_calls,expected_optimized_levels,expected_full_levels", [ From e9eee9feda9c677af8a0d34eab2aa5a02768a1ea Mon Sep 17 00:00:00 2001 From: Adam Merberg Date: Tue, 19 Aug 2025 22:50:42 -0400 Subject: [PATCH 10/11] exclude pyspark indexes from optimization for now Signed-off-by: Adam Merberg --- pandera/backends/pandas/components.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/pandera/backends/pandas/components.py b/pandera/backends/pandas/components.py index a547c033e..cbb641b66 100644 --- a/pandera/backends/pandas/components.py +++ b/pandera/backends/pandas/components.py @@ -462,16 +462,16 @@ def validate( validate_full_df = not (head or tail or sample) + is_pyspark_index = ( + type(check_obj).__module__.startswith("pyspark.pandas") + and hasattr(check_obj.index, "__module__") + and check_obj.index.__module__.startswith("pyspark.pandas") + ) # Ensure the object has a MultiIndex if not is_multiindex(check_obj.index): # Allow an exception for a *single-level* Index when the schema also # describes exactly one level to maintain compatibility (e.g. pyspark.pandas # often materializes a single-level MultiIndex as a plain Index). - is_pyspark_index = ( - type(check_obj).__module__.startswith("pyspark.pandas") - and hasattr(check_obj.index, "__module__") - and check_obj.index.__module__.startswith("pyspark.pandas") - ) if len(schema.indexes) == 1 and ( is_index(check_obj.index) or is_pyspark_index @@ -540,8 +540,10 @@ def validate( # doesn't commute with taking unique values, which can lead to inconsistent # results. For instance, the check may fail on the first n unique values but # pass on the first n values. - can_optimize = validate_full_df and self._can_optimize_level( - index_schema + can_optimize = ( + validate_full_df + and not is_pyspark_index + and self._can_optimize_level(index_schema) ) try: From 5b8b3bf564ae949754deefc36d696bc6e75a5a95 Mon Sep 17 00:00:00 2001 From: Adam Merberg Date: Wed, 20 Aug 2025 14:32:19 -0400 Subject: [PATCH 11/11] Revert "exclude pyspark indexes from optimization for now" This reverts commit e9eee9feda9c677af8a0d34eab2aa5a02768a1ea. Signed-off-by: Adam Merberg --- pandera/backends/pandas/components.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/pandera/backends/pandas/components.py b/pandera/backends/pandas/components.py index cbb641b66..a547c033e 100644 --- a/pandera/backends/pandas/components.py +++ b/pandera/backends/pandas/components.py @@ -462,16 +462,16 @@ def validate( validate_full_df = not (head or tail or sample) - is_pyspark_index = ( - type(check_obj).__module__.startswith("pyspark.pandas") - and hasattr(check_obj.index, "__module__") - and check_obj.index.__module__.startswith("pyspark.pandas") - ) # Ensure the object has a MultiIndex if not is_multiindex(check_obj.index): # Allow an exception for a *single-level* Index when the schema also # describes exactly one level to maintain compatibility (e.g. pyspark.pandas # often materializes a single-level MultiIndex as a plain Index). + is_pyspark_index = ( + type(check_obj).__module__.startswith("pyspark.pandas") + and hasattr(check_obj.index, "__module__") + and check_obj.index.__module__.startswith("pyspark.pandas") + ) if len(schema.indexes) == 1 and ( is_index(check_obj.index) or is_pyspark_index @@ -540,10 +540,8 @@ def validate( # doesn't commute with taking unique values, which can lead to inconsistent # results. For instance, the check may fail on the first n unique values but # pass on the first n values. - can_optimize = ( - validate_full_df - and not is_pyspark_index - and self._can_optimize_level(index_schema) + can_optimize = validate_full_df and self._can_optimize_level( + index_schema ) try: