From f330a99d9a5140bdaa8800c540f234a31c1bbef7 Mon Sep 17 00:00:00 2001 From: Cangyuan Li Date: Tue, 16 Sep 2025 10:21:25 -0700 Subject: [PATCH 1/4] docs: fix docstring error in is_id --- src/checkedframe/_checks.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/checkedframe/_checks.py b/src/checkedframe/_checks.py index e7b9bdc..79cf610 100644 --- a/src/checkedframe/_checks.py +++ b/src/checkedframe/_checks.py @@ -1370,9 +1370,10 @@ def is_id(subset: str | list[str]) -> Check: class MySchema(cf.Schema): - __dataframe_checks__ = [cf.Check.is_id("group")] group = cf.String() + _id_check = cf.Check.is_id("group") + df = pl.DataFrame({"group": ["A", "B", "A"]}) MySchema.validate(df) @@ -1382,8 +1383,7 @@ class MySchema(cf.Schema): .. code-block:: text SchemaError: Found 1 error(s) - __dataframe__: 1 error(s) - - is_id failed: 'group' must uniquely identify the DataFrame + * is_id failed for 3 / 3 (100.00%) rows: group must uniquely identify the DataFrame """ return Check( From 2851d6902a4f86a88f3d9e51f116cd53fa5ee5b2 Mon Sep 17 00:00:00 2001 From: Cangyuan Li Date: Tue, 16 Sep 2025 10:45:53 -0700 Subject: [PATCH 2/4] perf: take `n_unique` fast path in `is_id` when possible --- src/checkedframe/_checks.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/checkedframe/_checks.py b/src/checkedframe/_checks.py index 79cf610..9fe7a1c 100644 --- a/src/checkedframe/_checks.py +++ b/src/checkedframe/_checks.py @@ -261,7 +261,14 @@ def _is_sorted(s: nw.Series, descending: bool) -> bool: def _is_id(df: nw.DataFrame, subset: str | list[str]) -> bool: n_rows = df.shape[0] - n_unique_rows = df.select(subset).unique().shape[0] + + # n_unique on dataframes is not available on narhwals, so if we have only one + # column specified as the subset, take a potential fast path, otherwise fallback to + # a generic version + if isinstance(subset, str): + n_unique_rows = df[subset].n_unique() + else: + n_unique_rows = df.select(subset).unique().shape[0] return n_rows == n_unique_rows From 0fa6a24ebf0c5283019285518ec25b4a672c0077 Mon Sep 17 00:00:00 2001 From: Cangyuan Li Date: Tue, 16 Sep 2025 14:45:40 -0700 Subject: [PATCH 3/4] chore: ignore ambiguous variable name --- pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 7b736cc..077fd46 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,6 @@ pyperclip = ["pyperclip>=1.0.0"] [dependency-groups] typing = ["typing_extensions"] docs = [ - "checkedframe", "ghp-import==2.1.0", "pydata-sphinx-theme==0.16.1", "sphinx==8.2.3", @@ -53,6 +52,8 @@ profile = "black" plugins = ["checkedframe.mypy"] allow_redefinition = true +[tool.ruff] +ignore = ["E741"] [tool.ruff.format] docstring-code-format = true From f0f1654c2a0198745bc86272ecec8a45f99480dc Mon Sep 17 00:00:00 2001 From: Cangyuan Li Date: Tue, 16 Sep 2025 14:46:09 -0700 Subject: [PATCH 4/4] feat: add built-in check for cardinality ratio between two columns --- src/checkedframe/_checks.py | 92 ++++++++++++++++++++++++++++++++++++- tests/test_checks.py | 38 +++++++++++++++ 2 files changed, 128 insertions(+), 2 deletions(-) diff --git a/src/checkedframe/_checks.py b/src/checkedframe/_checks.py index 9fe7a1c..11bab51 100644 --- a/src/checkedframe/_checks.py +++ b/src/checkedframe/_checks.py @@ -2,10 +2,11 @@ import functools import inspect -from collections.abc import Collection, Sequence -from typing import Any, Callable, Literal, Optional, get_type_hints +from collections.abc import Collection, Iterable, Sequence +from typing import Any, Callable, Literal, Optional, get_args, get_type_hints import narwhals.stable.v1 as nw +import narwhals.stable.v1.typing as nwt from narwhals.stable.v1.dependencies import ( get_cudf, get_modin, @@ -371,6 +372,69 @@ def _str_contains(name: str, pattern: str, literal: bool = False) -> nw.Expr: return nw.col(name).str.contains(pattern, literal=literal) +CardinalityRatio = Literal["1:1", "1:m", "m:1"] + + +def _cardinality_ratio( + df: nw.DataFrame, + left: str, + right: str, + cardinality: CardinalityRatio, + by: str | list[str] | None = None, + allow_duplicates: bool = False, +): + index_col = "__checkedframe_temp_cardinality_ratio_private_index__" + result_col = left + + original_lf = df.with_row_index(index_col).lazy() + + if by is None: + by = "__checkedframe_temp_cardinality_ratio_private_by__" + original_lf = original_lf.with_columns(nw.lit(1).alias(by)) + + if isinstance(by, str): + by = [by] + + lf = original_lf.select(left, right, *by) + + if allow_duplicates: + lf = lf.unique() + + if cardinality == "1:1": + result_lf = ( + lf.group_by(by) + .agg( + nw.col(left).n_unique().__eq__(nw.len()), + nw.col(right).n_unique().__eq__(nw.len()), + ) + .select(*by, nw.col(left).__and__(nw.col(right)).alias(result_col)) + ) + elif cardinality == "1:m": + result_lf = ( + lf.group_by(by) + .agg(nw.col(left).n_unique().__eq__(nw.len()).alias(result_col)) + .select(*by, result_col) + ) + elif cardinality == "m:1": + result_lf = ( + lf.group_by(by) + .agg(nw.col(right).n_unique().__eq__(nw.len()).alias(result_col)) + .select(*by, result_col) + ) + else: + raise ValueError( + f"Invalid cardinality `{cardinality}`, must be one of `{get_args(CardinalityRatio)}`" + ) + + return ( + original_lf.select(index_col, *by) + .join(result_lf, on=by, how="left") + .sort(index_col) # joins are not guaranteed to preserve order + .select(result_col) + .collect()[result_col] + ) + + CheckInputType = Optional[Literal["auto", "Frame", "str", "Series"]] CheckReturnType = Literal["auto", "bool", "Expr", "Series"] @@ -1401,3 +1465,27 @@ class MySchema(cf.Schema): name="is_id", description=f"{subset} must uniquely identify the DataFrame", ) + + @staticmethod + def cardinality_ratio( + left: str, + right: str, + cardinality: CardinalityRatio, + by: str | list[str] | None = None, + allow_duplicates: bool = False, + ) -> Check: + return Check( + func=functools.partial( + _cardinality_ratio, + left=left, + right=right, + cardinality=cardinality, + by=by, + allow_duplicates=allow_duplicates, + ), + input_type="Frame", + return_type="Series", + native=False, + name="cardinality_ratio", + description=f"The relationship between {left} and {right} must be {cardinality} (by={by}, allow_duplicates={allow_duplicates})", + ) diff --git a/tests/test_checks.py b/tests/test_checks.py index 535c4d5..96fcc70 100644 --- a/tests/test_checks.py +++ b/tests/test_checks.py @@ -4,6 +4,7 @@ import pytest import checkedframe as cf +from checkedframe.exceptions import SchemaError ENGINES = [pd.DataFrame, pl.DataFrame] @@ -338,3 +339,40 @@ class S(cf.Schema): _c = cf.Check.is_id(["a", "b"]) S.validate(df) + + +@pytest.mark.parametrize("engine", ENGINES) +def test_cardinality_ratio(engine): + df = engine( + { + "feature": ["f1", "f1", "f1", "f2"], + "special_value": [-1, -6, -4, -7], + "imputed": [None, None, "MAX_WIN_P1", None], + "reason": ["o1", "o1", "o2", "o3"], + } + ) + + class S(cf.Schema): + _c = cf.Check.cardinality_ratio("imputed", "reason", "1:1", by="feature") + + with pytest.raises(SchemaError): + S.validate(df) + + class S(cf.Schema): + _c = cf.Check.cardinality_ratio("imputed", "reason", "1:m", by="feature") + + with pytest.raises(SchemaError): + S.validate(df) + + class S(cf.Schema): + _c = cf.Check.cardinality_ratio("imputed", "reason", "m:1", by="feature") + + with pytest.raises(SchemaError): + S.validate(df) + + class S(cf.Schema): + _c = cf.Check.cardinality_ratio( + "imputed", "reason", "m:1", by="feature", allow_duplicates=True + ) + + S.validate(df)