added tests for l-p relaxations

Co-authored-by: Kajetan Schweighofer <[email protected]>
socialfoundations · Apr 24, 2024 · f3c4c9f · f3c4c9f
1 parent fd1c707
commit f3c4c9f
Show file tree

Hide file tree

Showing 5 changed files with 118 additions and 41 deletions.
diff --git a/README.md b/README.md
@@ -114,12 +114,13 @@ can be chosen by altering the `l_p_norm` parameter.
 
 A few useful values:
 - `l_p_norm="inf"` **[default]** evaluates equalized-odds as the maximum
-between group-wise TPR and FPR differences (as shown above).
+between group-wise TPR and FPR differences (as shown above);
 - `l_p_norm=1` evaluates equalized-odds as the average of the
-absolute difference in group-wise TPR and FPR.
-  - this is also known as `average_abs_odds_difference`.
+absolute difference in group-wise TPR and FPR;
+  - this is also known as `average_abs_odds_difference`;
+  - specifically, the l-1 distance is *twice* the "average absolute odds" metric, so change `tolerance` accordingly;
 - `l_p_norm=p` for any other positive integer $p$: computes the distance between group-wise ROC
-points using the specified l-p norm.
+points using the specified l-p norm;
 
 The actual equalized odds constraint implemented is:
 

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -26,7 +26,7 @@ def rng(random_seed: int) -> np.random.Generator:
     return np.random.default_rng(random_seed)
 
 
-@pytest.fixture(params=[0.01, 0.02, 0.05, 0.1, 0.2, 1.0])
+@pytest.fixture(params=[0, 0.01, 0.02, 0.05, 0.1, 0.2, 1.0])
 def constraint_slack(request) -> float:
     """Fixture for constraint slack/violation (fairness tolerance)."""
     return request.param
@@ -135,7 +135,7 @@ def y_true(
     n_samples = len(sensitive_attribute)
 
     # Different levels of gaussian noise per group
-    group_noise = [0.2 + rng.random() / 2 for _ in range(n_groups)]
+    group_noise = [0.1 + rng.random() * 0.5 for _ in range(n_groups)]
 
     # Generate predictions
     label_prevalence = 0.2 + (rng.random() * 0.6)  # in [0.2, 0.8]

diff --git a/tests/test_constraints.py b/tests/test_constraints.py
@@ -12,6 +12,8 @@
 from error_parity.roc_utils import compute_roc_point_from_predictions
 from error_parity.evaluation import evaluate_fairness
 
+from .utils import check_metric_tolerance
+
 
 def test_synthetic_data_generation(
     y_true: np.ndarray,
@@ -34,26 +36,6 @@ def test_synthetic_data_generation(
             f"Synthetic data generated has group-{g} AUC of {group_auc}"
 
 
-def get_metric_abs_tolerance(group_size: int) -> float:
-    """Reasonable value for metric fulfillment given the inherent randomization
-    of predictions and the size of the group over which the metric is computed.
-    """
-    return (0.5 * group_size) ** (-1 / 2)
-    # return group_size ** (-1/2)
-
-
-def check_metric_tolerance(
-    theory_val: float, empirical_val, group_size: int, metric_name: str = ""
-) -> bool:
-    """Checks that the empirical value is within a reasonable tolerance of the expected theoretical value."""
-    assert np.isclose(
-        theory_val,
-        empirical_val,
-        atol=get_metric_abs_tolerance(group_size),
-        rtol=0.01,
-    ), f"> '{metric_name}' mismatch; expected {theory_val:.3}; got {empirical_val:.3};"
-
-
 def test_invalid_constraint_name():
     with pytest.raises(ValueError):
         _ = RelaxedThresholdOptimizer(
@@ -62,7 +44,7 @@ def test_invalid_constraint_name():
         )
 
 
-def test_equalized_odds_lp_relaxation(
+def test_equalized_odds_lp_relaxation_fulfillment(
     X_features: np.ndarray,
     y_true: np.ndarray,
     sensitive_attribute: np.ndarray,
@@ -146,6 +128,23 @@ def check_constraint_fulfillment(
         f"expected less than {postprocessed_clf.tolerance};"
     )
 
+    # # NOTE: This test is disabled as it is too strict for some cases
+    # # Check that, for tight constraints, theoretical solution is near the constraint boundary
+    # # > i.e., if constraint tolerance is small, then the theoretical solution found should have
+    # # constraint violation very close to that tolerance, as lower violation would mean a
+    # # lower-cost solution could've been found.
+    # TIGHT_CONSTRAINT_THRESHOLD = 0.04
+    # UNDERSHOOT_TOLERANCE = 0.01
+    # if postprocessed_clf.tolerance <= TIGHT_CONSTRAINT_THRESHOLD:
+    #     assert (
+    #         postprocessed_clf.constraint_violation()
+    #         >= postprocessed_clf.tolerance - UNDERSHOOT_TOLERANCE
+    #     ), (
+    #         f"Solution violates tight '{fairness_constraint}_l{postprocessed_clf.l_p_norm}' constraint; "
+    #         f"got: {postprocessed_clf.constraint_violation()}; "
+    #         f"expected near {postprocessed_clf.tolerance};"
+    #     )
+
     # Optimal binarized predictions
     y_pred_binary = postprocessed_clf(X_features, group=sensitive_attribute)
 
@@ -224,11 +223,12 @@ def check_constraint_fulfillment(
 
     # Assert realized constraint violation is close to theoretical solution found
     check_metric_tolerance(
-        # NOTE: it's fine if actual violation is below slack (and not fine if above)
-        empirical_val=max(empirical_constraint_violation - postprocessed_clf.tolerance, 0),
-        theory_val=0.0,
+        empirical_val=empirical_constraint_violation,
+        # NOTE: we're more lenient for tolerances close to 0, as 0 is literally impossible to achieve
+        theory_val=max(postprocessed_clf.constraint_violation(), 0.01),
         group_size=smallest_denominator,
         metric_name=f"{fairness_constraint} violation above slack",
+        less_or_equal=True,     # constraint violation below slack is acceptable
     )
 
     # Check realized global ROC point

diff --git a/tests/test_evaluation.py b/tests/test_evaluation.py
@@ -2,6 +2,10 @@
 
 import logging
 import pytest
+import numpy as np
+
+from error_parity import RelaxedThresholdOptimizer
+from error_parity.cvxpy_utils import SOLUTION_TOLERANCE
 from error_parity.evaluation import _safe_division
 
 
@@ -26,14 +30,43 @@ def test_valid_safe_division(caplog, rng):
         assert "error" not in caplog.text.lower()
 
 
-def test_equalized_odds_measure():
-    pass
-    # # Check realized constraint violation
-    # groupwise_differences = [
-    #     np.linalg.norm(
-    #         actual_group_roc_points[i] - actual_group_roc_points[j],
-    #         ord=np.inf,
-    #     )
-    #     for i, j in product(unique_groups, unique_groups)
-    #     if i < j
-    # ]
+def test_equalized_odds_relaxation_costs(
+    X_features: np.ndarray,
+    y_true: np.ndarray,
+    sensitive_attribute: np.ndarray,
+    predictor: callable,
+    constraint_slack: float,
+    random_seed: int,
+):
+    """Tests whether l-p norms follow standard orders (lower p -> higher norm)."""
+
+    results = {}
+    sorted_p_norms = (1, 2, 3, 10, np.inf)
+    for norm in sorted_p_norms:
+        # Fit postprocessing to data
+        clf = RelaxedThresholdOptimizer(
+            predictor=predictor,
+            constraint="equalized_odds",
+            tolerance=constraint_slack,
+            false_pos_cost=1,
+            false_neg_cost=1,
+            seed=random_seed,
+            l_p_norm=norm,
+        )
+        clf.fit(X=X_features, y=y_true, group=sensitive_attribute)
+
+        # Store results
+        results[norm] = clf.cost()
+
+    # Check that l-p norms with lower p achieve lower costs and higher unfairness
+    for idx in range(1, len(sorted_p_norms)):
+
+        lower_p_norm = sorted_p_norms[idx - 1]
+        higher_p_norm = sorted_p_norms[idx]
+
+        lower_p_cost = results[lower_p_norm]
+        higher_p_cost = results[higher_p_norm]
+
+        # Assert lower-p costs are higher (accuracy is lower)
+        assert lower_p_cost > higher_p_cost - SOLUTION_TOLERANCE, \
+            f"l-{lower_p_norm} cost: {lower_p_cost} < l-{higher_p_norm} cost: {higher_p_cost}"
diff --git a/tests/utils.py b/tests/utils.py
@@ -0,0 +1,43 @@
+import numpy as np
+
+
+def get_metric_abs_tolerance(group_size: int) -> float:
+    """Reasonable value for metric fulfillment given the inherent randomization
+    of predictions and the size of the group over which the metric is computed.
+    """
+    return (0.1 * group_size) ** (-1 / 1.7)     # tighter for larger groups, less tight for smaller groups
+    # return group_size ** (-1/2)
+
+
+def check_metric_tolerance(
+    theory_val: float,
+    empirical_val: float,
+    group_size: int,
+    metric_name: str = "",
+    less_or_equal: bool = False,
+) -> bool:
+    """Checks that empirical value approximately matches theoretical value.
+
+    Parameters
+    ----------
+    theory_val : float
+        The theoretical value to fulfill for the metrics.
+    empirical_val : float
+        The actual realized value for the metric.
+    group_size : int
+        The smallest group size over which the metric is evaluated.
+    metric_name : str, optional
+        The metric's name, by default "". This is used for debugging purposes.
+    less_or_equal : bool, optional
+        Whether a lower empirical value compared to theory is fine, by default
+        False.
+    """
+    if less_or_equal and empirical_val <= theory_val:
+        return True
+
+    assert np.isclose(
+        theory_val,
+        empirical_val,
+        atol=get_metric_abs_tolerance(group_size),
+        rtol=0.01,
+    ), f"> '{metric_name}' mismatch; expected {theory_val:.3}; got {empirical_val:.3};"