capitalone · taylorfturner · Sep 20, 2023 · Sep 18, 2023 · Sep 18, 2023 · Sep 20, 2023
@@ -1,6 +1,7 @@
 """Contains class for categorical column profiler."""
 from __future__ import annotations
 
+import math
 from collections import defaultdict
 from operator import itemgetter
 from typing import cast
@@ -304,7 +305,14 @@ def diff(self, other_profile: CategoricalColumn, options: dict = None) -> dict:
                     other_profile._categories.items(), key=itemgetter(1), reverse=True
                 )
             )
-
+            if cat_count1.keys() == cat_count2.keys():
+                total_psi = 0.0
+                for key in cat_count1.keys():
+                    perc_A = cat_count1[key] / self.sample_size
+                    perc_B = cat_count2[key] / other_profile.sample_size
+                    total_psi += (perc_B - perc_A) * math.log(perc_B / perc_A)
+
+                differences["statistics"]["psi"] = total_psi
             differences["statistics"][
                 "categorical_count"
             ] = profiler_utils.find_diff_of_dicts(cat_count1, cat_count2)

@@ -1,4 +1,5 @@
 import json
+import math
 import os
 import unittest
 from collections import defaultdict
@@ -756,6 +757,44 @@ def test_categorical_diff(self):
         }
         self.assertDictEqual(expected_diff, profile.diff(profile2))
 
+        # Test diff with psi enabled
+        df_categorical = pd.Series(["y", "y", "y", "y", "n", "n", "n", "maybe"])
+        profile = CategoricalColumn(df_categorical.name)
+        profile.update(df_categorical)
+
+        df_categorical = pd.Series(["y", "maybe", "y", "y", "n", "n", "maybe"])
+        profile2 = CategoricalColumn(df_categorical.name)
+        profile2.update(df_categorical)
+
+        # Calculate expected_psi
+        expected_psi = 0
+        bin_perc = [4 / 8, 3 / 8, 1 / 8]
+        bin_perc_2 = [3 / 7, 2 / 7, 2 / 7]
+        for perc_A, perc_B in zip(bin_perc, bin_perc_2):
+            expected_psi += (perc_B - perc_A) * math.log(perc_B / perc_A)
+
+        # chi2-statistic = sum((observed-expected)^2/expected for each category in each column)
+        # df = categories - 1
+        # p-value found through using chi2 CDF
+        expected_diff = {
+            "categorical": "unchanged",
+            "statistics": {
+                "unique_count": "unchanged",
+                "unique_ratio": -0.05357142857142855,
+                "chi2-test": {
+                    "chi2-statistic": 0.6122448979591839,
+                    "df": 2,
+                    "p-value": 0.7362964551863367,
+                },
+                "categories": "unchanged",
+                "gini_impurity": -0.059311224489795866,
+                "unalikeability": -0.08333333333333326,
+                "psi": expected_psi,
+                "categorical_count": {"y": 1, "n": 1, "maybe": -1},
+            },
+        }
+        self.assertDictEqual(expected_diff, profile.diff(profile2))
+
     def test_unalikeability(self):
         df_categorical = pd.Series(["a", "a"])
         profile = CategoricalColumn(df_categorical.name)