From 86aa39a5a77265865d45494afcd33be19d08205d Mon Sep 17 00:00:00 2001 From: ksneab7 Date: Wed, 20 Sep 2023 08:53:59 -0400 Subject: [PATCH] Added warning to psi calculation for if it is not calculated --- .../profilers/categorical_column_profile.py | 14 +++++++++++--- .../profilers/test_categorical_column_profile.py | 11 +++++++++-- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/dataprofiler/profilers/categorical_column_profile.py b/dataprofiler/profilers/categorical_column_profile.py index 3a813b26..e135a45b 100644 --- a/dataprofiler/profilers/categorical_column_profile.py +++ b/dataprofiler/profilers/categorical_column_profile.py @@ -2,6 +2,7 @@ from __future__ import annotations import math +import warnings from collections import defaultdict from operator import itemgetter from typing import cast @@ -305,14 +306,21 @@ def diff(self, other_profile: CategoricalColumn, options: dict = None) -> dict: other_profile._categories.items(), key=itemgetter(1), reverse=True ) ) + total_psi = 0.0 if cat_count1.keys() == cat_count2.keys(): - total_psi = 0.0 for key in cat_count1.keys(): perc_A = cat_count1[key] / self.sample_size perc_B = cat_count2[key] / other_profile.sample_size total_psi += (perc_B - perc_A) * math.log(perc_B / perc_A) - - differences["statistics"]["psi"] = total_psi + else: + warnings.warn( + "psi was not calculated due to the differences in categories " + "of the profiles. Differences:\n" + f"{set(cat_count1.keys()) ^ set(cat_count2.keys())}\n" + "defaulting psi value to 0...", + RuntimeWarning, + ) + differences["statistics"]["psi"] = total_psi differences["statistics"][ "categorical_count" ] = profiler_utils.find_diff_of_dicts(cat_count1, cat_count2) diff --git a/dataprofiler/tests/profilers/test_categorical_column_profile.py b/dataprofiler/tests/profilers/test_categorical_column_profile.py index 37b85f4f..8c6e1d5d 100644 --- a/dataprofiler/tests/profilers/test_categorical_column_profile.py +++ b/dataprofiler/tests/profilers/test_categorical_column_profile.py @@ -726,10 +726,17 @@ def test_categorical_diff(self): "df": 2, "p-value": 0.3099238764710244, }, + "psi": 0, }, } - - self.assertDictEqual(expected_diff, profile.diff(profile2)) + with self.assertWarnsRegex( + RuntimeWarning, + "psi was not calculated due to the differences in categories " + "of the profiles. Differences:\n{'maybe'}\n" + "defaulting psi value to 0...", + ): + test_profile_diff = profile.diff(profile2) + self.assertDictEqual(expected_diff, test_profile_diff) # Test with one categorical column matching df_not_categorical = pd.Series(