From acae2da6faf82fc897242c1bc6a719c69dd8512a Mon Sep 17 00:00:00 2001
From: ksneab7 <91956551+ksneab7@users.noreply.github.com>
Date: Thu, 21 Sep 2023 13:05:02 -0400
Subject: [PATCH] Staging/main/0.10.4 (#1029)

* modified the assignees for issue creation (#1016)

* Minor: Profiler Path Fix in Example Notebook (#1021)

* Bump actions/checkout from 3 to 4 (#1024)

Bumps [actions/checkout](https://github.com/actions/checkout) from 3 to 4.
- [Release notes](https://github.com/actions/checkout/releases)
- [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md)
- [Commits](https://github.com/actions/checkout/compare/v3...v4)

---
updated-dependencies:
- dependency-name: actions/checkout
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Taylor Turner <taylorfturner@gmail.com>

* Make sure random_state is a list before indexed assignment (#968)

* Make sure random_state is a list before indexed assignment

Currently, a mypy error occurs because we attempt to assign to
random_state[1] when random_state has type
Union[list[Any], tuple[Any]]. Tuples are immutable so this is a type
error.

We fix this by making random_state into a list before doing indexed
assignment on it.

* Add type guards for random_state

* Check random_state before random_state[1]

Co-authored-by: Michael Davis <36012613+micdavis@users.noreply.github.com>

* Reorder conditions for consistency

Co-authored-by: Taylor Turner <taylorfturner@gmail.com>

---------

Co-authored-by: Michael Davis <36012613+micdavis@users.noreply.github.com>
Co-authored-by: Taylor Turner <taylorfturner@gmail.com>

* added psi calculation to categorical columns (#1027)

* added psi calculation to categorical columns

* Changed test value to non-calculated assignment

---------

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: Navid Nafiuzzaman <mxn4459@rit.edu>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Taylor Turner <taylorfturner@gmail.com>
Co-authored-by: Junho Lee <53921230+junholee6a@users.noreply.github.com>
Co-authored-by: Michael Davis <36012613+micdavis@users.noreply.github.com>
---
 .github/ISSUE_TEMPLATE/bug_report.md          |  2 +-
 .github/ISSUE_TEMPLATE/documentation_issue.md |  2 +-
 .github/ISSUE_TEMPLATE/feature_request.md     |  2 +-
 .github/ISSUE_TEMPLATE/open_issue.md          |  2 +-
 .github/workflows/publish-python-package.yml  |  2 +-
 .github/workflows/test-python-package.yml     |  2 +-
 dataprofiler/labelers/data_processing.py      |  8 ++--
 .../profilers/categorical_column_profile.py   | 16 ++++++++
 .../test_categorical_column_profile.py        | 41 ++++++++++++++++++-
 examples/merge_profile_list.ipynb             |  4 +-
 10 files changed, 67 insertions(+), 14 deletions(-)

diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
index 6b5bf0bb7..679c0e407 100644
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -3,7 +3,7 @@ name: Bug report
 about: Create a report to help us improve
 title: ''
 labels: Bug
-assignees: JGSweets, ksneab7, micdavis, taylorfturner
+assignees: ksneab7, micdavis, taylorfturner, tyfarnan
 
 ---
 
diff --git a/.github/ISSUE_TEMPLATE/documentation_issue.md b/.github/ISSUE_TEMPLATE/documentation_issue.md
index b0802a943..367516ec9 100644
--- a/.github/ISSUE_TEMPLATE/documentation_issue.md
+++ b/.github/ISSUE_TEMPLATE/documentation_issue.md
@@ -3,7 +3,7 @@ name: Documentation Issue
 about: Is there an issue with the documentation?
 title: ''
 labels: Documentation
-assignees: JGSweets, ksneab7, micdavis, taylorfturner
+assignees: ksneab7, micdavis, taylorfturner, tyfarnan
 
 ---
 
diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
index 33697919d..a1650b194 100644
--- a/.github/ISSUE_TEMPLATE/feature_request.md
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -3,7 +3,7 @@ name: Feature request
 about: Suggest an idea for this project
 title: ''
 labels: New Feature
-assignees: JGSweets, ksneab7, micdavis, taylorfturner
+assignees: ksneab7, micdavis, taylorfturner, tyfarnan
 
 ---
 
diff --git a/.github/ISSUE_TEMPLATE/open_issue.md b/.github/ISSUE_TEMPLATE/open_issue.md
index 126e83d86..dfe5b5f3b 100644
--- a/.github/ISSUE_TEMPLATE/open_issue.md
+++ b/.github/ISSUE_TEMPLATE/open_issue.md
@@ -3,6 +3,6 @@ name: Open Issue
 about: Open an issue other than a bug, feature, or documentation issue
 title: ''
 labels: ''
-assignees: JGSweets, ksneab7, micdavis, taylorfturner
+assignees: ksneab7, micdavis, taylorfturner, tyfarnan
 
 ---
diff --git a/.github/workflows/publish-python-package.yml b/.github/workflows/publish-python-package.yml
index 3c593d4ed..0120180fd 100644
--- a/.github/workflows/publish-python-package.yml
+++ b/.github/workflows/publish-python-package.yml
@@ -16,7 +16,7 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Set up Python
       uses: actions/setup-python@v4
       with:
diff --git a/.github/workflows/test-python-package.yml b/.github/workflows/test-python-package.yml
index 41f7036e9..21130b471 100644
--- a/.github/workflows/test-python-package.yml
+++ b/.github/workflows/test-python-package.yml
@@ -19,7 +19,7 @@ jobs:
         python-version: [3.8, 3.9, "3.10"]
 
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Set up Python ${{ matrix.python-version }}
       uses: actions/setup-python@v4
       with:
diff --git a/dataprofiler/labelers/data_processing.py b/dataprofiler/labelers/data_processing.py
index be1a3fee4..2213fd72d 100644
--- a/dataprofiler/labelers/data_processing.py
+++ b/dataprofiler/labelers/data_processing.py
@@ -1555,8 +1555,8 @@ def __init__(
             random_state = random.Random(random_state)
         elif isinstance(random_state, (list, tuple)) and len(random_state) == 3:
             # tuple required for random state to be set, lists do not work
-            if isinstance(random_state[1], list):
-                random_state[1] = tuple(random_state[1])  # type: ignore
+            if isinstance(random_state, list) and isinstance(random_state[1], list):
+                random_state[1] = tuple(random_state[1])
             if isinstance(random_state, list):
                 random_state = tuple(random_state)
             temp_random_state = random.Random()
@@ -1894,8 +1894,8 @@ def __init__(
             random_state = random.Random(random_state)
         elif isinstance(random_state, (list, tuple)) and len(random_state) == 3:
             # tuple required for random state to be set, lists do not work
-            if isinstance(random_state[1], list):
-                random_state[1] = tuple(random_state[1])  # type: ignore
+            if isinstance(random_state, list) and isinstance(random_state[1], list):
+                random_state[1] = tuple(random_state[1])
             if isinstance(random_state, list):
                 random_state = tuple(random_state)
             temp_random_state = random.Random()
diff --git a/dataprofiler/profilers/categorical_column_profile.py b/dataprofiler/profilers/categorical_column_profile.py
index 1376cc38e..c85b195a1 100644
--- a/dataprofiler/profilers/categorical_column_profile.py
+++ b/dataprofiler/profilers/categorical_column_profile.py
@@ -1,6 +1,8 @@
 """Contains class for categorical column profiler."""
 from __future__ import annotations
 
+import math
+import warnings
 from collections import defaultdict
 from operator import itemgetter
 from typing import cast
@@ -304,6 +306,20 @@ def diff(self, other_profile: CategoricalColumn, options: dict = None) -> dict:
                     other_profile._categories.items(), key=itemgetter(1), reverse=True
                 )
             )
+            if cat_count1.keys() == cat_count2.keys():
+                total_psi = 0.0
+                for key in cat_count1.keys():
+                    perc_A = cat_count1[key] / self.sample_size
+                    perc_B = cat_count2[key] / other_profile.sample_size
+                    total_psi += (perc_B - perc_A) * math.log(perc_B / perc_A)
+                    differences["statistics"]["psi"] = total_psi
+            else:
+                warnings.warn(
+                    "psi was not calculated due to the differences in categories "
+                    "of the profiles. Differences:\n"
+                    f"{set(cat_count1.keys()) ^ set(cat_count2.keys())}",
+                    RuntimeWarning,
+                )
 
             differences["statistics"][
                 "categorical_count"
diff --git a/dataprofiler/tests/profilers/test_categorical_column_profile.py b/dataprofiler/tests/profilers/test_categorical_column_profile.py
index 10be10c58..5bdbbb83c 100644
--- a/dataprofiler/tests/profilers/test_categorical_column_profile.py
+++ b/dataprofiler/tests/profilers/test_categorical_column_profile.py
@@ -728,8 +728,13 @@ def test_categorical_diff(self):
                 },
             },
         }
-
-        self.assertDictEqual(expected_diff, profile.diff(profile2))
+        with self.assertWarnsRegex(
+            RuntimeWarning,
+            "psi was not calculated due to the differences in categories "
+            "of the profiles. Differences:\n{'maybe'}",
+        ):
+            test_profile_diff = profile.diff(profile2)
+        self.assertDictEqual(expected_diff, test_profile_diff)
 
         # Test with one categorical column matching
         df_not_categorical = pd.Series(
@@ -756,6 +761,38 @@ def test_categorical_diff(self):
         }
         self.assertDictEqual(expected_diff, profile.diff(profile2))
 
+        # Test diff with psi enabled
+        df_categorical = pd.Series(["y", "y", "y", "y", "n", "n", "n", "maybe"])
+        profile = CategoricalColumn(df_categorical.name)
+        profile.update(df_categorical)
+
+        df_categorical = pd.Series(["y", "maybe", "y", "y", "n", "n", "maybe"])
+        profile2 = CategoricalColumn(df_categorical.name)
+        profile2.update(df_categorical)
+
+        # chi2-statistic = sum((observed-expected)^2/expected for each category in each column)
+        # df = categories - 1
+        # psi = (% of records based on Sample (A) - % of records  Sample (B)) * ln(A/ B)
+        # p-value found through using chi2 CDF
+        expected_diff = {
+            "categorical": "unchanged",
+            "statistics": {
+                "unique_count": "unchanged",
+                "unique_ratio": -0.05357142857142855,
+                "chi2-test": {
+                    "chi2-statistic": 0.6122448979591839,
+                    "df": 2,
+                    "p-value": 0.7362964551863367,
+                },
+                "categories": "unchanged",
+                "gini_impurity": -0.059311224489795866,
+                "unalikeability": -0.08333333333333326,
+                "psi": 0.16814961527477595,
+                "categorical_count": {"y": 1, "n": 1, "maybe": -1},
+            },
+        }
+        self.assertDictEqual(expected_diff, profile.diff(profile2))
+
     def test_unalikeability(self):
         df_categorical = pd.Series(["a", "a"])
         profile = CategoricalColumn(df_categorical.name)
diff --git a/examples/merge_profile_list.ipynb b/examples/merge_profile_list.ipynb
index 4654caf8f..7a6d8005a 100644
--- a/examples/merge_profile_list.ipynb
+++ b/examples/merge_profile_list.ipynb
@@ -37,10 +37,10 @@
     "try:\n",
     "    sys.path.insert(0, '..')\n",
     "    import dataprofiler as dp\n",
-    "    from dataprofiler.profilers.utils import merge_profile_list\n",
+    "    from dataprofiler.profilers.profiler_utils import merge_profile_list\n",
     "except ImportError:\n",
     "    import dataprofiler as dp\n",
-    "    from dataprofiler.profilers.utils import merge_profile_list\n",
+    "    from dataprofiler.profilers.profiler_utils import merge_profile_list\n",
     "\n",
     "# remove extra tf loggin\n",
     "tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)"