From acae2da6faf82fc897242c1bc6a719c69dd8512a Mon Sep 17 00:00:00 2001 From: ksneab7 <91956551+ksneab7@users.noreply.github.com> Date: Thu, 21 Sep 2023 13:05:02 -0400 Subject: [PATCH] Staging/main/0.10.4 (#1029) * modified the assignees for issue creation (#1016) * Minor: Profiler Path Fix in Example Notebook (#1021) * Bump actions/checkout from 3 to 4 (#1024) Bumps [actions/checkout](https://github.com/actions/checkout) from 3 to 4. - [Release notes](https://github.com/actions/checkout/releases) - [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md) - [Commits](https://github.com/actions/checkout/compare/v3...v4) --- updated-dependencies: - dependency-name: actions/checkout dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Taylor Turner * Make sure random_state is a list before indexed assignment (#968) * Make sure random_state is a list before indexed assignment Currently, a mypy error occurs because we attempt to assign to random_state[1] when random_state has type Union[list[Any], tuple[Any]]. Tuples are immutable so this is a type error. We fix this by making random_state into a list before doing indexed assignment on it. * Add type guards for random_state * Check random_state before random_state[1] Co-authored-by: Michael Davis <36012613+micdavis@users.noreply.github.com> * Reorder conditions for consistency Co-authored-by: Taylor Turner --------- Co-authored-by: Michael Davis <36012613+micdavis@users.noreply.github.com> Co-authored-by: Taylor Turner * added psi calculation to categorical columns (#1027) * added psi calculation to categorical columns * Changed test value to non-calculated assignment --------- Signed-off-by: dependabot[bot] Co-authored-by: Navid Nafiuzzaman Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Taylor Turner Co-authored-by: Junho Lee <53921230+junholee6a@users.noreply.github.com> Co-authored-by: Michael Davis <36012613+micdavis@users.noreply.github.com> --- .github/ISSUE_TEMPLATE/bug_report.md | 2 +- .github/ISSUE_TEMPLATE/documentation_issue.md | 2 +- .github/ISSUE_TEMPLATE/feature_request.md | 2 +- .github/ISSUE_TEMPLATE/open_issue.md | 2 +- .github/workflows/publish-python-package.yml | 2 +- .github/workflows/test-python-package.yml | 2 +- dataprofiler/labelers/data_processing.py | 8 ++-- .../profilers/categorical_column_profile.py | 16 ++++++++ .../test_categorical_column_profile.py | 41 ++++++++++++++++++- examples/merge_profile_list.ipynb | 4 +- 10 files changed, 67 insertions(+), 14 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index 6b5bf0bb7..679c0e407 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -3,7 +3,7 @@ name: Bug report about: Create a report to help us improve title: '' labels: Bug -assignees: JGSweets, ksneab7, micdavis, taylorfturner +assignees: ksneab7, micdavis, taylorfturner, tyfarnan --- diff --git a/.github/ISSUE_TEMPLATE/documentation_issue.md b/.github/ISSUE_TEMPLATE/documentation_issue.md index b0802a943..367516ec9 100644 --- a/.github/ISSUE_TEMPLATE/documentation_issue.md +++ b/.github/ISSUE_TEMPLATE/documentation_issue.md @@ -3,7 +3,7 @@ name: Documentation Issue about: Is there an issue with the documentation? title: '' labels: Documentation -assignees: JGSweets, ksneab7, micdavis, taylorfturner +assignees: ksneab7, micdavis, taylorfturner, tyfarnan --- diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md index 33697919d..a1650b194 100644 --- a/.github/ISSUE_TEMPLATE/feature_request.md +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -3,7 +3,7 @@ name: Feature request about: Suggest an idea for this project title: '' labels: New Feature -assignees: JGSweets, ksneab7, micdavis, taylorfturner +assignees: ksneab7, micdavis, taylorfturner, tyfarnan --- diff --git a/.github/ISSUE_TEMPLATE/open_issue.md b/.github/ISSUE_TEMPLATE/open_issue.md index 126e83d86..dfe5b5f3b 100644 --- a/.github/ISSUE_TEMPLATE/open_issue.md +++ b/.github/ISSUE_TEMPLATE/open_issue.md @@ -3,6 +3,6 @@ name: Open Issue about: Open an issue other than a bug, feature, or documentation issue title: '' labels: '' -assignees: JGSweets, ksneab7, micdavis, taylorfturner +assignees: ksneab7, micdavis, taylorfturner, tyfarnan --- diff --git a/.github/workflows/publish-python-package.yml b/.github/workflows/publish-python-package.yml index 3c593d4ed..0120180fd 100644 --- a/.github/workflows/publish-python-package.yml +++ b/.github/workflows/publish-python-package.yml @@ -16,7 +16,7 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python uses: actions/setup-python@v4 with: diff --git a/.github/workflows/test-python-package.yml b/.github/workflows/test-python-package.yml index 41f7036e9..21130b471 100644 --- a/.github/workflows/test-python-package.yml +++ b/.github/workflows/test-python-package.yml @@ -19,7 +19,7 @@ jobs: python-version: [3.8, 3.9, "3.10"] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v4 with: diff --git a/dataprofiler/labelers/data_processing.py b/dataprofiler/labelers/data_processing.py index be1a3fee4..2213fd72d 100644 --- a/dataprofiler/labelers/data_processing.py +++ b/dataprofiler/labelers/data_processing.py @@ -1555,8 +1555,8 @@ def __init__( random_state = random.Random(random_state) elif isinstance(random_state, (list, tuple)) and len(random_state) == 3: # tuple required for random state to be set, lists do not work - if isinstance(random_state[1], list): - random_state[1] = tuple(random_state[1]) # type: ignore + if isinstance(random_state, list) and isinstance(random_state[1], list): + random_state[1] = tuple(random_state[1]) if isinstance(random_state, list): random_state = tuple(random_state) temp_random_state = random.Random() @@ -1894,8 +1894,8 @@ def __init__( random_state = random.Random(random_state) elif isinstance(random_state, (list, tuple)) and len(random_state) == 3: # tuple required for random state to be set, lists do not work - if isinstance(random_state[1], list): - random_state[1] = tuple(random_state[1]) # type: ignore + if isinstance(random_state, list) and isinstance(random_state[1], list): + random_state[1] = tuple(random_state[1]) if isinstance(random_state, list): random_state = tuple(random_state) temp_random_state = random.Random() diff --git a/dataprofiler/profilers/categorical_column_profile.py b/dataprofiler/profilers/categorical_column_profile.py index 1376cc38e..c85b195a1 100644 --- a/dataprofiler/profilers/categorical_column_profile.py +++ b/dataprofiler/profilers/categorical_column_profile.py @@ -1,6 +1,8 @@ """Contains class for categorical column profiler.""" from __future__ import annotations +import math +import warnings from collections import defaultdict from operator import itemgetter from typing import cast @@ -304,6 +306,20 @@ def diff(self, other_profile: CategoricalColumn, options: dict = None) -> dict: other_profile._categories.items(), key=itemgetter(1), reverse=True ) ) + if cat_count1.keys() == cat_count2.keys(): + total_psi = 0.0 + for key in cat_count1.keys(): + perc_A = cat_count1[key] / self.sample_size + perc_B = cat_count2[key] / other_profile.sample_size + total_psi += (perc_B - perc_A) * math.log(perc_B / perc_A) + differences["statistics"]["psi"] = total_psi + else: + warnings.warn( + "psi was not calculated due to the differences in categories " + "of the profiles. Differences:\n" + f"{set(cat_count1.keys()) ^ set(cat_count2.keys())}", + RuntimeWarning, + ) differences["statistics"][ "categorical_count" diff --git a/dataprofiler/tests/profilers/test_categorical_column_profile.py b/dataprofiler/tests/profilers/test_categorical_column_profile.py index 10be10c58..5bdbbb83c 100644 --- a/dataprofiler/tests/profilers/test_categorical_column_profile.py +++ b/dataprofiler/tests/profilers/test_categorical_column_profile.py @@ -728,8 +728,13 @@ def test_categorical_diff(self): }, }, } - - self.assertDictEqual(expected_diff, profile.diff(profile2)) + with self.assertWarnsRegex( + RuntimeWarning, + "psi was not calculated due to the differences in categories " + "of the profiles. Differences:\n{'maybe'}", + ): + test_profile_diff = profile.diff(profile2) + self.assertDictEqual(expected_diff, test_profile_diff) # Test with one categorical column matching df_not_categorical = pd.Series( @@ -756,6 +761,38 @@ def test_categorical_diff(self): } self.assertDictEqual(expected_diff, profile.diff(profile2)) + # Test diff with psi enabled + df_categorical = pd.Series(["y", "y", "y", "y", "n", "n", "n", "maybe"]) + profile = CategoricalColumn(df_categorical.name) + profile.update(df_categorical) + + df_categorical = pd.Series(["y", "maybe", "y", "y", "n", "n", "maybe"]) + profile2 = CategoricalColumn(df_categorical.name) + profile2.update(df_categorical) + + # chi2-statistic = sum((observed-expected)^2/expected for each category in each column) + # df = categories - 1 + # psi = (% of records based on Sample (A) - % of records Sample (B)) * ln(A/ B) + # p-value found through using chi2 CDF + expected_diff = { + "categorical": "unchanged", + "statistics": { + "unique_count": "unchanged", + "unique_ratio": -0.05357142857142855, + "chi2-test": { + "chi2-statistic": 0.6122448979591839, + "df": 2, + "p-value": 0.7362964551863367, + }, + "categories": "unchanged", + "gini_impurity": -0.059311224489795866, + "unalikeability": -0.08333333333333326, + "psi": 0.16814961527477595, + "categorical_count": {"y": 1, "n": 1, "maybe": -1}, + }, + } + self.assertDictEqual(expected_diff, profile.diff(profile2)) + def test_unalikeability(self): df_categorical = pd.Series(["a", "a"]) profile = CategoricalColumn(df_categorical.name) diff --git a/examples/merge_profile_list.ipynb b/examples/merge_profile_list.ipynb index 4654caf8f..7a6d8005a 100644 --- a/examples/merge_profile_list.ipynb +++ b/examples/merge_profile_list.ipynb @@ -37,10 +37,10 @@ "try:\n", " sys.path.insert(0, '..')\n", " import dataprofiler as dp\n", - " from dataprofiler.profilers.utils import merge_profile_list\n", + " from dataprofiler.profilers.profiler_utils import merge_profile_list\n", "except ImportError:\n", " import dataprofiler as dp\n", - " from dataprofiler.profilers.utils import merge_profile_list\n", + " from dataprofiler.profilers.profiler_utils import merge_profile_list\n", "\n", "# remove extra tf loggin\n", "tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)"