From 671017cb0757909387c0a915ec541389b813ccc9 Mon Sep 17 00:00:00 2001 From: ubd725 Date: Tue, 27 Feb 2024 15:50:36 -0600 Subject: [PATCH 01/10] Add polars to datetime_column_profile --- .../profilers/datetime_column_profile.py | 25 ++++++++++--------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/dataprofiler/profilers/datetime_column_profile.py b/dataprofiler/profilers/datetime_column_profile.py index af99283a9..7b2b57988 100644 --- a/dataprofiler/profilers/datetime_column_profile.py +++ b/dataprofiler/profilers/datetime_column_profile.py @@ -7,6 +7,7 @@ import numpy as np import pandas as pd +import polars as pl from . import profiler_utils from .base_column_profilers import BaseColumnPrimitiveTypeProfiler, BaseColumnProfiler @@ -256,8 +257,7 @@ def _get_datetime_profile(cls, df_series: pd.Series) -> dict: profile: dict = dict() activated_date_formats: list = list() len_df = len(df_series) - - is_row_datetime = pd.Series(np.full((len(df_series)), False)) + is_row_datetime = pd.Series(np.full((len_df), False)) min_value = None max_value = None @@ -275,18 +275,19 @@ def _get_datetime_profile(cls, df_series: pd.Series) -> dict: ) ) - df_dates = valid_dates[~valid_dates.isnull()] + df_dates = pl.Series(valid_dates[~valid_dates.isnull()]) - if "%b" in date_format and not df_dates.empty: + if "%b" in date_format and not df_dates.is_empty(): may_month = 5 # May can be %b or %B we want to force, so check - all_may = df_dates.apply(lambda x: x.month == may_month).all() + all_may = df_dates.map_elements(lambda x: x.month == may_month) + all_may = pl.Series(all_may).all() if all_may: - valid_dates[:] = np.nan - df_dates = pd.Series([], dtype=object) + valid_dates[:] = None + df_dates = pl.Series([]) # Create mask to avoid null dates null_date_mask = valid_dates.isnull() - np_date_array = df_dates.values + np_date_array = df_dates.to_numpy() # check off any values which were found to be datetime is_row_datetime[~is_row_datetime] = (~null_date_mask).values @@ -298,18 +299,18 @@ def _get_datetime_profile(cls, df_series: pd.Series) -> dict: max_idx = np.argmax(np_date_array) # Selects the min, ma value objects for comparison - tmp_min_value_obj = df_dates.iloc[min_idx] - tmp_max_value_obj = df_dates.iloc[max_idx] + tmp_min_value_obj = df_dates.item(int(min_idx)) + tmp_max_value_obj = df_dates.item(int(max_idx)) # If minimum value, keep reference if tmp_min_value_obj < min_value_obj: min_value = df_series[~null_date_mask].iloc[min_idx] - min_value_obj = tmp_min_value_obj + min_value_obj = pd.Timestamp(tmp_min_value_obj) # If maximum value, keep reference if tmp_max_value_obj > max_value_obj: max_value = df_series[~null_date_mask].iloc[max_idx] - max_value_obj = tmp_max_value_obj + max_value_obj = pd.Timestamp(tmp_max_value_obj) df_series = df_series[null_date_mask] From 280c7cc399d373d5c99e1513763e8dfbe5c5418d Mon Sep 17 00:00:00 2001 From: ubd725 Date: Tue, 5 Mar 2024 15:56:04 -0600 Subject: [PATCH 02/10] Polars added to unstructured labeler --- .../profilers/unstructured_labeler_profile.py | 19 +++++++++----- .../test_unstructured_labeler_profile.py | 25 ++++++++++--------- 2 files changed, 26 insertions(+), 18 deletions(-) diff --git a/dataprofiler/profilers/unstructured_labeler_profile.py b/dataprofiler/profilers/unstructured_labeler_profile.py index 1c7b16c0f..2a021fe81 100644 --- a/dataprofiler/profilers/unstructured_labeler_profile.py +++ b/dataprofiler/profilers/unstructured_labeler_profile.py @@ -3,6 +3,7 @@ from collections import defaultdict +import polars as pl from pandas import Series from ..labelers.base_data_labeler import BaseDataLabeler @@ -102,7 +103,7 @@ def __add__(self, other: UnstructuredLabelerProfile) -> UnstructuredLabelerProfi return merged_profile - def report(self, remove_disabled_flag: bool = False) -> dict: + def report(self) -> dict: """ Return profile object. @@ -176,6 +177,7 @@ def _update_helper(self, df_series_clean: Series, profile: dict) -> None: df_series_clean, predictions.copy(), self.data_labeler.label_mapping ) + df_series_clean = pl.Series(df_series_clean) # Update counts and percent values self._update_word_label_counts(df_series_clean, format_predictions["pred"]) self._update_true_char_label_counts(predictions["pred"]) @@ -188,7 +190,7 @@ def _update_helper(self, df_series_clean: Series, profile: dict) -> None: # CHARACTERS/WORDS PROCESSED self._update_column_base_properties(profile) - def update(self, df_series: Series) -> None: + def update(self, df_series: Series | pl.Series) -> None: """Update profile.""" if len(df_series) == 0: return @@ -196,6 +198,9 @@ def update(self, df_series: Series) -> None: char_sample_size=self.char_sample_size, word_sample_size=self.word_sample_size, ) + + if type(df_series) is pl.Series: + df_series = df_series.to_pandas() self._update_helper(df_series, profile) @property @@ -278,7 +283,7 @@ def _update_true_char_label_counts(self, predictions: list) -> None: self.char_sample_size += len(sample) def _update_postprocess_char_label_counts( - self, df_series_clean: Series, format_predictions: dict + self, df_series_clean: Series | pl.Series, format_predictions: dict ) -> None: """ Update the postprocess character label counts. @@ -292,7 +297,8 @@ def _update_postprocess_char_label_counts( """ char_label_counts = self.entity_counts["postprocess_char_level"] - for index, result in enumerate(zip(df_series_clean, format_predictions)): + df_series_clean = pl.Series(df_series_clean) + for result in zip(df_series_clean, format_predictions): text, entities = result index = 0 for entity in entities: @@ -308,7 +314,7 @@ def _update_postprocess_char_label_counts( char_label_counts["UNKNOWN"] += len(text) - index def _update_word_label_counts( - self, df_series_clean: Series, format_predictions: dict + self, df_series_clean: Series | pl.Series, format_predictions: dict ) -> None: """ Update the sorted dictionary of each entity count. @@ -321,7 +327,8 @@ def _update_word_label_counts( """ word_label_counts = self.entity_counts["word_level"] - for index, result in enumerate(zip(df_series_clean, format_predictions)): + df_series_clean = pl.Series(df_series_clean) + for result in zip(df_series_clean, format_predictions): text, entities = result begin_word_idx = -1 index = 0 diff --git a/dataprofiler/tests/profilers/test_unstructured_labeler_profile.py b/dataprofiler/tests/profilers/test_unstructured_labeler_profile.py index c7aa8b0c5..ce93c516e 100644 --- a/dataprofiler/tests/profilers/test_unstructured_labeler_profile.py +++ b/dataprofiler/tests/profilers/test_unstructured_labeler_profile.py @@ -3,6 +3,7 @@ from unittest import mock import pandas as pd +import polars as pl from dataprofiler.profilers import profiler_utils from dataprofiler.profilers.unstructured_labeler_profile import ( @@ -15,7 +16,7 @@ def test_char_level_counts(self): # setting up objects/profile default = UnstructuredLabelerProfile() - sample = pd.Series(["abc123", "Bob", "!@##$%"]) + sample = pl.Series(["abc123", "Bob", "!@##$%"]) # running update default.update(sample) @@ -34,7 +35,7 @@ def test_advanced_sample(self): # setting up objects/profile default = UnstructuredLabelerProfile() - sample = pd.Series( + sample = pl.Series( [ "Help\tJohn Macklemore\tneeds\tfood.\tPlease\tCall\t555-301-1234." "\tHis\tssn\tis\tnot\t334-97-1234. I'm a BAN: 000043219499392912." @@ -56,7 +57,7 @@ def test_word_level_NER_label_counts(self): # setting up objects/profile default = UnstructuredLabelerProfile() - sample = pd.Series( + sample = pl.Series( [ "Help\tJohn Macklemore\tneeds\tfood.\tPlease\tCall\t555-301-1234." "\tHis\tssn\tis\tnot\t334-97-1234. I'm a BAN: 000049939232194912." @@ -78,7 +79,7 @@ def test_statistics(self): # setting up objects/profile default = UnstructuredLabelerProfile() - sample = pd.Series( + sample = pl.Series( [ "Help\tJohn Macklemore\tneeds\tfood.\tPlease\tCall\t555-301-1234." "\tHis\tssn\tis\tnot\t334-97-1234. I'm a BAN: 000043219499392912." @@ -123,7 +124,7 @@ def test_profile(self, processor_class_mock, model_class_mock): # initialize labeler profile default = UnstructuredLabelerProfile() - sample = pd.Series(["a"]) + sample = pl.Series(["a"]) expected_profile = dict( entity_counts={ "postprocess_char_level": defaultdict(int, {"UNKNOWN": 1}), @@ -163,15 +164,15 @@ def test_report(self, processor_class_mock, model_class_mock): # initialize labeler profile profile = UnstructuredLabelerProfile() - sample = pd.Series(["a"]) + sample = pl.Series(["a"]) time_array = [float(i) for i in range(4, 0, -1)] with mock.patch("time.time", side_effect=lambda: time_array.pop()): profile.update(sample) report1 = profile.profile - report2 = profile.report(remove_disabled_flag=False) - report3 = profile.report(remove_disabled_flag=True) + report2 = profile.report() + report3 = profile.report() self.assertDictEqual(report1, report2) self.assertDictEqual(report1, report3) @@ -192,7 +193,7 @@ def test_entity_percentages(self, mock1, mock2): profile.entity_counts["true_char_level"]["TEST"] = 16 profile.entity_counts["word_level"]["UNKNOWN"] = 5 profile.entity_counts["word_level"]["TEST"] = 5 - profile.update(pd.Series(["a"])) + profile.update(pl.Series(["a"])) expected_percentages = { "postprocess_char_level": defaultdict(int, {"UNKNOWN": 0.3, "TEST": 0.7}), @@ -275,7 +276,7 @@ def test_diff(self, mock1, mock2): profiler1.entity_counts["word_level"]["UNKNOWN"] = 5 profiler1.entity_counts["word_level"]["TEST"] = 5 profiler1.entity_counts["word_level"]["UNIQUE1"] = 5 - profiler1.update(pd.Series(["a"])) + profiler1.update(pl.Series(["a"])) profiler2 = UnstructuredLabelerProfile() profiler2.char_sample_size = 20 @@ -289,7 +290,7 @@ def test_diff(self, mock1, mock2): profiler2.entity_counts["word_level"]["UNKNOWN"] = 2 profiler2.entity_counts["word_level"]["TEST"] = 4 profiler2.entity_counts["word_level"]["UNIQUE2"] = 4 - profiler2.update(pd.Series(["a"])) + profiler2.update(pl.Series(["a"])) expected_diff = { "entity_counts": { @@ -342,7 +343,7 @@ def test_diff(self, mock1, mock2): profiler1.entity_counts["postprocess_char_level"]["UNKNOWN"] = 5 profiler1.entity_counts["true_char_level"]["UNKNOWN"] = 5 profiler1.entity_counts["word_level"]["UNKNOWN"] = 5 - profiler1.update(pd.Series(["a"])) + profiler1.update(pl.Series(["a"])) profiler2 = UnstructuredLabelerProfile() profile2 = profiler2.profile From faa51e680e683146dc741a538aae15534b3d7ec4 Mon Sep 17 00:00:00 2001 From: ubd725 Date: Wed, 27 Mar 2024 14:21:15 -0500 Subject: [PATCH 03/10] Quick fix for keras and tensorflow --- dataprofiler/profilers/unstructured_labeler_profile.py | 10 +++++----- .../profilers/test_unstructured_labeler_profile.py | 5 ++--- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/dataprofiler/profilers/unstructured_labeler_profile.py b/dataprofiler/profilers/unstructured_labeler_profile.py index 2a021fe81..120a42114 100644 --- a/dataprofiler/profilers/unstructured_labeler_profile.py +++ b/dataprofiler/profilers/unstructured_labeler_profile.py @@ -103,7 +103,7 @@ def __add__(self, other: UnstructuredLabelerProfile) -> UnstructuredLabelerProfi return merged_profile - def report(self) -> dict: + def report(self, remove_disabled_flag: bool = False) -> dict: """ Return profile object. @@ -156,7 +156,7 @@ def label_encoding(self) -> list[str]: return self.data_labeler.labels @BaseColumnProfiler._timeit(name="data_labeler_predict") - def _update_helper(self, df_series_clean: Series, profile: dict) -> None: + def _update_helper(self, df_series_clean: pl.Series, profile: dict) -> None: """ Update col profile properties with clean dataset and its known profile. @@ -190,7 +190,7 @@ def _update_helper(self, df_series_clean: Series, profile: dict) -> None: # CHARACTERS/WORDS PROCESSED self._update_column_base_properties(profile) - def update(self, df_series: Series | pl.Series) -> None: + def update(self, df_series: pl.Series) -> None: """Update profile.""" if len(df_series) == 0: return @@ -283,7 +283,7 @@ def _update_true_char_label_counts(self, predictions: list) -> None: self.char_sample_size += len(sample) def _update_postprocess_char_label_counts( - self, df_series_clean: Series | pl.Series, format_predictions: dict + self, df_series_clean: pl.Series, format_predictions: dict ) -> None: """ Update the postprocess character label counts. @@ -314,7 +314,7 @@ def _update_postprocess_char_label_counts( char_label_counts["UNKNOWN"] += len(text) - index def _update_word_label_counts( - self, df_series_clean: Series | pl.Series, format_predictions: dict + self, df_series_clean: pl.Series, format_predictions: dict ) -> None: """ Update the sorted dictionary of each entity count. diff --git a/dataprofiler/tests/profilers/test_unstructured_labeler_profile.py b/dataprofiler/tests/profilers/test_unstructured_labeler_profile.py index ce93c516e..0af957c2a 100644 --- a/dataprofiler/tests/profilers/test_unstructured_labeler_profile.py +++ b/dataprofiler/tests/profilers/test_unstructured_labeler_profile.py @@ -2,7 +2,6 @@ from collections import defaultdict from unittest import mock -import pandas as pd import polars as pl from dataprofiler.profilers import profiler_utils @@ -171,8 +170,8 @@ def test_report(self, processor_class_mock, model_class_mock): profile.update(sample) report1 = profile.profile - report2 = profile.report() - report3 = profile.report() + report2 = profile.report(remove_disabled_flag=False) + report3 = profile.report(remove_disabled_flag=True) self.assertDictEqual(report1, report2) self.assertDictEqual(report1, report3) From f6b197052f63ad3e1f716828874e30f268ca7067 Mon Sep 17 00:00:00 2001 From: ubd725 Date: Thu, 28 Mar 2024 16:14:53 -0500 Subject: [PATCH 04/10] Add polars to unstructured text --- .../profilers/unstructured_labeler_profile.py | 19 ++--- .../profilers/unstructured_text_profile.py | 20 +++-- .../test_unstructured_text_profile.py | 78 +++++++++---------- 3 files changed, 59 insertions(+), 58 deletions(-) diff --git a/dataprofiler/profilers/unstructured_labeler_profile.py b/dataprofiler/profilers/unstructured_labeler_profile.py index 120a42114..74bd4cae2 100644 --- a/dataprofiler/profilers/unstructured_labeler_profile.py +++ b/dataprofiler/profilers/unstructured_labeler_profile.py @@ -4,7 +4,6 @@ from collections import defaultdict import polars as pl -from pandas import Series from ..labelers.base_data_labeler import BaseDataLabeler from ..labelers.data_labelers import DataLabeler @@ -161,23 +160,25 @@ def _update_helper(self, df_series_clean: pl.Series, profile: dict) -> None: Update col profile properties with clean dataset and its known profile. :param df_series_clean: df series with nulls removed - :type df_series_clean: pandas.core.series.Series + :type df_series_clean: polars.Series :param profile: profile dictionary :type profile: dict :return: None """ + + data_ndarray = df_series_clean.to_numpy() + # this will get char_level predictions as output - predictions = self.data_labeler.predict(df_series_clean) + predictions = self.data_labeler.predict(data_ndarray) # also store spacy/NER format postprocessor = CharPostprocessor( use_word_level_argmax=True, output_format="NER" ) format_predictions = postprocessor.process( - df_series_clean, predictions.copy(), self.data_labeler.label_mapping + data_ndarray, predictions.copy(), self.data_labeler.label_mapping ) - df_series_clean = pl.Series(df_series_clean) # Update counts and percent values self._update_word_label_counts(df_series_clean, format_predictions["pred"]) self._update_true_char_label_counts(predictions["pred"]) @@ -199,8 +200,6 @@ def update(self, df_series: pl.Series) -> None: word_sample_size=self.word_sample_size, ) - if type(df_series) is pl.Series: - df_series = df_series.to_pandas() self._update_helper(df_series, profile) @property @@ -289,7 +288,7 @@ def _update_postprocess_char_label_counts( Update the postprocess character label counts. :param df_series_clean: df series with nulls removed - :type df_series_clean: pandas.core.series.Series + :type df_series_clean: polars.Series :param format_predictions: contains dict of samples with predictions on the character level in congruence with the word level predictions :type format_predictions: Dict @@ -297,7 +296,6 @@ def _update_postprocess_char_label_counts( """ char_label_counts = self.entity_counts["postprocess_char_level"] - df_series_clean = pl.Series(df_series_clean) for result in zip(df_series_clean, format_predictions): text, entities = result index = 0 @@ -320,14 +318,13 @@ def _update_word_label_counts( Update the sorted dictionary of each entity count. :param df_series_clean: df series with nulls removed - :type df_series_clean: pandas.core.series.Series + :type df_series_clean: polars.Series :param format_predictions: Dictionary of sample text and entities :type format_predictions: dict :return: None """ word_label_counts = self.entity_counts["word_level"] - df_series_clean = pl.Series(df_series_clean) for result in zip(df_series_clean, format_predictions): text, entities = result begin_word_idx = -1 diff --git a/dataprofiler/profilers/unstructured_text_profile.py b/dataprofiler/profilers/unstructured_text_profile.py index 96b7d0625..214b10044 100644 --- a/dataprofiler/profilers/unstructured_text_profile.py +++ b/dataprofiler/profilers/unstructured_text_profile.py @@ -8,6 +8,7 @@ from numpy import ndarray from pandas import DataFrame, Series +import polars as pl from . import profiler_utils from .base_column_profilers import BaseColumnProfiler @@ -667,7 +668,7 @@ def profile(self) -> dict: @BaseColumnProfiler._timeit(name="vocab") def _update_vocab( self, - data: list | ndarray | DataFrame, + data: list | ndarray | DataFrame | pl.DataFrame, prev_dependent_properties: dict = None, subset_properties: dict = None, ) -> None: @@ -675,7 +676,7 @@ def _update_vocab( Find the vocabulary counts used in the text samples. :param data: list or array of data from which to extract vocab - :type data: Union[list, numpy.array, pandas.DataFrame] + :type data: Union[list, numpy.array, pandas.DataFrame, polars.DataFrame] :param prev_dependent_properties: Contains all the previous properties that the calculations depend on. :type prev_dependent_properties: dict @@ -690,7 +691,7 @@ def _update_vocab( @BaseColumnProfiler._timeit(name="words") def _update_words( self, - data: list | ndarray | DataFrame, + data: list | ndarray | DataFrame | pl.DataFrame, prev_dependent_properties: dict = None, subset_properties: dict = None, ) -> None: @@ -698,7 +699,7 @@ def _update_words( Find unique words and word count used in the text samples. :param data: list or array of data from which to extract vocab - :type data: Union[list, numpy.array, pandas.DataFrame] + :type data: Union[list, numpy.array, pandas.DataFrame, polars.DataFrame] :param prev_dependent_properties: Contains all the previous properties that the calculations depend on. :type prev_dependent_properties: dict @@ -720,12 +721,12 @@ def _update_words( if w and w.lower() not in self._stop_words: self.word_count.update({w: c}) - def _update_helper(self, data: Series, profile: dict) -> None: + def _update_helper(self, data: pl.Series, profile: dict) -> None: """ Update col profile properties with clean dataset and its known null parameters. :param data: df series with nulls removed - :type data: pandas.core.series.Series + :type data: polars.Series :param profile: text profile dictionary :type profile: dict :return: None @@ -733,12 +734,12 @@ def _update_helper(self, data: Series, profile: dict) -> None: self.sample_size += profile.pop("sample_size") self.metadata = profile - def update(self, data: Series) -> TextProfiler: + def update(self, data: Series | pl.Series) -> TextProfiler: """ Update the column profile. :param data: df series - :type data: pandas.core.series.Series + :type data: polars.Series :return: updated TextProfiler :rtype: TextProfiler """ @@ -748,6 +749,9 @@ def update(self, data: Series) -> TextProfiler: profile = dict(sample_size=len_data) + if type(data) is pl.Series: + data = data.to_pandas() + BaseColumnProfiler._perform_property_calcs( self, # type: ignore self.__calculations, diff --git a/dataprofiler/tests/profilers/test_unstructured_text_profile.py b/dataprofiler/tests/profilers/test_unstructured_text_profile.py index 39b1d1bb8..3ebab8294 100644 --- a/dataprofiler/tests/profilers/test_unstructured_text_profile.py +++ b/dataprofiler/tests/profilers/test_unstructured_text_profile.py @@ -1,6 +1,6 @@ import unittest -import pandas as pd +import polars as pl from dataprofiler.profilers.profiler_options import TextProfilerOptions from dataprofiler.profilers.unstructured_text_profile import TextProfiler @@ -9,7 +9,7 @@ class TestUnstructuredTextProfile(unittest.TestCase): def test_text_profile_update_and_name(self): text_profile = TextProfiler("Name") - sample = pd.Series( + sample = pl.Series( ["Hello my name is: Grant.!!!", "Bob and \"Grant\", 'are' friends"] ) text_profile.update(sample) @@ -17,7 +17,7 @@ def test_text_profile_update_and_name(self): def test_vocab(self): text_profile = TextProfiler("Name") - sample = pd.Series( + sample = pl.Series( ["Hello my name is: Grant.!!!", "Bob and \"Grant\", 'are' friends"] ) text_profile.update(sample) @@ -53,7 +53,7 @@ def test_vocab(self): self.assertListEqual(sorted(expected_vocab), sorted(profile["vocab"])) # Update the data again - sample = pd.Series(["Grant knows how to code", "Grant will code with Bob"]) + sample = pl.Series(["Grant knows how to code", "Grant will code with Bob"]) text_profile.update(sample) profile = text_profile.profile @@ -92,7 +92,7 @@ def test_vocab(self): def test_words_and_word_count(self): text_profile = TextProfiler("Name") - sample = pd.Series( + sample = pl.Series( ["Hello my name is: Grant.!!!", "Bob and \"Grant\", 'are' friends"] ) text_profile.update(sample) @@ -114,7 +114,7 @@ def test_words_and_word_count(self): self.assertDictEqual(expected_word_count, profile["word_count"]) # Update the data again - sample = pd.Series(["Grant knows how to code", "Grant will code with Bob"]) + sample = pl.Series(["Grant knows how to code", "Grant will code with Bob"]) text_profile.update(sample) profile = text_profile.profile @@ -137,7 +137,7 @@ def test_words_and_word_count(self): def test_sample_size(self): text_profile = TextProfiler("Name") - sample = pd.Series( + sample = pl.Series( ["Hello my name is: Grant.!!!", "Bob and \"Grant\", 'are' friends"] ) text_profile.update(sample) @@ -146,7 +146,7 @@ def test_sample_size(self): self.assertEqual(2, text_profile.sample_size) # Update the data again - sample = pd.Series(["Grant knows how to code", "Grant will code with Bob"]) + sample = pl.Series(["Grant knows how to code", "Grant will code with Bob"]) text_profile.update(sample) # Assert sample size is accurate @@ -154,7 +154,7 @@ def test_sample_size(self): def test_timing(self): text_profile = TextProfiler("Name") - sample = pd.Series( + sample = pl.Series( ["Hello my name is: Grant.!!!", "Bob and \"Grant\", 'are' friends"] ) text_profile.update(sample) @@ -166,11 +166,11 @@ def test_timing(self): def test_merge_profiles(self): text_profile1 = TextProfiler("Name") - sample = pd.Series(["Hello my name is: Grant.!!!"]) + sample = pl.Series(["Hello my name is: Grant.!!!"]) text_profile1.update(sample) text_profile2 = TextProfiler("Name") - sample = pd.Series(["Bob and \"Grant\", 'are' friends"]) + sample = pl.Series(["Bob and \"Grant\", 'are' friends"]) text_profile2.update(sample) text_profile3 = text_profile1 + text_profile2 @@ -231,11 +231,11 @@ def test_merge_profiles(self): def test_diff_profiles(self): text_profile1 = TextProfiler("Name") - sample = pd.Series(["Hello my name is: Grant.!!!"]) + sample = pl.Series(["Hello my name is: Grant.!!!"]) text_profile1.update(sample) text_profile2 = TextProfiler("Name") - sample = pd.Series(["Bob and \"grant\", 'are' friends Grant Grant"]) + sample = pl.Series(["Bob and \"grant\", 'are' friends Grant Grant"]) text_profile2.update(sample) expected_diff = { @@ -271,13 +271,13 @@ def test_diff_profiles(self): # Test when one profiler is not case sensitive text_profile1 = TextProfiler("Name") - sample = pd.Series(["Hello my name is: Grant.!!!"]) + sample = pl.Series(["Hello my name is: Grant.!!!"]) text_profile1.update(sample) options = TextProfilerOptions() options.is_case_sensitive = False text_profile2 = TextProfiler("Name", options=options) - sample = pd.Series(["Bob and \"grant\", 'are' friends Grant Grant"]) + sample = pl.Series(["Bob and \"grant\", 'are' friends Grant Grant"]) text_profile2.update(sample) expected_diff = { @@ -314,14 +314,14 @@ def test_diff_profiles(self): def test_case_sensitivity(self): text_profile1 = TextProfiler("Name") text_profile1._is_case_sensitive = False - sample = pd.Series(["Hello my name is: Grant.!!!"]) + sample = pl.Series(["Hello my name is: Grant.!!!"]) text_profile1.update(sample) profile = text_profile1.profile expected_word_count = {"grant": 1, "hello": 1, "name": 1} self.assertDictEqual(expected_word_count, profile["word_count"]) text_profile2 = TextProfiler("Name") - sample = pd.Series(["Bob and \"Grant\", 'are' friends"]) + sample = pl.Series(["Bob and \"Grant\", 'are' friends"]) text_profile2.update(sample) profile = text_profile2.profile expected_word_count = {"Grant": 1, "Bob": 1, "friends": 1} @@ -367,11 +367,11 @@ def test_case_sensitivity(self): def test_merge_most_common_chars_count(self): ### default values of most common chars for both profiles text_profile1 = TextProfiler("Name") - sample1 = pd.Series(["this is test,", " this is a test sentence"]) + sample1 = pl.Series(["this is test,", " this is a test sentence"]) text_profile1.update(sample1) text_profile2 = TextProfiler("Name") - sample2 = pd.Series(["this is", "this"]) + sample2 = pl.Series(["this is", "this"]) text_profile2.update(sample2) text_profile3 = text_profile1 + text_profile2 @@ -437,12 +437,12 @@ def test_merge_most_common_words_count(self): ### default values of most common words for both profiles text_profile1 = TextProfiler("Name") text_profile1._stop_words = set() # set stop_words to empty for easy inspection - sample1 = pd.Series(["this is test,", " this is a test sentence"]) + sample1 = pl.Series(["this is test,", " this is a test sentence"]) text_profile1.update(sample1) text_profile2 = TextProfiler("Name") text_profile2._stop_words = set() # set stop_words to empty for easy inspection - sample2 = pd.Series(["this is", "this"]) + sample2 = pl.Series(["this is", "this"]) text_profile2.update(sample2) text_profile3 = text_profile1 + text_profile2 @@ -494,7 +494,7 @@ def test_options_default(self): # input with one sample text_profile = TextProfiler("Name", options=options) - sample = pd.Series(["This is test, a Test sentence.!!!"]) + sample = pl.Series(["This is test, a Test sentence.!!!"]) text_profile.update(sample) expected_word_count = {"sentence": 1, "Test": 1, "test": 1} @@ -518,7 +518,7 @@ def test_options_default(self): # input with two samples text_profile = TextProfiler("Name", options=options) - sample = pd.Series(["This is test,", " a Test sentence.!!!"]) + sample = pl.Series(["This is test,", " a Test sentence.!!!"]) text_profile.update(sample) expected_word_count = {"sentence": 1, "Test": 1, "test": 1} @@ -553,7 +553,7 @@ def test_report(self): options.words.is_enabled = False profiler = TextProfiler("Name", options) - sample = pd.Series(["This is test, a Test sentence.!!!"]) + sample = pl.Series(["This is test, a Test sentence.!!!"]) profiler.update(sample) report = profiler.report(remove_disabled_flag=True) @@ -584,7 +584,7 @@ def test_report(self): options.words.is_enabled = False profiler = TextProfiler("Name", options) - sample = pd.Series(["This is test, a Test sentence.!!!"]) + sample = pl.Series(["This is test, a Test sentence.!!!"]) profiler.update(sample) report = profiler.report(remove_disabled_flag=True) @@ -600,7 +600,7 @@ def test_options_case_sensitive(self): # input with one sample text_profile = TextProfiler("Name", options=options) - sample = pd.Series(["This is test, a Test sentence.!!!"]) + sample = pl.Series(["This is test, a Test sentence.!!!"]) text_profile.update(sample) expected_word_count = {"sentence": 1, "test": 2} @@ -624,7 +624,7 @@ def test_options_case_sensitive(self): # input with two samples text_profile = TextProfiler("Name", options=options) - sample = pd.Series(["This is test,", " a Test sentence.!!!"]) + sample = pl.Series(["This is test,", " a Test sentence.!!!"]) text_profile.update(sample) expected_word_count = {"sentence": 1, "test": 2} @@ -655,7 +655,7 @@ def test_options_stop_words(self): ## input with one sample text_profile = TextProfiler("Name", options=options) - sample = pd.Series(["This is test, a Test sentence.!!!"]) + sample = pl.Series(["This is test, a Test sentence.!!!"]) text_profile.update(sample) expected_word_count = {"This": 1, "Test": 1, "test": 1} @@ -679,7 +679,7 @@ def test_options_stop_words(self): ## input with two samples text_profile = TextProfiler("Name", options=options) - sample = pd.Series(["This is test,", " a Test sentence.!!!"]) + sample = pl.Series(["This is test,", " a Test sentence.!!!"]) text_profile.update(sample) expected_word_count = {"This": 1, "Test": 1, "test": 1} @@ -707,7 +707,7 @@ def test_options_stop_words(self): ## input with one sample text_profile = TextProfiler("Name", options=options) - sample = pd.Series(["This is test, a Test sentence.!!!"]) + sample = pl.Series(["This is test, a Test sentence.!!!"]) text_profile.update(sample) expected_word_count = { @@ -738,7 +738,7 @@ def test_options_stop_words(self): ## input with two samples text_profile = TextProfiler("Name", options=options) - sample = pd.Series(["This is test,", " a Test sentence.!!!"]) + sample = pl.Series(["This is test,", " a Test sentence.!!!"]) text_profile.update(sample) expected_word_count = { @@ -774,7 +774,7 @@ def test_options_vocab_update(self): # input with one sample text_profile = TextProfiler("Name", options=options) - sample = pd.Series(["This is test, a Test sentence.!!!"]) + sample = pl.Series(["This is test, a Test sentence.!!!"]) text_profile.update(sample) expected_word_count = {"sentence": 1, "Test": 1, "test": 1} @@ -784,7 +784,7 @@ def test_options_vocab_update(self): # input with two samples text_profile = TextProfiler("Name", options=options) - sample = pd.Series(["This is test,", " a Test sentence.!!!"]) + sample = pl.Series(["This is test,", " a Test sentence.!!!"]) text_profile.update(sample) expected_word_count = {"sentence": 1, "Test": 1, "test": 1} @@ -799,7 +799,7 @@ def test_options_words_update(self): # input with one sample text_profile = TextProfiler("Name", options=options) - sample = pd.Series(["This is test, a Test sentence.!!!"]) + sample = pl.Series(["This is test, a Test sentence.!!!"]) text_profile.update(sample) expected_word_count = {} @@ -823,7 +823,7 @@ def test_options_words_update(self): # input with two samples text_profile = TextProfiler("Name", options=options) - sample = pd.Series(["This is test,", " a Test sentence.!!!"]) + sample = pl.Series(["This is test,", " a Test sentence.!!!"]) text_profile.update(sample) expected_word_count = {} @@ -851,7 +851,7 @@ def test_options_most_common_chars_count(self): options.top_k_chars = None text_profile = TextProfiler("Name", options=options) - sample = pd.Series( + sample = pl.Series( ["this is test,", " this is a test sentence", "this is", "this"] ) text_profile.update(sample) @@ -875,7 +875,7 @@ def test_options_most_common_chars_count(self): options.top_k_chars = 3 text_profile = TextProfiler("Name", options=options) - sample = pd.Series( + sample = pl.Series( ["this is test,", " this is a test sentence", "this is", "this"] ) text_profile.update(sample) @@ -920,7 +920,7 @@ def test_options_most_common_words_count(self): options.stop_words = [] # set stop_words to empty list for easy inspection text_profile = TextProfiler("Name", options=options) - sample = pd.Series( + sample = pl.Series( ["this is test,", " this is a test sentence", "this is", "this"] ) text_profile.update(sample) @@ -934,7 +934,7 @@ def test_options_most_common_words_count(self): options.stop_words = [] # set stop_words to empty list for easy inspection text_profile = TextProfiler("Name", options=options) - sample = pd.Series( + sample = pl.Series( ["this is test,", " this is a test sentence", "this is", "this"] ) text_profile.update(sample) From 73feade6ea457e636b891604c4adb26d7736a283 Mon Sep 17 00:00:00 2001 From: ubd725 Date: Fri, 29 Mar 2024 12:27:12 -0500 Subject: [PATCH 05/10] Correct polars usage --- .../profilers/unstructured_labeler_profile.py | 1 - .../profilers/unstructured_text_profile.py | 39 ++++++++++++++----- 2 files changed, 29 insertions(+), 11 deletions(-) diff --git a/dataprofiler/profilers/unstructured_labeler_profile.py b/dataprofiler/profilers/unstructured_labeler_profile.py index 74bd4cae2..22bd852d4 100644 --- a/dataprofiler/profilers/unstructured_labeler_profile.py +++ b/dataprofiler/profilers/unstructured_labeler_profile.py @@ -165,7 +165,6 @@ def _update_helper(self, df_series_clean: pl.Series, profile: dict) -> None: :type profile: dict :return: None """ - data_ndarray = df_series_clean.to_numpy() # this will get char_level predictions as output diff --git a/dataprofiler/profilers/unstructured_text_profile.py b/dataprofiler/profilers/unstructured_text_profile.py index 214b10044..51835f7ae 100644 --- a/dataprofiler/profilers/unstructured_text_profile.py +++ b/dataprofiler/profilers/unstructured_text_profile.py @@ -6,9 +6,9 @@ import warnings from collections import Counter, defaultdict +import polars as pl from numpy import ndarray from pandas import DataFrame, Series -import polars as pl from . import profiler_utils from .base_column_profilers import BaseColumnProfiler @@ -691,7 +691,7 @@ def _update_vocab( @BaseColumnProfiler._timeit(name="words") def _update_words( self, - data: list | ndarray | DataFrame | pl.DataFrame, + data: list | ndarray | DataFrame, prev_dependent_properties: dict = None, subset_properties: dict = None, ) -> None: @@ -709,12 +709,29 @@ def _update_words( :return: None """ if not self._is_case_sensitive: - words = ( - [w.strip(string.punctuation) for w in row.lower().split()] - for row in data - ) + if type(data) is pl.DataFrame: + words = ( + [ + w.strip(string.punctuation) + for w in row.str.to_lowercase().str.split(by=" ") + ] + for row in data + ) + else: + words = ( + [w.strip(string.punctuation) for w in row.lower().split()] + for row in data + ) else: - words = ([w.strip(string.punctuation) for w in row.split()] for row in data) + if type(data) is pl.DataFrame: + words = ( + [w.strip(string.punctuation) for w in row.str.split(by=" ")] + for row in data + ) + else: + words = ( + [w.strip(string.punctuation) for w in row.split()] for row in data + ) word_count = Counter(itertools.chain.from_iterable(words)) for w, c in word_count.items(): @@ -750,16 +767,18 @@ def update(self, data: Series | pl.Series) -> TextProfiler: profile = dict(sample_size=len_data) if type(data) is pl.Series: - data = data.to_pandas() + data_pandas = data.to_pandas() + else: + data_pandas = data BaseColumnProfiler._perform_property_calcs( self, # type: ignore self.__calculations, - df_series=data, + df_series=data_pandas, prev_dependent_properties={}, subset_properties=profile, ) - self._update_helper(data, profile) + self._update_helper(pl.Series(data), profile) return self From 32fa3e542ece46b2b128f920a21e40b460ced637 Mon Sep 17 00:00:00 2001 From: ubd725 Date: Fri, 22 Mar 2024 11:42:44 -0500 Subject: [PATCH 06/10] Dask version --- requirements-test.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/requirements-test.txt b/requirements-test.txt index 6c981cf9c..3bec5009d 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,5 +1,9 @@ coverage>=5.0.1 +<<<<<<< HEAD dask>=2.29.0,<2024.2.0 +======= +dask>=2.29.0, <2024.2.0 +>>>>>>> f4b7f9b (Dask version) fsspec>=0.3.3 pytest>=6.0.1 pytest-cov>=2.8.1 From b665e1b09a14976c2d523d51de9fa45c6d1c947c Mon Sep 17 00:00:00 2001 From: ubd725 Date: Wed, 27 Mar 2024 14:21:15 -0500 Subject: [PATCH 07/10] Quick fix for keras and tensorflow --- requirements-test.txt | 4 ---- 1 file changed, 4 deletions(-) diff --git a/requirements-test.txt b/requirements-test.txt index 3bec5009d..6c981cf9c 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,9 +1,5 @@ coverage>=5.0.1 -<<<<<<< HEAD dask>=2.29.0,<2024.2.0 -======= -dask>=2.29.0, <2024.2.0 ->>>>>>> f4b7f9b (Dask version) fsspec>=0.3.3 pytest>=6.0.1 pytest-cov>=2.8.1 From 4a5fc2dc17fafecbccf7daec0b53df08ed0a946f Mon Sep 17 00:00:00 2001 From: ubd725 Date: Mon, 1 Apr 2024 14:49:09 -0500 Subject: [PATCH 08/10] Minor polars updates --- dataprofiler/profilers/profiler_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataprofiler/profilers/profiler_utils.py b/dataprofiler/profilers/profiler_utils.py index a81dca7a5..66f33c773 100644 --- a/dataprofiler/profilers/profiler_utils.py +++ b/dataprofiler/profilers/profiler_utils.py @@ -752,7 +752,7 @@ def perform_chi_squared_test_for_homogeneity( # If one or less categories, we have zero/negative degrees of freedom, # which is not an appropriate value for this context num_cats = len(cat_counts) - if len(cat_counts) <= 1: + if num_cats <= 1: warnings.warn( "Insufficient number of categories. " "Chi-squared test cannot be performed.", From b3633eb0788730fb857dcf032dbe02eeb1c3beae Mon Sep 17 00:00:00 2001 From: ubd725 Date: Wed, 10 Apr 2024 15:59:41 -0500 Subject: [PATCH 09/10] Change type for isinstance --- dataprofiler/profilers/unstructured_text_profile.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dataprofiler/profilers/unstructured_text_profile.py b/dataprofiler/profilers/unstructured_text_profile.py index 51835f7ae..12871aa6c 100644 --- a/dataprofiler/profilers/unstructured_text_profile.py +++ b/dataprofiler/profilers/unstructured_text_profile.py @@ -691,7 +691,7 @@ def _update_vocab( @BaseColumnProfiler._timeit(name="words") def _update_words( self, - data: list | ndarray | DataFrame, + data: list | ndarray | DataFrame | pl.DataFrame, prev_dependent_properties: dict = None, subset_properties: dict = None, ) -> None: @@ -709,7 +709,7 @@ def _update_words( :return: None """ if not self._is_case_sensitive: - if type(data) is pl.DataFrame: + if isinstance(data, pl.DataFrame): words = ( [ w.strip(string.punctuation) @@ -723,7 +723,7 @@ def _update_words( for row in data ) else: - if type(data) is pl.DataFrame: + if isinstance(data, pl.DataFrame): words = ( [w.strip(string.punctuation) for w in row.str.split(by=" ")] for row in data @@ -766,7 +766,7 @@ def update(self, data: Series | pl.Series) -> TextProfiler: profile = dict(sample_size=len_data) - if type(data) is pl.Series: + if isinstance(data, pl.Series): data_pandas = data.to_pandas() else: data_pandas = data From 8f8b528fb6c227adabe491be70edf2ebc0a3e65c Mon Sep 17 00:00:00 2001 From: ubd725 Date: Tue, 23 Apr 2024 11:59:26 -0500 Subject: [PATCH 10/10] Categorical polars update --- .../profilers/categorical_column_profile.py | 24 ++++-- .../test_categorical_column_profile.py | 75 ++++++++++--------- 2 files changed, 54 insertions(+), 45 deletions(-) diff --git a/dataprofiler/profilers/categorical_column_profile.py b/dataprofiler/profilers/categorical_column_profile.py index 1ca630900..23e057215 100644 --- a/dataprofiler/profilers/categorical_column_profile.py +++ b/dataprofiler/profilers/categorical_column_profile.py @@ -7,6 +7,7 @@ from typing import cast import datasketches +import polars as pl from pandas import DataFrame, Series from .. import dp_logging @@ -474,7 +475,7 @@ def _check_stop_condition_is_met(self, sample_size: int, unqiue_ratio: float): return True return False - def _update_stop_condition(self, data: DataFrame): + def _update_stop_condition(self, data: DataFrame | pl.DataFrame): """Return value stop_condition_is_met given stop conditions. :param data: Dataframe currently being processed by categorical profiler @@ -497,8 +498,8 @@ def _get_categories_cms(self, df_series, len_df): """Return count min sketch and heavy hitters for both the batch and stream case. :param df_series: Series currently being processed by categorical profiler - :type df_series: Series - :param len_df: the total number of samples iin df_series + :type df_series: polars.Series + :param len_df: the total number of samples in df_series :type len_df: int :return: cms, heavy_hitter_dict, missing_heavy_hitter_dict """ @@ -601,13 +602,13 @@ def _get_categories_full(self, df_series) -> dict: :return: dict of counts for each unique value :rtype: dict """ - category_count: dict = df_series.value_counts(dropna=False).to_dict() + category_count: dict = Series(df_series).value_counts(dropna=False).to_dict() return category_count @BaseColumnProfiler._timeit(name="categories") def _update_categories( self, - df_series: DataFrame, + df_series: DataFrame | pl.DataFrame, prev_dependent_properties: dict = None, subset_properties: dict = None, ) -> None: @@ -657,7 +658,9 @@ def _update_categories( if self._stop_condition_is_met: self._categories = {} - def _update_helper(self, df_series_clean: Series, profile: dict) -> None: + def _update_helper( + self, df_series_clean: Series | pl.Series, profile: dict + ) -> None: """ Update col profile properties with clean dataset and its known profile. @@ -669,7 +672,7 @@ def _update_helper(self, df_series_clean: Series, profile: dict) -> None: """ self._update_column_base_properties(profile) - def update(self, df_series: Series) -> CategoricalColumn: + def update(self, df_series: pl.Series | Series) -> CategoricalColumn: """ Update the column profile. @@ -682,12 +685,17 @@ def update(self, df_series: Series) -> CategoricalColumn: if len(df_series) == 0 or self._stop_condition_is_met: return self + if isinstance(df_series, pl.Series): + pandas_df = df_series.to_pandas() + else: + pandas_df = df_series + profile = dict(sample_size=len(df_series)) CategoricalColumn._update_categories(self, df_series) BaseColumnProfiler._perform_property_calcs( self, self.__calculations, - df_series=df_series, + df_series=pandas_df, prev_dependent_properties={}, subset_properties=profile, ) diff --git a/dataprofiler/tests/profilers/test_categorical_column_profile.py b/dataprofiler/tests/profilers/test_categorical_column_profile.py index 55d2ea68e..5a403dec9 100644 --- a/dataprofiler/tests/profilers/test_categorical_column_profile.py +++ b/dataprofiler/tests/profilers/test_categorical_column_profile.py @@ -6,6 +6,7 @@ import numpy as np import pandas as pd +import polars as pl from dataprofiler.profilers import CategoricalColumn from dataprofiler.profilers.json_decoder import load_column_profile @@ -51,7 +52,7 @@ def test_correct_categorical_model_string(self): self.assertCountEqual(categories, profile.categories) def test_stop_condition_is_met_initially(self): - dataset = pd.Series(["a"] * 10 + ["b"] * 10 + ["c"] * 10 + ["d"] * 10) + dataset = pl.Series(["a"] * 10 + ["b"] * 10 + ["c"] * 10 + ["d"] * 10) profile = CategoricalColumn("test dataset") profile.max_sample_size_to_check_stop_condition = 0 profile.stop_condition_unique_value_ratio = 0 @@ -368,7 +369,7 @@ def test_categorical_mapping(self): self.assertNotEqual(num_nan_count, len(column_profile.null_types_index["NaN"])) def test_true_categorical_report(self): - df_categorical = pd.Series( + df_categorical = pl.Series( [ "a", "a", @@ -415,7 +416,7 @@ def test_true_categorical_report(self): self.assertEqual(report, expected_profile) def test_false_categorical_report(self): - df_non_categorical = pd.Series(list(map(str, range(0, 20)))) + df_non_categorical = pl.Series(list(map(str, range(0, 20)))) profile = CategoricalColumn(df_non_categorical.name) profile.update(df_non_categorical) @@ -433,7 +434,7 @@ def test_false_categorical_report(self): self.assertEqual(report, expected_profile) def test_report(self): - df_non_categorical = pd.Series(list(map(str, range(0, 20)))) + df_non_categorical = pl.Series(list(map(str, range(0, 20)))) profile = CategoricalColumn(df_non_categorical.name) profile.update(df_non_categorical) @@ -681,32 +682,32 @@ def test_categorical_merge(self): def test_gini_impurity(self): # Normal test - df_categorical = pd.Series(["y", "y", "y", "y", "n", "n", "n"]) + df_categorical = pl.Series(["y", "y", "y", "y", "n", "n", "n"]) profile = CategoricalColumn(df_categorical.name) profile.update(df_categorical) expected_val = ((4 / 7) * (3 / 7)) + ((4 / 7) * (3 / 7)) self.assertAlmostEqual(profile.gini_impurity, expected_val) # One class only test - df_categorical = pd.Series(["y", "y", "y", "y", "y", "y", "y"]) + df_categorical = pl.Series(["y", "y", "y", "y", "y", "y", "y"]) profile = CategoricalColumn(df_categorical.name) profile.update(df_categorical) expected_val = 0 self.assertEqual(profile.gini_impurity, expected_val) # Empty test - df_categorical = pd.Series([]) + df_categorical = pl.Series([]) profile = CategoricalColumn(df_categorical.name) profile.update(df_categorical) self.assertEqual(profile.gini_impurity, None) def test_categorical_diff(self): # test psi new category in another profile - df_categorical = pd.Series(["y", "y", "y", "y", "n", "n", "n"]) + df_categorical = pl.Series(["y", "y", "y", "y", "n", "n", "n"]) profile = CategoricalColumn(df_categorical.name) profile.update(df_categorical) - df_categorical = pd.Series(["y", "maybe", "y", "y", "n", "n", "maybe"]) + df_categorical = pl.Series(["y", "maybe", "y", "y", "n", "n", "maybe"]) profile2 = CategoricalColumn(df_categorical.name) profile2.update(df_categorical) @@ -734,7 +735,7 @@ def test_categorical_diff(self): self.assertDictEqual(expected_diff, actual_diff) # Test with one categorical column matching - df_not_categorical = pd.Series( + df_not_categorical = pl.Series( [ "THIS", "is", @@ -759,11 +760,11 @@ def test_categorical_diff(self): self.assertDictEqual(expected_diff, profile.diff(profile2)) # Test diff with psi enabled - df_categorical = pd.Series(["y", "y", "y", "y", "n", "n", "n", "maybe"]) + df_categorical = pl.Series(["y", "y", "y", "y", "n", "n", "n", "maybe"]) profile = CategoricalColumn(df_categorical.name) profile.update(df_categorical) - df_categorical = pd.Series(["y", "maybe", "y", "y", "n", "n", "maybe"]) + df_categorical = pl.Series(["y", "maybe", "y", "y", "n", "n", "maybe"]) profile2 = CategoricalColumn(df_categorical.name) profile2.update(df_categorical) @@ -787,32 +788,32 @@ def test_categorical_diff(self): self.assertDictEqual(expected_diff, profile.diff(profile2)) def test_unalikeability(self): - df_categorical = pd.Series(["a", "a"]) + df_categorical = pl.Series(["a", "a"]) profile = CategoricalColumn(df_categorical.name) profile.update(df_categorical) self.assertEqual(profile.unalikeability, 0) - df_categorical = pd.Series(["a", "c", "b"]) + df_categorical = pl.Series(["a", "c", "b"]) profile = CategoricalColumn(df_categorical.name) profile.update(df_categorical) self.assertEqual(profile.unalikeability, 1) - df_categorical = pd.Series(["a", "a", "a", "b", "b", "b"]) + df_categorical = pl.Series(["a", "a", "a", "b", "b", "b"]) profile = CategoricalColumn(df_categorical.name) profile.update(df_categorical) self.assertEqual(profile.unalikeability, 18 / 30) - df_categorical = pd.Series(["a", "a", "b", "b", "b", "a", "c", "c", "a", "a"]) + df_categorical = pl.Series(["a", "a", "b", "b", "b", "a", "c", "c", "a", "a"]) profile = CategoricalColumn(df_categorical.name) profile.update(df_categorical) self.assertEqual(profile.unalikeability, 2 * (10 + 15 + 6) / 90) - df_categorical = pd.Series(["a"]) + df_categorical = pl.Series(["a"]) profile = CategoricalColumn(df_categorical.name) profile.update(df_categorical) self.assertEqual(0, profile.unalikeability) - df_categorical = pd.Series([]) + df_categorical = pl.Series([]) profile = CategoricalColumn(df_categorical.name) profile.update(df_categorical) self.assertEqual(None, profile.unalikeability) @@ -820,7 +821,7 @@ def test_unalikeability(self): def test_top_k_categories_change(self): # Test if top_k_categories is None options = CategoricalOptions() - df_series = pd.Series(["a", "a", "b", "c", "d", "e", "e", "e", "f", "g"]) + df_series = pl.Series(["a", "a", "b", "c", "d", "e", "e", "e", "f", "g"]) profile = CategoricalColumn(df_series.name, options) profile.update(df_series) self.assertEqual(len(profile.profile["statistics"]["categorical_count"]), 7) @@ -831,7 +832,7 @@ def test_top_k_categories_change(self): # Test if top_k_categories is greater than the count of categories options.top_k_categories = 6 - df_series = pd.Series(["a", "a", "b", "c", "d"]) + df_series = pl.Series(["a", "a", "b", "c", "d"]) profile = CategoricalColumn(df_series.name, options) profile.update(df_series) self.assertEqual(len(profile.profile["statistics"]["categorical_count"]), 4) @@ -947,7 +948,7 @@ def test_json_decode_after_update(self): # Actual deserialization # Build expected CategoricalColumn - df_categorical = pd.Series( + df_categorical = pl.Series( [ "a", "a", @@ -973,7 +974,7 @@ def test_json_decode_after_update(self): test_utils.assert_profiles_equal(deserialized, expected_profile) - df_categorical = pd.Series( + df_categorical = pl.Series( [ "a", # add existing "d", # add new @@ -987,7 +988,7 @@ def test_json_decode_after_update(self): assert deserialized.categorical_counts == {"c": 5, "b": 4, "a": 4, "d": 1} def test_cms_max_num_heavy_hitters(self): - df_categorical = pd.Series(["a"] * 5 + ["b"] * 5 + ["c"] * 10) + df_categorical = pl.Series(["a"] * 5 + ["b"] * 5 + ["c"] * 10) options = CategoricalOptions() options.cms = True @@ -1002,8 +1003,8 @@ def test_cms_max_num_heavy_hitters(self): self.assertTrue(profile.sample_size >= 10) def test_cms_update_hybrid_batch_stream(self): - dataset = pd.Series(["a"] * 7 + ["b"] * 9 + ["c"] * 14) - dataset1 = pd.Series(["a"] * 9 + ["b"] * 11 + ["c"] * 9 + ["d"] * 1) + dataset = pl.Series(["a"] * 7 + ["b"] * 9 + ["c"] * 14) + dataset1 = pl.Series(["a"] * 9 + ["b"] * 11 + ["c"] * 9 + ["d"] * 1) options = CategoricalOptions() options.cms = True @@ -1031,8 +1032,8 @@ def test_cms_update_hybrid_batch_stream(self): def test_cms_profile_merge_via_add(self): - dataset = pd.Series(["a"] * 9 + ["b"] * 12 + ["c"] * 9) - dataset1 = pd.Series(["a"] * 6 + ["b"] * 10 + ["c"] * 14) + dataset = pl.Series(["a"] * 9 + ["b"] * 12 + ["c"] * 9) + dataset1 = pl.Series(["a"] * 6 + ["b"] * 10 + ["c"] * 14) expected_categories = ["b", "c"] expected_categories_dict = {"b": 22, "c": 23} @@ -1074,8 +1075,8 @@ def test_cms_profile_merge_via_add(self): def test_cms_profile_min_max_num_heavy_hitters(self): - dataset = pd.Series(["a"] * 9 + ["b"] * 12 + ["c"] * 9) - dataset1 = pd.Series(["a"] * 6 + ["b"] * 10 + ["c"] * 14) + dataset = pl.Series(["a"] * 9 + ["b"] * 12 + ["c"] * 9) + dataset1 = pl.Series(["a"] * 6 + ["b"] * 10 + ["c"] * 14) options = CategoricalOptions() options.cms = True @@ -1097,8 +1098,8 @@ def test_cms_profile_min_max_num_heavy_hitters(self): def test_cms_catch_overwriting_with_missing_dict(self): - dataset = pd.Series(["b"] * 2 + ["c"] * 14) - dataset1 = pd.Series(["b"] * 5 + ["c"] * 10) + dataset = pl.Series(["b"] * 2 + ["c"] * 14) + dataset1 = pl.Series(["b"] * 5 + ["c"] * 10) options = CategoricalOptions() options.cms = True @@ -1126,7 +1127,7 @@ def test_cms_catch_overwriting_with_missing_dict(self): def test_cms_vs_full_mismatch_merge(self): - dataset = pd.Series(["b"] * 2 + ["c"] * 14) + dataset = pl.Series(["b"] * 2 + ["c"] * 14) options = CategoricalOptions() options.cms = True @@ -1176,7 +1177,7 @@ def test_fewer_than_MAXIMUM_UNIQUE_VALUES_TO_CLASSIFY_AS_CATEGORICAL(self): ] len_unique = len(set(cat_sentence_list)) - cat_sentence_df = pd.Series(cat_sentence_list) + cat_sentence_df = pl.Series(cat_sentence_list) column_profile = StructuredColProfiler(cat_sentence_df) cat_profiler = column_profile.profiles["data_stats_profile"]._profiles[ "category" @@ -1200,7 +1201,7 @@ def test_greater_than_CATEGORICAL_THRESHOLD_DEFAULT_identify_as_text(self): ) cat_sentence_list = list_unique_values * num_sentences - cat_sentence_df = pd.Series(cat_sentence_list) + cat_sentence_df = pl.Series(cat_sentence_list) column_profile = StructuredColProfiler(cat_sentence_df) cat_profiler = column_profile.profiles["data_stats_profile"]._profiles[ "category" @@ -1226,7 +1227,7 @@ def test_less_than_CATEGORICAL_THRESHOLD_DEFAULT(self): cat_sentence_list = list_unique_values * num_sentences len_unique = len(set(cat_sentence_list)) - cat_sentence_df = pd.Series(cat_sentence_list) + cat_sentence_df = pl.Series(cat_sentence_list) column_profile = StructuredColProfiler(cat_sentence_df) cat_profiler = column_profile.profiles["data_stats_profile"]._profiles[ "category" @@ -1255,7 +1256,7 @@ def test_uppercase_less_than_CATEGORICAL_THRESHOLD_DEFAULT(self): cat_sentence_list[-3] = self.test_sentence_upper3 + str(num_sentences - 2) len_unique = len(set(cat_sentence_list)) - cat_sentence_df = pd.Series(cat_sentence_list) + cat_sentence_df = pl.Series(cat_sentence_list) column_profile = StructuredColProfiler(cat_sentence_df) cat_profiler = column_profile.profiles["data_stats_profile"]._profiles[ "category" @@ -1279,7 +1280,7 @@ def test_long_sentences_fewer_than_MAXIMUM_UNIQUE_VALUES_TO_CLASSIFY_AS_CATEGORI ] len_unique = len(set(cat_sentence_list)) - cat_sentence_df = pd.Series(cat_sentence_list) + cat_sentence_df = pl.Series(cat_sentence_list) column_profile = StructuredColProfiler(cat_sentence_df) cat_profiler = column_profile.profiles["data_stats_profile"]._profiles[ "category"