From 671017cb0757909387c0a915ec541389b813ccc9 Mon Sep 17 00:00:00 2001
From: ubd725 <aabajpai15@gmail.com>
Date: Tue, 27 Feb 2024 15:50:36 -0600
Subject: [PATCH 01/10] Add polars to datetime_column_profile

---
 .../profilers/datetime_column_profile.py      | 25 ++++++++++---------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/dataprofiler/profilers/datetime_column_profile.py b/dataprofiler/profilers/datetime_column_profile.py
index af99283a9..7b2b57988 100644
--- a/dataprofiler/profilers/datetime_column_profile.py
+++ b/dataprofiler/profilers/datetime_column_profile.py
@@ -7,6 +7,7 @@
 
 import numpy as np
 import pandas as pd
+import polars as pl
 
 from . import profiler_utils
 from .base_column_profilers import BaseColumnPrimitiveTypeProfiler, BaseColumnProfiler
@@ -256,8 +257,7 @@ def _get_datetime_profile(cls, df_series: pd.Series) -> dict:
         profile: dict = dict()
         activated_date_formats: list = list()
         len_df = len(df_series)
-
-        is_row_datetime = pd.Series(np.full((len(df_series)), False))
+        is_row_datetime = pd.Series(np.full((len_df), False))
 
         min_value = None
         max_value = None
@@ -275,18 +275,19 @@ def _get_datetime_profile(cls, df_series: pd.Series) -> dict:
                 )
             )
 
-            df_dates = valid_dates[~valid_dates.isnull()]
+            df_dates = pl.Series(valid_dates[~valid_dates.isnull()])
 
-            if "%b" in date_format and not df_dates.empty:
+            if "%b" in date_format and not df_dates.is_empty():
                 may_month = 5  # May can be %b or %B we want to force, so check
-                all_may = df_dates.apply(lambda x: x.month == may_month).all()
+                all_may = df_dates.map_elements(lambda x: x.month == may_month)
+                all_may = pl.Series(all_may).all()
                 if all_may:
-                    valid_dates[:] = np.nan
-                    df_dates = pd.Series([], dtype=object)
+                    valid_dates[:] = None
+                    df_dates = pl.Series([])
 
             # Create mask to avoid null dates
             null_date_mask = valid_dates.isnull()
-            np_date_array = df_dates.values
+            np_date_array = df_dates.to_numpy()
 
             # check off any values which were found to be datetime
             is_row_datetime[~is_row_datetime] = (~null_date_mask).values
@@ -298,18 +299,18 @@ def _get_datetime_profile(cls, df_series: pd.Series) -> dict:
                 max_idx = np.argmax(np_date_array)
 
                 # Selects the min, ma value objects for comparison
-                tmp_min_value_obj = df_dates.iloc[min_idx]
-                tmp_max_value_obj = df_dates.iloc[max_idx]
+                tmp_min_value_obj = df_dates.item(int(min_idx))
+                tmp_max_value_obj = df_dates.item(int(max_idx))
 
                 # If minimum value, keep reference
                 if tmp_min_value_obj < min_value_obj:
                     min_value = df_series[~null_date_mask].iloc[min_idx]
-                    min_value_obj = tmp_min_value_obj
+                    min_value_obj = pd.Timestamp(tmp_min_value_obj)
 
                 # If maximum value, keep reference
                 if tmp_max_value_obj > max_value_obj:
                     max_value = df_series[~null_date_mask].iloc[max_idx]
-                    max_value_obj = tmp_max_value_obj
+                    max_value_obj = pd.Timestamp(tmp_max_value_obj)
 
             df_series = df_series[null_date_mask]
 

From 280c7cc399d373d5c99e1513763e8dfbe5c5418d Mon Sep 17 00:00:00 2001
From: ubd725 <aabajpai15@gmail.com>
Date: Tue, 5 Mar 2024 15:56:04 -0600
Subject: [PATCH 02/10] Polars added to unstructured labeler

---
 .../profilers/unstructured_labeler_profile.py | 19 +++++++++-----
 .../test_unstructured_labeler_profile.py      | 25 ++++++++++---------
 2 files changed, 26 insertions(+), 18 deletions(-)

diff --git a/dataprofiler/profilers/unstructured_labeler_profile.py b/dataprofiler/profilers/unstructured_labeler_profile.py
index 1c7b16c0f..2a021fe81 100644
--- a/dataprofiler/profilers/unstructured_labeler_profile.py
+++ b/dataprofiler/profilers/unstructured_labeler_profile.py
@@ -3,6 +3,7 @@
 
 from collections import defaultdict
 
+import polars as pl
 from pandas import Series
 
 from ..labelers.base_data_labeler import BaseDataLabeler
@@ -102,7 +103,7 @@ def __add__(self, other: UnstructuredLabelerProfile) -> UnstructuredLabelerProfi
 
         return merged_profile
 
-    def report(self, remove_disabled_flag: bool = False) -> dict:
+    def report(self) -> dict:
         """
         Return profile object.
 
@@ -176,6 +177,7 @@ def _update_helper(self, df_series_clean: Series, profile: dict) -> None:
             df_series_clean, predictions.copy(), self.data_labeler.label_mapping
         )
 
+        df_series_clean = pl.Series(df_series_clean)
         # Update counts and percent values
         self._update_word_label_counts(df_series_clean, format_predictions["pred"])
         self._update_true_char_label_counts(predictions["pred"])
@@ -188,7 +190,7 @@ def _update_helper(self, df_series_clean: Series, profile: dict) -> None:
         # CHARACTERS/WORDS PROCESSED
         self._update_column_base_properties(profile)
 
-    def update(self, df_series: Series) -> None:
+    def update(self, df_series: Series | pl.Series) -> None:
         """Update profile."""
         if len(df_series) == 0:
             return
@@ -196,6 +198,9 @@ def update(self, df_series: Series) -> None:
             char_sample_size=self.char_sample_size,
             word_sample_size=self.word_sample_size,
         )
+
+        if type(df_series) is pl.Series:
+            df_series = df_series.to_pandas()
         self._update_helper(df_series, profile)
 
     @property
@@ -278,7 +283,7 @@ def _update_true_char_label_counts(self, predictions: list) -> None:
             self.char_sample_size += len(sample)
 
     def _update_postprocess_char_label_counts(
-        self, df_series_clean: Series, format_predictions: dict
+        self, df_series_clean: Series | pl.Series, format_predictions: dict
     ) -> None:
         """
         Update the postprocess character label counts.
@@ -292,7 +297,8 @@ def _update_postprocess_char_label_counts(
         """
         char_label_counts = self.entity_counts["postprocess_char_level"]
 
-        for index, result in enumerate(zip(df_series_clean, format_predictions)):
+        df_series_clean = pl.Series(df_series_clean)
+        for result in zip(df_series_clean, format_predictions):
             text, entities = result
             index = 0
             for entity in entities:
@@ -308,7 +314,7 @@ def _update_postprocess_char_label_counts(
             char_label_counts["UNKNOWN"] += len(text) - index
 
     def _update_word_label_counts(
-        self, df_series_clean: Series, format_predictions: dict
+        self, df_series_clean: Series | pl.Series, format_predictions: dict
     ) -> None:
         """
         Update the sorted dictionary of each entity count.
@@ -321,7 +327,8 @@ def _update_word_label_counts(
         """
         word_label_counts = self.entity_counts["word_level"]
 
-        for index, result in enumerate(zip(df_series_clean, format_predictions)):
+        df_series_clean = pl.Series(df_series_clean)
+        for result in zip(df_series_clean, format_predictions):
             text, entities = result
             begin_word_idx = -1
             index = 0
diff --git a/dataprofiler/tests/profilers/test_unstructured_labeler_profile.py b/dataprofiler/tests/profilers/test_unstructured_labeler_profile.py
index c7aa8b0c5..ce93c516e 100644
--- a/dataprofiler/tests/profilers/test_unstructured_labeler_profile.py
+++ b/dataprofiler/tests/profilers/test_unstructured_labeler_profile.py
@@ -3,6 +3,7 @@
 from unittest import mock
 
 import pandas as pd
+import polars as pl
 
 from dataprofiler.profilers import profiler_utils
 from dataprofiler.profilers.unstructured_labeler_profile import (
@@ -15,7 +16,7 @@ def test_char_level_counts(self):
         # setting up objects/profile
         default = UnstructuredLabelerProfile()
 
-        sample = pd.Series(["abc123", "Bob", "!@##$%"])
+        sample = pl.Series(["abc123", "Bob", "!@##$%"])
 
         # running update
         default.update(sample)
@@ -34,7 +35,7 @@ def test_advanced_sample(self):
         # setting up objects/profile
         default = UnstructuredLabelerProfile()
 
-        sample = pd.Series(
+        sample = pl.Series(
             [
                 "Help\tJohn Macklemore\tneeds\tfood.\tPlease\tCall\t555-301-1234."
                 "\tHis\tssn\tis\tnot\t334-97-1234. I'm a BAN: 000043219499392912."
@@ -56,7 +57,7 @@ def test_word_level_NER_label_counts(self):
         # setting up objects/profile
         default = UnstructuredLabelerProfile()
 
-        sample = pd.Series(
+        sample = pl.Series(
             [
                 "Help\tJohn Macklemore\tneeds\tfood.\tPlease\tCall\t555-301-1234."
                 "\tHis\tssn\tis\tnot\t334-97-1234. I'm a BAN: 000049939232194912."
@@ -78,7 +79,7 @@ def test_statistics(self):
         # setting up objects/profile
         default = UnstructuredLabelerProfile()
 
-        sample = pd.Series(
+        sample = pl.Series(
             [
                 "Help\tJohn Macklemore\tneeds\tfood.\tPlease\tCall\t555-301-1234."
                 "\tHis\tssn\tis\tnot\t334-97-1234. I'm a BAN: 000043219499392912."
@@ -123,7 +124,7 @@ def test_profile(self, processor_class_mock, model_class_mock):
         # initialize labeler profile
         default = UnstructuredLabelerProfile()
 
-        sample = pd.Series(["a"])
+        sample = pl.Series(["a"])
         expected_profile = dict(
             entity_counts={
                 "postprocess_char_level": defaultdict(int, {"UNKNOWN": 1}),
@@ -163,15 +164,15 @@ def test_report(self, processor_class_mock, model_class_mock):
         # initialize labeler profile
         profile = UnstructuredLabelerProfile()
 
-        sample = pd.Series(["a"])
+        sample = pl.Series(["a"])
 
         time_array = [float(i) for i in range(4, 0, -1)]
         with mock.patch("time.time", side_effect=lambda: time_array.pop()):
             profile.update(sample)
 
         report1 = profile.profile
-        report2 = profile.report(remove_disabled_flag=False)
-        report3 = profile.report(remove_disabled_flag=True)
+        report2 = profile.report()
+        report3 = profile.report()
         self.assertDictEqual(report1, report2)
         self.assertDictEqual(report1, report3)
 
@@ -192,7 +193,7 @@ def test_entity_percentages(self, mock1, mock2):
         profile.entity_counts["true_char_level"]["TEST"] = 16
         profile.entity_counts["word_level"]["UNKNOWN"] = 5
         profile.entity_counts["word_level"]["TEST"] = 5
-        profile.update(pd.Series(["a"]))
+        profile.update(pl.Series(["a"]))
 
         expected_percentages = {
             "postprocess_char_level": defaultdict(int, {"UNKNOWN": 0.3, "TEST": 0.7}),
@@ -275,7 +276,7 @@ def test_diff(self, mock1, mock2):
         profiler1.entity_counts["word_level"]["UNKNOWN"] = 5
         profiler1.entity_counts["word_level"]["TEST"] = 5
         profiler1.entity_counts["word_level"]["UNIQUE1"] = 5
-        profiler1.update(pd.Series(["a"]))
+        profiler1.update(pl.Series(["a"]))
 
         profiler2 = UnstructuredLabelerProfile()
         profiler2.char_sample_size = 20
@@ -289,7 +290,7 @@ def test_diff(self, mock1, mock2):
         profiler2.entity_counts["word_level"]["UNKNOWN"] = 2
         profiler2.entity_counts["word_level"]["TEST"] = 4
         profiler2.entity_counts["word_level"]["UNIQUE2"] = 4
-        profiler2.update(pd.Series(["a"]))
+        profiler2.update(pl.Series(["a"]))
 
         expected_diff = {
             "entity_counts": {
@@ -342,7 +343,7 @@ def test_diff(self, mock1, mock2):
         profiler1.entity_counts["postprocess_char_level"]["UNKNOWN"] = 5
         profiler1.entity_counts["true_char_level"]["UNKNOWN"] = 5
         profiler1.entity_counts["word_level"]["UNKNOWN"] = 5
-        profiler1.update(pd.Series(["a"]))
+        profiler1.update(pl.Series(["a"]))
 
         profiler2 = UnstructuredLabelerProfile()
         profile2 = profiler2.profile

From faa51e680e683146dc741a538aae15534b3d7ec4 Mon Sep 17 00:00:00 2001
From: ubd725 <aabajpai15@gmail.com>
Date: Wed, 27 Mar 2024 14:21:15 -0500
Subject: [PATCH 03/10] Quick fix for keras and tensorflow

---
 dataprofiler/profilers/unstructured_labeler_profile.py | 10 +++++-----
 .../profilers/test_unstructured_labeler_profile.py     |  5 ++---
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/dataprofiler/profilers/unstructured_labeler_profile.py b/dataprofiler/profilers/unstructured_labeler_profile.py
index 2a021fe81..120a42114 100644
--- a/dataprofiler/profilers/unstructured_labeler_profile.py
+++ b/dataprofiler/profilers/unstructured_labeler_profile.py
@@ -103,7 +103,7 @@ def __add__(self, other: UnstructuredLabelerProfile) -> UnstructuredLabelerProfi
 
         return merged_profile
 
-    def report(self) -> dict:
+    def report(self, remove_disabled_flag: bool = False) -> dict:
         """
         Return profile object.
 
@@ -156,7 +156,7 @@ def label_encoding(self) -> list[str]:
         return self.data_labeler.labels
 
     @BaseColumnProfiler._timeit(name="data_labeler_predict")
-    def _update_helper(self, df_series_clean: Series, profile: dict) -> None:
+    def _update_helper(self, df_series_clean: pl.Series, profile: dict) -> None:
         """
         Update col profile properties with clean dataset and its known profile.
 
@@ -190,7 +190,7 @@ def _update_helper(self, df_series_clean: Series, profile: dict) -> None:
         # CHARACTERS/WORDS PROCESSED
         self._update_column_base_properties(profile)
 
-    def update(self, df_series: Series | pl.Series) -> None:
+    def update(self, df_series: pl.Series) -> None:
         """Update profile."""
         if len(df_series) == 0:
             return
@@ -283,7 +283,7 @@ def _update_true_char_label_counts(self, predictions: list) -> None:
             self.char_sample_size += len(sample)
 
     def _update_postprocess_char_label_counts(
-        self, df_series_clean: Series | pl.Series, format_predictions: dict
+        self, df_series_clean: pl.Series, format_predictions: dict
     ) -> None:
         """
         Update the postprocess character label counts.
@@ -314,7 +314,7 @@ def _update_postprocess_char_label_counts(
             char_label_counts["UNKNOWN"] += len(text) - index
 
     def _update_word_label_counts(
-        self, df_series_clean: Series | pl.Series, format_predictions: dict
+        self, df_series_clean: pl.Series, format_predictions: dict
     ) -> None:
         """
         Update the sorted dictionary of each entity count.
diff --git a/dataprofiler/tests/profilers/test_unstructured_labeler_profile.py b/dataprofiler/tests/profilers/test_unstructured_labeler_profile.py
index ce93c516e..0af957c2a 100644
--- a/dataprofiler/tests/profilers/test_unstructured_labeler_profile.py
+++ b/dataprofiler/tests/profilers/test_unstructured_labeler_profile.py
@@ -2,7 +2,6 @@
 from collections import defaultdict
 from unittest import mock
 
-import pandas as pd
 import polars as pl
 
 from dataprofiler.profilers import profiler_utils
@@ -171,8 +170,8 @@ def test_report(self, processor_class_mock, model_class_mock):
             profile.update(sample)
 
         report1 = profile.profile
-        report2 = profile.report()
-        report3 = profile.report()
+        report2 = profile.report(remove_disabled_flag=False)
+        report3 = profile.report(remove_disabled_flag=True)
         self.assertDictEqual(report1, report2)
         self.assertDictEqual(report1, report3)
 

From f6b197052f63ad3e1f716828874e30f268ca7067 Mon Sep 17 00:00:00 2001
From: ubd725 <aabajpai15@gmail.com>
Date: Thu, 28 Mar 2024 16:14:53 -0500
Subject: [PATCH 04/10] Add polars to unstructured text

---
 .../profilers/unstructured_labeler_profile.py | 19 ++---
 .../profilers/unstructured_text_profile.py    | 20 +++--
 .../test_unstructured_text_profile.py         | 78 +++++++++----------
 3 files changed, 59 insertions(+), 58 deletions(-)

diff --git a/dataprofiler/profilers/unstructured_labeler_profile.py b/dataprofiler/profilers/unstructured_labeler_profile.py
index 120a42114..74bd4cae2 100644
--- a/dataprofiler/profilers/unstructured_labeler_profile.py
+++ b/dataprofiler/profilers/unstructured_labeler_profile.py
@@ -4,7 +4,6 @@
 from collections import defaultdict
 
 import polars as pl
-from pandas import Series
 
 from ..labelers.base_data_labeler import BaseDataLabeler
 from ..labelers.data_labelers import DataLabeler
@@ -161,23 +160,25 @@ def _update_helper(self, df_series_clean: pl.Series, profile: dict) -> None:
         Update col profile properties with clean dataset and its known profile.
 
         :param df_series_clean: df series with nulls removed
-        :type df_series_clean: pandas.core.series.Series
+        :type df_series_clean: polars.Series
         :param profile: profile dictionary
         :type profile: dict
         :return: None
         """
+
+        data_ndarray = df_series_clean.to_numpy()
+
         # this will get char_level predictions as output
-        predictions = self.data_labeler.predict(df_series_clean)
+        predictions = self.data_labeler.predict(data_ndarray)
 
         # also store spacy/NER format
         postprocessor = CharPostprocessor(
             use_word_level_argmax=True, output_format="NER"
         )
         format_predictions = postprocessor.process(
-            df_series_clean, predictions.copy(), self.data_labeler.label_mapping
+            data_ndarray, predictions.copy(), self.data_labeler.label_mapping
         )
 
-        df_series_clean = pl.Series(df_series_clean)
         # Update counts and percent values
         self._update_word_label_counts(df_series_clean, format_predictions["pred"])
         self._update_true_char_label_counts(predictions["pred"])
@@ -199,8 +200,6 @@ def update(self, df_series: pl.Series) -> None:
             word_sample_size=self.word_sample_size,
         )
 
-        if type(df_series) is pl.Series:
-            df_series = df_series.to_pandas()
         self._update_helper(df_series, profile)
 
     @property
@@ -289,7 +288,7 @@ def _update_postprocess_char_label_counts(
         Update the postprocess character label counts.
 
         :param df_series_clean: df series with nulls removed
-        :type df_series_clean: pandas.core.series.Series
+        :type df_series_clean: polars.Series
         :param format_predictions: contains dict of samples with predictions on
             the character level in congruence with the word level predictions
         :type format_predictions: Dict
@@ -297,7 +296,6 @@ def _update_postprocess_char_label_counts(
         """
         char_label_counts = self.entity_counts["postprocess_char_level"]
 
-        df_series_clean = pl.Series(df_series_clean)
         for result in zip(df_series_clean, format_predictions):
             text, entities = result
             index = 0
@@ -320,14 +318,13 @@ def _update_word_label_counts(
         Update the sorted dictionary of each entity count.
 
         :param df_series_clean: df series with nulls removed
-        :type df_series_clean: pandas.core.series.Series
+        :type df_series_clean: polars.Series
         :param format_predictions: Dictionary of sample text and entities
         :type format_predictions: dict
         :return: None
         """
         word_label_counts = self.entity_counts["word_level"]
 
-        df_series_clean = pl.Series(df_series_clean)
         for result in zip(df_series_clean, format_predictions):
             text, entities = result
             begin_word_idx = -1
diff --git a/dataprofiler/profilers/unstructured_text_profile.py b/dataprofiler/profilers/unstructured_text_profile.py
index 96b7d0625..214b10044 100644
--- a/dataprofiler/profilers/unstructured_text_profile.py
+++ b/dataprofiler/profilers/unstructured_text_profile.py
@@ -8,6 +8,7 @@
 
 from numpy import ndarray
 from pandas import DataFrame, Series
+import polars as pl
 
 from . import profiler_utils
 from .base_column_profilers import BaseColumnProfiler
@@ -667,7 +668,7 @@ def profile(self) -> dict:
     @BaseColumnProfiler._timeit(name="vocab")
     def _update_vocab(
         self,
-        data: list | ndarray | DataFrame,
+        data: list | ndarray | DataFrame | pl.DataFrame,
         prev_dependent_properties: dict = None,
         subset_properties: dict = None,
     ) -> None:
@@ -675,7 +676,7 @@ def _update_vocab(
         Find the vocabulary counts used in the text samples.
 
         :param data: list or array of data from which to extract vocab
-        :type data: Union[list, numpy.array, pandas.DataFrame]
+        :type data: Union[list, numpy.array, pandas.DataFrame, polars.DataFrame]
         :param prev_dependent_properties: Contains all the previous properties
             that the calculations depend on.
         :type prev_dependent_properties: dict
@@ -690,7 +691,7 @@ def _update_vocab(
     @BaseColumnProfiler._timeit(name="words")
     def _update_words(
         self,
-        data: list | ndarray | DataFrame,
+        data: list | ndarray | DataFrame | pl.DataFrame,
         prev_dependent_properties: dict = None,
         subset_properties: dict = None,
     ) -> None:
@@ -698,7 +699,7 @@ def _update_words(
         Find unique words and word count used in the text samples.
 
         :param data: list or array of data from which to extract vocab
-        :type data: Union[list, numpy.array, pandas.DataFrame]
+        :type data: Union[list, numpy.array, pandas.DataFrame, polars.DataFrame]
         :param prev_dependent_properties: Contains all the previous properties
             that the calculations depend on.
         :type prev_dependent_properties: dict
@@ -720,12 +721,12 @@ def _update_words(
             if w and w.lower() not in self._stop_words:
                 self.word_count.update({w: c})
 
-    def _update_helper(self, data: Series, profile: dict) -> None:
+    def _update_helper(self, data: pl.Series, profile: dict) -> None:
         """
         Update col profile properties with clean dataset and its known null parameters.
 
         :param data: df series with nulls removed
-        :type data: pandas.core.series.Series
+        :type data: polars.Series
         :param profile: text profile dictionary
         :type profile: dict
         :return: None
@@ -733,12 +734,12 @@ def _update_helper(self, data: Series, profile: dict) -> None:
         self.sample_size += profile.pop("sample_size")
         self.metadata = profile
 
-    def update(self, data: Series) -> TextProfiler:
+    def update(self, data: Series | pl.Series) -> TextProfiler:
         """
         Update the column profile.
 
         :param data: df series
-        :type data: pandas.core.series.Series
+        :type data: polars.Series
         :return: updated TextProfiler
         :rtype: TextProfiler
         """
@@ -748,6 +749,9 @@ def update(self, data: Series) -> TextProfiler:
 
         profile = dict(sample_size=len_data)
 
+        if type(data) is pl.Series:
+            data = data.to_pandas()
+
         BaseColumnProfiler._perform_property_calcs(
             self,  # type: ignore
             self.__calculations,
diff --git a/dataprofiler/tests/profilers/test_unstructured_text_profile.py b/dataprofiler/tests/profilers/test_unstructured_text_profile.py
index 39b1d1bb8..3ebab8294 100644
--- a/dataprofiler/tests/profilers/test_unstructured_text_profile.py
+++ b/dataprofiler/tests/profilers/test_unstructured_text_profile.py
@@ -1,6 +1,6 @@
 import unittest
 
-import pandas as pd
+import polars as pl
 
 from dataprofiler.profilers.profiler_options import TextProfilerOptions
 from dataprofiler.profilers.unstructured_text_profile import TextProfiler
@@ -9,7 +9,7 @@
 class TestUnstructuredTextProfile(unittest.TestCase):
     def test_text_profile_update_and_name(self):
         text_profile = TextProfiler("Name")
-        sample = pd.Series(
+        sample = pl.Series(
             ["Hello my name is: Grant.!!!", "Bob and \"Grant\", 'are' friends"]
         )
         text_profile.update(sample)
@@ -17,7 +17,7 @@ def test_text_profile_update_and_name(self):
 
     def test_vocab(self):
         text_profile = TextProfiler("Name")
-        sample = pd.Series(
+        sample = pl.Series(
             ["Hello my name is: Grant.!!!", "Bob and \"Grant\", 'are' friends"]
         )
         text_profile.update(sample)
@@ -53,7 +53,7 @@ def test_vocab(self):
         self.assertListEqual(sorted(expected_vocab), sorted(profile["vocab"]))
 
         # Update the data again
-        sample = pd.Series(["Grant knows how to code", "Grant will code with Bob"])
+        sample = pl.Series(["Grant knows how to code", "Grant will code with Bob"])
         text_profile.update(sample)
         profile = text_profile.profile
 
@@ -92,7 +92,7 @@ def test_vocab(self):
 
     def test_words_and_word_count(self):
         text_profile = TextProfiler("Name")
-        sample = pd.Series(
+        sample = pl.Series(
             ["Hello my name is: Grant.!!!", "Bob and \"Grant\", 'are' friends"]
         )
         text_profile.update(sample)
@@ -114,7 +114,7 @@ def test_words_and_word_count(self):
         self.assertDictEqual(expected_word_count, profile["word_count"])
 
         # Update the data again
-        sample = pd.Series(["Grant knows how to code", "Grant will code with Bob"])
+        sample = pl.Series(["Grant knows how to code", "Grant will code with Bob"])
         text_profile.update(sample)
         profile = text_profile.profile
 
@@ -137,7 +137,7 @@ def test_words_and_word_count(self):
 
     def test_sample_size(self):
         text_profile = TextProfiler("Name")
-        sample = pd.Series(
+        sample = pl.Series(
             ["Hello my name is: Grant.!!!", "Bob and \"Grant\", 'are' friends"]
         )
         text_profile.update(sample)
@@ -146,7 +146,7 @@ def test_sample_size(self):
         self.assertEqual(2, text_profile.sample_size)
 
         # Update the data again
-        sample = pd.Series(["Grant knows how to code", "Grant will code with Bob"])
+        sample = pl.Series(["Grant knows how to code", "Grant will code with Bob"])
         text_profile.update(sample)
 
         # Assert sample size is accurate
@@ -154,7 +154,7 @@ def test_sample_size(self):
 
     def test_timing(self):
         text_profile = TextProfiler("Name")
-        sample = pd.Series(
+        sample = pl.Series(
             ["Hello my name is: Grant.!!!", "Bob and \"Grant\", 'are' friends"]
         )
         text_profile.update(sample)
@@ -166,11 +166,11 @@ def test_timing(self):
 
     def test_merge_profiles(self):
         text_profile1 = TextProfiler("Name")
-        sample = pd.Series(["Hello my name is: Grant.!!!"])
+        sample = pl.Series(["Hello my name is: Grant.!!!"])
         text_profile1.update(sample)
 
         text_profile2 = TextProfiler("Name")
-        sample = pd.Series(["Bob and \"Grant\", 'are' friends"])
+        sample = pl.Series(["Bob and \"Grant\", 'are' friends"])
         text_profile2.update(sample)
 
         text_profile3 = text_profile1 + text_profile2
@@ -231,11 +231,11 @@ def test_merge_profiles(self):
 
     def test_diff_profiles(self):
         text_profile1 = TextProfiler("Name")
-        sample = pd.Series(["Hello my name is: Grant.!!!"])
+        sample = pl.Series(["Hello my name is: Grant.!!!"])
         text_profile1.update(sample)
 
         text_profile2 = TextProfiler("Name")
-        sample = pd.Series(["Bob and \"grant\", 'are' friends Grant Grant"])
+        sample = pl.Series(["Bob and \"grant\", 'are' friends Grant Grant"])
         text_profile2.update(sample)
 
         expected_diff = {
@@ -271,13 +271,13 @@ def test_diff_profiles(self):
 
         # Test when one profiler is not case sensitive
         text_profile1 = TextProfiler("Name")
-        sample = pd.Series(["Hello my name is: Grant.!!!"])
+        sample = pl.Series(["Hello my name is: Grant.!!!"])
         text_profile1.update(sample)
 
         options = TextProfilerOptions()
         options.is_case_sensitive = False
         text_profile2 = TextProfiler("Name", options=options)
-        sample = pd.Series(["Bob and \"grant\", 'are' friends Grant Grant"])
+        sample = pl.Series(["Bob and \"grant\", 'are' friends Grant Grant"])
         text_profile2.update(sample)
 
         expected_diff = {
@@ -314,14 +314,14 @@ def test_diff_profiles(self):
     def test_case_sensitivity(self):
         text_profile1 = TextProfiler("Name")
         text_profile1._is_case_sensitive = False
-        sample = pd.Series(["Hello my name is: Grant.!!!"])
+        sample = pl.Series(["Hello my name is: Grant.!!!"])
         text_profile1.update(sample)
         profile = text_profile1.profile
         expected_word_count = {"grant": 1, "hello": 1, "name": 1}
         self.assertDictEqual(expected_word_count, profile["word_count"])
 
         text_profile2 = TextProfiler("Name")
-        sample = pd.Series(["Bob and \"Grant\", 'are' friends"])
+        sample = pl.Series(["Bob and \"Grant\", 'are' friends"])
         text_profile2.update(sample)
         profile = text_profile2.profile
         expected_word_count = {"Grant": 1, "Bob": 1, "friends": 1}
@@ -367,11 +367,11 @@ def test_case_sensitivity(self):
     def test_merge_most_common_chars_count(self):
         ### default values of most common chars for both profiles
         text_profile1 = TextProfiler("Name")
-        sample1 = pd.Series(["this is test,", " this is a test sentence"])
+        sample1 = pl.Series(["this is test,", " this is a test sentence"])
         text_profile1.update(sample1)
 
         text_profile2 = TextProfiler("Name")
-        sample2 = pd.Series(["this is", "this"])
+        sample2 = pl.Series(["this is", "this"])
         text_profile2.update(sample2)
 
         text_profile3 = text_profile1 + text_profile2
@@ -437,12 +437,12 @@ def test_merge_most_common_words_count(self):
         ### default values of most common words for both profiles
         text_profile1 = TextProfiler("Name")
         text_profile1._stop_words = set()  # set stop_words to empty for easy inspection
-        sample1 = pd.Series(["this is test,", " this is a test sentence"])
+        sample1 = pl.Series(["this is test,", " this is a test sentence"])
         text_profile1.update(sample1)
 
         text_profile2 = TextProfiler("Name")
         text_profile2._stop_words = set()  # set stop_words to empty for easy inspection
-        sample2 = pd.Series(["this is", "this"])
+        sample2 = pl.Series(["this is", "this"])
         text_profile2.update(sample2)
 
         text_profile3 = text_profile1 + text_profile2
@@ -494,7 +494,7 @@ def test_options_default(self):
 
         # input with one sample
         text_profile = TextProfiler("Name", options=options)
-        sample = pd.Series(["This is test, a Test sentence.!!!"])
+        sample = pl.Series(["This is test, a Test sentence.!!!"])
         text_profile.update(sample)
 
         expected_word_count = {"sentence": 1, "Test": 1, "test": 1}
@@ -518,7 +518,7 @@ def test_options_default(self):
 
         # input with two samples
         text_profile = TextProfiler("Name", options=options)
-        sample = pd.Series(["This is test,", " a Test sentence.!!!"])
+        sample = pl.Series(["This is test,", " a Test sentence.!!!"])
         text_profile.update(sample)
 
         expected_word_count = {"sentence": 1, "Test": 1, "test": 1}
@@ -553,7 +553,7 @@ def test_report(self):
         options.words.is_enabled = False
 
         profiler = TextProfiler("Name", options)
-        sample = pd.Series(["This is test, a Test sentence.!!!"])
+        sample = pl.Series(["This is test, a Test sentence.!!!"])
         profiler.update(sample)
 
         report = profiler.report(remove_disabled_flag=True)
@@ -584,7 +584,7 @@ def test_report(self):
         options.words.is_enabled = False
 
         profiler = TextProfiler("Name", options)
-        sample = pd.Series(["This is test, a Test sentence.!!!"])
+        sample = pl.Series(["This is test, a Test sentence.!!!"])
         profiler.update(sample)
 
         report = profiler.report(remove_disabled_flag=True)
@@ -600,7 +600,7 @@ def test_options_case_sensitive(self):
 
         # input with one sample
         text_profile = TextProfiler("Name", options=options)
-        sample = pd.Series(["This is test, a Test sentence.!!!"])
+        sample = pl.Series(["This is test, a Test sentence.!!!"])
         text_profile.update(sample)
 
         expected_word_count = {"sentence": 1, "test": 2}
@@ -624,7 +624,7 @@ def test_options_case_sensitive(self):
 
         # input with two samples
         text_profile = TextProfiler("Name", options=options)
-        sample = pd.Series(["This is test,", " a Test sentence.!!!"])
+        sample = pl.Series(["This is test,", " a Test sentence.!!!"])
         text_profile.update(sample)
 
         expected_word_count = {"sentence": 1, "test": 2}
@@ -655,7 +655,7 @@ def test_options_stop_words(self):
 
         ## input with one sample
         text_profile = TextProfiler("Name", options=options)
-        sample = pd.Series(["This is test, a Test sentence.!!!"])
+        sample = pl.Series(["This is test, a Test sentence.!!!"])
         text_profile.update(sample)
 
         expected_word_count = {"This": 1, "Test": 1, "test": 1}
@@ -679,7 +679,7 @@ def test_options_stop_words(self):
 
         ## input with two samples
         text_profile = TextProfiler("Name", options=options)
-        sample = pd.Series(["This is test,", " a Test sentence.!!!"])
+        sample = pl.Series(["This is test,", " a Test sentence.!!!"])
         text_profile.update(sample)
 
         expected_word_count = {"This": 1, "Test": 1, "test": 1}
@@ -707,7 +707,7 @@ def test_options_stop_words(self):
 
         ## input with one sample
         text_profile = TextProfiler("Name", options=options)
-        sample = pd.Series(["This is test, a Test sentence.!!!"])
+        sample = pl.Series(["This is test, a Test sentence.!!!"])
         text_profile.update(sample)
 
         expected_word_count = {
@@ -738,7 +738,7 @@ def test_options_stop_words(self):
 
         ## input with two samples
         text_profile = TextProfiler("Name", options=options)
-        sample = pd.Series(["This is test,", " a Test sentence.!!!"])
+        sample = pl.Series(["This is test,", " a Test sentence.!!!"])
         text_profile.update(sample)
 
         expected_word_count = {
@@ -774,7 +774,7 @@ def test_options_vocab_update(self):
 
         # input with one sample
         text_profile = TextProfiler("Name", options=options)
-        sample = pd.Series(["This is test, a Test sentence.!!!"])
+        sample = pl.Series(["This is test, a Test sentence.!!!"])
         text_profile.update(sample)
 
         expected_word_count = {"sentence": 1, "Test": 1, "test": 1}
@@ -784,7 +784,7 @@ def test_options_vocab_update(self):
 
         # input with two samples
         text_profile = TextProfiler("Name", options=options)
-        sample = pd.Series(["This is test,", " a Test sentence.!!!"])
+        sample = pl.Series(["This is test,", " a Test sentence.!!!"])
         text_profile.update(sample)
 
         expected_word_count = {"sentence": 1, "Test": 1, "test": 1}
@@ -799,7 +799,7 @@ def test_options_words_update(self):
 
         # input with one sample
         text_profile = TextProfiler("Name", options=options)
-        sample = pd.Series(["This is test, a Test sentence.!!!"])
+        sample = pl.Series(["This is test, a Test sentence.!!!"])
         text_profile.update(sample)
 
         expected_word_count = {}
@@ -823,7 +823,7 @@ def test_options_words_update(self):
 
         # input with two samples
         text_profile = TextProfiler("Name", options=options)
-        sample = pd.Series(["This is test,", " a Test sentence.!!!"])
+        sample = pl.Series(["This is test,", " a Test sentence.!!!"])
         text_profile.update(sample)
 
         expected_word_count = {}
@@ -851,7 +851,7 @@ def test_options_most_common_chars_count(self):
         options.top_k_chars = None
 
         text_profile = TextProfiler("Name", options=options)
-        sample = pd.Series(
+        sample = pl.Series(
             ["this is test,", " this is a test sentence", "this is", "this"]
         )
         text_profile.update(sample)
@@ -875,7 +875,7 @@ def test_options_most_common_chars_count(self):
         options.top_k_chars = 3
 
         text_profile = TextProfiler("Name", options=options)
-        sample = pd.Series(
+        sample = pl.Series(
             ["this is test,", " this is a test sentence", "this is", "this"]
         )
         text_profile.update(sample)
@@ -920,7 +920,7 @@ def test_options_most_common_words_count(self):
         options.stop_words = []  # set stop_words to empty list for easy inspection
 
         text_profile = TextProfiler("Name", options=options)
-        sample = pd.Series(
+        sample = pl.Series(
             ["this is test,", " this is a test sentence", "this is", "this"]
         )
         text_profile.update(sample)
@@ -934,7 +934,7 @@ def test_options_most_common_words_count(self):
         options.stop_words = []  # set stop_words to empty list for easy inspection
 
         text_profile = TextProfiler("Name", options=options)
-        sample = pd.Series(
+        sample = pl.Series(
             ["this is test,", " this is a test sentence", "this is", "this"]
         )
         text_profile.update(sample)

From 73feade6ea457e636b891604c4adb26d7736a283 Mon Sep 17 00:00:00 2001
From: ubd725 <aabajpai15@gmail.com>
Date: Fri, 29 Mar 2024 12:27:12 -0500
Subject: [PATCH 05/10] Correct polars usage

---
 .../profilers/unstructured_labeler_profile.py |  1 -
 .../profilers/unstructured_text_profile.py    | 39 ++++++++++++++-----
 2 files changed, 29 insertions(+), 11 deletions(-)

diff --git a/dataprofiler/profilers/unstructured_labeler_profile.py b/dataprofiler/profilers/unstructured_labeler_profile.py
index 74bd4cae2..22bd852d4 100644
--- a/dataprofiler/profilers/unstructured_labeler_profile.py
+++ b/dataprofiler/profilers/unstructured_labeler_profile.py
@@ -165,7 +165,6 @@ def _update_helper(self, df_series_clean: pl.Series, profile: dict) -> None:
         :type profile: dict
         :return: None
         """
-
         data_ndarray = df_series_clean.to_numpy()
 
         # this will get char_level predictions as output
diff --git a/dataprofiler/profilers/unstructured_text_profile.py b/dataprofiler/profilers/unstructured_text_profile.py
index 214b10044..51835f7ae 100644
--- a/dataprofiler/profilers/unstructured_text_profile.py
+++ b/dataprofiler/profilers/unstructured_text_profile.py
@@ -6,9 +6,9 @@
 import warnings
 from collections import Counter, defaultdict
 
+import polars as pl
 from numpy import ndarray
 from pandas import DataFrame, Series
-import polars as pl
 
 from . import profiler_utils
 from .base_column_profilers import BaseColumnProfiler
@@ -691,7 +691,7 @@ def _update_vocab(
     @BaseColumnProfiler._timeit(name="words")
     def _update_words(
         self,
-        data: list | ndarray | DataFrame | pl.DataFrame,
+        data: list | ndarray | DataFrame,
         prev_dependent_properties: dict = None,
         subset_properties: dict = None,
     ) -> None:
@@ -709,12 +709,29 @@ def _update_words(
         :return: None
         """
         if not self._is_case_sensitive:
-            words = (
-                [w.strip(string.punctuation) for w in row.lower().split()]
-                for row in data
-            )
+            if type(data) is pl.DataFrame:
+                words = (
+                    [
+                        w.strip(string.punctuation)
+                        for w in row.str.to_lowercase().str.split(by=" ")
+                    ]
+                    for row in data
+                )
+            else:
+                words = (
+                    [w.strip(string.punctuation) for w in row.lower().split()]
+                    for row in data
+                )
         else:
-            words = ([w.strip(string.punctuation) for w in row.split()] for row in data)
+            if type(data) is pl.DataFrame:
+                words = (
+                    [w.strip(string.punctuation) for w in row.str.split(by=" ")]
+                    for row in data
+                )
+            else:
+                words = (
+                    [w.strip(string.punctuation) for w in row.split()] for row in data
+                )
         word_count = Counter(itertools.chain.from_iterable(words))
 
         for w, c in word_count.items():
@@ -750,16 +767,18 @@ def update(self, data: Series | pl.Series) -> TextProfiler:
         profile = dict(sample_size=len_data)
 
         if type(data) is pl.Series:
-            data = data.to_pandas()
+            data_pandas = data.to_pandas()
+        else:
+            data_pandas = data
 
         BaseColumnProfiler._perform_property_calcs(
             self,  # type: ignore
             self.__calculations,
-            df_series=data,
+            df_series=data_pandas,
             prev_dependent_properties={},
             subset_properties=profile,
         )
 
-        self._update_helper(data, profile)
+        self._update_helper(pl.Series(data), profile)
 
         return self

From 32fa3e542ece46b2b128f920a21e40b460ced637 Mon Sep 17 00:00:00 2001
From: ubd725 <aabajpai15@gmail.com>
Date: Fri, 22 Mar 2024 11:42:44 -0500
Subject: [PATCH 06/10] Dask version

---
 requirements-test.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/requirements-test.txt b/requirements-test.txt
index 6c981cf9c..3bec5009d 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -1,5 +1,9 @@
 coverage>=5.0.1
+<<<<<<< HEAD
 dask>=2.29.0,<2024.2.0
+=======
+dask>=2.29.0, <2024.2.0
+>>>>>>> f4b7f9b (Dask version)
 fsspec>=0.3.3
 pytest>=6.0.1
 pytest-cov>=2.8.1

From b665e1b09a14976c2d523d51de9fa45c6d1c947c Mon Sep 17 00:00:00 2001
From: ubd725 <aabajpai15@gmail.com>
Date: Wed, 27 Mar 2024 14:21:15 -0500
Subject: [PATCH 07/10] Quick fix for keras and tensorflow

---
 requirements-test.txt | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/requirements-test.txt b/requirements-test.txt
index 3bec5009d..6c981cf9c 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -1,9 +1,5 @@
 coverage>=5.0.1
-<<<<<<< HEAD
 dask>=2.29.0,<2024.2.0
-=======
-dask>=2.29.0, <2024.2.0
->>>>>>> f4b7f9b (Dask version)
 fsspec>=0.3.3
 pytest>=6.0.1
 pytest-cov>=2.8.1

From 4a5fc2dc17fafecbccf7daec0b53df08ed0a946f Mon Sep 17 00:00:00 2001
From: ubd725 <aabajpai15@gmail.com>
Date: Mon, 1 Apr 2024 14:49:09 -0500
Subject: [PATCH 08/10] Minor polars updates

---
 dataprofiler/profilers/profiler_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dataprofiler/profilers/profiler_utils.py b/dataprofiler/profilers/profiler_utils.py
index a81dca7a5..66f33c773 100644
--- a/dataprofiler/profilers/profiler_utils.py
+++ b/dataprofiler/profilers/profiler_utils.py
@@ -752,7 +752,7 @@ def perform_chi_squared_test_for_homogeneity(
     # If one or less categories, we have zero/negative degrees of freedom,
     # which is not an appropriate value for this context
     num_cats = len(cat_counts)
-    if len(cat_counts) <= 1:
+    if num_cats <= 1:
         warnings.warn(
             "Insufficient number of categories. "
             "Chi-squared test cannot be performed.",

From b3633eb0788730fb857dcf032dbe02eeb1c3beae Mon Sep 17 00:00:00 2001
From: ubd725 <aabajpai15@gmail.com>
Date: Wed, 10 Apr 2024 15:59:41 -0500
Subject: [PATCH 09/10] Change type for isinstance

---
 dataprofiler/profilers/unstructured_text_profile.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/dataprofiler/profilers/unstructured_text_profile.py b/dataprofiler/profilers/unstructured_text_profile.py
index 51835f7ae..12871aa6c 100644
--- a/dataprofiler/profilers/unstructured_text_profile.py
+++ b/dataprofiler/profilers/unstructured_text_profile.py
@@ -691,7 +691,7 @@ def _update_vocab(
     @BaseColumnProfiler._timeit(name="words")
     def _update_words(
         self,
-        data: list | ndarray | DataFrame,
+        data: list | ndarray | DataFrame | pl.DataFrame,
         prev_dependent_properties: dict = None,
         subset_properties: dict = None,
     ) -> None:
@@ -709,7 +709,7 @@ def _update_words(
         :return: None
         """
         if not self._is_case_sensitive:
-            if type(data) is pl.DataFrame:
+            if isinstance(data, pl.DataFrame):
                 words = (
                     [
                         w.strip(string.punctuation)
@@ -723,7 +723,7 @@ def _update_words(
                     for row in data
                 )
         else:
-            if type(data) is pl.DataFrame:
+            if isinstance(data, pl.DataFrame):
                 words = (
                     [w.strip(string.punctuation) for w in row.str.split(by=" ")]
                     for row in data
@@ -766,7 +766,7 @@ def update(self, data: Series | pl.Series) -> TextProfiler:
 
         profile = dict(sample_size=len_data)
 
-        if type(data) is pl.Series:
+        if isinstance(data, pl.Series):
             data_pandas = data.to_pandas()
         else:
             data_pandas = data

From 8f8b528fb6c227adabe491be70edf2ebc0a3e65c Mon Sep 17 00:00:00 2001
From: ubd725 <aabajpai15@gmail.com>
Date: Tue, 23 Apr 2024 11:59:26 -0500
Subject: [PATCH 10/10] Categorical polars update

---
 .../profilers/categorical_column_profile.py   | 24 ++++--
 .../test_categorical_column_profile.py        | 75 ++++++++++---------
 2 files changed, 54 insertions(+), 45 deletions(-)

diff --git a/dataprofiler/profilers/categorical_column_profile.py b/dataprofiler/profilers/categorical_column_profile.py
index 1ca630900..23e057215 100644
--- a/dataprofiler/profilers/categorical_column_profile.py
+++ b/dataprofiler/profilers/categorical_column_profile.py
@@ -7,6 +7,7 @@
 from typing import cast
 
 import datasketches
+import polars as pl
 from pandas import DataFrame, Series
 
 from .. import dp_logging
@@ -474,7 +475,7 @@ def _check_stop_condition_is_met(self, sample_size: int, unqiue_ratio: float):
             return True
         return False
 
-    def _update_stop_condition(self, data: DataFrame):
+    def _update_stop_condition(self, data: DataFrame | pl.DataFrame):
         """Return value stop_condition_is_met given stop conditions.
 
         :param data: Dataframe currently being processed by categorical profiler
@@ -497,8 +498,8 @@ def _get_categories_cms(self, df_series, len_df):
         """Return count min sketch and heavy hitters for both the batch and stream case.
 
         :param df_series: Series currently being processed by categorical profiler
-        :type df_series: Series
-        :param len_df: the total number of samples iin df_series
+        :type df_series: polars.Series
+        :param len_df: the total number of samples in df_series
         :type len_df: int
         :return: cms, heavy_hitter_dict, missing_heavy_hitter_dict
         """
@@ -601,13 +602,13 @@ def _get_categories_full(self, df_series) -> dict:
         :return: dict of counts for each unique value
         :rtype: dict
         """
-        category_count: dict = df_series.value_counts(dropna=False).to_dict()
+        category_count: dict = Series(df_series).value_counts(dropna=False).to_dict()
         return category_count
 
     @BaseColumnProfiler._timeit(name="categories")
     def _update_categories(
         self,
-        df_series: DataFrame,
+        df_series: DataFrame | pl.DataFrame,
         prev_dependent_properties: dict = None,
         subset_properties: dict = None,
     ) -> None:
@@ -657,7 +658,9 @@ def _update_categories(
             if self._stop_condition_is_met:
                 self._categories = {}
 
-    def _update_helper(self, df_series_clean: Series, profile: dict) -> None:
+    def _update_helper(
+        self, df_series_clean: Series | pl.Series, profile: dict
+    ) -> None:
         """
         Update col profile properties with clean dataset and its known profile.
 
@@ -669,7 +672,7 @@ def _update_helper(self, df_series_clean: Series, profile: dict) -> None:
         """
         self._update_column_base_properties(profile)
 
-    def update(self, df_series: Series) -> CategoricalColumn:
+    def update(self, df_series: pl.Series | Series) -> CategoricalColumn:
         """
         Update the column profile.
 
@@ -682,12 +685,17 @@ def update(self, df_series: Series) -> CategoricalColumn:
         if len(df_series) == 0 or self._stop_condition_is_met:
             return self
 
+        if isinstance(df_series, pl.Series):
+            pandas_df = df_series.to_pandas()
+        else:
+            pandas_df = df_series
+
         profile = dict(sample_size=len(df_series))
         CategoricalColumn._update_categories(self, df_series)
         BaseColumnProfiler._perform_property_calcs(
             self,
             self.__calculations,
-            df_series=df_series,
+            df_series=pandas_df,
             prev_dependent_properties={},
             subset_properties=profile,
         )
diff --git a/dataprofiler/tests/profilers/test_categorical_column_profile.py b/dataprofiler/tests/profilers/test_categorical_column_profile.py
index 55d2ea68e..5a403dec9 100644
--- a/dataprofiler/tests/profilers/test_categorical_column_profile.py
+++ b/dataprofiler/tests/profilers/test_categorical_column_profile.py
@@ -6,6 +6,7 @@
 
 import numpy as np
 import pandas as pd
+import polars as pl
 
 from dataprofiler.profilers import CategoricalColumn
 from dataprofiler.profilers.json_decoder import load_column_profile
@@ -51,7 +52,7 @@ def test_correct_categorical_model_string(self):
         self.assertCountEqual(categories, profile.categories)
 
     def test_stop_condition_is_met_initially(self):
-        dataset = pd.Series(["a"] * 10 + ["b"] * 10 + ["c"] * 10 + ["d"] * 10)
+        dataset = pl.Series(["a"] * 10 + ["b"] * 10 + ["c"] * 10 + ["d"] * 10)
         profile = CategoricalColumn("test dataset")
         profile.max_sample_size_to_check_stop_condition = 0
         profile.stop_condition_unique_value_ratio = 0
@@ -368,7 +369,7 @@ def test_categorical_mapping(self):
         self.assertNotEqual(num_nan_count, len(column_profile.null_types_index["NaN"]))
 
     def test_true_categorical_report(self):
-        df_categorical = pd.Series(
+        df_categorical = pl.Series(
             [
                 "a",
                 "a",
@@ -415,7 +416,7 @@ def test_true_categorical_report(self):
         self.assertEqual(report, expected_profile)
 
     def test_false_categorical_report(self):
-        df_non_categorical = pd.Series(list(map(str, range(0, 20))))
+        df_non_categorical = pl.Series(list(map(str, range(0, 20))))
         profile = CategoricalColumn(df_non_categorical.name)
         profile.update(df_non_categorical)
 
@@ -433,7 +434,7 @@ def test_false_categorical_report(self):
         self.assertEqual(report, expected_profile)
 
     def test_report(self):
-        df_non_categorical = pd.Series(list(map(str, range(0, 20))))
+        df_non_categorical = pl.Series(list(map(str, range(0, 20))))
         profile = CategoricalColumn(df_non_categorical.name)
         profile.update(df_non_categorical)
 
@@ -681,32 +682,32 @@ def test_categorical_merge(self):
 
     def test_gini_impurity(self):
         # Normal test
-        df_categorical = pd.Series(["y", "y", "y", "y", "n", "n", "n"])
+        df_categorical = pl.Series(["y", "y", "y", "y", "n", "n", "n"])
         profile = CategoricalColumn(df_categorical.name)
         profile.update(df_categorical)
         expected_val = ((4 / 7) * (3 / 7)) + ((4 / 7) * (3 / 7))
         self.assertAlmostEqual(profile.gini_impurity, expected_val)
 
         # One class only test
-        df_categorical = pd.Series(["y", "y", "y", "y", "y", "y", "y"])
+        df_categorical = pl.Series(["y", "y", "y", "y", "y", "y", "y"])
         profile = CategoricalColumn(df_categorical.name)
         profile.update(df_categorical)
         expected_val = 0
         self.assertEqual(profile.gini_impurity, expected_val)
 
         # Empty test
-        df_categorical = pd.Series([])
+        df_categorical = pl.Series([])
         profile = CategoricalColumn(df_categorical.name)
         profile.update(df_categorical)
         self.assertEqual(profile.gini_impurity, None)
 
     def test_categorical_diff(self):
         # test psi new category in another profile
-        df_categorical = pd.Series(["y", "y", "y", "y", "n", "n", "n"])
+        df_categorical = pl.Series(["y", "y", "y", "y", "n", "n", "n"])
         profile = CategoricalColumn(df_categorical.name)
         profile.update(df_categorical)
 
-        df_categorical = pd.Series(["y", "maybe", "y", "y", "n", "n", "maybe"])
+        df_categorical = pl.Series(["y", "maybe", "y", "y", "n", "n", "maybe"])
         profile2 = CategoricalColumn(df_categorical.name)
         profile2.update(df_categorical)
 
@@ -734,7 +735,7 @@ def test_categorical_diff(self):
         self.assertDictEqual(expected_diff, actual_diff)
 
         # Test with one categorical column matching
-        df_not_categorical = pd.Series(
+        df_not_categorical = pl.Series(
             [
                 "THIS",
                 "is",
@@ -759,11 +760,11 @@ def test_categorical_diff(self):
         self.assertDictEqual(expected_diff, profile.diff(profile2))
 
         # Test diff with psi enabled
-        df_categorical = pd.Series(["y", "y", "y", "y", "n", "n", "n", "maybe"])
+        df_categorical = pl.Series(["y", "y", "y", "y", "n", "n", "n", "maybe"])
         profile = CategoricalColumn(df_categorical.name)
         profile.update(df_categorical)
 
-        df_categorical = pd.Series(["y", "maybe", "y", "y", "n", "n", "maybe"])
+        df_categorical = pl.Series(["y", "maybe", "y", "y", "n", "n", "maybe"])
         profile2 = CategoricalColumn(df_categorical.name)
         profile2.update(df_categorical)
 
@@ -787,32 +788,32 @@ def test_categorical_diff(self):
         self.assertDictEqual(expected_diff, profile.diff(profile2))
 
     def test_unalikeability(self):
-        df_categorical = pd.Series(["a", "a"])
+        df_categorical = pl.Series(["a", "a"])
         profile = CategoricalColumn(df_categorical.name)
         profile.update(df_categorical)
         self.assertEqual(profile.unalikeability, 0)
 
-        df_categorical = pd.Series(["a", "c", "b"])
+        df_categorical = pl.Series(["a", "c", "b"])
         profile = CategoricalColumn(df_categorical.name)
         profile.update(df_categorical)
         self.assertEqual(profile.unalikeability, 1)
 
-        df_categorical = pd.Series(["a", "a", "a", "b", "b", "b"])
+        df_categorical = pl.Series(["a", "a", "a", "b", "b", "b"])
         profile = CategoricalColumn(df_categorical.name)
         profile.update(df_categorical)
         self.assertEqual(profile.unalikeability, 18 / 30)
 
-        df_categorical = pd.Series(["a", "a", "b", "b", "b", "a", "c", "c", "a", "a"])
+        df_categorical = pl.Series(["a", "a", "b", "b", "b", "a", "c", "c", "a", "a"])
         profile = CategoricalColumn(df_categorical.name)
         profile.update(df_categorical)
         self.assertEqual(profile.unalikeability, 2 * (10 + 15 + 6) / 90)
 
-        df_categorical = pd.Series(["a"])
+        df_categorical = pl.Series(["a"])
         profile = CategoricalColumn(df_categorical.name)
         profile.update(df_categorical)
         self.assertEqual(0, profile.unalikeability)
 
-        df_categorical = pd.Series([])
+        df_categorical = pl.Series([])
         profile = CategoricalColumn(df_categorical.name)
         profile.update(df_categorical)
         self.assertEqual(None, profile.unalikeability)
@@ -820,7 +821,7 @@ def test_unalikeability(self):
     def test_top_k_categories_change(self):
         # Test if top_k_categories is None
         options = CategoricalOptions()
-        df_series = pd.Series(["a", "a", "b", "c", "d", "e", "e", "e", "f", "g"])
+        df_series = pl.Series(["a", "a", "b", "c", "d", "e", "e", "e", "f", "g"])
         profile = CategoricalColumn(df_series.name, options)
         profile.update(df_series)
         self.assertEqual(len(profile.profile["statistics"]["categorical_count"]), 7)
@@ -831,7 +832,7 @@ def test_top_k_categories_change(self):
 
         # Test if top_k_categories is greater than the count of categories
         options.top_k_categories = 6
-        df_series = pd.Series(["a", "a", "b", "c", "d"])
+        df_series = pl.Series(["a", "a", "b", "c", "d"])
         profile = CategoricalColumn(df_series.name, options)
         profile.update(df_series)
         self.assertEqual(len(profile.profile["statistics"]["categorical_count"]), 4)
@@ -947,7 +948,7 @@ def test_json_decode_after_update(self):
         # Actual deserialization
 
         # Build expected CategoricalColumn
-        df_categorical = pd.Series(
+        df_categorical = pl.Series(
             [
                 "a",
                 "a",
@@ -973,7 +974,7 @@ def test_json_decode_after_update(self):
 
         test_utils.assert_profiles_equal(deserialized, expected_profile)
 
-        df_categorical = pd.Series(
+        df_categorical = pl.Series(
             [
                 "a",  # add existing
                 "d",  # add new
@@ -987,7 +988,7 @@ def test_json_decode_after_update(self):
         assert deserialized.categorical_counts == {"c": 5, "b": 4, "a": 4, "d": 1}
 
     def test_cms_max_num_heavy_hitters(self):
-        df_categorical = pd.Series(["a"] * 5 + ["b"] * 5 + ["c"] * 10)
+        df_categorical = pl.Series(["a"] * 5 + ["b"] * 5 + ["c"] * 10)
 
         options = CategoricalOptions()
         options.cms = True
@@ -1002,8 +1003,8 @@ def test_cms_max_num_heavy_hitters(self):
         self.assertTrue(profile.sample_size >= 10)
 
     def test_cms_update_hybrid_batch_stream(self):
-        dataset = pd.Series(["a"] * 7 + ["b"] * 9 + ["c"] * 14)
-        dataset1 = pd.Series(["a"] * 9 + ["b"] * 11 + ["c"] * 9 + ["d"] * 1)
+        dataset = pl.Series(["a"] * 7 + ["b"] * 9 + ["c"] * 14)
+        dataset1 = pl.Series(["a"] * 9 + ["b"] * 11 + ["c"] * 9 + ["d"] * 1)
 
         options = CategoricalOptions()
         options.cms = True
@@ -1031,8 +1032,8 @@ def test_cms_update_hybrid_batch_stream(self):
 
     def test_cms_profile_merge_via_add(self):
 
-        dataset = pd.Series(["a"] * 9 + ["b"] * 12 + ["c"] * 9)
-        dataset1 = pd.Series(["a"] * 6 + ["b"] * 10 + ["c"] * 14)
+        dataset = pl.Series(["a"] * 9 + ["b"] * 12 + ["c"] * 9)
+        dataset1 = pl.Series(["a"] * 6 + ["b"] * 10 + ["c"] * 14)
 
         expected_categories = ["b", "c"]
         expected_categories_dict = {"b": 22, "c": 23}
@@ -1074,8 +1075,8 @@ def test_cms_profile_merge_via_add(self):
 
     def test_cms_profile_min_max_num_heavy_hitters(self):
 
-        dataset = pd.Series(["a"] * 9 + ["b"] * 12 + ["c"] * 9)
-        dataset1 = pd.Series(["a"] * 6 + ["b"] * 10 + ["c"] * 14)
+        dataset = pl.Series(["a"] * 9 + ["b"] * 12 + ["c"] * 9)
+        dataset1 = pl.Series(["a"] * 6 + ["b"] * 10 + ["c"] * 14)
 
         options = CategoricalOptions()
         options.cms = True
@@ -1097,8 +1098,8 @@ def test_cms_profile_min_max_num_heavy_hitters(self):
 
     def test_cms_catch_overwriting_with_missing_dict(self):
 
-        dataset = pd.Series(["b"] * 2 + ["c"] * 14)
-        dataset1 = pd.Series(["b"] * 5 + ["c"] * 10)
+        dataset = pl.Series(["b"] * 2 + ["c"] * 14)
+        dataset1 = pl.Series(["b"] * 5 + ["c"] * 10)
 
         options = CategoricalOptions()
         options.cms = True
@@ -1126,7 +1127,7 @@ def test_cms_catch_overwriting_with_missing_dict(self):
 
     def test_cms_vs_full_mismatch_merge(self):
 
-        dataset = pd.Series(["b"] * 2 + ["c"] * 14)
+        dataset = pl.Series(["b"] * 2 + ["c"] * 14)
 
         options = CategoricalOptions()
         options.cms = True
@@ -1176,7 +1177,7 @@ def test_fewer_than_MAXIMUM_UNIQUE_VALUES_TO_CLASSIFY_AS_CATEGORICAL(self):
         ]
 
         len_unique = len(set(cat_sentence_list))
-        cat_sentence_df = pd.Series(cat_sentence_list)
+        cat_sentence_df = pl.Series(cat_sentence_list)
         column_profile = StructuredColProfiler(cat_sentence_df)
         cat_profiler = column_profile.profiles["data_stats_profile"]._profiles[
             "category"
@@ -1200,7 +1201,7 @@ def test_greater_than_CATEGORICAL_THRESHOLD_DEFAULT_identify_as_text(self):
         )
         cat_sentence_list = list_unique_values * num_sentences
 
-        cat_sentence_df = pd.Series(cat_sentence_list)
+        cat_sentence_df = pl.Series(cat_sentence_list)
         column_profile = StructuredColProfiler(cat_sentence_df)
         cat_profiler = column_profile.profiles["data_stats_profile"]._profiles[
             "category"
@@ -1226,7 +1227,7 @@ def test_less_than_CATEGORICAL_THRESHOLD_DEFAULT(self):
         cat_sentence_list = list_unique_values * num_sentences
 
         len_unique = len(set(cat_sentence_list))
-        cat_sentence_df = pd.Series(cat_sentence_list)
+        cat_sentence_df = pl.Series(cat_sentence_list)
         column_profile = StructuredColProfiler(cat_sentence_df)
         cat_profiler = column_profile.profiles["data_stats_profile"]._profiles[
             "category"
@@ -1255,7 +1256,7 @@ def test_uppercase_less_than_CATEGORICAL_THRESHOLD_DEFAULT(self):
         cat_sentence_list[-3] = self.test_sentence_upper3 + str(num_sentences - 2)
 
         len_unique = len(set(cat_sentence_list))
-        cat_sentence_df = pd.Series(cat_sentence_list)
+        cat_sentence_df = pl.Series(cat_sentence_list)
         column_profile = StructuredColProfiler(cat_sentence_df)
         cat_profiler = column_profile.profiles["data_stats_profile"]._profiles[
             "category"
@@ -1279,7 +1280,7 @@ def test_long_sentences_fewer_than_MAXIMUM_UNIQUE_VALUES_TO_CLASSIFY_AS_CATEGORI
         ]
 
         len_unique = len(set(cat_sentence_list))
-        cat_sentence_df = pd.Series(cat_sentence_list)
+        cat_sentence_df = pl.Series(cat_sentence_list)
         column_profile = StructuredColProfiler(cat_sentence_df)
         cat_profiler = column_profile.profiles["data_stats_profile"]._profiles[
             "category"