diff --git a/.github/workflows/publish-python-package.yml b/.github/workflows/publish-python-package.yml index d2b43419f..3c593d4ed 100644 --- a/.github/workflows/publish-python-package.yml +++ b/.github/workflows/publish-python-package.yml @@ -7,6 +7,8 @@ name: Publish Python Package on: release: types: [created] + branches: + - 'release/*' jobs: deploy: diff --git a/dataprofiler/data_readers/data_utils.py b/dataprofiler/data_readers/data_utils.py index 3e433d85d..58ea61179 100644 --- a/dataprofiler/data_readers/data_utils.py +++ b/dataprofiler/data_readers/data_utils.py @@ -1,7 +1,5 @@ """Contains functions for data readers.""" import json -import os -import random import re import urllib from collections import OrderedDict @@ -28,7 +26,7 @@ from chardet.universaldetector import UniversalDetector from typing_extensions import TypeGuard -from .. import dp_logging, settings +from .. import dp_logging, rng_utils from .._typing import JSONType, Url from .filepath_or_buffer import FileOrBufferHandler, is_stream_buffer # NOQA @@ -315,11 +313,7 @@ def reservoir(file: TextIOWrapper, sample_nrows: int) -> list: kinv = 1 / sample_nrows W = 1.0 - rng = random.Random(x=settings._seed) - if "DATAPROFILER_SEED" in os.environ and settings._seed is None: - seed = os.environ.get("DATAPROFILER_SEED") - if seed: - rng = random.Random(int(seed)) + rng = rng_utils.get_random_number_generator() while True: W *= rng.random() ** kinv @@ -334,7 +328,7 @@ def reservoir(file: TextIOWrapper, sample_nrows: int) -> list: except StopIteration: break # Append new, replace old with dummy, and keep track of order - remove_index = rng.randrange(sample_nrows) + remove_index = rng.integers(0, sample_nrows) values[indices[remove_index]] = str(None) indices[remove_index] = len(values) values.append(newval) @@ -824,7 +818,6 @@ def url_to_bytes(url_as_string: Url, options: Dict) -> BytesIO: "Content-length" in url.headers and int(url.headers["Content-length"]) >= 1024**3 ): - raise ValueError( "The downloaded file from the url may not be " "larger than 1GB" ) diff --git a/dataprofiler/labelers/base_model.py b/dataprofiler/labelers/base_model.py index a4eb0b1d2..032c2ea38 100644 --- a/dataprofiler/labelers/base_model.py +++ b/dataprofiler/labelers/base_model.py @@ -32,7 +32,7 @@ def __new__( class BaseModel(metaclass=abc.ABCMeta): """For labeling data.""" - _BaseModel__subclasses: dict[str, type[BaseModel]] = {} + __subclasses: dict[str, type[BaseModel]] = {} __metaclass__ = abc.ABCMeta # boolean if the label mapping requires the mapping for index 0 reserved @@ -90,7 +90,7 @@ def __eq__(self, other: object) -> bool: def _register_subclass(cls) -> None: """Register a subclass for the class factory.""" if not inspect.isabstract(cls): - cls._BaseModel__subclasses[cls.__name__.lower()] = cls + cls.__subclasses[cls.__name__.lower()] = cls @property def label_mapping(self) -> dict[str, int]: @@ -156,7 +156,7 @@ def get_class(cls, class_name: str) -> type[BaseModel] | None: from .column_name_model import ColumnNameModel # NOQA from .regex_model import RegexModel # NOQA - return cls._BaseModel__subclasses.get(class_name.lower(), None) + return cls.__subclasses.get(class_name.lower(), None) def get_parameters(self, param_list: list[str] | None = None) -> dict: """ diff --git a/dataprofiler/labelers/data_processing.py b/dataprofiler/labelers/data_processing.py index 4613d05de..be1a3fee4 100644 --- a/dataprofiler/labelers/data_processing.py +++ b/dataprofiler/labelers/data_processing.py @@ -49,16 +49,14 @@ def __init__(self, **parameters: Any) -> None: def _register_subclass(cls) -> None: """Register a subclass for the class factory.""" if not inspect.isabstract(cls): - cls._BaseDataProcessor__subclasses[ # type: ignore - cls.__name__.lower() - ] = cls + cls.__subclasses[cls.__name__.lower()] = cls @classmethod - def get_class(cls: type[Processor], class_name: str) -> type[Processor] | None: + def get_class( + cls: type[BaseDataProcessor], class_name: str + ) -> type[BaseDataProcessor] | None: """Get class of BaseDataProcessor object.""" - return cls._BaseDataProcessor__subclasses.get( # type: ignore - class_name.lower(), None - ) + return cls.__subclasses.get(class_name.lower(), None) def __eq__(self, other: object) -> bool: """ @@ -129,7 +127,7 @@ def set_params(self, **kwargs: Any) -> None: self._parameters[param] = kwargs[param] @abc.abstractmethod - def process(self, *args: Any) -> Any: + def process(self, *args: Any, **kwargs: Any) -> Any: """Process data.""" raise NotImplementedError() @@ -169,13 +167,15 @@ def __init__(self, **parameters: Any) -> None: super().__init__(**parameters) @abc.abstractmethod - def process( # type: ignore + def process( self, data: np.ndarray, labels: np.ndarray | None = None, label_mapping: dict[str, int] | None = None, batch_size: int = 32, - ) -> Generator[tuple[np.ndarray, np.ndarray] | np.ndarray, None, None]: + ) -> Generator[tuple[np.ndarray, np.ndarray] | np.ndarray, None, None] | tuple[ + np.ndarray, np.ndarray + ] | np.ndarray: """Preprocess data.""" raise NotImplementedError() @@ -191,7 +191,7 @@ def __init__(self, **parameters: Any) -> None: super().__init__(**parameters) @abc.abstractmethod - def process( # type: ignore + def process( self, data: np.ndarray, results: dict, @@ -240,7 +240,7 @@ def help(cls) -> None: ) print(help_str) - def process( # type: ignore + def process( self, data: np.ndarray, labels: np.ndarray | None = None, @@ -668,7 +668,7 @@ def gen_none() -> Generator[None, None, None]: if batch_data["samples"]: yield batch_data - def process( # type: ignore + def process( self, data: np.ndarray, labels: np.ndarray | None = None, @@ -735,8 +735,8 @@ def process( # type: ignore X_train = np.array( [[sentence] for sentence in batch_data["samples"]], dtype=object ) - if labels is not None: - num_classes = max(label_mapping.values()) + 1 # type: ignore + if labels is not None and label_mapping is not None: + num_classes = max(label_mapping.values()) + 1 Y_train = tf.keras.utils.to_categorical( batch_data["labels"], num_classes @@ -836,7 +836,7 @@ def _validate_parameters(self, parameters: dict) -> None: if errors: raise ValueError("\n".join(errors)) - def process( # type: ignore + def process( self, data: np.ndarray, labels: np.ndarray | None = None, @@ -1269,7 +1269,7 @@ def match_sentence_lengths( return results - def process( # type: ignore + def process( self, data: np.ndarray, results: dict, @@ -1439,7 +1439,7 @@ def convert_to_unstructured_format( return text, entities - def process( # type: ignore + def process( self, data: np.ndarray, labels: np.ndarray | None = None, @@ -1503,8 +1503,12 @@ def process( # type: ignore unstructured_label_set, ) = self.convert_to_unstructured_format(batch_data, batch_labels) unstructured_data[ind] = unstructured_text - if labels is not None: - unstructured_labels[ind] = unstructured_label_set # type: ignore + if ( + labels is not None + and unstructured_labels is not None + and unstructured_label_set is not None + ): + unstructured_labels[ind] = unstructured_label_set if labels is not None: np_unstruct_labels = np.array(unstructured_labels, dtype="object") @@ -1800,7 +1804,7 @@ def convert_to_structured_analysis( return results - def process( # type: ignore + def process( self, data: np.ndarray, results: dict, @@ -2022,7 +2026,7 @@ def split_prediction(results: dict) -> None: pred, axis=1, ord=1, keepdims=True ) - def process( # type: ignore + def process( self, data: np.ndarray, results: dict, @@ -2160,7 +2164,7 @@ def _save_processor(self, dirpath: str) -> None: ) as fp: json.dump(params, fp) - def process( # type: ignore + def process( self, data: np.ndarray, results: dict, @@ -2253,7 +2257,7 @@ def help(cls) -> None: ) print(help_str) - def process( # type: ignore + def process( self, data: np.ndarray, results: dict, diff --git a/dataprofiler/profilers/__init__.py b/dataprofiler/profilers/__init__.py index 64e33e384..4b068fcb0 100644 --- a/dataprofiler/profilers/__init__.py +++ b/dataprofiler/profilers/__init__.py @@ -28,7 +28,7 @@ DataLabelerOptions, DateTimeOptions, FloatOptions, - HistogramOption, + HistogramAndQuantilesOption, HyperLogLogOptions, IntOptions, ModeOption, @@ -66,7 +66,8 @@ json_decoder._options = { BooleanOption.__name__: BooleanOption, - HistogramOption.__name__: HistogramOption, + "HistogramOption": HistogramAndQuantilesOption, + HistogramAndQuantilesOption.__name__: HistogramAndQuantilesOption, ModeOption.__name__: ModeOption, BaseInspectorOptions.__name__: BaseInspectorOptions, NumericalOptions.__name__: NumericalOptions, diff --git a/dataprofiler/profilers/base_column_profilers.py b/dataprofiler/profilers/base_column_profilers.py index d9c183c99..1ef5b75fe 100644 --- a/dataprofiler/profilers/base_column_profilers.py +++ b/dataprofiler/profilers/base_column_profilers.py @@ -11,7 +11,7 @@ import numpy as np import pandas as pd -from . import utils +from . import profiler_utils from .profiler_options import BaseInspectorOptions, BaseOption BaseColumnProfilerT = TypeVar("BaseColumnProfilerT", bound="BaseColumnProfiler") @@ -76,7 +76,7 @@ def _timeit(method: Callable = None, name: str = None) -> Callable: :param name: key argument for the times dictionary :type name: str """ - return utils.method_timeit(method, name) + return profiler_utils.method_timeit(method, name) @staticmethod def _filter_properties_w_options( @@ -173,7 +173,7 @@ def _add_helper( else: raise ValueError(f"Column names unmatched: {other1.name} != {other2.name}") - self.times = utils.add_nested_dictionaries(other1.times, other2.times) + self.times = profiler_utils.add_nested_dictionaries(other1.times, other2.times) self.sample_size = other1.sample_size + other2.sample_size diff --git a/dataprofiler/profilers/categorical_column_profile.py b/dataprofiler/profilers/categorical_column_profile.py index caaf3778e..1376cc38e 100644 --- a/dataprofiler/profilers/categorical_column_profile.py +++ b/dataprofiler/profilers/categorical_column_profile.py @@ -8,7 +8,7 @@ import datasketches from pandas import DataFrame, Series -from . import utils +from . import profiler_utils from .base_column_profilers import BaseColumnProfiler from .profiler_options import CategoricalOptions @@ -131,7 +131,7 @@ def __add__(self, other: CategoricalColumn) -> CategoricalColumn: elif not self.cms and not other.cms: # If both profiles have not met stop condition if not (self._stop_condition_is_met or other._stop_condition_is_met): - merged_profile._categories = utils.add_nested_dictionaries( + merged_profile._categories = profiler_utils.add_nested_dictionaries( self._categories, other._categories ) @@ -250,7 +250,7 @@ def diff(self, other_profile: CategoricalColumn, options: dict = None) -> dict: # Make sure other_profile's type matches this class differences: dict = super().diff(other_profile, options) - differences["categorical"] = utils.find_diff_of_strings_and_bools( + differences["categorical"] = profiler_utils.find_diff_of_strings_and_bools( self.is_match, other_profile.is_match ) @@ -258,13 +258,13 @@ def diff(self, other_profile: CategoricalColumn, options: dict = None) -> dict: [ ( "unique_count", - utils.find_diff_of_numbers( + profiler_utils.find_diff_of_numbers( self.unique_count, other_profile.unique_count ), ), ( "unique_ratio", - utils.find_diff_of_numbers( + profiler_utils.find_diff_of_numbers( self.unique_ratio, other_profile.unique_ratio ), ), @@ -275,19 +275,25 @@ def diff(self, other_profile: CategoricalColumn, options: dict = None) -> dict: if self.is_match and other_profile.is_match: differences["statistics"][ "chi2-test" - ] = utils.perform_chi_squared_test_for_homogeneity( + ] = profiler_utils.perform_chi_squared_test_for_homogeneity( self._categories, self.sample_size, other_profile._categories, other_profile.sample_size, ) - differences["statistics"]["categories"] = utils.find_diff_of_lists_and_sets( + differences["statistics"][ + "categories" + ] = profiler_utils.find_diff_of_lists_and_sets( self.categories, other_profile.categories ) - differences["statistics"]["gini_impurity"] = utils.find_diff_of_numbers( + differences["statistics"][ + "gini_impurity" + ] = profiler_utils.find_diff_of_numbers( self.gini_impurity, other_profile.gini_impurity ) - differences["statistics"]["unalikeability"] = utils.find_diff_of_numbers( + differences["statistics"][ + "unalikeability" + ] = profiler_utils.find_diff_of_numbers( self.unalikeability, other_profile.unalikeability ) cat_count1 = dict( @@ -299,9 +305,9 @@ def diff(self, other_profile: CategoricalColumn, options: dict = None) -> dict: ) ) - differences["statistics"]["categorical_count"] = utils.find_diff_of_dicts( - cat_count1, cat_count2 - ) + differences["statistics"][ + "categorical_count" + ] = profiler_utils.find_diff_of_dicts(cat_count1, cat_count2) return differences @@ -532,7 +538,7 @@ def _merge_categories_cms( for k in (x for x in heavy_hitter_dict2 if x not in heavy_hitter_dict1): heavy_hitter_dict1[k] = cms1.get_estimate(k) - categories = utils.add_nested_dictionaries( + categories = profiler_utils.add_nested_dictionaries( heavy_hitter_dict2, heavy_hitter_dict1 ) @@ -604,7 +610,7 @@ def _update_categories( ) else: category_count = self._get_categories_full(df_series) - self._categories = utils.add_nested_dictionaries( + self._categories = profiler_utils.add_nested_dictionaries( self._categories, category_count ) self._update_stop_condition(df_series) diff --git a/dataprofiler/profilers/column_profile_compilers.py b/dataprofiler/profilers/column_profile_compilers.py index e3a8ecb16..07edf13dc 100644 --- a/dataprofiler/profilers/column_profile_compilers.py +++ b/dataprofiler/profilers/column_profile_compilers.py @@ -8,7 +8,7 @@ from pandas import Series -from . import utils +from . import profiler_utils from .categorical_column_profile import CategoricalColumn from .data_labeler_column_profile import DataLabelerColumn from .datetime_column_profile import DateTimeColumn @@ -106,7 +106,7 @@ def _create_profile( df_series.name, options=profiler_options ) except Exception as e: - utils.warn_on_profile(profiler.type, e) + profiler_utils.warn_on_profile(profiler.type, e) # Update profile after creation self.update_profile(df_series, pool) @@ -338,7 +338,7 @@ def diff( if all_profiles: for key in all_profiles: if key in self._profiles and key in other._profiles: - diff = utils.find_diff_of_numbers( + diff = profiler_utils.find_diff_of_numbers( self._profiles[key].data_type_ratio, other._profiles[key].data_type_ratio, ) @@ -352,7 +352,7 @@ def diff( data_type1 = self.selected_data_type data_type2 = other.selected_data_type if data_type1 is not None or data_type2 is not None: - diff_profile["data_type"] = utils.find_diff_of_strings_and_bools( + diff_profile["data_type"] = profiler_utils.find_diff_of_strings_and_bools( data_type1, data_type2 ) # Find diff of matching profile statistics diff --git a/dataprofiler/profilers/data_labeler_column_profile.py b/dataprofiler/profilers/data_labeler_column_profile.py index 9487278d6..d9bfe1ee9 100644 --- a/dataprofiler/profilers/data_labeler_column_profile.py +++ b/dataprofiler/profilers/data_labeler_column_profile.py @@ -9,7 +9,7 @@ from ..labelers.base_data_labeler import BaseDataLabeler from ..labelers.data_labelers import DataLabeler -from . import utils +from . import profiler_utils from .base_column_profilers import BaseColumnProfiler from .profiler_options import DataLabelerOptions @@ -325,7 +325,7 @@ def load_from_dict(cls, data, config: dict | None = None) -> DataLabelerColumn: data_labeler_load_attr = data.pop("data_labeler") if data_labeler_load_attr: - data_labeler_object = utils.reload_labeler_from_options_or_get_new( + data_labeler_object = profiler_utils.reload_labeler_from_options_or_get_new( data_labeler_load_attr, config ) if data_labeler_object is not None: @@ -379,9 +379,13 @@ def diff(self, other_profile: DataLabelerColumn, options: dict = None) -> dict: other_label_rep = other_profile.label_representation differences = { - "data_label": utils.find_diff_of_lists_and_sets(self_labels, other_labels), - "avg_predictions": utils.find_diff_of_dicts(avg_preds, other_avg_preds), - "label_representation": utils.find_diff_of_dicts( + "data_label": profiler_utils.find_diff_of_lists_and_sets( + self_labels, other_labels + ), + "avg_predictions": profiler_utils.find_diff_of_dicts( + avg_preds, other_avg_preds + ), + "label_representation": profiler_utils.find_diff_of_dicts( label_rep, other_label_rep ), } diff --git a/dataprofiler/profilers/datetime_column_profile.py b/dataprofiler/profilers/datetime_column_profile.py index fc7801dd3..af99283a9 100644 --- a/dataprofiler/profilers/datetime_column_profile.py +++ b/dataprofiler/profilers/datetime_column_profile.py @@ -8,7 +8,7 @@ import numpy as np import pandas as pd -from . import utils +from . import profiler_utils from .base_column_profilers import BaseColumnPrimitiveTypeProfiler, BaseColumnProfiler from .profiler_options import DateTimeOptions @@ -114,7 +114,7 @@ def __add__(self, other: DateTimeColumn) -> DateTimeColumn: merged_profile.max = other.max merged_profile._dt_obj_max = other._dt_obj_max - merged_profile.date_formats = utils._combine_unique_sets( + merged_profile.date_formats = profiler_utils._combine_unique_sets( self.date_formats, other.date_formats ) return merged_profile @@ -192,13 +192,13 @@ def diff(self, other_profile: DateTimeColumn, options: dict = None) -> dict: super().diff(other_profile, options) differences = { - "min": utils.find_diff_of_dates( + "min": profiler_utils.find_diff_of_dates( self._dt_obj_min, other_profile._dt_obj_min ), - "max": utils.find_diff_of_dates( + "max": profiler_utils.find_diff_of_dates( self._dt_obj_max, other_profile._dt_obj_max ), - "format": utils.find_diff_of_lists_and_sets( + "format": profiler_utils.find_diff_of_lists_and_sets( self.date_formats, other_profile.date_formats ), } diff --git a/dataprofiler/profilers/float_column_profile.py b/dataprofiler/profilers/float_column_profile.py index b816f221c..bc426a447 100644 --- a/dataprofiler/profilers/float_column_profile.py +++ b/dataprofiler/profilers/float_column_profile.py @@ -7,7 +7,7 @@ import numpy as np import pandas as pd -from . import utils +from . import profiler_utils from .base_column_profilers import BaseColumnPrimitiveTypeProfiler, BaseColumnProfiler from .numerical_column_stats import NumericStatsMixin from .profiler_options import FloatOptions @@ -137,7 +137,7 @@ def diff(self, other_profile: FloatColumn, options: dict = None) -> dict: other_precision = other_profile.profile["precision"] precision_diff = dict() for key in self.profile["precision"].keys(): - precision_diff[key] = utils.find_diff_of_numbers( + precision_diff[key] = profiler_utils.find_diff_of_numbers( self.profile["precision"][key], other_precision[key] ) precision_diff.pop("confidence_level") diff --git a/dataprofiler/profilers/graph_profiler.py b/dataprofiler/profilers/graph_profiler.py index 123961d88..0680a29a7 100644 --- a/dataprofiler/profilers/graph_profiler.py +++ b/dataprofiler/profilers/graph_profiler.py @@ -1,6 +1,7 @@ """Class and functions to calculate and profile properties of graph data.""" from __future__ import annotations +import importlib import pickle from collections import defaultdict from datetime import datetime @@ -10,9 +11,10 @@ import numpy as np import pandas as pd import scipy.stats as st +from packaging import version from ..data_readers.graph_data import GraphData -from . import utils +from . import profiler_utils from .base_column_profilers import BaseColumnProfiler from .profiler_options import ProfilerOptions @@ -116,34 +118,34 @@ def diff(self, other_profile: GraphProfiler, options: dict = None) -> dict: ) diff_profile = { - "num_nodes": utils.find_diff_of_numbers( + "num_nodes": profiler_utils.find_diff_of_numbers( self._num_nodes, other_profile._num_nodes ), - "num_edges": utils.find_diff_of_numbers( + "num_edges": profiler_utils.find_diff_of_numbers( self._num_edges, other_profile._num_edges ), - "categorical_attributes": utils.find_diff_of_lists_and_sets( + "categorical_attributes": profiler_utils.find_diff_of_lists_and_sets( self._categorical_attributes, other_profile._categorical_attributes ), - "continuous_attributes": utils.find_diff_of_lists_and_sets( + "continuous_attributes": profiler_utils.find_diff_of_lists_and_sets( self._continuous_attributes, other_profile._continuous_attributes ), - "avg_node_degree": utils.find_diff_of_numbers( + "avg_node_degree": profiler_utils.find_diff_of_numbers( self._avg_node_degree, other_profile._avg_node_degree ), - "global_max_component_size": utils.find_diff_of_numbers( + "global_max_component_size": profiler_utils.find_diff_of_numbers( self._global_max_component_size, other_profile._global_max_component_size, ), - "continuous_distribution": utils.find_diff_of_dicts_with_diff_keys( + "continuous_distribution": profiler_utils.find_diff_of_dicts_with_diff_keys( self._continuous_distribution, other_profile._continuous_distribution, ), - "categorical_distribution": utils.find_diff_of_dicts_with_diff_keys( + "categorical_distribution": profiler_utils.find_diff_of_dicts_with_diff_keys( # noqa: E501 self._categorical_distribution, other_profile._categorical_distribution, ), - "times": utils.find_diff_of_dicts(self.times, other_profile.times), + "times": profiler_utils.find_diff_of_dicts(self.times, other_profile.times), } return diff_profile @@ -391,6 +393,11 @@ def _get_continuous_distribution( st.lognorm, st.gamma, ] + + scipy_gte_1_11_0 = version.parse( + importlib.metadata.version("scipy") + ) >= version.parse("1.11.0") + for attribute in attributes: if attribute in continuous_attributes: data_as_list = self._attribute_data_as_list(graph, attribute) @@ -401,7 +408,14 @@ def _get_continuous_distribution( for distribution in distribution_candidates: # compute fit, mle, kolmogorov-smirnov test to test fit, and pdf - fit = distribution.fit(df) + + # scipy 1.11.0 updated the way they handle + # the loc parameter in fit() for lognorm + if distribution == st.lognorm and scipy_gte_1_11_0: + fit = distribution.fit(df, superfit=True) + + else: + fit = distribution.fit(df) mle = distribution.nnlf(fit, df) if mle <= best_mle: diff --git a/dataprofiler/profilers/json_decoder.py b/dataprofiler/profilers/json_decoder.py index 16bc2e148..fb4ff8cb9 100644 --- a/dataprofiler/profilers/json_decoder.py +++ b/dataprofiler/profilers/json_decoder.py @@ -1,6 +1,7 @@ """Contains methods to decode components of a Profiler.""" from __future__ import annotations +import warnings from typing import TYPE_CHECKING if TYPE_CHECKING: @@ -72,6 +73,14 @@ def get_option_class(class_name: str) -> type[BaseOption]: options_class: type[BaseOption] | None = _options.get(class_name) if options_class is None: raise ValueError(f"Invalid option class {class_name} " f"failed to load.") + + if class_name == "HistogramOption": + warnings.warn( + f"{class_name} will be deprecated in the future. During the JSON encode" + " process, HistogramOption is mapped to HistogramAndQuantilesOption. " + "Please begin utilizing the new HistogramAndQuantilesOption class.", + DeprecationWarning, + ) return options_class diff --git a/dataprofiler/profilers/numerical_column_stats.py b/dataprofiler/profilers/numerical_column_stats.py index 707e916db..2b35c8792 100644 --- a/dataprofiler/profilers/numerical_column_stats.py +++ b/dataprofiler/profilers/numerical_column_stats.py @@ -13,7 +13,7 @@ import pandas as pd import scipy.stats -from . import histogram_utils, utils +from . import histogram_utils, profiler_utils from .base_column_profilers import BaseColumnProfiler from .profiler_options import NumericalOptions @@ -82,7 +82,7 @@ def __init__(self, options: NumericalOptions = None) -> None: self._mode_is_enabled: bool = True self.num_zeros: int | np.int64 = np.int64(0) self.num_negatives: int | np.int64 = np.int64(0) - self._num_quantiles: int = 1000 # TODO: add to options + self._num_quantiles: int = 1000 # By default, we use 1000 quantiles if options: self.bias_correction = options.bias_correction.is_enabled @@ -90,6 +90,7 @@ def __init__(self, options: NumericalOptions = None) -> None: self._median_is_enabled = options.median.is_enabled self._median_abs_dev_is_enabled = options.median_abs_deviation.is_enabled self._mode_is_enabled = options.mode.is_enabled + self._num_quantiles = options.histogram_and_quantiles.num_quantiles bin_count_or_method = options.histogram_and_quantiles.bin_count_or_method if isinstance(bin_count_or_method, str): self.histogram_bin_method_names = [bin_count_or_method] @@ -497,20 +498,26 @@ def diff( ) differences = { - "min": utils.find_diff_of_numbers(self.min, other_profile.min), - "max": utils.find_diff_of_numbers(self.max, other_profile.max), - "sum": utils.find_diff_of_numbers(self.sum, other_profile.sum), - "mean": utils.find_diff_of_numbers(self.mean, other_profile.mean), - "median": utils.find_diff_of_numbers(self.median, other_profile.median), - "mode": utils.find_diff_of_lists_and_sets(self.mode, other_profile.mode), - "median_absolute_deviation": utils.find_diff_of_numbers( + "min": profiler_utils.find_diff_of_numbers(self.min, other_profile.min), + "max": profiler_utils.find_diff_of_numbers(self.max, other_profile.max), + "sum": profiler_utils.find_diff_of_numbers(self.sum, other_profile.sum), + "mean": profiler_utils.find_diff_of_numbers(self.mean, other_profile.mean), + "median": profiler_utils.find_diff_of_numbers( + self.median, other_profile.median + ), + "mode": profiler_utils.find_diff_of_lists_and_sets( + self.mode, other_profile.mode + ), + "median_absolute_deviation": profiler_utils.find_diff_of_numbers( self.median_abs_deviation, other_profile.median_abs_deviation, ), - "variance": utils.find_diff_of_numbers( + "variance": profiler_utils.find_diff_of_numbers( self.variance, other_profile.variance ), - "stddev": utils.find_diff_of_numbers(self.stddev, other_profile.stddev), + "stddev": profiler_utils.find_diff_of_numbers( + self.stddev, other_profile.stddev + ), "t-test": self._perform_t_test( self.mean, self.variance, @@ -1844,7 +1851,7 @@ def _get_skewness( ): return - batch_biased_skewness = utils.biased_skew(df_series) + batch_biased_skewness = profiler_utils.biased_skew(df_series) subset_properties["biased_skewness"] = batch_biased_skewness batch_count = subset_properties["match_count"] batch_biased_var = subset_properties["biased_variance"] @@ -1888,7 +1895,7 @@ def _get_kurtosis( ): return - batch_biased_kurtosis = utils.biased_kurt(df_series) + batch_biased_kurtosis = profiler_utils.biased_kurt(df_series) subset_properties["biased_kurtosis"] = batch_biased_kurtosis batch_count = subset_properties["match_count"] batch_biased_var = subset_properties["biased_variance"] diff --git a/dataprofiler/profilers/order_column_profile.py b/dataprofiler/profilers/order_column_profile.py index c6a369d8d..308262324 100644 --- a/dataprofiler/profilers/order_column_profile.py +++ b/dataprofiler/profilers/order_column_profile.py @@ -7,7 +7,7 @@ import numpy as np from pandas import DataFrame, Series -from . import utils +from . import profiler_utils from .base_column_profilers import BaseColumnProfiler from .profiler_options import OrderOptions @@ -362,7 +362,7 @@ def diff(self, other_profile: OrderColumn, options: dict = None) -> dict: super().diff(other_profile, options) differences = { - "order": utils.find_diff_of_strings_and_bools( + "order": profiler_utils.find_diff_of_strings_and_bools( self.order, other_profile.order ) } diff --git a/dataprofiler/profilers/profile_builder.py b/dataprofiler/profilers/profile_builder.py index fc2a2246e..113d19ef2 100644 --- a/dataprofiler/profilers/profile_builder.py +++ b/dataprofiler/profilers/profile_builder.py @@ -5,7 +5,6 @@ import copy import json import logging -import os import pickle import random import re @@ -20,11 +19,11 @@ import pandas as pd from HLL import HyperLogLog -from .. import data_readers, dp_logging, settings +from .. import data_readers, dp_logging, rng_utils from ..data_readers.data import Data from ..labelers.base_data_labeler import BaseDataLabeler from ..labelers.data_labelers import DataLabeler -from . import utils +from . import profiler_utils from .column_profile_compilers import ( BaseCompiler, ColumnDataLabelerCompiler, @@ -271,7 +270,7 @@ def diff(self, other_profile: StructuredColProfiler, options: dict = None) -> di comp_diff = self.profiles[key].diff( other_profile.profiles[key], options=options ) - unordered_profile = utils.recursive_dict_update( + unordered_profile = profiler_utils.recursive_dict_update( unordered_profile, comp_diff ) @@ -287,16 +286,16 @@ def diff(self, other_profile: StructuredColProfiler, options: dict = None) -> di unordered_profile["statistics"].update( { - "sample_size": utils.find_diff_of_numbers( + "sample_size": profiler_utils.find_diff_of_numbers( self.sample_size, other_profile.sample_size ), - "null_count": utils.find_diff_of_numbers( + "null_count": profiler_utils.find_diff_of_numbers( self.null_count, other_profile.null_count ), - "null_types": utils.find_diff_of_lists_and_sets( + "null_types": profiler_utils.find_diff_of_lists_and_sets( self.null_types, other_profile.null_types ), - "null_types_index": utils.find_diff_of_dicts_with_diff_keys( + "null_types_index": profiler_utils.find_diff_of_dicts_with_diff_keys( self.null_types_index, other_profile.null_types_index ), } @@ -337,7 +336,7 @@ def report(self, remove_disabled_flag: bool = False) -> OrderedDict: """Return profile.""" unordered_profile: dict = dict() for profile in self.profiles.values(): - unordered_profile = utils.recursive_dict_update( + unordered_profile = profiler_utils.recursive_dict_update( unordered_profile, profile.report(remove_disabled_flag) ) @@ -429,7 +428,7 @@ def _update_base_stats(self, base_stats: dict) -> None: self._last_batch_size = base_stats["sample_size"] self.sample = base_stats["sample"] self.null_count += base_stats["null_count"] - self.null_types = utils._combine_unique_sets( + self.null_types = profiler_utils._combine_unique_sets( self.null_types, list(base_stats["null_types"].keys()) ) @@ -438,7 +437,7 @@ def _update_base_stats(self, base_stats: dict) -> None: base_nti = base_stats["null_types"] # Check if indices overlap, if they do, adjust attributes accordingly - if utils.overlap(self._min_id, self._max_id, base_min, base_max): + if profiler_utils.overlap(self._min_id, self._max_id, base_min, base_max): warnings.warn( f"Overlapping indices detected. To resolve, indices " f"where null data present will be shifted forward " @@ -602,11 +601,11 @@ def clean_data_and_get_base_stats( # Select generator depending if sample_ids availability if sample_ids is None: - sample_ind_generator = utils.shuffle_in_chunks( + sample_ind_generator = profiler_utils.shuffle_in_chunks( len_df, chunk_size=sample_size ) else: - sample_ind_generator = utils.partition( + sample_ind_generator = profiler_utils.partition( sample_ids[0], chunk_size=sample_size ) @@ -654,14 +653,7 @@ def clean_data_and_get_base_stats( df_series = df_series.loc[true_sample_list] total_na = total_sample_size - len(true_sample_list) - rng = np.random.default_rng(settings._seed) - - if "DATAPROFILER_SEED" in os.environ and settings._seed is None: - seed = os.environ.get("DATAPROFILER_SEED") - if isinstance(seed, int): - rng = np.random.default_rng(int(seed)) - else: - warnings.warn("Seed should be an integer", RuntimeWarning) + rng = rng_utils.get_random_number_generator() base_stats = { "sample_size": total_sample_size, @@ -755,7 +747,7 @@ def __init__( self.options.set({"data_labeler.data_labeler_object": data_labeler}) except Exception as e: - utils.warn_on_profile("data_labeler", e) + profiler_utils.warn_on_profile("data_labeler", e) self.options.set({"data_labeler.is_enabled": False}) def _add_error_checks(self, other: BaseProfiler) -> None: @@ -801,7 +793,9 @@ def __add__(self, other: BaseProfiler) -> BaseProfiler: merged_profile.total_samples = self.total_samples + other.total_samples - merged_profile.times = utils.add_nested_dictionaries(self.times, other.times) + merged_profile.times = profiler_utils.add_nested_dictionaries( + self.times, other.times + ) return merged_profile @@ -826,10 +820,10 @@ def diff(self, other_profile: BaseProfiler, options: dict = None) -> dict: ( "global_stats", { - "file_type": utils.find_diff_of_strings_and_bools( + "file_type": profiler_utils.find_diff_of_strings_and_bools( self.file_type, other_profile.file_type ), - "encoding": utils.find_diff_of_strings_and_bools( + "encoding": profiler_utils.find_diff_of_strings_and_bools( self.encoding, other_profile.encoding ), }, @@ -1080,7 +1074,7 @@ def _restore_data_labelers(self, data_labeler: BaseDataLabeler = None) -> None: self.options.set({"data_labeler.data_labeler_object": data_labeler}) except Exception as e: - utils.warn_on_profile("data_labeler", e) + profiler_utils.warn_on_profile("data_labeler", e) self.options.set({"data_labeler.is_enabled": False}) self.options.set({"data_labeler.data_labeler_object": data_labeler}) @@ -1334,13 +1328,13 @@ def diff( # type: ignore[override] report["global_stats"].update( { - "samples_used": utils.find_diff_of_numbers( + "samples_used": profiler_utils.find_diff_of_numbers( self.total_samples, other_profile.total_samples ), - "empty_line_count": utils.find_diff_of_numbers( + "empty_line_count": profiler_utils.find_diff_of_numbers( self._empty_line_count, other_profile._empty_line_count ), - "memory_size": utils.find_diff_of_numbers( + "memory_size": profiler_utils.find_diff_of_numbers( self.memory_size, other_profile.memory_size ), } @@ -1444,7 +1438,7 @@ def load_from_dict( """ raise NotImplementedError("UnstructuredProfiler deserialization not supported.") - @utils.method_timeit(name="clean_and_base_stats") + @profiler_utils.method_timeit(name="clean_and_base_stats") def _clean_data_and_get_base_stats( self, data: pd.Series, sample_size: int, min_true_samples: int = None ) -> tuple[pd.Series, dict]: @@ -1481,10 +1475,14 @@ def _clean_data_and_get_base_stats( data = data.apply(str) # get memory size - base_stats: dict = {"memory_size": utils.get_memory_size(data, unit="M")} + base_stats: dict = { + "memory_size": profiler_utils.get_memory_size(data, unit="M") + } # Setup sample generator - sample_ind_generator = utils.shuffle_in_chunks(len_data, chunk_size=sample_size) + sample_ind_generator = profiler_utils.shuffle_in_chunks( + len_data, chunk_size=sample_size + ) true_sample_list = set() total_sample_size = 0 @@ -1869,34 +1867,34 @@ def diff( # type: ignore[override] report = super().diff(other_profile, options) report["global_stats"].update( { - "samples_used": utils.find_diff_of_numbers( + "samples_used": profiler_utils.find_diff_of_numbers( self._max_col_samples_used, other_profile._max_col_samples_used ), - "column_count": utils.find_diff_of_numbers( + "column_count": profiler_utils.find_diff_of_numbers( len(self._profile), len(other_profile._profile) ), - "row_count": utils.find_diff_of_numbers( + "row_count": profiler_utils.find_diff_of_numbers( self.total_samples, other_profile.total_samples ), - "row_has_null_ratio": utils.find_diff_of_numbers( + "row_has_null_ratio": profiler_utils.find_diff_of_numbers( self._get_row_has_null_ratio(), other_profile._get_row_has_null_ratio(), ), - "row_is_null_ratio": utils.find_diff_of_numbers( + "row_is_null_ratio": profiler_utils.find_diff_of_numbers( self._get_row_is_null_ratio(), other_profile._get_row_is_null_ratio(), ), - "unique_row_ratio": utils.find_diff_of_numbers( + "unique_row_ratio": profiler_utils.find_diff_of_numbers( self._get_unique_row_ratio(), other_profile._get_unique_row_ratio() ), - "duplicate_row_count": utils.find_diff_of_numbers( + "duplicate_row_count": profiler_utils.find_diff_of_numbers( self._get_duplicate_row_count(), other_profile._get_duplicate_row_count(), ), - "correlation_matrix": utils.find_diff_of_matrices( + "correlation_matrix": profiler_utils.find_diff_of_matrices( self.correlation_matrix, other_profile.correlation_matrix ), - "chi2_matrix": utils.find_diff_of_matrices( + "chi2_matrix": profiler_utils.find_diff_of_matrices( self.chi2_matrix, other_profile.chi2_matrix ), "profile_schema": defaultdict(list), @@ -1916,7 +1914,7 @@ def diff( # type: ignore[override] report["global_stats"][ "profile_schema" - ] = utils.find_diff_of_dicts_with_diff_keys( + ] = profiler_utils.find_diff_of_dicts_with_diff_keys( self_profile_schema, other_profile_schema ) @@ -2193,7 +2191,7 @@ def _get_duplicate_row_count(self) -> int | None: ) return 0 - @utils.method_timeit(name="row_stats") + @profiler_utils.method_timeit(name="row_stats") def _update_row_statistics( self, data: pd.DataFrame, sample_ids: list[int] = None ) -> None: @@ -2347,7 +2345,7 @@ def _get_correlation( return corr_mat - @utils.method_timeit(name="correlation") + @profiler_utils.method_timeit(name="correlation") def _update_correlation( self, clean_samples: dict, prev_dependent_properties: dict ) -> None: @@ -2371,7 +2369,7 @@ def _update_correlation( batch_properties["count"], ) - @utils.method_timeit(name="correlation") + @profiler_utils.method_timeit(name="correlation") def _merge_correlation(self, other: StructuredProfiler) -> pd.DataFrame: """ Merge correlation matrix from two profiles. @@ -2570,7 +2568,7 @@ def _update_chi2(self) -> np.ndarray: if not profiler2.is_match: continue - results = utils.perform_chi_squared_test_for_homogeneity( + results = profiler_utils.perform_chi_squared_test_for_homogeneity( profiler1.categorical_counts, profiler1.sample_size, profiler2.categorical_counts, @@ -2834,7 +2832,7 @@ def tqdm(level: set[int]) -> Generator[int, None, None]: yield e # Shuffle indices once and share with columns - sample_ids = [*utils.shuffle_in_chunks(len(data), len(data))] + sample_ids = [*profiler_utils.shuffle_in_chunks(len(data), len(data))] # If there are no minimum true samples, you can sort to save time if min_true_samples in [None, 0]: @@ -2869,12 +2867,17 @@ def tqdm(level: set[int]) -> Generator[int, None, None]: ) ) + # If options.multiprocess is enabled, auto-toggle multiprocessing + auto_multiprocess_toggle = False + if self.options.multiprocess.is_enabled: + auto_multiprocess_toggle = profiler_utils.auto_multiprocess_toggle(data) + # Generate pool and estimate datasize pool = None - if self.options.multiprocess.is_enabled: + if auto_multiprocess_toggle: est_data_size = data[:50000].memory_usage(index=False, deep=True).sum() est_data_size = (est_data_size / min(50000, len(data))) * len(data) - pool, pool_size = utils.generate_pool( + pool, pool_size = profiler_utils.generate_pool( max_pool_size=None, data_size=est_data_size, cols=len(data.columns) ) @@ -2992,8 +2995,8 @@ def tqdm(level: set[int]) -> Generator[int, None, None]: # Process and label the data notification_str = "Calculating the statistics... " pool = None - if self.options.multiprocess.is_enabled: - pool, pool_size = utils.generate_pool(4, est_data_size) + if auto_multiprocess_toggle: + pool, pool_size = profiler_utils.generate_pool(4, est_data_size) if pool: notification_str += " (with " + str(pool_size) + " processes)" diff --git a/dataprofiler/profilers/profiler_options.py b/dataprofiler/profilers/profiler_options.py index f34876a55..e3d10696b 100644 --- a/dataprofiler/profilers/profiler_options.py +++ b/dataprofiler/profilers/profiler_options.py @@ -9,7 +9,7 @@ from typing import Any, Generic, TypeVar, cast from ..labelers.base_data_labeler import BaseDataLabeler -from . import utils +from . import profiler_utils from .json_decoder import load_option BaseOptionT = TypeVar("BaseOptionT", bound="BaseOption") @@ -210,13 +210,14 @@ def _validate_helper(self, variable_path: str = "BooleanOption") -> list[str]: return errors -class HistogramOption(BooleanOption["HistogramOption"]): +class HistogramAndQuantilesOption(BooleanOption["HistogramAndQuantilesOption"]): """For setting histogram options.""" def __init__( self, is_enabled: bool = True, bin_count_or_method: str | int | list[str] = "auto", + num_quantiles: int = 1000, ) -> None: """ Initialize Options for histograms. @@ -226,11 +227,16 @@ def __init__( :ivar bin_count_or_method: bin count or the method with which to calculate histograms :vartype bin_count_or_method: Union[str, int, list(str)] + :ivar num_quantiles: number of quantiles + :vartype num_quantiles: int """ self.bin_count_or_method = bin_count_or_method + self.num_quantiles = num_quantiles super().__init__(is_enabled=is_enabled) - def _validate_helper(self, variable_path: str = "HistogramOption") -> list[str]: + def _validate_helper( + self, variable_path: str = "HistogramAndQuantilesOption" + ) -> list[str]: """ Validate the options do not conflict and cause errors. @@ -260,6 +266,12 @@ def _validate_helper(self, variable_path: str = "HistogramOption") -> list[str]: "than 1, a string, or list of strings from the " "following: {}.".format(variable_path, valid_methods) ) + + if self.num_quantiles is not None and ( + not isinstance(self.num_quantiles, int) or self.num_quantiles < 1 + ): + errors.append(f"{variable_path}.num_quantiles must be a positive integer.") + return errors @@ -396,7 +408,9 @@ def __init__(self) -> None: self.median_abs_deviation: BooleanOption = BooleanOption(is_enabled=True) self.num_zeros: BooleanOption = BooleanOption(is_enabled=True) self.num_negatives: BooleanOption = BooleanOption(is_enabled=True) - self.histogram_and_quantiles: HistogramOption = HistogramOption() + self.histogram_and_quantiles: HistogramAndQuantilesOption = ( + HistogramAndQuantilesOption() + ) # By default, we correct for bias self.bias_correction: BooleanOption = BooleanOption(is_enabled=True) BaseInspectorOptions.__init__(self) @@ -1308,7 +1322,7 @@ def load_from_dict( data_labeler_object = None data_labeler_load_attr = data.pop("data_labeler_object", {}) if data_labeler_load_attr: - data_labeler_object = utils.reload_labeler_from_options_or_get_new( + data_labeler_object = profiler_utils.reload_labeler_from_options_or_get_new( data_labeler_load_attr, config ) if data_labeler_object: diff --git a/dataprofiler/profilers/utils.py b/dataprofiler/profilers/profiler_utils.py similarity index 96% rename from dataprofiler/profilers/utils.py rename to dataprofiler/profilers/profiler_utils.py index 09bfbac18..a3ed375b4 100644 --- a/dataprofiler/profilers/utils.py +++ b/dataprofiler/profilers/profiler_utils.py @@ -7,7 +7,6 @@ import functools import math import multiprocessing as mp -import os import time import warnings from abc import abstractmethod @@ -31,13 +30,14 @@ import scipy from pandas import DataFrame, Series -from .. import settings from ..labelers.data_labelers import DataLabeler if TYPE_CHECKING: from ..labelers.base_data_labeler import BaseDataLabeler from . import profile_builder +from .. import rng_utils + def recursive_dict_update(d: dict, update_d: dict) -> dict: """ @@ -109,14 +109,7 @@ def shuffle_in_chunks( if not data_length or data_length == 0 or not chunk_size or chunk_size == 0: return [] - rng = np.random.default_rng(settings._seed) - - if "DATAPROFILER_SEED" in os.environ and settings._seed is None: - seed = os.environ.get("DATAPROFILER_SEED") - if isinstance(seed, int): - rng = np.random.default_rng(int(seed)) - else: - warnings.warn("Seed should be an integer", RuntimeWarning) + rng = rng_utils.get_random_number_generator() indices = KeyDict() j = 0 @@ -184,6 +177,34 @@ def partition(data: list, chunk_size: int) -> Generator[list, None, Any]: yield data[idx : idx + chunk_size] +def auto_multiprocess_toggle( + data: DataFrame, + num_rows_threshold: int = 750000, + num_cols_threshold: int = 20, +) -> bool: + """ + Automate multiprocessing toggle depending on dataset sizes. + + :param data: a dataset + :type data: pandas.DataFrame + :param num_rows_threshold: threshold for number of rows to + use multiprocess + :type num_rows_threshold: int + :param num_cols_threshold: threshold for number of columns + to use multiprocess + :type num_cols_threshold: int + :return: recommended option.multiprocess.is_enabled value + :rtype: bool + """ + # If the number of rows or columns exceed their respective threshold, + # we want to turn on multiprocessing + if data.shape[0] > num_rows_threshold or data.shape[1] > num_cols_threshold: + return True + # Otherwise, we do not turn on multiprocessing + else: + return False + + def suggest_pool_size(data_size: int = None, cols: int = None) -> int | None: """ Suggest the pool size based on resources. diff --git a/dataprofiler/profilers/text_column_profile.py b/dataprofiler/profilers/text_column_profile.py index e8446dcb8..bea8dbd68 100644 --- a/dataprofiler/profilers/text_column_profile.py +++ b/dataprofiler/profilers/text_column_profile.py @@ -6,7 +6,7 @@ import numpy as np import pandas as pd -from . import utils +from . import profiler_utils from .base_column_profilers import BaseColumnPrimitiveTypeProfiler, BaseColumnProfiler from .numerical_column_stats import NumericStatsMixin from .profiler_options import TextOptions @@ -111,7 +111,9 @@ def diff(self, other_profile: TextColumn, options: dict = None) -> dict: differences = NumericStatsMixin.diff(self, other_profile, options) del differences["psi"] - vocab_diff = utils.find_diff_of_lists_and_sets(self.vocab, other_profile.vocab) + vocab_diff = profiler_utils.find_diff_of_lists_and_sets( + self.vocab, other_profile.vocab + ) differences["vocab"] = vocab_diff return differences @@ -149,7 +151,7 @@ def _update_vocab( :return: None """ data_flat = set(itertools.chain(*data)) - self.vocab = utils._combine_unique_sets(self.vocab, data_flat) + self.vocab = profiler_utils._combine_unique_sets(self.vocab, data_flat) def _update_helper(self, df_series_clean: pd.Series, profile: dict) -> None: """ diff --git a/dataprofiler/profilers/unstructured_labeler_profile.py b/dataprofiler/profilers/unstructured_labeler_profile.py index d07f2647d..1c7b16c0f 100644 --- a/dataprofiler/profilers/unstructured_labeler_profile.py +++ b/dataprofiler/profilers/unstructured_labeler_profile.py @@ -8,7 +8,7 @@ from ..labelers.base_data_labeler import BaseDataLabeler from ..labelers.data_labelers import DataLabeler from ..labelers.data_processing import CharPostprocessor -from . import utils +from . import profiler_utils from .base_column_profilers import BaseColumnProfiler from .profiler_options import DataLabelerOptions @@ -87,14 +87,16 @@ def __add__(self, other: UnstructuredLabelerProfile) -> UnstructuredLabelerProfi options.data_labeler_object = self.data_labeler merged_profile = UnstructuredLabelerProfile(options=options) - merged_profile.entity_counts = utils.add_nested_dictionaries( + merged_profile.entity_counts = profiler_utils.add_nested_dictionaries( self.entity_counts, other.entity_counts ) merged_profile.char_sample_size = self.char_sample_size + other.char_sample_size merged_profile.word_sample_size = self.word_sample_size + other.word_sample_size - merged_profile.times = utils.add_nested_dictionaries(self.times, other.times) + merged_profile.times = profiler_utils.add_nested_dictionaries( + self.times, other.times + ) merged_profile._update_percentages() @@ -133,10 +135,10 @@ def diff( entity_counts_diff = {} entity_percentages_diff = {} for key in ["word_level", "true_char_level", "postprocess_char_level"]: - entity_percentages_diff[key] = utils.find_diff_of_dicts( + entity_percentages_diff[key] = profiler_utils.find_diff_of_dicts( self.entity_percentages[key], other_profile.entity_percentages[key] ) - entity_counts_diff[key] = utils.find_diff_of_dicts( + entity_counts_diff[key] = profiler_utils.find_diff_of_dicts( self.entity_counts[key], other_profile.entity_counts[key] ) diff --git a/dataprofiler/profilers/unstructured_text_profile.py b/dataprofiler/profilers/unstructured_text_profile.py index ffc9eb503..96b7d0625 100644 --- a/dataprofiler/profilers/unstructured_text_profile.py +++ b/dataprofiler/profilers/unstructured_text_profile.py @@ -9,7 +9,7 @@ from numpy import ndarray from pandas import DataFrame, Series -from . import utils +from . import profiler_utils from .base_column_profilers import BaseColumnProfiler from .profiler_options import TextProfilerOptions @@ -610,18 +610,20 @@ def diff(self, other_profile: TextProfiler, options: dict = None) -> dict: other_word_count = {k.lower(): v for k, v in other_word_count.items()} diff: dict = {} - diff["vocab"] = utils.find_diff_of_lists_and_sets( + diff["vocab"] = profiler_utils.find_diff_of_lists_and_sets( list(self.vocab_count.keys()), list(other_profile.vocab_count.keys()) ) - diff["vocab_count"] = utils.find_diff_of_dicts_with_diff_keys( + diff["vocab_count"] = profiler_utils.find_diff_of_dicts_with_diff_keys( dict(self.vocab_count.most_common(self._top_k_chars)), dict(other_profile.vocab_count.most_common(self._top_k_chars)), ) - diff["words"] = utils.find_diff_of_lists_and_sets(self_words, other_words) + diff["words"] = profiler_utils.find_diff_of_lists_and_sets( + self_words, other_words + ) - diff["word_count"] = utils.find_diff_of_dicts_with_diff_keys( + diff["word_count"] = profiler_utils.find_diff_of_dicts_with_diff_keys( self_word_count, other_word_count ) diff --git a/dataprofiler/rng_utils.py b/dataprofiler/rng_utils.py new file mode 100644 index 000000000..329066658 --- /dev/null +++ b/dataprofiler/rng_utils.py @@ -0,0 +1,19 @@ +"""Create a random number generator using a manual seed DATAPROFILER_SEED.""" +import os +import warnings + +import numpy as np + +from . import settings + + +def get_random_number_generator() -> np.random._generator.Generator: + """Create a random number generator using a manual seed DATAPROFILER_SEED.""" + rng = np.random.default_rng(settings._seed) + if "DATAPROFILER_SEED" in os.environ and settings._seed is None: + seed: str = os.environ.get("DATAPROFILER_SEED", "") + try: + rng = np.random.default_rng(int(seed)) + except ValueError: + warnings.warn("Seed should be an integer", RuntimeWarning) + return rng diff --git a/dataprofiler/tests/profilers/profiler_options/test_datalabeler_options.py b/dataprofiler/tests/profilers/profiler_options/test_datalabeler_options.py index ef906e084..e2794c78a 100644 --- a/dataprofiler/tests/profilers/profiler_options/test_datalabeler_options.py +++ b/dataprofiler/tests/profilers/profiler_options/test_datalabeler_options.py @@ -196,7 +196,7 @@ def test_json_encode(self): self.assertDictEqual(expected, json.loads(serialized)) @mock.patch( - "dataprofiler.profilers.utils.DataLabeler", + "dataprofiler.profilers.profiler_utils.DataLabeler", spec=BaseDataLabeler, ) def test_json_decode(self, mock_BaseDataLabeler): diff --git a/dataprofiler/tests/profilers/profiler_options/test_float_options.py b/dataprofiler/tests/profilers/profiler_options/test_float_options.py index 044faa04e..9b67e3534 100644 --- a/dataprofiler/tests/profilers/profiler_options/test_float_options.py +++ b/dataprofiler/tests/profilers/profiler_options/test_float_options.py @@ -94,7 +94,7 @@ def test_json_encode(self): "data": {"is_enabled": True}, }, "histogram_and_quantiles": { - "class": "HistogramOption", + "class": "HistogramAndQuantilesOption", "data": mock.ANY, }, "bias_correction": { diff --git a/dataprofiler/tests/profilers/profiler_options/test_histogram_and_quantiles_option.py b/dataprofiler/tests/profilers/profiler_options/test_histogram_and_quantiles_option.py new file mode 100644 index 000000000..17abad647 --- /dev/null +++ b/dataprofiler/tests/profilers/profiler_options/test_histogram_and_quantiles_option.py @@ -0,0 +1,311 @@ +import json + +from dataprofiler.profilers.json_decoder import load_option +from dataprofiler.profilers.json_encoder import ProfileEncoder +from dataprofiler.profilers.profiler_options import HistogramAndQuantilesOption + +from .. import utils as test_utils +from .test_boolean_option import TestBooleanOption + + +class TestHistogramAndQuantilesOption(TestBooleanOption): + + option_class = HistogramAndQuantilesOption + keys = [] + + def test_init(self): + option = self.get_options() + self.assertTrue(option.is_enabled) + self.assertEqual(option.bin_count_or_method, "auto") + self.assertEqual(option.num_quantiles, 1000) + + def test_set_helper(self): + option = self.get_options() + + # validate, variable path being passed + expected_error = ( + "type object 'test.bin_count_or_method' has no attribute 'is_enabled'" + ) + with self.assertRaisesRegex(AttributeError, expected_error): + option._set_helper({"bin_count_or_method.is_enabled": True}, "test") + + # validate, variable path being passed + expected_error = ( + "type object 'test.num_quantiles' has no attribute 'is_enabled'" + ) + with self.assertRaisesRegex(AttributeError, expected_error): + option._set_helper({"num_quantiles.is_enabled": True}, "test") + + def test_set(self): + option = self.get_options() + + params_to_check = [ + dict(prop="is_enabled", value_list=[False, True]), + dict( + prop="bin_count_or_method", + value_list=[ + None, + "auto", + "fd", + "doane", + "scott", + "rice", + "sturges", + "sqrt", + ["sturges", "doane"], + 1, + 10, + 100, + 1000, + 99, + 10000000, + ], + ), + ] + + # this code can be abstracted to limit code everywhere else + # AKA, params_to_check would be the only needed code plus raise errors + def _assert_set_helper(prop, value): + option.set({prop: value}) + self.assertEqual(value, getattr(option, prop), msg=prop) + + for params in params_to_check: + prop, value_list = params["prop"], params["value_list"] + for value in value_list: + _assert_set_helper(prop, value) + + # Treat bin_count_or_method as a BooleanOption + expected_error = ( + "type object 'bin_count_or_method' has no attribute 'is_enabled'" + ) + with self.assertRaisesRegex(AttributeError, expected_error): + option.set({"bin_count_or_method.is_enabled": True}) + + # Treat num_quantiles as a BooleanOption + expected_error = "type object 'num_quantiles' has no attribute 'is_enabled'" + with self.assertRaisesRegex(AttributeError, expected_error): + option.set({"num_quantiles.is_enabled": True}) + + # Test set option for num_quantiles + option.set({"num_quantiles": 50}) + self.assertEqual(option.num_quantiles, 50) + + def test_validate_helper(self): + super().test_validate_helper() + + optpth = self.get_options_path() + + # Default configuration + option = self.get_options(num_quantiles=1000) + self.assertEqual([], option._validate_helper()) + + # Valid configurations + option = self.get_options(num_quantiles=50) + self.assertEqual([], option._validate_helper()) + option = self.get_options(num_quantiles=2000) + self.assertEqual([], option._validate_helper()) + option = self.get_options(num_quantiles=1) + self.assertEqual([], option._validate_helper()) + + # Option num_quantiles + option = self.get_options(num_quantiles="Hello World") + expected_error = [f"{optpth}.num_quantiles must be a positive integer."] + self.assertSetEqual(set(expected_error), set(option._validate_helper())) + + # Option num_quantiles cannot be a float, must be an int + option = self.get_options(num_quantiles=1.1) + expected_error = [f"{optpth}.num_quantiles must be a positive integer."] + self.assertSetEqual(set(expected_error), set(option._validate_helper())) + + # Option num_quantiles may not be zero, must be greater than one(1) + option = self.get_options(num_quantiles=0) + expected_error = [f"{optpth}.num_quantiles must be a positive integer."] + self.assertSetEqual(set(expected_error), set(option._validate_helper())) + + # Option num_quantiles cannot be a negative integer + option = self.get_options(num_quantiles=-5) + expected_error = [f"{optpth}.num_quantiles must be a positive integer."] + self.assertSetEqual(set(expected_error), set(option._validate_helper())) + + def test_validate(self): + + super().test_validate() + + optpth = self.get_options_path() + + params_to_check = [ + # non errors + dict(prop="is_enabled", value_list=[False, True], errors=[]), + dict( + prop="bin_count_or_method", + value_list=[ + "auto", + "fd", + "doane", + "scott", + "rice", + "sturges", + "sqrt", + ["sturges", "doane"], + 1, + 10, + 100, + 1000, + 99, + 10000000, + ], + errors=[], + ), + # errors + dict( + prop="bin_count_or_method", + value_list=[ + -1, + 1.2, + 1.0, + [], + False, + "whoops", + ["doane", "incorrect"], + "1", + ], + errors=[ + "HistogramAndQuantilesOption.bin_count_or_method must be an integer " + "more than 1, a string, or list of strings from the " + "following: ['auto', 'fd', 'doane', 'scott', 'rice', " + "'sturges', 'sqrt']." + ], + ), + ] + + # this code can be abstracted to limit code everywhere else + # AKA, for loop below could be abstracted to a utils func + + # Default configuration is valid + option = self.get_options() + self.assertIsNone(option.validate(raise_error=False)) + + for params in params_to_check: + prop, value_list, expected_errors = ( + params["prop"], + params["value_list"], + params["errors"], + ) + option = self.get_options() + for value in value_list: + setattr(option, prop, value) + validate_errors = option.validate(raise_error=False) + if expected_errors: + self.assertListEqual( + expected_errors, + validate_errors, + msg=f"Errored for prop: {prop}, value: {value}.", + ) + else: + self.assertIsNone( + validate_errors, + msg=f"Errored for prop: {prop}, value: {value}.", + ) + + # this time testing raising an error + option.bin_count_or_method = "fake method" + expected_error = ( + r"HistogramAndQuantilesOption.bin_count_or_method must be an integer more than " + r"1, a string, or list of strings from the following: " + r"\['auto', 'fd', 'doane', 'scott', 'rice', 'sturges', 'sqrt']." + ) + with self.assertRaisesRegex(ValueError, expected_error): + option.validate() + + # Valid configurations + option = self.get_options(num_quantiles=50) + self.assertEqual([], option._validate_helper()) + option = self.get_options(num_quantiles=2000) + self.assertEqual([], option._validate_helper()) + option = self.get_options(num_quantiles=1) + self.assertEqual([], option._validate_helper()) + + # Option num_quantiles cannot be a string, must be an int + option = self.get_options(num_quantiles="Hello World") + expected_error = f"{optpth}.num_quantiles must be a positive integer" + with self.assertRaisesRegex(ValueError, expected_error): + option.validate() + + # Option num_quantiles cannot be a float, must be an int + option = self.get_options(num_quantiles=1.1) + expected_error = f"{optpth}.num_quantiles must be a positive integer" + with self.assertRaisesRegex(ValueError, expected_error): + option.validate() + + # Option num_quantiles must be a positive integer + option = self.get_options(num_quantiles=0) + expected_error = f"{optpth}.num_quantiles must be a positive integer" + with self.assertRaisesRegex(ValueError, expected_error): + option.validate() + + # Option num_quantiles cannot be a negative integer + option = self.get_options(num_quantiles=-5) + expected_error = f"{optpth}.num_quantiles must be a positive integer" + with self.assertRaisesRegex(ValueError, expected_error): + option.validate() + + def test_eq(self): + super().test_eq() + + options = self.get_options() + options2 = self.get_options() + options.bin_count_or_method = "sturges" + self.assertNotEqual(options, options2) + options2.bin_count_or_method = "doane" + self.assertNotEqual(options, options2) + options2.bin_count_or_method = "sturges" + self.assertEqual(options, options2) + options.num_quantiles = 30 + self.assertNotEqual(options, options2) + options2.num_quantiles = 50 + self.assertNotEqual(options, options2) + options2.num_quantiles = 30 + self.assertEqual(options, options2) + + def test_json_encode(self): + option = HistogramAndQuantilesOption( + is_enabled=False, bin_count_or_method="doane" + ) + + serialized = json.dumps(option, cls=ProfileEncoder) + + expected = { + "class": "HistogramAndQuantilesOption", + "data": { + "bin_count_or_method": "doane", + "num_quantiles": 1000, + "is_enabled": False, + }, + } + + self.assertDictEqual(expected, json.loads(serialized)) + + def test_json_decode_warn(self): + old_histogram = { + "class": "HistogramOption", + "data": { + "bin_count_or_method": "doane", + "is_enabled": False, + }, + } + + expected = HistogramAndQuantilesOption( + is_enabled=False, bin_count_or_method="doane" + ) + + expected_string = json.dumps(old_histogram, cls=ProfileEncoder) + + expected_warning = ( + "HistogramOption will be deprecated in the future. During the JSON encode " + "process, HistogramOption is mapped to HistogramAndQuantilesOption. " + "Please begin utilizing the new HistogramAndQuantilesOption class." + ) + + with self.assertWarnsRegex(DeprecationWarning, expected_warning): + deserialized = load_option(json.loads(expected_string)) + test_utils.assert_profiles_equal(deserialized, expected) diff --git a/dataprofiler/tests/profilers/profiler_options/test_histogram_option.py b/dataprofiler/tests/profilers/profiler_options/test_histogram_option.py deleted file mode 100644 index 4bf3a3b16..000000000 --- a/dataprofiler/tests/profilers/profiler_options/test_histogram_option.py +++ /dev/null @@ -1,190 +0,0 @@ -import json - -from dataprofiler.profilers.json_encoder import ProfileEncoder -from dataprofiler.profilers.profiler_options import HistogramOption - -from .test_boolean_option import TestBooleanOption - - -class TestHistogramOption(TestBooleanOption): - - option_class = HistogramOption - keys = [] - - def test_init(self): - option = self.get_options() - self.assertTrue(option.is_enabled) - self.assertEqual(option.bin_count_or_method, "auto") - - def test_set_helper(self): - option = self.get_options() - - # validate, variable path being passed - expected_error = ( - "type object 'test.bin_count_or_method' has no " "attribute 'is_enabled'" - ) - with self.assertRaisesRegex(AttributeError, expected_error): - option._set_helper({"bin_count_or_method.is_enabled": True}, "test") - - def test_set(self): - option = self.get_options() - - params_to_check = [ - dict(prop="is_enabled", value_list=[False, True]), - dict( - prop="bin_count_or_method", - value_list=[ - None, - "auto", - "fd", - "doane", - "scott", - "rice", - "sturges", - "sqrt", - ["sturges", "doane"], - 1, - 10, - 100, - 1000, - 99, - 10000000, - ], - ), - ] - - # this code can be abstracted to limit code everywhere else - # AKA, params_to_check would be the only needed code plus raise errors - def _assert_set_helper(prop, value): - option.set({prop: value}) - self.assertEqual(value, getattr(option, prop), msg=prop) - - for params in params_to_check: - prop, value_list = params["prop"], params["value_list"] - for value in value_list: - _assert_set_helper(prop, value) - - # Treat bin_count_or_method as a BooleanOption - expected_error = ( - "type object 'bin_count_or_method' has no attribute " "'is_enabled'" - ) - with self.assertRaisesRegex(AttributeError, expected_error): - option.set({"bin_count_or_method.is_enabled": True}) - - def test_validate_helper(self): - super().test_validate_helper() - - def test_validate(self): - - super().test_validate() - - params_to_check = [ - # non errors - dict(prop="is_enabled", value_list=[False, True], errors=[]), - dict( - prop="bin_count_or_method", - value_list=[ - "auto", - "fd", - "doane", - "scott", - "rice", - "sturges", - "sqrt", - ["sturges", "doane"], - 1, - 10, - 100, - 1000, - 99, - 10000000, - ], - errors=[], - ), - # errors - dict( - prop="bin_count_or_method", - value_list=[ - -1, - 1.2, - 1.0, - [], - False, - "whoops", - ["doane", "incorrect"], - "1", - ], - errors=[ - "HistogramOption.bin_count_or_method must be an integer " - "more than 1, a string, or list of strings from the " - "following: ['auto', 'fd', 'doane', 'scott', 'rice', " - "'sturges', 'sqrt']." - ], - ), - ] - - # # this code can be abstracted to limit code everywhere else - # # AKA, for loop below could be abstracted to a utils func - - # Default configuration is valid - option = self.get_options() - self.assertIsNone(option.validate(raise_error=False)) - - for params in params_to_check: - prop, value_list, expected_errors = ( - params["prop"], - params["value_list"], - params["errors"], - ) - option = self.get_options() - for value in value_list: - setattr(option, prop, value) - validate_errors = option.validate(raise_error=False) - if expected_errors: - self.assertListEqual( - expected_errors, - validate_errors, - msg=f"Errored for prop: {prop}, value: {value}.", - ) - else: - self.assertIsNone( - validate_errors, - msg=f"Errored for prop: {prop}, value: {value}.", - ) - - # this time testing raising an error - option.bin_count_or_method = "fake method" - expected_error = ( - r"HistogramOption.bin_count_or_method must be an integer more than " - r"1, a string, or list of strings from the following: " - r"\['auto', 'fd', 'doane', 'scott', 'rice', 'sturges', 'sqrt']." - ) - with self.assertRaisesRegex(ValueError, expected_error): - option.validate() - - def test_eq(self): - super().test_eq() - - options = self.get_options() - options2 = self.get_options() - options.bin_count_or_method = "sturges" - self.assertNotEqual(options, options2) - options2.bin_count_or_method = "doane" - self.assertNotEqual(options, options2) - options2.bin_count_or_method = "sturges" - self.assertEqual(options, options2) - - def test_json_encode(self): - option = HistogramOption(is_enabled=False, bin_count_or_method="doane") - - serialized = json.dumps(option, cls=ProfileEncoder) - - expected = { - "class": "HistogramOption", - "data": { - "bin_count_or_method": "doane", - "is_enabled": False, - }, - } - - self.assertDictEqual(expected, json.loads(serialized)) diff --git a/dataprofiler/tests/profilers/profiler_options/test_int_options.py b/dataprofiler/tests/profilers/profiler_options/test_int_options.py index 317d1ff64..b767f3f3e 100644 --- a/dataprofiler/tests/profilers/profiler_options/test_int_options.py +++ b/dataprofiler/tests/profilers/profiler_options/test_int_options.py @@ -86,7 +86,7 @@ def test_json_encode(self): "data": {"is_enabled": True}, }, "histogram_and_quantiles": { - "class": "HistogramOption", + "class": "HistogramAndQuantilesOption", "data": mock.ANY, }, "bias_correction": { diff --git a/dataprofiler/tests/profilers/profiler_options/test_numerical_options.py b/dataprofiler/tests/profilers/profiler_options/test_numerical_options.py index 03d6c01db..ad0833d80 100644 --- a/dataprofiler/tests/profilers/profiler_options/test_numerical_options.py +++ b/dataprofiler/tests/profilers/profiler_options/test_numerical_options.py @@ -422,7 +422,7 @@ def test_json_encode(self): "data": {"is_enabled": True}, }, "histogram_and_quantiles": { - "class": "HistogramOption", + "class": "HistogramAndQuantilesOption", "data": mock.ANY, }, "bias_correction": { diff --git a/dataprofiler/tests/profilers/profiler_options/test_text_options.py b/dataprofiler/tests/profilers/profiler_options/test_text_options.py index 57814126d..b26509e91 100644 --- a/dataprofiler/tests/profilers/profiler_options/test_text_options.py +++ b/dataprofiler/tests/profilers/profiler_options/test_text_options.py @@ -128,7 +128,7 @@ def test_json_encode(self): "data": {"is_enabled": False}, }, "histogram_and_quantiles": { - "class": "HistogramOption", + "class": "HistogramAndQuantilesOption", "data": mock.ANY, }, "bias_correction": { diff --git a/dataprofiler/tests/profilers/test_base_column_profilers.py b/dataprofiler/tests/profilers/test_base_column_profilers.py index eb2ed764d..4ab7182cf 100644 --- a/dataprofiler/tests/profilers/test_base_column_profilers.py +++ b/dataprofiler/tests/profilers/test_base_column_profilers.py @@ -7,7 +7,7 @@ import numpy as np import pandas as pd -from dataprofiler.profilers import utils +from dataprofiler.profilers import profiler_utils from dataprofiler.profilers.base_column_profilers import ( BaseColumnPrimitiveTypeProfiler, BaseColumnProfiler, @@ -185,7 +185,7 @@ def test_cannot_instantiate(self): def test_combine_unqiue_sets(self): a = [1, 2, 3] b = [3, 1, 4, -1] - c = utils._combine_unique_sets(a, b) + c = profiler_utils._combine_unique_sets(a, b) self.assertCountEqual([1, 2, 3, 4, -1], c) def test__init__(self): diff --git a/dataprofiler/tests/profilers/test_column_profile_compilers.py b/dataprofiler/tests/profilers/test_column_profile_compilers.py index 957bb694f..35617d1e8 100644 --- a/dataprofiler/tests/profilers/test_column_profile_compilers.py +++ b/dataprofiler/tests/profilers/test_column_profile_compilers.py @@ -611,7 +611,7 @@ def test_json_decode_after_update(self): @mock.patch( - "dataprofiler.profilers.utils.DataLabeler", + "dataprofiler.profilers.profiler_utils.DataLabeler", spec=BaseDataLabeler, ) @mock.patch( diff --git a/dataprofiler/tests/profilers/test_data_labeler_column_profile.py b/dataprofiler/tests/profilers/test_data_labeler_column_profile.py index 5b25f939c..35f448aea 100644 --- a/dataprofiler/tests/profilers/test_data_labeler_column_profile.py +++ b/dataprofiler/tests/profilers/test_data_labeler_column_profile.py @@ -7,7 +7,7 @@ import pandas as pd from dataprofiler.labelers import BaseDataLabeler -from dataprofiler.profilers import utils +from dataprofiler.profilers import profiler_utils from dataprofiler.profilers.data_labeler_column_profile import DataLabelerColumn from dataprofiler.profilers.json_decoder import load_column_profile from dataprofiler.profilers.json_encoder import ProfileEncoder @@ -376,7 +376,7 @@ def test_diff(self, mock_instance): diff = profiler1.diff(profiler2) expected_diff = { - "data_label": utils.find_diff_of_lists_and_sets( + "data_label": profiler_utils.find_diff_of_lists_and_sets( ["a", "b", "c"], ["b", "c", "d"] ), "avg_predictions": {"a": "unchanged", "b": -0.70, "c": 0.70}, @@ -485,7 +485,9 @@ def test_json_encode_after_update(self, mock_instance): self.assertEqual(expected, serialized) - @mock.patch("dataprofiler.profilers.utils.DataLabeler", spec=BaseDataLabeler) + @mock.patch( + "dataprofiler.profilers.profiler_utils.DataLabeler", spec=BaseDataLabeler + ) def test_json_decode(self, mock_utils_DataLabeler, mock_BaseDataLabeler): self._setup_data_labeler_mock(mock_BaseDataLabeler) mock_utils_DataLabeler.load_from_library.side_effect = mock_BaseDataLabeler @@ -526,7 +528,9 @@ def test_json_decode(self, mock_utils_DataLabeler, mock_BaseDataLabeler): class_as_dict["data"]["data_labeler"] = {"from_disk": "test"} deserialized = load_column_profile(class_as_dict, config) - @mock.patch("dataprofiler.profilers.utils.DataLabeler", spec=BaseDataLabeler) + @mock.patch( + "dataprofiler.profilers.profiler_utils.DataLabeler", spec=BaseDataLabeler + ) def test_json_decode_after_update( self, mock_utils_DataLabeler, mock_BaseDataLabeler ): diff --git a/dataprofiler/tests/profilers/test_float_column_profile.py b/dataprofiler/tests/profilers/test_float_column_profile.py index 86e721a33..b7a2bfab7 100644 --- a/dataprofiler/tests/profilers/test_float_column_profile.py +++ b/dataprofiler/tests/profilers/test_float_column_profile.py @@ -37,6 +37,7 @@ def test_base_case(self): self.assertTrue(profiler.stddev is np.nan) self.assertIsNone(profiler.histogram_selection) self.assertIsNone(profiler.quantiles) + self.assertEqual(profiler._num_quantiles, 1000) self.assertIsNone(profiler.data_type_ratio) def test_single_data_variance_case(self): @@ -1837,9 +1838,10 @@ def test_json_encode_after_update(self, time): data = np.array([0.0, 5.0, 10.0]) df = pd.Series(data).apply(str) - int_options = FloatOptions() - int_options.histogram_and_quantiles.bin_count_or_method = 5 - profiler = FloatColumn("0.0", int_options) + float_options = FloatOptions() + float_options.histogram_and_quantiles.bin_count_or_method = 5 + float_options.histogram_and_quantiles.num_quantiles = 4 + profiler = FloatColumn("0.0", float_options) mocked_quantiles = [0.25, 0.50, 0.75] with mock.patch.object( @@ -1884,7 +1886,7 @@ def test_json_encode_after_update(self, time): "_mode_is_enabled": True, "num_zeros": 1, "num_negatives": 0, - "_num_quantiles": 1000, + "_num_quantiles": 4, "histogram_methods": expected_historam_methods, "_stored_histogram": { "total_loss": 2.0, diff --git a/dataprofiler/tests/profilers/test_int_column_profile.py b/dataprofiler/tests/profilers/test_int_column_profile.py index 01e624d20..d224a57a0 100644 --- a/dataprofiler/tests/profilers/test_int_column_profile.py +++ b/dataprofiler/tests/profilers/test_int_column_profile.py @@ -37,6 +37,7 @@ def test_base_case(self): self.assertTrue(profiler.stddev is np.nan) self.assertIsNone(profiler.histogram_selection) self.assertIsNone(profiler.quantiles) + self.assertEqual(profiler._num_quantiles, 1000) self.assertIsNone(profiler.data_type_ratio) def test_single_data_variance_case(self): diff --git a/dataprofiler/tests/profilers/test_numeric_stats_mixin_profile.py b/dataprofiler/tests/profilers/test_numeric_stats_mixin_profile.py index d01a7c382..4294dfd40 100644 --- a/dataprofiler/tests/profilers/test_numeric_stats_mixin_profile.py +++ b/dataprofiler/tests/profilers/test_numeric_stats_mixin_profile.py @@ -420,7 +420,6 @@ def test_from_dict_helper(self): ) expected_profile._stored_histogram = mock_saved_profile["_stored_histogram"] expected_profile.quantiles = None - expected_profile._stored_histogram["histogram"] = { "bin_counts": None, "bin_edges": None, diff --git a/dataprofiler/tests/profilers/test_profile_builder.py b/dataprofiler/tests/profilers/test_profile_builder.py index 7a309e946..9507fe20f 100644 --- a/dataprofiler/tests/profilers/test_profile_builder.py +++ b/dataprofiler/tests/profilers/test_profile_builder.py @@ -327,7 +327,12 @@ def test_correlation(self, *mock): # sum((x - np.mean(x))*(y-np.mean(y))) / # np.sqrt(sum((x - np.mean(x)**2)))/np.sqrt(sum((y - np.mean(y)**2))) profile_options = dp.ProfilerOptions() - profile_options.set({"correlation.is_enabled": True}) + profile_options.set( + { + "correlation.is_enabled": True, + "structured_options.multiprocess.is_enabled": False, + } + ) # data with a sole numeric column data = pd.DataFrame([1.0, 8.0, 1.0, -2.0, 5.0]) @@ -580,7 +585,12 @@ def test_merge_correlation(self, *mocks): def test_correlation_update(self): profile_options = dp.ProfilerOptions() - profile_options.set({"correlation.is_enabled": True}) + profile_options.set( + { + "correlation.is_enabled": True, + "structured_options.multiprocess.is_enabled": False, + } + ) # Test with all numeric columns data = pd.DataFrame( @@ -776,12 +786,14 @@ def test_correlation_selected_columns(self, *mocks): def test_chi2(self, *mocks): # Empty data = pd.DataFrame([]) - profiler = dp.StructuredProfiler(data) + profile_options = dp.ProfilerOptions() + profile_options.set({"structured_options.multiprocess.is_enabled": False}) + profiler = dp.StructuredProfiler(data, options=profile_options) self.assertIsNone(profiler.chi2_matrix) # Single column data = pd.DataFrame({"a": ["y", "y", "n", "n", "y"]}) - profiler = dp.StructuredProfiler(data) + profiler = dp.StructuredProfiler(data, options=profile_options) expected_mat = np.array([1]) self.assertEqual(expected_mat, profiler.chi2_matrix) @@ -793,7 +805,7 @@ def test_chi2(self, *mocks): } ) - profiler = dp.StructuredProfiler(data) + profiler = dp.StructuredProfiler(data, options=profile_options) expected_mat = np.array( [[1, 0.309924, 0.404638], [0.309924, 1, 0.548812], [0.404638, 0.548812, 1]] ) @@ -808,7 +820,7 @@ def test_chi2(self, *mocks): } ) - profiler = dp.StructuredProfiler(data) + profiler = dp.StructuredProfiler(data, options=profile_options) expected_mat = np.array( [[1, 0.007295, 0.007295], [0.007295, 1, 0.015609], [0.007295, 0.015609, 1]] ) @@ -823,7 +835,7 @@ def test_chi2(self, *mocks): } ) - profiler = dp.StructuredProfiler(data) + profiler = dp.StructuredProfiler(data, options=profile_options) expected_mat = np.array([[1, 1, 1], [1, 1, 1], [1, 1, 1]]) np.testing.assert_array_almost_equal(expected_mat, profiler.chi2_matrix) @@ -840,8 +852,10 @@ def test_merge_chi2(self, *mocks): "c": ["n", "maybe", "n", "n", "n", "y", "y"], } ) - profiler1 = dp.StructuredProfiler(None) - profiler2 = dp.StructuredProfiler(data) + profile_options = dp.ProfilerOptions() + profile_options.set({"structured_options.multiprocess.is_enabled": False}) + profiler1 = dp.StructuredProfiler(None, options=profile_options) + profiler2 = dp.StructuredProfiler(data, options=profile_options) with mock.patch( "dataprofiler.profilers.profile_builder." "StructuredProfiler._add_error_checks" @@ -862,8 +876,8 @@ def test_merge_chi2(self, *mocks): data1 = data[:4] data2 = data[4:] - profiler1 = dp.StructuredProfiler(data1) - profiler2 = dp.StructuredProfiler(data2) + profiler1 = dp.StructuredProfiler(data1, options=profile_options) + profiler2 = dp.StructuredProfiler(data2, options=profile_options) profiler3 = profiler1 + profiler2 expected_mat = np.array( [[1, 0.309924, 0.404638], [0.309924, 1, 0.548812], [0.404638, 0.548812, 1]] @@ -880,8 +894,8 @@ def test_merge_chi2(self, *mocks): ) data1 = data[:4] data2 = data[4:] - profiler1 = dp.StructuredProfiler(data1) - profiler2 = dp.StructuredProfiler(data2) + profiler1 = dp.StructuredProfiler(data1, options=profile_options) + profiler2 = dp.StructuredProfiler(data2, options=profile_options) profiler3 = profiler1 + profiler2 expected_mat = np.array( [[1, 0.007295, 0.007295], [0.007295, 1, 0.015609], [0.007295, 0.015609, 1]] @@ -918,7 +932,9 @@ def test_update_chi2(self, *mocks): } ) data2 = pd.DataFrame({"a": [], "b": [], "c": []}) - profiler = dp.StructuredProfiler(data1) + profile_options = dp.ProfilerOptions() + profile_options.set({"structured_options.multiprocess.is_enabled": False}) + profiler = dp.StructuredProfiler(data1, options=profile_options) profiler.update_profile(data2) expected_mat = np.array( [[1, 0.309924, 0.404638], [0.309924, 1, 0.548812], [0.404638, 0.548812, 1]] @@ -934,7 +950,7 @@ def test_update_chi2(self, *mocks): ) data1 = data[:4] data2 = data[4:] - profiler = dp.StructuredProfiler(data1) + profiler = dp.StructuredProfiler(data1, options=profile_options) profiler.update_profile(data2) expected_mat = np.array( [[1, 0.309924, 0.404638], [0.309924, 1, 0.548812], [0.404638, 0.548812, 1]] @@ -952,7 +968,7 @@ def test_update_chi2(self, *mocks): data1 = data[:4] data2 = data[4:] - profiler = dp.StructuredProfiler(data1) + profiler = dp.StructuredProfiler(data1, options=profile_options) profiler.update_profile(data2) expected_mat = np.array( [[1, 0.007295, 0.007295], [0.007295, 1, 0.015609], [0.007295, 0.015609, 1]] @@ -969,7 +985,7 @@ def test_update_chi2(self, *mocks): ) data1 = data[:4] data2 = data[4:] - profiler = dp.StructuredProfiler(data1) + profiler = dp.StructuredProfiler(data1, options=profile_options) profiler.update_profile(data2) expected_mat = np.array([[1, 1, 1], [1, 1, 1], [1, 1, 1]]) np.testing.assert_array_almost_equal(expected_mat, profiler.chi2_matrix) @@ -1203,7 +1219,12 @@ def test_report_remove_disabled_flag(self): # with options to disable FloatColumn `precision` # and with remove_disabled_flag == True profiler_options = ProfilerOptions() - profiler_options.set({"precision.is_enabled": False}) + profiler_options.set( + { + "precision.is_enabled": False, + "structured_options.multiprocess.is_enabled": False, + } + ) profiler = dp.StructuredProfiler(data=data, options=profiler_options) report = profiler.report(report_options={"remove_disabled_flag": True}) @@ -1215,7 +1236,12 @@ def test_report_remove_disabled_flag(self): # with options to disable NumericalMixIn cal `min` # and with remove_disabled_flag == True profiler_options = ProfilerOptions() - profiler_options.set({"min.is_enabled": False}) + profiler_options.set( + { + "min.is_enabled": False, + "structured_options.multiprocess.is_enabled": False, + } + ) profiler = dp.StructuredProfiler(data=data, options=profiler_options) report = profiler.report(report_options={"remove_disabled_flag": True}) @@ -1225,7 +1251,12 @@ def test_report_remove_disabled_flag(self): # with options to disable TextColumn cal `vocab` # and with remove_disabled_flag == True profiler_options = ProfilerOptions() - profiler_options.set({"vocab.is_enabled": False}) + profiler_options.set( + { + "vocab.is_enabled": False, + "structured_options.multiprocess.is_enabled": False, + } + ) profiler = dp.StructuredProfiler(data=data, options=profiler_options) report = profiler.report(report_options={"remove_disabled_flag": True}) @@ -1234,7 +1265,12 @@ def test_report_remove_disabled_flag(self): # with profiler options and default remove_disabled_flag profiler_options = ProfilerOptions() - profiler_options.set({"min.is_enabled": False}) + profiler_options.set( + { + "min.is_enabled": False, + "structured_options.multiprocess.is_enabled": False, + } + ) profiler = dp.StructuredProfiler(data=data, options=profiler_options) report = profiler.report() @@ -1242,7 +1278,9 @@ def test_report_remove_disabled_flag(self): self.assertIn("min", report["data_stats"][iter_value]["statistics"]) # w/o profiler options and default remove_disabled_flag - profiler = dp.StructuredProfiler(data=data) + profile_options = dp.ProfilerOptions() + profile_options.set({"structured_options.multiprocess.is_enabled": False}) + profiler = dp.StructuredProfiler(data=data, options=profiler_options) report = profiler.report() for iter_value in range(0, len(data.columns) - 1): @@ -1370,7 +1408,11 @@ def recursive_test_helper(report, prev_key=None): def test_data_label_assigned(self): # only use 5 samples - trained_schema = dp.StructuredProfiler(self.aws_dataset, samples_per_update=5) + profile_options = dp.ProfilerOptions() + profile_options.set({"structured_options.multiprocess.is_enabled": False}) + trained_schema = dp.StructuredProfiler( + self.aws_dataset, samples_per_update=5, options=profile_options + ) report = trained_schema.report() has_non_null_column = False for i in range(len(report["data_stats"])): @@ -1545,7 +1587,7 @@ def test_save_and_load_json_file(self): save_profile.save(save_method="json") mock_file.seek(0) with mock.patch( - "dataprofiler.profilers.utils.DataLabeler.load_from_library", + "dataprofiler.profilers.profiler_utils.DataLabeler.load_from_library", return_value=data_labeler, ): load_profile = dp.StructuredProfiler.load("mock.json", "JSON") @@ -1754,7 +1796,10 @@ def test_duplicate_columns(self): [[1, 2, 3, 4, 5, 6], [10, 20, 30, 40, 50, 60]], columns=["a", "b", "a", "b", "c", "d"], ) - profiler = dp.StructuredProfiler(data) + profile_options = dp.ProfilerOptions() + profile_options.set({"structured_options.multiprocess.is_enabled": False}) + + profiler = dp.StructuredProfiler(data, options=profile_options) # Ensure columns are correctly allocated to profiles in list expected_mapping = {"a": [0, 2], "b": [1, 3], "c": [4], "d": [5]} @@ -1812,9 +1857,11 @@ def test_unique_col_permutation(self, *mocks): perm_data = pd.DataFrame( [[4, 3, 2, 1], [8, 7, 6, 5]], columns=["d", "c", "b", "a"] ) + profile_options = dp.ProfilerOptions() + profile_options.set({"structured_options.multiprocess.is_enabled": False}) # Test via add - first_profiler = dp.StructuredProfiler(data) + first_profiler = dp.StructuredProfiler(data, options=profile_options) perm_profiler = dp.StructuredProfiler(perm_data) profiler = first_profiler + perm_profiler @@ -1834,7 +1881,7 @@ def test_unique_col_permutation(self, *mocks): ) # Test via update - profiler = dp.StructuredProfiler(data) + profiler = dp.StructuredProfiler(data, options=profile_options) profiler.update_profile(perm_data) for col_idx in range(len(profiler._profile)): @@ -2462,7 +2509,7 @@ def test_json_encode_after_update(self, mock_DataLabeler, *mocks): spec=BaseDataLabeler, ) @mock.patch( - "dataprofiler.profilers.utils.DataLabeler", + "dataprofiler.profilers.profiler_utils.DataLabeler", spec=BaseDataLabeler, ) def test_json_decode(self, mock_utils_DataLabeler, mock_DataLabeler, *mocks): @@ -2496,7 +2543,7 @@ def test_json_decode(self, mock_utils_DataLabeler, mock_DataLabeler, *mocks): spec=BaseDataLabeler, ) @mock.patch( - "dataprofiler.profilers.utils.DataLabeler", + "dataprofiler.profilers.profiler_utils.DataLabeler", spec=BaseDataLabeler, ) def test_json_decode_after_update( @@ -3166,7 +3213,7 @@ def test_json_encode_after_update(self, mock_DataLabeler, *mocks): spec=BaseDataLabeler, ) @mock.patch( - "dataprofiler.profilers.utils.DataLabeler", + "dataprofiler.profilers.profiler_utils.DataLabeler", spec=BaseDataLabeler, ) def test_json_decode(self, mock_utils_DataLabeler, mock_DataLabeler, *mocks): @@ -3187,7 +3234,7 @@ def test_json_decode(self, mock_utils_DataLabeler, mock_DataLabeler, *mocks): spec=BaseDataLabeler, ) @mock.patch( - "dataprofiler.profilers.utils.DataLabeler", + "dataprofiler.profilers.profiler_utils.DataLabeler", spec=BaseDataLabeler, ) def test_json_decode_after_update( @@ -4047,11 +4094,13 @@ def test_report_remove_disabled_flag(self): def test_save_and_load_pkl_file(self): data_folder = "dataprofiler/tests/data/" test_files = ["txt/code.txt", "txt/sentence-10x.txt"] + profile_options = dp.ProfilerOptions() + profile_options.set({"structured_options.multiprocess.is_enabled": False}) for test_file in test_files: # Create Data and StructuredProfiler objects data = dp.Data(os.path.join(data_folder, test_file)) - save_profile = UnstructuredProfiler(data) + save_profile = UnstructuredProfiler(data, options=profile_options) # If profile _empty_line_count = 0, it won't test if the variable is # saved correctly since that is also the default value. Ensure @@ -4112,7 +4161,12 @@ def test_save_and_load_no_labeler(self): data = "this is my test data: 123-456-7890" profile_options = dp.ProfilerOptions() - profile_options.set({"data_labeler.is_enabled": False}) + profile_options.set( + { + "data_labeler.is_enabled": False, + "structured_options.multiprocess.is_enabled": False, + } + ) save_profile = dp.UnstructuredProfiler(data, options=profile_options) diff --git a/dataprofiler/tests/profilers/test_utils.py b/dataprofiler/tests/profilers/test_profiler_utils.py similarity index 67% rename from dataprofiler/tests/profilers/test_utils.py rename to dataprofiler/tests/profilers/test_profiler_utils.py index 10350b2b2..4eee1963a 100644 --- a/dataprofiler/tests/profilers/test_utils.py +++ b/dataprofiler/tests/profilers/test_profiler_utils.py @@ -7,26 +7,26 @@ import dataprofiler as dp from dataprofiler.labelers.base_data_labeler import BaseDataLabeler -from dataprofiler.profilers import utils +from dataprofiler.profilers import profiler_utils class TestShuffleInChunks(unittest.TestCase): """ - Validates utils.shuffle_in_chunks is properly working. + Validates profiler_utils.shuffle_in_chunks is properly working. """ def test_full_sample(self): """ Check if can shuffle full sample. """ - sample = next(utils.shuffle_in_chunks(data_length=10, chunk_size=10)) + sample = next(profiler_utils.shuffle_in_chunks(data_length=10, chunk_size=10)) self.assertCountEqual(sample, list(range(10))) def test_even_chunk_sample(self): """ Check if can shuffle sample where chunk size is evenly divisible. """ - sample_gen = utils.shuffle_in_chunks(data_length=12, chunk_size=3) + sample_gen = profiler_utils.shuffle_in_chunks(data_length=12, chunk_size=3) all_values = set() num_chunks = 0 @@ -41,7 +41,7 @@ def test_uneven_chunk_sample(self): """ Check if can shuffle sample where chunk size is not evenly divisible. """ - sample_gen = utils.shuffle_in_chunks(data_length=100, chunk_size=7) + sample_gen = profiler_utils.shuffle_in_chunks(data_length=100, chunk_size=7) all_values = set() num_chunks = 0 @@ -60,50 +60,65 @@ def test_find_diff(self): # Ensure lists and sets are handled appropriately self.assertEqual( - [[], [3, 2], [2]], utils.find_diff_of_lists_and_sets([3, 2], [2, 3, 2]) + [[], [3, 2], [2]], + profiler_utils.find_diff_of_lists_and_sets([3, 2], [2, 3, 2]), ) self.assertEqual( - [[1], [2, 3], [4]], utils.find_diff_of_lists_and_sets([1, 2, 3], [2, 3, 4]) + [[1], [2, 3], [4]], + profiler_utils.find_diff_of_lists_and_sets([1, 2, 3], [2, 3, 4]), ) - self.assertEqual("unchanged", utils.find_diff_of_lists_and_sets({3, 2}, {2, 3})) self.assertEqual( - [[1], [2, 3], [4]], utils.find_diff_of_lists_and_sets({1, 2, 3}, {2, 3, 4}) + "unchanged", profiler_utils.find_diff_of_lists_and_sets({3, 2}, {2, 3}) ) - self.assertEqual("unchanged", utils.find_diff_of_lists_and_sets({2, 3}, [2, 3])) self.assertEqual( - [[1], [2, 3], [4]], utils.find_diff_of_lists_and_sets([1, 2, 3], {2, 3, 4}) + [[1], [2, 3], [4]], + profiler_utils.find_diff_of_lists_and_sets({1, 2, 3}, {2, 3, 4}), ) self.assertEqual( - [None, {1, 2}], utils.find_diff_of_lists_and_sets(None, {1, 2}) + "unchanged", profiler_utils.find_diff_of_lists_and_sets({2, 3}, [2, 3]) + ) + self.assertEqual( + [[1], [2, 3], [4]], + profiler_utils.find_diff_of_lists_and_sets([1, 2, 3], {2, 3, 4}), + ) + self.assertEqual( + [None, {1, 2}], profiler_utils.find_diff_of_lists_and_sets(None, {1, 2}) + ) + self.assertEqual( + "unchanged", profiler_utils.find_diff_of_lists_and_sets(None, None) ) - self.assertEqual("unchanged", utils.find_diff_of_lists_and_sets(None, None)) # Ensure ints and floats are handled appropriately - self.assertEqual(1, utils.find_diff_of_numbers(5, 4)) - self.assertEqual(1.0, utils.find_diff_of_numbers(5.0, 4.0)) - self.assertEqual(1.0, utils.find_diff_of_numbers(5.0, 4)) - self.assertEqual("unchanged", utils.find_diff_of_numbers(5.0, 5.0)) - self.assertEqual("unchanged", utils.find_diff_of_numbers(5, 5.0)) - self.assertEqual([4, None], utils.find_diff_of_numbers(4, None)) - self.assertEqual("unchanged", utils.find_diff_of_numbers(None, None)) + self.assertEqual(1, profiler_utils.find_diff_of_numbers(5, 4)) + self.assertEqual(1.0, profiler_utils.find_diff_of_numbers(5.0, 4.0)) + self.assertEqual(1.0, profiler_utils.find_diff_of_numbers(5.0, 4)) + self.assertEqual("unchanged", profiler_utils.find_diff_of_numbers(5.0, 5.0)) + self.assertEqual("unchanged", profiler_utils.find_diff_of_numbers(5, 5.0)) + self.assertEqual([4, None], profiler_utils.find_diff_of_numbers(4, None)) + self.assertEqual("unchanged", profiler_utils.find_diff_of_numbers(None, None)) # Ensure strings are handled appropriately self.assertEqual( - "unchanged", utils.find_diff_of_strings_and_bools("Hello", "Hello") + "unchanged", profiler_utils.find_diff_of_strings_and_bools("Hello", "Hello") + ) + self.assertEqual( + ["Hello", "team"], + profiler_utils.find_diff_of_strings_and_bools("Hello", "team"), ) self.assertEqual( - ["Hello", "team"], utils.find_diff_of_strings_and_bools("Hello", "team") + "unchanged", profiler_utils.find_diff_of_strings_and_bools(None, None) ) - self.assertEqual("unchanged", utils.find_diff_of_strings_and_bools(None, None)) # Ensure dates are handled appropriately a = datetime(2021, 6, 28) b = datetime(2021, 6, 27, 1) - self.assertEqual("unchanged", utils.find_diff_of_dates(a, a)) - self.assertEqual("+23:00:00", utils.find_diff_of_dates(a, b)) - self.assertEqual("-23:00:00", utils.find_diff_of_dates(b, a)) - self.assertEqual(["06/28/21 00:00:00", None], utils.find_diff_of_dates(a, None)) - self.assertEqual("unchanged", utils.find_diff_of_dates(None, None)) + self.assertEqual("unchanged", profiler_utils.find_diff_of_dates(a, a)) + self.assertEqual("+23:00:00", profiler_utils.find_diff_of_dates(a, b)) + self.assertEqual("-23:00:00", profiler_utils.find_diff_of_dates(b, a)) + self.assertEqual( + ["06/28/21 00:00:00", None], profiler_utils.find_diff_of_dates(a, None) + ) + self.assertEqual("unchanged", profiler_utils.find_diff_of_dates(None, None)) # Ensure that differencing dictionaries is handled appropriately dict1 = { @@ -131,7 +146,9 @@ def test_find_diff(self): "f": ["hi2", None], "g": [None, 15], } - self.assertDictEqual(expected_diff, utils.find_diff_of_dicts(dict1, dict2)) + self.assertDictEqual( + expected_diff, profiler_utils.find_diff_of_dicts(dict1, dict2) + ) dict1 = { "nested_key_one": {"fruit": ["apple", "banana", "orange"], "yes_no": False}, @@ -167,7 +184,9 @@ def test_find_diff(self): "additional_key": [None, "random_string"], } - self.assertDictEqual(expected_diff, utils.find_diff_of_dicts(dict1, dict2)) + self.assertDictEqual( + expected_diff, profiler_utils.find_diff_of_dicts(dict1, dict2) + ) def test_diff_of_dicts_with_diff_keys(self): dict1 = {"unique1": 1, "shared1": 2, "shared2": 3} @@ -181,11 +200,13 @@ def test_diff_of_dicts_with_diff_keys(self): # Assert difference is appropriate self.assertListEqual( - expected, utils.find_diff_of_dicts_with_diff_keys(dict1, dict2) + expected, profiler_utils.find_diff_of_dicts_with_diff_keys(dict1, dict2) ) # Assert empty dicts are unchanged - self.assertEqual("unchanged", utils.find_diff_of_dicts_with_diff_keys({}, {})) + self.assertEqual( + "unchanged", profiler_utils.find_diff_of_dicts_with_diff_keys({}, {}) + ) # Assert all edge cases work a = datetime(2021, 6, 28) @@ -215,7 +236,7 @@ def test_diff_of_dicts_with_diff_keys(self): {"unique2": 5}, ] self.assertListEqual( - expected, utils.find_diff_of_dicts_with_diff_keys(dict1, dict2) + expected, profiler_utils.find_diff_of_dicts_with_diff_keys(dict1, dict2) ) dict1 = { @@ -259,14 +280,14 @@ def test_diff_of_dicts_with_diff_keys(self): ] self.assertListEqual( - expected, utils.find_diff_of_dicts_with_diff_keys(dict1, dict2) + expected, profiler_utils.find_diff_of_dicts_with_diff_keys(dict1, dict2) ) def test_list_diff_with_nan(self): # when lists are same length list_1 = [np.nan, 1.5, 6.7] list_2 = [np.nan, 1.5, np.nan] - diff_1 = utils.find_diff_of_lists_and_sets(list_1, list_2) + diff_1 = profiler_utils.find_diff_of_lists_and_sets(list_1, list_2) expected_diff_1 = [[6.7], [np.nan, 1.5], [np.nan]] for x, y in zip(diff_1, expected_diff_1): @@ -276,7 +297,7 @@ def test_list_diff_with_nan(self): # when lists aren't the same length list_3 = [np.nan, 1.5, 6.7, np.nan, np.nan, np.nan] list_4 = [4.2, 1.5, np.nan] - diff_2 = utils.find_diff_of_lists_and_sets(list_3, list_4) + diff_2 = profiler_utils.find_diff_of_lists_and_sets(list_3, list_4) expected_diff_2 = [[6.7, np.nan, np.nan, np.nan], [np.nan, 1.5], [4.2]] for x, y in zip(diff_2, expected_diff_2): @@ -285,7 +306,7 @@ def test_list_diff_with_nan(self): list_5 = [np.nan, np.nan] list_6 = [np.nan] - diff_3 = utils.find_diff_of_lists_and_sets(list_5, list_6) + diff_3 = profiler_utils.find_diff_of_lists_and_sets(list_5, list_6) expected_diff_3 = [[np.nan], [np.nan], []] for x, y in zip(diff_3, expected_diff_3): @@ -294,7 +315,7 @@ def test_list_diff_with_nan(self): list_7 = [np.nan, 3] list_8 = [np.nan, 3] - diff_4 = utils.find_diff_of_lists_and_sets(list_7, list_8) + diff_4 = profiler_utils.find_diff_of_lists_and_sets(list_7, list_8) expected_diff_4 = "unchanged" self.assertEqual(diff_4, expected_diff_4) @@ -307,7 +328,7 @@ def test_find_diff_of_matrices(self): # Check matrix subtraction of same size matrices expected_matrix = [[-10.0, np.nan, 3.0], [3.0, 0.0, 4.0], [np.nan, -12.0, 8.0]] - diff_matrix = utils.find_diff_of_matrices(matrix1, matrix2) + diff_matrix = profiler_utils.find_diff_of_matrices(matrix1, matrix2) comparison = ( (expected_matrix == diff_matrix) | (np.isnan(expected_matrix) & np.isnan(diff_matrix)) @@ -315,13 +336,15 @@ def test_find_diff_of_matrices(self): self.assertEqual(True, comparison) # Check matrix subtraction of same exact matrices - self.assertEqual("unchanged", utils.find_diff_of_matrices(matrix1, matrix1)) + self.assertEqual( + "unchanged", profiler_utils.find_diff_of_matrices(matrix1, matrix1) + ) # Check matrix subtraction with different sized matrices matrix1 = [[1, 2], [1, 2]] - self.assertIsNone(utils.find_diff_of_matrices(matrix1, matrix2)) + self.assertIsNone(profiler_utils.find_diff_of_matrices(matrix1, matrix2)) # Check matrix with none - self.assertIsNone(utils.find_diff_of_matrices(matrix1, None)) + self.assertIsNone(profiler_utils.find_diff_of_matrices(matrix1, None)) def test_get_memory_size(self): """ @@ -333,27 +356,30 @@ def test_get_memory_size(self): "Currently only supports the memory size unit " r"in \['B', 'K', 'M', 'G'\]", ): - utils.get_memory_size([], unit="wrong_unit") + profiler_utils.get_memory_size([], unit="wrong_unit") # test with different data sizes - self.assertEqual(0, utils.get_memory_size([])) + self.assertEqual(0, profiler_utils.get_memory_size([])) self.assertEqual( - 33 / 1024**2, utils.get_memory_size(["This is test, a Test sentence.!!!"]) + 33 / 1024**2, + profiler_utils.get_memory_size(["This is test, a Test sentence.!!!"]), ) self.assertEqual( 33 / 1024**2, - utils.get_memory_size(["This is test,", " a Test sentence.!!!"]), + profiler_utils.get_memory_size(["This is test,", " a Test sentence.!!!"]), ) self.assertEqual( 33 / 1024**3, - utils.get_memory_size(["This is test, a Test sentence.!!!"], unit="G"), + profiler_utils.get_memory_size( + ["This is test, a Test sentence.!!!"], unit="G" + ), ) @mock.patch("dataprofiler.profilers.profile_builder.DataLabeler", spec=BaseDataLabeler) class TestProfileDistributedMerge(unittest.TestCase): """ - Validates utils.merge_profile_list is properly working. + Validates profiler_utils.merge_profile_list is properly working. """ @staticmethod @@ -393,7 +419,9 @@ def test_merge_profile_list(self, mock_data_labeler, *mocks): profile_two = dp.Profiler(data[2:]) list_of_profiles = [profile_one, profile_two] - single_profile = utils.merge_profile_list(list_of_profiles=list_of_profiles) + single_profile = profiler_utils.merge_profile_list( + list_of_profiles=list_of_profiles + ) single_report = single_profile.report() self.assertEqual(1, len(single_report["data_stats"])) @@ -428,7 +456,9 @@ def test_odd_merge_profile_list(self, mock_data_labeler, *mocks): profile_three = dp.Profiler(data[2:]) list_of_profiles = [profile_one, profile_two, profile_three] - single_profile = utils.merge_profile_list(list_of_profiles=list_of_profiles) + single_profile = profiler_utils.merge_profile_list( + list_of_profiles=list_of_profiles + ) single_report = single_profile.report() self.assertEqual(1, len(single_report["data_stats"])) @@ -439,3 +469,52 @@ def test_odd_merge_profile_list(self, mock_data_labeler, *mocks): self.assertEqual(1, single_report["data_stats"][0]["statistics"]["min"]) self.assertEqual(60.0, single_report["data_stats"][0]["statistics"]["max"]) + + +class TestAutoMultiProcessToggle(unittest.TestCase): + + """ + Validate profile_utils.auto_multiprocess_toggle is properly working. + """ + + def test_auto_multiprocess_toggle(self): + rows_threshold = 5 + cols_threshold = 10 + + # Test for no multiprocessing for sufficiently small datasets + data = pd.DataFrame(np.random.random((2, 5))) + self.assertFalse( + profiler_utils.auto_multiprocess_toggle( + data, rows_threshold, cols_threshold + ) + ) + data = pd.DataFrame(np.random.random((5, 10))) + self.assertFalse( + profiler_utils.auto_multiprocess_toggle( + data, rows_threshold, cols_threshold + ) + ) + + # Test for multiprocessing with only rows passing threshold + data = pd.DataFrame(np.random.random((6, 10))) + self.assertTrue( + profiler_utils.auto_multiprocess_toggle( + data, rows_threshold, cols_threshold + ) + ) + + # Test for multiprocessing with only columns passing threshold + data = pd.DataFrame(np.random.random((5, 11))) + self.assertTrue( + profiler_utils.auto_multiprocess_toggle( + data, rows_threshold, cols_threshold + ) + ) + + # Test for multiprocessing with both rows and columns passing threshold + data = pd.DataFrame(np.random.random((6, 11))) + self.assertTrue( + profiler_utils.auto_multiprocess_toggle( + data, rows_threshold, cols_threshold + ) + ) diff --git a/dataprofiler/tests/profilers/test_text_column_profile.py b/dataprofiler/tests/profilers/test_text_column_profile.py index 0d578c6e9..98b87acbe 100644 --- a/dataprofiler/tests/profilers/test_text_column_profile.py +++ b/dataprofiler/tests/profilers/test_text_column_profile.py @@ -8,7 +8,7 @@ import numpy as np import pandas as pd -from dataprofiler.profilers import TextColumn, utils +from dataprofiler.profilers import TextColumn, profiler_utils from dataprofiler.profilers.json_decoder import load_column_profile from dataprofiler.profilers.json_encoder import ProfileEncoder from dataprofiler.profilers.profiler_options import TextOptions @@ -584,7 +584,7 @@ def test_diff(self): "median_absolute_deviation": -0.5, "variance": profile1["variance"] - profile2["variance"], "stddev": profile1["stddev"] - profiler2["stddev"], - "vocab": utils.find_diff_of_lists_and_sets( + "vocab": profiler_utils.find_diff_of_lists_and_sets( profile1["vocab"], profile2["vocab"] ), "t-test": { diff --git a/dataprofiler/tests/profilers/test_unstructured_labeler_profile.py b/dataprofiler/tests/profilers/test_unstructured_labeler_profile.py index d3a47108b..c7aa8b0c5 100644 --- a/dataprofiler/tests/profilers/test_unstructured_labeler_profile.py +++ b/dataprofiler/tests/profilers/test_unstructured_labeler_profile.py @@ -4,7 +4,7 @@ import pandas as pd -from dataprofiler.profilers import utils +from dataprofiler.profilers import profiler_utils from dataprofiler.profilers.unstructured_labeler_profile import ( UnstructuredLabelerProfile, ) diff --git a/dataprofiler/tests/profilers/utils.py b/dataprofiler/tests/profilers/utils.py index bef54763d..d85dbf380 100644 --- a/dataprofiler/tests/profilers/utils.py +++ b/dataprofiler/tests/profilers/utils.py @@ -11,7 +11,7 @@ from dataprofiler.profilers.column_profile_compilers import BaseCompiler from dataprofiler.profilers.profile_builder import BaseProfiler, StructuredColProfiler from dataprofiler.profilers.profiler_options import BaseOption -from dataprofiler.profilers.utils import find_diff_of_dicts +from dataprofiler.profilers.profiler_utils import find_diff_of_dicts from dataprofiler.tests.test_utils import patched_assert_warns diff --git a/dataprofiler/tests/test_rng_utils.py b/dataprofiler/tests/test_rng_utils.py new file mode 100644 index 000000000..6ee2ed35c --- /dev/null +++ b/dataprofiler/tests/test_rng_utils.py @@ -0,0 +1,53 @@ +"""Validates that generator intakes DATAPROFILER_SEED properly.""" +import os +import unittest +import unittest.mock + +from .. import rng_utils + + +class TestGetRandomNumberGenerator(unittest.TestCase): + """Validates get_random_number_generator() is properly working.""" + + @unittest.mock.patch.dict(os.environ, {"DATAPROFILER_SEED": "0"}) + @unittest.mock.patch("dataprofiler.rng_utils.settings._seed", new=123) + def test_dataprofiler_seed_true_settings_seed_false(self): + """Test for DATAPROFILER_SEED in os.environ and settings._seed!=None.""" + with unittest.mock.patch("numpy.random.default_rng") as mock_np_generator: + rng_utils.get_random_number_generator() + self.assertEqual(mock_np_generator.call_count, 1) + mock_np_generator.assert_called_with(123) + + @unittest.mock.patch("dataprofiler.rng_utils.settings._seed", new=None) + @unittest.mock.patch.dict("os.environ", clear=True) + def test_dataprofiler_seed_false_settings_seed_true(self): + """Test for DATAPROFILER_SEED not in os.environ and settings._seed==None.""" + with unittest.mock.patch("numpy.random.default_rng") as mock_np_generator: + rng_utils.get_random_number_generator() + self.assertEqual(mock_np_generator.call_count, 1) + mock_np_generator.assert_called_with(None) + + @unittest.mock.patch.dict(os.environ, {"DATAPROFILER_SEED": "123"}) + @unittest.mock.patch("dataprofiler.rng_utils.settings._seed", new=None) + def test_dataprofiler_seed_true_settings_seed_true(self): + """Test for DATAPROFILER_SEED in os.environ and settings._seed==None.""" + with unittest.mock.patch("numpy.random.default_rng") as mock_np_generator: + rng_utils.get_random_number_generator() + self.assertEqual(mock_np_generator.call_count, 2) + mock_np_generator.assert_called_with(123) + + @unittest.mock.patch("dataprofiler.rng_utils.settings._seed", new=123) + @unittest.mock.patch.dict("os.environ", clear=True) + def test_dataprofiler_seed_false_settings_seed_false(self): + """Test for DATAPROFILER_SEED not in os.environ and settings._seed!=None.""" + with unittest.mock.patch("numpy.random.default_rng") as mock_np_generator: + rng_utils.get_random_number_generator() + self.assertEqual(mock_np_generator.call_count, 1) + mock_np_generator.assert_called_with(123) + + @unittest.mock.patch.dict(os.environ, {"DATAPROFILER_SEED": "George Washington"}) + @unittest.mock.patch("dataprofiler.rng_utils.settings._seed", new=None) + def test_warning_raised(self): + """Test that warning raises if seed is not an integer.""" + with self.assertWarnsRegex(RuntimeWarning, "Seed should be an integer"): + rng_utils.get_random_number_generator() diff --git a/dataprofiler/version.py b/dataprofiler/version.py index 070b4c9e3..0808d3c3e 100644 --- a/dataprofiler/version.py +++ b/dataprofiler/version.py @@ -2,7 +2,7 @@ MAJOR = 0 MINOR = 10 -MICRO = 2 +MICRO = 3 POST = None # otherwise None VERSION = "%d.%d.%d" % (MAJOR, MINOR, MICRO) diff --git a/examples/add_new_model_to_data_labeler.ipynb b/examples/add_new_model_to_data_labeler.ipynb index 3f59297bc..1495e6a85 100644 --- a/examples/add_new_model_to_data_labeler.ipynb +++ b/examples/add_new_model_to_data_labeler.ipynb @@ -1,6 +1,7 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", "id": "228bb2a6", "metadata": {}, @@ -9,6 +10,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "cab7a569", "metadata": {}, @@ -39,6 +41,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "e90728ab", "metadata": {}, @@ -47,6 +50,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "3d61981c", "metadata": {}, @@ -75,6 +79,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "745ed0d4", "metadata": {}, @@ -83,6 +88,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "7375b0c0", "metadata": {}, @@ -105,15 +111,19 @@ "source": [ "import tensorflow as tf\n", "import numpy as np\n", - "from dataprofiler.labelers.character_level_cnn_model import CharacterLevelCnnModel, F1Score, \\\n", - " create_glove_char, build_embd_dictionary\n", + "from dataprofiler.labelers.character_level_cnn_model import (\n", + " CharacterLevelCnnModel,\n", + " create_glove_char,\n", + " build_embd_dictionary,\n", + ")\n", "from dataprofiler.labelers.base_model import BaseModel\n", + "from dataprofiler.labelers.labeler_utils import F1Score\n", + "\n", "\n", "# CharacterLevelLstmModel derives from CharacterLevelCnnModel\n", "#########################################################\n", "#########################################################\n", "class CharacterLevelLstmModel(CharacterLevelCnnModel):\n", - "\n", " # boolean if the label mapping requires the mapping for index 0 reserved\n", " requires_zero_mapping = True\n", "\n", @@ -121,26 +131,26 @@ " \"\"\"\n", " LSTM Model Initializer\n", " \"\"\"\n", - " \n", + "\n", " # parameter initialization\n", " if not parameters:\n", " parameters = {}\n", - " parameters.setdefault('max_length', 3400)\n", - " parameters.setdefault('max_char_encoding_id', 127)\n", - " parameters.setdefault('dim_embed', 64)\n", - " parameters.setdefault('size_fc', [32, 32])\n", - " parameters.setdefault('dropout', 0.1)\n", + " parameters.setdefault(\"max_length\", 3400)\n", + " parameters.setdefault(\"max_char_encoding_id\", 127)\n", + " parameters.setdefault(\"dim_embed\", 64)\n", + " parameters.setdefault(\"size_fc\", [32, 32])\n", + " parameters.setdefault(\"dropout\", 0.1)\n", " # new parameters for LSTM model\n", " #########################################################\n", " #########################################################\n", - " parameters.setdefault('size_lstm', [64])\n", - " parameters.setdefault('rec_dropout', 0.1)\n", - " parameters.setdefault('activation', \"tanh\")\n", - " parameters.setdefault('recurrent_activation', \"sigmoid\")\n", + " parameters.setdefault(\"size_lstm\", [64])\n", + " parameters.setdefault(\"rec_dropout\", 0.1)\n", + " parameters.setdefault(\"activation\", \"tanh\")\n", + " parameters.setdefault(\"recurrent_activation\", \"sigmoid\")\n", " #########################################################\n", " #########################################################\n", - " parameters.setdefault('default_label', \"UNKNOWN\")\n", - " parameters['pad_label'] = 'PAD'\n", + " parameters.setdefault(\"default_label\", \"UNKNOWN\")\n", + " parameters[\"pad_label\"] = \"PAD\"\n", " self._epoch_id = 0\n", "\n", " # reconstruct flags for model\n", @@ -155,36 +165,66 @@ " present.\n", " \"\"\"\n", " errors = []\n", - " list_of_necessary_params = ['max_length', 'max_char_encoding_id',\n", - " 'dim_embed', 'size_fc', 'dropout',\n", - " 'size_lstm', 'rec_dropout', 'activation', \n", - " 'recurrent_activation', 'default_label', \n", - " 'pad_label']\n", + " list_of_necessary_params = [\n", + " \"max_length\",\n", + " \"max_char_encoding_id\",\n", + " \"dim_embed\",\n", + " \"size_fc\",\n", + " \"dropout\",\n", + " \"size_lstm\",\n", + " \"rec_dropout\",\n", + " \"activation\",\n", + " \"recurrent_activation\",\n", + " \"default_label\",\n", + " \"pad_label\",\n", + " ]\n", " # Make sure the necessary parameters are present and valid.\n", " for param in parameters:\n", - " if param in ['max_length', 'max_char_encoding_id', 'dim_embed',\n", - " 'size_conv']:\n", - " if not isinstance(parameters[param], (int, float)) \\\n", - " or parameters[param] < 0:\n", - " errors.append(param + \" must be a valid integer or float \"\n", - " \"greater than 0.\")\n", - " elif param in ['dropout', 'rec_dropout']: # additional check for rec_dropout\n", - " if not isinstance(parameters[param], (int, float)) \\\n", - " or parameters[param] < 0 or parameters[param] > 1:\n", - " errors.append(param + \" must be a valid integer or float \"\n", - " \"from 0 to 1.\")\n", - " elif param == 'size_fc' or param == 'size_lstm': # additional check for size_lstm\n", - " if not isinstance(parameters[param], list) \\\n", - " or len(parameters[param]) == 0:\n", - " errors.append(param + \" must be a non-empty list of \"\n", - " \"integers.\")\n", + " if param in [\n", + " \"max_length\",\n", + " \"max_char_encoding_id\",\n", + " \"dim_embed\",\n", + " \"size_conv\",\n", + " ]:\n", + " if (\n", + " not isinstance(parameters[param], (int, float))\n", + " or parameters[param] < 0\n", + " ):\n", + " errors.append(\n", + " param + \" must be a valid integer or float \" \"greater than 0.\"\n", + " )\n", + " elif param in [\n", + " \"dropout\",\n", + " \"rec_dropout\",\n", + " ]: # additional check for rec_dropout\n", + " if (\n", + " not isinstance(parameters[param], (int, float))\n", + " or parameters[param] < 0\n", + " or parameters[param] > 1\n", + " ):\n", + " errors.append(\n", + " param + \" must be a valid integer or float \" \"from 0 to 1.\"\n", + " )\n", + " elif (\n", + " param == \"size_fc\" or param == \"size_lstm\"\n", + " ): # additional check for size_lstm\n", + " if (\n", + " not isinstance(parameters[param], list)\n", + " or len(parameters[param]) == 0\n", + " ):\n", + " errors.append(param + \" must be a non-empty list of \" \"integers.\")\n", " else:\n", " for item in parameters[param]:\n", " if not isinstance(item, int):\n", - " errors.append(param + \" must be a non-empty \"\n", - " \"list of integers.\")\n", + " errors.append(\n", + " param + \" must be a non-empty \" \"list of integers.\"\n", + " )\n", " break\n", - " elif param in ['default_label', 'activation', 'recurrent_activation']: # additional check for activation and recurrent_activation\n", + " elif param in [\n", + " \"default_label\",\n", + " \"activation\",\n", + " \"recurrent_activation\",\n", + " ]: # additional check for activation and recurrent_activation\n", " if not isinstance(parameters[param], str):\n", " error = str(param) + \" must be a string.\"\n", " errors.append(error)\n", @@ -194,7 +234,7 @@ " if param not in list_of_necessary_params:\n", " errors.append(param + \" is not an accepted parameter.\")\n", " if errors:\n", - " raise ValueError('\\n'.join(errors))\n", + " raise ValueError(\"\\n\".join(errors))\n", "\n", " def _construct_model(self):\n", " \"\"\"\n", @@ -204,41 +244,44 @@ " :return: None\n", " \"\"\"\n", " num_labels = self.num_labels\n", - " default_ind = self.label_mapping[self._parameters['default_label']]\n", + " default_ind = self.label_mapping[self._parameters[\"default_label\"]]\n", "\n", " # Reset model\n", " tf.keras.backend.clear_session()\n", "\n", " # generate glove embedding\n", - " create_glove_char(self._parameters['dim_embed'])\n", + " create_glove_char(self._parameters[\"dim_embed\"])\n", "\n", " # generate model\n", " self._model = tf.keras.models.Sequential()\n", "\n", " # default parameters\n", - " max_length = self._parameters['max_length']\n", - " max_char_encoding_id = self._parameters['max_char_encoding_id']\n", + " max_length = self._parameters[\"max_length\"]\n", + " max_char_encoding_id = self._parameters[\"max_char_encoding_id\"]\n", "\n", " # Encoding layer\n", " def encoding_function(input_str):\n", " char_in_vector = CharacterLevelLstmModel._char_encoding_layer(\n", - " input_str, max_char_encoding_id, max_length)\n", + " input_str, max_char_encoding_id, max_length\n", + " )\n", " return char_in_vector\n", "\n", " self._model.add(tf.keras.layers.Input(shape=(None,), dtype=tf.string))\n", "\n", " self._model.add(\n", - " tf.keras.layers.Lambda(encoding_function,\n", - " output_shape=tuple([max_length])))\n", + " tf.keras.layers.Lambda(encoding_function, output_shape=tuple([max_length]))\n", + " )\n", "\n", " # Create a pre-trained weight matrix\n", " # character encoding indices range from 0 to max_char_encoding_id,\n", " # we add one extra index for out-of-vocabulary character\n", " embed_file = os.path.join(\n", - " \"../dataprofiler/labelers\", \"embeddings/glove-reduced-{}D.txt\".format(\n", - " self._parameters['dim_embed']))\n", - " embedding_matrix = np.zeros((max_char_encoding_id + 2,\n", - " self._parameters['dim_embed']))\n", + " \"../dataprofiler/labelers\",\n", + " \"embeddings/glove-reduced-{}D.txt\".format(self._parameters[\"dim_embed\"]),\n", + " )\n", + " embedding_matrix = np.zeros(\n", + " (max_char_encoding_id + 2, self._parameters[\"dim_embed\"])\n", + " )\n", " embedding_dict = build_embd_dictionary(embed_file)\n", "\n", " input_shape = tuple([max_length])\n", @@ -247,70 +290,74 @@ " if chr(ascii_num) in embedding_dict:\n", " embedding_matrix[ascii_num + 1] = embedding_dict[chr(ascii_num)]\n", "\n", - " self._model.add(tf.keras.layers.Embedding(\n", - " max_char_encoding_id + 2,\n", - " self._parameters['dim_embed'],\n", - " weights=[embedding_matrix],\n", - " input_length=input_shape[0],\n", - " trainable=True))\n", - " \n", + " self._model.add(\n", + " tf.keras.layers.Embedding(\n", + " max_char_encoding_id + 2,\n", + " self._parameters[\"dim_embed\"],\n", + " weights=[embedding_matrix],\n", + " input_length=input_shape[0],\n", + " trainable=True,\n", + " )\n", + " )\n", + "\n", " # Add the lstm layers\n", " #########################################################\n", " #########################################################\n", - " for size in self._parameters['size_lstm']:\n", + " for size in self._parameters[\"size_lstm\"]:\n", " self._model.add(\n", - " tf.keras.layers.LSTM(units=size, \n", - " recurrent_dropout=self._parameters['rec_dropout'], \n", - " activation=self._parameters['activation'],\n", - " recurrent_activation=self._parameters['recurrent_activation'],\n", - " return_sequences=True))\n", - " if self._parameters['dropout']:\n", - " self._model.add(tf.keras.layers.Dropout(self._parameters['dropout']))\n", + " tf.keras.layers.LSTM(\n", + " units=size,\n", + " recurrent_dropout=self._parameters[\"rec_dropout\"],\n", + " activation=self._parameters[\"activation\"],\n", + " recurrent_activation=self._parameters[\"recurrent_activation\"],\n", + " return_sequences=True,\n", + " )\n", + " )\n", + " if self._parameters[\"dropout\"]:\n", + " self._model.add(tf.keras.layers.Dropout(self._parameters[\"dropout\"]))\n", " #########################################################\n", " #########################################################\n", "\n", " # Add the fully connected layers\n", - " for size in self._parameters['size_fc']:\n", - " self._model.add(\n", - " tf.keras.layers.Dense(units=size, activation='relu'))\n", - " if self._parameters['dropout']:\n", - " self._model.add(\n", - " tf.keras.layers.Dropout(self._parameters['dropout']))\n", + " for size in self._parameters[\"size_fc\"]:\n", + " self._model.add(tf.keras.layers.Dense(units=size, activation=\"relu\"))\n", + " if self._parameters[\"dropout\"]:\n", + " self._model.add(tf.keras.layers.Dropout(self._parameters[\"dropout\"]))\n", "\n", " # Add the final Softmax layer\n", - " self._model.add(\n", - " tf.keras.layers.Dense(num_labels, activation='softmax'))\n", + " self._model.add(tf.keras.layers.Dense(num_labels, activation=\"softmax\"))\n", "\n", " # Output the model into a .pb file for TensorFlow\n", " argmax_layer = tf.keras.backend.argmax(self._model.output)\n", "\n", " # Create confidence layers\n", " final_predicted_layer = CharacterLevelLstmModel._argmax_threshold_layer(\n", - " num_labels, threshold=0.0, default_ind=default_ind)\n", + " num_labels, threshold=0.0, default_ind=default_ind\n", + " )\n", "\n", - " argmax_outputs = self._model.outputs + \\\n", - " [argmax_layer,\n", - " final_predicted_layer(argmax_layer, self._model.output)]\n", + " argmax_outputs = self._model.outputs + [\n", + " argmax_layer,\n", + " final_predicted_layer(argmax_layer, self._model.output),\n", + " ]\n", " self._model = tf.keras.Model(self._model.inputs, argmax_outputs)\n", "\n", " # Compile the model\n", - " softmax_output_layer_name = self._model.outputs[0].name.split('/')[0]\n", + " softmax_output_layer_name = self._model.outputs[0].name.split(\"/\")[0]\n", " losses = {softmax_output_layer_name: \"categorical_crossentropy\"}\n", "\n", " # use f1 score metric\n", - " f1_score_training = F1Score(num_classes=num_labels, average='micro')\n", - " metrics = {softmax_output_layer_name: ['acc', f1_score_training]}\n", + " f1_score_training = F1Score(num_classes=num_labels, average=\"micro\")\n", + " metrics = {softmax_output_layer_name: [\"acc\", f1_score_training]}\n", "\n", - " self._model.compile(loss=losses,\n", - " optimizer=\"adam\",\n", - " metrics=metrics)\n", + " self._model.compile(loss=losses, optimizer=\"adam\", metrics=metrics)\n", "\n", " self._epoch_id = 0\n", " self._model_num_labels = num_labels\n", - " self._model_default_ind = default_ind\n" + " self._model_default_ind = default_ind" ] }, { + "attachments": {}, "cell_type": "markdown", "id": "d66bd25c", "metadata": {}, @@ -319,6 +366,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "479f407a", "metadata": {}, @@ -365,6 +413,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "14b78c69", "metadata": {}, @@ -406,6 +455,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "cc60ff8a", "metadata": {}, diff --git a/examples/structured_profilers.ipynb b/examples/structured_profilers.ipynb index 10f9eddd2..b6a4409c9 100644 --- a/examples/structured_profilers.ipynb +++ b/examples/structured_profilers.ipynb @@ -245,7 +245,7 @@ "\n", "Below, let's remove the histogram and increase the number of samples to the labeler component (1,000 samples). \n", "\n", - "Full list of options in the Profiler section of the [DataProfiler documentation](https://capitalone.github.io/DataProfiler)." + "Full list of options in the Profiler section of the [DataProfiler documentation](https://capitalone.github.io/DataProfiler/profile_options.html)." ] }, { diff --git a/examples/unstructured_profilers.ipynb b/examples/unstructured_profilers.ipynb index 82169af5a..9ab754cc7 100644 --- a/examples/unstructured_profilers.ipynb +++ b/examples/unstructured_profilers.ipynb @@ -178,7 +178,7 @@ "\n", "Below, let's remove the vocab count and set the stop words. \n", "\n", - "Full list of options in the Profiler section of the [DataProfiler documentation](https://capitalone.github.io/DataProfiler)." + "Full list of options in the Profiler section of the [DataProfiler documentation](https://capitalone.github.io/DataProfiler/profile_options.html)." ] }, { diff --git a/requirements.txt b/requirements.txt index 994ec78de..7c8aa0b99 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,9 +10,10 @@ fastavro>=1.0.0.post1 python-snappy>=0.5.4 charset-normalizer>=1.3.6 psutil>=4.0.0 -scipy>=1.4.1,<1.11.0 +scipy>=1.10.0 requests>=2.28.1 networkx>=2.5.1 typing-extensions>=3.10.0.2 HLL>=2.0.3 datasketches>=4.1.0 +packaging>=23.0