diff --git a/.gitignore b/.gitignore index 0a12bc7be..2f5d5259b 100644 --- a/.gitignore +++ b/.gitignore @@ -134,6 +134,3 @@ venv.bak/ env3/ *.bak - -#Pipfiles -Pipfile* diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index fd1768f0b..a2047e1bf 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -2,7 +2,7 @@ repos: # Black: format Python code # https://github.com/psf/black/blob/master/.pre-commit-hooks.yaml - repo: https://github.com/psf/black - rev: 22.3.0 + rev: 24.10.0 hooks: - id: black types: [file, python] @@ -38,7 +38,7 @@ repos: # Mypy: Optional static type checking # https://github.com/pre-commit/mirrors-mypy - repo: https://github.com/pre-commit/mirrors-mypy - rev: v0.982 + rev: v1.11.2 hooks: - id: mypy exclude: (^dataprofiler/tests/|^resources/|^examples|venv*/) @@ -108,16 +108,19 @@ repos: rev: "0.48" hooks: - id: check-manifest - additional_dependencies: ['h5py', 'wheel', 'future', 'numpy<2.0.0', 'pandas', - 'python-dateutil', 'pytz', 'pyarrow', 'chardet', 'fastavro', - 'python-snappy', 'charset-normalizer', 'psutil', 'scipy', 'requests', - 'networkx','typing-extensions', 'HLL', 'datasketches', 'boto3'] + additional_dependencies: + [ + 'matplotlib', 'h5py', 'wheel', 'future', 'numpy<2.0.0', 'pandas', + 'python-dateutil', 'pytz', 'pyarrow', 'chardet', 'fastavro', + 'python-snappy', 'charset-normalizer', 'psutil', 'scipy', 'requests', + 'networkx','typing-extensions', 'HLL', 'datasketches', 'boto3', + ] # Pyupgrade - standardize and modernize Python syntax for newer versions of the language - repo: https://github.com/asottile/pyupgrade rev: v3.3.0 hooks: - id: pyupgrade - args: ["--py38-plus"] + args: ["--py39-plus"] # Autoflake - cleanup unused variables and imports - repo: https://github.com/PyCQA/autoflake rev: v2.0.0 diff --git a/MANIFEST.in b/MANIFEST.in index 0ace6ebe9..9a62e405e 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -17,4 +17,4 @@ recursive-include resources *.json recursive-include resources *.pb recursive-include resources *.py -recursive-include dataprofiler/labelers/embeddings/ *.txt +recursive-include dataprofiler/labelers/embeddings/*.txt diff --git a/Makefile b/Makefile index f659ae309..5867b94be 100644 --- a/Makefile +++ b/Makefile @@ -1,12 +1,19 @@ -setup: requirements.txt requirements-dev.txt requirements-test.txt - python3 -m venv venv +PYTHON_VERSION ?= python3.9 +VENV_DIR ?= venv +REQ_FILES := requirements.txt requirements-dev.txt requirements-test.txt requirements-ml.txt requirements-reports.txt - . venv/bin/activate && \ - pip3 install -r requirements.txt && \ - pip3 install -r requirements-dev.txt && \ - pip3 install -r requirements-ml.txt && \ - pip3 install -r requirements-reports.txt && \ - pip3 install -r requirements-test.txt && \ +check-python: + @$(PYTHON_VERSION) --version | grep -E "Python (3\.9|3\.10|3\.11)" || \ + (echo "Python 3.9, 3.10, or 3.11 is required. Ensure $(PYTHON_VERSION) is installed and try again." && exit 1) + +setup: check-python $(REQ_FILES) + @$(PYTHON_VERSION) -m venv $(VENV_DIR) + . $(VENV_DIR)/bin/activate && \ + pip3 install --no-cache-dir -r requirements-ml.txt && \ + pip3 install --no-cache-dir -r requirements.txt && \ + pip3 install --no-cache-dir -r requirements-dev.txt && \ + pip3 install --no-cache-dir -r requirements-reports.txt && \ + pip3 install --no-cache-dir -r requirements-test.txt && \ pip3 install -e . && \ pre-commit install && \ pre-commit run @@ -15,4 +22,15 @@ format: pre-commit run test: - DATAPROFILER_SEED=0 python3 -m unittest discover -p "test*.py" + DATAPROFILER_SEED=0 $(VENV_DIR)/bin/python -m unittest discover -p "test*.py" + +clean: + rm -rf .pytest_cache __pycache__ + +help: + @echo "Makefile Commands:" + @echo " setup - Set up the virtual environment with Python $(PYTHON_VERSION)" + @echo " format - Format the code using pre-commit hooks" + @echo " test - Run unit tests with unittest" + @echo " clean - Remove temporary files (caches), but keep the virtual environment" + @echo " help - Display this help message" diff --git a/dataprofiler/__init__.py b/dataprofiler/__init__.py index 5f218bd85..f23cf6494 100644 --- a/dataprofiler/__init__.py +++ b/dataprofiler/__init__.py @@ -1,4 +1,5 @@ """Package for dataprofiler.""" + from . import settings from .data_readers.data import Data from .dp_logging import get_logger, set_verbosity diff --git a/dataprofiler/_typing.py b/dataprofiler/_typing.py index fa362d1b1..399d3ad43 100644 --- a/dataprofiler/_typing.py +++ b/dataprofiler/_typing.py @@ -1,9 +1,10 @@ """Contains typing aliases.""" -from typing import Dict, List, NewType, Union + +from typing import NewType, Union import numpy as np import pandas as pd DataArray = Union[pd.DataFrame, pd.Series, np.ndarray] -JSONType = Union[str, int, float, bool, None, List, Dict] +JSONType = Union[str, int, float, bool, None, list, dict] Url = NewType("Url", str) diff --git a/dataprofiler/data_readers/avro_data.py b/dataprofiler/data_readers/avro_data.py index 720b9d1f7..79769f800 100644 --- a/dataprofiler/data_readers/avro_data.py +++ b/dataprofiler/data_readers/avro_data.py @@ -1,6 +1,7 @@ """Contains class for saving and loading spreadsheet data.""" + from io import BytesIO, StringIO -from typing import Any, Dict, List, Optional, Union +from typing import Any, Optional, Union import fastavro @@ -20,7 +21,7 @@ def __init__( self, input_file_path: Optional[str] = None, data: Optional[Any] = None, - options: Optional[Dict] = None, + options: Optional[dict] = None, ) -> None: """ Initialize Data class for loading datasets of type AVRO. @@ -60,7 +61,7 @@ def file_encoding(self, value: Any) -> None: """ pass - def _load_data_from_file(self, input_file_path: str) -> List: + def _load_data_from_file(self, input_file_path: str) -> list: """Load data from file.""" with FileOrBufferHandler(input_file_path, "rb") as input_file: # Currently, string reading with 'r' option has the unicode issue, @@ -68,14 +69,14 @@ def _load_data_from_file(self, input_file_path: str) -> List: # some special compression codec, e.g., snappy. Then, binary mode # reading is currently used to get the dict-formatted lines. df_reader = fastavro.reader(input_file) - lines: List = list() + lines: list = list() for line in df_reader: lines.append(line) return lines @classmethod def is_match( - cls, file_path: Union[str, StringIO, BytesIO], options: Optional[Dict] = None + cls, file_path: Union[str, StringIO, BytesIO], options: Optional[dict] = None ) -> bool: """ Test the given file to check if the file has valid AVRO format or not. @@ -103,7 +104,7 @@ def is_match( return is_valid_avro @classmethod - def _get_nested_key(cls, dict_line: Dict, nested_key: Dict) -> Dict: + def _get_nested_key(cls, dict_line: dict, nested_key: dict) -> dict: """ Update nested keys from a dictionary and the current nested key. @@ -131,7 +132,7 @@ def _get_nested_key(cls, dict_line: Dict, nested_key: Dict) -> Dict: return nested_key @classmethod - def _get_nested_keys_from_dicts(cls, dicts: List[Dict]) -> Dict: + def _get_nested_keys_from_dicts(cls, dicts: list[dict]) -> dict: """ Extract nested keys from a list of dictionaries. @@ -143,13 +144,13 @@ def _get_nested_keys_from_dicts(cls, dicts: List[Dict]) -> Dict: :type dicts: list(dict) :return: a dictionary containing nested keys """ - nested_keys: Dict = {} + nested_keys: dict = {} for dict_line in dicts: nested_keys = cls._get_nested_key(dict_line, nested_keys) return nested_keys @classmethod - def _get_schema_avro(cls, nested_keys: Dict, schema_avro: Dict) -> Dict: + def _get_schema_avro(cls, nested_keys: dict, schema_avro: dict) -> dict: """ Update avro schema from the nested keys and the current avro schema. @@ -190,7 +191,7 @@ def _get_schema_avro(cls, nested_keys: Dict, schema_avro: Dict) -> Dict: if type(value) is dict: # here, the null option to specify keys not required # for every lines - schema_avro_temp: Dict[str, Any] = { + schema_avro_temp: dict[str, Any] = { "name": key, "type": [{"name": key, "type": "record", "fields": []}, "null"], } diff --git a/dataprofiler/data_readers/base_data.py b/dataprofiler/data_readers/base_data.py index 27d8d5deb..74c695755 100644 --- a/dataprofiler/data_readers/base_data.py +++ b/dataprofiler/data_readers/base_data.py @@ -1,9 +1,11 @@ """Contains abstract class for data loading and saving.""" + import locale import sys from collections import OrderedDict +from collections.abc import Generator from io import StringIO -from typing import Any, Dict, Generator, List, Optional, Union +from typing import Any, Optional, Union import numpy as np import pandas as pd @@ -21,7 +23,7 @@ class BaseData: info: Optional[str] = None def __init__( - self, input_file_path: Optional[str], data: Any, options: Dict + self, input_file_path: Optional[str], data: Any, options: dict ) -> None: """ Initialize Base class for loading a dataset. @@ -42,7 +44,7 @@ def __init__( # Public properties self.input_file_path = input_file_path - self.options: Optional[Dict] = options + self.options: Optional[dict] = options # 'Private' properties # _data_formats: dict containing data_formats (key) and function @@ -56,10 +58,10 @@ def __init__( # constant across function calls. # _tmp_file_name: randomly set variables for file name usable by system # _file_encoding: contains the suggested file encoding for reading data - self._data_formats: Dict[str, Any] = OrderedDict() + self._data_formats: dict[str, Any] = OrderedDict() self._selected_data_format: Optional[str] = None self._data: Optional[Any] = data - self._batch_info: Dict = dict(perm=list(), iter=0) + self._batch_info: dict = dict(perm=list(), iter=0) self._tmp_file_name: Optional[str] = None self._file_encoding: Optional[str] = options.get("encoding", None) @@ -137,7 +139,7 @@ def file_encoding(self, value: str) -> None: self._file_encoding = value @staticmethod - def _check_and_return_options(options: Optional[Dict]) -> Dict: + def _check_and_return_options(options: Optional[dict]) -> dict: """Return options or raise error.""" if not options: options = dict() @@ -151,7 +153,7 @@ def _load_data(self, data: Optional[Any] = None) -> None: def get_batch_generator( self, batch_size: int - ) -> Generator[Union[pd.DataFrame, List], None, None]: + ) -> Generator[Union[pd.DataFrame, list], None, None]: """Get batch generator.""" data_length = len(self.data) indices = np.random.permutation(data_length) @@ -162,12 +164,12 @@ def get_batch_generator( yield list(self.data[k] for k in indices[i : i + batch_size]) @classmethod - def is_match(cls, input_file_path: str, options: Optional[Dict]) -> bool: + def is_match(cls, input_file_path: str, options: Optional[dict]) -> bool: """Return true if match, false otherwise.""" raise NotImplementedError() def reload( - self, input_file_path: Optional[str], data: Any, options: Optional[Dict] + self, input_file_path: Optional[str], data: Any, options: Optional[dict] ) -> None: """ Reload the data class with a new dataset. diff --git a/dataprofiler/data_readers/csv_data.py b/dataprofiler/data_readers/csv_data.py index 7e13d4075..5cc1fe299 100644 --- a/dataprofiler/data_readers/csv_data.py +++ b/dataprofiler/data_readers/csv_data.py @@ -1,10 +1,11 @@ """Contains class that saves and loads spreadsheet data.""" + import csv import random import re from collections import Counter from io import StringIO -from typing import Dict, List, Optional, Tuple, Union, cast +from typing import Optional, Union, cast import numpy as np import pandas as pd @@ -26,7 +27,7 @@ def __init__( self, input_file_path: Optional[str] = None, data: Optional[pd.DataFrame] = None, - options: Optional[Dict] = None, + options: Optional[dict] = None, ): """ Initialize Data class for loading datasets of type CSV. @@ -82,7 +83,7 @@ def __init__( self._selected_data_format: str = options.get("data_format", "dataframe") self._delimiter: Optional[str] = options.get("delimiter", None) self._quotechar: Optional[str] = options.get("quotechar", None) - self._selected_columns: List[str] = options.get("selected_columns", list()) + self._selected_columns: list[str] = options.get("selected_columns", list()) self._header: Optional[Union[str, int]] = options.get("header", "auto") self._checked_header: bool = "header" in options and self._header != "auto" self._default_delimiter: str = "," @@ -97,7 +98,7 @@ def __init__( self._quotechar = self._default_quotechar @property - def selected_columns(self) -> List[str]: + def selected_columns(self) -> list[str]: """Return selected columns.""" return self._selected_columns @@ -127,7 +128,7 @@ def is_structured(self) -> bool: return self.data_format == "dataframe" @staticmethod - def _check_and_return_options(options: Optional[Dict]) -> Dict: + def _check_and_return_options(options: Optional[dict]) -> dict: """ Ensure options are valid inputs to the data reader. @@ -184,9 +185,9 @@ def _check_and_return_options(options: Optional[Dict]) -> Dict: def _guess_delimiter_and_quotechar( data_as_str: str, quotechar: Optional[str] = None, - preferred: List[str] = [",", "\t"], - omitted: List[str] = ['"', "'"], - ) -> Tuple[Optional[str], Optional[str]]: + preferred: list[str] = [",", "\t"], + omitted: list[str] = ['"', "'"], + ) -> tuple[Optional[str], Optional[str]]: r""" Automatically check for what delimiter exists in a text document. @@ -207,7 +208,7 @@ def _guess_delimiter_and_quotechar( vocab = Counter(data_as_str) if "\n" in vocab: vocab.pop("\n") - omitted_list: List[str] = omitted + omitted_list: list[str] = omitted if quotechar is not None: omitted_list = omitted + [quotechar] for char in omitted_list: @@ -384,7 +385,7 @@ def _guess_header_row( quotechar = '"' # Determine type for every cell - header_check_list: List[List[str]] = [] + header_check_list: list[list[str]] = [] only_string_flag = True # Requires additional checks for row in data_as_str.split("\n"): @@ -403,7 +404,7 @@ def _guess_header_row( # Flags differences in types between each row (true/false) potential_header = header_check_list[0] - differences: List[List[bool]] = [] + differences: list[list[bool]] = [] for i in range(0, len(header_check_list)): differences.append([]) @@ -612,7 +613,7 @@ def _load_data_from_file(self, input_file_path: str) -> pd.DataFrame: encoding=self.file_encoding, ) - def _get_data_as_records(self, data: pd.DataFrame) -> List[str]: + def _get_data_as_records(self, data: pd.DataFrame) -> list[str]: """Return data as records.""" sep = self.delimiter if self.delimiter else self._default_delimiter quote = self.quotechar if self.quotechar else self._default_quotechar @@ -621,7 +622,7 @@ def _get_data_as_records(self, data: pd.DataFrame) -> List[str]: return super()._get_data_as_records(data) @classmethod - def is_match(cls, file_path: str, options: Optional[Dict] = None) -> bool: + def is_match(cls, file_path: str, options: Optional[dict] = None) -> bool: """ Check if first 1000 lines of given file has valid delimited format. @@ -745,7 +746,7 @@ def reload( self, input_file_path: Optional[str] = None, data: Optional[pd.DataFrame] = None, - options: Optional[Dict] = None, + options: Optional[dict] = None, ): """ Reload the data class with a new dataset. diff --git a/dataprofiler/data_readers/data.py b/dataprofiler/data_readers/data.py index 8a3e6d94a..e2dcfe8d4 100644 --- a/dataprofiler/data_readers/data.py +++ b/dataprofiler/data_readers/data.py @@ -1,7 +1,7 @@ """Contains factory class reading various kinds of data.""" from io import BytesIO -from typing import Any, Dict, List, Optional, Union +from typing import Any, Optional, Union from .. import dp_logging from .avro_data import AVROData @@ -18,7 +18,7 @@ class Data: """Factory class for reading various kinds of data.""" - data_classes: List[Dict] = [ + data_classes: list[dict] = [ dict(data_class=JSONData, kwargs=dict()), dict(data_class=GraphData, kwargs=dict()), dict(data_class=CSVData, kwargs=dict()), @@ -32,7 +32,7 @@ def __new__( input_file_path: Optional[Union[str, BytesIO]] = None, data: Optional[Any] = None, data_type: Optional[str] = None, - options: Optional[Dict] = None, + options: Optional[dict] = None, ): """ Create Factory Data object. diff --git a/dataprofiler/data_readers/data_utils.py b/dataprofiler/data_readers/data_utils.py index 611d25dc3..4065a6b42 100644 --- a/dataprofiler/data_readers/data_utils.py +++ b/dataprofiler/data_readers/data_utils.py @@ -1,4 +1,5 @@ """Contains functions for data readers.""" + import json import logging import os @@ -6,21 +7,12 @@ import re import urllib from collections import OrderedDict +from collections.abc import Generator, Iterator from io import BytesIO, StringIO, TextIOWrapper from itertools import islice from math import floor, log, log1p -from typing import ( - Any, - Dict, - Generator, - Iterator, - List, - Optional, - Pattern, - Tuple, - Union, - cast, -) +from re import Pattern +from typing import Any, Optional, Union, cast import boto3 import botocore @@ -39,7 +31,7 @@ logger = dp_logging.get_child_logger(__name__) -def data_generator(data_list: List[str]) -> Generator[str, None, None]: +def data_generator(data_list: list[str]) -> Generator[str, None, None]: """ Take a list and return a generator on the list. @@ -122,10 +114,10 @@ def unicode_to_str(data: JSONType, ignore_dicts: bool = False) -> JSONType: def json_to_dataframe( - json_lines: List[JSONType], - selected_columns: Optional[List[str]] = None, + json_lines: list[JSONType], + selected_columns: Optional[list[str]] = None, read_in_string: bool = False, -) -> Tuple[pd.DataFrame, pd.Series]: +) -> tuple[pd.DataFrame, pd.Series]: """ Take list of json objects and return dataframe representing json list. @@ -165,9 +157,9 @@ def json_to_dataframe( def read_json_df( data_generator: Generator, - selected_columns: Optional[List[str]] = None, + selected_columns: Optional[list[str]] = None, read_in_string: bool = False, -) -> Tuple[pd.DataFrame, pd.Series]: +) -> tuple[pd.DataFrame, pd.Series]: """ Return an iterator that returns a chunk of data as dataframe in each call. @@ -193,7 +185,7 @@ def read_json_df( each call as well as original dtypes of the dataframe columns. :rtype: tuple(pd.DataFrame, pd.Series(dtypes)) """ - lines: List[JSONType] = list() + lines: list[JSONType] = list() k = 0 while True: try: @@ -222,9 +214,9 @@ def read_json_df( def read_json( data_generator: Iterator, - selected_columns: Optional[List[str]] = None, + selected_columns: Optional[list[str]] = None, read_in_string: bool = False, -) -> List[JSONType]: +) -> list[JSONType]: """ Return the lines of a json. @@ -249,7 +241,7 @@ def read_json( :return: returns the lines of a json file :rtype: list(dict) """ - lines: List[JSONType] = list() + lines: list[JSONType] = list() k = 0 while True: try: @@ -372,7 +364,7 @@ def read_csv_df( delimiter: Optional[str], header: Optional[int], sample_nrows: Optional[int] = None, - selected_columns: List[str] = [], + selected_columns: list[str] = [], read_in_string: bool = False, encoding: Optional[str] = "utf-8", ) -> pd.DataFrame: @@ -393,7 +385,7 @@ def read_csv_df( :return: Iterator :rtype: pd.DataFrame """ - args: Dict[str, Any] = { + args: dict[str, Any] = { "delimiter": delimiter, "header": header, "iterator": True, @@ -476,9 +468,9 @@ def convert_unicode_col_to_utf8(input_df: pd.DataFrame) -> pd.DataFrame: def sample_parquet( file_path: str, sample_nrows: int, - selected_columns: Optional[List[str]] = None, + selected_columns: Optional[list[str]] = None, read_in_string: bool = False, -) -> Tuple[pd.DataFrame, pd.Series]: +) -> tuple[pd.DataFrame, pd.Series]: """ Read parquet file, sample specified number of rows from it and return a data frame. @@ -521,9 +513,9 @@ def sample_parquet( def read_parquet_df( file_path: str, sample_nrows: Optional[int] = None, - selected_columns: Optional[List[str]] = None, + selected_columns: Optional[list[str]] = None, read_in_string: bool = False, -) -> Tuple[pd.DataFrame, pd.Series]: +) -> tuple[pd.DataFrame, pd.Series]: """ Return an iterator that returns one row group each time. @@ -569,7 +561,7 @@ def read_parquet_df( def read_text_as_list_of_strs( file_path: str, encoding: Optional[str] = None -) -> List[str]: +) -> list[str]: """ Return list of strings relative to the chunk size. @@ -744,7 +736,7 @@ def find_nth_loc( search_query: Optional[str] = None, n: int = 0, ignore_consecutive: bool = True, -) -> Tuple[int, int]: +) -> tuple[int, int]: """ Search string via search_query and return nth index in which query occurs. @@ -877,7 +869,7 @@ def is_valid_url(url_as_string: Any) -> TypeGuard[Url]: return all([result.scheme, result.netloc]) -def url_to_bytes(url_as_string: Url, options: Dict) -> BytesIO: +def url_to_bytes(url_as_string: Url, options: dict) -> BytesIO: """ Read in URL and converts it to a byte stream. diff --git a/dataprofiler/data_readers/filepath_or_buffer.py b/dataprofiler/data_readers/filepath_or_buffer.py index 56c21e28e..f5ee552c6 100644 --- a/dataprofiler/data_readers/filepath_or_buffer.py +++ b/dataprofiler/data_readers/filepath_or_buffer.py @@ -1,6 +1,7 @@ """Contains functions and classes for handling filepaths and buffers.""" + from io import BytesIO, StringIO, TextIOWrapper -from typing import IO, Any, Optional, Type, Union, cast +from typing import IO, Any, Optional, Union, cast from typing_extensions import TypeGuard @@ -54,7 +55,7 @@ def __init__( self.seek_whence: int = seek_whence self._encoding: Optional[str] = encoding self.original_type: Union[ - Type[str], Type[StringIO], Type[BytesIO], Type[IO] + type[str], type[StringIO], type[BytesIO], type[IO] ] = type(filepath_or_buffer) self._is_wrapped: bool = False diff --git a/dataprofiler/data_readers/graph_data.py b/dataprofiler/data_readers/graph_data.py index 337408a68..be49e3c00 100644 --- a/dataprofiler/data_readers/graph_data.py +++ b/dataprofiler/data_readers/graph_data.py @@ -1,6 +1,7 @@ """Contains class for identifying, reading, and loading graph data.""" + import csv -from typing import Dict, List, Optional, Union, cast +from typing import Optional, Union, cast import networkx as nx @@ -19,7 +20,7 @@ def __init__( self, input_file_path: Optional[str] = None, data: Optional[nx.Graph] = None, - options: Optional[Dict] = None, + options: Optional[dict] = None, ) -> None: """ Initialize Data class for identifying, reading, and loading graph data. @@ -64,14 +65,14 @@ def __init__( self._source_node: Optional[int] = options.get("source_node", None) self._destination_node: Optional[int] = options.get("destination_node", None) - self._target_keywords: List[str] = options.get( + self._target_keywords: list[str] = options.get( "target_keywords", ["target", "destination", "dst"] ) - self._source_keywords: List[str] = options.get( + self._source_keywords: list[str] = options.get( "source_keywords", ["source", "src", "origin"] ) - self._graph_keywords: List[str] = options.get("graph_keywords", ["node"]) - self._column_names: Optional[List[str]] = options.get("column_names", None) + self._graph_keywords: list[str] = options.get("graph_keywords", ["node"]) + self._column_names: Optional[list[str]] = options.get("column_names", None) self._delimiter: Optional[str] = options.get("delimiter", None) self._quotechar: Optional[str] = options.get("quotechar", None) self._header: Optional[Union[str, int]] = options.get("header", "auto") @@ -82,7 +83,7 @@ def __init__( @classmethod def _find_target_string_in_column( - self, column_names: List[str], keyword_list: List[str] + self, column_names: list[str], keyword_list: list[str] ) -> int: """Find out if col name contains keyword that could refer to target node col.""" column_name_symbols = ["_", ".", "-"] @@ -116,9 +117,9 @@ def csv_column_names( header: Optional[int], delimiter: Optional[str], encoding: str = "utf-8", - ) -> List[str]: + ) -> list[str]: """Fetch a list of column names from the csv file.""" - column_names: List[str] = [] + column_names: list[str] = [] if delimiter is None: delimiter = "," if header is None: @@ -141,7 +142,7 @@ def csv_column_names( return column_names @classmethod - def is_match(cls, file_path: str, options: Optional[Dict] = None) -> bool: + def is_match(cls, file_path: str, options: Optional[dict] = None) -> bool: """ Determine whether the file is a graph. @@ -157,16 +158,16 @@ def is_match(cls, file_path: str, options: Optional[Dict] = None) -> bool: header: int = options.get("header", 0) delimiter: str = options.get("delimiter", ",") encoding: str = options.get("encoding", "utf-8") - column_names: List[str] = cls.csv_column_names( + column_names: list[str] = cls.csv_column_names( file_path, header, delimiter, encoding ) - source_keywords: List[str] = options.get( + source_keywords: list[str] = options.get( "source_keywords", ["source", "src", "origin"] ) - target_keywords: List[str] = options.get( + target_keywords: list[str] = options.get( "target_keywords", ["target", "destination", "dst"] ) - graph_keywords: List[str] = options.get("graph_keywords", ["node"]) + graph_keywords: list[str] = options.get("graph_keywords", ["node"]) source_index: int = cls._find_target_string_in_column( column_names, source_keywords ) diff --git a/dataprofiler/data_readers/json_data.py b/dataprofiler/data_readers/json_data.py index 93e5d7e6f..f1e378fcd 100644 --- a/dataprofiler/data_readers/json_data.py +++ b/dataprofiler/data_readers/json_data.py @@ -1,10 +1,11 @@ """Contains class to save and load json data.""" + import json import re import warnings from collections import OrderedDict from io import StringIO -from typing import Dict, List, Optional, Union +from typing import Optional, Union import numpy as np import pandas as pd @@ -25,7 +26,7 @@ def __init__( self, input_file_path: Optional[str] = None, data: Optional[Union[str, pd.DataFrame]] = None, - options: Optional[Dict] = None, + options: Optional[dict] = None, ): """ Initialize Data class for loading datasets of type JSON. @@ -71,23 +72,23 @@ def __init__( self._data_formats["records"] = self._get_data_as_records self._data_formats["json"] = self._get_data_as_json - self._data_formats[ - "flattened_dataframe" - ] = self._get_data_as_flattened_dataframe + self._data_formats["flattened_dataframe"] = ( + self._get_data_as_flattened_dataframe + ) self._selected_data_format: str = options.get( "data_format", "flattened_dataframe" ) - self._payload_keys: List[str] = options.get("payload_keys", ["data", "payload"]) + self._payload_keys: list[str] = options.get("payload_keys", ["data", "payload"]) if not isinstance(self._payload_keys, list): self._payload_keys = [self._payload_keys] self._key_separator: str = options.get("key_separator", ".") - self._selected_keys: Optional[List[str]] = options.get("selected_keys", list()) + self._selected_keys: Optional[list[str]] = options.get("selected_keys", list()) self._metadata: Optional[pd.DataFrame] = None if data is not None: self._load_data(data) @property - def selected_keys(self) -> Optional[List[str]]: + def selected_keys(self) -> Optional[list[str]]: """Return selected keys.""" return self._selected_keys @@ -280,7 +281,7 @@ def _load_data_from_file(self, input_file_path: str) -> JSONType: ) return data - def _get_data_as_records(self, data: Union[pd.DataFrame, Dict, List]) -> List[str]: + def _get_data_as_records(self, data: Union[pd.DataFrame, dict, list]) -> list[str]: """ Extract the data as a record format. @@ -297,7 +298,7 @@ def _get_data_as_records(self, data: Union[pd.DataFrame, Dict, List]) -> List[st ) return super()._get_data_as_records(data) - def _get_data_as_json(self, data: Union[pd.DataFrame, Dict, List]) -> List[str]: + def _get_data_as_json(self, data: Union[pd.DataFrame, dict, list]) -> list[str]: """ Extract the data as a json format. @@ -310,7 +311,7 @@ def _get_data_as_json(self, data: Union[pd.DataFrame, Dict, List]) -> List[str]: char_per_line = min(len(data), self.SAMPLES_PER_LINE_DEFAULT) return list(map("".join, zip(*[iter(data)] * char_per_line))) - def _get_data_as_df(self, data: Union[pd.DataFrame, Dict, List]) -> pd.DataFrame: + def _get_data_as_df(self, data: Union[pd.DataFrame, dict, list]) -> pd.DataFrame: """ Extract the data as pandas formats it. @@ -329,7 +330,7 @@ def _get_data_as_df(self, data: Union[pd.DataFrame, Dict, List]) -> pd.DataFrame return data @classmethod - def _convert_flat_to_nested_cols(cls, dic: Dict, separator: str = ".") -> Dict: + def _convert_flat_to_nested_cols(cls, dic: dict, separator: str = ".") -> dict: """ Convert a flat dict to nested dict. @@ -366,7 +367,7 @@ def _convert_flat_to_nested_cols(cls, dic: Dict, separator: str = ".") -> Dict: @classmethod def is_match( - cls, file_path: Union[str, StringIO], options: Optional[Dict] = None + cls, file_path: Union[str, StringIO], options: Optional[dict] = None ) -> bool: """ Test whether first 1000 lines of file has valid JSON format or not. @@ -425,7 +426,7 @@ def reload( self, input_file_path: Optional[str] = None, data: Optional[Union[str, pd.DataFrame]] = None, - options: Optional[Dict] = None, + options: Optional[dict] = None, ) -> None: """ Reload the data class with a new dataset. diff --git a/dataprofiler/data_readers/parquet_data.py b/dataprofiler/data_readers/parquet_data.py index 4fa567b8d..ee6253162 100644 --- a/dataprofiler/data_readers/parquet_data.py +++ b/dataprofiler/data_readers/parquet_data.py @@ -1,6 +1,7 @@ """Contains class to save and load parquet data.""" + from io import BytesIO, StringIO -from typing import Any, Dict, List, Optional, Union +from typing import Any, Optional, Union import pandas as pd import pyarrow.parquet as pq @@ -19,7 +20,7 @@ def __init__( self, input_file_path: Optional[str] = None, data: Optional[Union[pd.DataFrame, str]] = None, - options: Optional[Dict] = None, + options: Optional[dict] = None, ): """ Initialize Data class for loading datasets of type PARQUET. @@ -60,7 +61,7 @@ def __init__( self._data_formats["records"] = self._get_data_as_records self._data_formats["json"] = self._get_data_as_json self._selected_data_format: str = options.get("data_format", "dataframe") - self._selected_columns: List[str] = options.get("selected_columns", list()) + self._selected_columns: list[str] = options.get("selected_columns", list()) self._sample_nrows: Optional[int] = options.get("sample_nrows", None) if data is not None: @@ -80,7 +81,7 @@ def file_encoding(self, value: Any) -> None: pass @property - def selected_columns(self) -> List[str]: + def selected_columns(self) -> list[str]: """Return selected columns.""" return self._selected_columns @@ -114,14 +115,14 @@ def _load_data_from_file(self, input_file_path: str) -> pd.DataFrame: self._original_df_dtypes = original_df_dtypes return data - def _get_data_as_records(self, data: pd.DataFrame) -> List[str]: + def _get_data_as_records(self, data: pd.DataFrame) -> list[str]: """Return data records.""" # split into row samples separate by `\n` data = data.to_json(orient="records", lines=True) data = data.splitlines() return super()._get_data_as_records(data) - def _get_data_as_json(self, data: pd.DataFrame) -> List[str]: + def _get_data_as_json(self, data: pd.DataFrame) -> list[str]: """Return json data.""" data = data.to_json(orient="records") chars_per_line = min(len(data), self.SAMPLES_PER_LINE_DEFAULT) @@ -129,7 +130,7 @@ def _get_data_as_json(self, data: pd.DataFrame) -> List[str]: @classmethod def is_match( - cls, file_path: Union[str, StringIO, BytesIO], options: Optional[Dict] = None + cls, file_path: Union[str, StringIO, BytesIO], options: Optional[dict] = None ) -> bool: """ Test the given file to check if the file has valid Parquet format. @@ -164,7 +165,7 @@ def reload( self, input_file_path: Optional[str] = None, data: Any = None, - options: Optional[Dict] = None, + options: Optional[dict] = None, ) -> None: """ Reload the data class with a new dataset. diff --git a/dataprofiler/data_readers/structured_mixins.py b/dataprofiler/data_readers/structured_mixins.py index 3587291f1..c1c3e9cca 100644 --- a/dataprofiler/data_readers/structured_mixins.py +++ b/dataprofiler/data_readers/structured_mixins.py @@ -1,6 +1,7 @@ """Contains mixin data class for loading datasets of tye SpreadSheet.""" + from logging import Logger -from typing import Any, Dict, List, Optional, Union, cast +from typing import Any, Optional, Union, cast import pandas as pd @@ -26,10 +27,10 @@ class SpreadSheetDataMixin: """ def __init__( - self, input_file_path: Optional[str], data: Any, options: Dict + self, input_file_path: Optional[str], data: Any, options: dict ) -> None: """Initialize spreadsheet mixin object.""" - self._data_formats: Dict = dict() + self._data_formats: dict = dict() self._data_formats["dataframe"] = self._get_data_as_df self._original_df_dtypes: Optional[pd.Series] self.input_file_path: Optional[str] = input_file_path @@ -69,7 +70,7 @@ def _get_data_as_df(self, data: pd.DataFrame) -> pd.DataFrame: ) return data - def _get_data_as_records(self, data: Any) -> List[str]: + def _get_data_as_records(self, data: Any) -> list[str]: """Return data records.""" records_per_line = min(len(data), self.SAMPLES_PER_LINE_DEFAULT) data = [ @@ -80,4 +81,4 @@ def _get_data_as_records(self, data: Any) -> List[str]: ) for i in range((len(data) + records_per_line - 1) // records_per_line) ] - return cast(List[str], data) + return cast(list[str], data) diff --git a/dataprofiler/data_readers/text_data.py b/dataprofiler/data_readers/text_data.py index fd9dc2413..117ca2e37 100644 --- a/dataprofiler/data_readers/text_data.py +++ b/dataprofiler/data_readers/text_data.py @@ -1,7 +1,7 @@ """Contains class for saving and loading text files.""" from io import StringIO -from typing import Dict, List, Optional, Union, cast +from typing import Optional, Union, cast from . import data_utils from .base_data import BaseData @@ -15,8 +15,8 @@ class TextData(BaseData): def __init__( self, input_file_path: Optional[str] = None, - data: Optional[List[str]] = None, - options: Optional[Dict] = None, + data: Optional[list[str]] = None, + options: Optional[dict] = None, ) -> None: """ Initialize Data class for loading datasets of type TEXT. @@ -74,7 +74,7 @@ def is_structured(self) -> bool: """Determine compatibility with StructuredProfiler.""" return False - def _load_data(self, data: Optional[List[str]] = None) -> None: + def _load_data(self, data: Optional[list[str]] = None) -> None: """Load data.""" if data is not None: self._data = data @@ -83,7 +83,7 @@ def _load_data(self, data: Optional[List[str]] = None) -> None: cast(str, self.input_file_path), self.file_encoding ) - def _get_data_as_text(self, data: Union[str, List[str]]) -> List[str]: + def _get_data_as_text(self, data: Union[str, list[str]]) -> list[str]: """Return data as text.""" if isinstance(data, list) and len(data) and isinstance(data[0], str): data = "".join(data) @@ -105,7 +105,7 @@ def tokenize(self) -> None: raise NotImplementedError("Tokenizing does not currently exist for text data.") @classmethod - def is_match(cls, file_path: str, options: Optional[Dict] = None) -> bool: + def is_match(cls, file_path: str, options: Optional[dict] = None) -> bool: """ Return True if all are text files. @@ -127,8 +127,8 @@ def is_match(cls, file_path: str, options: Optional[Dict] = None) -> bool: def reload( self, input_file_path: Optional[str] = None, - data: Optional[List[str]] = None, - options: Optional[Dict] = None, + data: Optional[list[str]] = None, + options: Optional[dict] = None, ) -> None: """ Reload the data class with a new dataset. diff --git a/dataprofiler/labelers/__init__.py b/dataprofiler/labelers/__init__.py index 1b2302fcf..a355ead23 100644 --- a/dataprofiler/labelers/__init__.py +++ b/dataprofiler/labelers/__init__.py @@ -26,6 +26,7 @@ 2. structured_model 3. regex_model """ + # import data labelers # import models from .base_data_labeler import BaseDataLabeler, TrainableDataLabeler diff --git a/dataprofiler/labelers/base_data_labeler.py b/dataprofiler/labelers/base_data_labeler.py index 201f78998..efb387377 100644 --- a/dataprofiler/labelers/base_data_labeler.py +++ b/dataprofiler/labelers/base_data_labeler.py @@ -1,4 +1,5 @@ """Contains abstract classes from which labeler classes will inherit.""" + from __future__ import annotations import json diff --git a/dataprofiler/labelers/base_model.py b/dataprofiler/labelers/base_model.py index 032c2ea38..fcfcddbec 100644 --- a/dataprofiler/labelers/base_model.py +++ b/dataprofiler/labelers/base_model.py @@ -1,11 +1,12 @@ """Contains abstract classes for labeling data.""" + from __future__ import annotations import abc import copy import inspect import warnings -from typing import Any, Callable, Type, TypeVar, cast +from typing import Any, Callable, TypeVar, cast from dataprofiler._typing import DataArray @@ -22,7 +23,7 @@ def __new__( ) -> type[T]: """Create auto registration object and return new class.""" new_class = cast( - Type[T], + type[T], super().__new__(cls, clsname, bases, attrs), ) new_class._register_subclass() diff --git a/dataprofiler/labelers/char_load_tf_model.py b/dataprofiler/labelers/char_load_tf_model.py index a4a44e03a..57087c0d0 100644 --- a/dataprofiler/labelers/char_load_tf_model.py +++ b/dataprofiler/labelers/char_load_tf_model.py @@ -1,4 +1,5 @@ """Contains class for training data labeler model.""" + from __future__ import annotations import copy @@ -537,9 +538,9 @@ def predict( confidences[ allocation_index : allocation_index + num_samples_in_batch ] = model_output[0].numpy() - predictions[ - allocation_index : allocation_index + num_samples_in_batch - ] = model_output[1].numpy() + predictions[allocation_index : allocation_index + num_samples_in_batch] = ( + model_output[1].numpy() + ) allocation_index += num_samples_in_batch diff --git a/dataprofiler/labelers/character_level_cnn_model.py b/dataprofiler/labelers/character_level_cnn_model.py index 2cbb7051a..78fd2152e 100644 --- a/dataprofiler/labelers/character_level_cnn_model.py +++ b/dataprofiler/labelers/character_level_cnn_model.py @@ -1,4 +1,5 @@ """Contains classes for char data labeling.""" + from __future__ import annotations import copy @@ -877,9 +878,9 @@ def predict( confidences[ allocation_index : allocation_index + num_samples_in_batch ] = model_output[0].numpy() - predictions[ - allocation_index : allocation_index + num_samples_in_batch - ] = model_output[1].numpy() + predictions[allocation_index : allocation_index + num_samples_in_batch] = ( + model_output[1].numpy() + ) sentence_lengths[ allocation_index : allocation_index + num_samples_in_batch ] = list(map(lambda x: len(x[0]), batch_data)) diff --git a/dataprofiler/labelers/classification_report_utils.py b/dataprofiler/labelers/classification_report_utils.py index 28e742e32..3d248bcfa 100644 --- a/dataprofiler/labelers/classification_report_utils.py +++ b/dataprofiler/labelers/classification_report_utils.py @@ -1,4 +1,5 @@ """Contains functions for classification.""" + from __future__ import annotations import warnings diff --git a/dataprofiler/labelers/column_name_model.py b/dataprofiler/labelers/column_name_model.py index d698cfd6a..e5ff59184 100644 --- a/dataprofiler/labelers/column_name_model.py +++ b/dataprofiler/labelers/column_name_model.py @@ -1,4 +1,5 @@ """Contains class for column name data labeling model.""" + from __future__ import annotations import json @@ -175,7 +176,7 @@ def _reconstruct_model(self) -> None: pass def _need_to_reconstruct_model(self) -> bool: - pass + raise NotImplementedError() def reset_weights(self) -> None: """Reset weights function.""" diff --git a/dataprofiler/labelers/data_labelers.py b/dataprofiler/labelers/data_labelers.py index a6d9932b7..48dbc7834 100644 --- a/dataprofiler/labelers/data_labelers.py +++ b/dataprofiler/labelers/data_labelers.py @@ -1,4 +1,5 @@ """Module to train and choose between structured and unstructured data labelers.""" + from __future__ import annotations import os diff --git a/dataprofiler/labelers/data_processing.py b/dataprofiler/labelers/data_processing.py index d53980a35..a849863a6 100644 --- a/dataprofiler/labelers/data_processing.py +++ b/dataprofiler/labelers/data_processing.py @@ -1,4 +1,5 @@ """Contains pre-built processors for data labeling/processing.""" + from __future__ import annotations import abc @@ -11,7 +12,8 @@ import types import warnings from collections import Counter -from typing import Any, Generator, Iterable, TypeVar, cast +from collections.abc import Generator, Iterable +from typing import Any, TypeVar, cast import numpy as np import numpy.typing as npt @@ -173,9 +175,11 @@ def process( labels: np.ndarray | None = None, label_mapping: dict[str, int] | None = None, batch_size: int = 32, - ) -> Generator[tuple[np.ndarray, np.ndarray] | np.ndarray, None, None] | tuple[ - np.ndarray, np.ndarray - ] | np.ndarray: + ) -> ( + Generator[tuple[np.ndarray, np.ndarray] | np.ndarray, None, None] + | tuple[np.ndarray, np.ndarray] + | np.ndarray + ): """Preprocess data.""" raise NotImplementedError() @@ -1947,9 +1951,11 @@ def _validate_parameters(self, parameters: dict) -> None: # being changed and is already set aggregation_func = parameters.get( "aggregation_func", - self._parameters.get("aggregation_func") - if hasattr(self, "_parameters") - else None, + ( + self._parameters.get("aggregation_func") + if hasattr(self, "_parameters") + else None + ), ) if value is None and aggregation_func == "priority": errors.append( diff --git a/dataprofiler/labelers/labeler_utils.py b/dataprofiler/labelers/labeler_utils.py index 3a24886f3..2883042b3 100644 --- a/dataprofiler/labelers/labeler_utils.py +++ b/dataprofiler/labelers/labeler_utils.py @@ -1,10 +1,11 @@ """Contains functions for the data labeler.""" + from __future__ import annotations import logging import os import warnings -from typing import Any, Callable, Dict, cast +from typing import Any, Callable, cast import numpy as np import scipy @@ -169,7 +170,7 @@ def evaluate_accuracy( conf_mat_pd.to_csv(confusion_matrix_file) f1_report: dict = cast( - Dict, + dict, classification_report( conf_mat, labels=label_indexes, target_names=label_names, output_dict=True ), diff --git a/dataprofiler/labelers/regex_model.py b/dataprofiler/labelers/regex_model.py index c6a690c17..3c8a27ab2 100644 --- a/dataprofiler/labelers/regex_model.py +++ b/dataprofiler/labelers/regex_model.py @@ -1,4 +1,5 @@ """Contains class for regex data labeling model.""" + from __future__ import annotations import copy @@ -166,7 +167,7 @@ def _reconstruct_model(self) -> None: pass def _need_to_reconstruct_model(self) -> bool: - pass + raise NotImplementedError() def reset_weights(self) -> None: """Reset weights.""" diff --git a/dataprofiler/labelers/utils.py b/dataprofiler/labelers/utils.py index 2d587f7b4..54676a176 100644 --- a/dataprofiler/labelers/utils.py +++ b/dataprofiler/labelers/utils.py @@ -1,7 +1,8 @@ """Contains functions for checking for installations/dependencies.""" + import sys import warnings -from typing import Any, Callable, List +from typing import Any, Callable def warn_missing_module(labeler_function: str, module_name: str) -> None: @@ -22,7 +23,7 @@ def warn_missing_module(labeler_function: str, module_name: str) -> None: warnings.warn(warning_msg, RuntimeWarning, stacklevel=3) -def require_module(names: List[str]) -> Callable: +def require_module(names: list[str]) -> Callable: """ Check if a set of modules exists in sys.modules prior to running function. diff --git a/dataprofiler/plugins/__init__.py b/dataprofiler/plugins/__init__.py index fbe52182e..1673a15ec 100644 --- a/dataprofiler/plugins/__init__.py +++ b/dataprofiler/plugins/__init__.py @@ -1,7 +1,7 @@ import importlib import os -from .decorators import plugin_decorator, plugins_dict +from dataprofiler.plugins.decorators import plugins_dict def load_plugins(): diff --git a/dataprofiler/plugins/decorators.py b/dataprofiler/plugins/decorators.py index c781f4300..d839707ca 100644 --- a/dataprofiler/plugins/decorators.py +++ b/dataprofiler/plugins/decorators.py @@ -1,8 +1,9 @@ """Contains function for generating plugins data.""" + from collections import defaultdict -from typing import Any, DefaultDict, Dict +from typing import Any, DefaultDict -plugins_dict: DefaultDict[str, Dict[str, Any]] = defaultdict(dict) +plugins_dict: DefaultDict[str, dict[str, Any]] = defaultdict(dict) def plugin_decorator(typ, name): diff --git a/dataprofiler/profilers/__init__.py b/dataprofiler/profilers/__init__.py index 4b068fcb0..14834794b 100644 --- a/dataprofiler/profilers/__init__.py +++ b/dataprofiler/profilers/__init__.py @@ -1,4 +1,5 @@ """Package for providing statistics and predictions for a given dataset.""" + from . import json_decoder from .base_column_profilers import BaseColumnProfiler from .categorical_column_profile import CategoricalColumn diff --git a/dataprofiler/profilers/categorical_column_profile.py b/dataprofiler/profilers/categorical_column_profile.py index 1ca630900..d64f5aa5b 100644 --- a/dataprofiler/profilers/categorical_column_profile.py +++ b/dataprofiler/profilers/categorical_column_profile.py @@ -1,4 +1,5 @@ """Contains class for categorical column profiler.""" + from __future__ import annotations import math @@ -277,28 +278,28 @@ def diff(self, other_profile: CategoricalColumn, options: dict = None) -> dict: # These stats are only diffed if both profiles are categorical if self.is_match and other_profile.is_match: - differences["statistics"][ - "chi2-test" - ] = profiler_utils.perform_chi_squared_test_for_homogeneity( - self._categories, - self.sample_size, - other_profile._categories, - other_profile.sample_size, + differences["statistics"]["chi2-test"] = ( + profiler_utils.perform_chi_squared_test_for_homogeneity( + self._categories, + self.sample_size, + other_profile._categories, + other_profile.sample_size, + ) ) - differences["statistics"][ - "categories" - ] = profiler_utils.find_diff_of_lists_and_sets( - self.categories, other_profile.categories + differences["statistics"]["categories"] = ( + profiler_utils.find_diff_of_lists_and_sets( + self.categories, other_profile.categories + ) ) - differences["statistics"][ - "gini_impurity" - ] = profiler_utils.find_diff_of_numbers( - self.gini_impurity, other_profile.gini_impurity + differences["statistics"]["gini_impurity"] = ( + profiler_utils.find_diff_of_numbers( + self.gini_impurity, other_profile.gini_impurity + ) ) - differences["statistics"][ - "unalikeability" - ] = profiler_utils.find_diff_of_numbers( - self.unalikeability, other_profile.unalikeability + differences["statistics"]["unalikeability"] = ( + profiler_utils.find_diff_of_numbers( + self.unalikeability, other_profile.unalikeability + ) ) cat_count1 = dict( sorted(self._categories.items(), key=itemgetter(1), reverse=True) @@ -326,9 +327,9 @@ def diff(self, other_profile: CategoricalColumn, options: dict = None) -> dict: ) differences["statistics"]["psi"] = total_psi - differences["statistics"][ - "categorical_count" - ] = profiler_utils.find_diff_of_dicts(self_cat_count, other_cat_count) + differences["statistics"]["categorical_count"] = ( + profiler_utils.find_diff_of_dicts(self_cat_count, other_cat_count) + ) return differences diff --git a/dataprofiler/profilers/column_profile_compilers.py b/dataprofiler/profilers/column_profile_compilers.py index 07edf13dc..cfeb8c699 100644 --- a/dataprofiler/profilers/column_profile_compilers.py +++ b/dataprofiler/profilers/column_profile_compilers.py @@ -1,4 +1,5 @@ """For generating a report.""" + from __future__ import annotations import abc diff --git a/dataprofiler/profilers/data_labeler_column_profile.py b/dataprofiler/profilers/data_labeler_column_profile.py index d9bfe1ee9..4a3478efa 100644 --- a/dataprofiler/profilers/data_labeler_column_profile.py +++ b/dataprofiler/profilers/data_labeler_column_profile.py @@ -1,8 +1,9 @@ """Contains class for for profiling data labeler col.""" + from __future__ import annotations import operator -from typing import Dict, cast +from typing import cast import numpy as np from pandas import DataFrame, Series @@ -264,7 +265,7 @@ def data_label(self) -> str | None: map(operator.itemgetter(0), ordered_top_k_rank[is_value_close]) ) top_label = ordered_top_k_rank[0][0] - if cast(Dict, self.label_representation)[top_label] < self._min_top_label_prob: + if cast(dict, self.label_representation)[top_label] < self._min_top_label_prob: return "could not determine" return data_label diff --git a/dataprofiler/profilers/datetime_column_profile.py b/dataprofiler/profilers/datetime_column_profile.py index af99283a9..1042ea0c0 100644 --- a/dataprofiler/profilers/datetime_column_profile.py +++ b/dataprofiler/profilers/datetime_column_profile.py @@ -1,4 +1,5 @@ """Contains class for profiling datetime column.""" + from __future__ import annotations import datetime @@ -216,7 +217,7 @@ def _validate_datetime(date: str, date_format: str) -> datetime.datetime | float :return: either the str converted into a date format, or Nan """ try: - converted_date: (datetime.datetime | float) = datetime.datetime.strptime( + converted_date: datetime.datetime | float = datetime.datetime.strptime( date, date_format ) except (ValueError, TypeError): @@ -237,7 +238,7 @@ def _replace_day_suffix(date: str, pattern: re.Pattern) -> str | float: """ try: new_date: str | float = pattern.sub(r"\1", date) - except (TypeError): + except TypeError: new_date = np.nan return new_date diff --git a/dataprofiler/profilers/float_column_profile.py b/dataprofiler/profilers/float_column_profile.py index bc426a447..3d6ede326 100644 --- a/dataprofiler/profilers/float_column_profile.py +++ b/dataprofiler/profilers/float_column_profile.py @@ -1,4 +1,5 @@ """Float profile analysis for individual col within structured profiling.""" + from __future__ import annotations import copy diff --git a/dataprofiler/profilers/graph_profiler.py b/dataprofiler/profilers/graph_profiler.py index 0680a29a7..345a0f2e5 100644 --- a/dataprofiler/profilers/graph_profiler.py +++ b/dataprofiler/profilers/graph_profiler.py @@ -1,4 +1,5 @@ """Class and functions to calculate and profile properties of graph data.""" + from __future__ import annotations import importlib diff --git a/dataprofiler/profilers/helpers/__init__.py b/dataprofiler/profilers/helpers/__init__.py index 43393433d..2c72b2f35 100644 --- a/dataprofiler/profilers/helpers/__init__.py +++ b/dataprofiler/profilers/helpers/__init__.py @@ -1,4 +1,5 @@ """This package provides helper functions for generating reports.""" + from .report_helpers import _prepare_report, calculate_quantiles __all__ = [ diff --git a/dataprofiler/profilers/helpers/report_helpers.py b/dataprofiler/profilers/helpers/report_helpers.py index 0588252c9..44ac8fb1a 100644 --- a/dataprofiler/profilers/helpers/report_helpers.py +++ b/dataprofiler/profilers/helpers/report_helpers.py @@ -1,4 +1,5 @@ """Contains helper functions for generating report.""" + from __future__ import annotations import math diff --git a/dataprofiler/profilers/histogram_utils.py b/dataprofiler/profilers/histogram_utils.py index df230c4c7..033e384bc 100644 --- a/dataprofiler/profilers/histogram_utils.py +++ b/dataprofiler/profilers/histogram_utils.py @@ -7,8 +7,9 @@ A copy of the license for numpy is available here: https://github.com/numpy/numpy/blob/main/LICENSE.txt """ + import operator -from typing import List, Optional, Tuple, Union +from typing import Optional, Union import numpy as np from numpy.lib.histograms import ( # type: ignore[attr-defined] @@ -238,10 +239,10 @@ def _calc_scott_bin_width_from_profile(profile): def _get_bin_edges( a: np.ndarray, - bins: Union[str, int, List], - range: Optional[Tuple[int, int]], + bins: Union[str, int, list], + range: Optional[tuple[int, int]], weights: Optional[np.ndarray], -) -> Tuple[None, int]: +) -> tuple[None, int]: """ Compute the bins used internally by `histogram`. diff --git a/dataprofiler/profilers/int_column_profile.py b/dataprofiler/profilers/int_column_profile.py index 014465c71..fe38eec60 100644 --- a/dataprofiler/profilers/int_column_profile.py +++ b/dataprofiler/profilers/int_column_profile.py @@ -1,4 +1,5 @@ """Int profile analysis for individual col within structured profiling.""" + from __future__ import annotations import numpy as np diff --git a/dataprofiler/profilers/json_decoder.py b/dataprofiler/profilers/json_decoder.py index fb4ff8cb9..eb09db0d7 100644 --- a/dataprofiler/profilers/json_decoder.py +++ b/dataprofiler/profilers/json_decoder.py @@ -1,4 +1,5 @@ """Contains methods to decode components of a Profiler.""" + from __future__ import annotations import warnings @@ -116,9 +117,9 @@ def get_structured_col_profiler_class(class_name: str) -> type[StructuredColProf :type class_name: str representing name of class :return: subclass of StructuredColProfiler object """ - struct_col_profiler_class: None | ( - type[StructuredColProfiler] - ) = _structured_col_profiler.get(class_name) + struct_col_profiler_class: None | (type[StructuredColProfiler]) = ( + _structured_col_profiler.get(class_name) + ) if struct_col_profiler_class is None: raise ValueError( f"Invalid structured col profiler class {class_name} " f"failed to load." @@ -153,9 +154,9 @@ def load_column_profile( JSON """ - column_profiler_cls: type[ - BaseColumnProfiler[BaseColumnProfiler] - ] = get_column_profiler_class(serialized_json["class"]) + column_profiler_cls: type[BaseColumnProfiler[BaseColumnProfiler]] = ( + get_column_profiler_class(serialized_json["class"]) + ) return column_profiler_cls.load_from_dict(serialized_json["data"], config) diff --git a/dataprofiler/profilers/numerical_column_stats.py b/dataprofiler/profilers/numerical_column_stats.py index fa0666a66..202a704f8 100644 --- a/dataprofiler/profilers/numerical_column_stats.py +++ b/dataprofiler/profilers/numerical_column_stats.py @@ -6,7 +6,7 @@ import copy import itertools import warnings -from typing import Any, Callable, Dict, List, TypeVar, cast +from typing import Any, Callable, TypeVar, cast import numpy as np import numpy.typing as npt @@ -257,9 +257,9 @@ def _add_helper_merge_profile_histograms( if self.user_set_histogram_bin is None: for method in self.histogram_bin_method_names: - self.histogram_methods[method][ - "suggested_bin_count" - ] = histogram_utils._calculate_bins_from_profile(self, method) + self.histogram_methods[method]["suggested_bin_count"] = ( + histogram_utils._calculate_bins_from_profile(self, method) + ) self._get_quantiles() @@ -1040,10 +1040,7 @@ def _merge_biased_kurtosis( / N**3 ) third_term = ( - 6 - * delta**2 - * (match_count1**2 * M2_2 + match_count2**2 * M2_1) - / N**2 + 6 * delta**2 * (match_count1**2 * M2_2 + match_count2**2 * M2_1) / N**2 ) fourth_term = 4 * delta * (match_count1 * M3_2 - match_count2 * M3_1) / N M4 = first_term + second_term + third_term + fourth_term @@ -1111,7 +1108,7 @@ def _estimate_mode_from_histogram(self) -> list[float]: mode: npt.NDArray[np.float64] = ( bin_edges[highest_idxs] + bin_edges[highest_idxs + 1] # type: ignore ) / 2 - return cast(List[float], mode.tolist()) + return cast(list[float], mode.tolist()) def _estimate_stats_from_histogram(self) -> np.float64: # test estimated mean and var @@ -1548,7 +1545,7 @@ def _get_best_histogram_for_profile(self) -> dict: self.histogram_selection = method best_hist_loss = hist_loss - return cast(Dict, self.histogram_methods[self.histogram_selection]["histogram"]) + return cast(dict, self.histogram_methods[self.histogram_selection]["histogram"]) def _get_percentile(self, percentiles: np.ndarray | list[float]) -> list[float]: """ @@ -1586,7 +1583,7 @@ def _get_percentile(self, percentiles: np.ndarray | list[float]) -> list[float]: ) if median_value: quantiles[percentiles == 50] = median_value - return cast(List[float], quantiles.tolist()) + return cast(list[float], quantiles.tolist()) @staticmethod def _fold_histogram( diff --git a/dataprofiler/profilers/order_column_profile.py b/dataprofiler/profilers/order_column_profile.py index 308262324..6ee96ed56 100644 --- a/dataprofiler/profilers/order_column_profile.py +++ b/dataprofiler/profilers/order_column_profile.py @@ -1,8 +1,9 @@ """Index profile analysis for individual col within structured profiling.""" + from __future__ import annotations from abc import abstractmethod -from typing import Protocol, Type, TypeVar, cast +from typing import Protocol, TypeVar, cast import numpy as np from pandas import DataFrame, Series @@ -24,8 +25,8 @@ def __lt__(self: CT, other: CT) -> bool: CT = TypeVar("CT", bound=Comparable) # bc type in class attr causing issues, need to alias -AliasFloatType = Type[np.float64] -AliasStrType = Type[str] +AliasFloatType = type[np.float64] +AliasStrType = type[str] class OrderColumn(BaseColumnProfiler["OrderColumn"]): diff --git a/dataprofiler/profilers/profile_builder.py b/dataprofiler/profilers/profile_builder.py index 6e512658f..542c2536f 100644 --- a/dataprofiler/profilers/profile_builder.py +++ b/dataprofiler/profilers/profile_builder.py @@ -10,9 +10,10 @@ import re import warnings from collections import OrderedDict, defaultdict +from collections.abc import Generator from datetime import datetime from multiprocessing.pool import Pool -from typing import Any, Generator, List, Optional, TypeVar, cast +from typing import Any, Optional, TypeVar, cast import networkx as nx import numpy as np @@ -1919,10 +1920,10 @@ def diff( # type: ignore[override] col_name = other_profile._profile[i].name other_profile_schema[col_name].append(i) - report["global_stats"][ - "profile_schema" - ] = profiler_utils.find_diff_of_dicts_with_diff_keys( - self_profile_schema, other_profile_schema + report["global_stats"]["profile_schema"] = ( + profiler_utils.find_diff_of_dicts_with_diff_keys( + self_profile_schema, other_profile_schema + ) ) # Only find the diff of columns if the schemas are exactly the same @@ -2049,7 +2050,7 @@ def profile(self) -> list[StructuredColProfiler]: :return: list[StructuredColProfiler] """ - return cast(List[StructuredColProfiler], super().profile) + return cast(list[StructuredColProfiler], super().profile) def report(self, report_options: dict = None) -> dict: """Return a report.""" @@ -2101,9 +2102,9 @@ def report(self, report_options: dict = None) -> dict: self.options.null_replication_metrics.is_enabled and i in self._null_replication_metrics ): - report["data_stats"][i][ - "null_replication_metrics" - ] = self._null_replication_metrics[i] + report["data_stats"][i]["null_replication_metrics"] = ( + self._null_replication_metrics[i] + ) return _prepare_report(report, output_format, omit_keys) @@ -2610,9 +2611,11 @@ def _update_null_replication_metrics(self, clean_samples: dict) -> None: total_row_sum = np.asarray( [ - get_data_type_profiler(profile).sum - if get_data_type(profile) not in [None, "datetime"] - else np.nan + ( + get_data_type_profiler(profile).sum + if get_data_type(profile) not in [None, "datetime"] + else np.nan + ) for profile in self._profile ] ) @@ -2704,17 +2707,21 @@ def _merge_null_replication_metrics(self, other: StructuredProfiler) -> dict: self_row_sum = np.asarray( [ - get_data_type_profiler(profile).sum - if get_data_type(profile) - else np.nan + ( + get_data_type_profiler(profile).sum + if get_data_type(profile) + else np.nan + ) for profile in self._profile ] ) other_row_sum = np.asarray( [ - get_data_type_profiler(profile).sum - if get_data_type(profile) - else np.nan + ( + get_data_type_profiler(profile).sum + if get_data_type(profile) + else np.nan + ) for profile in other._profile ] ) diff --git a/dataprofiler/profilers/profiler_options.py b/dataprofiler/profilers/profiler_options.py index 038acf806..077aa3b62 100644 --- a/dataprofiler/profilers/profiler_options.py +++ b/dataprofiler/profilers/profiler_options.py @@ -9,7 +9,7 @@ from typing import Any, Generic, TypeVar, cast from ..labelers.base_data_labeler import BaseDataLabeler -from ..plugins.__init__ import get_plugins +from ..plugins import get_plugins from . import profiler_utils from .json_decoder import load_option @@ -32,7 +32,7 @@ def properties(self) -> dict[str, BooleanOption]: """ return copy.deepcopy(self.__dict__) - def _set_helper(self, options: dict[str, bool], variable_path: str) -> None: + def _set_helper(self, options: dict, variable_path: str) -> None: """ Set all the options. @@ -100,7 +100,7 @@ def _set_helper(self, options: dict[str, bool], variable_path: str) -> None: f"type object '{error_path}' has no attribute '{option}'" ) - def set(self, options: dict[str, bool]) -> None: + def set(self, options: dict) -> None: """ Set all the options. diff --git a/dataprofiler/profilers/profiler_utils.py b/dataprofiler/profilers/profiler_utils.py index e38e1b041..e0d883a07 100644 --- a/dataprofiler/profilers/profiler_utils.py +++ b/dataprofiler/profilers/profiler_utils.py @@ -1,4 +1,5 @@ """Contains functions for profilers.""" + from __future__ import annotations import collections @@ -10,20 +11,10 @@ import time import warnings from abc import abstractmethod +from collections.abc import Generator, Iterator from itertools import islice from multiprocessing.pool import Pool -from typing import ( - TYPE_CHECKING, - Any, - Callable, - Dict, - Generator, - Iterator, - Protocol, - TypeVar, - cast, - overload, -) +from typing import TYPE_CHECKING, Any, Callable, Protocol, TypeVar, cast, overload import numpy as np import psutil @@ -51,7 +42,7 @@ def recursive_dict_update(d: dict, update_d: dict) -> dict: if isinstance(v, collections.abc.Mapping) and isinstance( d.get(k, None), collections.abc.Mapping ): - d[k] = recursive_dict_update(d.get(k, {}), cast(Dict, v)) + d[k] = recursive_dict_update(d.get(k, {}), cast(dict, v)) else: d[k] = v return d @@ -417,13 +408,11 @@ def __sub__(self: T, other: T) -> Any: def find_diff_of_numbers( stat1: int | float | np.float64 | np.int64 | None, stat2: int | float | np.float64 | np.int64 | None, -) -> Any: - ... +) -> Any: ... @overload -def find_diff_of_numbers(stat1: T | None, stat2: T | None) -> Any: - ... +def find_diff_of_numbers(stat1: T | None, stat2: T | None) -> Any: ... def find_diff_of_numbers(stat1, stat2): diff --git a/dataprofiler/profilers/text_column_profile.py b/dataprofiler/profilers/text_column_profile.py index bea8dbd68..eb79643fd 100644 --- a/dataprofiler/profilers/text_column_profile.py +++ b/dataprofiler/profilers/text_column_profile.py @@ -1,4 +1,5 @@ """Text profile analysis for individual col within structured profiling..""" + from __future__ import annotations import itertools diff --git a/dataprofiler/profilers/unstructured_labeler_profile.py b/dataprofiler/profilers/unstructured_labeler_profile.py index 1c7b16c0f..22789c4e3 100644 --- a/dataprofiler/profilers/unstructured_labeler_profile.py +++ b/dataprofiler/profilers/unstructured_labeler_profile.py @@ -1,4 +1,5 @@ """Profile analysis for applying labels within unstructured profiling.""" + from __future__ import annotations from collections import defaultdict diff --git a/dataprofiler/profilers/unstructured_text_profile.py b/dataprofiler/profilers/unstructured_text_profile.py index 96b7d0625..3f1b6dd7f 100644 --- a/dataprofiler/profilers/unstructured_text_profile.py +++ b/dataprofiler/profilers/unstructured_text_profile.py @@ -1,4 +1,5 @@ """For profiling unstructured text data.""" + from __future__ import annotations import itertools diff --git a/dataprofiler/reports/graphs.py b/dataprofiler/reports/graphs.py index 1f0b43018..8c0e16133 100644 --- a/dataprofiler/reports/graphs.py +++ b/dataprofiler/reports/graphs.py @@ -1,10 +1,11 @@ """Contains functions for generating graph data report.""" + # !/usr/bin/env python3 from __future__ import annotations import math import warnings -from typing import TYPE_CHECKING, List, Union, cast +from typing import TYPE_CHECKING, Union, cast if TYPE_CHECKING: from ..profilers.float_column_profile import FloatColumn @@ -17,6 +18,7 @@ import matplotlib.patches import matplotlib.pyplot as plt import seaborn as sns + from matplotlib.pyplot.figure import Figure, SubFigure except ImportError: # don't require if using graphs will below recommend to install if not # installed @@ -32,7 +34,7 @@ def plot_histograms( profiler: StructuredProfiler, column_names: list[int | str] | None = None, column_inds: list[int] | None = None, -) -> matplotlib.pyplot.figure: +) -> Figure | SubFigure | None: """ Plot the histograms of column names that are int or float columns. @@ -72,7 +74,7 @@ def plot_histograms( if not column_names and not column_inds: inds_to_graph = list(range(len(profile_list))) elif not column_inds: - for column in cast(List[Union[str, int]], column_names): + for column in cast(list[Union[str, int]], column_names): col = column if isinstance(col, str): col = col.lower() @@ -108,7 +110,7 @@ def is_index_graphable_column(ind_to_graph: int) -> bool: "No plots were constructed" " because no int or float columns were found in columns" ) - return + return None # get proper tile format for graph n = len(inds_to_graph) @@ -153,7 +155,7 @@ def plot_col_histogram( data_type_profiler: IntColumn | FloatColumn, ax: matplotlib.axes.Axes | None = None, title: str | None = None, -) -> matplotlib.axes.Axes: +) -> matplotlib.axes.Axes | None: """ Take input of a Int or Float Column and plot the histogram. @@ -179,10 +181,11 @@ def plot_col_histogram( ax=ax, ) - ax.set(xlabel="bins") - if title is None: - title = str(data_type_profiler.name) - ax.set_title(title) + if ax: + ax.set(xlabel="bins") + if title is None: + title = str(data_type_profiler.name) + ax.set_title(title) return ax @@ -191,7 +194,7 @@ def plot_missing_values_matrix( profiler: StructuredProfiler, ax: matplotlib.axes.Axes | None = None, title: str | None = None, -) -> matplotlib.pyplot.figure: +) -> Figure | SubFigure | None: """ Generate matrix of bar graphs for missing value locations in cols of struct dataset. @@ -215,7 +218,7 @@ def plot_col_missing_values( col_profiler_list: list[StructuredColProfiler], ax: matplotlib.axes.Axes | None = None, title: str | None = None, -) -> matplotlib.pyplot.figure: +) -> Figure | SubFigure | None: """ Generate bar graph of missing value locations within a col. @@ -243,7 +246,7 @@ def plot_col_missing_values( warnings.warn( "There was no data in the profiles to plot missing " "column values." ) - return + return None # bar width settings and height settings for each null value # width = 1, height = 1 would be no gaps @@ -262,7 +265,8 @@ def plot_col_missing_values( ax = fig.add_subplot(111) is_own_fig = True # in case user passed their own axes - fig = ax.figure + else: + fig = cast(Figure, ax.figure) # loop through eac column plotting their null values for col_id, col_profiler in enumerate(col_profiler_list): diff --git a/dataprofiler/reports/utils.py b/dataprofiler/reports/utils.py index a10b8fe59..a578f9c29 100644 --- a/dataprofiler/reports/utils.py +++ b/dataprofiler/reports/utils.py @@ -1,7 +1,8 @@ """Contains functions for checking for installations/dependencies.""" + import sys import warnings -from typing import Any, Callable, List, TypeVar, cast +from typing import Any, Callable, TypeVar, cast # Generic type for the return of the function "require_module()" F = TypeVar("F", bound=Callable[..., Any]) @@ -25,7 +26,7 @@ def warn_missing_module(graph_func: str, module_name: str) -> None: warnings.warn(warning_msg, RuntimeWarning, stacklevel=3) -def require_module(names: List[str]) -> Callable[[F], F]: +def require_module(names: list[str]) -> Callable[[F], F]: """ Check if a set of modules exists in sys.modules prior to running function. diff --git a/dataprofiler/rng_utils.py b/dataprofiler/rng_utils.py index 329066658..2fd14f0f7 100644 --- a/dataprofiler/rng_utils.py +++ b/dataprofiler/rng_utils.py @@ -1,4 +1,5 @@ """Create a random number generator using a manual seed DATAPROFILER_SEED.""" + import os import warnings diff --git a/dataprofiler/settings.py b/dataprofiler/settings.py index 1ba017f4e..a81c3477c 100644 --- a/dataprofiler/settings.py +++ b/dataprofiler/settings.py @@ -1,2 +1,3 @@ """Configure settings for dataprofiler.""" + _seed = None diff --git a/dataprofiler/tests/plugins/test_plugins.py b/dataprofiler/tests/plugins/test_plugins.py index ec148a526..91677e5c1 100644 --- a/dataprofiler/tests/plugins/test_plugins.py +++ b/dataprofiler/tests/plugins/test_plugins.py @@ -2,7 +2,7 @@ from collections import defaultdict from unittest import mock -from dataprofiler.plugins.__init__ import get_plugins, load_plugins +from dataprofiler.plugins import get_plugins, load_plugins from dataprofiler.plugins.decorators import plugin_decorator, plugins_dict @@ -24,12 +24,12 @@ def test_plugin(): test_get_dict = get_plugins("test") self.assertDictEqual({"mock_test": test_plugin}, test_get_dict) - @mock.patch("dataprofiler.plugins.__init__.importlib.util") - @mock.patch("dataprofiler.plugins.__init__.os.path.isdir") - @mock.patch("dataprofiler.plugins.__init__.os.listdir") + @mock.patch("dataprofiler.plugins.importlib.util") + @mock.patch("dataprofiler.plugins.os.path.isdir") + @mock.patch("dataprofiler.plugins.os.listdir") def test_load_plugin(self, mock_listdir, mock_isdir, mock_importlib_util): - mock_listdir.side_effect = ( - lambda folder_dir: ["__pycache__", "py"] + mock_listdir.side_effect = lambda folder_dir: ( + ["__pycache__", "py"] if folder_dir.endswith("plugins") else ["stillnotrealpy", "a.json", None] ) @@ -38,10 +38,8 @@ def test_load_plugin(self, mock_listdir, mock_isdir, mock_importlib_util): load_plugins() mock_importlib_util.spec_from_file_location.assert_not_called() - mock_listdir.side_effect = ( - lambda folder_dir: ["folder"] - if folder_dir.endswith("plugins") - else ["file.py"] + mock_listdir.side_effect = lambda folder_dir: ( + ["folder"] if folder_dir.endswith("plugins") else ["file.py"] ) mock_spec = mock.Mock() mock_importlib_util.spec_from_file_location.return_value = mock_spec diff --git a/dataprofiler/tests/profilers/test_profile_builder.py b/dataprofiler/tests/profilers/test_profile_builder.py index c4e604737..f9bbf14ae 100644 --- a/dataprofiler/tests/profilers/test_profile_builder.py +++ b/dataprofiler/tests/profilers/test_profile_builder.py @@ -88,6 +88,12 @@ def setUpClass(cls): cls.aws_dataset, len(cls.aws_dataset), options=profiler_options ) + @classmethod + def tearDownClass(cls): + from dataprofiler import dp_logging + + dp_logging.set_verbosity(logging.INFO) + @mock.patch( "dataprofiler.profilers.profile_builder.ColumnPrimitiveTypeProfileCompiler" ) diff --git a/dataprofiler/tests/profilers/test_profiler_utils.py b/dataprofiler/tests/profilers/test_profiler_utils.py index 4eee1963a..0ea0c0fc0 100644 --- a/dataprofiler/tests/profilers/test_profiler_utils.py +++ b/dataprofiler/tests/profilers/test_profiler_utils.py @@ -472,7 +472,6 @@ def test_odd_merge_profile_list(self, mock_data_labeler, *mocks): class TestAutoMultiProcessToggle(unittest.TestCase): - """ Validate profile_utils.auto_multiprocess_toggle is properly working. """ diff --git a/dataprofiler/tests/space_time_analysis/dataset_generation.py b/dataprofiler/tests/space_time_analysis/dataset_generation.py index a983921da..6b23de32d 100644 --- a/dataprofiler/tests/space_time_analysis/dataset_generation.py +++ b/dataprofiler/tests/space_time_analysis/dataset_generation.py @@ -1,7 +1,7 @@ import copy import json import string -from typing import List, Optional +from typing import Optional import numpy as np import pandas as pd @@ -55,7 +55,7 @@ def convert_data_to_df( np_data: np.array, path: Optional[str] = None, index: bool = False, - column_names: Optional[List[str]] = None, + column_names: Optional[list[str]] = None, ) -> pd.DataFrame: """ Converts np array to a pandas dataframe @@ -129,7 +129,7 @@ def random_floats( def random_string( rng: Generator, - chars: Optional[List[str]] = None, + chars: Optional[list[str]] = None, num_rows: int = 1, str_len_min: int = 1, str_len_max: int = 256, @@ -244,7 +244,7 @@ def random_datetimes( def random_categorical( - rng: Generator, categories: Optional[List[str]] = None, num_rows: int = 1 + rng: Generator, categories: Optional[list[str]] = None, num_rows: int = 1 ) -> np.array: """ Randomly generates an array of categorical chosen out of categories @@ -311,7 +311,7 @@ def random_text( def generate_dataset_by_class( rng: Generator, - columns_to_generate: Optional[List[dict]] = None, + columns_to_generate: Optional[list[dict]] = None, dataset_length: int = 100000, path: Optional[str] = None, ) -> pd.DataFrame: diff --git a/dataprofiler/tests/space_time_analysis/structured_space_time_analysis.py b/dataprofiler/tests/space_time_analysis/structured_space_time_analysis.py index df57854f6..51207413c 100644 --- a/dataprofiler/tests/space_time_analysis/structured_space_time_analysis.py +++ b/dataprofiler/tests/space_time_analysis/structured_space_time_analysis.py @@ -1,10 +1,11 @@ """Contains space and time analysis tests for the Dataprofiler""" + import json import os import random import time from collections import defaultdict -from typing import Dict, List, Optional +from typing import Optional import memray import numpy as np @@ -32,7 +33,7 @@ def dp_profile_space_analysis( data: pd.DataFrame, path: str, - options: Optional[Dict] = None, + options: Optional[dict] = None, ) -> StructuredProfiler: """ Generate memray bin file of the space analysis of dp.Profiler function @@ -68,12 +69,12 @@ def dp_merge_space_analysis(profile: StructuredProfiler, path: str): def dp_space_time_analysis( rng: Generator, - sample_sizes: List, + sample_sizes: list, data: pd.DataFrame, path: str = "./time_analysis/structured_profiler_times.json", percent_to_nan: float = 0.0, allow_subsampling: bool = True, - options: Optional[Dict] = None, + options: Optional[dict] = None, space_analysis=True, time_analysis=True, ): diff --git a/dataprofiler/tests/test_data_profiler.py b/dataprofiler/tests/test_data_profiler.py index 9ebdfa039..54a5f2d82 100644 --- a/dataprofiler/tests/test_data_profiler.py +++ b/dataprofiler/tests/test_data_profiler.py @@ -18,6 +18,7 @@ class TestDataProfiler(unittest.TestCase): @classmethod def setUpClass(cls): + import dataprofiler as dp test_dir = os.path.join(MODULE_PATH, "data") cls.input_file_names = [ @@ -25,6 +26,7 @@ def setUpClass(cls): path=os.path.join(test_dir, "csv/aws_honeypot_marx_geo.csv"), type="csv" ), ] + dp.settings._seed = None def test_set_seed(self): import dataprofiler as dp diff --git a/dataprofiler/tests/test_rng_utils.py b/dataprofiler/tests/test_rng_utils.py index 6ee2ed35c..8a4c4d229 100644 --- a/dataprofiler/tests/test_rng_utils.py +++ b/dataprofiler/tests/test_rng_utils.py @@ -1,4 +1,5 @@ """Validates that generator intakes DATAPROFILER_SEED properly.""" + import os import unittest import unittest.mock diff --git a/dataprofiler/version.py b/dataprofiler/version.py index 1136efae1..b4e4c2b9c 100644 --- a/dataprofiler/version.py +++ b/dataprofiler/version.py @@ -1,8 +1,8 @@ """File contains the version number for the package.""" MAJOR = 0 -MINOR = 11 -MICRO = 0 +MINOR = 13 +MICRO = 1 POST = None # otherwise None VERSION = "%d.%d.%d" % (MAJOR, MINOR, MICRO) diff --git a/resources/__init__.py b/resources/__init__.py index dd86bffe7..b1d2822d9 100644 --- a/resources/__init__.py +++ b/resources/__init__.py @@ -1,2 +1,3 @@ """Contains resources for labelers.""" + # empty init diff --git a/setup.cfg b/setup.cfg index dd0e2235f..6ec3e8a61 100644 --- a/setup.cfg +++ b/setup.cfg @@ -15,6 +15,9 @@ line_length=88 warn_return_any = True warn_unused_configs = True ignore_missing_imports = True +no_implicit_optional = False +exclude = ^dataprofiler/tests/|^resources/|^examples|venv*/ +disable_error_code = override [check-manifest] ignore-default-rules=True diff --git a/setup.py b/setup.py index f8b5eaf8e..eeca6629b 100644 --- a/setup.py +++ b/setup.py @@ -54,7 +54,7 @@ setup( name="DataProfiler", version=__version__, - python_requires=">=3.8", + python_requires=">=3.9", description=DESCRIPTION, long_description=LONG_DESCRIPTION, long_description_content_type="text/markdown", diff --git a/tox.ini b/tox.ini index 90d06af06..4ee6081bd 100644 --- a/tox.ini +++ b/tox.ini @@ -16,7 +16,7 @@ deps = -rrequirements-reports.txt -rrequirements-test.txt commands = - python3 -m pytest dataprofiler/tests/ --cov=dataprofiler --cov-fail-under=80 --cov-report=xml:dist/coverage.xml --forked + python3 -m pytest dataprofiler/tests/ --cov=dataprofiler --cov-fail-under=80 --cov-report=xml:coverage.xml --forked # add "docs" to `envlist` to run the docs build #[testenv:docs]