Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

dev-fix/Improvements for Local Development Setup and Unit Test Structure #1168

Merged
merged 13 commits into from
Jan 30, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,3 @@ venv.bak/
env3/

*.bak

#Pipfiles
Pipfile*
17 changes: 10 additions & 7 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ repos:
# Black: format Python code
# https://github.com/psf/black/blob/master/.pre-commit-hooks.yaml
- repo: https://github.com/psf/black
rev: 22.3.0
rev: 24.10.0
hooks:
- id: black
types: [file, python]
Expand Down Expand Up @@ -38,7 +38,7 @@ repos:
# Mypy: Optional static type checking
# https://github.com/pre-commit/mirrors-mypy
- repo: https://github.com/pre-commit/mirrors-mypy
rev: v0.982
rev: v1.11.2
hooks:
- id: mypy
exclude: (^dataprofiler/tests/|^resources/|^examples|venv*/)
Expand Down Expand Up @@ -108,16 +108,19 @@ repos:
rev: "0.48"
hooks:
- id: check-manifest
additional_dependencies: ['h5py', 'wheel', 'future', 'numpy<2.0.0', 'pandas',
'python-dateutil', 'pytz', 'pyarrow', 'chardet', 'fastavro',
'python-snappy', 'charset-normalizer', 'psutil', 'scipy', 'requests',
'networkx','typing-extensions', 'HLL', 'datasketches', 'boto3']
additional_dependencies:
[
'matplotlib', 'h5py', 'wheel', 'future', 'numpy<2.0.0', 'pandas',
'python-dateutil', 'pytz', 'pyarrow', 'chardet', 'fastavro',
'python-snappy', 'charset-normalizer', 'psutil', 'scipy', 'requests',
'networkx','typing-extensions', 'HLL', 'datasketches', 'boto3',
]
# Pyupgrade - standardize and modernize Python syntax for newer versions of the language
- repo: https://github.com/asottile/pyupgrade
rev: v3.3.0
hooks:
- id: pyupgrade
args: ["--py38-plus"]
args: ["--py39-plus"]
# Autoflake - cleanup unused variables and imports
- repo: https://github.com/PyCQA/autoflake
rev: v2.0.0
Expand Down
2 changes: 1 addition & 1 deletion MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,4 @@ recursive-include resources *.json
recursive-include resources *.pb
recursive-include resources *.py

recursive-include dataprofiler/labelers/embeddings/ *.txt
recursive-include dataprofiler/labelers/embeddings/*.txt
36 changes: 27 additions & 9 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,12 +1,19 @@
setup: requirements.txt requirements-dev.txt requirements-test.txt
python3 -m venv venv
PYTHON_VERSION ?= python3.9
VENV_DIR ?= venv
REQ_FILES := requirements.txt requirements-dev.txt requirements-test.txt requirements-ml.txt requirements-reports.txt

. venv/bin/activate && \
pip3 install -r requirements.txt && \
pip3 install -r requirements-dev.txt && \
pip3 install -r requirements-ml.txt && \
pip3 install -r requirements-reports.txt && \
pip3 install -r requirements-test.txt && \
check-python:
@$(PYTHON_VERSION) --version | grep -E "Python (3\.9|3\.10|3\.11)" || \
(echo "Python 3.9, 3.10, or 3.11 is required. Ensure $(PYTHON_VERSION) is installed and try again." && exit 1)

setup: check-python $(REQ_FILES)
@$(PYTHON_VERSION) -m venv $(VENV_DIR)
. $(VENV_DIR)/bin/activate && \
pip3 install --no-cache-dir -r requirements-ml.txt && \
pip3 install --no-cache-dir -r requirements.txt && \
pip3 install --no-cache-dir -r requirements-dev.txt && \
pip3 install --no-cache-dir -r requirements-reports.txt && \
pip3 install --no-cache-dir -r requirements-test.txt && \
pip3 install -e . && \
pre-commit install && \
pre-commit run
Expand All @@ -15,4 +22,15 @@ format:
pre-commit run

test:
DATAPROFILER_SEED=0 python3 -m unittest discover -p "test*.py"
DATAPROFILER_SEED=0 $(VENV_DIR)/bin/python -m unittest discover -p "test*.py"

clean:
rm -rf .pytest_cache __pycache__

help:
@echo "Makefile Commands:"
@echo " setup - Set up the virtual environment with Python $(PYTHON_VERSION)"
@echo " format - Format the code using pre-commit hooks"
@echo " test - Run unit tests with unittest"
@echo " clean - Remove temporary files (caches), but keep the virtual environment"
@echo " help - Display this help message"
1 change: 1 addition & 0 deletions dataprofiler/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Package for dataprofiler."""

from . import settings
from .data_readers.data import Data
from .dp_logging import get_logger, set_verbosity
Expand Down
5 changes: 3 additions & 2 deletions dataprofiler/_typing.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
"""Contains typing aliases."""
from typing import Dict, List, NewType, Union

from typing import NewType, Union

import numpy as np
import pandas as pd

DataArray = Union[pd.DataFrame, pd.Series, np.ndarray]
JSONType = Union[str, int, float, bool, None, List, Dict]
JSONType = Union[str, int, float, bool, None, list, dict]
armaan-dhillon marked this conversation as resolved.
Show resolved Hide resolved
Url = NewType("Url", str)
21 changes: 11 additions & 10 deletions dataprofiler/data_readers/avro_data.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Contains class for saving and loading spreadsheet data."""

from io import BytesIO, StringIO
from typing import Any, Dict, List, Optional, Union
from typing import Any, Optional, Union

import fastavro

Expand All @@ -20,7 +21,7 @@ def __init__(
self,
input_file_path: Optional[str] = None,
data: Optional[Any] = None,
options: Optional[Dict] = None,
options: Optional[dict] = None,
) -> None:
"""
Initialize Data class for loading datasets of type AVRO.
Expand Down Expand Up @@ -60,22 +61,22 @@ def file_encoding(self, value: Any) -> None:
"""
pass

def _load_data_from_file(self, input_file_path: str) -> List:
def _load_data_from_file(self, input_file_path: str) -> list:
"""Load data from file."""
with FileOrBufferHandler(input_file_path, "rb") as input_file:
# Currently, string reading with 'r' option has the unicode issue,
# even when the option encoding='utf-8' is added. It may come from
# some special compression codec, e.g., snappy. Then, binary mode
# reading is currently used to get the dict-formatted lines.
df_reader = fastavro.reader(input_file)
lines: List = list()
lines: list = list()
for line in df_reader:
lines.append(line)
return lines

@classmethod
def is_match(
cls, file_path: Union[str, StringIO, BytesIO], options: Optional[Dict] = None
cls, file_path: Union[str, StringIO, BytesIO], options: Optional[dict] = None
) -> bool:
"""
Test the given file to check if the file has valid AVRO format or not.
Expand Down Expand Up @@ -103,7 +104,7 @@ def is_match(
return is_valid_avro

@classmethod
def _get_nested_key(cls, dict_line: Dict, nested_key: Dict) -> Dict:
def _get_nested_key(cls, dict_line: dict, nested_key: dict) -> dict:
"""
Update nested keys from a dictionary and the current nested key.

Expand Down Expand Up @@ -131,7 +132,7 @@ def _get_nested_key(cls, dict_line: Dict, nested_key: Dict) -> Dict:
return nested_key

@classmethod
def _get_nested_keys_from_dicts(cls, dicts: List[Dict]) -> Dict:
def _get_nested_keys_from_dicts(cls, dicts: list[dict]) -> dict:
"""
Extract nested keys from a list of dictionaries.

Expand All @@ -143,13 +144,13 @@ def _get_nested_keys_from_dicts(cls, dicts: List[Dict]) -> Dict:
:type dicts: list(dict)
:return: a dictionary containing nested keys
"""
nested_keys: Dict = {}
nested_keys: dict = {}
for dict_line in dicts:
nested_keys = cls._get_nested_key(dict_line, nested_keys)
return nested_keys

@classmethod
def _get_schema_avro(cls, nested_keys: Dict, schema_avro: Dict) -> Dict:
def _get_schema_avro(cls, nested_keys: dict, schema_avro: dict) -> dict:
"""
Update avro schema from the nested keys and the current avro schema.

Expand Down Expand Up @@ -190,7 +191,7 @@ def _get_schema_avro(cls, nested_keys: Dict, schema_avro: Dict) -> Dict:
if type(value) is dict:
# here, the null option to specify keys not required
# for every lines
schema_avro_temp: Dict[str, Any] = {
schema_avro_temp: dict[str, Any] = {
"name": key,
"type": [{"name": key, "type": "record", "fields": []}, "null"],
}
Expand Down
20 changes: 11 additions & 9 deletions dataprofiler/data_readers/base_data.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
"""Contains abstract class for data loading and saving."""

import locale
import sys
from collections import OrderedDict
from collections.abc import Generator
from io import StringIO
from typing import Any, Dict, Generator, List, Optional, Union
from typing import Any, Optional, Union

import numpy as np
import pandas as pd
Expand All @@ -21,7 +23,7 @@ class BaseData:
info: Optional[str] = None

def __init__(
self, input_file_path: Optional[str], data: Any, options: Dict
self, input_file_path: Optional[str], data: Any, options: dict
) -> None:
"""
Initialize Base class for loading a dataset.
Expand All @@ -42,7 +44,7 @@ def __init__(

# Public properties
self.input_file_path = input_file_path
self.options: Optional[Dict] = options
self.options: Optional[dict] = options

# 'Private' properties
# _data_formats: dict containing data_formats (key) and function
Expand All @@ -56,10 +58,10 @@ def __init__(
# constant across function calls.
# _tmp_file_name: randomly set variables for file name usable by system
# _file_encoding: contains the suggested file encoding for reading data
self._data_formats: Dict[str, Any] = OrderedDict()
self._data_formats: dict[str, Any] = OrderedDict()
self._selected_data_format: Optional[str] = None
self._data: Optional[Any] = data
self._batch_info: Dict = dict(perm=list(), iter=0)
self._batch_info: dict = dict(perm=list(), iter=0)
self._tmp_file_name: Optional[str] = None
self._file_encoding: Optional[str] = options.get("encoding", None)

Expand Down Expand Up @@ -137,7 +139,7 @@ def file_encoding(self, value: str) -> None:
self._file_encoding = value

@staticmethod
def _check_and_return_options(options: Optional[Dict]) -> Dict:
def _check_and_return_options(options: Optional[dict]) -> dict:
"""Return options or raise error."""
if not options:
options = dict()
Expand All @@ -151,7 +153,7 @@ def _load_data(self, data: Optional[Any] = None) -> None:

def get_batch_generator(
self, batch_size: int
) -> Generator[Union[pd.DataFrame, List], None, None]:
) -> Generator[Union[pd.DataFrame, list], None, None]:
"""Get batch generator."""
data_length = len(self.data)
indices = np.random.permutation(data_length)
Expand All @@ -162,12 +164,12 @@ def get_batch_generator(
yield list(self.data[k] for k in indices[i : i + batch_size])

@classmethod
def is_match(cls, input_file_path: str, options: Optional[Dict]) -> bool:
def is_match(cls, input_file_path: str, options: Optional[dict]) -> bool:
"""Return true if match, false otherwise."""
raise NotImplementedError()

def reload(
self, input_file_path: Optional[str], data: Any, options: Optional[Dict]
self, input_file_path: Optional[str], data: Any, options: Optional[dict]
) -> None:
"""
Reload the data class with a new dataset.
Expand Down
29 changes: 15 additions & 14 deletions dataprofiler/data_readers/csv_data.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
"""Contains class that saves and loads spreadsheet data."""

import csv
import random
import re
from collections import Counter
from io import StringIO
from typing import Dict, List, Optional, Tuple, Union, cast
from typing import Optional, Union, cast

import numpy as np
import pandas as pd
Expand All @@ -26,7 +27,7 @@ def __init__(
self,
input_file_path: Optional[str] = None,
data: Optional[pd.DataFrame] = None,
options: Optional[Dict] = None,
options: Optional[dict] = None,
):
"""
Initialize Data class for loading datasets of type CSV.
Expand Down Expand Up @@ -82,7 +83,7 @@ def __init__(
self._selected_data_format: str = options.get("data_format", "dataframe")
self._delimiter: Optional[str] = options.get("delimiter", None)
self._quotechar: Optional[str] = options.get("quotechar", None)
self._selected_columns: List[str] = options.get("selected_columns", list())
self._selected_columns: list[str] = options.get("selected_columns", list())
self._header: Optional[Union[str, int]] = options.get("header", "auto")
self._checked_header: bool = "header" in options and self._header != "auto"
self._default_delimiter: str = ","
Expand All @@ -97,7 +98,7 @@ def __init__(
self._quotechar = self._default_quotechar

@property
def selected_columns(self) -> List[str]:
def selected_columns(self) -> list[str]:
"""Return selected columns."""
return self._selected_columns

Expand Down Expand Up @@ -127,7 +128,7 @@ def is_structured(self) -> bool:
return self.data_format == "dataframe"

@staticmethod
def _check_and_return_options(options: Optional[Dict]) -> Dict:
def _check_and_return_options(options: Optional[dict]) -> dict:
"""
Ensure options are valid inputs to the data reader.

Expand Down Expand Up @@ -184,9 +185,9 @@ def _check_and_return_options(options: Optional[Dict]) -> Dict:
def _guess_delimiter_and_quotechar(
data_as_str: str,
quotechar: Optional[str] = None,
preferred: List[str] = [",", "\t"],
omitted: List[str] = ['"', "'"],
) -> Tuple[Optional[str], Optional[str]]:
preferred: list[str] = [",", "\t"],
omitted: list[str] = ['"', "'"],
) -> tuple[Optional[str], Optional[str]]:
r"""
Automatically check for what delimiter exists in a text document.

Expand All @@ -207,7 +208,7 @@ def _guess_delimiter_and_quotechar(
vocab = Counter(data_as_str)
if "\n" in vocab:
vocab.pop("\n")
omitted_list: List[str] = omitted
omitted_list: list[str] = omitted
if quotechar is not None:
omitted_list = omitted + [quotechar]
for char in omitted_list:
Expand Down Expand Up @@ -384,7 +385,7 @@ def _guess_header_row(
quotechar = '"'

# Determine type for every cell
header_check_list: List[List[str]] = []
header_check_list: list[list[str]] = []
only_string_flag = True # Requires additional checks
for row in data_as_str.split("\n"):

Expand All @@ -403,7 +404,7 @@ def _guess_header_row(

# Flags differences in types between each row (true/false)
potential_header = header_check_list[0]
differences: List[List[bool]] = []
differences: list[list[bool]] = []
for i in range(0, len(header_check_list)):
differences.append([])

Expand Down Expand Up @@ -612,7 +613,7 @@ def _load_data_from_file(self, input_file_path: str) -> pd.DataFrame:
encoding=self.file_encoding,
)

def _get_data_as_records(self, data: pd.DataFrame) -> List[str]:
def _get_data_as_records(self, data: pd.DataFrame) -> list[str]:
"""Return data as records."""
sep = self.delimiter if self.delimiter else self._default_delimiter
quote = self.quotechar if self.quotechar else self._default_quotechar
Expand All @@ -621,7 +622,7 @@ def _get_data_as_records(self, data: pd.DataFrame) -> List[str]:
return super()._get_data_as_records(data)

@classmethod
def is_match(cls, file_path: str, options: Optional[Dict] = None) -> bool:
def is_match(cls, file_path: str, options: Optional[dict] = None) -> bool:
"""
Check if first 1000 lines of given file has valid delimited format.

Expand Down Expand Up @@ -745,7 +746,7 @@ def reload(
self,
input_file_path: Optional[str] = None,
data: Optional[pd.DataFrame] = None,
options: Optional[Dict] = None,
options: Optional[dict] = None,
):
"""
Reload the data class with a new dataset.
Expand Down
Loading
Loading