From d386b9e4b9050a71ed53c3032ffc114349e6543f Mon Sep 17 00:00:00 2001 From: cdo03c Date: Tue, 5 Dec 2023 20:07:19 -0500 Subject: [PATCH 01/17] Adds alt text to images --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 82f8418..64092de 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # PyMASq

- + MASq Logo

## Python-based Mitigation Application and Assessment (MASq) @@ -32,9 +32,9 @@ cd pymasq ### Installing into a Conda Environment ```sh -conda create -n masq python=3.8 -y +conda create -n masq python=3.10 -y conda activate masq -pip install . +pip install -e . ``` To generate the docs @@ -44,7 +44,7 @@ python -m pip install -r ./doc-requirements.txt ```

- + MIT Lincoln Lab Logo

## Distribution Statement From 772e216848f9ac4218e41cc8ef5db263c0252329 Mon Sep 17 00:00:00 2001 From: cdo03c Date: Wed, 6 Dec 2023 18:57:03 -0500 Subject: [PATCH 02/17] Cleans up code --- setup.cfg | 40 ++++++++++++----------- src/pymasq/config.py | 1 - src/pymasq/datasets/data_generator.py | 2 +- src/pymasq/mitigations/add_noise.py | 1 - src/pymasq/mitigations/geom_transform.py | 2 +- src/pymasq/mitigations/hashing.py | 3 +- src/pymasq/mitigations/local_supp.py | 4 ++- src/pymasq/mitigations/pram.py | 7 ++-- src/pymasq/mitigations/rank_swap.py | 7 ++-- src/pymasq/mitigations/rounding.py | 7 ++-- src/pymasq/mitigations/shuffle.py | 4 +-- src/pymasq/mitigations/substitute.py | 5 ++- src/pymasq/mitigations/topbot_recoding.py | 3 +- src/pymasq/mitigations/truncate.py | 12 ++++--- src/pymasq/mitigations/utils.py | 6 ++-- src/pymasq/models/_base.py | 8 ++--- src/pymasq/models/models.py | 2 +- src/pymasq/optimizations/optimizations.py | 2 +- src/pymasq/optimizations/utils.py | 2 +- src/pymasq/preprocessing/preprocess.py | 8 ++--- src/pymasq/sa/sobol.py | 2 +- src/pymasq/utils/utils.py | 1 - tests/utils/test_cache.py | 10 ++---- 23 files changed, 64 insertions(+), 75 deletions(-) diff --git a/setup.cfg b/setup.cfg index 7bf9a61..31608d1 100644 --- a/setup.cfg +++ b/setup.cfg @@ -11,27 +11,29 @@ author = Cuyler OBrien, Jaime Pena, Evan Young, Brian Levine, Eric Wybenga author_email = cuyler.obrien@ll.mit.edu, jdpena@ll.mit.edu, evan.young@ll.mit.edu [options] -python_requires = >= 3.8 +python_requires = >= 3.9 packages = find: package_dir = = src install_requires = - boruta>=0.3 - bpemb>=0.3.3 - matplotlib>=3.4.2 - numpy>=1.19.3 - pandas>=1.1.3 + boruta~=0.3 + bpemb~=0.3 + matplotlib~=3.5 + numpy~=1.22 + pandas~=1.4 plotly>=4.11.0 - scikit-learn>=0.23 - scipy>=1.5.4 - statsmodels>=0.12 - SALib>=1.4.5 - tensorflow>=2.4.0 - tpot[dask]>=0.11 + SALib~=1.4 + scikit-learn~=1.1 + scipy~=1.8 + statsmodels~=0.13 + tensorflow~=2.9 + tpot[dask]~=0.11 tests_require = - pytest>=3.8 - hypothesis>=4.53.2 beartype>=0.5.1 + hypothesis>=4.53.2 + pytest>=3.8 + pytest-xdist~=3.5 + [options.packages.find] where = src @@ -43,7 +45,7 @@ python_files=test_*.py testpaths=tests [tox:tox] -envlist = py38, py39, coverage, bandit, owasp-depcheck +envlist = py3{9,10,11}, coverage, bandit, owasp-depcheck toxworkdir = build/tox [testenv] @@ -54,7 +56,7 @@ commands = pytest tests --junitxml={toxworkdir}/xunit-tests-{envname}.xml -o jun [testenv:coverage] usedevelop = true -basepython = python3.8 +basepython = python3.10 deps = {[testenv]deps} coverage pytest-cov @@ -62,16 +64,16 @@ commands = pytest --cov-report xml:{toxworkdir}/xunit-coverage.xml --cov-config= [testenv:localcoverage] usedevelop = true -basepython = python3.8 +basepython = python3.10 deps = {[testenv]deps} coverage pytest-cov commands = pytest --cov-report term-missing --cov-config=setup.cfg --cov=pymasq tests [testenv:bandit] -basepython = python3.8 +basepython = python3.10 deps = bandit commands = bandit -f json -o {toxworkdir}/security-bandit.json -r {envsitepackagesdir}/pymasq [testenv:owasp-depcheck] -basepython = python3.8 +basepython = python3.10 diff --git a/src/pymasq/config.py b/src/pymasq/config.py index 855e3ad..57f1b3a 100644 --- a/src/pymasq/config.py +++ b/src/pymasq/config.py @@ -1,6 +1,5 @@ from pathlib import Path from typing import Tuple -from pymasq import ROOT_DIR # Directory where all embeddings and models will be cached CACHE_LOCATION: Path = Path("~/.cache/pymasq").expanduser() diff --git a/src/pymasq/datasets/data_generator.py b/src/pymasq/datasets/data_generator.py index 8b88fa0..773c429 100644 --- a/src/pymasq/datasets/data_generator.py +++ b/src/pymasq/datasets/data_generator.py @@ -136,7 +136,7 @@ def _l_div_sensitive_gen(l: int, n: int) -> List: while len(unique_entries) != len(set(unique_entries)): unique_entries = np.random.choice(range(n), l) - non_unique = np.random.choice(unique_entries, n - l) + non_unique = np.random.Generator.choice(unique_entries, n - l) return list(unique_entries) + list(non_unique) diff --git a/src/pymasq/mitigations/add_noise.py b/src/pymasq/mitigations/add_noise.py index 92e85a8..eca3a34 100644 --- a/src/pymasq/mitigations/add_noise.py +++ b/src/pymasq/mitigations/add_noise.py @@ -13,7 +13,6 @@ VALIDATE_NUMERIC_ON_INPUT, VALIDATE_NUMERIC_ON_OUTPUT, ) -from pymasq.mitigations.utils import _as_series, _as_dataframe from pymasq.utils import validate_numeric, formatting from pymasq import BEARTYPE from pymasq.errors import InputError diff --git a/src/pymasq/mitigations/geom_transform.py b/src/pymasq/mitigations/geom_transform.py index 940f90a..9e5fef5 100644 --- a/src/pymasq/mitigations/geom_transform.py +++ b/src/pymasq/mitigations/geom_transform.py @@ -10,7 +10,7 @@ from pymasq.config import FORMATTING_ON_OUTPUT, FORMATTING_IGNORE_DTYPES from pymasq.errors import InputError from pymasq.mitigations.utils import _is_identical -from pymasq.utils import formatting, validate_numeric +from pymasq.utils import formatting __all__ = ["geom_transform"] diff --git a/src/pymasq/mitigations/hashing.py b/src/pymasq/mitigations/hashing.py index 9446d8e..a178ebf 100644 --- a/src/pymasq/mitigations/hashing.py +++ b/src/pymasq/mitigations/hashing.py @@ -1,4 +1,4 @@ -from typing import Any, Callable, Dict, List, Optional, Union +from typing import Callable, List, Optional, Union import hashlib import numpy as np @@ -8,7 +8,6 @@ from pymasq import BEARTYPE from pymasq.config import ( FORMATTING_ON_OUTPUT, - FORMATTING_IGNORE_DTYPES, ) from pymasq.errors import InputError from pymasq.utils import formatting diff --git a/src/pymasq/mitigations/local_supp.py b/src/pymasq/mitigations/local_supp.py index e48a2a4..558f365 100644 --- a/src/pymasq/mitigations/local_supp.py +++ b/src/pymasq/mitigations/local_supp.py @@ -137,7 +137,9 @@ def local_supp( method=method, qual=qual, ) - if not keep_dtypes and type(to_val) != type(data.loc[0, suppress_col]): + if not keep_dtypes and not isinstance( + type(to_val), type(data.loc[0, suppress_col]) + ): # TODO: switch to logging print( f"WARNING: The datatype of the `suppress_col` ({suppress_col}`) will be changed." diff --git a/src/pymasq/mitigations/pram.py b/src/pymasq/mitigations/pram.py index df04f55..d1bcb18 100644 --- a/src/pymasq/mitigations/pram.py +++ b/src/pymasq/mitigations/pram.py @@ -6,7 +6,6 @@ from pymasq import BEARTYPE from pymasq.config import ( FORMATTING_ON_OUTPUT, - FORMATTING_IGNORE_DTYPES, ) from pymasq.errors import InputError, NotInRangeError from pymasq.mitigations.utils import _is_identical @@ -92,9 +91,7 @@ def __randomization( d_pramed[idxs] = np.random.choice( cats, len(idxs), - p=trans.loc[ - cat, - ], + p=trans.loc[cat,], ) return d_pramed @@ -300,7 +297,7 @@ def pram( if len(perturb_cols) != n_pc: if len(perturb_cols) == 0: - raise InputError(f"All values of `data` cannot be NaNs or identical.") + raise InputError("All values of `data` cannot be NaNs or identical.") else: print( "WARNING: ignoring columns that are composed entirely of identical values." diff --git a/src/pymasq/mitigations/rank_swap.py b/src/pymasq/mitigations/rank_swap.py index 057cec6..0a47a13 100644 --- a/src/pymasq/mitigations/rank_swap.py +++ b/src/pymasq/mitigations/rank_swap.py @@ -1,7 +1,6 @@ from typing import Union, List import pandas as pd -import numpy as np from .utils import _as_series @@ -9,11 +8,9 @@ def rank_swap( - data: Union[pd.DataFrame, pd.Series], - cols: Union[str, List[str]] = None, - **kwargs + data: Union[pd.DataFrame, pd.Series], cols: Union[str, List[str]] = None, **kwargs ) -> pd.Series: - """ TODO + """TODO Parameters ---------- diff --git a/src/pymasq/mitigations/rounding.py b/src/pymasq/mitigations/rounding.py index 772f810..68134ee 100644 --- a/src/pymasq/mitigations/rounding.py +++ b/src/pymasq/mitigations/rounding.py @@ -1,4 +1,3 @@ -import math import pandas as pd from typing import List, Union, Optional @@ -29,14 +28,14 @@ def rounding( cols: Optional[Union[List, str, int]] = None, keep_dtypes: bool = True, ) -> pd.DataFrame: - """ Round numerical values to the nearest place value. + """Round numerical values to the nearest place value. Round to the nearest whole number or decimal. Values are always rounded up. Parameters ---------- data : DataFrame, Series, or array_like - The data to be modified. + The data to be modified. magnitude : int (Default: 0) The place value to round to. round_decimal : bool (Default: False) @@ -54,7 +53,7 @@ def rounding( Examples -------- - >>> df = pd.DataFrame(np.random.uniform(0.0, 1000, (10,3))) + >>> df = pd.DataFrame(np.random.uniform(0.0, 1000, (10,3))) 0 1 2 0 790.885012 378.955986 598.524492 1 396.506198 416.688230 801.133469 diff --git a/src/pymasq/mitigations/shuffle.py b/src/pymasq/mitigations/shuffle.py index 9e38c55..0cf5b85 100644 --- a/src/pymasq/mitigations/shuffle.py +++ b/src/pymasq/mitigations/shuffle.py @@ -10,7 +10,7 @@ from pymasq.config import FORMATTING_ON_OUTPUT from pymasq.utils import formatting from pymasq.preprocessing import LabelEncoder_pm -from pymasq.errors import InputError, DataTypeError +from pymasq.errors import InputError from pymasq.mitigations.utils import _is_identical @@ -327,4 +327,4 @@ def _shuffle_wrapper( raise InputError( f"Invalid `method` defined; method must be one of ['model', 'corr']. (Received: {method}" ) -''' \ No newline at end of file +''' diff --git a/src/pymasq/mitigations/substitute.py b/src/pymasq/mitigations/substitute.py index 951677f..67bb9de 100644 --- a/src/pymasq/mitigations/substitute.py +++ b/src/pymasq/mitigations/substitute.py @@ -1,10 +1,9 @@ import pandas as pd -import re from typing import List, Optional, Union from pymasq import BEARTYPE -from pymasq.config import FORMATTING_ON_OUTPUT, FORMATTING_IGNORE_DTYPES +from pymasq.config import FORMATTING_ON_OUTPUT from pymasq.utils import formatting __all__ = ["substitute"] @@ -13,7 +12,7 @@ def __format_if_list( from_val: Union[str, float, int, List], to_val: Union[str, float, int, List] ): - """ Format input values if at least one of them is a list """ + """Format input values if at least one of them is a list""" if isinstance(from_val, list): to_val = to_val if isinstance(to_val, list) else [to_val] if len(to_val) == 1: diff --git a/src/pymasq/mitigations/topbot_recoding.py b/src/pymasq/mitigations/topbot_recoding.py index 745c073..65dd92b 100644 --- a/src/pymasq/mitigations/topbot_recoding.py +++ b/src/pymasq/mitigations/topbot_recoding.py @@ -1,11 +1,10 @@ import pandas as pd -from typing import Union, List, Optional, Dict, Final +from typing import Union, List, Optional, Final from pymasq import BEARTYPE from pymasq.config import ( FORMATTING_ON_OUTPUT, - FORMATTING_IGNORE_DTYPES, VALIDATE_NUMERIC_ON_INPUT, VALIDATE_NUMERIC_ON_OUTPUT, ) diff --git a/src/pymasq/mitigations/truncate.py b/src/pymasq/mitigations/truncate.py index 303b27a..79abbf0 100644 --- a/src/pymasq/mitigations/truncate.py +++ b/src/pymasq/mitigations/truncate.py @@ -43,7 +43,7 @@ def truncate_by_match( Parameters ---------- data : DataFrame or Series - The data to be modified. + The data to be modified. match : str The string to search for. keep_before : bool, optional (Default: True) @@ -70,7 +70,7 @@ def truncate_by_match( 2 Private HS-grad Not-in-family 3 Private 11th Husband 4 Private Bachelors Wife - + >>> truncate_by_match(df[['workclass', 'education', 'relationship']], match='a') workclass education relationship 0 St B Not-in-f @@ -140,7 +140,7 @@ def truncate_by_index( 2 Private HS-grad Not-in-family 3 Private 11th Husband 4 Private Bachelors Wife - + >>> truncate_by_index(df[['workclass', 'education', 'relationship']], idx=1, trim_from='both') workclass education relationship 0 tate-go achelor ot-in-famil @@ -167,7 +167,9 @@ def _truncate_by_index(series, trim_from, idx, end): def truncate( - data: Union[pd.DataFrame, pd.Series], method: str = "index", **kwargs, + data: Union[pd.DataFrame, pd.Series], + method: str = "index", + **kwargs, ) -> pd.DataFrame: """Truncate strings by index or after matching a speficic substring. @@ -235,7 +237,7 @@ def truncate( 2 Private HS-grad Not-in-family 3 Private 11th Husband 4 Private Bachelors Wife - + >>> truncate(df, cols=['workclass', 'education', 'relationship'], method='index', idx=1, trim_from='both') workclass education relationship 0 tate-go achelor ot-in-famil diff --git a/src/pymasq/mitigations/utils.py b/src/pymasq/mitigations/utils.py index 65b56cb..faf7ef1 100644 --- a/src/pymasq/mitigations/utils.py +++ b/src/pymasq/mitigations/utils.py @@ -11,7 +11,7 @@ def _is_identical(s: pd.Series) -> bool: - """ Checks if all values in the input series are identical. """ + """Checks if all values in the input series are identical.""" s = s.to_numpy() # s.values (pandas<0.24) return (s[0] == s).all() @@ -19,7 +19,7 @@ def _is_identical(s: pd.Series) -> bool: def _as_series( obj: Union[pd.DataFrame, pd.Series], cols: Optional[Union[str, List[str]]] = None ) -> pd.Series: - """ Convert an object data structure into a Series """ + """Convert an object data structure into a Series""" if isinstance(obj, pd.DataFrame): if cols is None: raise InputError( @@ -38,7 +38,7 @@ def _as_series( def _as_dataframe( obj: Union[pd.DataFrame, pd.Series], cols: Optional[Union[str, List[str]]] = None ) -> pd.DataFrame: - """ Convert an object data structure into a DataFrame """ + """Convert an object data structure into a DataFrame""" if isinstance(obj, pd.DataFrame): if cols is None: return obj.copy() diff --git a/src/pymasq/models/_base.py b/src/pymasq/models/_base.py index 46c8b06..5a0206c 100644 --- a/src/pymasq/models/_base.py +++ b/src/pymasq/models/_base.py @@ -1,11 +1,11 @@ +import os from abc import abstractmethod -from joblib.parallel import DEFAULT_N_JOBS +from typing import Type, Optional, Union + import pandas as pd -import os -from typing import Type, Optional, List, Union -from pymasq.utils import cache import pymasq.config as cfg +from pymasq.utils import cache from pymasq.preprocessing._base import PreprocessorBase from pymasq import BEARTYPE diff --git a/src/pymasq/models/models.py b/src/pymasq/models/models.py index 3ea34eb..82c8116 100644 --- a/src/pymasq/models/models.py +++ b/src/pymasq/models/models.py @@ -1,4 +1,4 @@ -from pymasq.config import DEFAULT_LOGISITIC_REGRESSION_SOLVER, DEFAULT_SEED +from pymasq.config import DEFAULT_LOGISITIC_REGRESSION_SOLVER import pandas as pd import numpy as np from typing import List, Optional, Type, Any, Union diff --git a/src/pymasq/optimizations/optimizations.py b/src/pymasq/optimizations/optimizations.py index e17d541..4383f71 100644 --- a/src/pymasq/optimizations/optimizations.py +++ b/src/pymasq/optimizations/optimizations.py @@ -456,7 +456,7 @@ def _optimize(self): prob = np.random.random_sample() - if (target.equals(new_target) == False) and ( + if not target.equals(new_target) and ( self._accept_prob(cur_fit, new_fit) > prob ): if self.verbose >= 1: diff --git a/src/pymasq/optimizations/utils.py b/src/pymasq/optimizations/utils.py index a134def..a36ca9e 100644 --- a/src/pymasq/optimizations/utils.py +++ b/src/pymasq/optimizations/utils.py @@ -1,6 +1,6 @@ from typing import Tuple import pandas as pd -from typing import Callable, Dict, List, Optional, Union +from typing import Callable, Dict, List, Union from pymasq import BEARTYPE from pymasq.optimizations import IterativeSearch diff --git a/src/pymasq/preprocessing/preprocess.py b/src/pymasq/preprocessing/preprocess.py index a3f8587..705a2a9 100644 --- a/src/pymasq/preprocessing/preprocess.py +++ b/src/pymasq/preprocessing/preprocess.py @@ -384,7 +384,7 @@ def _organize_columns( dropped_cols.extend(ignore_columns) if sensitive_col or ignore_columns: - input_data = df.drop(dropped_cols, 1).copy() + input_data = df.drop(columns=dropped_cols, axis=1).copy() else: input_data = df.copy() @@ -548,7 +548,7 @@ def encode( print("Splitting Data into Numerical and Categorical Data...") if sensitive_col or ignore_columns: - input_data = df.drop(dropped_cols, 1).copy() + input_data = df.drop(columns=dropped_cols, axis=1).copy() ignore_col_data = df.loc[:, ignore_columns].copy() else: input_data = df.copy() @@ -678,14 +678,14 @@ def encode( [y, ignore_col_data, numerical_imputed_normalized, binary] + categorical_embeddings + textual_embeddings, - 1, + axis=1, ) return pd.concat( [ignore_col_data, numerical_imputed_normalized, binary] + categorical_embeddings + textual_embeddings, - 1, + axis=1, ) diff --git a/src/pymasq/sa/sobol.py b/src/pymasq/sa/sobol.py index c316a32..60b62fb 100644 --- a/src/pymasq/sa/sobol.py +++ b/src/pymasq/sa/sobol.py @@ -1,7 +1,7 @@ import numpy as np import pandas as pd from pymasq.errors import DataTypeError, InputError -from SALib.sample import saltelli +from SALib.sample import sobol as saltelli from SALib.analyze import sobol from typing import Dict, Optional, Tuple, Final diff --git a/src/pymasq/utils/utils.py b/src/pymasq/utils/utils.py index a974089..2eb6d23 100644 --- a/src/pymasq/utils/utils.py +++ b/src/pymasq/utils/utils.py @@ -8,7 +8,6 @@ from pymasq import BEARTYPE from pymasq import config -from pymasq.errors import InputError __all__ = ["BOTH", "as_dataframe", "validate_numeric", "formatting", "freq_calc"] diff --git a/tests/utils/test_cache.py b/tests/utils/test_cache.py index c1e64ad..3157e43 100644 --- a/tests/utils/test_cache.py +++ b/tests/utils/test_cache.py @@ -8,9 +8,6 @@ from pymasq.datasets import load_census from pymasq.models.models import LogisticRegressionClassifier, RFClassifier from pymasq.preprocessing import LabelEncoder_pm, EmbeddingsEncoder - -# from pymasq.errors import InputError, DataTypeError - from pymasq.utils import cache @@ -36,7 +33,7 @@ def my_df(): ( LogisticRegressionClassifier, LabelEncoder_pm, - 0.6, + 0.5, "cache_test/053cb5e57bfa9b5c9568625cb22588dd.larsCV.2bd270eec04828b035a1facfbb35f355.pkl", """larsCV. Description: Preprocessed with First ten rows: @@ -57,7 +54,7 @@ def my_df(): ( RFClassifier, EmbeddingsEncoder, - 0.5, + 0.61, "cache_test/053cb5e57bfa9b5c9568625cb22588dd.ENCV.e81a5b5eb0df48bc68540d7b71342a7d.pkl", """ENCV. Description: Preprocessed with First ten rows: @@ -121,10 +118,9 @@ def test_cache(my_df, combo): except Exception as e: print("This error is a desired outcome of the test:") print("\t", e, "\n") - pass cfg.CACHE_HMAC_KEY = "my key" # Assert to see if description was saved descriptions = cache.cache_info(dir_name) assert descriptions[key] == desc - shutil.rmtree(dir_name) \ No newline at end of file + shutil.rmtree(dir_name) From cff46b151316027f381b8f52fab9be5447927562 Mon Sep 17 00:00:00 2001 From: cdo03c Date: Wed, 6 Dec 2023 19:00:57 -0500 Subject: [PATCH 03/17] Cleans up python style --- src/pymasq/mitigations/global_recode.py | 12 ++--- src/pymasq/mitigations/shuffle.py | 4 +- src/pymasq/models/_base.py | 2 +- src/pymasq/models/models.py | 14 +++--- src/pymasq/preprocessing/__init__.py | 4 +- src/pymasq/preprocessing/preprocess.py | 59 ++++++++++++------------- tests/classifiers/test_classifiers.py | 8 ++-- tests/preprocessing/test_preprocess.py | 6 +-- tests/utils/test_cache.py | 6 +-- 9 files changed, 56 insertions(+), 59 deletions(-) diff --git a/src/pymasq/mitigations/global_recode.py b/src/pymasq/mitigations/global_recode.py index 73d4fb0..3deb79b 100644 --- a/src/pymasq/mitigations/global_recode.py +++ b/src/pymasq/mitigations/global_recode.py @@ -1,4 +1,4 @@ -from pymasq.preprocessing.preprocess import LabelEncoder_pm +from pymasq.preprocessing.preprocess import LabelEncoderPM import pandas as pd import numpy as np @@ -29,23 +29,23 @@ def __gr_equidistant(data: pd.Series, breaks: int) -> pd.Series: - """ Global Recode for `equidistant` method """ + """Global Recode for `equidistant` method""" return np.linspace(data.min(), data.max(), breaks) def __gr_log_equidistant(data: pd.Series, breaks: int) -> pd.Series: - """ Global Recode for `log_equidistant` method """ + """Global Recode for `log_equidistant` method""" data_log = np.log(data) return np.exp(np.linspace(data_log.min(), data_log.max(), breaks)) def __gr_equal_quantity(data: pd.Series, breaks: int) -> pd.Series: - """ Global Recode for `equal` method """ + """Global Recode for `equal` method""" return data.quantile(np.linspace(0, 1, breaks)) def __gr_order_of_magnitude(data: pd.Series, breaks: int) -> pd.Series: - """ Global Recode for order of `magnitude` method. """ + """Global Recode for order of `magnitude` method.""" data_log = np.log10(data) return np.power(10, np.linspace(data_log.min(), data_log.max(), breaks)) @@ -194,7 +194,7 @@ def global_recode( ) ) if ret_ints: - le = LabelEncoder_pm() + le = LabelEncoderPM() return le.encode(data_recode.astype(str)) return data_recode diff --git a/src/pymasq/mitigations/shuffle.py b/src/pymasq/mitigations/shuffle.py index 0cf5b85..961a433 100644 --- a/src/pymasq/mitigations/shuffle.py +++ b/src/pymasq/mitigations/shuffle.py @@ -9,7 +9,7 @@ from pymasq import BEARTYPE from pymasq.config import FORMATTING_ON_OUTPUT from pymasq.utils import formatting -from pymasq.preprocessing import LabelEncoder_pm +from pymasq.preprocessing import LabelEncoderPM from pymasq.errors import InputError from pymasq.mitigations.utils import _is_identical @@ -214,7 +214,7 @@ def shuffle( "WARNING: ignoring columns that are composed entirely of identical values." ) - _data = LabelEncoder_pm.encode(df=pd.concat([x, y], axis=1)) + _data = LabelEncoderPM.encode(df=pd.concat([x, y], axis=1)) resp_cols = y.columns.to_list() pred_cols = x.columns.to_list() diff --git a/src/pymasq/models/_base.py b/src/pymasq/models/_base.py index 5a0206c..2cc0efa 100644 --- a/src/pymasq/models/_base.py +++ b/src/pymasq/models/_base.py @@ -87,7 +87,7 @@ def train( preprocessor : PreprocessorBase A child of PreprocessorBase class indicating what preprocessor to use. Options are: - pymasq.preprocessing.EmbeddingsEncoder - - pymasq.preprocessing.LabelEncoder_pm + - pymasq.preprocessing.LabelEncoderPM - None (i.e., the data is already pre-processed) retrain : boolean, optional (Default: False) Re-runs and saves over existing TPOT model for the given file path. diff --git a/src/pymasq/models/models.py b/src/pymasq/models/models.py index 82c8116..0f59b9d 100644 --- a/src/pymasq/models/models.py +++ b/src/pymasq/models/models.py @@ -84,7 +84,7 @@ def train( preprocessor : PreprocessorBase A child of PreprocessorBase class indicating what preprocessor to use. Options are: - pymasq.preprocessing.EmbeddingsEncoder - - pymasq.preprocessing.LabelEncoder_pm + - pymasq.preprocessing.LabelEncoderPM - None (i.e., the data is already pre-processed) retrain: bool (Default: False) Ignore cached results and retrain @@ -180,7 +180,7 @@ def train( preprocessor : PreprocessorBase (Default: None) A child of PreprocessorBase class indicating what preprocessor to use. Options are: - pymasq.preprocessing.EmbeddingsEncoder - - pymasq.preprocessing.LabelEncoder_pm + - pymasq.preprocessing.LabelEncoderPM - None (i.e., the data is already pre-processed) retrain: bool (Default: False) Ignore cached results and retrain @@ -288,7 +288,7 @@ def train( preprocessor : PreprocessorBase (Default: None) A child of PreprocessorBase class indicating what preprocessor to use. Options are: - pymasq.preprocessing.EmbeddingsEncoder - - pymasq.preprocessing.LabelEncoder_pm + - pymasq.preprocessing.LabelEncoderPM - None (i.e., the data is already pre-processed) retrain: bool (Default: False) Ignore cached results and retrain @@ -397,7 +397,7 @@ def train( preprocessor : PreprocessorBase (Default: None) A child of PreprocessorBase class indicating what preprocessor to use. Options are: - pymasq.preprocessing.EmbeddingsEncoder - - pymasq.preprocessing.LabelEncoder_pm + - pymasq.preprocessing.LabelEncoderPM - None (i.e., the data is already pre-processed) retrain: bool (Default: False) Ignore cached results and retrain @@ -498,7 +498,7 @@ def train( preprocessor : PreprocessorBase A child of PreprocessorBase class indicating what preprocessor to use. Options are: - pymasq.preprocessing.EmbeddingsEncoder - - pymasq.preprocessing.LabelEncoder_pm + - pymasq.preprocessing.LabelEncoderPM - None (i.e., the data is already pre-processed) retrain: bool (Default: False) Ignore cached results and retrain @@ -604,7 +604,7 @@ def train( preprocessor : PreprocessorBase A child of PreprocessorBase class indicating what preprocessor to use. Options are: - pymasq.preprocessing.EmbeddingsEncoder - - pymasq.preprocessing.LabelEncoder_pm + - pymasq.preprocessing.LabelEncoderPM - None (i.e., the data is already pre-processed) scoring : string or callable, optional (Default: 'f1') @@ -785,7 +785,7 @@ def train( preprocessor : PreprocessorBase A child of PreprocessorBase class indicating what preprocessor to use. Options are: - pymasq.preprocessing.EmbeddingsEncoder - - pymasq.preprocessing.LabelEncoder_pm + - pymasq.preprocessing.LabelEncoderPM - None (i.e., the data is already pre-processed) scoring : string or callable, optional (Default: 'f1') diff --git a/src/pymasq/preprocessing/__init__.py b/src/pymasq/preprocessing/__init__.py index 4f6f142..cc23c91 100644 --- a/src/pymasq/preprocessing/__init__.py +++ b/src/pymasq/preprocessing/__init__.py @@ -1,11 +1,11 @@ from .preprocess import ( EmbeddingsEncoder, - LabelEncoder_pm, + LabelEncoderPM, ) from .entity_embedding import embed_entities __all__ = [ "embed_entities", "EmbeddingsEncoder", - "LabelEncoder_pm", + "LabelEncoderPM", ] diff --git a/src/pymasq/preprocessing/preprocess.py b/src/pymasq/preprocessing/preprocess.py index 705a2a9..96e9331 100644 --- a/src/pymasq/preprocessing/preprocess.py +++ b/src/pymasq/preprocessing/preprocess.py @@ -16,7 +16,7 @@ from pymasq import BEARTYPE # This file contains two children of PreprocessorBase -# 1. LabelEncoder_pm +# 1. LabelEncoderPM # 2. EmbeddingsEncoder ################# @@ -30,7 +30,7 @@ } -class LabelEncoder_pm(PreprocessorBase): +class LabelEncoderPM(PreprocessorBase): """ This class manages an instance of sklearn's LabelEncoder. Encodes categorical data only, as integers. @@ -38,7 +38,6 @@ class LabelEncoder_pm(PreprocessorBase): def __init__(self): super().__init__() - pass @staticmethod @BEARTYPE @@ -84,7 +83,7 @@ def encode(df: Union[pd.Series, pd.DataFrame], **kwargs) -> pd.DataFrame: @staticmethod @BEARTYPE def encode_both( - df_A: pd.DataFrame, df_B: pd.DataFrame, **kwargs + df_a: pd.DataFrame, df_b: pd.DataFrame, **kwargs ) -> Tuple[pd.DataFrame, pd.DataFrame]: """ Takes two dataframes and uses sklearn's LabelEncoder on categorical columns only to relabel @@ -100,28 +99,28 @@ def encode_both( Parameter --------- - df_A: pdf.DataFrame: + df_a: pdf.DataFrame: The data frame to encode. - df_B: pdf.DataFrame: + df_b: pdf.DataFrame: The data frame to encode. Returns ------- Tuple[pd.DataFrame, pd.DataFrame]: pd.DataFrame - df_A data frame now preprocessed so that categorical data is relabeled as integers. + df_a data frame now preprocessed so that categorical data is relabeled as integers. pd.DataFrame - df_B data frame now preprocessed so that categorical data is relabeled as integers. + df_b data frame now preprocessed so that categorical data is relabeled as integers. - Column order remains consistent with original dataframes. df_A and df_B are not modified. + Column order remains consistent with original dataframes. df_a and df_b are not modified. """ le = skLabelEncoder() # make a copy - df_a = df_A.copy() - df_b = df_B.copy() - if set(df_a.columns) != set(df_B.columns): - raise InputError("df_A and df_B must have same columns") + df_a = df_a.copy() + df_b = df_b.copy() + if set(df_a.columns) != set(df_b.columns): + raise InputError("df_a and df_b must have same columns") col_order = df_a.columns.tolist() # join together; mark each so we can separate again later @@ -150,10 +149,10 @@ def encode_both( [cat_cols.apply(le.fit_transform), num_cols], join="outer", axis=1 ) # split up again, and drop the extra column - df_A_enc = both.loc[both[class_col] == 0].drop(class_col, axis=1) - df_B_enc = both.loc[both[class_col] == 1].drop(class_col, axis=1) + df_a_enc = both.loc[both[class_col] == 0].drop(class_col, axis=1) + df_b_enc = both.loc[both[class_col] == 1].drop(class_col, axis=1) - return df_A_enc[col_order], df_B_enc[col_order] + return df_a_enc[col_order], df_b_enc[col_order] ################# @@ -193,7 +192,6 @@ class EmbeddingsEncoder(PreprocessorBase): def __init__(self): super().__init__() - pass @staticmethod def sentence_bpe_vectors( @@ -229,8 +227,8 @@ def sentence_bpe_vectors( @staticmethod @BEARTYPE def encode_both( - df_A: pd.DataFrame, - df_B: pd.DataFrame, + df_a: pd.DataFrame, + df_b: pd.DataFrame, sensitive_col: Optional[Union[List, str]] = None, seed: int = 1234, ) -> Tuple[pd.DataFrame, pd.DataFrame]: @@ -240,10 +238,10 @@ def encode_both( Parameters ---------- - df_A : pd.DataFrame + df_a : pd.DataFrame data frame containing the binary label column and the other variables of interest - df_B : pd.DataFrame + df_b : pd.DataFrame data frame containing the binary label column and the other variables of interest sensitive_col : str or List[str] (Default: None) @@ -255,20 +253,20 @@ def encode_both( ------- Tuple: pd.DataFrame - The encoded version of df_A + The encoded version of df_a pd.DataFrame - The encoded version of df_B - df_A and df_B are not modified. + The encoded version of df_b + df_a and df_b are not modified. """ - if set(df_A.columns) != set(df_B.columns): - raise InputError("df_A and df_B must have same columns.") + if set(df_a.columns) != set(df_b.columns): + raise InputError("df_a and df_b must have same columns.") # pick a column name that isn't in the dataset - class_col = utils.uniq_col_name(df_A) + class_col = utils.uniq_col_name(df_a) # make one dataframe for pre-processing, otherwise preprocess_data won't be consistent - orig_df_copy = df_A.copy() - mod_df_copy = df_B.copy() + orig_df_copy = df_a.copy() + mod_df_copy = df_b.copy() orig_df_copy[class_col] = 0 mod_df_copy[class_col] = 1 comb_for_proprocessing = pd.concat( @@ -293,7 +291,6 @@ def encode_both( .drop([class_col], axis=1) .reset_index(drop=True) ) - # return both return orig_df_proc, mod_df_proc @staticmethod @@ -693,5 +690,5 @@ def encode( preprocessor_fn = { None: PreprocessorBase, "embeddings": EmbeddingsEncoder, - "label_encode": LabelEncoder_pm, + "label_encode": LabelEncoderPM, } diff --git a/tests/classifiers/test_classifiers.py b/tests/classifiers/test_classifiers.py index 887215e..62ddc12 100644 --- a/tests/classifiers/test_classifiers.py +++ b/tests/classifiers/test_classifiers.py @@ -6,7 +6,7 @@ import pymasq.config as cfg from pathlib import Path from pymasq.datasets import load_census -from pymasq.preprocessing import LabelEncoder_pm, EmbeddingsEncoder +from pymasq.preprocessing import LabelEncoderPM, EmbeddingsEncoder from pymasq.models.models import ( LogisticRegressionClassifier, TpotClassifier, @@ -33,11 +33,11 @@ def my_df(): @pytest.mark.parametrize( "combo", [ - (LogisticRegressionClassifier, LabelEncoder_pm, 0.5), + (LogisticRegressionClassifier, LabelEncoderPM, 0.5), (LogisticRegressionClassifier, EmbeddingsEncoder, 0.5), - (RFClassifier, LabelEncoder_pm, 1.0), + (RFClassifier, LabelEncoderPM, 1.0), (RFClassifier, EmbeddingsEncoder, 1.0), - (TpotClassifier, LabelEncoder_pm, 0.77), + (TpotClassifier, LabelEncoderPM, 0.77), (TpotClassifier, EmbeddingsEncoder, 0.86), ], ) diff --git a/tests/preprocessing/test_preprocess.py b/tests/preprocessing/test_preprocess.py index 35c3647..af518a5 100644 --- a/tests/preprocessing/test_preprocess.py +++ b/tests/preprocessing/test_preprocess.py @@ -7,7 +7,7 @@ from pymasq.datasets import load_census -from pymasq.preprocessing import embed_entities, LabelEncoder_pm, EmbeddingsEncoder +from pymasq.preprocessing import embed_entities, LabelEncoderPM, EmbeddingsEncoder # from pymasq.errors import InputError, DataTypeError @@ -242,7 +242,7 @@ def test_label_encode_1(my_df): 10 37 280464 5 1 1 0 1 """ - enc, _ = LabelEncoder_pm.encode_both(my_df, my_df) + enc, _ = LabelEncoderPM.encode_both(my_df, my_df) assert ( enc.to_json() == '{"age":{"0":39,"1":50,"2":38,"3":53,"4":28,"5":37,"6":49,"7":52,"8":31,"9":42,"10":37},"fnlwgt":{"0":77516,"1":83311,"2":215646,"3":234721,"4":338409,"5":284582,"6":160187,"7":209642,"8":45781,"9":159449,"10":280464},"education":{"0":2,"1":2,"2":3,"3":0,"4":2,"5":4,"6":1,"7":3,"8":4,"9":2,"10":5},"marital_status":{"0":3,"1":1,"2":0,"3":1,"4":1,"5":1,"6":2,"7":1,"8":3,"9":1,"10":1},"sex":{"0":1,"1":1,"2":1,"3":1,"4":0,"5":0,"6":0,"7":1,"8":0,"9":1,"10":1},"capital_gain":{"0":2174,"1":0,"2":0,"3":0,"4":0,"5":0,"6":0,"7":0,"8":14084,"9":5178,"10":0},"income_level":{"0":0,"1":0,"2":0,"3":0,"4":0,"5":0,"6":0,"7":1,"8":1,"9":1,"10":1}}' @@ -290,7 +290,7 @@ def test_label_encode_2(my_df): """ my_df1 = my_df[my_df["marital_status"].isin(["Never-married", "Divorced"])] my_df2 = my_df[~my_df["marital_status"].isin(["Never-married", "Divorced"])] - enc1, enc2 = LabelEncoder_pm.encode_both(my_df1, my_df2) + enc1, enc2 = LabelEncoderPM.encode_both(my_df1, my_df2) assert set(enc1.marital_status).isdisjoint(set(enc2.marital_status)) assert ( enc1.to_json() diff --git a/tests/utils/test_cache.py b/tests/utils/test_cache.py index 3157e43..43775fb 100644 --- a/tests/utils/test_cache.py +++ b/tests/utils/test_cache.py @@ -7,7 +7,7 @@ from pathlib import Path from pymasq.datasets import load_census from pymasq.models.models import LogisticRegressionClassifier, RFClassifier -from pymasq.preprocessing import LabelEncoder_pm, EmbeddingsEncoder +from pymasq.preprocessing import LabelEncoderPM, EmbeddingsEncoder from pymasq.utils import cache @@ -32,10 +32,10 @@ def my_df(): [ ( LogisticRegressionClassifier, - LabelEncoder_pm, + LabelEncoderPM, 0.5, "cache_test/053cb5e57bfa9b5c9568625cb22588dd.larsCV.2bd270eec04828b035a1facfbb35f355.pkl", - """larsCV. Description: Preprocessed with + """larsCV. Description: Preprocessed with First ten rows: age fnlwgt education ... sex capital_gain income_level 0 39 77516 Bachelors ... Male 2174 <=50K From c1debab3d9658560322c38a8611ae3ce829d7ffd Mon Sep 17 00:00:00 2001 From: cdo03c Date: Wed, 6 Dec 2023 19:24:44 -0500 Subject: [PATCH 04/17] Updates formatting --- docs/source/conf.py | 18 ++++----- src/pymasq/datasets/__init__.py | 8 +++- src/pymasq/datasets/_base.py | 5 ++- src/pymasq/errors/__init__.py | 26 ++++++------- src/pymasq/metrics/suda.py | 2 +- src/pymasq/mitigations/geom_transform.py | 8 ++-- src/pymasq/mitigations/hashing.py | 22 +++++++---- src/pymasq/mitigations/microaggregation.py | 8 +--- src/pymasq/optimizations/_base.py | 21 ++++++----- src/pymasq/optimizations/optimizations.py | 8 +--- src/pymasq/preprocessing/_base.py | 2 +- src/pymasq/utils/utils.py | 4 +- tests/mitigations/test_geom_transforms.py | 16 ++++---- tests/mitigations/test_hashing.py | 10 ++--- tests/mitigations/test_microaggregation.py | 31 ++++++++------- tests/mitigations/test_pram.py | 19 +++++----- tests/mitigations/test_shuffle.py | 7 ++-- tests/optimizations/test_optimizations.py | 44 +++++++++++----------- tests/optimizations/test_utils.py | 1 - 19 files changed, 129 insertions(+), 131 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index c05ad8b..23ecc49 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -2,7 +2,7 @@ import sys import sphinx_rtd_theme -sys.path.insert(0, os.path.abspath(os.path.join('..','..'))) +sys.path.insert(0, os.path.abspath(os.path.join("..", ".."))) # Configuration file for the Sphinx documentation builder. # # This file only contains a selection of the most common options. For a full @@ -22,12 +22,12 @@ # -- Project information ----------------------------------------------------- -project = 'pymasq' -copyright = '2022, MITLL' -author = 'MITLL' +project = "pymasq" +copyright = "2022, MITLL" +author = "MITLL" # The full version, including alpha/beta/rc tags -release = '1.0' +release = "1.0" # -- General configuration --------------------------------------------------- @@ -36,14 +36,14 @@ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - 'sphinx.ext.napoleon', # NumPy & Google style docstring support + "sphinx.ext.napoleon", # NumPy & Google style docstring support "sphinx_rtd_theme", ] napoleon_google_docstring = False # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. @@ -56,9 +56,9 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -html_theme = 'sphinx_rtd_theme' +html_theme = "sphinx_rtd_theme" # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +html_static_path = ["_static"] diff --git a/src/pymasq/datasets/__init__.py b/src/pymasq/datasets/__init__.py index cfa0f58..01dd262 100644 --- a/src/pymasq/datasets/__init__.py +++ b/src/pymasq/datasets/__init__.py @@ -2,7 +2,13 @@ The :mod:`pymasq.datasets` module includes utilities to load tabular datasets. """ -from ._base import load_data, load_census, load_loan, load_prestige, load_bank_attrition_rates +from ._base import ( + load_data, + load_census, + load_loan, + load_prestige, + load_bank_attrition_rates, +) from .data_generator import gen_geom_seq, gen_bin_df, gen_num_df from .utils import rand_cat_change diff --git a/src/pymasq/datasets/_base.py b/src/pymasq/datasets/_base.py index 52d1ec6..081f778 100644 --- a/src/pymasq/datasets/_base.py +++ b/src/pymasq/datasets/_base.py @@ -77,11 +77,12 @@ def load_loan(): """ return load_data("loan.csv") + def load_bank_attrition_rates(): """Load and return the Bank Attrition Rates dataset. - A manager at the bank is disturbed with more and more customers leaving their credit card services. - They would really appreciate if one could predict for them who is gonna get churned so + A manager at the bank is disturbed with more and more customers leaving their credit card services. + They would really appreciate if one could predict for them who is gonna get churned so they can proactively go to the customer to provide them better services and turn customers' decisions in the opposite direction. ============== ============== diff --git a/src/pymasq/errors/__init__.py b/src/pymasq/errors/__init__.py index c9cd3a9..c7effa7 100644 --- a/src/pymasq/errors/__init__.py +++ b/src/pymasq/errors/__init__.py @@ -1,34 +1,34 @@ - """ Expose public exceptions & warnings """ + class InputError(Exception): - """ Exception raised for errors in the input value. """ + """Exception raised for errors in the input value.""" class DataTypeError(Exception): - """ Exception raised for errors in the data type. """ - - + """Exception raised for errors in the data type.""" + + class SumNotEqualToOneError(ValueError): - """ Exception for sum of values not equal to 1. """ - + """Exception for sum of values not equal to 1.""" + class NotInRangeError(ValueError): - """ Exception for values not in specified interval. """ + """Exception for values not in specified interval.""" class LessThanZeroError(ValueError): - """ Exceptions for values < 0. """ + """Exceptions for values < 0.""" class LessThanOrEqualToZeroError(ValueError): - """ Exceptions for values <= 0. """ + """Exceptions for values <= 0.""" class NoMutationAvailableError(ValueError): - """ Exception when all mutations have been discarded and not replaced """ + """Exception when all mutations have been discarded and not replaced""" __all__ = [ @@ -38,5 +38,5 @@ class NoMutationAvailableError(ValueError): "NotInRangeError", "LessThanZeroError", "LessThanOrEqualToZeroError", - "NoMutationAvailableError" -] \ No newline at end of file + "NoMutationAvailableError", +] diff --git a/src/pymasq/metrics/suda.py b/src/pymasq/metrics/suda.py index 9901be4..f84de28 100644 --- a/src/pymasq/metrics/suda.py +++ b/src/pymasq/metrics/suda.py @@ -4,5 +4,5 @@ def suda(df: pd.DataFrame, cols: List[str], **kwargs: Dict[Any, Any]) -> pd.DataFrame: - """ TODO """ + """TODO""" return df diff --git a/src/pymasq/mitigations/geom_transform.py b/src/pymasq/mitigations/geom_transform.py index 9e5fef5..cf46976 100644 --- a/src/pymasq/mitigations/geom_transform.py +++ b/src/pymasq/mitigations/geom_transform.py @@ -279,19 +279,17 @@ def geom_transform( # Randomized expansion sign = np.sign(bo) - bo = np.add(abs(bo), abs(np.random.uniform(size=bo.shape) * magnitude)) + bo = np.add(abs(bo), abs(np.random.Generator.uniform(size=bo.shape) * magnitude)) bo = (bo * sign).T bo = bo * data[perturb_cols].std().values + data[perturb_cols].mean().values shuff_idx = data.index if shuffle: - shuff_idx = np.random.choice( + shuff_idx = np.random.Generator.choice( range(bo.shape[0]), size=(bo.shape[0]), replace=False ) - data.loc[:, perturb_cols] = bo[ - shuff_idx, - ] + data.loc[:, perturb_cols] = bo[shuff_idx,] if len(sensitive_col) != 0: data.loc[:, sensitive_col] = data.loc[shuff_idx, sensitive_col].reset_index( drop=True diff --git a/src/pymasq/mitigations/hashing.py b/src/pymasq/mitigations/hashing.py index a178ebf..5dcdf55 100644 --- a/src/pymasq/mitigations/hashing.py +++ b/src/pymasq/mitigations/hashing.py @@ -36,10 +36,10 @@ def hashing( a function name in the `hashlib` Python library [1]_. Else, it will apply the user-defined function. Algorithms listed in `hashlib.algorithms_guaranteed` are prefererd. salt : list, str, or int, Optional - The salt, or random data, to add to `data` to perturb it before hashing occurs. - If left as `None`, then no salt will be added to `data`. If `salt` is a list, - then it must be of the same length as `data`. If `salt` is a string, then the same salt value - will be added to each value in `data`. If `salt` is an integer, then a random salt of that bit size + The salt, or random data, to add to `data` to perturb it before hashing occurs. + If left as `None`, then no salt will be added to `data`. If `salt` is a list, + then it must be of the same length as `data`. If `salt` is a string, then the same salt value + will be added to each value in `data`. If `salt` is an integer, then a random salt of that bit size will automatically be generated (note that 16 and 32 are typical salt bit sizes). Generated salts can be stored by specifying the `store_salts` parameter. Please refer to [2]_ for additional information on the importance of salts. @@ -166,13 +166,17 @@ def hashing( salt, index=data.index, columns=data.columns, dtype=bytes ) if salt_df.shape != data.shape: - raise InputError(f"Incorrect `salt` dimensions; expected {data.shape}. (Received: {salt_df.shape})") + raise InputError( + f"Incorrect `salt` dimensions; expected {data.shape}. (Received: {salt_df.shape})" + ) elif isinstance(salt, str): salt_df[:] = salt.encode() elif isinstance(salt, int): salt_df = salt_df.applymap(lambda v: os.urandom(salt)) else: - raise InputError(f"Invalid `salt` type; only types allowed are `list`, `str`, and `int`. (Received: {type(salt)})") + raise InputError( + f"Invalid `salt` type; only types allowed are `list`, `str`, and `int`. (Received: {type(salt)})" + ) data = (data + salt_df) if append_salt else (salt_df + data) @@ -187,7 +191,9 @@ def hashing( if "shake" in str(hash_func): # TODO: change to logging - print(f"Warning: the default length of the hexdigest is set to 16; to alter the length, pass in `{hash_func}` as a callable defined with your prefered length.") + print( + f"Warning: the default length of the hexdigest is set to 16; to alter the length, pass in `{hash_func}` as a callable defined with your prefered length." + ) return data.applymap(lambda v: hash_func(v).hexdigest(16)) - + return data.applymap(lambda v: hash_func(v).hexdigest()) diff --git a/src/pymasq/mitigations/microaggregation.py b/src/pymasq/mitigations/microaggregation.py index f531190..37e4d10 100644 --- a/src/pymasq/mitigations/microaggregation.py +++ b/src/pymasq/mitigations/microaggregation.py @@ -578,18 +578,14 @@ def _knn(pwds, aggr): for _ in range((len(data) // aggr) - 1): max_val_idx = np.nanargmax(mah_dists) min_val_idxs = _knn(pw_dists[:, max_val_idx], aggr) - pw_dists[ - min_val_idxs, - ] = np.nan + pw_dists[min_val_idxs,] = np.nan mah_dists[min_val_idxs] = np.nan z[min_val_idxs] = np.mean(z[min_val_idxs], axis=0) min_val_idxs = np.unique( np.argwhere(~np.isnan(pw_dists))[:, 0] ) # get idx of remaining non-nan values - z[min_val_idxs,] = z[min_val_idxs,].mean( - axis=0 - ) # merge w above + z[min_val_idxs,] = z[min_val_idxs,].mean(axis=0) # merge w above mat = (z * data.std().to_numpy()) + data.mean().to_numpy() diff --git a/src/pymasq/optimizations/_base.py b/src/pymasq/optimizations/_base.py index 30d11a9..baa22b7 100644 --- a/src/pymasq/optimizations/_base.py +++ b/src/pymasq/optimizations/_base.py @@ -6,7 +6,6 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union -# import pymasq from pymasq import BEARTYPE import pymasq.mitigations as mits import pymasq.metrics as mets @@ -17,7 +16,6 @@ LessThanZeroError, NoMutationAvailableError, ) -import sys class OptimizationBase: @@ -106,7 +104,6 @@ def __init__( exit_on_error: bool = True, # Don't change to False without considering impact on pytests. **kwargs, ): - self.target = target self.mutations = mutations self.metrics = metrics @@ -136,12 +133,12 @@ def __init__( f"A probability `p` must be defined for each mutation in `mutations`. (Received: {mutations})." ) prob_sum = sum(probs) - if prob_sum == 0.0: + if np.isclose(prob_sum, 0.0, rtol=1e-09, atol=1e-09): probs = self._distribute(len(mutations)) self.mutations = [ dict(m, **{"p": probs[i]}) for i, m in enumerate(mutations) ] - elif round(prob_sum, 5) != 1.0: + elif not np.isclose(round(prob_sum, 5), 1.0, rtol=1e-09, atol=1e-09): raise SumNotEqualToOneError( f"Mitigation probabilities must sum to 1. (Received: {prob_sum})" ) @@ -152,13 +149,13 @@ def __init__( f"An importance weighting `weight` must be defined for each metric in `metrics`. (Received: {metrics})" ) weight_sum = sum(weights) - if weight_sum == 0.0: + if np.isclose(weight_sum, 0.0, rtol=1e-09, atol=1e-09): weights = self._distribute(len(metrics)) [ v.update({"weight": weights[i]}) for i, v in enumerate(self.metrics.values()) ] - elif weight_sum != 1.0: + elif not np.isclose(weight_sum, 1.0, rtol=1e-09, atol=1e-09): raise SumNotEqualToOneError( f"Metric importance weightings must sum to 1. (Received: {weight_sum})" ) @@ -204,7 +201,9 @@ def _validate_input_sums( except KeyError: sums.append(0.0) # if n_defined == 0, then none were defined - if n_defined != 0.0 and n_defined != len(values): + if not np.isclose(n_defined, 0.0, rtol=1e-09, atol=1e-09) and n_defined != len( + values + ): # TODO: future iterations should distribute missing values and/or normalize return None return sums @@ -409,7 +408,7 @@ def _mutate( mut = None if self.randomize_mutations: probs = [v["p"] for v in mutations] - mut = np.random.choice(mutations, p=probs) + mut = np.random.Generator.choice(mutations, p=probs) if not self.reuse_mutations and mutations: # redistribute according to initial weighting mut_idx = mutations.index(mut) @@ -439,7 +438,9 @@ def _mutate( result = func(target, **args) except Exception as e: if self.verbose >= 2: - print(f"[Warning] mutation {func.__name__} failed with args:={args}") + print( + f"[Warning] mutation {func.__name__} failed with args:={args} and error: {e}" + ) raise if isinstance(result, pd.Series): col_args = args.get("col", args.get("cols", None)) diff --git a/src/pymasq/optimizations/optimizations.py b/src/pymasq/optimizations/optimizations.py index 4383f71..32b2b76 100644 --- a/src/pymasq/optimizations/optimizations.py +++ b/src/pymasq/optimizations/optimizations.py @@ -92,7 +92,6 @@ def _optimize(self): best_fit = cur_fit while all([cur_fit > self.theta, self._iters > 0]): - if self.verbose: print("-- Iteration [%i] --" % (self._max_iters - self._iters)) if self.progress_reporter: @@ -225,7 +224,6 @@ def __init__(self, *args, retry: int = 3, **kwargs): @BEARTYPE def _optimize(self): - target = self._target retry = self.retry @@ -240,7 +238,6 @@ def _optimize(self): ) while all([cur_fit > self.theta, self._iters > 0, retry > 0]): - if self.verbose: print("-- Iteration [%i] --" % (self._max_iters - self._iters)) if self.progress_reporter: @@ -429,7 +426,6 @@ def _optimize(self): best_fit = cur_fit while all([best_fit > self.theta, self._iters > 0]): - if self.verbose: print("-- Iteration [%i] --" % (self._max_iters - self._iters)) if self.progress_reporter: @@ -584,7 +580,6 @@ def __init__( return_best: bool = False, **kwargs, ): - kwargs["headers"] = ["perm_num"] super().__init__(*args, **kwargs) @@ -631,7 +626,7 @@ def _optimize(self): if self.randomize_mutations: # Note: only matters when `num_perms` is set. - test = np.random.shuffle(self._mutations) + test = np.random.Generator.shuffle(self._mutations) for num_perms, mutation_perms in enumerate( itertools.permutations(self._mutations, self.size_perms) @@ -643,7 +638,6 @@ def _optimize(self): stop = False for mutation in mutation_perms: - if self.verbose: print("\t-- Iteration [%i] --" % (self._max_iters - self._iters)) if self.progress_reporter: diff --git a/src/pymasq/preprocessing/_base.py b/src/pymasq/preprocessing/_base.py index 9efa3ef..45fdaaa 100644 --- a/src/pymasq/preprocessing/_base.py +++ b/src/pymasq/preprocessing/_base.py @@ -23,4 +23,4 @@ def encode_both(self): encoded as [0,1] and [1,2]. In contract, two distinct calls to encode() will return [0,1] and [0,1] """ - pass \ No newline at end of file + pass diff --git a/src/pymasq/utils/utils.py b/src/pymasq/utils/utils.py index 2eb6d23..e1fd5c9 100644 --- a/src/pymasq/utils/utils.py +++ b/src/pymasq/utils/utils.py @@ -18,7 +18,7 @@ @BEARTYPE def as_dataframe(obj, cols: Optional[Union[List, str, int]] = None): - """ Convert an object data structure into a DataFrame """ + """Convert an object data structure into a DataFrame""" if isinstance(obj, (list, np.ndarray)): cols = None if cols is not None: @@ -103,7 +103,7 @@ def _formatting_wrapper(data, *args, **kwargs): def is_identical(s: pd.Series) -> bool: - """ Checks if all values in the input series are identical. """ + """Checks if all values in the input series are identical.""" s = s.to_numpy() # s.values (pandas<0.24) return (s[0] == s).all() diff --git a/tests/mitigations/test_geom_transforms.py b/tests/mitigations/test_geom_transforms.py index d539bee..1b6d91e 100644 --- a/tests/mitigations/test_geom_transforms.py +++ b/tests/mitigations/test_geom_transforms.py @@ -11,7 +11,7 @@ def my_rand_df(): ncols = 5 colnames = "abcdefghijklmnopqrstuvwxyz" df = pd.DataFrame( - np.random.random_integers(0, 100, (100, ncols)), + np.random.Generator.random_integers(0, 100, (100, ncols)), columns=[colnames[i] for i in range(ncols)], ) return df @@ -32,7 +32,7 @@ def my_non_numeric_df(): ncols = 3 colnames = list("abcdefghijklmnopqrstuvwxyz") df = pd.DataFrame( - np.random.choice(colnames, size=(100, ncols), replace=True), + np.random.Generator.choice(colnames, size=(100, ncols), replace=True), columns=colnames[:ncols], ) return df @@ -95,7 +95,7 @@ def test_geom_transform_error_single_column(my_rand_df): def test_geom_transform_different_values_for_perturb_cols(my_rand_df): - """ Ensure geom_transform returns different values for perturb_cols """ + """Ensure geom_transform returns different values for perturb_cols""" perturb_cols = ["a", "b"] sensitive_col = "c" rdf = geom_transform( @@ -109,7 +109,7 @@ def test_geom_transform_different_values_for_perturb_cols(my_rand_df): def test_geom_transform_cols_not_specified_no_perturbed(my_rand_df): - """ Ensure geom_transform returns different values for perturb_cols """ + """Ensure geom_transform returns different values for perturb_cols""" perturb_cols = ["a", "b"] sensitive_col = "d" ignore_cols = ["c"] @@ -134,7 +134,7 @@ def test_geom_transform_cols_not_specified_no_perturbed(my_rand_df): def test_geom_transform_same_values_for_sensitive_col(my_rand_df): - """ Ensure geom_transform returns different values for perturb_cols """ + """Ensure geom_transform returns different values for perturb_cols""" perturb_cols = ["a", "b"] sensitive_col = "c" rdf = geom_transform( @@ -149,7 +149,7 @@ def test_geom_transform_same_values_for_sensitive_col(my_rand_df): def test_geom_transform_same_values_in_proper_order_for_sensitive_col(my_rand_df): - """ Ensure geom_transform returns different values for perturb_cols """ + """Ensure geom_transform returns different values for perturb_cols""" perturb_cols = ["a", "b"] sensitive_col = "c" rdf = geom_transform( @@ -164,7 +164,7 @@ def test_geom_transform_same_values_in_proper_order_for_sensitive_col(my_rand_df def test_geom_transform_returns_same_shapes(my_rand_df): - """ Ensure geom_transform returns the same dataframe shapes """ + """Ensure geom_transform returns the same dataframe shapes""" perturb_cols = ["a", "b"] sensitive_col = "d" @@ -184,4 +184,4 @@ def test_geom_transform_returns_same_shapes(my_rand_df): ).shape assert in_size_1 == out_size_1 - assert in_size_2 == out_size_2 \ No newline at end of file + assert in_size_2 == out_size_2 diff --git a/tests/mitigations/test_hashing.py b/tests/mitigations/test_hashing.py index 4f4719e..c04b670 100644 --- a/tests/mitigations/test_hashing.py +++ b/tests/mitigations/test_hashing.py @@ -28,7 +28,7 @@ def my_df(): @pytest.fixture def salts(): df = _my_df() - return np.random.choice(["a", "b", "c"], size=df.shape).tolist() + return np.random.Generator.choice(["a", "b", "c"], size=df.shape).tolist() @pytest.mark.parametrize("hash_func", (ALGORITHMS)) @@ -36,12 +36,12 @@ def test_hashing_all_hashlib_guaranteed_algorithms(my_df, hash_func): """ Test all hashing algorithms that are guaranteed to be supported by hashlib, regardless of OS platform. """ - e = None + rdf = None try: rdf = hashing(my_df, hash_func) except Exception as e: - print("Raised Exception") - assert e is None + print(f"Raised Exception: {e}") + assert rdf is not None @pytest.mark.parametrize("hash_func", (ALGORITHMS)) @@ -119,4 +119,4 @@ def test_hashing_hardcoded_salt(my_df, salts, hash_func): Test that salts can be passed in by user and yield different values """ sdf = hashing(my_df, hash_func, salt=salts) - assert not sdf.equals(my_df) \ No newline at end of file + assert not sdf.equals(my_df) diff --git a/tests/mitigations/test_microaggregation.py b/tests/mitigations/test_microaggregation.py index c62efa2..82a59b7 100644 --- a/tests/mitigations/test_microaggregation.py +++ b/tests/mitigations/test_microaggregation.py @@ -5,15 +5,14 @@ import pandas as pd import pytest -from pymasq import config - -config.FORMATTING_ON_OUTPUT = True - -from pymasq import set_seed +from pymasq import config, set_seed from pymasq.datasets import load_loan +from pymasq.errors import InputError, LessThanOrEqualToZeroError, NotInRangeError from pymasq.mitigations import microaggregation as magg from pymasq.mitigations.microaggregation import MaggMethods -from pymasq.errors import InputError, LessThanOrEqualToZeroError, NotInRangeError + + +config.FORMATTING_ON_OUTPUT = True METHODS = [ @@ -32,7 +31,7 @@ @pytest.fixture def rand_df(): - return pd.DataFrame(np.random.randint(1, NUM_RECORDS, (NUM_RECORDS, 4))) + return pd.DataFrame(np.random.Generator.randint(1, NUM_RECORDS, (NUM_RECORDS, 4))) @pytest.fixture @@ -44,14 +43,14 @@ def my_df(): def test_magg_error_if_invalid_method(my_df): - """ Test that microaggregation throws an InputError if incorrect method is supplied. """ + """Test that microaggregation throws an InputError if incorrect method is supplied.""" with pytest.raises(InputError): magg(my_df, method=None, aggr=2) @pytest.mark.parametrize("method", METHODS) def test_magg_returns_same_dimensions_and_column_names(my_df, method): - """ Test that microaggregation returns the same dimensions and column names. """ + """Test that microaggregation returns the same dimensions and column names.""" kwargs = {} if method == MaggMethods.ADVANCED: kwargs = MAGG_ADVANCED_KWARGS @@ -62,7 +61,7 @@ def test_magg_returns_same_dimensions_and_column_names(my_df, method): @pytest.mark.parametrize("method", (METHODS)) def test_magg_aggr_is_valid(my_df, method): - """ Test for NotInRangeError when `aggr` not in [1, len(my_df)] """ + """Test for NotInRangeError when `aggr` not in [1, len(my_df)]""" aggr = 0 kwargs = {} if method == MaggMethods.ADVANCED: @@ -77,7 +76,7 @@ def test_magg_aggr_is_valid(my_df, method): @pytest.mark.parametrize("method", (METHODS)) def test_magg_unique_vals_is_one(my_df, method): - """ Test the number of unique values returned for `aggr` is 1. """ + """Test the number of unique values returned for `aggr` is 1.""" kwargs = {} if method == MaggMethods.ADVANCED: kwargs = MAGG_ADVANCED_KWARGS @@ -85,13 +84,13 @@ def test_magg_unique_vals_is_one(my_df, method): test_df = magg( my_df, method=method, aggr=aggr, keep_dtypes=True, **kwargs ) # .astype(int) - assert True == np.allclose(my_df, test_df, 1, 1) + assert np.allclose(my_df, test_df, 1, 1) is True @pytest.mark.parametrize("method", METHODS) @pytest.mark.parametrize("aggr", [2] + [n for n in range(10, NUM_RECORDS, 10)]) def test_magg_unique_vals_greater_than_one(my_df, method, aggr): - """ Test the number of unique values returned for `aggr` is greater than 1. """ + """Test the number of unique values returned for `aggr` is greater than 1.""" kwargs = {} if method == MaggMethods.ADVANCED: kwargs = MAGG_ADVANCED_KWARGS @@ -101,19 +100,19 @@ def test_magg_unique_vals_greater_than_one(my_df, method, aggr): def test_magg_quantile_not_in_range(my_df): - """ Test that quantile-based microaggregation throws an NotInRangeError if aggr > len(my_df). """ + """Test that quantile-based microaggregation throws an NotInRangeError if aggr > len(my_df).""" with pytest.raises(NotInRangeError): magg(my_df, method="quantile", aggr=len(my_df) + 1) def test_magg_advanced_required_extra_parameters(my_df): - """ Test that advanced-based microaggregation throws an InputError if neither clust or reduct are specified. """ + """Test that advanced-based microaggregation throws an InputError if neither clust or reduct are specified.""" with pytest.raises(InputError): magg(my_df, method="advanced") def test_magg_advanced_error_if_invalid_methods(my_df): - """ Test that advanced-based microaggregation throws an InputError when input kwargs are not valid. """ + """Test that advanced-based microaggregation throws an InputError when input kwargs are not valid.""" with pytest.raises(InputError): magg(my_df.copy(), method="advanced", clust="INVALID") magg(my_df.copy(), method="advanced", reduct="INVALID") diff --git a/tests/mitigations/test_pram.py b/tests/mitigations/test_pram.py index 59f2cf8..57f5478 100644 --- a/tests/mitigations/test_pram.py +++ b/tests/mitigations/test_pram.py @@ -3,13 +3,12 @@ import pytest import pymasq +from pymasq.datasets import load_census +from pymasq.errors import InputError, NotInRangeError +from pymasq.mitigations import pram pymasq.set_seed(10) -from pymasq.mitigations import pram -from pymasq.errors import InputError, NotInRangeError -from pymasq.datasets import load_census - @pytest.fixture def my_df(): @@ -34,7 +33,7 @@ def my_numerical_df(): nrows = 10 max_val = 1000000 return pd.DataFrame( - np.random.random_integers(0, max_val, (nrows, ncols)), + np.random.Generator.random_integers(0, max_val, (nrows, ncols)), columns=[f"c{i}" for i in range(ncols)], ) @@ -92,7 +91,7 @@ def test_pram_probs_invalid_dict(my_df): def test_pram_probs_valid_dict(my_df): - """ Ensure that specifying probabilities results in that number of changes on average """ + """Ensure that specifying probabilities results in that number of changes on average""" probs = dict( race=pd.DataFrame({"White": 0.5, "Black": 0.5}, index=["White", "Black"]) ) @@ -124,25 +123,25 @@ def test_pram_returns_same_shapes(my_df): def test_pram_probs_equal_0(my_df): - """ at least 1 value changed """ + """at least 1 value changed""" r = pram(my_df, probs=0) assert not all((r == my_df).all()) def test_pram_probs_equal_1(my_df): - """ no change in data """ + """no change in data""" r = pram(my_df, probs=1) assert all((r == my_df).all()) def test_pram_alpha_equal_0(my_df): - """ no change in data """ + """no change in data""" r = pram(my_df, alpha=0) assert all((r == my_df).all()) def test_pram_alpha_equal_1(my_df): - """ at least 1 value changed """ + """at least 1 value changed""" r = pram(my_df, alpha=1) assert not all((r == my_df).all()) diff --git a/tests/mitigations/test_shuffle.py b/tests/mitigations/test_shuffle.py index c772d99..67babab 100644 --- a/tests/mitigations/test_shuffle.py +++ b/tests/mitigations/test_shuffle.py @@ -6,9 +6,8 @@ from pymasq.mitigations import ( s, # shuffle.py module shuffle, - MODEL, ) -from pymasq.errors import InputError, DataTypeError +from pymasq.errors import InputError @pytest.fixture @@ -88,7 +87,7 @@ def test_shuffle_cols_not_numeric(my_df): def test_shuffle_same_mean_different_values(loan_df): - """ Test that values are perturbed and retain the same mean while also in different order """ + """Test that values are perturbed and retain the same mean while also in different order""" shuffle_cols = ["ApplicantIncome", "LoanAmount"] cor_cols = ["Education", "Loan_Status"] shuffled = shuffle( @@ -124,4 +123,4 @@ def test_shuffle_returns_same_shapes(loan_df): ).shape assert in_size_1 == out_size_1 - assert in_size_2 == out_size_2 \ No newline at end of file + assert in_size_2 == out_size_2 diff --git a/tests/optimizations/test_optimizations.py b/tests/optimizations/test_optimizations.py index de8c096..7c56d26 100644 --- a/tests/optimizations/test_optimizations.py +++ b/tests/optimizations/test_optimizations.py @@ -2,26 +2,26 @@ # coding: utf-8 import copy +import hashlib import itertools import json import numpy as np import pandas as pd import pytest +import random from scipy.special import perm +from sklearn.utils import shuffle import pymasq - -pymasq.BEARTYPE = lambda func: func - -from pymasq.datasets import load_census -from pymasq import optimizations as opts from pymasq import mitigations as mits +from pymasq import optimizations as opts from pymasq import set_seed +from pymasq.datasets import load_census + + +pymasq.BEARTYPE = lambda func: func -import random -from sklearn.utils import shuffle -import hashlib set_seed(1) @@ -68,7 +68,7 @@ def my_mutations(): # evaluation functions zeros = {lambda: 0: {"weight": 1}} ones = {lambda: 1: {"weight": 1}} -rands = {lambda: np.random.rand(): {"weight": 1}} +rands = {lambda: np.random.Generator.rand(): {"weight": 1}} # Test standard termination conditions @@ -150,7 +150,7 @@ def _terminates_correctly(res, fit, log): ], ) def test_optimizations_returns(my_df, my_mutations, my_metrics, my_iters, my_theta): - """ Test the return variables of all `pymasq.optimization` algorithms. """ + """Test the return variables of all `pymasq.optimization` algorithms.""" def _returns_correctly(algo): result = algo.optimize() @@ -310,10 +310,8 @@ def _randomize_mutations_correctly(res, fit, log): return any( [ - len(mut_log_unique) == len(my_mutations), # randomize = True - all( - mut_log[: len(_my_mutations)] == _my_mutations - ), # randomize = False + len(mut_log_unique) == len(my_mutations), + all(mut_log[: len(_my_mutations)] == _my_mutations), ] ) @@ -375,7 +373,7 @@ def _randomize_mutations_correctly(res, fit, log): (ones, np.inf, 0.0, 100), ], ) -def test_IncrementalSearch( +def test_incremental_search( my_df, my_mutations, my_metrics, my_iters, my_theta, my_retry ): """ @@ -411,7 +409,7 @@ def test_IncrementalSearch( (zeros, np.inf, 1.0, None, None), # theta ], ) -def test_ExhaustiveSearch( +def test_exhaustive_search( my_df, my_mutations, my_metrics, my_iters, my_theta, my_num_perms, my_size_perms ): """ @@ -422,7 +420,9 @@ def test_ExhaustiveSearch( def _terminates_correctly(res, fit, log): if not np.isinf(my_iters): assert log.shape[0] == (my_iters + 1) - elif my_theta == 1.0 or my_theta == 0.9: + elif np.isclose(my_theta, 1.0, rtol=1e-09, atol=1e-09) or np.isclose( + my_theta, 0.9, rtol=1e-09, atol=1e-09 + ): assert log.iloc[-1]["fitness"] <= my_theta else: # terminates via permutations @@ -461,7 +461,7 @@ def test_exit_on_error(): def throw_error_mut(*args, **kwargs): df = args[0] choice = random.choices([True, False], weights=[1, 2]) - if choice[0] == True: + if choice[0] is True: raise Exception("Mutation error thrown on purpose.") else: df = shuffle(df) @@ -469,16 +469,16 @@ def throw_error_mut(*args, **kwargs): return df def rand_metric(df, *args, **kwargs): - Hash = hashlib.sha512 - MAX_HASH_PLUS_ONE = 2 ** (Hash().digest_size * 8) + hash_func = hashlib.sha512 + MAX_HASH_PLUS_ONE = 2 ** (hash_func().digest_size * 8) seed = str(df).encode() - hash_digest = Hash(seed).digest() + hash_digest = hash_func(seed).digest() hash_int = int.from_bytes(hash_digest, "big") return np.round(hash_int / MAX_HASH_PLUS_ONE, 4) # Float division def throw_error_metric(df, *args, **kwargs): choice = random.choices([True, False], weights=[1, 2]) - if choice[0] == True: + if choice[0] is True: raise Exception("Metrics error thrown on purpose.") else: return rand_metric(df) diff --git a/tests/optimizations/test_utils.py b/tests/optimizations/test_utils.py index ba1f79e..c1ef169 100644 --- a/tests/optimizations/test_utils.py +++ b/tests/optimizations/test_utils.py @@ -8,7 +8,6 @@ def test_apply_and_evaluate(): - # This checks that the output of apply_and_evaluate is the same as # if we called iterativeSearch(), which performs the metrics at each step From 381900cf334dd68936e5a9f0e213c09fc7277390 Mon Sep 17 00:00:00 2001 From: cdo03c Date: Thu, 7 Dec 2023 19:44:43 -0500 Subject: [PATCH 05/17] Fixes bugs --- src/pymasq/preprocessing/entity_embedding.py | 2 +- src/pymasq/preprocessing/preprocess.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/pymasq/preprocessing/entity_embedding.py b/src/pymasq/preprocessing/entity_embedding.py index 1b0b319..5dcbdcb 100755 --- a/src/pymasq/preprocessing/entity_embedding.py +++ b/src/pymasq/preprocessing/entity_embedding.py @@ -123,7 +123,7 @@ def embed_entities( # Converts categories represented by integers to strings so that the # label encoder will work and the classes can be determined later - categorical_df[column] = categorical_df[column].astype(str) + categorical_df.loc[:, column] = categorical_df.loc[:,column].astype(str) le = LabelEncoder() X_train = le.fit_transform(categorical_df[column]) diff --git a/src/pymasq/preprocessing/preprocess.py b/src/pymasq/preprocessing/preprocess.py index 96e9331..6d7dcfc 100644 --- a/src/pymasq/preprocessing/preprocess.py +++ b/src/pymasq/preprocessing/preprocess.py @@ -131,7 +131,7 @@ def encode_both( df_b[class_col] = 1 # append b to a; and then split out the categorical (non-numerical) columns - cat_cols = df_a.append(df_b).select_dtypes(exclude=["number"]) + cat_cols = df_a._append(df_b).select_dtypes(exclude=["number"]) # cast everything to string in case we have a mix of floats and string, # otherwise LabelEncoder will choke/die. # This should never happen, but does in our pytests, so who knows. @@ -140,7 +140,7 @@ def encode_both( ) # append b to a; and then split out the non-categorical (numerical) columns - num_cols = df_a.append(df_b).select_dtypes(include=["number"]) + num_cols = df_a._append(df_b).select_dtypes(include=["number"]) # concatenate, but relabel the cat_cols first if cat_cols.empty: both = num_cols From 1af4d8d0069d4b328994691ab690a2cef7e2419b Mon Sep 17 00:00:00 2001 From: cdo03c Date: Thu, 7 Dec 2023 20:02:10 -0500 Subject: [PATCH 06/17] fixes optimization tests --- src/pymasq/optimizations/_base.py | 9 ++++++--- src/pymasq/optimizations/optimizations.py | 2 +- tests/optimizations/test_optimizations.py | 3 +-- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/pymasq/optimizations/_base.py b/src/pymasq/optimizations/_base.py index baa22b7..8db85f7 100644 --- a/src/pymasq/optimizations/_base.py +++ b/src/pymasq/optimizations/_base.py @@ -6,10 +6,12 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union -from pymasq import BEARTYPE + import pymasq.mitigations as mits import pymasq.metrics as mets +from pymasq import BEARTYPE +from pymasq.config import DEFAULT_SEED from pymasq.errors import ( SumNotEqualToOneError, NotInRangeError, @@ -17,6 +19,7 @@ NoMutationAvailableError, ) +rg = np.random.Generator(np.random.PCG64(DEFAULT_SEED)) class OptimizationBase: """Base class for the optimization algorithms. @@ -408,7 +411,7 @@ def _mutate( mut = None if self.randomize_mutations: probs = [v["p"] for v in mutations] - mut = np.random.Generator.choice(mutations, p=probs) + mut = rg.choice(mutations, p=probs) if not self.reuse_mutations and mutations: # redistribute according to initial weighting mut_idx = mutations.index(mut) @@ -541,4 +544,4 @@ def update(self, record: Dict[str, Any]): """ record = self._pretty_values(record) df = pd.DataFrame.from_records(record) - self.log = self.log.append(df, ignore_index=True) + self.log = self.log._append(df, ignore_index=True) diff --git a/src/pymasq/optimizations/optimizations.py b/src/pymasq/optimizations/optimizations.py index 32b2b76..9697b21 100644 --- a/src/pymasq/optimizations/optimizations.py +++ b/src/pymasq/optimizations/optimizations.py @@ -626,7 +626,7 @@ def _optimize(self): if self.randomize_mutations: # Note: only matters when `num_perms` is set. - test = np.random.Generator.shuffle(self._mutations) + test = np.random.shuffle(self._mutations) for num_perms, mutation_perms in enumerate( itertools.permutations(self._mutations, self.size_perms) diff --git a/tests/optimizations/test_optimizations.py b/tests/optimizations/test_optimizations.py index 7c56d26..e4da8a4 100644 --- a/tests/optimizations/test_optimizations.py +++ b/tests/optimizations/test_optimizations.py @@ -25,7 +25,6 @@ set_seed(1) - @pytest.fixture def my_df(): df = load_census() @@ -68,7 +67,7 @@ def my_mutations(): # evaluation functions zeros = {lambda: 0: {"weight": 1}} ones = {lambda: 1: {"weight": 1}} -rands = {lambda: np.random.Generator.rand(): {"weight": 1}} +rands = {lambda: np.random.rand(): {"weight": 1}} # Test standard termination conditions From e164df7f2a2f4f16c79705f4b3530e63ad312e57 Mon Sep 17 00:00:00 2001 From: cdo03c Date: Thu, 7 Dec 2023 20:17:16 -0500 Subject: [PATCH 07/17] fixes mitigation tests --- src/pymasq/datasets/data_generator.py | 8 +++++--- src/pymasq/metrics/utility_scores.py | 2 +- src/pymasq/mitigations/geom_transform.py | 12 +++++++----- tests/mitigations/test_geom_transforms.py | 6 ++++-- tests/mitigations/test_hashing.py | 4 +++- tests/mitigations/test_pram.py | 4 +++- 6 files changed, 23 insertions(+), 13 deletions(-) diff --git a/src/pymasq/datasets/data_generator.py b/src/pymasq/datasets/data_generator.py index 773c429..c2d4c6b 100644 --- a/src/pymasq/datasets/data_generator.py +++ b/src/pymasq/datasets/data_generator.py @@ -7,7 +7,9 @@ from .utils import rand_cat_change from pymasq import BEARTYPE +from pymasq.config import DEFAULT_SEED +rg = np.random.default_rng(DEFAULT_SEED) @BEARTYPE def gen_geom_seq(start: float = 0.5, n: int = 6, rate: float = 2.0) -> List[float]: @@ -132,11 +134,11 @@ def _l_div_sensitive_gen(l: int, n: int) -> List: List of integer values for the sensitive column """ - unique_entries = np.random.choice(range(n), l) + unique_entries = rg.choice(range(n), l) while len(unique_entries) != len(set(unique_entries)): - unique_entries = np.random.choice(range(n), l) + unique_entries = rg.choice(range(n), l) - non_unique = np.random.Generator.choice(unique_entries, n - l) + non_unique = rg.Generator.choice(unique_entries, n - l) return list(unique_entries) + list(non_unique) diff --git a/src/pymasq/metrics/utility_scores.py b/src/pymasq/metrics/utility_scores.py index 459edd6..f115bb1 100644 --- a/src/pymasq/metrics/utility_scores.py +++ b/src/pymasq/metrics/utility_scores.py @@ -201,7 +201,7 @@ def propensity_score( # Encode the two data frames (at once for consistent encodings) preprocessor_fn = preprocess.preprocessor_fn[preprocessor] orig_enc, mod_enc = preprocessor_fn.encode_both( - df_A=orig_df, df_B=mod_df, sensitive_col=sensitive_col + df_a=orig_df, df_b=mod_df, sensitive_col=sensitive_col ) # Create a unique column name to mark from which dataframe a row came from class_col = utils.uniq_col_name(orig_df) diff --git a/src/pymasq/mitigations/geom_transform.py b/src/pymasq/mitigations/geom_transform.py index cf46976..6c89a63 100644 --- a/src/pymasq/mitigations/geom_transform.py +++ b/src/pymasq/mitigations/geom_transform.py @@ -7,7 +7,7 @@ from typing import List, Optional, Union from pymasq import BEARTYPE -from pymasq.config import FORMATTING_ON_OUTPUT, FORMATTING_IGNORE_DTYPES +from pymasq.config import FORMATTING_ON_OUTPUT, FORMATTING_IGNORE_DTYPES, DEFAULT_SEED from pymasq.errors import InputError from pymasq.mitigations.utils import _is_identical from pymasq.utils import formatting @@ -15,6 +15,7 @@ __all__ = ["geom_transform"] +rg = np.random.default_rng(DEFAULT_SEED) SKIP_ROTATION_ANGLES = [30, 45, 60, 90, 120, 135, 150, 180] MAX_DEGREES = 180 @@ -127,7 +128,8 @@ def geom_transform( Examples -------- - >>> df = pd.DataFrame(np.random.random_integers(0, 100, (10,3))) + >>> rg = np.random.default_rng(1234) + >>> df = pd.DataFrame(rg.integers(0, 100, (10,3))) 0 1 2 3 0 72 13 92 91 1 55 63 65 76 @@ -261,7 +263,7 @@ def geom_transform( # Translation Matrix Generation/Application idtrans = np.eye(ncols + 1) # add a new row for the homogeneous coordinate - idtrans[:ncols, ncols:] = np.random.uniform(size=(ncols, 1)) + idtrans[:ncols, ncols:] = rg.uniform(size=(ncols, 1)) # multidim translations; adding ones column for homogeneous coordinate multitrans = np.concatenate((bo, np.ones(shape=(bo.shape[0], 1))), axis=1) @@ -279,13 +281,13 @@ def geom_transform( # Randomized expansion sign = np.sign(bo) - bo = np.add(abs(bo), abs(np.random.Generator.uniform(size=bo.shape) * magnitude)) + bo = np.add(abs(bo), abs(rg.uniform(size=bo.shape) * magnitude)) bo = (bo * sign).T bo = bo * data[perturb_cols].std().values + data[perturb_cols].mean().values shuff_idx = data.index if shuffle: - shuff_idx = np.random.Generator.choice( + shuff_idx = rg.choice( range(bo.shape[0]), size=(bo.shape[0]), replace=False ) diff --git a/tests/mitigations/test_geom_transforms.py b/tests/mitigations/test_geom_transforms.py index 1b6d91e..a18f367 100644 --- a/tests/mitigations/test_geom_transforms.py +++ b/tests/mitigations/test_geom_transforms.py @@ -2,16 +2,18 @@ import pandas as pd import pytest +from pymasq.config import DEFAULT_SEED from pymasq.mitigations import geom_transform from pymasq.errors import InputError +rg = np.random.default_rng(DEFAULT_SEED) @pytest.fixture def my_rand_df(): ncols = 5 colnames = "abcdefghijklmnopqrstuvwxyz" df = pd.DataFrame( - np.random.Generator.random_integers(0, 100, (100, ncols)), + rg.integers(0, 100, (100, ncols)), columns=[colnames[i] for i in range(ncols)], ) return df @@ -32,7 +34,7 @@ def my_non_numeric_df(): ncols = 3 colnames = list("abcdefghijklmnopqrstuvwxyz") df = pd.DataFrame( - np.random.Generator.choice(colnames, size=(100, ncols), replace=True), + rg.choice(colnames, size=(100, ncols), replace=True), columns=colnames[:ncols], ) return df diff --git a/tests/mitigations/test_hashing.py b/tests/mitigations/test_hashing.py index c04b670..1e746cc 100644 --- a/tests/mitigations/test_hashing.py +++ b/tests/mitigations/test_hashing.py @@ -6,12 +6,14 @@ import hashlib import numpy as np +from pymasq.config import DEFAULT_SEED from pymasq.datasets import load_census from pymasq.mitigations import hashing ALGORITHMS = hashlib.algorithms_guaranteed +rg = np.random.default_rng(DEFAULT_SEED) def _my_df(): df = load_census() @@ -28,7 +30,7 @@ def my_df(): @pytest.fixture def salts(): df = _my_df() - return np.random.Generator.choice(["a", "b", "c"], size=df.shape).tolist() + return rg.choice(["a", "b", "c"], size=df.shape).tolist() @pytest.mark.parametrize("hash_func", (ALGORITHMS)) diff --git a/tests/mitigations/test_pram.py b/tests/mitigations/test_pram.py index 57f5478..0ed5c40 100644 --- a/tests/mitigations/test_pram.py +++ b/tests/mitigations/test_pram.py @@ -3,12 +3,14 @@ import pytest import pymasq +from pymasq.config import DEFAULT_SEED from pymasq.datasets import load_census from pymasq.errors import InputError, NotInRangeError from pymasq.mitigations import pram pymasq.set_seed(10) +rg = np.random.default_rng(DEFAULT_SEED) @pytest.fixture def my_df(): @@ -33,7 +35,7 @@ def my_numerical_df(): nrows = 10 max_val = 1000000 return pd.DataFrame( - np.random.Generator.random_integers(0, max_val, (nrows, ncols)), + rg.integers(0, max_val, (nrows, ncols)), columns=[f"c{i}" for i in range(ncols)], ) From 64eae4208bf479febdd5beb92762f238de1d76f0 Mon Sep 17 00:00:00 2001 From: cdo03c Date: Sat, 23 Dec 2023 09:01:44 -0500 Subject: [PATCH 08/17] Updates --- src/pymasq/mitigations/add_noise.py | 13 ++++++++----- src/pymasq/mitigations/hashing.py | 10 ++++++---- src/pymasq/mitigations/local_supp.py | 10 ++++++---- src/pymasq/mitigations/pram.py | 6 ++++-- src/pymasq/mitigations/rank_swap.py | 2 +- src/pymasq/mitigations/shuffle.py | 6 ++++-- src/pymasq/mitigations/utils.py | 2 +- src/pymasq/models/models.py | 4 ++-- tests/mitigations/test_pram.py | 7 ++++--- tests/optimizations/test_optimizations.py | 6 +++--- 10 files changed, 39 insertions(+), 27 deletions(-) diff --git a/src/pymasq/mitigations/add_noise.py b/src/pymasq/mitigations/add_noise.py index eca3a34..77ac99b 100644 --- a/src/pymasq/mitigations/add_noise.py +++ b/src/pymasq/mitigations/add_noise.py @@ -9,6 +9,7 @@ from typing import List, Optional, Union, Final from pymasq.config import ( + DEFAULT_SEED, FORMATTING_ON_OUTPUT, VALIDATE_NUMERIC_ON_INPUT, VALIDATE_NUMERIC_ON_OUTPUT, @@ -36,13 +37,15 @@ OUTLIERS: Final = "outliers" -class OUTLIERS_INTERPOLATION_METHODS: +class outliersInterpolationMethods: LINEAR = "linear" LOWER = "lower" HIGHER = "higher" MIDPOINT = "midpoint" NEAREST = "nearest" +rg = np.random.default_rng(DEFAULT_SEED) + @formatting(on_output=FORMATTING_ON_OUTPUT) @validate_numeric( @@ -114,10 +117,10 @@ def add_noise_additive( if centered: delta = np.sqrt(1 - np.square(magnitude)) loc = (1 - delta) / magnitude - noise = np.random.normal(loc=loc * data.mean(), scale=std, size=data.shape) + noise = rg.normal(loc=loc * data.mean(), scale=std, size=data.shape) data *= delta return data.add(magnitude * noise) - return data + np.random.normal(scale=magnitude * std, size=data.shape) + return data + rg.normal(scale=magnitude * std, size=data.shape) @formatting(on_output=FORMATTING_ON_OUTPUT) @@ -239,7 +242,7 @@ def add_noise_correlated( ] ).transpose() # Transposes the data to have the column/row orientation match the input data - return data_encoded + np.random.multivariate_normal( + return data_encoded + rg.multivariate_normal( pd.Series([0] * data_encoded.shape[1]), (magnitude / 100.0) * data_encoded.cov(), size=data_encoded.shape[0], @@ -415,7 +418,7 @@ def add_noise_outliers( outliers = np.unique(np.append(quant_outliers, dist_outliers)) std = 1.96 * data.std() / np.sqrt(len(data)) * (magnitude / 100.0) - noise = np.random.normal(scale=std, size=(len(outliers), len(data.columns))) + noise = rg.normal(scale=std, size=(len(outliers), len(data.columns))) data.iloc[outliers, :] += noise diff --git a/src/pymasq/mitigations/hashing.py b/src/pymasq/mitigations/hashing.py index 5dcdf55..547f6c0 100644 --- a/src/pymasq/mitigations/hashing.py +++ b/src/pymasq/mitigations/hashing.py @@ -1,8 +1,9 @@ +import hashlib +import logging +import os from typing import Callable, List, Optional, Union -import hashlib import numpy as np -import os import pandas as pd from pymasq import BEARTYPE @@ -14,6 +15,8 @@ __all__ = ["hashing"] +logger = logging.getLogger(__name__) + @formatting(on_output=FORMATTING_ON_OUTPUT, ignore_dtypes=True) @BEARTYPE @@ -190,8 +193,7 @@ def hashing( hash_func = getattr(hashlib, hash_func) if "shake" in str(hash_func): - # TODO: change to logging - print( + logger.warning( f"Warning: the default length of the hexdigest is set to 16; to alter the length, pass in `{hash_func}` as a callable defined with your prefered length." ) return data.applymap(lambda v: hash_func(v).hexdigest(16)) diff --git a/src/pymasq/mitigations/local_supp.py b/src/pymasq/mitigations/local_supp.py index 558f365..a3b535e 100644 --- a/src/pymasq/mitigations/local_supp.py +++ b/src/pymasq/mitigations/local_supp.py @@ -1,7 +1,8 @@ -import pandas as pd - +import logging from typing import Any, List, Optional, Union +import pandas as pd + from pymasq import BEARTYPE from pymasq.config import ( FORMATTING_ON_OUTPUT, @@ -13,6 +14,8 @@ __all__ = ["local_supp"] +logger = logging.getLogger(__name__) + @formatting(on_output=FORMATTING_ON_OUTPUT, ignore_dtypes=True) # fmt: off @BEARTYPE @@ -140,8 +143,7 @@ def local_supp( if not keep_dtypes and not isinstance( type(to_val), type(data.loc[0, suppress_col]) ): - # TODO: switch to logging - print( + logger.warning( f"WARNING: The datatype of the `suppress_col` ({suppress_col}`) will be changed." ) diff --git a/src/pymasq/mitigations/pram.py b/src/pymasq/mitigations/pram.py index d1bcb18..828580f 100644 --- a/src/pymasq/mitigations/pram.py +++ b/src/pymasq/mitigations/pram.py @@ -5,6 +5,7 @@ from pymasq import BEARTYPE from pymasq.config import ( + DEFAULT_SEED, FORMATTING_ON_OUTPUT, ) from pymasq.errors import InputError, NotInRangeError @@ -14,6 +15,7 @@ __all__ = ["pram"] +rg = np.random.default_rng(DEFAULT_SEED) def __calc_transition_matrix( data: pd.Series, @@ -39,7 +41,7 @@ def __calc_transition_matrix( pandas.DataFrame with transition probabilities for each category. """ ncats = len(cats) - runif = np.random.uniform(low=probs, size=ncats) + runif = rg.uniform(low=probs, size=ncats) tri = (1 - runif) / (ncats - 1) prob_mat = np.zeros(shape=(ncats, ncats)) @@ -88,7 +90,7 @@ def __randomization( for cat in cats: idxs = data.index.where(data == cat).dropna() if len(idxs) > 0: - d_pramed[idxs] = np.random.choice( + d_pramed[idxs] = rg.choice( cats, len(idxs), p=trans.loc[cat,], diff --git a/src/pymasq/mitigations/rank_swap.py b/src/pymasq/mitigations/rank_swap.py index 0a47a13..f01ddd1 100644 --- a/src/pymasq/mitigations/rank_swap.py +++ b/src/pymasq/mitigations/rank_swap.py @@ -8,7 +8,7 @@ def rank_swap( - data: Union[pd.DataFrame, pd.Series], cols: Union[str, List[str]] = None, **kwargs + data: Union[pd.DataFrame, pd.Series], cols: Union[str, List[str]] = None, ) -> pd.Series: """TODO diff --git a/src/pymasq/mitigations/shuffle.py b/src/pymasq/mitigations/shuffle.py index 961a433..79dd284 100644 --- a/src/pymasq/mitigations/shuffle.py +++ b/src/pymasq/mitigations/shuffle.py @@ -7,7 +7,7 @@ import scipy.stats as ss from pymasq import BEARTYPE -from pymasq.config import FORMATTING_ON_OUTPUT +from pymasq.config import DEFAULT_SEED, FORMATTING_ON_OUTPUT from pymasq.utils import formatting from pymasq.preprocessing import LabelEncoderPM from pymasq.errors import InputError @@ -32,6 +32,8 @@ CORRELATIVE: Final = "corr" MODEL: Final = "model" +rg = np.random.default_rng(DEFAULT_SEED) + @BEARTYPE def _reverse_map(data: pd.DataFrame, y_star: pd.DataFrame) -> pd.DataFrame: @@ -232,7 +234,7 @@ def shuffle( ystar1 = predictors.dot(pxs.dot(pssinv).T) sigma = pxx - pxs.dot(pssinv.dot(psx)) - e1 = np.random.multivariate_normal( + e1 = rg.multivariate_normal( mean=[0] * len(resp_cols), cov=sigma, size=_data.shape[0] ) y_star = ystar1 + e1 diff --git a/src/pymasq/mitigations/utils.py b/src/pymasq/mitigations/utils.py index faf7ef1..62f58b7 100644 --- a/src/pymasq/mitigations/utils.py +++ b/src/pymasq/mitigations/utils.py @@ -81,7 +81,7 @@ def __calc_freq( freq_df = df.groupby(cols).count()[sensitive_col] freq_df = freq_df.rename("samp_fq") freq_df = freq_df.reset_index() - result = pd.merge(df, freq_df, how="outer", on=cols) + result = pd.merge(df, freq_df, how="outer", on=cols, validate="m:1") result["pop_fq"] = result["samp_fq"].values * weights return result diff --git a/src/pymasq/models/models.py b/src/pymasq/models/models.py index 0f59b9d..5e19ede 100644 --- a/src/pymasq/models/models.py +++ b/src/pymasq/models/models.py @@ -909,9 +909,9 @@ def predict(self, x_test: pd.DataFrame, y_true: pd.Series) -> float: except: continue if Y_predict_prob_array is None: - raise (f"No prediction method available for {self.trained}") + raise Exception (f"No prediction method available for {self.trained}") - return mape(y_true=y_true, y_score=Y_predict_prob_array) + return mape(y_true=y_true, y_pred=Y_predict_prob_array) # For translation from text to callable functions diff --git a/tests/mitigations/test_pram.py b/tests/mitigations/test_pram.py index 0ed5c40..a0b5d98 100644 --- a/tests/mitigations/test_pram.py +++ b/tests/mitigations/test_pram.py @@ -1,14 +1,14 @@ +import logging import numpy as np import pandas as pd import pytest -import pymasq from pymasq.config import DEFAULT_SEED from pymasq.datasets import load_census from pymasq.errors import InputError, NotInRangeError from pymasq.mitigations import pram -pymasq.set_seed(10) +logger = logging.getLogger(__name__) rg = np.random.default_rng(DEFAULT_SEED) @@ -114,7 +114,8 @@ def test_pram_probs_valid_dict(my_df): def test_pram_numerical_cast_to_categorical(my_numerical_df): try: pram(my_numerical_df) - except: + except Exception as e: + logger.exception(e) assert False, "Numerical dataframe should not have raised error." diff --git a/tests/optimizations/test_optimizations.py b/tests/optimizations/test_optimizations.py index e4da8a4..35a8961 100644 --- a/tests/optimizations/test_optimizations.py +++ b/tests/optimizations/test_optimizations.py @@ -16,14 +16,14 @@ import pymasq from pymasq import mitigations as mits from pymasq import optimizations as opts -from pymasq import set_seed +from pymasq.config import DEFAULT_SEED from pymasq.datasets import load_census pymasq.BEARTYPE = lambda func: func -set_seed(1) +rg = np.random.default_rng(DEFAULT_SEED) @pytest.fixture def my_df(): @@ -67,7 +67,7 @@ def my_mutations(): # evaluation functions zeros = {lambda: 0: {"weight": 1}} ones = {lambda: 1: {"weight": 1}} -rands = {lambda: np.random.rand(): {"weight": 1}} +rands = {lambda: rg(): {"weight": 1}} # Test standard termination conditions From d2a7a79bf7f823ee509962fdd03b266f819788d7 Mon Sep 17 00:00:00 2001 From: cdo03c Date: Wed, 27 Dec 2023 08:33:06 -0500 Subject: [PATCH 09/17] updates truncate tests --- tests/mitigations/test_truncate.py | 81 ++++++++++++++++-------------- 1 file changed, 43 insertions(+), 38 deletions(-) diff --git a/tests/mitigations/test_truncate.py b/tests/mitigations/test_truncate.py index e7eaacc..99af40c 100644 --- a/tests/mitigations/test_truncate.py +++ b/tests/mitigations/test_truncate.py @@ -1,19 +1,24 @@ #!/usr/bin/env python # coding: utf-8 +import logging import pytest +import pandas as pd + from pymasq.datasets import load_census from pymasq.mitigations import truncate, INDEX, MATCH, START, END, BOTH +logger = logging.getLogger(__name__) @pytest.fixture def my_df(): df = load_census() - cols = ["fnlwgt", "education", "marital_status", "sex", "capital_gain"] + cols = ["fnlwgt", "education", "marital_status", selected_col, "capital_gain"] df = df.loc[:10, cols] return df +selected_col: str = "sex" # ----- Method: Index Tests ----- def test_truncate_index_1(my_df): @@ -22,8 +27,8 @@ def test_truncate_index_1(my_df): supplied. Should only keep characters [0:3) """ - ret = truncate(my_df["sex"], idx=3) - assert ret.isin(["e", "ale"]).all() + ret:pd.DataFrame = truncate(my_df[selected_col], idx=3) + assert ret[selected_col].isin(["e", "ale"]).all() def test_truncate_index_2(my_df): @@ -32,8 +37,8 @@ def test_truncate_index_2(my_df): supplied. Should only keep characters [0:-1) """ - ret = truncate(my_df["sex"], method=INDEX, idx=-1) - assert ret.isin(["e", "e"]).all() + ret:pd.DataFrame = truncate(my_df[selected_col], method=INDEX, idx=-1) + assert ret[selected_col].isin(["e", "e"]).all() def test_truncate_index_3(my_df): @@ -42,8 +47,8 @@ def test_truncate_index_3(my_df): supplied and trim_from=END Should not keep any characters """ - ret = truncate(my_df["sex"], method=INDEX, idx=0, trim_from=END) - assert ret.isin([""]).all() + ret:pd.DataFrame = truncate(my_df[selected_col], method=INDEX, idx=0, trim_from=END) + assert ret[selected_col].isin([""]).all() def test_truncate_index_4(my_df): @@ -52,8 +57,8 @@ def test_truncate_index_4(my_df): idx supplied. (idx > longest string in the column). Should not keep all characters """ - ret = truncate(my_df["sex"], method=INDEX, end=100) - assert ret.isin(["Male", "Female"]).all() + ret:pd.DataFrame = truncate(my_df[selected_col], method=INDEX, end=100) + assert ret[selected_col].isin(["Male", "Female"]).all() def test_truncate_index_5(my_df): @@ -62,8 +67,8 @@ def test_truncate_index_5(my_df): supplied. Should keep characters [1:3) """ - ret = truncate(my_df["sex"], method=INDEX, idx=1, end=3) - assert ret.isin(["al", "em"]).all() + ret:pd.DataFrame = truncate(my_df[selected_col], method=INDEX, idx=1, end=3) + assert ret[selected_col].isin(["al", "em"]).all() def test_truncate_index_6(my_df): @@ -72,16 +77,16 @@ def test_truncate_index_6(my_df): supplied. Should not keep any characters """ - ret = truncate(my_df["sex"], method=INDEX, idx=3, end=1) - assert ret.isin([""]).all() + ret:pd.DataFrame = truncate(my_df[selected_col], method=INDEX, idx=3, end=1) + assert ret[selected_col].isin([""]).all() def test_truncate_input_7(my_df): """ Test that truncate returns same value if no idx or end supplied """ - ret = truncate(my_df["sex"], method=INDEX) - assert ret.isin(["Male", "Female"]).all() + ret:pd.DataFrame = truncate(my_df[selected_col], method=INDEX) + assert ret[selected_col].isin(["Male", "Female"]).all() # ----- Method: match Tests ----- @@ -91,8 +96,8 @@ def test_truncate_match_1(my_df): part of all strings in the specified column. Should only keep characters before "al" for all values """ - ret = truncate(my_df["sex"], method=MATCH, match="al") - assert ret.isin(["M", "Fem"]).all() + ret:pd.DataFrame = truncate(my_df[selected_col], method=MATCH, match="al") + assert ret[selected_col].isin(["M", "Fem"]).all() def test_truncate_match_2(my_df): @@ -102,8 +107,8 @@ def test_truncate_match_2(my_df): Should only keep characters before "em" ("F")for entries with value "Female" and the full entry "Male" for the others. """ - ret = truncate(my_df["sex"], method=MATCH, match="em") - assert ret.isin(["Male", "F"]).all() + ret:pd.DataFrame = truncate(my_df[selected_col], method=MATCH, match="em") + assert ret[selected_col].isin(["Male", "F"]).all() def test_truncate_match_3(my_df): @@ -112,8 +117,8 @@ def test_truncate_match_3(my_df): match any string in the specified column Should keep all characters """ - ret = truncate(my_df["sex"], method=MATCH, match="cat") - assert ret.isin(["Male", "Female"]).all() + ret:pd.DataFrame = truncate(my_df[selected_col], method=MATCH, match="cat") + assert ret[selected_col].isin(["Male", "Female"]).all() def test_truncate_match_4(my_df): @@ -123,8 +128,8 @@ def test_truncate_match_4(my_df): Should only keep characters before "em" ("F")for entries with value "Female" and the full entry "Male" for the others. """ - ret = truncate(my_df["sex"], method=MATCH, match="EM", ignore_case=True) - assert ret.isin(["Male", "F"]).all() + ret:pd.DataFrame = truncate(my_df[selected_col], method=MATCH, match="EM", ignore_case=True) + assert ret[selected_col].isin(["Male", "F"]).all() def test_truncate_match_5(my_df): @@ -133,8 +138,8 @@ def test_truncate_match_5(my_df): Should only keep characters before "em" ("F")for entries with value "Female" and the full entry "Male" for the others. """ - ret = truncate(my_df["sex"], method=MATCH, match=".*", ignore_case=True) - assert ret.isin(["Male", "Female"]).all() + ret:pd.DataFrame = truncate(my_df[selected_col], method=MATCH, match=".*", ignore_case=True) + assert ret[selected_col].isin(["Male", "Female"]).all() # ----- Method: More Index Tests ----- @@ -145,8 +150,8 @@ def test_truncate_index_11(my_df): Test that truncate runs correctly for the INDEX method with only a valid `n` supplied Should only keep characters [3:] """ - ret = truncate(my_df["sex"], method=INDEX, idx=3) - assert ret.isin(["e", "ale"]).all() + ret:pd.DataFrame = truncate(my_df[selected_col], method=INDEX, idx=3) + assert ret[selected_col].isin(["e", "ale"]).all() def test_truncate_index_12(my_df): @@ -155,8 +160,8 @@ def test_truncate_index_12(my_df): and trim_from=END Should only keep characters [:-3] """ - ret = truncate(my_df["sex"], method=INDEX, idx=3, trim_from=END) - assert ret.isin(["M", "Fem"]).all() + ret:pd.DataFrame = truncate(my_df[selected_col], method=INDEX, idx=3, trim_from=END) + assert ret[selected_col].isin(["M", "Fem"]).all() def test_truncate_index_13(my_df): @@ -165,8 +170,8 @@ def test_truncate_index_13(my_df): and trim_from=BOTH Should only keep characters [1:-1] """ - ret = truncate(my_df["sex"], method=INDEX, idx=1, trim_from=BOTH) - assert ret.isin(["al", "emal"]).all() + ret:pd.DataFrame = truncate(my_df[selected_col], method=INDEX, idx=1, trim_from=BOTH) + assert ret[selected_col].isin(["al", "emal"]).all() def test_truncate_index_14(my_df): @@ -175,8 +180,8 @@ def test_truncate_index_14(my_df): greater than some of the string lengths but not others and trim_from=START Should only keep the last "e" in "Female" """ - ret = truncate(my_df["sex"], method=INDEX, idx=5, trim_from=START) - assert ret.isin(["", "e"]).all() + ret:pd.DataFrame = truncate(my_df[selected_col], method=INDEX, idx=5, trim_from=START) + assert ret[selected_col].isin(["", "e"]).all() def test_truncate_index_15(my_df): @@ -185,15 +190,15 @@ def test_truncate_index_15(my_df): greater or equal to than half the length of some strings but not others, and trim_from=BOTH Should only keep characters "ma" from "Female" """ - ret = truncate(my_df["sex"], method=INDEX, idx=2, trim_from=BOTH) - assert ret.isin(["", "ma"]).all() + ret:pd.DataFrame = truncate(my_df[selected_col], method=INDEX, idx=2, trim_from=BOTH) + assert ret[selected_col].isin(["", "ma"]).all() def test_truncate_index_16(my_df): """ Test that truncate runs correctly for the INDEX method with a large value of `n` supplied and trim_from=START - Should only keep characters "ma" from "Female" + Should not keep any """ - ret = truncate(my_df["sex"], method=INDEX, idx=100, trim_from=START) - assert ret.isin([""]).all() + ret: pd.DataFrame = truncate(my_df[selected_col], method=INDEX, idx=100, trim_from=START) + assert ret[selected_col].isin([""]).all() From ea5c61e82d4eed71ff6fa89393dee65603e11756 Mon Sep 17 00:00:00 2001 From: cdo03c Date: Wed, 27 Dec 2023 16:13:50 -0500 Subject: [PATCH 10/17] fixes truncate tests --- docs/source/conf.py | 2 +- src/pymasq/mitigations/pram.py | 4 +- src/pymasq/mitigations/truncate.py | 6 +++ tests/mitigations/test_truncate.py | 81 ++++++++++++++++-------------- 4 files changed, 52 insertions(+), 41 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index 23ecc49..1ffae56 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -27,7 +27,7 @@ author = "MITLL" # The full version, including alpha/beta/rc tags -release = "1.0" +release = "1.1" # -- General configuration --------------------------------------------------- diff --git a/src/pymasq/mitigations/pram.py b/src/pymasq/mitigations/pram.py index 828580f..17e368f 100644 --- a/src/pymasq/mitigations/pram.py +++ b/src/pymasq/mitigations/pram.py @@ -50,13 +50,13 @@ def __calc_transition_matrix( cat_codes = data.cat.codes + 1 sum_cats = np.nansum(cat_codes) - freqs = data.value_counts() / sum_cats # scaled category frequencies + freqs: pd.Series = data.value_counts() / sum_cats # scaled category frequencies scaled_prob_mat = prob_mat.copy() for i in range(ncats): s = sum(freqs * prob_mat[:, i]) for j in range(ncats): - scaled_prob_mat[i, j] = prob_mat[j, i] * (freqs[j] / s) + scaled_prob_mat[i, j] = prob_mat[j, i] * (freqs.iloc[j] / s) trans_probs = prob_mat @ scaled_prob_mat scaled_trans_probs = alpha * trans_probs + (1 - alpha) * np.identity(ncats) diff --git a/src/pymasq/mitigations/truncate.py b/src/pymasq/mitigations/truncate.py index 79abbf0..f077068 100644 --- a/src/pymasq/mitigations/truncate.py +++ b/src/pymasq/mitigations/truncate.py @@ -87,6 +87,9 @@ def _truncate_by_match(series, match, ignore_case, keep_before): return series.apply( lambda x: re.split(re.escape(match), x, 1, flags=re.IGNORECASE) ).str[0 if keep_before else -1] + + if isinstance(data, pd.Series): + return pd.DataFrame(_truncate_by_match(data, match=match, ignore_case=ignore_case, keep_before=keep_before)) return data.apply( _truncate_by_match, @@ -162,6 +165,9 @@ def _truncate_by_index(series, trim_from, idx, end): raise InputError( f"`trim_from` must be one of ['start', 'end', 'both', None]. (Received: {trim_from})" ) + + if isinstance(data, pd.Series): + return pd.DataFrame(_truncate_by_index(data, trim_from=trim_from, idx=idx, end=end)) return data.apply(_truncate_by_index, trim_from=trim_from, idx=idx, end=end) diff --git a/tests/mitigations/test_truncate.py b/tests/mitigations/test_truncate.py index 99af40c..3fab210 100644 --- a/tests/mitigations/test_truncate.py +++ b/tests/mitigations/test_truncate.py @@ -12,193 +12,198 @@ logger = logging.getLogger(__name__) @pytest.fixture -def my_df(): +def truncate_df(): df = load_census() - cols = ["fnlwgt", "education", "marital_status", selected_col, "capital_gain"] + cols = ["fnlwgt", "education", "marital_status", "sex", "capital_gain"] df = df.loc[:10, cols] return df selected_col: str = "sex" # ----- Method: Index Tests ----- -def test_truncate_index_1(my_df): +def test_truncate_index_1(truncate_df): """ Test that truncate runs correctly for the INDEX method with only a positive idx supplied. Should only keep characters [0:3) """ - ret:pd.DataFrame = truncate(my_df[selected_col], idx=3) + # Test Series input + ret:pd.Series = truncate(truncate_df[selected_col], idx=3) + assert ret.isin(["e", "ale"]).all() + + # Test Dataframe input + ret:pd.DataFrame = truncate(truncate_df[[selected_col]], idx=3) assert ret[selected_col].isin(["e", "ale"]).all() -def test_truncate_index_2(my_df): +def test_truncate_index_2(truncate_df): """ Test that truncate runs correctly for the INDEX method with only a negative idx supplied. Should only keep characters [0:-1) """ - ret:pd.DataFrame = truncate(my_df[selected_col], method=INDEX, idx=-1) + ret:pd.DataFrame = truncate(truncate_df[[selected_col]], method=INDEX, idx=-1) assert ret[selected_col].isin(["e", "e"]).all() -def test_truncate_index_3(my_df): +def test_truncate_index_3(truncate_df): """ Test that truncate runs correctly for the INDEX method with only idx of 0 supplied and trim_from=END Should not keep any characters """ - ret:pd.DataFrame = truncate(my_df[selected_col], method=INDEX, idx=0, trim_from=END) + ret:pd.DataFrame = truncate(truncate_df[[selected_col]], method=INDEX, idx=0, trim_from=END) assert ret[selected_col].isin([""]).all() -def test_truncate_index_4(my_df): +def test_truncate_index_4(truncate_df): """ Test that truncate runs correctly for the INDEX method with only a very large idx supplied. (idx > longest string in the column). Should not keep all characters """ - ret:pd.DataFrame = truncate(my_df[selected_col], method=INDEX, end=100) + ret:pd.DataFrame = truncate(truncate_df[[selected_col]], method=INDEX, end=100) assert ret[selected_col].isin(["Male", "Female"]).all() -def test_truncate_index_5(my_df): +def test_truncate_index_5(truncate_df): """ Test that truncate runs correctly for the INDEX method with idx and end supplied. Should keep characters [1:3) """ - ret:pd.DataFrame = truncate(my_df[selected_col], method=INDEX, idx=1, end=3) + ret:pd.DataFrame = truncate(truncate_df[[selected_col]], method=INDEX, idx=1, end=3) assert ret[selected_col].isin(["al", "em"]).all() -def test_truncate_index_6(my_df): +def test_truncate_index_6(truncate_df): """ Test that truncate runs correctly for the INDEX method with idx > end supplied. Should not keep any characters """ - ret:pd.DataFrame = truncate(my_df[selected_col], method=INDEX, idx=3, end=1) + ret:pd.DataFrame = truncate(truncate_df[[selected_col]], method=INDEX, idx=3, end=1) assert ret[selected_col].isin([""]).all() -def test_truncate_input_7(my_df): +def test_truncate_input_7(truncate_df): """ Test that truncate returns same value if no idx or end supplied """ - ret:pd.DataFrame = truncate(my_df[selected_col], method=INDEX) + ret:pd.DataFrame = truncate(truncate_df[[selected_col]], method=INDEX) assert ret[selected_col].isin(["Male", "Female"]).all() # ----- Method: match Tests ----- -def test_truncate_match_1(my_df): +def test_truncate_match_1(truncate_df): """ Test that truncate runs correctly for the MATCH method with a pattern that matches a part of all strings in the specified column. Should only keep characters before "al" for all values """ - ret:pd.DataFrame = truncate(my_df[selected_col], method=MATCH, match="al") + ret:pd.DataFrame = truncate(truncate_df[[selected_col]], method=MATCH, match="al") assert ret[selected_col].isin(["M", "Fem"]).all() -def test_truncate_match_2(my_df): +def test_truncate_match_2(truncate_df): """ Test that truncate runs correctly for the MATCH method with a pattern that matches a part of all strings in the specified column. Should only keep characters before "em" ("F")for entries with value "Female" and the full entry "Male" for the others. """ - ret:pd.DataFrame = truncate(my_df[selected_col], method=MATCH, match="em") + ret:pd.DataFrame = truncate(truncate_df[[selected_col]], method=MATCH, match="em") assert ret[selected_col].isin(["Male", "F"]).all() -def test_truncate_match_3(my_df): +def test_truncate_match_3(truncate_df): """ Test that truncate runs correctly for the MATCH method with a pattern that does not match any string in the specified column Should keep all characters """ - ret:pd.DataFrame = truncate(my_df[selected_col], method=MATCH, match="cat") + ret:pd.DataFrame = truncate(truncate_df[[selected_col]], method=MATCH, match="cat") assert ret[selected_col].isin(["Male", "Female"]).all() -def test_truncate_match_4(my_df): +def test_truncate_match_4(truncate_df): """ Test that truncate runs correctly for the MATCH method with a pattern matches only when ignorecase is True Should only keep characters before "em" ("F")for entries with value "Female" and the full entry "Male" for the others. """ - ret:pd.DataFrame = truncate(my_df[selected_col], method=MATCH, match="EM", ignore_case=True) + ret:pd.DataFrame = truncate(truncate_df[[selected_col]], method=MATCH, match="EM", ignore_case=True) assert ret[selected_col].isin(["Male", "F"]).all() -def test_truncate_match_5(my_df): +def test_truncate_match_5(truncate_df): """ Test pattern matches are properly escaped by the regex expression Should only keep characters before "em" ("F")for entries with value "Female" and the full entry "Male" for the others. """ - ret:pd.DataFrame = truncate(my_df[selected_col], method=MATCH, match=".*", ignore_case=True) + ret:pd.DataFrame = truncate(truncate_df[[selected_col]], method=MATCH, match=".*", ignore_case=True) assert ret[selected_col].isin(["Male", "Female"]).all() # ----- Method: More Index Tests ----- -def test_truncate_index_11(my_df): +def test_truncate_index_11(truncate_df): """ Test that truncate runs correctly for the INDEX method with only a valid `n` supplied Should only keep characters [3:] """ - ret:pd.DataFrame = truncate(my_df[selected_col], method=INDEX, idx=3) + ret:pd.DataFrame = truncate(truncate_df[[selected_col]], method=INDEX, idx=3) assert ret[selected_col].isin(["e", "ale"]).all() -def test_truncate_index_12(my_df): +def test_truncate_index_12(truncate_df): """ Test that truncate runs correctly for the INDEX method with a valid `n` supplied and trim_from=END Should only keep characters [:-3] """ - ret:pd.DataFrame = truncate(my_df[selected_col], method=INDEX, idx=3, trim_from=END) + ret:pd.DataFrame = truncate(truncate_df[[selected_col]], method=INDEX, idx=3, trim_from=END) assert ret[selected_col].isin(["M", "Fem"]).all() -def test_truncate_index_13(my_df): +def test_truncate_index_13(truncate_df): """ Test that truncate runs correctly for the INDEX method with a valid `n` supplied and trim_from=BOTH Should only keep characters [1:-1] """ - ret:pd.DataFrame = truncate(my_df[selected_col], method=INDEX, idx=1, trim_from=BOTH) + ret:pd.DataFrame = truncate(truncate_df[[selected_col]], method=INDEX, idx=1, trim_from=BOTH) assert ret[selected_col].isin(["al", "emal"]).all() -def test_truncate_index_14(my_df): +def test_truncate_index_14(truncate_df): """ Test that truncate runs correctly for the INDEX method with a value of `n` supplied greater than some of the string lengths but not others and trim_from=START Should only keep the last "e" in "Female" """ - ret:pd.DataFrame = truncate(my_df[selected_col], method=INDEX, idx=5, trim_from=START) + ret:pd.DataFrame = truncate(truncate_df[[selected_col]], method=INDEX, idx=5, trim_from=START) assert ret[selected_col].isin(["", "e"]).all() -def test_truncate_index_15(my_df): +def test_truncate_index_15(truncate_df): """ Test that truncate runs correctly for the INDEX method with a value of `n` supplied greater or equal to than half the length of some strings but not others, and trim_from=BOTH Should only keep characters "ma" from "Female" """ - ret:pd.DataFrame = truncate(my_df[selected_col], method=INDEX, idx=2, trim_from=BOTH) + ret:pd.DataFrame = truncate(truncate_df[[selected_col]], method=INDEX, idx=2, trim_from=BOTH) assert ret[selected_col].isin(["", "ma"]).all() -def test_truncate_index_16(my_df): +def test_truncate_index_16(truncate_df): """ Test that truncate runs correctly for the INDEX method with a large value of `n` supplied and trim_from=START Should not keep any """ - ret: pd.DataFrame = truncate(my_df[selected_col], method=INDEX, idx=100, trim_from=START) + ret: pd.DataFrame = truncate(truncate_df[[selected_col]], method=INDEX, idx=100, trim_from=START) assert ret[selected_col].isin([""]).all() From a7678074f9d0ecc35f819ed00db771db56f22f62 Mon Sep 17 00:00:00 2001 From: cdo03c Date: Wed, 27 Dec 2023 16:29:21 -0500 Subject: [PATCH 11/17] Fixes optimization tests --- src/pymasq/mitigations/hashing.py | 4 ++-- tests/optimizations/test_optimizations.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/pymasq/mitigations/hashing.py b/src/pymasq/mitigations/hashing.py index 547f6c0..d1595b1 100644 --- a/src/pymasq/mitigations/hashing.py +++ b/src/pymasq/mitigations/hashing.py @@ -196,6 +196,6 @@ def hashing( logger.warning( f"Warning: the default length of the hexdigest is set to 16; to alter the length, pass in `{hash_func}` as a callable defined with your prefered length." ) - return data.applymap(lambda v: hash_func(v).hexdigest(16)) + return data.map(lambda v: hash_func(v).hexdigest(16)) - return data.applymap(lambda v: hash_func(v).hexdigest()) + return data.map(lambda v: hash_func(v).hexdigest()) diff --git a/tests/optimizations/test_optimizations.py b/tests/optimizations/test_optimizations.py index 35a8961..c8dba80 100644 --- a/tests/optimizations/test_optimizations.py +++ b/tests/optimizations/test_optimizations.py @@ -67,7 +67,7 @@ def my_mutations(): # evaluation functions zeros = {lambda: 0: {"weight": 1}} ones = {lambda: 1: {"weight": 1}} -rands = {lambda: rg(): {"weight": 1}} +rands = {lambda: rg.random(1).item(): {"weight": 1}} # Test standard termination conditions From c822e8e3c3d7c488213527acb75b6e0b60db27da Mon Sep 17 00:00:00 2001 From: cdo03c Date: Wed, 27 Dec 2023 17:45:29 -0500 Subject: [PATCH 12/17] Updates --- src/pymasq/datasets/data_generator.py | 2 +- src/pymasq/metrics/auc_scores.py | 2 +- src/pymasq/metrics/risk_scores.py | 14 +++++++------- src/pymasq/metrics/utility_scores.py | 2 +- src/pymasq/mitigations/microaggregation.py | 4 ++-- 5 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/pymasq/datasets/data_generator.py b/src/pymasq/datasets/data_generator.py index c2d4c6b..ffa42ae 100644 --- a/src/pymasq/datasets/data_generator.py +++ b/src/pymasq/datasets/data_generator.py @@ -138,7 +138,7 @@ def _l_div_sensitive_gen(l: int, n: int) -> List: while len(unique_entries) != len(set(unique_entries)): unique_entries = rg.choice(range(n), l) - non_unique = rg.Generator.choice(unique_entries, n - l) + non_unique = rg.choice(unique_entries, n - l) return list(unique_entries) + list(non_unique) diff --git a/src/pymasq/metrics/auc_scores.py b/src/pymasq/metrics/auc_scores.py index b3d5051..d663def 100644 --- a/src/pymasq/metrics/auc_scores.py +++ b/src/pymasq/metrics/auc_scores.py @@ -134,7 +134,7 @@ def auc_score( ) # Encode the two data frames (at once for consistent encodings) orig_enc, mod_enc = preprocessor_fn.encode_both( - df_A=orig_df, df_B=mod_df, sensitive_col=sensitive_col + df_a=orig_df, df_b=mod_df, sensitive_col=sensitive_col ) # Train the classifer based on only the original data classifer_fn.train( diff --git a/src/pymasq/metrics/risk_scores.py b/src/pymasq/metrics/risk_scores.py index a808abd..6630f70 100644 --- a/src/pymasq/metrics/risk_scores.py +++ b/src/pymasq/metrics/risk_scores.py @@ -284,7 +284,7 @@ def _diversity( def l_diversity( df: pd.DataFrame, sensitive_col: str, - L: int = 2, + l_thresh: int = 2, method: Optional[str] = None, ) -> float: """ @@ -298,7 +298,7 @@ def l_diversity( sensitive_col : str, The name of the column containing the data that is being obscured by mitigations - L : int, optional + l_thresh : int, optional The threshold by which the closeness of the q-blocks and the full dataset are compared (Default: 2) @@ -324,14 +324,14 @@ def l_diversity( else: raise ValueError(f"method must be '{DISTINCT}' or '{ENTROPY}'") - return sum([1.0 if ld <= L else 0.0 for ld in l_div]) / len(l_div) + return sum([1.0 if ld <= l_thresh else 0.0 for ld in l_div]) / len(l_div) @BEARTYPE def is_l_diverse( df: pd.DataFrame, sensitive_col: str, - L: int = 2, + l_thresh: int = 2, method: Optional[str] = None, ) -> bool: """ @@ -348,7 +348,7 @@ def is_l_diverse( sensitive_col : str The name of the column containing the data that is being obscured by mitigations - L : int, optional + l_thresh : int, optional The threshold by which the closeness of the q-blocks and the full dataset are compared. Default is arbitrary. (Default: 2) @@ -377,9 +377,9 @@ def is_l_diverse( """ if method is None or method == DISTINCT: - return _diversity(df, sensitive_col, _unique_count) <= L + return _diversity(df, sensitive_col, _unique_count) <= l_thresh elif method == ENTROPY: - return _diversity(df, sensitive_col, _entropy_count) <= np.log(L) + return _diversity(df, sensitive_col, _entropy_count) <= np.log(l_thresh) raise ValueError(f"method must be '{DISTINCT}' or '{ENTROPY}'") diff --git a/src/pymasq/metrics/utility_scores.py b/src/pymasq/metrics/utility_scores.py index f115bb1..4df4f5f 100644 --- a/src/pymasq/metrics/utility_scores.py +++ b/src/pymasq/metrics/utility_scores.py @@ -60,7 +60,7 @@ def jensen_shannon( # Encode the two data frames (at once for consistent encodings) preprocessor_fn = preprocess.preprocessor_fn[preprocessor] orig_enc, mod_enc = preprocessor_fn.encode_both( - df_A=orig_df, df_B=mod_df, sensitive_col=sensitive_col + df_a=orig_df, df_b=mod_df, sensitive_col=sensitive_col ) # remove sensitive column diff --git a/src/pymasq/mitigations/microaggregation.py b/src/pymasq/mitigations/microaggregation.py index 37e4d10..a629b30 100644 --- a/src/pymasq/mitigations/microaggregation.py +++ b/src/pymasq/mitigations/microaggregation.py @@ -124,7 +124,7 @@ def _scaling( raise ImportError( "Unable to import `tensorly` library to perform `robust` scaling; run ´pip3 install tensorly` from within your project environment to install it." ) - scaled_data, _ = robust_pca(data.values.astype(np.float)) + scaled_data, _ = robust_pca(data.values.astype(float)) return scaled_data if callable(scale): return scale(data, **kwargs) @@ -561,7 +561,7 @@ def robust_magg( pw_dists = pairwise_distances(z) - if not all(np.diagonal(pw_dists)) == 0: + if all(np.diagonal(pw_dists)) != 0: np.fill_diagonal(pw_dists, 0) mcd = MinCovDet(random_state=seed).fit(z) From 972f96cc8f60f7faf7ed71fc4a3256db331e922c Mon Sep 17 00:00:00 2001 From: cdo03c Date: Thu, 4 Jan 2024 04:48:18 -0500 Subject: [PATCH 13/17] Updates naming conventions --- src/pymasq/datasets/data_generator.py | 18 +++---- src/pymasq/metrics/risk_scores.py | 75 +++++++++++++-------------- tests/metrics/test_risk_scores.py | 19 +++---- tests/utils/test_cache.py | 2 +- 4 files changed, 57 insertions(+), 57 deletions(-) diff --git a/src/pymasq/datasets/data_generator.py b/src/pymasq/datasets/data_generator.py index ffa42ae..17268f2 100644 --- a/src/pymasq/datasets/data_generator.py +++ b/src/pymasq/datasets/data_generator.py @@ -119,12 +119,12 @@ def gen_num_df(n: int = 1000, seed: int = 1234) -> pd.DataFrame: @BEARTYPE -def _l_div_sensitive_gen(l: int, n: int) -> List: +def _l_div_sensitive_gen(l_div: int, n: int) -> List[int]: """ Generates the sensitive variable for generate_l_diverse_table for each equivalence class Parameters ---------- - l : int + l_div : int The specified diversity that the equivalence class needs to be n : int The size of the equivalence class (i.e. the lenght of the list returned) @@ -134,17 +134,17 @@ def _l_div_sensitive_gen(l: int, n: int) -> List: List of integer values for the sensitive column """ - unique_entries = rg.choice(range(n), l) + unique_entries = rg.choice(range(n), l_div) while len(unique_entries) != len(set(unique_entries)): - unique_entries = rg.choice(range(n), l) + unique_entries = rg.choice(range(n), l_div) - non_unique = rg.choice(unique_entries, n - l) + non_unique = rg.choice(unique_entries, n - l_div) return list(unique_entries) + list(non_unique) @BEARTYPE def generate_l_diverse_table( - l: Union[int, List[int]], + l_div: Union[int, List[int]], num_col: int = 5, num_q_blocks: int = 5, q_block_sizes: Union[int, List[int]] = 5, @@ -153,7 +153,7 @@ def generate_l_diverse_table( Used for testing l-diversity. Creates a data set that is l-diverse for given l. Parameters ---------- - l : Union[int, List[int]] + l_div : Union[int, List[int]] The specified diversity that the data set needs to be TODO: need to expand this to allow float l parameters for entropy num_col : int, optional The number of columns (in addition to the sensitive column) the data set should have @@ -180,10 +180,10 @@ def generate_l_diverse_table( if isinstance(q_block_sizes, int) else q_block_sizes ) - l = [l] * num_q_blocks if not isinstance(l, list) else l + l_div: List[int] = [l_div] * num_q_blocks if not isinstance(l_div, list) else l_div for n in range(num_q_blocks): - senn = _l_div_sensitive_gen(l[n], q_block_sizes[n]) + senn = _l_div_sensitive_gen(l_div[n], q_block_sizes[n]) col_names["sensitive"] += senn for cn in col_names: if cn != "sensitive": diff --git a/src/pymasq/metrics/risk_scores.py b/src/pymasq/metrics/risk_scores.py index 6630f70..57f0b03 100644 --- a/src/pymasq/metrics/risk_scores.py +++ b/src/pymasq/metrics/risk_scores.py @@ -1,11 +1,9 @@ -from typing import List, Callable, Dict, Union, Final -from pymasq.errors import InputError, NotInRangeError +from typing import List, Callable, Dict, Optional, Union, Final import numpy as np import pandas as pd from copy import copy -from typing import Callable, Dict, Final, List, Optional, Union from pymasq import BEARTYPE from pymasq.errors import InputError, NotInRangeError @@ -185,7 +183,7 @@ def is_k_anon_col( .rename(columns={"size": "k_count"}) ) adf["is_k_anon"] = adf["k_count"] > k - return pd.merge(df, adf, on=key_vars) + return pd.merge(df, adf, on=key_vars, how="left", validate="many_to_one") @BEARTYPE @@ -377,9 +375,9 @@ def is_l_diverse( """ if method is None or method == DISTINCT: - return _diversity(df, sensitive_col, _unique_count) <= l_thresh + return _diversity(df, sensitive_col, _unique_count)["l-diversity"] <= l_thresh elif method == ENTROPY: - return _diversity(df, sensitive_col, _entropy_count) <= np.log(l_thresh) + return _diversity(df, sensitive_col, _entropy_count)["l-diversity"] <= np.log(l_thresh) raise ValueError(f"method must be '{DISTINCT}' or '{ENTROPY}'") @@ -556,8 +554,9 @@ def _closeness( grp_qi = df.groupby(qi) # get the closeness qs = _get_probs(df, sensitive_col) - fun = lambda x: fxn(qs, x) - div = grp_qi[sensitive_col].agg(fun) + def _func(x): + return fxn(qs, x) + div = grp_qi[sensitive_col].agg(_func) counts = grp_qi[sensitive_col].agg("count") _t_closeness = [] @@ -661,15 +660,15 @@ def is_t_close( @BEARTYPE -def indiv_risk_approx(fk: Union[int, float], Fk: Union[int, float]) -> float: +def indiv_risk_approx(samp_freq: Union[int, float], pop_freq: Union[int, float]) -> float: """ calculates the approximate individual risk Parameters ---------- - fk : int or float + samp_freq : int or float the sample frequency of the row's combination of quasi-identifier values - Fk : int or float + pop_freq : int or float the population frequence of the row's combination of quasi-identifier values Returns @@ -682,28 +681,28 @@ def indiv_risk_approx(fk: Union[int, float], Fk: Union[int, float]) -> float: TODO """ - if fk == Fk: - return 1 / float(fk) + if samp_freq == pop_freq: + return 1 / float(samp_freq) - pk = float(fk) / float(Fk) + pk = float(samp_freq) / float(pop_freq) - if fk > 2: - return pk / (fk - (1 - pk)) - if fk == 2: + if samp_freq > 2: + return pk / (samp_freq - (1 - pk)) + if samp_freq == 2: return (pk / (1 - pk)) - (((pk / (1 - pk)) ^ 2) * np.log(1 / pk)) return (pk / (1 - pk)) * np.log(1 / pk) @BEARTYPE -def indiv_risk_exact(fk: int, Fk: float) -> float: +def indiv_risk_exact(samp_freq: int, pop_freq: float) -> float: """ calculates the exact individual risk Parameters ---------- - fk : int + samp_freq : int the sample frequency of the row's combination of quasi-identifier values - Fk : int + pop_freq : int the population frequence of the row's combination of quasi-identifier values Returns @@ -716,32 +715,32 @@ def indiv_risk_exact(fk: int, Fk: float) -> float: TODO """ - if fk == Fk: - return 1 / float(fk) + if samp_freq == pop_freq: + return 1 / float(samp_freq) - pk = float(fk) / float(Fk) + pk = float(samp_freq) / float(pop_freq) - def B(fk, pk, i): - b1 = (fk - 1 - i) ^ 2 / ((i + 2) * (fk - 2 - i)) - b2 = (pk ^ (i + 2 - fk) - 1) / (pk ^ (i + 1 - fk) - 1) + def b_func(samp_freq, pk, i): + b1 = (samp_freq - 1 - i) ^ 2 / ((i + 2) * (samp_freq - 2 - i)) + b2 = (pk ^ (i + 2 - samp_freq) - 1) / (pk ^ (i + 1 - samp_freq) - 1) return b1 * b2 - def BB(fk, pk): + def bb_func(samp_freq, pk): bb = 0 - for m in range(fk - 2): + for m in range(samp_freq - 2): b = 1 for m2 in range(m + 1): - b = b * B(fk, pk, m2) + b = b * b_func(samp_freq, pk, m2) bb = bb + (-1) ^ (m + 1) * b return bb - first = (pk / (1 - pk)) ^ fk - third = (-1) ^ fk * np.log(pk) + first = (pk / (1 - pk)) ^ samp_freq + third = (-1) ^ samp_freq * np.log(pk) - if fk > 2: - A = (pk ^ (1 - fk) - 1) / (fk - 1) - return first * ((A * (1 + BB(fk, pk))) + third) - if fk == 2: + if samp_freq > 2: + A = (pk ^ (1 - samp_freq) - 1) / (samp_freq - 1) + return first * ((A * (1 + bb_func(samp_freq, pk))) + third) + if samp_freq == 2: return (pk / (1 - pk)) - (((pk / (1 - pk)) ^ 2) * np.log(1 / pk)) return (pk / (1 - pk)) * np.log(1 / pk) @@ -816,7 +815,7 @@ def indiv_risk( f"Method must be in ['{APPROX}', '{EXACT}'] Method given was {method}" ) - return pd.merge(df, freq_count, how="left", on=quasi_cols + ["order"])["risk"] + return pd.merge(df, freq_count, how="left", on=quasi_cols + ["order"], validate="many_to_one")["risk"] @BEARTYPE @@ -868,7 +867,7 @@ def beta_likeness( InputError This error is raised when a `beta` value of <= 0 is supplied. """ - if not beta > 0: + if beta <= 0: raise InputError("beta must be a value greater than 0") qi = ( # Generate a list of all quasi-indicators (qi) [colname for colname in df.columns if colname != sensitive_col] @@ -891,7 +890,7 @@ def beta_likeness( item, sensitive_col ) # get the frequencies of SA values in the equivalence class for key in sa_ec.keys(): - if not sa_all[key] < sa_ec[key]: # satisfies the requirement that p_i < q_i + if sa_all[key] >= sa_ec[key]: # satisfies the requirement that p_i < q_i continue dist = (sa_ec[key] - sa_all[key]) / sa_all[key] # (q_i - p_i) / p_i if enhanced: diff --git a/tests/metrics/test_risk_scores.py b/tests/metrics/test_risk_scores.py index f935765..d96fdf5 100644 --- a/tests/metrics/test_risk_scores.py +++ b/tests/metrics/test_risk_scores.py @@ -28,6 +28,7 @@ def my_df(): LETTER_SET = ["A", "B", "C", "A", "B", "A", "C", "C", "B"] +true_assert_statement: str = "Should be True" @pytest.fixture def letter_df(): @@ -39,7 +40,7 @@ def test_l_diversity_all_same(): Tests l-diversity function """ df = generate_l_diverse_table(2) - assert l_diversity(df, "sensitive", 3) == 1.0, "Should be True" + assert l_diversity(df, "sensitive", 3) == pytest.approx(1.0), true_assert_statement def test_l_diversity_variety(): @@ -47,7 +48,7 @@ def test_l_diversity_variety(): Tests l-diversity function """ df = generate_l_diverse_table([2, 3, 3, 2, 2]) - assert l_diversity(df, "sensitive", 2) == 0.6, "Should be True" + assert l_diversity(df, "sensitive", 2) == pytest.approx(0.6), true_assert_statement def test_t_closeness_num(): @@ -70,7 +71,7 @@ def test_t_closeness_num(): assert ( t_closeness(tc_table, "sensitive", test=True, datatype="numeric") == expected_result - ), "Should be True" + ), true_assert_statement def test_t_closeness_cat(): @@ -93,7 +94,7 @@ def test_t_closeness_cat(): assert ( t_closeness(tc_table, "sensitive", test=True, datatype="categorical") == expected_result - ), "Should be True" + ), true_assert_statement def test_t_closeness(): @@ -102,8 +103,8 @@ def test_t_closeness(): """ tc_table = generate_t_close_table(LETTER_SET) assert ( - t_closeness(tc_table, "sensitive", datatype="categorical", t=0.0) == 1.0 - ), "Should be True" + t_closeness(tc_table, "sensitive", datatype="categorical", t=0.0) == pytest.approx(1.0) + ), true_assert_statement def test_beta_likeness_1(letter_df): @@ -119,7 +120,7 @@ def test_beta_likeness_2(letter_df): Tests beta-likeness on a toy dataset with a very small beta (any information gain should fail) """ assert ( - beta_likeness(letter_df, "sensitive", beta=1e-9) == 4.0 / 9.0 + beta_likeness(letter_df, "sensitive", beta=1e-9) == pytest.approx(4.0 / 9.0) ), "Should fail beta likeness on the 2 A's in EC2 and 2 C's in EC3" @@ -158,7 +159,7 @@ def test_auc_score_1(my_df, method, preprocessor): preprocessor=preprocessor, **kwargs, ) - == 1.0 + == pytest.approx(1.0) ), "Result should be equal to 1.0 (i.e. True)" @@ -186,7 +187,7 @@ def test_auc_score_2(my_df, method, preprocessor): ), 3, ) - assert score == 1.0, "Result should be equal to 1.0 (i.e. True)" + assert score == pytest.approx(1.0), "Result should be equal to 1.0 (i.e. True)" answer_key = { diff --git a/tests/utils/test_cache.py b/tests/utils/test_cache.py index 43775fb..c348dec 100644 --- a/tests/utils/test_cache.py +++ b/tests/utils/test_cache.py @@ -54,7 +54,7 @@ def my_df(): ( RFClassifier, EmbeddingsEncoder, - 0.61, + 0.57, "cache_test/053cb5e57bfa9b5c9568625cb22588dd.ENCV.e81a5b5eb0df48bc68540d7b71342a7d.pkl", """ENCV. Description: Preprocessed with First ten rows: From 392d53ee772dd4a1e8181becaf4f561957ea4d30 Mon Sep 17 00:00:00 2001 From: cdo03c Date: Thu, 4 Jan 2024 04:54:19 -0500 Subject: [PATCH 14/17] Updates readme --- tests/integration/README.md | 38 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 21 deletions(-) diff --git a/tests/integration/README.md b/tests/integration/README.md index aad113e..5a02e38 100644 --- a/tests/integration/README.md +++ b/tests/integration/README.md @@ -1,4 +1,5 @@ -Integration Testing +# Integration Testing + ------------------- The integration tests will test each `pymasq.mitigation` and `pymasq.metric` available using `pymasq.optimization` procedures. @@ -8,8 +9,9 @@ Any new functionality that is to be tested must be specified in its own configur The `template_config.yaml` file describes the expected format of the configurations. -User Guide ----------- +## User Guide + +------------------- The `integration.py` script is run via the command line. @@ -22,13 +24,13 @@ For top-level use: ### Help Display available actions. - + $ python integration.py [ -h | --help ] ### Verbose Display additional logging info to terminal. _Optional: default is False_. - + $ python integration.py [ -v | --verbose ] ### Test Configuration @@ -37,39 +39,33 @@ Set the complete file path of the test configuration YAML file to use. $ python integration.py [ --test-config ] -### Iterations +### Iterations Set the number of `iters` to run the optimization procedures. This will **not** overwrite `iters` if set in the config file. _Optional: default is 1000000000_. $ python integration.py [ -i | --iters ] +## Configuration Files -Configuration Files ------------------- The integration tests run with the parameters specified in two YAML configuration files, `core_config.yaml` and `test_config.yaml`. -These files should define the configuration of all tests to be run. A third configuration file, `template_config.yaml`, is also included -and provides the schema for how proper configuration files can be defined. +These files should define the configuration of all tests to be run. A third configuration file, `template_config.yaml`, is also included and provides the schema for how proper configuration files can be defined. - The `core_config.yaml` contains the configurations for the mitigations and metrics that -have been vetted previously. +have been vetted previously. > **This file should only be modified when adding a new mitigation or metric that has already been tested.** -- The `test_config.yaml` is intended to include an example configuration file for new functionality. -The configuration in this file will add to or update/overwrite the configuration loaded from `config_core.yaml`. Use the +- The `test_config.yaml` is intended to include an example configuration file for new functionality. The configuration in this file will add to or update/overwrite the configuration loaded from `config_core.yaml`. Use the `--test-config` flag to specify the file path to a different configuration file to be tested. Note that comments to the YAML files can be included by adding "`#`" in any part of the file. -Default Behavior ----------------- -If no optimization procedure is defined the in the configuration file to be tested (e.g., `test_config.yaml`), then -only the `pymasq.optimization.ExhaustiveSearch` procedure will be run. This procedure will test -all permutations in `pymasq.mitigations` and may be time-consuming. In this case, you can use the `--iters` flag to -constraint the number of iterations to run. +## Default Behavior + +------------------- -Note that permutations are not applied and evaluated all at once, but rather incrementally. -That is, a mitigation strategy composed of 3 mitigations, will have 6 permutations and will run for 18 iterations, -while a mitigation strategy composed of 6 mitigations will have 720 permutations and will run for 4,320 iterations. +If no optimization procedure is defined the in the configuration file to be tested (e.g., `test_config.yaml`), then only the `pymasq.optimization.ExhaustiveSearch` procedure will be run. This procedure will test all permutations in `pymasq.mitigations` and may be time-consuming. In this case, you can use the `--iters` flag to constraint the number of iterations to run. +Note that permutations are not applied and evaluated all at once, but rather incrementally. That is, a mitigation strategy composed of 3 mitigations, will have 6 permutations and will run for 18 iterations, while a mitigation strategy composed of 6 mitigations will have 720 permutations and will run for 4,320 iterations. From 1633fb2c9234f5f0b2466c5181e2f32cb83fc0c5 Mon Sep 17 00:00:00 2001 From: cdo03c Date: Thu, 4 Jan 2024 05:06:00 -0500 Subject: [PATCH 15/17] Updates to py310 --- setup.cfg | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/setup.cfg b/setup.cfg index 31608d1..effd9d5 100644 --- a/setup.cfg +++ b/setup.cfg @@ -11,7 +11,7 @@ author = Cuyler OBrien, Jaime Pena, Evan Young, Brian Levine, Eric Wybenga author_email = cuyler.obrien@ll.mit.edu, jdpena@ll.mit.edu, evan.young@ll.mit.edu [options] -python_requires = >= 3.9 +python_requires = >= 3.10 packages = find: package_dir = = src @@ -29,11 +29,8 @@ install_requires = tensorflow~=2.9 tpot[dask]~=0.11 tests_require = - beartype>=0.5.1 - hypothesis>=4.53.2 - pytest>=3.8 - pytest-xdist~=3.5 - + beartype>=0.5.1 + pytest~=7.4 [options.packages.find] where = src @@ -45,7 +42,7 @@ python_files=test_*.py testpaths=tests [tox:tox] -envlist = py3{9,10,11}, coverage, bandit, owasp-depcheck +envlist = py3{10,11}, coverage, bandit, owasp-depcheck toxworkdir = build/tox [testenv] From c3b301da37752ddf46704ba98cf097e5d2fc9376 Mon Sep 17 00:00:00 2001 From: cdo03c Date: Sat, 6 Jan 2024 15:57:49 -0500 Subject: [PATCH 16/17] remove boruta because it has not been updated --- setup.cfg | 3 +- src/pymasq/__init__.py | 2 +- src/pymasq/kve/kve.py | 127 +------------------------- tests/classifiers/test_classifiers.py | 4 +- tests/kve/test_kve.py | 96 +------------------ 5 files changed, 10 insertions(+), 222 deletions(-) diff --git a/setup.cfg b/setup.cfg index effd9d5..19ef812 100644 --- a/setup.cfg +++ b/setup.cfg @@ -16,10 +16,9 @@ packages = find: package_dir = = src install_requires = - boruta~=0.3 bpemb~=0.3 matplotlib~=3.5 - numpy~=1.22 + numpy~=1.26 pandas~=1.4 plotly>=4.11.0 SALib~=1.4 diff --git a/src/pymasq/__init__.py b/src/pymasq/__init__.py index 37450ba..9687132 100644 --- a/src/pymasq/__init__.py +++ b/src/pymasq/__init__.py @@ -1,6 +1,6 @@ from os import path -__version__ = "0.6.5" +__version__ = "0.6.6" try: diff --git a/src/pymasq/kve/kve.py b/src/pymasq/kve/kve.py index edba759..a9ccac1 100644 --- a/src/pymasq/kve/kve.py +++ b/src/pymasq/kve/kve.py @@ -8,8 +8,6 @@ from numpy import ndarray import pandas as pd import statsmodels.api as sm -import json -from boruta import BorutaPy from pandas.api.types import is_numeric_dtype from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor from sklearn.feature_selection import RFECV @@ -26,12 +24,10 @@ "key_variable_exploration", "importance_scores", "random_forest_scores", - "boruta_scores", "rfe_scores", "stepwise_scores", "stepwise_selection", "RANDOM_FOREST", - "BORUTA", "RFE", "INCLUDE", "VARIABLE", @@ -43,7 +39,6 @@ RANDOM_FOREST: Final = "Random_Forest" -BORUTA: Final = "Boruta" RFE: Final = "RFE" STEPWISE: Final = "Stepwise" INCLUDE: Final = "Include" @@ -136,7 +131,7 @@ def key_variable_exploration( **kwargs Additional arguments to be passed to `importance_Scores`: - * methods : Tuple[str], optional Default: ('rf', 'boruta', 'rfe', 'stepwise') + * methods : Tuple[str], optional Default: ('rf', 'rfe', 'stepwise') Names of the ranking methods to run. Returns @@ -162,7 +157,7 @@ def key_variable_exploration( normalize=True, ) - methods = kwargs.get("methods", (RANDOM_FOREST, BORUTA, RFE, STEPWISE)) + methods = kwargs.get("methods", (RANDOM_FOREST, RFE, STEPWISE)) categories = len(df[sensitive_col].dropna().unique()) if categories < 2: print( @@ -238,7 +233,7 @@ def importance_scores( Number of categories in the senestive column used to determine the type of model used in feature selection, -1 indicates the column is continuous - methods : Tuple[str], optional (Default: "Random_Forest","Boruta","RFE", "Stepwise") + methods : Tuple[str], optional (Default: "Random_Forest","RFE", "Stepwise") Names of the ranking methods to run verbose : int {0, 1, 2}, (Default: 0) @@ -256,7 +251,7 @@ def importance_scores( "callback", None ) # callable function that emits to main server if methods is None: - methods = (RANDOM_FOREST, BORUTA, RFE, STEPWISE) + methods = (RANDOM_FOREST, RFE, STEPWISE) method_len = float(len(methods)) # instantiated for progress emits method_count = 1 # instantiated for progress emits x_rf = input_df.drop([sensitive_col], axis=1) @@ -274,15 +269,6 @@ def importance_scores( if progress_reporter is not None: progress_reporter(method_count / method_len) method_count += 1 - if BORUTA in methods and x_train.shape[0] >= 250: - if verbose > 0: - print("Running Boruta...") - score_dict[f"{BORUTA}_{INCLUDE}"] = boruta_scores( - x_train, y, verbose=verbose, categories=categories - ) - if progress_reporter is not None: - progress_reporter(method_count / method_len) - method_count += 1 if RFE in methods: if verbose > 0: print("Running Recursive Feature Elimination...") @@ -392,109 +378,6 @@ def random_forest_scores( return rf.feature_importances_, include -@BEARTYPE -def boruta_scores( - x_train: pd.DataFrame, - y: pd.Series, - categories: int, - n_estimators: int = 1000, - n_jobs: int = -1, - random_state: int = 1234, - verbose: int = 0, - max_iter: int = 50, -) -> List[str]: - """ - Boruta is an all relevant feature selection method, while most other are - minimal optimal; this means it tries to find all features carrying - information usable for prediction, rather than finding a possibly compact - subset of features on which some classifier has a minimal error - - - NOTE: Does not work with small data, requires >250 rows - - Parameters - ---------- - x_train : pd.DataFrame - A dataframe containing all input variables for training the model - - y : pd.Series - A series containing the ground truth labels or numbers - - categories: int - number of categories in the senestive column used to determine the type - of model used in feature selection, -1 indicates the column is continuous - - n_estimators : int, optional (Default: 1000) - Number of trees that are constructed during the random forest - - n_jobs : int, optional (Default: -1) - Number of workers to use for parallel processing - - -1 indicates use all available workers - - random_state: int, optional (Default: 1234) - Integer seed for setting the random state in the model - - verbose : int {0, 1, 2}, optional (Default 2) - Level of reporting from the algorithms: - - 0 disables verbose logging - - 2 is step-by-step reporting - - max_iter: int, optional (Default: 50) - The number of maximum iterations to perform. - - Returns - ------- - List[str] - list of strings, contains whether a feature should be included in - further analysis: - - "yes": boruta ranking = 1 - - "maybe": boruta ranking = 2 - - "no": boruta ranking >= 3 - - References - ---------- - https://medium.com/@indreshbhattacharyya/feature-selection-categorical-feature-selection-boruta-light-gbm-chi-square-bf47e94e2558 - - """ - if x_train.shape[0] < 250: - print("Requires > 250 rows to be stable") - return [] - if categories >= 2: - rf = RandomForestClassifier( - n_estimators=n_estimators, - n_jobs=n_jobs, - verbose=verbose, - random_state=random_state, - ) - else: - rf = RandomForestRegressor( - n_estimators=n_estimators, - n_jobs=n_jobs, - verbose=verbose, - random_state=random_state, - ) - boruta_selector = BorutaPy( - rf, - verbose=verbose, - n_estimators="auto", - random_state=random_state, - max_iter=max_iter, - ) - if isinstance(x_train, np.ndarray): - boruta_selector.fit(x_train, y) - else: - boruta_selector.fit(x_train.values, y.values) - include = [] - for r in list(boruta_selector.ranking_): - if r == 1: - include.append("yes") - elif r == 2: - include.append("maybe") - else: - include.append("no") - return include - - @BEARTYPE def rfe_scores( x_train: pd.DataFrame, @@ -638,7 +521,7 @@ def rfe_scores( multi_class="ovr", ) else: - estimator = LinearRegression(normalize=True, n_jobs=n_jobs) + estimator = LinearRegression(n_jobs=n_jobs) rfecv_selector = RFECV(estimator, step=step, cv=cv, verbose=verbose, n_jobs=n_jobs) rfecv_selector.fit(x_train, y) return ["yes" if r == 1 else "no" for r in list(rfecv_selector.ranking_)] diff --git a/tests/classifiers/test_classifiers.py b/tests/classifiers/test_classifiers.py index 62ddc12..c220321 100644 --- a/tests/classifiers/test_classifiers.py +++ b/tests/classifiers/test_classifiers.py @@ -37,8 +37,8 @@ def my_df(): (LogisticRegressionClassifier, EmbeddingsEncoder, 0.5), (RFClassifier, LabelEncoderPM, 1.0), (RFClassifier, EmbeddingsEncoder, 1.0), - (TpotClassifier, LabelEncoderPM, 0.77), - (TpotClassifier, EmbeddingsEncoder, 0.86), + (TpotClassifier, LabelEncoderPM, 0.8), + (TpotClassifier, EmbeddingsEncoder, 0.81), ], ) def test_classifiers(my_df, combo): diff --git a/tests/kve/test_kve.py b/tests/kve/test_kve.py index d2ab2fa..d5d0b7e 100644 --- a/tests/kve/test_kve.py +++ b/tests/kve/test_kve.py @@ -2,7 +2,7 @@ import pandas as pd import pytest -from pymasq.kve import random_forest_scores, boruta_scores, rfe_scores, stepwise_scores +from pymasq.kve import random_forest_scores, rfe_scores, stepwise_scores from pymasq.datasets import gen_num_df, gen_bin_df, load_census from pymasq.preprocessing import EmbeddingsEncoder from pymasq import ROOT_DIR @@ -71,28 +71,6 @@ def test_random_forest_cont(my_df): ) assert len(rf[1]) > 0, "Should be True" - -def test_boruta_cont(my_df): - """ - Tests boruta_scores if passed a continuous variable for y - """ - sensitive_col = "age" - my_df = EmbeddingsEncoder.encode( - my_df, - sensitive_col=sensitive_col, - cache_location=ROOT_DIR + "/datasets/data/cache", - ) - rf = boruta_scores( - x_train=my_df.drop(sensitive_col, axis=1), - y=my_df[sensitive_col], - verbose=0, - categories=-1, - max_iter=5, - n_estimators=20, - ) - assert len(rf[1]) > 0, "Should be True" - - def test_rfe_cont(my_df): """ Tests rfe_scores if passed a continuous variable for y @@ -134,29 +112,6 @@ def test_random_forest_multiclass(my_df): assert len(rf[1]) > 0, "Should be True" -def test_boruta_multiclass(my_df): - """ - Tests boruta_scores if passed a variable with number of categories > 2 for y - """ - sensitive_col = "education" - my_df = EmbeddingsEncoder.encode( - my_df, - sensitive_col=sensitive_col, - cache_location=ROOT_DIR + "/datasets/data/cache", - ) - y = my_df[sensitive_col] - n_cats = len(y.dropna().unique()) - rf = boruta_scores( - x_train=my_df.drop(sensitive_col, axis=1), - y=y, - verbose=0, - categories=n_cats, - max_iter=5, - n_estimators=20, - ) - assert len(rf[1]) > 0, "Should be True" - - def test_rfe_multiclass(my_df): """ Tests rfe_scores if passed a variable with number of categories > 2 for y @@ -194,20 +149,6 @@ def test_random_forest_bin(bin_df): ), "Should be ['yes', 'no', 'no', 'no', 'no', 'no']]" -def test_boruta_bin(bin_df): - """ - Tests boruta_scores feature importance ranks for a binary dataframe - of a given size. - """ - y = bin_df["Label"] - n_cats = len(y.dropna().unique()) - assert boruta_scores( - x_train=bin_df.drop("Label", axis=1), y=y, verbose=0, categories=n_cats - ) == ["yes"] * 5 + [ - "maybe" - ], "Should be ['yes', 'yes', 'yes', 'yes', 'yes', 'maybe']" - - def test_rfe_bin(bin_df): """ Tests rfe_scores feature importance ranks for a binary dataframe @@ -241,24 +182,6 @@ def test_random_forest_num(num_df): ), "Should be ['yes', 'no', 'no', 'no', 'no', 'no']]" -def test_boruta_num(num_df): - """ - Tests boruta_scores feature importance ranks for a numeric dataframe - of a given size. - """ - y = num_df["Label"] - n_cats = len(y.dropna().unique()) - assert ( - boruta_scores( - x_train=num_df.drop("Label", axis=1), - y=y, - verbose=0, - categories=n_cats, - ) - == ["yes"] * 6 - ), "Should be ['yes', 'yes', 'yes', 'yes', 'yes', 'yes']" - - def test_rfe_num(num_df): """ Tests rfe_scores feature importance ranks for a numeric dataframe @@ -277,23 +200,6 @@ def test_rfe_num(num_df): ), "Should be ['yes', 'no', 'no', 'no', 'no', 'no']" -def test_boruta_comb(comb_df): - """ - Tests boruta_scores feature importance ranks for a combined dataframe - of a given size. - """ - if comb_df.shape[0] <= 2000: - assert True - y = comb_df["Label"] - n_cats = len(y.dropna().unique()) - scores = boruta_scores( - x_train=comb_df.drop("Label", axis=1), y=y, verbose=0, categories=n_cats - ) - assert ( - scores == ["yes"] * 5 + ["maybe"] + ["yes"] * 6 - ), "One 'maybe' at index 5, otherwise all 'yes" - - def test_random_forest_comb(comb_df): """ Tests random_forest_scores feature importance ranks for a combined dataframe From bed7a7d5c581be44fdbb53fceb3739b5bd222e82 Mon Sep 17 00:00:00 2001 From: cdo03c Date: Wed, 17 Jan 2024 20:41:05 -0500 Subject: [PATCH 17/17] replaces print statements with logs --- src/pymasq/config.py | 5 ++ src/pymasq/kve/kve.py | 14 ++-- src/pymasq/metrics/utils.py | 5 +- src/pymasq/mitigations/geom_transform.py | 7 +- src/pymasq/mitigations/microaggregation.py | 2 +- src/pymasq/mitigations/pram.py | 9 ++- src/pymasq/mitigations/shuffle.py | 4 +- src/pymasq/models/_base.py | 7 +- src/pymasq/models/models.py | 36 ++++++---- src/pymasq/optimizations/_base.py | 24 +++---- src/pymasq/optimizations/optimizations.py | 73 ++++++++++---------- src/pymasq/preprocessing/entity_embedding.py | 12 ++-- src/pymasq/preprocessing/preprocess.py | 26 ++++--- src/pymasq/utils/cache.py | 30 ++++---- src/pymasq/utils/utils.py | 13 ++-- tests/classifiers/test_classifiers.py | 21 +++--- tests/integration/integration.py | 30 ++++---- tests/metrics/test_utility_scores.py | 10 +-- tests/mitigations/test_global_recode.py | 16 +++-- tests/mitigations/test_hashing.py | 12 ++-- tests/preprocessing/test_preprocess.py | 13 ++-- tests/utils/test_cache.py | 16 +++-- 22 files changed, 217 insertions(+), 168 deletions(-) diff --git a/src/pymasq/config.py b/src/pymasq/config.py index 57f1b3a..f965170 100644 --- a/src/pymasq/config.py +++ b/src/pymasq/config.py @@ -1,6 +1,8 @@ from pathlib import Path from typing import Tuple +import numpy as np + # Directory where all embeddings and models will be cached CACHE_LOCATION: Path = Path("~/.cache/pymasq").expanduser() @@ -26,6 +28,7 @@ CLASSIFIER_MODELS: Tuple[str] = ("logreg", "rfclass", "tpotclass") DEFAULT_LOGISITIC_REGRESSION_SOLVER: str = "saga" +DEFAULT_MODEL_ITERATIONS: int = 1000 # Byte Pair Encoding default language and dimensionality for vectors BPE_LANG: str = "en" @@ -39,3 +42,5 @@ # Default number of parallel processors, set to -1 for all processors DEFAULT_N_JOBS: int = -1 + +rg = np.random.default_rng(DEFAULT_SEED) \ No newline at end of file diff --git a/src/pymasq/kve/kve.py b/src/pymasq/kve/kve.py index a9ccac1..75add12 100644 --- a/src/pymasq/kve/kve.py +++ b/src/pymasq/kve/kve.py @@ -160,7 +160,7 @@ def key_variable_exploration( methods = kwargs.get("methods", (RANDOM_FOREST, RFE, STEPWISE)) categories = len(df[sensitive_col].dropna().unique()) if categories < 2: - print( + logger.info( "The kve function requires two categories for binary classification and the {} column has {} class".format( sensitive_col, categories ) @@ -173,7 +173,7 @@ def key_variable_exploration( df, sensitive_col, categories=categories, verbose=verbose, **kwargs ) if verbose > 0: - print("Building ranking...") + logger.info("Building ranking...") include_cols = [c for c in rank_df.columns if INCLUDE in c] rank_df[INCLUDE] = rank_df.apply( @@ -261,7 +261,7 @@ def importance_scores( score_dict = {} if RANDOM_FOREST in methods: if verbose > 0: - print("Running Random Forest...") + logger.info("Running Random Forest...") ( score_dict[RANDOM_FOREST], score_dict[f"{RANDOM_FOREST}_{INCLUDE}"], @@ -271,7 +271,7 @@ def importance_scores( method_count += 1 if RFE in methods: if verbose > 0: - print("Running Recursive Feature Elimination...") + logger.info("Running Recursive Feature Elimination...") score_dict[f"{RFE}_{INCLUDE}"] = rfe_scores( x_train, y, verbose=verbose, categories=categories ) @@ -280,7 +280,7 @@ def importance_scores( method_count += 1 if STEPWISE in methods: if verbose > 0: - print("Running Stepwise...") + logger.info("Running Stepwise...") score_dict[f"{STEPWISE}_{INCLUDE}"] = stepwise_scores( x_rf, y_rf, verbose=verbose ) @@ -654,7 +654,7 @@ def stepwise_selection( tested.append(best_feature) changed = True if verbose > 0: - print("Add {:30} with p-value {:.6}".format(best_feature, best_pval)) + logger.info("Add {:30} with p-value {:.6}".format(best_feature, best_pval)) # backward step model = sm.OLS(y, sm.add_constant(pd.DataFrame(x_train[included]))).fit() @@ -666,7 +666,7 @@ def stepwise_selection( worst_feature = included[pvalues.argmax()] included.remove(worst_feature) if verbose > 0: - print("Drop {:30} with p-value {:.6}".format(worst_feature, worst_pval)) + logger.info("Drop {:30} with p-value {:.6}".format(worst_feature, worst_pval)) if not changed: break count += 1 diff --git a/src/pymasq/metrics/utils.py b/src/pymasq/metrics/utils.py index 54c9f02..b33043f 100644 --- a/src/pymasq/metrics/utils.py +++ b/src/pymasq/metrics/utils.py @@ -1,3 +1,5 @@ +import logging + from pymasq.config import CATEGORY_THRESHOLD import pandas as pd @@ -7,6 +9,7 @@ __all__ = ["uniq_col_name", "_get_model_task"] +logger = logging.getLogger(__name__) @BEARTYPE def uniq_col_name(df, prefix: str = "class") -> str: @@ -61,7 +64,7 @@ def _get_model_task( elif is_numeric_dtype(sensitive_col): return "regression" else: - print( + logger.info( "The number of unique categories: {} is greater than the threshold of {} and is dtype {}".format( num_unique, cat_threshold, sensitive_col.dtype ) diff --git a/src/pymasq/mitigations/geom_transform.py b/src/pymasq/mitigations/geom_transform.py index 6c89a63..d5dd118 100644 --- a/src/pymasq/mitigations/geom_transform.py +++ b/src/pymasq/mitigations/geom_transform.py @@ -1,4 +1,5 @@ import itertools +import logging import numpy as np import pandas as pd @@ -7,7 +8,7 @@ from typing import List, Optional, Union from pymasq import BEARTYPE -from pymasq.config import FORMATTING_ON_OUTPUT, FORMATTING_IGNORE_DTYPES, DEFAULT_SEED +from pymasq.config import FORMATTING_ON_OUTPUT, FORMATTING_IGNORE_DTYPES, rg from pymasq.errors import InputError from pymasq.mitigations.utils import _is_identical from pymasq.utils import formatting @@ -15,7 +16,7 @@ __all__ = ["geom_transform"] -rg = np.random.default_rng(DEFAULT_SEED) +logger = logging.getLogger(__name__) SKIP_ROTATION_ANGLES = [30, 45, 60, 90, 120, 135, 150, 180] MAX_DEGREES = 180 @@ -231,7 +232,7 @@ def geom_transform( f"The values of `data[{perturb_cols}]` are all identical and therefore cannot be used for correlation." ) else: - print( + logger.info( "WARNING: ignoring columns that are composed entirely of identical values." ) elif len(perturb_cols) == 1: diff --git a/src/pymasq/mitigations/microaggregation.py b/src/pymasq/mitigations/microaggregation.py index a629b30..a86196a 100644 --- a/src/pymasq/mitigations/microaggregation.py +++ b/src/pymasq/mitigations/microaggregation.py @@ -548,7 +548,7 @@ def robust_magg( # test data for normality; z-scores are only meaningful for normally distributed data result = shapiro(data) if result.pvalue < 0.05: - print( + logger.info( f"Warning: data not normally distributed; fails Shapiro-Wilk test (p={result.pvalue})." ) diff --git a/src/pymasq/mitigations/pram.py b/src/pymasq/mitigations/pram.py index 17e368f..6c2c9d5 100644 --- a/src/pymasq/mitigations/pram.py +++ b/src/pymasq/mitigations/pram.py @@ -1,8 +1,9 @@ +import logging +from typing import Dict, List, Optional, Union + import pandas as pd import numpy as np -from typing import Dict, List, Optional, Union - from pymasq import BEARTYPE from pymasq.config import ( DEFAULT_SEED, @@ -15,6 +16,8 @@ __all__ = ["pram"] +logger = logging.getLogger(__name__) + rg = np.random.default_rng(DEFAULT_SEED) def __calc_transition_matrix( @@ -301,7 +304,7 @@ def pram( if len(perturb_cols) == 0: raise InputError("All values of `data` cannot be NaNs or identical.") else: - print( + logger.info( "WARNING: ignoring columns that are composed entirely of identical values." ) diff --git a/src/pymasq/mitigations/shuffle.py b/src/pymasq/mitigations/shuffle.py index 79dd284..af37fa5 100644 --- a/src/pymasq/mitigations/shuffle.py +++ b/src/pymasq/mitigations/shuffle.py @@ -1,3 +1,4 @@ +import logging import math from typing import Union, List, Final, Optional @@ -25,6 +26,7 @@ "MODEL", ] +logger = logging.getLogger(__name__) SPEARMAN: Final = "spearman" PEARSON: Final = "pearson" @@ -212,7 +214,7 @@ def shuffle( f"The values of `data[{cor_cols}]` are all identical and therefore cannot be used for correlation." ) else: - print( + logger.info( "WARNING: ignoring columns that are composed entirely of identical values." ) diff --git a/src/pymasq/models/_base.py b/src/pymasq/models/_base.py index 2cc0efa..898a9c3 100644 --- a/src/pymasq/models/_base.py +++ b/src/pymasq/models/_base.py @@ -1,3 +1,4 @@ +import logging import os from abc import abstractmethod from typing import Type, Optional, Union @@ -9,6 +10,8 @@ from pymasq.preprocessing._base import PreprocessorBase from pymasq import BEARTYPE +logger = logging.getLogger(__name__) + class ModelingBase: """ @@ -106,7 +109,7 @@ def train( if not retrain: self.load_trained_model(df, verbose) # sets self.trained from file if self.trained and verbose > 0: - print( + logger.info( f"{self.name}: loading trained model from cache. (Set retrain=True to ignore cache.)" ) @@ -186,7 +189,7 @@ def save_trained_model( verbose=verbose, ) if verbose > 0: - print(f"{self.name} model trained and saved to: {filename}") + logger.info(f"{self.name} model trained and saved to: {filename}") @BEARTYPE def load_trained_model( diff --git a/src/pymasq/models/models.py b/src/pymasq/models/models.py index 5e19ede..d4debf1 100644 --- a/src/pymasq/models/models.py +++ b/src/pymasq/models/models.py @@ -1,14 +1,16 @@ -from pymasq.config import DEFAULT_LOGISITIC_REGRESSION_SOLVER +import logging +from typing import List, Optional, Type, Any, Union + import pandas as pd import numpy as np -from typing import List, Optional, Type, Any, Union -from sklearn.preprocessing import LabelEncoder +from sklearn.preprocessing import LabelEncoder, StandardScaler from tpot import TPOTClassifier, TPOTRegressor from sklearn.metrics import roc_auc_score from sklearn.linear_model import ElasticNetCV, ElasticNet, LarsCV, LogisticRegressionCV from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor from pymasq import BEARTYPE +from pymasq.config import DEFAULT_LOGISITIC_REGRESSION_SOLVER, DEFAULT_MODEL_ITERATIONS from pymasq.models._base import ModelingBase from pymasq.preprocessing._base import PreprocessorBase @@ -24,6 +26,8 @@ ######################### +logger = logging.getLogger(__name__) + def mape( y_true: Union[pd.Series, List[float]], y_pred: Union[pd.Series, List[float]] @@ -106,7 +110,7 @@ def train( # We didn't: train the model and cache it. if verbose > 0: - print("Training LarsCV model ") + logger.info("Training LarsCV model ") x_train = df_enc.drop(y_column, axis=1) y = df_enc[y_column] self.trained = LarsCV(n_jobs=self.n_jobs) @@ -198,7 +202,7 @@ def train( # no cache found, we need to train. if verbose > 0: - print("Training ElasticNetCV model ") + logger.info("Training ElasticNetCV model ") x_train = df_enc.drop(y_column, axis=1) y = df_enc[y_column] @@ -263,8 +267,9 @@ def __init__( A string defining the type of modeling task """ super().__init__( - name="logreg", cache_location=cache_location, modeling_task=modeling_task + name="logreg", cache_location=cache_location, modeling_task=modeling_task, ) + self.scaler = StandardScaler() @BEARTYPE def train( @@ -307,16 +312,18 @@ def train( # no cache found, we need to train. if verbose > 0: - print("Training Logistic Regression model ") + logger.info("Training Logistic Regression model ") x_train = df_enc.drop(y_column, axis=1) + x_scaled = self.scaler.fit_transform(x_train) y = LabelEncoder().fit_transform(df_enc[y_column]) self.trained = LogisticRegressionCV( random_state=self.seed, n_jobs=self.n_jobs, solver=DEFAULT_LOGISITIC_REGRESSION_SOLVER, + max_iter=DEFAULT_MODEL_ITERATIONS, ) - self.trained.fit(x_train, y) + self.trained.fit(x_scaled, y) # save to cache self.save_trained_model( @@ -344,12 +351,13 @@ def predict(self, x_test: pd.DataFrame, y_true: pd.Series) -> float: """ assert self.trained is not None + x_scaled = self.scaler.fit_transform(x_test) if pd.Series(y_true).nunique() == 2: - y_predict = self.trained.predict(x_test) + y_predict = self.trained.predict(x_scaled) return roc_auc_score(y_true=y_true.tolist(), y_score=y_predict) else: - y_predict = self.trained.predict_proba(x_test) + y_predict = self.trained.predict_proba(x_scaled) return roc_auc_score( y_true=y_true.tolist(), y_score=y_predict[:, 1:], multi_class="ovr" ) @@ -416,7 +424,7 @@ def train( # no cache found, we need to train. if verbose > 0: - print("Training Logistic Regression model ") + logger.info("Training Logistic Regression model ") x_train = df_enc.drop(y_column, axis=1) y = LabelEncoder().fit_transform(df_enc[y_column]) @@ -518,7 +526,7 @@ def train( # We didn't: train the model and cache it. if verbose > 0: - print("Training LarsCV model ") + logger.info("Training LarsCV model ") x_train = df_enc.drop(y_column, axis=1) y = LabelEncoder().fit_transform(df_enc[y_column]) self.trained = RandomForestRegressor(n_jobs=self.n_jobs, random_state=self.seed) @@ -671,7 +679,7 @@ def train( # No cache available, we need to train if verbose > 0: - print(f"{type(self).__name__} Training new model.") + logger.info(f"{type(self).__name__} Training new model.") tpot = TPOTClassifier( generations=int(generations), @@ -852,7 +860,7 @@ def train( # No cache available, we need to train if verbose > 0: - print(f"{type(self).__name__} Training new model.") + logger.info(f"{type(self).__name__} Training new model.") tpot = TPOTRegressor( generations=int(generations), diff --git a/src/pymasq/optimizations/_base.py b/src/pymasq/optimizations/_base.py index 8db85f7..e81d472 100644 --- a/src/pymasq/optimizations/_base.py +++ b/src/pymasq/optimizations/_base.py @@ -1,15 +1,14 @@ import copy import inspect -import numpy as np -import pandas as pd +import logging from abc import abstractmethod - from typing import Any, Callable, Dict, List, Optional, Tuple, Union +import numpy as np +import pandas as pd import pymasq.mitigations as mits import pymasq.metrics as mets - from pymasq import BEARTYPE from pymasq.config import DEFAULT_SEED from pymasq.errors import ( @@ -19,6 +18,7 @@ NoMutationAvailableError, ) +logger = logging.getLogger(__name__) rg = np.random.Generator(np.random.PCG64(DEFAULT_SEED)) class OptimizationBase: @@ -167,7 +167,7 @@ def __init__( if not self.reuse_mutations and self.iters > n_mutations: self.iters = n_mutations if self.verbose: - print( + logger.info( ">>> [Info]: The number of iterations (%i)" % (iters), "cannot exceed the number of mutations specified (%i)" % (n_mutations), @@ -251,7 +251,7 @@ def optimize(self) -> Tuple[pd.DataFrame, float, pd.DataFrame]: A dataframe with the records of each dataframe, mutation, and fitness value accross the optimization """ if self.verbose: - print("[Starting ...]") + logger.info("[Starting ...]") self._target = self.target.copy() self._iters = self.iters @@ -261,7 +261,7 @@ def optimize(self) -> Tuple[pd.DataFrame, float, pd.DataFrame]: target, fit, logbook = self._optimize() # algo-specific if self.verbose: - print("[... Search Complete]") + logger.info("[... Search Complete]") if self.progress_reporter: self.progress_reporter(1.0) @@ -312,7 +312,7 @@ def _evaluate(self, target) -> Tuple[float, List[Tuple]]: func = getattr(mets, func) if self.verbose >= 2: - print("\t[Evaluation]: %s" % (func)) + logger.info("\t[Evaluation]: %s" % (func)) params = copy.deepcopy(args.get("params", {})) @@ -339,7 +339,7 @@ def _evaluate(self, target) -> Tuple[float, List[Tuple]]: raise else: if self.verbose >= 2: - print(f"[Warning] exception {func.__name__}: {e}") + logger.info(f"[Warning] exception {func.__name__}: {e}") raise fitnesses.append((func.__name__, value, args["weight"])) @@ -403,7 +403,7 @@ def _mutate( if not self.reuse_mutations and not mutations: if self.verbose: - print( + logger.info( ">>> [NOOP] No mutations to apply (consider changing `reuse_mutations`)." ) return target, {} # NOOP; all mitigations used and removed @@ -435,13 +435,13 @@ def _mutate( func = getattr(mits, func) if self.verbose >= 2: - print("\t[Mutation]: %s" % (func), args) + logger.info("\t[Mutation]: %s" % (func), args) try: result = func(target, **args) except Exception as e: if self.verbose >= 2: - print( + logger.info( f"[Warning] mutation {func.__name__} failed with args:={args} and error: {e}" ) raise diff --git a/src/pymasq/optimizations/optimizations.py b/src/pymasq/optimizations/optimizations.py index 9697b21..9eda759 100644 --- a/src/pymasq/optimizations/optimizations.py +++ b/src/pymasq/optimizations/optimizations.py @@ -1,13 +1,16 @@ import itertools +import logging from typing import Optional import numpy as np from scipy.special import perm from pymasq import BEARTYPE +from pymasq.config import rg from pymasq.errors import LessThanOrEqualToZeroError, NotInRangeError from pymasq.optimizations._base import OptimizationBase +logger = logging.getLogger(__name__) class IterativeSearch(OptimizationBase): """Iterative (sequential) optimization algorithm. @@ -93,7 +96,7 @@ def _optimize(self): while all([cur_fit > self.theta, self._iters > 0]): if self.verbose: - print("-- Iteration [%i] --" % (self._max_iters - self._iters)) + logger.info("-- Iteration [%i] --" % (self._max_iters - self._iters)) if self.progress_reporter: self.progress_reporter( round(1 - (self._iters / (self._max_iters * 1.0)), 2) @@ -105,13 +108,13 @@ def _optimize(self): new_fit, fit_log, met_errors = self._safe_evaluate(new_target) error_log += met_errors if self.verbose >= 2: - print( + logger.info( ">> Current fitness: %.5f | " % (cur_fit), "New fitness: %.5f | " % (new_fit), "Best fitness: %.5f" % (best_fit), ) if self.verbose >= 3: - print(new_target) + logger.info(new_target) cur_fit = new_fit target = new_target @@ -125,12 +128,12 @@ def _optimize(self): ) if cur_fit <= self.theta and self.verbose: - print(">>> [Terminating]: Solution found") + logger.info(">>> [Terminating]: Solution found") self._iters -= 1 if self._iters <= 0 and self.verbose: - print(">>> [Terminating]: Iterations complete") + logger.info(">>> [Terminating]: Iterations complete") if self.return_best: return best_target, best_fit, self._logbook @@ -239,7 +242,7 @@ def _optimize(self): while all([cur_fit > self.theta, self._iters > 0, retry > 0]): if self.verbose: - print("-- Iteration [%i] --" % (self._max_iters - self._iters)) + logger.info("-- Iteration [%i] --" % (self._max_iters - self._iters)) if self.progress_reporter: self.progress_reporter( round(1 - (self._iters / (self._max_iters * 1.0)), 2) @@ -252,11 +255,11 @@ def _optimize(self): error_log += met_errors if self.verbose >= 2: - print( + logger.info( ">> Current fitness: %.5f | New fitness: %.5f" % (cur_fit, new_fit) ) if self.verbose >= 3: - print(new_target) + logger.info(new_target) if new_fit < cur_fit: cur_fit = new_fit @@ -264,7 +267,7 @@ def _optimize(self): else: retry -= 1 if self.verbose >= 2: - print(">>> Retries left: %i" % (retry)) + logger.info(">>> Retries left: %i" % (retry)) self._record_stats( fitness=cur_fit, @@ -275,16 +278,15 @@ def _optimize(self): ) if cur_fit <= self.theta and self.verbose: - print(">>> [Terminating]: Solution found") + logger.info(">>> [Terminating]: Solution found") - if retry <= 0: - if self.verbose: - print(">>> [Terminating]: Max number of retries reached") + if retry <= 0 and self.verbose: + logger.info(">>> [Terminating]: Max number of retries reached") self._iters -= 1 if self._iters <= 0 and self.verbose: - print(">>> [Terminating]: Iterations complete") + logger.info(">>> [Terminating]: Iterations complete") return target, cur_fit, self._logbook @@ -427,7 +429,7 @@ def _optimize(self): while all([best_fit > self.theta, self._iters > 0]): if self.verbose: - print("-- Iteration [%i] --" % (self._max_iters - self._iters)) + logger.info("-- Iteration [%i] --" % (self._max_iters - self._iters)) if self.progress_reporter: self.progress_reporter( round(1 - (self._iters / (self._max_iters * 1.0)), 2) @@ -442,31 +444,30 @@ def _optimize(self): error_log += met_errors if self.verbose >= 2: - print( + logger.info( ">> Current fitness: %.5f | " % (cur_fit), "New fitness: %.5f | " % (new_fit), "Best fitness: %.5f" % (best_fit), ) if self.verbose >= 3: - print(new_target) + logger.info(new_target) - prob = np.random.random_sample() + prob = rg.random() if not target.equals(new_target) and ( self._accept_prob(cur_fit, new_fit) > prob ): - if self.verbose >= 1: - print( - ">> New solution accepted", - "(inferior solution)" if cur_fit < new_fit else "", - ) + if self.verbose >= 1 and cur_fit < new_fit: + logger.info( + ">> New solution accepted" + ) cur_fit = new_fit target = new_target accepted = True if new_fit < best_fit: if self.verbose >= 1: - print(f">> New [best] solution found: {new_fit} < {best_fit}") + logger.info(f">> New [best] solution found: {new_fit} < {best_fit}") best_fit = new_fit best_target = new_target @@ -482,10 +483,10 @@ def _optimize(self): self.temperature *= 1 - self.alpha if cur_fit <= self.theta and self.verbose: - print(">>> [Terminating]: Solution found") + logger.info(">>> [Terminating]: Solution found") if self._iters <= 0 and self.verbose: - print(">>> [Terminating]: Iterations complete") + logger.info(">>> [Terminating]: Iterations complete") return best_target, best_fit, self._logbook @@ -621,25 +622,25 @@ def _optimize(self): if any([cur_fit <= self.theta, self._iters <= 0]): if self.verbose: - print(">>> [Terminating]: Solution found or Iterations Complete") + logger.info(">>> [Terminating]: Solution found or Iterations Complete") return target, cur_fit, self._logbook if self.randomize_mutations: # Note: only matters when `num_perms` is set. - test = np.random.shuffle(self._mutations) + rg.shuffle(self._mutations) for num_perms, mutation_perms in enumerate( itertools.permutations(self._mutations, self.size_perms) ): if self.verbose: - print("-- Permutation: [%i] --" % (num_perms)) + logger.info("-- Permutation: [%i] --" % (num_perms)) target = self._target.copy() stop = False for mutation in mutation_perms: if self.verbose: - print("\t-- Iteration [%i] --" % (self._max_iters - self._iters)) + logger.info("\t-- Iteration [%i] --" % (self._max_iters - self._iters)) if self.progress_reporter: self.progress_reporter( round(1 - (self._iters / (self._max_iters * 1.0)), 2) @@ -654,13 +655,13 @@ def _optimize(self): error_log += met_errors if self.verbose >= 2: - print( + logger.info( ">> Current fitness: %.5f | " % (cur_fit), "New fitness: %.5f | " % (new_fit), "Best fitness: %.5f" % (best_fit), ) if self.verbose >= 3: - print(new_target) + logger.info(new_target) cur_fit = new_fit target = new_target @@ -679,7 +680,7 @@ def _optimize(self): if cur_fit <= self.theta: if self.verbose: - print(">>> [Terminating]: Solution found") + logger.info(">>> [Terminating]: Solution found") stop = True break @@ -687,16 +688,16 @@ def _optimize(self): if self._iters <= 0: if self.verbose: - print(">>> [Terminating]: Iterations complete") + logger.info(">>> [Terminating]: Iterations complete") stop = True break if self.verbose: - print("\n") + logger.info("\n") if (num_perms + 1) >= self.max_perms: if self.verbose: - print(">>> [Terminating]: Number of permutations complete") + logger.info(">>> [Terminating]: Number of permutations complete") stop = True if stop: diff --git a/src/pymasq/preprocessing/entity_embedding.py b/src/pymasq/preprocessing/entity_embedding.py index 5dcbdcb..185098e 100755 --- a/src/pymasq/preprocessing/entity_embedding.py +++ b/src/pymasq/preprocessing/entity_embedding.py @@ -1,4 +1,5 @@ import hashlib +import logging from pathlib import Path from typing import Dict, Optional, Union @@ -16,6 +17,7 @@ from pymasq.utils import cache import pymasq.config as cfg +logger = logging.getLogger(__name__) def embed_cache_fn(column: pd.Series, cache_location: Path) -> Path: """ @@ -93,7 +95,7 @@ def embed_entities( seed = cfg.DEFAULT_SEED if seed is None else seed if verbose > 0: - print(f"Tensor flow seed set to {seed}.") + logger.info(f"Tensor flow seed set to {seed}.") tf_set_seed(seed) embed_dict = {} @@ -113,19 +115,19 @@ def embed_entities( # ignore description embed_dict[column], _ = cache.load_cache(filename) if verbose > 1: - print("\t Cache file found and loaded for column", column) + logger.info("\t Cache file found and loaded for column {column}") # returns none if a file was found but hmac didn't match if embed_dict[column] is not None: continue if verbose > 1: - print("\tembed_entities: No cache available for ", column) + logger.info(f"\tembed_entities: No cache available for {column}") # Converts categories represented by integers to strings so that the # label encoder will work and the classes can be determined later categorical_df.loc[:, column] = categorical_df.loc[:,column].astype(str) le = LabelEncoder() - X_train = le.fit_transform(categorical_df[column]) + x_train = le.fit_transform(categorical_df[column]) model = Sequential() model.add( @@ -161,7 +163,7 @@ def embed_entities( sgd = SGD(learning_rate=learning_rate) model.compile(optimizer=sgd, loss=loss_array, metrics=metrics_array) model.fit( - X_train, + x_train, y, epochs=epochs, verbose=0, diff --git a/src/pymasq/preprocessing/preprocess.py b/src/pymasq/preprocessing/preprocess.py index 6d7dcfc..e4abf98 100644 --- a/src/pymasq/preprocessing/preprocess.py +++ b/src/pymasq/preprocessing/preprocess.py @@ -1,5 +1,7 @@ +import logging from time import time from typing import Tuple, List, Union, Optional + import numpy as np import pandas as pd from bpemb import BPEmb @@ -21,6 +23,8 @@ ################# +logger = logging.getLogger(__name__) + REDUCTION_METHODS = { "pca": PCA, "trucated": TruncatedSVD, @@ -508,12 +512,12 @@ def encode( cache_location = Path(cache_location) if verbose > 0: - print("Preprocessing Data...") + logger.info("Preprocessing Data...") start = time() cache_location.mkdir(parents=True, exist_ok=True) if verbose > 0: - print("cache_location for preprocess is: " + str(cache_location)) + logger.info("cache_location for preprocess is: " + str(cache_location)) # Remove the sensitive column and other columns from consideration. # We'll add them back in later. @@ -542,7 +546,7 @@ def encode( ignore_col_data = None if verbose > 0: - print("Splitting Data into Numerical and Categorical Data...") + logger.info("Splitting Data into Numerical and Categorical Data...") if sensitive_col or ignore_columns: input_data = df.drop(columns=dropped_cols, axis=1).copy() @@ -554,7 +558,7 @@ def encode( binary = input_data.loc[:, binary_columns] if binary_columns: if verbose > 0: - print("Imputing Missing Binary Data...") + logger.info("Imputing Missing Binary Data...") simple_imputer = SimpleImputer(strategy="most_frequent") binary = pd.DataFrame( simple_imputer.fit_transform(input_data[binary_columns]), @@ -565,7 +569,7 @@ def encode( numerical_imputed_normalized = pd.DataFrame() if numerical_columns: if verbose > 0: - print("Imputing Missing Numerical Data...") + logger.info("Imputing Missing Numerical Data...") simple_imputer = SimpleImputer(strategy="mean") simple_imputer.fit(input_data[numerical_columns]) numerical_imputed = pd.DataFrame( @@ -593,7 +597,7 @@ def encode( categorical_embeddings = [] if categorical_columns: if verbose > 0: - print("Imputing Missing Categorical Data...") + logger.info("Imputing Missing Categorical Data...") simple_imputer = SimpleImputer(fill_value="None", strategy="constant") simple_imputer.fit(input_data[categorical_columns]) categorical_imputed = pd.DataFrame( @@ -612,7 +616,7 @@ def encode( columns=numerical_columns, ) if verbose > 0: - print("Creating/Loading Categorical Data Embeddings...") + logger.info("Creating/Loading Categorical Data Embeddings...") new_embeddings = embed_entities( target_df=y, @@ -635,7 +639,7 @@ def encode( textual_embeddings = [] if textual_columns: if verbose > 0: - print("Imputing Missing Textual Data...") + logger.info("Imputing Missing Textual Data...") simple_imputer = SimpleImputer( missing_values="", fill_value="None", strategy="constant" ) @@ -646,10 +650,10 @@ def encode( columns=textual_columns, ) if verbose > 0: - print("Creating Textual Data Embeddings...") + logger.info("Creating Textual Data Embeddings...") for col in textual_columns: if verbose > 0: - print("\t" + col) + logger.info("\t" + col) sents = textual_imputed[col].str.lower().str.replace("[!?:/]", " ") textual_embedding_array = EmbeddingsEncoder.sentence_bpe_vectors( @@ -668,7 +672,7 @@ def encode( textual_embeddings.append(textual_embedding) if verbose > 0: - print("Preprocessing took: {} seconds".format(round(time() - start, 2))) + logger.info("Preprocessing took: {} seconds".format(round(time() - start, 2))) if sensitive_col: return pd.concat( diff --git a/src/pymasq/utils/cache.py b/src/pymasq/utils/cache.py index c082f5c..87d2bd0 100644 --- a/src/pymasq/utils/cache.py +++ b/src/pymasq/utils/cache.py @@ -1,17 +1,21 @@ -import pickle import hashlib -from pathlib import Path import hmac import glob +import logging +import os +import pickle import shutil +from pathlib import Path from typing import Optional, Tuple, Dict, Union -from pandas.util import hash_pandas_object + import pandas as pd -from pymasq import BEARTYPE +from pandas.util import hash_pandas_object + import pymasq.config as cfg +from pymasq import BEARTYPE from pymasq.errors import InputError -import os +logger = logging.getLogger(__name__) def _hmac(data: object) -> str: """ @@ -63,7 +67,7 @@ def save( filename = f"{fn_prefix}.{_hmac(pickled_data)}.pkl" if verbose > 0: - print(f"Saving. hmac key is: {cfg.CACHE_HMAC_KEY}") + logger.info(f"Saving. hmac key is: {cfg.CACHE_HMAC_KEY}") with open(filename, "wb") as fd: fd.write(pickled_data) @@ -112,7 +116,7 @@ def load_cache( # check the hmac of the file (unless ignore) if str(digest) != file.split(".")[-2] and not ignore_hmac: if verbose > 0: - print( + logger.info( f"""" Error: hmac of file ({digest}) does not match the hmac stored in the filename ({file.split('.')[-2]}) for hmac key of '{cfg.CACHE_HMAC_KEY}' for file: {file} @@ -120,8 +124,8 @@ def load_cache( ) continue if verbose > 0: - print(f"Expected hmac: {str(digest)}") - print(f"Filename hmac: {file.split('.')[-2]}") + logger.info(f"Expected hmac: {str(digest)}") + logger.info(f"Filename hmac: {file.split('.')[-2]}") # read in the data try: @@ -129,7 +133,7 @@ def load_cache( description, data = pickle.load(fd) fd.close() if verbose > 0: - print(f"{description}") + logger.info(f"{description}") return data, description except Exception as e: raise InputError(f"Error loading cache file from {prefix_path}: {e}") @@ -161,14 +165,14 @@ def cache_info(file_or_path: str) -> Dict[str, str]: Files without valid hmacs are not listed. """ - print("Checking all files in ", file_or_path) + logger.info(f"Checking all files in {file_or_path}") result = {} for file in glob.glob(file_or_path + "/*.pkl"): - print(f"\n----{file}----") + logger.info(f"\n----{file}----") try: _, description = load_cache(prefix_path=file) except Exception as e: - print(e) + logger.info(e) continue if description is not None: result[file] = description diff --git a/src/pymasq/utils/utils.py b/src/pymasq/utils/utils.py index e1fd5c9..21c55aa 100644 --- a/src/pymasq/utils/utils.py +++ b/src/pymasq/utils/utils.py @@ -1,10 +1,11 @@ -import inspect import functools +import inspect +import logging +from typing import Final, List, Optional, Union + import numpy as np import pandas as pd - from pandas.api.types import is_numeric_dtype -from typing import Final, List, Optional, Union from pymasq import BEARTYPE from pymasq import config @@ -12,6 +13,8 @@ __all__ = ["BOTH", "as_dataframe", "validate_numeric", "formatting", "freq_calc"] +logger = logging.getLogger(__name__) + BOTH: Final = "both" @@ -85,7 +88,7 @@ def _formatting_wrapper(data, *args, **kwargs): data = data.astype(dtypes) except: # TODO: switch to logging - print("WARNING: Unable to keep original datatypes.") + logger.info("WARNING: Unable to keep original datatypes.") if on_output: if input_type == pd.Series: @@ -146,7 +149,7 @@ def freq_calc( freq_df = data.groupby(quasi_cols).count()[sensitive_col] freq_df = freq_df.rename("samp_fq").reset_index() - freqs = pd.merge(data, freq_df, how="outer", on=quasi_cols) + freqs = pd.merge(data, freq_df, how="outer", on=quasi_cols, validate="many_to_one") weights = as_dataframe(weights) if weights else pd.Series([1] * freqs.shape[0]) freqs["pop_fq"] = freqs["samp_fq"].values * weights diff --git a/tests/classifiers/test_classifiers.py b/tests/classifiers/test_classifiers.py index c220321..9f9bc2e 100644 --- a/tests/classifiers/test_classifiers.py +++ b/tests/classifiers/test_classifiers.py @@ -1,10 +1,11 @@ #!/usr/bin/env python # coding: utf-8 - -import shutil +import logging import pytest -import pymasq.config as cfg +import shutil from pathlib import Path + +import pymasq.config as cfg from pymasq.datasets import load_census from pymasq.preprocessing import LabelEncoderPM, EmbeddingsEncoder from pymasq.models.models import ( @@ -13,6 +14,8 @@ RFClassifier, ) +logger = logging.getLogger(__name__) + @pytest.fixture def my_df(): @@ -43,7 +46,7 @@ def my_df(): ) def test_classifiers(my_df, combo): classifier_type, preprocessor, answer = combo - print(classifier_type) + logger.info(classifier_type) # check that the classifier gets the expected value given a set hmac key and set seed dir_name = "cache_test" @@ -72,17 +75,17 @@ def test_classifiers(my_df, combo): # should make use of cache enc = preprocessor.encode(my_df, cache_location=dir_name, verbose=1) - print(type(enc.drop(["sex"], axis=1))) - print(type(enc.sex)) + logger.info(type(enc.drop(["sex"], axis=1))) + logger.info(type(enc.sex)) score = classifier.predict(x_test=enc.drop(["sex"], axis=1), y_true=enc.sex) - print(f"{classifier.name}, {preprocessor}: {score}") + logger.info(f"{classifier.name}, {preprocessor}: {score}") assert round(score, 2) == answer, "Scores should match (trial {}, {})".format( classifier_type, preprocessor ) # Check if the cached file loads, and that the hmac checks out - print(f"\n{classifier.name}, {preprocessor} load") + logger.info(f"\n{classifier.name}, {preprocessor} load") classifier.load_trained_model(my_df, verbose=1) - print("removing cache") + logger.info("removing cache") shutil.rmtree(dir_name) diff --git a/tests/integration/integration.py b/tests/integration/integration.py index 7da1a31..dbda821 100644 --- a/tests/integration/integration.py +++ b/tests/integration/integration.py @@ -1,19 +1,17 @@ import argparse import json -import numpy as np -import pandas as pd +import logging import os import yaml import pymasq -pymasq.set_seed(123) - -from pymasq import mitigations as mits -from pymasq import metrics as mets from pymasq import optimizations as opts from pymasq import datasets +pymasq.set_seed(123) + +logger = logging.getLogger(__name__) ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) CORE_CFG_FNAME = os.path.join(ROOT_DIR, "core_config.yaml") @@ -76,13 +74,13 @@ def get_configs(test_cfg): opts_cfg = cfg.get("optimizations", None) if VERBOSE: - print( - "========== [ Dataset ] ==========\n", - json.dumps(data_cfg, indent=4), + logger.info( + f"""========== [ Dataset ] ==========\n, + {json.dumps(data_cfg, indent=4)}, "\n========== [Mitigations] ==========\n", - json.dumps(mits_cfg, indent=4), + {json.dumps(mits_cfg, indent=4)}, "\n========== [Metrics] ==========\n", - json.dumps(mets_cfg, indent=4), + {json.dumps(mets_cfg, indent=4)},""" ) return data_cfg, mits_cfg, mets_cfg, opts_cfg @@ -104,7 +102,7 @@ def get_data(data_cfg): df = df.loc[:, cols if isinstance(cols, list) else [cols]] if VERBOSE: - print(df, "\n", df.shape) + logger.info(df, "\n", df.shape) return df @@ -127,8 +125,8 @@ def run(args): mod_df, fit, log = algo.optimize() if VERBOSE: - print("\n============== %s ===============\n" % (opt)) - print(mod_df, "\n", fit, "\n", log) + logger.info("\n============== %s ===============\n" % (opt)) + logger.info(mod_df, "\n", fit, "\n", log) else: # if no optimizations specified, then simply run ExhaustiveSearch @@ -143,9 +141,9 @@ def run(args): mod_df, fit, log = algo.optimize() if VERBOSE: - print(mod_df, "\n", fit, "\n", log) + logger.info(mod_df, "\n", fit, "\n", log) - print("[Tests: Complete]") + logger.info("[Tests: Complete]") if __name__ == "__main__": diff --git a/tests/metrics/test_utility_scores.py b/tests/metrics/test_utility_scores.py index 7d48060..bf0dbf0 100644 --- a/tests/metrics/test_utility_scores.py +++ b/tests/metrics/test_utility_scores.py @@ -1,4 +1,5 @@ -from random import sample, gauss, seed +import logging +from random import sample, gauss import pandas as pd import pytest @@ -11,6 +12,7 @@ params = [5000, 10000, 100000] seed = 1234 +logger = logging.getLogger(__name__) @pytest.fixture(scope="session", params=params) def orig_bin_df(request): @@ -116,7 +118,7 @@ def test_propensity_score_identical(my_df): """ Tests propensity_score for identical data frames """ - print() + logger.info() for classifier, pp in [ ("logreg", ["embeddings", "label_encode"]), ("rfclass", ["embeddings", "label_encode"]), @@ -143,7 +145,7 @@ def test_propensity_score_identical(my_df): method=classifier, preprocessor=preprocessor, ) - print(f"{classifier}/{preprocessor}: {round(score,2)}") + logger.info(f"{classifier}/{preprocessor}: {round(score,2)}") assert ( round(score, 2) <= 0.0 ), f"{classifier}/{preprocessor}: Should be 0.0 but is round({score},2)={round(score,2)}" @@ -185,7 +187,7 @@ def test_propensity_score_moderate_change(my_df): method=classifier, preprocessor=preprocessor, ) - print(f"{classifier}/{preprocessor}: {round(score,2)}") + logger.info(f"{classifier}/{preprocessor}: {round(score,2)}") exp = expected.pop() assert ( round(score, 2) == exp, diff --git a/tests/mitigations/test_global_recode.py b/tests/mitigations/test_global_recode.py index 042d5a4..aa5068c 100644 --- a/tests/mitigations/test_global_recode.py +++ b/tests/mitigations/test_global_recode.py @@ -1,18 +1,20 @@ +import logging + import pandas as pd -import numpy as np from pymasq import config -config.FORMATTING_ON_OUTPUT = True - from pymasq.mitigations import ( global_recode, EQUAL, - EQUIDISTANT, MAGNITUDE, LOG_EQUIDISTANT, ) +config.FORMATTING_ON_OUTPUT = True + +logger = logging.getLogger(__name__) + def test_global_recode_labels_ordered(): one_to_ten = range(1, 11) @@ -25,7 +27,7 @@ def test_global_recode_labels_ordered(): ordered=True, ) result = global_recode(series, bins=3, ordered=True, labels=["A", "B", "C"]) - print(result) + logger.info(result) assert all(result == expected_result), "This should be true" @@ -73,12 +75,12 @@ def test_global_recode_no_labels(): dtype="category", ordered=True, ) - print("EXPECTED RESULT ====>", expected_result) + logger.info(f"EXPECTED RESULT ====>{expected_result}") result = global_recode( series, bins=5, ) - print("RESULT ====>", result) + logger.info(f"RESULT ====>{result}") assert all(result == expected_result), "This should be true" diff --git a/tests/mitigations/test_hashing.py b/tests/mitigations/test_hashing.py index 1e746cc..302babe 100644 --- a/tests/mitigations/test_hashing.py +++ b/tests/mitigations/test_hashing.py @@ -1,19 +1,19 @@ #!/usr/bin/env python # coding: utf-8 -import pytest - +import logging import hashlib -import numpy as np -from pymasq.config import DEFAULT_SEED +import pytest + +from pymasq.config import rg from pymasq.datasets import load_census from pymasq.mitigations import hashing ALGORITHMS = hashlib.algorithms_guaranteed -rg = np.random.default_rng(DEFAULT_SEED) +logger = logging.getLogger(__name__) def _my_df(): df = load_census() @@ -42,7 +42,7 @@ def test_hashing_all_hashlib_guaranteed_algorithms(my_df, hash_func): try: rdf = hashing(my_df, hash_func) except Exception as e: - print(f"Raised Exception: {e}") + logger.info(f"Raised Exception: {e}") assert rdf is not None diff --git a/tests/preprocessing/test_preprocess.py b/tests/preprocessing/test_preprocess.py index af518a5..22cd47a 100644 --- a/tests/preprocessing/test_preprocess.py +++ b/tests/preprocessing/test_preprocess.py @@ -1,16 +1,17 @@ #!/usr/bin/env python # coding: utf-8 -import pytest +import logging +import pytest from numpy import NaN from pymasq.datasets import load_census - from pymasq.preprocessing import embed_entities, LabelEncoderPM, EmbeddingsEncoder # from pymasq.errors import InputError, DataTypeError +logger = logging.getLogger(__name__) @pytest.fixture def my_df(): @@ -114,8 +115,8 @@ def test_embed_entites_7(my_df): # Tests that embed_entities returns arrays for each education category given two target columns. # """ # ret = embed_entities(my_df[["sex", "marital_status"]], my_df[["education"]]) -# print(my_df["education"].nunique()) -# print(ret["education"].shape[0]) +# logger.info(my_df["education"].nunique()) +# logger.info(ret["education"].shape[0]) # assert my_df["education"].nunique() == ret["education"].shape[0] @@ -129,8 +130,8 @@ def test_embed_entites_9(my_df): cache_location=None, retrain=True, ) - print(my_df["education"].nunique()) - print(ret["education"].shape[0]) + logger.info(my_df["education"].nunique()) + logger.info(ret["education"].shape[0]) assert my_df["education"].nunique() == ret["education"].shape[0] diff --git a/tests/utils/test_cache.py b/tests/utils/test_cache.py index c348dec..8b9995e 100644 --- a/tests/utils/test_cache.py +++ b/tests/utils/test_cache.py @@ -1,15 +1,19 @@ #!/usr/bin/env python # coding: utf-8 +import logging import shutil +from pathlib import Path + import pytest + import pymasq.config as cfg -from pathlib import Path from pymasq.datasets import load_census from pymasq.models.models import LogisticRegressionClassifier, RFClassifier from pymasq.preprocessing import LabelEncoderPM, EmbeddingsEncoder from pymasq.utils import cache +logger = logging.getLogger(__name__) @pytest.fixture def my_df(): @@ -76,7 +80,7 @@ def my_df(): ) def test_cache(my_df, combo): classifier_type, preprocessor, answer, key, desc = combo - print(classifier_type) + logger.info(classifier_type) dir_name = "cache_test" Path(dir_name).mkdir(exist_ok=True) @@ -101,13 +105,13 @@ def test_cache(my_df, combo): ) enc = preprocessor.encode(my_df, cache_location=None) score = classifier.predict(x_test=enc.drop(["sex"], axis=1), y_true=enc.sex) - print(f"{classifier.name}, {preprocessor}: {score}") + logger.info(f"{classifier.name}, {preprocessor}: {score}") assert round(score, 2) == answer, "Scores should match (trial {}) {} and {}".format( combo, score, answer ) # Check if the cached file loads, and that the hmac checks out - print(f"\n{classifier.name}, {preprocessor} load") + logger.info(f"\n{classifier.name}, {preprocessor} load") classifier.load_trained_model(my_df, verbose=1) # Test that changing the hmac will cause a failure @@ -116,8 +120,8 @@ def test_cache(my_df, combo): classifier.load_trained_model(my_df) raise ("This test should have failed because the hmac key changed") except Exception as e: - print("This error is a desired outcome of the test:") - print("\t", e, "\n") + logger.info("This error is a desired outcome of the test:") + logger.exception(e) cfg.CACHE_HMAC_KEY = "my key" # Assert to see if description was saved