From d386b9e4b9050a71ed53c3032ffc114349e6543f Mon Sep 17 00:00:00 2001
From: cdo03c <cdo03c@gmail.com>
Date: Tue, 5 Dec 2023 20:07:19 -0500
Subject: [PATCH 01/17] Adds alt text to images

---
 README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/README.md b/README.md
index 82f8418..64092de 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 # PyMASq
 
 <p align="center">
-    <img src="./assets/images/masq_logo_light.svg" width="150px"/>
+    <img src="./assets/images/masq_logo_light.svg" width="150px" alt="MASq Logo"/>
 </p>
 
 ## Python-based Mitigation Application and Assessment (MASq)
@@ -32,9 +32,9 @@ cd pymasq
 ### Installing into a Conda Environment
 
 ```sh
-conda create -n masq python=3.8 -y
+conda create -n masq python=3.10 -y
 conda activate masq
-pip install .
+pip install -e .
 ```
 
 To generate the docs
@@ -44,7 +44,7 @@ python -m pip install -r ./doc-requirements.txt
 ```
 
 <p align="center">
-    <img src="./assets/images/Lincoln_Lab_icon.png" width="150px"/>
+    <img src="./assets/images/Lincoln_Lab_icon.png" width="150px" alt="MIT Lincoln Lab Logo"/>
 </p>
 
 ## Distribution Statement

From 772e216848f9ac4218e41cc8ef5db263c0252329 Mon Sep 17 00:00:00 2001
From: cdo03c <cdo03c@gmail.com>
Date: Wed, 6 Dec 2023 18:57:03 -0500
Subject: [PATCH 02/17] Cleans up code

---
 setup.cfg                                 | 40 ++++++++++++-----------
 src/pymasq/config.py                      |  1 -
 src/pymasq/datasets/data_generator.py     |  2 +-
 src/pymasq/mitigations/add_noise.py       |  1 -
 src/pymasq/mitigations/geom_transform.py  |  2 +-
 src/pymasq/mitigations/hashing.py         |  3 +-
 src/pymasq/mitigations/local_supp.py      |  4 ++-
 src/pymasq/mitigations/pram.py            |  7 ++--
 src/pymasq/mitigations/rank_swap.py       |  7 ++--
 src/pymasq/mitigations/rounding.py        |  7 ++--
 src/pymasq/mitigations/shuffle.py         |  4 +--
 src/pymasq/mitigations/substitute.py      |  5 ++-
 src/pymasq/mitigations/topbot_recoding.py |  3 +-
 src/pymasq/mitigations/truncate.py        | 12 ++++---
 src/pymasq/mitigations/utils.py           |  6 ++--
 src/pymasq/models/_base.py                |  8 ++---
 src/pymasq/models/models.py               |  2 +-
 src/pymasq/optimizations/optimizations.py |  2 +-
 src/pymasq/optimizations/utils.py         |  2 +-
 src/pymasq/preprocessing/preprocess.py    |  8 ++---
 src/pymasq/sa/sobol.py                    |  2 +-
 src/pymasq/utils/utils.py                 |  1 -
 tests/utils/test_cache.py                 | 10 ++----
 23 files changed, 64 insertions(+), 75 deletions(-)

diff --git a/setup.cfg b/setup.cfg
index 7bf9a61..31608d1 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -11,27 +11,29 @@ author = Cuyler OBrien, Jaime Pena, Evan Young, Brian Levine, Eric Wybenga
 author_email = cuyler.obrien@ll.mit.edu, jdpena@ll.mit.edu, evan.young@ll.mit.edu
 
 [options]
-python_requires = >= 3.8
+python_requires = >= 3.9
 packages = find:
 package_dir =
        = src
 install_requires =
-       boruta>=0.3
-       bpemb>=0.3.3
-       matplotlib>=3.4.2
-       numpy>=1.19.3
-       pandas>=1.1.3
+       boruta~=0.3
+       bpemb~=0.3
+       matplotlib~=3.5
+       numpy~=1.22
+       pandas~=1.4
        plotly>=4.11.0
-       scikit-learn>=0.23
-       scipy>=1.5.4
-       statsmodels>=0.12
-       SALib>=1.4.5
-       tensorflow>=2.4.0
-       tpot[dask]>=0.11
+       SALib~=1.4
+       scikit-learn~=1.1
+       scipy~=1.8
+       statsmodels~=0.13
+       tensorflow~=2.9
+       tpot[dask]~=0.11
 tests_require = 
-       pytest>=3.8
-       hypothesis>=4.53.2
        beartype>=0.5.1
+       hypothesis>=4.53.2    
+       pytest>=3.8
+       pytest-xdist~=3.5
+       
 
 [options.packages.find]
 where = src
@@ -43,7 +45,7 @@ python_files=test_*.py
 testpaths=tests
 
 [tox:tox]
-envlist = py38, py39, coverage, bandit, owasp-depcheck
+envlist = py3{9,10,11}, coverage, bandit, owasp-depcheck
 toxworkdir = build/tox
 
 [testenv]
@@ -54,7 +56,7 @@ commands = pytest tests --junitxml={toxworkdir}/xunit-tests-{envname}.xml -o jun
 
 [testenv:coverage]
 usedevelop = true
-basepython = python3.8
+basepython = python3.10
 deps = {[testenv]deps}
        coverage
        pytest-cov
@@ -62,16 +64,16 @@ commands = pytest --cov-report xml:{toxworkdir}/xunit-coverage.xml --cov-config=
 
 [testenv:localcoverage]
 usedevelop = true
-basepython = python3.8
+basepython = python3.10
 deps = {[testenv]deps}
        coverage
        pytest-cov
 commands = pytest --cov-report term-missing --cov-config=setup.cfg --cov=pymasq tests
 
 [testenv:bandit]
-basepython = python3.8
+basepython = python3.10
 deps = bandit
 commands = bandit -f json -o {toxworkdir}/security-bandit.json -r {envsitepackagesdir}/pymasq
 
 [testenv:owasp-depcheck]
-basepython = python3.8
+basepython = python3.10
diff --git a/src/pymasq/config.py b/src/pymasq/config.py
index 855e3ad..57f1b3a 100644
--- a/src/pymasq/config.py
+++ b/src/pymasq/config.py
@@ -1,6 +1,5 @@
 from pathlib import Path
 from typing import Tuple
-from pymasq import ROOT_DIR
 
 # Directory where all embeddings and models will be cached
 CACHE_LOCATION: Path = Path("~/.cache/pymasq").expanduser()
diff --git a/src/pymasq/datasets/data_generator.py b/src/pymasq/datasets/data_generator.py
index 8b88fa0..773c429 100644
--- a/src/pymasq/datasets/data_generator.py
+++ b/src/pymasq/datasets/data_generator.py
@@ -136,7 +136,7 @@ def _l_div_sensitive_gen(l: int, n: int) -> List:
     while len(unique_entries) != len(set(unique_entries)):
         unique_entries = np.random.choice(range(n), l)
 
-    non_unique = np.random.choice(unique_entries, n - l)
+    non_unique = np.random.Generator.choice(unique_entries, n - l)
     return list(unique_entries) + list(non_unique)
 
 
diff --git a/src/pymasq/mitigations/add_noise.py b/src/pymasq/mitigations/add_noise.py
index 92e85a8..eca3a34 100644
--- a/src/pymasq/mitigations/add_noise.py
+++ b/src/pymasq/mitigations/add_noise.py
@@ -13,7 +13,6 @@
     VALIDATE_NUMERIC_ON_INPUT,
     VALIDATE_NUMERIC_ON_OUTPUT,
 )
-from pymasq.mitigations.utils import _as_series, _as_dataframe
 from pymasq.utils import validate_numeric, formatting
 from pymasq import BEARTYPE
 from pymasq.errors import InputError
diff --git a/src/pymasq/mitigations/geom_transform.py b/src/pymasq/mitigations/geom_transform.py
index 940f90a..9e5fef5 100644
--- a/src/pymasq/mitigations/geom_transform.py
+++ b/src/pymasq/mitigations/geom_transform.py
@@ -10,7 +10,7 @@
 from pymasq.config import FORMATTING_ON_OUTPUT, FORMATTING_IGNORE_DTYPES
 from pymasq.errors import InputError
 from pymasq.mitigations.utils import _is_identical
-from pymasq.utils import formatting, validate_numeric
+from pymasq.utils import formatting
 
 
 __all__ = ["geom_transform"]
diff --git a/src/pymasq/mitigations/hashing.py b/src/pymasq/mitigations/hashing.py
index 9446d8e..a178ebf 100644
--- a/src/pymasq/mitigations/hashing.py
+++ b/src/pymasq/mitigations/hashing.py
@@ -1,4 +1,4 @@
-from typing import Any, Callable, Dict, List, Optional, Union
+from typing import Callable, List, Optional, Union
 
 import hashlib
 import numpy as np
@@ -8,7 +8,6 @@
 from pymasq import BEARTYPE
 from pymasq.config import (
     FORMATTING_ON_OUTPUT,
-    FORMATTING_IGNORE_DTYPES,
 )
 from pymasq.errors import InputError
 from pymasq.utils import formatting
diff --git a/src/pymasq/mitigations/local_supp.py b/src/pymasq/mitigations/local_supp.py
index e48a2a4..558f365 100644
--- a/src/pymasq/mitigations/local_supp.py
+++ b/src/pymasq/mitigations/local_supp.py
@@ -137,7 +137,9 @@ def local_supp(
         method=method,
         qual=qual,
     )
-    if not keep_dtypes and type(to_val) != type(data.loc[0, suppress_col]):
+    if not keep_dtypes and not isinstance(
+        type(to_val), type(data.loc[0, suppress_col])
+    ):
         # TODO: switch to logging
         print(
             f"WARNING: The datatype of the `suppress_col` ({suppress_col}`) will be changed."
diff --git a/src/pymasq/mitigations/pram.py b/src/pymasq/mitigations/pram.py
index df04f55..d1bcb18 100644
--- a/src/pymasq/mitigations/pram.py
+++ b/src/pymasq/mitigations/pram.py
@@ -6,7 +6,6 @@
 from pymasq import BEARTYPE
 from pymasq.config import (
     FORMATTING_ON_OUTPUT,
-    FORMATTING_IGNORE_DTYPES,
 )
 from pymasq.errors import InputError, NotInRangeError
 from pymasq.mitigations.utils import _is_identical
@@ -92,9 +91,7 @@ def __randomization(
             d_pramed[idxs] = np.random.choice(
                 cats,
                 len(idxs),
-                p=trans.loc[
-                    cat,
-                ],
+                p=trans.loc[cat,],
             )
 
     return d_pramed
@@ -300,7 +297,7 @@ def pram(
 
     if len(perturb_cols) != n_pc:
         if len(perturb_cols) == 0:
-            raise InputError(f"All values of `data` cannot be NaNs or identical.")
+            raise InputError("All values of `data` cannot be NaNs or identical.")
         else:
             print(
                 "WARNING: ignoring columns that are composed entirely of identical values."
diff --git a/src/pymasq/mitigations/rank_swap.py b/src/pymasq/mitigations/rank_swap.py
index 057cec6..0a47a13 100644
--- a/src/pymasq/mitigations/rank_swap.py
+++ b/src/pymasq/mitigations/rank_swap.py
@@ -1,7 +1,6 @@
 from typing import Union, List
 
 import pandas as pd
-import numpy as np
 
 from .utils import _as_series
 
@@ -9,11 +8,9 @@
 
 
 def rank_swap(
-    data: Union[pd.DataFrame, pd.Series],
-    cols: Union[str, List[str]] = None,
-    **kwargs
+    data: Union[pd.DataFrame, pd.Series], cols: Union[str, List[str]] = None, **kwargs
 ) -> pd.Series:
-    """ TODO
+    """TODO
 
     Parameters
     ----------
diff --git a/src/pymasq/mitigations/rounding.py b/src/pymasq/mitigations/rounding.py
index 772f810..68134ee 100644
--- a/src/pymasq/mitigations/rounding.py
+++ b/src/pymasq/mitigations/rounding.py
@@ -1,4 +1,3 @@
-import math
 import pandas as pd
 
 from typing import List, Union, Optional
@@ -29,14 +28,14 @@ def rounding(
     cols: Optional[Union[List, str, int]] = None,
     keep_dtypes: bool = True,
 ) -> pd.DataFrame:
-    """ Round numerical values to the nearest place value.
+    """Round numerical values to the nearest place value.
 
     Round to the nearest whole number or decimal. Values are always rounded up.
 
     Parameters
     ----------
     data : DataFrame, Series, or array_like
-        The data to be modified.    
+        The data to be modified.
     magnitude : int (Default: 0)
         The place value to round to.
     round_decimal : bool (Default: False)
@@ -54,7 +53,7 @@ def rounding(
 
     Examples
     --------
-    >>> df = pd.DataFrame(np.random.uniform(0.0, 1000, (10,3)))    
+    >>> df = pd.DataFrame(np.random.uniform(0.0, 1000, (10,3)))
        0           1           2
     0  790.885012  378.955986  598.524492
     1  396.506198  416.688230  801.133469
diff --git a/src/pymasq/mitigations/shuffle.py b/src/pymasq/mitigations/shuffle.py
index 9e38c55..0cf5b85 100644
--- a/src/pymasq/mitigations/shuffle.py
+++ b/src/pymasq/mitigations/shuffle.py
@@ -10,7 +10,7 @@
 from pymasq.config import FORMATTING_ON_OUTPUT
 from pymasq.utils import formatting
 from pymasq.preprocessing import LabelEncoder_pm
-from pymasq.errors import InputError, DataTypeError
+from pymasq.errors import InputError
 from pymasq.mitigations.utils import _is_identical
 
 
@@ -327,4 +327,4 @@ def _shuffle_wrapper(
     raise InputError(
         f"Invalid `method` defined; method must be one of ['model', 'corr']. (Received: {method}"
     )
-'''
\ No newline at end of file
+'''
diff --git a/src/pymasq/mitigations/substitute.py b/src/pymasq/mitigations/substitute.py
index 951677f..67bb9de 100644
--- a/src/pymasq/mitigations/substitute.py
+++ b/src/pymasq/mitigations/substitute.py
@@ -1,10 +1,9 @@
 import pandas as pd
-import re
 
 from typing import List, Optional, Union
 
 from pymasq import BEARTYPE
-from pymasq.config import FORMATTING_ON_OUTPUT, FORMATTING_IGNORE_DTYPES
+from pymasq.config import FORMATTING_ON_OUTPUT
 from pymasq.utils import formatting
 
 __all__ = ["substitute"]
@@ -13,7 +12,7 @@
 def __format_if_list(
     from_val: Union[str, float, int, List], to_val: Union[str, float, int, List]
 ):
-    """ Format input values if at least one of them is a list """
+    """Format input values if at least one of them is a list"""
     if isinstance(from_val, list):
         to_val = to_val if isinstance(to_val, list) else [to_val]
         if len(to_val) == 1:
diff --git a/src/pymasq/mitigations/topbot_recoding.py b/src/pymasq/mitigations/topbot_recoding.py
index 745c073..65dd92b 100644
--- a/src/pymasq/mitigations/topbot_recoding.py
+++ b/src/pymasq/mitigations/topbot_recoding.py
@@ -1,11 +1,10 @@
 import pandas as pd
 
-from typing import Union, List, Optional, Dict, Final
+from typing import Union, List, Optional, Final
 
 from pymasq import BEARTYPE
 from pymasq.config import (
     FORMATTING_ON_OUTPUT,
-    FORMATTING_IGNORE_DTYPES,
     VALIDATE_NUMERIC_ON_INPUT,
     VALIDATE_NUMERIC_ON_OUTPUT,
 )
diff --git a/src/pymasq/mitigations/truncate.py b/src/pymasq/mitigations/truncate.py
index 303b27a..79abbf0 100644
--- a/src/pymasq/mitigations/truncate.py
+++ b/src/pymasq/mitigations/truncate.py
@@ -43,7 +43,7 @@ def truncate_by_match(
     Parameters
     ----------
     data : DataFrame or Series
-        The data to be modified.     
+        The data to be modified.
     match : str
         The string to search for.
     keep_before : bool, optional (Default: True)
@@ -70,7 +70,7 @@ def truncate_by_match(
     2  Private           HS-grad    Not-in-family
     3  Private           11th       Husband
     4  Private           Bachelors  Wife
-    
+
     >>> truncate_by_match(df[['workclass', 'education', 'relationship']], match='a')
         workclass        education  relationship
     0  St                B          Not-in-f
@@ -140,7 +140,7 @@ def truncate_by_index(
     2  Private           HS-grad    Not-in-family
     3  Private           11th       Husband
     4  Private           Bachelors  Wife
-    
+
     >>> truncate_by_index(df[['workclass', 'education', 'relationship']], idx=1, trim_from='both')
         workclass      education  relationship
     0  tate-go         achelor    ot-in-famil
@@ -167,7 +167,9 @@ def _truncate_by_index(series, trim_from, idx, end):
 
 
 def truncate(
-    data: Union[pd.DataFrame, pd.Series], method: str = "index", **kwargs,
+    data: Union[pd.DataFrame, pd.Series],
+    method: str = "index",
+    **kwargs,
 ) -> pd.DataFrame:
     """Truncate strings by index or after matching a speficic substring.
 
@@ -235,7 +237,7 @@ def truncate(
     2  Private           HS-grad    Not-in-family
     3  Private           11th       Husband
     4  Private           Bachelors  Wife
-    
+
     >>> truncate(df, cols=['workclass', 'education', 'relationship'], method='index', idx=1, trim_from='both')
         workclass      education  relationship
     0  tate-go         achelor    ot-in-famil
diff --git a/src/pymasq/mitigations/utils.py b/src/pymasq/mitigations/utils.py
index 65b56cb..faf7ef1 100644
--- a/src/pymasq/mitigations/utils.py
+++ b/src/pymasq/mitigations/utils.py
@@ -11,7 +11,7 @@
 
 
 def _is_identical(s: pd.Series) -> bool:
-    """ Checks if all values in the input series are identical. """
+    """Checks if all values in the input series are identical."""
     s = s.to_numpy()  # s.values (pandas<0.24)
     return (s[0] == s).all()
 
@@ -19,7 +19,7 @@ def _is_identical(s: pd.Series) -> bool:
 def _as_series(
     obj: Union[pd.DataFrame, pd.Series], cols: Optional[Union[str, List[str]]] = None
 ) -> pd.Series:
-    """ Convert an object data structure into a Series """
+    """Convert an object data structure into a Series"""
     if isinstance(obj, pd.DataFrame):
         if cols is None:
             raise InputError(
@@ -38,7 +38,7 @@ def _as_series(
 def _as_dataframe(
     obj: Union[pd.DataFrame, pd.Series], cols: Optional[Union[str, List[str]]] = None
 ) -> pd.DataFrame:
-    """ Convert an object data structure into a DataFrame """
+    """Convert an object data structure into a DataFrame"""
     if isinstance(obj, pd.DataFrame):
         if cols is None:
             return obj.copy()
diff --git a/src/pymasq/models/_base.py b/src/pymasq/models/_base.py
index 46c8b06..5a0206c 100644
--- a/src/pymasq/models/_base.py
+++ b/src/pymasq/models/_base.py
@@ -1,11 +1,11 @@
+import os
 from abc import abstractmethod
-from joblib.parallel import DEFAULT_N_JOBS
+from typing import Type, Optional, Union
+
 import pandas as pd
-import os
-from typing import Type, Optional, List, Union
-from pymasq.utils import cache
 
 import pymasq.config as cfg
+from pymasq.utils import cache
 from pymasq.preprocessing._base import PreprocessorBase
 from pymasq import BEARTYPE
 
diff --git a/src/pymasq/models/models.py b/src/pymasq/models/models.py
index 3ea34eb..82c8116 100644
--- a/src/pymasq/models/models.py
+++ b/src/pymasq/models/models.py
@@ -1,4 +1,4 @@
-from pymasq.config import DEFAULT_LOGISITIC_REGRESSION_SOLVER, DEFAULT_SEED
+from pymasq.config import DEFAULT_LOGISITIC_REGRESSION_SOLVER
 import pandas as pd
 import numpy as np
 from typing import List, Optional, Type, Any, Union
diff --git a/src/pymasq/optimizations/optimizations.py b/src/pymasq/optimizations/optimizations.py
index e17d541..4383f71 100644
--- a/src/pymasq/optimizations/optimizations.py
+++ b/src/pymasq/optimizations/optimizations.py
@@ -456,7 +456,7 @@ def _optimize(self):
 
             prob = np.random.random_sample()
 
-            if (target.equals(new_target) == False) and (
+            if not target.equals(new_target) and (
                 self._accept_prob(cur_fit, new_fit) > prob
             ):
                 if self.verbose >= 1:
diff --git a/src/pymasq/optimizations/utils.py b/src/pymasq/optimizations/utils.py
index a134def..a36ca9e 100644
--- a/src/pymasq/optimizations/utils.py
+++ b/src/pymasq/optimizations/utils.py
@@ -1,6 +1,6 @@
 from typing import Tuple
 import pandas as pd
-from typing import Callable, Dict, List, Optional, Union
+from typing import Callable, Dict, List, Union
 
 from pymasq import BEARTYPE
 from pymasq.optimizations import IterativeSearch
diff --git a/src/pymasq/preprocessing/preprocess.py b/src/pymasq/preprocessing/preprocess.py
index a3f8587..705a2a9 100644
--- a/src/pymasq/preprocessing/preprocess.py
+++ b/src/pymasq/preprocessing/preprocess.py
@@ -384,7 +384,7 @@ def _organize_columns(
             dropped_cols.extend(ignore_columns)
 
         if sensitive_col or ignore_columns:
-            input_data = df.drop(dropped_cols, 1).copy()
+            input_data = df.drop(columns=dropped_cols, axis=1).copy()
         else:
             input_data = df.copy()
 
@@ -548,7 +548,7 @@ def encode(
             print("Splitting Data into Numerical and Categorical Data...")
 
         if sensitive_col or ignore_columns:
-            input_data = df.drop(dropped_cols, 1).copy()
+            input_data = df.drop(columns=dropped_cols, axis=1).copy()
             ignore_col_data = df.loc[:, ignore_columns].copy()
         else:
             input_data = df.copy()
@@ -678,14 +678,14 @@ def encode(
                 [y, ignore_col_data, numerical_imputed_normalized, binary]
                 + categorical_embeddings
                 + textual_embeddings,
-                1,
+                axis=1,
             )
 
         return pd.concat(
             [ignore_col_data, numerical_imputed_normalized, binary]
             + categorical_embeddings
             + textual_embeddings,
-            1,
+            axis=1,
         )
 
 
diff --git a/src/pymasq/sa/sobol.py b/src/pymasq/sa/sobol.py
index c316a32..60b62fb 100644
--- a/src/pymasq/sa/sobol.py
+++ b/src/pymasq/sa/sobol.py
@@ -1,7 +1,7 @@
 import numpy as np
 import pandas as pd
 from pymasq.errors import DataTypeError, InputError
-from SALib.sample import saltelli
+from SALib.sample import sobol as saltelli
 from SALib.analyze import sobol
 from typing import Dict, Optional, Tuple, Final
 
diff --git a/src/pymasq/utils/utils.py b/src/pymasq/utils/utils.py
index a974089..2eb6d23 100644
--- a/src/pymasq/utils/utils.py
+++ b/src/pymasq/utils/utils.py
@@ -8,7 +8,6 @@
 
 from pymasq import BEARTYPE
 from pymasq import config
-from pymasq.errors import InputError
 
 
 __all__ = ["BOTH", "as_dataframe", "validate_numeric", "formatting", "freq_calc"]
diff --git a/tests/utils/test_cache.py b/tests/utils/test_cache.py
index c1e64ad..3157e43 100644
--- a/tests/utils/test_cache.py
+++ b/tests/utils/test_cache.py
@@ -8,9 +8,6 @@
 from pymasq.datasets import load_census
 from pymasq.models.models import LogisticRegressionClassifier, RFClassifier
 from pymasq.preprocessing import LabelEncoder_pm, EmbeddingsEncoder
-
-# from pymasq.errors import InputError, DataTypeError
-
 from pymasq.utils import cache
 
 
@@ -36,7 +33,7 @@ def my_df():
         (
             LogisticRegressionClassifier,
             LabelEncoder_pm,
-            0.6,
+            0.5,
             "cache_test/053cb5e57bfa9b5c9568625cb22588dd.larsCV.2bd270eec04828b035a1facfbb35f355.pkl",
             """larsCV. Description: Preprocessed with <class 'pymasq.preprocessing.preprocess.LabelEncoder_pm'>
 First ten rows:
@@ -57,7 +54,7 @@ def my_df():
         (
             RFClassifier,
             EmbeddingsEncoder,
-            0.5,
+            0.61,
             "cache_test/053cb5e57bfa9b5c9568625cb22588dd.ENCV.e81a5b5eb0df48bc68540d7b71342a7d.pkl",
             """ENCV. Description: Preprocessed with <class 'pymasq.preprocessing.preprocess.EmbeddingsEncoder'>
 First ten rows:
@@ -121,10 +118,9 @@ def test_cache(my_df, combo):
     except Exception as e:
         print("This error is a desired outcome of the test:")
         print("\t", e, "\n")
-        pass
 
     cfg.CACHE_HMAC_KEY = "my key"
     # Assert to see if description was saved
     descriptions = cache.cache_info(dir_name)
     assert descriptions[key] == desc
-    shutil.rmtree(dir_name)
\ No newline at end of file
+    shutil.rmtree(dir_name)

From cff46b151316027f381b8f52fab9be5447927562 Mon Sep 17 00:00:00 2001
From: cdo03c <cdo03c@gmail.com>
Date: Wed, 6 Dec 2023 19:00:57 -0500
Subject: [PATCH 03/17] Cleans up python style

---
 src/pymasq/mitigations/global_recode.py | 12 ++---
 src/pymasq/mitigations/shuffle.py       |  4 +-
 src/pymasq/models/_base.py              |  2 +-
 src/pymasq/models/models.py             | 14 +++---
 src/pymasq/preprocessing/__init__.py    |  4 +-
 src/pymasq/preprocessing/preprocess.py  | 59 ++++++++++++-------------
 tests/classifiers/test_classifiers.py   |  8 ++--
 tests/preprocessing/test_preprocess.py  |  6 +--
 tests/utils/test_cache.py               |  6 +--
 9 files changed, 56 insertions(+), 59 deletions(-)

diff --git a/src/pymasq/mitigations/global_recode.py b/src/pymasq/mitigations/global_recode.py
index 73d4fb0..3deb79b 100644
--- a/src/pymasq/mitigations/global_recode.py
+++ b/src/pymasq/mitigations/global_recode.py
@@ -1,4 +1,4 @@
-from pymasq.preprocessing.preprocess import LabelEncoder_pm
+from pymasq.preprocessing.preprocess import LabelEncoderPM
 import pandas as pd
 import numpy as np
 
@@ -29,23 +29,23 @@
 
 
 def __gr_equidistant(data: pd.Series, breaks: int) -> pd.Series:
-    """ Global Recode for `equidistant` method """
+    """Global Recode for `equidistant` method"""
     return np.linspace(data.min(), data.max(), breaks)
 
 
 def __gr_log_equidistant(data: pd.Series, breaks: int) -> pd.Series:
-    """ Global Recode for `log_equidistant` method """
+    """Global Recode for `log_equidistant` method"""
     data_log = np.log(data)
     return np.exp(np.linspace(data_log.min(), data_log.max(), breaks))
 
 
 def __gr_equal_quantity(data: pd.Series, breaks: int) -> pd.Series:
-    """ Global Recode for `equal` method """
+    """Global Recode for `equal` method"""
     return data.quantile(np.linspace(0, 1, breaks))
 
 
 def __gr_order_of_magnitude(data: pd.Series, breaks: int) -> pd.Series:
-    """ Global Recode for order of `magnitude` method. """
+    """Global Recode for order of `magnitude` method."""
     data_log = np.log10(data)
     return np.power(10, np.linspace(data_log.min(), data_log.max(), breaks))
 
@@ -194,7 +194,7 @@ def global_recode(
         )
     )
     if ret_ints:
-        le = LabelEncoder_pm()
+        le = LabelEncoderPM()
         return le.encode(data_recode.astype(str))
 
     return data_recode
diff --git a/src/pymasq/mitigations/shuffle.py b/src/pymasq/mitigations/shuffle.py
index 0cf5b85..961a433 100644
--- a/src/pymasq/mitigations/shuffle.py
+++ b/src/pymasq/mitigations/shuffle.py
@@ -9,7 +9,7 @@
 from pymasq import BEARTYPE
 from pymasq.config import FORMATTING_ON_OUTPUT
 from pymasq.utils import formatting
-from pymasq.preprocessing import LabelEncoder_pm
+from pymasq.preprocessing import LabelEncoderPM
 from pymasq.errors import InputError
 from pymasq.mitigations.utils import _is_identical
 
@@ -214,7 +214,7 @@ def shuffle(
                 "WARNING: ignoring columns that are composed entirely of identical values."
             )
 
-    _data = LabelEncoder_pm.encode(df=pd.concat([x, y], axis=1))
+    _data = LabelEncoderPM.encode(df=pd.concat([x, y], axis=1))
 
     resp_cols = y.columns.to_list()
     pred_cols = x.columns.to_list()
diff --git a/src/pymasq/models/_base.py b/src/pymasq/models/_base.py
index 5a0206c..2cc0efa 100644
--- a/src/pymasq/models/_base.py
+++ b/src/pymasq/models/_base.py
@@ -87,7 +87,7 @@ def train(
         preprocessor : PreprocessorBase
             A child of PreprocessorBase class indicating what preprocessor to use. Options are:
             - pymasq.preprocessing.EmbeddingsEncoder
-            - pymasq.preprocessing.LabelEncoder_pm
+            - pymasq.preprocessing.LabelEncoderPM
             - None (i.e., the data is already pre-processed)
         retrain : boolean, optional (Default: False)
             Re-runs and saves over existing TPOT model for the given file path.
diff --git a/src/pymasq/models/models.py b/src/pymasq/models/models.py
index 82c8116..0f59b9d 100644
--- a/src/pymasq/models/models.py
+++ b/src/pymasq/models/models.py
@@ -84,7 +84,7 @@ def train(
         preprocessor : PreprocessorBase
             A child of PreprocessorBase class indicating what preprocessor to use. Options are:
             - pymasq.preprocessing.EmbeddingsEncoder
-            - pymasq.preprocessing.LabelEncoder_pm
+            - pymasq.preprocessing.LabelEncoderPM
             - None (i.e., the data is already pre-processed)
         retrain: bool (Default: False)
             Ignore cached results and retrain
@@ -180,7 +180,7 @@ def train(
         preprocessor : PreprocessorBase  (Default: None)
             A child of PreprocessorBase class indicating what preprocessor to use. Options are:
             - pymasq.preprocessing.EmbeddingsEncoder
-            - pymasq.preprocessing.LabelEncoder_pm
+            - pymasq.preprocessing.LabelEncoderPM
             - None (i.e., the data is already pre-processed)
         retrain: bool (Default: False)
             Ignore cached results and retrain
@@ -288,7 +288,7 @@ def train(
         preprocessor : PreprocessorBase  (Default: None)
             A child of PreprocessorBase class indicating what preprocessor to use. Options are:
             - pymasq.preprocessing.EmbeddingsEncoder
-            - pymasq.preprocessing.LabelEncoder_pm
+            - pymasq.preprocessing.LabelEncoderPM
             - None (i.e., the data is already pre-processed)
         retrain: bool (Default: False)
             Ignore cached results and retrain
@@ -397,7 +397,7 @@ def train(
         preprocessor : PreprocessorBase  (Default: None)
             A child of PreprocessorBase class indicating what preprocessor to use. Options are:
             - pymasq.preprocessing.EmbeddingsEncoder
-            - pymasq.preprocessing.LabelEncoder_pm
+            - pymasq.preprocessing.LabelEncoderPM
             - None (i.e., the data is already pre-processed)
         retrain: bool (Default: False)
             Ignore cached results and retrain
@@ -498,7 +498,7 @@ def train(
         preprocessor : PreprocessorBase
             A child of PreprocessorBase class indicating what preprocessor to use. Options are:
             - pymasq.preprocessing.EmbeddingsEncoder
-            - pymasq.preprocessing.LabelEncoder_pm
+            - pymasq.preprocessing.LabelEncoderPM
             - None (i.e., the data is already pre-processed)
         retrain: bool (Default: False)
             Ignore cached results and retrain
@@ -604,7 +604,7 @@ def train(
         preprocessor : PreprocessorBase
             A child of PreprocessorBase class indicating what preprocessor to use. Options are:
             - pymasq.preprocessing.EmbeddingsEncoder
-            - pymasq.preprocessing.LabelEncoder_pm
+            - pymasq.preprocessing.LabelEncoderPM
             - None (i.e., the data is already pre-processed)
 
         scoring : string or callable, optional (Default: 'f1')
@@ -785,7 +785,7 @@ def train(
         preprocessor : PreprocessorBase
             A child of PreprocessorBase class indicating what preprocessor to use. Options are:
             - pymasq.preprocessing.EmbeddingsEncoder
-            - pymasq.preprocessing.LabelEncoder_pm
+            - pymasq.preprocessing.LabelEncoderPM
             - None (i.e., the data is already pre-processed)
 
         scoring : string or callable, optional (Default: 'f1')
diff --git a/src/pymasq/preprocessing/__init__.py b/src/pymasq/preprocessing/__init__.py
index 4f6f142..cc23c91 100644
--- a/src/pymasq/preprocessing/__init__.py
+++ b/src/pymasq/preprocessing/__init__.py
@@ -1,11 +1,11 @@
 from .preprocess import (
     EmbeddingsEncoder,
-    LabelEncoder_pm,
+    LabelEncoderPM,
 )
 from .entity_embedding import embed_entities
 
 __all__ = [
     "embed_entities",
     "EmbeddingsEncoder",
-    "LabelEncoder_pm",
+    "LabelEncoderPM",
 ]
diff --git a/src/pymasq/preprocessing/preprocess.py b/src/pymasq/preprocessing/preprocess.py
index 705a2a9..96e9331 100644
--- a/src/pymasq/preprocessing/preprocess.py
+++ b/src/pymasq/preprocessing/preprocess.py
@@ -16,7 +16,7 @@
 from pymasq import BEARTYPE
 
 # This file contains two children of PreprocessorBase
-#  1. LabelEncoder_pm
+#  1. LabelEncoderPM
 #  2. EmbeddingsEncoder
 
 #################
@@ -30,7 +30,7 @@
 }
 
 
-class LabelEncoder_pm(PreprocessorBase):
+class LabelEncoderPM(PreprocessorBase):
     """
     This class manages an instance of sklearn's LabelEncoder.
     Encodes categorical data only, as integers.
@@ -38,7 +38,6 @@ class LabelEncoder_pm(PreprocessorBase):
 
     def __init__(self):
         super().__init__()
-        pass
 
     @staticmethod
     @BEARTYPE
@@ -84,7 +83,7 @@ def encode(df: Union[pd.Series, pd.DataFrame], **kwargs) -> pd.DataFrame:
     @staticmethod
     @BEARTYPE
     def encode_both(
-        df_A: pd.DataFrame, df_B: pd.DataFrame, **kwargs
+        df_a: pd.DataFrame, df_b: pd.DataFrame, **kwargs
     ) -> Tuple[pd.DataFrame, pd.DataFrame]:
         """
         Takes two dataframes and uses sklearn's LabelEncoder on categorical columns only to relabel
@@ -100,28 +99,28 @@ def encode_both(
 
         Parameter
         ---------
-        df_A: pdf.DataFrame:
+        df_a: pdf.DataFrame:
             The data frame to encode.
-        df_B: pdf.DataFrame:
+        df_b: pdf.DataFrame:
             The data frame to encode.
 
         Returns
         -------
         Tuple[pd.DataFrame, pd.DataFrame]:
             pd.DataFrame
-                df_A data frame now preprocessed so that categorical data is relabeled as integers.
+                df_a data frame now preprocessed so that categorical data is relabeled as integers.
             pd.DataFrame
-                df_B data frame now preprocessed so that categorical data is relabeled as integers.
+                df_b data frame now preprocessed so that categorical data is relabeled as integers.
 
-        Column order remains consistent with original dataframes. df_A and df_B are not modified.
+        Column order remains consistent with original dataframes. df_a and df_b are not modified.
 
         """
         le = skLabelEncoder()
         # make a copy
-        df_a = df_A.copy()
-        df_b = df_B.copy()
-        if set(df_a.columns) != set(df_B.columns):
-            raise InputError("df_A and df_B must have same columns")
+        df_a = df_a.copy()
+        df_b = df_b.copy()
+        if set(df_a.columns) != set(df_b.columns):
+            raise InputError("df_a and df_b must have same columns")
         col_order = df_a.columns.tolist()
 
         # join together; mark each so we can separate again later
@@ -150,10 +149,10 @@ def encode_both(
                 [cat_cols.apply(le.fit_transform), num_cols], join="outer", axis=1
             )
         # split up again, and drop the extra column
-        df_A_enc = both.loc[both[class_col] == 0].drop(class_col, axis=1)
-        df_B_enc = both.loc[both[class_col] == 1].drop(class_col, axis=1)
+        df_a_enc = both.loc[both[class_col] == 0].drop(class_col, axis=1)
+        df_b_enc = both.loc[both[class_col] == 1].drop(class_col, axis=1)
 
-        return df_A_enc[col_order], df_B_enc[col_order]
+        return df_a_enc[col_order], df_b_enc[col_order]
 
 
 #################
@@ -193,7 +192,6 @@ class EmbeddingsEncoder(PreprocessorBase):
 
     def __init__(self):
         super().__init__()
-        pass
 
     @staticmethod
     def sentence_bpe_vectors(
@@ -229,8 +227,8 @@ def sentence_bpe_vectors(
     @staticmethod
     @BEARTYPE
     def encode_both(
-        df_A: pd.DataFrame,
-        df_B: pd.DataFrame,
+        df_a: pd.DataFrame,
+        df_b: pd.DataFrame,
         sensitive_col: Optional[Union[List, str]] = None,
         seed: int = 1234,
     ) -> Tuple[pd.DataFrame, pd.DataFrame]:
@@ -240,10 +238,10 @@ def encode_both(
 
         Parameters
         ----------
-        df_A : pd.DataFrame
+        df_a : pd.DataFrame
             data frame containing the binary label column and the other variables
             of interest
-        df_B : pd.DataFrame
+        df_b : pd.DataFrame
             data frame containing the binary label column and the other variables
             of interest
         sensitive_col : str or List[str] (Default: None)
@@ -255,20 +253,20 @@ def encode_both(
         -------
         Tuple:
             pd.DataFrame
-                The encoded version of df_A
+                The encoded version of df_a
             pd.DataFrame
-                The encoded version of df_B
-        df_A and df_B are not modified.
+                The encoded version of df_b
+        df_a and df_b are not modified.
         """
 
-        if set(df_A.columns) != set(df_B.columns):
-            raise InputError("df_A and df_B must have same columns.")
+        if set(df_a.columns) != set(df_b.columns):
+            raise InputError("df_a and df_b must have same columns.")
 
         # pick a column name that isn't in the dataset
-        class_col = utils.uniq_col_name(df_A)
+        class_col = utils.uniq_col_name(df_a)
         # make one dataframe for pre-processing, otherwise preprocess_data won't be consistent
-        orig_df_copy = df_A.copy()
-        mod_df_copy = df_B.copy()
+        orig_df_copy = df_a.copy()
+        mod_df_copy = df_b.copy()
         orig_df_copy[class_col] = 0
         mod_df_copy[class_col] = 1
         comb_for_proprocessing = pd.concat(
@@ -293,7 +291,6 @@ def encode_both(
             .drop([class_col], axis=1)
             .reset_index(drop=True)
         )
-        # return both
         return orig_df_proc, mod_df_proc
 
     @staticmethod
@@ -693,5 +690,5 @@ def encode(
 preprocessor_fn = {
     None: PreprocessorBase,
     "embeddings": EmbeddingsEncoder,
-    "label_encode": LabelEncoder_pm,
+    "label_encode": LabelEncoderPM,
 }
diff --git a/tests/classifiers/test_classifiers.py b/tests/classifiers/test_classifiers.py
index 887215e..62ddc12 100644
--- a/tests/classifiers/test_classifiers.py
+++ b/tests/classifiers/test_classifiers.py
@@ -6,7 +6,7 @@
 import pymasq.config as cfg
 from pathlib import Path
 from pymasq.datasets import load_census
-from pymasq.preprocessing import LabelEncoder_pm, EmbeddingsEncoder
+from pymasq.preprocessing import LabelEncoderPM, EmbeddingsEncoder
 from pymasq.models.models import (
     LogisticRegressionClassifier,
     TpotClassifier,
@@ -33,11 +33,11 @@ def my_df():
 @pytest.mark.parametrize(
     "combo",
     [
-        (LogisticRegressionClassifier, LabelEncoder_pm, 0.5),
+        (LogisticRegressionClassifier, LabelEncoderPM, 0.5),
         (LogisticRegressionClassifier, EmbeddingsEncoder, 0.5),
-        (RFClassifier, LabelEncoder_pm, 1.0),
+        (RFClassifier, LabelEncoderPM, 1.0),
         (RFClassifier, EmbeddingsEncoder, 1.0),
-        (TpotClassifier, LabelEncoder_pm, 0.77),
+        (TpotClassifier, LabelEncoderPM, 0.77),
         (TpotClassifier, EmbeddingsEncoder, 0.86),
     ],
 )
diff --git a/tests/preprocessing/test_preprocess.py b/tests/preprocessing/test_preprocess.py
index 35c3647..af518a5 100644
--- a/tests/preprocessing/test_preprocess.py
+++ b/tests/preprocessing/test_preprocess.py
@@ -7,7 +7,7 @@
 
 from pymasq.datasets import load_census
 
-from pymasq.preprocessing import embed_entities, LabelEncoder_pm, EmbeddingsEncoder
+from pymasq.preprocessing import embed_entities, LabelEncoderPM, EmbeddingsEncoder
 
 # from pymasq.errors import InputError, DataTypeError
 
@@ -242,7 +242,7 @@ def test_label_encode_1(my_df):
     10   37  280464          5               1    1             0             1
 
     """
-    enc, _ = LabelEncoder_pm.encode_both(my_df, my_df)
+    enc, _ = LabelEncoderPM.encode_both(my_df, my_df)
     assert (
         enc.to_json()
         == '{"age":{"0":39,"1":50,"2":38,"3":53,"4":28,"5":37,"6":49,"7":52,"8":31,"9":42,"10":37},"fnlwgt":{"0":77516,"1":83311,"2":215646,"3":234721,"4":338409,"5":284582,"6":160187,"7":209642,"8":45781,"9":159449,"10":280464},"education":{"0":2,"1":2,"2":3,"3":0,"4":2,"5":4,"6":1,"7":3,"8":4,"9":2,"10":5},"marital_status":{"0":3,"1":1,"2":0,"3":1,"4":1,"5":1,"6":2,"7":1,"8":3,"9":1,"10":1},"sex":{"0":1,"1":1,"2":1,"3":1,"4":0,"5":0,"6":0,"7":1,"8":0,"9":1,"10":1},"capital_gain":{"0":2174,"1":0,"2":0,"3":0,"4":0,"5":0,"6":0,"7":0,"8":14084,"9":5178,"10":0},"income_level":{"0":0,"1":0,"2":0,"3":0,"4":0,"5":0,"6":0,"7":1,"8":1,"9":1,"10":1}}'
@@ -290,7 +290,7 @@ def test_label_encode_2(my_df):
     """
     my_df1 = my_df[my_df["marital_status"].isin(["Never-married", "Divorced"])]
     my_df2 = my_df[~my_df["marital_status"].isin(["Never-married", "Divorced"])]
-    enc1, enc2 = LabelEncoder_pm.encode_both(my_df1, my_df2)
+    enc1, enc2 = LabelEncoderPM.encode_both(my_df1, my_df2)
     assert set(enc1.marital_status).isdisjoint(set(enc2.marital_status))
     assert (
         enc1.to_json()
diff --git a/tests/utils/test_cache.py b/tests/utils/test_cache.py
index 3157e43..43775fb 100644
--- a/tests/utils/test_cache.py
+++ b/tests/utils/test_cache.py
@@ -7,7 +7,7 @@
 from pathlib import Path
 from pymasq.datasets import load_census
 from pymasq.models.models import LogisticRegressionClassifier, RFClassifier
-from pymasq.preprocessing import LabelEncoder_pm, EmbeddingsEncoder
+from pymasq.preprocessing import LabelEncoderPM, EmbeddingsEncoder
 from pymasq.utils import cache
 
 
@@ -32,10 +32,10 @@ def my_df():
     [
         (
             LogisticRegressionClassifier,
-            LabelEncoder_pm,
+            LabelEncoderPM,
             0.5,
             "cache_test/053cb5e57bfa9b5c9568625cb22588dd.larsCV.2bd270eec04828b035a1facfbb35f355.pkl",
-            """larsCV. Description: Preprocessed with <class 'pymasq.preprocessing.preprocess.LabelEncoder_pm'>
+            """larsCV. Description: Preprocessed with <class 'pymasq.preprocessing.preprocess.LabelEncoderPM'>
 First ten rows:
    age  fnlwgt  education  ...     sex capital_gain  income_level
 0   39   77516  Bachelors  ...    Male         2174         <=50K

From c1debab3d9658560322c38a8611ae3ce829d7ffd Mon Sep 17 00:00:00 2001
From: cdo03c <cdo03c@gmail.com>
Date: Wed, 6 Dec 2023 19:24:44 -0500
Subject: [PATCH 04/17] Updates formatting

---
 docs/source/conf.py                        | 18 ++++-----
 src/pymasq/datasets/__init__.py            |  8 +++-
 src/pymasq/datasets/_base.py               |  5 ++-
 src/pymasq/errors/__init__.py              | 26 ++++++-------
 src/pymasq/metrics/suda.py                 |  2 +-
 src/pymasq/mitigations/geom_transform.py   |  8 ++--
 src/pymasq/mitigations/hashing.py          | 22 +++++++----
 src/pymasq/mitigations/microaggregation.py |  8 +---
 src/pymasq/optimizations/_base.py          | 21 ++++++-----
 src/pymasq/optimizations/optimizations.py  |  8 +---
 src/pymasq/preprocessing/_base.py          |  2 +-
 src/pymasq/utils/utils.py                  |  4 +-
 tests/mitigations/test_geom_transforms.py  | 16 ++++----
 tests/mitigations/test_hashing.py          | 10 ++---
 tests/mitigations/test_microaggregation.py | 31 ++++++++-------
 tests/mitigations/test_pram.py             | 19 +++++-----
 tests/mitigations/test_shuffle.py          |  7 ++--
 tests/optimizations/test_optimizations.py  | 44 +++++++++++-----------
 tests/optimizations/test_utils.py          |  1 -
 19 files changed, 129 insertions(+), 131 deletions(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index c05ad8b..23ecc49 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -2,7 +2,7 @@
 import sys
 import sphinx_rtd_theme
 
-sys.path.insert(0, os.path.abspath(os.path.join('..','..')))
+sys.path.insert(0, os.path.abspath(os.path.join("..", "..")))
 # Configuration file for the Sphinx documentation builder.
 #
 # This file only contains a selection of the most common options. For a full
@@ -22,12 +22,12 @@
 
 # -- Project information -----------------------------------------------------
 
-project = 'pymasq'
-copyright = '2022, MITLL'
-author = 'MITLL'
+project = "pymasq"
+copyright = "2022, MITLL"
+author = "MITLL"
 
 # The full version, including alpha/beta/rc tags
-release = '1.0'
+release = "1.0"
 
 
 # -- General configuration ---------------------------------------------------
@@ -36,14 +36,14 @@
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
 extensions = [
-    'sphinx.ext.napoleon',  # NumPy & Google style docstring support
+    "sphinx.ext.napoleon",  # NumPy & Google style docstring support
     "sphinx_rtd_theme",
 ]
 
 napoleon_google_docstring = False
 
 # Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
+templates_path = ["_templates"]
 
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
@@ -56,9 +56,9 @@
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
 #
-html_theme = 'sphinx_rtd_theme'
+html_theme = "sphinx_rtd_theme"
 
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
+html_static_path = ["_static"]
diff --git a/src/pymasq/datasets/__init__.py b/src/pymasq/datasets/__init__.py
index cfa0f58..01dd262 100644
--- a/src/pymasq/datasets/__init__.py
+++ b/src/pymasq/datasets/__init__.py
@@ -2,7 +2,13 @@
 The :mod:`pymasq.datasets` module includes utilities to load tabular datasets.
 """
 
-from ._base import load_data, load_census, load_loan, load_prestige, load_bank_attrition_rates
+from ._base import (
+    load_data,
+    load_census,
+    load_loan,
+    load_prestige,
+    load_bank_attrition_rates,
+)
 from .data_generator import gen_geom_seq, gen_bin_df, gen_num_df
 from .utils import rand_cat_change
 
diff --git a/src/pymasq/datasets/_base.py b/src/pymasq/datasets/_base.py
index 52d1ec6..081f778 100644
--- a/src/pymasq/datasets/_base.py
+++ b/src/pymasq/datasets/_base.py
@@ -77,11 +77,12 @@ def load_loan():
     """
     return load_data("loan.csv")
 
+
 def load_bank_attrition_rates():
     """Load and return the Bank Attrition Rates dataset.
 
-    A manager at the bank is disturbed with more and more customers leaving their credit card services. 
-    They would really appreciate if one could predict for them who is gonna get churned so 
+    A manager at the bank is disturbed with more and more customers leaving their credit card services.
+    They would really appreciate if one could predict for them who is gonna get churned so
     they can proactively go to the customer to provide them better services and turn customers' decisions in the opposite direction.
 
     ==============   ==============
diff --git a/src/pymasq/errors/__init__.py b/src/pymasq/errors/__init__.py
index c9cd3a9..c7effa7 100644
--- a/src/pymasq/errors/__init__.py
+++ b/src/pymasq/errors/__init__.py
@@ -1,34 +1,34 @@
-
 """
 Expose public exceptions & warnings
 """
 
+
 class InputError(Exception):
-    """ Exception raised for errors in the input value. """
+    """Exception raised for errors in the input value."""
 
 
 class DataTypeError(Exception):
-    """ Exception raised for errors in the data type. """
-    
-    
+    """Exception raised for errors in the data type."""
+
+
 class SumNotEqualToOneError(ValueError):
-    """ Exception for sum of values not equal to 1. """
-    
+    """Exception for sum of values not equal to 1."""
+
 
 class NotInRangeError(ValueError):
-    """ Exception for values not in specified interval. """
+    """Exception for values not in specified interval."""
 
 
 class LessThanZeroError(ValueError):
-    """ Exceptions for values < 0. """
+    """Exceptions for values < 0."""
 
 
 class LessThanOrEqualToZeroError(ValueError):
-    """ Exceptions for values <= 0. """
+    """Exceptions for values <= 0."""
 
 
 class NoMutationAvailableError(ValueError):
-    """ Exception when all mutations have been discarded and not replaced """
+    """Exception when all mutations have been discarded and not replaced"""
 
 
 __all__ = [
@@ -38,5 +38,5 @@ class NoMutationAvailableError(ValueError):
     "NotInRangeError",
     "LessThanZeroError",
     "LessThanOrEqualToZeroError",
-    "NoMutationAvailableError"
-]
\ No newline at end of file
+    "NoMutationAvailableError",
+]
diff --git a/src/pymasq/metrics/suda.py b/src/pymasq/metrics/suda.py
index 9901be4..f84de28 100644
--- a/src/pymasq/metrics/suda.py
+++ b/src/pymasq/metrics/suda.py
@@ -4,5 +4,5 @@
 
 
 def suda(df: pd.DataFrame, cols: List[str], **kwargs: Dict[Any, Any]) -> pd.DataFrame:
-    """ TODO """
+    """TODO"""
     return df
diff --git a/src/pymasq/mitigations/geom_transform.py b/src/pymasq/mitigations/geom_transform.py
index 9e5fef5..cf46976 100644
--- a/src/pymasq/mitigations/geom_transform.py
+++ b/src/pymasq/mitigations/geom_transform.py
@@ -279,19 +279,17 @@ def geom_transform(
 
     # Randomized expansion
     sign = np.sign(bo)
-    bo = np.add(abs(bo), abs(np.random.uniform(size=bo.shape) * magnitude))
+    bo = np.add(abs(bo), abs(np.random.Generator.uniform(size=bo.shape) * magnitude))
     bo = (bo * sign).T
     bo = bo * data[perturb_cols].std().values + data[perturb_cols].mean().values
 
     shuff_idx = data.index
     if shuffle:
-        shuff_idx = np.random.choice(
+        shuff_idx = np.random.Generator.choice(
             range(bo.shape[0]), size=(bo.shape[0]), replace=False
         )
 
-    data.loc[:, perturb_cols] = bo[
-        shuff_idx,
-    ]
+    data.loc[:, perturb_cols] = bo[shuff_idx,]
     if len(sensitive_col) != 0:
         data.loc[:, sensitive_col] = data.loc[shuff_idx, sensitive_col].reset_index(
             drop=True
diff --git a/src/pymasq/mitigations/hashing.py b/src/pymasq/mitigations/hashing.py
index a178ebf..5dcdf55 100644
--- a/src/pymasq/mitigations/hashing.py
+++ b/src/pymasq/mitigations/hashing.py
@@ -36,10 +36,10 @@ def hashing(
         a function name in the `hashlib` Python library [1]_. Else, it will apply the user-defined function.
         Algorithms listed in `hashlib.algorithms_guaranteed` are prefererd.
     salt : list, str, or int, Optional
-        The salt, or random data, to add to `data` to perturb it before hashing occurs. 
-        If left as `None`, then no salt will be added to `data`. If `salt` is a list, 
-        then it must be of the same length as `data`. If `salt` is a string, then the same salt value 
-        will be added to each value in `data`. If `salt` is an integer, then a random salt of that bit size 
+        The salt, or random data, to add to `data` to perturb it before hashing occurs.
+        If left as `None`, then no salt will be added to `data`. If `salt` is a list,
+        then it must be of the same length as `data`. If `salt` is a string, then the same salt value
+        will be added to each value in `data`. If `salt` is an integer, then a random salt of that bit size
         will automatically be generated (note that 16 and 32 are typical salt bit sizes).
         Generated salts can be stored by specifying the `store_salts` parameter.
         Please refer to [2]_ for additional information on the importance of salts.
@@ -166,13 +166,17 @@ def hashing(
                 salt, index=data.index, columns=data.columns, dtype=bytes
             )
             if salt_df.shape != data.shape:
-                raise InputError(f"Incorrect `salt` dimensions; expected {data.shape}. (Received: {salt_df.shape})")
+                raise InputError(
+                    f"Incorrect `salt` dimensions; expected {data.shape}. (Received: {salt_df.shape})"
+                )
         elif isinstance(salt, str):
             salt_df[:] = salt.encode()
         elif isinstance(salt, int):
             salt_df = salt_df.applymap(lambda v: os.urandom(salt))
         else:
-            raise InputError(f"Invalid `salt` type; only types allowed are `list`, `str`, and `int`. (Received: {type(salt)})")
+            raise InputError(
+                f"Invalid `salt` type; only types allowed are `list`, `str`, and `int`. (Received: {type(salt)})"
+            )
 
         data = (data + salt_df) if append_salt else (salt_df + data)
 
@@ -187,7 +191,9 @@ def hashing(
 
     if "shake" in str(hash_func):
         # TODO: change to logging
-        print(f"Warning: the default length of the hexdigest is set to 16; to alter the length, pass in `{hash_func}` as a callable defined with your prefered length.")
+        print(
+            f"Warning: the default length of the hexdigest is set to 16; to alter the length, pass in `{hash_func}` as a callable defined with your prefered length."
+        )
         return data.applymap(lambda v: hash_func(v).hexdigest(16))
-    
+
     return data.applymap(lambda v: hash_func(v).hexdigest())
diff --git a/src/pymasq/mitigations/microaggregation.py b/src/pymasq/mitigations/microaggregation.py
index f531190..37e4d10 100644
--- a/src/pymasq/mitigations/microaggregation.py
+++ b/src/pymasq/mitigations/microaggregation.py
@@ -578,18 +578,14 @@ def _knn(pwds, aggr):
     for _ in range((len(data) // aggr) - 1):
         max_val_idx = np.nanargmax(mah_dists)
         min_val_idxs = _knn(pw_dists[:, max_val_idx], aggr)
-        pw_dists[
-            min_val_idxs,
-        ] = np.nan
+        pw_dists[min_val_idxs,] = np.nan
         mah_dists[min_val_idxs] = np.nan
         z[min_val_idxs] = np.mean(z[min_val_idxs], axis=0)
 
     min_val_idxs = np.unique(
         np.argwhere(~np.isnan(pw_dists))[:, 0]
     )  # get idx of remaining non-nan values
-    z[min_val_idxs,] = z[min_val_idxs,].mean(
-        axis=0
-    )  # merge w above
+    z[min_val_idxs,] = z[min_val_idxs,].mean(axis=0)  # merge w above
 
     mat = (z * data.std().to_numpy()) + data.mean().to_numpy()
 
diff --git a/src/pymasq/optimizations/_base.py b/src/pymasq/optimizations/_base.py
index 30d11a9..baa22b7 100644
--- a/src/pymasq/optimizations/_base.py
+++ b/src/pymasq/optimizations/_base.py
@@ -6,7 +6,6 @@
 
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
-# import pymasq
 from pymasq import BEARTYPE
 import pymasq.mitigations as mits
 import pymasq.metrics as mets
@@ -17,7 +16,6 @@
     LessThanZeroError,
     NoMutationAvailableError,
 )
-import sys
 
 
 class OptimizationBase:
@@ -106,7 +104,6 @@ def __init__(
         exit_on_error: bool = True,  # Don't change to False without considering impact on pytests.
         **kwargs,
     ):
-
         self.target = target
         self.mutations = mutations
         self.metrics = metrics
@@ -136,12 +133,12 @@ def __init__(
                 f"A probability `p` must be defined for each mutation in `mutations`. (Received: {mutations})."
             )
         prob_sum = sum(probs)
-        if prob_sum == 0.0:
+        if np.isclose(prob_sum, 0.0, rtol=1e-09, atol=1e-09):
             probs = self._distribute(len(mutations))
             self.mutations = [
                 dict(m, **{"p": probs[i]}) for i, m in enumerate(mutations)
             ]
-        elif round(prob_sum, 5) != 1.0:
+        elif not np.isclose(round(prob_sum, 5), 1.0, rtol=1e-09, atol=1e-09):
             raise SumNotEqualToOneError(
                 f"Mitigation probabilities must sum to 1. (Received: {prob_sum})"
             )
@@ -152,13 +149,13 @@ def __init__(
                 f"An importance weighting `weight` must be defined for each metric in `metrics`. (Received: {metrics})"
             )
         weight_sum = sum(weights)
-        if weight_sum == 0.0:
+        if np.isclose(weight_sum, 0.0, rtol=1e-09, atol=1e-09):
             weights = self._distribute(len(metrics))
             [
                 v.update({"weight": weights[i]})
                 for i, v in enumerate(self.metrics.values())
             ]
-        elif weight_sum != 1.0:
+        elif not np.isclose(weight_sum, 1.0, rtol=1e-09, atol=1e-09):
             raise SumNotEqualToOneError(
                 f"Metric importance weightings must sum to 1. (Received: {weight_sum})"
             )
@@ -204,7 +201,9 @@ def _validate_input_sums(
             except KeyError:
                 sums.append(0.0)
         # if n_defined == 0, then none were defined
-        if n_defined != 0.0 and n_defined != len(values):
+        if not np.isclose(n_defined, 0.0, rtol=1e-09, atol=1e-09) and n_defined != len(
+            values
+        ):
             # TODO: future iterations should distribute missing values and/or normalize
             return None
         return sums
@@ -409,7 +408,7 @@ def _mutate(
         mut = None
         if self.randomize_mutations:
             probs = [v["p"] for v in mutations]
-            mut = np.random.choice(mutations, p=probs)
+            mut = np.random.Generator.choice(mutations, p=probs)
             if not self.reuse_mutations and mutations:
                 # redistribute according to initial weighting
                 mut_idx = mutations.index(mut)
@@ -439,7 +438,9 @@ def _mutate(
             result = func(target, **args)
         except Exception as e:
             if self.verbose >= 2:
-                print(f"[Warning] mutation {func.__name__} failed with args:={args}")
+                print(
+                    f"[Warning] mutation {func.__name__} failed with args:={args} and error: {e}"
+                )
             raise
         if isinstance(result, pd.Series):
             col_args = args.get("col", args.get("cols", None))
diff --git a/src/pymasq/optimizations/optimizations.py b/src/pymasq/optimizations/optimizations.py
index 4383f71..32b2b76 100644
--- a/src/pymasq/optimizations/optimizations.py
+++ b/src/pymasq/optimizations/optimizations.py
@@ -92,7 +92,6 @@ def _optimize(self):
         best_fit = cur_fit
 
         while all([cur_fit > self.theta, self._iters > 0]):
-
             if self.verbose:
                 print("-- Iteration [%i] --" % (self._max_iters - self._iters))
                 if self.progress_reporter:
@@ -225,7 +224,6 @@ def __init__(self, *args, retry: int = 3, **kwargs):
 
     @BEARTYPE
     def _optimize(self):
-
         target = self._target
         retry = self.retry
 
@@ -240,7 +238,6 @@ def _optimize(self):
         )
 
         while all([cur_fit > self.theta, self._iters > 0, retry > 0]):
-
             if self.verbose:
                 print("-- Iteration [%i] --" % (self._max_iters - self._iters))
                 if self.progress_reporter:
@@ -429,7 +426,6 @@ def _optimize(self):
         best_fit = cur_fit
 
         while all([best_fit > self.theta, self._iters > 0]):
-
             if self.verbose:
                 print("-- Iteration [%i] --" % (self._max_iters - self._iters))
                 if self.progress_reporter:
@@ -584,7 +580,6 @@ def __init__(
         return_best: bool = False,
         **kwargs,
     ):
-
         kwargs["headers"] = ["perm_num"]
         super().__init__(*args, **kwargs)
 
@@ -631,7 +626,7 @@ def _optimize(self):
 
         if self.randomize_mutations:
             # Note: only matters when `num_perms` is set.
-            test = np.random.shuffle(self._mutations)
+            test = np.random.Generator.shuffle(self._mutations)
 
         for num_perms, mutation_perms in enumerate(
             itertools.permutations(self._mutations, self.size_perms)
@@ -643,7 +638,6 @@ def _optimize(self):
 
             stop = False
             for mutation in mutation_perms:
-
                 if self.verbose:
                     print("\t-- Iteration [%i] --" % (self._max_iters - self._iters))
                     if self.progress_reporter:
diff --git a/src/pymasq/preprocessing/_base.py b/src/pymasq/preprocessing/_base.py
index 9efa3ef..45fdaaa 100644
--- a/src/pymasq/preprocessing/_base.py
+++ b/src/pymasq/preprocessing/_base.py
@@ -23,4 +23,4 @@ def encode_both(self):
         encoded as [0,1] and [1,2]. In contract, two distinct calls to encode() will
         return [0,1] and [0,1]
         """
-        pass
\ No newline at end of file
+        pass
diff --git a/src/pymasq/utils/utils.py b/src/pymasq/utils/utils.py
index 2eb6d23..e1fd5c9 100644
--- a/src/pymasq/utils/utils.py
+++ b/src/pymasq/utils/utils.py
@@ -18,7 +18,7 @@
 
 @BEARTYPE
 def as_dataframe(obj, cols: Optional[Union[List, str, int]] = None):
-    """ Convert an object data structure into a DataFrame """
+    """Convert an object data structure into a DataFrame"""
     if isinstance(obj, (list, np.ndarray)):
         cols = None
     if cols is not None:
@@ -103,7 +103,7 @@ def _formatting_wrapper(data, *args, **kwargs):
 
 
 def is_identical(s: pd.Series) -> bool:
-    """ Checks if all values in the input series are identical. """
+    """Checks if all values in the input series are identical."""
     s = s.to_numpy()  # s.values (pandas<0.24)
     return (s[0] == s).all()
 
diff --git a/tests/mitigations/test_geom_transforms.py b/tests/mitigations/test_geom_transforms.py
index d539bee..1b6d91e 100644
--- a/tests/mitigations/test_geom_transforms.py
+++ b/tests/mitigations/test_geom_transforms.py
@@ -11,7 +11,7 @@ def my_rand_df():
     ncols = 5
     colnames = "abcdefghijklmnopqrstuvwxyz"
     df = pd.DataFrame(
-        np.random.random_integers(0, 100, (100, ncols)),
+        np.random.Generator.random_integers(0, 100, (100, ncols)),
         columns=[colnames[i] for i in range(ncols)],
     )
     return df
@@ -32,7 +32,7 @@ def my_non_numeric_df():
     ncols = 3
     colnames = list("abcdefghijklmnopqrstuvwxyz")
     df = pd.DataFrame(
-        np.random.choice(colnames, size=(100, ncols), replace=True),
+        np.random.Generator.choice(colnames, size=(100, ncols), replace=True),
         columns=colnames[:ncols],
     )
     return df
@@ -95,7 +95,7 @@ def test_geom_transform_error_single_column(my_rand_df):
 
 
 def test_geom_transform_different_values_for_perturb_cols(my_rand_df):
-    """ Ensure geom_transform returns different values for perturb_cols """
+    """Ensure geom_transform returns different values for perturb_cols"""
     perturb_cols = ["a", "b"]
     sensitive_col = "c"
     rdf = geom_transform(
@@ -109,7 +109,7 @@ def test_geom_transform_different_values_for_perturb_cols(my_rand_df):
 
 
 def test_geom_transform_cols_not_specified_no_perturbed(my_rand_df):
-    """ Ensure geom_transform returns different values for perturb_cols """
+    """Ensure geom_transform returns different values for perturb_cols"""
     perturb_cols = ["a", "b"]
     sensitive_col = "d"
     ignore_cols = ["c"]
@@ -134,7 +134,7 @@ def test_geom_transform_cols_not_specified_no_perturbed(my_rand_df):
 
 
 def test_geom_transform_same_values_for_sensitive_col(my_rand_df):
-    """ Ensure geom_transform returns different values for perturb_cols """
+    """Ensure geom_transform returns different values for perturb_cols"""
     perturb_cols = ["a", "b"]
     sensitive_col = "c"
     rdf = geom_transform(
@@ -149,7 +149,7 @@ def test_geom_transform_same_values_for_sensitive_col(my_rand_df):
 
 
 def test_geom_transform_same_values_in_proper_order_for_sensitive_col(my_rand_df):
-    """ Ensure geom_transform returns different values for perturb_cols """
+    """Ensure geom_transform returns different values for perturb_cols"""
     perturb_cols = ["a", "b"]
     sensitive_col = "c"
     rdf = geom_transform(
@@ -164,7 +164,7 @@ def test_geom_transform_same_values_in_proper_order_for_sensitive_col(my_rand_df
 
 
 def test_geom_transform_returns_same_shapes(my_rand_df):
-    """ Ensure geom_transform returns the same dataframe shapes """
+    """Ensure geom_transform returns the same dataframe shapes"""
     perturb_cols = ["a", "b"]
     sensitive_col = "d"
 
@@ -184,4 +184,4 @@ def test_geom_transform_returns_same_shapes(my_rand_df):
     ).shape
 
     assert in_size_1 == out_size_1
-    assert in_size_2 == out_size_2
\ No newline at end of file
+    assert in_size_2 == out_size_2
diff --git a/tests/mitigations/test_hashing.py b/tests/mitigations/test_hashing.py
index 4f4719e..c04b670 100644
--- a/tests/mitigations/test_hashing.py
+++ b/tests/mitigations/test_hashing.py
@@ -28,7 +28,7 @@ def my_df():
 @pytest.fixture
 def salts():
     df = _my_df()
-    return np.random.choice(["a", "b", "c"], size=df.shape).tolist()
+    return np.random.Generator.choice(["a", "b", "c"], size=df.shape).tolist()
 
 
 @pytest.mark.parametrize("hash_func", (ALGORITHMS))
@@ -36,12 +36,12 @@ def test_hashing_all_hashlib_guaranteed_algorithms(my_df, hash_func):
     """
     Test all hashing algorithms that are guaranteed to be supported by hashlib, regardless of OS platform.
     """
-    e = None
+    rdf = None
     try:
         rdf = hashing(my_df, hash_func)
     except Exception as e:
-        print("Raised Exception")
-    assert e is None
+        print(f"Raised Exception: {e}")
+    assert rdf is not None
 
 
 @pytest.mark.parametrize("hash_func", (ALGORITHMS))
@@ -119,4 +119,4 @@ def test_hashing_hardcoded_salt(my_df, salts, hash_func):
     Test that salts can be passed in by user and yield different values
     """
     sdf = hashing(my_df, hash_func, salt=salts)
-    assert not sdf.equals(my_df)
\ No newline at end of file
+    assert not sdf.equals(my_df)
diff --git a/tests/mitigations/test_microaggregation.py b/tests/mitigations/test_microaggregation.py
index c62efa2..82a59b7 100644
--- a/tests/mitigations/test_microaggregation.py
+++ b/tests/mitigations/test_microaggregation.py
@@ -5,15 +5,14 @@
 import pandas as pd
 import pytest
 
-from pymasq import config
-
-config.FORMATTING_ON_OUTPUT = True
-
-from pymasq import set_seed
+from pymasq import config, set_seed
 from pymasq.datasets import load_loan
+from pymasq.errors import InputError, LessThanOrEqualToZeroError, NotInRangeError
 from pymasq.mitigations import microaggregation as magg
 from pymasq.mitigations.microaggregation import MaggMethods
-from pymasq.errors import InputError, LessThanOrEqualToZeroError, NotInRangeError
+
+
+config.FORMATTING_ON_OUTPUT = True
 
 
 METHODS = [
@@ -32,7 +31,7 @@
 
 @pytest.fixture
 def rand_df():
-    return pd.DataFrame(np.random.randint(1, NUM_RECORDS, (NUM_RECORDS, 4)))
+    return pd.DataFrame(np.random.Generator.randint(1, NUM_RECORDS, (NUM_RECORDS, 4)))
 
 
 @pytest.fixture
@@ -44,14 +43,14 @@ def my_df():
 
 
 def test_magg_error_if_invalid_method(my_df):
-    """ Test that microaggregation throws an InputError if incorrect method is supplied. """
+    """Test that microaggregation throws an InputError if incorrect method is supplied."""
     with pytest.raises(InputError):
         magg(my_df, method=None, aggr=2)
 
 
 @pytest.mark.parametrize("method", METHODS)
 def test_magg_returns_same_dimensions_and_column_names(my_df, method):
-    """ Test that microaggregation returns the same dimensions and column names. """
+    """Test that microaggregation returns the same dimensions and column names."""
     kwargs = {}
     if method == MaggMethods.ADVANCED:
         kwargs = MAGG_ADVANCED_KWARGS
@@ -62,7 +61,7 @@ def test_magg_returns_same_dimensions_and_column_names(my_df, method):
 
 @pytest.mark.parametrize("method", (METHODS))
 def test_magg_aggr_is_valid(my_df, method):
-    """ Test for NotInRangeError when `aggr` not in [1, len(my_df)] """
+    """Test for NotInRangeError when `aggr` not in [1, len(my_df)]"""
     aggr = 0
     kwargs = {}
     if method == MaggMethods.ADVANCED:
@@ -77,7 +76,7 @@ def test_magg_aggr_is_valid(my_df, method):
 
 @pytest.mark.parametrize("method", (METHODS))
 def test_magg_unique_vals_is_one(my_df, method):
-    """ Test the number of unique values returned for `aggr` is 1. """
+    """Test the number of unique values returned for `aggr` is 1."""
     kwargs = {}
     if method == MaggMethods.ADVANCED:
         kwargs = MAGG_ADVANCED_KWARGS
@@ -85,13 +84,13 @@ def test_magg_unique_vals_is_one(my_df, method):
     test_df = magg(
         my_df, method=method, aggr=aggr, keep_dtypes=True, **kwargs
     )  # .astype(int)
-    assert True == np.allclose(my_df, test_df, 1, 1)
+    assert np.allclose(my_df, test_df, 1, 1) is True
 
 
 @pytest.mark.parametrize("method", METHODS)
 @pytest.mark.parametrize("aggr", [2] + [n for n in range(10, NUM_RECORDS, 10)])
 def test_magg_unique_vals_greater_than_one(my_df, method, aggr):
-    """ Test the number of unique values returned for `aggr` is greater than 1. """
+    """Test the number of unique values returned for `aggr` is greater than 1."""
     kwargs = {}
     if method == MaggMethods.ADVANCED:
         kwargs = MAGG_ADVANCED_KWARGS
@@ -101,19 +100,19 @@ def test_magg_unique_vals_greater_than_one(my_df, method, aggr):
 
 
 def test_magg_quantile_not_in_range(my_df):
-    """ Test that quantile-based microaggregation throws an NotInRangeError if aggr > len(my_df). """
+    """Test that quantile-based microaggregation throws an NotInRangeError if aggr > len(my_df)."""
     with pytest.raises(NotInRangeError):
         magg(my_df, method="quantile", aggr=len(my_df) + 1)
 
 
 def test_magg_advanced_required_extra_parameters(my_df):
-    """ Test that advanced-based microaggregation throws an InputError if neither clust or reduct are specified. """
+    """Test that advanced-based microaggregation throws an InputError if neither clust or reduct are specified."""
     with pytest.raises(InputError):
         magg(my_df, method="advanced")
 
 
 def test_magg_advanced_error_if_invalid_methods(my_df):
-    """ Test that advanced-based microaggregation throws an InputError when input kwargs are not valid. """
+    """Test that advanced-based microaggregation throws an InputError when input kwargs are not valid."""
     with pytest.raises(InputError):
         magg(my_df.copy(), method="advanced", clust="INVALID")
         magg(my_df.copy(), method="advanced", reduct="INVALID")
diff --git a/tests/mitigations/test_pram.py b/tests/mitigations/test_pram.py
index 59f2cf8..57f5478 100644
--- a/tests/mitigations/test_pram.py
+++ b/tests/mitigations/test_pram.py
@@ -3,13 +3,12 @@
 import pytest
 
 import pymasq
+from pymasq.datasets import load_census
+from pymasq.errors import InputError, NotInRangeError
+from pymasq.mitigations import pram
 
 pymasq.set_seed(10)
 
-from pymasq.mitigations import pram
-from pymasq.errors import InputError, NotInRangeError
-from pymasq.datasets import load_census
-
 
 @pytest.fixture
 def my_df():
@@ -34,7 +33,7 @@ def my_numerical_df():
     nrows = 10
     max_val = 1000000
     return pd.DataFrame(
-        np.random.random_integers(0, max_val, (nrows, ncols)),
+        np.random.Generator.random_integers(0, max_val, (nrows, ncols)),
         columns=[f"c{i}" for i in range(ncols)],
     )
 
@@ -92,7 +91,7 @@ def test_pram_probs_invalid_dict(my_df):
 
 
 def test_pram_probs_valid_dict(my_df):
-    """ Ensure that specifying probabilities results in that number of changes on average """
+    """Ensure that specifying probabilities results in that number of changes on average"""
     probs = dict(
         race=pd.DataFrame({"White": 0.5, "Black": 0.5}, index=["White", "Black"])
     )
@@ -124,25 +123,25 @@ def test_pram_returns_same_shapes(my_df):
 
 
 def test_pram_probs_equal_0(my_df):
-    """ at least 1 value changed """
+    """at least 1 value changed"""
     r = pram(my_df, probs=0)
     assert not all((r == my_df).all())
 
 
 def test_pram_probs_equal_1(my_df):
-    """ no change in data """
+    """no change in data"""
     r = pram(my_df, probs=1)
     assert all((r == my_df).all())
 
 
 def test_pram_alpha_equal_0(my_df):
-    """ no change in data """
+    """no change in data"""
     r = pram(my_df, alpha=0)
     assert all((r == my_df).all())
 
 
 def test_pram_alpha_equal_1(my_df):
-    """ at least 1 value changed """
+    """at least 1 value changed"""
     r = pram(my_df, alpha=1)
     assert not all((r == my_df).all())
 
diff --git a/tests/mitigations/test_shuffle.py b/tests/mitigations/test_shuffle.py
index c772d99..67babab 100644
--- a/tests/mitigations/test_shuffle.py
+++ b/tests/mitigations/test_shuffle.py
@@ -6,9 +6,8 @@
 from pymasq.mitigations import (
     s,  # shuffle.py module
     shuffle,
-    MODEL,
 )
-from pymasq.errors import InputError, DataTypeError
+from pymasq.errors import InputError
 
 
 @pytest.fixture
@@ -88,7 +87,7 @@ def test_shuffle_cols_not_numeric(my_df):
 
 
 def test_shuffle_same_mean_different_values(loan_df):
-    """ Test that values are perturbed and retain the same mean while also in different order """
+    """Test that values are perturbed and retain the same mean while also in different order"""
     shuffle_cols = ["ApplicantIncome", "LoanAmount"]
     cor_cols = ["Education", "Loan_Status"]
     shuffled = shuffle(
@@ -124,4 +123,4 @@ def test_shuffle_returns_same_shapes(loan_df):
     ).shape
 
     assert in_size_1 == out_size_1
-    assert in_size_2 == out_size_2
\ No newline at end of file
+    assert in_size_2 == out_size_2
diff --git a/tests/optimizations/test_optimizations.py b/tests/optimizations/test_optimizations.py
index de8c096..7c56d26 100644
--- a/tests/optimizations/test_optimizations.py
+++ b/tests/optimizations/test_optimizations.py
@@ -2,26 +2,26 @@
 # coding: utf-8
 
 import copy
+import hashlib
 import itertools
 import json
 import numpy as np
 import pandas as pd
 import pytest
+import random
 
 from scipy.special import perm
+from sklearn.utils import shuffle
 
 import pymasq
-
-pymasq.BEARTYPE = lambda func: func
-
-from pymasq.datasets import load_census
-from pymasq import optimizations as opts
 from pymasq import mitigations as mits
+from pymasq import optimizations as opts
 from pymasq import set_seed
+from pymasq.datasets import load_census
+
+
+pymasq.BEARTYPE = lambda func: func
 
-import random
-from sklearn.utils import shuffle
-import hashlib
 
 set_seed(1)
 
@@ -68,7 +68,7 @@ def my_mutations():
 # evaluation functions
 zeros = {lambda: 0: {"weight": 1}}
 ones = {lambda: 1: {"weight": 1}}
-rands = {lambda: np.random.rand(): {"weight": 1}}
+rands = {lambda: np.random.Generator.rand(): {"weight": 1}}
 
 
 # Test standard termination conditions
@@ -150,7 +150,7 @@ def _terminates_correctly(res, fit, log):
     ],
 )
 def test_optimizations_returns(my_df, my_mutations, my_metrics, my_iters, my_theta):
-    """ Test the return variables of all `pymasq.optimization` algorithms. """
+    """Test the return variables of all `pymasq.optimization` algorithms."""
 
     def _returns_correctly(algo):
         result = algo.optimize()
@@ -310,10 +310,8 @@ def _randomize_mutations_correctly(res, fit, log):
 
         return any(
             [
-                len(mut_log_unique) == len(my_mutations),  # randomize = True
-                all(
-                    mut_log[: len(_my_mutations)] == _my_mutations
-                ),  # randomize = False
+                len(mut_log_unique) == len(my_mutations),
+                all(mut_log[: len(_my_mutations)] == _my_mutations),
             ]
         )
 
@@ -375,7 +373,7 @@ def _randomize_mutations_correctly(res, fit, log):
         (ones, np.inf, 0.0, 100),
     ],
 )
-def test_IncrementalSearch(
+def test_incremental_search(
     my_df, my_mutations, my_metrics, my_iters, my_theta, my_retry
 ):
     """
@@ -411,7 +409,7 @@ def test_IncrementalSearch(
         (zeros, np.inf, 1.0, None, None),  # theta
     ],
 )
-def test_ExhaustiveSearch(
+def test_exhaustive_search(
     my_df, my_mutations, my_metrics, my_iters, my_theta, my_num_perms, my_size_perms
 ):
     """
@@ -422,7 +420,9 @@ def test_ExhaustiveSearch(
     def _terminates_correctly(res, fit, log):
         if not np.isinf(my_iters):
             assert log.shape[0] == (my_iters + 1)
-        elif my_theta == 1.0 or my_theta == 0.9:
+        elif np.isclose(my_theta, 1.0, rtol=1e-09, atol=1e-09) or np.isclose(
+            my_theta, 0.9, rtol=1e-09, atol=1e-09
+        ):
             assert log.iloc[-1]["fitness"] <= my_theta
         else:
             # terminates via permutations
@@ -461,7 +461,7 @@ def test_exit_on_error():
     def throw_error_mut(*args, **kwargs):
         df = args[0]
         choice = random.choices([True, False], weights=[1, 2])
-        if choice[0] == True:
+        if choice[0] is True:
             raise Exception("Mutation error thrown on purpose.")
         else:
             df = shuffle(df)
@@ -469,16 +469,16 @@ def throw_error_mut(*args, **kwargs):
             return df
 
     def rand_metric(df, *args, **kwargs):
-        Hash = hashlib.sha512
-        MAX_HASH_PLUS_ONE = 2 ** (Hash().digest_size * 8)
+        hash_func = hashlib.sha512
+        MAX_HASH_PLUS_ONE = 2 ** (hash_func().digest_size * 8)
         seed = str(df).encode()
-        hash_digest = Hash(seed).digest()
+        hash_digest = hash_func(seed).digest()
         hash_int = int.from_bytes(hash_digest, "big")
         return np.round(hash_int / MAX_HASH_PLUS_ONE, 4)  # Float division
 
     def throw_error_metric(df, *args, **kwargs):
         choice = random.choices([True, False], weights=[1, 2])
-        if choice[0] == True:
+        if choice[0] is True:
             raise Exception("Metrics error thrown on purpose.")
         else:
             return rand_metric(df)
diff --git a/tests/optimizations/test_utils.py b/tests/optimizations/test_utils.py
index ba1f79e..c1ef169 100644
--- a/tests/optimizations/test_utils.py
+++ b/tests/optimizations/test_utils.py
@@ -8,7 +8,6 @@
 
 
 def test_apply_and_evaluate():
-
     # This checks that the output of apply_and_evaluate is the same as
     # if we called iterativeSearch(), which performs the metrics at each step
 

From 381900cf334dd68936e5a9f0e213c09fc7277390 Mon Sep 17 00:00:00 2001
From: cdo03c <cdo03c@gmail.com>
Date: Thu, 7 Dec 2023 19:44:43 -0500
Subject: [PATCH 05/17] Fixes bugs

---
 src/pymasq/preprocessing/entity_embedding.py | 2 +-
 src/pymasq/preprocessing/preprocess.py       | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/pymasq/preprocessing/entity_embedding.py b/src/pymasq/preprocessing/entity_embedding.py
index 1b0b319..5dcbdcb 100755
--- a/src/pymasq/preprocessing/entity_embedding.py
+++ b/src/pymasq/preprocessing/entity_embedding.py
@@ -123,7 +123,7 @@ def embed_entities(
 
         # Converts categories represented by integers to strings so that the
         # label encoder will work and the classes can be determined later
-        categorical_df[column] = categorical_df[column].astype(str)
+        categorical_df.loc[:, column] = categorical_df.loc[:,column].astype(str)
         le = LabelEncoder()
         X_train = le.fit_transform(categorical_df[column])
 
diff --git a/src/pymasq/preprocessing/preprocess.py b/src/pymasq/preprocessing/preprocess.py
index 96e9331..6d7dcfc 100644
--- a/src/pymasq/preprocessing/preprocess.py
+++ b/src/pymasq/preprocessing/preprocess.py
@@ -131,7 +131,7 @@ def encode_both(
         df_b[class_col] = 1
 
         # append b to a; and then split out the categorical (non-numerical) columns
-        cat_cols = df_a.append(df_b).select_dtypes(exclude=["number"])
+        cat_cols = df_a._append(df_b).select_dtypes(exclude=["number"])
         # cast everything to string in case we have a mix of floats and string,
         # otherwise LabelEncoder will choke/die.
         # This should never happen, but does in our pytests, so who knows.
@@ -140,7 +140,7 @@ def encode_both(
         )
 
         # append b to a; and then split out the non-categorical (numerical) columns
-        num_cols = df_a.append(df_b).select_dtypes(include=["number"])
+        num_cols = df_a._append(df_b).select_dtypes(include=["number"])
         # concatenate, but relabel the cat_cols first
         if cat_cols.empty:
             both = num_cols

From 1af4d8d0069d4b328994691ab690a2cef7e2419b Mon Sep 17 00:00:00 2001
From: cdo03c <cdo03c@gmail.com>
Date: Thu, 7 Dec 2023 20:02:10 -0500
Subject: [PATCH 06/17] fixes optimization tests

---
 src/pymasq/optimizations/_base.py         | 9 ++++++---
 src/pymasq/optimizations/optimizations.py | 2 +-
 tests/optimizations/test_optimizations.py | 3 +--
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/pymasq/optimizations/_base.py b/src/pymasq/optimizations/_base.py
index baa22b7..8db85f7 100644
--- a/src/pymasq/optimizations/_base.py
+++ b/src/pymasq/optimizations/_base.py
@@ -6,10 +6,12 @@
 
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
-from pymasq import BEARTYPE
+
 import pymasq.mitigations as mits
 import pymasq.metrics as mets
 
+from pymasq import BEARTYPE
+from pymasq.config import DEFAULT_SEED
 from pymasq.errors import (
     SumNotEqualToOneError,
     NotInRangeError,
@@ -17,6 +19,7 @@
     NoMutationAvailableError,
 )
 
+rg = np.random.Generator(np.random.PCG64(DEFAULT_SEED))
 
 class OptimizationBase:
     """Base class for the optimization algorithms.
@@ -408,7 +411,7 @@ def _mutate(
         mut = None
         if self.randomize_mutations:
             probs = [v["p"] for v in mutations]
-            mut = np.random.Generator.choice(mutations, p=probs)
+            mut = rg.choice(mutations, p=probs)
             if not self.reuse_mutations and mutations:
                 # redistribute according to initial weighting
                 mut_idx = mutations.index(mut)
@@ -541,4 +544,4 @@ def update(self, record: Dict[str, Any]):
         """
         record = self._pretty_values(record)
         df = pd.DataFrame.from_records(record)
-        self.log = self.log.append(df, ignore_index=True)
+        self.log = self.log._append(df, ignore_index=True)
diff --git a/src/pymasq/optimizations/optimizations.py b/src/pymasq/optimizations/optimizations.py
index 32b2b76..9697b21 100644
--- a/src/pymasq/optimizations/optimizations.py
+++ b/src/pymasq/optimizations/optimizations.py
@@ -626,7 +626,7 @@ def _optimize(self):
 
         if self.randomize_mutations:
             # Note: only matters when `num_perms` is set.
-            test = np.random.Generator.shuffle(self._mutations)
+            test = np.random.shuffle(self._mutations)
 
         for num_perms, mutation_perms in enumerate(
             itertools.permutations(self._mutations, self.size_perms)
diff --git a/tests/optimizations/test_optimizations.py b/tests/optimizations/test_optimizations.py
index 7c56d26..e4da8a4 100644
--- a/tests/optimizations/test_optimizations.py
+++ b/tests/optimizations/test_optimizations.py
@@ -25,7 +25,6 @@
 
 set_seed(1)
 
-
 @pytest.fixture
 def my_df():
     df = load_census()
@@ -68,7 +67,7 @@ def my_mutations():
 # evaluation functions
 zeros = {lambda: 0: {"weight": 1}}
 ones = {lambda: 1: {"weight": 1}}
-rands = {lambda: np.random.Generator.rand(): {"weight": 1}}
+rands = {lambda: np.random.rand(): {"weight": 1}}
 
 
 # Test standard termination conditions

From e164df7f2a2f4f16c79705f4b3530e63ad312e57 Mon Sep 17 00:00:00 2001
From: cdo03c <cdo03c@gmail.com>
Date: Thu, 7 Dec 2023 20:17:16 -0500
Subject: [PATCH 07/17] fixes mitigation tests

---
 src/pymasq/datasets/data_generator.py     |  8 +++++---
 src/pymasq/metrics/utility_scores.py      |  2 +-
 src/pymasq/mitigations/geom_transform.py  | 12 +++++++-----
 tests/mitigations/test_geom_transforms.py |  6 ++++--
 tests/mitigations/test_hashing.py         |  4 +++-
 tests/mitigations/test_pram.py            |  4 +++-
 6 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/src/pymasq/datasets/data_generator.py b/src/pymasq/datasets/data_generator.py
index 773c429..c2d4c6b 100644
--- a/src/pymasq/datasets/data_generator.py
+++ b/src/pymasq/datasets/data_generator.py
@@ -7,7 +7,9 @@
 from .utils import rand_cat_change
 
 from pymasq import BEARTYPE
+from pymasq.config import DEFAULT_SEED
 
+rg = np.random.default_rng(DEFAULT_SEED)
 
 @BEARTYPE
 def gen_geom_seq(start: float = 0.5, n: int = 6, rate: float = 2.0) -> List[float]:
@@ -132,11 +134,11 @@ def _l_div_sensitive_gen(l: int, n: int) -> List:
         List of integer values for the sensitive column
     """
 
-    unique_entries = np.random.choice(range(n), l)
+    unique_entries = rg.choice(range(n), l)
     while len(unique_entries) != len(set(unique_entries)):
-        unique_entries = np.random.choice(range(n), l)
+        unique_entries = rg.choice(range(n), l)
 
-    non_unique = np.random.Generator.choice(unique_entries, n - l)
+    non_unique = rg.Generator.choice(unique_entries, n - l)
     return list(unique_entries) + list(non_unique)
 
 
diff --git a/src/pymasq/metrics/utility_scores.py b/src/pymasq/metrics/utility_scores.py
index 459edd6..f115bb1 100644
--- a/src/pymasq/metrics/utility_scores.py
+++ b/src/pymasq/metrics/utility_scores.py
@@ -201,7 +201,7 @@ def propensity_score(
     # Encode the two data frames (at once for consistent encodings)
     preprocessor_fn = preprocess.preprocessor_fn[preprocessor]
     orig_enc, mod_enc = preprocessor_fn.encode_both(
-        df_A=orig_df, df_B=mod_df, sensitive_col=sensitive_col
+        df_a=orig_df, df_b=mod_df, sensitive_col=sensitive_col
     )
     # Create a unique column name to mark from which dataframe a row came from
     class_col = utils.uniq_col_name(orig_df)
diff --git a/src/pymasq/mitigations/geom_transform.py b/src/pymasq/mitigations/geom_transform.py
index cf46976..6c89a63 100644
--- a/src/pymasq/mitigations/geom_transform.py
+++ b/src/pymasq/mitigations/geom_transform.py
@@ -7,7 +7,7 @@
 from typing import List, Optional, Union
 
 from pymasq import BEARTYPE
-from pymasq.config import FORMATTING_ON_OUTPUT, FORMATTING_IGNORE_DTYPES
+from pymasq.config import FORMATTING_ON_OUTPUT, FORMATTING_IGNORE_DTYPES, DEFAULT_SEED
 from pymasq.errors import InputError
 from pymasq.mitigations.utils import _is_identical
 from pymasq.utils import formatting
@@ -15,6 +15,7 @@
 
 __all__ = ["geom_transform"]
 
+rg = np.random.default_rng(DEFAULT_SEED)
 
 SKIP_ROTATION_ANGLES = [30, 45, 60, 90, 120, 135, 150, 180]
 MAX_DEGREES = 180
@@ -127,7 +128,8 @@ def geom_transform(
 
     Examples
     --------
-    >>> df = pd.DataFrame(np.random.random_integers(0, 100, (10,3)))
+    >>> rg = np.random.default_rng(1234)
+    >>> df = pd.DataFrame(rg.integers(0, 100, (10,3)))
         0   1   2   3
     0  72  13  92  91
     1  55  63  65  76
@@ -261,7 +263,7 @@ def geom_transform(
 
     # Translation Matrix Generation/Application
     idtrans = np.eye(ncols + 1)  # add a new row for the homogeneous coordinate
-    idtrans[:ncols, ncols:] = np.random.uniform(size=(ncols, 1))
+    idtrans[:ncols, ncols:] = rg.uniform(size=(ncols, 1))
 
     # multidim translations; adding ones column for homogeneous coordinate
     multitrans = np.concatenate((bo, np.ones(shape=(bo.shape[0], 1))), axis=1)
@@ -279,13 +281,13 @@ def geom_transform(
 
     # Randomized expansion
     sign = np.sign(bo)
-    bo = np.add(abs(bo), abs(np.random.Generator.uniform(size=bo.shape) * magnitude))
+    bo = np.add(abs(bo), abs(rg.uniform(size=bo.shape) * magnitude))
     bo = (bo * sign).T
     bo = bo * data[perturb_cols].std().values + data[perturb_cols].mean().values
 
     shuff_idx = data.index
     if shuffle:
-        shuff_idx = np.random.Generator.choice(
+        shuff_idx = rg.choice(
             range(bo.shape[0]), size=(bo.shape[0]), replace=False
         )
 
diff --git a/tests/mitigations/test_geom_transforms.py b/tests/mitigations/test_geom_transforms.py
index 1b6d91e..a18f367 100644
--- a/tests/mitigations/test_geom_transforms.py
+++ b/tests/mitigations/test_geom_transforms.py
@@ -2,16 +2,18 @@
 import pandas as pd
 import pytest
 
+from pymasq.config import DEFAULT_SEED
 from pymasq.mitigations import geom_transform
 from pymasq.errors import InputError
 
+rg = np.random.default_rng(DEFAULT_SEED)
 
 @pytest.fixture
 def my_rand_df():
     ncols = 5
     colnames = "abcdefghijklmnopqrstuvwxyz"
     df = pd.DataFrame(
-        np.random.Generator.random_integers(0, 100, (100, ncols)),
+        rg.integers(0, 100, (100, ncols)),
         columns=[colnames[i] for i in range(ncols)],
     )
     return df
@@ -32,7 +34,7 @@ def my_non_numeric_df():
     ncols = 3
     colnames = list("abcdefghijklmnopqrstuvwxyz")
     df = pd.DataFrame(
-        np.random.Generator.choice(colnames, size=(100, ncols), replace=True),
+        rg.choice(colnames, size=(100, ncols), replace=True),
         columns=colnames[:ncols],
     )
     return df
diff --git a/tests/mitigations/test_hashing.py b/tests/mitigations/test_hashing.py
index c04b670..1e746cc 100644
--- a/tests/mitigations/test_hashing.py
+++ b/tests/mitigations/test_hashing.py
@@ -6,12 +6,14 @@
 import hashlib
 import numpy as np
 
+from pymasq.config import DEFAULT_SEED
 from pymasq.datasets import load_census
 from pymasq.mitigations import hashing
 
 
 ALGORITHMS = hashlib.algorithms_guaranteed
 
+rg = np.random.default_rng(DEFAULT_SEED)
 
 def _my_df():
     df = load_census()
@@ -28,7 +30,7 @@ def my_df():
 @pytest.fixture
 def salts():
     df = _my_df()
-    return np.random.Generator.choice(["a", "b", "c"], size=df.shape).tolist()
+    return rg.choice(["a", "b", "c"], size=df.shape).tolist()
 
 
 @pytest.mark.parametrize("hash_func", (ALGORITHMS))
diff --git a/tests/mitigations/test_pram.py b/tests/mitigations/test_pram.py
index 57f5478..0ed5c40 100644
--- a/tests/mitigations/test_pram.py
+++ b/tests/mitigations/test_pram.py
@@ -3,12 +3,14 @@
 import pytest
 
 import pymasq
+from pymasq.config import DEFAULT_SEED
 from pymasq.datasets import load_census
 from pymasq.errors import InputError, NotInRangeError
 from pymasq.mitigations import pram
 
 pymasq.set_seed(10)
 
+rg = np.random.default_rng(DEFAULT_SEED)
 
 @pytest.fixture
 def my_df():
@@ -33,7 +35,7 @@ def my_numerical_df():
     nrows = 10
     max_val = 1000000
     return pd.DataFrame(
-        np.random.Generator.random_integers(0, max_val, (nrows, ncols)),
+        rg.integers(0, max_val, (nrows, ncols)),
         columns=[f"c{i}" for i in range(ncols)],
     )
 

From 64eae4208bf479febdd5beb92762f238de1d76f0 Mon Sep 17 00:00:00 2001
From: cdo03c <cdo03c@gmail.com>
Date: Sat, 23 Dec 2023 09:01:44 -0500
Subject: [PATCH 08/17] Updates

---
 src/pymasq/mitigations/add_noise.py       | 13 ++++++++-----
 src/pymasq/mitigations/hashing.py         | 10 ++++++----
 src/pymasq/mitigations/local_supp.py      | 10 ++++++----
 src/pymasq/mitigations/pram.py            |  6 ++++--
 src/pymasq/mitigations/rank_swap.py       |  2 +-
 src/pymasq/mitigations/shuffle.py         |  6 ++++--
 src/pymasq/mitigations/utils.py           |  2 +-
 src/pymasq/models/models.py               |  4 ++--
 tests/mitigations/test_pram.py            |  7 ++++---
 tests/optimizations/test_optimizations.py |  6 +++---
 10 files changed, 39 insertions(+), 27 deletions(-)

diff --git a/src/pymasq/mitigations/add_noise.py b/src/pymasq/mitigations/add_noise.py
index eca3a34..77ac99b 100644
--- a/src/pymasq/mitigations/add_noise.py
+++ b/src/pymasq/mitigations/add_noise.py
@@ -9,6 +9,7 @@
 from typing import List, Optional, Union, Final
 
 from pymasq.config import (
+    DEFAULT_SEED,
     FORMATTING_ON_OUTPUT,
     VALIDATE_NUMERIC_ON_INPUT,
     VALIDATE_NUMERIC_ON_OUTPUT,
@@ -36,13 +37,15 @@
 OUTLIERS: Final = "outliers"
 
 
-class OUTLIERS_INTERPOLATION_METHODS:
+class outliersInterpolationMethods:
     LINEAR = "linear"
     LOWER = "lower"
     HIGHER = "higher"
     MIDPOINT = "midpoint"
     NEAREST = "nearest"
 
+rg = np.random.default_rng(DEFAULT_SEED)
+
 
 @formatting(on_output=FORMATTING_ON_OUTPUT)
 @validate_numeric(
@@ -114,10 +117,10 @@ def add_noise_additive(
     if centered:
         delta = np.sqrt(1 - np.square(magnitude))
         loc = (1 - delta) / magnitude
-        noise = np.random.normal(loc=loc * data.mean(), scale=std, size=data.shape)
+        noise = rg.normal(loc=loc * data.mean(), scale=std, size=data.shape)
         data *= delta
         return data.add(magnitude * noise)
-    return data + np.random.normal(scale=magnitude * std, size=data.shape)
+    return data + rg.normal(scale=magnitude * std, size=data.shape)
 
 
 @formatting(on_output=FORMATTING_ON_OUTPUT)
@@ -239,7 +242,7 @@ def add_noise_correlated(
             ]
         ).transpose()  # Transposes the data to have the column/row orientation match the input data
 
-        return data_encoded + np.random.multivariate_normal(
+        return data_encoded + rg.multivariate_normal(
             pd.Series([0] * data_encoded.shape[1]),
             (magnitude / 100.0) * data_encoded.cov(),
             size=data_encoded.shape[0],
@@ -415,7 +418,7 @@ def add_noise_outliers(
     outliers = np.unique(np.append(quant_outliers, dist_outliers))
 
     std = 1.96 * data.std() / np.sqrt(len(data)) * (magnitude / 100.0)
-    noise = np.random.normal(scale=std, size=(len(outliers), len(data.columns)))
+    noise = rg.normal(scale=std, size=(len(outliers), len(data.columns)))
 
     data.iloc[outliers, :] += noise
 
diff --git a/src/pymasq/mitigations/hashing.py b/src/pymasq/mitigations/hashing.py
index 5dcdf55..547f6c0 100644
--- a/src/pymasq/mitigations/hashing.py
+++ b/src/pymasq/mitigations/hashing.py
@@ -1,8 +1,9 @@
+import hashlib
+import logging
+import os
 from typing import Callable, List, Optional, Union
 
-import hashlib
 import numpy as np
-import os
 import pandas as pd
 
 from pymasq import BEARTYPE
@@ -14,6 +15,8 @@
 
 __all__ = ["hashing"]
 
+logger = logging.getLogger(__name__)
+
 
 @formatting(on_output=FORMATTING_ON_OUTPUT, ignore_dtypes=True)
 @BEARTYPE
@@ -190,8 +193,7 @@ def hashing(
     hash_func = getattr(hashlib, hash_func)
 
     if "shake" in str(hash_func):
-        # TODO: change to logging
-        print(
+        logger.warning(
             f"Warning: the default length of the hexdigest is set to 16; to alter the length, pass in `{hash_func}` as a callable defined with your prefered length."
         )
         return data.applymap(lambda v: hash_func(v).hexdigest(16))
diff --git a/src/pymasq/mitigations/local_supp.py b/src/pymasq/mitigations/local_supp.py
index 558f365..a3b535e 100644
--- a/src/pymasq/mitigations/local_supp.py
+++ b/src/pymasq/mitigations/local_supp.py
@@ -1,7 +1,8 @@
-import pandas as pd
-
+import logging
 from typing import Any, List, Optional, Union
 
+import pandas as pd
+
 from pymasq import BEARTYPE
 from pymasq.config import (
     FORMATTING_ON_OUTPUT,
@@ -13,6 +14,8 @@
 
 __all__ = ["local_supp"]
 
+logger = logging.getLogger(__name__)
+
 
 @formatting(on_output=FORMATTING_ON_OUTPUT, ignore_dtypes=True)  # fmt: off
 @BEARTYPE
@@ -140,8 +143,7 @@ def local_supp(
     if not keep_dtypes and not isinstance(
         type(to_val), type(data.loc[0, suppress_col])
     ):
-        # TODO: switch to logging
-        print(
+        logger.warning(
             f"WARNING: The datatype of the `suppress_col` ({suppress_col}`) will be changed."
         )
 
diff --git a/src/pymasq/mitigations/pram.py b/src/pymasq/mitigations/pram.py
index d1bcb18..828580f 100644
--- a/src/pymasq/mitigations/pram.py
+++ b/src/pymasq/mitigations/pram.py
@@ -5,6 +5,7 @@
 
 from pymasq import BEARTYPE
 from pymasq.config import (
+    DEFAULT_SEED,
     FORMATTING_ON_OUTPUT,
 )
 from pymasq.errors import InputError, NotInRangeError
@@ -14,6 +15,7 @@
 
 __all__ = ["pram"]
 
+rg = np.random.default_rng(DEFAULT_SEED)
 
 def __calc_transition_matrix(
     data: pd.Series,
@@ -39,7 +41,7 @@ def __calc_transition_matrix(
         pandas.DataFrame with transition probabilities for each category.
     """
     ncats = len(cats)
-    runif = np.random.uniform(low=probs, size=ncats)
+    runif = rg.uniform(low=probs, size=ncats)
     tri = (1 - runif) / (ncats - 1)
 
     prob_mat = np.zeros(shape=(ncats, ncats))
@@ -88,7 +90,7 @@ def __randomization(
     for cat in cats:
         idxs = data.index.where(data == cat).dropna()
         if len(idxs) > 0:
-            d_pramed[idxs] = np.random.choice(
+            d_pramed[idxs] = rg.choice(
                 cats,
                 len(idxs),
                 p=trans.loc[cat,],
diff --git a/src/pymasq/mitigations/rank_swap.py b/src/pymasq/mitigations/rank_swap.py
index 0a47a13..f01ddd1 100644
--- a/src/pymasq/mitigations/rank_swap.py
+++ b/src/pymasq/mitigations/rank_swap.py
@@ -8,7 +8,7 @@
 
 
 def rank_swap(
-    data: Union[pd.DataFrame, pd.Series], cols: Union[str, List[str]] = None, **kwargs
+    data: Union[pd.DataFrame, pd.Series], cols: Union[str, List[str]] = None,
 ) -> pd.Series:
     """TODO
 
diff --git a/src/pymasq/mitigations/shuffle.py b/src/pymasq/mitigations/shuffle.py
index 961a433..79dd284 100644
--- a/src/pymasq/mitigations/shuffle.py
+++ b/src/pymasq/mitigations/shuffle.py
@@ -7,7 +7,7 @@
 import scipy.stats as ss
 
 from pymasq import BEARTYPE
-from pymasq.config import FORMATTING_ON_OUTPUT
+from pymasq.config import DEFAULT_SEED, FORMATTING_ON_OUTPUT
 from pymasq.utils import formatting
 from pymasq.preprocessing import LabelEncoderPM
 from pymasq.errors import InputError
@@ -32,6 +32,8 @@
 CORRELATIVE: Final = "corr"
 MODEL: Final = "model"
 
+rg = np.random.default_rng(DEFAULT_SEED)
+
 
 @BEARTYPE
 def _reverse_map(data: pd.DataFrame, y_star: pd.DataFrame) -> pd.DataFrame:
@@ -232,7 +234,7 @@ def shuffle(
     ystar1 = predictors.dot(pxs.dot(pssinv).T)
 
     sigma = pxx - pxs.dot(pssinv.dot(psx))
-    e1 = np.random.multivariate_normal(
+    e1 = rg.multivariate_normal(
         mean=[0] * len(resp_cols), cov=sigma, size=_data.shape[0]
     )
     y_star = ystar1 + e1
diff --git a/src/pymasq/mitigations/utils.py b/src/pymasq/mitigations/utils.py
index faf7ef1..62f58b7 100644
--- a/src/pymasq/mitigations/utils.py
+++ b/src/pymasq/mitigations/utils.py
@@ -81,7 +81,7 @@ def __calc_freq(
     freq_df = df.groupby(cols).count()[sensitive_col]
     freq_df = freq_df.rename("samp_fq")
     freq_df = freq_df.reset_index()
-    result = pd.merge(df, freq_df, how="outer", on=cols)
+    result = pd.merge(df, freq_df, how="outer", on=cols, validate="m:1")
     result["pop_fq"] = result["samp_fq"].values * weights
 
     return result
diff --git a/src/pymasq/models/models.py b/src/pymasq/models/models.py
index 0f59b9d..5e19ede 100644
--- a/src/pymasq/models/models.py
+++ b/src/pymasq/models/models.py
@@ -909,9 +909,9 @@ def predict(self, x_test: pd.DataFrame, y_true: pd.Series) -> float:
             except:
                 continue
         if Y_predict_prob_array is None:
-            raise (f"No prediction method available for {self.trained}")
+            raise Exception (f"No prediction method available for {self.trained}")
 
-        return mape(y_true=y_true, y_score=Y_predict_prob_array)
+        return mape(y_true=y_true, y_pred=Y_predict_prob_array)
 
 
 # For translation from text to callable functions
diff --git a/tests/mitigations/test_pram.py b/tests/mitigations/test_pram.py
index 0ed5c40..a0b5d98 100644
--- a/tests/mitigations/test_pram.py
+++ b/tests/mitigations/test_pram.py
@@ -1,14 +1,14 @@
+import logging
 import numpy as np
 import pandas as pd
 import pytest
 
-import pymasq
 from pymasq.config import DEFAULT_SEED
 from pymasq.datasets import load_census
 from pymasq.errors import InputError, NotInRangeError
 from pymasq.mitigations import pram
 
-pymasq.set_seed(10)
+logger = logging.getLogger(__name__)
 
 rg = np.random.default_rng(DEFAULT_SEED)
 
@@ -114,7 +114,8 @@ def test_pram_probs_valid_dict(my_df):
 def test_pram_numerical_cast_to_categorical(my_numerical_df):
     try:
         pram(my_numerical_df)
-    except:
+    except Exception as e:
+        logger.exception(e)
         assert False, "Numerical dataframe should not have raised error."
 
 
diff --git a/tests/optimizations/test_optimizations.py b/tests/optimizations/test_optimizations.py
index e4da8a4..35a8961 100644
--- a/tests/optimizations/test_optimizations.py
+++ b/tests/optimizations/test_optimizations.py
@@ -16,14 +16,14 @@
 import pymasq
 from pymasq import mitigations as mits
 from pymasq import optimizations as opts
-from pymasq import set_seed
+from pymasq.config import DEFAULT_SEED
 from pymasq.datasets import load_census
 
 
 pymasq.BEARTYPE = lambda func: func
 
 
-set_seed(1)
+rg = np.random.default_rng(DEFAULT_SEED)
 
 @pytest.fixture
 def my_df():
@@ -67,7 +67,7 @@ def my_mutations():
 # evaluation functions
 zeros = {lambda: 0: {"weight": 1}}
 ones = {lambda: 1: {"weight": 1}}
-rands = {lambda: np.random.rand(): {"weight": 1}}
+rands = {lambda: rg(): {"weight": 1}}
 
 
 # Test standard termination conditions

From d2a7a79bf7f823ee509962fdd03b266f819788d7 Mon Sep 17 00:00:00 2001
From: cdo03c <cdo03c@gmail.com>
Date: Wed, 27 Dec 2023 08:33:06 -0500
Subject: [PATCH 09/17] updates truncate tests

---
 tests/mitigations/test_truncate.py | 81 ++++++++++++++++--------------
 1 file changed, 43 insertions(+), 38 deletions(-)

diff --git a/tests/mitigations/test_truncate.py b/tests/mitigations/test_truncate.py
index e7eaacc..99af40c 100644
--- a/tests/mitigations/test_truncate.py
+++ b/tests/mitigations/test_truncate.py
@@ -1,19 +1,24 @@
 #!/usr/bin/env python
 # coding: utf-8
 
+import logging
 import pytest
 
+import pandas as pd
+
 from pymasq.datasets import load_census
 from pymasq.mitigations import truncate, INDEX, MATCH, START, END, BOTH
 
+logger = logging.getLogger(__name__)
 
 @pytest.fixture
 def my_df():
     df = load_census()
-    cols = ["fnlwgt", "education", "marital_status", "sex", "capital_gain"]
+    cols = ["fnlwgt", "education", "marital_status", selected_col, "capital_gain"]
     df = df.loc[:10, cols]
     return df
 
+selected_col: str = "sex"
 
 # ----- Method: Index Tests -----
 def test_truncate_index_1(my_df):
@@ -22,8 +27,8 @@ def test_truncate_index_1(my_df):
     supplied.
     Should only keep characters [0:3)
     """
-    ret = truncate(my_df["sex"], idx=3)
-    assert ret.isin(["e", "ale"]).all()
+    ret:pd.DataFrame = truncate(my_df[selected_col], idx=3)
+    assert ret[selected_col].isin(["e", "ale"]).all()
 
 
 def test_truncate_index_2(my_df):
@@ -32,8 +37,8 @@ def test_truncate_index_2(my_df):
     supplied.
     Should only keep characters [0:-1)
     """
-    ret = truncate(my_df["sex"], method=INDEX, idx=-1)
-    assert ret.isin(["e", "e"]).all()
+    ret:pd.DataFrame = truncate(my_df[selected_col], method=INDEX, idx=-1)
+    assert ret[selected_col].isin(["e", "e"]).all()
 
 
 def test_truncate_index_3(my_df):
@@ -42,8 +47,8 @@ def test_truncate_index_3(my_df):
     supplied and trim_from=END
     Should not keep any characters
     """
-    ret = truncate(my_df["sex"], method=INDEX, idx=0, trim_from=END)
-    assert ret.isin([""]).all()
+    ret:pd.DataFrame = truncate(my_df[selected_col], method=INDEX, idx=0, trim_from=END)
+    assert ret[selected_col].isin([""]).all()
 
 
 def test_truncate_index_4(my_df):
@@ -52,8 +57,8 @@ def test_truncate_index_4(my_df):
     idx supplied. (idx > longest string in the column).
     Should not keep all characters
     """
-    ret = truncate(my_df["sex"], method=INDEX, end=100)
-    assert ret.isin(["Male", "Female"]).all()
+    ret:pd.DataFrame = truncate(my_df[selected_col], method=INDEX, end=100)
+    assert ret[selected_col].isin(["Male", "Female"]).all()
 
 
 def test_truncate_index_5(my_df):
@@ -62,8 +67,8 @@ def test_truncate_index_5(my_df):
     supplied.
     Should keep characters [1:3)
     """
-    ret = truncate(my_df["sex"], method=INDEX, idx=1, end=3)
-    assert ret.isin(["al", "em"]).all()
+    ret:pd.DataFrame = truncate(my_df[selected_col], method=INDEX, idx=1, end=3)
+    assert ret[selected_col].isin(["al", "em"]).all()
 
 
 def test_truncate_index_6(my_df):
@@ -72,16 +77,16 @@ def test_truncate_index_6(my_df):
     supplied.
     Should not keep any characters
     """
-    ret = truncate(my_df["sex"], method=INDEX, idx=3, end=1)
-    assert ret.isin([""]).all()
+    ret:pd.DataFrame = truncate(my_df[selected_col], method=INDEX, idx=3, end=1)
+    assert ret[selected_col].isin([""]).all()
 
 
 def test_truncate_input_7(my_df):
     """
     Test that truncate returns same value if no idx or end supplied
     """
-    ret = truncate(my_df["sex"], method=INDEX)
-    assert ret.isin(["Male", "Female"]).all()
+    ret:pd.DataFrame = truncate(my_df[selected_col], method=INDEX)
+    assert ret[selected_col].isin(["Male", "Female"]).all()
 
 
 # ----- Method: match Tests -----
@@ -91,8 +96,8 @@ def test_truncate_match_1(my_df):
     part of all strings in the specified column.
     Should only keep characters before "al" for all values
     """
-    ret = truncate(my_df["sex"], method=MATCH, match="al")
-    assert ret.isin(["M", "Fem"]).all()
+    ret:pd.DataFrame = truncate(my_df[selected_col], method=MATCH, match="al")
+    assert ret[selected_col].isin(["M", "Fem"]).all()
 
 
 def test_truncate_match_2(my_df):
@@ -102,8 +107,8 @@ def test_truncate_match_2(my_df):
     Should only keep characters before "em" ("F")for entries with value "Female" and the full entry
     "Male" for the others.
     """
-    ret = truncate(my_df["sex"], method=MATCH, match="em")
-    assert ret.isin(["Male", "F"]).all()
+    ret:pd.DataFrame = truncate(my_df[selected_col], method=MATCH, match="em")
+    assert ret[selected_col].isin(["Male", "F"]).all()
 
 
 def test_truncate_match_3(my_df):
@@ -112,8 +117,8 @@ def test_truncate_match_3(my_df):
     match any string in the specified column
     Should keep all characters
     """
-    ret = truncate(my_df["sex"], method=MATCH, match="cat")
-    assert ret.isin(["Male", "Female"]).all()
+    ret:pd.DataFrame = truncate(my_df[selected_col], method=MATCH, match="cat")
+    assert ret[selected_col].isin(["Male", "Female"]).all()
 
 
 def test_truncate_match_4(my_df):
@@ -123,8 +128,8 @@ def test_truncate_match_4(my_df):
     Should only keep characters before "em" ("F")for entries with value "Female" and the full entry
     "Male" for the others.
     """
-    ret = truncate(my_df["sex"], method=MATCH, match="EM", ignore_case=True)
-    assert ret.isin(["Male", "F"]).all()
+    ret:pd.DataFrame = truncate(my_df[selected_col], method=MATCH, match="EM", ignore_case=True)
+    assert ret[selected_col].isin(["Male", "F"]).all()
 
 
 def test_truncate_match_5(my_df):
@@ -133,8 +138,8 @@ def test_truncate_match_5(my_df):
     Should only keep characters before "em" ("F")for entries with value "Female" and the full entry
     "Male" for the others.
     """
-    ret = truncate(my_df["sex"], method=MATCH, match=".*", ignore_case=True)
-    assert ret.isin(["Male", "Female"]).all()
+    ret:pd.DataFrame = truncate(my_df[selected_col], method=MATCH, match=".*", ignore_case=True)
+    assert ret[selected_col].isin(["Male", "Female"]).all()
 
 
 # ----- Method: More Index Tests -----
@@ -145,8 +150,8 @@ def test_truncate_index_11(my_df):
     Test that truncate runs correctly for the INDEX method with only a valid `n` supplied
     Should only keep characters [3:]
     """
-    ret = truncate(my_df["sex"], method=INDEX, idx=3)
-    assert ret.isin(["e", "ale"]).all()
+    ret:pd.DataFrame = truncate(my_df[selected_col], method=INDEX, idx=3)
+    assert ret[selected_col].isin(["e", "ale"]).all()
 
 
 def test_truncate_index_12(my_df):
@@ -155,8 +160,8 @@ def test_truncate_index_12(my_df):
     and trim_from=END
     Should only keep characters [:-3]
     """
-    ret = truncate(my_df["sex"], method=INDEX, idx=3, trim_from=END)
-    assert ret.isin(["M", "Fem"]).all()
+    ret:pd.DataFrame = truncate(my_df[selected_col], method=INDEX, idx=3, trim_from=END)
+    assert ret[selected_col].isin(["M", "Fem"]).all()
 
 
 def test_truncate_index_13(my_df):
@@ -165,8 +170,8 @@ def test_truncate_index_13(my_df):
     and trim_from=BOTH
     Should only keep characters [1:-1]
     """
-    ret = truncate(my_df["sex"], method=INDEX, idx=1, trim_from=BOTH)
-    assert ret.isin(["al", "emal"]).all()
+    ret:pd.DataFrame = truncate(my_df[selected_col], method=INDEX, idx=1, trim_from=BOTH)
+    assert ret[selected_col].isin(["al", "emal"]).all()
 
 
 def test_truncate_index_14(my_df):
@@ -175,8 +180,8 @@ def test_truncate_index_14(my_df):
     greater than some of the string lengths but not others and trim_from=START
     Should only keep the last "e" in "Female"
     """
-    ret = truncate(my_df["sex"], method=INDEX, idx=5, trim_from=START)
-    assert ret.isin(["", "e"]).all()
+    ret:pd.DataFrame = truncate(my_df[selected_col], method=INDEX, idx=5, trim_from=START)
+    assert ret[selected_col].isin(["", "e"]).all()
 
 
 def test_truncate_index_15(my_df):
@@ -185,15 +190,15 @@ def test_truncate_index_15(my_df):
     greater or equal to than half the length of some strings but not others, and trim_from=BOTH
     Should only keep characters "ma" from "Female"
     """
-    ret = truncate(my_df["sex"], method=INDEX, idx=2, trim_from=BOTH)
-    assert ret.isin(["", "ma"]).all()
+    ret:pd.DataFrame = truncate(my_df[selected_col], method=INDEX, idx=2, trim_from=BOTH)
+    assert ret[selected_col].isin(["", "ma"]).all()
 
 
 def test_truncate_index_16(my_df):
     """
     Test that truncate runs correctly for the INDEX method with a large value of `n`
     supplied and trim_from=START
-    Should only keep characters "ma" from "Female"
+    Should not keep any
     """
-    ret = truncate(my_df["sex"], method=INDEX, idx=100, trim_from=START)
-    assert ret.isin([""]).all()
+    ret: pd.DataFrame = truncate(my_df[selected_col], method=INDEX, idx=100, trim_from=START)
+    assert ret[selected_col].isin([""]).all()

From ea5c61e82d4eed71ff6fa89393dee65603e11756 Mon Sep 17 00:00:00 2001
From: cdo03c <cdo03c@gmail.com>
Date: Wed, 27 Dec 2023 16:13:50 -0500
Subject: [PATCH 10/17] fixes truncate tests

---
 docs/source/conf.py                |  2 +-
 src/pymasq/mitigations/pram.py     |  4 +-
 src/pymasq/mitigations/truncate.py |  6 +++
 tests/mitigations/test_truncate.py | 81 ++++++++++++++++--------------
 4 files changed, 52 insertions(+), 41 deletions(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 23ecc49..1ffae56 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -27,7 +27,7 @@
 author = "MITLL"
 
 # The full version, including alpha/beta/rc tags
-release = "1.0"
+release = "1.1"
 
 
 # -- General configuration ---------------------------------------------------
diff --git a/src/pymasq/mitigations/pram.py b/src/pymasq/mitigations/pram.py
index 828580f..17e368f 100644
--- a/src/pymasq/mitigations/pram.py
+++ b/src/pymasq/mitigations/pram.py
@@ -50,13 +50,13 @@ def __calc_transition_matrix(
 
     cat_codes = data.cat.codes + 1
     sum_cats = np.nansum(cat_codes)
-    freqs = data.value_counts() / sum_cats  # scaled category frequencies
+    freqs: pd.Series = data.value_counts() / sum_cats  # scaled category frequencies
 
     scaled_prob_mat = prob_mat.copy()
     for i in range(ncats):
         s = sum(freqs * prob_mat[:, i])
         for j in range(ncats):
-            scaled_prob_mat[i, j] = prob_mat[j, i] * (freqs[j] / s)
+            scaled_prob_mat[i, j] = prob_mat[j, i] * (freqs.iloc[j] / s)
 
     trans_probs = prob_mat @ scaled_prob_mat
     scaled_trans_probs = alpha * trans_probs + (1 - alpha) * np.identity(ncats)
diff --git a/src/pymasq/mitigations/truncate.py b/src/pymasq/mitigations/truncate.py
index 79abbf0..f077068 100644
--- a/src/pymasq/mitigations/truncate.py
+++ b/src/pymasq/mitigations/truncate.py
@@ -87,6 +87,9 @@ def _truncate_by_match(series, match, ignore_case, keep_before):
         return series.apply(
             lambda x: re.split(re.escape(match), x, 1, flags=re.IGNORECASE)
         ).str[0 if keep_before else -1]
+    
+    if isinstance(data, pd.Series):
+        return pd.DataFrame(_truncate_by_match(data, match=match, ignore_case=ignore_case, keep_before=keep_before))
 
     return data.apply(
         _truncate_by_match,
@@ -162,6 +165,9 @@ def _truncate_by_index(series, trim_from, idx, end):
         raise InputError(
             f"`trim_from` must be one of ['start', 'end', 'both', None]. (Received: {trim_from})"
         )
+    
+    if isinstance(data, pd.Series):
+        return pd.DataFrame(_truncate_by_index(data, trim_from=trim_from, idx=idx, end=end))
 
     return data.apply(_truncate_by_index, trim_from=trim_from, idx=idx, end=end)
 
diff --git a/tests/mitigations/test_truncate.py b/tests/mitigations/test_truncate.py
index 99af40c..3fab210 100644
--- a/tests/mitigations/test_truncate.py
+++ b/tests/mitigations/test_truncate.py
@@ -12,193 +12,198 @@
 logger = logging.getLogger(__name__)
 
 @pytest.fixture
-def my_df():
+def truncate_df():
     df = load_census()
-    cols = ["fnlwgt", "education", "marital_status", selected_col, "capital_gain"]
+    cols = ["fnlwgt", "education", "marital_status", "sex", "capital_gain"]
     df = df.loc[:10, cols]
     return df
 
 selected_col: str = "sex"
 
 # ----- Method: Index Tests -----
-def test_truncate_index_1(my_df):
+def test_truncate_index_1(truncate_df):
     """
     Test that truncate runs correctly for the INDEX method with only a positive idx
     supplied.
     Should only keep characters [0:3)
     """
-    ret:pd.DataFrame = truncate(my_df[selected_col], idx=3)
+    # Test Series input
+    ret:pd.Series = truncate(truncate_df[selected_col], idx=3)
+    assert ret.isin(["e", "ale"]).all()
+
+    # Test Dataframe input
+    ret:pd.DataFrame = truncate(truncate_df[[selected_col]], idx=3)
     assert ret[selected_col].isin(["e", "ale"]).all()
 
 
-def test_truncate_index_2(my_df):
+def test_truncate_index_2(truncate_df):
     """
     Test that truncate runs correctly for the INDEX method with only a negative idx
     supplied.
     Should only keep characters [0:-1)
     """
-    ret:pd.DataFrame = truncate(my_df[selected_col], method=INDEX, idx=-1)
+    ret:pd.DataFrame = truncate(truncate_df[[selected_col]], method=INDEX, idx=-1)
     assert ret[selected_col].isin(["e", "e"]).all()
 
 
-def test_truncate_index_3(my_df):
+def test_truncate_index_3(truncate_df):
     """
     Test that truncate runs correctly for the INDEX method with only idx of 0
     supplied and trim_from=END
     Should not keep any characters
     """
-    ret:pd.DataFrame = truncate(my_df[selected_col], method=INDEX, idx=0, trim_from=END)
+    ret:pd.DataFrame = truncate(truncate_df[[selected_col]], method=INDEX, idx=0, trim_from=END)
     assert ret[selected_col].isin([""]).all()
 
 
-def test_truncate_index_4(my_df):
+def test_truncate_index_4(truncate_df):
     """
     Test that truncate runs correctly for the INDEX method with only a very large
     idx supplied. (idx > longest string in the column).
     Should not keep all characters
     """
-    ret:pd.DataFrame = truncate(my_df[selected_col], method=INDEX, end=100)
+    ret:pd.DataFrame = truncate(truncate_df[[selected_col]], method=INDEX, end=100)
     assert ret[selected_col].isin(["Male", "Female"]).all()
 
 
-def test_truncate_index_5(my_df):
+def test_truncate_index_5(truncate_df):
     """
     Test that truncate runs correctly for the INDEX method with idx and end
     supplied.
     Should keep characters [1:3)
     """
-    ret:pd.DataFrame = truncate(my_df[selected_col], method=INDEX, idx=1, end=3)
+    ret:pd.DataFrame = truncate(truncate_df[[selected_col]], method=INDEX, idx=1, end=3)
     assert ret[selected_col].isin(["al", "em"]).all()
 
 
-def test_truncate_index_6(my_df):
+def test_truncate_index_6(truncate_df):
     """
     Test that truncate runs correctly for the INDEX method with idx > end
     supplied.
     Should not keep any characters
     """
-    ret:pd.DataFrame = truncate(my_df[selected_col], method=INDEX, idx=3, end=1)
+    ret:pd.DataFrame = truncate(truncate_df[[selected_col]], method=INDEX, idx=3, end=1)
     assert ret[selected_col].isin([""]).all()
 
 
-def test_truncate_input_7(my_df):
+def test_truncate_input_7(truncate_df):
     """
     Test that truncate returns same value if no idx or end supplied
     """
-    ret:pd.DataFrame = truncate(my_df[selected_col], method=INDEX)
+    ret:pd.DataFrame = truncate(truncate_df[[selected_col]], method=INDEX)
     assert ret[selected_col].isin(["Male", "Female"]).all()
 
 
 # ----- Method: match Tests -----
-def test_truncate_match_1(my_df):
+def test_truncate_match_1(truncate_df):
     """
     Test that truncate runs correctly for the MATCH method with a pattern that matches a
     part of all strings in the specified column.
     Should only keep characters before "al" for all values
     """
-    ret:pd.DataFrame = truncate(my_df[selected_col], method=MATCH, match="al")
+    ret:pd.DataFrame = truncate(truncate_df[[selected_col]], method=MATCH, match="al")
     assert ret[selected_col].isin(["M", "Fem"]).all()
 
 
-def test_truncate_match_2(my_df):
+def test_truncate_match_2(truncate_df):
     """
     Test that truncate runs correctly for the MATCH method with a pattern that matches a
     part of all strings in the specified column.
     Should only keep characters before "em" ("F")for entries with value "Female" and the full entry
     "Male" for the others.
     """
-    ret:pd.DataFrame = truncate(my_df[selected_col], method=MATCH, match="em")
+    ret:pd.DataFrame = truncate(truncate_df[[selected_col]], method=MATCH, match="em")
     assert ret[selected_col].isin(["Male", "F"]).all()
 
 
-def test_truncate_match_3(my_df):
+def test_truncate_match_3(truncate_df):
     """
     Test that truncate runs correctly for the MATCH method with a pattern that does not
     match any string in the specified column
     Should keep all characters
     """
-    ret:pd.DataFrame = truncate(my_df[selected_col], method=MATCH, match="cat")
+    ret:pd.DataFrame = truncate(truncate_df[[selected_col]], method=MATCH, match="cat")
     assert ret[selected_col].isin(["Male", "Female"]).all()
 
 
-def test_truncate_match_4(my_df):
+def test_truncate_match_4(truncate_df):
     """
     Test that truncate runs correctly for the MATCH method with a pattern matches only when
     ignorecase is True
     Should only keep characters before "em" ("F")for entries with value "Female" and the full entry
     "Male" for the others.
     """
-    ret:pd.DataFrame = truncate(my_df[selected_col], method=MATCH, match="EM", ignore_case=True)
+    ret:pd.DataFrame = truncate(truncate_df[[selected_col]], method=MATCH, match="EM", ignore_case=True)
     assert ret[selected_col].isin(["Male", "F"]).all()
 
 
-def test_truncate_match_5(my_df):
+def test_truncate_match_5(truncate_df):
     """
     Test pattern matches are properly escaped by the regex expression
     Should only keep characters before "em" ("F")for entries with value "Female" and the full entry
     "Male" for the others.
     """
-    ret:pd.DataFrame = truncate(my_df[selected_col], method=MATCH, match=".*", ignore_case=True)
+    ret:pd.DataFrame = truncate(truncate_df[[selected_col]], method=MATCH, match=".*", ignore_case=True)
     assert ret[selected_col].isin(["Male", "Female"]).all()
 
 
 # ----- Method: More Index Tests -----
 
 
-def test_truncate_index_11(my_df):
+def test_truncate_index_11(truncate_df):
     """
     Test that truncate runs correctly for the INDEX method with only a valid `n` supplied
     Should only keep characters [3:]
     """
-    ret:pd.DataFrame = truncate(my_df[selected_col], method=INDEX, idx=3)
+    ret:pd.DataFrame = truncate(truncate_df[[selected_col]], method=INDEX, idx=3)
     assert ret[selected_col].isin(["e", "ale"]).all()
 
 
-def test_truncate_index_12(my_df):
+def test_truncate_index_12(truncate_df):
     """
     Test that truncate runs correctly for the INDEX method with a valid `n` supplied
     and trim_from=END
     Should only keep characters [:-3]
     """
-    ret:pd.DataFrame = truncate(my_df[selected_col], method=INDEX, idx=3, trim_from=END)
+    ret:pd.DataFrame = truncate(truncate_df[[selected_col]], method=INDEX, idx=3, trim_from=END)
     assert ret[selected_col].isin(["M", "Fem"]).all()
 
 
-def test_truncate_index_13(my_df):
+def test_truncate_index_13(truncate_df):
     """
     Test that truncate runs correctly for the INDEX method with a valid `n` supplied
     and trim_from=BOTH
     Should only keep characters [1:-1]
     """
-    ret:pd.DataFrame = truncate(my_df[selected_col], method=INDEX, idx=1, trim_from=BOTH)
+    ret:pd.DataFrame = truncate(truncate_df[[selected_col]], method=INDEX, idx=1, trim_from=BOTH)
     assert ret[selected_col].isin(["al", "emal"]).all()
 
 
-def test_truncate_index_14(my_df):
+def test_truncate_index_14(truncate_df):
     """
     Test that truncate runs correctly for the INDEX method with a value of `n` supplied
     greater than some of the string lengths but not others and trim_from=START
     Should only keep the last "e" in "Female"
     """
-    ret:pd.DataFrame = truncate(my_df[selected_col], method=INDEX, idx=5, trim_from=START)
+    ret:pd.DataFrame = truncate(truncate_df[[selected_col]], method=INDEX, idx=5, trim_from=START)
     assert ret[selected_col].isin(["", "e"]).all()
 
 
-def test_truncate_index_15(my_df):
+def test_truncate_index_15(truncate_df):
     """
     Test that truncate runs correctly for the INDEX method with a value of `n` supplied
     greater or equal to than half the length of some strings but not others, and trim_from=BOTH
     Should only keep characters "ma" from "Female"
     """
-    ret:pd.DataFrame = truncate(my_df[selected_col], method=INDEX, idx=2, trim_from=BOTH)
+    ret:pd.DataFrame = truncate(truncate_df[[selected_col]], method=INDEX, idx=2, trim_from=BOTH)
     assert ret[selected_col].isin(["", "ma"]).all()
 
 
-def test_truncate_index_16(my_df):
+def test_truncate_index_16(truncate_df):
     """
     Test that truncate runs correctly for the INDEX method with a large value of `n`
     supplied and trim_from=START
     Should not keep any
     """
-    ret: pd.DataFrame = truncate(my_df[selected_col], method=INDEX, idx=100, trim_from=START)
+    ret: pd.DataFrame = truncate(truncate_df[[selected_col]], method=INDEX, idx=100, trim_from=START)
     assert ret[selected_col].isin([""]).all()

From a7678074f9d0ecc35f819ed00db771db56f22f62 Mon Sep 17 00:00:00 2001
From: cdo03c <cdo03c@gmail.com>
Date: Wed, 27 Dec 2023 16:29:21 -0500
Subject: [PATCH 11/17] Fixes optimization tests

---
 src/pymasq/mitigations/hashing.py         | 4 ++--
 tests/optimizations/test_optimizations.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/pymasq/mitigations/hashing.py b/src/pymasq/mitigations/hashing.py
index 547f6c0..d1595b1 100644
--- a/src/pymasq/mitigations/hashing.py
+++ b/src/pymasq/mitigations/hashing.py
@@ -196,6 +196,6 @@ def hashing(
         logger.warning(
             f"Warning: the default length of the hexdigest is set to 16; to alter the length, pass in `{hash_func}` as a callable defined with your prefered length."
         )
-        return data.applymap(lambda v: hash_func(v).hexdigest(16))
+        return data.map(lambda v: hash_func(v).hexdigest(16))
 
-    return data.applymap(lambda v: hash_func(v).hexdigest())
+    return data.map(lambda v: hash_func(v).hexdigest())
diff --git a/tests/optimizations/test_optimizations.py b/tests/optimizations/test_optimizations.py
index 35a8961..c8dba80 100644
--- a/tests/optimizations/test_optimizations.py
+++ b/tests/optimizations/test_optimizations.py
@@ -67,7 +67,7 @@ def my_mutations():
 # evaluation functions
 zeros = {lambda: 0: {"weight": 1}}
 ones = {lambda: 1: {"weight": 1}}
-rands = {lambda: rg(): {"weight": 1}}
+rands = {lambda: rg.random(1).item(): {"weight": 1}}
 
 
 # Test standard termination conditions

From c822e8e3c3d7c488213527acb75b6e0b60db27da Mon Sep 17 00:00:00 2001
From: cdo03c <cdo03c@gmail.com>
Date: Wed, 27 Dec 2023 17:45:29 -0500
Subject: [PATCH 12/17] Updates

---
 src/pymasq/datasets/data_generator.py      |  2 +-
 src/pymasq/metrics/auc_scores.py           |  2 +-
 src/pymasq/metrics/risk_scores.py          | 14 +++++++-------
 src/pymasq/metrics/utility_scores.py       |  2 +-
 src/pymasq/mitigations/microaggregation.py |  4 ++--
 5 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/pymasq/datasets/data_generator.py b/src/pymasq/datasets/data_generator.py
index c2d4c6b..ffa42ae 100644
--- a/src/pymasq/datasets/data_generator.py
+++ b/src/pymasq/datasets/data_generator.py
@@ -138,7 +138,7 @@ def _l_div_sensitive_gen(l: int, n: int) -> List:
     while len(unique_entries) != len(set(unique_entries)):
         unique_entries = rg.choice(range(n), l)
 
-    non_unique = rg.Generator.choice(unique_entries, n - l)
+    non_unique = rg.choice(unique_entries, n - l)
     return list(unique_entries) + list(non_unique)
 
 
diff --git a/src/pymasq/metrics/auc_scores.py b/src/pymasq/metrics/auc_scores.py
index b3d5051..d663def 100644
--- a/src/pymasq/metrics/auc_scores.py
+++ b/src/pymasq/metrics/auc_scores.py
@@ -134,7 +134,7 @@ def auc_score(
         )
     # Encode the two data frames (at once for consistent encodings)
     orig_enc, mod_enc = preprocessor_fn.encode_both(
-        df_A=orig_df, df_B=mod_df, sensitive_col=sensitive_col
+        df_a=orig_df, df_b=mod_df, sensitive_col=sensitive_col
     )
     # Train the classifer based on only the original data
     classifer_fn.train(
diff --git a/src/pymasq/metrics/risk_scores.py b/src/pymasq/metrics/risk_scores.py
index a808abd..6630f70 100644
--- a/src/pymasq/metrics/risk_scores.py
+++ b/src/pymasq/metrics/risk_scores.py
@@ -284,7 +284,7 @@ def _diversity(
 def l_diversity(
     df: pd.DataFrame,
     sensitive_col: str,
-    L: int = 2,
+    l_thresh: int = 2,
     method: Optional[str] = None,
 ) -> float:
     """
@@ -298,7 +298,7 @@ def l_diversity(
     sensitive_col : str,
         The name of the column containing the data that is being obscured by mitigations
 
-    L : int, optional
+    l_thresh : int, optional
         The threshold by which the closeness of the q-blocks and the full dataset are compared
         (Default: 2)
 
@@ -324,14 +324,14 @@ def l_diversity(
     else:
         raise ValueError(f"method must be '{DISTINCT}' or '{ENTROPY}'")
 
-    return sum([1.0 if ld <= L else 0.0 for ld in l_div]) / len(l_div)
+    return sum([1.0 if ld <= l_thresh else 0.0 for ld in l_div]) / len(l_div)
 
 
 @BEARTYPE
 def is_l_diverse(
     df: pd.DataFrame,
     sensitive_col: str,
-    L: int = 2,
+    l_thresh: int = 2,
     method: Optional[str] = None,
 ) -> bool:
     """
@@ -348,7 +348,7 @@ def is_l_diverse(
     sensitive_col : str
         The name of the column containing the data that is being obscured by mitigations
 
-    L : int, optional
+    l_thresh : int, optional
         The threshold by which the closeness of the q-blocks and the full dataset are compared. Default is arbitrary.
         (Default: 2)
 
@@ -377,9 +377,9 @@ def is_l_diverse(
 
     """
     if method is None or method == DISTINCT:
-        return _diversity(df, sensitive_col, _unique_count) <= L
+        return _diversity(df, sensitive_col, _unique_count) <= l_thresh
     elif method == ENTROPY:
-        return _diversity(df, sensitive_col, _entropy_count) <= np.log(L)
+        return _diversity(df, sensitive_col, _entropy_count) <= np.log(l_thresh)
 
     raise ValueError(f"method must be '{DISTINCT}' or '{ENTROPY}'")
 
diff --git a/src/pymasq/metrics/utility_scores.py b/src/pymasq/metrics/utility_scores.py
index f115bb1..4df4f5f 100644
--- a/src/pymasq/metrics/utility_scores.py
+++ b/src/pymasq/metrics/utility_scores.py
@@ -60,7 +60,7 @@ def jensen_shannon(
     # Encode the two data frames (at once for consistent encodings)
     preprocessor_fn = preprocess.preprocessor_fn[preprocessor]
     orig_enc, mod_enc = preprocessor_fn.encode_both(
-        df_A=orig_df, df_B=mod_df, sensitive_col=sensitive_col
+        df_a=orig_df, df_b=mod_df, sensitive_col=sensitive_col
     )
 
     # remove sensitive column
diff --git a/src/pymasq/mitigations/microaggregation.py b/src/pymasq/mitigations/microaggregation.py
index 37e4d10..a629b30 100644
--- a/src/pymasq/mitigations/microaggregation.py
+++ b/src/pymasq/mitigations/microaggregation.py
@@ -124,7 +124,7 @@ def _scaling(
             raise ImportError(
                 "Unable to import `tensorly` library to perform `robust` scaling; run ´pip3 install tensorly` from within your project environment to install it."
             )
-        scaled_data, _ = robust_pca(data.values.astype(np.float))
+        scaled_data, _ = robust_pca(data.values.astype(float))
         return scaled_data
     if callable(scale):
         return scale(data, **kwargs)
@@ -561,7 +561,7 @@ def robust_magg(
 
     pw_dists = pairwise_distances(z)
 
-    if not all(np.diagonal(pw_dists)) == 0:
+    if all(np.diagonal(pw_dists)) != 0:
         np.fill_diagonal(pw_dists, 0)
 
     mcd = MinCovDet(random_state=seed).fit(z)

From 972f96cc8f60f7faf7ed71fc4a3256db331e922c Mon Sep 17 00:00:00 2001
From: cdo03c <cdo03c@gmail.com>
Date: Thu, 4 Jan 2024 04:48:18 -0500
Subject: [PATCH 13/17] Updates naming conventions

---
 src/pymasq/datasets/data_generator.py | 18 +++----
 src/pymasq/metrics/risk_scores.py     | 75 +++++++++++++--------------
 tests/metrics/test_risk_scores.py     | 19 +++----
 tests/utils/test_cache.py             |  2 +-
 4 files changed, 57 insertions(+), 57 deletions(-)

diff --git a/src/pymasq/datasets/data_generator.py b/src/pymasq/datasets/data_generator.py
index ffa42ae..17268f2 100644
--- a/src/pymasq/datasets/data_generator.py
+++ b/src/pymasq/datasets/data_generator.py
@@ -119,12 +119,12 @@ def gen_num_df(n: int = 1000, seed: int = 1234) -> pd.DataFrame:
 
 
 @BEARTYPE
-def _l_div_sensitive_gen(l: int, n: int) -> List:
+def _l_div_sensitive_gen(l_div: int, n: int) -> List[int]:
     """
     Generates the sensitive variable for generate_l_diverse_table for each equivalence class
     Parameters
     ----------
-    l : int
+    l_div : int
         The specified diversity that the equivalence class needs to be
     n : int
         The size of the equivalence class (i.e. the lenght of the list returned)
@@ -134,17 +134,17 @@ def _l_div_sensitive_gen(l: int, n: int) -> List:
         List of integer values for the sensitive column
     """
 
-    unique_entries = rg.choice(range(n), l)
+    unique_entries = rg.choice(range(n), l_div)
     while len(unique_entries) != len(set(unique_entries)):
-        unique_entries = rg.choice(range(n), l)
+        unique_entries = rg.choice(range(n), l_div)
 
-    non_unique = rg.choice(unique_entries, n - l)
+    non_unique = rg.choice(unique_entries, n - l_div)
     return list(unique_entries) + list(non_unique)
 
 
 @BEARTYPE
 def generate_l_diverse_table(
-    l: Union[int, List[int]],
+    l_div: Union[int, List[int]],
     num_col: int = 5,
     num_q_blocks: int = 5,
     q_block_sizes: Union[int, List[int]] = 5,
@@ -153,7 +153,7 @@ def generate_l_diverse_table(
     Used for testing l-diversity. Creates a data set that is l-diverse for given l.
     Parameters
     ----------
-    l : Union[int, List[int]]
+    l_div : Union[int, List[int]]
         The specified diversity that the data set needs to be TODO: need to expand this to allow float l parameters for entropy
     num_col : int, optional
         The number of columns (in addition to the sensitive column) the data set should have
@@ -180,10 +180,10 @@ def generate_l_diverse_table(
         if isinstance(q_block_sizes, int)
         else q_block_sizes
     )
-    l = [l] * num_q_blocks if not isinstance(l, list) else l
+    l_div: List[int] = [l_div] * num_q_blocks if not isinstance(l_div, list) else l_div
 
     for n in range(num_q_blocks):
-        senn = _l_div_sensitive_gen(l[n], q_block_sizes[n])
+        senn = _l_div_sensitive_gen(l_div[n], q_block_sizes[n])
         col_names["sensitive"] += senn
         for cn in col_names:
             if cn != "sensitive":
diff --git a/src/pymasq/metrics/risk_scores.py b/src/pymasq/metrics/risk_scores.py
index 6630f70..57f0b03 100644
--- a/src/pymasq/metrics/risk_scores.py
+++ b/src/pymasq/metrics/risk_scores.py
@@ -1,11 +1,9 @@
-from typing import List, Callable, Dict, Union, Final
-from pymasq.errors import InputError, NotInRangeError
+from typing import List, Callable, Dict, Optional, Union, Final
 
 import numpy as np
 import pandas as pd
 
 from copy import copy
-from typing import Callable, Dict, Final, List, Optional, Union
 
 from pymasq import BEARTYPE
 from pymasq.errors import InputError, NotInRangeError
@@ -185,7 +183,7 @@ def is_k_anon_col(
         .rename(columns={"size": "k_count"})
     )
     adf["is_k_anon"] = adf["k_count"] > k
-    return pd.merge(df, adf, on=key_vars)
+    return pd.merge(df, adf, on=key_vars, how="left", validate="many_to_one")
 
 
 @BEARTYPE
@@ -377,9 +375,9 @@ def is_l_diverse(
 
     """
     if method is None or method == DISTINCT:
-        return _diversity(df, sensitive_col, _unique_count) <= l_thresh
+        return _diversity(df, sensitive_col, _unique_count)["l-diversity"] <= l_thresh
     elif method == ENTROPY:
-        return _diversity(df, sensitive_col, _entropy_count) <= np.log(l_thresh)
+        return _diversity(df, sensitive_col, _entropy_count)["l-diversity"] <= np.log(l_thresh)
 
     raise ValueError(f"method must be '{DISTINCT}' or '{ENTROPY}'")
 
@@ -556,8 +554,9 @@ def _closeness(
     grp_qi = df.groupby(qi)
     # get the closeness
     qs = _get_probs(df, sensitive_col)
-    fun = lambda x: fxn(qs, x)
-    div = grp_qi[sensitive_col].agg(fun)
+    def _func(x):
+        return fxn(qs, x)
+    div = grp_qi[sensitive_col].agg(_func)
     counts = grp_qi[sensitive_col].agg("count")
 
     _t_closeness = []
@@ -661,15 +660,15 @@ def is_t_close(
 
 
 @BEARTYPE
-def indiv_risk_approx(fk: Union[int, float], Fk: Union[int, float]) -> float:
+def indiv_risk_approx(samp_freq: Union[int, float], pop_freq: Union[int, float]) -> float:
     """
     calculates the approximate individual risk
 
     Parameters
     ----------
-    fk : int or float
+    samp_freq : int or float
         the sample frequency of the row's combination of quasi-identifier values
-    Fk : int or float
+    pop_freq : int or float
         the population frequence of the row's combination of quasi-identifier values
 
     Returns
@@ -682,28 +681,28 @@ def indiv_risk_approx(fk: Union[int, float], Fk: Union[int, float]) -> float:
     TODO
 
     """
-    if fk == Fk:
-        return 1 / float(fk)
+    if samp_freq == pop_freq:
+        return 1 / float(samp_freq)
 
-    pk = float(fk) / float(Fk)
+    pk = float(samp_freq) / float(pop_freq)
 
-    if fk > 2:
-        return pk / (fk - (1 - pk))
-    if fk == 2:
+    if samp_freq > 2:
+        return pk / (samp_freq - (1 - pk))
+    if samp_freq == 2:
         return (pk / (1 - pk)) - (((pk / (1 - pk)) ^ 2) * np.log(1 / pk))
     return (pk / (1 - pk)) * np.log(1 / pk)
 
 
 @BEARTYPE
-def indiv_risk_exact(fk: int, Fk: float) -> float:
+def indiv_risk_exact(samp_freq: int, pop_freq: float) -> float:
     """
     calculates the exact individual risk
 
     Parameters
     ----------
-    fk : int
+    samp_freq : int
         the sample frequency of the row's combination of quasi-identifier values
-    Fk : int
+    pop_freq : int
         the population frequence of the row's combination of quasi-identifier values
 
     Returns
@@ -716,32 +715,32 @@ def indiv_risk_exact(fk: int, Fk: float) -> float:
     TODO
 
     """
-    if fk == Fk:
-        return 1 / float(fk)
+    if samp_freq == pop_freq:
+        return 1 / float(samp_freq)
 
-    pk = float(fk) / float(Fk)
+    pk = float(samp_freq) / float(pop_freq)
 
-    def B(fk, pk, i):
-        b1 = (fk - 1 - i) ^ 2 / ((i + 2) * (fk - 2 - i))
-        b2 = (pk ^ (i + 2 - fk) - 1) / (pk ^ (i + 1 - fk) - 1)
+    def b_func(samp_freq, pk, i):
+        b1 = (samp_freq - 1 - i) ^ 2 / ((i + 2) * (samp_freq - 2 - i))
+        b2 = (pk ^ (i + 2 - samp_freq) - 1) / (pk ^ (i + 1 - samp_freq) - 1)
         return b1 * b2
 
-    def BB(fk, pk):
+    def bb_func(samp_freq, pk):
         bb = 0
-        for m in range(fk - 2):
+        for m in range(samp_freq - 2):
             b = 1
             for m2 in range(m + 1):
-                b = b * B(fk, pk, m2)
+                b = b * b_func(samp_freq, pk, m2)
             bb = bb + (-1) ^ (m + 1) * b
         return bb
 
-    first = (pk / (1 - pk)) ^ fk
-    third = (-1) ^ fk * np.log(pk)
+    first = (pk / (1 - pk)) ^ samp_freq
+    third = (-1) ^ samp_freq * np.log(pk)
 
-    if fk > 2:
-        A = (pk ^ (1 - fk) - 1) / (fk - 1)
-        return first * ((A * (1 + BB(fk, pk))) + third)
-    if fk == 2:
+    if samp_freq > 2:
+        A = (pk ^ (1 - samp_freq) - 1) / (samp_freq - 1)
+        return first * ((A * (1 + bb_func(samp_freq, pk))) + third)
+    if samp_freq == 2:
         return (pk / (1 - pk)) - (((pk / (1 - pk)) ^ 2) * np.log(1 / pk))
     return (pk / (1 - pk)) * np.log(1 / pk)
 
@@ -816,7 +815,7 @@ def indiv_risk(
             f"Method must be in ['{APPROX}', '{EXACT}'] Method given was {method}"
         )
 
-    return pd.merge(df, freq_count, how="left", on=quasi_cols + ["order"])["risk"]
+    return pd.merge(df, freq_count, how="left", on=quasi_cols + ["order"], validate="many_to_one")["risk"]
 
 
 @BEARTYPE
@@ -868,7 +867,7 @@ def beta_likeness(
     InputError
         This error is raised when a `beta` value of <= 0 is supplied.
     """
-    if not beta > 0:
+    if beta <= 0:
         raise InputError("beta must be a value greater than 0")
     qi = (  # Generate a list of all quasi-indicators (qi)
         [colname for colname in df.columns if colname != sensitive_col]
@@ -891,7 +890,7 @@ def beta_likeness(
             item, sensitive_col
         )  # get the frequencies of SA values in the equivalence class
         for key in sa_ec.keys():
-            if not sa_all[key] < sa_ec[key]:  # satisfies the requirement that p_i < q_i
+            if sa_all[key] >= sa_ec[key]:  # satisfies the requirement that p_i < q_i
                 continue
             dist = (sa_ec[key] - sa_all[key]) / sa_all[key]  # (q_i - p_i) / p_i
             if enhanced:
diff --git a/tests/metrics/test_risk_scores.py b/tests/metrics/test_risk_scores.py
index f935765..d96fdf5 100644
--- a/tests/metrics/test_risk_scores.py
+++ b/tests/metrics/test_risk_scores.py
@@ -28,6 +28,7 @@ def my_df():
 
 LETTER_SET = ["A", "B", "C", "A", "B", "A", "C", "C", "B"]
 
+true_assert_statement: str = "Should be True"
 
 @pytest.fixture
 def letter_df():
@@ -39,7 +40,7 @@ def test_l_diversity_all_same():
     Tests l-diversity function
     """
     df = generate_l_diverse_table(2)
-    assert l_diversity(df, "sensitive", 3) == 1.0, "Should be True"
+    assert l_diversity(df, "sensitive", 3) == pytest.approx(1.0), true_assert_statement
 
 
 def test_l_diversity_variety():
@@ -47,7 +48,7 @@ def test_l_diversity_variety():
     Tests l-diversity function
     """
     df = generate_l_diverse_table([2, 3, 3, 2, 2])
-    assert l_diversity(df, "sensitive", 2) == 0.6, "Should be True"
+    assert l_diversity(df, "sensitive", 2) == pytest.approx(0.6), true_assert_statement
 
 
 def test_t_closeness_num():
@@ -70,7 +71,7 @@ def test_t_closeness_num():
     assert (
         t_closeness(tc_table, "sensitive", test=True, datatype="numeric")
         == expected_result
-    ), "Should be True"
+    ), true_assert_statement
 
 
 def test_t_closeness_cat():
@@ -93,7 +94,7 @@ def test_t_closeness_cat():
     assert (
         t_closeness(tc_table, "sensitive", test=True, datatype="categorical")
         == expected_result
-    ), "Should be True"
+    ), true_assert_statement
 
 
 def test_t_closeness():
@@ -102,8 +103,8 @@ def test_t_closeness():
     """
     tc_table = generate_t_close_table(LETTER_SET)
     assert (
-        t_closeness(tc_table, "sensitive", datatype="categorical", t=0.0) == 1.0
-    ), "Should be True"
+        t_closeness(tc_table, "sensitive", datatype="categorical", t=0.0) == pytest.approx(1.0)
+    ), true_assert_statement
 
 
 def test_beta_likeness_1(letter_df):
@@ -119,7 +120,7 @@ def test_beta_likeness_2(letter_df):
     Tests beta-likeness on a toy dataset with a very small beta (any information gain should fail)
     """
     assert (
-        beta_likeness(letter_df, "sensitive", beta=1e-9) == 4.0 / 9.0
+        beta_likeness(letter_df, "sensitive", beta=1e-9) == pytest.approx(4.0 / 9.0)
     ), "Should fail beta likeness on the 2 A's in EC2 and 2 C's in EC3"
 
 
@@ -158,7 +159,7 @@ def test_auc_score_1(my_df, method, preprocessor):
             preprocessor=preprocessor,
             **kwargs,
         )
-        == 1.0
+        == pytest.approx(1.0)
     ), "Result should be equal to 1.0 (i.e. True)"
 
 
@@ -186,7 +187,7 @@ def test_auc_score_2(my_df, method, preprocessor):
         ),
         3,
     )
-    assert score == 1.0, "Result should be equal to 1.0 (i.e. True)"
+    assert score == pytest.approx(1.0), "Result should be equal to 1.0 (i.e. True)"
 
 
 answer_key = {
diff --git a/tests/utils/test_cache.py b/tests/utils/test_cache.py
index 43775fb..c348dec 100644
--- a/tests/utils/test_cache.py
+++ b/tests/utils/test_cache.py
@@ -54,7 +54,7 @@ def my_df():
         (
             RFClassifier,
             EmbeddingsEncoder,
-            0.61,
+            0.57,
             "cache_test/053cb5e57bfa9b5c9568625cb22588dd.ENCV.e81a5b5eb0df48bc68540d7b71342a7d.pkl",
             """ENCV. Description: Preprocessed with <class 'pymasq.preprocessing.preprocess.EmbeddingsEncoder'>
 First ten rows:

From 392d53ee772dd4a1e8181becaf4f561957ea4d30 Mon Sep 17 00:00:00 2001
From: cdo03c <cdo03c@gmail.com>
Date: Thu, 4 Jan 2024 04:54:19 -0500
Subject: [PATCH 14/17] Updates readme

---
 tests/integration/README.md | 38 +++++++++++++++++--------------------
 1 file changed, 17 insertions(+), 21 deletions(-)

diff --git a/tests/integration/README.md b/tests/integration/README.md
index aad113e..5a02e38 100644
--- a/tests/integration/README.md
+++ b/tests/integration/README.md
@@ -1,4 +1,5 @@
-Integration Testing
+# Integration Testing
+
 -------------------
 
 The integration tests will test each `pymasq.mitigation` and `pymasq.metric` available using `pymasq.optimization` procedures.
@@ -8,8 +9,9 @@ Any new functionality that is to be tested must be specified in its own configur
 
 The `template_config.yaml` file describes the expected format of the configurations.
 
-User Guide
-----------
+## User Guide
+
+-------------------
 
 The `integration.py` script is run via the command line.
 
@@ -22,13 +24,13 @@ For top-level use:
 ### Help
 
 Display available actions.
-    
+
     $ python integration.py [ -h | --help ]
 
 ### Verbose
 
 Display additional logging info to terminal. _Optional: default is False_.
-    
+
     $ python integration.py [ -v | --verbose ]
 
 ### Test Configuration
@@ -37,39 +39,33 @@ Set the complete file path of the test configuration YAML file to use.
 
     $ python integration.py [ --test-config ]
 
-### Iterations 
+### Iterations
 
 Set the number of `iters` to run the optimization procedures. This will **not** overwrite `iters` if set in the config file. _Optional: default is 1000000000_.
 
     $  python integration.py [ -i | --iters ] <int>
 
+## Configuration Files
 
-Configuration Files
 -------------------
 
 The integration tests run with the parameters specified in two YAML configuration files, `core_config.yaml` and `test_config.yaml`.
-These files should define the configuration of all tests to be run. A third configuration file, `template_config.yaml`, is also included
-and provides the schema for how proper configuration files can be defined.
+These files should define the configuration of all tests to be run. A third configuration file, `template_config.yaml`, is also included and provides the schema for how proper configuration files can be defined.
 
 - The `core_config.yaml` contains the configurations for the mitigations and metrics that
-have been vetted previously. 
+have been vetted previously.
     > **This file should only be modified when adding a new mitigation
 or metric that has already been tested.**
 
-- The `test_config.yaml` is intended to include an example configuration file for new functionality. 
-The configuration in this file will add to or update/overwrite the configuration loaded from `config_core.yaml`. Use the
+- The `test_config.yaml` is intended to include an example configuration file for new functionality. The configuration in this file will add to or update/overwrite the configuration loaded from `config_core.yaml`. Use the
 `--test-config` flag to specify the file path to a different configuration file to be tested.
 
 Note that comments to the YAML files can be included by adding "`#`" in any part of the file.
 
-Default Behavior
-----------------
-If no optimization procedure is defined the in the configuration file to be tested (e.g., `test_config.yaml`), then 
-only the `pymasq.optimization.ExhaustiveSearch` procedure will be run. This procedure will test 
-all permutations in `pymasq.mitigations` and may be time-consuming. In this case, you can use the `--iters` flag to 
-constraint the number of iterations to run.
+## Default Behavior
+
+-------------------
 
-Note that permutations are not applied and evaluated all at once, but rather incrementally. 
-That is, a mitigation strategy composed of 3 mitigations, will have 6 permutations and will run for 18 iterations, 
-while a mitigation strategy composed of 6 mitigations will have 720 permutations and will run for 4,320 iterations.
+If no optimization procedure is defined the in the configuration file to be tested (e.g., `test_config.yaml`), then only the `pymasq.optimization.ExhaustiveSearch` procedure will be run. This procedure will test all permutations in `pymasq.mitigations` and may be time-consuming. In this case, you can use the `--iters` flag to constraint the number of iterations to run.
 
+Note that permutations are not applied and evaluated all at once, but rather incrementally. That is, a mitigation strategy composed of 3 mitigations, will have 6 permutations and will run for 18 iterations, while a mitigation strategy composed of 6 mitigations will have 720 permutations and will run for 4,320 iterations.

From 1633fb2c9234f5f0b2466c5181e2f32cb83fc0c5 Mon Sep 17 00:00:00 2001
From: cdo03c <cdo03c@gmail.com>
Date: Thu, 4 Jan 2024 05:06:00 -0500
Subject: [PATCH 15/17] Updates to py310

---
 setup.cfg | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/setup.cfg b/setup.cfg
index 31608d1..effd9d5 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -11,7 +11,7 @@ author = Cuyler OBrien, Jaime Pena, Evan Young, Brian Levine, Eric Wybenga
 author_email = cuyler.obrien@ll.mit.edu, jdpena@ll.mit.edu, evan.young@ll.mit.edu
 
 [options]
-python_requires = >= 3.9
+python_requires = >= 3.10
 packages = find:
 package_dir =
        = src
@@ -29,11 +29,8 @@ install_requires =
        tensorflow~=2.9
        tpot[dask]~=0.11
 tests_require = 
-       beartype>=0.5.1
-       hypothesis>=4.53.2    
-       pytest>=3.8
-       pytest-xdist~=3.5
-       
+       beartype>=0.5.1  
+       pytest~=7.4
 
 [options.packages.find]
 where = src
@@ -45,7 +42,7 @@ python_files=test_*.py
 testpaths=tests
 
 [tox:tox]
-envlist = py3{9,10,11}, coverage, bandit, owasp-depcheck
+envlist = py3{10,11}, coverage, bandit, owasp-depcheck
 toxworkdir = build/tox
 
 [testenv]

From c3b301da37752ddf46704ba98cf097e5d2fc9376 Mon Sep 17 00:00:00 2001
From: cdo03c <cdo03c@gmail.com>
Date: Sat, 6 Jan 2024 15:57:49 -0500
Subject: [PATCH 16/17] remove boruta because it has not been updated

---
 setup.cfg                             |   3 +-
 src/pymasq/__init__.py                |   2 +-
 src/pymasq/kve/kve.py                 | 127 +-------------------------
 tests/classifiers/test_classifiers.py |   4 +-
 tests/kve/test_kve.py                 |  96 +------------------
 5 files changed, 10 insertions(+), 222 deletions(-)

diff --git a/setup.cfg b/setup.cfg
index effd9d5..19ef812 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -16,10 +16,9 @@ packages = find:
 package_dir =
        = src
 install_requires =
-       boruta~=0.3
        bpemb~=0.3
        matplotlib~=3.5
-       numpy~=1.22
+       numpy~=1.26
        pandas~=1.4
        plotly>=4.11.0
        SALib~=1.4
diff --git a/src/pymasq/__init__.py b/src/pymasq/__init__.py
index 37450ba..9687132 100644
--- a/src/pymasq/__init__.py
+++ b/src/pymasq/__init__.py
@@ -1,6 +1,6 @@
 from os import path
 
-__version__ = "0.6.5"
+__version__ = "0.6.6"
 
 
 try:
diff --git a/src/pymasq/kve/kve.py b/src/pymasq/kve/kve.py
index edba759..a9ccac1 100644
--- a/src/pymasq/kve/kve.py
+++ b/src/pymasq/kve/kve.py
@@ -8,8 +8,6 @@
 from numpy import ndarray
 import pandas as pd
 import statsmodels.api as sm
-import json
-from boruta import BorutaPy
 from pandas.api.types import is_numeric_dtype
 from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
 from sklearn.feature_selection import RFECV
@@ -26,12 +24,10 @@
     "key_variable_exploration",
     "importance_scores",
     "random_forest_scores",
-    "boruta_scores",
     "rfe_scores",
     "stepwise_scores",
     "stepwise_selection",
     "RANDOM_FOREST",
-    "BORUTA",
     "RFE",
     "INCLUDE",
     "VARIABLE",
@@ -43,7 +39,6 @@
 
 
 RANDOM_FOREST: Final = "Random_Forest"
-BORUTA: Final = "Boruta"
 RFE: Final = "RFE"
 STEPWISE: Final = "Stepwise"
 INCLUDE: Final = "Include"
@@ -136,7 +131,7 @@ def key_variable_exploration(
 
     **kwargs
         Additional arguments to be passed to `importance_Scores`:
-        * methods : Tuple[str], optional Default: ('rf', 'boruta', 'rfe', 'stepwise')
+        * methods : Tuple[str], optional Default: ('rf', 'rfe', 'stepwise')
             Names of the ranking methods to run.
 
     Returns
@@ -162,7 +157,7 @@ def key_variable_exploration(
             normalize=True,
         )
 
-    methods = kwargs.get("methods", (RANDOM_FOREST, BORUTA, RFE, STEPWISE))
+    methods = kwargs.get("methods", (RANDOM_FOREST, RFE, STEPWISE))
     categories = len(df[sensitive_col].dropna().unique())
     if categories < 2:
         print(
@@ -238,7 +233,7 @@ def importance_scores(
         Number of categories in the senestive column used to determine the type
         of model used in feature selection, -1 indicates the column is continuous
 
-    methods : Tuple[str], optional (Default: "Random_Forest","Boruta","RFE", "Stepwise")
+    methods : Tuple[str], optional (Default: "Random_Forest","RFE", "Stepwise")
         Names of the ranking methods to run
 
     verbose : int {0, 1, 2}, (Default: 0)
@@ -256,7 +251,7 @@ def importance_scores(
         "callback", None
     )  # callable function that emits to main server
     if methods is None:
-        methods = (RANDOM_FOREST, BORUTA, RFE, STEPWISE)
+        methods = (RANDOM_FOREST, RFE, STEPWISE)
     method_len = float(len(methods))  # instantiated for progress emits
     method_count = 1  # instantiated for progress emits
     x_rf = input_df.drop([sensitive_col], axis=1)
@@ -274,15 +269,6 @@ def importance_scores(
         if progress_reporter is not None:
             progress_reporter(method_count / method_len)
             method_count += 1
-    if BORUTA in methods and x_train.shape[0] >= 250:
-        if verbose > 0:
-            print("Running Boruta...")
-        score_dict[f"{BORUTA}_{INCLUDE}"] = boruta_scores(
-            x_train, y, verbose=verbose, categories=categories
-        )
-        if progress_reporter is not None:
-            progress_reporter(method_count / method_len)
-            method_count += 1
     if RFE in methods:
         if verbose > 0:
             print("Running Recursive Feature Elimination...")
@@ -392,109 +378,6 @@ def random_forest_scores(
     return rf.feature_importances_, include
 
 
-@BEARTYPE
-def boruta_scores(
-    x_train: pd.DataFrame,
-    y: pd.Series,
-    categories: int,
-    n_estimators: int = 1000,
-    n_jobs: int = -1,
-    random_state: int = 1234,
-    verbose: int = 0,
-    max_iter: int = 50,
-) -> List[str]:
-    """
-    Boruta is an all relevant feature selection method, while most other are
-    minimal optimal; this means it tries to find all features carrying
-    information usable for prediction, rather than finding a possibly compact
-    subset of features on which some classifier has a minimal error
-
-
-    NOTE: Does not work with small data, requires >250 rows
-
-    Parameters
-    ----------
-    x_train : pd.DataFrame
-        A dataframe containing all input variables for training the model
-
-    y : pd.Series
-        A series containing the ground truth labels or numbers
-
-    categories: int
-        number of categories in the senestive column used to determine the type
-        of model used in feature selection, -1 indicates the column is continuous
-
-    n_estimators : int, optional (Default: 1000)
-        Number of trees that are constructed during the random forest
-
-    n_jobs : int, optional (Default: -1)
-        Number of workers to use for parallel processing
-            - -1 indicates use all available workers
-
-    random_state: int, optional (Default: 1234)
-        Integer seed for setting the random state in the model
-
-    verbose : int {0, 1, 2}, optional (Default 2)
-        Level of reporting from the algorithms:
-            - 0 disables verbose logging
-            - 2 is step-by-step reporting
-
-    max_iter: int, optional (Default: 50)
-        The number of maximum iterations to perform.
-
-    Returns
-    -------
-    List[str]
-        list of strings, contains whether a feature should be included in
-        further analysis:
-        - "yes": boruta ranking = 1
-        - "maybe": boruta ranking = 2
-        - "no": boruta ranking >= 3
-
-    References
-    ----------
-    https://medium.com/@indreshbhattacharyya/feature-selection-categorical-feature-selection-boruta-light-gbm-chi-square-bf47e94e2558
-
-    """
-    if x_train.shape[0] < 250:
-        print("Requires > 250 rows to be stable")
-        return []
-    if categories >= 2:
-        rf = RandomForestClassifier(
-            n_estimators=n_estimators,
-            n_jobs=n_jobs,
-            verbose=verbose,
-            random_state=random_state,
-        )
-    else:
-        rf = RandomForestRegressor(
-            n_estimators=n_estimators,
-            n_jobs=n_jobs,
-            verbose=verbose,
-            random_state=random_state,
-        )
-    boruta_selector = BorutaPy(
-        rf,
-        verbose=verbose,
-        n_estimators="auto",
-        random_state=random_state,
-        max_iter=max_iter,
-    )
-    if isinstance(x_train, np.ndarray):
-        boruta_selector.fit(x_train, y)
-    else:
-        boruta_selector.fit(x_train.values, y.values)
-    include = []
-    for r in list(boruta_selector.ranking_):
-        if r == 1:
-            include.append("yes")
-        elif r == 2:
-            include.append("maybe")
-        else:
-            include.append("no")
-    return include
-
-
 @BEARTYPE
 def rfe_scores(
     x_train: pd.DataFrame,
@@ -638,7 +521,7 @@ def rfe_scores(
                 multi_class="ovr",
             )
     else:
-        estimator = LinearRegression(normalize=True, n_jobs=n_jobs)
+        estimator = LinearRegression(n_jobs=n_jobs)
     rfecv_selector = RFECV(estimator, step=step, cv=cv, verbose=verbose, n_jobs=n_jobs)
     rfecv_selector.fit(x_train, y)
     return ["yes" if r == 1 else "no" for r in list(rfecv_selector.ranking_)]
diff --git a/tests/classifiers/test_classifiers.py b/tests/classifiers/test_classifiers.py
index 62ddc12..c220321 100644
--- a/tests/classifiers/test_classifiers.py
+++ b/tests/classifiers/test_classifiers.py
@@ -37,8 +37,8 @@ def my_df():
         (LogisticRegressionClassifier, EmbeddingsEncoder, 0.5),
         (RFClassifier, LabelEncoderPM, 1.0),
         (RFClassifier, EmbeddingsEncoder, 1.0),
-        (TpotClassifier, LabelEncoderPM, 0.77),
-        (TpotClassifier, EmbeddingsEncoder, 0.86),
+        (TpotClassifier, LabelEncoderPM, 0.8),
+        (TpotClassifier, EmbeddingsEncoder, 0.81),
     ],
 )
 def test_classifiers(my_df, combo):
diff --git a/tests/kve/test_kve.py b/tests/kve/test_kve.py
index d2ab2fa..d5d0b7e 100644
--- a/tests/kve/test_kve.py
+++ b/tests/kve/test_kve.py
@@ -2,7 +2,7 @@
 import pandas as pd
 import pytest
 
-from pymasq.kve import random_forest_scores, boruta_scores, rfe_scores, stepwise_scores
+from pymasq.kve import random_forest_scores, rfe_scores, stepwise_scores
 from pymasq.datasets import gen_num_df, gen_bin_df, load_census
 from pymasq.preprocessing import EmbeddingsEncoder
 from pymasq import ROOT_DIR
@@ -71,28 +71,6 @@ def test_random_forest_cont(my_df):
     )
     assert len(rf[1]) > 0, "Should be True"
 
-
-def test_boruta_cont(my_df):
-    """
-    Tests boruta_scores if passed a continuous variable for y
-    """
-    sensitive_col = "age"
-    my_df = EmbeddingsEncoder.encode(
-        my_df,
-        sensitive_col=sensitive_col,
-        cache_location=ROOT_DIR + "/datasets/data/cache",
-    )
-    rf = boruta_scores(
-        x_train=my_df.drop(sensitive_col, axis=1),
-        y=my_df[sensitive_col],
-        verbose=0,
-        categories=-1,
-        max_iter=5,
-        n_estimators=20,
-    )
-    assert len(rf[1]) > 0, "Should be True"
-
-
 def test_rfe_cont(my_df):
     """
     Tests rfe_scores if passed a continuous variable for y
@@ -134,29 +112,6 @@ def test_random_forest_multiclass(my_df):
     assert len(rf[1]) > 0, "Should be True"
 
 
-def test_boruta_multiclass(my_df):
-    """
-    Tests boruta_scores if passed a variable with number of categories > 2 for y
-    """
-    sensitive_col = "education"
-    my_df = EmbeddingsEncoder.encode(
-        my_df,
-        sensitive_col=sensitive_col,
-        cache_location=ROOT_DIR + "/datasets/data/cache",
-    )
-    y = my_df[sensitive_col]
-    n_cats = len(y.dropna().unique())
-    rf = boruta_scores(
-        x_train=my_df.drop(sensitive_col, axis=1),
-        y=y,
-        verbose=0,
-        categories=n_cats,
-        max_iter=5,
-        n_estimators=20,
-    )
-    assert len(rf[1]) > 0, "Should be True"
-
-
 def test_rfe_multiclass(my_df):
     """
     Tests rfe_scores if passed a variable with number of categories > 2 for y
@@ -194,20 +149,6 @@ def test_random_forest_bin(bin_df):
     ), "Should be ['yes', 'no', 'no', 'no', 'no', 'no']]"
 
 
-def test_boruta_bin(bin_df):
-    """
-    Tests boruta_scores feature importance ranks for a binary dataframe
-    of a given size.
-    """
-    y = bin_df["Label"]
-    n_cats = len(y.dropna().unique())
-    assert boruta_scores(
-        x_train=bin_df.drop("Label", axis=1), y=y, verbose=0, categories=n_cats
-    ) == ["yes"] * 5 + [
-        "maybe"
-    ], "Should be ['yes', 'yes', 'yes', 'yes', 'yes', 'maybe']"
-
-
 def test_rfe_bin(bin_df):
     """
     Tests rfe_scores feature importance ranks for a binary dataframe
@@ -241,24 +182,6 @@ def test_random_forest_num(num_df):
     ), "Should be ['yes', 'no', 'no', 'no', 'no', 'no']]"
 
 
-def test_boruta_num(num_df):
-    """
-    Tests boruta_scores feature importance ranks for a numeric dataframe
-    of a given size.
-    """
-    y = num_df["Label"]
-    n_cats = len(y.dropna().unique())
-    assert (
-        boruta_scores(
-            x_train=num_df.drop("Label", axis=1),
-            y=y,
-            verbose=0,
-            categories=n_cats,
-        )
-        == ["yes"] * 6
-    ), "Should be ['yes', 'yes', 'yes', 'yes', 'yes', 'yes']"
-
-
 def test_rfe_num(num_df):
     """
     Tests rfe_scores feature importance ranks for a numeric dataframe
@@ -277,23 +200,6 @@ def test_rfe_num(num_df):
     ), "Should be ['yes', 'no', 'no', 'no', 'no', 'no']"
 
 
-def test_boruta_comb(comb_df):
-    """
-    Tests boruta_scores feature importance ranks for a combined dataframe
-    of a given size.
-    """
-    if comb_df.shape[0] <= 2000:
-        assert True
-    y = comb_df["Label"]
-    n_cats = len(y.dropna().unique())
-    scores = boruta_scores(
-        x_train=comb_df.drop("Label", axis=1), y=y, verbose=0, categories=n_cats
-    )
-    assert (
-        scores == ["yes"] * 5 + ["maybe"] + ["yes"] * 6
-    ), "One 'maybe' at index 5, otherwise all 'yes"
-
-
 def test_random_forest_comb(comb_df):
     """
     Tests random_forest_scores feature importance ranks for a combined dataframe

From bed7a7d5c581be44fdbb53fceb3739b5bd222e82 Mon Sep 17 00:00:00 2001
From: cdo03c <cdo03c@gmail.com>
Date: Wed, 17 Jan 2024 20:41:05 -0500
Subject: [PATCH 17/17] replaces print statements with logs

---
 src/pymasq/config.py                         |  5 ++
 src/pymasq/kve/kve.py                        | 14 ++--
 src/pymasq/metrics/utils.py                  |  5 +-
 src/pymasq/mitigations/geom_transform.py     |  7 +-
 src/pymasq/mitigations/microaggregation.py   |  2 +-
 src/pymasq/mitigations/pram.py               |  9 ++-
 src/pymasq/mitigations/shuffle.py            |  4 +-
 src/pymasq/models/_base.py                   |  7 +-
 src/pymasq/models/models.py                  | 36 ++++++----
 src/pymasq/optimizations/_base.py            | 24 +++----
 src/pymasq/optimizations/optimizations.py    | 73 ++++++++++----------
 src/pymasq/preprocessing/entity_embedding.py | 12 ++--
 src/pymasq/preprocessing/preprocess.py       | 26 ++++---
 src/pymasq/utils/cache.py                    | 30 ++++----
 src/pymasq/utils/utils.py                    | 13 ++--
 tests/classifiers/test_classifiers.py        | 21 +++---
 tests/integration/integration.py             | 30 ++++----
 tests/metrics/test_utility_scores.py         | 10 +--
 tests/mitigations/test_global_recode.py      | 16 +++--
 tests/mitigations/test_hashing.py            | 12 ++--
 tests/preprocessing/test_preprocess.py       | 13 ++--
 tests/utils/test_cache.py                    | 16 +++--
 22 files changed, 217 insertions(+), 168 deletions(-)

diff --git a/src/pymasq/config.py b/src/pymasq/config.py
index 57f1b3a..f965170 100644
--- a/src/pymasq/config.py
+++ b/src/pymasq/config.py
@@ -1,6 +1,8 @@
 from pathlib import Path
 from typing import Tuple
 
+import numpy as np
+
 # Directory where all embeddings and models will be cached
 CACHE_LOCATION: Path = Path("~/.cache/pymasq").expanduser()
 
@@ -26,6 +28,7 @@
 CLASSIFIER_MODELS: Tuple[str] = ("logreg", "rfclass", "tpotclass")
 
 DEFAULT_LOGISITIC_REGRESSION_SOLVER: str = "saga"
+DEFAULT_MODEL_ITERATIONS: int = 1000
 
 # Byte Pair Encoding default language and dimensionality for vectors
 BPE_LANG: str = "en"
@@ -39,3 +42,5 @@
 
 # Default number of parallel processors, set to -1 for all processors
 DEFAULT_N_JOBS: int = -1
+
+rg = np.random.default_rng(DEFAULT_SEED)
\ No newline at end of file
diff --git a/src/pymasq/kve/kve.py b/src/pymasq/kve/kve.py
index a9ccac1..75add12 100644
--- a/src/pymasq/kve/kve.py
+++ b/src/pymasq/kve/kve.py
@@ -160,7 +160,7 @@ def key_variable_exploration(
     methods = kwargs.get("methods", (RANDOM_FOREST, RFE, STEPWISE))
     categories = len(df[sensitive_col].dropna().unique())
     if categories < 2:
-        print(
+        logger.info(
             "The kve function requires two categories for binary classification  and the {} column has {} class".format(
                 sensitive_col, categories
             )
@@ -173,7 +173,7 @@ def key_variable_exploration(
         df, sensitive_col, categories=categories, verbose=verbose, **kwargs
     )
     if verbose > 0:
-        print("Building ranking...")
+        logger.info("Building ranking...")
     include_cols = [c for c in rank_df.columns if INCLUDE in c]
 
     rank_df[INCLUDE] = rank_df.apply(
@@ -261,7 +261,7 @@ def importance_scores(
     score_dict = {}
     if RANDOM_FOREST in methods:
         if verbose > 0:
-            print("Running Random Forest...")
+            logger.info("Running Random Forest...")
         (
             score_dict[RANDOM_FOREST],
             score_dict[f"{RANDOM_FOREST}_{INCLUDE}"],
@@ -271,7 +271,7 @@ def importance_scores(
             method_count += 1
     if RFE in methods:
         if verbose > 0:
-            print("Running Recursive Feature Elimination...")
+            logger.info("Running Recursive Feature Elimination...")
         score_dict[f"{RFE}_{INCLUDE}"] = rfe_scores(
             x_train, y, verbose=verbose, categories=categories
         )
@@ -280,7 +280,7 @@ def importance_scores(
             method_count += 1
     if STEPWISE in methods:
         if verbose > 0:
-            print("Running Stepwise...")
+            logger.info("Running Stepwise...")
         score_dict[f"{STEPWISE}_{INCLUDE}"] = stepwise_scores(
             x_rf, y_rf, verbose=verbose
         )
@@ -654,7 +654,7 @@ def stepwise_selection(
             tested.append(best_feature)
             changed = True
             if verbose > 0:
-                print("Add  {:30} with p-value {:.6}".format(best_feature, best_pval))
+                logger.info("Add  {:30} with p-value {:.6}".format(best_feature, best_pval))
 
         # backward step
         model = sm.OLS(y, sm.add_constant(pd.DataFrame(x_train[included]))).fit()
@@ -666,7 +666,7 @@ def stepwise_selection(
             worst_feature = included[pvalues.argmax()]
             included.remove(worst_feature)
             if verbose > 0:
-                print("Drop {:30} with p-value {:.6}".format(worst_feature, worst_pval))
+                logger.info("Drop {:30} with p-value {:.6}".format(worst_feature, worst_pval))
         if not changed:
             break
         count += 1
diff --git a/src/pymasq/metrics/utils.py b/src/pymasq/metrics/utils.py
index 54c9f02..b33043f 100644
--- a/src/pymasq/metrics/utils.py
+++ b/src/pymasq/metrics/utils.py
@@ -1,3 +1,5 @@
+import logging
+
 from pymasq.config import CATEGORY_THRESHOLD
 import pandas as pd
 
@@ -7,6 +9,7 @@
 
 __all__ = ["uniq_col_name", "_get_model_task"]
 
+logger = logging.getLogger(__name__)
 
 @BEARTYPE
 def uniq_col_name(df, prefix: str = "class") -> str:
@@ -61,7 +64,7 @@ def _get_model_task(
     elif is_numeric_dtype(sensitive_col):
         return "regression"
     else:
-        print(
+        logger.info(
             "The number of unique categories: {} is greater than the threshold of {} and is dtype {}".format(
                 num_unique, cat_threshold, sensitive_col.dtype
             )
diff --git a/src/pymasq/mitigations/geom_transform.py b/src/pymasq/mitigations/geom_transform.py
index 6c89a63..d5dd118 100644
--- a/src/pymasq/mitigations/geom_transform.py
+++ b/src/pymasq/mitigations/geom_transform.py
@@ -1,4 +1,5 @@
 import itertools
+import logging
 
 import numpy as np
 import pandas as pd
@@ -7,7 +8,7 @@
 from typing import List, Optional, Union
 
 from pymasq import BEARTYPE
-from pymasq.config import FORMATTING_ON_OUTPUT, FORMATTING_IGNORE_DTYPES, DEFAULT_SEED
+from pymasq.config import FORMATTING_ON_OUTPUT, FORMATTING_IGNORE_DTYPES, rg
 from pymasq.errors import InputError
 from pymasq.mitigations.utils import _is_identical
 from pymasq.utils import formatting
@@ -15,7 +16,7 @@
 
 __all__ = ["geom_transform"]
 
-rg = np.random.default_rng(DEFAULT_SEED)
+logger = logging.getLogger(__name__)
 
 SKIP_ROTATION_ANGLES = [30, 45, 60, 90, 120, 135, 150, 180]
 MAX_DEGREES = 180
@@ -231,7 +232,7 @@ def geom_transform(
                 f"The values of `data[{perturb_cols}]` are all identical and therefore cannot be used for correlation."
             )
         else:
-            print(
+            logger.info(
                 "WARNING: ignoring columns that are composed entirely of identical values."
             )
     elif len(perturb_cols) == 1:
diff --git a/src/pymasq/mitigations/microaggregation.py b/src/pymasq/mitigations/microaggregation.py
index a629b30..a86196a 100644
--- a/src/pymasq/mitigations/microaggregation.py
+++ b/src/pymasq/mitigations/microaggregation.py
@@ -548,7 +548,7 @@ def robust_magg(
     # test data for normality; z-scores are only meaningful for normally distributed data
     result = shapiro(data)
     if result.pvalue < 0.05:
-        print(
+        logger.info(
             f"Warning: data not normally distributed; fails Shapiro-Wilk test (p={result.pvalue})."
         )
 
diff --git a/src/pymasq/mitigations/pram.py b/src/pymasq/mitigations/pram.py
index 17e368f..6c2c9d5 100644
--- a/src/pymasq/mitigations/pram.py
+++ b/src/pymasq/mitigations/pram.py
@@ -1,8 +1,9 @@
+import logging
+from typing import Dict, List, Optional, Union
+
 import pandas as pd
 import numpy as np
 
-from typing import Dict, List, Optional, Union
-
 from pymasq import BEARTYPE
 from pymasq.config import (
     DEFAULT_SEED,
@@ -15,6 +16,8 @@
 
 __all__ = ["pram"]
 
+logger = logging.getLogger(__name__)
+
 rg = np.random.default_rng(DEFAULT_SEED)
 
 def __calc_transition_matrix(
@@ -301,7 +304,7 @@ def pram(
         if len(perturb_cols) == 0:
             raise InputError("All values of `data` cannot be NaNs or identical.")
         else:
-            print(
+            logger.info(
                 "WARNING: ignoring columns that are composed entirely of identical values."
             )
 
diff --git a/src/pymasq/mitigations/shuffle.py b/src/pymasq/mitigations/shuffle.py
index 79dd284..af37fa5 100644
--- a/src/pymasq/mitigations/shuffle.py
+++ b/src/pymasq/mitigations/shuffle.py
@@ -1,3 +1,4 @@
+import logging
 import math
 from typing import Union, List, Final, Optional
 
@@ -25,6 +26,7 @@
     "MODEL",
 ]
 
+logger = logging.getLogger(__name__)
 
 SPEARMAN: Final = "spearman"
 PEARSON: Final = "pearson"
@@ -212,7 +214,7 @@ def shuffle(
                 f"The values of `data[{cor_cols}]` are all identical and therefore cannot be used for correlation."
             )
         else:
-            print(
+            logger.info(
                 "WARNING: ignoring columns that are composed entirely of identical values."
             )
 
diff --git a/src/pymasq/models/_base.py b/src/pymasq/models/_base.py
index 2cc0efa..898a9c3 100644
--- a/src/pymasq/models/_base.py
+++ b/src/pymasq/models/_base.py
@@ -1,3 +1,4 @@
+import logging
 import os
 from abc import abstractmethod
 from typing import Type, Optional, Union
@@ -9,6 +10,8 @@
 from pymasq.preprocessing._base import PreprocessorBase
 from pymasq import BEARTYPE
 
+logger = logging.getLogger(__name__)
+
 
 class ModelingBase:
     """
@@ -106,7 +109,7 @@ def train(
         if not retrain:
             self.load_trained_model(df, verbose)  # sets self.trained from file
             if self.trained and verbose > 0:
-                print(
+                logger.info(
                     f"{self.name}: loading trained model from cache. (Set retrain=True to ignore cache.)"
                 )
 
@@ -186,7 +189,7 @@ def save_trained_model(
             verbose=verbose,
         )
         if verbose > 0:
-            print(f"{self.name} model trained and saved to: {filename}")
+            logger.info(f"{self.name} model trained and saved to: {filename}")
 
     @BEARTYPE
     def load_trained_model(
diff --git a/src/pymasq/models/models.py b/src/pymasq/models/models.py
index 5e19ede..d4debf1 100644
--- a/src/pymasq/models/models.py
+++ b/src/pymasq/models/models.py
@@ -1,14 +1,16 @@
-from pymasq.config import DEFAULT_LOGISITIC_REGRESSION_SOLVER
+import logging
+from typing import List, Optional, Type, Any, Union
+
 import pandas as pd
 import numpy as np
-from typing import List, Optional, Type, Any, Union
-from sklearn.preprocessing import LabelEncoder
+from sklearn.preprocessing import LabelEncoder, StandardScaler
 from tpot import TPOTClassifier, TPOTRegressor
 from sklearn.metrics import roc_auc_score
 from sklearn.linear_model import ElasticNetCV, ElasticNet, LarsCV, LogisticRegressionCV
 from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
 
 from pymasq import BEARTYPE
+from pymasq.config import DEFAULT_LOGISITIC_REGRESSION_SOLVER, DEFAULT_MODEL_ITERATIONS
 from pymasq.models._base import ModelingBase
 from pymasq.preprocessing._base import PreprocessorBase
 
@@ -24,6 +26,8 @@
 
 #########################
 
+logger = logging.getLogger(__name__)
+
 
 def mape(
     y_true: Union[pd.Series, List[float]], y_pred: Union[pd.Series, List[float]]
@@ -106,7 +110,7 @@ def train(
 
         # We didn't: train the model and cache it.
         if verbose > 0:
-            print("Training LarsCV model ")
+            logger.info("Training LarsCV model ")
         x_train = df_enc.drop(y_column, axis=1)
         y = df_enc[y_column]
         self.trained = LarsCV(n_jobs=self.n_jobs)
@@ -198,7 +202,7 @@ def train(
 
         # no cache found, we need to train.
         if verbose > 0:
-            print("Training ElasticNetCV model ")
+            logger.info("Training ElasticNetCV model ")
         x_train = df_enc.drop(y_column, axis=1)
         y = df_enc[y_column]
 
@@ -263,8 +267,9 @@ def __init__(
             A string defining the type of modeling task
         """
         super().__init__(
-            name="logreg", cache_location=cache_location, modeling_task=modeling_task
+            name="logreg", cache_location=cache_location, modeling_task=modeling_task, 
         )
+        self.scaler = StandardScaler()
 
     @BEARTYPE
     def train(
@@ -307,16 +312,18 @@ def train(
 
         # no cache found, we need to train.
         if verbose > 0:
-            print("Training Logistic Regression model ")
+            logger.info("Training Logistic Regression model ")
         x_train = df_enc.drop(y_column, axis=1)
+        x_scaled = self.scaler.fit_transform(x_train)
         y = LabelEncoder().fit_transform(df_enc[y_column])
 
         self.trained = LogisticRegressionCV(
             random_state=self.seed,
             n_jobs=self.n_jobs,
             solver=DEFAULT_LOGISITIC_REGRESSION_SOLVER,
+            max_iter=DEFAULT_MODEL_ITERATIONS,
         )
-        self.trained.fit(x_train, y)
+        self.trained.fit(x_scaled, y)
 
         # save to cache
         self.save_trained_model(
@@ -344,12 +351,13 @@ def predict(self, x_test: pd.DataFrame, y_true: pd.Series) -> float:
 
         """
         assert self.trained is not None
+        x_scaled = self.scaler.fit_transform(x_test)
 
         if pd.Series(y_true).nunique() == 2:
-            y_predict = self.trained.predict(x_test)
+            y_predict = self.trained.predict(x_scaled)
             return roc_auc_score(y_true=y_true.tolist(), y_score=y_predict)
         else:
-            y_predict = self.trained.predict_proba(x_test)
+            y_predict = self.trained.predict_proba(x_scaled)
             return roc_auc_score(
                 y_true=y_true.tolist(), y_score=y_predict[:, 1:], multi_class="ovr"
             )
@@ -416,7 +424,7 @@ def train(
 
         # no cache found, we need to train.
         if verbose > 0:
-            print("Training Logistic Regression model ")
+            logger.info("Training Logistic Regression model ")
         x_train = df_enc.drop(y_column, axis=1)
         y = LabelEncoder().fit_transform(df_enc[y_column])
 
@@ -518,7 +526,7 @@ def train(
 
         # We didn't: train the model and cache it.
         if verbose > 0:
-            print("Training LarsCV model ")
+            logger.info("Training LarsCV model ")
         x_train = df_enc.drop(y_column, axis=1)
         y = LabelEncoder().fit_transform(df_enc[y_column])
         self.trained = RandomForestRegressor(n_jobs=self.n_jobs, random_state=self.seed)
@@ -671,7 +679,7 @@ def train(
 
         # No cache available, we need to train
         if verbose > 0:
-            print(f"{type(self).__name__} Training new model.")
+            logger.info(f"{type(self).__name__} Training new model.")
 
         tpot = TPOTClassifier(
             generations=int(generations),
@@ -852,7 +860,7 @@ def train(
 
         # No cache available, we need to train
         if verbose > 0:
-            print(f"{type(self).__name__} Training new model.")
+            logger.info(f"{type(self).__name__} Training new model.")
 
         tpot = TPOTRegressor(
             generations=int(generations),
diff --git a/src/pymasq/optimizations/_base.py b/src/pymasq/optimizations/_base.py
index 8db85f7..e81d472 100644
--- a/src/pymasq/optimizations/_base.py
+++ b/src/pymasq/optimizations/_base.py
@@ -1,15 +1,14 @@
 import copy
 import inspect
-import numpy as np
-import pandas as pd
+import logging
 from abc import abstractmethod
-
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
+import numpy as np
+import pandas as pd
 
 import pymasq.mitigations as mits
 import pymasq.metrics as mets
-
 from pymasq import BEARTYPE
 from pymasq.config import DEFAULT_SEED
 from pymasq.errors import (
@@ -19,6 +18,7 @@
     NoMutationAvailableError,
 )
 
+logger = logging.getLogger(__name__)
 rg = np.random.Generator(np.random.PCG64(DEFAULT_SEED))
 
 class OptimizationBase:
@@ -167,7 +167,7 @@ def __init__(
         if not self.reuse_mutations and self.iters > n_mutations:
             self.iters = n_mutations
             if self.verbose:
-                print(
+                logger.info(
                     ">>> [Info]: The number of iterations (%i)" % (iters),
                     "cannot exceed the number of mutations specified (%i)"
                     % (n_mutations),
@@ -251,7 +251,7 @@ def optimize(self) -> Tuple[pd.DataFrame, float, pd.DataFrame]:
             A dataframe with the records of each dataframe, mutation, and fitness value accross the optimization
         """
         if self.verbose:
-            print("[Starting ...]")
+            logger.info("[Starting ...]")
 
         self._target = self.target.copy()
         self._iters = self.iters
@@ -261,7 +261,7 @@ def optimize(self) -> Tuple[pd.DataFrame, float, pd.DataFrame]:
         target, fit, logbook = self._optimize()  # algo-specific
 
         if self.verbose:
-            print("[... Search Complete]")
+            logger.info("[... Search Complete]")
 
         if self.progress_reporter:
             self.progress_reporter(1.0)
@@ -312,7 +312,7 @@ def _evaluate(self, target) -> Tuple[float, List[Tuple]]:
                 func = getattr(mets, func)
 
             if self.verbose >= 2:
-                print("\t[Evaluation]: %s" % (func))
+                logger.info("\t[Evaluation]: %s" % (func))
 
             params = copy.deepcopy(args.get("params", {}))
 
@@ -339,7 +339,7 @@ def _evaluate(self, target) -> Tuple[float, List[Tuple]]:
                     raise
                 else:
                     if self.verbose >= 2:
-                        print(f"[Warning] exception {func.__name__}: {e}")
+                        logger.info(f"[Warning] exception {func.__name__}: {e}")
                     raise
             fitnesses.append((func.__name__, value, args["weight"]))
 
@@ -403,7 +403,7 @@ def _mutate(
 
         if not self.reuse_mutations and not mutations:
             if self.verbose:
-                print(
+                logger.info(
                     ">>> [NOOP] No mutations to apply (consider changing `reuse_mutations`)."
                 )
             return target, {}  # NOOP; all mitigations used and removed
@@ -435,13 +435,13 @@ def _mutate(
             func = getattr(mits, func)
 
         if self.verbose >= 2:
-            print("\t[Mutation]: %s" % (func), args)
+            logger.info("\t[Mutation]: %s" % (func), args)
 
         try:
             result = func(target, **args)
         except Exception as e:
             if self.verbose >= 2:
-                print(
+                logger.info(
                     f"[Warning] mutation {func.__name__} failed with args:={args} and error: {e}"
                 )
             raise
diff --git a/src/pymasq/optimizations/optimizations.py b/src/pymasq/optimizations/optimizations.py
index 9697b21..9eda759 100644
--- a/src/pymasq/optimizations/optimizations.py
+++ b/src/pymasq/optimizations/optimizations.py
@@ -1,13 +1,16 @@
 import itertools
+import logging
 from typing import Optional
 
 import numpy as np
 from scipy.special import perm
 
 from pymasq import BEARTYPE
+from pymasq.config import rg
 from pymasq.errors import LessThanOrEqualToZeroError, NotInRangeError
 from pymasq.optimizations._base import OptimizationBase
 
+logger = logging.getLogger(__name__)
 
 class IterativeSearch(OptimizationBase):
     """Iterative (sequential) optimization algorithm.
@@ -93,7 +96,7 @@ def _optimize(self):
 
         while all([cur_fit > self.theta, self._iters > 0]):
             if self.verbose:
-                print("-- Iteration [%i] --" % (self._max_iters - self._iters))
+                logger.info("-- Iteration [%i] --" % (self._max_iters - self._iters))
                 if self.progress_reporter:
                     self.progress_reporter(
                         round(1 - (self._iters / (self._max_iters * 1.0)), 2)
@@ -105,13 +108,13 @@ def _optimize(self):
             new_fit, fit_log, met_errors = self._safe_evaluate(new_target)
             error_log += met_errors
             if self.verbose >= 2:
-                print(
+                logger.info(
                     ">> Current fitness: %.5f | " % (cur_fit),
                     "New fitness: %.5f | " % (new_fit),
                     "Best fitness: %.5f" % (best_fit),
                 )
                 if self.verbose >= 3:
-                    print(new_target)
+                    logger.info(new_target)
 
             cur_fit = new_fit
             target = new_target
@@ -125,12 +128,12 @@ def _optimize(self):
             )
 
             if cur_fit <= self.theta and self.verbose:
-                print(">>> [Terminating]: Solution found")
+                logger.info(">>> [Terminating]: Solution found")
 
             self._iters -= 1
 
             if self._iters <= 0 and self.verbose:
-                print(">>> [Terminating]: Iterations complete")
+                logger.info(">>> [Terminating]: Iterations complete")
 
         if self.return_best:
             return best_target, best_fit, self._logbook
@@ -239,7 +242,7 @@ def _optimize(self):
 
         while all([cur_fit > self.theta, self._iters > 0, retry > 0]):
             if self.verbose:
-                print("-- Iteration [%i] --" % (self._max_iters - self._iters))
+                logger.info("-- Iteration [%i] --" % (self._max_iters - self._iters))
                 if self.progress_reporter:
                     self.progress_reporter(
                         round(1 - (self._iters / (self._max_iters * 1.0)), 2)
@@ -252,11 +255,11 @@ def _optimize(self):
             error_log += met_errors
 
             if self.verbose >= 2:
-                print(
+                logger.info(
                     ">> Current fitness: %.5f | New fitness: %.5f" % (cur_fit, new_fit)
                 )
                 if self.verbose >= 3:
-                    print(new_target)
+                    logger.info(new_target)
 
             if new_fit < cur_fit:
                 cur_fit = new_fit
@@ -264,7 +267,7 @@ def _optimize(self):
             else:
                 retry -= 1
                 if self.verbose >= 2:
-                    print(">>> Retries left: %i" % (retry))
+                    logger.info(">>> Retries left: %i" % (retry))
 
             self._record_stats(
                 fitness=cur_fit,
@@ -275,16 +278,15 @@ def _optimize(self):
             )
 
             if cur_fit <= self.theta and self.verbose:
-                print(">>> [Terminating]: Solution found")
+                logger.info(">>> [Terminating]: Solution found")
 
-            if retry <= 0:
-                if self.verbose:
-                    print(">>> [Terminating]: Max number of retries reached")
+            if retry <= 0 and self.verbose:
+                    logger.info(">>> [Terminating]: Max number of retries reached")
 
             self._iters -= 1
 
             if self._iters <= 0 and self.verbose:
-                print(">>> [Terminating]: Iterations complete")
+                logger.info(">>> [Terminating]: Iterations complete")
 
         return target, cur_fit, self._logbook
 
@@ -427,7 +429,7 @@ def _optimize(self):
 
         while all([best_fit > self.theta, self._iters > 0]):
             if self.verbose:
-                print("-- Iteration [%i] --" % (self._max_iters - self._iters))
+                logger.info("-- Iteration [%i] --" % (self._max_iters - self._iters))
                 if self.progress_reporter:
                     self.progress_reporter(
                         round(1 - (self._iters / (self._max_iters * 1.0)), 2)
@@ -442,31 +444,30 @@ def _optimize(self):
             error_log += met_errors
 
             if self.verbose >= 2:
-                print(
+                logger.info(
                     ">> Current fitness: %.5f | " % (cur_fit),
                     "New fitness: %.5f | " % (new_fit),
                     "Best fitness: %.5f" % (best_fit),
                 )
                 if self.verbose >= 3:
-                    print(new_target)
+                    logger.info(new_target)
 
-            prob = np.random.random_sample()
+            prob = rg.random()
 
             if not target.equals(new_target) and (
                 self._accept_prob(cur_fit, new_fit) > prob
             ):
-                if self.verbose >= 1:
-                    print(
-                        ">> New solution accepted",
-                        "(inferior solution)" if cur_fit < new_fit else "",
-                    )
+                if self.verbose >= 1 and cur_fit < new_fit:
+                        logger.info(
+                            ">> New solution accepted"
+                        )
                 cur_fit = new_fit
                 target = new_target
                 accepted = True
 
             if new_fit < best_fit:
                 if self.verbose >= 1:
-                    print(f">> New [best] solution found: {new_fit} < {best_fit}")
+                    logger.info(f">> New [best] solution found: {new_fit} < {best_fit}")
                 best_fit = new_fit
                 best_target = new_target
 
@@ -482,10 +483,10 @@ def _optimize(self):
             self.temperature *= 1 - self.alpha
 
             if cur_fit <= self.theta and self.verbose:
-                print(">>> [Terminating]: Solution found")
+                logger.info(">>> [Terminating]: Solution found")
 
             if self._iters <= 0 and self.verbose:
-                print(">>> [Terminating]: Iterations complete")
+                logger.info(">>> [Terminating]: Iterations complete")
 
         return best_target, best_fit, self._logbook
 
@@ -621,25 +622,25 @@ def _optimize(self):
 
         if any([cur_fit <= self.theta, self._iters <= 0]):
             if self.verbose:
-                print(">>> [Terminating]: Solution found or Iterations Complete")
+                logger.info(">>> [Terminating]: Solution found or Iterations Complete")
             return target, cur_fit, self._logbook
 
         if self.randomize_mutations:
             # Note: only matters when `num_perms` is set.
-            test = np.random.shuffle(self._mutations)
+            rg.shuffle(self._mutations)
 
         for num_perms, mutation_perms in enumerate(
             itertools.permutations(self._mutations, self.size_perms)
         ):
             if self.verbose:
-                print("-- Permutation: [%i] --" % (num_perms))
+                logger.info("-- Permutation: [%i] --" % (num_perms))
 
             target = self._target.copy()
 
             stop = False
             for mutation in mutation_perms:
                 if self.verbose:
-                    print("\t-- Iteration [%i] --" % (self._max_iters - self._iters))
+                    logger.info("\t-- Iteration [%i] --" % (self._max_iters - self._iters))
                     if self.progress_reporter:
                         self.progress_reporter(
                             round(1 - (self._iters / (self._max_iters * 1.0)), 2)
@@ -654,13 +655,13 @@ def _optimize(self):
                 error_log += met_errors
 
                 if self.verbose >= 2:
-                    print(
+                    logger.info(
                         ">> Current fitness: %.5f | " % (cur_fit),
                         "New fitness: %.5f | " % (new_fit),
                         "Best fitness: %.5f" % (best_fit),
                     )
                     if self.verbose >= 3:
-                        print(new_target)
+                        logger.info(new_target)
 
                 cur_fit = new_fit
                 target = new_target
@@ -679,7 +680,7 @@ def _optimize(self):
 
                 if cur_fit <= self.theta:
                     if self.verbose:
-                        print(">>> [Terminating]: Solution found")
+                        logger.info(">>> [Terminating]: Solution found")
                     stop = True
                     break
 
@@ -687,16 +688,16 @@ def _optimize(self):
 
                 if self._iters <= 0:
                     if self.verbose:
-                        print(">>> [Terminating]: Iterations complete")
+                        logger.info(">>> [Terminating]: Iterations complete")
                     stop = True
                     break
 
             if self.verbose:
-                print("\n")
+                logger.info("\n")
 
             if (num_perms + 1) >= self.max_perms:
                 if self.verbose:
-                    print(">>> [Terminating]: Number of permutations complete")
+                    logger.info(">>> [Terminating]: Number of permutations complete")
                 stop = True
 
             if stop:
diff --git a/src/pymasq/preprocessing/entity_embedding.py b/src/pymasq/preprocessing/entity_embedding.py
index 5dcbdcb..185098e 100755
--- a/src/pymasq/preprocessing/entity_embedding.py
+++ b/src/pymasq/preprocessing/entity_embedding.py
@@ -1,4 +1,5 @@
 import hashlib
+import logging
 from pathlib import Path
 from typing import Dict, Optional, Union
 
@@ -16,6 +17,7 @@
 from pymasq.utils import cache
 import pymasq.config as cfg
 
+logger = logging.getLogger(__name__)
 
 def embed_cache_fn(column: pd.Series, cache_location: Path) -> Path:
     """
@@ -93,7 +95,7 @@ def embed_entities(
     seed = cfg.DEFAULT_SEED if seed is None else seed
 
     if verbose > 0:
-        print(f"Tensor flow seed set to {seed}.")
+        logger.info(f"Tensor flow seed set to {seed}.")
     tf_set_seed(seed)
 
     embed_dict = {}
@@ -113,19 +115,19 @@ def embed_entities(
                 # ignore description
                 embed_dict[column], _ = cache.load_cache(filename)
                 if verbose > 1:
-                    print("\t Cache file found and loaded for column", column)
+                    logger.info("\t Cache file found and loaded for column {column}")
                 # returns none if a file was found but hmac didn't match
                 if embed_dict[column] is not None:
                     continue
 
         if verbose > 1:
-            print("\tembed_entities: No cache available for ", column)
+            logger.info(f"\tembed_entities: No cache available for {column}")
 
         # Converts categories represented by integers to strings so that the
         # label encoder will work and the classes can be determined later
         categorical_df.loc[:, column] = categorical_df.loc[:,column].astype(str)
         le = LabelEncoder()
-        X_train = le.fit_transform(categorical_df[column])
+        x_train = le.fit_transform(categorical_df[column])
 
         model = Sequential()
         model.add(
@@ -161,7 +163,7 @@ def embed_entities(
         sgd = SGD(learning_rate=learning_rate)
         model.compile(optimizer=sgd, loss=loss_array, metrics=metrics_array)
         model.fit(
-            X_train,
+            x_train,
             y,
             epochs=epochs,
             verbose=0,
diff --git a/src/pymasq/preprocessing/preprocess.py b/src/pymasq/preprocessing/preprocess.py
index 6d7dcfc..e4abf98 100644
--- a/src/pymasq/preprocessing/preprocess.py
+++ b/src/pymasq/preprocessing/preprocess.py
@@ -1,5 +1,7 @@
+import logging
 from time import time
 from typing import Tuple, List, Union, Optional
+
 import numpy as np
 import pandas as pd
 from bpemb import BPEmb
@@ -21,6 +23,8 @@
 
 #################
 
+logger = logging.getLogger(__name__)
+
 REDUCTION_METHODS = {
     "pca": PCA,
     "trucated": TruncatedSVD,
@@ -508,12 +512,12 @@ def encode(
             cache_location = Path(cache_location)
 
         if verbose > 0:
-            print("Preprocessing Data...")
+            logger.info("Preprocessing Data...")
             start = time()
 
         cache_location.mkdir(parents=True, exist_ok=True)
         if verbose > 0:
-            print("cache_location for preprocess is: " + str(cache_location))
+            logger.info("cache_location for preprocess is: " + str(cache_location))
 
         # Remove the sensitive column and other columns from consideration.
         # We'll add them back in later.
@@ -542,7 +546,7 @@ def encode(
         ignore_col_data = None
 
         if verbose > 0:
-            print("Splitting Data into Numerical and Categorical Data...")
+            logger.info("Splitting Data into Numerical and Categorical Data...")
 
         if sensitive_col or ignore_columns:
             input_data = df.drop(columns=dropped_cols, axis=1).copy()
@@ -554,7 +558,7 @@ def encode(
         binary = input_data.loc[:, binary_columns]
         if binary_columns:
             if verbose > 0:
-                print("Imputing Missing Binary Data...")
+                logger.info("Imputing Missing Binary Data...")
             simple_imputer = SimpleImputer(strategy="most_frequent")
             binary = pd.DataFrame(
                 simple_imputer.fit_transform(input_data[binary_columns]),
@@ -565,7 +569,7 @@ def encode(
         numerical_imputed_normalized = pd.DataFrame()
         if numerical_columns:
             if verbose > 0:
-                print("Imputing Missing Numerical Data...")
+                logger.info("Imputing Missing Numerical Data...")
             simple_imputer = SimpleImputer(strategy="mean")
             simple_imputer.fit(input_data[numerical_columns])
             numerical_imputed = pd.DataFrame(
@@ -593,7 +597,7 @@ def encode(
         categorical_embeddings = []
         if categorical_columns:
             if verbose > 0:
-                print("Imputing Missing Categorical Data...")
+                logger.info("Imputing Missing Categorical Data...")
             simple_imputer = SimpleImputer(fill_value="None", strategy="constant")
             simple_imputer.fit(input_data[categorical_columns])
             categorical_imputed = pd.DataFrame(
@@ -612,7 +616,7 @@ def encode(
                     columns=numerical_columns,
                 )
             if verbose > 0:
-                print("Creating/Loading Categorical Data Embeddings...")
+                logger.info("Creating/Loading Categorical Data Embeddings...")
 
             new_embeddings = embed_entities(
                 target_df=y,
@@ -635,7 +639,7 @@ def encode(
         textual_embeddings = []
         if textual_columns:
             if verbose > 0:
-                print("Imputing Missing Textual Data...")
+                logger.info("Imputing Missing Textual Data...")
             simple_imputer = SimpleImputer(
                 missing_values="", fill_value="None", strategy="constant"
             )
@@ -646,10 +650,10 @@ def encode(
                 columns=textual_columns,
             )
             if verbose > 0:
-                print("Creating Textual Data Embeddings...")
+                logger.info("Creating Textual Data Embeddings...")
             for col in textual_columns:
                 if verbose > 0:
-                    print("\t" + col)
+                    logger.info("\t" + col)
                 sents = textual_imputed[col].str.lower().str.replace("[!?:/]", " ")
 
                 textual_embedding_array = EmbeddingsEncoder.sentence_bpe_vectors(
@@ -668,7 +672,7 @@ def encode(
                 textual_embeddings.append(textual_embedding)
 
         if verbose > 0:
-            print("Preprocessing took: {} seconds".format(round(time() - start, 2)))
+            logger.info("Preprocessing took: {} seconds".format(round(time() - start, 2)))
 
         if sensitive_col:
             return pd.concat(
diff --git a/src/pymasq/utils/cache.py b/src/pymasq/utils/cache.py
index c082f5c..87d2bd0 100644
--- a/src/pymasq/utils/cache.py
+++ b/src/pymasq/utils/cache.py
@@ -1,17 +1,21 @@
-import pickle
 import hashlib
-from pathlib import Path
 import hmac
 import glob
+import logging
+import os
+import pickle
 import shutil
+from pathlib import Path
 from typing import Optional, Tuple, Dict, Union
-from pandas.util import hash_pandas_object
+
 import pandas as pd
-from pymasq import BEARTYPE
+from pandas.util import hash_pandas_object
+
 import pymasq.config as cfg
+from pymasq import BEARTYPE
 from pymasq.errors import InputError
-import os
 
+logger = logging.getLogger(__name__)
 
 def _hmac(data: object) -> str:
     """
@@ -63,7 +67,7 @@ def save(
 
     filename = f"{fn_prefix}.{_hmac(pickled_data)}.pkl"
     if verbose > 0:
-        print(f"Saving. hmac key is: {cfg.CACHE_HMAC_KEY}")
+        logger.info(f"Saving. hmac key is: {cfg.CACHE_HMAC_KEY}")
     with open(filename, "wb") as fd:
         fd.write(pickled_data)
 
@@ -112,7 +116,7 @@ def load_cache(
         # check the hmac of the file (unless ignore)
         if str(digest) != file.split(".")[-2] and not ignore_hmac:
             if verbose > 0:
-                print(
+                logger.info(
                     f""""
                     Error: hmac of file ({digest}) does not match the hmac stored in the filename 
                     ({file.split('.')[-2]}) for hmac key of '{cfg.CACHE_HMAC_KEY}' for file: {file}
@@ -120,8 +124,8 @@ def load_cache(
                 )
             continue
         if verbose > 0:
-            print(f"Expected hmac: {str(digest)}")
-            print(f"Filename hmac: {file.split('.')[-2]}")
+            logger.info(f"Expected hmac: {str(digest)}")
+            logger.info(f"Filename hmac: {file.split('.')[-2]}")
 
         # read in the data
         try:
@@ -129,7 +133,7 @@ def load_cache(
             description, data = pickle.load(fd)
             fd.close()
             if verbose > 0:
-                print(f"{description}")
+                logger.info(f"{description}")
             return data, description
         except Exception as e:
             raise InputError(f"Error loading cache file from {prefix_path}: {e}")
@@ -161,14 +165,14 @@ def cache_info(file_or_path: str) -> Dict[str, str]:
     Files without valid hmacs are not listed.
 
     """
-    print("Checking all files in ", file_or_path)
+    logger.info(f"Checking all files in {file_or_path}")
     result = {}
     for file in glob.glob(file_or_path + "/*.pkl"):
-        print(f"\n----{file}----")
+        logger.info(f"\n----{file}----")
         try:
             _, description = load_cache(prefix_path=file)
         except Exception as e:
-            print(e)
+            logger.info(e)
             continue
         if description is not None:
             result[file] = description
diff --git a/src/pymasq/utils/utils.py b/src/pymasq/utils/utils.py
index e1fd5c9..21c55aa 100644
--- a/src/pymasq/utils/utils.py
+++ b/src/pymasq/utils/utils.py
@@ -1,10 +1,11 @@
-import inspect
 import functools
+import inspect
+import logging
+from typing import Final, List, Optional, Union
+
 import numpy as np
 import pandas as pd
-
 from pandas.api.types import is_numeric_dtype
-from typing import Final, List, Optional, Union
 
 from pymasq import BEARTYPE
 from pymasq import config
@@ -12,6 +13,8 @@
 
 __all__ = ["BOTH", "as_dataframe", "validate_numeric", "formatting", "freq_calc"]
 
+logger = logging.getLogger(__name__)
+
 
 BOTH: Final = "both"
 
@@ -85,7 +88,7 @@ def _formatting_wrapper(data, *args, **kwargs):
                         data = data.astype(dtypes)
                 except:
                     # TODO: switch to logging
-                    print("WARNING: Unable to keep original datatypes.")
+                    logger.info("WARNING: Unable to keep original datatypes.")
 
             if on_output:
                 if input_type == pd.Series:
@@ -146,7 +149,7 @@ def freq_calc(
     freq_df = data.groupby(quasi_cols).count()[sensitive_col]
     freq_df = freq_df.rename("samp_fq").reset_index()
 
-    freqs = pd.merge(data, freq_df, how="outer", on=quasi_cols)
+    freqs = pd.merge(data, freq_df, how="outer", on=quasi_cols, validate="many_to_one")
     weights = as_dataframe(weights) if weights else pd.Series([1] * freqs.shape[0])
     freqs["pop_fq"] = freqs["samp_fq"].values * weights
 
diff --git a/tests/classifiers/test_classifiers.py b/tests/classifiers/test_classifiers.py
index c220321..9f9bc2e 100644
--- a/tests/classifiers/test_classifiers.py
+++ b/tests/classifiers/test_classifiers.py
@@ -1,10 +1,11 @@
 #!/usr/bin/env python
 # coding: utf-8
-
-import shutil
+import logging
 import pytest
-import pymasq.config as cfg
+import shutil
 from pathlib import Path
+
+import pymasq.config as cfg
 from pymasq.datasets import load_census
 from pymasq.preprocessing import LabelEncoderPM, EmbeddingsEncoder
 from pymasq.models.models import (
@@ -13,6 +14,8 @@
     RFClassifier,
 )
 
+logger = logging.getLogger(__name__)
+
 
 @pytest.fixture
 def my_df():
@@ -43,7 +46,7 @@ def my_df():
 )
 def test_classifiers(my_df, combo):
     classifier_type, preprocessor, answer = combo
-    print(classifier_type)
+    logger.info(classifier_type)
     # check that the classifier gets the expected value given a set hmac key and set seed
 
     dir_name = "cache_test"
@@ -72,17 +75,17 @@ def test_classifiers(my_df, combo):
     # should make use of cache
     enc = preprocessor.encode(my_df, cache_location=dir_name, verbose=1)
 
-    print(type(enc.drop(["sex"], axis=1)))
-    print(type(enc.sex))
+    logger.info(type(enc.drop(["sex"], axis=1)))
+    logger.info(type(enc.sex))
     score = classifier.predict(x_test=enc.drop(["sex"], axis=1), y_true=enc.sex)
-    print(f"{classifier.name}, {preprocessor}: {score}")
+    logger.info(f"{classifier.name}, {preprocessor}: {score}")
     assert round(score, 2) == answer, "Scores should match (trial {}, {})".format(
         classifier_type, preprocessor
     )
 
     # Check if the cached file loads, and that the hmac checks out
-    print(f"\n{classifier.name}, {preprocessor} load")
+    logger.info(f"\n{classifier.name}, {preprocessor} load")
     classifier.load_trained_model(my_df, verbose=1)
 
-    print("removing cache")
+    logger.info("removing cache")
     shutil.rmtree(dir_name)
diff --git a/tests/integration/integration.py b/tests/integration/integration.py
index 7da1a31..dbda821 100644
--- a/tests/integration/integration.py
+++ b/tests/integration/integration.py
@@ -1,19 +1,17 @@
 import argparse
 import json
-import numpy as np
-import pandas as pd
+import logging
 import os
 import yaml
 
 import pymasq
 
-pymasq.set_seed(123)
-
-from pymasq import mitigations as mits
-from pymasq import metrics as mets
 from pymasq import optimizations as opts
 from pymasq import datasets
 
+pymasq.set_seed(123)
+
+logger = logging.getLogger(__name__)
 
 ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
 CORE_CFG_FNAME = os.path.join(ROOT_DIR, "core_config.yaml")
@@ -76,13 +74,13 @@ def get_configs(test_cfg):
     opts_cfg = cfg.get("optimizations", None)
 
     if VERBOSE:
-        print(
-            "========== [ Dataset ] ==========\n",
-            json.dumps(data_cfg, indent=4),
+        logger.info(
+            f"""========== [ Dataset ] ==========\n,
+            {json.dumps(data_cfg, indent=4)},
             "\n========== [Mitigations] ==========\n",
-            json.dumps(mits_cfg, indent=4),
+            {json.dumps(mits_cfg, indent=4)},
             "\n========== [Metrics] ==========\n",
-            json.dumps(mets_cfg, indent=4),
+            {json.dumps(mets_cfg, indent=4)},"""
         )
 
     return data_cfg, mits_cfg, mets_cfg, opts_cfg
@@ -104,7 +102,7 @@ def get_data(data_cfg):
         df = df.loc[:, cols if isinstance(cols, list) else [cols]]
 
     if VERBOSE:
-        print(df, "\n", df.shape)
+        logger.info(df, "\n", df.shape)
 
     return df
 
@@ -127,8 +125,8 @@ def run(args):
             mod_df, fit, log = algo.optimize()
 
             if VERBOSE:
-                print("\n============== %s ===============\n" % (opt))
-                print(mod_df, "\n", fit, "\n", log)
+                logger.info("\n============== %s ===============\n" % (opt))
+                logger.info(mod_df, "\n", fit, "\n", log)
 
     else:
         # if no optimizations specified, then simply run ExhaustiveSearch
@@ -143,9 +141,9 @@ def run(args):
         mod_df, fit, log = algo.optimize()
 
         if VERBOSE:
-            print(mod_df, "\n", fit, "\n", log)
+            logger.info(mod_df, "\n", fit, "\n", log)
 
-    print("[Tests: Complete]")
+    logger.info("[Tests: Complete]")
 
 
 if __name__ == "__main__":
diff --git a/tests/metrics/test_utility_scores.py b/tests/metrics/test_utility_scores.py
index 7d48060..bf0dbf0 100644
--- a/tests/metrics/test_utility_scores.py
+++ b/tests/metrics/test_utility_scores.py
@@ -1,4 +1,5 @@
-from random import sample, gauss, seed
+import logging
+from random import sample, gauss
 
 import pandas as pd
 import pytest
@@ -11,6 +12,7 @@
 params = [5000, 10000, 100000]
 seed = 1234
 
+logger = logging.getLogger(__name__)
 
 @pytest.fixture(scope="session", params=params)
 def orig_bin_df(request):
@@ -116,7 +118,7 @@ def test_propensity_score_identical(my_df):
     """
     Tests propensity_score for identical data frames
     """
-    print()
+    logger.info()
     for classifier, pp in [
         ("logreg", ["embeddings", "label_encode"]),
         ("rfclass", ["embeddings", "label_encode"]),
@@ -143,7 +145,7 @@ def test_propensity_score_identical(my_df):
                     method=classifier,
                     preprocessor=preprocessor,
                 )
-            print(f"{classifier}/{preprocessor}: {round(score,2)}")
+            logger.info(f"{classifier}/{preprocessor}: {round(score,2)}")
             assert (
                 round(score, 2) <= 0.0
             ), f"{classifier}/{preprocessor}: Should be 0.0 but is round({score},2)={round(score,2)}"
@@ -185,7 +187,7 @@ def test_propensity_score_moderate_change(my_df):
                     method=classifier,
                     preprocessor=preprocessor,
                 )
-            print(f"{classifier}/{preprocessor}: {round(score,2)}")
+            logger.info(f"{classifier}/{preprocessor}: {round(score,2)}")
             exp = expected.pop()
             assert (
                 round(score, 2) == exp,
diff --git a/tests/mitigations/test_global_recode.py b/tests/mitigations/test_global_recode.py
index 042d5a4..aa5068c 100644
--- a/tests/mitigations/test_global_recode.py
+++ b/tests/mitigations/test_global_recode.py
@@ -1,18 +1,20 @@
+import logging
+
 import pandas as pd
-import numpy as np
 
 from pymasq import config
 
-config.FORMATTING_ON_OUTPUT = True
-
 from pymasq.mitigations import (
     global_recode,
     EQUAL,
-    EQUIDISTANT,
     MAGNITUDE,
     LOG_EQUIDISTANT,
 )
 
+config.FORMATTING_ON_OUTPUT = True
+
+logger = logging.getLogger(__name__)
+
 
 def test_global_recode_labels_ordered():
     one_to_ten = range(1, 11)
@@ -25,7 +27,7 @@ def test_global_recode_labels_ordered():
         ordered=True,
     )
     result = global_recode(series, bins=3, ordered=True, labels=["A", "B", "C"])
-    print(result)
+    logger.info(result)
     assert all(result == expected_result), "This should be true"
 
 
@@ -73,12 +75,12 @@ def test_global_recode_no_labels():
         dtype="category",
         ordered=True,
     )
-    print("EXPECTED RESULT ====>", expected_result)
+    logger.info(f"EXPECTED RESULT ====>{expected_result}")
     result = global_recode(
         series,
         bins=5,
     )
-    print("RESULT ====>", result)
+    logger.info(f"RESULT ====>{result}")
 
     assert all(result == expected_result), "This should be true"
 
diff --git a/tests/mitigations/test_hashing.py b/tests/mitigations/test_hashing.py
index 1e746cc..302babe 100644
--- a/tests/mitigations/test_hashing.py
+++ b/tests/mitigations/test_hashing.py
@@ -1,19 +1,19 @@
 #!/usr/bin/env python
 # coding: utf-8
 
-import pytest
-
+import logging
 import hashlib
-import numpy as np
 
-from pymasq.config import DEFAULT_SEED
+import pytest
+
+from pymasq.config import rg
 from pymasq.datasets import load_census
 from pymasq.mitigations import hashing
 
 
 ALGORITHMS = hashlib.algorithms_guaranteed
 
-rg = np.random.default_rng(DEFAULT_SEED)
+logger = logging.getLogger(__name__)
 
 def _my_df():
     df = load_census()
@@ -42,7 +42,7 @@ def test_hashing_all_hashlib_guaranteed_algorithms(my_df, hash_func):
     try:
         rdf = hashing(my_df, hash_func)
     except Exception as e:
-        print(f"Raised Exception: {e}")
+        logger.info(f"Raised Exception: {e}")
     assert rdf is not None
 
 
diff --git a/tests/preprocessing/test_preprocess.py b/tests/preprocessing/test_preprocess.py
index af518a5..22cd47a 100644
--- a/tests/preprocessing/test_preprocess.py
+++ b/tests/preprocessing/test_preprocess.py
@@ -1,16 +1,17 @@
 #!/usr/bin/env python
 # coding: utf-8
 
-import pytest
+import logging
 
+import pytest
 from numpy import NaN
 
 from pymasq.datasets import load_census
-
 from pymasq.preprocessing import embed_entities, LabelEncoderPM, EmbeddingsEncoder
 
 # from pymasq.errors import InputError, DataTypeError
 
+logger = logging.getLogger(__name__)
 
 @pytest.fixture
 def my_df():
@@ -114,8 +115,8 @@ def test_embed_entites_7(my_df):
 #     Tests that embed_entities returns arrays for each education category given two target columns.
 #     """
 #     ret = embed_entities(my_df[["sex", "marital_status"]], my_df[["education"]])
-#     print(my_df["education"].nunique())
-#     print(ret["education"].shape[0])
+#     logger.info(my_df["education"].nunique())
+#     logger.info(ret["education"].shape[0])
 #     assert my_df["education"].nunique() == ret["education"].shape[0]
 
 
@@ -129,8 +130,8 @@ def test_embed_entites_9(my_df):
         cache_location=None,
         retrain=True,
     )
-    print(my_df["education"].nunique())
-    print(ret["education"].shape[0])
+    logger.info(my_df["education"].nunique())
+    logger.info(ret["education"].shape[0])
     assert my_df["education"].nunique() == ret["education"].shape[0]
 
 
diff --git a/tests/utils/test_cache.py b/tests/utils/test_cache.py
index c348dec..8b9995e 100644
--- a/tests/utils/test_cache.py
+++ b/tests/utils/test_cache.py
@@ -1,15 +1,19 @@
 #!/usr/bin/env python
 # coding: utf-8
 
+import logging
 import shutil
+from pathlib import Path
+
 import pytest
+
 import pymasq.config as cfg
-from pathlib import Path
 from pymasq.datasets import load_census
 from pymasq.models.models import LogisticRegressionClassifier, RFClassifier
 from pymasq.preprocessing import LabelEncoderPM, EmbeddingsEncoder
 from pymasq.utils import cache
 
+logger = logging.getLogger(__name__)
 
 @pytest.fixture
 def my_df():
@@ -76,7 +80,7 @@ def my_df():
 )
 def test_cache(my_df, combo):
     classifier_type, preprocessor, answer, key, desc = combo
-    print(classifier_type)
+    logger.info(classifier_type)
 
     dir_name = "cache_test"
     Path(dir_name).mkdir(exist_ok=True)
@@ -101,13 +105,13 @@ def test_cache(my_df, combo):
         )
     enc = preprocessor.encode(my_df, cache_location=None)
     score = classifier.predict(x_test=enc.drop(["sex"], axis=1), y_true=enc.sex)
-    print(f"{classifier.name}, {preprocessor}: {score}")
+    logger.info(f"{classifier.name}, {preprocessor}: {score}")
     assert round(score, 2) == answer, "Scores should match (trial {}) {} and {}".format(
         combo, score, answer
     )
 
     # Check if the cached file loads, and that the hmac checks out
-    print(f"\n{classifier.name}, {preprocessor} load")
+    logger.info(f"\n{classifier.name}, {preprocessor} load")
     classifier.load_trained_model(my_df, verbose=1)
 
     # Test that changing the hmac will cause a failure
@@ -116,8 +120,8 @@ def test_cache(my_df, combo):
         classifier.load_trained_model(my_df)
         raise ("This test should have failed because the hmac key changed")
     except Exception as e:
-        print("This error is a desired outcome of the test:")
-        print("\t", e, "\n")
+        logger.info("This error is a desired outcome of the test:")
+        logger.exception(e)
 
     cfg.CACHE_HMAC_KEY = "my key"
     # Assert to see if description was saved