Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/update python versions #5

Draft
wants to merge 17 commits into
base: 1.0.1-SNAPSHOT
Choose a base branch
from
Prev Previous commit
Next Next commit
remove boruta because it has not been updated
cdo03c committed Jan 6, 2024
commit c3b301da37752ddf46704ba98cf097e5d2fc9376
3 changes: 1 addition & 2 deletions setup.cfg
Original file line number Diff line number Diff line change
@@ -16,10 +16,9 @@ packages = find:
package_dir =
= src
install_requires =
boruta~=0.3
bpemb~=0.3
matplotlib~=3.5
numpy~=1.22
numpy~=1.26
pandas~=1.4
plotly>=4.11.0
SALib~=1.4
2 changes: 1 addition & 1 deletion src/pymasq/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from os import path

__version__ = "0.6.5"
__version__ = "0.6.6"


try:
127 changes: 5 additions & 122 deletions src/pymasq/kve/kve.py
Original file line number Diff line number Diff line change
@@ -8,8 +8,6 @@
from numpy import ndarray
import pandas as pd
import statsmodels.api as sm
import json
from boruta import BorutaPy
from pandas.api.types import is_numeric_dtype
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.feature_selection import RFECV
@@ -26,12 +24,10 @@
"key_variable_exploration",
"importance_scores",
"random_forest_scores",
"boruta_scores",
"rfe_scores",
"stepwise_scores",
"stepwise_selection",
"RANDOM_FOREST",
"BORUTA",
"RFE",
"INCLUDE",
"VARIABLE",
@@ -43,7 +39,6 @@


RANDOM_FOREST: Final = "Random_Forest"
BORUTA: Final = "Boruta"
RFE: Final = "RFE"
STEPWISE: Final = "Stepwise"
INCLUDE: Final = "Include"
@@ -136,7 +131,7 @@ def key_variable_exploration(

**kwargs
Additional arguments to be passed to `importance_Scores`:
* methods : Tuple[str], optional Default: ('rf', 'boruta', 'rfe', 'stepwise')
* methods : Tuple[str], optional Default: ('rf', 'rfe', 'stepwise')
Names of the ranking methods to run.

Returns
@@ -162,7 +157,7 @@ def key_variable_exploration(
normalize=True,
)

methods = kwargs.get("methods", (RANDOM_FOREST, BORUTA, RFE, STEPWISE))
methods = kwargs.get("methods", (RANDOM_FOREST, RFE, STEPWISE))
categories = len(df[sensitive_col].dropna().unique())
if categories < 2:
print(
@@ -238,7 +233,7 @@ def importance_scores(
Number of categories in the senestive column used to determine the type
of model used in feature selection, -1 indicates the column is continuous

methods : Tuple[str], optional (Default: "Random_Forest","Boruta","RFE", "Stepwise")
methods : Tuple[str], optional (Default: "Random_Forest","RFE", "Stepwise")
Names of the ranking methods to run

verbose : int {0, 1, 2}, (Default: 0)
@@ -256,7 +251,7 @@ def importance_scores(
"callback", None
) # callable function that emits to main server
if methods is None:
methods = (RANDOM_FOREST, BORUTA, RFE, STEPWISE)
methods = (RANDOM_FOREST, RFE, STEPWISE)
method_len = float(len(methods)) # instantiated for progress emits
method_count = 1 # instantiated for progress emits
x_rf = input_df.drop([sensitive_col], axis=1)
@@ -274,15 +269,6 @@ def importance_scores(
if progress_reporter is not None:
progress_reporter(method_count / method_len)
method_count += 1
if BORUTA in methods and x_train.shape[0] >= 250:
if verbose > 0:
print("Running Boruta...")
score_dict[f"{BORUTA}_{INCLUDE}"] = boruta_scores(
x_train, y, verbose=verbose, categories=categories
)
if progress_reporter is not None:
progress_reporter(method_count / method_len)
method_count += 1
if RFE in methods:
if verbose > 0:
print("Running Recursive Feature Elimination...")
@@ -392,109 +378,6 @@ def random_forest_scores(
return rf.feature_importances_, include


@BEARTYPE
def boruta_scores(
x_train: pd.DataFrame,
y: pd.Series,
categories: int,
n_estimators: int = 1000,
n_jobs: int = -1,
random_state: int = 1234,
verbose: int = 0,
max_iter: int = 50,
) -> List[str]:
"""
Boruta is an all relevant feature selection method, while most other are
minimal optimal; this means it tries to find all features carrying
information usable for prediction, rather than finding a possibly compact
subset of features on which some classifier has a minimal error


NOTE: Does not work with small data, requires >250 rows

Parameters
----------
x_train : pd.DataFrame
A dataframe containing all input variables for training the model

y : pd.Series
A series containing the ground truth labels or numbers

categories: int
number of categories in the senestive column used to determine the type
of model used in feature selection, -1 indicates the column is continuous

n_estimators : int, optional (Default: 1000)
Number of trees that are constructed during the random forest

n_jobs : int, optional (Default: -1)
Number of workers to use for parallel processing
- -1 indicates use all available workers

random_state: int, optional (Default: 1234)
Integer seed for setting the random state in the model

verbose : int {0, 1, 2}, optional (Default 2)
Level of reporting from the algorithms:
- 0 disables verbose logging
- 2 is step-by-step reporting

max_iter: int, optional (Default: 50)
The number of maximum iterations to perform.

Returns
-------
List[str]
list of strings, contains whether a feature should be included in
further analysis:
- "yes": boruta ranking = 1
- "maybe": boruta ranking = 2
- "no": boruta ranking >= 3

References
----------
https://medium.com/@indreshbhattacharyya/feature-selection-categorical-feature-selection-boruta-light-gbm-chi-square-bf47e94e2558

"""
if x_train.shape[0] < 250:
print("Requires > 250 rows to be stable")
return []
if categories >= 2:
rf = RandomForestClassifier(
n_estimators=n_estimators,
n_jobs=n_jobs,
verbose=verbose,
random_state=random_state,
)
else:
rf = RandomForestRegressor(
n_estimators=n_estimators,
n_jobs=n_jobs,
verbose=verbose,
random_state=random_state,
)
boruta_selector = BorutaPy(
rf,
verbose=verbose,
n_estimators="auto",
random_state=random_state,
max_iter=max_iter,
)
if isinstance(x_train, np.ndarray):
boruta_selector.fit(x_train, y)
else:
boruta_selector.fit(x_train.values, y.values)
include = []
for r in list(boruta_selector.ranking_):
if r == 1:
include.append("yes")
elif r == 2:
include.append("maybe")
else:
include.append("no")
return include


@BEARTYPE
def rfe_scores(
x_train: pd.DataFrame,
@@ -638,7 +521,7 @@ def rfe_scores(
multi_class="ovr",
)
else:
estimator = LinearRegression(normalize=True, n_jobs=n_jobs)
estimator = LinearRegression(n_jobs=n_jobs)
rfecv_selector = RFECV(estimator, step=step, cv=cv, verbose=verbose, n_jobs=n_jobs)
rfecv_selector.fit(x_train, y)
return ["yes" if r == 1 else "no" for r in list(rfecv_selector.ranking_)]
4 changes: 2 additions & 2 deletions tests/classifiers/test_classifiers.py
Original file line number Diff line number Diff line change
@@ -37,8 +37,8 @@ def my_df():
(LogisticRegressionClassifier, EmbeddingsEncoder, 0.5),
(RFClassifier, LabelEncoderPM, 1.0),
(RFClassifier, EmbeddingsEncoder, 1.0),
(TpotClassifier, LabelEncoderPM, 0.77),
(TpotClassifier, EmbeddingsEncoder, 0.86),
(TpotClassifier, LabelEncoderPM, 0.8),
(TpotClassifier, EmbeddingsEncoder, 0.81),
],
)
def test_classifiers(my_df, combo):
96 changes: 1 addition & 95 deletions tests/kve/test_kve.py
Original file line number Diff line number Diff line change
@@ -2,7 +2,7 @@
import pandas as pd
import pytest

from pymasq.kve import random_forest_scores, boruta_scores, rfe_scores, stepwise_scores
from pymasq.kve import random_forest_scores, rfe_scores, stepwise_scores
from pymasq.datasets import gen_num_df, gen_bin_df, load_census
from pymasq.preprocessing import EmbeddingsEncoder
from pymasq import ROOT_DIR
@@ -71,28 +71,6 @@ def test_random_forest_cont(my_df):
)
assert len(rf[1]) > 0, "Should be True"


def test_boruta_cont(my_df):
"""
Tests boruta_scores if passed a continuous variable for y
"""
sensitive_col = "age"
my_df = EmbeddingsEncoder.encode(
my_df,
sensitive_col=sensitive_col,
cache_location=ROOT_DIR + "/datasets/data/cache",
)
rf = boruta_scores(
x_train=my_df.drop(sensitive_col, axis=1),
y=my_df[sensitive_col],
verbose=0,
categories=-1,
max_iter=5,
n_estimators=20,
)
assert len(rf[1]) > 0, "Should be True"


def test_rfe_cont(my_df):
"""
Tests rfe_scores if passed a continuous variable for y
@@ -134,29 +112,6 @@ def test_random_forest_multiclass(my_df):
assert len(rf[1]) > 0, "Should be True"


def test_boruta_multiclass(my_df):
"""
Tests boruta_scores if passed a variable with number of categories > 2 for y
"""
sensitive_col = "education"
my_df = EmbeddingsEncoder.encode(
my_df,
sensitive_col=sensitive_col,
cache_location=ROOT_DIR + "/datasets/data/cache",
)
y = my_df[sensitive_col]
n_cats = len(y.dropna().unique())
rf = boruta_scores(
x_train=my_df.drop(sensitive_col, axis=1),
y=y,
verbose=0,
categories=n_cats,
max_iter=5,
n_estimators=20,
)
assert len(rf[1]) > 0, "Should be True"


def test_rfe_multiclass(my_df):
"""
Tests rfe_scores if passed a variable with number of categories > 2 for y
@@ -194,20 +149,6 @@ def test_random_forest_bin(bin_df):
), "Should be ['yes', 'no', 'no', 'no', 'no', 'no']]"


def test_boruta_bin(bin_df):
"""
Tests boruta_scores feature importance ranks for a binary dataframe
of a given size.
"""
y = bin_df["Label"]
n_cats = len(y.dropna().unique())
assert boruta_scores(
x_train=bin_df.drop("Label", axis=1), y=y, verbose=0, categories=n_cats
) == ["yes"] * 5 + [
"maybe"
], "Should be ['yes', 'yes', 'yes', 'yes', 'yes', 'maybe']"


def test_rfe_bin(bin_df):
"""
Tests rfe_scores feature importance ranks for a binary dataframe
@@ -241,24 +182,6 @@ def test_random_forest_num(num_df):
), "Should be ['yes', 'no', 'no', 'no', 'no', 'no']]"


def test_boruta_num(num_df):
"""
Tests boruta_scores feature importance ranks for a numeric dataframe
of a given size.
"""
y = num_df["Label"]
n_cats = len(y.dropna().unique())
assert (
boruta_scores(
x_train=num_df.drop("Label", axis=1),
y=y,
verbose=0,
categories=n_cats,
)
== ["yes"] * 6
), "Should be ['yes', 'yes', 'yes', 'yes', 'yes', 'yes']"


def test_rfe_num(num_df):
"""
Tests rfe_scores feature importance ranks for a numeric dataframe
@@ -277,23 +200,6 @@ def test_rfe_num(num_df):
), "Should be ['yes', 'no', 'no', 'no', 'no', 'no']"


def test_boruta_comb(comb_df):
"""
Tests boruta_scores feature importance ranks for a combined dataframe
of a given size.
"""
if comb_df.shape[0] <= 2000:
assert True
y = comb_df["Label"]
n_cats = len(y.dropna().unique())
scores = boruta_scores(
x_train=comb_df.drop("Label", axis=1), y=y, verbose=0, categories=n_cats
)
assert (
scores == ["yes"] * 5 + ["maybe"] + ["yes"] * 6
), "One 'maybe' at index 5, otherwise all 'yes"


def test_random_forest_comb(comb_df):
"""
Tests random_forest_scores feature importance ranks for a combined dataframe