remove boruta because it has not been updated

mit-ll · cdo03c · Dec 6, 2023 · Dec 6, 2023 · Dec 7, 2023 · Dec 7, 2023
commit c3b301da37752ddf46704ba98cf097e5d2fc9376
diff --git a/setup.cfg b/setup.cfg
@@ -16,10 +16,9 @@ packages = find:
 package_dir =
        = src
 install_requires =
-       boruta~=0.3
        bpemb~=0.3
        matplotlib~=3.5
-       numpy~=1.22
+       numpy~=1.26
        pandas~=1.4
        plotly>=4.11.0
        SALib~=1.4

diff --git a/src/pymasq/__init__.py b/src/pymasq/__init__.py
@@ -1,6 +1,6 @@
 from os import path
 
-__version__ = "0.6.5"
+__version__ = "0.6.6"
 
 
 try:

diff --git a/src/pymasq/kve/kve.py b/src/pymasq/kve/kve.py
@@ -8,8 +8,6 @@
 from numpy import ndarray
 import pandas as pd
 import statsmodels.api as sm
-import json
-from boruta import BorutaPy
 from pandas.api.types import is_numeric_dtype
 from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
 from sklearn.feature_selection import RFECV
@@ -26,12 +24,10 @@
     "key_variable_exploration",
     "importance_scores",
     "random_forest_scores",
-    "boruta_scores",
     "rfe_scores",
     "stepwise_scores",
     "stepwise_selection",
     "RANDOM_FOREST",
-    "BORUTA",
     "RFE",
     "INCLUDE",
     "VARIABLE",
@@ -43,7 +39,6 @@
 
 
 RANDOM_FOREST: Final = "Random_Forest"
-BORUTA: Final = "Boruta"
 RFE: Final = "RFE"
 STEPWISE: Final = "Stepwise"
 INCLUDE: Final = "Include"
@@ -136,7 +131,7 @@ def key_variable_exploration(
 
     **kwargs
         Additional arguments to be passed to `importance_Scores`:
-        * methods : Tuple[str], optional Default: ('rf', 'boruta', 'rfe', 'stepwise')
+        * methods : Tuple[str], optional Default: ('rf', 'rfe', 'stepwise')
             Names of the ranking methods to run.
 
     Returns
@@ -162,7 +157,7 @@ def key_variable_exploration(
             normalize=True,
         )
 
-    methods = kwargs.get("methods", (RANDOM_FOREST, BORUTA, RFE, STEPWISE))
+    methods = kwargs.get("methods", (RANDOM_FOREST, RFE, STEPWISE))
     categories = len(df[sensitive_col].dropna().unique())
     if categories < 2:
         print(
@@ -238,7 +233,7 @@ def importance_scores(
         Number of categories in the senestive column used to determine the type
         of model used in feature selection, -1 indicates the column is continuous
 
-    methods : Tuple[str], optional (Default: "Random_Forest","Boruta","RFE", "Stepwise")
+    methods : Tuple[str], optional (Default: "Random_Forest","RFE", "Stepwise")
         Names of the ranking methods to run
 
     verbose : int {0, 1, 2}, (Default: 0)
@@ -256,7 +251,7 @@ def importance_scores(
         "callback", None
     )  # callable function that emits to main server
     if methods is None:
-        methods = (RANDOM_FOREST, BORUTA, RFE, STEPWISE)
+        methods = (RANDOM_FOREST, RFE, STEPWISE)
     method_len = float(len(methods))  # instantiated for progress emits
     method_count = 1  # instantiated for progress emits
     x_rf = input_df.drop([sensitive_col], axis=1)
@@ -274,15 +269,6 @@ def importance_scores(
         if progress_reporter is not None:
             progress_reporter(method_count / method_len)
             method_count += 1
-    if BORUTA in methods and x_train.shape[0] >= 250:
-        if verbose > 0:
-            print("Running Boruta...")
-        score_dict[f"{BORUTA}_{INCLUDE}"] = boruta_scores(
-            x_train, y, verbose=verbose, categories=categories
-        )
-        if progress_reporter is not None:
-            progress_reporter(method_count / method_len)
-            method_count += 1
     if RFE in methods:
         if verbose > 0:
             print("Running Recursive Feature Elimination...")
@@ -392,109 +378,6 @@ def random_forest_scores(
     return rf.feature_importances_, include
 
 
-@BEARTYPE
-def boruta_scores(
-    x_train: pd.DataFrame,
-    y: pd.Series,
-    categories: int,
-    n_estimators: int = 1000,
-    n_jobs: int = -1,
-    random_state: int = 1234,
-    verbose: int = 0,
-    max_iter: int = 50,
-) -> List[str]:
-    """
-    Boruta is an all relevant feature selection method, while most other are
-    minimal optimal; this means it tries to find all features carrying
-    information usable for prediction, rather than finding a possibly compact
-    subset of features on which some classifier has a minimal error
-
-
-    NOTE: Does not work with small data, requires >250 rows
-
-    Parameters
-    ----------
-    x_train : pd.DataFrame
-        A dataframe containing all input variables for training the model
-
-    y : pd.Series
-        A series containing the ground truth labels or numbers
-
-    categories: int
-        number of categories in the senestive column used to determine the type
-        of model used in feature selection, -1 indicates the column is continuous
-
-    n_estimators : int, optional (Default: 1000)
-        Number of trees that are constructed during the random forest
-
-    n_jobs : int, optional (Default: -1)
-        Number of workers to use for parallel processing
-            - -1 indicates use all available workers
-
-    random_state: int, optional (Default: 1234)
-        Integer seed for setting the random state in the model
-
-    verbose : int {0, 1, 2}, optional (Default 2)
-        Level of reporting from the algorithms:
-            - 0 disables verbose logging
-            - 2 is step-by-step reporting
-
-    max_iter: int, optional (Default: 50)
-        The number of maximum iterations to perform.
-
-    Returns
-    -------
-    List[str]
-        list of strings, contains whether a feature should be included in
-        further analysis:
-        - "yes": boruta ranking = 1
-        - "maybe": boruta ranking = 2
-        - "no": boruta ranking >= 3
-
-    References
-    ----------
-    https://medium.com/@indreshbhattacharyya/feature-selection-categorical-feature-selection-boruta-light-gbm-chi-square-bf47e94e2558
-
-    """
-    if x_train.shape[0] < 250:
-        print("Requires > 250 rows to be stable")
-        return []
-    if categories >= 2:
-        rf = RandomForestClassifier(
-            n_estimators=n_estimators,
-            n_jobs=n_jobs,
-            verbose=verbose,
-            random_state=random_state,
-        )
-    else:
-        rf = RandomForestRegressor(
-            n_estimators=n_estimators,
-            n_jobs=n_jobs,
-            verbose=verbose,
-            random_state=random_state,
-        )
-    boruta_selector = BorutaPy(
-        rf,
-        verbose=verbose,
-        n_estimators="auto",
-        random_state=random_state,
-        max_iter=max_iter,
-    )
-    if isinstance(x_train, np.ndarray):
-        boruta_selector.fit(x_train, y)
-    else:
-        boruta_selector.fit(x_train.values, y.values)
-    include = []
-    for r in list(boruta_selector.ranking_):
-        if r == 1:
-            include.append("yes")
-        elif r == 2:
-            include.append("maybe")
-        else:
-            include.append("no")
-    return include
-
-
 @BEARTYPE
 def rfe_scores(
     x_train: pd.DataFrame,
@@ -638,7 +521,7 @@ def rfe_scores(
                 multi_class="ovr",
             )
     else:
-        estimator = LinearRegression(normalize=True, n_jobs=n_jobs)
+        estimator = LinearRegression(n_jobs=n_jobs)
     rfecv_selector = RFECV(estimator, step=step, cv=cv, verbose=verbose, n_jobs=n_jobs)
     rfecv_selector.fit(x_train, y)
     return ["yes" if r == 1 else "no" for r in list(rfecv_selector.ranking_)]

diff --git a/tests/classifiers/test_classifiers.py b/tests/classifiers/test_classifiers.py
@@ -37,8 +37,8 @@ def my_df():
         (LogisticRegressionClassifier, EmbeddingsEncoder, 0.5),
         (RFClassifier, LabelEncoderPM, 1.0),
         (RFClassifier, EmbeddingsEncoder, 1.0),
-        (TpotClassifier, LabelEncoderPM, 0.77),
-        (TpotClassifier, EmbeddingsEncoder, 0.86),
+        (TpotClassifier, LabelEncoderPM, 0.8),
+        (TpotClassifier, EmbeddingsEncoder, 0.81),
     ],
 )
 def test_classifiers(my_df, combo):

diff --git a/tests/kve/test_kve.py b/tests/kve/test_kve.py
@@ -2,7 +2,7 @@
 import pandas as pd
 import pytest
 
-from pymasq.kve import random_forest_scores, boruta_scores, rfe_scores, stepwise_scores
+from pymasq.kve import random_forest_scores, rfe_scores, stepwise_scores
 from pymasq.datasets import gen_num_df, gen_bin_df, load_census
 from pymasq.preprocessing import EmbeddingsEncoder
 from pymasq import ROOT_DIR
@@ -71,28 +71,6 @@ def test_random_forest_cont(my_df):
     )
     assert len(rf[1]) > 0, "Should be True"
 
-
-def test_boruta_cont(my_df):
-    """
-    Tests boruta_scores if passed a continuous variable for y
-    """
-    sensitive_col = "age"
-    my_df = EmbeddingsEncoder.encode(
-        my_df,
-        sensitive_col=sensitive_col,
-        cache_location=ROOT_DIR + "/datasets/data/cache",
-    )
-    rf = boruta_scores(
-        x_train=my_df.drop(sensitive_col, axis=1),
-        y=my_df[sensitive_col],
-        verbose=0,
-        categories=-1,
-        max_iter=5,
-        n_estimators=20,
-    )
-    assert len(rf[1]) > 0, "Should be True"
-
-
 def test_rfe_cont(my_df):
     """
     Tests rfe_scores if passed a continuous variable for y
@@ -134,29 +112,6 @@ def test_random_forest_multiclass(my_df):
     assert len(rf[1]) > 0, "Should be True"
 
 
-def test_boruta_multiclass(my_df):
-    """
-    Tests boruta_scores if passed a variable with number of categories > 2 for y
-    """
-    sensitive_col = "education"
-    my_df = EmbeddingsEncoder.encode(
-        my_df,
-        sensitive_col=sensitive_col,
-        cache_location=ROOT_DIR + "/datasets/data/cache",
-    )
-    y = my_df[sensitive_col]
-    n_cats = len(y.dropna().unique())
-    rf = boruta_scores(
-        x_train=my_df.drop(sensitive_col, axis=1),
-        y=y,
-        verbose=0,
-        categories=n_cats,
-        max_iter=5,
-        n_estimators=20,
-    )
-    assert len(rf[1]) > 0, "Should be True"
-
-
 def test_rfe_multiclass(my_df):
     """
     Tests rfe_scores if passed a variable with number of categories > 2 for y
@@ -194,20 +149,6 @@ def test_random_forest_bin(bin_df):
     ), "Should be ['yes', 'no', 'no', 'no', 'no', 'no']]"
 
 
-def test_boruta_bin(bin_df):
-    """
-    Tests boruta_scores feature importance ranks for a binary dataframe
-    of a given size.
-    """
-    y = bin_df["Label"]
-    n_cats = len(y.dropna().unique())
-    assert boruta_scores(
-        x_train=bin_df.drop("Label", axis=1), y=y, verbose=0, categories=n_cats
-    ) == ["yes"] * 5 + [
-        "maybe"
-    ], "Should be ['yes', 'yes', 'yes', 'yes', 'yes', 'maybe']"
-
-
 def test_rfe_bin(bin_df):
     """
     Tests rfe_scores feature importance ranks for a binary dataframe
@@ -241,24 +182,6 @@ def test_random_forest_num(num_df):
     ), "Should be ['yes', 'no', 'no', 'no', 'no', 'no']]"
 
 
-def test_boruta_num(num_df):
-    """
-    Tests boruta_scores feature importance ranks for a numeric dataframe
-    of a given size.
-    """
-    y = num_df["Label"]
-    n_cats = len(y.dropna().unique())
-    assert (
-        boruta_scores(
-            x_train=num_df.drop("Label", axis=1),
-            y=y,
-            verbose=0,
-            categories=n_cats,
-        )
-        == ["yes"] * 6
-    ), "Should be ['yes', 'yes', 'yes', 'yes', 'yes', 'yes']"
-
-
 def test_rfe_num(num_df):
     """
     Tests rfe_scores feature importance ranks for a numeric dataframe
@@ -277,23 +200,6 @@ def test_rfe_num(num_df):
     ), "Should be ['yes', 'no', 'no', 'no', 'no', 'no']"
 
 
-def test_boruta_comb(comb_df):
-    """
-    Tests boruta_scores feature importance ranks for a combined dataframe
-    of a given size.
-    """
-    if comb_df.shape[0] <= 2000:
-        assert True
-    y = comb_df["Label"]
-    n_cats = len(y.dropna().unique())
-    scores = boruta_scores(
-        x_train=comb_df.drop("Label", axis=1), y=y, verbose=0, categories=n_cats
-    )
-    assert (
-        scores == ["yes"] * 5 + ["maybe"] + ["yes"] * 6
-    ), "One 'maybe' at index 5, otherwise all 'yes"
-
-
 def test_random_forest_comb(comb_df):
     """
     Tests random_forest_scores feature importance ranks for a combined dataframe