nipype
diff --git a/‎README.md
+158-52 b/‎README.md
+158-52
diff --git a/‎examples/classification_cancer_long-spec.json
+41 b/‎examples/classification_cancer_long-spec.json
+41
diff --git a/‎examples/classification_cancer_short-spec.json
+25 b/‎examples/classification_cancer_short-spec.json
+25
diff --git a/‎examples/classification_cancer_toy-spec.json
+22 b/‎examples/classification_cancer_toy-spec.json
+22
diff --git a/‎examples/regression_diabetes_spec.json
+22 b/‎examples/regression_diabetes_spec.json
+22
diff --git a/‎examples/shap_example.png
192 KB b/‎examples/shap_example.png
192 KB
diff --git a/‎examples/test-roc_auc_score-example.png
56.6 KB b/‎examples/test-roc_auc_score-example.png
56.6 KB
diff --git a/‎examples/test_performance_with_null_roc_auc_score.png
38.1 KB b/‎examples/test_performance_with_null_roc_auc_score.png
38.1 KB
diff --git a/‎pydra_ml/classifier.py
+37 b/‎pydra_ml/classifier.py
+37
@@ -0,0 +1,41 @@
+{"filename": "breast_cancer.csv",
+ "x_indices": ["radius_mean", "texture_mean","perimeter_mean", "area_mean", "smoothness_mean",
+       "compactness_mean", "concavity_mean", "concave points_mean",
+       "symmetry_mean", "fractal_dimension_mean", "radius_se",
+       "texture_se", "perimeter_se", "area_se", "smoothness_se",
+       "compactness_se", "concavity_se", "concave points_se",
+       "symmetry_se", "fractal_dimension_se", "radius_worst",
+       "texture_worst", "perimeter_worst", "area_worst",
+       "smoothness_worst", "compactness_worst", "concavity_worst",
+       "concave points_worst", "symmetry_worst", "fractal_dimension_worst"],
+ "target_vars": ["target"],
+ "group_var": null,
+ "n_splits": 100,
+ "test_size": 0.2,
+ "clf_info": [
+ ["sklearn.ensemble", "AdaBoostClassifier"],
+ ["sklearn.naive_bayes", "GaussianNB"],
+ [ ["sklearn.impute", "SimpleImputer"],
+   ["sklearn.preprocessing", "StandardScaler"],
+   ["sklearn.tree", "DecisionTreeClassifier", {"max_depth": 5}]],
+ ["sklearn.ensemble", "RandomForestClassifier", {"n_estimators": 100}],
+ ["sklearn.ensemble", "ExtraTreesClassifier", {"n_estimators": 100, "class_weight": "balanced"}],
+ ["sklearn.linear_model", "LogisticRegressionCV", {"solver": "liblinear", "penalty": "l1"}],
+ ["sklearn.neural_network", "MLPClassifier", {"alpha": 1, "max_iter": 1000}],
+ ["sklearn.svm", "SVC", {"probability": true},
+  [{"kernel": ["rbf", "linear"], "C": [1, 10, 100, 1000]}]],
+ ["sklearn.neighbors", "KNeighborsClassifier", {},
+  [{"n_neighbors": [3, 5, 7, 9, 11, 13, 15, 17, 19],
+    "weights": ["uniform", "distance"]}]]
+ ],
+ "permute": [true, false],
+ "gen_feature_importance": false,
+ "gen_permutation_importance": false,
+ "permutation_importance_n_repeats": 5,
+ "permutation_importance_scoring": "accuracy",
+ "gen_shap": true,
+ "nsamples": "auto",
+ "l1_reg": "aic",
+ "plot_top_n_shap": 16,
+ "metrics": ["roc_auc_score", "f1_score", "precision_score", "recall_score"]
+ }
@@ -0,0 +1,25 @@
+{"filename": "breast_cancer.csv",
+ "x_indices": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
+ 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29],
+ "target_vars": ["target"],
+ "group_var": null,
+ "n_splits": 3,
+ "test_size": 0.2,
+ "clf_info": [
+  ["sklearn.neural_network", "MLPClassifier", {"alpha": 1, "max_iter": 1000}],
+  [ ["sklearn.impute", "SimpleImputer"],
+    ["sklearn.preprocessing", "StandardScaler"],
+    ["sklearn.tree", "DecisionTreeClassifier", {"max_depth": 5}]
+  ]
+ ],
+ "permute": [false, true],
+ "gen_feature_importance": false,
+ "gen_permutation_importance": false,
+ "permutation_importance_n_repeats": 5,
+ "permutation_importance_scoring": "accuracy",
+ "gen_shap": true,
+ "nsamples": 100,
+ "l1_reg": "aic",
+ "plot_top_n_shap": 16,
+ "metrics": ["roc_auc_score", "accuracy_score"]
+ }
@@ -0,0 +1,22 @@
+{"filename": "breast_cancer.csv",
+ "x_indices": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
+ 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29],
+ "target_vars": ["target"],
+ "group_var": null,
+ "n_splits": 3,
+ "test_size": 0.2,
+ "clf_info": [
+  ["sklearn.svm", "SVC", {"kernel": "linear"}],
+  ["sklearn.linear_model", "LogisticRegression", {"penalty": "l1", "solver":"liblinear"}]
+ ],
+ "permute": [false, true],
+ "gen_feature_importance": true,
+ "gen_permutation_importance": true,
+ "permutation_importance_n_repeats": 5,
+ "permutation_importance_scoring": "accuracy",
+ "gen_shap": true,
+ "nsamples": 100,
+ "l1_reg": "aic",
+ "plot_top_n_shap": 16,
+ "metrics": ["roc_auc_score", "precision_score", "recall_score"]
+ }
@@ -0,0 +1,22 @@
+{"filename": "diabetes_table.csv",
+ "x_indices": [0,1,2,3,4,5,6,7,8,9],
+ "target_vars": ["target"],
+ "group_var": null,
+ "n_splits": 4,
+ "test_size": 0.2,
+ "clf_info": [
+    ["sklearn.linear_model","RidgeCV",{"fit_intercept": true,"normalize": true}],
+    ["sklearn.linear_model","LassoCV",{"fit_intercept": true,"normalize": true}],
+    ["sklearn.linear_model","ElasticNetCV",{"fit_intercept": true,"normalize": true}]
+  ],
+  "permute": [true,false],
+  "gen_feature_importance": false,
+  "gen_permutation_importance": false,
+  "permutation_importance_n_repeats": 5,
+  "permutation_importance_scoring": null,
+  "gen_shap": true,
+  "nsamples": 100,
+  "l1_reg": "aic",
+  "plot_top_n_shap": 10,
+  "metrics":["explained_variance_score","mean_squared_error","mean_absolute_error"]
+}
@@ -12,6 +12,8 @@
     calc_metric,
     create_model,
     gen_splits,
+    get_feature_importance,
+    get_permutation_importance,
     get_shap,
     read_file,
     train_test_kernel,
@@ -43,6 +45,14 @@
     annotate({"return": {"score": ty.Any, "output": ty.Any}})(calc_metric)
 )
 
+get_feature_importance_pdt = task(
+    annotate({"return": {"feature_importance": ty.Any}})(get_feature_importance)
+)
+
+get_permutation_importance_pdt = task(
+    annotate({"return": {"permutation_importance": ty.Any}})(get_permutation_importance)
+)
+
 get_shap_pdt = task(annotate({"return": {"shaps": ty.Any}})(get_shap))
 
 create_model_pdt = task(
@@ -99,6 +109,28 @@ def gen_workflow(inputs, cache_dir=None, cache_locations=None):
         )
     )
     wf.metric.combine("fit_clf.split_index")
+    wf.add(
+        get_feature_importance_pdt(
+            name="feature_importance",
+            permute=wf.lzin.permute,
+            model=wf.fit_clf.lzout.model,
+            gen_feature_importance=wf.lzin.gen_feature_importance,
+        )
+    )
+    wf.feature_importance.combine("fit_clf.split_index")
+    wf.add(
+        get_permutation_importance_pdt(
+            name="permutation_importance",
+            X=wf.readcsv.lzout.X,
+            y=wf.readcsv.lzout.Y,
+            permute=wf.lzin.permute,
+            model=wf.fit_clf.lzout.model,
+            permutation_importance_n_repeats=wf.lzin.permutation_importance_n_repeats,
+            permutation_importance_scoring=wf.lzin.permutation_importance_scoring,
+            gen_permutation_importance=wf.lzin.gen_permutation_importance,
+        )
+    )
+    wf.permutation_importance.combine("fit_clf.split_index")
     wf.add(
         get_shap_pdt(
             name="shap",
@@ -124,6 +156,11 @@ def gen_workflow(inputs, cache_dir=None, cache_locations=None):
         [
             ("output", wf.metric.lzout.output),
             ("score", wf.metric.lzout.score),
+            ("feature_importance", wf.feature_importance.lzout.feature_importance),
+            (
+                "permutation_importance",
+                wf.permutation_importance.lzout.permutation_importance,
+            ),
             ("shaps", wf.shap.lzout.shaps),
             ("feature_names", wf.readcsv.lzout.feature_names),
             ("model", wf.create_model.lzout.model),