Merge pull request #33 from satra/master

satra · web-flow · commit 5833902aa4a6 · 2020-12-07T21:02:57.000-05:00
enh: add trained model saving
diff --git a/README.md b/README.md
@@ -171,6 +171,19 @@ Each model contains:
         amount of predictions and F the different SHAP values for each feature.
         `shaps` is empty if `gen_shap` is set to `false` or if `permute` is set
         to true.
+        - `model`: A pickled version of the model trained on all the input data.
+        One can use this model to test on new data that has the exact same input
+        shape and features as the trained model. For example:
+        ```python
+        import pickle as pk
+        import numpy as np
+        with open("results-20201208T010313.229190.pkl", "rb") as fp:
+            data = pk.load(fp)
+        trained_model = data[0][1].output.model
+        trained_model.predict(np.random.rand(1, 30))
+        ```
+        Please check the value of `data[N][0]` to ensure that you are not using
+        a permuted model.
 - One figure per metric with performance distribution across splits (with or
 without null distribution trained on permuted labels)
 - One figure per any metric with the word `score` in it reporting the results of
@@ -202,7 +215,7 @@ The actual numeric values are stored in a correspondingly named pkl file.
 ## Debugging
 
 You will need to understand a bit of pydra to know how to debug this application for
-now. If the process crashes, the easiest way to restart is to remove the `cache-wf` 
+now. If the process crashes, the easiest way to restart is to remove the `cache-wf`
 folder first. However, if you are rerunning, you could also remove any `.lock` file
 in the `cache-wf`directory.
 
diff --git a/pydra_ml/classifier.py b/pydra_ml/classifier.py
@@ -5,7 +5,14 @@
 from pydra.utils.messenger import AuditFlag, FileMessenger
 import typing as ty
 import os
-from .tasks import read_file, gen_splits, train_test_kernel, calc_metric, get_shap
+from .tasks import (
+    read_file,
+    gen_splits,
+    train_test_kernel,
+    calc_metric,
+    get_shap,
+    create_model,
+)
 from .report import gen_report
 
 # Create pydra tasks
@@ -36,6 +43,10 @@
 
 get_shap_pdt = task(annotate({"return": {"shaps": ty.Any}})(get_shap))
 
+create_model_pdt = task(
+    annotate({"return": {"output": ty.Any, "model": ty.Any}})(create_model)
+)
+
 
 def gen_workflow(inputs, cache_dir=None, cache_locations=None):
     wf = pydra.Workflow(
@@ -98,12 +109,22 @@ def gen_workflow(inputs, cache_dir=None, cache_locations=None):
         )
     )
     wf.shap.combine("fit_clf.split_index")
+    wf.add(
+        create_model_pdt(
+            name="create_model",
+            X=wf.readcsv.lzout.X,
+            y=wf.readcsv.lzout.Y,
+            clf_info=wf.lzin.clf_info,
+            permute=wf.lzin.permute,
+        )
+    )
     wf.set_output(
         [
             ("output", wf.metric.lzout.output),
             ("score", wf.metric.lzout.score),
             ("shaps", wf.shap.lzout.shaps),
             ("feature_names", wf.readcsv.lzout.feature_names),
+            ("model", wf.create_model.lzout.model),
         ]
     )
     return wf
diff --git a/pydra_ml/tasks.py b/pydra_ml/tasks.py
@@ -134,3 +134,49 @@ def get_shap(X, permute, model, gen_shap=False, nsamples="auto", l1_reg="aic"):
     explainer = shap.KernelExplainer(pipe.predict, shap.kmeans(X[train_index], 5))
     shaps = explainer.shap_values(X[test_index], nsamples=nsamples, l1_reg=l1_reg)
     return shaps
+
+
+def create_model(X, y, clf_info, permute):
+    """Train a model with all the data
+
+    :param X: Input features
+    :param y: Target variables
+    :param clf_info: how to construct the classifier
+    :param permute: whether to run it in permuted mode or not
+    :return: training error, classifier
+    """
+    from sklearn.pipeline import Pipeline
+    import numpy as np
+
+    def to_instance(clf_info):
+        mod = __import__(clf_info[0], fromlist=[clf_info[1]])
+        params = {}
+        if len(clf_info) > 2:
+            params = clf_info[2]
+        clf = getattr(mod, clf_info[1])(**params)
+        if len(clf_info) == 4:
+            from sklearn.model_selection import GridSearchCV
+
+            clf = GridSearchCV(clf, param_grid=clf_info[3])
+        return clf
+
+    if isinstance(clf_info[0], list):
+        # Process as a pipeline constructor
+        steps = []
+        for val in clf_info:
+            step = to_instance(val)
+            steps.append((val[1], step))
+        pipe = Pipeline(steps)
+    else:
+        clf = to_instance(clf_info)
+        from sklearn.preprocessing import StandardScaler
+
+        pipe = Pipeline([("std", StandardScaler()), (clf_info[1], clf)])
+
+    y = y.ravel()
+    if permute:
+        pipe.fit(X, y[np.random.permutation(range(len(y)))])
+    else:
+        pipe.fit(X, y)
+    predicted = pipe.predict(X)
+    return (y, predicted), pipe
diff --git a/pydra_ml/tests/test_classifier.py b/pydra_ml/tests/test_classifier.py
@@ -1,5 +1,6 @@
 import os
 from ..classifier import gen_workflow, run_workflow
+import numpy as np
 
 
 def test_classifier(tmpdir):
@@ -32,6 +33,8 @@ def test_classifier(tmpdir):
     assert results[0][0]["ml_wf.clf_info"][1] == "MLPClassifier"
     assert results[0][0]["ml_wf.permute"]
     assert results[0][1].output.score[0][0] < results[1][1].output.score[0][0]
+    assert hasattr(results[2][1].output.model, "predict")
+    assert isinstance(results[2][1].output.model.predict(np.ones((1, 30))), np.ndarray)
 
 
 def test_regressor(tmpdir):
@@ -69,3 +72,5 @@ def test_regressor(tmpdir):
     assert results[0][0]["ml_wf.clf_info"][-1][1] == "MLPRegressor"
     assert results[0][0]["ml_wf.permute"]
     assert results[0][1].output.score[0][0] < results[1][1].output.score[0][0]
+    assert hasattr(results[2][1].output.model, "predict")
+    assert isinstance(results[2][1].output.model.predict(np.ones((1, 10))), np.ndarray)