diff --git a/DESCRIPTION b/DESCRIPTION index 712a72f..2c6bb2f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: xspliner Title: Assisted Model Building, using Surrogate Black-Box Models to Train Interpretable Spline Based Additive Models -Version: 0.0.2.9005 +Version: 0.0.3 Authors@R: c( person("Krystian", "Igras", email = "krystian8207@gmail.com", role = c("aut", "cre")), person("Przemyslaw", "Biecek", role = c("aut", "ths"))) @@ -20,7 +20,7 @@ Imports: stats, magrittr, purrr, tidyr, - pROC + pROC (>= 1.15.3) Suggests: ALEPlot, factorMerger, diff --git a/NEWS.md b/NEWS.md index 353fcae..e56a594 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,11 @@ +# xspliner 0.0.3 +* Summary method extended with comparison statistics +* Extended methods for auto-extracting model metadata (lhs, response) +* Added comparison plot for factor responses +* Documentation extended with new examples +* Ability to specify glm options +* Added more informative progress messages + # xspliner 0.0.2 * Specify parameters hierarchy * Use of S3 diff --git a/R/methods-xspliner.R b/R/methods-xspliner.R index 013ffd2..9feb481 100644 --- a/R/methods-xspliner.R +++ b/R/methods-xspliner.R @@ -66,14 +66,18 @@ measure_diff_roc <- function(a, b, measure = max) { } fit_roc_diff <- function(surrogate_scores, original_scores, original_labels) { - is_score_surrogate <- all(surrogate_scores <= 1 && surrogate_scores >=0) - is_score_original <- all(original_scores <= 1 && original_scores >=0) + is_score_surrogate <- all(surrogate_scores <= 1) && all(surrogate_scores >=0) + is_score_original <- all(original_scores <= 1) && all(original_scores >=0) if (is_score_original && is_score_surrogate) { roc_surrogate <- pROC::roc(original_labels, surrogate_scores, direction="<") roc_original <- pROC::roc(original_labels, original_scores, direction="<") thresholds <- union(roc_surrogate$thresholds, roc_original$thresholds) - roc_surrogate_on_thresholds <- pROC::coords(roc_surrogate, x = thresholds, input = "threshold", ret = c("se", "1-sp")) - roc_original_on_thresholds <- pROC::coords(roc_original, x = thresholds, input = "threshold", ret = c("se", "1-sp")) + roc_surrogate_on_thresholds <- pROC::coords( + roc_surrogate, x = thresholds, input = "threshold", ret = c("se", "1-sp"), transpose = TRUE + ) + roc_original_on_thresholds <- pROC::coords( + roc_original, x = thresholds, input = "threshold", ret = c("se", "1-sp"), transpose = TRUE + ) list( max = measure_diff_roc(roc_surrogate_on_thresholds, roc_original_on_thresholds), mean = measure_diff_roc(roc_surrogate_on_thresholds, roc_original_on_thresholds, measure = mean) @@ -183,27 +187,27 @@ compare_summary <- function(surrogate, original, surrogate_pred_fun, original_pr #' summary(iris.xs, model = iris.rf, newdata = data) #' #' # Classification model -#' data <- droplevels(iris[51:150, ]) # selecting only two species data -#' iris.rf <- randomForest(Species ~ ., data = data) -#' iris.xs <- xspline(iris.rf) -#' -#' # Comparing summaries requires providing prediction function -#' # Prediction as probability for success -#' prob_rf <- function(object, newdata) predict(object, newdata = newdata, type = "prob")[, 2] -#' prob_xs <- function(object, newdata) predict(object, newdata = newdata, type = "response") -#' summary(iris.xs, model = iris.rf, newdata = data, prediction_funs = list(prob_xs, prob_rf)) -#' # Prediction as final category -#' response_rf <- function(object, newdata) predict(object, newdata = newdata) -#' response_xs <- function(object, newdata) { -#' y_levels <- levels(newdata[[environment(object)$response]]) -#' factor( -#' y_levels[(predict.glm(object, newdata = newdata, type = "link") > 0) + 1], -#' levels = y_levels -#' ) -#' } -#' response_rf(iris.rf, newdata = data) -#' response_xs(iris.xs, newdata = data) -#' summary(iris.xs, model = iris.rf, newdata = data, prediction_funs = list(response_xs, response_rf)) +# data <- droplevels(iris[51:150, ]) # selecting only two species data +# iris.rf <- randomForest(Species ~ ., data = data) +# iris.xs <- xspline(iris.rf) +# +# # Comparing summaries requires providing prediction function +# # Prediction as probability for success +# prob_rf <- function(object, newdata) predict(object, newdata = newdata, type = "prob")[, 2] +# prob_xs <- function(object, newdata) predict(object, newdata = newdata, type = "response") +# summary(iris.xs, model = iris.rf, newdata = data, prediction_funs = list(prob_xs, prob_rf)) +# # Prediction as final category +# response_rf <- function(object, newdata) predict(object, newdata = newdata) +# response_xs <- function(object, newdata) { +# y_levels <- levels(newdata[[environment(object)$response]]) +# factor( +# y_levels[(predict.glm(object, newdata = newdata, type = "link") > 0) + 1], +# levels = y_levels +# ) +# } +# response_rf(iris.rf, newdata = data) +# response_xs(iris.xs, newdata = data) +# summary(iris.xs, model = iris.rf, newdata = data, prediction_funs = list(response_xs, response_rf)) #' #' @export summary.xspliner <- function(object, predictor, ..., model = NULL, newdata = NULL, diff --git a/R/utils-formula-build.R b/R/utils-formula-build.R index 1b14c52..f900a0e 100644 --- a/R/utils-formula-build.R +++ b/R/utils-formula-build.R @@ -204,7 +204,7 @@ get_predictors_classes <- function(data) { try_get <- function(possible) { possible_response <- try(possible, silent = TRUE) if (!("try-error" %in% class(possible_response))) { - if (length(possible_response) == 0 || possible_response == "NULL") { + if (length(possible_response) == 0 || identical(possible_response, "NULL")) { NULL } else { possible_response diff --git a/cran-comments.md b/cran-comments.md index b4b4639..feff630 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,39 +1,42 @@ ## Test environments -* local Ubuntu 18.10, R 3.5.1 -* Rhub Ubuntu Linux 16.04 LTS, R-release, GCC +* local check + Ubuntu 18.10, R 3.6.1 * win-builder + R version 3.5.3 (2019-03-11) + R Under development (unstable) (2019-09-02 r77130) + R version 3.6.1 (2019-07-05) -## R CMD check results +## `R CMD check xspliner_0.0.3.tar.gz --as-cran` results ``` Status: OK ``` -## check_rhub() result +## win-builder result ``` -* checking CRAN incoming feasibility ... NOTE -Maintainer: ‘Krystian Igras ’ - -New submission - -Possibly mis-spelled words in DESCRIPTION: - interpretable (9:41) - Interpretable (3:15) - -* checking examples ... NOTE -Examples with CPU or elapsed time > 5s -user system elapsed -xspline 5.92 0.164 6.217 - -0 errors | 0 warnings | 2 notes +* using log directory 'd:/RCompile/CRANguest/R-oldrelease/xspliner.Rcheck' +* using R version 3.5.3 (2019-03-11) +* using platform: x86_64-w64-mingw32 (64-bit) +... +* DONE +Status: OK ``` -# win-builder result - ``` -* checking CRAN incoming feasibility ... NOTE -Maintainer: 'Krystian Igras ' +* using log directory 'd:/RCompile/CRANguest/R-devel/xspliner.Rcheck' +* using R Under development (unstable) (2019-09-02 r77130) +* using platform: x86_64-w64-mingw32 (64-bit) +... +* DONE +Status: OK +``` -Status: 1 NOTE +``` +* using log directory 'd:/RCompile/CRANguest/R-release/xspliner.Rcheck' +* using R version 3.6.1 (2019-07-05) +* using platform: x86_64-w64-mingw32 (64-bit) +... +* DONE +Status: OK ``` diff --git a/docs/articles/automation.html b/docs/articles/automation.html index dac9949..0afd4d3 100644 --- a/docs/articles/automation.html +++ b/docs/articles/automation.html @@ -88,7 +88,7 @@

Automate your work

Krystian Igras

-

2019-08-31

+

2019-09-05

@@ -106,16 +106,16 @@

Local transition and effect

In previous sections we learned how to specify parameters for the effect and transition of single variable. Let’s name them local parameters.

A quick example you can see below:

-
library(xspliner)
-library(randomForest)
-
-rf_iris <- randomForest(Petal.Width ~  Sepal.Length + Petal.Length + Species, data = iris)
-model_xs <- xspline(Petal.Width ~ 
-  Sepal.Length + 
-  xs(Petal.Length, effect = list(grid.resolution = 100), transition = list(bs = "cr")) + 
-  xf(Species, transition = list(stat = "loglikelihood", value = 4)),
-  model = rf_iris)
-summary(model_xs)
+
library(xspliner)
+library(randomForest)
+
+rf_iris <- randomForest(Petal.Width ~  Sepal.Length + Petal.Length + Species, data = iris)
+model_xs <- xspline(Petal.Width ~ 
+  Sepal.Length + 
+  xs(Petal.Length, effect = list(grid.resolution = 100), transition = list(bs = "cr")) + 
+  xf(Species, transition = list(stat = "loglikelihood", value = 4)),
+  model = rf_iris)
+summary(model_xs)
## 
 ## Call:
 ## stats::glm(formula = Petal.Width ~ Sepal.Length + xs(Petal.Length) + 
@@ -123,23 +123,23 @@ 

## ## Deviance Residuals: ## Min 1Q Median 3Q Max -## -0.66812 -0.07307 -0.02386 0.10341 0.45717 +## -0.67011 -0.06856 -0.02511 0.09891 0.45816 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) -## (Intercept) -1.32737 0.21623 -6.139 7.56e-09 *** -## Sepal.Length 0.03222 0.03547 0.908 0.365 -## xs(Petal.Length) 1.85685 0.35456 5.237 5.63e-07 *** -## xf(Species)versicolor 0.08438 0.17014 0.496 0.621 -## xf(Species)virginica 0.39232 0.22838 1.718 0.088 . +## (Intercept) -1.24091 0.20204 -6.142 7.44e-09 *** +## Sepal.Length 0.02889 0.03555 0.813 0.418 +## xs(Petal.Length) 1.84981 0.34695 5.332 3.65e-07 *** +## xf(Species)versicolor -0.01324 0.18502 -0.072 0.943 +## xf(Species)virginica 0.30189 0.24099 1.253 0.212 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## -## (Dispersion parameter for gaussian family taken to be 0.03096157) +## (Dispersion parameter for gaussian family taken to be 0.03078324) ## ## Null deviance: 86.5699 on 149 degrees of freedom -## Residual deviance: 4.4894 on 145 degrees of freedom -## AIC: -88.655 +## Residual deviance: 4.4636 on 145 degrees of freedom +## AIC: -89.521 ## ## Number of Fisher Scoring iterations: 2

When the black box model is based on higher amount of variables it can be problematic to specify local parameters for each predictor. Also formula becomes large and hard to read.

@@ -149,15 +149,15 @@

Global transition and effect

Its more common that we use similar configuration for each variable, as it’s simple and allows us to build and experiment faster. To do this you can specify xs_opts and xf_opts of xspline function. Let’s name them global parameters.

Each of global parameters can specify effect and transition, that should be used for xs and xf transformations respectively. Then you just need to use base xs and xf symbols without parameters:

-
model_xs <- xspline(Petal.Width ~ 
-  Sepal.Length + 
-  xs(Petal.Length) + 
-  xf(Species),
-  model = rf_iris, 
-  xs_opts = list(effect = list(grid.resolution = 100), transition = list(bs = "cr")),
-  xf_opts = list(transition = list(stat = "loglikelihood", value = 4))
-)
-summary(model_xs)
+
model_xs <- xspline(Petal.Width ~ 
+  Sepal.Length + 
+  xs(Petal.Length) + 
+  xf(Species),
+  model = rf_iris, 
+  xs_opts = list(effect = list(grid.resolution = 100), transition = list(bs = "cr")),
+  xf_opts = list(transition = list(stat = "loglikelihood", value = 4))
+)
+summary(model_xs)
## 
 ## Call:
 ## stats::glm(formula = Petal.Width ~ Sepal.Length + xs(Petal.Length) + 
@@ -165,35 +165,35 @@ 

## ## Deviance Residuals: ## Min 1Q Median 3Q Max -## -0.66812 -0.07307 -0.02386 0.10341 0.45717 +## -0.67011 -0.06856 -0.02511 0.09891 0.45816 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) -## (Intercept) -1.32737 0.21623 -6.139 7.56e-09 *** -## Sepal.Length 0.03222 0.03547 0.908 0.365 -## xs(Petal.Length) 1.85685 0.35456 5.237 5.63e-07 *** -## xf(Species)versicolor 0.08438 0.17014 0.496 0.621 -## xf(Species)virginica 0.39232 0.22838 1.718 0.088 . +## (Intercept) -1.24091 0.20204 -6.142 7.44e-09 *** +## Sepal.Length 0.02889 0.03555 0.813 0.418 +## xs(Petal.Length) 1.84981 0.34695 5.332 3.65e-07 *** +## xf(Species)versicolor -0.01324 0.18502 -0.072 0.943 +## xf(Species)virginica 0.30189 0.24099 1.253 0.212 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## -## (Dispersion parameter for gaussian family taken to be 0.03096157) +## (Dispersion parameter for gaussian family taken to be 0.03078324) ## ## Null deviance: 86.5699 on 149 degrees of freedom -## Residual deviance: 4.4894 on 145 degrees of freedom -## AIC: -88.655 +## Residual deviance: 4.4636 on 145 degrees of freedom +## AIC: -89.521 ## ## Number of Fisher Scoring iterations: 2

But still you can specify local parameters that override the global ones.

-
model_xs <- xspline(Petal.Width ~ 
-  xs(Sepal.Length, transition = list(k = 10)) + 
-  xs(Petal.Length) + 
-  xf(Species),
-  model = rf_iris, 
-  xs_opts = list(effect = list(grid.resolution = 100), transition = list(bs = "cr")),
-  xf_opts = list(transition = list(stat = "loglikelihood", value = 4))
-)
-summary(model_xs)
+
model_xs <- xspline(Petal.Width ~ 
+  xs(Sepal.Length, transition = list(k = 10)) + 
+  xs(Petal.Length) + 
+  xf(Species),
+  model = rf_iris, 
+  xs_opts = list(effect = list(grid.resolution = 100), transition = list(bs = "cr")),
+  xf_opts = list(transition = list(stat = "loglikelihood", value = 4))
+)
+summary(model_xs)
## 
 ## Call:
 ## stats::glm(formula = Petal.Width ~ xs(Sepal.Length) + xs(Petal.Length) + 
@@ -201,23 +201,23 @@ 

## ## Deviance Residuals: ## Min 1Q Median 3Q Max -## -0.67121 -0.07495 -0.03046 0.09856 0.45944 +## -0.67309 -0.07436 -0.03025 0.09788 0.45994 ## ## Coefficients: -## Estimate Std. Error t value Pr(>|t|) -## (Intercept) -1.46014 0.26089 -5.597 1.06e-07 *** -## xs(Sepal.Length) 0.31132 0.31424 0.991 0.3235 -## xs(Petal.Length) 1.80229 0.37818 4.766 4.53e-06 *** -## xf(Species)versicolor 0.09996 0.17451 0.573 0.5677 -## xf(Species)virginica 0.41648 0.23573 1.767 0.0794 . +## Estimate Std. Error t value Pr(>|t|) +## (Intercept) -1.368640 0.266276 -5.140 8.74e-07 *** +## xs(Sepal.Length) 0.278368 0.331361 0.840 0.402 +## xs(Petal.Length) 1.808687 0.374137 4.834 3.37e-06 *** +## xf(Species)versicolor 0.001331 0.192390 0.007 0.994 +## xf(Species)virginica 0.323242 0.251553 1.285 0.201 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## -## (Dispersion parameter for gaussian family taken to be 0.03092842) +## (Dispersion parameter for gaussian family taken to be 0.03077369) ## ## Null deviance: 86.5699 on 149 degrees of freedom -## Residual deviance: 4.4846 on 145 degrees of freedom -## AIC: -88.816 +## Residual deviance: 4.4622 on 145 degrees of freedom +## AIC: -89.568 ## ## Number of Fisher Scoring iterations: 2

In this case last_evaluation variable will be transformed with thin plate regression spline (bs = "tp" is default for mgcv::s) with basis dimension equal to 10. At the same time average_monthly_hours will be transformed with cubic splines.

@@ -227,7 +227,7 @@

Default transition and effect

As you can see in project objects reference, you may find there xs_opts_edfault and xf_opts_default objects. These objects specify default parameters for xs and xf transformations.

- +
xs_opts_default
## $effect
 ## $effect$type
 ## [1] "pdp"
@@ -239,7 +239,7 @@ 

## ## $transition$monotonic ## [1] "not"

- +
xf_opts_default
## $effect
 ## $effect$type
 ## [1] "ice"
@@ -267,11 +267,11 @@ 

Transform each formula predictor

If you want to transform each predictor of specified formula and not using xs and xf variables you can omit it using consider parameter for xspline function.

Possible values are "specials" the default one and "all". For automatic transformation of each variable without specifying xs and xf symbols just set consider = "all" and pass standard formula into xspline function:

-
model_xs <- xspline(Petal.Width ~ Sepal.Length  + Petal.Length + Species,
-  model = rf_iris,
-  consider = "all"
-)
-summary(model_xs)
+
model_xs <- xspline(Petal.Width ~ Sepal.Length  + Petal.Length + Species,
+  model = rf_iris,
+  consider = "all"
+)
+summary(model_xs)
## 
 ## Call:
 ## stats::glm(formula = Petal.Width ~ xs(Sepal.Length) + xs(Petal.Length) + 
@@ -279,34 +279,34 @@ 

## ## Deviance Residuals: ## Min 1Q Median 3Q Max -## -0.67513 -0.07534 -0.03094 0.09496 0.46835 +## -0.67517 -0.07722 -0.03076 0.09586 0.46877 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) -## (Intercept) -1.4577 0.2641 -5.520 1.52e-07 *** -## xs(Sepal.Length) 0.3332 0.3179 1.048 0.2963 -## xs(Petal.Length) 1.7661 0.3734 4.730 5.27e-06 *** -## xf(Species)versicolor 0.1181 0.1716 0.688 0.4926 -## xf(Species)virginica 0.4411 0.2321 1.901 0.0593 . +## (Intercept) -1.36474 0.27156 -5.026 1.46e-06 *** +## xs(Sepal.Length) 0.31522 0.33594 0.938 0.350 +## xs(Petal.Length) 1.74404 0.36633 4.761 4.62e-06 *** +## xf(Species)versicolor 0.03733 0.18736 0.199 0.842 +## xf(Species)virginica 0.36985 0.24541 1.507 0.134 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## -## (Dispersion parameter for gaussian family taken to be 0.03099558) +## (Dispersion parameter for gaussian family taken to be 0.03090884) ## ## Null deviance: 86.5699 on 149 degrees of freedom -## Residual deviance: 4.4944 on 145 degrees of freedom -## AIC: -88.49 +## Residual deviance: 4.4818 on 145 degrees of freedom +## AIC: -88.911 ## ## Number of Fisher Scoring iterations: 2

Then each predictor is transformed with xs and xf symbols and use of default parameters or global ones when specified.

xs is used for integer and numeric variables - xf for factors.

By default xspline function tries to extract the data from model (rf_model) call and xspline’s parent.frame() environment then uses it to determine predictor types. So to be sure that variable types are sourced correctly a good practice here is to add data parameter, storing black box’s training data.

-
model_xs <- xspline(Petal.Width ~ Sepal.Length  + Petal.Length + Species,
-  model = rf_iris,
-  data = iris,
-  consider = "all"
-)
-summary(model_xs)
+
model_xs <- xspline(Petal.Width ~ Sepal.Length  + Petal.Length + Species,
+  model = rf_iris,
+  data = iris,
+  consider = "all"
+)
+summary(model_xs)
## 
 ## Call:
 ## stats::glm(formula = Petal.Width ~ xs(Sepal.Length) + xs(Petal.Length) + 
@@ -314,23 +314,23 @@ 

## ## Deviance Residuals: ## Min 1Q Median 3Q Max -## -0.67513 -0.07534 -0.03094 0.09496 0.46835 +## -0.67517 -0.07722 -0.03076 0.09586 0.46877 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) -## (Intercept) -1.4577 0.2641 -5.520 1.52e-07 *** -## xs(Sepal.Length) 0.3332 0.3179 1.048 0.2963 -## xs(Petal.Length) 1.7661 0.3734 4.730 5.27e-06 *** -## xf(Species)versicolor 0.1181 0.1716 0.688 0.4926 -## xf(Species)virginica 0.4411 0.2321 1.901 0.0593 . +## (Intercept) -1.36474 0.27156 -5.026 1.46e-06 *** +## xs(Sepal.Length) 0.31522 0.33594 0.938 0.350 +## xs(Petal.Length) 1.74404 0.36633 4.761 4.62e-06 *** +## xf(Species)versicolor 0.03733 0.18736 0.199 0.842 +## xf(Species)virginica 0.36985 0.24541 1.507 0.134 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## -## (Dispersion parameter for gaussian family taken to be 0.03099558) +## (Dispersion parameter for gaussian family taken to be 0.03090884) ## ## Null deviance: 86.5699 on 149 degrees of freedom -## Residual deviance: 4.4944 on 145 degrees of freedom -## AIC: -88.49 +## Residual deviance: 4.4818 on 145 degrees of freedom +## AIC: -88.911 ## ## Number of Fisher Scoring iterations: 2

@@ -340,13 +340,13 @@

In some cases you may want to transform only quantitative or qualitative predictors. Looking into default parameters xs_opts_default and xf_opts_default we may find alter parameter for transition.

The parameter is used to specify if predictor for which xs or xf was specified needs to be transformed or used as a bare variable in formula. You can specify it in the local or global transition parameter. In this case using the global one is more reasonable.

So, in order to transform only continuous variables just set alter = "always" (what is default) for xs_opts and alter = "never" for xf_opts:

-
model_xs <- xspline(Petal.Width ~ Sepal.Length + Petal.Length + Species,
-  model = rf_iris,
-  data = iris,
-  consider = "all",
-  xf_opts = list(transition = list(alter = "never"))
-)
-summary(model_xs)
+
model_xs <- xspline(Petal.Width ~ Sepal.Length + Petal.Length + Species,
+  model = rf_iris,
+  data = iris,
+  consider = "all",
+  xf_opts = list(transition = list(alter = "never"))
+)
+summary(model_xs)
## 
 ## Call:
 ## stats::glm(formula = Petal.Width ~ xs(Sepal.Length) + xs(Petal.Length) + 
@@ -354,33 +354,33 @@ 

## ## Deviance Residuals: ## Min 1Q Median 3Q Max -## -0.67513 -0.07534 -0.03094 0.09496 0.46835 +## -0.67517 -0.07722 -0.03076 0.09586 0.46877 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) -## (Intercept) -1.4577 0.2641 -5.520 1.52e-07 *** -## xs(Sepal.Length) 0.3332 0.3179 1.048 0.2963 -## xs(Petal.Length) 1.7661 0.3734 4.730 5.27e-06 *** -## Speciesversicolor 0.1181 0.1716 0.688 0.4926 -## Speciesvirginica 0.4411 0.2321 1.901 0.0593 . +## (Intercept) -1.36474 0.27156 -5.026 1.46e-06 *** +## xs(Sepal.Length) 0.31522 0.33594 0.938 0.350 +## xs(Petal.Length) 1.74404 0.36633 4.761 4.62e-06 *** +## Speciesversicolor 0.03733 0.18736 0.199 0.842 +## Speciesvirginica 0.36985 0.24541 1.507 0.134 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## -## (Dispersion parameter for gaussian family taken to be 0.03099558) +## (Dispersion parameter for gaussian family taken to be 0.03090884) ## ## Null deviance: 86.5699 on 149 degrees of freedom -## Residual deviance: 4.4944 on 145 degrees of freedom -## AIC: -88.49 +## Residual deviance: 4.4818 on 145 degrees of freedom +## AIC: -88.911 ## ## Number of Fisher Scoring iterations: 2

For transformation of factors only:

-
model_xs <- xspline(Petal.Width ~ Sepal.Length + Petal.Length + Species,
-  model = rf_iris,
-  data = iris,
-  consider = "all",
-  xs_opts = list(transition = list(alter = "never"))
-)
-summary(model_xs)
+
model_xs <- xspline(Petal.Width ~ Sepal.Length + Petal.Length + Species,
+  model = rf_iris,
+  data = iris,
+  consider = "all",
+  xs_opts = list(transition = list(alter = "never"))
+)
+summary(model_xs)
## 
 ## Call:
 ## stats::glm(formula = Petal.Width ~ Sepal.Length + Petal.Length + 
@@ -413,8 +413,8 @@ 

Model based dot formula

For many existing models in R we usually can specify “dot formulas”, when used predictors are sourced from provided data. xspliner can also handle the form. Let’s return here for iris random forest model.

-
model_xs <- xspline(Petal.Width ~ ., model = rf_iris)
-summary(model_xs)
+
model_xs <- xspline(Petal.Width ~ ., model = rf_iris)
+summary(model_xs)
## 
 ## Call:
 ## stats::glm(formula = Petal.Width ~ xs(Sepal.Length) + xs(Petal.Length) + 
@@ -422,23 +422,23 @@ 

## ## Deviance Residuals: ## Min 1Q Median 3Q Max -## -0.67513 -0.07534 -0.03094 0.09496 0.46835 +## -0.67517 -0.07722 -0.03076 0.09586 0.46877 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) -## (Intercept) -1.4577 0.2641 -5.520 1.52e-07 *** -## xs(Sepal.Length) 0.3332 0.3179 1.048 0.2963 -## xs(Petal.Length) 1.7661 0.3734 4.730 5.27e-06 *** -## xf(Species)versicolor 0.1181 0.1716 0.688 0.4926 -## xf(Species)virginica 0.4411 0.2321 1.901 0.0593 . +## (Intercept) -1.36474 0.27156 -5.026 1.46e-06 *** +## xs(Sepal.Length) 0.31522 0.33594 0.938 0.350 +## xs(Petal.Length) 1.74404 0.36633 4.761 4.62e-06 *** +## xf(Species)versicolor 0.03733 0.18736 0.199 0.842 +## xf(Species)virginica 0.36985 0.24541 1.507 0.134 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## -## (Dispersion parameter for gaussian family taken to be 0.03099558) +## (Dispersion parameter for gaussian family taken to be 0.03090884) ## ## Null deviance: 86.5699 on 149 degrees of freedom -## Residual deviance: 4.4944 on 145 degrees of freedom -## AIC: -88.49 +## Residual deviance: 4.4818 on 145 degrees of freedom +## AIC: -88.911 ## ## Number of Fisher Scoring iterations: 2

Good practice here is to provide data parameter as well to detect predictors classes, and model type (classification or regression).

@@ -450,11 +450,11 @@

  • xspline provided data parameter, excluding formula response
  • To assure correct predictors usage, you may also specify predictor names vector in predictors parameter, and data (optional) to assure source of variable classes:

    - +
    model_xs <- xspline(Petal.Width ~ ., 
    +                    model = rf_iris,
    +                    predictors = colnames(iris)[-c(2, 4)],
    +                    data = iris)
    +summary(model_xs)
    ## 
     ## Call:
     ## stats::glm(formula = Petal.Width ~ xs(Sepal.Length) + xs(Petal.Length) + 
    @@ -462,23 +462,23 @@ 

    ## ## Deviance Residuals: ## Min 1Q Median 3Q Max -## -0.67513 -0.07534 -0.03094 0.09496 0.46835 +## -0.67517 -0.07722 -0.03076 0.09586 0.46877 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) -## (Intercept) -1.4577 0.2641 -5.520 1.52e-07 *** -## xs(Sepal.Length) 0.3332 0.3179 1.048 0.2963 -## xs(Petal.Length) 1.7661 0.3734 4.730 5.27e-06 *** -## xf(Species)versicolor 0.1181 0.1716 0.688 0.4926 -## xf(Species)virginica 0.4411 0.2321 1.901 0.0593 . +## (Intercept) -1.36474 0.27156 -5.026 1.46e-06 *** +## xs(Sepal.Length) 0.31522 0.33594 0.938 0.350 +## xs(Petal.Length) 1.74404 0.36633 4.761 4.62e-06 *** +## xf(Species)versicolor 0.03733 0.18736 0.199 0.842 +## xf(Species)virginica 0.36985 0.24541 1.507 0.134 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## -## (Dispersion parameter for gaussian family taken to be 0.03099558) +## (Dispersion parameter for gaussian family taken to be 0.03090884) ## ## Null deviance: 86.5699 on 149 degrees of freedom -## Residual deviance: 4.4944 on 145 degrees of freedom -## AIC: -88.49 +## Residual deviance: 4.4818 on 145 degrees of freedom +## AIC: -88.911 ## ## Number of Fisher Scoring iterations: 2

    In above examples each predictor is transformed by default. You can exclude needed, by specifying global alter = "never" parameters, or bare.

    @@ -490,8 +490,8 @@

    Omit formula

    As we could see in previous section, using dot formula the predictors are sourced from provided black box. Why cannot we fully extract formula from black box? We can.

    Let’s use previously built rf_iris model:

    -
    model_xs <- xspline(rf_iris)
    -summary(model_xs)
    +
    model_xs <- xspline(rf_iris)
    +summary(model_xs)
    ## 
     ## Call:
     ## stats::glm(formula = Petal.Width ~ xs(Sepal.Length) + xs(Petal.Length) + 
    @@ -499,23 +499,23 @@ 

    ## ## Deviance Residuals: ## Min 1Q Median 3Q Max -## -0.67513 -0.07534 -0.03094 0.09496 0.46835 +## -0.67517 -0.07722 -0.03076 0.09586 0.46877 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) -## (Intercept) -1.4577 0.2641 -5.520 1.52e-07 *** -## xs(Sepal.Length) 0.3332 0.3179 1.048 0.2963 -## xs(Petal.Length) 1.7661 0.3734 4.730 5.27e-06 *** -## xf(Species)versicolor 0.1181 0.1716 0.688 0.4926 -## xf(Species)virginica 0.4411 0.2321 1.901 0.0593 . +## (Intercept) -1.36474 0.27156 -5.026 1.46e-06 *** +## xs(Sepal.Length) 0.31522 0.33594 0.938 0.350 +## xs(Petal.Length) 1.74404 0.36633 4.761 4.62e-06 *** +## xf(Species)versicolor 0.03733 0.18736 0.199 0.842 +## xf(Species)virginica 0.36985 0.24541 1.507 0.134 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## -## (Dispersion parameter for gaussian family taken to be 0.03099558) +## (Dispersion parameter for gaussian family taken to be 0.03090884) ## ## Null deviance: 86.5699 on 149 degrees of freedom -## Residual deviance: 4.4944 on 145 degrees of freedom -## AIC: -88.49 +## Residual deviance: 4.4818 on 145 degrees of freedom +## AIC: -88.911 ## ## Number of Fisher Scoring iterations: 2

    Works! Can it be simpler? Actually not because of black box based transformation and theory, but we can provide some model based parameters upfront using DALEX’s explainer object (see next section).

    @@ -525,29 +525,29 @@

  • Building the black box with formula
  • Black box training data stores factors in long form (the factor is one column, not spread into binary wide form)
  • -

    Most of R based ML models satisfy the above condition. One of exceptions is XGBoost which in actual xspliner version needs to be handled in the more complex xspline call. You can see it in use cases

    +

    Most of R based ML models satisfy the above condition. One of exceptions is XGBoost which in actual xspliner version needs to be handled in the more complex xspline call. You can see it in use cases

    Excluding predictors from transformation

    For this example consider again Boston Housing Data from pdp package, and build simple svm model for predicting chas variable:

    -
    library(pdp)
    -library(e1071)
    -data(boston)
    -svm_boston <- svm(chas ~ cmedv + rad + lstat, data = boston, probability = TRUE)
    -str(boston[, "rad"])
    +
    library(pdp)
    +library(e1071)
    +data(boston)
    +svm_boston <- svm(chas ~ cmedv + rad + lstat, data = boston, probability = TRUE)
    +str(boston[, "rad"])
    ##  int [1:506] 1 2 2 3 3 3 5 5 5 5 ...
    -
    unique(boston$rad)
    +
    unique(boston$rad)
    ## [1]  1  2  3  5  4  8  6  7 24

    As we can see rad variable is integer and has only 9 unique values. As a result spline approximation may be misleading, and not possible to perform. We decide here to omit rad variable when performing transformation, nevertheless remaining predictors should be transformed.

    At first setup model based transformation:

    - +
    xs_model <- xspline(svm_boston)
    ## Error in smooth.construct.tp.smooth.spec(object, dk$data, dk$knots): A term has fewer unique covariate combinations than specified maximum degrees of freedom

    As we can see, the error was returned due to the form of rad variable.

    How to exclude rad from transformation? We can use xspline’s bare parameter, responsible for specifying predictor which shouldn’t be transformed.

    - +
    xs_model <- xspline(
    +  svm_boston,
    +  bare = "rad")
    +summary(xs_model)
    ## 
     ## Call:
     ## stats::glm(formula = chas ~ xs(cmedv) + rad + xs(lstat), family = family, 
    @@ -559,10 +559,10 @@ 

    ## ## Coefficients: ## Estimate Std. Error z value Pr(>|z|) -## (Intercept) 1.027e+02 1.203e+02 0.853 0.39357 -## xs(cmedv) 9.507e+01 3.677e+01 2.585 0.00972 ** -## rad -8.456e-04 2.149e-02 -0.039 0.96861 -## xs(lstat) -1.402e+01 1.149e+02 -0.122 0.90292 +## (Intercept) 50.9339706 61.1731524 0.833 0.40506 +## xs(cmedv) 48.2617778 18.6661709 2.586 0.00972 ** +## rad -0.0008446 0.0214853 -0.039 0.96864 +## xs(lstat) -7.1136654 58.3310899 -0.122 0.90294 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## @@ -583,21 +583,21 @@

    Integration with DALEX

    As mentioned in the previous section, xspliner is integrated with DALEX package. The main function from the package explain returns useful black box data (such as bare black model or training data) that can be used by xspline function.

    Just check below example

    -
    library(DALEX)
    -rf_boston <- randomForest(lstat ~ cmedv + crim + chas, data = boston)
    -explainer <- explain(rf_boston, label = "boston")
    +
    library(DALEX)
    +rf_boston <- randomForest(lstat ~ cmedv + crim + chas, data = boston)
    +explainer <- explain(rf_boston, label = "boston")
    ## Preparation of a new explainer is initiated
     ##   -> model label       :  boston 
     ##   -> data              :  506  rows  4  cols ([33mextracted from the model[39m)
     ##   -> target variable   :  not specified! ([31mWARNING[39m)
     ##   -> predict function  :  yhat.randomForest  will be used ([33mdefault[39m)
    -##   -> predicted values  :  numerical, min =  5.975977 , mean =  12.65206 , max =  24.93637  
    +##   -> predicted values  :  numerical, min =  6.213462 , mean =  12.6528 , max =  24.49117  
     ##   -> residual function :  difference between y and yhat ([33mdefault[39m)
     ## [32mA new explainer has been created![39m
    - +
    model <- xspline(
    +  explainer
    +)
    +summary(model)
    ## 
     ## Call:
     ## stats::glm(formula = lstat ~ xs(cmedv) + xs(crim) + xf(chas), 
    @@ -605,22 +605,22 @@ 

    ## ## Deviance Residuals: ## Min 1Q Median 3Q Max -## -11.2276 -2.3748 -0.6923 1.6867 21.1417 +## -11.3904 -2.3979 -0.6708 1.7025 21.1816 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) -## (Intercept) -15.95068 1.44893 -11.009 < 2e-16 *** -## xs(cmedv) 1.70141 0.07246 23.480 < 2e-16 *** -## xs(crim) 0.54515 0.14498 3.760 0.00019 *** -## xf(chas)1 1.42731 0.69468 2.055 0.04043 * +## (Intercept) -16.91139 1.61080 -10.499 < 2e-16 *** +## xs(cmedv) 1.72018 0.07356 23.386 < 2e-16 *** +## xs(crim) 0.60945 0.15948 3.821 0.000149 *** +## xf(chas)1 1.41359 0.69607 2.031 0.042802 * ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## -## (Dispersion parameter for gaussian family taken to be 15.41681) +## (Dispersion parameter for gaussian family taken to be 15.48482) ## ## Null deviance: 25752.4 on 505 degrees of freedom -## Residual deviance: 7739.2 on 502 degrees of freedom -## AIC: 2826.1 +## Residual deviance: 7773.4 on 502 degrees of freedom +## AIC: 2828.3 ## ## Number of Fisher Scoring iterations: 2

    You can provide your own xspline’s parameters that overwrite that sourced from explainer.

    diff --git a/docs/articles/cases.html b/docs/articles/cases.html index 28f7361..624af1f 100644 --- a/docs/articles/cases.html +++ b/docs/articles/cases.html @@ -88,7 +88,7 @@

    Use cases

    Krystian Igras

    -

    2019-08-31

    +

    2019-09-05

    @@ -100,12 +100,12 @@

    2019-08-31

    XGBoost case

    -
    library(xgboost)
    -library(xspliner)
    -library(breakDown)
    -HR <- HR_data
    -
    -str(HR_data)
    +
    library(xgboost)
    +library(xspliner)
    +library(breakDown)
    +HR <- HR_data
    +
    +str(HR_data)
    ## 'data.frame':    14999 obs. of  10 variables:
     ##  $ satisfaction_level   : num  0.38 0.8 0.11 0.72 0.37 0.41 0.1 0.92 0.89 0.42 ...
     ##  $ last_evaluation      : num  0.53 0.86 0.88 0.87 0.52 0.5 0.77 0.85 1 0.53 ...
    @@ -117,17 +117,17 @@ 

    ## $ promotion_last_5years: int 0 0 0 0 0 0 0 0 0 0 ... ## $ sales : Factor w/ 10 levels "accounting","hr",..: 8 8 8 8 8 8 8 8 8 8 ... ## $ salary : Factor w/ 3 levels "high","low","medium": 2 3 3 2 2 2 2 2 2 2 ...

    -
    model_matrix_train <- model.matrix(left ~ . -1, HR)
    -data_train <- xgb.DMatrix(model_matrix_train, label = HR$left)
    -param <- list(max_depth = 2, objective = "binary:logistic")
    -
    -HR_xgb_model <- xgb.train(param, data_train, nrounds = 50)
    -model_xs <- xspline(HR_xgb_model, lhs = "left", response = "left", predictors = colnames(HR)[-7],
    -                 data = HR, form = "additive", family = "binomial", link = "logit",
    -                 bare = c("number_project", "time_spend_company", "Work_accident", "promotion_last_5years"),
    -                 xs_opts = list(effect = list(train = model_matrix_train)),
    -                 xf_opts = list(transition = list(alter = "never"))) 
    -summary(model_xs)
    +
    model_matrix_train <- model.matrix(left ~ . -1, HR)
    +data_train <- xgb.DMatrix(model_matrix_train, label = HR$left)
    +param <- list(max_depth = 2, objective = "binary:logistic")
    +
    +HR_xgb_model <- xgb.train(param, data_train, nrounds = 50)
    +model_xs <- xspline(HR_xgb_model, lhs = "left", response = "left", predictors = colnames(HR)[-7],
    +                 data = HR, form = "additive", family = "binomial", link = "logit",
    +                 bare = c("number_project", "time_spend_company", "Work_accident", "promotion_last_5years"),
    +                 xs_opts = list(effect = list(train = model_matrix_train)),
    +                 xf_opts = list(transition = list(alter = "never"))) 
    +summary(model_xs)
    ## 
     ## Call:
     ## stats::glm(formula = left ~ xs(satisfaction_level) + xs(last_evaluation) + 
    @@ -174,19 +174,19 @@ 

    Partially constant approximation

    -
    library(DALEX)
    -library(caret)
    -library(xspliner)
    -data(apartments)
    -
    -set.seed(123)
    -variable <- "construction.year"
    -regr_rf <- train(m2.price ~ ., data = apartments, method = "rf", ntree = 100)
    -model_xs <- xspline(regr_rf, data = apartments, bare = c("floor", "no.rooms"),
    -                    xs_opts = list(transition = list(bs = "ps", fx = FALSE, k = 20, m = -1)))
    +
    library(DALEX)
    +library(caret)
    +library(xspliner)
    +data(apartments)
    +
    +set.seed(123)
    +variable <- "construction.year"
    +regr_rf <- train(m2.price ~ ., data = apartments, method = "rf", ntree = 100)
    +model_xs <- xspline(regr_rf, data = apartments, bare = c("floor", "no.rooms"),
    +                    xs_opts = list(transition = list(bs = "ps", fx = FALSE, k = 20, m = -1)))
    ## Warning: Column `orig` joining factors with different levels, coercing to
     ## character vector
    -
    plot(model_xs, "surface")
    +
    plot(model_xs, "surface")

    diff --git a/docs/articles/discrete.html b/docs/articles/discrete.html index 5891449..0e11903 100644 --- a/docs/articles/discrete.html +++ b/docs/articles/discrete.html @@ -88,7 +88,7 @@

    Classification and discrete predictors

    Krystian Igras

    -

    2019-08-31

    +

    2019-09-05

    @@ -103,8 +103,8 @@

    Qualitative predictors

    As before let’s explain the approach basing on a random forest model. For this case we use HR_data from breakDown package.

    Let’s load the data

    - +
    HR_data <- breakDown::HR_data
    +str(HR_data)
    ## 'data.frame':    14999 obs. of  10 variables:
     ##  $ satisfaction_level   : num  0.38 0.8 0.11 0.72 0.37 0.41 0.1 0.92 0.89 0.42 ...
     ##  $ last_evaluation      : num  0.53 0.86 0.88 0.87 0.52 0.5 0.77 0.85 1 0.53 ...
    @@ -117,8 +117,8 @@ 

    ## $ sales : Factor w/ 10 levels "accounting","hr",..: 8 8 8 8 8 8 8 8 8 8 ... ## $ salary : Factor w/ 3 levels "high","low","medium": 2 3 3 2 2 2 2 2 2 2 ...

    and build random forest in which we predict average_montly_hours based on last_evaluation, salary and satisfaction_level predictors.

    -
    library(randomForest)
    -model_rf <- randomForest(average_montly_hours ~ last_evaluation + salary + satisfaction_level, data = HR_data)
    +
    library(randomForest)
    +model_rf <- randomForest(average_montly_hours ~ last_evaluation + salary + satisfaction_level, data = HR_data)

    We’re going to make some transformation/simplification on salary variable. To do so, we need to transform formula passed into xspline function.

    Similarly to continuous variable it is enough to use xf symbol on salary variable, i.e. use formula:

    average_monthly_hours ~ last_evaluation + xf(salary) + satisfaction_level
    @@ -149,13 +149,13 @@

    In order to customize variable transition, just specified (inherited from above functions) parameters inside transition parameter of xf formula symbol. For example to use “fast-adaptive” method for groups merging with optimal partition at GIC statistics value of 4, we set:

    xf(salary, transition = list(method = "fast-adaptive", value = 4))

    In below example, we will transform salary predictor with cutting of GIC statistics at value = 2. As in continuous case we need to use the formula within xspline function:

    - +
    library(xspliner)
    +model_xs <- xspline(
    +  average_montly_hours ~ last_evaluation + xf(salary, transition = list(value = 2)) + satisfaction_level,
    +  model = model_rf
    +)
    +
    +summary(model_xs)
    ## 
     ## Call:
     ## stats::glm(formula = average_montly_hours ~ last_evaluation + 
    @@ -183,13 +183,13 @@ 

    ## Number of Fisher Scoring iterations: 2

    Checking out the model summary, we can realize that “low” and “medium” values were merged into single level (generating “lowmedium” level).

    It can be also found by:

    -
    summary(model_xs, "salary")
    +
    summary(model_xs, "salary")
    ##     orig      pred
     ## 1   high      high
     ## 2    low lowmedium
     ## 3 medium lowmedium

    The graphical result if fully sourced from factorMerger. It is enough to run:

    -
    plot(model_xs, "salary")
    +
    plot(model_xs, "salary")

    @@ -199,17 +199,17 @@

    xspliner can work with classification problems as well. As the final GLM model can work only with binary classification, the only limit here is the number of levels for predicted value (equals to 2).

    Let’s check below example based on SVM algorithm (e1071::svm), and modified iris data.

    Preparing data (we drop “setosa” level on Species value):

    -
    iris_data <- droplevels(iris[iris$Species != "setosa", ])
    +
    iris_data <- droplevels(iris[iris$Species != "setosa", ])

    Building SVM:

    -
    library(e1071) 
    -library(xspliner)
    -model_svm <- svm(Species ~  Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, 
    -                 data = iris_data, probability = TRUE)
    +
    library(e1071) 
    +library(xspliner)
    +model_svm <- svm(Species ~  Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, 
    +                 data = iris_data, probability = TRUE)

    When the base model response variable is of class factor (or integer with two unique values) then xspliner automatically detects classification problem. To force specific model response distribution you can set family and link parameters. In this case we can use xspliner in standard way.

    As each predictor is continuous variable, let’s transform it with xs usage on standard parameters, and build the model:

    -
    model_xs <- xspline(Species ~  xs(Sepal.Length) + xs(Sepal.Width) + xs(Petal.Length) + xs(Petal.Width),
    -                    model = model_svm)
    -summary(model_xs)
    +
    model_xs <- xspline(Species ~  xs(Sepal.Length) + xs(Sepal.Width) + xs(Petal.Length) + xs(Petal.Width),
    +                    model = model_svm)
    +summary(model_xs)
    ## 
     ## Call:
     ## stats::glm(formula = Species ~ xs(Sepal.Length) + xs(Sepal.Width) + 
    @@ -221,11 +221,11 @@ 

    ## ## Coefficients: ## Estimate Std. Error z value Pr(>|z|) -## (Intercept) 0.7125 1.3240 0.538 0.59049 -## xs(Sepal.Length) 11.3979 16.4565 0.693 0.48856 -## xs(Sepal.Width) 7.6981 4.7353 1.626 0.10402 -## xs(Petal.Length) 2.9544 1.2031 2.456 0.01406 * -## xs(Petal.Width) 3.3565 1.2797 2.623 0.00872 ** +## (Intercept) 1.369 1.742 0.786 0.43189 +## xs(Sepal.Length) 11.796 17.032 0.693 0.48856 +## xs(Sepal.Width) 7.967 4.901 1.626 0.10402 +## xs(Petal.Length) 3.058 1.245 2.456 0.01406 * +## xs(Petal.Width) 3.474 1.324 2.623 0.00872 ** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## @@ -237,7 +237,7 @@

    ## ## Number of Fisher Scoring iterations: 8

    Simple plot for Petal.Width shows that approximation almost fully covers the PDP.

    -
    plot(model_xs, "Petal.Width")
    +
    plot(model_xs, "Petal.Width")

    diff --git a/docs/articles/discrete_files/figure-html/unnamed-chunk-5-1.png b/docs/articles/discrete_files/figure-html/unnamed-chunk-5-1.png index bf49c8c..dca4c32 100644 Binary files a/docs/articles/discrete_files/figure-html/unnamed-chunk-5-1.png and b/docs/articles/discrete_files/figure-html/unnamed-chunk-5-1.png differ diff --git a/docs/articles/discrete_files/figure-html/unnamed-chunk-9-1.png b/docs/articles/discrete_files/figure-html/unnamed-chunk-9-1.png index 4aa6d9b..51a799d 100644 Binary files a/docs/articles/discrete_files/figure-html/unnamed-chunk-9-1.png and b/docs/articles/discrete_files/figure-html/unnamed-chunk-9-1.png differ diff --git a/docs/articles/extras.html b/docs/articles/extras.html index 5bcb8f6..b18e4bf 100644 --- a/docs/articles/extras.html +++ b/docs/articles/extras.html @@ -88,7 +88,7 @@

    Extra information about the package

    Krystian Igras

    -

    2019-08-31

    +

    2019-09-05

    @@ -110,24 +110,24 @@

  • “auto” compare increasing and decreasing approximation and chooses better one (basing on \(R^2\) statistic)
  • Let’s see below example:

    -
    library(randomForest)
    -library(pdp)
    -library(xspliner)
    -data(boston)
    -set.seed(123)
    -boston_rf <- randomForest(cmedv ~ lstat + ptratio + age, data = boston)
    -model_xs <- xspline(
    -  cmedv ~
    -    xs(lstat, transition = list(k = 6), effect = list(type = "pdp", grid.resolution = 100)) +
    -    xs(ptratio, transition = list(k = 5), effect = list(type = "pdp", grid.resolution = 100)) +
    -    age,
    -  model = boston_rf,
    -  xs_opts = list(transition = list(monotonic = "auto"))
    -)
    -
    -plot(model_xs, "ptratio", plot_deriv = TRUE)
    +
    library(randomForest)
    +library(pdp)
    +library(xspliner)
    +data(boston)
    +set.seed(123)
    +boston_rf <- randomForest(cmedv ~ lstat + ptratio + age, data = boston)
    +model_xs <- xspline(
    +  cmedv ~
    +    xs(lstat, transition = list(k = 6), effect = list(type = "pdp", grid.resolution = 100)) +
    +    xs(ptratio, transition = list(k = 5), effect = list(type = "pdp", grid.resolution = 100)) +
    +    age,
    +  model = boston_rf,
    +  xs_opts = list(transition = list(monotonic = "auto"))
    +)
    +
    +plot(model_xs, "ptratio", plot_deriv = TRUE)

    -
    plot(model_xs, "lstat", plot_deriv = TRUE)
    +
    plot(model_xs, "lstat", plot_deriv = TRUE)

    @@ -143,20 +143,20 @@

    compare_stat - function of lm class object. It defines statistic that should be used in decision between spline model and linear one. The function should have the attribute higher. When the attribute has "better" value then the model with higher statistic value is chosen.

    You can see the feature in above example:

    -
    set.seed(123)
    -boston_rf <- randomForest(cmedv ~ lstat + ptratio + age, data = boston)
    -model_pdp_auto <- xspline(
    -  cmedv ~
    -    xs(lstat, transition = list(k = 6), effect = list(type = "pdp", grid.resolution = 60)) +
    -    xs(ptratio, transition = list(k = 4), effect = list(type = "pdp", grid.resolution = 40)) +
    -    age,
    -  model = boston_rf,
    -  xs_opts = list(transition = list(alter = "auto"))
    -)
    -
    -# aic statistic is used by default
    -
    -summary(model_pdp_auto)
    +
    set.seed(123)
    +boston_rf <- randomForest(cmedv ~ lstat + ptratio + age, data = boston)
    +model_pdp_auto <- xspline(
    +  cmedv ~
    +    xs(lstat, transition = list(k = 6), effect = list(type = "pdp", grid.resolution = 60)) +
    +    xs(ptratio, transition = list(k = 4), effect = list(type = "pdp", grid.resolution = 40)) +
    +    age,
    +  model = boston_rf,
    +  xs_opts = list(transition = list(alter = "auto"))
    +)
    +
    +# aic statistic is used by default
    +
    +summary(model_pdp_auto)
    ## 
     ## Call:
     ## stats::glm(formula = cmedv ~ xs(lstat) + ptratio + age, family = family, 
    @@ -191,19 +191,19 @@ 

    Link parameters stores info about what function should be used to transform the response. The transformation is used in the final model fitting. The standard link is the identity (for gaussian distribution) - for binomial distribution logit is used.

    See more at ??stats::family.glm.

    xspline function allows you to decide which response should be used in the final model. Let’s check the example below in which poisson distribution with log link is used.

    -
    library(xspliner)
    -library(randomForest)
    -x <- rnorm(100)
    -z <- rnorm(100)
    -y <- rpois(100, exp(1 + x + z))
    -data <- data.frame(x, y, z)
    -model_rf <- randomForest(y ~ x + z, data = data)
    -model_xs_1 <- xspline(model_rf)
    -model_xs_2 <- xspline(model_rf, family = poisson(), link = "log")
    +
    library(xspliner)
    +library(randomForest)
    +x <- rnorm(100)
    +z <- rnorm(100)
    +y <- rpois(100, exp(1 + x + z))
    +data <- data.frame(x, y, z)
    +model_rf <- randomForest(y ~ x + z, data = data)
    +model_xs_1 <- xspline(model_rf)
    +model_xs_2 <- xspline(model_rf, family = poisson(), link = "log")

    Let’s compare two models by checking its AIC statistics:

    - +
    model_xs_1$aic
    ## [1] 672.5753
    - +
    model_xs_2$aic
    ## [1] 580.0274

    As we can see the second model is better.

    @@ -211,15 +211,15 @@

    Transformed response

    In some cases you may want to transform model response with you own function. Let’s check the example below with random forest model:

    -
    set.seed(123)
    -x <- rnorm(100, 10)
    -z <- rnorm(100, 10)
    -y <- x * z * rnorm(100, 1, 0.1)
    -data <- data.frame(x, z, y)
    -model_rf <- randomForest(log(y) ~ x + z, data = data)
    +
    set.seed(123)
    +x <- rnorm(100, 10)
    +z <- rnorm(100, 10)
    +y <- x * z * rnorm(100, 1, 0.1)
    +data <- data.frame(x, z, y)
    +model_rf <- randomForest(log(y) ~ x + z, data = data)

    In this case log transformation for y, removes interaction of x and z. In xspliner same transformation is used by default:

    -
    model_xs <- xspline(model_rf)
    -summary(model_xs)
    +
    model_xs <- xspline(model_rf)
    +summary(model_xs)
    ## 
     ## Call:
     ## stats::glm(formula = log(y) ~ xs(x) + xs(z), family = family, 
    @@ -244,7 +244,7 @@ 

    ## AIC: -193.88 ## ## Number of Fisher Scoring iterations: 2

    -
    plot(model_xs, model = model_rf, data = data)
    +
    plot(model_xs, model = model_rf, data = data)

    @@ -253,14 +253,14 @@

    When interactions between predictors occurs black box models in fact deal much better that linear models. xspliner offers using formulas with variables interactions.

    You can do it in two possible forms.

    Lets start with creating data and building black box:

    -
    x <- rnorm(100)
    -z <- rnorm(100)
    -y <- x + x * z + z + rnorm(100, 0, 0.1)
    -data <- data.frame(x, y, z)
    -model_rf <- randomForest(y ~ x + z, data = data)
    +
    x <- rnorm(100)
    +z <- rnorm(100)
    +y <- x + x * z + z + rnorm(100, 0, 0.1)
    +data <- data.frame(x, y, z)
    +model_rf <- randomForest(y ~ x + z, data = data)

    The first option is specifying formula with * sign, as in standard linear models.

    -
    model_xs <- xspline(y ~ x * z, model = model_rf)
    -summary(model_xs)
    +
    model_xs <- xspline(y ~ x * z, model = model_rf)
    +summary(model_xs)
    ## 
     ## Call:
     ## stats::glm(formula = y ~ x * z, family = family, data = data)
    @@ -285,11 +285,11 @@ 

    ## AIC: -153.1 ## ## Number of Fisher Scoring iterations: 2

    -
    plot(model_xs, model = model_rf, data = data)
    +
    plot(model_xs, model = model_rf, data = data)

    The second one is adding form parameter equal to “multiplicative” in case of passing just the model or dot formula.

    -
    model_xs <- xspline(model_rf, form = "multiplicative")
    -summary(model_xs)
    +
    model_xs <- xspline(model_rf, form = "multiplicative")
    +summary(model_xs)
    ## 
     ## Call:
     ## stats::glm(formula = y ~ xs(x) * xs(z), family = family, data = data)
    @@ -314,10 +314,10 @@ 

    ## AIC: 94.46 ## ## Number of Fisher Scoring iterations: 2

    -
    plot(model_xs, model = model_rf, data = data)
    +
    plot(model_xs, model = model_rf, data = data)

    -
    model_xs <- xspline(y ~ ., model = model_rf, form = "multiplicative")
    -summary(model_xs)
    +
    model_xs <- xspline(y ~ ., model = model_rf, form = "multiplicative")
    +summary(model_xs)
    ## 
     ## Call:
     ## stats::glm(formula = y ~ xs(x) * xs(z), family = family, data = data)
    @@ -342,23 +342,23 @@ 

    ## AIC: 94.46 ## ## Number of Fisher Scoring iterations: 2

    -
    plot(model_xs, model = model_rf, data = data)
    +
    plot(model_xs, model = model_rf, data = data)

    Subset formula

    Every example we saw before used to use the same variables in black box and xspliner model. In fact this is not obligatory. How can it be used? For example to build a simpler model based on truncated amount of predictors. Let’s see below example:

    -
    library(randomForest)
    -library(xspliner)
    -data(airquality)
    -air <- na.omit(airquality)
    -model_rf <- randomForest(Ozone ~ ., data = air)
    -varImpPlot(model_rf)
    +
    library(randomForest)
    +library(xspliner)
    +data(airquality)
    +air <- na.omit(airquality)
    +model_rf <- randomForest(Ozone ~ ., data = air)
    +varImpPlot(model_rf)

    As we can see Wind and Temp variables are of the highest importance. Let’s build xspliner basing on just the Two variables.

    -
    model_xs <- xspline(Ozone ~ xs(Wind) + xs(Temp), model = model_rf)
    -summary(model_xs)
    +
    model_xs <- xspline(Ozone ~ xs(Wind) + xs(Temp), model = model_rf)
    +summary(model_xs)
    ## 
     ## Call:
     ## stats::glm(formula = Ozone ~ xs(Wind) + xs(Temp), family = family, 
    @@ -383,11 +383,11 @@ 

    ## AIC: 960.62 ## ## Number of Fisher Scoring iterations: 2

    -
    plot(model_xs, model = model_rf, data = air)
    +
    plot(model_xs, model = model_rf, data = air)

    Or model including variables interaction:

    -
    model_xs <- xspline(Ozone ~ xs(Wind) * xs(Temp), model = model_rf)
    -summary(model_xs)
    +
    model_xs <- xspline(Ozone ~ xs(Wind) * xs(Temp), model = model_rf)
    +summary(model_xs)
    ## 
     ## Call:
     ## stats::glm(formula = Ozone ~ xs(Wind) * xs(Temp), family = family, 
    @@ -413,7 +413,7 @@ 

    ## AIC: 953.43 ## ## Number of Fisher Scoring iterations: 2

    -
    plot(model_xs, model = model_rf, data = air)
    +
    plot(model_xs, model = model_rf, data = air)

    diff --git a/docs/articles/graphics.html b/docs/articles/graphics.html index a666aeb..a1edcb8 100644 --- a/docs/articles/graphics.html +++ b/docs/articles/graphics.html @@ -88,7 +88,7 @@

    Graphics

    Krystian Igras

    -

    2019-08-31

    +

    2019-09-05

    @@ -104,9 +104,9 @@

    As you actually saw in the previous sections, the model transformation can be presented on simple graphics. In continuous predictor case, the transformation is plotted as estimated spline - in case of discrete predictor, factorMerger plot is used.

    For quantitative predictors some additional options are offered. Let’s check them on the below examples.

    Let’s use already known Boston Housing Data, and a random forest model.

    -
    library(pdp)
    -data(boston)
    -str(boston)
    +
    library(pdp)
    +data(boston)
    +str(boston)
    ## 'data.frame':    506 obs. of  16 variables:
     ##  $ lon    : num  -71 -71 -70.9 -70.9 -70.9 ...
     ##  $ lat    : num  42.3 42.3 42.3 42.3 42.3 ...
    @@ -124,29 +124,29 @@ 

    ## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ... ## $ b : num 397 397 393 395 397 ... ## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...

    -
    data(boston)
    -set.seed(123)
    -
    -library(randomForest)
    -boston.rf <- randomForest(cmedv ~ lstat + ptratio + age, data = boston)
    +
    data(boston)
    +set.seed(123)
    +
    +library(randomForest)
    +boston.rf <- randomForest(cmedv ~ lstat + ptratio + age, data = boston)

    Also let’s build xspliner model with lstat and ptratio transformation.

    -
    library(xspliner)
    -model_xs <- xspline(cmedv ~ xs(lstat) + xs(ptratio) + age, model = boston.rf)
    +
    library(xspliner)
    +model_xs <- xspline(cmedv ~ xs(lstat) + xs(ptratio) + age, model = boston.rf)

    In order to plot specified predictor transformation just use:

    -
    plot(model_xs, "lstat")
    +
    plot(model_xs, "lstat")

    You can also add some additional information for the plot.

    Plotting training data

    Just add two parameters: data = <training data> and flag plot_data = TRUE.

    -
    plot(model_xs, "lstat", data = boston, plot_data = TRUE)
    +
    plot(model_xs, "lstat", data = boston, plot_data = TRUE)

    Plotting approximation derivative

    The derivative is plotted on scale of the second axis. To display it just add: plot_deriv = TRUE.

    -
    plot(model_xs, "lstat", plot_deriv = TRUE)
    +
    plot(model_xs, "lstat", plot_deriv = TRUE)

    You can also specify if the model effect or transition should be displayed. Parameters responsible for that are plot_response and plot_approx respectively.

    For example below we display just the approximation and derivative for ptratio variable:

    -
    plot(model_xs, "ptratio", plot_response = FALSE, plot_deriv = TRUE)
    +
    plot(model_xs, "ptratio", plot_response = FALSE, plot_deriv = TRUE)

    @@ -154,52 +154,52 @@

    Models comparison

    As we may want to compare the final GLM model with its parent black box, xspliner provides one simple tool.

    For comparison just add model parameter for plot method, and provide data:

    -
    plot(model_xs, model = boston.rf, data = boston)
    +
    plot(model_xs, model = boston.rf, data = boston)

    The resulting graphics compares predicted values for both GLM and black box model.

    For predicting values standard predict method is used: function(object, newdata) predict(object, newdata). So for regression models the results are on the same scale.

    The notable difference occurs in the classification models. GLM models by default return “link” function values, so for classification it can be any real number. Contrary to that, randomForest function returns predicted levels.

    To avoid the problem, predictor_funs parameter was added. This is the list of prediction functions for each model (in order: black box, xspliner). Let’s see it on SVM example:

    -
    iris_data <- droplevels(iris[iris$Species != "setosa", ])
    -
    -library(e1071) 
    -library(xspliner)
    -model_svm <- svm(Species ~  Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, 
    -                 data = iris_data, probability = TRUE)
    -
    -model_xs <- xspline(Species ~  xs(Sepal.Length) + xs(Sepal.Width) + xs(Petal.Length) + xs(Petal.Width),
    -                    model = model_svm)
    +
    iris_data <- droplevels(iris[iris$Species != "setosa", ])
    +
    +library(e1071) 
    +library(xspliner)
    +model_svm <- svm(Species ~  Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, 
    +                 data = iris_data, probability = TRUE)
    +
    +model_xs <- xspline(Species ~  xs(Sepal.Length) + xs(Sepal.Width) + xs(Petal.Length) + xs(Petal.Width),
    +                    model = model_svm)

    Now we specify predict functions to return probability of virginica response.

    -
    prob_svm <- function(object, newdata) attr(predict(object, newdata = newdata, probability = TRUE), "probabilities")[, 2]
    -prob_xs <- function(object, newdata) predict(object, newdata = newdata, type = "response")
    +
    prob_svm <- function(object, newdata) attr(predict(object, newdata = newdata, probability = TRUE), "probabilities")[, 2]
    +prob_xs <- function(object, newdata) predict(object, newdata = newdata, type = "response")

    And plot the result

    -
    plot(model_xs, model = model_svm, data = iris_data,
    -     prediction_funs = list(prob_xs, prob_svm)
    -)  
    +
    plot(model_xs, model = model_svm, data = iris_data,
    +     prediction_funs = list(prob_xs, prob_svm)
    +)  

    It is also possible to sort the values of heatmap according to chosen model:

    -
    plot(model_xs, model = model_svm, data = iris_data,
    -     prediction_funs = list(prob_xs, prob_svm),
    -     sort_by = "svm"
    -)  
    +
    plot(model_xs, model = model_svm, data = iris_data,
    +     prediction_funs = list(prob_xs, prob_svm),
    +     sort_by = "svm"
    +)  

    In case of class predictions, let’s create class prediction function first:

    -
    class_svm <- function(object, newdata) predict(object, newdata = newdata)
    -response_levels <- levels(iris_data$Species)
    -class_xs <- function(object, newdata) {
    -  probs <- predict(object, newdata = newdata, type = "response")
    -  factor(ifelse(probs > 0.5, response_levels[2], response_levels[1]), levels = response_levels)
    -}
    +
    class_svm <- function(object, newdata) predict(object, newdata = newdata)
    +response_levels <- levels(iris_data$Species)
    +class_xs <- function(object, newdata) {
    +  probs <- predict(object, newdata = newdata, type = "response")
    +  factor(ifelse(probs > 0.5, response_levels[2], response_levels[1]), levels = response_levels)
    +}

    And plot the result:

    -
    plot(model_xs, model = model_svm, data = iris_data,
    -     prediction_funs = list(class_xs, class_svm)
    -)  
    +
    plot(model_xs, model = model_svm, data = iris_data,
    +     prediction_funs = list(class_xs, class_svm)
    +)  

    Sorting values according to specified model is also possible:

    -
    plot(model_xs, model = model_svm, data = iris_data,
    -     prediction_funs = list(class_xs, class_svm),
    -     sort_by = "svm"
    -)  
    +
    plot(model_xs, model = model_svm, data = iris_data,
    +     prediction_funs = list(class_xs, class_svm),
    +     sort_by = "svm"
    +)  

    @@ -207,23 +207,23 @@

    Following above approach it’s easy to generate similar graphics for higher amount of models.

    Just include additional models inside compare_with parameter (named list), and add corresponding predict functions to them to predictor_funs parameter (if omitted, the default one is used).

    See below example on airquality data

    -
    library(mgcv)
    -
    -data(airquality)
    -ozone <- subset(na.omit(airquality),
    -                select = c("Ozone", "Solar.R", "Wind", "Temp"))
    -set.seed(123)
    -
    -model_rf <- randomForest(Ozone ~ ., data = ozone)
    -model_xs <- xspline(Ozone ~ xs(Solar.R) + xs(Wind) + xs(Temp), model_rf, data = ozone)
    -model_glm <- glm(Ozone ~ ., data = ozone)
    -model_gam <- mgcv::gam(Ozone ~ s(Solar.R) + s(Wind) + s(Temp), data = ozone)
    -
    -plot(model_xs, 
    -     model = model_rf, 
    -     data = ozone, 
    -     compare_with = list(glm = model_glm, gam = model_gam),
    -     sort_by = "xspliner")
    +
    library(mgcv)
    +
    +data(airquality)
    +ozone <- subset(na.omit(airquality),
    +                select = c("Ozone", "Solar.R", "Wind", "Temp"))
    +set.seed(123)
    +
    +model_rf <- randomForest(Ozone ~ ., data = ozone)
    +model_xs <- xspline(Ozone ~ xs(Solar.R) + xs(Wind) + xs(Temp), model_rf, data = ozone)
    +model_glm <- glm(Ozone ~ ., data = ozone)
    +model_gam <- mgcv::gam(Ozone ~ s(Solar.R) + s(Wind) + s(Temp), data = ozone)
    +
    +plot(model_xs, 
    +     model = model_rf, 
    +     data = ozone, 
    +     compare_with = list(glm = model_glm, gam = model_gam),
    +     sort_by = "xspliner")

    @@ -231,13 +231,13 @@

    Plotting more transitions at once

    If you want to display many transitions on one plot just pass xspliner model to plot:

    -
    plot(model_xs)
    +
    plot(model_xs)

    For models trained on top of many predictors, there will be displayed plots for 6 first transformed variables. To change that value just set n_plots variable:

    -
    plot(model_xs, n_plots = 2)
    +
    plot(model_xs, n_plots = 2)

    You can select interesting variables to plot just passing predictor names in vector:

    -
    plot(model_xs, c("Wind", "Temp"))
    +
    plot(model_xs, c("Wind", "Temp"))

    diff --git a/docs/articles/methods.html b/docs/articles/methods.html index ab29d78..77c892d 100644 --- a/docs/articles/methods.html +++ b/docs/articles/methods.html @@ -88,7 +88,7 @@

    Methods and xspliner environment

    Krystian Igras

    -

    2019-08-31

    +

    2019-09-05

    @@ -101,42 +101,42 @@

    2019-08-31

    Predict

    As xspliner final model is GLM, predict method is just wrapper of stats::predict.glm function. Let’s see it on the below example:

    -
    library(xspliner)
    -library(randomForest)
    -library(magrittr)
    -
    -rf_iris <- randomForest(Petal.Width ~  Sepal.Length + Petal.Length + Species, data = iris)
    -model_xs <- xspline(Petal.Width ~ 
    -  Sepal.Length + 
    -  xs(Petal.Length, effect = list(grid.resolution = 100), transition = list(bs = "cr")) + 
    -  xf(Species, transition = list(stat = "loglikelihood", value = -300)),
    -  model = rf_iris)
    -newdata <- data.frame(
    -  Sepal.Length = 10, 
    -  Petal.Length = 2, 
    -  Species = factor("virginica", levels = levels(iris$Species)))
    -predict(model_xs, newdata = newdata)
    +
    library(xspliner)
    +library(randomForest)
    +library(magrittr)
    +
    +rf_iris <- randomForest(Petal.Width ~  Sepal.Length + Petal.Length + Species, data = iris)
    +model_xs <- xspline(Petal.Width ~ 
    +  Sepal.Length + 
    +  xs(Petal.Length, effect = list(grid.resolution = 100), transition = list(bs = "cr")) + 
    +  xf(Species, transition = list(stat = "loglikelihood", value = -300)),
    +  model = rf_iris)
    +newdata <- data.frame(
    +  Sepal.Length = 10, 
    +  Petal.Length = 2, 
    +  Species = factor("virginica", levels = levels(iris$Species)))
    +predict(model_xs, newdata = newdata)
    ##          1 
    -## -0.3811033
    +## -0.3923137

    Print

    Print method works similarly to the summary. In case of passing just the model, standard print.glm is used.

    -
    print(model_xs)
    +
    print(model_xs)
    ## 
     ## Call:  stats::glm(formula = Petal.Width ~ Sepal.Length + xs(Petal.Length) + 
     ##     xf(Species), family = family, data = data)
     ## 
     ## Coefficients:
     ##                    (Intercept)                    Sepal.Length  
    -##                       -1.96540                        -0.01363  
    +##                      -2.244882                       -0.009104  
     ##               xs(Petal.Length)  xf(Species)versicolorvirginica  
    -##                        3.13004                        -0.62584  
    +##                       3.340926                       -0.654606  
     ## 
     ## Degrees of Freedom: 149 Total (i.e. Null);  146 Residual
     ## Null Deviance:       86.57 
    -## Residual Deviance: 5.14  AIC: -70.36
    +## Residual Deviance: 5.172 AIC: -69.44

    Predictor based print

    @@ -144,7 +144,7 @@

    Standard usage print(xspliner_object, variable_name)

    Quantitative variable

    When predictor is the quantitative variable its transition is based on GAM model. For this case print uses standard print.gam method.

    -
    print(model_xs, "Petal.Length")
    +
    print(model_xs, "Petal.Length")
    ## 
     ## Family: gaussian 
     ## Link function: identity 
    @@ -153,21 +153,21 @@ 

    ## yhat ~ s(Petal.Length, bs = "cr") ## ## Estimated degrees of freedom: -## 8.82 total = 9.82 +## 8.83 total = 9.83 ## -## GCV score: 0.001429038

    +## GCV score: 0.001315672

    Qualitative variable

    In case of qualitative predictor, standard print.factorMerger method is used.

    -
    print(model_xs, "Species")
    +
    print(model_xs, "Species")
    ## Family: gaussian Factor Merger.
     ## 
     ## Factor levels were merged in the following order:
     ## 
     ##      groupA       groupB                     model   pvalVsFull   pvalVsPrevious
     ## ---  -----------  --------------------  ----------  -----------  ---------------
    -## 0                                        -269.6600            1                1
    -## 1    versicolor   virginica              -283.9595            0                0
    -## 2    setosa       versicolorvirginica    -354.5249            0                0
    +## 0 -257.6859 1 1 +## 1 versicolor virginica -273.8312 0 0 +## 2 setosa versicolorvirginica -352.9511 0 0

    @@ -184,31 +184,31 @@

    Extracting effect

    Each transition is built on top of the black box response data. For example the default response for quantitative variables is PDP - for qualitative ones ICE.

    In order to extract the effect use transition method with type parameter equals to data

    -
    transition(model_xs, predictor = "Petal.Length", type = "data") %>% 
    -  head
    +
    transition(model_xs, predictor = "Petal.Length", type = "data") %>% 
    +  head
    ##   Petal.Length      yhat
    -## 1     1.000000 0.7207022
    -## 2     1.059596 0.7206924
    -## 3     1.119192 0.7206924
    -## 4     1.178788 0.7230503
    -## 5     1.238384 0.7248216
    -## 6     1.297980 0.7266156
    -
    transition(model_xs, predictor = "Species", type = "data") %>% 
    -  head
    +## 1 1.000000 0.7501526 +## 2 1.059596 0.7499859 +## 3 1.119192 0.7499859 +## 4 1.178788 0.7531005 +## 5 1.238384 0.7549714 +## 6 1.297980 0.7583468 +
    transition(model_xs, predictor = "Species", type = "data") %>% 
    +  head
    ##      Species      yhat yhat.id
    -## 1     setosa 0.2440234       1
    -## 2 versicolor 0.6806043       1
    -## 3  virginica 0.8353717       1
    -## 4     setosa 0.2169328       2
    -## 5 versicolor 0.6781563       2
    -## 6  virginica 0.8346654       2
    +## 1 setosa 0.2437544 1 +## 2 versicolor 0.6966708 1 +## 3 virginica 0.8499297 1 +## 4 setosa 0.2157778 2 +## 5 versicolor 0.6923198 2 +## 6 virginica 0.8465953 2

    Extracting transition model

    After we built transition basing on continuity of variable specific model is created. In case of quantitative predictor we build GAM model in order to get spline approximation of effect. In case of qualitative predictor we build factorMerger object and get optimal factor division on that.

    To extract the model, use transition method with type = "base":

    -
    transition(model_xs, predictor = "Petal.Length", type = "base")
    +
    transition(model_xs, predictor = "Petal.Length", type = "base")
    ## 
     ## Family: gaussian 
     ## Link function: identity 
    @@ -217,31 +217,31 @@ 

    ## yhat ~ s(Petal.Length, bs = "cr") ## ## Estimated degrees of freedom: -## 8.82 total = 9.82 +## 8.83 total = 9.83 ## -## GCV score: 0.001429038

    -
    transition(model_xs, predictor = "Species", type = "base")
    +## GCV score: 0.001315672 +
    transition(model_xs, predictor = "Species", type = "base")
    ## Family: gaussian Factor Merger.
     ## 
     ## Factor levels were merged in the following order:
     ## 
     ##      groupA       groupB                     model   pvalVsFull   pvalVsPrevious
     ## ---  -----------  --------------------  ----------  -----------  ---------------
    -## 0                                        -269.6600            1                1
    -## 1    versicolor   virginica              -283.9595            0                0
    -## 2    setosa       versicolorvirginica    -354.5249            0                0
    +## 0 -257.6859 1 1 +## 1 versicolor virginica -273.8312 0 0 +## 2 setosa versicolorvirginica -352.9511 0 0

    Extracting transition function

    The final result of building transition is transformation function, that is used in the final GLM model estimation.

    To extract the function just use transition method with type = "function".

    -
    petal_length_xs <- transition(model_xs, predictor = "Petal.Length", type = "function")
    -x <- seq(1, 7, length.out = 50)
    -plot(x, petal_length_xs(x))
    +
    petal_length_xs <- transition(model_xs, predictor = "Petal.Length", type = "function")
    +x <- seq(1, 7, length.out = 50)
    +plot(x, petal_length_xs(x))

    -
    species_xf <- transition(model_xs, predictor = "Species", type = "function")
    -species_xf(c("setosa", "versicolor", "virginica"))
    +
    species_xf <- transition(model_xs, predictor = "Species", type = "function")
    +species_xf(c("setosa", "versicolor", "virginica"))
    ## [1] setosa              versicolorvirginica versicolorvirginica
     ## Levels: setosa versicolorvirginica
    @@ -254,7 +254,7 @@

    GLM summary

    Standard summary method is just wrapper for summary::glm. In order to use this just type:

    -
    summary(model_xs)
    +
    summary(model_xs)
    ## 
     ## Call:
     ## stats::glm(formula = Petal.Width ~ Sepal.Length + xs(Petal.Length) + 
    @@ -262,22 +262,22 @@ 

    ## ## Deviance Residuals: ## Min 1Q Median 3Q Max -## -0.68602 -0.08184 -0.01356 0.10264 0.50215 +## -0.68163 -0.07833 -0.02037 0.10712 0.50143 ## ## Coefficients: -## Estimate Std. Error t value Pr(>|t|) -## (Intercept) -1.96540 0.14013 -14.025 < 2e-16 *** -## Sepal.Length -0.01363 0.03631 -0.375 0.708 -## xs(Petal.Length) 3.13004 0.21589 14.498 < 2e-16 *** -## xf(Species)versicolorvirginica -0.62584 0.12133 -5.158 7.99e-07 *** +## Estimate Std. Error t value Pr(>|t|) +## (Intercept) -2.244882 0.146470 -15.327 < 2e-16 *** +## Sepal.Length -0.009104 0.036244 -0.251 0.802 +## xs(Petal.Length) 3.340926 0.231635 14.423 < 2e-16 *** +## xf(Species)versicolorvirginica -0.654606 0.123765 -5.289 4.41e-07 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## -## (Dispersion parameter for gaussian family taken to be 0.03520518) +## (Dispersion parameter for gaussian family taken to be 0.03542157) ## -## Null deviance: 86.57 on 149 degrees of freedom -## Residual deviance: 5.14 on 146 degrees of freedom -## AIC: -70.357 +## Null deviance: 86.5699 on 149 degrees of freedom +## Residual deviance: 5.1715 on 146 degrees of freedom +## AIC: -69.438 ## ## Number of Fisher Scoring iterations: 2

    @@ -288,7 +288,7 @@

    Standard usage summary(xspliner_object, variable_name)

    Quantitative variable

    When predictor is quantitative variable its transition is based on GAM model. For this case summary displays summary of that model.

    -
    summary(model_xs, "Petal.Length")
    +
    summary(model_xs, "Petal.Length")
    ## 
     ## Family: gaussian 
     ## Link function: identity 
    @@ -298,21 +298,21 @@ 

    ## ## Parametric coefficients: ## Estimate Std. Error t value Pr(>|t|) -## (Intercept) 1.20102 0.00359 334.6 <2e-16 *** +## (Intercept) 1.207191 0.003444 350.5 <2e-16 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Approximate significance of smooth terms: ## edf Ref.df F p-value -## s(Petal.Length) 8.818 8.99 756.3 <2e-16 *** +## s(Petal.Length) 8.827 8.991 733.4 <2e-16 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## -## R-sq.(adj) = 0.986 Deviance explained = 98.7% -## GCV = 0.001429 Scale est. = 0.0012887 n = 100

    +## R-sq.(adj) = 0.985 Deviance explained = 98.7% +## GCV = 0.0013157 Scale est. = 0.0011864 n = 100

    Qualitative variable

    In case of qualitative predictor, the method displays data.frame storing information how factors were merged during the transition.

    -
    summary(model_xs, "Species")
    +
    summary(model_xs, "Species")
    ##         orig                pred
     ## 1     setosa              setosa
     ## 2 versicolor versicolorvirginica
    @@ -324,15 +324,15 @@ 

    Providing model parameter instead of predictor, the summary displays a few statistics that compares original model with surrogate one. All statistics definitions are included in summary.xspline documentation.

    Here we show one example for classification model.

    For this example we use ISLR::Default data and build svm model as black box. The model aims to predict default variable, indicating whether the customer defaulted on their debt.

    -
    library(xspliner)
    -library(e1071)
    -set.seed(1)
    -data <- ISLR::Default
    -default.svm <- svm(default ~ ., data = data, probability = TRUE)
    -default.xs <- xspline(default ~ student + xs(balance) + xs(income), model = default.svm)
    +
    library(xspliner)
    +library(e1071)
    +set.seed(1)
    +data <- ISLR::Default
    +default.svm <- svm(default ~ ., data = data, probability = TRUE)
    +default.xs <- xspline(default ~ student + xs(balance) + xs(income), model = default.svm)

    In order to check the summary, we need to specify prediction functions for each model. In this case predictions are probabilities of success:

    -
    prob_svm <- function(object, newdata) attr(predict(object, newdata = newdata, probability = TRUE), "probabilities")[, 2]
    -prob_xs <- function(object, newdata) predict(object, newdata = newdata, type = "response")
    +
    prob_svm <- function(object, newdata) attr(predict(object, newdata = newdata, probability = TRUE), "probabilities")[, 2]
    +prob_xs <- function(object, newdata) predict(object, newdata = newdata, type = "response")

    Almost each summary statistic compares models basing on some data.

    In this case we’re going to compare models on training data providing:

      @@ -343,32 +343,22 @@

    • prediction_funs as a list of prediction functions (for surrogate and original model respectively)
    -
    summary(default.xs, model = default.svm, newdata = data, prediction_funs = list(prob_xs, prob_svm))
    +
    summary(default.xs, model = default.svm, newdata = data, prediction_funs = list(prob_xs, prob_svm))
    ## Models comparison 
     ##   1 - Max prediction normed-diff:  0.5268109 
     ##   R^2:  0.9185403
    ## Setting levels: control = No, case = Yes
     ## Setting levels: control = No, case = Yes
    -
    ## Warning in coords.roc(roc_surrogate, x = thresholds, input = "threshold", :
    -## An upcoming version of pROC will set the 'transpose' argument to FALSE
    -## by default. Set transpose = TRUE explicitly to keep the current behavior,
    -## or transpose = FALSE to adopt the new one and silence this warning. Type
    -## help(coords_transpose) for additional information.
    -
    ## Warning in coords.roc(roc_original, x = thresholds, input = "threshold", :
    -## An upcoming version of pROC will set the 'transpose' argument to FALSE
    -## by default. Set transpose = TRUE explicitly to keep the current behavior,
    -## or transpose = FALSE to adopt the new one and silence this warning. Type
    -## help(coords_transpose) for additional information.
    ##   1 - Max ROC diff:  0.8712113 
     ##   1 - Mean ROC diff:  0.9506292

    Another set of statistics is generated for prediction functions that return response levels.

    -
    response_svm <- function(object, newdata) predict(object, newdata = newdata)
    -response_xs <- function(object, newdata) {
    -  y_levels <- levels(newdata[[environment(object)$response]])
    -  factor(y_levels[(predict.glm(object, newdata = newdata, type = "link") > 0) + 1], levels = y_levels)
    -}
    +
    response_svm <- function(object, newdata) predict(object, newdata = newdata)
    +response_xs <- function(object, newdata) {
    +  y_levels <- levels(newdata[[environment(object)$response]])
    +  factor(y_levels[(predict.glm(object, newdata = newdata, type = "link") > 0) + 1], levels = y_levels)
    +}

    And similarly to previous example:

    -
    summary(default.xs, model = default.svm, newdata = data, prediction_funs = list(response_xs, response_svm))
    +
    summary(default.xs, model = default.svm, newdata = data, prediction_funs = list(response_xs, response_svm))
    ## Models comparison 
     ##   Mean predictions similarity:  0.9966 
     ##   ACC Black Box:  0.9719 
    diff --git a/docs/articles/methods_files/figure-html/unnamed-chunk-7-1.png b/docs/articles/methods_files/figure-html/unnamed-chunk-7-1.png
    index 55bc1ff..7d49c8d 100644
    Binary files a/docs/articles/methods_files/figure-html/unnamed-chunk-7-1.png and b/docs/articles/methods_files/figure-html/unnamed-chunk-7-1.png differ
    diff --git a/docs/articles/xspliner.html b/docs/articles/xspliner.html
    index af7af94..a6897eb 100644
    --- a/docs/articles/xspliner.html
    +++ b/docs/articles/xspliner.html
    @@ -88,7 +88,7 @@
           

    Basic theory and usage

    Krystian Igras

    -

    2019-08-31

    +

    2019-09-05

    @@ -150,29 +150,29 @@

    Defining formula

    This sections shows, how to build formula interpretable by xspliner package using Boston Housing Data from pdp package.

    Read the data

    - +
    data(boston)
    +str(boston)
    +#> 'data.frame':    506 obs. of  16 variables:
    +#>  $ lon    : num  -71 -71 -70.9 -70.9 -70.9 ...
    +#>  $ lat    : num  42.3 42.3 42.3 42.3 42.3 ...
    +#>  $ cmedv  : num  24 21.6 34.7 33.4 36.2 28.7 22.9 22.1 16.5 18.9 ...
    +#>  $ crim   : num  0.00632 0.02731 0.02729 0.03237 0.06905 ...
    +#>  $ zn     : num  18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
    +#>  $ indus  : num  2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
    +#>  $ chas   : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
    +#>  $ nox    : num  0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
    +#>  $ rm     : num  6.58 6.42 7.18 7 7.15 ...
    +#>  $ age    : num  65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
    +#>  $ dis    : num  4.09 4.97 4.97 6.06 6.06 ...
    +#>  $ rad    : int  1 2 2 3 3 3 5 5 5 5 ...
    +#>  $ tax    : int  296 242 242 222 222 222 311 311 311 311 ...
    +#>  $ ptratio: num  15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
    +#>  $ b      : num  397 397 393 395 397 ...
    +#>  $ lstat  : num  4.98 9.14 4.03 2.94 5.33 ...

    We’re going to build model based on formula

    cmedv ~ rm + lstat + nox

    So let’s build a random forest black box model, that we use as a base for further steps:

    - +
    boston_rf <- randomForest(cmedv ~ rm + lstat + nox, data = boston)

    Now let’s specify which variables should be transformed. In this example only nox variable will use random forest effect (PDP or ALEPlot).

    To indicate transformation use xs(nos) symbol.

    So we have:

    @@ -201,17 +201,17 @@

    Below we will use PDP random forest effect, that returns 40 response points for nox variable. By checking ?pdp::partial we can see that grid.resolution parameter specifies predictor grid.

    Now we should just specify: xs(nox, effect = list(type = "pdp", grid.resolution = 40)).

    Let’s verify correctness of this parameters with the bare usage of pdp::partial:

    - +
    rf_effect <- pdp::partial(boston_rf, "nox", grid.resolution = 40)
    +head(rf_effect)
    +#>         nox     yhat
    +#> 1 0.3850000 23.38131
    +#> 2 0.3974615 23.40352
    +#> 3 0.4099231 23.49511
    +#> 4 0.4223846 23.62153
    +#> 5 0.4348462 23.60978
    +#> 6 0.4473077 23.69874
    +nrow(rf_effect)
    +#> [1] 40

    We got data.frame with predictor and response values containing 40 rows. So parameter should correctly specify our expectations.

    Remark

    Here we can see that response function is presented as data.frame with two columns:

    @@ -253,40 +253,40 @@

    Having the formula defined, we almost have all the required data provided to build GLM. The only one left, that the approach requires is the black box model that is the basis of our resulting model.

    We will use here model_rf random forest model that was built before.

    The final step is to use xspliner::xspline to build the desired model.

    -
    xp_model <- xspline(
    -  cmedv ~ rm + lstat +
    -    xs(nox, 
    -       effect = list(type = "pdp", grid.resolution = 40), 
    -       transition = list(k = 10, bs = "cr")),
    -  model = boston_rf
    -)
    +
    xp_model <- xspline(
    +  cmedv ~ rm + lstat +
    +    xs(nox, 
    +       effect = list(type = "pdp", grid.resolution = 40), 
    +       transition = list(k = 10, bs = "cr")),
    +  model = boston_rf
    +)

    Lets check the model summary:

    - +
    summary(xp_model)
    +#> 
    +#> Call:
    +#> stats::glm(formula = cmedv ~ rm + lstat + xs(nox), family = family, 
    +#>     data = data)
    +#> 
    +#> Deviance Residuals: 
    +#>      Min        1Q    Median        3Q       Max  
    +#> -13.4873   -3.2935   -0.8747    1.9991   26.9926  
    +#> 
    +#> Coefficients:
    +#>              Estimate Std. Error t value Pr(>|t|)    
    +#> (Intercept) -34.29942    5.05680  -6.783 3.32e-11 ***
    +#> rm            5.23681    0.41600  12.589  < 2e-16 ***
    +#> lstat        -0.52504    0.04355 -12.057  < 2e-16 ***
    +#> xs(nox)       1.31670    0.16255   8.100 4.20e-15 ***
    +#> ---
    +#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
    +#> 
    +#> (Dispersion parameter for gaussian family taken to be 26.81888)
    +#> 
    +#>     Null deviance: 42578  on 505  degrees of freedom
    +#> Residual deviance: 13463  on 502  degrees of freedom
    +#> AIC: 3106.2
    +#> 
    +#> Number of Fisher Scoring iterations: 2

    As the final model is just the GLM, the summary is also standard. One difference that we can see here is the formula call. It has missing xs parameters, and xs is treated as standard R function (previously it was just the symbol used in formula).

    More details about can be found in following sections:

      @@ -304,7 +304,7 @@

      Use cases - see xspliner in action with examples

    Here we shortly show how the plot random forest effect and transition for nox variable:

    -
    plot(xp_model, "nox")
    +
    plot(xp_model, "nox")

    diff --git a/docs/index.html b/docs/index.html index 00ca215..3c6c9a4 100644 --- a/docs/index.html +++ b/docs/index.html @@ -87,9 +87,9 @@
    -
    +

    -xspliner’s pipeline: model %>% xspline(…) and analyze

    +xspliner’s pipeline: model %>% xspline(…) and analyze

    diff --git a/docs/news/index.html b/docs/news/index.html index 8f127c1..0f524dc 100644 --- a/docs/news/index.html +++ b/docs/news/index.html @@ -115,9 +115,22 @@

    Change log

    -
    +

    -xspliner 0.0.2 2018-12-21 +xspliner 0.0.3 Unreleased +

    +
      +
    • Summary method extended with comparison statistics
    • +
    • Extended methods for auto-extracting model metadata (lhs, response)
    • +
    • Added comparison plot for factor responses
    • +
    • Documentation extended with new examples
    • +
    • Ability to specify glm options
    • +
    • Added more informative progress messages
    • +
    +
    +
    +

    +xspliner 0.0.2 2018-12-21

    • Specify parameters hierarchy
    • @@ -138,7 +151,8 @@

      Contents

    diff --git a/docs/pkgdown.yml b/docs/pkgdown.yml index f795d0b..b2fe7fe 100644 --- a/docs/pkgdown.yml +++ b/docs/pkgdown.yml @@ -1,4 +1,4 @@ -pandoc: 2.2.1 +pandoc: 2.7.3 pkgdown: 1.3.0 pkgdown_sha: ~ articles: diff --git a/docs/reference/build_xspliner.html b/docs/reference/build_xspliner.html index bffdbbd..c0b9fe2 100644 --- a/docs/reference/build_xspliner.html +++ b/docs/reference/build_xspliner.html @@ -117,7 +117,7 @@

    Helper function for building GLM object with transformed variables.

    build_xspliner(formula, model, data, xf_opts = xf_opts_default,
       xs_opts = xs_opts_default, link = "identity", family = "gaussian",
    -  env = parent.frame(), compare_stat = aic, control)
    + env = parent.frame(), compare_stat = aic, control, ...)

    Arguments

    @@ -165,6 +165,10 @@

    Ar

    + + + +
    control

    Fitting settings. See glm.control.

    ...

    Another parameters passed from chosen method. Not used.

    diff --git a/docs/reference/index.html b/docs/reference/index.html index e39a449..7381c47 100644 --- a/docs/reference/index.html +++ b/docs/reference/index.html @@ -112,7 +112,7 @@ diff --git a/docs/reference/summary.xspliner.html b/docs/reference/summary.xspliner.html index 47b213e..39db124 100644 --- a/docs/reference/summary.xspliner.html +++ b/docs/reference/summary.xspliner.html @@ -222,78 +222,8 @@

    Examp #> R^2: 0.9960708 #> MSE Black Box: 0.02251746 #> MSE Surrogate: 0.02989727
    -# Classification model -data <- droplevels(iris[51:150, ]) # selecting only two species data -iris.rf <- randomForest(Species ~ ., data = data) -iris.xs <- xspline(iris.rf) +# Classification model -# Comparing summaries requires providing prediction function -# Prediction as probability for success -prob_rf <- function(object, newdata) predict(object, newdata = newdata, type = "prob")[, 2] -prob_xs <- function(object, newdata) predict(object, newdata = newdata, type = "response") -summary(iris.xs, model = iris.rf, newdata = data, prediction_funs = list(prob_xs, prob_rf))
    #> Models comparison -#> 1 - Max prediction normed-diff: 0.3908407 -#> R^2: 0.958412
    #> Setting levels: control = versicolor, case = virginica
    #> Setting levels: control = versicolor, case = virginica
    #> Warning: An upcoming version of pROC will set the 'transpose' argument to FALSE by default. Set transpose = TRUE explicitly to keep the current behavior, or transpose = FALSE to adopt the new one and silence this warning. Type help(coords_transpose) for additional information.
    #> Warning: An upcoming version of pROC will set the 'transpose' argument to FALSE by default. Set transpose = TRUE explicitly to keep the current behavior, or transpose = FALSE to adopt the new one and silence this warning. Type help(coords_transpose) for additional information.
    #> 1 - Max ROC diff: 0.56 -#> 1 - Mean ROC diff: 0.8443484
    # Prediction as final category -response_rf <- function(object, newdata) predict(object, newdata = newdata) -response_xs <- function(object, newdata) { - y_levels <- levels(newdata[[environment(object)$response]]) - factor( - y_levels[(predict.glm(object, newdata = newdata, type = "link") > 0) + 1], - levels = y_levels - ) -} -response_rf(iris.rf, newdata = data)
    #> 51 52 53 54 55 56 57 -#> versicolor versicolor versicolor versicolor versicolor versicolor versicolor -#> 58 59 60 61 62 63 64 -#> versicolor versicolor versicolor versicolor versicolor versicolor versicolor -#> 65 66 67 68 69 70 71 -#> versicolor versicolor versicolor versicolor versicolor versicolor versicolor -#> 72 73 74 75 76 77 78 -#> versicolor versicolor versicolor versicolor versicolor versicolor versicolor -#> 79 80 81 82 83 84 85 -#> versicolor versicolor versicolor versicolor versicolor versicolor versicolor -#> 86 87 88 89 90 91 92 -#> versicolor versicolor versicolor versicolor versicolor versicolor versicolor -#> 93 94 95 96 97 98 99 -#> versicolor versicolor versicolor versicolor versicolor versicolor versicolor -#> 100 101 102 103 104 105 106 -#> versicolor virginica virginica virginica virginica virginica virginica -#> 107 108 109 110 111 112 113 -#> virginica virginica virginica virginica virginica virginica virginica -#> 114 115 116 117 118 119 120 -#> virginica virginica virginica virginica virginica virginica virginica -#> 121 122 123 124 125 126 127 -#> virginica virginica virginica virginica virginica virginica virginica -#> 128 129 130 131 132 133 134 -#> virginica virginica virginica virginica virginica virginica virginica -#> 135 136 137 138 139 140 141 -#> virginica virginica virginica virginica virginica virginica virginica -#> 142 143 144 145 146 147 148 -#> virginica virginica virginica virginica virginica virginica virginica -#> 149 150 -#> virginica virginica -#> Levels: versicolor virginica
    response_xs(iris.xs, newdata = data)
    #> [1] versicolor versicolor versicolor versicolor versicolor versicolor -#> [7] versicolor versicolor versicolor versicolor versicolor versicolor -#> [13] versicolor versicolor versicolor versicolor versicolor versicolor -#> [19] versicolor versicolor versicolor versicolor versicolor versicolor -#> [25] versicolor versicolor versicolor virginica versicolor versicolor -#> [31] versicolor versicolor versicolor virginica versicolor versicolor -#> [37] versicolor versicolor versicolor versicolor versicolor versicolor -#> [43] versicolor versicolor versicolor versicolor versicolor versicolor -#> [49] versicolor versicolor virginica virginica virginica virginica -#> [55] virginica virginica virginica virginica virginica virginica -#> [61] virginica virginica virginica virginica virginica virginica -#> [67] virginica virginica virginica virginica virginica virginica -#> [73] virginica virginica virginica virginica virginica virginica -#> [79] virginica virginica virginica virginica virginica versicolor -#> [85] virginica virginica virginica virginica virginica virginica -#> [91] virginica virginica virginica virginica virginica virginica -#> [97] virginica virginica virginica virginica -#> Levels: versicolor virginica
    summary(iris.xs, model = iris.rf, newdata = data, prediction_funs = list(response_xs, response_rf))
    #> Models comparison -#> Mean predictions similarity: 0.97 -#> ACC Black Box: 1 -#> ACC Surrogate: 0.97

    plot(xs_iris, "Sepal.Length")
    @@ -238,23 +238,23 @@

    Examp #> #> Deviance Residuals: #> Min 1Q Median 3Q Max -#> -0.67881 -0.07692 -0.03106 0.09672 0.46649 +#> -0.67489 -0.07877 -0.03048 0.09804 0.46780 #> #> Coefficients: #> Estimate Std. Error t value Pr(>|t|) -#> (Intercept) -1.52021 0.29110 -5.222 6.02e-07 *** -#> xs(Sepal.Length) 0.33583 0.35190 0.954 0.3415 -#> xs(Petal.Length) 1.84254 0.38360 4.803 3.85e-06 *** -#> xf(Species)versicolor 0.07022 0.17859 0.393 0.6947 -#> xf(Species)virginica 0.39507 0.23856 1.656 0.0999 . +#> (Intercept) -1.40810 0.30407 -4.631 8.03e-06 *** +#> xs(Sepal.Length) 0.39342 0.36819 1.069 0.2871 +#> xs(Petal.Length) 1.67365 0.35387 4.730 5.28e-06 *** +#> xf(Species)versicolor 0.08582 0.17800 0.482 0.6304 +#> xf(Species)virginica 0.41758 0.23705 1.762 0.0802 . #> --- #> Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 #> -#> (Dispersion parameter for gaussian family taken to be 0.03105335) +#> (Dispersion parameter for gaussian family taken to be 0.03102474) #> #> Null deviance: 86.5699 on 149 degrees of freedom -#> Residual deviance: 4.5027 on 145 degrees of freedom -#> AIC: -88.211 +#> Residual deviance: 4.4986 on 145 degrees of freedom +#> AIC: -88.349 #> #> Number of Fisher Scoring iterations: 2 #>

    plot(xs_iris, "Sepal.Length")
    @@ -269,7 +269,7 @@

    Examp #> -> data : 150 rows 4 cols (extracted from the model) #> -> target variable : not specified! (WARNING) #> -> predict function : yhat.randomForest will be used (default) -#> -> predicted values : numerical, min = 0.1991898 , mean = 1.199876 , max = 2.134114 +#> -> predicted values : numerical, min = 0.1977761 , mean = 1.199116 , max = 2.143874 #> -> residual function : difference between y and yhat (default) #> A new explainer has been created!

    xs_iris <- xspline(rf_iris) summary(xs_iris)
    #> @@ -279,23 +279,23 @@

    Examp #> #> Deviance Residuals: #> Min 1Q Median 3Q Max -#> -0.67881 -0.07692 -0.03106 0.09672 0.46649 +#> -0.67489 -0.07877 -0.03048 0.09804 0.46780 #> #> Coefficients: #> Estimate Std. Error t value Pr(>|t|) -#> (Intercept) -1.52021 0.29110 -5.222 6.02e-07 *** -#> xs(Sepal.Length) 0.33583 0.35190 0.954 0.3415 -#> xs(Petal.Length) 1.84254 0.38360 4.803 3.85e-06 *** -#> xf(Species)versicolor 0.07022 0.17859 0.393 0.6947 -#> xf(Species)virginica 0.39507 0.23856 1.656 0.0999 . +#> (Intercept) -1.40810 0.30407 -4.631 8.03e-06 *** +#> xs(Sepal.Length) 0.39342 0.36819 1.069 0.2871 +#> xs(Petal.Length) 1.67365 0.35387 4.730 5.28e-06 *** +#> xf(Species)versicolor 0.08582 0.17800 0.482 0.6304 +#> xf(Species)virginica 0.41758 0.23705 1.762 0.0802 . #> --- #> Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 #> -#> (Dispersion parameter for gaussian family taken to be 0.03105335) +#> (Dispersion parameter for gaussian family taken to be 0.03102474) #> #> Null deviance: 86.5699 on 149 degrees of freedom -#> Residual deviance: 4.5027 on 145 degrees of freedom -#> AIC: -88.211 +#> Residual deviance: 4.4986 on 145 degrees of freedom +#> AIC: -88.349 #> #> Number of Fisher Scoring iterations: 2 #>

    plot(xs_iris, "Sepal.Length")
    diff --git a/man/summary.xspliner.Rd b/man/summary.xspliner.Rd index 5069391..e33a0a9 100644 --- a/man/summary.xspliner.Rd +++ b/man/summary.xspliner.Rd @@ -84,26 +84,5 @@ summary(iris.xs, "Species") summary(iris.xs, model = iris.rf, newdata = data) # Classification model -data <- droplevels(iris[51:150, ]) # selecting only two species data -iris.rf <- randomForest(Species ~ ., data = data) -iris.xs <- xspline(iris.rf) - -# Comparing summaries requires providing prediction function -# Prediction as probability for success -prob_rf <- function(object, newdata) predict(object, newdata = newdata, type = "prob")[, 2] -prob_xs <- function(object, newdata) predict(object, newdata = newdata, type = "response") -summary(iris.xs, model = iris.rf, newdata = data, prediction_funs = list(prob_xs, prob_rf)) -# Prediction as final category -response_rf <- function(object, newdata) predict(object, newdata = newdata) -response_xs <- function(object, newdata) { - y_levels <- levels(newdata[[environment(object)$response]]) - factor( - y_levels[(predict.glm(object, newdata = newdata, type = "link") > 0) + 1], - levels = y_levels - ) -} -response_rf(iris.rf, newdata = data) -response_xs(iris.xs, newdata = data) -summary(iris.xs, model = iris.rf, newdata = data, prediction_funs = list(response_xs, response_rf)) } diff --git a/vignettes/automation.Rmd b/vignettes/automation.Rmd index ae692c3..b4424d8 100644 --- a/vignettes/automation.Rmd +++ b/vignettes/automation.Rmd @@ -206,7 +206,7 @@ They are: - Building the black box with formula - Black box training data stores factors in long form (the factor is one column, not spread into binary wide form) -Most of R based ML models satisfy the above condition. One of exceptions is XGBoost which in actual xspliner version needs to be handled in the more complex xspline call. You can see it in [use cases](./use_cases.html) +Most of R based ML models satisfy the above condition. One of exceptions is XGBoost which in actual xspliner version needs to be handled in the more complex xspline call. You can see it in [use cases](./cases.html) ## Excluding predictors from transformation