Merge pull request #9 from automl/w04_hop100

t04 chapter of w04_hpo lecture
automl-edu · May 11, 2020 · 895ffc2 · 895ffc2
2 parents 0547be4 + 2d5dc38
commit 895ffc2
Show file tree

Hide file tree

Showing 11 changed files with 98 additions and 72 deletions.
diff --git a/w04_hpo_basics/code/benchmark.R b/w04_hpo_basics/code/benchmark.R
@@ -35,6 +35,7 @@ learner = lrn("classif.svm", predict_type = "prob", cost = 1, gamma = 1, type =
 measure = msr("classif.acc")
 
 n_evalss = c(25,100)
+
 #define n tuners
 tuner_terms = lapply(n_evalss, function(n_evals) {
   tuners = tnrs(c("grid_search", "random_search"))
@@ -45,22 +46,22 @@ tuner_terms = lapply(n_evalss, function(n_evals) {
 
   source(file = fs::path_join(c(mlr3tuningdir, "attic", "TunerECRSimpleEA.R")))
   tuners$TunerECRSimpleEA = TunerECRSimpleEA$new()
-  tuners$TunerECRSimpleEA$param_set$values$sdev = 0.5
 
   lapply(tuners, function(x) list(tuner = x, term = term("evals", n_evals = n_evals)))
 })
 tuner_terms = unlist(tuner_terms, recursive = FALSE)
 
 #define m tasks
-tasks = tsks(c("spam", "sonar")) #, "german_credit"))
-tasks = c(tasks, 
-  lapply(c(optdigits = 28,  ionosphere = 59), function(id) {
-    tsk("oml", data_id = id)
-}))
+# tasks = tsks(c("spam", "sonar")) #, "german_credit"))
+# tasks = c(tasks, 
+#   lapply(c(optdigits = 28,  ionosphere = 59), function(id) {
+#     tsk("oml", data_id = id)
+# }))
+tasks = tsks(c("spam"))
 
 ps = ParamSet$new(params = list(
-    ParamDbl$new("cost", lower = -5, upper = 5),
-    ParamDbl$new("gamma", lower = -5, upper = 5)
+    ParamDbl$new("cost", lower = -15, upper = 15),
+    ParamDbl$new("gamma", lower = -15, upper = 15)
 ))
 
 ps$trafo = function(x, param_set) {
@@ -74,6 +75,8 @@ rsmp_outer = rsmp("cv", folds = 10)
 #rsmp_outer = rsmp("cv", folds = 2)
 
 learners = Map(function(ps, tuner_terms) {
+  learner$encapsulate = c(train = "evaluate", predict = "evaluate")
+  learner$fallback = lrn("classif.featureless")
   learner = AutoTuner$new(learner = learner, resampling = rsmp_tuning, measures = measure, terminator = tuner_terms$term, search_space = ps, tuner = tuner_terms$tuner)  
   learner$id = paste0(learner$id, ".", tuner_terms[[1]]$term$param_set$values$n_evals)
   return(learner)
@@ -131,6 +134,7 @@ if (!fs::file_exists("benchmark_res.rds")) {
   batchMap(function(design, ...) {
     #makes inner resampling folds the same if the outer resampling is the same?
     set.seed(as.integer(substr(stri_replace_all_regex(design$resampling[[1]]$hash, "[a-z]", ""),0,9)))
+    options(mc.cores = design$resampling[[1]]$param_set$values$folds %??% 10)
     future::plan(multiprocess)
     res = benchmark(design = design, ...)
     for (i in seq_row(res$data)) {
@@ -142,7 +146,7 @@ if (!fs::file_exists("benchmark_res.rds")) {
   }, store_models = TRUE, design = split(design, seq_row(design)))
 
   #testJob(1)
-  submitJobs(resources = list(ncpus = rsmp_outer$param_set$values$folds %??% 10))
+  submitJobs(resources = list(ncpus = rsmp_outer$param_set$values$folds %??% 10, memory = 16000))
   waitForJobs()
   res = reduceResultsList(findDone())
   res_tune = res[[1]]
@@ -163,7 +167,8 @@ if (!fs::file_exists("benchmark_res.rds")) {
 
 res = readRDS("benchmark_res.rds")
 
-baseline_res = res$baseline$aggregate(measures = measure)
+xx = res$baseline$aggregate(measures = measure)
+res_baseline = map_dtr(xx$resample_result, function(x) x$score(measure))
 
 
 #build dt for plotting
@@ -178,88 +183,94 @@ res_compl = unnest(res_compl, "opt_x", prefix = "opt.x.")
 res_compl[tuner == "TunerGridSearch", nr := sample(x = nr), by = .(task_id, learner_id, uhash, iteration, budget)]
 setkey(res_compl, task_id, learner_id, uhash, iteration, budget, nr)
 
-res_compl[, classif.auc.cummax := cummax(classif.auc), by = .(task_id, learner_id, tuner, uhash, iteration, budget)]
+res_compl[, classif.acc.cummax := cummax(classif.acc), by = .(task_id, learner_id, tuner, uhash, iteration, budget)]
 res_compl[, tuner := stri_replace_first_fixed(tuner, "Tuner", "")]
 res_compl = res_compl[nr <= budget,]
 
+res_outer = res$tune$score(measures = measure)
+
+#reduce size
+rm(res)
+#gc()
+
+
 theme_set(theme_bw())
 
-tuner_names = c("GridSearch", "RandomSearch", "CMAES", "Untuned", "Heuristic")
-tuner_colors = set_names(RColorBrewer::brewer.pal(7, "Set1"), tuner_names)
+EVAL_ITERS = 100
+DATASET = "spam"
+tuners_select = c("GridSearch", "RandomSearch", "CMAES", "(1+1)-EA" = "ECRSimpleEA", "Untuned", "Heuristic")
+names(tuners_select) = ifelse(nzchar(names(tuners_select)), names(tuners_select), tuners_select)
+tuner_colors = set_names(RColorBrewer::brewer.pal(length(tuners_select), "Set1"), names(tuners_select))
 
-res_compl[, tuner := factor(tuner, levels = tuner_names)]
-res_compl = res_compl[budget == 100 & task_id == "spam",]
+# reduce data
+res_compl[, tuner := factor(tuner, levels = tuners_select, labels = names(tuners_select))]
+res_compl = res_compl[budget == EVAL_ITERS & task_id %in% DATASET,]
 
 #tune curve for iter = 1
-g = ggplot(res_compl[iteration == 1,], aes(y = classif.auc.cummax, x = nr, color = tuner))
+g = ggplot(res_compl[iteration == 1,], aes(y = classif.acc.cummax, x = nr, color = tuner))
 g = g + geom_line()
-g = g + geom_point(aes(y = classif.auc), alpha = 0.1)
-g = g + facet_wrap("task_id")
-g = g + coord_cartesian(ylim = c(0.7, 1))
+g = g + geom_point(aes(y = classif.acc), alpha = 0.1)
+g = g + facet_wrap("task_id", scales = "free")
+#g = g + coord_cartesian(ylim = c(0.7, 1))
 g = g + scale_color_manual(values = tuner_colors)
-g = g + labs(y = "AUC", x = "eval", title = "Tuning cost and gamma for SVM (kernel = radial)", fill = "Tuner")
+g = g + labs(y = "ACC", x = "eval", title = "Tuning cost and gamma for SVM (kernel = radial)", fill = "Tuner")
 if (interactive()) {
   print(g)
 }
 ggsave("../images/benchmark_curve_iter_1.png", g, height = 5, width = 7)
 
 #tune curve for all iters
-g = ggplot(res_compl, aes(y = classif.auc.cummax, x = nr, color = tuner, group = paste0(tuner, iteration)))
-g = g + geom_line()
-g = g + facet_wrap("task_id")
-g = g + coord_cartesian(ylim = c(0.7, 1))
+g = ggplot(res_compl, aes(y = classif.acc.cummax, x = nr, color = tuner, group = paste0(tuner, iteration)))
+g = g + geom_line(alpha = 0.5)
+g = g + facet_wrap("task_id", scales = "free")
 g = g + scale_color_manual(values = tuner_colors)
-g = g + labs(y = "AUC", x = "eval", title = "Tuning cost and gamma for SVM (kernel = radial)", fill = "Tuner")
+g = g + labs(y = "ACC", x = "eval", title = "Tuning cost and gamma for SVM (kernel = radial)", fill = "Tuner")
 if (interactive()) {
   print(g)
 }
 ggsave("../images/benchmark_curve_iter_all.png", g, height = 5, width = 7)
 
 #tune curve for all iters averaged
-g = ggplot(res_compl, aes(y = classif.auc.cummax, x = nr, color = tuner))
+g = ggplot(res_compl, aes(y = classif.acc.cummax, x = nr, color = tuner))
 g = g + stat_summary(geom = "line", fun = median)
-g = g + facet_wrap("task_id")
-g = g + coord_cartesian(ylim = c(0.925, 0.99))
+g = g + facet_wrap("task_id", scales = "free")
 g = g + scale_color_manual(values = tuner_colors)
-g = g + labs(y = "AUC", x = "eval", title = "Tuning cost and gamma for SVM (kernel = radial)", fill = "Tuner")
+g = g + labs(y = "ACC", x = "eval", title = "Tuning cost and gamma for SVM (kernel = radial)", fill = "Tuner")
 if (interactive()) {
   print(g)
 }
 ggsave("../images/benchmark_curve_median.png", g, height = 5, width = 7)
 
 #tune curve for all iters averaged + individual
-g = ggplot(res_compl, aes(y = classif.auc.cummax, x = nr, color = tuner))
+g = ggplot(res_compl, aes(y = classif.acc.cummax, x = nr, color = tuner))
 g = g + geom_line(alpha = 0.2, mapping = aes(group = paste0(tuner, iteration)))
 g = g + stat_summary(geom = "line", fun = median)
-g = g + facet_wrap("task_id")
-g = g + coord_cartesian(ylim = c(0.9, 1))
+g = g + facet_wrap("task_id", scales = "free")
 g = g + scale_color_manual(values = tuner_colors)
-g = g + labs(y = "AUC", x = "eval", title = "Tuning cost and gamma for SVM (kernel = radial)", fill = "Tuner")
+g = g + labs(y = "ACC", x = "eval", title = "Tuning cost and gamma for SVM (kernel = radial)", fill = "Tuner")
 if (interactive()) {
   print(g)
 }
 ggsave("../images/benchmark_curve_iter_all_median.png", g, height = 5, width = 7)
 
 # outer performance
-res_outer = res$tune$score(measures = measure)
-res_outer = res_outer[map_lgl(learner, function(x) x$model$tuning_instance$terminator$param_set$values$n_evals == 100) & task_id == "spam", ]
+res_outer = res_outer[map_lgl(learner, function(x) x$model$tuning_instance$terminator$param_set$values$n_evals == EVAL_ITERS) & task_id %in% DATASET, ]
 res_outer[, tuner := map_chr(learner, function(x) class(x$tuner)[[1]])]
 res_outer[, tuner := stri_replace_first_fixed(tuner, "Tuner", "")]
-res_baseline = res$baseline$score(measures = measure)[task_id == "spam", ]
 res_baseline[, tuner := ifelse(stri_detect_fixed(learner_id, "default"), "Heuristic", "Untuned")]
-res_combined = rbind(res_baseline, res_outer)
-res_combined[, tuner:=factor(tuner, levels = tuner_names)]
+res_combined = rbind(res_baseline[task_id %in% DATASET,], res_outer, fill = TRUE)
+res_combined[, tuner:=factor(tuner, levels = tuners_select, labels = names(tuners_select))]
 settings = list(
   tuners = list(name = "tuners", tuners = unique(res_outer$tuner)),
   untuned = list(name = "default", tuners = c(unique(res_outer$tuner), "Untuned")),
   all = list(name = "all", tuners = unique(res_combined$tuner))
 )
 for (s in settings) {
-  g = ggplot(res_combined[tuner %in% s$tuners], aes(x = tuner, y = classif.auc, fill = tuner))
+  g = ggplot(res_combined[tuner %in% s$tuners,], aes(x = tuner, y = classif.acc, fill = tuner))
   g = g + geom_boxplot()
   g = g + scale_fill_manual(values = tuner_colors)
   g = g + facet_grid(task_id~.)
-  g = g + labs(y = "AUC", x = NULL, fill = "Tuner", title = "SVM performance on outer test set")
+  g = g + labs(y = "ACC", x = NULL, fill = "Tuner", title = "SVM performance on outer test set")
   g = g + coord_flip(ylim = s$ylim)
   if (interactive()) {
     print(g)
@@ -268,16 +279,21 @@ for (s in settings) {
 }
 
 #tune x space
-#gs = lapply(unique(res_compl$task_id), function(this_task_id) {
-  this_task_id = "spam"
-  g = ggplot(res_compl[task_id == this_task_id & iteration == 1], aes(x = opt.x.cost, y = opt.x.gamma, size = classif.auc, color = classif.auc))
+gs = lapply(unique(res_compl$task_id), function(this_task_id) {
+  #this_task_id = "spam"
+  g = ggplot(res_compl[task_id == this_task_id & iteration == 1], aes(x = opt.x.cost, y = opt.x.gamma, size = classif.acc, color = classif.acc))
   g = g + geom_point()
   g = g + facet_grid(task_id~tuner)
   g = g + scale_radius() + scale_colour_viridis_c() + scale_y_log10() + scale_x_log10()
-  g = g + labs(color = "AUC", size = "AUC", x = "cost", y = "gamma")
+  g = g + labs(color = "ACC", size = "ACC", x = "cost", y = "gamma")
   g + theme_bw()
-#})
-#g = marrangeGrob(gs, ncol = 2, nrow = 1, top = "Tuning eta and lambda for xgboost (nrounds = 100)")
+})
+if (length(DATASET) > 1) {
+  g = marrangeGrob(gs, ncol = 2, top = "Tuning cost and gamma for SVM (kernel = radial)")  
+} else {
+  g = gs[[1]]
+}
+
 if (interactive()) {
   print(g)
 }

diff --git a/w04_hpo_basics/images/benchmark_boxplot_all.png b/w04_hpo_basics/images/benchmark_boxplot_all.png
diff --git a/w04_hpo_basics/images/benchmark_boxplot_default.png b/w04_hpo_basics/images/benchmark_boxplot_default.png
diff --git a/w04_hpo_basics/images/benchmark_boxplot_tuners.png b/w04_hpo_basics/images/benchmark_boxplot_tuners.png
diff --git a/w04_hpo_basics/images/benchmark_curve_iter_1.png b/w04_hpo_basics/images/benchmark_curve_iter_1.png
diff --git a/w04_hpo_basics/images/benchmark_curve_iter_all.png b/w04_hpo_basics/images/benchmark_curve_iter_all.png
diff --git a/w04_hpo_basics/images/benchmark_curve_iter_all_median.png b/w04_hpo_basics/images/benchmark_curve_iter_all_median.png
diff --git a/w04_hpo_basics/images/benchmark_curve_median.png b/w04_hpo_basics/images/benchmark_curve_median.png
diff --git a/w04_hpo_basics/images/benchmark_scatter.png b/w04_hpo_basics/images/benchmark_scatter.png
diff --git a/w04_hpo_basics/t04_practical.pdf b/w04_hpo_basics/t04_practical.pdf
diff --git a/w04_hpo_basics/t04_practical.tex b/w04_hpo_basics/t04_practical.tex
@@ -15,30 +15,37 @@
 
 \begin{itemize}
 	\item Knowledge about hyperparameters can help to guide the optimization
-	\item For example, it can be benificial to sample hyperparameters on a non-uniform scale.
+	\item For example, it can be beneficial to sample hyperparameters on a non-uniform scale.
 \end{itemize}
 
     \vspace{0.2cm}
-Example: regularization hyperparameter ($C$ or \emph{cost}) of SVM: $[0.001, 1000.0]$
+Example: regularization hyperparameter ($C$ or \emph{cost}) of SVM: $[2^{-5}, 2^{5}] \approx [0.03, 32]$
 
 \begin{itemize}
-	\item The distance between $999.9$ and $1000.0$ should not be the same as between $0.1$ and $0.2$.
-  \item We might want to sample here from from a log-scale, e.g., $[10^{\conf_l}, 10^{\conf_u}]$ with $\conf_l = -3$ and $\conf_u = 3$.
+	\item The distance between $10$ and $20$ should be the same as between $0.1$ and $0.2$.
+  \item We might want to sample here from from a log-scale, e.g., $[2^{\conf_l}, 2^{\conf_u}]$ with $\conf_l = -5$ and $\conf_u = 5$.
 \end{itemize}
 
 \begin{figure}[htb]
 \centering
+
   \begin{tikzpicture}[auto]%[scale=1.5]
     \draw [->](-0.3,0)-- (12.3,0) coordinate;
     \draw [->](-0.3,-2)-- (12.3,-2) coordinate;
-    \foreach \x/\xtext/\xxtext in {-3/-3/0.001, -2/-2/ , -1/-1/, 0/0/, 1/1/, 2/2/100, 3/3/1000} {
-      \draw [very thick] ({\x*2+6},-2pt) -- ++(0, 4pt) node[xshift = -6pt, yshift=-3pt,anchor=south west,baseline]{\strut$\xtext$};
-      \draw [very thick] ({10^(\x)*0.012},-2cm+2pt) -- ++(0,-4pt) node[anchor=north]{$\xxtext$};
-      \draw [->] ({\x*2+6},-2pt) .. controls ({\x*2+6},-0.5) and ({10^(\x)*0.012},-1.5) .. ({10^(\x)*0.012},-2cm+2pt);
+    % \foreach \x/\xtext/\xxtext in {-15/-15/0.000031, -10/-10/ , -5/-5/, 0/0/1, 5/5/32, 10/10/1024, 15/15/32768} {
+    \def\xM{5} % max exponent
+    \def\xW{12} % max width in cm
+    \foreach \x/\xtext/\xxtext in {-5/-5/.03, -2.5/-2.5/, 0/0/1, 2.5/2.5/5.66, 5/5/32} {
+      \def\xA{{(\x + \xM) * (\xW / (2 * \xM))}} %untrafoed val in cm (0 to 12)
+      \def\xB{{\xW * 2^(\x-\xM)}} %trafoed val in cm
+      \draw [very thick] (\xA,-2pt) -- ++(0, 4pt) node[xshift = -6pt, yshift=-3pt,anchor=south west,baseline]{\strut$\xtext$};
+      \draw [very thick] (\xB,-2cm+2pt) -- ++(0,-4pt) node[anchor=north]{\scriptsize $\xxtext$};
+      \draw [->] (\xA,-2pt) .. controls (\xA,-0.5) and (\xB,-1.5) .. (\xB,-2cm+2pt);
     }
     \node[] at (-0.7,-0.1) (t1) {$\conf$};
-    \node[] at (-0.7,-1.9) (t2) {$10^{\conf}$};
+    \node[] at (-0.7,-1.9) (t2) {$2^{\conf}$};
   \end{tikzpicture}
+
 \end{figure}
 
 \framebreak
@@ -49,12 +56,12 @@
         \item If $\finconf$ is close to the border of $\pcs$ the ranges should be increased (or a different scale should be selected).
         \item Meta-Learning can help to decide which hyperparameters should be tuned in which ranges.
     \end{itemize}
-    \vspace{0.5cm}
-    Example:
-    \begin{itemize}
-            \item The ranges for $\text{cost} \in [10^{-3}, 10^3]$ and $\gamma \in [10^{-3}, 10^3]$ are rather small.
-            \item More common would be ranges like $\text{cost} \in [2^{-15}, 2^{15}]$ and $\gamma \in [2^{-15}, 2^{15}]$.
-    \end{itemize}
+    % \vspace{0.5cm}
+    % Example:
+    % \begin{itemize}
+    %         \item The ranges for $\text{cost} \in [10^{-3}, 10^3]$ and $\gamma \in [10^{-3}, 10^3]$ are rather small.
+    %         \item More common would be ranges like $\text{cost} \in [2^{-15}, 2^{15}]$ and $\gamma \in [2^{-15}, 2^{15}]$.
+    % \end{itemize}
 
 \end{frame}
 
@@ -67,18 +74,20 @@
 
     \begin{itemize}
         \item The learning algorithm is a support vector machine (SVM) with RBF kernel.
-        \item The hyperparmeters we optimize are
+        \item The hyperparameters we optimize are
             \begin{itemize}
-                \item Cost parameter $\text{cost} \in [10^{-3}, 10^3]$.
-                \item Kernel parameter $\gamma \in [10^{-3}, 10^3]$.
+                \item Cost parameter $\text{cost} \in [2^{-15}, 2^{15}]$.
+                \item Kernel parameter $\gamma \in [2^{-15}, 2^{15}]$.
             \end{itemize}
-        \item We compare three different optimizer
+        \item We compare four different optimizer
             \begin{itemize}
                 \item Random search
                 \item Grid search
-                \item \textit{CMAES} - an evolutionary algorithm, that generates offsprings from a multivariate normal distribution (We didn't cover this algorithm).
+                \item A $(\mu+\lambda)$-selection evolutionary algorithm with $\mu = 1$, $\lambda = 1$ and Gauss mutation with $\sigma = 1$.
+                \item \textit{CMAES} - an evolutionary algorithm that generates offspring from a multivariate normal distribution (We didn't cover this algorithm).
             \end{itemize}
-        \item We use a 5-fold cross-validation and optimize the area under the ROC curve (AUC).
+        \item We use a 5-fold cross-validation and optimize the accuracy (ACC).
+        \item All methods are allowed a budget of $100$ evaluations.
     \end{itemize}
 
 \end{frame}
@@ -101,7 +110,8 @@
 
   \begin{itemize}
       \item Both \emph{Grid search} and \emph{random search} have many evaluations in regions with bad performance ($\gamma>1$).
-    \item \emph{CMAES} only explores a small region.
+      \item \emph{CMAES} only explores a small region.
+      \item \emph{(1+1)-EA} does not converge.
   \end{itemize}
 \end{column}%
 \begin{column}{0.5\textwidth}
@@ -171,7 +181,7 @@
 \begin{column}{0.4\textwidth}
   \footnotesize
 
-  The box plots show the distribution of the AUC-values that were measured on the \emph{outer test set} with a 10-fold CV.
+  The box plots show the distribution of the ACC-values that were measured on the \emph{outer test set} with a 10-fold CV.
 
   Note:
 
@@ -213,13 +223,13 @@
     \begin{itemize}
             \item Static defaults of hyperparameters, e.g., $\conf = (\text{cost},\gamma) = (1,1)$ are rarely a good choice.
             \item A simple extension is to compute defaults based on some simple dataset characteristics.
-            \item The best know example is the formular for the size of the random subset of features to consider as a split in a random forest: $\sqrt{p}$, where $p$ is the number of features.
+            \item The best know example is the formula for the size of the random subset of features to consider as a split in a random forest: $\sqrt{p}$, where $p$ is the number of features.
             \item For the RBF-SVM a data dependent default for the $\gamma$ parameter can be computed by
                 \begin{itemize}
                      \item The pairwise distances $\|\x - \tilde \x \|$ between points of a random subset containing $50\%$ of the data points are calculated.
                      \item The estimate is based upon the $0.1$ and $0.9$ quantile of these distances.
                      \item Basically any value between those two bounds will produce good results.
-                     \item Take the mean of the $0.1$ and $0.9$ quantile of these distances as an estimate for $\sigma$ and compute $\gamma = \tfrac{1}{2\sigma^2}$.
+                     \item Take the mean of the $0.1$ and $0.9$ quantile of these distances as an estimate for $\gamma$.
                 \end{itemize}
             \item These simple defaults can work well and don't require expensive tuning procedures.
     \end{itemize}
@@ -233,7 +243,7 @@
   \footnotesize
 
   \begin{itemize}
-      \item With the previously discussed data dependet default of $\gamma$ and $\text{cost} = 1$.
+      \item With the previously discussed data depended default of $\gamma$ and $\text{cost} = 1$.
   \end{itemize}
 \end{column}
 \begin{column}{0.6\textwidth}