Skip to content

Commit

Permalink
Merge pull request #9 from automl/w04_hop100
Browse files Browse the repository at this point in the history
t04 chapter of w04_hpo lecture
  • Loading branch information
jakob-r authored May 11, 2020
2 parents 0547be4 + 2d5dc38 commit 895ffc2
Show file tree
Hide file tree
Showing 11 changed files with 98 additions and 72 deletions.
108 changes: 62 additions & 46 deletions w04_hpo_basics/code/benchmark.R
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ learner = lrn("classif.svm", predict_type = "prob", cost = 1, gamma = 1, type =
measure = msr("classif.acc")

n_evalss = c(25,100)

#define n tuners
tuner_terms = lapply(n_evalss, function(n_evals) {
tuners = tnrs(c("grid_search", "random_search"))
Expand All @@ -45,22 +46,22 @@ tuner_terms = lapply(n_evalss, function(n_evals) {

source(file = fs::path_join(c(mlr3tuningdir, "attic", "TunerECRSimpleEA.R")))
tuners$TunerECRSimpleEA = TunerECRSimpleEA$new()
tuners$TunerECRSimpleEA$param_set$values$sdev = 0.5

lapply(tuners, function(x) list(tuner = x, term = term("evals", n_evals = n_evals)))
})
tuner_terms = unlist(tuner_terms, recursive = FALSE)

#define m tasks
tasks = tsks(c("spam", "sonar")) #, "german_credit"))
tasks = c(tasks,
lapply(c(optdigits = 28, ionosphere = 59), function(id) {
tsk("oml", data_id = id)
}))
# tasks = tsks(c("spam", "sonar")) #, "german_credit"))
# tasks = c(tasks,
# lapply(c(optdigits = 28, ionosphere = 59), function(id) {
# tsk("oml", data_id = id)
# }))
tasks = tsks(c("spam"))

ps = ParamSet$new(params = list(
ParamDbl$new("cost", lower = -5, upper = 5),
ParamDbl$new("gamma", lower = -5, upper = 5)
ParamDbl$new("cost", lower = -15, upper = 15),
ParamDbl$new("gamma", lower = -15, upper = 15)
))

ps$trafo = function(x, param_set) {
Expand All @@ -74,6 +75,8 @@ rsmp_outer = rsmp("cv", folds = 10)
#rsmp_outer = rsmp("cv", folds = 2)

learners = Map(function(ps, tuner_terms) {
learner$encapsulate = c(train = "evaluate", predict = "evaluate")
learner$fallback = lrn("classif.featureless")
learner = AutoTuner$new(learner = learner, resampling = rsmp_tuning, measures = measure, terminator = tuner_terms$term, search_space = ps, tuner = tuner_terms$tuner)
learner$id = paste0(learner$id, ".", tuner_terms[[1]]$term$param_set$values$n_evals)
return(learner)
Expand Down Expand Up @@ -131,6 +134,7 @@ if (!fs::file_exists("benchmark_res.rds")) {
batchMap(function(design, ...) {
#makes inner resampling folds the same if the outer resampling is the same?
set.seed(as.integer(substr(stri_replace_all_regex(design$resampling[[1]]$hash, "[a-z]", ""),0,9)))
options(mc.cores = design$resampling[[1]]$param_set$values$folds %??% 10)
future::plan(multiprocess)
res = benchmark(design = design, ...)
for (i in seq_row(res$data)) {
Expand All @@ -142,7 +146,7 @@ if (!fs::file_exists("benchmark_res.rds")) {
}, store_models = TRUE, design = split(design, seq_row(design)))

#testJob(1)
submitJobs(resources = list(ncpus = rsmp_outer$param_set$values$folds %??% 10))
submitJobs(resources = list(ncpus = rsmp_outer$param_set$values$folds %??% 10, memory = 16000))
waitForJobs()
res = reduceResultsList(findDone())
res_tune = res[[1]]
Expand All @@ -163,7 +167,8 @@ if (!fs::file_exists("benchmark_res.rds")) {

res = readRDS("benchmark_res.rds")

baseline_res = res$baseline$aggregate(measures = measure)
xx = res$baseline$aggregate(measures = measure)
res_baseline = map_dtr(xx$resample_result, function(x) x$score(measure))


#build dt for plotting
Expand All @@ -178,88 +183,94 @@ res_compl = unnest(res_compl, "opt_x", prefix = "opt.x.")
res_compl[tuner == "TunerGridSearch", nr := sample(x = nr), by = .(task_id, learner_id, uhash, iteration, budget)]
setkey(res_compl, task_id, learner_id, uhash, iteration, budget, nr)

res_compl[, classif.auc.cummax := cummax(classif.auc), by = .(task_id, learner_id, tuner, uhash, iteration, budget)]
res_compl[, classif.acc.cummax := cummax(classif.acc), by = .(task_id, learner_id, tuner, uhash, iteration, budget)]
res_compl[, tuner := stri_replace_first_fixed(tuner, "Tuner", "")]
res_compl = res_compl[nr <= budget,]

res_outer = res$tune$score(measures = measure)

#reduce size
rm(res)
#gc()


theme_set(theme_bw())

tuner_names = c("GridSearch", "RandomSearch", "CMAES", "Untuned", "Heuristic")
tuner_colors = set_names(RColorBrewer::brewer.pal(7, "Set1"), tuner_names)
EVAL_ITERS = 100
DATASET = "spam"
tuners_select = c("GridSearch", "RandomSearch", "CMAES", "(1+1)-EA" = "ECRSimpleEA", "Untuned", "Heuristic")
names(tuners_select) = ifelse(nzchar(names(tuners_select)), names(tuners_select), tuners_select)
tuner_colors = set_names(RColorBrewer::brewer.pal(length(tuners_select), "Set1"), names(tuners_select))

res_compl[, tuner := factor(tuner, levels = tuner_names)]
res_compl = res_compl[budget == 100 & task_id == "spam",]
# reduce data
res_compl[, tuner := factor(tuner, levels = tuners_select, labels = names(tuners_select))]
res_compl = res_compl[budget == EVAL_ITERS & task_id %in% DATASET,]

#tune curve for iter = 1
g = ggplot(res_compl[iteration == 1,], aes(y = classif.auc.cummax, x = nr, color = tuner))
g = ggplot(res_compl[iteration == 1,], aes(y = classif.acc.cummax, x = nr, color = tuner))
g = g + geom_line()
g = g + geom_point(aes(y = classif.auc), alpha = 0.1)
g = g + facet_wrap("task_id")
g = g + coord_cartesian(ylim = c(0.7, 1))
g = g + geom_point(aes(y = classif.acc), alpha = 0.1)
g = g + facet_wrap("task_id", scales = "free")
#g = g + coord_cartesian(ylim = c(0.7, 1))
g = g + scale_color_manual(values = tuner_colors)
g = g + labs(y = "AUC", x = "eval", title = "Tuning cost and gamma for SVM (kernel = radial)", fill = "Tuner")
g = g + labs(y = "ACC", x = "eval", title = "Tuning cost and gamma for SVM (kernel = radial)", fill = "Tuner")
if (interactive()) {
print(g)
}
ggsave("../images/benchmark_curve_iter_1.png", g, height = 5, width = 7)

#tune curve for all iters
g = ggplot(res_compl, aes(y = classif.auc.cummax, x = nr, color = tuner, group = paste0(tuner, iteration)))
g = g + geom_line()
g = g + facet_wrap("task_id")
g = g + coord_cartesian(ylim = c(0.7, 1))
g = ggplot(res_compl, aes(y = classif.acc.cummax, x = nr, color = tuner, group = paste0(tuner, iteration)))
g = g + geom_line(alpha = 0.5)
g = g + facet_wrap("task_id", scales = "free")
g = g + scale_color_manual(values = tuner_colors)
g = g + labs(y = "AUC", x = "eval", title = "Tuning cost and gamma for SVM (kernel = radial)", fill = "Tuner")
g = g + labs(y = "ACC", x = "eval", title = "Tuning cost and gamma for SVM (kernel = radial)", fill = "Tuner")
if (interactive()) {
print(g)
}
ggsave("../images/benchmark_curve_iter_all.png", g, height = 5, width = 7)

#tune curve for all iters averaged
g = ggplot(res_compl, aes(y = classif.auc.cummax, x = nr, color = tuner))
g = ggplot(res_compl, aes(y = classif.acc.cummax, x = nr, color = tuner))
g = g + stat_summary(geom = "line", fun = median)
g = g + facet_wrap("task_id")
g = g + coord_cartesian(ylim = c(0.925, 0.99))
g = g + facet_wrap("task_id", scales = "free")
g = g + scale_color_manual(values = tuner_colors)
g = g + labs(y = "AUC", x = "eval", title = "Tuning cost and gamma for SVM (kernel = radial)", fill = "Tuner")
g = g + labs(y = "ACC", x = "eval", title = "Tuning cost and gamma for SVM (kernel = radial)", fill = "Tuner")
if (interactive()) {
print(g)
}
ggsave("../images/benchmark_curve_median.png", g, height = 5, width = 7)

#tune curve for all iters averaged + individual
g = ggplot(res_compl, aes(y = classif.auc.cummax, x = nr, color = tuner))
g = ggplot(res_compl, aes(y = classif.acc.cummax, x = nr, color = tuner))
g = g + geom_line(alpha = 0.2, mapping = aes(group = paste0(tuner, iteration)))
g = g + stat_summary(geom = "line", fun = median)
g = g + facet_wrap("task_id")
g = g + coord_cartesian(ylim = c(0.9, 1))
g = g + facet_wrap("task_id", scales = "free")
g = g + scale_color_manual(values = tuner_colors)
g = g + labs(y = "AUC", x = "eval", title = "Tuning cost and gamma for SVM (kernel = radial)", fill = "Tuner")
g = g + labs(y = "ACC", x = "eval", title = "Tuning cost and gamma for SVM (kernel = radial)", fill = "Tuner")
if (interactive()) {
print(g)
}
ggsave("../images/benchmark_curve_iter_all_median.png", g, height = 5, width = 7)

# outer performance
res_outer = res$tune$score(measures = measure)
res_outer = res_outer[map_lgl(learner, function(x) x$model$tuning_instance$terminator$param_set$values$n_evals == 100) & task_id == "spam", ]
res_outer = res_outer[map_lgl(learner, function(x) x$model$tuning_instance$terminator$param_set$values$n_evals == EVAL_ITERS) & task_id %in% DATASET, ]
res_outer[, tuner := map_chr(learner, function(x) class(x$tuner)[[1]])]
res_outer[, tuner := stri_replace_first_fixed(tuner, "Tuner", "")]
res_baseline = res$baseline$score(measures = measure)[task_id == "spam", ]
res_baseline[, tuner := ifelse(stri_detect_fixed(learner_id, "default"), "Heuristic", "Untuned")]
res_combined = rbind(res_baseline, res_outer)
res_combined[, tuner:=factor(tuner, levels = tuner_names)]
res_combined = rbind(res_baseline[task_id %in% DATASET,], res_outer, fill = TRUE)
res_combined[, tuner:=factor(tuner, levels = tuners_select, labels = names(tuners_select))]
settings = list(
tuners = list(name = "tuners", tuners = unique(res_outer$tuner)),
untuned = list(name = "default", tuners = c(unique(res_outer$tuner), "Untuned")),
all = list(name = "all", tuners = unique(res_combined$tuner))
)
for (s in settings) {
g = ggplot(res_combined[tuner %in% s$tuners], aes(x = tuner, y = classif.auc, fill = tuner))
g = ggplot(res_combined[tuner %in% s$tuners,], aes(x = tuner, y = classif.acc, fill = tuner))
g = g + geom_boxplot()
g = g + scale_fill_manual(values = tuner_colors)
g = g + facet_grid(task_id~.)
g = g + labs(y = "AUC", x = NULL, fill = "Tuner", title = "SVM performance on outer test set")
g = g + labs(y = "ACC", x = NULL, fill = "Tuner", title = "SVM performance on outer test set")
g = g + coord_flip(ylim = s$ylim)
if (interactive()) {
print(g)
Expand All @@ -268,16 +279,21 @@ for (s in settings) {
}

#tune x space
#gs = lapply(unique(res_compl$task_id), function(this_task_id) {
this_task_id = "spam"
g = ggplot(res_compl[task_id == this_task_id & iteration == 1], aes(x = opt.x.cost, y = opt.x.gamma, size = classif.auc, color = classif.auc))
gs = lapply(unique(res_compl$task_id), function(this_task_id) {
#this_task_id = "spam"
g = ggplot(res_compl[task_id == this_task_id & iteration == 1], aes(x = opt.x.cost, y = opt.x.gamma, size = classif.acc, color = classif.acc))
g = g + geom_point()
g = g + facet_grid(task_id~tuner)
g = g + scale_radius() + scale_colour_viridis_c() + scale_y_log10() + scale_x_log10()
g = g + labs(color = "AUC", size = "AUC", x = "cost", y = "gamma")
g = g + labs(color = "ACC", size = "ACC", x = "cost", y = "gamma")
g + theme_bw()
#})
#g = marrangeGrob(gs, ncol = 2, nrow = 1, top = "Tuning eta and lambda for xgboost (nrounds = 100)")
})
if (length(DATASET) > 1) {
g = marrangeGrob(gs, ncol = 2, top = "Tuning cost and gamma for SVM (kernel = radial)")
} else {
g = gs[[1]]
}

if (interactive()) {
print(g)
}
Expand Down
Binary file modified w04_hpo_basics/images/benchmark_boxplot_all.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified w04_hpo_basics/images/benchmark_boxplot_default.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified w04_hpo_basics/images/benchmark_boxplot_tuners.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified w04_hpo_basics/images/benchmark_curve_iter_1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified w04_hpo_basics/images/benchmark_curve_iter_all.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified w04_hpo_basics/images/benchmark_curve_iter_all_median.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified w04_hpo_basics/images/benchmark_curve_median.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified w04_hpo_basics/images/benchmark_scatter.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added w04_hpo_basics/t04_practical.pdf
Binary file not shown.
62 changes: 36 additions & 26 deletions w04_hpo_basics/t04_practical.tex
Original file line number Diff line number Diff line change
Expand Up @@ -15,30 +15,37 @@

\begin{itemize}
\item Knowledge about hyperparameters can help to guide the optimization
\item For example, it can be benificial to sample hyperparameters on a non-uniform scale.
\item For example, it can be beneficial to sample hyperparameters on a non-uniform scale.
\end{itemize}

\vspace{0.2cm}
Example: regularization hyperparameter ($C$ or \emph{cost}) of SVM: $[0.001, 1000.0]$
Example: regularization hyperparameter ($C$ or \emph{cost}) of SVM: $[2^{-5}, 2^{5}] \approx [0.03, 32]$

\begin{itemize}
\item The distance between $999.9$ and $1000.0$ should not be the same as between $0.1$ and $0.2$.
\item We might want to sample here from from a log-scale, e.g., $[10^{\conf_l}, 10^{\conf_u}]$ with $\conf_l = -3$ and $\conf_u = 3$.
\item The distance between $10$ and $20$ should be the same as between $0.1$ and $0.2$.
\item We might want to sample here from from a log-scale, e.g., $[2^{\conf_l}, 2^{\conf_u}]$ with $\conf_l = -5$ and $\conf_u = 5$.
\end{itemize}

\begin{figure}[htb]
\centering

\begin{tikzpicture}[auto]%[scale=1.5]
\draw [->](-0.3,0)-- (12.3,0) coordinate;
\draw [->](-0.3,-2)-- (12.3,-2) coordinate;
\foreach \x/\xtext/\xxtext in {-3/-3/0.001, -2/-2/ , -1/-1/, 0/0/, 1/1/, 2/2/100, 3/3/1000} {
\draw [very thick] ({\x*2+6},-2pt) -- ++(0, 4pt) node[xshift = -6pt, yshift=-3pt,anchor=south west,baseline]{\strut$\xtext$};
\draw [very thick] ({10^(\x)*0.012},-2cm+2pt) -- ++(0,-4pt) node[anchor=north]{$\xxtext$};
\draw [->] ({\x*2+6},-2pt) .. controls ({\x*2+6},-0.5) and ({10^(\x)*0.012},-1.5) .. ({10^(\x)*0.012},-2cm+2pt);
% \foreach \x/\xtext/\xxtext in {-15/-15/0.000031, -10/-10/ , -5/-5/, 0/0/1, 5/5/32, 10/10/1024, 15/15/32768} {
\def\xM{5} % max exponent
\def\xW{12} % max width in cm
\foreach \x/\xtext/\xxtext in {-5/-5/.03, -2.5/-2.5/, 0/0/1, 2.5/2.5/5.66, 5/5/32} {
\def\xA{{(\x + \xM) * (\xW / (2 * \xM))}} %untrafoed val in cm (0 to 12)
\def\xB{{\xW * 2^(\x-\xM)}} %trafoed val in cm
\draw [very thick] (\xA,-2pt) -- ++(0, 4pt) node[xshift = -6pt, yshift=-3pt,anchor=south west,baseline]{\strut$\xtext$};
\draw [very thick] (\xB,-2cm+2pt) -- ++(0,-4pt) node[anchor=north]{\scriptsize $\xxtext$};
\draw [->] (\xA,-2pt) .. controls (\xA,-0.5) and (\xB,-1.5) .. (\xB,-2cm+2pt);
}
\node[] at (-0.7,-0.1) (t1) {$\conf$};
\node[] at (-0.7,-1.9) (t2) {$10^{\conf}$};
\node[] at (-0.7,-1.9) (t2) {$2^{\conf}$};
\end{tikzpicture}

\end{figure}

\framebreak
Expand All @@ -49,12 +56,12 @@
\item If $\finconf$ is close to the border of $\pcs$ the ranges should be increased (or a different scale should be selected).
\item Meta-Learning can help to decide which hyperparameters should be tuned in which ranges.
\end{itemize}
\vspace{0.5cm}
Example:
\begin{itemize}
\item The ranges for $\text{cost} \in [10^{-3}, 10^3]$ and $\gamma \in [10^{-3}, 10^3]$ are rather small.
\item More common would be ranges like $\text{cost} \in [2^{-15}, 2^{15}]$ and $\gamma \in [2^{-15}, 2^{15}]$.
\end{itemize}
% \vspace{0.5cm}
% Example:
% \begin{itemize}
% \item The ranges for $\text{cost} \in [10^{-3}, 10^3]$ and $\gamma \in [10^{-3}, 10^3]$ are rather small.
% \item More common would be ranges like $\text{cost} \in [2^{-15}, 2^{15}]$ and $\gamma \in [2^{-15}, 2^{15}]$.
% \end{itemize}

\end{frame}

Expand All @@ -67,18 +74,20 @@

\begin{itemize}
\item The learning algorithm is a support vector machine (SVM) with RBF kernel.
\item The hyperparmeters we optimize are
\item The hyperparameters we optimize are
\begin{itemize}
\item Cost parameter $\text{cost} \in [10^{-3}, 10^3]$.
\item Kernel parameter $\gamma \in [10^{-3}, 10^3]$.
\item Cost parameter $\text{cost} \in [2^{-15}, 2^{15}]$.
\item Kernel parameter $\gamma \in [2^{-15}, 2^{15}]$.
\end{itemize}
\item We compare three different optimizer
\item We compare four different optimizer
\begin{itemize}
\item Random search
\item Grid search
\item \textit{CMAES} - an evolutionary algorithm, that generates offsprings from a multivariate normal distribution (We didn't cover this algorithm).
\item A $(\mu+\lambda)$-selection evolutionary algorithm with $\mu = 1$, $\lambda = 1$ and Gauss mutation with $\sigma = 1$.
\item \textit{CMAES} - an evolutionary algorithm that generates offspring from a multivariate normal distribution (We didn't cover this algorithm).
\end{itemize}
\item We use a 5-fold cross-validation and optimize the area under the ROC curve (AUC).
\item We use a 5-fold cross-validation and optimize the accuracy (ACC).
\item All methods are allowed a budget of $100$ evaluations.
\end{itemize}

\end{frame}
Expand All @@ -101,7 +110,8 @@

\begin{itemize}
\item Both \emph{Grid search} and \emph{random search} have many evaluations in regions with bad performance ($\gamma>1$).
\item \emph{CMAES} only explores a small region.
\item \emph{CMAES} only explores a small region.
\item \emph{(1+1)-EA} does not converge.
\end{itemize}
\end{column}%
\begin{column}{0.5\textwidth}
Expand Down Expand Up @@ -171,7 +181,7 @@
\begin{column}{0.4\textwidth}
\footnotesize

The box plots show the distribution of the AUC-values that were measured on the \emph{outer test set} with a 10-fold CV.
The box plots show the distribution of the ACC-values that were measured on the \emph{outer test set} with a 10-fold CV.

Note:

Expand Down Expand Up @@ -213,13 +223,13 @@
\begin{itemize}
\item Static defaults of hyperparameters, e.g., $\conf = (\text{cost},\gamma) = (1,1)$ are rarely a good choice.
\item A simple extension is to compute defaults based on some simple dataset characteristics.
\item The best know example is the formular for the size of the random subset of features to consider as a split in a random forest: $\sqrt{p}$, where $p$ is the number of features.
\item The best know example is the formula for the size of the random subset of features to consider as a split in a random forest: $\sqrt{p}$, where $p$ is the number of features.
\item For the RBF-SVM a data dependent default for the $\gamma$ parameter can be computed by
\begin{itemize}
\item The pairwise distances $\|\x - \tilde \x \|$ between points of a random subset containing $50\%$ of the data points are calculated.
\item The estimate is based upon the $0.1$ and $0.9$ quantile of these distances.
\item Basically any value between those two bounds will produce good results.
\item Take the mean of the $0.1$ and $0.9$ quantile of these distances as an estimate for $\sigma$ and compute $\gamma = \tfrac{1}{2\sigma^2}$.
\item Take the mean of the $0.1$ and $0.9$ quantile of these distances as an estimate for $\gamma$.
\end{itemize}
\item These simple defaults can work well and don't require expensive tuning procedures.
\end{itemize}
Expand All @@ -233,7 +243,7 @@
\footnotesize

\begin{itemize}
\item With the previously discussed data dependet default of $\gamma$ and $\text{cost} = 1$.
\item With the previously discussed data depended default of $\gamma$ and $\text{cost} = 1$.
\end{itemize}
\end{column}
\begin{column}{0.6\textwidth}
Expand Down

0 comments on commit 895ffc2

Please sign in to comment.