diff --git a/README.md b/README.md index 13fa779..97e99a5 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,4 @@ # offline-parameter-tuning Code for the offline paramater tuning paper (submitted to IDA 2020) + +For the replications of the plots, see demo_lif_bandit.R and demo_tbl_bandit.R diff --git a/bandit_continuum_function_bimodal.R b/bandit_continuum_function_bimodal.R new file mode 100644 index 0000000..4e059d2 --- /dev/null +++ b/bandit_continuum_function_bimodal.R @@ -0,0 +1,133 @@ +#' @export +library("truncnorm") +ContinuumBanditBimodal <- R6::R6Class( + inherit = Bandit, + class = FALSE, + public = list( + arm_function = NULL, + mu1 = NULL, + sd1 = NULL, + mu2 = NULL, + sd2 = NULL, + class_name = "ContinuumBanditBimodal", + initialize = function() { + self$arm_function <- function(x, mu1, sd1, mu2, sd2) { + y1 <- truncnorm::dtruncnorm(x, a=0, b=1, mean=mu1, sd=sd1) + y2 <- truncnorm::dtruncnorm(x, a=0, b=1, mean=mu2, sd=sd2) + return(y1 + y2 + rnorm(length(x), 0, 0.01)) + } + super$initialize() + self$d <- 1 + self$k <- 1 + }, + post_initialization = function(){ + self$mu1 <- runif(1, 0.15, 0.35) + self$sd1 <- runif(1, 0.1, 0.2) + self$mu2 <- runif(1, 0.65, 0.85) + self$sd2 <- runif(1, 0.1, 0.2) + }, + get_context = function(t) { + context <- list() + context$k <- self$k + context$d <- self$d + context + }, + get_reward = function(t, context, action) { + reward <- list( + reward = self$arm_function(action$choice, self$mu1, self$sd1, self$mu2, self$sd2), + optimal_reward = self$mu2 + ) + } + ) +) + +#' Bandit: ContinuumBandit +#' +#' A function based continuum multi-armed bandit +#' where arms are chosen from a subset of the real line and the mean rewards +#' are assumed to be a continuous function of the arms. +#' +#' @section Usage: +#' \preformatted{ +#' bandit <- ContinuumBandit$new(FUN) +#' } +#' +#' @name ContinuumBandit +#' +#' +#' @section Arguments: +#' \describe{ +#' \item{FUN}{continuous function.} +#' } +#' +#' @section Methods: +#' +#' \describe{ +#' +#' \item{\code{new(FUN)}}{ generates and instantializes a new \code{ContinuumBandit} instance. } +#' +#' \item{\code{get_context(t)}}{ +#' argument: +#' \itemize{ +#' \item \code{t}: integer, time step \code{t}. +#' } +#' returns a named \code{list} +#' containing the current \code{d x k} dimensional matrix \code{context$X}, +#' the number of arms \code{context$k} and the number of features \code{context$d}. +#' } +#' +#' \item{\code{get_reward(t, context, action)}}{ +#' arguments: +#' \itemize{ +#' \item \code{t}: integer, time step \code{t}. +#' \item \code{context}: list, containing the current \code{context$X} (d x k context matrix), +#' \code{context$k} (number of arms) and \code{context$d} (number of context features) +#' (as set by \code{bandit}). +#' \item \code{action}: list, containing \code{action$choice} (as set by \code{policy}). +#' } +#' returns a named \code{list} containing \code{reward$reward} and, where computable, +#' \code{reward$optimal} (used by "oracle" policies and to calculate regret). +#' } +# +#' } +#' +#' @seealso +#' +#' Core contextual classes: \code{\link{Bandit}}, \code{\link{Policy}}, \code{\link{Simulator}}, +#' \code{\link{Agent}}, \code{\link{History}}, \code{\link{Plot}} +#' +#' Bandit subclass examples: \code{\link{BasicBernoulliBandit}}, \code{\link{ContextualLogitBandit}}, +#' \code{\link{OfflineReplayEvaluatorBandit}} +#' +#' Policy subclass examples: \code{\link{EpsilonGreedyPolicy}}, \code{\link{ContextualLinTSPolicy}} +#' +#' @examples +#' \dontrun{ +#' +#' horizon <- 1500 +#' simulations <- 100 +#' +#' continuous_arms <- function(x) { +#' -0.1*(x - 5) ^ 2 + 3.5 + rnorm(length(x),0,0.4) +#' } +#' +#' int_time <- 100 +#' amplitude <- 0.2 +#' learn_rate <- 0.3 +#' omega <- 2*pi/int_time +#' x0_start <- 2.0 +#' +#' policy <- LifPolicy$new(int_time, amplitude, learn_rate, omega, x0_start) +#' +#' bandit <- ContinuumBandit$new(FUN = continuous_arms) +#' +#' agent <- Agent$new(policy,bandit) +#' +#' history <- Simulator$new( agents = agent, +#' horizon = horizon, +#' simulations = simulations, +#' save_theta = TRUE )$run() +#' +#' plot(history, type = "average", regret = FALSE) +#' } +NULL diff --git a/bandit_continuum_function_unimodal.R b/bandit_continuum_function_unimodal.R new file mode 100644 index 0000000..da399ae --- /dev/null +++ b/bandit_continuum_function_unimodal.R @@ -0,0 +1,126 @@ +#' @export +ContinuumBanditUnimodal <- R6::R6Class( + inherit = Bandit, + class = FALSE, + public = list( + arm_function = NULL, + c1 = NULL, + c2 = NULL, + class_name = "ContinuumBanditUnimodal", + initialize = function() { + self$c2 <- 1 + self$arm_function <- function(x, c1 = 0.25, c2 = 0.75) { + -(x - c1) ^ 2 + c2 + rnorm(length(x), 0, 0.01) + } + super$initialize() + self$d <- 1 + self$k <- 1 + }, + post_initialization = function(){ + self$c1 <- runif(1,0.25,0.75) + }, + get_context = function(t) { + context <- list() + context$k <- self$k + context$d <- self$d + context + }, + get_reward = function(t, context, action) { + reward <- list( + reward = self$arm_function(action$choice, self$c1, self$c2), + optimal_reward = self$c2 + ) + } + ) +) + +#' Bandit: ContinuumBandit +#' +#' A function based continuum multi-armed bandit +#' where arms are chosen from a subset of the real line and the mean rewards +#' are assumed to be a continuous function of the arms. +#' +#' @section Usage: +#' \preformatted{ +#' bandit <- ContinuumBandit$new(FUN) +#' } +#' +#' @name ContinuumBandit +#' +#' +#' @section Arguments: +#' \describe{ +#' \item{FUN}{continuous function.} +#' } +#' +#' @section Methods: +#' +#' \describe{ +#' +#' \item{\code{new(FUN)}}{ generates and instantializes a new \code{ContinuumBandit} instance. } +#' +#' \item{\code{get_context(t)}}{ +#' argument: +#' \itemize{ +#' \item \code{t}: integer, time step \code{t}. +#' } +#' returns a named \code{list} +#' containing the current \code{d x k} dimensional matrix \code{context$X}, +#' the number of arms \code{context$k} and the number of features \code{context$d}. +#' } +#' +#' \item{\code{get_reward(t, context, action)}}{ +#' arguments: +#' \itemize{ +#' \item \code{t}: integer, time step \code{t}. +#' \item \code{context}: list, containing the current \code{context$X} (d x k context matrix), +#' \code{context$k} (number of arms) and \code{context$d} (number of context features) +#' (as set by \code{bandit}). +#' \item \code{action}: list, containing \code{action$choice} (as set by \code{policy}). +#' } +#' returns a named \code{list} containing \code{reward$reward} and, where computable, +#' \code{reward$optimal} (used by "oracle" policies and to calculate regret). +#' } +# +#' } +#' +#' @seealso +#' +#' Core contextual classes: \code{\link{Bandit}}, \code{\link{Policy}}, \code{\link{Simulator}}, +#' \code{\link{Agent}}, \code{\link{History}}, \code{\link{Plot}} +#' +#' Bandit subclass examples: \code{\link{BasicBernoulliBandit}}, \code{\link{ContextualLogitBandit}}, +#' \code{\link{OfflineReplayEvaluatorBandit}} +#' +#' Policy subclass examples: \code{\link{EpsilonGreedyPolicy}}, \code{\link{ContextualLinTSPolicy}} +#' +#' @examples +#' \dontrun{ +#' +#' horizon <- 1500 +#' simulations <- 100 +#' +#' continuous_arms <- function(x) { +#' -0.1*(x - 5) ^ 2 + 3.5 + rnorm(length(x),0,0.4) +#' } +#' +#' int_time <- 100 +#' amplitude <- 0.2 +#' learn_rate <- 0.3 +#' omega <- 2*pi/int_time +#' x0_start <- 2.0 +#' +#' policy <- LifPolicy$new(int_time, amplitude, learn_rate, omega, x0_start) +#' +#' bandit <- ContinuumBandit$new(FUN = continuous_arms) +#' +#' agent <- Agent$new(policy,bandit) +#' +#' history <- Simulator$new( agents = agent, +#' horizon = horizon, +#' simulations = simulations, +#' save_theta = TRUE )$run() +#' +#' plot(history, type = "average", regret = FALSE) +#' } +NULL diff --git a/bandit_continuum_offon.R b/bandit_continuum_offon.R new file mode 100644 index 0000000..df930c0 --- /dev/null +++ b/bandit_continuum_offon.R @@ -0,0 +1,42 @@ +#' @export +OnlineOfflineContinuumBandit <- R6::R6Class( + inherit = Bandit, + class = FALSE, + private = list( + S = NULL + ), + public = list( + class_name = "OnlineOfflineContinuumBandit", + delta = NULL, + horizon = NULL, + choice = NULL, + arm_function = NULL, + initialize = function(FUN, delta, horizon) { + self$arm_function <- FUN + self$horizon <- horizon + self$delta <- delta + self$k <- 1 + }, + post_initialization = function() { + self$choice <- runif(self$horizon, min=0, max=1) + private$S <- data.frame(self$choice, self$arm_function(self$choice)) + private$S <- private$S[sample(nrow(private$S)),] + colnames(private$S) <- c('choice', 'reward') + }, + get_context = function(index) { + context <- list() + context$k <- self$k + context + }, + get_reward = function(index, context, action) { + reward_at_index <- as.double(private$S$reward[[index]]) + if (abs(private$S$choice[[index]] - action$choice) < self$delta) { + reward <- list( + reward = reward_at_index + ) + } else { + NULL + } + } + ) +) diff --git a/bandit_continuum_offon_kern.R b/bandit_continuum_offon_kern.R new file mode 100644 index 0000000..50d2c09 --- /dev/null +++ b/bandit_continuum_offon_kern.R @@ -0,0 +1,51 @@ +#' @export +OnlineOfflineContinuumBanditKernel <- R6::R6Class( + inherit = Bandit, + class = FALSE, + private = list( + S = NULL, + n = NULL + ), + public = list( + class_name = "OnlineOfflineContinuumBanditKernel", + delta = NULL, + c1 = NULL, + c2 = NULL, + arm_function = NULL, + choice = NULL, + h = NULL, + kernel = NULL, + horizon = NULL, + initialize = function(FUN, horizon) { + self$arm_function <- FUN + self$k <- 1 + self$horizon <- horizon + self$h <- horizon^(-1/5) + self$kernel <- function(action_true, action_choice, bandwith){ 1/sqrt(2*pi)*exp(((action_choice - action_true) / bandwith)^2/2) } + }, + post_initialization = function() { + self$choice <- runif(self$horizon, min=0, max=1) + private$S <- data.frame(self$choice, self$arm_function(self$choice)) + private$S <- private$S[sample(nrow(private$S)),] + colnames(private$S) <- c('choice', 'reward') + private$n <- 0 + }, + get_context = function(index) { + context <- list() + context$k <- self$k + context + }, + get_reward = function(index, context, action) { + reward_at_index <- as.double(private$S$reward[[index]]) + #kern_value <- self$kernel(action_true = private$S$choice[[index]], action_choice = action$choice, bandwith = self$h) + temp_u <- (action$choice - private$S$choice[[index]]) / self$h + kern_value <- 1/sqrt(2*pi) * exp(-temp_u^2 / 2) + #inc(private$n) <- 1 + #print(paste0("Kernel value: ", kern_value, "action choice: ", action$choice, "true action: ", private$S$choice[[index]], "divy: ", temp_u)) + reward <- list( + reward = (kern_value * reward_at_index), + optimal_reward = self$c2 + ) + } + ) +) diff --git a/demo_lif_bandit.R b/demo_lif_bandit.R new file mode 100644 index 0000000..7604eac --- /dev/null +++ b/demo_lif_bandit.R @@ -0,0 +1,234 @@ +# This is the demo file for the online and offline parameter tuning of Lock-in Feedback +# For policy and bandit specific code, please look at the files (as sourced above). +# First make sure to install contextual +# (see https://github.com/Nth-iteration-labs/contextual for a how to). +# +# For any questions, please contact the authors. + +library(contextual) +library(here) +library(ggplot2) + +source("./bandit_continuum_function_unimodal.R") +source("./bandit_continuum_function_bimodal.R") +source("./bandit_continuum_offon.R") +source("./bandit_continuum_offon_kern.R") +source("./policy_cont_lif_randstart.R") + +############################################################# +# # +# Online evaluation for LiF # +# # +############################################################# + +### Set seed +set.seed(1) + +### Set number of interactions (horizon) and number of repeats (simulations) +### In the paper we used a horizon of 10000 and 10000 simulations +horizon <- 10000 +simulations <- 10 + +### Set LiF specific parameters. Start point is set within the policy itself. +int_time <- 50 +amplitude_list <- seq(0.002, 0.2, length.out = 20) +learn_rate <- .1 +omega <- 2*pi/int_time + +### Set up two different bandits +bandits <- c(ContinuumBanditUnimodal$new(), ContinuumBanditBimodal$new()) + +### Set up all agents with different amplitudes and run them for each bandit +for (bandit in bandits){ + + agents <- list() + + for (i in 1:length(amplitude_list)){ + agents <- append(agents, Agent$new(LifPolicyRandstart$new(int_time, amplitude_list[i], learn_rate, omega), bandit)) + } + + history <- Simulator$new(agents = agents, + horizon = horizon, + simulations = simulations, + do_parallel = TRUE, + save_interval = 10)$run() + + ### Post-processing for plotting + iters <- length(amplitude_list) + reward_rate <- c() + confs <- c() + + for(i in 1:iters){ + reward_rate[[i]] <- history$cumulative[[i]]$cum_reward_rate + confs[[i]] <- history$cumulative[[i]]$cum_reward_rate_sd / sqrt(simulations) * qnorm(0.975) + } + + df <- data.frame(amp = amplitude_list, reward = reward_rate, ci = confs, delta = rep("Online")) + g <- ggplot(data = df, aes(x=amp, y=reward)) + + geom_line(colour = 'red') + + geom_errorbar(aes(ymin=reward-ci, ymax=reward+ci, color='red'), width=0.01) + + labs(x = "Amplitude", y = "Average reward per interaction") + + theme_bw(base_size = 15) + + theme(legend.position = "none") + + ### Saving data to use later + if (bandit$class_name == "ContinuumBanditBimodal"){ + online_lif_bimodal <- df + } else if (bandit$class_name == "ContinuumBanditUnimodal"){ + online_lif_unimodal <- df + } + + print(g) + print(amplitude_list[[which.max(reward_rate)]]) +} + +############################################################# +# # +# Offline evaluation for LiF # +# # +############################################################# + +### Set seed +set.seed(1) + +### Set number of interactions (horizon) and number of repeats (simulations) +### Typically same as in online evaluation +horizon <- 1000 +simulations <- 2 + +### Set LiF specific parameters. Start point is set within the policy itself. +### Typically same as in online evaluation +int_time <- 50 +amplitude_list <- seq(0.002, 0.2, length.out = 20) +learn_rate <- .1 +omega <- 2*pi/int_time + +### Set up functions to make offline dataset +unimodal_data <- function(x) { + c1 <- runif(1, 0.25, 0.75) + c2 <- 1 + return(-(x - c1) ^ 2 + c2 + rnorm(length(x), 0, 0.01)) +} + +bimodal_data <- function(x){ + mu1 <- runif(1, 0.15, 0.2) + sd1 <- runif(1, 0.1, 0.15) + mu2 <- runif(1, 0.7, 0.85) + sd2 <- runif(1, 0.1, 0.15) + y1 <- truncnorm::dtruncnorm(x, a=0, b=1, mean=mu1, sd=sd1) + y2 <- truncnorm::dtruncnorm(x, a=0, b=1, mean=mu2, sd=sd2) + return(y1 + y2 + rnorm(length(x), 0, 0.01)) +} + +functions <- list(list("unimodal", unimodal_data), list("bimodal", bimodal_data)) + +### Set up different delta's for the delta and kernel method. If delta = 0 we resort to the kernel method. +deltas <- c(0, 0.01, 0.1, 0.5) + +### Pre-allocation +offline_lif_unimodal_kernel <- data.frame() +offline_lif_unimodal <- data.frame() +offline_lif_bimodal_kernel <- data.frame() +offline_lif_bimodal <- data.frame() + +### Set up all agents with different amplitudes and run them for each bandit +### Do this for each specified delta +for (f in functions){ + for (d in deltas){ + if(d == 0){ + bandit <- OnlineOfflineContinuumBanditKernel$new(FUN = f[[2]], horizon = horizon) + } else { + bandit <- OnlineOfflineContinuumBandit$new(FUN = f[[2]], delta = d, horizon = horizon) + } + + agents <- list() + + for (amp in amplitude_list){ + agents <- append(agents, Agent$new(LifPolicyRandstart$new(int_time, amp, learn_rate, omega), bandit)) + } + + history <- Simulator$new(agents = agents, + horizon = horizon, + simulations = simulations, + policy_time_loop = FALSE, + save_interval = 20)$run() + + ### Post-processing for plotting + iters <- length(amplitude_list) + reward_rate <- c() + confs <- c() + + for(k in 1:iters){ + reward_rate[[k]] <- history$cumulative[[k]]$cum_reward_rate + dt <- history$get_data_table() + df_split <- split(dt, dt$agent) + for(dd in df_split){ + dd <- as.data.table(dd) + maxes <- dd[, .I[which.max(t)], by=sim]$V1 + select <- dd[maxes]$cum_reward_rate + confs[[k]] <- sd(select) / sqrt(simulations) * qnorm(0.975) + } + } + history$clear_data_table() + if (d == 0){ + df <- data.frame(amp = amplitude_list, reward = reward_rate, delta = as.factor(rep("Kernel")), ci = confs) + } else { + df <- data.frame(amp = amplitude_list, reward = reward_rate, delta = as.factor(rep(d)), ci = confs) + } + + if (f[[1]] == "bimodal"){ + if (d == 0){ + offline_lif_bimodal_kernel <- rbind(offline_lif_bimodal_kernel, df) + } else { + offline_lif_bimodal <- rbind(offline_lif_bimodal, df) + } + } else if(f[[1]] == "unimodal"){ + if (d == 0){ + offline_lif_unimodal_kernel <- rbind(offline_lif_unimodal_kernel, df) + } else { + offline_lif_unimodal <- rbind(offline_lif_unimodal, df) + } + } + } +} + +### Plotting both online and offline data together +different_plots <- list( + list("unimodal", rbind(online_lif_unimodal, offline_lif_unimodal)), + list("unimodal_kernel", offline_lif_unimodal_kernel), + list("bimodal", rbind(online_lif_bimodal, offline_lif_bimodal)), + list("bimodal_kernel", offline_lif_bimodal_kernel) +) + +for (dif_plot in different_plots){ + if(dif_plot[[1]] == "unimodal_kernel"){ + g <- ggplot(data = dif_plot[[2]], aes(x=amp, y=reward, label = delta)) + + geom_line(aes(colour = as.factor(delta))) + + geom_errorbar(data = dif_plot[[2]], aes(ymin=reward-ci, ymax=reward+ci, color=as.factor(delta)), width=.01) + + geom_vline(xintercept = 0.115, linetype = "dotted", color = "black", size = 1.5) + + theme(legend.position = "right") + #"none" + labs(x = "Amplitude", y = "Average reward per interaction", color="", fill="") + + theme_bw(base_size = 15) + ggsave(g, file=paste0("offline_lif_function_",dif_plot[[1]],".eps"), device="eps") + print(g) + } else if(dif_plot[[1]] == "bimodal_kernel"){ + g <- ggplot(data = dif_plot[[2]], aes(x=amp, y=reward, label = delta)) + + geom_line(aes(colour = as.factor(delta))) + + geom_errorbar(data = dif_plot[[2]], aes(ymin=reward-ci, ymax=reward+ci, color=as.factor(delta)), width=.01) + + geom_vline(xintercept = 0.035, linetype = "dotted", color = "black", size = 1.5) + + theme(legend.position = "right") + #"none" + labs(x = "Amplitude", y = "Average reward per interaction", color="", fill="") + + theme_bw(base_size = 15) + ggsave(g, file=paste0("offline_lif_function_",dif_plot[[1]],".eps"), device="eps") + print(g) + } else if(dif_plot[[1]] == "unimodal" || dif_plot[[1]] == "bimodal") { + g <- ggplot(data = dif_plot[[2]], aes(x=amp, y=reward, label = delta)) + + geom_line(aes(colour = as.factor(delta))) + + geom_errorbar(data = dif_plot[[2]], aes(ymin=reward-ci, ymax=reward+ci, color=as.factor(delta)), width=.01) + + theme(legend.position = "right") + #"none" + labs(x = "Amplitude", y = "Average reward per interaction", color="", fill="") + + theme_bw(base_size = 15) + ggsave(g, file=paste0("offline_lif_function_",dif_plot[[1]],"_delta.eps"), device="eps") + print(g) + } +} \ No newline at end of file diff --git a/demo_tbl_bandit.R b/demo_tbl_bandit.R new file mode 100644 index 0000000..24388e5 --- /dev/null +++ b/demo_tbl_bandit.R @@ -0,0 +1,247 @@ +# This is the demo file for the online and offline parameter tuning of Thompson sampling +# for Bayesian linear regression. For policy and bandit specific code, please look at +# the files (as sourced above). First make sure to install contextual +# (see https://github.com/Nth-iteration-labs/contextual for a how to). +# +# For any questions, please contact the authors. + +library(contextual) +library(here) +library(ggplot2) + +source("./bandit_continuum_function_unimodal.R") +source("./bandit_continuum_function_bimodal.R") +source("./bandit_continuum_offon.R") +source("./bandit_continuum_offon_kern.R") +source("./policy_tbl.R") + +############################################################# +# # +# Online evaluation for TBL # +# # +############################################################# + +### Set seed +set.seed(1) + +### Set number of interactions (horizon) and number of repeats (simulations) +### In the paper we used a horizon of 10000 and 10000 simulations +horizon <- 10000 +simulations <- 10 + +### Set TBL specific parameters +J <- matrix(c(5, 4, -4), nrow=1, ncol=3, byrow = TRUE) +err <- 1 +precision_list <- list(matrix(diag(c(0.01,0.01,0.02)), nrow=3, ncol=3, byrow = TRUE), + matrix(diag(c(0.1,0.1,0.2)), nrow=3, ncol=3, byrow = TRUE), + matrix(diag(c(0.4,0.4,1)), nrow=3, ncol=3, byrow = TRUE), + matrix(diag(c(1,1,2)), nrow=3, ncol=3, byrow = TRUE), + matrix(diag(c(2,2,5)), nrow=3, ncol=3, byrow = TRUE), + matrix(diag(c(10,10,20)), nrow=3, ncol=3, byrow = TRUE)) + +### Set up two different bandits +bandits <- c(ContinuumBanditUnimodal$new(), ContinuumBanditBimodal$new()) + +### Set up all agents with different prior precisions and run them for each bandit +for (bandit in bandits){ + + agents <- list() + + for (prec in precision_list){ + agents <- append(agents, Agent$new(ThompsonBayesianLinearPolicy$new(J = J, P = prec, err = err), bandit)) + } + + history <- Simulator$new(agents = agents, + horizon = horizon, + simulations = simulations, + do_parallel = TRUE, + save_interval = 10)$run() + + ### Post-processing for plotting + iters <- length(precision_list) + reward_rate <- c() + confs <- c() + + for(i in 1:iters){ + reward_rate[[i]] <- history$cumulative[[i]]$cum_reward_rate + confs[[i]] <- history$cumulative[[i]]$cum_reward_rate_sd / sqrt(simulations) * qnorm(0.975) + } + + df <- data.frame(prec = 1:length(precision_list), reward = reward_rate, ci = confs, delta = rep("Online")) + g <- ggplot(data = df, aes(x=prec, y=reward)) + + geom_line(colour = 'red') + + geom_errorbar(aes(ymin=reward-ci, ymax=reward+ci, color='red'), width=.2) + + labs(x = "Prior precision", y = "Average reward per interaction") + + scale_x_continuous(breaks = c(1,6), labels=c("Low", "High")) + + theme_bw(base_size = 15) + + theme(legend.position = "none") + + ### Saving data to use later + if (bandit$class_name == "ContinuumBanditBimodal"){ + online_tbl_bimodal <- df + } else if (bandit$class_name == "ContinuumBanditUnimodal"){ + online_tbl_unimodal <- df + } + + print(g) + print(precision_list[[which.max(reward_rate)]]) +} + +############################################################# +# # +# Offline evaluation for TBL # +# # +############################################################# + +### Set seed +set.seed(1) + +### Set number of interactions (horizon) and number of repeats (simulations) +### Typically same as in online evaluation +horizon <- 1000 +simulations <- 2 + +### Set TBL specific parameters +### Typically same as in online evaluation +J <- matrix(c(5, 4, -4), nrow=1, ncol=3, byrow = TRUE) +err <- 1 +precision_list <- list(matrix(diag(c(0.01,0.01,0.02)), nrow=3, ncol=3, byrow = TRUE), + matrix(diag(c(0.1,0.1,0.2)), nrow=3, ncol=3, byrow = TRUE), + matrix(diag(c(0.4,0.4,1)), nrow=3, ncol=3, byrow = TRUE), + matrix(diag(c(1,1,2)), nrow=3, ncol=3, byrow = TRUE), + matrix(diag(c(2,2,5)), nrow=3, ncol=3, byrow = TRUE), + matrix(diag(c(10,10,20)), nrow=3, ncol=3, byrow = TRUE)) + +### Set up functions to make offline dataset +unimodal_data <- function(x) { + c1 <- runif(1, 0.25, 0.75) + c2 <- 1 + return(-(x - c1) ^ 2 + c2 + rnorm(length(x), 0, 0.01)) +} + +bimodal_data <- function(x){ + mu1 <- runif(1, 0.15, 0.2) + sd1 <- runif(1, 0.1, 0.15) + mu2 <- runif(1, 0.7, 0.85) + sd2 <- runif(1, 0.1, 0.15) + y1 <- truncnorm::dtruncnorm(x, a=0, b=1, mean=mu1, sd=sd1) + y2 <- truncnorm::dtruncnorm(x, a=0, b=1, mean=mu2, sd=sd2) + return(y1 + y2 + rnorm(length(x), 0, 0.01)) +} + +functions <- list(list("unimodal", unimodal_data), list("bimodal", bimodal_data)) + +### Set up different delta's for the delta and kernel method. If delta = 0 we resort to the kernel method. +deltas <- c(0, 0.01, 0.1, 0.5) + +### Pre-allocation +offline_tbl_unimodal_kernel <- data.frame() +offline_tbl_unimodal <- data.frame() +offline_tbl_bimodal_kernel <- data.frame() +offline_tbl_bimodal <- data.frame() + +### Set up all agents with different prior precisions and run them for each bandit +### Do this for each specified delta +for (f in functions){ + for (d in deltas){ + if(d == 0){ + bandit <- OnlineOfflineContinuumBanditKernel$new(FUN = f[[2]], horizon = horizon) + } else { + bandit <- OnlineOfflineContinuumBandit$new(FUN = f[[2]], delta = d, horizon = horizon) + } + + agents <- list() + + for (prec in precision_list){ + agents <- append(agents, Agent$new(ThompsonBayesianLinearPolicy$new(J = J, P = prec, err = err), bandit)) + } + + history <- Simulator$new(agents = agents, + horizon = horizon, + simulations = simulations, + policy_time_loop = FALSE, + save_interval = 20)$run() + + ### Post-processing for plotting + iters <- length(precision_list) + reward_rate <- c() + confs <- c() + + for(k in 1:iters){ + reward_rate[[k]] <- history$cumulative[[k]]$cum_reward_rate + dt <- history$get_data_table() + df_split <- split(dt, dt$agent) + for(dd in df_split){ + dd <- as.data.table(dd) + maxes <- dd[, .I[which.max(t)], by=sim]$V1 + select <- dd[maxes]$cum_reward_rate + confs[[k]] <- sd(select) / sqrt(simulations) * qnorm(0.975) + } + } + + history$clear_data_table() + if (d == 0){ + df <- data.frame(prec = 1:length(precision_list), reward = reward_rate, delta = as.factor(rep("Kernel")), ci = confs) + } else { + df <- data.frame(prec = 1:length(precision_list), reward = reward_rate, delta = as.factor(rep(d)), ci = confs) + } + + if (f[[1]] == "bimodal"){ + if (d == 0){ + offline_tbl_bimodal_kernel <- rbind(offline_tbl_bimodal_kernel, df) + } else { + offline_tbl_bimodal <- rbind(offline_tbl_bimodal, df) + } + } else if(f[[1]] == "unimodal"){ + if (d == 0){ + offline_tbl_unimodal_kernel <- rbind(offline_tbl_unimodal_kernel, df) + } else { + offline_tbl_unimodal <- rbind(offline_tbl_unimodal, df) + } + } + } +} + +### Plotting both online and offline data together +different_plots <- list( + list("unimodal", rbind(online_tbl_unimodal, offline_tbl_unimodal)), + list("unimodal_kernel", offline_tbl_unimodal_kernel), + list("bimodal", rbind(online_tbl_bimodal, offline_tbl_bimodal)), + list("bimodal_kernel", offline_tbl_bimodal_kernel) +) + +for (dif_plot in different_plots){ + if(dif_plot[[1]] == "unimodal_kernel"){ + g <- ggplot(data = dif_plot[[2]], aes(x=prec, y=reward, label = delta)) + + geom_line(aes(colour = as.factor(delta))) + + geom_errorbar(data = dif_plot[[2]], aes(ymin=reward-ci, ymax=reward+ci, color=as.factor(delta)), width=.2) + + geom_vline(xintercept = 4, linetype = "dotted", color = "black", size = 1.5) + + theme(legend.position = "right") + #"none" + labs(x = "Prior precision", y = "Average reward per interaction", color="", fill="") + + scale_x_continuous(breaks = c(1,6), labels=c("Low", "High")) + + theme_bw(base_size = 15) + ggsave(g, file=paste0("offline_tbl_function_",dif_plot[[1]],".eps"), device="eps") + print(g) + } else if(dif_plot[[1]] == "bimodal_kernel"){ + g <- ggplot(data = dif_plot[[2]], aes(x=prec, y=reward, label = delta)) + + geom_line(aes(colour = as.factor(delta))) + + geom_errorbar(data = dif_plot[[2]], aes(ymin=reward-ci, ymax=reward+ci, color=as.factor(delta)), width=.2) + + geom_vline(xintercept = 4, linetype = "dotted", color = "black", size = 1.5) + + theme(legend.position = "right") + #"none" + labs(x = "Prior precision", y = "Average reward per interaction", color="", fill="") + + scale_x_continuous(breaks = c(1,6), labels=c("Low", "High")) + + theme_bw(base_size = 15) + ggsave(g, file=paste0("offline_tbl_function_",dif_plot[[1]],".eps"), device="eps") + print(g) + } else if(dif_plot[[1]] == "unimodal" || dif_plot[[1]] == "bimodal") { + g <- ggplot(data = dif_plot[[2]], aes(x=prec, y=reward, label = delta)) + + geom_line(aes(colour = as.factor(delta))) + + geom_errorbar(data = dif_plot[[2]], aes(ymin=reward-ci, ymax=reward+ci, color=as.factor(delta)), width=.2) + + theme(legend.position = "right") + #"none" + labs(x = "Prior precision", y = "Average reward per interaction", color="", fill="") + + scale_x_continuous(breaks = c(1,6), labels=c("Low", "High")) + + theme_bw(base_size = 15) + ggsave(g, file=paste0("offline_tbl_function_",dif_plot[[1]],"_delta.eps"), device="eps") + print(g) + } +} \ No newline at end of file diff --git a/policy_cont_lif_randstart.R b/policy_cont_lif_randstart.R new file mode 100644 index 0000000..c15c466 --- /dev/null +++ b/policy_cont_lif_randstart.R @@ -0,0 +1,96 @@ +#' @export +LifPolicyRandstart <- R6::R6Class( + portable = FALSE, + class = FALSE, + inherit = Policy, + public = list( + first = NULL, + inttime = NULL, # Integration time + amplitude = NULL, # Amplitude + learnrate = NULL, # Learnrate + omega = NULL, # Omega + x0_start = NULL, # x0 start value + class_name = "LifPolicyRandstart", + initialize = function(inttime,amplitude,learnrate,omega) { + super$initialize() + self$inttime <- inttime + self$amplitude <- amplitude + self$learnrate <- learnrate + self$omega <- omega + }, + post_initialization = function(){ + self$x0_start <- runif(1, min = 0.7, max = 1) + self$theta <- list('x0' = x0_start, 'Y' = rep(NA, inttime), 't_real' = 1) + }, + set_parameters = function(context_params) { + }, + get_action = function(t, context) { + action$choice <- self$theta$x0 + amplitude*cos(omega * self$theta$t_real) + if(action$choice < 0){ + action$choice <- 0 + }else if(action$choice > 1){ + action$choice <- 1 + } + action + }, + set_reward = function(t, context, action, reward) { + reward <- reward$reward + y <- amplitude*cos(omega * self$theta$t_real)*reward + self$theta$Y <- c(y, self$theta$Y)[seq_along(self$theta$Y)] + if (self$theta$t_real > inttime) + self$theta$x0 <- self$theta$x0 + learnrate * sum( self$theta$Y ) / inttime + self$theta$t_real <- self$theta$t_real + 1 + self$theta + } + ) +) + +#' Policy: Continuum Bandit Policy with Lock-in Feedback +#' +#' The continuum type Lock-in Feedback (LiF) policy is based on an approach used in physics and engineering, +#' where, if a physical variable y depends on the value of a well controllable physical variable x, +#' the search for argmax x f(x) can be solved via what is nowadays considered as standard electronics. +#' This approach relies on the possibility of making the variable x oscillate at a fixed frequency and to +#' look at the response of the dependent variable y at the very same frequency by means of a lock-in +#' amplifier. The method is particularly suitable when y is immersed in a high noise level, +#' where other more direct methods would fail. Furthermore, should the entire curve +#' shift (or, in other words, if argmax x f(x) changes in time, also known as concept drift), the circuit +#' will automatically adjust to the new situation and quickly reveal the new maximum position. +#' This approach is widely used in a very large number of applications, both in industry and research, +#' and is the basis for the Lock-in Feedback (LiF) method. +#' +#' In this, Lock in feedback goes through the following steps, again and again: +#' +#' * Oscillate a controllable independent variable X around a set value at a fixed pace. +#' * Apply the Lock-in amplifier algorithm.to obtain values of the amplitude if the outcome variable Y at +#' the pace you set at step 1. +#' * Is the amplitude of this variable zero? Congratulations, you have reached lock-in! +#' That is, you have found the optimal value of Y at the current value of X. +#' Still, this optimal value might shift over time, so move to step 1 and repeat the process to make sure we +#' maintain lock-in. +#' * Is the amplitude less than, or greater than zero? +#' Then move the set value around which we are oscillating our independent variable X up or down on the basis +#' of the outcome. +#' +#' Now move to step 1 and repeat.. +#' +#' @name LifPolicy +#' +#' @section Usage: +#' \preformatted{b <- LifPolicy$new(inttime,amplitude,learnrate,omega,x0_start)} +#' +#' @references +#' Kaptein, M. C., Van Emden, R., & Iannuzzi, D. (2016). Tracking the decoy: maximizing the decoy effect +#' through sequential experimentation. Palgrave Communications, 2, 16082. +#' +#' @seealso +#' +#' Core contextual classes: \code{\link{Bandit}}, \code{\link{Policy}}, \code{\link{Simulator}}, +#' \code{\link{Agent}}, \code{\link{History}}, \code{\link{Plot}} +#' +#' Bandit subclass examples: \code{\link{BasicBernoulliBandit}}, \code{\link{ContextualLogitBandit}}, +#' \code{\link{OfflineReplayEvaluatorBandit}} +#' +#' Policy subclass examples: \code{\link{EpsilonGreedyPolicy}}, \code{\link{ContextualLinTSPolicy}} +NULL + diff --git a/policy_tbl.R b/policy_tbl.R new file mode 100644 index 0000000..238d003 --- /dev/null +++ b/policy_tbl.R @@ -0,0 +1,42 @@ +ThompsonBayesianLinearPolicy <- R6::R6Class( + portable = FALSE, + class = FALSE, + inherit = Policy, + public = list( + class_name = "ThompsonBayesianLinearPolicy", + J = NULL, + P = NULL, + err = NULL, + initialize = function(J = matrix(c(0, 0.025, -0.025), nrow=1, ncol=3, byrow = TRUE), + P = matrix(diag(c(2,2,5)), nrow=3, ncol=3, byrow = TRUE), + err=1) { + super$initialize() + self$J <- J + self$P <- P + self$err <- err + }, + set_parameters = function(context_params) { + self$theta <- list('J' = self$J, 'P' = self$P, 'err' = self$err) + }, + get_action = function(t, context) { + sigma <- solve(self$theta$P, tol = 1e-200) + mu <- sigma %*% matrix(self$theta$J) + betas <- contextual::mvrnorm(n = 1, mu, sigma) + action$choice <- -(betas[2] / (2*betas[3])) + if(action$choice > 1){ + action$choice <- 1 + } else if(action$choice < 0) { + action$choice <- 0 + } + action + }, + set_reward = function(t, context, action, reward) { + y <- reward$reward + x <- action$choice + x <- matrix(c(1,x,x^2), nrow = 1, ncol = 3, byrow = TRUE) + self$theta$J <- (x*y)/self$theta$err + self$theta$J + self$theta$P <- t(x)%*%x + self$theta$P + self$theta + } + ) +)