Code commit

Nth-iteration-labs · Dec 4, 2019 · f235995 · f235995
1 parent b2a8046
commit f235995
Show file tree

Hide file tree

Showing 9 changed files with 973 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -1,2 +1,4 @@
 # offline-parameter-tuning
 Code for the offline paramater tuning paper (submitted to IDA 2020)
+
+For the replications of the plots, see demo_lif_bandit.R and demo_tbl_bandit.R
diff --git a/bandit_continuum_function_bimodal.R b/bandit_continuum_function_bimodal.R
@@ -0,0 +1,133 @@
+#' @export
+library("truncnorm")
+ContinuumBanditBimodal <- R6::R6Class(
+  inherit = Bandit,
+  class = FALSE,
+  public = list(
+    arm_function = NULL,
+    mu1 = NULL,
+    sd1 = NULL,
+    mu2 = NULL,
+    sd2 = NULL,
+    class_name = "ContinuumBanditBimodal",
+    initialize   = function() {
+      self$arm_function <- function(x, mu1, sd1, mu2, sd2) {
+        y1 <- truncnorm::dtruncnorm(x, a=0, b=1, mean=mu1, sd=sd1)
+        y2 <- truncnorm::dtruncnorm(x, a=0, b=1, mean=mu2, sd=sd2)
+        return(y1 + y2 + rnorm(length(x), 0, 0.01))
+      }
+      super$initialize()
+      self$d            <- 1
+      self$k            <- 1
+    },
+    post_initialization = function(){
+      self$mu1 <- runif(1, 0.15, 0.35)
+      self$sd1 <- runif(1, 0.1, 0.2)
+      self$mu2 <- runif(1, 0.65, 0.85)
+      self$sd2 <- runif(1, 0.1, 0.2)
+    },
+    get_context = function(t) {
+      context           <- list()
+      context$k         <- self$k
+      context$d         <- self$d
+      context
+    },
+    get_reward = function(t, context, action) {
+      reward  <- list(
+        reward                   = self$arm_function(action$choice, self$mu1, self$sd1, self$mu2, self$sd2),
+        optimal_reward           = self$mu2
+      )
+    }
+  )
+)
+
+#' Bandit: ContinuumBandit
+#'
+#' A function based continuum multi-armed bandit
+#' where arms are chosen from a subset of the real line and the mean rewards
+#' are assumed to be a continuous function of the arms.
+#'
+#' @section Usage:
+#' \preformatted{
+#'    bandit <- ContinuumBandit$new(FUN)
+#' }
+#'
+#' @name ContinuumBandit
+#'
+#'
+#' @section Arguments:
+#' \describe{
+#'   \item{FUN}{continuous function.}
+#' }
+#'
+#' @section Methods:
+#'
+#' \describe{
+#'
+#'   \item{\code{new(FUN)}}{ generates and instantializes a new \code{ContinuumBandit} instance. }
+#'
+#'   \item{\code{get_context(t)}}{
+#'      argument:
+#'      \itemize{
+#'          \item \code{t}: integer, time step \code{t}.
+#'      }
+#'      returns a named \code{list}
+#'      containing the current \code{d x k} dimensional matrix \code{context$X},
+#'      the number of arms \code{context$k} and the number of features \code{context$d}.
+#'  }
+#'
+#'   \item{\code{get_reward(t, context, action)}}{
+#'      arguments:
+#'      \itemize{
+#'          \item \code{t}: integer, time step \code{t}.
+#'          \item \code{context}: list, containing the current \code{context$X} (d x k context matrix),
+#'          \code{context$k} (number of arms) and \code{context$d} (number of context features)
+#'          (as set by \code{bandit}).
+#'          \item \code{action}:  list, containing \code{action$choice} (as set by \code{policy}).
+#'      }
+#'      returns a named \code{list} containing \code{reward$reward} and, where computable,
+#'         \code{reward$optimal} (used by "oracle" policies and to calculate regret).
+#'  }
+#
+#' }
+#'
+#' @seealso
+#'
+#' Core contextual classes: \code{\link{Bandit}}, \code{\link{Policy}}, \code{\link{Simulator}},
+#' \code{\link{Agent}}, \code{\link{History}}, \code{\link{Plot}}
+#'
+#' Bandit subclass examples: \code{\link{BasicBernoulliBandit}}, \code{\link{ContextualLogitBandit}},
+#' \code{\link{OfflineReplayEvaluatorBandit}}
+#'
+#' Policy subclass examples: \code{\link{EpsilonGreedyPolicy}}, \code{\link{ContextualLinTSPolicy}}
+#'
+#' @examples
+#' \dontrun{
+#'
+#' horizon            <- 1500
+#' simulations        <- 100
+#'
+#' continuous_arms  <- function(x) {
+#'   -0.1*(x - 5) ^ 2 + 3.5  + rnorm(length(x),0,0.4)
+#' }
+#'
+#' int_time    <- 100
+#' amplitude   <- 0.2
+#' learn_rate  <- 0.3
+#' omega       <- 2*pi/int_time
+#' x0_start    <- 2.0
+#'
+#' policy             <- LifPolicy$new(int_time, amplitude, learn_rate, omega, x0_start)
+#'
+#' bandit             <- ContinuumBandit$new(FUN = continuous_arms)
+#'
+#' agent              <- Agent$new(policy,bandit)
+#'
+#' history            <- Simulator$new(     agents = agent,
+#'                                          horizon = horizon,
+#'                                          simulations = simulations,
+#'                                          save_theta = TRUE             )$run()
+#'
+#' plot(history, type = "average", regret = FALSE)
+#' }
+NULL
diff --git a/bandit_continuum_function_unimodal.R b/bandit_continuum_function_unimodal.R
@@ -0,0 +1,126 @@
+#' @export
+ContinuumBanditUnimodal <- R6::R6Class(
+  inherit = Bandit,
+  class = FALSE,
+  public = list(
+    arm_function = NULL,
+    c1 = NULL,
+    c2 = NULL,
+    class_name = "ContinuumBanditUnimodal",
+    initialize   = function() {
+      self$c2 <- 1
+      self$arm_function <- function(x, c1 = 0.25, c2 = 0.75) {
+        -(x - c1) ^ 2 + c2  + rnorm(length(x), 0, 0.01)
+      }
+      super$initialize()
+      self$d            <- 1
+      self$k            <- 1
+    },
+    post_initialization = function(){
+      self$c1 <- runif(1,0.25,0.75)
+    },
+    get_context = function(t) {
+      context           <- list()
+      context$k         <- self$k
+      context$d         <- self$d
+      context
+    },
+    get_reward = function(t, context, action) {
+      reward  <- list(
+        reward                   = self$arm_function(action$choice, self$c1, self$c2),
+        optimal_reward           = self$c2
+      )
+    }
+  )
+)
+
+#' Bandit: ContinuumBandit
+#'
+#' A function based continuum multi-armed bandit
+#' where arms are chosen from a subset of the real line and the mean rewards
+#' are assumed to be a continuous function of the arms.
+#'
+#' @section Usage:
+#' \preformatted{
+#'    bandit <- ContinuumBandit$new(FUN)
+#' }
+#'
+#' @name ContinuumBandit
+#'
+#'
+#' @section Arguments:
+#' \describe{
+#'   \item{FUN}{continuous function.}
+#' }
+#'
+#' @section Methods:
+#'
+#' \describe{
+#'
+#'   \item{\code{new(FUN)}}{ generates and instantializes a new \code{ContinuumBandit} instance. }
+#'
+#'   \item{\code{get_context(t)}}{
+#'      argument:
+#'      \itemize{
+#'          \item \code{t}: integer, time step \code{t}.
+#'      }
+#'      returns a named \code{list}
+#'      containing the current \code{d x k} dimensional matrix \code{context$X},
+#'      the number of arms \code{context$k} and the number of features \code{context$d}.
+#'  }
+#'
+#'   \item{\code{get_reward(t, context, action)}}{
+#'      arguments:
+#'      \itemize{
+#'          \item \code{t}: integer, time step \code{t}.
+#'          \item \code{context}: list, containing the current \code{context$X} (d x k context matrix),
+#'          \code{context$k} (number of arms) and \code{context$d} (number of context features)
+#'          (as set by \code{bandit}).
+#'          \item \code{action}:  list, containing \code{action$choice} (as set by \code{policy}).
+#'      }
+#'      returns a named \code{list} containing \code{reward$reward} and, where computable,
+#'         \code{reward$optimal} (used by "oracle" policies and to calculate regret).
+#'  }
+#
+#' }
+#'
+#' @seealso
+#'
+#' Core contextual classes: \code{\link{Bandit}}, \code{\link{Policy}}, \code{\link{Simulator}},
+#' \code{\link{Agent}}, \code{\link{History}}, \code{\link{Plot}}
+#'
+#' Bandit subclass examples: \code{\link{BasicBernoulliBandit}}, \code{\link{ContextualLogitBandit}},
+#' \code{\link{OfflineReplayEvaluatorBandit}}
+#'
+#' Policy subclass examples: \code{\link{EpsilonGreedyPolicy}}, \code{\link{ContextualLinTSPolicy}}
+#'
+#' @examples
+#' \dontrun{
+#'
+#' horizon            <- 1500
+#' simulations        <- 100
+#'
+#' continuous_arms  <- function(x) {
+#'   -0.1*(x - 5) ^ 2 + 3.5  + rnorm(length(x),0,0.4)
+#' }
+#'
+#' int_time    <- 100
+#' amplitude   <- 0.2
+#' learn_rate  <- 0.3
+#' omega       <- 2*pi/int_time
+#' x0_start    <- 2.0
+#'
+#' policy             <- LifPolicy$new(int_time, amplitude, learn_rate, omega, x0_start)
+#'
+#' bandit             <- ContinuumBandit$new(FUN = continuous_arms)
+#'
+#' agent              <- Agent$new(policy,bandit)
+#'
+#' history            <- Simulator$new(     agents = agent,
+#'                                          horizon = horizon,
+#'                                          simulations = simulations,
+#'                                          save_theta = TRUE             )$run()
+#'
+#' plot(history, type = "average", regret = FALSE)
+#' }
+NULL
diff --git a/bandit_continuum_offon.R b/bandit_continuum_offon.R
@@ -0,0 +1,42 @@
+#' @export
+OnlineOfflineContinuumBandit <- R6::R6Class(
+  inherit = Bandit,
+  class = FALSE,
+  private = list(
+    S = NULL
+  ),
+  public = list(
+    class_name = "OnlineOfflineContinuumBandit",
+    delta = NULL,
+    horizon = NULL,
+    choice = NULL,
+    arm_function = NULL,
+    initialize   = function(FUN, delta, horizon) {
+      self$arm_function <- FUN
+      self$horizon <- horizon
+      self$delta <- delta
+      self$k <- 1
+    },
+    post_initialization = function() {
+      self$choice <- runif(self$horizon, min=0, max=1)
+      private$S <- data.frame(self$choice, self$arm_function(self$choice))
+      private$S <- private$S[sample(nrow(private$S)),]
+      colnames(private$S) <- c('choice', 'reward')
+    },
+    get_context = function(index) {
+      context           <- list()
+      context$k         <- self$k
+      context
+    },
+    get_reward = function(index, context, action) {
+      reward_at_index <- as.double(private$S$reward[[index]])
+      if (abs(private$S$choice[[index]] - action$choice) < self$delta) {
+        reward <- list(
+          reward = reward_at_index
+        )
+      } else {
+        NULL
+      }
+    }
+  )
+)
diff --git a/bandit_continuum_offon_kern.R b/bandit_continuum_offon_kern.R
@@ -0,0 +1,51 @@
+#' @export
+OnlineOfflineContinuumBanditKernel <- R6::R6Class(
+  inherit = Bandit,
+  class = FALSE,
+  private = list(
+    S = NULL,
+    n = NULL
+  ),
+  public = list(
+    class_name = "OnlineOfflineContinuumBanditKernel",
+    delta = NULL,
+    c1 = NULL,
+    c2 = NULL,
+    arm_function = NULL,
+    choice = NULL,
+    h = NULL,
+    kernel = NULL,
+    horizon = NULL,
+    initialize   = function(FUN, horizon) {
+      self$arm_function <- FUN
+      self$k <- 1
+      self$horizon <- horizon
+      self$h <- horizon^(-1/5)
+      self$kernel <- function(action_true, action_choice, bandwith){ 1/sqrt(2*pi)*exp(((action_choice - action_true) / bandwith)^2/2) }
+    },
+    post_initialization = function() {
+      self$choice <- runif(self$horizon, min=0, max=1)
+      private$S <- data.frame(self$choice, self$arm_function(self$choice))
+      private$S <- private$S[sample(nrow(private$S)),]
+      colnames(private$S) <- c('choice', 'reward')
+      private$n <- 0
+    },
+    get_context = function(index) {
+      context           <- list()
+      context$k         <- self$k
+      context
+    },
+    get_reward = function(index, context, action) {
+      reward_at_index <- as.double(private$S$reward[[index]])
+      #kern_value <- self$kernel(action_true = private$S$choice[[index]], action_choice = action$choice, bandwith = self$h)
+      temp_u <- (action$choice - private$S$choice[[index]]) / self$h
+      kern_value <- 1/sqrt(2*pi) * exp(-temp_u^2 / 2)
+      #inc(private$n) <- 1
+      #print(paste0("Kernel value: ", kern_value, "action choice: ", action$choice, "true action: ", private$S$choice[[index]], "divy: ", temp_u))
+      reward <- list(
+        reward = (kern_value * reward_at_index),
+        optimal_reward = self$c2
+      )
+    }
+  )
+)