Skip to content

Commit

Permalink
Code commit
Browse files Browse the repository at this point in the history
  • Loading branch information
g0ulash committed Dec 4, 2019
1 parent b2a8046 commit f235995
Show file tree
Hide file tree
Showing 9 changed files with 973 additions and 0 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
# offline-parameter-tuning
Code for the offline paramater tuning paper (submitted to IDA 2020)

For the replications of the plots, see demo_lif_bandit.R and demo_tbl_bandit.R
133 changes: 133 additions & 0 deletions bandit_continuum_function_bimodal.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
#' @export
library("truncnorm")
ContinuumBanditBimodal <- R6::R6Class(
inherit = Bandit,
class = FALSE,
public = list(
arm_function = NULL,
mu1 = NULL,
sd1 = NULL,
mu2 = NULL,
sd2 = NULL,
class_name = "ContinuumBanditBimodal",
initialize = function() {
self$arm_function <- function(x, mu1, sd1, mu2, sd2) {
y1 <- truncnorm::dtruncnorm(x, a=0, b=1, mean=mu1, sd=sd1)
y2 <- truncnorm::dtruncnorm(x, a=0, b=1, mean=mu2, sd=sd2)
return(y1 + y2 + rnorm(length(x), 0, 0.01))
}
super$initialize()
self$d <- 1
self$k <- 1
},
post_initialization = function(){
self$mu1 <- runif(1, 0.15, 0.35)
self$sd1 <- runif(1, 0.1, 0.2)
self$mu2 <- runif(1, 0.65, 0.85)
self$sd2 <- runif(1, 0.1, 0.2)
},
get_context = function(t) {
context <- list()
context$k <- self$k
context$d <- self$d
context
},
get_reward = function(t, context, action) {
reward <- list(
reward = self$arm_function(action$choice, self$mu1, self$sd1, self$mu2, self$sd2),
optimal_reward = self$mu2
)
}
)
)

#' Bandit: ContinuumBandit
#'
#' A function based continuum multi-armed bandit
#' where arms are chosen from a subset of the real line and the mean rewards
#' are assumed to be a continuous function of the arms.
#'
#' @section Usage:
#' \preformatted{
#' bandit <- ContinuumBandit$new(FUN)
#' }
#'
#' @name ContinuumBandit
#'
#'
#' @section Arguments:
#' \describe{
#' \item{FUN}{continuous function.}
#' }
#'
#' @section Methods:
#'
#' \describe{
#'
#' \item{\code{new(FUN)}}{ generates and instantializes a new \code{ContinuumBandit} instance. }
#'
#' \item{\code{get_context(t)}}{
#' argument:
#' \itemize{
#' \item \code{t}: integer, time step \code{t}.
#' }
#' returns a named \code{list}
#' containing the current \code{d x k} dimensional matrix \code{context$X},
#' the number of arms \code{context$k} and the number of features \code{context$d}.
#' }
#'
#' \item{\code{get_reward(t, context, action)}}{
#' arguments:
#' \itemize{
#' \item \code{t}: integer, time step \code{t}.
#' \item \code{context}: list, containing the current \code{context$X} (d x k context matrix),
#' \code{context$k} (number of arms) and \code{context$d} (number of context features)
#' (as set by \code{bandit}).
#' \item \code{action}: list, containing \code{action$choice} (as set by \code{policy}).
#' }
#' returns a named \code{list} containing \code{reward$reward} and, where computable,
#' \code{reward$optimal} (used by "oracle" policies and to calculate regret).
#' }
#
#' }
#'
#' @seealso
#'
#' Core contextual classes: \code{\link{Bandit}}, \code{\link{Policy}}, \code{\link{Simulator}},
#' \code{\link{Agent}}, \code{\link{History}}, \code{\link{Plot}}
#'
#' Bandit subclass examples: \code{\link{BasicBernoulliBandit}}, \code{\link{ContextualLogitBandit}},
#' \code{\link{OfflineReplayEvaluatorBandit}}
#'
#' Policy subclass examples: \code{\link{EpsilonGreedyPolicy}}, \code{\link{ContextualLinTSPolicy}}
#'
#' @examples
#' \dontrun{
#'
#' horizon <- 1500
#' simulations <- 100
#'
#' continuous_arms <- function(x) {
#' -0.1*(x - 5) ^ 2 + 3.5 + rnorm(length(x),0,0.4)
#' }
#'
#' int_time <- 100
#' amplitude <- 0.2
#' learn_rate <- 0.3
#' omega <- 2*pi/int_time
#' x0_start <- 2.0
#'
#' policy <- LifPolicy$new(int_time, amplitude, learn_rate, omega, x0_start)
#'
#' bandit <- ContinuumBandit$new(FUN = continuous_arms)
#'
#' agent <- Agent$new(policy,bandit)
#'
#' history <- Simulator$new( agents = agent,
#' horizon = horizon,
#' simulations = simulations,
#' save_theta = TRUE )$run()
#'
#' plot(history, type = "average", regret = FALSE)
#' }
NULL
126 changes: 126 additions & 0 deletions bandit_continuum_function_unimodal.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
#' @export
ContinuumBanditUnimodal <- R6::R6Class(
inherit = Bandit,
class = FALSE,
public = list(
arm_function = NULL,
c1 = NULL,
c2 = NULL,
class_name = "ContinuumBanditUnimodal",
initialize = function() {
self$c2 <- 1
self$arm_function <- function(x, c1 = 0.25, c2 = 0.75) {
-(x - c1) ^ 2 + c2 + rnorm(length(x), 0, 0.01)
}
super$initialize()
self$d <- 1
self$k <- 1
},
post_initialization = function(){
self$c1 <- runif(1,0.25,0.75)
},
get_context = function(t) {
context <- list()
context$k <- self$k
context$d <- self$d
context
},
get_reward = function(t, context, action) {
reward <- list(
reward = self$arm_function(action$choice, self$c1, self$c2),
optimal_reward = self$c2
)
}
)
)

#' Bandit: ContinuumBandit
#'
#' A function based continuum multi-armed bandit
#' where arms are chosen from a subset of the real line and the mean rewards
#' are assumed to be a continuous function of the arms.
#'
#' @section Usage:
#' \preformatted{
#' bandit <- ContinuumBandit$new(FUN)
#' }
#'
#' @name ContinuumBandit
#'
#'
#' @section Arguments:
#' \describe{
#' \item{FUN}{continuous function.}
#' }
#'
#' @section Methods:
#'
#' \describe{
#'
#' \item{\code{new(FUN)}}{ generates and instantializes a new \code{ContinuumBandit} instance. }
#'
#' \item{\code{get_context(t)}}{
#' argument:
#' \itemize{
#' \item \code{t}: integer, time step \code{t}.
#' }
#' returns a named \code{list}
#' containing the current \code{d x k} dimensional matrix \code{context$X},
#' the number of arms \code{context$k} and the number of features \code{context$d}.
#' }
#'
#' \item{\code{get_reward(t, context, action)}}{
#' arguments:
#' \itemize{
#' \item \code{t}: integer, time step \code{t}.
#' \item \code{context}: list, containing the current \code{context$X} (d x k context matrix),
#' \code{context$k} (number of arms) and \code{context$d} (number of context features)
#' (as set by \code{bandit}).
#' \item \code{action}: list, containing \code{action$choice} (as set by \code{policy}).
#' }
#' returns a named \code{list} containing \code{reward$reward} and, where computable,
#' \code{reward$optimal} (used by "oracle" policies and to calculate regret).
#' }
#
#' }
#'
#' @seealso
#'
#' Core contextual classes: \code{\link{Bandit}}, \code{\link{Policy}}, \code{\link{Simulator}},
#' \code{\link{Agent}}, \code{\link{History}}, \code{\link{Plot}}
#'
#' Bandit subclass examples: \code{\link{BasicBernoulliBandit}}, \code{\link{ContextualLogitBandit}},
#' \code{\link{OfflineReplayEvaluatorBandit}}
#'
#' Policy subclass examples: \code{\link{EpsilonGreedyPolicy}}, \code{\link{ContextualLinTSPolicy}}
#'
#' @examples
#' \dontrun{
#'
#' horizon <- 1500
#' simulations <- 100
#'
#' continuous_arms <- function(x) {
#' -0.1*(x - 5) ^ 2 + 3.5 + rnorm(length(x),0,0.4)
#' }
#'
#' int_time <- 100
#' amplitude <- 0.2
#' learn_rate <- 0.3
#' omega <- 2*pi/int_time
#' x0_start <- 2.0
#'
#' policy <- LifPolicy$new(int_time, amplitude, learn_rate, omega, x0_start)
#'
#' bandit <- ContinuumBandit$new(FUN = continuous_arms)
#'
#' agent <- Agent$new(policy,bandit)
#'
#' history <- Simulator$new( agents = agent,
#' horizon = horizon,
#' simulations = simulations,
#' save_theta = TRUE )$run()
#'
#' plot(history, type = "average", regret = FALSE)
#' }
NULL
42 changes: 42 additions & 0 deletions bandit_continuum_offon.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#' @export
OnlineOfflineContinuumBandit <- R6::R6Class(
inherit = Bandit,
class = FALSE,
private = list(
S = NULL
),
public = list(
class_name = "OnlineOfflineContinuumBandit",
delta = NULL,
horizon = NULL,
choice = NULL,
arm_function = NULL,
initialize = function(FUN, delta, horizon) {
self$arm_function <- FUN
self$horizon <- horizon
self$delta <- delta
self$k <- 1
},
post_initialization = function() {
self$choice <- runif(self$horizon, min=0, max=1)
private$S <- data.frame(self$choice, self$arm_function(self$choice))
private$S <- private$S[sample(nrow(private$S)),]
colnames(private$S) <- c('choice', 'reward')
},
get_context = function(index) {
context <- list()
context$k <- self$k
context
},
get_reward = function(index, context, action) {
reward_at_index <- as.double(private$S$reward[[index]])
if (abs(private$S$choice[[index]] - action$choice) < self$delta) {
reward <- list(
reward = reward_at_index
)
} else {
NULL
}
}
)
)
51 changes: 51 additions & 0 deletions bandit_continuum_offon_kern.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#' @export
OnlineOfflineContinuumBanditKernel <- R6::R6Class(
inherit = Bandit,
class = FALSE,
private = list(
S = NULL,
n = NULL
),
public = list(
class_name = "OnlineOfflineContinuumBanditKernel",
delta = NULL,
c1 = NULL,
c2 = NULL,
arm_function = NULL,
choice = NULL,
h = NULL,
kernel = NULL,
horizon = NULL,
initialize = function(FUN, horizon) {
self$arm_function <- FUN
self$k <- 1
self$horizon <- horizon
self$h <- horizon^(-1/5)
self$kernel <- function(action_true, action_choice, bandwith){ 1/sqrt(2*pi)*exp(((action_choice - action_true) / bandwith)^2/2) }
},
post_initialization = function() {
self$choice <- runif(self$horizon, min=0, max=1)
private$S <- data.frame(self$choice, self$arm_function(self$choice))
private$S <- private$S[sample(nrow(private$S)),]
colnames(private$S) <- c('choice', 'reward')
private$n <- 0
},
get_context = function(index) {
context <- list()
context$k <- self$k
context
},
get_reward = function(index, context, action) {
reward_at_index <- as.double(private$S$reward[[index]])
#kern_value <- self$kernel(action_true = private$S$choice[[index]], action_choice = action$choice, bandwith = self$h)
temp_u <- (action$choice - private$S$choice[[index]]) / self$h
kern_value <- 1/sqrt(2*pi) * exp(-temp_u^2 / 2)
#inc(private$n) <- 1
#print(paste0("Kernel value: ", kern_value, "action choice: ", action$choice, "true action: ", private$S$choice[[index]], "divy: ", temp_u))
reward <- list(
reward = (kern_value * reward_at_index),
optimal_reward = self$c2
)
}
)
)
Loading

0 comments on commit f235995

Please sign in to comment.