-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbandit_continuum_function_unimodal.R
126 lines (125 loc) · 3.74 KB
/
bandit_continuum_function_unimodal.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#' @export
ContinuumBanditUnimodal <- R6::R6Class(
inherit = Bandit,
class = FALSE,
public = list(
arm_function = NULL,
c1 = NULL,
c2 = NULL,
class_name = "ContinuumBanditUnimodal",
initialize = function() {
self$c2 <- 1
self$arm_function <- function(x, c1 = 0.25, c2 = 0.75) {
-(x - c1) ^ 2 + c2 + rnorm(length(x), 0, 0.01)
}
super$initialize()
self$d <- 1
self$k <- 1
},
post_initialization = function(){
self$c1 <- runif(1,0.25,0.75)
},
get_context = function(t) {
context <- list()
context$k <- self$k
context$d <- self$d
context
},
get_reward = function(t, context, action) {
reward <- list(
reward = self$arm_function(action$choice, self$c1, self$c2),
optimal_reward = self$c2
)
}
)
)
#' Bandit: ContinuumBandit
#'
#' A function based continuum multi-armed bandit
#' where arms are chosen from a subset of the real line and the mean rewards
#' are assumed to be a continuous function of the arms.
#'
#' @section Usage:
#' \preformatted{
#' bandit <- ContinuumBandit$new(FUN)
#' }
#'
#' @name ContinuumBandit
#'
#'
#' @section Arguments:
#' \describe{
#' \item{FUN}{continuous function.}
#' }
#'
#' @section Methods:
#'
#' \describe{
#'
#' \item{\code{new(FUN)}}{ generates and instantializes a new \code{ContinuumBandit} instance. }
#'
#' \item{\code{get_context(t)}}{
#' argument:
#' \itemize{
#' \item \code{t}: integer, time step \code{t}.
#' }
#' returns a named \code{list}
#' containing the current \code{d x k} dimensional matrix \code{context$X},
#' the number of arms \code{context$k} and the number of features \code{context$d}.
#' }
#'
#' \item{\code{get_reward(t, context, action)}}{
#' arguments:
#' \itemize{
#' \item \code{t}: integer, time step \code{t}.
#' \item \code{context}: list, containing the current \code{context$X} (d x k context matrix),
#' \code{context$k} (number of arms) and \code{context$d} (number of context features)
#' (as set by \code{bandit}).
#' \item \code{action}: list, containing \code{action$choice} (as set by \code{policy}).
#' }
#' returns a named \code{list} containing \code{reward$reward} and, where computable,
#' \code{reward$optimal} (used by "oracle" policies and to calculate regret).
#' }
#
#' }
#'
#' @seealso
#'
#' Core contextual classes: \code{\link{Bandit}}, \code{\link{Policy}}, \code{\link{Simulator}},
#' \code{\link{Agent}}, \code{\link{History}}, \code{\link{Plot}}
#'
#' Bandit subclass examples: \code{\link{BasicBernoulliBandit}}, \code{\link{ContextualLogitBandit}},
#' \code{\link{OfflineReplayEvaluatorBandit}}
#'
#' Policy subclass examples: \code{\link{EpsilonGreedyPolicy}}, \code{\link{ContextualLinTSPolicy}}
#'
#' @examples
#' \dontrun{
#'
#' horizon <- 1500
#' simulations <- 100
#'
#' continuous_arms <- function(x) {
#' -0.1*(x - 5) ^ 2 + 3.5 + rnorm(length(x),0,0.4)
#' }
#'
#' int_time <- 100
#' amplitude <- 0.2
#' learn_rate <- 0.3
#' omega <- 2*pi/int_time
#' x0_start <- 2.0
#'
#' policy <- LifPolicy$new(int_time, amplitude, learn_rate, omega, x0_start)
#'
#' bandit <- ContinuumBandit$new(FUN = continuous_arms)
#'
#' agent <- Agent$new(policy,bandit)
#'
#' history <- Simulator$new( agents = agent,
#' horizon = horizon,
#' simulations = simulations,
#' save_theta = TRUE )$run()
#'
#' plot(history, type = "average", regret = FALSE)
#' }
NULL