Updated to have proper offline simulation from first version of the p…

…aper. Only plotting is left to do.
Nth-iteration-labs · Jul 13, 2020 · 673805d · 673805d
1 parent f235995
commit 673805d
Show file tree

Hide file tree

Showing 17 changed files with 51,509 additions and 45 deletions.
diff --git a/1_offline_ranking.R b/1_offline_ranking.R
@@ -0,0 +1,116 @@
+# This is the demo file for the online and offline parameter tuning of Lock-in Feedback
+# For policy and bandit specific code, please look at the files (as sourced above).
+# First make sure to install contextual 
+# (see https://github.com/Nth-iteration-labs/contextual for a how to).
+#
+# For any questions, please contact the authors.
+
+library(contextual)
+library(here)
+library(ggplot2)
+library(cowplot)
+
+source("./bandit_continuum_function_unimodal.R")
+source("./bandit_continuum_function_bimodal.R")
+source("./bandit_continuum_offon.R")
+source("./policy_cont_lif_randstart.R")
+source("./policy_tbl.R")
+source("./policy_efirst.R")
+source("./policy_ur.R")
+
+#############################################################
+#                                                           #
+#  Online and Offline evaluation                            #
+#                                                           #
+#############################################################
+
+### Set seed
+set.seed(1)
+
+### Set number of interactions (horizon) and number of repeats (simulations)
+### In the paper we used a horizon of 10000 and 10000 simulations
+horizon            <- 10
+simulations        <- 1
+
+### Set up two different bandits
+bandits <- c(ContinuumBanditUnimodal$new(), ContinuumBanditBimodal$new())
+
+### Set up functions to make offline dataset
+unimodal_data <- function(x){
+    c1 <- runif(1, 0.25, 0.75)
+    c2 <- 1
+    return(list("data" = -(x - c1) ^2 + c2 + rnorm(length(x), 0, 0.01), "max" = c2))
+}
+
+bimodal_data <- function(x){
+    mu1 <- runif(1, 0.15, 0.2)
+    sd1 <- runif(1, 0.1, 0.15)
+    mu2 <- runif(1, 0.7, 0.85)
+    sd2 <- runif(1, 0.1, 0.15)
+    y1 <- truncnorm::dtruncnorm(x, a=0, b=1, mean=mu1, sd=sd1)
+    y2 <- truncnorm::dtruncnorm(x, a=0, b=1, mean=mu2, sd=sd2)
+    maxval <- truncnorm::dtruncnorm(mu2, a=0, b=1, mean=mu1, sd=sd1) + truncnorm::dtruncnorm(mu2, a=0, b=1, mean=mu2, sd=sd2)
+    return(list("data" = y1 + y2 + rnorm(length(x), 0, 0.01), "max" = maxval))
+}
+
+functions <- list(list("unimodal", bimodal_data))#, list("bimodal", bimodal_data))
+
+### Set up different deltas for the delta method. If delta = 0 we do online
+#deltas <- c(0, 0.01, 0.05, 0.1, 0.2, 0.5)
+deltas <- c(0, 0.5, 0.2, 0.1, 0.05, 0.01)
+
+### Parameters for LiF
+int_time <- 10
+amplitude <- 0.035
+learn_rate <- 2*pi/int_time 
+omega <- 1
+
+histories <- vector(mode='list', length=length(deltas))
+
+### Set up all agents with different amplitudes and run them for each bandit
+for (f in functions){
+  if(f[[1]] == "unimodal"){
+      bandit_online <- ContinuumBanditUnimodal$new()
+  } else {
+      bandit_online <- ContinuumBanditBimodal$new()
+  }
+  for (i in 1:length(deltas)){
+      d = deltas[i]
+      if(d == 0){
+          bandit <- bandit_online
+      } else {
+          bandit <- OnlineOfflineContinuumBandit$new(FUN = f[[2]], max_bool = TRUE, delta = d, horizon = horizon)
+      }
+
+      agents <- list(Agent$new(UniformRandomPolicy$new(), bandit),
+                     Agent$new(EpsilonFirstLinearRegressionPolicy$new(), bandit),
+                     Agent$new(LifPolicyRandstart$new(int_time, amplitude, learn_rate, omega), bandit),
+                     Agent$new(ThompsonBayesianLinearPolicy$new(), bandit))
+
+      history            <- Simulator$new(agents      = agents,
+                                          horizon     = horizon,
+                                          simulations = simulations,
+                                          do_parallel = TRUE,
+                                          save_interval = 10)$run()
+
+      histories[[i]] <- history 
+ }
+
+#pdf(paste0("offline_",f[[1]],".pdf"))
+
+#layout(matrix(c(1,2,3,4,5,6,7,7,7), ncol=3, byrow=TRUE), heights=c(3, 3, 1))
+#par(oma=c(4,4,2,2), las=1)
+##par(cex=1.05)
+#for(hist in histories){
+#    plot(hist, type="cumulative", no_par = TRUE, disp="ci", legend=FALSE, use_colors=TRUE, trunc_per_agent = FALSE, xlab = "", ylab = "")
+#}
+#par(cex=1.05)
+#mtext("Time step", 1, 3, outer=TRUE, las = 0)
+#mtext("Cumulative regret", 2, 3, outer=TRUE, las = 0)
+#par(mar=c(1,1,1,1))
+#plot.new()
+#legend(x="center", ncol=4,legend=c("UR","E-First","LiF", "TBL"), fill=c("#F8766D", "#7CAE00", "#00BFC4", "#C77CFF"), title="Legend")
+#
+#dev.off()
+}
+
diff --git a/2a_offline_empirical.R b/2a_offline_empirical.R
@@ -0,0 +1,73 @@
+# This is the demo file for the online and offline parameter tuning of Lock-in Feedback
+# For policy and bandit specific code, please look at the files (as sourced above).
+# First make sure to install contextual 
+# (see https://github.com/Nth-iteration-labs/contextual for a how to).
+#
+# For any questions, please contact the authors.
+
+library(contextual)
+library(here)
+library(ggplot2)
+library(cowplot)
+
+source("./bandit_continuum_offline.R")
+source("./policy_cont_lif_randstart.R")
+source("./policy_tbl.R")
+source("./policy_efirst.R")
+source("./policy_ur.R")
+
+#############################################################
+#                                                           #
+#  Online and Offline evaluation                            #
+#                                                           #
+#############################################################
+
+### Set seed
+set.seed(1)
+
+### Set number of interactions (horizon) and number of repeats (simulations)
+### In the paper we used a horizon of the data and 1000 simulations
+simulations        <- 1000
+
+### Data
+
+dt <- read.table("results.csv", TRUE, sep=",")
+dt <- dt[dt$type=="setreward",]
+dt$store <- as.factor(dt$context.StoreID)
+
+dt <- dt[dt$store=="15337",]
+
+dt$choice <- dt$split
+dt$reward <- (1 - dt$split) * dt$revenue
+
+horizon <- nrow(dt)
+
+### Set up different deltas for the delta method
+deltas <- 0.1
+
+### Parameters for LiF
+int_time <- 10
+amplitude <- 0.035
+learn_rate <- 2*pi/int_time 
+omega <- 1
+
+
+### Set up all agents with different amplitudes and run them for each bandit
+bandit <- OfflineContinuumBandit$new(data = dt, max_bool = TRUE, delta = deltas, horizon = horizon)
+
+agents <- list(Agent$new(UniformRandomPolicy$new(), bandit),
+             Agent$new(EpsilonFirstLinearRegressionPolicy$new(), bandit),
+             Agent$new(LifPolicyRandstart$new(int_time, amplitude, learn_rate, omega), bandit),
+             Agent$new(ThompsonBayesianLinearPolicy$new(), bandit))
+
+history            <- Simulator$new(agents      = agents,
+                                  horizon     = horizon,
+                                  simulations = simulations,
+                                  do_parallel = FALSE,
+                                  save_interval = 10)$run()
+
+
+cairo_ps("offline_empirical.eps")
+plot(history, regret=FALSE, type="cumulative", legend_labels = c("UR", "E-First", "LiF", "TBL"), disp="ci")     
+dev.off()
+
diff --git a/2b_company_results.R b/2b_company_results.R
@@ -0,0 +1,16 @@
+library(ggplot2)
+
+data <- read.table("results.csv", TRUE, sep=",")
+head(data)
+
+data <- data[data$type=="setreward",]
+data$store <- as.factor(data$context.StoreID)
+
+data <- data[data$store=="15337",]
+
+dat2 <- aggregate(data, by=list(data$context.UserID), FUN=mean)
+
+dat2$bb <- (1-dat2$split) * dat2$revenue
+c <- qplot(dat2$split, dat2$bb, xlab="Split of discount offered to customer", ylab="Profit for rebate company in euros") + stat_smooth() + geom_vline(xintercept = 0.5, linetype = "dashed", color = "red")
+ggsave("company_results.eps", c, device="eps")
+print(c)
diff --git a/demo_lif_bandit.R → 3_offline_par_lif.R b/demo_lif_bandit.R → 3_offline_par_lif.R
diff --git a/demo_tbl_bandit.R → 4_offline_par_tbl.R b/demo_tbl_bandit.R → 4_offline_par_tbl.R
diff --git a/bandit_continuum_function_bimodal.R b/bandit_continuum_function_bimodal.R
@@ -33,9 +33,12 @@ ContinuumBanditBimodal <- R6::R6Class(
       context
     },
     get_reward = function(t, context, action) {
+      reward <- self$arm_function(action$choice, self$mu1, self$sd1, self$mu2, self$sd2)
+      optimal_reward <- self$arm_function(self$mu2, self$mu1, self$sd1, self$mu2, self$sd2)
+
       reward  <- list(
-        reward                   = self$arm_function(action$choice, self$mu1, self$sd1, self$mu2, self$sd2),
-        optimal_reward           = self$mu2
+       reward = reward,
+       optimal_reward = optimal_reward
       )
     }
   )

diff --git a/bandit_continuum_offline.R b/bandit_continuum_offline.R
@@ -0,0 +1,36 @@
+#' @export
+OfflineContinuumBandit <- R6::R6Class(
+  inherit = Bandit,
+  class = FALSE,
+  public = list(
+    class_name = "OfflineContinuumBandit",
+    delta = NULL,
+    horizon = NULL,
+    choice = NULL,
+    S = NULL,
+    initialize   = function(data, max_bool, delta, horizon) {
+      self$S <- data 
+      self$horizon <- horizon
+      self$delta <- delta
+      self$k <- 1
+    },
+    post_initialization = function() {
+      self$S <- self$S[sample(nrow(self$S)),]
+    },
+    get_context = function(index) {
+      context           <- list()
+      context$k         <- self$k
+      context
+    },
+    get_reward = function(index, context, action) {
+      reward_at_index <- as.double(self$S$reward[[index]])
+      if (abs(self$S$choice[[index]] - action$choice) < self$delta) {
+            reward <- list(
+              reward = reward_at_index
+            )
+      } else {
+        NULL
+      }
+    }
+  )
+)
diff --git a/bandit_continuum_offon.R b/bandit_continuum_offon.R
@@ -1,42 +1,46 @@
-#' @export
-OnlineOfflineContinuumBandit <- R6::R6Class(
-  inherit = Bandit,
-  class = FALSE,
-  private = list(
-    S = NULL
-  ),
-  public = list(
-    class_name = "OnlineOfflineContinuumBandit",
-    delta = NULL,
-    horizon = NULL,
-    choice = NULL,
-    arm_function = NULL,
-    initialize   = function(FUN, delta, horizon) {
-      self$arm_function <- FUN
-      self$horizon <- horizon
-      self$delta <- delta
-      self$k <- 1
-    },
-    post_initialization = function() {
-      self$choice <- runif(self$horizon, min=0, max=1)
-      private$S <- data.frame(self$choice, self$arm_function(self$choice))
-      private$S <- private$S[sample(nrow(private$S)),]
-      colnames(private$S) <- c('choice', 'reward')
-    },
-    get_context = function(index) {
-      context           <- list()
-      context$k         <- self$k
-      context
-    },
-    get_reward = function(index, context, action) {
-      reward_at_index <- as.double(private$S$reward[[index]])
-      if (abs(private$S$choice[[index]] - action$choice) < self$delta) {
-        reward <- list(
-          reward = reward_at_index
-        )
-      } else {
-        NULL
-      }
-    }
-  )
-)
+#' @export
+OnlineOfflineContinuumBandit <- R6::R6Class(
+  inherit = Bandit,
+  class = FALSE,
+  public = list(
+    class_name = "OnlineOfflineContinuumBandit",
+    delta = NULL,
+    horizon = NULL,
+    choice = NULL,
+    arm_function = NULL,
+    max_bool = FALSE,
+    maxval = NULL,
+    S = NULL,
+    initialize   = function(FUN, max_bool, delta, horizon) {
+      self$arm_function <- FUN
+      self$horizon <- horizon
+      self$delta <- delta
+      self$k <- 1
+      self$max_bool <- max_bool
+    },
+    post_initialization = function() {
+      self$choice <- runif(self$horizon, min=0, max=1)
+      temp_data <- self$arm_function(self$choice)
+      self$S <- data.frame(self$choice, temp_data$data)
+      self$maxval <- temp_data$max
+      self$S <- self$S[sample(nrow(self$S)),]
+      colnames(self$S) <- c('choice', 'reward')
+    },
+    get_context = function(index) {
+      context           <- list()
+      context$k         <- self$k
+      context
+    },
+    get_reward = function(index, context, action) {
+      reward_at_index <- as.double(self$S$reward[[index]])
+      if (abs(self$S$choice[[index]] - action$choice) < self$delta) {
+        reward <- list(
+                  reward = reward_at_index,
+                  optimal_reward = ifelse(self$max_bool, self$maxval, NA)
+                  )
+      } else {
+        NULL
+      }
+    }
+  )
+)