Skip to content

Commit

Permalink
Updated to have proper offline simulation from first version of the p…
Browse files Browse the repository at this point in the history
…aper. Only plotting is left to do.
  • Loading branch information
g0ulash committed Jul 13, 2020
1 parent f235995 commit 673805d
Show file tree
Hide file tree
Showing 17 changed files with 51,509 additions and 45 deletions.
116 changes: 116 additions & 0 deletions 1_offline_ranking.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
# This is the demo file for the online and offline parameter tuning of Lock-in Feedback
# For policy and bandit specific code, please look at the files (as sourced above).
# First make sure to install contextual
# (see https://github.com/Nth-iteration-labs/contextual for a how to).
#
# For any questions, please contact the authors.

library(contextual)
library(here)
library(ggplot2)
library(cowplot)

source("./bandit_continuum_function_unimodal.R")
source("./bandit_continuum_function_bimodal.R")
source("./bandit_continuum_offon.R")
source("./policy_cont_lif_randstart.R")
source("./policy_tbl.R")
source("./policy_efirst.R")
source("./policy_ur.R")

#############################################################
# #
# Online and Offline evaluation #
# #
#############################################################

### Set seed
set.seed(1)

### Set number of interactions (horizon) and number of repeats (simulations)
### In the paper we used a horizon of 10000 and 10000 simulations
horizon <- 10
simulations <- 1

### Set up two different bandits
bandits <- c(ContinuumBanditUnimodal$new(), ContinuumBanditBimodal$new())

### Set up functions to make offline dataset
unimodal_data <- function(x){
c1 <- runif(1, 0.25, 0.75)
c2 <- 1
return(list("data" = -(x - c1) ^2 + c2 + rnorm(length(x), 0, 0.01), "max" = c2))
}

bimodal_data <- function(x){
mu1 <- runif(1, 0.15, 0.2)
sd1 <- runif(1, 0.1, 0.15)
mu2 <- runif(1, 0.7, 0.85)
sd2 <- runif(1, 0.1, 0.15)
y1 <- truncnorm::dtruncnorm(x, a=0, b=1, mean=mu1, sd=sd1)
y2 <- truncnorm::dtruncnorm(x, a=0, b=1, mean=mu2, sd=sd2)
maxval <- truncnorm::dtruncnorm(mu2, a=0, b=1, mean=mu1, sd=sd1) + truncnorm::dtruncnorm(mu2, a=0, b=1, mean=mu2, sd=sd2)
return(list("data" = y1 + y2 + rnorm(length(x), 0, 0.01), "max" = maxval))
}

functions <- list(list("unimodal", bimodal_data))#, list("bimodal", bimodal_data))

### Set up different deltas for the delta method. If delta = 0 we do online
#deltas <- c(0, 0.01, 0.05, 0.1, 0.2, 0.5)
deltas <- c(0, 0.5, 0.2, 0.1, 0.05, 0.01)

### Parameters for LiF
int_time <- 10
amplitude <- 0.035
learn_rate <- 2*pi/int_time
omega <- 1

histories <- vector(mode='list', length=length(deltas))

### Set up all agents with different amplitudes and run them for each bandit
for (f in functions){
if(f[[1]] == "unimodal"){
bandit_online <- ContinuumBanditUnimodal$new()
} else {
bandit_online <- ContinuumBanditBimodal$new()
}
for (i in 1:length(deltas)){
d = deltas[i]
if(d == 0){
bandit <- bandit_online
} else {
bandit <- OnlineOfflineContinuumBandit$new(FUN = f[[2]], max_bool = TRUE, delta = d, horizon = horizon)
}

agents <- list(Agent$new(UniformRandomPolicy$new(), bandit),
Agent$new(EpsilonFirstLinearRegressionPolicy$new(), bandit),
Agent$new(LifPolicyRandstart$new(int_time, amplitude, learn_rate, omega), bandit),
Agent$new(ThompsonBayesianLinearPolicy$new(), bandit))

history <- Simulator$new(agents = agents,
horizon = horizon,
simulations = simulations,
do_parallel = TRUE,
save_interval = 10)$run()

histories[[i]] <- history
}

#pdf(paste0("offline_",f[[1]],".pdf"))

#layout(matrix(c(1,2,3,4,5,6,7,7,7), ncol=3, byrow=TRUE), heights=c(3, 3, 1))
#par(oma=c(4,4,2,2), las=1)
##par(cex=1.05)
#for(hist in histories){
# plot(hist, type="cumulative", no_par = TRUE, disp="ci", legend=FALSE, use_colors=TRUE, trunc_per_agent = FALSE, xlab = "", ylab = "")
#}
#par(cex=1.05)
#mtext("Time step", 1, 3, outer=TRUE, las = 0)
#mtext("Cumulative regret", 2, 3, outer=TRUE, las = 0)
#par(mar=c(1,1,1,1))
#plot.new()
#legend(x="center", ncol=4,legend=c("UR","E-First","LiF", "TBL"), fill=c("#F8766D", "#7CAE00", "#00BFC4", "#C77CFF"), title="Legend")
#
#dev.off()
}

73 changes: 73 additions & 0 deletions 2a_offline_empirical.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
# This is the demo file for the online and offline parameter tuning of Lock-in Feedback
# For policy and bandit specific code, please look at the files (as sourced above).
# First make sure to install contextual
# (see https://github.com/Nth-iteration-labs/contextual for a how to).
#
# For any questions, please contact the authors.

library(contextual)
library(here)
library(ggplot2)
library(cowplot)

source("./bandit_continuum_offline.R")
source("./policy_cont_lif_randstart.R")
source("./policy_tbl.R")
source("./policy_efirst.R")
source("./policy_ur.R")

#############################################################
# #
# Online and Offline evaluation #
# #
#############################################################

### Set seed
set.seed(1)

### Set number of interactions (horizon) and number of repeats (simulations)
### In the paper we used a horizon of the data and 1000 simulations
simulations <- 1000

### Data

dt <- read.table("results.csv", TRUE, sep=",")
dt <- dt[dt$type=="setreward",]
dt$store <- as.factor(dt$context.StoreID)

dt <- dt[dt$store=="15337",]

dt$choice <- dt$split
dt$reward <- (1 - dt$split) * dt$revenue

horizon <- nrow(dt)

### Set up different deltas for the delta method
deltas <- 0.1

### Parameters for LiF
int_time <- 10
amplitude <- 0.035
learn_rate <- 2*pi/int_time
omega <- 1


### Set up all agents with different amplitudes and run them for each bandit
bandit <- OfflineContinuumBandit$new(data = dt, max_bool = TRUE, delta = deltas, horizon = horizon)

agents <- list(Agent$new(UniformRandomPolicy$new(), bandit),
Agent$new(EpsilonFirstLinearRegressionPolicy$new(), bandit),
Agent$new(LifPolicyRandstart$new(int_time, amplitude, learn_rate, omega), bandit),
Agent$new(ThompsonBayesianLinearPolicy$new(), bandit))

history <- Simulator$new(agents = agents,
horizon = horizon,
simulations = simulations,
do_parallel = FALSE,
save_interval = 10)$run()


cairo_ps("offline_empirical.eps")
plot(history, regret=FALSE, type="cumulative", legend_labels = c("UR", "E-First", "LiF", "TBL"), disp="ci")
dev.off()

16 changes: 16 additions & 0 deletions 2b_company_results.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
library(ggplot2)

data <- read.table("results.csv", TRUE, sep=",")
head(data)

data <- data[data$type=="setreward",]
data$store <- as.factor(data$context.StoreID)

data <- data[data$store=="15337",]

dat2 <- aggregate(data, by=list(data$context.UserID), FUN=mean)

dat2$bb <- (1-dat2$split) * dat2$revenue
c <- qplot(dat2$split, dat2$bb, xlab="Split of discount offered to customer", ylab="Profit for rebate company in euros") + stat_smooth() + geom_vline(xintercept = 0.5, linetype = "dashed", color = "red")
ggsave("company_results.eps", c, device="eps")
print(c)
File renamed without changes.
File renamed without changes.
7 changes: 5 additions & 2 deletions bandit_continuum_function_bimodal.R
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,12 @@ ContinuumBanditBimodal <- R6::R6Class(
context
},
get_reward = function(t, context, action) {
reward <- self$arm_function(action$choice, self$mu1, self$sd1, self$mu2, self$sd2)
optimal_reward <- self$arm_function(self$mu2, self$mu1, self$sd1, self$mu2, self$sd2)

reward <- list(
reward = self$arm_function(action$choice, self$mu1, self$sd1, self$mu2, self$sd2),
optimal_reward = self$mu2
reward = reward,
optimal_reward = optimal_reward
)
}
)
Expand Down
36 changes: 36 additions & 0 deletions bandit_continuum_offline.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#' @export
OfflineContinuumBandit <- R6::R6Class(
inherit = Bandit,
class = FALSE,
public = list(
class_name = "OfflineContinuumBandit",
delta = NULL,
horizon = NULL,
choice = NULL,
S = NULL,
initialize = function(data, max_bool, delta, horizon) {
self$S <- data
self$horizon <- horizon
self$delta <- delta
self$k <- 1
},
post_initialization = function() {
self$S <- self$S[sample(nrow(self$S)),]
},
get_context = function(index) {
context <- list()
context$k <- self$k
context
},
get_reward = function(index, context, action) {
reward_at_index <- as.double(self$S$reward[[index]])
if (abs(self$S$choice[[index]] - action$choice) < self$delta) {
reward <- list(
reward = reward_at_index
)
} else {
NULL
}
}
)
)
88 changes: 46 additions & 42 deletions bandit_continuum_offon.R
Original file line number Diff line number Diff line change
@@ -1,42 +1,46 @@
#' @export
OnlineOfflineContinuumBandit <- R6::R6Class(
inherit = Bandit,
class = FALSE,
private = list(
S = NULL
),
public = list(
class_name = "OnlineOfflineContinuumBandit",
delta = NULL,
horizon = NULL,
choice = NULL,
arm_function = NULL,
initialize = function(FUN, delta, horizon) {
self$arm_function <- FUN
self$horizon <- horizon
self$delta <- delta
self$k <- 1
},
post_initialization = function() {
self$choice <- runif(self$horizon, min=0, max=1)
private$S <- data.frame(self$choice, self$arm_function(self$choice))
private$S <- private$S[sample(nrow(private$S)),]
colnames(private$S) <- c('choice', 'reward')
},
get_context = function(index) {
context <- list()
context$k <- self$k
context
},
get_reward = function(index, context, action) {
reward_at_index <- as.double(private$S$reward[[index]])
if (abs(private$S$choice[[index]] - action$choice) < self$delta) {
reward <- list(
reward = reward_at_index
)
} else {
NULL
}
}
)
)
#' @export
OnlineOfflineContinuumBandit <- R6::R6Class(
inherit = Bandit,
class = FALSE,
public = list(
class_name = "OnlineOfflineContinuumBandit",
delta = NULL,
horizon = NULL,
choice = NULL,
arm_function = NULL,
max_bool = FALSE,
maxval = NULL,
S = NULL,
initialize = function(FUN, max_bool, delta, horizon) {
self$arm_function <- FUN
self$horizon <- horizon
self$delta <- delta
self$k <- 1
self$max_bool <- max_bool
},
post_initialization = function() {
self$choice <- runif(self$horizon, min=0, max=1)
temp_data <- self$arm_function(self$choice)
self$S <- data.frame(self$choice, temp_data$data)
self$maxval <- temp_data$max
self$S <- self$S[sample(nrow(self$S)),]
colnames(self$S) <- c('choice', 'reward')
},
get_context = function(index) {
context <- list()
context$k <- self$k
context
},
get_reward = function(index, context, action) {
reward_at_index <- as.double(self$S$reward[[index]])
if (abs(self$S$choice[[index]] - action$choice) < self$delta) {
reward <- list(
reward = reward_at_index,
optimal_reward = ifelse(self$max_bool, self$maxval, NA)
)
} else {
NULL
}
}
)
)
Loading

0 comments on commit 673805d

Please sign in to comment.