forked from dhanesh123us/GBID
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscript.R
128 lines (105 loc) · 5.64 KB
/
script.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
# Bimbo XGBoost R script LB:0.457
# Author: Bohdan Pavlyshenko e-mail: [email protected]
#
# One of the main ideas of the script is to use lagged values for target variable.
# Due to memory limits 8GB in the Kaggle cloud R server, we use only one week lag for target variable.
# If to start this script on the computer with 32GB and XGBoost parameter nrounds=150, we can use lagged values
# of target variable for 5 weeks which gives us the leaderbord score 0.45728
rm(list=ls())
gc()
library(data.table)
library(xgboost)
train=fread('train.csv',select = c("Semana",'Cliente_ID', 'Producto_ID', 'Agencia_ID', 'Ruta_SAK', 'Demanda_uni_equil'))
test=fread('test.csv',select = c("Semana",'id','Cliente_ID', 'Producto_ID', 'Agencia_ID', 'Ruta_SAK'))
# Cut the train set to 8 and 9 weeks (Semana) for using only one week lags for target variable.
# If you have enough memory, you can set up condition Semana>3 on the next row for using lagged values of target variable for 5 weeks.
train=train[Semana>3,]
train$id=0
train[,target:=Demanda_uni_equil]
train[,Demanda_uni_equil:=NULL]
train[,tst:=0]
test$target=0
test[,tst:=1]
data=rbind(train,test)
rm(test)
rm(train)
# Creating features for one week lagged values of target variable
data1<-data[Semana>7 & Semana<11,.(Semana=Semana+1,Cliente_ID,Producto_ID,target)]
data=merge(data,data1[Semana>8,.(targetl1=mean(target)), by=.(Semana,Cliente_ID,Producto_ID)],all.x=T, by=c("Semana","Cliente_ID","Producto_ID"))
rm(data1)
gc()
# If you have enough memory, you can set up value TRUE in the following condition to make it possible to use lagged values of target variable
# for lagged 2-5 weeks as additional features
if(TRUE){
data1<-data[Semana>6 & Semana<10,.(Semana=Semana+2,Cliente_ID,Producto_ID,target)]
data=merge(data,data1[Semana>8,.(targetl2=mean(target)), by=.(Semana,Cliente_ID,Producto_ID)],all.x=T, by=c("Semana","Cliente_ID","Producto_ID"))
data1<-data[Semana>5 & Semana<9,.(Semana=Semana+3,Cliente_ID,Producto_ID,target)]
data=merge(data,data1[Semana>8,.(targetl3=mean(target)), by=.(Semana,Cliente_ID,Producto_ID)],all.x=T, by=c("Semana","Cliente_ID","Producto_ID"))
data1<-data[Semana>4 & Semana<8,.(Semana=Semana+4,Cliente_ID,Producto_ID,target)]
data=merge(data,data1[Semana>8,.(targetl4=mean(target)), by=.(Semana,Cliente_ID,Producto_ID)],all.x=T, by=c("Semana","Cliente_ID","Producto_ID"))
data1<-data[Semana>3 & Semana<7,.(Semana=Semana+5,Cliente_ID,Producto_ID,target)]
data=merge(data,data1[Semana>8,.(targetl5=mean(target)), by=.(Semana,Cliente_ID,Producto_ID)],all.x=T, by=c("Semana","Cliente_ID","Producto_ID"))
}
rm(data1)
data=data[Semana>8,]
# Creating frequency features for some factor variables
nAgencia_ID=data[,.(nAgencia_ID=.N),by=.(Agencia_ID,Semana)]
nAgencia_ID=nAgencia_ID[,.(nAgencia_ID=mean(nAgencia_ID,na.rm=T)),by=Agencia_ID]
data=merge(data,nAgencia_ID,by='Agencia_ID',all.x=T)
nRuta_SAK=data[,.(nRuta_SAK=.N),by=.(Ruta_SAK,Semana)]
nRuta_SAK=nRuta_SAK[,.(nRuta_SAK=mean(nRuta_SAK,na.rm=T)),by=Ruta_SAK]
data=merge(data,nRuta_SAK,by='Ruta_SAK',all.x=T)
nCliente_ID=data[,.(nCliente_ID=.N),by=.(Cliente_ID,Semana)]
nCliente_ID=nCliente_ID[,.(nCliente_ID=mean(nCliente_ID,na.rm=T)),by=Cliente_ID]
data=merge(data,nCliente_ID,by='Cliente_ID',all.x=T)
nProducto_ID=data[,.(nProducto_ID=.N),by=.(Producto_ID,Semana)]
nProducto_ID=nProducto_ID[,.(nProducto_ID=mean(nProducto_ID,na.rm=T)),by=Producto_ID]
data=merge(data,nProducto_ID,by='Producto_ID',all.x=T)
data$target=log(data$target+1)
data_train=data[tst==0,]
data_test=data[tst==1,]
features=names(data_train)[!(names(data_train) %in% c('id',"target",'tst'))]
rm(data)
wltst=sample(nrow(data_train),30000)
dval<-xgb.DMatrix(data=data.matrix(data_train[wltst,features,with=FALSE]),
label=data.matrix(data_train[wltst,target]),missing=NA)
watchlist<-list(dval=dval)
clf <- xgb.train(params=list( objective="reg:linear",
booster = "gbtree",
eta=0.1,
max_depth=10,
subsample=0.85,
colsample_bytree=0.7) ,
data = xgb.DMatrix(data=data.matrix(data_train[-wltst,features,with=FALSE]),
label=data.matrix(data_train[-wltst,target]),missing=NA),
nrounds = 75,
verbose = 1,
print_every_n=5,
early_stopping_rounds = 10,
watchlist = watchlist,
maximize = FALSE,
eval_metric='rmse'
)
# Make prediction for the 10th week
data_test1=data_test[Semana==10,]
pred<-predict(clf,xgb.DMatrix(data.matrix(data_test1[,features,with=FALSE]),missing=NA))
res=exp(round(pred,5))-1
# Create lagged values of target variable which will be used as a feature for the 11th week prediction
data_test_lag1=data_test1[,.(Cliente_ID,Producto_ID)]
data_test_lag1$targetl1=res
data_test_lag1=data_test_lag1[,.(targetl1=mean(targetl1)), by=.(Cliente_ID,Producto_ID)]
results=data.frame(id=data_test1$id,Demanda_uni_equil=res)
data_test2=data_test[Semana==11,]
data_test2[,targetl1:=NULL]
# Merge lagged values of target variable to test the set for the 11th week
data_test2=merge(data_test2,data_test_lag1,all.x=T,by=c('Cliente_ID','Producto_ID'))
pred<-predict(clf,xgb.DMatrix(data.matrix(data_test2[,features,with=FALSE]),missing=NA))
res=exp(round(pred,5))-1
res.df=data.frame(id=data_test2$id,Demanda_uni_equil=res)
results=rbind(results, res.df)
results[results[,2]<0,2]=0
results[,2]=round(results[,2],1)
results[,1]=as.integer(results[,1])
class(results[,1])='int32'
options(digits=18)
write.csv(results,file='results2.csv',row.names=F)