-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathwinequality-red.R
215 lines (176 loc) · 5.09 KB
/
winequality-red.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
## Comparison Of Supervised Machine Learning Models To Predict Red Wine Quality ##
#Importing the dataset
data <- read.csv('winequality-red.csv', sep = ';')
str(data)
#Format outcome variable
data$quality <- ifelse(data$quality >= 7, 1, 0)
data$quality <- factor(data$quality, levels = c(0, 1))
#EDA
#Descriptive statistics
summary(data)
#Univariate analysis
#Dependent variable
#Frequency plot
par(mfrow=c(1,1))
barplot(table(data[[12]]),
main = sprintf('Frequency plot of the variable: %s',
colnames(data[12])),
xlab = colnames(data[12]),
ylab = 'Frequency')
#Check class BIAS
table(data$quality)
round(prop.table((table(data$quality))),2)
#Independent variable
#Boxplots
par(mfrow=c(3,4))
for (i in 1:(length(data)-1)){
boxplot(x = data[i],
horizontal = TRUE,
main = sprintf('Boxplot of the variable: %s',
colnames(data[i])),
xlab = colnames(data[i]))
}
#Histograms
par(mfrow=c(3,4))
for (i in 1:(length(data)-1)){
hist(x = data[[i]],
main = sprintf('Histogram of the variable: %s',
colnames(data[i])),
xlab = colnames(data[i]))
}
#Bivariate analysis
#Correlation matrix
# install.packages('ggcorrplot')
library(ggcorrplot)
ggcorrplot(round(cor(data[-12]), 2),
type = "lower",
lab = TRUE,
title =
'Correlation matrix of the red wine quality dataset')
#DATA PREPARATION
#Missing values
sum(is.na(data))
#Outliers
#Identifing outliers
is_outlier <- function(x) {
return(x < quantile(x, 0.25) - 1.5 * IQR(x) |
x > quantile(x, 0.75) + 1.5 * IQR(x))
}
outlier <- data.frame(variable = character(),
sum_outliers = integer(),
stringsAsFactors=FALSE)
for (j in 1:(length(data)-1)){
variable <- colnames(data[j])
for (i in data[j]){
sum_outliers <- sum(is_outlier(i))
}
row <- data.frame(variable,sum_outliers)
outlier <- rbind(outlier, row)
}
#Identifying the percentage of outliers
for (i in 1:nrow(outlier)){
if (outlier[i,2]/nrow(data) * 100 >= 5){
print(paste(outlier[i,1],
'=',
round(outlier[i,2]/nrow(data) * 100, digits = 2),
'%'))
}
}
#Inputting outlier values
for (i in 4:5){
for (j in 1:nrow(data)){
if (data[[j, i]] > as.numeric(quantile(data[[i]], 0.75) +
1.5 * IQR(data[[i]]))){
if (i == 4){
data[[j, i]] <- round(mean(data[[i]]), digits = 2)
} else{
data[[j, i]] <- round(mean(data[[i]]), digits = 3)
}
}
}
}
#MODELING
#Splitting the dataset into the Training set and Test set
#Stratified sample
data_ones <- data[which(data$quality == 1), ]
data_zeros <- data[which(data$quality == 0), ]
#Train data
set.seed(123)
train_ones_rows <- sample(1:nrow(data_ones), 0.8*nrow(data_ones))
train_zeros_rows <- sample(1:nrow(data_zeros), 0.8*nrow(data_ones))
train_ones <- data_ones[train_ones_rows, ]
train_zeros <- data_zeros[train_zeros_rows, ]
training_set <- rbind(train_ones, train_zeros)
table(training_set$quality)
#Test Data
test_ones <- data_ones[-train_ones_rows, ]
test_zeros <- data_zeros[-train_zeros_rows, ]
test_set <- rbind(test_ones, test_zeros)
table(test_set$quality)
#Logistic Regression
lr = glm(formula = quality ~.,
data = training_set,
family = binomial)
#Predictions
prob_pred = predict(lr,
type = 'response',
newdata = test_set[-12])
# install.packages('InformationValue')
library(InformationValue)
optCutOff <- optimalCutoff(test_set$quality, prob_pred)[1]
y_pred = ifelse(prob_pred > optCutOff, 1, 0)
#Making the confusion matrix
cm_lr = table(test_set[, 12], y_pred)
cm_lr
#Accuracy
accuracy_lr = (cm_lr[1,1] + cm_lr[2,2])/
(cm_lr[1,1] + cm_lr[2,2] + cm_lr[2,1] + cm_lr[1,2])
accuracy_lr
#ROC curve
# install.packages('ROSE')
library(ROSE)
par(mfrow = c(1, 1))
roc.curve(test_set$quality, y_pred)
#Decision Tree
# install.packages('rpart')
library(rpart)
dt = rpart(formula = quality ~ .,
data = training_set,
method = 'class')
#Predictions
y_pred = predict(dt,
type = 'class',
newdata = test_set[-12])
#Making the confusion matrix
cm_dt = table(test_set[, 12], y_pred)
cm_dt
#Accuracy
accuracy_dt = (cm_dt[1,1] + cm_dt[2,2])/
(cm_dt[1,1] + cm_dt[2,2] + cm_dt[2,1] + cm_dt[1,2])
accuracy_dt
#ROC curve
library(ROSE)
roc.curve(test_set$quality, y_pred)
#Random forest
# install.packages('randomForest')
library(randomForest)
rf = randomForest(x = training_set[-12],
y = training_set$quality,
ntree = 10)
#Predictions
y_pred = predict(rf,
type = 'class',
newdata = test_set[-12])
#Making the confusion matrix
cm_rf = table(test_set[, 12], y_pred)
cm_rf
#Accuracy
accuracy_rf = (cm_rf[1,1] + cm_rf[2,2])/
(cm_rf[1,1] + cm_rf[2,2] + cm_rf[2,1] + cm_rf[1,2])
accuracy_rf
#ROC curve
library(ROSE)
roc.curve(test_set$quality, y_pred)
#Variable importance
library(caret)
varImp(lr)