-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPart 1.R
187 lines (125 loc) · 9.62 KB
/
Part 1.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
#############################################################################################
######################## Data modelling ###########################################
#############################################################################################
# Libraries and dataset
library(dplyr)
source("extra_functions.R")
getwd()
data<-read.csv("cw_dataset.csv",header = TRUE,sep = ",") # Save the dataset in the working directory.
## 0.0 Summary data
str(data)
table(data$Class)/nrow(data)
## 0.0 Replace empty strings with NA values to avoid data problems.
is.na(data)<-data==''
############## 1.1 Explore the data ##############
## 1.1.i Summary of the data
dispersion.measures<-summary(data[,2:19]) # All numeric attributes without considering the Sample_ID variable
dispersion.measures
percentage.na.values<-round(colSums(is.na(data[,2:19]))/nrow(data),2)
percentage.na.values
## 1.1.ii Histograms the attributes
plot.hist(data,2,4,1,3) # Centroid X, Centroid Y, Mass
plot.hist(data,5,7,1,3) # Width, Depth, Orientation 0
plot.hist(data,8,10,1,3) # Orientation 1, Orientation 2, Orientation 3
plot.hist(data,11,13,1,3) # Orientation 4, Orientation 5, Orientation 6
plot.hist(data,14,16,1,3) # Orientation 7, Orientation 8, Orientation 9
plot.hist(data,17,19,1,3)# Leaf Weight, Leaf Area, Leaf Hue.
############## 1.2 Explore the relationships between the attributes, and between the class and the attributes ##############
# Correlation matrix
correlation.matrix<-cor(data[unlist(lapply(data,is.numeric))][-1], use = "pairwise.complete.obs")
correlation.matrix
# Additional Function to check the multicollinearity
multicollinearity(correlation.matrix,0.70)
## 1.2.i. Calculate the correlations and produce scatterplots for the variables: orientation 4 and orientation 7. What does this correlation tell you about the relationships of these variables?
corr.or4.or7<-correlation.matrix["Orientation..4","Orientation..7"]
corr.or4.or7
# Scatterplot
plot(data$Orientation..4, data$Orientation..7, main="Orientation 4 vs Orientation 7",xlab="Orientation 4", ylab="Orientation 7", pch=18, bty="n",cex.lab=1.5, cex.axis=1.5,cex.sub=1.5, cex.main=1.5)
abline(lm(data$Orientation..7~data$Orientation..4), col="blue", lwd=2, lty=2) # regression line (y~x)
legend("bottom", c("Linear Trend"), col = c("blue"), lty=2, lwd = 2,bg="transparent",cex = 1.4,pt.cex = 1.4, bty = "n",seg.len = 1.5)
box()
## 1.2.ii. Produce scatterplots between the class variable and orientation 4, orientation 6 and area variables.
par(mfrow=c(1,3))
plot(as.numeric(data$Class), data$Orientation..4,main = "Class vs Orientation 4",ylab = "Orientation 4", xlab = "Class",xaxt = "n", col = data$Class, cex.lab = 1.5, cex.axis = 1.5, cex.main = 1.5, cex = 1.5)
axis(1, at=1:5,labels=unique(data$Class), cex.axis = 1.5)
plot(as.numeric(data$Class), data$Orientation..6,main = "Class vs Orientation 6",ylab = "Orientation 6",xlab = "Class",xaxt = "n", col = data$Class, cex.lab = 1.5, cex.axis = 1.5,cex.main = 1.5, cex = 1.5)
axis(1, at=1:5,labels=unique(data$Class), cex.axis = 1.5)
plot(as.numeric(data$Class), data$Leaf.Area,main = "Class vs Area",ylab = "Area",xlab = "Class",xaxt = "n", col = data$Class, cex.lab = 1.5, cex.axis = 1.5,cex.main = 1.5,cex = 1.5)
axis(1, at=1:5,labels=unique(data$Class), cex.axis = 1.5)
## 1.2.iii. Produce boxplots for all of the appropriate attributes in the dataset. Group each variable according to the class attribute.
plot.boxplot(data,2,4,1,3) # Centroid X, Centroid Y, Mass
plot.boxplot(data,5,7,1,3) # Width, Depth, Orientation 0
plot.boxplot(data,8,10,1,3) # Orientation 1, Orientation 2, Orientation 3
plot.boxplot(data,11,13,1,3) # Orientation 4, Orientation 5, Orientation 6
plot.boxplot(data,14,16,1,3) # Orientation 7, Orientation 8, Orientation 9
plot.boxplot(data,17,19,1,3) # Leaf Weight, Leaf Area, Leaf Hue.
############## 1.4 Dealing with missing values in R##############
# 1.4.i. Replace missing values in the dataset using three strategies: replacement with 0, mean and median.
data.mvalues.0<-rep.missing.values(data,"0") # Replace with zero
data.mvalues.mean<-rep.missing.values(data,"mean") # Repace with the mean
data.mvalues.median<-rep.missing.values(data,"median") # Replace with the median
# 1.4.ii. Define, compare and contrast these approaches and its effects on the data.
# Replace with 0 Example: Width and Depth
boxplot.replace.mvalues(data,6) #Boxplot Depth variable
#boxplot.replace.mvalues(data,5) #Boxplot Width variable
############## 1.5. Attribute transformation ##############
# Transformation techniques: Mean centering, Normalisation and Standardisation)
# Mean Centering
data.mvalues.0.MC<-data.mvalues.0 %>% mutate_at(c(2:19), funs(c(scale(., scale = FALSE))))
data.mvalues.mean.MC<-data.mvalues.mean %>% mutate_at(c(2:19), funs(c(scale(., scale = FALSE))))
data.mvalues.median.MC<-data.mvalues.median %>% mutate_at(c(2:19), funs(c(scale(., scale = FALSE))))
# Normalising between [0,1]:
data.mvalues.0.NM<-normalising(data.mvalues.0,2,19)
data.mvalues.mean.NM<-normalising(data.mvalues.mean,2,19)
data.mvalues.median.NM<-normalising(data.mvalues.median,2,19)
# Standardising
data.mvalues.0.ST<-data.mvalues.0 %>% mutate_at(c(2:19), funs(c(scale(.))))
data.mvalues.mean.ST<-data.mvalues.mean %>% mutate_at(c(2:19), funs(c(scale(.))))
data.mvalues.median.ST<-data.mvalues.median %>% mutate_at(c(2:19), funs(c(scale(.))))
# Define, compare and contrast these approaches and its effects on the data.
# Boxplot Example
par(mfrow=c(1,4))
boxplot(data[,7],ylab = "Orientation 0", main = "Orientation 0",col = "grey66", cex.lab=1.6, cex.axis=1.6,cex.sub=1.6, cex.main=1.6,outcex=1.8)
boxplot(data.mvalues.mean.NM[,7],ylab = "Orientation 0",main = "Or.0: Normalisation", ylab = "Orientation 0",col = "grey66", cex.lab=1.6, cex.axis=1.6,cex.sub=1.6, cex.main=1.6,outcex=1.8)
boxplot(data.mvalues.mean.MC[,7],ylab = "Orientation 0",main = "Or.0: Mean centering", col = "grey66", cex.lab=1.6, cex.axis=1.6,cex.sub=1.6, cex.main=1.6,outcex=1.8)
boxplot(data.mvalues.mean.ST[,7],ylab = "Orientation 0",main = "Or.0: Standardization", col = "grey66", cex.lab=1.6, cex.axis=1.6,cex.sub=1.6, cex.main=1.6,outcex=1.8)
############## 1.6. Attribute / instance selection ##############
#1.6.i Starting again from the raw data, consider attribute and instance deletion strategies to deal with missing values
data.attribute.instance.deletion<-data[,-17] # Remove the variable Leaf Weight. It has 50% of missing values
missing.values.rows(data.attribute.instance.deletion) #it is possible to remove all those instances with more than 3 missing values
data.attribute.instance.deletion<-cbind(data.attribute.instance.deletion, empty_values = rowSums(is.na(data.attribute.instance.deletion))) #Auxiliar column
data.attribute.instance.deletion<-data.attribute.instance.deletion[data.attribute.instance.deletion$empty_values <= 2,] # Delete all those instances with more than 3 missing values
data.attribute.instance.deletion<-data.attribute.instance.deletion[,-20] # Delete the auxiliar column
#1.6.ii Start from the raw data, use correlations between attributes to reduce the number of attributes.
# Try to reduce the dataset to contain only uncorrelated attributes and no missing values. Explain
# your choices and its effects on the dataset.
data.without.missing.values<-cbind(data, empty_values = rowSums(is.na(data))) # Create an auxiliar column
data.without.missing.values<- data.without.missing.values[data.without.missing.values$empty_values == 0,] #Delete all those instaces with 1 or more missing values
data.without.missing.values.uncorrelated<-data.without.missing.values[,c(1:5,7,17:20)] # Select the uncorrelated attributes
cor(data.without.missing.values.uncorrelated[,2:9]) # The correlation matrix with uncorrelated variables.
#1.6.iii Starting from an appropriate version of the dataset, use Principal Component Analysis to create a
# data set with eight attributes. Explain the process and the result obtained.
# My approtiate version considers
data.pca<-data[,-17] # Remove the Leaf Weight variable
data.pca<-cbind(data.pca, empty_values = rowSums(is.na(data.pca))) # Auxiliar column Number of attributes without information
data.pca<-data.pca[data.pca$empty_values <=2,] # Deleting all those which have 3 o more missing values
data.pca<-data.frame(data.pca %>% mutate_each(funs(replace(., which(is.na(.)),mean(., na.rm=TRUE))))) # Replacing by the mean
data.pca<-data.pca[,-20] # Deleting the auxiliar column
data.pca<-data.pca %>% mutate_at(c(2:18), funs(c(scale(.)))) # standarizing data before PCA
pca <- prcomp(data.pca[,c(2:18)],scale=T) # PCA without considering the Class attribute
summary.pca<-summary(pca)
summary.pca
input.all.pcs<-data.frame(pca$x)
input.all.pcs$Class<-data.pca$Class
############## Final Datasets ##############
dim(data.mvalues.0.MC) # Missing values replaced by 0 and Mean Centered
dim(data.mvalues.mean.MC)# Missing values replaced by the mean and Mean Centered
dim(data.mvalues.median.MC)# Missing values replaced by the median and Mean Centered
dim(data.mvalues.0.NM) # Missing values replaced by 0 and Normalisation
dim(data.mvalues.mean.NM) # Missing values replaced by the mean and Normalisation
dim(data.mvalues.median.NM) # Missing values replaced by the median and Normalisation
dim(data.mvalues.0.ST) # Missing values replaced by 0 / Standardisation
dim(data.mvalues.mean.ST) # Missing values replaced by the mean / Standardisation
dim(data.mvalues.median.ST) # Missing values replaced by the median / Standardisation
dim(data.attribute.instance.deletion) # Without Leaf Weight variable/ Instances with a maximum of 2 missing values.
dim(data.without.missing.values.uncorrelated) # Uncorrelated variables and without missing values