-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathassignment1.r
154 lines (135 loc) · 5.07 KB
/
assignment1.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
pollutantmean <- function(directory, pollutant, id = 1:332) {
## 'directory' is a character vector of length 1 indicating
## the location of the CSV files
## 'pollutant' is a character vector of length 1 indicating
## the name of the pollutant for which we will calculate the
## mean; either "sulfate" or "nitrate".
## 'id' is an integer vector indicating the monitor ID numbers
## to be used
## Return the mean of the pollutant across all monitors list
## in the 'id' vector (ignoring NA values)
## NOTE: Do not round the result!
collect.good.data <- c()
for (an.id in id) {
file.path = sprintf("%s/%03d.csv",directory,an.id)
#if (exists("mydebug")) print(file.path)
data = read.csv(file.path)
#if (exists("mydebug")) print(head(data))
data.col = data[pollutant]
#if (exists("mydebug")) print(head(data.col))
filter <- complete.cases(data.col)
#if (exists("mydebug")) print(head(filter))
good.data <- data.col[filter, ]
#if (exists("mydebug")) print(class(good.data))
#if (exists("mydebug")) print(head(good.data))
collect.good.data = c(collect.good.data,good.data)
}
return(round(mean(collect.good.data),3))
}
# course ID: rprog-014
pollutantmean("specdata", "sulfate", 1:10)
pollutantmean("specdata", "nitrate", 70:72)
pollutantmean("specdata", "nitrate", 23)
complete <- function(directory, id = 1:332) {
## 'directory' is a character vector of length 1 indicating
## the location of the CSV files
## 'id' is an integer vector indicating the monitor ID numbers
## to be used
## Return a data frame of the form:
## id nobs
## 1 117
## 2 1041
## ...
## where 'id' is the monitor ID number and 'nobs' is the
## number of complete cases
col.1 <- c()
col.2 <- c()
col.head <- c("id","nobs")
for (an.id in id) {
file.path = sprintf("%s/%03d.csv",directory,an.id)
#if (exists("mydebug")) print(file.path)
data = read.csv(file.path)
#if (exists("mydebug")) print(nrow(data))
filter <- complete.cases(data)
good.data <- data[filter,]
#if (exists("mydebug")) print(head(good.data))
#if (exists("mydebug")) print(nrow(good.data))
col.1 = c(col.1,an.id)
col.2 = c(col.2,nrow(good.data))
}
return.frame = data.frame(col.1,col.2)
colnames(return.frame) <- col.head
#if (exists("mydebug")) print(return.frame)
return(return.frame)
}
mydebug <- T
complete("specdata", 1)
complete("specdata", 3)
complete("specdata", c(2, 4, 8, 10, 12))
complete("specdata", 30:25)
corr <- function(directory, threshold = 0) {
## 'directory' is a character vector of length 1 indicating
## the location of the CSV files
## 'threshold' is a numeric vector of length 1 indicating the
## number of completely observed observations (on all
## variables) required to compute the correlation between
## nitrate and sulfate; the default is 0
## Return a numeric vector of correlations
## NOTE: Do not round the result!
ret.vect <- c(numeric())
filepaths <- list.files(directory, pattern="*.csv", full.names=TRUE)
#if (exists("mydebug")) print(head(filepaths))
#if (exists("mydebug")) print(length(filepaths))
for(a.fp in filepaths) {
data = read.csv(a.fp)
#if (exists("mydebug")) print(nrow(data))
filter <- complete.cases(data)
good.data <- data[filter,]
#if (exists("mydebug")) print(nrow(good.data))
nu.good <- nrow(good.data)
#if(nu.good >= threshold) print(nu.good)
if(nu.good >= threshold && nu.good > 0) {
sulf.vect <- c(numeric())
nitr.vect <- c(numeric())
#print(nu.good)
#print(head(good.data))
sulf.vect <- good.data[,"sulfate"]
nitr.vect <- good.data[,"nitrate"]
#print(class(sulf.vect))
the.corr <- numeric()
the.corr <- cor(sulf.vect,nitr.vect)
#if(is.na(the.corr)) print(nrow(good.data))
ret.vect = c(ret.vect,the.corr)
}
}
if (exists("mydebug")) print(ret.vect)
return(ret.vect)
}
cr <- corr("specdata", 5000)
summary(cr)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
##
length(cr)
## [1] 0
cr <- corr("specdata", 150)
head(cr)
## [1] -0.01896 -0.14051 -0.04390 -0.06816 -0.12351 -0.07589
summary(cr)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.2110 -0.0500 0.0946 0.1250 0.2680 0.7630
cr <- corr("specdata", 400)
head(cr)
## [1] -0.01896 -0.04390 -0.06816 -0.07589 0.76313 -0.15783
summary(cr)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.1760 -0.0311 0.1000 0.1400 0.2680 0.7630
cr <- corr("specdata")
summary(cr)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.0000 -0.0528 0.1070 0.1370 0.2780 1.0000
length(cr)
## [1] 323
cr <- corr("specdata",1000) # three files
cr <- corr("specdata",1050) # one file
length(cr)
summary(cr)