echo = TRUE
First step is loading and cleaning of data
tableRaw <- read.csv("activity.csv", header=T, colClasses=c("numeric", "character", "numeric"))
tableRaw$date <- as.Date(tableRaw$date)
summary(tableRaw)
## steps date interval
## Min. : 0.0 Min. :2012-10-01 Min. : 0
## 1st Qu.: 0.0 1st Qu.:2012-10-16 1st Qu.: 589
## Median : 0.0 Median :2012-10-31 Median :1178
## Mean : 37.4 Mean :2012-10-31 Mean :1178
## 3rd Qu.: 12.0 3rd Qu.:2012-11-15 3rd Qu.:1766
## Max. :806.0 Max. :2012-11-30 Max. :2355
## NA's :2304
head(tableRaw)
## steps date interval
## 1 NA 2012-10-01 0
## 2 NA 2012-10-01 5
## 3 NA 2012-10-01 10
## 4 NA 2012-10-01 15
## 5 NA 2012-10-01 20
## 6 NA 2012-10-01 25
- Histogram of the total number of steps taken each day We can ignore NA's from the table and another column month in the table.
tableNA <- subset(tableRaw,!is.na(tableRaw$steps))
tableNA$month <- format(tableNA$date, "%b", ordered=T)
head(tableNA)
## steps date interval month
## 289 0 2012-10-02 0 Oct
## 290 0 2012-10-02 5 Oct
## 291 0 2012-10-02 10 Oct
## 292 0 2012-10-02 15 Oct
## 293 0 2012-10-02 20 Oct
## 294 0 2012-10-02 25 Oct
library('ggplot2')
qplot(x=date, y=steps, data=tableNA, stat='summary', fun.y=sum, geom='bar', width=0.8) +facet_grid(. ~month, scales = "free")+ labs(title = "Histogram of Total Number of Steps per Day", x = "Date", y = "Total number of steps")
- Mean Total number of steps per day
stepsDay <- aggregate(formula = steps~date, data = tableNA, FUN = sum)
mean(stepsDay$steps)
## [1] 10766
- Median Total number of steps per day
median(stepsDay$steps)
## [1] 10765
Aggregation of the steps by intervals
tableAggr <- aggregate(formula = steps~interval, data = tableNA, FUN = mean)
head(tableAggr)
## interval steps
## 1 0 1.71698
## 2 5 0.33962
## 3 10 0.13208
## 4 15 0.15094
## 5 20 0.07547
## 6 25 2.09434
qplot(x=interval, y=steps, data=tableAggr, geom='line') + labs(title = "Time Series Plot of the 5-minute Interval", x = "Intervals", y = "Average Number of Steps Taken")
maxSteps <- tableAggr[which(tableAggr$steps == max(tableAggr$steps)),]
maxSteps
## interval steps
## 104 835 206.2
Thus 835th interval has maximum number of steps 206.2 .
The total number of NA's in table is :
sum(is.na(tableRaw))
## [1] 2304
Replacing the NA's in table :
replaceNA <- function(dataOld, aggr) {
data <- dataOld
for (i in 1:nrow(data)) {
if (is.na(data$steps[i])) {
data$steps[i] <- aggr[which(data$interval[i] == aggr$interval), ]$steps
}
}
return(data)
}
tableNew <- replaceNA(tableRaw, tableAggr)
head(tableNew)
## steps date interval
## 1 1.71698 2012-10-01 0
## 2 0.33962 2012-10-01 5
## 3 0.13208 2012-10-01 10
## 4 0.15094 2012-10-01 15
## 5 0.07547 2012-10-01 20
## 6 2.09434 2012-10-01 25
- Histogram of the total number of steps taken each day
##tableNA <- subset(tableRaw,!is.na(tableRaw$steps))
tableNew$month <- format(tableNew$date, "%b", ordered=T)
library('ggplot2')
qplot(x=date, y=steps, data=tableNew, stat='summary', fun.y=sum, geom='bar', width=0.8) +facet_grid(. ~month, scales = "free")+ labs(title = "Histogram of Total Number of Steps per Day", x = "Date", y = "Total number of steps")
- Mean Total number of steps per day
stepsDay <- aggregate(formula = steps~date, data = tableNew, FUN = sum)
mean(stepsDay$steps)
## [1] 10766
- Median Total number of steps per day
median(stepsDay$steps)
## [1] 10766
We can observe that mean is same but after replacing NA's in data frame the median has shifted. The new median is greater than old median.
First step is to assign "weekdays" and "weekends" level.
tableNew$weekInfo <- factor(format(tableNew$date, "%A"))
levels(tableNew$weekInfo) <- list(weekday = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday"), weekend = c("Saturday", "Sunday"))
head(tableNew)
## steps date interval month weekInfo
## 1 1.71698 2012-10-01 0 Oct weekday
## 2 0.33962 2012-10-01 5 Oct weekday
## 3 0.13208 2012-10-01 10 Oct weekday
## 4 0.15094 2012-10-01 15 Oct weekday
## 5 0.07547 2012-10-01 20 Oct weekday
## 6 2.09434 2012-10-01 25 Oct weekday
tableAggr1 <- aggregate(tableNew$steps, list(interval = tableNew$interval, weekInfo = tableNew$weekInfo), FUN = mean)
names(tableAggr1)[3] <- "steps"
head(tableAggr1)
## interval weekInfo steps
## 1 0 weekday 2.25115
## 2 5 weekday 0.44528
## 3 10 weekday 0.17317
## 4 15 weekday 0.19790
## 5 20 weekday 0.09895
## 6 25 weekday 1.59036
qplot(x=interval, y=steps, data=tableAggr1, geom='line') + facet_grid(weekInfo~.) + labs(title = "Average number of steps taken across weekends and weekdays", x = "Intervals", y = "Average Number of Steps Taken")
From the graph we can observe that the in weekdays we have the highest peak but weekends have more number of peaks compared to weekdays. Thus on weekends the intensity is spread out equally but on weekdays we have high intensity for a short period of time only.