sentiment-manning.Rmd

---
title: "Sentiment Analysis Case Study: Peyton Manning vs. Tom Brady"
author: "Dr. Stephen W. Thomas, Queen's University"
date: "July 14, 2017"
output:
  pdf_document:
    highlight: pygments
    number_sections: yes
    toc: no
    toc_depth: '2'
---


```{r}
library(tidytext)
library(RSentiment)
library(cleanNLP)
library(tidyr)
library(dplyr)
library(ggplot2)
library(readr)
library(lubridate)
library(stringr)
library(scales)
```


```{r}
# Example input: "Thu Oct 02 15:38:29 +0000 2014"
configDate = function(x) {
  tmp = strsplit(x, "\\s+")[[1]]
  as.POSIXct(paste(tmp[2], tmp[3], tmp[6]), format="%b %d %Y", tz="GMT")
}
```

```{r}

createDate = FALSE
if (createDate == TRUE){
  tweets_manning <- read_csv("manning.csv")
  tweets_brady <- read_csv("brady.csv")
  tweets <- bind_rows(tweets_manning %>% 
                        mutate(person = "Manning"),
                      tweets_brady %>% 
                        mutate(person = "Brady"))
  
  tweets$timestamp = 0
  for (i in 1:nrow(tweets)){
    tweets[i,]$timestamp = configDate(tweets[i,]$time)
  }
  
  tweets = tweets %>% 
    select(-time) %>%
    select(id, timestamp, person, text)
  
  write_csv(tweets, "alltweets.csv")
} else {
  tweets = read_csv("alltweets.csv", col_types = list(id = col_number()))
}

tweets$id = c(1:nrow(tweets))
tweets[1:10,]

tweets <- tweets %>%
  sample_frac(0.1)
```

```{r}
ggplot(tweets, aes(x = as.POSIXct(timestamp, origin="1970-01-01"), fill = person)) +
  geom_histogram(position = "identity", bins = 11, show.legend = FALSE) +
  facet_wrap(~person, ncol = 1, scales = "free_y") + 
  scale_x_datetime(labels = date_format("%b %d %Y", tz="GMT"), breaks=date_breaks("1 day")) +
  labs(x="Date", y="Number of Tweets") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) 
```

```{r}

# TODO:
# Count exact duplicates
# Count retweets
# Count tweets with url
# Count length of tweets


# Show duplicate tweets
tweets %>%
  group_by(text) %>%
  summarize(count = n()) %>%
  arrange(desc(count))
  
replace_reg <- "https://t.co/[A-Za-z\\d]+|http://[A-Za-z\\d]+|&amp;|&lt;|&gt;|RT|https"
unnest_reg <- "([^A-Za-z_\\d#@']|'(?![A-Za-z_\\d#@]))"
custom_stop_words = data.frame(word=c("loan", "business"))

# Remove URLs
tweets = tweets %>% 
  mutate(text = gsub("(f|ht)tp(s?)://\\S+", "", text))

# Remove URLs
tweets = tweets %>% 
  mutate(text = gsub("(f|ht)tp(s?)://\\S+", "", text))

tweets
```


Create tidy format.
```{r}

text_df <- tweets %>% 
  #filter(!str_detect(text, "^RT")) %>%
  #mutate(text = str_replace_all(text, replace_reg, "")) %>%
  #unnest_tokens(word, text, token = "regex", pattern = unnest_reg) %>%
  unnest_tokens(word, text)

text_df
```


```{r}
## Remove stopwords
custom_stop_words = data.frame(word=c("loan", "business"))
text_df <- text_df %>%
  anti_join(stop_words, by=c("word"="word")) %>%
  anti_join(custom_stop_words, by=c("word"="word")) %>%
  arrange(id)
```


Do sentiment analysis for 

```{r}
nrc <- get_sentiments("nrc")
head(text_df)
sents = text_df %>%
  inner_join(nrc) %>%
  count(person, id = id, timestamp=timestamp, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative) %>%
  arrange(sentiment)
sents
```


Look at examples
Quantify sentiment by rating
Look at average
Look at average over time
Look at different emotions
Try different lexicons


```{r}
sents
sents %>%
  ggplot(aes(sentiment)) + geom_histogram() + facet_wrap(~person)


sents %>%
  filter(person=="Manning") %>%
  group_by(sentiment) %>%
  summarise (n = n()) %>%
  mutate(freq = n / sum(n), cum = cumsum(freq))

sents %>%
  filter(person=="Brady") %>%
  group_by(sentiment) %>%
  summarise (n = n()) %>%
  mutate(freq = n / sum(n), cum = cumsum(freq))
```

```{r}
tweets[tweets$id==6621,]$text
```

```{r}
text_df %>%
  filter (id == 1739)
```