forked from kbenoit/ITAUR
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdemo.R
137 lines (108 loc) · 4.7 KB
/
demo.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
## Demonstration of quanteda's capabilities
##
## Ken Benoit <[email protected]>
## Paul Nulty <[email protected]>
require(quanteda)
help(package="quanteda")
## create a corpus from a text vector of UK immigration texts
summary(ukimmigTexts)
str(ukimmigTexts)
encoding(ukimmigTexts)
encoding(encodedTexts)
# create a corpus from immigration texts
immigCorpus <- corpus(ukimmigTexts, notes="Created as part of a demo.")
docvars(immigCorpus) <- data.frame(party = docnames(immigCorpus), year = 2010)
summary(immigCorpus)
# explore using kwic
kwic(immigCorpus, "deport", window = 3)
kwic(immigCorpus, "illegal immig*", window = 3)
# extract a document-feature matrix
immigDfm <- dfm(subset(immigCorpus, party=="BNP"))
plot(immigDfm)
immigDfm <- dfm(subset(immigCorpus, party=="BNP"), ignoredFeatures = stopwords("english"))
plot(immigDfm, random.color = TRUE, rot.per = .25, colors = sample(colors()[2:128], 5))
# change units to sentences
immigCorpusSent <- changeunits(immigCorpus, to = "sentences")
summary(immigCorpusSent, 20)
## tokenize some texts
txt <- "#TextAnalysis is MY <3 4U @myhandle gr8 #stuff :-)"
tokenize(txt, removePunct=TRUE)
tokenize(txt, removePunct=TRUE, removeTwitter=TRUE)
(toks <- tokenize(toLower(txt), removePunct=TRUE, removeTwitter=TRUE))
str(toks)
# tokenize sentences
(sents <- tokenize(ukimmigTexts[1], what = "sentence", simplify = TRUE)[1:5])
# tokenize characters
tokenize(ukimmigTexts[1], what = "character", simplify = TRUE)[1:100]
## some descriptive statistics
## create a document-feature matrix from the inaugural corpus
summary(inaugCorpus)
presDfm <- dfm(inaugCorpus)
presDfm
docnames(presDfm)
# concatenate by president name
presDfm <- dfm(inaugCorpus, groups="President")
presDfm
docnames(presDfm)
# need first to install quantedaData, using
# devtools::install_github("kbenoit/quantedaData")
## show some selection capabilities on Irish budget corpus
data(iebudgetsCorpus, package = "quantedaData")
summary(iebudgetsCorpus, 10)
ieFinMin <- subset(iebudgetsCorpus, number=="01" & debate == "BUDGET")
summary(ieFinMin)
dfmFM <- dfm(ieFinMin)
plot(2008:2012, lexdiv(dfmFM, "C"), xlab="Year", ylab="Herndan's C", type="b",
main = "World's Crudest Lexical Diversity Plot")
# plot some readability statistics
data(SOTUCorpus, package = "quantedaData")
fk <- readability(SOTUCorpus, "Flesch.Kincaid")
year <- lubridate::year(docvars(SOTUCorpus, "Date"))
require(ggplot2)
partyColours <- c("blue", "blue", "black", "black", "red", "red")
p <- ggplot(data = docvars(SOTUCorpus), aes(x = year, y = fk)) + #, group = delivery)) +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black")) +
geom_smooth(alpha=0.2, linetype=1, color="grey70", method = "loess", span = .34) +
xlab("") +
ylab("Flesch-Kincaid") +
geom_point(aes(colour = party)) +
scale_colour_manual(values = partyColours) +
geom_line(aes(), alpha=0.3, size = 1) +
ggtitle("Text Complexity in State of the Union Addresses") +
theme(plot.title = element_text(lineheight=.8, face="bold"))
quartz(height=7, width=12)
print(p)
## Presidential Inaugural Address Corpus
presDfm <- dfm(inaugCorpus, ignoredFeatures = stopwords("english"))
# compute some document similarities
similarity(presDfm, "1985-Reagan", n=5, margin="documents")
similarity(presDfm, c("2009-Obama" , "2013-Obama"), n=5, margin="documents", method = "cosine")
similarity(presDfm, c("2009-Obama" , "2013-Obama"), n=5, margin="documents", method = "Hellinger")
similarity(presDfm, c("2009-Obama" , "2013-Obama"), n=5, margin="documents", method = "eJaccard")
# compute some term similarities
similarity(presDfm, c("fair", "health", "terror"), method="cosine")
## mining collocations
# form ngrams
txt <- "Hey @kenbenoit #textasdata: The quick, brown fox jumped over the lazy dog!"
(toks1 <- tokenize(toLower(txt), removePunct = TRUE))
tokenize(toLower(txt), removePunct = TRUE, ngrams = 2)
tokenize(toLower(txt), removePunct = TRUE, ngrams = c(1,3))
# low-level options exist too (note: Need to port to C++)
ngrams(tokens, c(1, 3, 5))
# form "skip-grams"
tokens <- tokenize(toLower("Insurgents killed in ongoing fighting."),
removePunct = TRUE, simplify = TRUE)
skipgrams(tokens, n = 2, k = 2, concatenator = " ")
skipgrams(tokens, n = 3, k = 2, concatenator = " ")
# mine bigrams
collocs2 <- collocations(inaugTexts, size = 2, method = "all")
head(collocs2, 20)
# mine trigrams
collocs3 <- collocations(inaugTexts, size = 3, method = "all")
head(collocs3, 20)
# remove parts of speech and inspect
head(removeFeatures(collocs2, stopwords("english")), 20)
head(removeFeatures(collocs3, stopwords("english")), 20)