-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathstep1_analyseUrl.R
198 lines (140 loc) · 5.98 KB
/
step1_analyseUrl.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
#autoinstall packages
packages <- c("readxl", "dplyr", "stringi", "stringr", "urltools")
if (length(setdiff(packages, rownames(installed.packages()))) > 0) {
install.packages(setdiff(packages, rownames(installed.packages())))
}
library(readxl)
library(dplyr)
library(stringi)
library(stringr)
library(urltools)
#conf
siteconf <- "./websites/dataseo/segments.csv"
pathxlsx <- "./websites/dataseo/internal_html.xlsx"
## use xlsx format to prevent read errors with csv and xls
print("open xlsx....")
#if (!exists("urls")) {
ptm <- proc.time()
urls <- read_excel(pathxlsx,
sheet = 1,
col_names = TRUE,
na = "",
skip=1)
# last line is always NA
urls <- head(urls,-1)
# crawler generate NA columns
urls <- urls[colSums(!is.na(urls)) > 0]
print("urls")
print(proc.time() - ptm)
# detect domain name
sitename <- paste(scheme(urls[1,]$Address),"://",domain(urls[1,]$Address),sep="")
#}
print("urls loaded")
print("-------------------")
###############################################################
ptm <- proc.time()
print("classify urls ")
schemas <- read.csv(siteconf,
header = FALSE,
col.names = "schema",
stringsAsFactors = FALSE
)
schemas <- as.character(schemas[,1])
urls$Category <- "no match"
for (j in 1:length(schemas))
{
#print(schemas[j])
urls$Category[which(stri_detect_fixed(urls$Address,schemas[j],case_insensitive=TRUE))] <- schemas[j]
}
# Detect HomePage
urls$`Category`[1] <- 'Home'
urls$Category <- as.factor(urls$Category)
print("urls classified")
print("-------------------")
print(proc.time() - ptm)
##########################################################################
# Compliant Pages
# Canonical Not Equal
# Meta No-index
# Bad HTTP Status Code
# Not Equal
urls$Compliant <- TRUE
urls$Compliant[which(urls$`Status Code` != 200
| urls$`Canonical Link Element 1` != urls$Address
| urls$Status != "OK"
| grepl("noindex",urls$`Meta Robots 1`)
)] <- FALSE
urls$Compliant <- as.factor(urls$Compliant)
print("Compliant OK")
# Classify by inlinks
urls$`Group Inlinks` <- "URLs with No Follow Inlinks"
urls$`Group Inlinks`[which(urls$`Inlinks` < 1 )] <- "URLs with No Follow Inlinks"
urls$`Group Inlinks`[which(urls$`Inlinks` == 1 )] <- "URLs with 1 Follow Inlink"
urls$`Group Inlinks`[which(urls$`Inlinks` > 1 & urls$`Inlinks` < 6)] <- "URLs with 2 to 5 Follow Inlinks"
urls$`Group Inlinks`[which(urls$`Inlinks` >= 6 & urls$`Inlinks` < 11 )] <- "URLs with 5 to 10 Follow Inlinks"
urls$`Group Inlinks`[which(urls$`Inlinks` >= 11)] <- "URLs with more than 10 Follow Inlinks"
urls$`Group Inlinks` <- as.factor(urls$`Group Inlinks`)
print("Group Inlinks OK")
# Classify Speed
urls$Speed <- NA
urls$Speed[which(urls$`Response Time` < 0.501 )] <- "Fast"
urls$Speed[which(urls$`Response Time` >= 0.501 & urls$`Response Time` < 1.001)] <- "Medium"
urls$Speed[which(urls$`Response Time` >= 1.001 & urls$`Response Time` < 2.001)] <- "Slow"
urls$Speed[which(urls$`Response Time` >= 2.001)] <- "Slowest"
urls$Speed <- as.factor(urls$Speed)
print("Speed OK")
# Detect Active Pages
urls$Active <- FALSE
if ( !is.null(urls$`GA Sessions`) ) {
urls$`GA Sessions`[is.na(urls$`GA Sessions`)] <- "0"
urls$`GA Sessions` <- as.numeric(urls$`GA Sessions`)
urls$Active[which(urls$`GA Sessions` > 0)] <- TRUE
}
urls$Active <- as.factor(urls$Active)
print("Active OK")
# Detect DupliCategorye Meta
urls$`Status Title` <- 'Unique'
urls$`Status Title`[which(urls$`Title 1 Length` == 0)] <- "No Set"
urls$`Status Description` <- 'Unique'
urls$`Status Description`[which(urls$`Meta Description 1 Length` == 0)] <- "No Set"
urls$`Status H1` <- 'Unique'
urls$`Status H1`[which(urls$`H1-1 Length` == 0)] <- "No Set"
urls$`Status Title`[which(duplicated(urls$`Title 1`))] <- 'Duplicate'
urls$`Status Description`[which(duplicated(urls$`Meta Description 1`))] <- 'Duplicate'
urls$`Status H1`[which(duplicated(urls$`H1-1`))] <- 'Duplicate'
urls$`Status Title` <- as.factor(urls$`Status Title`)
urls$`Status Description` <- as.factor(urls$`Status Description`)
urls$`Status H1` <- as.factor(urls$`Status H1`)
print("DC OK")
urls$`Group WordCount` <- "0 - 150"
urls$`Group WordCount`[which(urls$`Word Count` >=150 & urls$`Word Count` < 250 )] <- "150 - 250"
urls$`Group WordCount`[which(urls$`Word Count` >= 250 & urls$`Word Count` < 500)] <- "250 - 500"
urls$`Group WordCount`[which(urls$`Word Count` >= 500 & urls$`Word Count` < 1000 )] <- "500 - 1000"
urls$`Group WordCount`[which(urls$`Word Count` >= 1000 & urls$`Word Count` < 3000)] <- "1000 - 3000"
urls$`Group WordCount`[which(urls$`Word Count` >= 3000 )] <- "3000 +"
urls$`Group WordCount` <- as.factor(urls$`Group WordCount`)
print("Group WordCount OK")
urls$`Group Visits` <- "0 visit"
urls$`Group Visits`[which(urls$`GA Sessions`==1)] <- "1 visit"
urls$`Group Visits`[which(urls$`GA Sessions`>1)] <- "2 to 10 visits"
urls$`Group Visits`[which(urls$`GA Sessions`>10)] <- "11 to 100 visits"
urls$`Group Visits`[which(urls$`GA Sessions`>100)] <- "100+ visit"
urls$`Group Visits` <- as.factor(urls$`Group Visits`)
print("Group Visits OK")
#Numeric
urls$`Status Code` <- as.factor(as.character(urls$`Status Code`))
# Stop the clock
print(proc.time() - ptm)
print("-------------------")
#####################################################################
#Generate CSV
print("Generate CSV")
# prepare csv for elasticsearch
# TODO : add response_time : double
urls_csv <- select(urls, Address,Category,Active,Speed,Compliant,Level,
Inlinks,Outlinks,
`Status Title`,`Status Description`,`Status H1`,
`Group Inlinks`,`Group WordCount`) %>%
mutate(Address=gsub(sitename,"",Address))
colnames(urls_csv) <- NULL
write.csv2(urls_csv,paste("filebeat-csv/crawled-urls-filebeat-",format(Sys.time(), "%Y%m%d"),".csv",sep=""), row.names = FALSE)