-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathBad_word_removal.py
More file actions
77 lines (71 loc) · 2.64 KB
/
Bad_word_removal.py
File metadata and controls
77 lines (71 loc) · 2.64 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
stop_words = "i me my myself we our ours ourselves you your yours yourself yourselves he him his himself she her hers herself it its itself they them their theirs themselves what which who whom this that these those am is are was were be been being have has had having do does did doing a an the and but if or because as until while of at by for with about against between into through during before after above below to from up down in out on off over under again further then once here there whenwhere why how all any both eachfew more most other some such no nor not only own same so than too very s t can will just don should now "
ls = []
rs = []
# a list of keywords
gs = ["covid19","corona","pandemic","coronawarriors",
"incubation period", "community spread", "n95", "quarantine " ,"isolation",
"epidemic","flattenig the curve","comorbidity","social distancing","hydroxychloroquine",
"aarogya setu app","lockdown","lockdown extensio","virus","infection","airborn","cough","fever","masks","mask"]
replace = ", . - ' ; : / ™ [ ] { } ( ) * - + & ! @ # $ % ^ _ = ` ~"
i = 0
# change the value of this variable with the number of files that have been parsed
max1 = 100
number_pdf = 0
with open("./To_download_links.txt",'r') as t:
x = t.read()
for link in x.split():
if i < max1:
i += 1
link = link.split("/")[-1]
print(link)
try:
# with open("./Newpaper_Cleaned/"+link+'.txt','rb') as f:
with open("./Better_cleaned/"+link+'.txt','rb') as f:
rs.append(number_pdf)
number_pdf +=1
total_words = 0
counter = 0
for line in f:
line = line.lower().decode('utf-8')
# print(line)
line = line.split()
# print(line)
for word in line:
try:
for x in replace.split():
if x in word:
word = word.replace(x,"")
# bad word removal in this step
if word in stop_words:
# print(word)
pass
else:
total_words +=1
# for multiple words or hot keywords
for c in gs:
c = c.strip()
if c == word.lower():
# print(c)
counter +=1
# print(counter)
except:
pass
f.close()
print(counter)
print(total_words)
print("Percentage = ",counter/total_words * 100)
print()
ls.append(counter)
counter = 0
except:
pass
print(ls)
ls.reverse()
print(ls,rs)
# this is the graph plotting section of the code
from pandas import DataFrame
import matplotlib.pyplot as plt
Data ={ 'Day_number':rs,'Occurences':ls}
df = DataFrame(Data,columns=['Day_number','Occurences'])
df.plot(x ='Day_number', y='Occurences', kind = 'line')
plt.show()