forked from Yiangos01/ADE
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathNaiveBayes.py
92 lines (83 loc) · 2.69 KB
/
NaiveBayes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
from pandas import DataFrame
import pandas as pd
import re
import numpy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import KFold
from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle
from nltk.stem.porter import PorterStemmer
#Create a dataframe which in the first column contains the text and in the second the category
def build_data_frame(file_name):
stemmer=PorterStemmer()
with open(file_name,'rb') as tweets:
firstline=True
firsttopic=True
prevtopic=None
flag=True
topics1=[]
topics2=[]
rows1=[]
rows2=[]
line1=[]
for tweet in tweets:
if firstline:
firstline=False
else:
fields = tweet.strip().split('\t')
if firsttopic:
prevtopic=fields[14]
firsttopic=False
line = fields[5]
line = line.replace("rt", "")
line = line.replace("RT", "")
line = re.sub(r"http\S+", "", line)
line = re.sub(r"([!.'():,?%]+)", " ", line)
line = re.sub(r"https\S+", "", line)
line = re.sub(r"(@[a-zA-Z0-9_]+)", "", line)
line = re.sub(r"([0-9]+)", "", line)
line = line.lower()
line = re.sub(r"(.)\1{1,}", r"\1\1", line)
line = line.strip()
if prevtopic != fields[14] and flag:
flag=False
prevtopic=fields[14]
elif prevtopic != fields[14] and not flag:
flag=True
prevtopic=fields[14]
if flag :
rows2.append({"text":line,"category":fields[1]})
topics2.append(fields[14])
else :
rows1.append({"text":line,"category":fields[1]})
topics1.append(fields[14])
dataframe1 = DataFrame(rows1,index=topics1)
dataframe2 = DataFrame(rows2,index=topics2)
return pd.concat([dataframe2,dataframe1])
if __name__=='__main__':
data=build_data_frame('bb.csv')
#data=data.groupby(['topics'],as_index=False).sum()
#data = shuffle(data)
print data
#data=data.sample(frac=1) #shuffle dataset
pipeline = Pipeline([ ('vectorizer',CountVectorizer()),
('tfidf', TfidfTransformer()),
('classifier',MultinomialNB())
])
#Cross-Validating
kfold = KFold(n=len(data),n_folds=6)
scores = []
for train_ind,test_ind in kfold:
train_text = data.iloc[train_ind]['text'].values
train_y= data.iloc[train_ind]['category'].values
test_text = data.iloc[test_ind]['text'].values
test_y= data.iloc[test_ind]['category'].values
pipeline.fit(train_text,train_y)
predictions = pipeline.predict(test_text)
score = accuracy_score(test_y, predictions)
print score
scores.append(score)
print ("Score: {:.2f}".format(sum(scores[])/len(scores[])*100))