-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbaseline.py
executable file
·158 lines (129 loc) · 5.9 KB
/
baseline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
#!/usr/bin/env python3
import argparse
import collections
import os.path
import re
import sklearn.naive_bayes
import sklearn.feature_extraction
import sklearn.svm
import sklearn.preprocessing
import evaluate_iest
def arguments():
parser = argparse.ArgumentParser("Some baselines for IEST")
parser.add_argument("-d", "--datadir", type=os.path.abspath, default="/ccl/projects/IEST/", help="Path to training and trial data; default: /ccl/projects/IEST/")
return parser.parse_args()
def read_train_data(filename):
data, labels = [], []
with open(filename, encoding="utf8") as fh:
for line in fh:
l, d = line.strip().split("\t")
d = re.sub(r"\[#TRIGGERWORD#\]", "", d)
text = " ".join(d.split())
data.append(text)
labels.append(l)
return data, labels
def read_labels(filename):
with open(filename, encoding="utf8") as fh:
labels = [l.strip() for l in fh]
return labels
def majority_baseline(train_labels, test_labels):
train_freq = collections.Counter(train_labels)
most_common = train_freq.most_common(1)[0][0]
fake_prediction = [most_common] * len(test_labels)
print("\n## Majority baseline ##\n")
evaluate_iest.calculatePRF(test_labels, fake_prediction)
def strip_triggerword(data):
return [re.sub(r"\[#TRIGGERWORD#\]", "", t) for t in data]
def bow_baseline_naive_bayes(train_data, train_labels, test_data, test_labels):
clf = sklearn.naive_bayes.MultinomialNB()
cv = sklearn.feature_extraction.text.CountVectorizer()
train_data = strip_triggerword(train_data)
test_data = strip_triggerword(test_data)
train = cv.fit_transform(train_data)
test = cv.transform(test_data)
clf.fit(train, train_labels)
pred = clf.predict(test)
print("\n## Bag of words (Naive Bayes) ##\n")
evaluate_iest.calculatePRF(test_labels, pred.tolist())
def tfidf_baseline_naive_bayes(train_data, train_labels, test_data, test_labels):
clf = sklearn.naive_bayes.MultinomialNB()
cv = sklearn.feature_extraction.text.TfidfVectorizer()
# train_data = strip_triggerword(train_data)
# test_data = strip_triggerword(test_data)
train = cv.fit_transform(train_data)
test = cv.transform(test_data)
clf.fit(train, train_labels)
pred = clf.predict(test)
print("\n## Bag of words tf-idf (Naive Bayes) ##\n")
evaluate_iest.calculatePRF(test_labels, pred.tolist())
def bow_baseline_svm(train_data, train_labels, test_data, test_labels):
cv = sklearn.feature_extraction.text.CountVectorizer()
clf = sklearn.svm.LinearSVC()
# scaler = sklearn.preprocessing.StandardScaler(with_mean=False)
# train_data = strip_triggerword(train_data)
# test_data = strip_triggerword(test_data)
train = cv.fit_transform(train_data)
# train = scaler.fit_transform(train)
test = cv.transform(test_data)
# test = scaler.transform(test)
clf.fit(train, train_labels)
pred = clf.predict(test)
print("\n## Bag of words (Linear SVC) ##\n")
evaluate_iest.calculatePRF(test_labels, pred.tolist())
def tfidf_baseline_svm(train_data, train_labels, test_data, test_labels):
cv = sklearn.feature_extraction.text.TfidfVectorizer()
clf = sklearn.svm.LinearSVC()
# scaler = sklearn.preprocessing.StandardScaler(with_mean=False)
# train_data = strip_triggerword(train_data)
# test_data = strip_triggerword(test_data)
train = cv.fit_transform(train_data)
# train = scaler.fit_transform(train)
test = cv.transform(test_data)
# test = scaler.transform(test)
clf.fit(train, train_labels)
pred = clf.predict(test)
print("\n## Bag of words tf-idf (Linear SVC) ##\n")
evaluate_iest.calculatePRF(test_labels, pred.tolist())
def bigrams_svm(train_data, train_labels, test_data, test_labels):
cv = sklearn.feature_extraction.text.CountVectorizer(ngram_range=(1, 2))
clf = sklearn.svm.LinearSVC()
# scaler = sklearn.preprocessing.StandardScaler(with_mean=False)
# train_data = strip_triggerword(train_data)
# test_data = strip_triggerword(test_data)
train = cv.fit_transform(train_data)
# train = scaler.fit_transform(train)
test = cv.transform(test_data)
# test = scaler.transform(test)
clf.fit(train, train_labels)
pred = clf.predict(test)
print("\n## Bag of uni- and bigrams (Linear SVC) ##\n")
evaluate_iest.calculatePRF(test_labels, pred.tolist())
def tfidf_bigrams_svm(train_data, train_labels, test_data, test_labels):
cv = sklearn.feature_extraction.text.TfidfVectorizer(ngram_range=(1, 2))
clf = sklearn.svm.LinearSVC()
# scaler = sklearn.preprocessing.StandardScaler(with_mean=False)
# train_data = strip_triggerword(train_data)
# test_data = strip_triggerword(test_data)
train = cv.fit_transform(train_data)
# train = scaler.fit_transform(train)
test = cv.transform(test_data)
# test = scaler.transform(test)
clf.fit(train, train_labels)
pred = clf.predict(test)
print("\n## Bag of uni- and bigrams tf-idf (Linear SVC) ##\n")
evaluate_iest.calculatePRF(test_labels, pred.tolist())
def main():
args = arguments()
train_data, train_labels = read_train_data(os.path.join(args.datadir, "train-v3.csv_tokenized.txt"))
trial_data = read_train_data(os.path.join(args.datadir, "trial-v3.csv_tokenized.txt"))[0]
trial_labels = read_labels(os.path.join(args.datadir, "trial-v3.labels"))
print("# Some baselines for IEST #")
majority_baseline(train_labels, trial_labels)
bow_baseline_naive_bayes(train_data, train_labels, trial_data, trial_labels)
tfidf_baseline_naive_bayes(train_data, train_labels, trial_data, trial_labels)
bow_baseline_svm(train_data, train_labels, trial_data, trial_labels)
tfidf_baseline_svm(train_data, train_labels, trial_data, trial_labels)
bigrams_svm(train_data, train_labels, trial_data, trial_labels)
tfidf_bigrams_svm(train_data, train_labels, trial_data, trial_labels)
if __name__ == "__main__":
main()