-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdataset_stats.py
93 lines (74 loc) · 4.19 KB
/
dataset_stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import nltk
from nltk.corpus import movie_reviews, subjectivity, stopwords
from experiment import Experiment
from settings import STATS_SAVE_PATH
from baseline import BaselineExperiment
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import string
from utils import removeObjectiveSents
def compute_stats(data, name, neg=None, pos=None):
stats = {}
seq_lens = [len(sents) for sents in data]
stats["num_sequences"] = len(data)
stats["num_words"] = sum([len(sent) for sents in data for sent in sents])
stats["avg_seq_len"] = np.average(seq_lens).round(decimals=2)
stats["std_seq_len"] = np.std(seq_lens).round(decimals=2)
stats["max_seq_len"] = np.max(seq_lens)
stats["min_seq_len"] = np.min(seq_lens)
NLTK_STOP_WORDS = set(stopwords.words('english')+list(string.punctuation))
lexicon = set([w for doc in data for w in doc])
filtered_mr_words = [word for word in lexicon if not word in NLTK_STOP_WORDS]
lexicon_filtered = set(filtered_mr_words)
stats["lexicon_size"] = len(lexicon)
stats["lexicon_size_no_stopwords"] = len(lexicon_filtered)
if pos is not None and neg is not None:
filtered_neg = [w for doc in neg for w in doc if not w in NLTK_STOP_WORDS]
filtered_pos = [w for doc in pos for w in doc if not w in NLTK_STOP_WORDS]
lexicon_intersection = set(filtered_neg).intersection(set(filtered_pos))
stats["lexicon_intersection_size"] = len(lexicon_intersection)
stats["most_common_words_neg"] = [w for w, _ in nltk.FreqDist(filtered_neg).most_common(10)]
stats["most_common_words_pos"] = [w for w, _ in nltk.FreqDist(filtered_pos).most_common(10)]
intersect_common_words = set(stats["most_common_words_neg"]).intersection(set(stats["most_common_words_pos"]))
stats["most_common_words_intersect"] = list(intersect_common_words)
stats["neg_only_words_len"] = len([w for w in set(filtered_neg) if w not in lexicon_intersection])
stats["neg_only_words"] = [w for w in set(filtered_neg) if w not in lexicon_intersection]
stats["pos_only_words_len"] = len([w for w in set(filtered_pos) if w not in lexicon_intersection])
stats["pos_only_words"] = [w for w in set(filtered_pos) if w not in lexicon_intersection]
return stats
if __name__ == "__main__":
stats = {}
# Movie review dataset
negative_fileids = movie_reviews.fileids('neg')
positive_fileids = movie_reviews.fileids('pos')
# each is a list of documents
mr_neg_words = [movie_reviews.words(fileids=fileid) for fileid in negative_fileids]
mr_pos_words = [movie_reviews.words(fileids=fileid) for fileid in positive_fileids]
mr_neg_sents = [movie_reviews.sents(fileids=fileid) for fileid in negative_fileids]
mr_pos_sents = [movie_reviews.sents(fileids=fileid) for fileid in positive_fileids]
mr_sents = mr_neg_sents + mr_pos_sents
mr_words = mr_neg_words + mr_pos_words
stats["MR"] = compute_stats(mr_words, "MR", mr_pos_words, mr_neg_words)
# Treating MR as subjectivity dataset (list of sentences)
mr_sjv = [sent for doc in mr_sents for sent in doc]
stats["MR_sjv"] = compute_stats(mr_sjv, "MR_SJV")
# Subjectivity dataset
obj_fileid = subjectivity.fileids()[0] # plot.tok.gt9.5000
subj_fileid = subjectivity.fileids()[1] # quote.tok.gt9.5000
obj_words = subjectivity.sents(fileids=obj_fileid)
subj_words = subjectivity.sents(fileids=subj_fileid)
sjv_words = obj_words + subj_words
stats["SJV"] = compute_stats(sjv_words, "SJV", obj_words, subj_words)
# Clean MR
# Train baseline subjectivity classifier
exp_subjectivity = BaselineExperiment(task="subjectivity")
sjv_classifier, sjv_vectorizer = exp_subjectivity.run()
mr_vectors = sjv_vectorizer.transform([" ".join(sent) for sent in mr_sjv])
preds = sjv_classifier.predict(mr_vectors)
# Remove objective sentences
mr_sents_filtered = removeObjectiveSents(mr_sents, preds, tokenized=True)
stats["MR_clean_baseline"] = compute_stats(mr_sents_filtered, "MR_clean_baseline", mr_sents_filtered[:1000], mr_sents_filtered[1000:])
stats_df = pd.DataFrame.from_dict(stats, orient="index")
stats_df.to_csv(f"{STATS_SAVE_PATH}/datasets.csv")
print(stats_df)