forked from Pinafore/nlp-hw
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsgd.py
179 lines (131 loc) · 5.24 KB
/
sgd.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
import random
from numpy import zeros, sign
from math import exp, log
from collections import defaultdict
import json
import argparse
kSEED = 1701
kBIAS = "BIAS_CONSTANT"
random.seed(kSEED)
def sigmoid(score, threshold=20.0):
"""
Note: Prevents overflow of exp by capping activation at 20.
:param score: A real valued number to convert into a number between 0 and 1
"""
# You'll want to use this function, but don't modify it
if abs(score) > threshold:
score = threshold * sign(score)
activation = exp(score)
return activation / (1.0 + activation)
class Example:
"""
Class to represent a logistic regression example
"""
def __init__(self, json_line, vocab, use_bias=True):
"""
Create a new example
json_line -- The json object that contains the label ("label") and features as fields
vocab -- The vocabulary to use as features (list)
use_bias -- Include a bias feature (should be false with Pytorch)
"""
# Use but don't modify this function
self.nonzero = {}
self.y = 1 if json_line["label"] else 0
self.x = zeros(len(vocab))
for feature in json_line:
if feature in vocab:
assert feature != kBIAS, "Bias can't actually appear in document"
self.x[vocab.index(feature)] += float(json_line[feature])
self.nonzero[vocab.index(feature)] = feature
# Initialize the bias feature
if use_bias:
self.x[0] = 1
class LogReg:
def __init__(self, num_features, learning_rate=0.001):
"""
Create a logistic regression classifier
num_features -- The number of features (including bias)
learning_rate -- How big of a SG step we take
"""
# You *may* want to add additional data members here
self.beta = zeros(num_features)
self.learning_rate = learning_rate
def progress(self, examples):
"""
Given a set of examples, compute the probability and accuracy, which is returned as a tuple.
examples -- The dataset to score
"""
# You probably don't need to modify this code
logprob = 0.0
num_right = 0
for ii in examples:
p = sigmoid(self.beta.dot(ii.x))
if ii.y == 1:
logprob += log(p)
else:
logprob += log(1.0 - p)
# Get accuracy
if abs(ii.y - p) < 0.5:
num_right += 1
return logprob, float(num_right) / float(len(examples))
def sg_update(self, train_example):
"""
Compute a stochastic gradient update to improve the log likelihood and return the new feature weights.
train_example -- The example to take the gradient with respect to
"""
# Your code here
return self.beta
def inspect(self, vocab, limit=10):
"""
A fundtion to find the top features.
"""
None
def read_dataset(filename, vocab):
"""
Reads in a text dataset with a given vocabulary
filename -- json lines file of the dataset
"""
# You should not need to modify this function
assert vocab[0] == kBIAS, \
"First vocab word must be bias term (was %s)" % vocab[0]
dataset = []
with open(filename) as infile:
for line in infile:
ex = Example(json.loads(line), vocab)
dataset.append(ex)
# Shuffle the data so that we don't have order effects
random.shuffle(dataset)
return dataset
if __name__ == "__main__":
argparser = argparse.ArgumentParser()
argparser.add_argument("--step", help="Initial SG step size",
type=float, default=0.1, required=False)
argparser.add_argument("--vocab", help="Vocabulary of all features",
type=str, default="data/small_guess.vocab")
argparser.add_argument("--train", help="Training set",
type=str, default="data/small_guess.buzztrain.jsonl", required=False)
argparser.add_argument("--test", help="Test set",
type=str, default="data/small_guess.buzzdev.jsonl", required=False)
argparser.add_argument("--passes", help="Number of passes through train",
type=int, default=1, required=False)
args = argparser.parse_args()
with open(args.vocab, 'r') as infile:
vocab = [x.strip() for x in infile]
print("Loaded %i items from vocab %s" % (len(vocab), args.vocab))
train = read_dataset(args.train, vocab=vocab)
test = read_dataset(args.test, vocab=vocab)
print("Read in %i train and %i test" % (len(train), len(test)))
# Initialize model
lr = LogReg(len(vocab), args.step)
# Iterations
update_number = 0
for pp in range(args.passes):
for ii in train:
update_number += 1
lr.sg_update(ii)
if update_number % 100 == 1:
train_lp, train_acc = lr.progress(train)
ho_lp, ho_acc = lr.progress(test)
# lr.inspect(vocab)
print("Update %i\tTProb %f\tHProb %f\tTAcc %f\tHAcc %f" %
(update_number, train_lp, ho_lp, train_acc, ho_acc))