-
Notifications
You must be signed in to change notification settings - Fork 7
Expand file tree
/
Copy pathhmm.py
More file actions
159 lines (126 loc) · 5.91 KB
/
Copy pathhmm.py
File metadata and controls
159 lines (126 loc) · 5.91 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
# hmm.py
# Using Python 3.4.3
import math
from functools import reduce
from cs_model import CodeSModel
from collections import namedtuple
"""A namedtuple for use with the hidden Markov model, specifically made for
use with the viterbi algorithm.
"""
HMMNode = namedtuple("HMMNode", ["prob", "p_tag"])
class HiddenMarkovModel:
"""Represents a Hidden Markov Model. The Hidden Markov Model, or HMM
is a statistical Markov Model made to observe a Markov process
where the results are known, but the state of the process is not.
Args:
words (list<str>): list of words
tag_set (list<str>): list of tags
transi_matrix (dict<str -> dict<str -> float>>): transition matrix.
See the method in eval.py. As of now, is a 2x2 matrix.
cs_model (CodeSModel): The code switched language model of the
corpus.
Properties:
words (list<str>): list of words
tag_set (list<str>): list of tags
transi_matrix (dict<str -> dict<str -> float>>): transition matrix.
See the method in eval.py. As of now, is a 2x2 matrix.
cs_model (CodeSModel): The code switched language model of the
corpus.
v (list<list<HMMNode>>): A series of nodes representiing the provided
words. This will be filled with useful info once the method
viterbi is called, and the data processed. There is one set of
nodes for each word, and each of these sets carry one node for
every language tag.
"""
def __init__(self, words, tag_set, transi_matrix, cs_model):
self.words = words
self.tag_set = tag_set
self.transi_matrix = transi_matrix
self.cs_model = cs_model
self.v = [[HMMNode(0, 0) for _ in range(len(tag_set))]
for _ in range(len(words))]
def gen_tags(self):
"""Generate the tags of the language using the viterbi alogrithm to
determine the probabilities, and then determining most likely tags.
Return:
list<str>: A list of tags generated by the retrace method
"""
self.viterbi()
return self.retrace()
def em(self, lang, word):
"""Find the emission property, or how likely it a certain observable
state has been generated from a state.
Here, the observable state is the word, and the hidden state is the
context language.
Args:
lang (str): Which language to match the word to
word (str): Which word to check the likelihood of belonging to a
language
Return:
The probability for a word to exist, given a context
"""
return self.cs_model.prob(lang, word)
def tr(self, ctx, tag):
"""Determines the transmission probability of a node.
The transmission probability is the likelihood of a single state
to transition to another state. That state can be either the
same or different from the original state.
Args:
ctx (str): the source state
tag (str): the target state
Return:
float: probability of transitioning from the given state to the
new state.
"""
return self.transi_matrix[ctx][tag]
def viterbi(self):
"""Runs the viterbi algorithm on the setup of the hidden Markov model.
The following is a description of the algorithm:
in the first node set, for each language, assume a base possibility
for each word
for each language node
craft the set of possible nodes (previous to current state)
determine the most likely previous node for each current node
save the node with a reference to the most likely past node
At the end of the Viterbi algorithm, the most likely paths
from the available bases should be generated.
"""
for k, tag in enumerate(self.tag_set):
self.v[0][k] = HMMNode(math.log(0.5), -1)
for word_index, word in enumerate(self.words[1:]):
# Skip over first word
if word_index == 0:
pass
for tag_index, next_tag in enumerate(self.tag_set):
tran_prob = []
for old_tag_index, prev_tag in enumerate(self.tag_set):
prev_prob = self.v[word_index - 1][old_tag_index].prob + \
self.tr(prev_tag, next_tag)
prev_node = HMMNode(prev_prob, old_tag_index)
tran_prob.append(prev_node)
nextnode = reduce( # Looks for the node with highest prob
lambda n1, n2: n1 if n1.prob > n2.prob else n2,
tran_prob)
em_prob = self.em(self.tag_set[tag_index],
self.words[word_index])
self.v[word_index][tag_index] = HMMNode(em_prob +
nextnode.prob,
nextnode.p_tag)
def retrace(self):
"""Reverse traverses the graph generated by the viterbi algorithm to
find the best path (the one with the highest probability)
Returns:
list<str>: the most likely tag combinations
"""
tags = [""] * len(self.words)
# Find most probable final tag
last = reduce(lambda x, y: x if self.v[len(self.words) - 1][x].prob >
self.v[len(self.words) - 1][y].prob else y,
range(len(self.tag_set)))
tags[len(self.words) - 1] = self.tag_set[last]
# Follow backpointers to most probable previous tags
prev = self.v[len(self.words) - 1][last].p_tag
for k in range(len(self.words) - 2, -1, -1):
tags[k] = self.tag_set[prev]
prev = self.v[k][prev].p_tag
return tags