python-tagger/hmm.py at master · Bilingual-Annotation-Task-Force/python-tagger · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
#  hmm.py
#  Using Python 3.4.3

import math
from functools import reduce
from cs_model import CodeSModel
from collections import namedtuple


"""A namedtuple for use with the hidden Markov model, specifically made for
    use with the viterbi algorithm.
"""
HMMNode = namedtuple("HMMNode", ["prob", "p_tag"])


class HiddenMarkovModel:
    """Represents a Hidden Markov Model. The Hidden Markov Model, or HMM
        is a statistical Markov Model made to observe a Markov process
        where the results are known, but the state of the process is not.

    Args:
        words (list<str>): list of words
        tag_set (list<str>): list of tags
        transi_matrix (dict<str -> dict<str -> float>>): transition matrix.
            See the method in eval.py. As of now, is a 2x2 matrix.
        cs_model (CodeSModel): The code switched language model of the
            corpus.

    Properties:
        words (list<str>): list of words
        tag_set (list<str>): list of tags
        transi_matrix (dict<str -> dict<str -> float>>): transition matrix.
            See the method in eval.py. As of now, is a 2x2 matrix.
        cs_model (CodeSModel): The code switched language model of the
            corpus.
        v (list<list<HMMNode>>): A series of nodes representiing the provided
            words.  This will be filled with useful info once the method
            viterbi is called, and the data processed. There is one set of
            nodes for each word, and each of these sets carry one node for
            every language tag.

    """

    def __init__(self, words, tag_set, transi_matrix, cs_model):
        self.words = words
        self.tag_set = tag_set
        self.transi_matrix = transi_matrix
        self.cs_model = cs_model
        self.v = [[HMMNode(0, 0) for _ in range(len(tag_set))]
                  for _ in range(len(words))]

    def gen_tags(self):
        """Generate the tags of the language using the viterbi alogrithm to
            determine the probabilities, and then determining most likely tags.

        Return:
            list<str>: A list of tags generated by the retrace method
        """
        self.viterbi()
        return self.retrace()

    def em(self, lang, word):
        """Find the emission property, or how likely it a certain observable
            state has been generated from a state.

        Here, the observable state is the word, and the hidden state is the
            context language.

        Args:
            lang (str): Which language to match the word to
            word (str): Which word to check the likelihood of belonging to a
                language

        Return:
            The probability for a word to exist, given a context
        """
        return self.cs_model.prob(lang, word)

    def tr(self, ctx, tag):
        """Determines the transmission probability of a node.

        The transmission probability is the likelihood of a single state
            to transition to another state. That state can be either the
            same or different from the original state.

        Args:
            ctx (str): the source state
            tag (str): the target state

        Return:
            float: probability of transitioning from the given state to the
                new state.
        """
        return self.transi_matrix[ctx][tag]

    def viterbi(self):
        """Runs the viterbi algorithm on the setup of the hidden Markov model.

        The following is a description of the algorithm:
            in the first node set, for each language, assume a base possibility
            for each word
                for each language node
                    craft the set of possible nodes (previous to current state)
                determine the most likely previous node for each current node
                save the node with a reference to the most likely past node

        At the end of the Viterbi algorithm, the most likely paths
            from the available bases should be generated.
        """
        for k, tag in enumerate(self.tag_set):
            self.v[0][k] = HMMNode(math.log(0.5), -1)

        for word_index, word in enumerate(self.words[1:]):
            # Skip over first word
            if word_index == 0:
                pass

            for tag_index, next_tag in enumerate(self.tag_set):
                tran_prob = []

                for old_tag_index, prev_tag in enumerate(self.tag_set):
                    prev_prob = self.v[word_index - 1][old_tag_index].prob + \
                                self.tr(prev_tag, next_tag)
                    prev_node = HMMNode(prev_prob, old_tag_index)
                    tran_prob.append(prev_node)

                nextnode = reduce(  # Looks for the node with highest prob
                    lambda n1, n2: n1 if n1.prob > n2.prob else n2,
                    tran_prob)

                em_prob = self.em(self.tag_set[tag_index],
                                  self.words[word_index])
                self.v[word_index][tag_index] = HMMNode(em_prob +
                                                        nextnode.prob,
                                                        nextnode.p_tag)

    def retrace(self):
        """Reverse traverses the graph generated by the viterbi algorithm to
            find the best path (the one with the highest probability)

        Returns:
            list<str>: the most likely tag combinations
        """
        tags = [""] * len(self.words)

        # Find most probable final tag
        last = reduce(lambda x, y: x if self.v[len(self.words) - 1][x].prob >
                      self.v[len(self.words) - 1][y].prob else y,
                      range(len(self.tag_set)))

        tags[len(self.words) - 1] = self.tag_set[last]

        # Follow backpointers to most probable previous tags
        prev = self.v[len(self.words) - 1][last].p_tag
        for k in range(len(self.words) - 2, -1, -1):
            tags[k] = self.tag_set[prev]
            prev = self.v[k][prev].p_tag

        return tags