forked from sequitur-g2p/sequitur-g2p
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgroupedCounts.py
113 lines (96 loc) · 3.13 KB
/
groupedCounts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
__author__ = 'Maximilian Bisani'
__version__ = '$LastChangedRevision: 1667 $'
__date__ = '$LastChangedDate: 2007-06-02 16:32:35 +0200 (Sat, 02 Jun 2007) $'
__copyright__ = 'Copyright (c) 2004-2005 RWTH Aachen University'
__license__ = """
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License Version 2 (June
1991) as published by the Free Software Foundation.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, you will find it at
http://www.gnu.org/licenses/gpl.html, or write to the Free Software
Foundation, Inc., 51 Franlin Street, Fifth Floor, Boston, MA 02110,
USA.
Should a provision of no. 9 and 10 of the GNU General Public License
be invalid or become invalid, a valid provision is deemed to have been
agreed upon which comes closest to what the parties intended
commercially. In any case guarantee/warranty shall be limited to gross
negligent actions or intended actions or fraudulent concealment.
"""
import marshal, os
from mGramCounts import AbstractFileStorage
class StoredCounts(AbstractFileStorage):
def write(self, seq):
file = os.popen('gzip -fc >%s' % self.fname, 'wb')
for history, values in seq:
marshal.dump(history, file)
SparseVector.dump(values, file)
file.close()
def __iter__(self):
file = os.popen('gzip -dc %s' % self.fname, 'rb')
while True:
try:
history = marshal.load(file)
values = SparseVector.load(file)
yield (history, values)
except EOFError:
break
file.close()
def store(seq, big=False, filename=None):
if big:
s = StoredCounts(filename)
s.write(seq)
return s
else:
return list(seq)
from misc import restartable
import SparseVector
Counts = SparseVector.sparse
sumCounts = SparseVector.sumSparse
class NonMonotonousHistoriesError(RuntimeError):
pass
def contract(seq):
it = iter(seq)
(history, predicted), value = it.next()
values = [(predicted, value)]
for (h, p), v in it:
if h != history:
if h < history:
raise NonMonotonousHistoriesError(history, h)
yield history, Counts(values)
history = h
values = []
values.append((p, v))
yield history, Counts(values)
contract = restartable(contract)
class CountsAccumulator(object):
def __init__(self):
self.terms = [ [], [], [] ]
def set(self, initial = None):
self.terms = [ [initial], [], [] ]
def shrink(self):
for i in range(3):
if len(self.terms[i]) < 64:
break
s = sumCounts(self.terms[i])
try:
self.terms[i+1].append(s)
self.terms[i] = []
except IndexError:
self.terms[i] = [s]
def __iadd__(self, counts):
self.terms[0].append(counts)
if len(self.terms[0]) > 64:
self.shrink()
return self
def sum(self):
return sumCounts([ t for ts in self.terms for t in ts ])
def sumLotsOfCounts(counts):
accu = CountsAccumulator()
for c in counts:
accu += c
return accu.sum()