-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcommon.py
101 lines (80 loc) · 2.25 KB
/
common.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
from math import log
from nltk.tokenize import TreebankWordTokenizer
from collections import defaultdict, Counter
from random import random, randint
from glob import glob
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.probability import FreqDist
from common import *
from operator import itemgetter
import re
import argparse
import time
import string
import pickle
import os.path
import shutil
import sys
string.punctuation += '°!”#€%&/()=?`±´*——:;》@…~≤÷≈‚。,-’*^_•|<>,./¢∞§¶•【】=」¥“'
TAG_EN = 'en'
TAG_ZH = 'cmn'
TAG_TR = 'tr'
TAG_UZ = 'uz'
TAG_HA = 'ha'
UNASSIGNED = -1
DOCUMENT = -1
TOPIC = -2
kTOKENIZER = TreebankWordTokenizer()
CACHE_DIR = './cache'
EMPTYWORD = ''
MIN_LENGTH = 3
MIN_LENGTH_CMN = 2
threshold = defaultdict(float)
threshold[TAG_EN] = 0.2
threshold[TAG_ZH] = 0.25
threshold[TAG_TR] = 0.3
def dict_sample(d, cutoff=-1):
"""
Sample a key from a dictionary using the values as probabilities (unnormalized)
"""
if cutoff==-1: cutoff = random()
normalizer = float(sum(d.values()))
current = 0
for i in d:
assert(d[i] > 0)
current += float(d[i]) / normalizer
if current >= cutoff: return i
print("Didn't choose anything: ", cutoff, current)
def lgammln(xx):
"""
Returns the gamma function of xx.
Gamma(z) = Integral(0,infinity) of t^(z-1)exp(-t) dt.
Usage: lgammln(xx)
Copied from stats.py by [email protected]
"""
assert xx > 0, "Arg to gamma function must be > 0; got %f" % xx
coeff = [76.18009173, -86.50532033, 24.01409822, -1.231739516,
0.120858003e-2, -0.536382e-5]
x = xx - 1.0
tmp = x + 5.5
tmp = tmp - (x + 0.5) * log(tmp)
ser = 1.0
for j in range(len(coeff)):
x = x + 1
ser = ser + coeff[j] / x
return -tmp + log(2.50662827465 * ser)
def tokenize_file(filename):
contents = open(filename).read()
for ii in kTOKENIZER.tokenize(contents): yield ii
class RandomWrapper:
"""
Class to wrap a random number generator to facilitate deterministic testing.
"""
def __init__(self, buff):
self._buffer = buff
self._buffer.reverse()
def __call__(self):
val = self._buffer.pop()
print("Using random value %0.2f" % val)
return val