-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathsample_rules.py
58 lines (41 loc) · 1.44 KB
/
sample_rules.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# encoding: utf-8
import nltk
from wsd import LOG, Rule, Ruleset
class AllSentencesRule(Rule):
def __init__(self, ruleset):
Rule.__init__(self, ruleset, 'All sentences')
def itermatches(self, text):
for pattern in self.patterns:
yield pattern, self.ruleset.tokenizer.span_tokenize(text).__iter__()
class LongSentencesRule(Rule):
def __init__(self, ruleset):
Rule.__init__(self, ruleset, 'Long sentences')
def itermatches(self, text):
for pattern in self.patterns:
yield pattern, (
m for m
in self.ruleset.tokenizer.span_tokenize(text).__iter__()
if m[1] - m[0] > 200
)
class ShortSentencesRule(Rule):
def __init__(self, ruleset):
Rule.__init__(self, ruleset, 'Short sentences')
def itermatches(self, text):
for pattern in self.patterns:
yield pattern, (
m for m
in self.ruleset.tokenizer.span_tokenize(text).__iter__()
if m[1] - m[0] < 40
)
class SampleRuleset(Ruleset):
def __init__(self):
Ruleset.__init__(self, 'Sample ruleset')
self.tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
self.rules = [
ShortSentencesRule(self),
LongSentencesRule(self),
AllSentencesRule(self)
]
def get_rulesets():
ruleset = SampleRuleset()
return [ruleset]