Skip to content

Commit 8621d9c

Browse files
committed
Load model
1 parent 7ddbc58 commit 8621d9c

File tree

2 files changed

+173
-1
lines changed

2 files changed

+173
-1
lines changed

README.md

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,17 @@
1-
# Triplizer
1+
# Triplizer
2+
3+
```python
4+
from corefextraction import InformationExtractor
5+
extractor = InformationExtractor(coreference = True)
6+
text = 'Paul Allen was born on January 21, 1953, in Seattle, Washington, to Kenneth Sam Allen and Edna Faye Allen. Allen attended Lakeside School, a private school in Seattle, where he befriended Bill Gates, two years younger, with whom he shared an enthusiasm for computers.'
7+
triples = extractor.process(text)
8+
for triple in triples:
9+
print(triple)
10+
'''
11+
Output:
12+
Paul allen was born on january 21 , 1953.
13+
Paul allen attended lakeside school.
14+
Paul allen befriended bill gates.
15+
Paul allen shared an enthusiasm for computers.
16+
'''
17+
```

corefextraction.py

Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
# -*- coding: utf-8 -*-
2+
import re
3+
import torch
4+
import spacy
5+
import neuralcoref
6+
from allennlp.predictors.predictor import Predictor
7+
from nltk.tokenize import sent_tokenize, word_tokenize
8+
9+
10+
class Tripple:
11+
def __init__(self, arg0,verb,arg1):
12+
self.subject = arg0.lower()
13+
self.verb = verb.lower()
14+
self.object = arg1.lower()
15+
def __str__(self):
16+
return f'{self.subject.capitalize()} {self.verb} {self.object}. '
17+
18+
def __repr__(self):
19+
return f'Tripple({self.subject!r},{self.verb!r},{self.object!r})'
20+
def __eq__(self, other):
21+
return ((self.verb in other.verb) and (other.object in self.object)) or ((other.verb in self.verb) and (self.object in other.object))
22+
def __len__(self):
23+
return len(str(self))
24+
25+
26+
class InformationExtractor():
27+
def __init__(self, coreference = False):
28+
29+
self.predictor = Predictor.from_path("https://s3-us-west-2.amazonaws.com/allennlp/models/openie-model.2018-08-20.tar.gz")
30+
if torch.cuda.is_available():
31+
self.predictor._model = self.predictor._model.cuda(0)
32+
33+
self.spacy_pipeline = spacy.load('en')
34+
self.coreference = coreference
35+
if self.coreference:
36+
coref = neuralcoref.NeuralCoref(self.spacy_pipeline.vocab)
37+
self.spacy_pipeline.add_pipe(coref, name='neuralcoref')
38+
39+
def Arguments(self):
40+
_dict = dict({})
41+
def dict_instance(string):
42+
values = string.split(': ')
43+
if len(values) > 1:
44+
_dict[values[0]] = values[1]
45+
return _dict
46+
return dict_instance
47+
48+
def find_tripples(self,string):
49+
tripples = []
50+
extraction = self.predictor.predict(
51+
sentence=string
52+
)
53+
#print(extraction)
54+
for phrase in extraction['verbs']:
55+
args = dict({})
56+
subject = None
57+
action = None
58+
object1 = None
59+
object2 = None
60+
matches=re.findall(r'\[(.+?)\]',phrase['description'])
61+
for x in matches:
62+
keyValues = x.split(': ')
63+
if len(keyValues) > 1:
64+
args[keyValues[0]] = keyValues[1]
65+
if 'ARG0' in args:
66+
subject = args['ARG0']
67+
if 'ARG1' in args:
68+
object1 = args['ARG1']
69+
if 'ARG2' in args:
70+
if object1 is not None:
71+
object1 = object1 + ' ' + args['ARG2']
72+
else:
73+
object1 = args['ARG2']
74+
if 'V' in args:
75+
action = args['V']
76+
if 'BV' in args:
77+
action = args['BV'] +' '+action
78+
if 'AV' in args:
79+
action = action + ' ' + args['AV']
80+
81+
if subject and action and object1:
82+
new_tripple = Tripple(subject,action,object1)
83+
#tripples.append(new_tripple)
84+
#print(new_tripple)
85+
if len(tripples):
86+
old_tripple = tripples[-1]
87+
if old_tripple == new_tripple:
88+
if len(new_tripple.verb) > len(old_tripple.verb):
89+
tripples[-1] = new_tripple
90+
else:
91+
tripples.append(new_tripple)
92+
else:
93+
tripples.append(new_tripple)
94+
95+
#if tripples:
96+
# return max(tripples, key=len)
97+
#else:
98+
# return None
99+
return tripples
100+
101+
102+
def process(self,text):
103+
sentnces = self.sent_tokenize(text)
104+
tripples = [self.find_tripples(sent) for sent in sentnces]
105+
tripples =[sent for sent in tripples if sent is not None]
106+
output = []
107+
for tripple in tripples:
108+
output += tripple
109+
110+
return output
111+
112+
113+
def sent_tokenize(self,input_):
114+
if not self.coreference:
115+
if isinstance(input_,list):
116+
sentences = input_
117+
else:
118+
document = self.spacy_pipeline(input_)
119+
sentences = [str(sent) for sent in document.sents]
120+
else:
121+
if isinstance(input_,list):
122+
document = self.spacy_pipeline(" ".join(input_))
123+
sentences = input_
124+
else:
125+
document = self.spacy_pipeline(input_)
126+
sentences = [str(sent) for sent in document.sents]
127+
128+
if document._.has_coref:
129+
sentences = self.get_resolved(document, sentences)
130+
131+
output = sentences
132+
return output
133+
134+
def get_resolved(self, doc, sentences):
135+
def get_2d_element(arrays, index):
136+
j = index
137+
lens = [len(sent) for sent in arrays]
138+
for i,length in enumerate(lens):
139+
j = j - length
140+
if j < 0:
141+
return i, length + j
142+
resolved_list = []
143+
tokenizer = spacy.load('en')
144+
for sent in sentences:
145+
resolved_list.append(list(tok.text_with_ws for tok in tokenizer(sent)))
146+
147+
for cluster in doc._.coref_clusters:
148+
for coref in cluster:
149+
if coref != cluster.main:
150+
ind1, ind2 = get_2d_element(resolved_list,coref.start)
151+
resolved_list[ind1][ind2] = cluster.main.text + doc[coref.end-1].whitespace_
152+
for i in range(coref.start+1, coref.end):
153+
ind3, ind4 = get_2d_element(resolved_list,i)
154+
resolved_list[ind3][ind4] = ""
155+
output = [''.join(sublist) for sublist in resolved_list]
156+
return output

0 commit comments

Comments
 (0)