1
+ # -*- coding: utf-8 -*-
2
+ import re
3
+ import torch
4
+ import spacy
5
+ import neuralcoref
6
+ from allennlp .predictors .predictor import Predictor
7
+ from nltk .tokenize import sent_tokenize , word_tokenize
8
+
9
+
10
+ class Tripple :
11
+ def __init__ (self , arg0 ,verb ,arg1 ):
12
+ self .subject = arg0 .lower ()
13
+ self .verb = verb .lower ()
14
+ self .object = arg1 .lower ()
15
+ def __str__ (self ):
16
+ return f'{ self .subject .capitalize ()} { self .verb } { self .object } . '
17
+
18
+ def __repr__ (self ):
19
+ return f'Tripple({ self .subject !r} ,{ self .verb !r} ,{ self .object !r} )'
20
+ def __eq__ (self , other ):
21
+ return ((self .verb in other .verb ) and (other .object in self .object )) or ((other .verb in self .verb ) and (self .object in other .object ))
22
+ def __len__ (self ):
23
+ return len (str (self ))
24
+
25
+
26
+ class InformationExtractor ():
27
+ def __init__ (self , coreference = False ):
28
+
29
+ self .predictor = Predictor .from_path ("https://s3-us-west-2.amazonaws.com/allennlp/models/openie-model.2018-08-20.tar.gz" )
30
+ if torch .cuda .is_available ():
31
+ self .predictor ._model = self .predictor ._model .cuda (0 )
32
+
33
+ self .spacy_pipeline = spacy .load ('en' )
34
+ self .coreference = coreference
35
+ if self .coreference :
36
+ coref = neuralcoref .NeuralCoref (self .spacy_pipeline .vocab )
37
+ self .spacy_pipeline .add_pipe (coref , name = 'neuralcoref' )
38
+
39
+ def Arguments (self ):
40
+ _dict = dict ({})
41
+ def dict_instance (string ):
42
+ values = string .split (': ' )
43
+ if len (values ) > 1 :
44
+ _dict [values [0 ]] = values [1 ]
45
+ return _dict
46
+ return dict_instance
47
+
48
+ def find_tripples (self ,string ):
49
+ tripples = []
50
+ extraction = self .predictor .predict (
51
+ sentence = string
52
+ )
53
+ #print(extraction)
54
+ for phrase in extraction ['verbs' ]:
55
+ args = dict ({})
56
+ subject = None
57
+ action = None
58
+ object1 = None
59
+ object2 = None
60
+ matches = re .findall (r'\[(.+?)\]' ,phrase ['description' ])
61
+ for x in matches :
62
+ keyValues = x .split (': ' )
63
+ if len (keyValues ) > 1 :
64
+ args [keyValues [0 ]] = keyValues [1 ]
65
+ if 'ARG0' in args :
66
+ subject = args ['ARG0' ]
67
+ if 'ARG1' in args :
68
+ object1 = args ['ARG1' ]
69
+ if 'ARG2' in args :
70
+ if object1 is not None :
71
+ object1 = object1 + ' ' + args ['ARG2' ]
72
+ else :
73
+ object1 = args ['ARG2' ]
74
+ if 'V' in args :
75
+ action = args ['V' ]
76
+ if 'BV' in args :
77
+ action = args ['BV' ] + ' ' + action
78
+ if 'AV' in args :
79
+ action = action + ' ' + args ['AV' ]
80
+
81
+ if subject and action and object1 :
82
+ new_tripple = Tripple (subject ,action ,object1 )
83
+ #tripples.append(new_tripple)
84
+ #print(new_tripple)
85
+ if len (tripples ):
86
+ old_tripple = tripples [- 1 ]
87
+ if old_tripple == new_tripple :
88
+ if len (new_tripple .verb ) > len (old_tripple .verb ):
89
+ tripples [- 1 ] = new_tripple
90
+ else :
91
+ tripples .append (new_tripple )
92
+ else :
93
+ tripples .append (new_tripple )
94
+
95
+ #if tripples:
96
+ # return max(tripples, key=len)
97
+ #else:
98
+ # return None
99
+ return tripples
100
+
101
+
102
+ def process (self ,text ):
103
+ sentnces = self .sent_tokenize (text )
104
+ tripples = [self .find_tripples (sent ) for sent in sentnces ]
105
+ tripples = [sent for sent in tripples if sent is not None ]
106
+ output = []
107
+ for tripple in tripples :
108
+ output += tripple
109
+
110
+ return output
111
+
112
+
113
+ def sent_tokenize (self ,input_ ):
114
+ if not self .coreference :
115
+ if isinstance (input_ ,list ):
116
+ sentences = input_
117
+ else :
118
+ document = self .spacy_pipeline (input_ )
119
+ sentences = [str (sent ) for sent in document .sents ]
120
+ else :
121
+ if isinstance (input_ ,list ):
122
+ document = self .spacy_pipeline (" " .join (input_ ))
123
+ sentences = input_
124
+ else :
125
+ document = self .spacy_pipeline (input_ )
126
+ sentences = [str (sent ) for sent in document .sents ]
127
+
128
+ if document ._ .has_coref :
129
+ sentences = self .get_resolved (document , sentences )
130
+
131
+ output = sentences
132
+ return output
133
+
134
+ def get_resolved (self , doc , sentences ):
135
+ def get_2d_element (arrays , index ):
136
+ j = index
137
+ lens = [len (sent ) for sent in arrays ]
138
+ for i ,length in enumerate (lens ):
139
+ j = j - length
140
+ if j < 0 :
141
+ return i , length + j
142
+ resolved_list = []
143
+ tokenizer = spacy .load ('en' )
144
+ for sent in sentences :
145
+ resolved_list .append (list (tok .text_with_ws for tok in tokenizer (sent )))
146
+
147
+ for cluster in doc ._ .coref_clusters :
148
+ for coref in cluster :
149
+ if coref != cluster .main :
150
+ ind1 , ind2 = get_2d_element (resolved_list ,coref .start )
151
+ resolved_list [ind1 ][ind2 ] = cluster .main .text + doc [coref .end - 1 ].whitespace_
152
+ for i in range (coref .start + 1 , coref .end ):
153
+ ind3 , ind4 = get_2d_element (resolved_list ,i )
154
+ resolved_list [ind3 ][ind4 ] = ""
155
+ output = ['' .join (sublist ) for sublist in resolved_list ]
156
+ return output
0 commit comments