-
Notifications
You must be signed in to change notification settings - Fork 116
/
Copy pathmlee.py
279 lines (252 loc) · 10.6 KB
/
mlee.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
# coding=utf-8
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
MLEE is an event extraction corpus consisting of manually annotated abstracts of papers
on angiogenesis. It contains annotations for entities, relations, events and coreferences
The annotations span molecular, cellular, tissue, and organ-level processes.
"""
from pathlib import Path
from typing import Dict, List
import datasets
from .bigbiohub import kb_features
from .bigbiohub import BigBioConfig
from .bigbiohub import Tasks
_SOURCE_VIEW_NAME = "source"
_UNIFIED_VIEW_NAME = "bigbio"
_LANGUAGES = ['English']
_PUBMED = True
_LOCAL = False
_CITATION = """\
@article{pyysalo2012event,
title={Event extraction across multiple levels of biological organization},
author={Pyysalo, Sampo and Ohta, Tomoko and Miwa, Makoto and Cho, Han-Cheol and Tsujii, Jun'ichi and Ananiadou, Sophia},
journal={Bioinformatics},
volume={28},
number={18},
pages={i575--i581},
year={2012},
publisher={Oxford University Press}
}
"""
_DESCRIPTION = """\
MLEE is an event extraction corpus consisting of manually annotated abstracts of papers
on angiogenesis. It contains annotations for entities, relations, events and coreferences
The annotations span molecular, cellular, tissue, and organ-level processes.
"""
_DATASETNAME = "mlee"
_DISPLAYNAME = "MLEE"
_HOMEPAGE = "http://www.nactem.ac.uk/MLEE/"
_LICENSE = 'Creative Commons Attribution Non Commercial Share Alike 3.0 Unported'
_URLs = {
"source": "http://www.nactem.ac.uk/MLEE/MLEE-1.0.2-rev1.tar.gz",
"bigbio_kb": "http://www.nactem.ac.uk/MLEE/MLEE-1.0.2-rev1.tar.gz",
}
_SUPPORTED_TASKS = [
Tasks.EVENT_EXTRACTION,
Tasks.NAMED_ENTITY_RECOGNITION,
Tasks.RELATION_EXTRACTION,
Tasks.COREFERENCE_RESOLUTION,
]
_SOURCE_VERSION = "1.0.0"
_BIGBIO_VERSION = "1.0.0"
class MLEE(datasets.GeneratorBasedBuilder):
"""Write a short docstring documenting what this dataset is"""
SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION)
BUILDER_CONFIGS = [
BigBioConfig(
name="mlee_source",
version=SOURCE_VERSION,
description="MLEE source schema",
schema="source",
subset_id="mlee",
),
BigBioConfig(
name="mlee_bigbio_kb",
version=SOURCE_VERSION,
description="MLEE BigBio schema",
schema="bigbio_kb",
subset_id="mlee",
),
]
DEFAULT_CONFIG_NAME = "mlee_source"
_ROLE_MAPPING = {
"Theme2": "Theme",
"Instrument2": "Instrument",
"Participant2": "Participant",
"Participant3": "Participant",
"Participant4": "Participant",
}
def _info(self):
"""
Provide information about MLEE:
- `features` defines the schema of the parsed data set. The schema depends on the
chosen `config`: If it is `_SOURCE_VIEW_NAME` the schema is the schema of the
original data. If `config` is `_UNIFIED_VIEW_NAME`, then the schema is the
canonical KB-task schema defined in `biomedical/schemas/kb.py`.
"""
if self.config.schema == "source":
features = datasets.Features(
{
"id": datasets.Value("string"),
"document_id": datasets.Value("string"),
"text": datasets.Value("string"),
"text_bound_annotations": [ # T line in brat, e.g. type or event trigger
{
"offsets": datasets.Sequence([datasets.Value("int32")]),
"text": datasets.Sequence(datasets.Value("string")),
"type": datasets.Value("string"),
"id": datasets.Value("string"),
}
],
"events": [ # E line in brat
{
"trigger": datasets.Value(
"string"
), # refers to the text_bound_annotation of the trigger,
"id": datasets.Value("string"),
"type": datasets.Value("string"),
"arguments": datasets.Sequence(
{
"role": datasets.Value("string"),
"ref_id": datasets.Value("string"),
}
),
}
],
"relations": [ # R line in brat
{
"id": datasets.Value("string"),
"head": {
"ref_id": datasets.Value("string"),
"role": datasets.Value("string"),
},
"tail": {
"ref_id": datasets.Value("string"),
"role": datasets.Value("string"),
},
"type": datasets.Value("string"),
}
],
"equivalences": [ # Equiv line in brat
{
"id": datasets.Value("string"),
"ref_ids": datasets.Sequence(datasets.Value("string")),
}
],
"attributes": [ # M or A lines in brat
{
"id": datasets.Value("string"),
"type": datasets.Value("string"),
"ref_id": datasets.Value("string"),
"value": datasets.Value("string"),
}
],
"normalizations": [ # N lines in brat
{
"id": datasets.Value("string"),
"type": datasets.Value("string"),
"ref_id": datasets.Value("string"),
"resource_name": datasets.Value(
"string"
), # Name of the resource, e.g. "Wikipedia"
"cuid": datasets.Value(
"string"
), # ID in the resource, e.g. 534366
"text": datasets.Value(
"string"
), # Human readable description/name of the entity, e.g. "Barack Obama"
}
],
},
)
elif self.config.schema == "bigbio_kb":
features = kb_features
return datasets.DatasetInfo(
# This is the description that will appear on the datasets page.
description=_DESCRIPTION,
features=features,
# If there's a common (input, target) tuple from the features, uncomment supervised_keys line below and
# specify them. They'll be used if as_supervised=True in builder.as_dataset.
# This is not applicable for MLEE.
# supervised_keys=("sentence", "label"),
# Homepage of the dataset for documentation
homepage=_HOMEPAGE,
# License for the dataset if available
license=str(_LICENSE),
# Citation for the dataset
citation=_CITATION,
)
def _split_generators(
self, dl_manager: datasets.DownloadManager
) -> List[datasets.SplitGenerator]:
"""
Create the three splits provided by MLEE: train, validation and test.
Each split is created by instantiating a `datasets.SplitGenerator`, which will
call `this._generate_examples` with the keyword arguments in `gen_kwargs`.
"""
my_urls = _URLs[self.config.schema]
data_dir = Path(dl_manager.download_and_extract(my_urls))
data_files = {
"train": data_dir
/ "MLEE-1.0.2-rev1"
/ "standoff"
/ "development"
/ "train",
"dev": data_dir / "MLEE-1.0.2-rev1" / "standoff" / "development" / "test",
"test": data_dir / "MLEE-1.0.2-rev1" / "standoff" / "test" / "test",
}
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={"data_files": data_files["train"]},
),
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
gen_kwargs={"data_files": data_files["dev"]},
),
datasets.SplitGenerator(
name=datasets.Split.TEST,
gen_kwargs={"data_files": data_files["test"]},
),
]
def _standardize_arguments_roles(self, kb_example: Dict) -> Dict:
for event in kb_example["events"]:
for argument in event["arguments"]:
role = argument["role"]
argument["role"] = self._ROLE_MAPPING.get(role, role)
return kb_example
def _generate_examples(self, data_files: Path):
"""
Yield one `(guid, example)` pair per abstract in MLEE.
The contents of `example` will depend on the chosen configuration.
"""
if self.config.schema == "source":
txt_files = list(data_files.glob("*txt"))
for guid, txt_file in enumerate(txt_files):
example = parsing.parse_brat_file(txt_file)
example["id"] = str(guid)
yield guid, example
elif self.config.schema == "bigbio_kb":
txt_files = list(data_files.glob("*txt"))
for guid, txt_file in enumerate(txt_files):
example = parsing.brat_parse_to_bigbio_kb(
parsing.parse_brat_file(txt_file)
)
example = self._standardize_arguments_roles(example)
example["id"] = str(guid)
yield guid, example
else:
raise ValueError(f"Invalid config: {self.config.name}")