Skip to content
Open
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
274 changes: 274 additions & 0 deletions biodatasets/mediqa_ans/mediqa_ans.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,274 @@
# coding=utf-8
# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Medical Question-Answer Summarization (MEDIQA-AnS): Question-Driven Summarization of Answers to Consumer Health
Questions. The first summarization collection containing question-driven summaries of answers to consumer health
questions. This dataset can be used to evaluate single or multi-document summaries generated by algorithms using
extractive or abstractive approaches."""

import itertools as it
import json
import os
from typing import Dict, List, Tuple

import datasets

from utils import schemas
from utils.configs import BigBioConfig
from utils.constants import Tasks

_CITATION = """\
@article{,
author={Savery, Max
and Abacha, Asma Ben
and Gayen, Soumya
and Demner-Fushman, Dina},
title={Question-driven summarization of answers to consumer health questions},
journal={Scientific Data},
year={2020},
month={Oct},
day={02},
volume={7},
number={1},
pages={322},
issn={2052-4463},
doi={10.1038/s41597-020-00667-z},
url={https://doi.org/10.1038/s41597-020-00667-z}
}
"""

_DATASETNAME = "mediqa_ans"

_DESCRIPTION = """\
Medical Question-Answer Summarization (MEDIQA-AnS): Question-Driven Summarization of Answers to Consumer Health
Questions. The first summarization collection containing question-driven summaries of answers to consumer health
questions. This dataset can be used to evaluate single or multi-document summaries generated by algorithms using
extractive or abstractive approaches.
"""

_HOMEPAGE = "https://osf.io/fyg46/"

_LICENSE = "CC0"

_URLS = {
_DATASETNAME: "https://osf.io/fs57e/download",
}

_SUPPORTED_TASKS = [Tasks.SUMMARIZATION]

_SOURCE_VERSION = "1.0.0"

_BIGBIO_VERSION = "1.0.0"


class MediqaAnsDataset(datasets.GeneratorBasedBuilder):
"""
A dataset of manually generated, question-driven summaries of multi and
single document answers to consumer health questions.
"""

SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION)

BUILDER_CONFIGS = []

BUILDER_CONFIGS.append(
BigBioConfig(
name="mediqa_ans_all_source",
version=BIGBIO_VERSION,
description="MEDIQA-AnS All source schema",
schema="source",
subset_id="mediqa_ans_all",
),
)

for setting1 in ["page2answer", "section2answer"]:
for setting2 in ["multi", "single"]:
for setting3 in ["abstractive", "extractive"]:
BUILDER_CONFIGS.append(
BigBioConfig(
name=f"mediqa_ans_{setting1}_{setting2}_{setting3}_bigbio_t2t",
version=BIGBIO_VERSION,
description=f"MEDIQA-AnS {setting1} {setting2} {setting3} BigBio schema",
schema="bigbio_t2t",
subset_id=f"mediqa_ans_{setting1}_{setting2}_{setting3}",
)
)
BUILDER_CONFIGS.append(
BigBioConfig(
name=f"mediqa_ans_{setting1}_{setting2}_{setting3}_source",
version=BIGBIO_VERSION,
description=f"MEDIQA-AnS {setting1} {setting2} {setting3} source schema",
schema="source",
subset_id=f"mediqa_ans_{setting1}_{setting2}_{setting3}",
),
)

DEFAULT_CONFIG_NAME = "mediqa_ans_page2answer_multi_abstractive_source"

def _info(self) -> datasets.DatasetInfo:

if self.config.schema == "source" and self.config.subset_id == "mediqa_ans_all":
features = datasets.Features(
{
"id": datasets.Value("string"),
"question": datasets.Value("string"),
"multi_abs_summ": datasets.Value("string"),
"multi_ext_summ": datasets.Value("string"),
"answers": [
{
"id": datasets.Value("string"),
"answer_abs_summ": datasets.Value("string"),
"answer_ext_summ": datasets.Value("string"),
"section": datasets.Value("string"),
"article": datasets.Value("string"),
"url": datasets.Value("string"),
"rating": datasets.Value("string"),
}
],
}
)
elif self.config.schema == "source":
features = datasets.Features(
{
"question": datasets.Value("string"),
"question_id": datasets.Value("string"),
"summary": datasets.Value("string"),
"articles": [
{
"answer_id": datasets.Value("string"),
"text": datasets.Value("string"),
"rating": datasets.Value("string"),
}
],
}
)
elif self.config.schema == "bigbio_t2t":
features = schemas.text2text_features

return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION,
)

def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]:
"""Returns SplitGenerators."""

urls = _URLS[_DATASETNAME]
file_path = dl_manager.download_and_extract(urls)

return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
"filepath": os.path.join(file_path),
},
),
]

def _generate_examples(self, filepath) -> Tuple[int, Dict]:
"""Yields examples as (key, example) tuples."""

dataset = None
with open(filepath, "r", encoding="utf8") as infile:
dataset = json.load(infile)

uid = it.count(0)
if self.config.name == "mediqa_ans_all_source":
dataset = self._json_dict_to_list(dataset, "id")
for example in dataset:
example["answers"] = self._json_dict_to_list(example["answers"], "id")
yield example["id"], example
else:
_, setting1, setting2, setting3 = self.config.subset_id.rsplit("_", 3)
if self.config.schema == "source":
for example in self._generate_setting_examples(dataset, setting1, setting2, setting3):
yield next(uid), example
elif self.config.schema == "bigbio_t2t":
for example in self._generate_setting_examples(dataset, setting1, setting2, setting3):
example = self._source_to_t2t(example)
example["id"] = next(uid)
yield example["id"], example

def _generate_setting_examples(self, dataset, setting1, setting2, setting3):
for question_id, question in dataset.items():
example = {}
example["question_id"] = question_id
example["question"] = question["question"]
if setting2 == "single":
for answer_id, answer in question["answers"].items():
example_ = example.copy()
if setting1 == "section2answer":
example_["articles"] = [
{"answer_id": answer_id, "text": answer["section"], "rating": answer["rating"]}
]
elif setting1 == "page2answer":
example_["articles"] = [
{"answer_id": answer_id, "text": answer["article"], "rating": answer["rating"]}
]
if setting3 == "abstractive":
example_["summary"] = answer["answer_abs_summ"]
elif setting3 == "extractive":
example_["summary"] = answer["answer_ext_summ"]
yield example_
elif setting2 == "multi":
example["articles"] = []
for answer_id, answer in question["answers"].items():
if setting1 == "section2answer":
example["articles"].append(
{"answer_id": answer_id, "text": answer["section"], "rating": answer["rating"]}
)
elif setting1 == "page2answer":
example["articles"].append(
{"answer_id": answer_id, "text": answer["article"], "rating": answer["rating"]}
)

if setting3 == "abstractive":
example["summary"] = question["multi_abs_summ"]
elif setting3 == "extractive":
example["summary"] = question["multi_ext_summ"]
yield example

def _source_to_t2t(self, example):
example_ = {}
example_["document_id"] = ""
example_["text_1_name"] = ""
example_["text_2_name"] = ""

text1 = ""
text1 += "Question ID: " + example["question_id"] + "\n"
text1 += "Question: " + example["question"] + "\n"
for article in example["articles"]:
text1 += "Answer ID: " + article["answer_id"] + "\n"
text1 += "Answer: " + article["text"] + "\n"
text1 += "Rating: " + article["rating"] + "\n"
example_["text_1"] = text1

example_["text_2"] = example["summary"]

return example_

def _json_dict_to_list(self, json, new_key):
list_ = []
for key, values in json.items():
assert isinstance(values, dict), "Child element is not a dict"
assert new_key not in values, "New key already in values"
values[new_key] = key
list_.append(values)
return list_