diff --git a/lhotse/cut/set.py b/lhotse/cut/set.py index 927558bce..2a7afd16c 100644 --- a/lhotse/cut/set.py +++ b/lhotse/cut/set.py @@ -2550,6 +2550,88 @@ def prefetch(self, buffer_size: int = 10) -> "CutSet": ) ) + def to_huggingface_dataset(self): + """ + Converts a CutSet to a HuggingFace Dataset. Currently, only MonoCut with one recording source is supported. + Other cut types will be supported in the future. + + Currently, two formats are supported: + 1. If each cut has one supervision (e.g. LibriSpeech), each cut is represented as a single row (entry) + in the HuggingFace dataset with all the supervision information stored along the cut information. + The final HuggingFace dataset format is: + ╔═══════════════════╦═══════════════════════════════╗ + ║ Feature ║ Type ║ + ╠═══════════════════╬═══════════════════════════════╣ + ║ id ║ Value(dtype='string') ║ + ╠═══════════════════╬═══════════════════════════════╣ + ║ audio ║ Audio() ║ + ╠═══════════════════╬═══════════════════════════════╣ + ║ duration ║ Value(dtype='float32') ║ + ╠═══════════════════╬═══════════════════════════════╣ + ║ num_channels ║ Value(dtype='uint16') ║ + ╠═══════════════════╬═══════════════════════════════╣ + ║ text ║ Value(dtype='string') ║ + ╠═══════════════════╬═══════════════════════════════╣ + ║ speaker ║ Value(dtype='string') ║ + ╠═══════════════════╬═══════════════════════════════╣ + ║ language ║ Value(dtype='string') ║ + ╠═══════════════════╬═══════════════════════════════╣ + ║ {x}_alignment ║ Sequence(Alignment) ║ + ╚═══════════════════╩═══════════════════════════════╝ + where x stands for the alignment type (commonly used: "word", "phoneme"). + + Alignment is represented as: + ╔═══════════════════╦═══════════════════════════════╗ + ║ Feature ║ Type ║ + ╠═══════════════════╬═══════════════════════════════╣ + ║ symbol ║ Value(dtype='string') ║ + ╠═══════════════════╬═══════════════════════════════╣ + ║ start ║ Value(dtype='float32') ║ + ╠═══════════════════╬═══════════════════════════════╣ + ║ end ║ Value(dtype='float32') ║ + ╚═══════════════════╩═══════════════════════════════╝ + + + 2. If each cut has multiple supervisions (e.g. AMI), each cut is represented as a single row (entry) + while all the supervisions are stored in a separate list of dictionaries under the 'segments' key. + The final HuggingFace dataset format is: + ╔══════════════╦════════════════════════════════════╗ + ║ Feature ║ Type ║ + ╠══════════════╬════════════════════════════════════╣ + ║ id ║ Value(dtype='string') ║ + ╠══════════════╬════════════════════════════════════╣ + ║ audio ║ Audio() ║ + ╠══════════════╬════════════════════════════════════╣ + ║ duration ║ Value(dtype='float32') ║ + ╠══════════════╬════════════════════════════════════╣ + ║ num_channels ║ Value(dtype='uint16') ║ + ╠══════════════╬════════════════════════════════════╣ + ║ segments ║ Sequence(Segment) ║ + ╚══════════════╩════════════════════════════════════╝ + where one Segment is represented as: + ╔═══════════════════╦═══════════════════════════════╗ + ║ Feature ║ Type ║ + ╠═══════════════════╬═══════════════════════════════╣ + ║ text ║ Value(dtype='string') ║ + ╠═══════════════════╬═══════════════════════════════╣ + ║ start ║ Value(dtype='float32') ║ + ╠═══════════════════╬═══════════════════════════════╣ + ║ end ║ Value(dtype='float32') ║ + ╠═══════════════════╬═══════════════════════════════╣ + ║ channel ║ Value(dtype='string') ║ + ╠═══════════════════╬═══════════════════════════════╣ + ║ speaker ║ Value(dtype='string') ║ + ╠═══════════════════╬═══════════════════════════════╣ + ║ language ║ Value(dtype='string') ║ + ╠═══════════════════╬═══════════════════════════════╣ + ║ {x}_alignment ║ Sequence(Alignment) ║ + ╚═══════════════════╩═══════════════════════════════╝ + :return: A HuggingFace Dataset. + """ + from lhotse.hf import export_cuts_to_hf + + return export_cuts_to_hf(self) + def __repr__(self) -> str: try: len_val = len(self) diff --git a/lhotse/hf.py b/lhotse/hf.py new file mode 100644 index 000000000..493087ad9 --- /dev/null +++ b/lhotse/hf.py @@ -0,0 +1,303 @@ +""" +╔══════════════════════════════════════╗ +║ Export CutSet to HuggingFace Dataset ║ +╚══════════════════════════════════════╝ +""" +from typing import Any, Dict, List, Optional, Set, Tuple, Union + +from lhotse.cut import CutSet, MonoCut +from lhotse.utils import is_module_available + + +def contains_only_mono_cuts(cutset: CutSet) -> bool: + return all(isinstance(cut, MonoCut) for cut in cutset) + + +def has_one_supervision_per_cut(cutset: CutSet) -> bool: + return all(len(cut.supervisions) == 1 for cut in cutset) + + +def has_one_audio_source(cutset: CutSet) -> bool: + return all(len(cut.recording.sources) == 1 for cut in cutset) + + +def convert_cuts_info_to_hf(cutset: CutSet) -> Tuple[Dict[str, Any], Dict[str, Any]]: + """ + Converts the cut information into a dictionary compatible with HuggingFace datasets format. + + :param cutset: A CutSet object. + :return: A tuple where the first element is a dictionary + representing the cut attributes and the second element is a dictionary describing the + format of the HuggingFace dataset. + """ + from datasets import Audio, Value + + cut_info = { + "id": [cut.id for cut in cutset], + "audio": [cut.recording.sources[0].source for cut in cutset], + "duration": [cut.duration for cut in cutset], + "num_channels": [len(cut.recording.channel_ids) for cut in cutset], + } + cut_info_description = { + "id": Value("string"), + "audio": Audio(mono=False), + "duration": Value("float"), + "num_channels": Value("uint16"), + } + return cut_info, cut_info_description + + +def convert_supervisions_info_to_hf( + cutset: CutSet, + exclude_attributes: Optional[Union[List[str], Set[str]]] = None, +) -> Tuple[List[List[Dict[str, Any]]], Dict[str, Any]]: + """ + Converts cut supervisions into a dictionary compatible with HuggingFace datasets format. + + :param cutset: A CutSet object. + :param exclude_attributes: A list|set of attributes to exclude from the supervisions dicts. + :return: A tuple where the first element is a dictionary + representing the cut attributes and the second element is a dictionary describing the + format of the HuggingFace dataset. + """ + + from datasets import Features, Sequence, Value + + has_speaker = any( + ( + hasattr(cut.supervisions[0], "speaker") + and cut.supervisions[0].speaker is not None + ) + for cut in cutset + ) + has_language = any( + ( + hasattr(cut.supervisions[0], "language") + and cut.supervisions[0].language is not None + ) + for cut in cutset + ) + alignment_types = [ + s.alignment.keys() + for c in cutset + for s in c.supervisions + if s.alignment is not None + ] + alignment_types = set([item for sublist in alignment_types for item in sublist]) + + sup_dicts = [] + for c in cutset: + cut_sup_dicts = [] + for s in c.supervisions: + sup_dict = { + "text": s.text, + } + + if exclude_attributes is None or "start" not in exclude_attributes: + sup_dict["start"] = s.start + + if exclude_attributes is None or "end" not in exclude_attributes: + sup_dict["end"] = s.end + + if exclude_attributes is None or "channel" not in exclude_attributes: + if isinstance(s.channel, list): + sup_dict["channel"] = ",".join(map(str, s.channel)) + else: + sup_dict["channel"] = str(s.channel) + + if has_speaker and ( + exclude_attributes is None or "speaker" not in exclude_attributes + ): + sup_dict["speaker"] = str(s.speaker) + + if has_language and ( + exclude_attributes is None or "language" not in exclude_attributes + ): + sup_dict["language"] = str(s.language) + + if alignment_types and ( + exclude_attributes is None or "alignments" not in exclude_attributes + ): + alignments = {} + for alignment_type in alignment_types: + alignments[alignment_type + "_alignment"] = list( + map( + lambda item: { + "symbol": item.symbol, + "start": item.start, + "end": item.end, + }, + s.alignment[alignment_type], + ) + ) + + sup_dict = {**sup_dict, **alignments} + + cut_sup_dicts.append(sup_dict) + sup_dicts.append(cut_sup_dicts) + + sup_dicts_info = {"text": Value("string")} + + if exclude_attributes is None or "start" not in exclude_attributes: + sup_dicts_info["start"] = Value("float") + + if exclude_attributes is None or "end" not in exclude_attributes: + sup_dicts_info["end"] = Value("float") + + if exclude_attributes is None or "channel" not in exclude_attributes: + sup_dicts_info["channel"] = Value("string") + + if has_speaker and ( + exclude_attributes is None or "speaker" not in exclude_attributes + ): + sup_dicts_info["speaker"] = Value("string") + + if has_language and ( + exclude_attributes is None or "language" not in exclude_attributes + ): + sup_dicts_info["language"] = Value("string") + + if alignment_types and ( + exclude_attributes is None or "alignments" not in exclude_attributes + ): + alignment_info = { + "symbol": Value("string"), + "start": Value("float"), + "end": Value("float"), + } + for alignment_type in alignment_types: + sup_dicts_info[alignment_type + "_alignment"] = Sequence( + Features(**alignment_info) + ) + + return sup_dicts, sup_dicts_info + + +def lod_to_dol(lod: List[Dict[str, Any]]) -> Dict[str, List]: + """ + Converts List of Dicts to Dict of Lists. + """ + return {k: [d[k] for d in lod] for k in lod[0].keys()} + + +def export_cuts_to_hf(cutset: CutSet): + """ + Converts a CutSet to a HuggingFace Dataset. Currently, only MonoCut with one recording source is supported. + Other cut types will be supported in the future. + + Currently, two formats are supported: + 1. If each cut has one supervision (e.g. LibriSpeech), each cut is represented as a single row (entry) + in the HuggingFace dataset with all the supervision information stored along the cut information. + The final HuggingFace dataset format is: + ╔═══════════════════╦═══════════════════════════════╗ + ║ Feature ║ Type ║ + ╠═══════════════════╬═══════════════════════════════╣ + ║ id ║ Value(dtype='string') ║ + ╠═══════════════════╬═══════════════════════════════╣ + ║ audio ║ Audio() ║ + ╠═══════════════════╬═══════════════════════════════╣ + ║ duration ║ Value(dtype='float32') ║ + ╠═══════════════════╬═══════════════════════════════╣ + ║ num_channels ║ Value(dtype='uint16') ║ + ╠═══════════════════╬═══════════════════════════════╣ + ║ text ║ Value(dtype='string') ║ + ╠═══════════════════╬═══════════════════════════════╣ + ║ speaker ║ Value(dtype='string') ║ + ╠═══════════════════╬═══════════════════════════════╣ + ║ language ║ Value(dtype='string') ║ + ╠═══════════════════╬═══════════════════════════════╣ + ║ {x}_alignment ║ Sequence(Alignment) ║ + ╚═══════════════════╩═══════════════════════════════╝ + where x stands for the alignment type (commonly used: "word", "phoneme"). + + Alignment is represented as: + ╔═══════════════════╦═══════════════════════════════╗ + ║ Feature ║ Type ║ + ╠═══════════════════╬═══════════════════════════════╣ + ║ symbol ║ Value(dtype='string') ║ + ╠═══════════════════╬═══════════════════════════════╣ + ║ start ║ Value(dtype='float32') ║ + ╠═══════════════════╬═══════════════════════════════╣ + ║ end ║ Value(dtype='float32') ║ + ╚═══════════════════╩═══════════════════════════════╝ + + + 2. If each cut has multiple supervisions (e.g. AMI), each cut is represented as a single row (entry) + while all the supervisions are stored in a separate list of dictionaries under the 'segments' key. + The final HuggingFace dataset format is: + ╔══════════════╦════════════════════════════════════╗ + ║ Feature ║ Type ║ + ╠══════════════╬════════════════════════════════════╣ + ║ id ║ Value(dtype='string') ║ + ╠══════════════╬════════════════════════════════════╣ + ║ audio ║ Audio() ║ + ╠══════════════╬════════════════════════════════════╣ + ║ duration ║ Value(dtype='float32') ║ + ╠══════════════╬════════════════════════════════════╣ + ║ num_channels ║ Value(dtype='uint16') ║ + ╠══════════════╬════════════════════════════════════╣ + ║ segments ║ Sequence(Segment) ║ + ╚══════════════╩════════════════════════════════════╝ + where one Segment is represented as: + ╔═══════════════════╦═══════════════════════════════╗ + ║ Feature ║ Type ║ + ╠═══════════════════╬═══════════════════════════════╣ + ║ text ║ Value(dtype='string') ║ + ╠═══════════════════╬═══════════════════════════════╣ + ║ start ║ Value(dtype='float32') ║ + ╠═══════════════════╬═══════════════════════════════╣ + ║ end ║ Value(dtype='float32') ║ + ╠═══════════════════╬═══════════════════════════════╣ + ║ channel ║ Value(dtype='string') ║ + ╠═══════════════════╬═══════════════════════════════╣ + ║ speaker ║ Value(dtype='string') ║ + ╠═══════════════════╬═══════════════════════════════╣ + ║ language ║ Value(dtype='string') ║ + ╠═══════════════════╬═══════════════════════════════╣ + ║ {x}_alignment ║ Sequence(Alignment) ║ + ╚═══════════════════╩═══════════════════════════════╝ + + :param cutset: A CutSet object. + :return: A HuggingFace Dataset. + """ + + assert has_one_audio_source( + cutset + ), "Only CutSets with one audio source per cut are supported. MultiSource cuts coming soon." + + if not is_module_available("datasets"): + raise ImportError( + "Please install the 'datasets' package (pip install datasets)." + ) + from datasets import Dataset, Features, Sequence + + # We don't need start and end attribute if we have only one supervision/segment per cut, + # as start=0 and end=duration. + cut_info, cut_info_description = convert_cuts_info_to_hf(cutset) + sup_dicts, sup_dicts_info = convert_supervisions_info_to_hf( + cutset, + exclude_attributes={"start", "end", "channel"} + if has_one_supervision_per_cut(cutset) + else None, + ) + + if has_one_supervision_per_cut(cutset): + dataset_dict = { + **cut_info, + **lod_to_dol([x[0] for x in sup_dicts]), + } + dataset_info = Features( + **cut_info_description, + **sup_dicts_info, + ) + else: + dataset_dict = { + **cut_info, + "segments": sup_dicts, + } + dataset_info = Features( + segments=Sequence(Features(**sup_dicts_info)), + **cut_info_description, + ) + + return Dataset.from_dict(dataset_dict, features=dataset_info)