-
Notifications
You must be signed in to change notification settings - Fork 227
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add recipe for the Santa Barbara Corpus of Spoken American English (S…
…BCSAE) (#1395) * initial commit * transcript fixes * added SBCSAE download * Updates sbcsae to properly process mono_channel audio and adds speaker origin as geolocations for speakers * Fixes a few 0-width segments by adding 0.02 s of padding * small fix * Add alignment export option Exports aligned supervisions along with the original supervisions with or without changing the text after manual inspections and corrections. * update to cli flags and docs * added sbcsae to docs and fixed python compatibility * more python3.8 fixes --------- Co-authored-by: Matthew Wiesner <[email protected]> Co-authored-by: Dominik Klement <[email protected]> Co-authored-by: Piotr Żelasko <[email protected]>
- Loading branch information
1 parent
c8ba6d0
commit d1b078b
Showing
5 changed files
with
1,217 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
from typing import Optional, Sequence | ||
|
||
import click | ||
|
||
from lhotse.bin.modes import download, prepare | ||
from lhotse.recipes.sbcsae import download_sbcsae, prepare_sbcsae | ||
from lhotse.utils import Pathlike | ||
|
||
__all__ = ["sbcsae"] | ||
|
||
|
||
@prepare.command(context_settings=dict(show_default=True)) | ||
@click.argument("corpus_dir", type=click.Path(exists=True, dir_okay=True)) | ||
@click.argument("output_dir", type=click.Path()) | ||
@click.option( | ||
"--geolocation", | ||
type=bool, | ||
is_flag=True, | ||
default=False, | ||
help="Include geographic coordinates of speakers' hometowns in the manifests.", | ||
) | ||
@click.option( | ||
"--omit-realignments", | ||
type=bool, | ||
is_flag=True, | ||
default=False, | ||
help="Only output the original corpus segmentation without boundary improvements.", | ||
) | ||
def sbcsae( | ||
corpus_dir: Pathlike, | ||
output_dir: Pathlike, | ||
geolocation: bool, | ||
omit_realignments: bool, | ||
): | ||
"""SBCSAE data preparation.""" | ||
prepare_sbcsae( | ||
corpus_dir, | ||
output_dir=output_dir, | ||
geolocation=geolocation, | ||
omit_realignments=omit_realignments, | ||
) | ||
|
||
|
||
@download.command(context_settings=dict(show_default=True)) | ||
@click.argument("target_dir", type=click.Path()) | ||
@click.option( | ||
"--force-download", | ||
type=bool, | ||
is_flag=True, | ||
default=False, | ||
help="Force download.", | ||
) | ||
def sbcsae( | ||
target_dir: Pathlike, | ||
force_download: bool, | ||
): | ||
"""SBCSAE download.""" | ||
download_sbcsae(target_dir, force_download=force_download) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.