Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
125 changes: 88 additions & 37 deletions ChildProject/annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ class AnnotationManager:
IndexColumn(name = 'filter', description = 'source file to filter in (for rttm and alice only)', required = False),
IndexColumn(name = 'annotation_filename', description = 'output formatted annotation location, relative to `annotations/<set>/converted (automatic column, don\'t specify)', filename = True, required = False, generated = True),
IndexColumn(name = 'imported_at', description = 'importation date (automatic column, don\'t specify)', datetime = "%Y-%m-%d %H:%M:%S", required = False, generated = True),
IndexColumn(name = 'type', description = 'annotation storage format', choices = ['csv', 'gz', 'h5'], required = False),
IndexColumn(name = 'package_version', description = 'version of the package used when the importation was performed', regex = r"[0-9]+\.[0-9]+\.[0-9]+", required = False, generated = True),
IndexColumn(name = 'error', description = 'error message in case the annotation could not be imported', required = False, generated = True)
]
Expand All @@ -44,22 +45,22 @@ class AnnotationManager:
IndexColumn(name = 'segment_onset', description = 'segment onset timestamp in milliseconds (since the start of the recording)', regex = r"([0-9]+)", required = True),
IndexColumn(name = 'segment_offset', description = 'segment end time in milliseconds (since the start of the recording)', regex = r"([0-9]+)", required = True),
IndexColumn(name = 'speaker_id', description = 'identity of speaker in the annotation'),
IndexColumn(name = 'speaker_type', description = 'class of speaker (FEM = female adult, MAL = male adult, CHI = key child, OCH = other child)', choices = ['FEM', 'MAL', 'CHI', 'OCH', 'NA']),
IndexColumn(name = 'ling_type', description = '1 if the vocalization contains at least a vowel (ie canonical or non-canonical), 0 if crying or laughing', choices = ['1', '0', 'NA']),
IndexColumn(name = 'vcm_type', description = 'vocal maturity defined as: C (canonical), N (non-canonical), Y (crying) L (laughing), J (junk)', choices = ['C', 'N', 'Y', 'L', 'J', 'NA']),
IndexColumn(name = 'lex_type', description = 'W if meaningful, 0 otherwise', choices = ['W', '0', 'NA']),
IndexColumn(name = 'mwu_type', description = 'M if multiword, 1 if single word -- only filled if lex_type==W', choices = ['M', '1', 'NA']),
IndexColumn(name = 'addressee', description = 'T if target-child-directed, C if other-child-directed, A if adult-directed, U if uncertain or other. Multiple values should be sorted and separated by commas', choices = ['T', 'C', 'A', 'U', 'NA']),
IndexColumn(name = 'speaker_type', description = 'class of speaker (FEM = female adult, MAL = male adult, CHI = key child, OCH = other child)', choices = ['FEM', 'MAL', 'CHI', 'OCH', 'NA'], na = True),
IndexColumn(name = 'ling_type', description = '1 if the vocalization contains at least a vowel (ie canonical or non-canonical), 0 if crying or laughing', choices = ['1', '0', 'NA'], na = True),
IndexColumn(name = 'vcm_type', description = 'vocal maturity defined as: C (canonical), N (non-canonical), Y (crying) L (laughing), J (junk)', choices = ['C', 'N', 'Y', 'L', 'J', 'NA'], na = True),
IndexColumn(name = 'lex_type', description = 'W if meaningful, 0 otherwise', choices = ['W', '0', 'NA'], na = True),
IndexColumn(name = 'mwu_type', description = 'M if multiword, 1 if single word -- only filled if lex_type==W', choices = ['M', '1', 'NA'], na = True),
IndexColumn(name = 'addressee', description = 'T if target-child-directed, C if other-child-directed, A if adult-directed, U if uncertain or other. Multiple values should be sorted and separated by commas', choices = ['T', 'C', 'A', 'U', 'NA'], na = True),
IndexColumn(name = 'transcription', description = 'orthographic transcription of the speach'),
IndexColumn(name = 'phonemes', description = 'amount of phonemes', regex = r'(\d+(\.\d+)?)'),
IndexColumn(name = 'syllables', description = 'amount of syllables', regex = r'(\d+(\.\d+)?)'),
IndexColumn(name = 'words', description = 'amount of words', regex = r'(\d+(\.\d+)?)'),
IndexColumn(name = 'lena_block_type', description = 'whether regarded as part as a pause or a conversation by LENA', choices = ['pause', 'CM', 'CIC', 'CIOCX', 'CIOCAX', 'AMF', 'AICF', 'AIOCF', 'AIOCCXF', 'AMM', 'AICM', 'AIOCM', 'AIOCCXM', 'XM', 'XIOCC', 'XIOCA', 'XIC', 'XIOCAC']),
IndexColumn(name = 'lena_block_number', description = 'number of the LENA pause/conversation the segment belongs to', regex = r"(\d+(\.\d+)?)"),
IndexColumn(name = 'lena_conv_status', description = 'LENA conversation status', choices = ['BC', 'RC', 'EC']),
IndexColumn(name = 'lena_response_count', description = 'LENA turn count within block', regex = r"(\d+(\.\d+)?)"),
IndexColumn(name = 'lena_conv_floor_type', description = '(FI): Floor Initiation, (FH): Floor Holding', choices = ['FI', 'FH']),
IndexColumn(name = 'lena_conv_turn_type', description = 'LENA turn type', choices = ['TIFI', 'TIMI', 'TIFR', 'TIMR', 'TIFE', 'TIME', 'NT']),
IndexColumn(name = 'words', description = 'amount of words', regex = r'(\d+(\.\d+)?)', na = True),
IndexColumn(name = 'lena_block_type', description = 'whether regarded as part as a pause or a conversation by LENA', choices = ['pause', 'CM', 'CIC', 'CIOCX', 'CIOCAX', 'AMF', 'AICF', 'AIOCF', 'AIOCCXF', 'AMM', 'AICM', 'AIOCM', 'AIOCCXM', 'XM', 'XIOCC', 'XIOCA', 'XIC', 'XIOCAC'], na = True),
IndexColumn(name = 'lena_block_number', description = 'number of the LENA pause/conversation the segment belongs to', regex = r"(\d+(\.\d+)?)", na = True),
IndexColumn(name = 'lena_conv_status', description = 'LENA conversation status', choices = ['BC', 'RC', 'EC'], na = True),
IndexColumn(name = 'lena_response_count', description = 'LENA turn count within block', regex = r"(\d+(\.\d+)?)", na = True),
IndexColumn(name = 'lena_conv_floor_type', description = '(FI): Floor Initiation, (FH): Floor Holding', choices = ['FI', 'FH'], na = True),
IndexColumn(name = 'lena_conv_turn_type', description = 'LENA turn type', choices = ['TIFI', 'TIMI', 'TIFR', 'TIMR', 'TIFE', 'TIME', 'NT'], na = True),
IndexColumn(name = 'utterances_count', description = 'utterances count', regex = r"(\d+(\.\d+)?)"),
IndexColumn(name = 'utterances_length', description = 'utterances length', regex = r"([0-9]+)"),
IndexColumn(name = 'non_speech_length', description = 'non-speech length', regex = r"([0-9]+)"),
Expand Down Expand Up @@ -117,17 +118,25 @@ def read(self) -> Tuple[List[str], List[str]]:

return errors, warnings

def write(self):
"""Update the annotations index,
while enforcing its good shape.
"""
self.annotations[['time_seek', 'range_onset', 'range_offset']].fillna(0, inplace = True)
self.annotations[['time_seek', 'range_onset', 'range_offset']] = self.annotations[['time_seek', 'range_onset', 'range_offset']].astype(int)
self.annotations.to_csv(os.path.join(self.project.path, 'metadata/annotations.csv'), index = False)

def validate_annotation(self, annotation: dict) -> Tuple[List[str], List[str]]:
print("validating {} from {}...".format(annotation['annotation_filename'], annotation['set']))

segments = IndexTable(
'segments',
path = os.path.join(self.project.path, 'annotations', annotation['set'], 'converted', str(annotation['annotation_filename'])),
path = os.path.join(self.project.path, 'annotations', annotation['set'], 'converted', annotation['annotation_filename']),
columns = self.SEGMENTS_COLUMNS
)

try:
segments.read()
segments.df = self._read_annotation(annotation['set'], annotation['annotation_filename'])
except Exception as e:
error_message = "error while trying to read {} from {}:\n\t{}".format(
annotation['annotation_filename'],
Expand Down Expand Up @@ -163,14 +172,38 @@ def validate(self, annotations: pd.DataFrame = None, threads: int = 0) -> Tuple[

return errors, warnings

def write(self):
"""Update the annotations index,
while enforcing its good shape.
"""
self.annotations[['time_seek', 'range_onset', 'range_offset']].fillna(0, inplace = True)
self.annotations[['time_seek', 'range_onset', 'range_offset']] = self.annotations[['time_seek', 'range_onset', 'range_offset']].astype(int)
self.annotations.to_csv(os.path.join(self.project.path, 'metadata/annotations.csv'), index = False)

def _read_annotation(self, set: str, filename: str):
path = os.path.join(self.project.path, 'annotations', set, 'converted', filename)
ext = os.path.splitext(filename)[1]

if ext == '.csv':
return pd.read_csv(path)
elif ext == '.gz':
return pd.read_csv(path, compression = 'gzip')
elif ext == '.h5':
return pd.read_hdf(path)
elif ext == '.parquet':
return pd.read_parquet(path)
else:
raise ValueError(f"invalid extension '{ext}' for annotation {set}/{filename}'")

def _write_annotation(self, df: pd.DataFrame, set: str, filename: str):
path = os.path.join(self.project.path, 'annotations', set, 'converted', filename)
ext = os.path.splitext(filename)[1]

os.makedirs(os.path.dirname(path), exist_ok = True)

if ext == '.csv':
df.to_csv(path, index = False)
elif ext == '.gz':
df.to_csv(path, index = False, compression = 'gzip')
elif ext == '.h5':
df.to_hdf(path, key = 'segments', mode = 'w', index = False)
elif ext == '.parquet':
df.to_parquet(path, index = False)
else:
raise ValueError(f"invalid extension '{ext}' for annotation {set}/{filename}'")

def _import_annotation(self, import_function: Callable[[str], pd.DataFrame], annotation: dict):
"""import and convert ``annotation``. This function should not be called outside of this class.

Expand All @@ -183,9 +216,6 @@ def _import_annotation(self, import_function: Callable[[str], pd.DataFrame], ann
"""

source_recording = os.path.splitext(annotation['recording_filename'])[0]
annotation_filename = "{}_{}_{}.csv".format(source_recording, annotation['range_onset'], annotation['range_offset'])
output_filename = os.path.join('annotations', annotation['set'], 'converted', annotation_filename)

path = os.path.join(self.project.path, 'annotations', annotation['set'], 'raw', annotation['raw_filename'])
annotation_format = annotation['format']

Expand Down Expand Up @@ -230,8 +260,21 @@ def _import_annotation(self, import_function: Callable[[str], pd.DataFrame], ann

df.sort_values(sort_columns, inplace = True)

os.makedirs(os.path.dirname(os.path.join(self.project.path, output_filename)), exist_ok = True)
df.to_csv(os.path.join(self.project.path, output_filename), index = False)
if 'type' not in annotation or pd.isnull(annotation['type']):
annotation['type'] = 'csv'

annotation_filename = "{}_{}_{}.{}".format(
source_recording,
annotation['range_onset'],
annotation['range_offset'],
annotation['type']
)

self._write_annotation(
df,
annotation['set'],
annotation_filename
)

annotation['annotation_filename'] = annotation_filename
annotation['imported_at'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
Expand Down Expand Up @@ -409,17 +452,18 @@ def rename_set(self, annotation_set: str, new_set: str, recursive: bool = False,
self.annotations.loc[(self.annotations['set'] == annotation_set), 'set'] = new_set
self.write()

def merge_annotations(self, left_columns, right_columns, columns, output_set, input):
def merge_annotations(self, left_columns, right_columns, columns, output_set, type, input):
left_annotations = input['left_annotations']
right_annotations = input['right_annotations']

annotations = left_annotations.copy()
annotations['format'] = ''
annotations['annotation_filename'] = annotations.apply(
lambda annotation: "{}_{}_{}.csv".format(
lambda annotation: "{}_{}_{}.{}".format(
os.path.splitext(annotation['recording_filename'])[0],
annotation['range_onset'],
annotation['range_offset']
annotation['range_offset'],
type
)
, axis = 1)

Expand Down Expand Up @@ -487,16 +531,19 @@ def merge_annotations(self, left_columns, right_columns, columns, output_set, in

segments = output_segments[output_segments['interval'] == interval]
segments.drop(columns = list(set(segments.columns)-{c.name for c in self.SEGMENTS_COLUMNS}), inplace = True)
segments.to_csv(
os.path.join(self.project.path, 'annotations', annotation_set, 'converted', annotation_filename),
index = False

self._write_annotation(
segments,
annotation_set,
annotation_filename
)

return annotations

def merge_sets(self, left_set: str, right_set: str,
left_columns: List[str], right_columns: List[str],
output_set: str, columns: dict = {},
type = 'csv',
threads = -1
):
"""Merge columns from ``left_set`` and ``right_set`` annotations,
Expand Down Expand Up @@ -546,8 +593,12 @@ def merge_sets(self, left_set: str, right_set: str,
for recording in left_annotations['recording_filename'].unique()
]

pool = mp.Pool(processes = threads if threads > 0 else mp.cpu_count())
annotations = pool.map(partial(self.merge_annotations, left_columns, right_columns, columns, output_set), input_annotations)
with mp.Pool(processes = threads if threads > 0 else mp.cpu_count()) as pool:
annotations = pool.map(
partial(self.merge_annotations, left_columns, right_columns, columns, output_set, type),
input_annotations
)

annotations = pd.concat(annotations)
annotations.drop(columns = list(set(annotations.columns)-{c.name for c in self.INDEX_COLUMNS}), inplace = True)
annotations.fillna({'raw_filename': 'NA'}, inplace = True)
Expand All @@ -570,7 +621,7 @@ def get_segments(self, annotations: pd.DataFrame) -> pd.DataFrame:
segments = []
for index, _annotations in annotations.groupby(['set', 'annotation_filename']):
s, annotation_filename = index
df = pd.read_csv(os.path.join(self.project.path, 'annotations', s, 'converted', annotation_filename))
df = self._read_annotation(s, annotation_filename)

for annotation in _annotations.to_dict(orient = 'records'):
segs = df.copy()
Expand Down
9 changes: 7 additions & 2 deletions ChildProject/tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,16 @@
import re
import datetime
import numpy as np
from typing import Callable

def is_boolean(x):
return x == 'NA' or int(x) in [0,1]

class IndexColumn:
def __init__(self, name = "", description = "", required = False,
regex = None, filename = False, datetime = None, function = None, choices = None,
unique = False, generated = False):
unique = False, generated = False,
na = False):
self.name = name
self.description = description
self.required = required
Expand All @@ -21,6 +23,7 @@ def __init__(self, name = "", description = "", required = False,
self.choices = choices
self.unique = unique
self.generated = generated
self.na = bool(na)

def __str__(self):
return 'IndexColumn(name = {})'.format(self.name)
Expand Down Expand Up @@ -98,7 +101,9 @@ def validate(self):
if column_attr is None:
continue

if callable(column_attr.function):
if column_attr.na and pd.isnull(row[column_name]):
continue
elif callable(column_attr.function):
try:
ok = column_attr.function(str(row[column_name])) == True
except:
Expand Down
16 changes: 8 additions & 8 deletions examples/valid_raw_data/annotations/input.csv
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
set,recording_filename,time_seek,raw_filename,range_onset,range_offset,format,filter
textgrid,sound.wav,0,example.TextGrid,0,10000,TextGrid,
eaf_basic,sound.wav,0,example.eaf,0,300000,eaf,
eaf_solis,sound.wav,0,example_solis.eaf,0,40000000,eaf,
vtc_rttm,sound.wav,0,example.rttm,1980000,1990000,vtc_rttm,namibie_aiku_20160714_1
alice,sound.wav,0,example_alice.txt,1980000,1990000,alice,namibie_aiku_20160714_1
metrics,sound.wav,0,example_metrics.rttm,0,100000000,vtc_rttm,tsimane2017_C01_20170706
old_its,sound.wav,0,example_lena_old.its,0,100000000,its,
set,recording_filename,time_seek,raw_filename,range_onset,range_offset,format,filter
textgrid,sound.wav,0,example.TextGrid,0,10000,TextGrid,
eaf_basic,sound.wav,0,example.eaf,0,300000,eaf,
eaf_solis,sound.wav,0,example_solis.eaf,0,40000000,eaf,
vtc_rttm,sound.wav,0,example.rttm,1980000,1990000,vtc_rttm,namibie_aiku_20160714_1
alice,sound.wav,0,example_alice.txt,1980000,1990000,alice,namibie_aiku_20160714_1
metrics,sound.wav,0,example_metrics.rttm,0,100000000,vtc_rttm,tsimane2017_C01_20170706
old_its,sound.wav,0,example_lena_old.its,0,100000000,its,
new_its,sound.wav,0,example_lena_new.its,0,100000000,its,
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ pydub
pandas>=0.25.0
panoptes_client
pyannote.metrics; python_version >= "3.7.0"
pyarrow
pygamma-agreement; python_version >= "3.6.0"
pylangacq
pympi-ling
Expand All @@ -22,3 +23,4 @@ sklearn
sox
sphinx
sphinx_rtd_theme
tables
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

requires = {
'core': ['pandas>=0.25.0', 'jinja2', 'numpy>=1.16.5', 'sox', 'datalad', 'requests<2.25.0'],
'annotations': ['lxml', 'pympi-ling', 'pylangacq', 'python-dateutil>=2.8.1'],
'annotations': ['lxml', 'pyarrow', 'pympi-ling', 'pylangacq', 'python-dateutil>=2.8.1', 'tables'],
'metrics': ['pyannote.metrics; python_version >= "3.7.0"', 'pygamma-agreement; python_version >= "3.6.0"', 'nltk', 'sklearn'],
'audio': ['librosa', 'pydub', 'pysoundfile'],
'samplers': ['PyYAML'],
Expand Down
16 changes: 9 additions & 7 deletions tests/test_annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,10 +98,12 @@ def test_its(its):
)#.fillna('NA')
check_its(converted, truth)

def test_import(project):
am = AnnotationManager(project)

@pytest.mark.parametrize('type', ['csv', 'gz', 'h5', 'parquet'])
def test_import(project, type):
input_annotations = pd.read_csv('examples/valid_raw_data/annotations/input.csv')
input_annotations['type'] = type

am = AnnotationManager(project)
am.import_annotations(input_annotations)
am.read()

Expand All @@ -113,16 +115,15 @@ def test_import(project):
]), "some annotations are missing"

errors, warnings = am.validate()
assert len(errors) == 0 and len(warnings) == 0, "malformed annotations detected"
assert len(errors) == 0 and len(warnings) == 0, f"malformed annotations detected ({errors} {warnings})"

for dataset in ['eaf_basic', 'textgrid', 'eaf_solis']:
annotations = am.annotations[am.annotations['set'] == dataset]
segments = am.get_segments(annotations)
segments.drop(columns = set(annotations.columns) - {'raw_filename'}, inplace = True)
truth = pd.read_csv('tests/truth/{}.csv'.format(dataset))
truth = pd.read_csv('tests/truth/{}.csv'.format(dataset), dtype = {'transcription': str, 'ling_type': str})

print(segments)
print(truth)
print(annotations)

pd.testing.assert_frame_equal(
standardize_dataframe(segments, set(truth.columns.tolist())),
Expand All @@ -146,6 +147,7 @@ def test_intersect(project):
columns = a.columns.tolist()
columns.remove('imported_at')
columns.remove('package_version')
columns.remove('type')

pd.testing.assert_frame_equal(
standardize_dataframe(a, columns),
Expand Down
Loading