Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,20 @@ All notable changes to this project will be documented in this file.

## [Unreleased]

## [0.4.6] 2026-02-02

### Added

- adding std (standard deviation) on the durations of vocalizations in metrics

### Modified

- Derivation and merge of sets won't fail if the writing of metannots fails, it will simply issue an error in log
- dialect element in languages column inside children.csv is not supported anymore, dialect should be indicated elsewhere
- NA is accepted in datetime elements without warning
- custom is accepted as an annotation format, monoling, normative, child_sex, start_time_accuracy and dob_accuracy
- allow innacurate and reported for dob_criterion / accuracy to reflect lack of knowledge of the participant's age

## [0.4.5] 2025-12-15

### Fixed
Expand Down
81 changes: 69 additions & 12 deletions ChildProject/annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ class AnnotationManager:
IndexColumn(
name="format",
description="input annotation format",
choices=[*converters.keys(), "NA"],
choices=[*converters.keys(), "NA", "custom"],
required=False,
),
IndexColumn(
Expand Down Expand Up @@ -789,8 +789,14 @@ def write(self) -> Self:

return self

def _write_set_metadata(self, setname, metadata) -> Self:
def _write_set_metadata(self, setname, metadata, output_as_path: bool=False) -> Self:
if output_as_path:
with open(setname / METANNOTS, 'w') as stream:
yaml.dump(metadata, stream)
return self

assert setname in self.annotations['set'].unique(), f"set must exist"

with open(self.project.path / ANNOTATIONS / setname / METANNOTS, 'w') as stream:
yaml.dump(metadata, stream)
return self
Expand Down Expand Up @@ -1083,6 +1089,7 @@ def _derive_annotation(
derivator: Derivator,
output_set: str,
overwrite_existing: bool = False,
output_as_path = False,
) -> dict:
"""import and convert ``annotation``. This function should not be called outside of this class.

Expand All @@ -1094,6 +1101,8 @@ def _derive_annotation(
:type output_set: str
:param overwrite_existing: use for lines with the same set and annotation_filename to be re-derived and overwritten
:type overwrite_existing: bool
:param output_as_path: used if you want to direct your outputs to any filesystem folder, specified by `output_set`
:type output_as_path: bool
:return: output annotation dictionary (attributes defined according to :ref:`ChildProject.annotations.AnnotationManager.SEGMENTS_COLUMNS`)
:rtype: dict
"""
Expand All @@ -1106,7 +1115,10 @@ def _derive_annotation(
annotation_filename = "{}_{}_{}.csv".format(
source_recording, annotation["range_onset"], annotation["range_offset"]
)
output_filename = ANNOTATIONS / output_set / CONVERTED / annotation_filename
if not output_as_path:
output_filename = ANNOTATIONS / output_set / CONVERTED / annotation_filename
else:
output_filename = Path(output_set) / CONVERTED / annotation_filename

# check if the annotation file already exists in dataset (same filename and same set)
if self.annotations[(self.annotations['set'] == output_set) &
Expand Down Expand Up @@ -1202,11 +1214,18 @@ def bad_derivation(annotation_dict, msg_err, error, path_file):

df.sort_values(sort_columns, inplace=True)

os.makedirs(
(self.project.path / output_filename).parent,
exist_ok=True,
)
df.to_csv(self.project.path / output_filename, index=False)
if output_as_path:
os.makedirs(
output_filename.parent,
exist_ok=True,
)
df.to_csv(output_filename, index=False)
else:
os.makedirs(
(self.project.path / output_filename).parent,
exist_ok=True,
)
df.to_csv(self.project.path / output_filename, index=False)

annotation_result["annotation_filename"] = annotation_filename
annotation_result["imported_at"] = datetime.datetime.now().strftime(
Expand Down Expand Up @@ -1242,6 +1261,29 @@ def derive_annotations(self,
:return: tuple of dataframe of derived annotations, as in :ref:`format-annotations` and dataframe of errors
:rtype: tuple(pd.DataFrame, pd.DataFrame)
"""
return self._derive_annotations(
input_set=input_set,
output_set=output_set,
derivation=derivation,
derivation_metadata=derivation_metadata,
threads=threads,
overwrite_existing=overwrite_existing,
output_as_path=False,
)

def _derive_annotations(self,
input_set: str,
output_set: str,
derivation: Union[str, Callable],
derivation_metadata=None,
threads: int = -1,
overwrite_existing: bool = False,
output_as_path: bool = False,
) -> (pd.DataFrame, pd.DataFrame):
"""
Derive annotations. Same as the public routine, except specifying `output_as_path==True`
will direct your outputs to a chosen folder anywhere on the filesystem
"""
input_processed = self.annotations[self.annotations['set'] == input_set].copy()
assert not input_processed.empty, "Input set {0} does not exist,\
existing sets are in the 'set' column of {1}".format(input_set, ANNOTATIONS_CSV)
Expand All @@ -1268,7 +1310,8 @@ def derive_annotations(self,
partial(self._derive_annotation,
derivator=derivator,
output_set=output_set,
overwrite_existing=overwrite_existing
overwrite_existing=overwrite_existing,
output_as_path=output_as_path,
), axis=1
).to_dict(orient="records")
else:
Expand All @@ -1278,7 +1321,8 @@ def derive_annotations(self,
partial(self._derive_annotation,
derivator=derivator,
output_set=output_set,
overwrite_existing=overwrite_existing
overwrite_existing=overwrite_existing,
output_as_path=output_as_path,
),
input_processed.to_dict(orient="records"),
)
Expand Down Expand Up @@ -1325,7 +1369,17 @@ def derive_annotations(self,
subset=["set", "recording_filename", "range_onset", "range_offset"], keep='last')
# write the derived set metadata only if some lines were correctly imported
if imported.shape[0]:
self._write_set_metadata(output_set, set_metadata)
try:
self._write_set_metadata(output_set, set_metadata, output_as_path)
except Exception as e:
logger.error(f"Could not write set metadata for {output_set}")

if output_as_path:
# At this point the outputs are where they need to be, but the below functions will not run
# Until the set has been added to the dataset. You would have to import yourself manually
# after the fact using an automated importation (and possibly some file/folder renaming)
return imported, errors

self._read_sets_metadata()
self.write()

Expand Down Expand Up @@ -1812,7 +1866,10 @@ def merge_sets(
self.write()
# if the set's metadata exists already, do not write new metadata
if not (self.project.path / ANNOTATIONS / output_set / METANNOTS).exists():
self._write_set_metadata(output_set, new_set_meta)
try:
self._write_set_metadata(output_set, new_set_meta)
except Exception as e:
logger.error(f"Could not write set metadata for {output_set}")
self._read_sets_metadata()

return self
Expand Down
7 changes: 6 additions & 1 deletion ChildProject/cmdline.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,11 @@
import random
import logging
import json
class SetEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, set):
return list(obj)
return json.JSONEncoder.default(self, obj)

# add this to setup,py in the requires section and in requirements.txt
import colorlog
Expand Down Expand Up @@ -693,7 +698,7 @@ def overview(args) -> int:
logger.info(output)

if args.format == 'json':
logger.info(json.dumps(dict))
logger.info(json.dumps(dict, cls=SetEncoder))

return 0

Expand Down
16 changes: 16 additions & 0 deletions ChildProject/pipelines/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -595,6 +595,7 @@ def __init__(
segments: Union[str, pd.DataFrame] = None,
by: str = "recording_filename",
threads: int = 1,
include_std: bool = False,
):

self.vtc = vtc
Expand All @@ -619,6 +620,14 @@ def __init__(
["simple_CTC_ph",self.vtc,pd.NA],
])

if include_std:
METRICS = np.concatenate((np.array(
[["std_voc_dur_speaker", self.vtc,'FEM'],
["std_voc_dur_speaker", self.vtc,'MAL'],
["std_voc_dur_speaker", self.vtc,'OCH'],
["std_voc_dur_speaker", self.vtc,'CHI']]
), METRICS))

if self.alice not in am.annotations["set"].values:
print(f"The ALICE set ('{self.alice}') was not found in the index.")
else:
Expand Down Expand Up @@ -652,6 +661,13 @@ def __init__(
["cp_n",self.vcm,pd.NA],
["cp_dur",self.vcm,pd.NA],
])))

if include_std:
METRICS = np.concatenate((METRICS, np.array(
[["std_cry_voc_dur_speaker",self.vcm,"CHI"],
["std_can_voc_dur_speaker",self.vcm,"CHI"],
["std_non_can_voc_dur_speaker",self.vcm,"CHI"]]
)))

METRICS = pd.DataFrame(METRICS, columns=["callable","set","speaker"])

Expand Down
58 changes: 57 additions & 1 deletion ChildProject/pipelines/metricsFunctions.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,16 @@ def avg_voc_dur_speaker(segments: pd.DataFrame, duration: int, **kwargs) -> floa
return segments[segments["speaker_type"] == kwargs["speaker"]]["duration"].mean()


@metricFunction({"speaker"}, {"speaker_type", "duration"}, np.nan)
def std_voc_dur_speaker(segments: pd.DataFrame, duration: int, **kwargs) -> float:
"""standard deviation of duration in milliseconds of vocalizations for a given speaker type

Required keyword arguments:
- speaker : speaker_type to use
"""
return segments[segments["speaker_type"] == kwargs["speaker"]]["duration"].std()


def wc_speaker(segments: pd.DataFrame, duration: int, **kwargs) -> float:
"""number of words for a given speaker type

Expand Down Expand Up @@ -342,14 +352,36 @@ def avg_cry_voc_dur_speaker(segments: pd.DataFrame, duration: int, **kwargs) ->
value = segments.loc[(segments["speaker_type"] == kwargs["speaker"]) &
(segments["vcm_type"] == "Y")]["duration"].mean()
else:
segments = segments[segments['speaker_type'] == kwargs["speaker"]]
segments = segments[(segments['speaker_type'] == kwargs["speaker"]) & (segments["child_cry_vfx_len"] > 0)]
value = segments["child_cry_vfx_len"].sum() / segments["cries"].apply(lambda x: len(ast.literal_eval(x))).sum()

if pd.isnull(value):
value = 0
return value


@metricFunction({"speaker"}, ({"speaker_type", "vcm_type", "duration"}, {'speaker_type', "child_cry_vfx_len", "cries"}), np.nan)
def std_cry_voc_dur_speaker(segments: pd.DataFrame, duration: int, **kwargs) -> float:
"""standard deviation of duration of cry vocalizations by a given speaker type (based on vcm_type or lena cries)

Required keyword arguments:
- speaker : speaker_type to use
"""
if 'vcm_type' in segments.columns and 'duration' in segments.columns:
value = segments.loc[(segments["speaker_type"] == kwargs["speaker"]) &
(segments["vcm_type"] == "Y")]["duration"].std()
else:
segments = segments[
(segments['speaker_type'] == kwargs["speaker"]) & (segments["child_cry_vfx_len"] > 0)].copy()
segments['cry_dur'] = segments["child_cry_vfx_len"] / segments["cries"].apply(
lambda x: len(ast.literal_eval(x))) # split duration of cry in the same voc
segments['num'] = segments["cries"].apply(lambda x: ast.literal_eval(x)) # have a array to explode
segments = segments.explode('num')
value = segments['cry_dur'].std()

return value


def can_voc_speaker(segments: pd.DataFrame, duration: int, **kwargs) -> int:
"""number of canonical vocalizations for a given speaker type (based on vcm_type)

Expand Down Expand Up @@ -395,6 +427,18 @@ def avg_can_voc_dur_speaker(segments: pd.DataFrame, duration: int, **kwargs) ->
return value


@metricFunction({"speaker"}, {"speaker_type", "vcm_type", "duration"}, np.nan)
def std_can_voc_dur_speaker(segments: pd.DataFrame, duration: int, **kwargs) -> float:
"""standard deviation of duration of canonical vocalizations for a given speaker type (based on vcm_type)

Required keyword arguments:
- speaker : speaker_type to use
"""
value = segments.loc[(segments["speaker_type"] == kwargs["speaker"]) & (segments["vcm_type"] == "C")][
"duration"].std()
return value


def non_can_voc_speaker(segments: pd.DataFrame, duration: int, **kwargs) -> int:
"""number of non-canonical vocalizations for a given speaker type (based on vcm_type)

Expand Down Expand Up @@ -443,6 +487,18 @@ def avg_non_can_voc_dur_speaker(segments: pd.DataFrame, duration: int, **kwargs)
return value


@metricFunction({"speaker"}, {"speaker_type", "vcm_type", "duration"}, np.nan)
def std_non_can_voc_dur_speaker(segments: pd.DataFrame, duration: int, **kwargs) -> float:
"""standard deviation of duration of non-canonical vocalizations for a given speaker type (based on vcm_type)

Required keyword arguments:
- speaker : speaker_type to use
"""
value = segments.loc[(segments["speaker_type"] == kwargs["speaker"]) &
(segments["vcm_type"] == "N")]["duration"].std()
return value


@metricFunction(set(), set(), np.nan)
def lp_n(segments: pd.DataFrame, duration: int, **kwargs) -> float:
"""linguistic proportion on the number of vocalizations for CHI (based on vcm_type or [cries,vfxs,utterances_count] if vcm_type does not exist)
Expand Down
15 changes: 7 additions & 8 deletions ChildProject/projects.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,11 +87,11 @@ class ChildProject:
IndexColumn(
name="child_sex",
description="f= female, m=male",
choices=["m", "M", "f", "F"],
choices=["m", "M", "f", "F", 'NA'],
),
IndexColumn(
name="language",
description='language the child is exposed to if child is monolingual; small caps, indicate dialect by name or location if available; eg "france french"; "paris french"',
description='main language the child is exposed to; small caps; eg "french"; "english"',
),
IndexColumn(
name="languages",
Expand All @@ -106,7 +106,7 @@ class ChildProject:
IndexColumn(
name="monoling",
description="whether the child is monolingual (Y) or not (N)",
choices=["Y", "N"],
choices=["Y", "N", 'NA'],
),
IndexColumn(
name="monoling_criterion",
Expand All @@ -115,7 +115,7 @@ class ChildProject:
IndexColumn(
name="normative",
description="whether the child is normative (Y) or not (N)",
choices=["Y", "N"],
choices=["Y", "N", 'NA'],
),
IndexColumn(
name="normative_criterion",
Expand Down Expand Up @@ -144,13 +144,13 @@ class ChildProject:
IndexColumn(
name="dob_criterion",
description="determines whether the date of birth is known exactly or extrapolated e.g. from the age. Dates of birth are assumed to be known exactly if this column is NA or unspecified.",
choices=["extrapolated", "exact"],
choices=["extrapolated", "exact", 'reported', 'innacurate'],
required=False,
),
IndexColumn(
name="dob_accuracy",
description="date of birth accuracy",
choices=["day", "week", "month", "year", "other"],
choices=["day", "week", "month", "year", "other", "innacurate", 'NA'], # innacurate shows the dob isn't representative of the child's age; analysis should not use the age of the participant
),
IndexColumn(
name="discard",
Expand Down Expand Up @@ -243,7 +243,7 @@ class ChildProject:
IndexColumn(
name="start_time_accuracy",
description="Accuracy of start_time for this recording. If not specified, assumes second-accuray.",
choices=["second", "minute", "hour", "reliable"],
choices=["second", "minute", "hour", "reliable", 'NA'],
),
IndexColumn(
name="noisy_setting",
Expand Down Expand Up @@ -450,7 +450,6 @@ def read(self, verbose=False, accumulate=True) -> Self:
verbose,
)

# breakpoint()
if self.ignore_discarded and "discard" in self.ct.df:
self.ct.df['discard'] = pd.to_numeric(self.ct.df["discard"], errors='coerce').fillna(0).astype('Int64').astype('string')
self.discarded_children = self.ct.df[self.ct.df["discard"] == '1']
Expand Down
Loading
Loading