diff --git a/CHANGELOG.md b/CHANGELOG.md index 808c70e1..3fd7cdca 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,20 @@ All notable changes to this project will be documented in this file. ## [Unreleased] +## [0.4.6] 2026-02-02 + +### Added + +- adding std (standard deviation) on the durations of vocalizations in metrics + +### Modified + +- Derivation and merge of sets won't fail if the writing of metannots fails, it will simply issue an error in log +- dialect element in languages column inside children.csv is not supported anymore, dialect should be indicated elsewhere +- NA is accepted in datetime elements without warning +- custom is accepted as an annotation format, monoling, normative, child_sex, start_time_accuracy and dob_accuracy +- allow innacurate and reported for dob_criterion / accuracy to reflect lack of knowledge of the participant's age + ## [0.4.5] 2025-12-15 ### Fixed diff --git a/ChildProject/annotations.py b/ChildProject/annotations.py index 43fbb923..aac52a66 100644 --- a/ChildProject/annotations.py +++ b/ChildProject/annotations.py @@ -75,7 +75,7 @@ class AnnotationManager: IndexColumn( name="format", description="input annotation format", - choices=[*converters.keys(), "NA"], + choices=[*converters.keys(), "NA", "custom"], required=False, ), IndexColumn( @@ -789,8 +789,14 @@ def write(self) -> Self: return self - def _write_set_metadata(self, setname, metadata) -> Self: + def _write_set_metadata(self, setname, metadata, output_as_path: bool=False) -> Self: + if output_as_path: + with open(setname / METANNOTS, 'w') as stream: + yaml.dump(metadata, stream) + return self + assert setname in self.annotations['set'].unique(), f"set must exist" + with open(self.project.path / ANNOTATIONS / setname / METANNOTS, 'w') as stream: yaml.dump(metadata, stream) return self @@ -1083,6 +1089,7 @@ def _derive_annotation( derivator: Derivator, output_set: str, overwrite_existing: bool = False, + output_as_path = False, ) -> dict: """import and convert ``annotation``. This function should not be called outside of this class. @@ -1094,6 +1101,8 @@ def _derive_annotation( :type output_set: str :param overwrite_existing: use for lines with the same set and annotation_filename to be re-derived and overwritten :type overwrite_existing: bool + :param output_as_path: used if you want to direct your outputs to any filesystem folder, specified by `output_set` + :type output_as_path: bool :return: output annotation dictionary (attributes defined according to :ref:`ChildProject.annotations.AnnotationManager.SEGMENTS_COLUMNS`) :rtype: dict """ @@ -1106,7 +1115,10 @@ def _derive_annotation( annotation_filename = "{}_{}_{}.csv".format( source_recording, annotation["range_onset"], annotation["range_offset"] ) - output_filename = ANNOTATIONS / output_set / CONVERTED / annotation_filename + if not output_as_path: + output_filename = ANNOTATIONS / output_set / CONVERTED / annotation_filename + else: + output_filename = Path(output_set) / CONVERTED / annotation_filename # check if the annotation file already exists in dataset (same filename and same set) if self.annotations[(self.annotations['set'] == output_set) & @@ -1202,11 +1214,18 @@ def bad_derivation(annotation_dict, msg_err, error, path_file): df.sort_values(sort_columns, inplace=True) - os.makedirs( - (self.project.path / output_filename).parent, - exist_ok=True, - ) - df.to_csv(self.project.path / output_filename, index=False) + if output_as_path: + os.makedirs( + output_filename.parent, + exist_ok=True, + ) + df.to_csv(output_filename, index=False) + else: + os.makedirs( + (self.project.path / output_filename).parent, + exist_ok=True, + ) + df.to_csv(self.project.path / output_filename, index=False) annotation_result["annotation_filename"] = annotation_filename annotation_result["imported_at"] = datetime.datetime.now().strftime( @@ -1242,6 +1261,29 @@ def derive_annotations(self, :return: tuple of dataframe of derived annotations, as in :ref:`format-annotations` and dataframe of errors :rtype: tuple(pd.DataFrame, pd.DataFrame) """ + return self._derive_annotations( + input_set=input_set, + output_set=output_set, + derivation=derivation, + derivation_metadata=derivation_metadata, + threads=threads, + overwrite_existing=overwrite_existing, + output_as_path=False, + ) + + def _derive_annotations(self, + input_set: str, + output_set: str, + derivation: Union[str, Callable], + derivation_metadata=None, + threads: int = -1, + overwrite_existing: bool = False, + output_as_path: bool = False, + ) -> (pd.DataFrame, pd.DataFrame): + """ + Derive annotations. Same as the public routine, except specifying `output_as_path==True` + will direct your outputs to a chosen folder anywhere on the filesystem + """ input_processed = self.annotations[self.annotations['set'] == input_set].copy() assert not input_processed.empty, "Input set {0} does not exist,\ existing sets are in the 'set' column of {1}".format(input_set, ANNOTATIONS_CSV) @@ -1268,7 +1310,8 @@ def derive_annotations(self, partial(self._derive_annotation, derivator=derivator, output_set=output_set, - overwrite_existing=overwrite_existing + overwrite_existing=overwrite_existing, + output_as_path=output_as_path, ), axis=1 ).to_dict(orient="records") else: @@ -1278,7 +1321,8 @@ def derive_annotations(self, partial(self._derive_annotation, derivator=derivator, output_set=output_set, - overwrite_existing=overwrite_existing + overwrite_existing=overwrite_existing, + output_as_path=output_as_path, ), input_processed.to_dict(orient="records"), ) @@ -1325,7 +1369,17 @@ def derive_annotations(self, subset=["set", "recording_filename", "range_onset", "range_offset"], keep='last') # write the derived set metadata only if some lines were correctly imported if imported.shape[0]: - self._write_set_metadata(output_set, set_metadata) + try: + self._write_set_metadata(output_set, set_metadata, output_as_path) + except Exception as e: + logger.error(f"Could not write set metadata for {output_set}") + + if output_as_path: + # At this point the outputs are where they need to be, but the below functions will not run + # Until the set has been added to the dataset. You would have to import yourself manually + # after the fact using an automated importation (and possibly some file/folder renaming) + return imported, errors + self._read_sets_metadata() self.write() @@ -1812,7 +1866,10 @@ def merge_sets( self.write() # if the set's metadata exists already, do not write new metadata if not (self.project.path / ANNOTATIONS / output_set / METANNOTS).exists(): - self._write_set_metadata(output_set, new_set_meta) + try: + self._write_set_metadata(output_set, new_set_meta) + except Exception as e: + logger.error(f"Could not write set metadata for {output_set}") self._read_sets_metadata() return self diff --git a/ChildProject/cmdline.py b/ChildProject/cmdline.py index 654b6bcc..066a42cf 100755 --- a/ChildProject/cmdline.py +++ b/ChildProject/cmdline.py @@ -28,6 +28,11 @@ import random import logging import json +class SetEncoder(json.JSONEncoder): + def default(self, obj): + if isinstance(obj, set): + return list(obj) + return json.JSONEncoder.default(self, obj) # add this to setup,py in the requires section and in requirements.txt import colorlog @@ -693,7 +698,7 @@ def overview(args) -> int: logger.info(output) if args.format == 'json': - logger.info(json.dumps(dict)) + logger.info(json.dumps(dict, cls=SetEncoder)) return 0 diff --git a/ChildProject/pipelines/metrics.py b/ChildProject/pipelines/metrics.py index dd2d17f9..866149d9 100644 --- a/ChildProject/pipelines/metrics.py +++ b/ChildProject/pipelines/metrics.py @@ -595,6 +595,7 @@ def __init__( segments: Union[str, pd.DataFrame] = None, by: str = "recording_filename", threads: int = 1, + include_std: bool = False, ): self.vtc = vtc @@ -619,6 +620,14 @@ def __init__( ["simple_CTC_ph",self.vtc,pd.NA], ]) + if include_std: + METRICS = np.concatenate((np.array( + [["std_voc_dur_speaker", self.vtc,'FEM'], + ["std_voc_dur_speaker", self.vtc,'MAL'], + ["std_voc_dur_speaker", self.vtc,'OCH'], + ["std_voc_dur_speaker", self.vtc,'CHI']] + ), METRICS)) + if self.alice not in am.annotations["set"].values: print(f"The ALICE set ('{self.alice}') was not found in the index.") else: @@ -652,6 +661,13 @@ def __init__( ["cp_n",self.vcm,pd.NA], ["cp_dur",self.vcm,pd.NA], ]))) + + if include_std: + METRICS = np.concatenate((METRICS, np.array( + [["std_cry_voc_dur_speaker",self.vcm,"CHI"], + ["std_can_voc_dur_speaker",self.vcm,"CHI"], + ["std_non_can_voc_dur_speaker",self.vcm,"CHI"]] + ))) METRICS = pd.DataFrame(METRICS, columns=["callable","set","speaker"]) diff --git a/ChildProject/pipelines/metricsFunctions.py b/ChildProject/pipelines/metricsFunctions.py index 3fe2628f..78103a33 100644 --- a/ChildProject/pipelines/metricsFunctions.py +++ b/ChildProject/pipelines/metricsFunctions.py @@ -207,6 +207,16 @@ def avg_voc_dur_speaker(segments: pd.DataFrame, duration: int, **kwargs) -> floa return segments[segments["speaker_type"] == kwargs["speaker"]]["duration"].mean() +@metricFunction({"speaker"}, {"speaker_type", "duration"}, np.nan) +def std_voc_dur_speaker(segments: pd.DataFrame, duration: int, **kwargs) -> float: + """standard deviation of duration in milliseconds of vocalizations for a given speaker type + + Required keyword arguments: + - speaker : speaker_type to use + """ + return segments[segments["speaker_type"] == kwargs["speaker"]]["duration"].std() + + def wc_speaker(segments: pd.DataFrame, duration: int, **kwargs) -> float: """number of words for a given speaker type @@ -342,7 +352,7 @@ def avg_cry_voc_dur_speaker(segments: pd.DataFrame, duration: int, **kwargs) -> value = segments.loc[(segments["speaker_type"] == kwargs["speaker"]) & (segments["vcm_type"] == "Y")]["duration"].mean() else: - segments = segments[segments['speaker_type'] == kwargs["speaker"]] + segments = segments[(segments['speaker_type'] == kwargs["speaker"]) & (segments["child_cry_vfx_len"] > 0)] value = segments["child_cry_vfx_len"].sum() / segments["cries"].apply(lambda x: len(ast.literal_eval(x))).sum() if pd.isnull(value): @@ -350,6 +360,28 @@ def avg_cry_voc_dur_speaker(segments: pd.DataFrame, duration: int, **kwargs) -> return value +@metricFunction({"speaker"}, ({"speaker_type", "vcm_type", "duration"}, {'speaker_type', "child_cry_vfx_len", "cries"}), np.nan) +def std_cry_voc_dur_speaker(segments: pd.DataFrame, duration: int, **kwargs) -> float: + """standard deviation of duration of cry vocalizations by a given speaker type (based on vcm_type or lena cries) + + Required keyword arguments: + - speaker : speaker_type to use + """ + if 'vcm_type' in segments.columns and 'duration' in segments.columns: + value = segments.loc[(segments["speaker_type"] == kwargs["speaker"]) & + (segments["vcm_type"] == "Y")]["duration"].std() + else: + segments = segments[ + (segments['speaker_type'] == kwargs["speaker"]) & (segments["child_cry_vfx_len"] > 0)].copy() + segments['cry_dur'] = segments["child_cry_vfx_len"] / segments["cries"].apply( + lambda x: len(ast.literal_eval(x))) # split duration of cry in the same voc + segments['num'] = segments["cries"].apply(lambda x: ast.literal_eval(x)) # have a array to explode + segments = segments.explode('num') + value = segments['cry_dur'].std() + + return value + + def can_voc_speaker(segments: pd.DataFrame, duration: int, **kwargs) -> int: """number of canonical vocalizations for a given speaker type (based on vcm_type) @@ -395,6 +427,18 @@ def avg_can_voc_dur_speaker(segments: pd.DataFrame, duration: int, **kwargs) -> return value +@metricFunction({"speaker"}, {"speaker_type", "vcm_type", "duration"}, np.nan) +def std_can_voc_dur_speaker(segments: pd.DataFrame, duration: int, **kwargs) -> float: + """standard deviation of duration of canonical vocalizations for a given speaker type (based on vcm_type) + + Required keyword arguments: + - speaker : speaker_type to use + """ + value = segments.loc[(segments["speaker_type"] == kwargs["speaker"]) & (segments["vcm_type"] == "C")][ + "duration"].std() + return value + + def non_can_voc_speaker(segments: pd.DataFrame, duration: int, **kwargs) -> int: """number of non-canonical vocalizations for a given speaker type (based on vcm_type) @@ -443,6 +487,18 @@ def avg_non_can_voc_dur_speaker(segments: pd.DataFrame, duration: int, **kwargs) return value +@metricFunction({"speaker"}, {"speaker_type", "vcm_type", "duration"}, np.nan) +def std_non_can_voc_dur_speaker(segments: pd.DataFrame, duration: int, **kwargs) -> float: + """standard deviation of duration of non-canonical vocalizations for a given speaker type (based on vcm_type) + + Required keyword arguments: + - speaker : speaker_type to use + """ + value = segments.loc[(segments["speaker_type"] == kwargs["speaker"]) & + (segments["vcm_type"] == "N")]["duration"].std() + return value + + @metricFunction(set(), set(), np.nan) def lp_n(segments: pd.DataFrame, duration: int, **kwargs) -> float: """linguistic proportion on the number of vocalizations for CHI (based on vcm_type or [cries,vfxs,utterances_count] if vcm_type does not exist) diff --git a/ChildProject/projects.py b/ChildProject/projects.py index a7280f3e..7cc0cb8a 100644 --- a/ChildProject/projects.py +++ b/ChildProject/projects.py @@ -87,11 +87,11 @@ class ChildProject: IndexColumn( name="child_sex", description="f= female, m=male", - choices=["m", "M", "f", "F"], + choices=["m", "M", "f", "F", 'NA'], ), IndexColumn( name="language", - description='language the child is exposed to if child is monolingual; small caps, indicate dialect by name or location if available; eg "france french"; "paris french"', + description='main language the child is exposed to; small caps; eg "french"; "english"', ), IndexColumn( name="languages", @@ -106,7 +106,7 @@ class ChildProject: IndexColumn( name="monoling", description="whether the child is monolingual (Y) or not (N)", - choices=["Y", "N"], + choices=["Y", "N", 'NA'], ), IndexColumn( name="monoling_criterion", @@ -115,7 +115,7 @@ class ChildProject: IndexColumn( name="normative", description="whether the child is normative (Y) or not (N)", - choices=["Y", "N"], + choices=["Y", "N", 'NA'], ), IndexColumn( name="normative_criterion", @@ -144,13 +144,13 @@ class ChildProject: IndexColumn( name="dob_criterion", description="determines whether the date of birth is known exactly or extrapolated e.g. from the age. Dates of birth are assumed to be known exactly if this column is NA or unspecified.", - choices=["extrapolated", "exact"], + choices=["extrapolated", "exact", 'reported', 'innacurate'], required=False, ), IndexColumn( name="dob_accuracy", description="date of birth accuracy", - choices=["day", "week", "month", "year", "other"], + choices=["day", "week", "month", "year", "other", "innacurate", 'NA'], # innacurate shows the dob isn't representative of the child's age; analysis should not use the age of the participant ), IndexColumn( name="discard", @@ -243,7 +243,7 @@ class ChildProject: IndexColumn( name="start_time_accuracy", description="Accuracy of start_time for this recording. If not specified, assumes second-accuray.", - choices=["second", "minute", "hour", "reliable"], + choices=["second", "minute", "hour", "reliable", 'NA'], ), IndexColumn( name="noisy_setting", @@ -450,7 +450,6 @@ def read(self, verbose=False, accumulate=True) -> Self: verbose, ) - # breakpoint() if self.ignore_discarded and "discard" in self.ct.df: self.ct.df['discard'] = pd.to_numeric(self.ct.df["discard"], errors='coerce').fillna(0).astype('Int64').astype('string') self.discarded_children = self.ct.df[self.ct.df["discard"] == '1'] diff --git a/ChildProject/tables.py b/ChildProject/tables.py index 307dc761..667ba83f 100644 --- a/ChildProject/tables.py +++ b/ChildProject/tables.py @@ -231,7 +231,8 @@ def validate(self) -> Tuple[List[str], List[str]]: if column_attr.required and str(row[column_name]) != "NA": errors.append(self.msg(message)) elif column_attr.required or str(row[column_name]) != "NA": - warnings.append(self.msg(message)) + pass + #warnings.append(self.msg(message)) elif column_attr.regex: if not re.fullmatch(column_attr.regex, str(row[column_name])): message = "'{}' does not match the format required for '{}' on line {}, expected '{}'".format( diff --git a/pyproject.toml b/pyproject.toml index 7198a957..1752f4b7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "ChildProject" -version = '0.4.5' +version = '0.4.6' dependencies = [ "colorlog", "GitPython", @@ -13,7 +13,7 @@ dependencies = [ "matplotlib", "nltk", "numpy>=1.17", - "pandas>=2.0.0,<=3.0.0", + "pandas>=2.0.0,<3.0.0", "panoptes_client", "praat-parselmouth", "pyannote.core",