@@ -368,6 +368,44 @@ def is_metainfo_yaml(key):
368368 yield dataset_name , key
369369
370370
371+ def _get_info_from_yaml_key (yaml_key ):
372+ """Load and parse YAML metadata from an S3 key."""
373+ raw = _get_data_from_bucket (yaml_key )
374+ return yaml .safe_load (raw ) or {}
375+
376+
377+ def _parse_size_mb (size_mb_val , dataset_name ):
378+ """Parse the size (MB) value into a float or NaN with logging on failures."""
379+ try :
380+ return float (size_mb_val ) if size_mb_val is not None else np .nan
381+ except (ValueError , TypeError ):
382+ LOGGER .info (
383+ f'Invalid dataset-size-mb { size_mb_val } for dataset { dataset_name } ; defaulting to NaN.'
384+ )
385+ return np .nan
386+
387+
388+ def _parse_num_tables (num_tables_val , dataset_name ):
389+ """Parse the num-tables value into an int or NaN with logging on failures."""
390+ if isinstance (num_tables_val , str ):
391+ try :
392+ num_tables_val = float (num_tables_val )
393+ except (ValueError , TypeError ):
394+ LOGGER .info (
395+ f'Could not cast num_tables_val { num_tables_val } to float for '
396+ f'dataset { dataset_name } ; defaulting to NaN.'
397+ )
398+ num_tables_val = np .nan
399+
400+ try :
401+ return int (num_tables_val ) if not pd .isna (num_tables_val ) else np .nan
402+ except (ValueError , TypeError ):
403+ LOGGER .info (
404+ f'Invalid num-tables { num_tables_val } for dataset { dataset_name } when parsing as int.'
405+ )
406+ return np .nan
407+
408+
371409def get_available_demos (modality ):
372410 """Get demo datasets available for a ``modality``.
373411
@@ -387,38 +425,10 @@ def get_available_demos(modality):
387425 tables_info = defaultdict (list )
388426 for dataset_name , yaml_key in _iter_metainfo_yaml_entries (contents , modality ):
389427 try :
390- raw = _get_data_from_bucket (yaml_key )
391- info = yaml .safe_load (raw ) or {}
428+ info = _get_info_from_yaml_key (yaml_key )
392429
393- size_mb_val = info .get ('dataset-size-mb' )
394- try :
395- size_mb = float (size_mb_val ) if size_mb_val is not None else np .nan
396- except (ValueError , TypeError ):
397- LOGGER .info (
398- f'Invalid dataset-size-mb { size_mb_val } for dataset '
399- f'{ dataset_name } ; defaulting to NaN.'
400- )
401- size_mb = np .nan
402-
403- num_tables_val = info .get ('num-tables' , np .nan )
404- if isinstance (num_tables_val , str ):
405- try :
406- num_tables_val = float (num_tables_val )
407- except (ValueError , TypeError ):
408- LOGGER .info (
409- f'Could not cast num_tables_val { num_tables_val } to float for '
410- f'dataset { dataset_name } ; defaulting to NaN.'
411- )
412- num_tables_val = np .nan
413-
414- try :
415- num_tables = int (num_tables_val ) if not pd .isna (num_tables_val ) else np .nan
416- except (ValueError , TypeError ):
417- LOGGER .info (
418- f'Invalid num-tables { num_tables_val } for '
419- f'dataset { dataset_name } when parsing as int.'
420- )
421- num_tables = np .nan
430+ size_mb = _parse_size_mb (info .get ('dataset-size-mb' ), dataset_name )
431+ num_tables = _parse_num_tables (info .get ('num-tables' , np .nan ), dataset_name )
422432
423433 tables_info ['dataset_name' ].append (dataset_name )
424434 tables_info ['size_MB' ].append (size_mb )
@@ -456,6 +466,53 @@ def _find_text_key(contents, dataset_prefix, filename):
456466 return None
457467
458468
469+ def _validate_text_file_content (modality , output_filepath , filename ):
470+ """Validation for the text file content method."""
471+ _validate_modalities (modality )
472+ if output_filepath is not None and not str (output_filepath ).endswith ('.txt' ):
473+ fname = (filename or '' ).lower ()
474+ file_type = 'README' if 'readme' in fname else 'source'
475+ raise ValueError (
476+ f'The { file_type } can only be saved as a txt file. '
477+ "Please provide a filepath ending in '.txt'"
478+ )
479+
480+
481+ def _raise_warnings (filename , output_filepath ):
482+ """Warn about missing text resources for a dataset."""
483+ if (filename or '' ).upper () == 'README.TXT' :
484+ msg = 'No README information is available for this dataset.'
485+ elif (filename or '' ).upper () == 'SOURCE.TXT' :
486+ msg = 'No source information is available for this dataset.'
487+ else :
488+ msg = f'No { filename } information is available for this dataset.'
489+
490+ if output_filepath :
491+ msg = f'{ msg } The requested file ({ output_filepath } ) will not be created.'
492+
493+ warnings .warn (msg , DemoResourceNotFoundWarning )
494+
495+
496+ def _save_document (text , output_filepath , filename , dataset_name ):
497+ """Persist ``text`` to ``output_filepath`` if provided."""
498+ if not output_filepath :
499+ return
500+
501+ if os .path .exists (str (output_filepath )):
502+ raise ValueError (
503+ f"A file named '{ output_filepath } ' already exists. Please specify a different filepath."
504+ )
505+
506+ try :
507+ parent = os .path .dirname (str (output_filepath ))
508+ if parent :
509+ os .makedirs (parent , exist_ok = True )
510+ with open (output_filepath , 'w' , encoding = 'utf-8' ) as f :
511+ f .write (text )
512+ except Exception :
513+ LOGGER .info (f'Error saving { filename } for dataset { dataset_name } .' )
514+
515+
459516def _get_text_file_content (modality , dataset_name , filename , output_filepath = None ):
460517 """Fetch text file content under the dataset prefix.
461518
@@ -473,29 +530,13 @@ def _get_text_file_content(modality, dataset_name, filename, output_filepath=Non
473530 str or None:
474531 The decoded text contents if the file exists, otherwise ``None``.
475532 """
476- _validate_modalities (modality )
477- if output_filepath is not None and not str (output_filepath ).endswith ('.txt' ):
478- fname = (filename or '' ).lower ()
479- file_type = 'README' if 'readme' in fname else 'source'
480- raise ValueError (
481- f'The { file_type } can only be saved as a txt file. '
482- "Please provide a filepath ending in '.txt'"
483- )
533+ _validate_text_file_content (modality , output_filepath , filename )
484534
485535 dataset_prefix = f'{ modality } /{ dataset_name } /'
486536 contents = _list_objects (dataset_prefix )
487-
488537 key = _find_text_key (contents , dataset_prefix , filename )
489538 if not key :
490- if file_type in ('README' , 'SOURCE' ):
491- msg = f'No { file_type } information is available for this dataset .
492- else :
493- msg = f'No { filename } information is available for this dataset.'
494-
495- if output_filepath :
496- msg = f'{ msg } The requested file ({ output_filepath } ) will not be created.'
497-
498- warnings .warn (msg , DemoResourceNotFoundWarning )
539+ _raise_warnings (filename , output_filepath )
499540 return None
500541
501542 try :
@@ -505,22 +546,7 @@ def _get_text_file_content(modality, dataset_name, filename, output_filepath=Non
505546 return None
506547
507548 text = raw .decode ('utf-8' , errors = 'replace' )
508- if output_filepath :
509- if os .path .exists (str (output_filepath )):
510- raise ValueError (
511- f"A file named '{ output_filepath } ' already exists. "
512- 'Please specify a different filepath.'
513- )
514- try :
515- parent = os .path .dirname (str (output_filepath ))
516- if parent :
517- os .makedirs (parent , exist_ok = True )
518- with open (output_filepath , 'w' , encoding = 'utf-8' ) as f :
519- f .write (text )
520-
521- except Exception :
522- LOGGER .info (f'Error saving { filename } for dataset { dataset_name } .' )
523- pass
549+ _save_document (text , output_filepath , filename , dataset_name )
524550
525551 return text
526552
0 commit comments