@@ -46,12 +46,12 @@ def __init__(self, values: List, cell_names: List):
4646
4747
4848class CellMetadata (IngestFiles ):
49- ALLOWED_FILE_TYPES = [" text/csv" , " text/plain" , " text/tab-separated-values" ]
49+ ALLOWED_FILE_TYPES = [' text/csv' , ' text/plain' , ' text/tab-separated-values' ]
5050
5151 def __init__ (self , file_path , file_id : str , study_accession : str , * args , ** kwargs ):
5252
5353 IngestFiles .__init__ (
54- self , file_path , self .ALLOWED_FILE_TYPES , open_as = " dataframe"
54+ self , file_path , self .ALLOWED_FILE_TYPES , open_as = ' dataframe'
5555 )
5656 self .headers = self .file .columns .get_level_values (0 )
5757 self .annot_types = self .file .columns .get_level_values (1 )
@@ -61,14 +61,13 @@ def __init__(self, file_path, file_id: str, study_accession: str, *args, **kwarg
6161 # lambda below initializes new key with nested dictionary as value and avoids KeyError
6262 self .issues = defaultdict (lambda : defaultdict (lambda : defaultdict (list )))
6363 self .ontology = defaultdict (lambda : defaultdict (list ))
64- self .type = defaultdict (list )
6564 self .cells = []
6665 self .is_valid_file = self .validate_format ()
6766
6867 @dataclass
6968 class Model :
70- COLLECTION_NAME = " cell_metadata"
71- SUBCOLLECTION_NAME = " data"
69+ COLLECTION_NAME = ' cell_metadata'
70+ SUBCOLLECTION_NAME = ' data'
7271 annot_type : str
7372 doc : Document
7473 subdoc : SubDocument
@@ -84,12 +83,12 @@ def preproccess(self):
8483 self .file .rename (columns = {name : name .upper (), type : type .upper ()}, inplace = True )
8584 # Make sure group annotations are treated as strings
8685 group_columns = self .file .xs (
87- " group" , axis = 1 , level = 1 , drop_level = False
86+ ' group' , axis = 1 , level = 1 , drop_level = False
8887 ).columns .tolist ()
8988 self .file [group_columns ] = self .file [group_columns ].astype (str )
9089 # Find numeric columns, round to 3 decimals places, and cast to floats
9190 numeric_columns = self .file .xs (
92- " numeric" , axis = 1 , level = 1 , drop_level = False
91+ ' numeric' , axis = 1 , level = 1 , drop_level = False
9392 ).columns .tolist ()
9493 self .file [numeric_columns ] = self .file [numeric_columns ].round (3 ).astype (float )
9594
@@ -102,18 +101,18 @@ def transform(self):
102101 yield self .Model (
103102 column_type ,
104103 {
105- " name" : col_name ,
106- " study_accession" : self .study_accession ,
104+ ' name' : col_name ,
105+ ' study_accession' : self .study_accession ,
107106 # save unique values for group type annotations
108- " unique_values" : list (self .file [column ].unique ())
109- if column_type == " group"
107+ ' unique_values' : list (self .file [column ].unique ())
108+ if column_type == ' group'
110109 else [],
111- " annotation_type" : column_type ,
112- " file_id" : self .file_id ,
110+ ' annotation_type' : column_type ,
111+ ' file_id' : self .file_id ,
113112 },
114113 {
115- " cell_names" : list (self .file .iloc [:, 0 ]),
116- " values" : list (self .file [column ]),
114+ ' cell_names' : list (self .file .iloc [:, 0 ]),
115+ ' values' : list (self .file [column ]),
117116 },
118117 )
119118
@@ -132,8 +131,8 @@ def chunk_subdocuments(self, doc_name: str, doc_path: str, model: Model) -> Dict
132131 Subdocuments that are under 1,048,576 bytes.
133132 """
134133
135- size_of_cell_names_field = 10 + 1 # " cell_names" is 10 characters
136- size_of_values_field = 6 + 1 # " values" is 6 characters
134+ size_of_cell_names_field = 10 + 1 # ' cell_names' is 10 characters
135+ size_of_values_field = 6 + 1 # ' values' is 6 characters
137136 starting_sum = (
138137 + len (doc_name )
139138 + 1
@@ -149,17 +148,17 @@ def chunk_subdocuments(self, doc_name: str, doc_path: str, model: Model) -> Dict
149148 sum = starting_sum
150149 annot_type = model .annot_type
151150 # All cells names:[] that are in subdoc
152- cell_names = model .subdoc [" cell_names" ]
151+ cell_names = model .subdoc [' cell_names' ]
153152 # All values:[] that are in subdoc
154- values = model .subdoc [" values" ]
153+ values = model .subdoc [' values' ]
155154
156155 for index , (cell_name , value ) in enumerate (zip (cell_names , values )):
157156
158157 cell_name_storage = len (cell_name ) + 1 + size_of_cell_names_field
159158
160159 # Check annotation type because float and string values have
161160 # different storage values
162- if annot_type == " numeric" :
161+ if annot_type == ' numeric' :
163162 value_storage = size_of_values_field + float_storage
164163 else :
165164 value_storage = len (value ) + 1 + size_of_values_field
@@ -175,10 +174,10 @@ def chunk_subdocuments(self, doc_name: str, doc_path: str, model: Model) -> Dict
175174 end_index = index - 1
176175 # TODO: This can turn into a logging statement
177176 # Please do not remove this. It's needed for testing
178- print (f" { sum } , { index } , { start_index } , { end_index } " )
177+ print (f' { sum } , { index } , { start_index } , { end_index } ' )
179178 yield {
180- " cell_names" : cell_names [start_index :end_index ],
181- " values" : values [start_index :end_index ],
179+ ' cell_names' : cell_names [start_index :end_index ],
180+ ' values' : values [start_index :end_index ],
182181 }
183182 # Reset sum and add storage size at current index
184183 sum = starting_sum + cell_name_storage + value_storage
@@ -206,30 +205,37 @@ def validate_header_keyword(self):
206205 """
207206
208207 valid = False
209- if self .headers [0 ].upper () == " NAME" :
208+ if self .headers [0 ].upper () == ' NAME' :
210209 valid = True
211- if self .headers [0 ] != "NAME" :
212- # ToDO - capture warning below in error report
213- msg = (
214- f'Warning: metadata file keyword "NAME" provided as '
215- f"{ self .headers [0 ]} "
216- )
210+ if self .headers [0 ] != 'NAME' :
211+ msg = f'Metadata file keyword "NAME" provided as ' f"{ self .headers [0 ]} "
217212 self .store_validation_issue ('warn' , 'format' , msg )
218213 else :
219- msg = 'Error: Metadata file header row malformed , missing NAME. (Case Sensitive)'
214+ msg = 'Malformed metadata file header row, missing NAME. (Case Sensitive)'
220215 self .store_validation_issue ('error' , 'format' , msg )
221216 return valid
222217
223218 def validate_unique_header (self ):
224- """Check all metadata header names are unique.
219+ """Check all metadata header names are unique and not empty .
225220 :return: boolean True if valid, False otherwise
226221 """
227222 valid = False
228223 unique_headers = set (self .headers )
229224 if len (unique_headers ) == len (self .headers ):
230225 valid = True
231- if any ("Unnamed" in s for s in list (unique_headers )):
232- msg = "Error: Headers cannot contain empty values"
226+ else :
227+ seen_headers = set ()
228+ duplicate_headers = set ()
229+ for x in self .headers :
230+ if x in seen_headers or seen_headers .add (x ):
231+ duplicate_headers .add (x )
232+ msg = (
233+ f'Duplicated metadata header names are not allowed: { duplicate_headers } '
234+ )
235+ self .store_validation_issue ('error' , 'format' , msg )
236+ valid = False
237+ if any ('Unnamed' in s for s in list (unique_headers )):
238+ msg = 'Headers cannot contain empty values'
233239 self .store_validation_issue ('error' , 'format' , msg )
234240 valid = False
235241 return valid
@@ -239,18 +245,13 @@ def validate_type_keyword(self):
239245 :return: boolean True if valid, False otherwise
240246 """
241247 valid = False
242- if self .annot_types [0 ].upper () == " TYPE" :
248+ if self .annot_types [0 ].upper () == ' TYPE' :
243249 valid = True
244- if self .annot_types [0 ] != "TYPE" :
245- # ToDO - capture warning below in issue report
246- # investigate f-string formatting here
247- msg = (
248- 'Warning: Metadata file keyword TYPE provided as '
249- '{self.metadata_types[0]}'
250- )
250+ if self .annot_types [0 ] != 'TYPE' :
251+ msg = f'Metadata file keyword "TYPE" provided as { self .annot_types [0 ]} '
251252 self .store_validation_issue ('warn' , 'format' , msg )
252253 else :
253- msg = 'Error: Metadata file TYPE row malformed , missing TYPE'
254+ msg = 'Malformed metadata TYPE row, missing TYPE. (Case Sensitive) '
254255 self .store_validation_issue ('error' , 'format' , msg )
255256 return valid
256257
@@ -268,10 +269,17 @@ def validate_type_annotations(self):
268269 # string for error reporting
269270 if 'Unnamed' in t :
270271 invalid_types .append ('<empty value>' )
272+ # Duplicated metadata header name causes type annotation issue.
273+ # Side effect of Pandas adding a suffix to uniquefy the header.
274+ # These invalid annotations should not be included in invalid
275+ # type annotation count. This exception may cause miscount of
276+ # type annot errors if user-supplied annotation has period.
277+ elif '.' in t :
278+ pass
271279 else :
272280 invalid_types .append (t )
273281 if invalid_types :
274- msg = 'Error: TYPE declarations should be group or numeric'
282+ msg = 'TYPE row annotations should be " group" or " numeric" '
275283 self .store_validation_issue ('error' , 'format' , msg , invalid_types )
276284 else :
277285 valid = True
@@ -294,7 +302,7 @@ def validate_against_header_count(self):
294302 )
295303 if not len_headers == len_annot_type :
296304 msg = (
297- f'Error : { len_annot_type } TYPE declarations '
305+ f'Header mismatch : { len_annot_type } TYPE declarations '
298306 f'for { len_headers } column headers'
299307 )
300308 self .store_validation_issue ('error' , 'format' , msg )
@@ -305,10 +313,12 @@ def validate_against_header_count(self):
305313 def validate_format (self ):
306314 """Check all metadata file format criteria for file validity
307315 """
308- return (
309- self .validate_header_keyword ()
310- and self .validate_type_keyword ()
311- and self .validate_type_annotations ()
312- and self .validate_unique_header ()
313- and self .validate_against_header_count ()
316+ return all (
317+ [
318+ self .validate_header_keyword (),
319+ self .validate_type_keyword (),
320+ self .validate_type_annotations (),
321+ self .validate_unique_header (),
322+ self .validate_against_header_count (),
323+ ]
314324 )
0 commit comments