Skip to content

Commit eac7584

Browse files
Merge pull request #39 from broadinstitute/ea-metadata-convention-access
Store metadata convention for Ingest Pipeline to access
2 parents 0692a46 + 0dba75b commit eac7584

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+3982
-4406
lines changed

.circleci/config.yml

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,9 @@
44
#
55
version: 2.1
66

7+
orbs:
8+
codecov: codecov/[email protected]
9+
710
jobs:
811
build:
912
docker:
@@ -61,9 +64,7 @@ jobs:
6164
export FIRESTORE_EMULATOR_HOST=localhost:8081
6265
. venv/bin/activate
6366
cd tests
64-
coverage run -m pytest
65-
coverage report --include *scp-ingest-pipeline/ingest*
66-
coverage html --include *scp-ingest-pipeline/ingest*
67+
pytest --cov-report=xml --cov=../ingest/
6768
68-
- store_artifacts:
69-
path: tests/htmlcov
69+
- codecov/upload:
70+
file: tests/coverage.xml

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
File Ingest Pipeline for Single Cell Portal
33

44
[![Build status](https://img.shields.io/circleci/build/github/broadinstitute/scp-ingest-pipeline.svg)](https://circleci.com/gh/broadinstitute/scp-ingest-pipeline)
5+
[![Code coverage](https://codecov.io/gh/broadinstitute/scp-ingest-pipeline/branch/master/graph/badge.svg)](https://codecov.io/gh/broadinstitute/scp-ingest-pipeline)
56

67
The SCP Ingest Pipeline is an ETL pipeline for single-cell RNA-seq data.
78

ingest/cell_metadata.py

Lines changed: 61 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -46,12 +46,12 @@ def __init__(self, values: List, cell_names: List):
4646

4747

4848
class CellMetadata(IngestFiles):
49-
ALLOWED_FILE_TYPES = ["text/csv", "text/plain", "text/tab-separated-values"]
49+
ALLOWED_FILE_TYPES = ['text/csv', 'text/plain', 'text/tab-separated-values']
5050

5151
def __init__(self, file_path, file_id: str, study_accession: str, *args, **kwargs):
5252

5353
IngestFiles.__init__(
54-
self, file_path, self.ALLOWED_FILE_TYPES, open_as="dataframe"
54+
self, file_path, self.ALLOWED_FILE_TYPES, open_as='dataframe'
5555
)
5656
self.headers = self.file.columns.get_level_values(0)
5757
self.annot_types = self.file.columns.get_level_values(1)
@@ -61,14 +61,13 @@ def __init__(self, file_path, file_id: str, study_accession: str, *args, **kwarg
6161
# lambda below initializes new key with nested dictionary as value and avoids KeyError
6262
self.issues = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
6363
self.ontology = defaultdict(lambda: defaultdict(list))
64-
self.type = defaultdict(list)
6564
self.cells = []
6665
self.is_valid_file = self.validate_format()
6766

6867
@dataclass
6968
class Model:
70-
COLLECTION_NAME = "cell_metadata"
71-
SUBCOLLECTION_NAME = "data"
69+
COLLECTION_NAME = 'cell_metadata'
70+
SUBCOLLECTION_NAME = 'data'
7271
annot_type: str
7372
doc: Document
7473
subdoc: SubDocument
@@ -84,12 +83,12 @@ def preproccess(self):
8483
self.file.rename(columns={name: name.upper(), type: type.upper()}, inplace=True)
8584
# Make sure group annotations are treated as strings
8685
group_columns = self.file.xs(
87-
"group", axis=1, level=1, drop_level=False
86+
'group', axis=1, level=1, drop_level=False
8887
).columns.tolist()
8988
self.file[group_columns] = self.file[group_columns].astype(str)
9089
# Find numeric columns, round to 3 decimals places, and cast to floats
9190
numeric_columns = self.file.xs(
92-
"numeric", axis=1, level=1, drop_level=False
91+
'numeric', axis=1, level=1, drop_level=False
9392
).columns.tolist()
9493
self.file[numeric_columns] = self.file[numeric_columns].round(3).astype(float)
9594

@@ -102,18 +101,18 @@ def transform(self):
102101
yield self.Model(
103102
column_type,
104103
{
105-
"name": col_name,
106-
"study_accession": self.study_accession,
104+
'name': col_name,
105+
'study_accession': self.study_accession,
107106
# save unique values for group type annotations
108-
"unique_values": list(self.file[column].unique())
109-
if column_type == "group"
107+
'unique_values': list(self.file[column].unique())
108+
if column_type == 'group'
110109
else [],
111-
"annotation_type": column_type,
112-
"file_id": self.file_id,
110+
'annotation_type': column_type,
111+
'file_id': self.file_id,
113112
},
114113
{
115-
"cell_names": list(self.file.iloc[:, 0]),
116-
"values": list(self.file[column]),
114+
'cell_names': list(self.file.iloc[:, 0]),
115+
'values': list(self.file[column]),
117116
},
118117
)
119118

@@ -132,8 +131,8 @@ def chunk_subdocuments(self, doc_name: str, doc_path: str, model: Model) -> Dict
132131
Subdocuments that are under 1,048,576 bytes.
133132
"""
134133

135-
size_of_cell_names_field = 10 + 1 # "cell_names" is 10 characters
136-
size_of_values_field = 6 + 1 # "values" is 6 characters
134+
size_of_cell_names_field = 10 + 1 # 'cell_names' is 10 characters
135+
size_of_values_field = 6 + 1 # 'values' is 6 characters
137136
starting_sum = (
138137
+len(doc_name)
139138
+ 1
@@ -149,17 +148,17 @@ def chunk_subdocuments(self, doc_name: str, doc_path: str, model: Model) -> Dict
149148
sum = starting_sum
150149
annot_type = model.annot_type
151150
# All cells names:[] that are in subdoc
152-
cell_names = model.subdoc["cell_names"]
151+
cell_names = model.subdoc['cell_names']
153152
# All values:[] that are in subdoc
154-
values = model.subdoc["values"]
153+
values = model.subdoc['values']
155154

156155
for index, (cell_name, value) in enumerate(zip(cell_names, values)):
157156

158157
cell_name_storage = len(cell_name) + 1 + size_of_cell_names_field
159158

160159
# Check annotation type because float and string values have
161160
# different storage values
162-
if annot_type == "numeric":
161+
if annot_type == 'numeric':
163162
value_storage = size_of_values_field + float_storage
164163
else:
165164
value_storage = len(value) + 1 + size_of_values_field
@@ -175,10 +174,10 @@ def chunk_subdocuments(self, doc_name: str, doc_path: str, model: Model) -> Dict
175174
end_index = index - 1
176175
# TODO: This can turn into a logging statement
177176
# Please do not remove this. It's needed for testing
178-
print(f"{sum}, {index}, {start_index}, {end_index}")
177+
print(f'{sum}, {index}, {start_index}, {end_index}')
179178
yield {
180-
"cell_names": cell_names[start_index:end_index],
181-
"values": values[start_index:end_index],
179+
'cell_names': cell_names[start_index:end_index],
180+
'values': values[start_index:end_index],
182181
}
183182
# Reset sum and add storage size at current index
184183
sum = starting_sum + cell_name_storage + value_storage
@@ -206,30 +205,37 @@ def validate_header_keyword(self):
206205
"""
207206

208207
valid = False
209-
if self.headers[0].upper() == "NAME":
208+
if self.headers[0].upper() == 'NAME':
210209
valid = True
211-
if self.headers[0] != "NAME":
212-
# ToDO - capture warning below in error report
213-
msg = (
214-
f'Warning: metadata file keyword "NAME" provided as '
215-
f"{self.headers[0]}"
216-
)
210+
if self.headers[0] != 'NAME':
211+
msg = f'Metadata file keyword "NAME" provided as ' f"{self.headers[0]}"
217212
self.store_validation_issue('warn', 'format', msg)
218213
else:
219-
msg = 'Error: Metadata file header row malformed, missing NAME. (Case Sensitive)'
214+
msg = 'Malformed metadata file header row, missing NAME. (Case Sensitive)'
220215
self.store_validation_issue('error', 'format', msg)
221216
return valid
222217

223218
def validate_unique_header(self):
224-
"""Check all metadata header names are unique.
219+
"""Check all metadata header names are unique and not empty.
225220
:return: boolean True if valid, False otherwise
226221
"""
227222
valid = False
228223
unique_headers = set(self.headers)
229224
if len(unique_headers) == len(self.headers):
230225
valid = True
231-
if any("Unnamed" in s for s in list(unique_headers)):
232-
msg = "Error: Headers cannot contain empty values"
226+
else:
227+
seen_headers = set()
228+
duplicate_headers = set()
229+
for x in self.headers:
230+
if x in seen_headers or seen_headers.add(x):
231+
duplicate_headers.add(x)
232+
msg = (
233+
f'Duplicated metadata header names are not allowed: {duplicate_headers}'
234+
)
235+
self.store_validation_issue('error', 'format', msg)
236+
valid = False
237+
if any('Unnamed' in s for s in list(unique_headers)):
238+
msg = 'Headers cannot contain empty values'
233239
self.store_validation_issue('error', 'format', msg)
234240
valid = False
235241
return valid
@@ -239,18 +245,13 @@ def validate_type_keyword(self):
239245
:return: boolean True if valid, False otherwise
240246
"""
241247
valid = False
242-
if self.annot_types[0].upper() == "TYPE":
248+
if self.annot_types[0].upper() == 'TYPE':
243249
valid = True
244-
if self.annot_types[0] != "TYPE":
245-
# ToDO - capture warning below in issue report
246-
# investigate f-string formatting here
247-
msg = (
248-
'Warning: Metadata file keyword TYPE provided as '
249-
'{self.metadata_types[0]}'
250-
)
250+
if self.annot_types[0] != 'TYPE':
251+
msg = f'Metadata file keyword "TYPE" provided as {self.annot_types[0]}'
251252
self.store_validation_issue('warn', 'format', msg)
252253
else:
253-
msg = 'Error: Metadata file TYPE row malformed, missing TYPE'
254+
msg = 'Malformed metadata TYPE row, missing TYPE. (Case Sensitive)'
254255
self.store_validation_issue('error', 'format', msg)
255256
return valid
256257

@@ -268,10 +269,17 @@ def validate_type_annotations(self):
268269
# string for error reporting
269270
if 'Unnamed' in t:
270271
invalid_types.append('<empty value>')
272+
# Duplicated metadata header name causes type annotation issue.
273+
# Side effect of Pandas adding a suffix to uniquefy the header.
274+
# These invalid annotations should not be included in invalid
275+
# type annotation count. This exception may cause miscount of
276+
# type annot errors if user-supplied annotation has period.
277+
elif '.' in t:
278+
pass
271279
else:
272280
invalid_types.append(t)
273281
if invalid_types:
274-
msg = 'Error: TYPE declarations should be group or numeric'
282+
msg = 'TYPE row annotations should be "group" or "numeric"'
275283
self.store_validation_issue('error', 'format', msg, invalid_types)
276284
else:
277285
valid = True
@@ -294,7 +302,7 @@ def validate_against_header_count(self):
294302
)
295303
if not len_headers == len_annot_type:
296304
msg = (
297-
f'Error: {len_annot_type} TYPE declarations '
305+
f'Header mismatch: {len_annot_type} TYPE declarations '
298306
f'for {len_headers} column headers'
299307
)
300308
self.store_validation_issue('error', 'format', msg)
@@ -305,10 +313,12 @@ def validate_against_header_count(self):
305313
def validate_format(self):
306314
"""Check all metadata file format criteria for file validity
307315
"""
308-
return (
309-
self.validate_header_keyword()
310-
and self.validate_type_keyword()
311-
and self.validate_type_annotations()
312-
and self.validate_unique_header()
313-
and self.validate_against_header_count()
316+
return all(
317+
[
318+
self.validate_header_keyword(),
319+
self.validate_type_keyword(),
320+
self.validate_type_annotations(),
321+
self.validate_unique_header(),
322+
self.validate_against_header_count(),
323+
]
314324
)

ingest/ingest_files.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ def open_file(self, file_path, open_as=None, start_point: int = 0):
8888
file_connections = {
8989
"text/csv": self.open_csv(open_file),
9090
"text/plain": open_file,
91+
"application/json": open_file,
9192
"text/tab-separated-values": self.open_tsv(open_file),
9293
"dataframe": self.open_pandas,
9394
}

ingest/ingest_pipeline.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@
1919
python ingest_pipeline.py --study-accession SCP1 --file-id 123abc ingest_cell_metadata --cell-metadata-file ../tests/data/metadata_valid.tsv --ingest-cell-metadata
2020
2121
# Ingest Cell Metadata file against convention
22-
python ingest_pipeline.py --study-accession SCP1 --file-id 123abc ingest_cell_metadata --cell-metadata-file ../tests/data/metadata_valid.tsv --ingest-cell-metadata --validate-convention
22+
!! Please note that you must have permission to the SCP bucket
23+
python ingest_pipeline.py --study-accession SCP1 --file-id 123abc ingest_cell_metadata --cell-metadata-file ../tests/data/valid_array_v1.1.3.tsv --ingest-cell-metadata --validate-convention
2324
2425
# Ingest dense file
2526
python ingest_pipeline.py --study-accession SCP1 --file-id 123abc ingest_expression --taxon-name 'Homo sapiens' --taxon-common-name human --ncbi-taxid 9606 --matrix-file ../tests/data/dense_matrix_19_genes_100k_cells.txt --matrix-file-type dense
@@ -47,21 +48,18 @@
4748
from google.api_core import exceptions
4849
from google.cloud import firestore
4950
from mtx import Mtx
51+
from ingest_files import IngestFiles
5052
from subsample import SubSample
5153
from loom import Loom
52-
from validation.validate_metadata import (
53-
collect_jsonschema_errors,
54-
validate_collected_ontology_data,
55-
report_issues,
56-
)
54+
from validation.validate_metadata import validate_input_metadata, report_issues
5755

5856
# Ingest file types
5957
EXPRESSION_FILE_TYPES = ["dense", "mtx", "loom"]
6058

6159

6260
class IngestPipeline(object):
6361
# File location for metadata json convention
64-
JSON_CONVENTION = 'DoNotTouch/AMC_v0.8.json'
62+
JSON_CONVENTION = 'gs://fc-bcc55e6c-bec3-4b2e-9fb2-5e1526ddfcd2/metadata_conventions/AMC_v1.1.3/AMC_v1.1.3.json'
6563

6664
def __init__(
6765
self,
@@ -233,10 +231,11 @@ def load_subsample(self, doc):
233231
def has_valid_metadata_convention(self):
234232
""" Determines if cell metadata file follows metadata convention"""
235233
with open(self.JSON_CONVENTION, 'r') as f:
236-
convention = json.load(f)
234+
json_file = IngestFiles(self.JSON_CONVENTION, ['application/json'])
235+
convention = json.load(json_file.file)
236+
validate_input_metadata(self.cell_metadata, convention)
237237

238-
collect_jsonschema_errors(self.cell_metadata, convention)
239-
validate_collected_ontology_data(self.cell_metadata, convention)
238+
f.close()
240239
return not report_issues(self.cell_metadata)
241240

242241
def ingest_expression(self) -> None:
@@ -278,6 +277,7 @@ def ingest_cell_metadata(self):
278277
if self.kwargs['validate_convention'] is not None:
279278
if self.kwargs['validate_convention']:
280279
if self.has_valid_metadata_convention():
280+
print("it works!")
281281
pass
282282
else:
283283
return 1

0 commit comments

Comments
 (0)