Skip to content

Commit 4e9e9ca

Browse files
authored
Merge pull request #47 from broadinstitute/jlc_error2bucket
Write metadata error messages to workspace bucket [SCP-1969]
2 parents 4e5e55d + 1688688 commit 4e9e9ca

File tree

8 files changed

+101
-51
lines changed

8 files changed

+101
-51
lines changed

ingest/ingest_files.py

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ def resolve_path(self, file_path):
6969
"""
7070
if self.is_remote_file:
7171
file_path = self.download_from_bucket(file_path)
72+
self.local_file_path = file_path
7273
# Remove BOM with encoding ='utf - 8 - sig'
7374
if self.is_gzip_file:
7475
open_file = gzip.open(file_path, 'rt', encoding='utf-8-sig')
@@ -106,11 +107,20 @@ def open_file(self, file_path, open_as=None, start_point: int = 0):
106107
if open_as is None:
107108
return file_type, file_connections.get(file_type), open_file
108109
else:
109-
return (
110-
file_type,
111-
file_connections.get("dataframe")(open_file, file_path),
112-
open_file,
113-
)
110+
if self.is_remote_file:
111+
return (
112+
file_type,
113+
file_connections.get("dataframe")(
114+
open_file, self.local_file_path
115+
),
116+
open_file,
117+
)
118+
else:
119+
return (
120+
file_type,
121+
file_connections.get("dataframe")(open_file, file_path),
122+
open_file,
123+
)
114124
else:
115125
raise ValueError(
116126
f"Unsupported file format. Allowed file types are: {' '.join(self.allowed_file_type)}"

ingest/ingest_pipeline.py

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@
4646
from dense import Dense
4747
from gene_data_model import Gene
4848
from google.api_core import exceptions
49+
from google.cloud import storage
4950
from google.cloud import firestore
5051
from mtx import Mtx
5152
from ingest_files import IngestFiles
@@ -226,12 +227,11 @@ def load_subsample(self, doc):
226227

227228
def has_valid_metadata_convention(self):
228229
""" Determines if cell metadata file follows metadata convention"""
229-
with open(self.JSON_CONVENTION, 'r') as f:
230-
json_file = IngestFiles(self.JSON_CONVENTION, ['application/json'])
231-
convention = json.load(json_file.file)
232-
validate_input_metadata(self.cell_metadata, convention)
230+
json_file = IngestFiles(self.JSON_CONVENTION, ['application/json'])
231+
convention = json.load(json_file.file)
232+
validate_input_metadata(self.cell_metadata, convention)
233233

234-
f.close()
234+
json_file.file_handle.close()
235235
return not report_issues(self.cell_metadata)
236236

237237
def ingest_expression(self) -> None:
@@ -331,6 +331,17 @@ def create_cluster_subdoc(scope):
331331
return load_status
332332
return 0
333333

334+
def delocalize_error_file(self):
335+
"""Writes local error file to Google bucket
336+
"""
337+
storage_client = storage.Client()
338+
bucket = storage_client.get_bucket(self.cell_metadata.bucket)
339+
destination_blob_name = f'parse_logs/{self.file_id}/errors.txt'
340+
blob = bucket.blob(destination_blob_name)
341+
source_file_name = 'scp_validation_errors.txt'
342+
blob.upload_from_filename(source_file_name)
343+
print(f'File {source_file_name} uploaded to {destination_blob_name}.')
344+
334345

335346
def create_parser():
336347
"""Creates parser for input arguments.
@@ -531,6 +542,8 @@ def main() -> None:
531542
if all(i < 1 for i in status) or len(status) == 0:
532543
sys.exit(os.EX_OK)
533544
else:
545+
if status_cell_metadata > 0 and ingest.cell_metadata.is_remote_file:
546+
ingest.delocalize_error_file()
534547
sys.exit(os.EX_DATAERR)
535548

536549

ingest/validation/validate_metadata.py

Lines changed: 28 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -398,32 +398,53 @@ def collect_jsonschema_errors(metadata, convention, bq_json=None):
398398
return False
399399

400400

401+
def record_issue(errfile, warnfile, issue_type, msg):
402+
"""print issue to console with coloring and
403+
writes issues to appropriate issue file
404+
"""
405+
406+
if issue_type == 'error':
407+
errfile.write(msg + '\n')
408+
color = Fore.RED
409+
elif issue_type == 'warn':
410+
warnfile.write(msg + '\n')
411+
color = Fore.YELLOW
412+
else:
413+
color = ''
414+
console_msg = color + msg
415+
print(console_msg)
416+
417+
401418
def report_issues(metadata):
402419
"""Report issues in CellMetadata.issues dictionary
403420
returns True if errors are reported, False if no errors to report
404421
"""
405422
logger.debug('Begin: report_issues')
406423

424+
error_file = open('scp_validation_errors.txt', 'w')
425+
warn_file = open('scp_validation_warnings.txt', 'w')
407426
has_errors = False
408427
has_warnings = False
409428
for issue_type in sorted(metadata.issues.keys()):
410429
for issue_category, category_dict in metadata.issues[issue_type].items():
411430
if category_dict:
412-
print('\n***', issue_category, issue_type, 'list:')
431+
category_header = f'\n*** {issue_category} {issue_type} list:'
432+
record_issue(error_file, warn_file, issue_type, category_header)
413433
if issue_type == 'error':
414-
color = Fore.RED
415434
has_errors = True
416435
elif issue_type == 'warn':
417-
color = Fore.YELLOW
418436
has_warnings = True
419437
for issue_text, cells in category_dict.items():
420-
issue_msg = color + issue_text
421438
if cells:
422-
print(f'{issue_msg} [ Error count: {len(cells)} ]')
439+
issue_msg = f'{issue_text} [ Error count: {len(cells)} ]'
440+
record_issue(error_file, warn_file, issue_type, issue_msg)
423441
else:
424-
print(issue_msg)
442+
record_issue(error_file, warn_file, issue_type, issue_text)
425443
if not has_errors and not has_warnings:
426-
print('No errors or warnings detected for input metadata file')
444+
no_issues = 'No errors or warnings detected for input metadata file'
445+
record_issue(error_file, warn_file, None, no_issues)
446+
error_file.close()
447+
warn_file.close()
427448
return has_errors
428449

429450

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
1-
NME cell_type cell_type__ontology_label organism_age disease disease__ontology_label species species__ontology_label geographical_region geographical_region__ontology_label library_preparation_protocol library_preparation_protocol__ontology_label organ organ__ontology_label sex is_living organism_age__unit organism_age__unit_label is_living ethnicity__ontology_label ethnicity sample_type
2-
TPE group group numeric group group group group group group group group group group group group group group label number group
3-
BM01_16dpp_AAGCAGTGGTAT CL_0000066 epithelial cell 31 MONDO_0000001 disease or disorder NCBITaxon_9606 Homo sapiens GAZ_00003181 Boston EFO_0008919 Seq-Well UBERON_0001913 milk female yes UO_0000036 year yes European HANCESTRO_0005 direct from donor - fresh
4-
BM01_16dpp_TAAGCAGTGGTA CL_0000066 epithelial cell 31 MONDO_0000001 disease or disorder NCBITaxon_9606 Homo sapiens GAZ_00003181 Boston EFO_0008919 Seq-Well UBERON_0001913 milk female yes UO_0000036 year yes European HANCESTRO_0005 direct from donor - fresh
5-
BM01_16dpp_CTAAGCAGTGGT CL_0000066 epithelial cell 31 MONDO_0000001 disease or disorder NCBITaxon_9606 Homo sapiens GAZ_00003181 Boston EFO_0008919 Seq-Well UBERON_0001913 milk female yes UO_0000036 year yes European HANCESTRO_0005 direct from donor - fresh
6-
BM01_16dpp_CGGTAAACCATT CL_0000066 epithelial cell 31 MONDO_0000001 disease or disorder NCBITaxon_9606 Homo sapiens GAZ_00003181 Boston EFO_0008919 Seq-Well UBERON_0001913 milk female yes UO_0000036 year yes European HANCESTRO_0005 direct from donor - fresh
7-
BM01_16dpp_CCGAATTCACCG CL_0000066 epithelial cell 31 MONDO_0000001 disease or disorder NCBITaxon_9606 Homo sapiens GAZ_00003181 Boston EFO_0008919 Seq-Well UBERON_0001913 milk female yes UO_0000036 year yes European HANCESTRO_0005 direct from donor - fresh
1+
NME cell_type cell_type__ontology_label organism_age disease disease__ontology_label species species__ontology_label geographical_region geographical_region__ontology_label library_preparation_protocol library_preparation_protocol__ontology_label organ organ__ontology_label sex is_living organism_age__unit organism_age__unit_label is_living ethnicity__ontology_label ethnicity sample_type donor_id biosample_id
2+
TPE group group numeric group group group group group group group group group group group group group group label number group group group
3+
BM01_16dpp_AAGCAGTGGTAT CL_0000066 epithelial cell 31 MONDO_0000001 disease or disorder NCBITaxon_9606 Homo sapiens GAZ_00003181 Boston EFO_0008919 Seq-Well UBERON_0001913 milk female yes UO_0000036 year yes European HANCESTRO_0005 direct from donor - fresh BM01 BM01_16dpp_r3
4+
BM01_16dpp_TAAGCAGTGGTA CL_0000066 epithelial cell 31 MONDO_0000001 disease or disorder NCBITaxon_9606 Homo sapiens GAZ_00003181 Boston EFO_0008919 Seq-Well UBERON_0001913 milk female yes UO_0000036 year yes European HANCESTRO_0005 direct from donor - fresh BM01 BM01_16dpp_r3
5+
BM01_16dpp_CTAAGCAGTGGT CL_0000066 epithelial cell 31 MONDO_0000001 disease or disorder NCBITaxon_9606 Homo sapiens GAZ_00003181 Boston EFO_0008919 Seq-Well UBERON_0001913 milk female yes UO_0000036 year yes European HANCESTRO_0005 direct from donor - fresh BM01 BM01_16dpp_r3
6+
BM01_16dpp_CGGTAAACCATT CL_0000066 epithelial cell 31 MONDO_0000001 disease or disorder NCBITaxon_9606 Homo sapiens GAZ_00003181 Boston EFO_0008919 Seq-Well UBERON_0001913 milk female yes UO_0000036 year yes European HANCESTRO_0005 direct from donor - fresh BM01 BM01_16dpp_r3
7+
BM01_16dpp_CCGAATTCACCG CL_0000066 epithelial cell 31 MONDO_0000001 disease or disorder NCBITaxon_9606 Homo sapiens GAZ_00003181 Boston EFO_0008919 Seq-Well UBERON_0001913 milk female yes UO_0000036 year yes European HANCESTRO_0005 direct from donor - fresh BM01 BM01_16dpp_r3
Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
1-
NAME cell_type cell_type__ontology_label organism_age disease disease__ontology_label species species__ontology_label geographical_region geographical_region__ontology_label library_preparation_protocol library_preparation_protocol__ontology_label organ organ__ontology_label is_living organism_age__unit organism_age__unit_label ethnicity__ontology_label sample_type
2-
TYPE group group numeric group group group group group group group group group group group group group group group
3-
BM01_16dpp_AAGCAGTGGTAT CL_0000066 epithelial cell 31 MONDO_0000001 disease or disorder NCBITaxon_9606 Homo sapiens GAZ_00003181 Boston EFO_0008919 Seq-Well UBERON_0001913 milk yes UO_0000036 year European direct from donor - fresh
4-
BM01_16dpp_TAAGCAGTGGTA CL_0000066 epithelial cell 31 MONDO_0000001 disease or disorder NCBITaxon_9606 Homo sapiens GAZ_00003181 Boston EFO_0008919 Seq-Well UBERON_0001913 milk UO_0000036 year European direct from donor - fresh
5-
BM01_16dpp_CTAAGCAGTGGT CL_0000066 epithelial cell 31 MONDO_0000001 disease or disorder NCBITaxon_9606 Homo sapiens GAZ_00003181 Boston EFO_0008919 Seq-Well UBERON_0001913 milk yes UO_0000036 year European direct from donr - fresh
6-
BM01_16dpp_CGGTAAACCATT CL_0000066 epithelial cell foo MONDO_0000001 disease or disorder NCBITaxon_9606 Homo sapiens GAZ_00003181 Boston EFO_0008919 Seq-Well UBERON_0001913 milk yes UO_0000036 year European direct from donor - fresh
7-
BM01_16dpp_AAGCAGTGGTAT CL_0000066 epithelial cell 31 MONDO_0000001 disease or disorder NCBITaxon_9606 Homo sapiens GAZ_00003181 Boston EFO_0008919 Seq-Well UBERON_0001913 milk yes UO_0000036 year European direct from donor - fresh
1+
NAME cell_type cell_type__ontology_label organism_age disease disease__ontology_label species species__ontology_label geographical_region geographical_region__ontology_label library_preparation_protocol library_preparation_protocol__ontology_label organ organ__ontology_label is_living organism_age__unit organism_age__unit_label ethnicity__ontology_label sample_type donor_id biosample_id
2+
TYPE group group numeric group group group group group group group group group group group group group group group group group
3+
BM01_16dpp_AAGCAGTGGTAT CL_0000066 epithelial cell 31 MONDO_0000001 disease or disorder NCBITaxon_9606 Homo sapiens GAZ_00003181 Boston EFO_0008919 Seq-Well UBERON_0001913 milk yes UO_0000036 year European direct from donor - fresh BM01 BM01_16dpp_r3
4+
BM01_16dpp_TAAGCAGTGGTA CL_0000066 epithelial cell 31 MONDO_0000001 disease or disorder NCBITaxon_9606 Homo sapiens GAZ_00003181 Boston EFO_0008919 Seq-Well UBERON_0001913 milk UO_0000036 year European direct from donor - fresh BM01 BM01_16dpp_r3
5+
BM01_16dpp_CTAAGCAGTGGT CL_0000066 epithelial cell 31 MONDO_0000001 disease or disorder NCBITaxon_9606 Homo sapiens GAZ_00003181 Boston EFO_0008919 Seq-Well UBERON_0001913 milk yes UO_0000036 year European direct from donr - fresh BM01 BM01_16dpp_r3
6+
BM01_16dpp_CGGTAAACCATT CL_0000066 epithelial cell foo MONDO_0000001 disease or disorder NCBITaxon_9606 Homo sapiens GAZ_00003181 Boston EFO_0008919 Seq-Well UBERON_0001913 milk yes UO_0000036 year European direct from donor - fresh BM01 BM01_16dpp_r3
7+
BM01_16dpp_AAGCAGTGGTAT CL_0000066 epithelial cell 31 MONDO_0000001 disease or disorder NCBITaxon_9606 Homo sapiens GAZ_00003181 Boston EFO_0008919 Seq-Well UBERON_0001913 milk yes UO_0000036 year European direct from donor - fresh BM01 BM01_16dpp_r3
Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
1-
NAME cell_type cell_type__ontology_label organism_age disease disease__ontology_label species species__ontology_label geographical_region geographical_region__ontology_label library_preparation_protocol library_preparation_protocol__ontology_label organ organ__ontology_label sex is_living organism_age__unit organism_age__unit_label ethnicity__ontology_label ethnicity sample_type
2-
TYPE group group numeric group group group group group group group group group group group group group group group group group
3-
BM01_16dpp_AAGCAGTGGTAT epithelial cell 31 MONDO_0000001 disease or disorder NCBITaxon_9606 Homo sapiens EFO0008919 Seq-Well UBERON_0001913 milk female yes UO_0000036 year European HANCESTRO_0005 direct from donor - fresh
4-
BM01_16dpp_TAAGCAGTGGTA CL_0000066 epithelial cell 31 MONDO_0000001 disease or disorder NCBITaxon_9606 Homo sapiens NCBITaxon_9606 Boston EFO_0008919 Seq-Well UBERON_0001913 milk female yes UO_0000036 year European HANCESTRO_0005 direct from donor - fresh
5-
BM01_16dpp_CTAAGCAGTGGT CELL_0000066 epithelial cell 31 MONDO_0000001 disease or disorder NCBITaxon_9606 Homo sapiens GAZ_00003181 Boston EFO_0008919 Seq-Well UBERON_0001913 milk female yes UO_0000036 year European HANCESTRO_0005 direct from donor - fresh
6-
BM01_16dpp_CGGTAAACCATT CL_0000066 epithelial cell 31 MONDO_0000001 disease or disorder NCBITaxon_9606 homo sapien GAZ_00003181 Boston EFO_0008919 Seq-Well UBERON_0001913 milk female yes UO_0000036 year European HANCESTRO_0005 direct from donor - fresh
7-
BM01_16dpp_CCGAATTCACCG CL_0000066 epithelial cell 31 MONDO_0000001 disease or disorder NCBITaxon_9606 Homo sapiens GAZ_00003181 Boston EFO_0008919 Seq-Well UBERON_1001913 milk female yes UO_0000036 year European HANCESTRO_0005 direct from donor - fresh
1+
NAME cell_type cell_type__ontology_label organism_age disease disease__ontology_label species species__ontology_label geographical_region geographical_region__ontology_label library_preparation_protocol library_preparation_protocol__ontology_label organ organ__ontology_label sex is_living organism_age__unit organism_age__unit_label ethnicity__ontology_label ethnicity sample_type donor_id biosample_id
2+
TYPE group group numeric group group group group group group group group group group group group group group group group group group group
3+
BM01_16dpp_AAGCAGTGGTAT epithelial cell 31 MONDO_0000001 disease or disorder NCBITaxon_9606 Homo sapiens EFO0008919 Seq-Well UBERON_0001913 milk female yes UO_0000036 year European HANCESTRO_0005 direct from donor - fresh BM01 BM01_16dpp_r3
4+
BM01_16dpp_TAAGCAGTGGTA CL_0000066 epithelial cell 31 MONDO_0000001 disease or disorder NCBITaxon_9606 Homo sapiens NCBITaxon_9606 Boston EFO_0008919 Seq-Well UBERON_0001913 milk female yes UO_0000036 year European HANCESTRO_0005 direct from donor - fresh BM01 BM01_16dpp_r3
5+
BM01_16dpp_CTAAGCAGTGGT CELL_0000066 epithelial cell 31 MONDO_0000001 disease or disorder NCBITaxon_9606 Homo sapiens GAZ_00003181 Boston EFO_0008919 Seq-Well UBERON_0001913 milk female yes UO_0000036 year European HANCESTRO_0005 direct from donor - fresh BM01 BM01_16dpp_r3
6+
BM01_16dpp_CGGTAAACCATT CL_0000066 epithelial cell 31 MONDO_0000001 disease or disorder NCBITaxon_9606 homo sapien GAZ_00003181 Boston EFO_0008919 Seq-Well UBERON_0001913 milk female yes UO_0000036 year European HANCESTRO_0005 direct from donor - fresh BM01 BM01_16dpp_r3
7+
BM01_16dpp_CCGAATTCACCG CL_0000066 epithelial cell 31 MONDO_0000001 disease or disorder NCBITaxon_9606 Homo sapiens GAZ_00003181 Boston EFO_0008919 Seq-Well UBERON_1001913 milk female yes UO_0000036 year European HANCESTRO_0005 direct from donor - fresh BM01 BM01_16dpp_r3

0 commit comments

Comments
 (0)