Skip to content

Commit 8f55d65

Browse files
authored
Merge pull request #43 from broadinstitute/jlc_bq_output
generate json files for BigQuery testing [SCP-1922, SCP-1954]
2 parents 7addbe4 + 5848ec0 commit 8f55d65

File tree

7 files changed

+323
-123
lines changed

7 files changed

+323
-123
lines changed

ingest/validation/validate_metadata.py

Lines changed: 112 additions & 117 deletions
Large diffs are not rendered by default.

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
google-cloud-firestore==1.4.0
22
google-cloud-storage==1.16.1
3+
google-cloud-bigquery==1.21.0
34
requests==2.22.0
45
numpy==1.16.4
56
scipy==1.3.0
Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
"""Produce BigQuery schema JSON file from metadata convention tsv file
2+
3+
DESCRIPTION
4+
This CLI takes a tsv metadata convention and creates a BigQuery schema JSON file.
5+
6+
EXAMPLE
7+
$ python convention_to_bq_schema.py metadata_convention.tsv
8+
9+
"""
10+
11+
import argparse
12+
import csv
13+
import os
14+
import json
15+
16+
REQUIRED_FIELDS = ['CellID', 'biosample_id', 'donor_id']
17+
18+
19+
def create_parser():
20+
"""
21+
Command Line parser for serialize_convention
22+
23+
Input: metadata convention tsv file
24+
"""
25+
# create the argument parser
26+
parser = argparse.ArgumentParser(
27+
description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
28+
)
29+
parser.add_argument('--output-path', '-p', help='Path for output file')
30+
parser.add_argument('input_convention', help='Metadata convention tsv file')
31+
return parser
32+
33+
34+
def process_row_type(type_info):
35+
type_map = {
36+
'integer': 'integer',
37+
'boolean': 'boolean',
38+
'string': 'string',
39+
'number': 'float',
40+
}
41+
return type_map.get(type_info).upper()
42+
43+
44+
def build_schema(input_convention):
45+
"""
46+
Build schema as a Python dictionary
47+
"""
48+
49+
with open(input_convention) as tsvfile:
50+
reader = csv.DictReader(tsvfile, dialect='excel-tab')
51+
schema = []
52+
for row in reader:
53+
entry = {}
54+
entry['name'] = row['attribute']
55+
entry['type'] = process_row_type(row['type'])
56+
if row['attribute'] in REQUIRED_FIELDS:
57+
entry['mode'] = 'REQUIRED'
58+
# handle arrays of values
59+
if row['array']:
60+
entry['type'] = process_row_type(row['type'])
61+
entry['mode'] = 'REPEATED'
62+
else:
63+
entry['mode'] = 'NULLABLE'
64+
schema.append(entry)
65+
return schema
66+
67+
68+
def add_scp_fields_to_schema(schema):
69+
scp_entry = {'name': 'scp_accession', 'type': 'string', 'mode': 'REQUIRED'}
70+
schema.append(scp_entry)
71+
return schema
72+
73+
74+
def generate_output_name(inputname, path='', label='bq_schema'):
75+
"""
76+
Build output filename from inputname
77+
"""
78+
head, tail = os.path.split(inputname)
79+
name, suffix = os.path.splitext(tail)
80+
if label:
81+
labeledName = '.'.join([name, label, 'json'])
82+
else:
83+
labeledName = '.'.join([name, 'json'])
84+
if path:
85+
outputname = '/'.join([path, labeledName])
86+
elif head:
87+
outputname = '/'.join([head, labeledName])
88+
else:
89+
outputname = labeledName
90+
return outputname
91+
92+
93+
def write_schema(data, inputname, filepath=''):
94+
"""
95+
write BigQuery schema as json file
96+
"""
97+
filename = generate_output_name(inputname, filepath)
98+
with open(filename, 'w') as jsonfile:
99+
json.dump(data, jsonfile, sort_keys=True, indent=4)
100+
101+
102+
if __name__ == '__main__':
103+
args = create_parser().parse_args()
104+
input_convention = args.input_convention
105+
output_path = args.output_path
106+
schema = build_schema(input_convention)
107+
schema = add_scp_fields_to_schema(schema)
108+
write_schema(schema, input_convention, output_path)
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
"""Upload NDJSON file to BigQuery
2+
3+
DESCRIPTION
4+
This CLI takes a local NDJSON file and appends it to an existing bigquery table.
5+
(reference: https://cloud.google.com/bigquery/docs/loading-data-local#loading_data_from_a_local_data_source and
6+
https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-json#loading_json_data_into_a_new_table)
7+
8+
EXAMPLE
9+
$ python upload_to_bq.py dataset_id table_id ../../tests/data/valid_arrays_v1.1.3_for_bq_v1.json
10+
11+
"""
12+
13+
import argparse
14+
from google.cloud import bigquery
15+
16+
client = bigquery.Client()
17+
18+
19+
def create_parser():
20+
"""
21+
Command Line parser for serialize_convention
22+
23+
Input: metadata convention tsv file
24+
"""
25+
# create the argument parser
26+
parser = argparse.ArgumentParser(
27+
description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
28+
)
29+
parser.add_argument('dataset_id', help='<project_ID>:<dataset_id>')
30+
parser.add_argument('table_id', help='bigquery table_id')
31+
parser.add_argument('input_json', help='NDJSON file for upload')
32+
return parser
33+
34+
35+
if __name__ == '__main__':
36+
args = create_parser().parse_args()
37+
dataset_id = args.dataset_id
38+
table_id = args.table_id
39+
input_json = args.input_json
40+
dataset_ref = client.dataset(dataset_id)
41+
table_ref = dataset_ref.table(table_id)
42+
job_config = bigquery.LoadJobConfig()
43+
# note WRITE_APPEND is default behavior
44+
job_config.write_disposition = bigquery.WriteDisposition.WRITE_APPEND
45+
job_config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON
46+
uri = input_json
47+
# API request
48+
with open(input_json, 'rb') as source_file:
49+
job = client.load_table_from_file(source_file, table_ref, job_config=job_config)
50+
51+
job.result() # Waits for table load to complete.
52+
print("Job finished.")
53+
54+
print("Loaded {} rows into {}:{}.".format(job.output_rows, dataset_id, table_id))
55+
56+
destination_table = client.get_table(table_ref)
57+
print("{} rows in destination table.".format(destination_table.num_rows))

tests/data/issues_array_v1.1.2.json

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,8 @@
2121
"disease__treated: 'T' is not of type 'boolean'": [
2222
"BM01_16dpp_CTAAGCAGTGGT"
2323
],
24-
"disease__time_since_onset: '36|three|1' is not of type 'array'": [
25-
"BM01_16dpp_CGGTAAACCATT"
26-
],
27-
"disease__time_since_onset: 'zero' is not of type 'array'": [
24+
"'disease__time_since_onset' is a dependency of 'disease__time_since_onset__unit'": [
25+
"BM01_16dpp_CGGTAAACCATT",
2826
"BM01_16dpp_CCGAATTCACCG"
2927
],
3028
"ethnicity: nan is not of type 'string'": [
Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,42 @@
1-
{"error": {"type": {"organism_age: \"foo\" does not match expected type": ["BM01_16dpp_CGGTAAACCATT"]}, "convention": {"'ethnicity' is a dependency of 'ethnicity__ontology_label'": ["BM01_16dpp_AAGCAGTGGTAT", "BM01_16dpp_TAAGCAGTGGTA", "BM01_16dpp_CTAAGCAGTGGT", "BM01_16dpp_CGGTAAACCATT", "BM01_16dpp_AAGCAGTGGTAT"], "'sex' is a required property": ["BM01_16dpp_AAGCAGTGGTAT", "BM01_16dpp_TAAGCAGTGGTA", "BM01_16dpp_CTAAGCAGTGGT", "BM01_16dpp_CGGTAAACCATT", "BM01_16dpp_AAGCAGTGGTAT"], "is_living: nan is not one of ['yes', 'no', 'unknown']": ["BM01_16dpp_TAAGCAGTGGTA"], "is_living: nan is not of type 'string'": ["BM01_16dpp_TAAGCAGTGGTA"], "sample_type: 'direct from donr - fresh' is not one of ['cell line', 'organoid', 'direct from donor - fresh', 'direct from donor - frozen', 'cultured primary cells']": ["BM01_16dpp_CTAAGCAGTGGT"], "organism_age: 'foo' is not of type 'number'": ["BM01_16dpp_CGGTAAACCATT"]}, "format": {"Duplicate CellID(s) in metadata file": ["BM01_16dpp_AAGCAGTGGTAT"]}}}
1+
{
2+
"error": {
3+
"type": {
4+
"organism_age: \"foo\" does not match expected type": [
5+
"BM01_16dpp_CGGTAAACCATT"
6+
]
7+
},
8+
"convention": {
9+
"'ethnicity' is a dependency of 'ethnicity__ontology_label'": [
10+
"BM01_16dpp_AAGCAGTGGTAT",
11+
"BM01_16dpp_TAAGCAGTGGTA",
12+
"BM01_16dpp_CTAAGCAGTGGT",
13+
"BM01_16dpp_CGGTAAACCATT",
14+
"BM01_16dpp_AAGCAGTGGTAT"
15+
],
16+
"'sex' is a required property": [
17+
"BM01_16dpp_AAGCAGTGGTAT",
18+
"BM01_16dpp_TAAGCAGTGGTA",
19+
"BM01_16dpp_CTAAGCAGTGGT",
20+
"BM01_16dpp_CGGTAAACCATT",
21+
"BM01_16dpp_AAGCAGTGGTAT"
22+
],
23+
"is_living: nan is not one of ['yes', 'no', 'unknown']": [
24+
"BM01_16dpp_TAAGCAGTGGTA"
25+
],
26+
"is_living: nan is not of type 'string'": [
27+
"BM01_16dpp_TAAGCAGTGGTA"
28+
],
29+
"sample_type: 'direct from donr - fresh' is not one of ['cell line', 'organoid', 'direct from donor - fresh', 'direct from donor - frozen', 'cultured primary cells']": [
30+
"BM01_16dpp_CTAAGCAGTGGT"
31+
],
32+
"'organism_age' is a dependency of 'organism_age__unit'": [
33+
"BM01_16dpp_CGGTAAACCATT"
34+
]
35+
},
36+
"format": {
37+
"Duplicate CellID(s) in metadata file": [
38+
"BM01_16dpp_AAGCAGTGGTAT"
39+
]
40+
}
41+
}
42+
}

tests/test_validate_metadata.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ def test_header_format(self):
8484

8585
def test_convention_content(self):
8686
"""Metadata convention should be valid jsonschema
87-
"""
87+
"""
8888

8989
args = '../tests/data/AMC_invalid.json ../tests/data/valid_v1.1.1.tsv'
9090
metadata, convention = self.setup_metadata(args)

0 commit comments

Comments
 (0)