Merge pull request #43 from broadinstitute/jlc_bq_output

jlchang · web-flow · commit 8f55d6521028 · 2019-10-31T11:22:12.000-04:00
generate json files for BigQuery testing [SCP-1922, SCP-1954]
diff --git a/ingest/validation/validate_metadata.py b/ingest/validation/validate_metadata.py
diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,6 @@
 google-cloud-firestore==1.4.0
 google-cloud-storage==1.16.1
+google-cloud-bigquery==1.21.0
 requests==2.22.0
 numpy==1.16.4
 scipy==1.3.0
diff --git a/scripts/scratch_query/convention_to_bq_schema.py b/scripts/scratch_query/convention_to_bq_schema.py
@@ -0,0 +1,108 @@
+"""Produce BigQuery schema JSON file from metadata convention tsv file
+
+DESCRIPTION
+This CLI takes a tsv metadata convention and creates a BigQuery schema JSON file.
+
+EXAMPLE
+$ python convention_to_bq_schema.py metadata_convention.tsv
+
+"""
+
+import argparse
+import csv
+import os
+import json
+
+REQUIRED_FIELDS = ['CellID', 'biosample_id', 'donor_id']
+
+
+def create_parser():
+    """
+    Command Line parser for serialize_convention
+
+    Input: metadata convention tsv file
+    """
+    # create the argument parser
+    parser = argparse.ArgumentParser(
+        description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
+    )
+    parser.add_argument('--output-path', '-p', help='Path for output file')
+    parser.add_argument('input_convention', help='Metadata convention tsv file')
+    return parser
+
+
+def process_row_type(type_info):
+    type_map = {
+        'integer': 'integer',
+        'boolean': 'boolean',
+        'string': 'string',
+        'number': 'float',
+    }
+    return type_map.get(type_info).upper()
+
+
+def build_schema(input_convention):
+    """
+    Build schema as a Python dictionary
+    """
+
+    with open(input_convention) as tsvfile:
+        reader = csv.DictReader(tsvfile, dialect='excel-tab')
+        schema = []
+        for row in reader:
+            entry = {}
+            entry['name'] = row['attribute']
+            entry['type'] = process_row_type(row['type'])
+            if row['attribute'] in REQUIRED_FIELDS:
+                entry['mode'] = 'REQUIRED'
+            # handle arrays of values
+            if row['array']:
+                entry['type'] = process_row_type(row['type'])
+                entry['mode'] = 'REPEATED'
+            else:
+                entry['mode'] = 'NULLABLE'
+            schema.append(entry)
+    return schema
+
+
+def add_scp_fields_to_schema(schema):
+    scp_entry = {'name': 'scp_accession', 'type': 'string', 'mode': 'REQUIRED'}
+    schema.append(scp_entry)
+    return schema
+
+
+def generate_output_name(inputname, path='', label='bq_schema'):
+    """
+    Build output filename from inputname
+    """
+    head, tail = os.path.split(inputname)
+    name, suffix = os.path.splitext(tail)
+    if label:
+        labeledName = '.'.join([name, label, 'json'])
+    else:
+        labeledName = '.'.join([name, 'json'])
+    if path:
+        outputname = '/'.join([path, labeledName])
+    elif head:
+        outputname = '/'.join([head, labeledName])
+    else:
+        outputname = labeledName
+    return outputname
+
+
+def write_schema(data, inputname, filepath=''):
+    """
+    write BigQuery schema as json file
+    """
+    filename = generate_output_name(inputname, filepath)
+    with open(filename, 'w') as jsonfile:
+        json.dump(data, jsonfile, sort_keys=True, indent=4)
+
+
+if __name__ == '__main__':
+    args = create_parser().parse_args()
+    input_convention = args.input_convention
+    output_path = args.output_path
+    schema = build_schema(input_convention)
+    schema = add_scp_fields_to_schema(schema)
+    write_schema(schema, input_convention, output_path)
diff --git a/scripts/scratch_query/upload_to_bq.py b/scripts/scratch_query/upload_to_bq.py
@@ -0,0 +1,57 @@
+"""Upload NDJSON file to BigQuery
+
+DESCRIPTION
+This CLI takes a local NDJSON file and appends it to an existing bigquery table.
+(reference: https://cloud.google.com/bigquery/docs/loading-data-local#loading_data_from_a_local_data_source and
+https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-json#loading_json_data_into_a_new_table)
+
+EXAMPLE
+$ python upload_to_bq.py dataset_id table_id ../../tests/data/valid_arrays_v1.1.3_for_bq_v1.json
+
+"""
+
+import argparse
+from google.cloud import bigquery
+
+client = bigquery.Client()
+
+
+def create_parser():
+    """
+    Command Line parser for serialize_convention
+
+    Input: metadata convention tsv file
+    """
+    # create the argument parser
+    parser = argparse.ArgumentParser(
+        description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
+    )
+    parser.add_argument('dataset_id', help='<project_ID>:<dataset_id>')
+    parser.add_argument('table_id', help='bigquery table_id')
+    parser.add_argument('input_json', help='NDJSON file for upload')
+    return parser
+
+
+if __name__ == '__main__':
+    args = create_parser().parse_args()
+    dataset_id = args.dataset_id
+    table_id = args.table_id
+    input_json = args.input_json
+    dataset_ref = client.dataset(dataset_id)
+    table_ref = dataset_ref.table(table_id)
+    job_config = bigquery.LoadJobConfig()
+    # note WRITE_APPEND is default behavior
+    job_config.write_disposition = bigquery.WriteDisposition.WRITE_APPEND
+    job_config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON
+    uri = input_json
+    # API request
+    with open(input_json, 'rb') as source_file:
+        job = client.load_table_from_file(source_file, table_ref, job_config=job_config)
+
+    job.result()  # Waits for table load to complete.
+    print("Job finished.")
+
+    print("Loaded {} rows into {}:{}.".format(job.output_rows, dataset_id, table_id))
+
+    destination_table = client.get_table(table_ref)
+    print("{} rows in destination table.".format(destination_table.num_rows))
diff --git a/tests/data/issues_array_v1.1.2.json b/tests/data/issues_array_v1.1.2.json
@@ -21,10 +21,8 @@
       "disease__treated: 'T' is not of type 'boolean'": [
         "BM01_16dpp_CTAAGCAGTGGT"
       ],
-      "disease__time_since_onset: '36|three|1' is not of type 'array'": [
-        "BM01_16dpp_CGGTAAACCATT"
-      ],
-      "disease__time_since_onset: 'zero' is not of type 'array'": [
+      "'disease__time_since_onset' is a dependency of 'disease__time_since_onset__unit'": [
+        "BM01_16dpp_CGGTAAACCATT",
         "BM01_16dpp_CCGAATTCACCG"
       ],
       "ethnicity: nan is not of type 'string'": [
diff --git a/tests/data/issues_metadata_v1.1.1.json b/tests/data/issues_metadata_v1.1.1.json
@@ -1 +1,42 @@
-{"error": {"type": {"organism_age: \"foo\" does not match expected type": ["BM01_16dpp_CGGTAAACCATT"]}, "convention": {"'ethnicity' is a dependency of 'ethnicity__ontology_label'": ["BM01_16dpp_AAGCAGTGGTAT", "BM01_16dpp_TAAGCAGTGGTA", "BM01_16dpp_CTAAGCAGTGGT", "BM01_16dpp_CGGTAAACCATT", "BM01_16dpp_AAGCAGTGGTAT"], "'sex' is a required property": ["BM01_16dpp_AAGCAGTGGTAT", "BM01_16dpp_TAAGCAGTGGTA", "BM01_16dpp_CTAAGCAGTGGT", "BM01_16dpp_CGGTAAACCATT", "BM01_16dpp_AAGCAGTGGTAT"], "is_living: nan is not one of ['yes', 'no', 'unknown']": ["BM01_16dpp_TAAGCAGTGGTA"], "is_living: nan is not of type 'string'": ["BM01_16dpp_TAAGCAGTGGTA"], "sample_type: 'direct from donr - fresh' is not one of ['cell line', 'organoid', 'direct from donor - fresh', 'direct from donor - frozen', 'cultured primary cells']": ["BM01_16dpp_CTAAGCAGTGGT"], "organism_age: 'foo' is not of type 'number'": ["BM01_16dpp_CGGTAAACCATT"]}, "format": {"Duplicate CellID(s) in metadata file": ["BM01_16dpp_AAGCAGTGGTAT"]}}}
+{
+  "error": {
+    "type": {
+      "organism_age: \"foo\" does not match expected type": [
+        "BM01_16dpp_CGGTAAACCATT"
+      ]
+    },
+    "convention": {
+      "'ethnicity' is a dependency of 'ethnicity__ontology_label'": [
+        "BM01_16dpp_AAGCAGTGGTAT",
+        "BM01_16dpp_TAAGCAGTGGTA",
+        "BM01_16dpp_CTAAGCAGTGGT",
+        "BM01_16dpp_CGGTAAACCATT",
+        "BM01_16dpp_AAGCAGTGGTAT"
+      ],
+      "'sex' is a required property": [
+        "BM01_16dpp_AAGCAGTGGTAT",
+        "BM01_16dpp_TAAGCAGTGGTA",
+        "BM01_16dpp_CTAAGCAGTGGT",
+        "BM01_16dpp_CGGTAAACCATT",
+        "BM01_16dpp_AAGCAGTGGTAT"
+      ],
+      "is_living: nan is not one of ['yes', 'no', 'unknown']": [
+        "BM01_16dpp_TAAGCAGTGGTA"
+      ],
+      "is_living: nan is not of type 'string'": [
+        "BM01_16dpp_TAAGCAGTGGTA"
+      ],
+      "sample_type: 'direct from donr - fresh' is not one of ['cell line', 'organoid', 'direct from donor - fresh', 'direct from donor - frozen', 'cultured primary cells']": [
+        "BM01_16dpp_CTAAGCAGTGGT"
+      ],
+      "'organism_age' is a dependency of 'organism_age__unit'": [
+        "BM01_16dpp_CGGTAAACCATT"
+      ]
+    },
+    "format": {
+      "Duplicate CellID(s) in metadata file": [
+        "BM01_16dpp_AAGCAGTGGTAT"
+      ]
+    }
+  }
+}
diff --git a/tests/test_validate_metadata.py b/tests/test_validate_metadata.py
@@ -84,7 +84,7 @@ def test_header_format(self):
 
     def test_convention_content(self):
         """Metadata convention should be valid jsonschema
-            """
+        """
 
         args = '../tests/data/AMC_invalid.json ../tests/data/valid_v1.1.1.tsv'
         metadata, convention = self.setup_metadata(args)