Merge pull request #85 from broadinstitute/ea-numeric-labels

knapii-developments · web-flow · commit 49abb2e60ae8 · 2020-04-16T11:53:20.000-04:00
Fix metadata of type "group" behaving as float post-ingest
diff --git a/ingest/annotations.py b/ingest/annotations.py
@@ -8,9 +8,10 @@
 """
 
 import abc
+from collections import defaultdict
+
 import pandas as pd  # NOqa: F821
 from bson.objectid import ObjectId
-from collections import defaultdict
 
 try:
     # Used when importing internally and in tests
@@ -72,12 +73,14 @@ def merge_df(self, first_df, second_df):
 
     def preprocess(self):
         """Ensures that:
+            - Labels are treated as strings
             - Numeric columns are rounded to 3 decimals points
             - Group annotations are strings
             - 'NAME' in first header row is capitalized
             - 'TYPE' in second header row is capitalized
         """
-        headers = self.file.columns.get_level_values(0)
+        # Grab column names and convert to strings
+        headers = [str(header) for header in self.file.columns.get_level_values(0)]
         annot_types = self.file.columns.get_level_values(1)
         # Lowercase second level. Example: NUMeric -> numeric
         self.file.rename(
@@ -89,7 +92,7 @@ def preprocess(self):
         self.file.rename(columns={name: name.upper(), type: type.upper()}, inplace=True)
         # Make sure group annotations are treated as strings
         # only run this assignment if group annotations are present
-        if 'group' in list(annot_types):
+        if 'group' in annot_types:
             group_columns = self.file.xs(
                 "group", axis=1, level=1, drop_level=False
             ).columns.tolist()
diff --git a/ingest/ingest_files.py b/ingest/ingest_files.py
@@ -3,17 +3,18 @@
 DESCRIPTION
 Module provides extract capabilities for text, CSV, and TSV file types
 """
+import copy
 import csv
+import gzip
 import logging
 import mimetypes
 import os
 import re
-from typing import Dict, Generator, List, Tuple, Union  # noqa: F401
 from dataclasses import dataclass
-import gzip
+from typing import Dict, Generator, List, Tuple, Union  # noqa: F401
+
 import pandas as pd  # NOqa: F821
 from google.cloud import storage
-import copy
 
 # from google.cloud.logging.resource import Resource
 # import google.cloud.logging
@@ -65,7 +66,7 @@ def get_data_array(self):
         if len(self.values) > self.MAX_ENTRIES:
             values = self.values
             for idx, i in enumerate(range(0, len(self.values), self.MAX_ENTRIES)):
-                self.values = values[i : i + self.MAX_ENTRIES]
+                self.values = values[i: i + self.MAX_ENTRIES]
                 self.array_index = idx
                 yield copy.copy(self.__dict__)
         else:
@@ -75,7 +76,8 @@ def get_data_array(self):
 class IngestFiles:
     # General logger for class
     info_logger = setup_logger(__name__, "info.txt")
-    error_logger = setup_logger(__name__ + "_errors", "errors.txt", level=logging.ERROR)
+    error_logger = setup_logger(
+        __name__ + "_errors", "errors.txt", level=logging.ERROR)
 
     def __init__(self, file_path, allowed_file_types):
         self.file_path = file_path
@@ -270,30 +272,11 @@ def open_txt(self, open_file_object, **kwargs):
     def open_pandas(self, file_path, file_type, **kwargs):
         """Opens file as a dataframe """
         open_file_object = kwargs.pop('open_file_object')
-        if file_type == "text/tab-separated-values":
-            return pd.read_csv(
-                file_path,
-                sep="\t",
-                skipinitialspace=True,
-                quoting=csv.QUOTE_NONNUMERIC,
-                **kwargs,
-            )
-        elif file_type == "text/csv":
-            return pd.read_csv(
-                file_path,
-                sep=",",
-                quotechar='"',
-                quoting=csv.QUOTE_NONNUMERIC,
-                skipinitialspace=True,
-                escapechar='\\',
-                **kwargs,
-            )
-        elif file_type == 'text/plain':
+        if file_type in self.allowed_file_types:
             csv_dialect = csv.Sniffer().sniff(open_file_object.readline())
             csv_dialect.skipinitialspace = True
             open_file_object.seek(0)
             return pd.read_csv(file_path, dialect=csv_dialect, **kwargs)
-
         else:
             raise ValueError("File must be tab or comma delimited")
 
@@ -310,7 +293,8 @@ def open_csv(self, opened_file_object, **kwargs):
 
     def open_tsv(self, opened_file_object, **kwargs):
         """Opens tsv file"""
-        csv.register_dialect("tsvDialect", delimiter="\t", skipinitialspace=True)
+        csv.register_dialect("tsvDialect", delimiter="\t",
+                             skipinitialspace=True)
         return csv.reader(opened_file_object, dialect="tsvDialect")
 
     def extract_csv_or_tsv(self, file):
diff --git a/ingest/ingest_pipeline.py b/ingest/ingest_pipeline.py
@@ -33,25 +33,23 @@
 # Ingest mtx files
 python ingest_pipeline.py --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_expression --taxon-name 'Homo sapiens' --taxon-common-name humans --matrix-file ../tests/data/matrix.mtx --matrix-file-type mtx --gene-file ../tests/data/genes.tsv --barcode-file ../tests/data/barcodes.tsv
 """
-from typing import Dict, Generator, List, Tuple, Union  # noqa: F401
-from contextlib import nullcontext
-
-import sys
 import json
-import os
 import logging
+import os
 import re
-
-from pymongo import MongoClient, InsertOne
-from pymongo.errors import BulkWriteError
+import sys
+from contextlib import nullcontext
+from typing import Dict, Generator, List, Tuple, Union  # noqa: F401
 
 # import google.cloud.logging
 from bson.objectid import ObjectId
 
 # For tracing
 from opencensus.ext.stackdriver.trace_exporter import StackdriverExporter
-from opencensus.trace.tracer import Tracer
 from opencensus.trace.samplers import AlwaysOnSampler
+from opencensus.trace.tracer import Tracer
+from pymongo import InsertOne, MongoClient
+from pymongo.errors import BulkWriteError
 
 # from google.cloud.logging.resource import Resource
 
@@ -230,13 +228,8 @@ def load(
                 self.insert_many('data_arrays', documents)
         except Exception as e:
             self.error_logger.error(e, extra=self.extra_log_params)
-            if hasattr(e, 'details') and e.details is not None:
+            if e.details is not None:
                 self.error_logger.error(e.details, extra=self.extra_log_params)
-            else:
-                self.error_logger.error(
-                    'Error loading data to MongoDB (no details available) - check MongoDB access',
-                    extra=self.extra_log_params,
-                )
             return 1
         return 0
 
@@ -286,7 +279,8 @@ def load_subsample(
                 for model in set_data_array_fn(
                     (
                         key_value[0],  # NAMES, x, y, or z
-                        parent_data['name'],  # Cluster name provided from parent
+                        # Cluster name provided from parent
+                        parent_data['name'],
                         key_value[1],  # Subsampled data/values
                         ObjectId(self.study_file_id),
                         ObjectId(self.study_id),
diff --git a/tests/data/test_1k_cluster_data.csv b/tests/data/test_1k_cluster_data.csv
diff --git a/tests/test_annotations.py b/tests/test_annotations.py
@@ -4,6 +4,7 @@
     - Group type annotations that have numeric-like values are being treated as strings
     - Numeric columns are rounded to 3 decimals points
     - Filtering cell names (given from cluster file) in metadata correctly
+    - Labels are treated as strings
 
 PREREQUISITES
 Spin up Python 3.6 virtualenv, install Python dependencies in requirements.txt
@@ -12,18 +13,18 @@
 differences in how the reference issues are serialized
 
 # Run all tests in a manner that shows report_issues output
-python3 test_ingest_files.py -s
+python3 test_annotations.py -s
 """
 
 
+import random
 import sys
 import unittest
 from decimal import Decimal
+
 import numpy as np
-import random
 
 sys.path.append("../ingest")
-
 from annotations import Annotations
 
 
@@ -54,6 +55,9 @@ def test_round(self):
     def test_group_annotations(self):
         self.df.preprocess()
         for column in self.df.file.columns:
+            # Ensure labels are strings
+            header = column[0]
+            assert isinstance(header, str)
             annot_type = column[1]
             if annot_type == 'group':
                 assert (