Skip to content

Commit 49abb2e

Browse files
Merge pull request #85 from broadinstitute/ea-numeric-labels
Fix metadata of type "group" behaving as float post-ingest
2 parents c06e835 + 7679297 commit 49abb2e

File tree

5 files changed

+34
-49
lines changed

5 files changed

+34
-49
lines changed

ingest/annotations.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,10 @@
88
"""
99

1010
import abc
11+
from collections import defaultdict
12+
1113
import pandas as pd # NOqa: F821
1214
from bson.objectid import ObjectId
13-
from collections import defaultdict
1415

1516
try:
1617
# Used when importing internally and in tests
@@ -72,12 +73,14 @@ def merge_df(self, first_df, second_df):
7273

7374
def preprocess(self):
7475
"""Ensures that:
76+
- Labels are treated as strings
7577
- Numeric columns are rounded to 3 decimals points
7678
- Group annotations are strings
7779
- 'NAME' in first header row is capitalized
7880
- 'TYPE' in second header row is capitalized
7981
"""
80-
headers = self.file.columns.get_level_values(0)
82+
# Grab column names and convert to strings
83+
headers = [str(header) for header in self.file.columns.get_level_values(0)]
8184
annot_types = self.file.columns.get_level_values(1)
8285
# Lowercase second level. Example: NUMeric -> numeric
8386
self.file.rename(
@@ -89,7 +92,7 @@ def preprocess(self):
8992
self.file.rename(columns={name: name.upper(), type: type.upper()}, inplace=True)
9093
# Make sure group annotations are treated as strings
9194
# only run this assignment if group annotations are present
92-
if 'group' in list(annot_types):
95+
if 'group' in annot_types:
9396
group_columns = self.file.xs(
9497
"group", axis=1, level=1, drop_level=False
9598
).columns.tolist()

ingest/ingest_files.py

Lines changed: 10 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -3,17 +3,18 @@
33
DESCRIPTION
44
Module provides extract capabilities for text, CSV, and TSV file types
55
"""
6+
import copy
67
import csv
8+
import gzip
79
import logging
810
import mimetypes
911
import os
1012
import re
11-
from typing import Dict, Generator, List, Tuple, Union # noqa: F401
1213
from dataclasses import dataclass
13-
import gzip
14+
from typing import Dict, Generator, List, Tuple, Union # noqa: F401
15+
1416
import pandas as pd # NOqa: F821
1517
from google.cloud import storage
16-
import copy
1718

1819
# from google.cloud.logging.resource import Resource
1920
# import google.cloud.logging
@@ -65,7 +66,7 @@ def get_data_array(self):
6566
if len(self.values) > self.MAX_ENTRIES:
6667
values = self.values
6768
for idx, i in enumerate(range(0, len(self.values), self.MAX_ENTRIES)):
68-
self.values = values[i : i + self.MAX_ENTRIES]
69+
self.values = values[i: i + self.MAX_ENTRIES]
6970
self.array_index = idx
7071
yield copy.copy(self.__dict__)
7172
else:
@@ -75,7 +76,8 @@ def get_data_array(self):
7576
class IngestFiles:
7677
# General logger for class
7778
info_logger = setup_logger(__name__, "info.txt")
78-
error_logger = setup_logger(__name__ + "_errors", "errors.txt", level=logging.ERROR)
79+
error_logger = setup_logger(
80+
__name__ + "_errors", "errors.txt", level=logging.ERROR)
7981

8082
def __init__(self, file_path, allowed_file_types):
8183
self.file_path = file_path
@@ -270,30 +272,11 @@ def open_txt(self, open_file_object, **kwargs):
270272
def open_pandas(self, file_path, file_type, **kwargs):
271273
"""Opens file as a dataframe """
272274
open_file_object = kwargs.pop('open_file_object')
273-
if file_type == "text/tab-separated-values":
274-
return pd.read_csv(
275-
file_path,
276-
sep="\t",
277-
skipinitialspace=True,
278-
quoting=csv.QUOTE_NONNUMERIC,
279-
**kwargs,
280-
)
281-
elif file_type == "text/csv":
282-
return pd.read_csv(
283-
file_path,
284-
sep=",",
285-
quotechar='"',
286-
quoting=csv.QUOTE_NONNUMERIC,
287-
skipinitialspace=True,
288-
escapechar='\\',
289-
**kwargs,
290-
)
291-
elif file_type == 'text/plain':
275+
if file_type in self.allowed_file_types:
292276
csv_dialect = csv.Sniffer().sniff(open_file_object.readline())
293277
csv_dialect.skipinitialspace = True
294278
open_file_object.seek(0)
295279
return pd.read_csv(file_path, dialect=csv_dialect, **kwargs)
296-
297280
else:
298281
raise ValueError("File must be tab or comma delimited")
299282

@@ -310,7 +293,8 @@ def open_csv(self, opened_file_object, **kwargs):
310293

311294
def open_tsv(self, opened_file_object, **kwargs):
312295
"""Opens tsv file"""
313-
csv.register_dialect("tsvDialect", delimiter="\t", skipinitialspace=True)
296+
csv.register_dialect("tsvDialect", delimiter="\t",
297+
skipinitialspace=True)
314298
return csv.reader(opened_file_object, dialect="tsvDialect")
315299

316300
def extract_csv_or_tsv(self, file):

ingest/ingest_pipeline.py

Lines changed: 10 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -33,25 +33,23 @@
3333
# Ingest mtx files
3434
python ingest_pipeline.py --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_expression --taxon-name 'Homo sapiens' --taxon-common-name humans --matrix-file ../tests/data/matrix.mtx --matrix-file-type mtx --gene-file ../tests/data/genes.tsv --barcode-file ../tests/data/barcodes.tsv
3535
"""
36-
from typing import Dict, Generator, List, Tuple, Union # noqa: F401
37-
from contextlib import nullcontext
38-
39-
import sys
4036
import json
41-
import os
4237
import logging
38+
import os
4339
import re
44-
45-
from pymongo import MongoClient, InsertOne
46-
from pymongo.errors import BulkWriteError
40+
import sys
41+
from contextlib import nullcontext
42+
from typing import Dict, Generator, List, Tuple, Union # noqa: F401
4743

4844
# import google.cloud.logging
4945
from bson.objectid import ObjectId
5046

5147
# For tracing
5248
from opencensus.ext.stackdriver.trace_exporter import StackdriverExporter
53-
from opencensus.trace.tracer import Tracer
5449
from opencensus.trace.samplers import AlwaysOnSampler
50+
from opencensus.trace.tracer import Tracer
51+
from pymongo import InsertOne, MongoClient
52+
from pymongo.errors import BulkWriteError
5553

5654
# from google.cloud.logging.resource import Resource
5755

@@ -230,13 +228,8 @@ def load(
230228
self.insert_many('data_arrays', documents)
231229
except Exception as e:
232230
self.error_logger.error(e, extra=self.extra_log_params)
233-
if hasattr(e, 'details') and e.details is not None:
231+
if e.details is not None:
234232
self.error_logger.error(e.details, extra=self.extra_log_params)
235-
else:
236-
self.error_logger.error(
237-
'Error loading data to MongoDB (no details available) - check MongoDB access',
238-
extra=self.extra_log_params,
239-
)
240233
return 1
241234
return 0
242235

@@ -286,7 +279,8 @@ def load_subsample(
286279
for model in set_data_array_fn(
287280
(
288281
key_value[0], # NAMES, x, y, or z
289-
parent_data['name'], # Cluster name provided from parent
282+
# Cluster name provided from parent
283+
parent_data['name'],
290284
key_value[1], # Subsampled data/values
291285
ObjectId(self.study_file_id),
292286
ObjectId(self.study_id),

tests/data/test_1k_cluster_data.csv

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

tests/test_annotations.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
- Group type annotations that have numeric-like values are being treated as strings
55
- Numeric columns are rounded to 3 decimals points
66
- Filtering cell names (given from cluster file) in metadata correctly
7+
- Labels are treated as strings
78
89
PREREQUISITES
910
Spin up Python 3.6 virtualenv, install Python dependencies in requirements.txt
@@ -12,18 +13,18 @@
1213
differences in how the reference issues are serialized
1314
1415
# Run all tests in a manner that shows report_issues output
15-
python3 test_ingest_files.py -s
16+
python3 test_annotations.py -s
1617
"""
1718

1819

20+
import random
1921
import sys
2022
import unittest
2123
from decimal import Decimal
24+
2225
import numpy as np
23-
import random
2426

2527
sys.path.append("../ingest")
26-
2728
from annotations import Annotations
2829

2930

@@ -54,6 +55,9 @@ def test_round(self):
5455
def test_group_annotations(self):
5556
self.df.preprocess()
5657
for column in self.df.file.columns:
58+
# Ensure labels are strings
59+
header = column[0]
60+
assert isinstance(header, str)
5761
annot_type = column[1]
5862
if annot_type == 'group':
5963
assert (

0 commit comments

Comments
 (0)