Skip to content

Commit a5ded63

Browse files
Merge pull request #92 from broadinstitute/ea-validate-annotation-files
Ea validate annotation files
2 parents f6bf48d + 0f2b689 commit a5ded63

14 files changed

+186
-50
lines changed

ingest/annotations.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -240,9 +240,9 @@ def validate_against_header_count(self):
240240
return valid
241241

242242
def validate_format(self):
243-
"""Check all metadata file format criteria for file validity
243+
"""Check common format criteria for annotation files
244244
"""
245-
self.is_valid_file = all(
245+
return all(
246246
[
247247
self.validate_header_keyword(),
248248
self.validate_type_keyword(),
@@ -251,4 +251,3 @@ def validate_format(self):
251251
self.validate_against_header_count(),
252252
]
253253
)
254-
return self.is_valid_file

ingest/cell_metadata.py

Lines changed: 33 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,14 @@
77
PREREQUISITES
88
Must have python 3.6 or higher.
99
"""
10+
import collections
11+
import ntpath
1012
from collections import defaultdict
11-
from typing import Dict, Generator, List, Tuple, Union # noqa: F401
1213
from dataclasses import dataclass
13-
from mypy_extensions import TypedDict
14+
from typing import Dict, Generator, List, Tuple, Union # noqa: F401
15+
1416
from bson.objectid import ObjectId
15-
import ntpath
16-
import collections
17+
from mypy_extensions import TypedDict
1718

1819
try:
1920
# Used when importing internally and in tests
@@ -35,7 +36,7 @@ def __init__(
3536
study_id: ObjectId,
3637
study_file_id: ObjectId,
3738
*args,
38-
**kwargs
39+
**kwargs,
3940
):
4041

4142
self.study_accession = kwargs.pop("study_accession")
@@ -60,6 +61,33 @@ class Model(TypedDict):
6061
# unique values from "group" type annotations
6162
values: List
6263

64+
# Will evolve to do cross file validation
65+
def validate(self):
66+
""" Runs all validation checks
67+
"""
68+
return all([self.is_valid_format()])
69+
70+
def is_valid_format(self):
71+
"""Validates format by calling all format validation methods"""
72+
return all(
73+
[self.validate_header_for_coordinate_values(), self.validate_format()]
74+
)
75+
76+
def validate_header_for_coordinate_values(self):
77+
"""Cell metadata files should not have coordinates in header
78+
:return: boolean True if coordinates are not in header, otherwise False
79+
"""
80+
lower_cased_headers = [header.lower() for header in self.headers]
81+
valid = not any(
82+
[coordinate in ('x', 'y', 'z') for coordinate in lower_cased_headers]
83+
)
84+
if valid:
85+
return True
86+
else:
87+
msg = 'Header names can not be coordinate values x, y, or z (case insensitive)'
88+
self.store_validation_issue('error', 'format', msg)
89+
return False
90+
6391
def transform(self):
6492
""" Builds cell metadata model"""
6593
AnnotationModel = collections.namedtuple(

ingest/clusters.py

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
1-
from typing import Dict, Generator, List, Tuple, Union # noqa: F401
2-
from dataclasses import dataclass
3-
from mypy_extensions import TypedDict
41
import logging
2+
from dataclasses import dataclass
3+
from typing import Dict, Generator, List, Tuple, Union # noqa: F401
4+
55
from bson.objectid import ObjectId
6+
from mypy_extensions import TypedDict
67

78
try:
89
from ingest_files import DataArray
@@ -75,6 +76,32 @@ def __init__(
7576
self.extra_log_params = {'study_id': self.study_id, 'duration': None}
7677
self.preprocess()
7778

79+
# Will evolve to do cross file validation
80+
def validate(self):
81+
""" Runs all validation checks
82+
"""
83+
return all([self.is_valid_format()])
84+
85+
def is_valid_format(self):
86+
"""Validates format by calling all format validation methods"""
87+
return all(
88+
[self.validate_header_for_coordinate_values(), self.validate_format()]
89+
)
90+
91+
def validate_header_for_coordinate_values(self):
92+
"""Cluster files must have coordinates 'x' and 'y' in header
93+
:return: boolean True if coordinates are in header, otherwise False
94+
"""
95+
lower_cased_headers = [header.lower() for header in self.headers]
96+
for coordinate in ('x', 'y'):
97+
if coordinate not in lower_cased_headers:
98+
msg = (
99+
"Header must have coordinate values 'x' and 'y' (case insensitive)"
100+
)
101+
self.store_validation_issue('error', 'format', msg)
102+
return False
103+
return True
104+
78105
def transform(self):
79106
""" Builds cluster data model"""
80107
# Array of Hash objects that describe all extra "annotation" columns

ingest/dense.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
import collections
1212
from typing import List # noqa: F401
13+
1314
from bson.objectid import ObjectId
1415

1516
try:

ingest/ingest_files.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -279,7 +279,9 @@ def open_pandas(self, file_path, file_type, **kwargs):
279279
delimiter = ","
280280
else:
281281
delimiter = None
282-
dialect = csv.Sniffer().sniff(open_file_object.readline(), delimiters=delimiter)
282+
dialect = csv.Sniffer().sniff(
283+
open_file_object.readline(), delimiters=delimiter
284+
)
283285
dialect.skipinitialspace = True
284286
open_file_object.seek(0)
285287
return pd.read_csv(file_path, dialect=dialect, **kwargs)

ingest/ingest_pipeline.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -233,6 +233,7 @@ def load(
233233
return 1
234234
return 0
235235

236+
# @profile
236237
def load_expression_file(self, models, is_gene_model=False):
237238
collection_name = self.matrix.COLLECTION_NAME
238239
# Creates operations to perform for bulk write
@@ -337,9 +338,8 @@ def upload_metadata_to_bq(self):
337338
return 1
338339
return 0
339340

340-
@trace
341-
@my_debug_logger()
342-
# @profile
341+
# @trace
342+
# @my_debug_logger()
343343
def ingest_expression(self) -> int:
344344
"""Ingests expression files.
345345
"""
@@ -405,7 +405,7 @@ def ingest_expression(self) -> int:
405405
# @my_debug_logger()
406406
def ingest_cell_metadata(self):
407407
"""Ingests cell metadata files into Firestore."""
408-
if self.cell_metadata.validate_format():
408+
if self.cell_metadata.validate():
409409
self.info_logger.info(
410410
f'Cell metadata file format valid', extra=self.extra_log_params
411411
)
@@ -451,7 +451,7 @@ def ingest_cell_metadata(self):
451451
@my_debug_logger()
452452
def ingest_cluster(self):
453453
"""Ingests cluster files."""
454-
if self.cluster.validate_format():
454+
if self.cluster.validate():
455455
annotation_model = self.cluster.transform()
456456
status = self.load(
457457
self.cluster.COLLECTION_NAME,

tests/data/cluster_bad.txt

Lines changed: 1 addition & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1 @@
1-
NAME X Y
2-
CELL_0001 34.472 32.211
3-
CELL_0002 15.975 10.043
4-
CELL_0003 -11.688 -53.645
5-
CELL_0004 30.04 31.138
6-
CELL_0005 23.862 33.092
7-
CELL_0006 -39.07 -14.64
8-
CELL_0007 40.039 27.206
9-
CELL_0008 28.755 27.187
10-
CELL_0009 -48.601 -13.512
11-
CELL_00010 14.653 27.832
12-
CELL_00011 20.603 32.071
13-
CELL_00012 -10.333 -51.733
14-
CELL_00013 -52.966 -12.484
15-
CELL_00014 38.513 26.969
16-
CELL_00015 12.838 13.047
1+
NAME XCELL_0001 34.472CELL_0002 15.975CELL_0003 -11.688CELL_0004 30.04CELL_0005 23.862CELL_0006 -39.07CELL_0007 40.039CELL_0008 28.755CELL_0009 -48.601CELL_00010 14.653CELL_00011 20.603CELL_00012 -10.333CELL_00013 -52.966CELL_00014 38.513CELL_00015 12.838

tests/data/cluster_bad_missing_coordinate.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
NAME XTYPE groupCELL_0001 34.472CELL_0002 15.975CELL_0003 -11.688CELL_0004 30.04CELL_0005 23.862CELL_0006 -39.07CELL_0007 40.039CELL_0008 28.755CELL_0009 -48.601CELL_00010 14.653CELL_00011 20.603CELL_00012 -10.333CELL_00013 -52.966CELL_00014 38.513CELL_00015 12.838

tests/data/metadata_bad.txt

Lines changed: 1 addition & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1 @@
1-
NAME CLUSTER SUB-CLUSTER
2-
CELL_0001 CLST_A CLST_A_1
3-
CELL_0002 CLST_A CLST_A_1
4-
CELL_0003 CLST_A CLST_A_1
5-
CELL_0004 CLST_A CLST_A_2
6-
CELL_0005 CLST_A CLST_A_2
7-
CELL_0006 CLST_B CLST_B_1
8-
CELL_0007 CLST_B CLST_B_1
9-
CELL_0008 CLST_B CLST_B_2
10-
CELL_0009 CLST_B CLST_B_2
11-
CELL_00010 CLST_B CLST_B_2
12-
CELL_00011 CLST_C CLST_C_1
13-
CELL_00012 CLST_C CLST_C_1
14-
CELL_00013 CLST_C CLST_C_1
15-
CELL_00014 CLST_C CLST_C_2
16-
CELL_00015 CLST_C CLST_C_2
1+
NAME CLUSTER SUB-CLUSTER XTYPE GROUP GROUP NUMERICCELL_0001 CLST_A CLST_A_1 1CELL_0002 CLST_A CLST_A_1 2CELL_0003 CLST_A CLST_A_1 3CELL_0004 CLST_A CLST_A_2 4CELL_0005 CLST_A CLST_A_2 5CELL_0006 CLST_B CLST_B_1 6CELL_0007 CLST_B CLST_B_1 7CELL_0008 CLST_B CLST_B_2 8CELL_0009 CLST_B CLST_B_2 9CELL_00010 CLST_B CLST_B_2 10CELL_00011 CLST_C CLST_C_1 11CELL_00012 CLST_C CLST_C_1 12CELL_00013 CLST_C CLST_C_1 13CELL_00014 CLST_C CLST_C_2 14CELL_00015 CLST_C CLST_C_2 15

tests/data/metadata_bad_contains_coordinates.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
NAME CLUSTER SUB-CLUSTER X TYPE group group numeric CELL_0001 CLST_A CLST_A_1 1 CELL_0002 CLST_A CLST_A_1 2 CELL_0003 CLST_A CLST_A_1 3 CELL_0004 CLST_A CLST_A_2 4 CELL_0005 CLST_A CLST_A_2 5 CELL_0006 CLST_B CLST_B_1 6 CELL_0007 CLST_B CLST_B_1 7 CELL_0008 CLST_B CLST_B_2 8 CELL_0009 CLST_B CLST_B_2 9 cCELL_00010 CLST_B CLST_B_2 10 CELL_00011 CLST_C CLST_C_1 11 CELL_00012 CLST_C CLST_C_1 12 CELL_00013 CLST_C CLST_C_1 13 CELL_00014 CLST_C CLST_C_2 14 CELL_00015 CLST_C CLST_C_2 15

0 commit comments

Comments
 (0)