Skip to content

Commit 3cf2577

Browse files
authored
Merge pull request #267 from broadinstitute/development
Release 1.20.0
2 parents c81634d + f37a8bc commit 3cf2577

File tree

10 files changed

+203
-22
lines changed

10 files changed

+203
-22
lines changed

ingest/cli_parser.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -331,6 +331,20 @@ def create_parser():
331331
"--gene-file", help="Path to .genes.tsv file"
332332
)
333333

334+
# h5ad subparsers
335+
parser_h5ad = subparsers.add_parser(
336+
"ingest_h5ad", help="Indicates that h5ad file is being ingested"
337+
)
338+
339+
parser_h5ad.add_argument(
340+
"--ingest-h5ad",
341+
required=True,
342+
action="store_true",
343+
help="Indicates that ingest of h5ad file should be invoked",
344+
)
345+
346+
parser_h5ad.add_argument("--h5ad-file", required=True, help="Path to h5ad file")
347+
334348
return parser
335349

336350

ingest/de.py

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,6 @@ def __init__(
5151
self.kwargs = kwargs
5252
self.accession = self.kwargs["study_accession"]
5353
self.annot_scope = self.kwargs["annotation_scope"]
54-
# only used in output filename, replacing non-alphanumeric with underscores
55-
self.cluster_name = re.sub(r'\W', '_', self.kwargs["name"])
5654
self.method = self.kwargs["method"]
5755

5856
if matrix_file_type == "mtx":
@@ -183,6 +181,11 @@ def subset_adata(adata, de_cells):
183181
def execute_de(self):
184182
print(f'dev_info: Starting DE for {self.accession}')
185183
try:
184+
# only used in output filename, replacing non-alphanumeric with underscores
185+
# except '+' replaced with 'pos'
186+
self.cluster_name = DifferentialExpression.sanitize_strings(
187+
self.kwargs["name"]
188+
)
186189
if self.matrix_file_type == "mtx":
187190
DifferentialExpression.de_logger.info("preparing DE on sparse matrix")
188191
self.run_scanpy_de(
@@ -389,13 +392,12 @@ def run_scanpy_de(
389392
DifferentialExpression.de_logger.info("Gathering DE annotation labels")
390393
groups = np.unique(adata.obs[annotation]).tolist()
391394
for group in groups:
392-
clean_group = re.sub(r'\W', '_', group)
393-
clean_annotation = re.sub(r'\W', '_', annotation)
395+
clean_group = DifferentialExpression.sanitize_strings(group)
396+
clean_annotation = DifferentialExpression.sanitize_strings(annotation)
394397
DifferentialExpression.de_logger.info(f"Writing DE output for {group}")
395398
rank = sc.get.rank_genes_groups_df(adata, key=rank_key, group=group)
396399
if DifferentialExpression.delimiter_in_gene_name(rank):
397400
DifferentialExpression.extract_gene_id_for_out_file(rank)
398-
399401
out_file = f'{cluster_name}--{clean_annotation}--{clean_group}--{annot_scope}--{method}.tsv'
400402
# Round numbers to 4 significant digits while respecting fixed point
401403
# and scientific notation (note: trailing zeros are removed)
@@ -408,10 +410,23 @@ def run_scanpy_de(
408410

409411
DifferentialExpression.de_logger.info("DE processing complete")
410412

413+
@staticmethod
414+
def sanitize_strings(input_string):
415+
"""
416+
Replace '+' with 'pos', then replace non-alphanumerics with underscore
417+
this allows distinct sanitization for "CD16+ monocyte" vs "CD16- monocyte"
418+
"""
419+
plus_converted_string = re.sub('\+', 'pos', input_string)
420+
return re.sub(r'\W', '_', plus_converted_string)
421+
411422
@staticmethod
412423
def string_for_output_match(arguments):
413-
cleaned_cluster_name = re.sub(r'\W', '_', arguments["cluster_name"])
414-
cleaned_annotation_name = re.sub(r'\W', '_', arguments["annotation_name"])
424+
cleaned_cluster_name = DifferentialExpression.sanitize_strings(
425+
arguments["cluster_name"]
426+
)
427+
cleaned_annotation_name = DifferentialExpression.sanitize_strings(
428+
arguments["annotation_name"]
429+
)
415430
files_to_match = f"{cleaned_cluster_name}--{cleaned_annotation_name}*.tsv"
416431
return files_to_match
417432

ingest/h5ad.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
try:
2+
from ingest_files import IngestFiles
3+
from monitor import log_exception
4+
except ImportError:
5+
# Used when importing as external package, e.g. imports in single_cell_portal code
6+
from .ingest_files import IngestFiles
7+
from .monitor import log_exception
8+
9+
10+
class H5adIngestor(IngestFiles):
11+
ALLOWED_FILE_TYPES = ['application/x-hdf5']
12+
13+
def __init__(self, file_path, study_file_id, study_id, **kwargs):
14+
IngestFiles.__init__(
15+
self, file_path, allowed_file_types=self.ALLOWED_FILE_TYPES
16+
)
17+
pass
18+
19+
def obtain_adata(self):
20+
try:
21+
self.adata = self.open_file(self.file_path)[0]
22+
print(self.adata)
23+
IngestFiles.dev_logger.info(str(self.adata))
24+
except ValueError as e:
25+
raise ValueError(e)
26+
27+
def validate(self):
28+
"""
29+
Currently, file passes "basic validation" if file
30+
can be opened by scanpy
31+
"""
32+
try:
33+
self.adata = self.obtain_adata()
34+
return True
35+
except ValueError:
36+
return False
37+

ingest/ingest_files.py

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from dataclasses import dataclass
1313
from typing import Dict, Generator, List, Tuple, Union # noqa: F401
1414
import warnings
15+
import scanpy as sc
1516

1617

1718
import pandas as pd # NOqa: F821
@@ -21,9 +22,9 @@
2122
# import google.cloud.logging
2223

2324
try:
24-
from monitor import setup_logger
25+
from monitor import setup_logger, log_exception
2526
except ImportError:
26-
from .monitor import setup_logger
27+
from .monitor import setup_logger, log_exception
2728

2829

2930
@dataclass
@@ -75,13 +76,16 @@ class IngestFiles:
7576
# General logger for class
7677
# Logger provides more details
7778
dev_logger = setup_logger(__name__, "log.txt", format="support_configs")
79+
user_logger = setup_logger(__name__ + ".user_logger", "user_log.txt")
7880
# Filter out warnings about using end user credentials when running ingest_pipeline as dev
7981
warnings.filterwarnings(
8082
"ignore", "Your application has authenticated using end user credentials"
8183
)
8284

8385
def __init__(self, file_path, allowed_file_types):
8486
self.file_path = file_path
87+
# define filetype for h5ad file extension
88+
mimetypes.add_type('application/x-hdf5', '.h5ad')
8589
# File is remote (in GCS bucket) when running via PAPI,
8690
# and typically local when developing
8791
self.is_remote_file = IngestFiles.is_remote_file(file_path)
@@ -195,6 +199,7 @@ def open_file(self, file_path, open_as=None, start_point: int = 0, **kwargs):
195199
"text/plain": self.open_txt,
196200
"text/tab-separated-values": self.open_tsv,
197201
"dataframe": self.open_pandas,
202+
"application/x-hdf5": self.open_h5ad,
198203
}
199204

200205
if start_point != 0:
@@ -214,6 +219,11 @@ def open_file(self, file_path, open_as=None, start_point: int = 0, **kwargs):
214219
file_connections.get(file_type)(open_file, file_type, **kwargs),
215220
open_file,
216221
)
222+
elif file_type == "application/x-hdf5":
223+
return (
224+
file_connections.get(file_type)(file_path, **kwargs),
225+
open_file,
226+
)
217227
else:
218228
return (
219229
file_connections.get(file_type)(open_file, **kwargs),
@@ -227,9 +237,12 @@ def open_file(self, file_path, open_as=None, start_point: int = 0, **kwargs):
227237
open_file,
228238
)
229239
else:
230-
raise ValueError(
231-
f"Unsupported file format. Allowed file types are: {' '.join(self.allowed_file_types)}"
240+
msg = (
241+
f"Unsupported file format. Allowed file MIME types are: "
242+
f"{' '.join(self.allowed_file_types)}"
232243
)
244+
log_exception(IngestFiles.dev_logger, IngestFiles.user_logger, msg)
245+
raise ValueError(msg)
233246

234247
# Inherited function
235248
def extract(self):
@@ -298,6 +311,15 @@ def open_pandas(self, file_path, file_type, **kwargs):
298311
else:
299312
raise ValueError("File must be tab or comma delimited")
300313

314+
def open_h5ad(self, file_path, **kwargs):
315+
"""Opens file as AnnData object """
316+
try:
317+
return sc.read_h5ad(file_path, backed='r')
318+
except OSError as e:
319+
msg = f"Scanpy cannot read file, \"{file_path}\"."
320+
log_exception(IngestFiles.dev_logger, IngestFiles.user_logger, msg)
321+
raise ValueError(msg)
322+
301323
def open_csv(self, opened_file_object, **kwargs):
302324
"""Opens csv file"""
303325
csv.register_dialect(

ingest/ingest_pipeline.py

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,8 @@
2626
# Ingest dense file
2727
python ingest_pipeline.py --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_expression --taxon-name 'Homo sapiens' --taxon-common-name human --ncbi-taxid 9606 --matrix-file ../tests/data/dense_matrix_19_genes_1000_cells.txt --matrix-file-type dense
2828
29-
# Ingest loom file
30-
python ingest_pipeline.py --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_expression --matrix-file ../tests/data/test_loom.loom --matrix-file-type loom --taxon-name 'Homo Sapiens' --taxon-common-name human
29+
# Ingest h5ad file
30+
python ingest_pipeline.py --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_h5ad --h5ad-file ../tests/data/test.h5ad
3131
3232
# Subsample cluster and metadata file
3333
python ingest_pipeline.py --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_subsample --cluster-file ../tests/data/test_1k_cluster_Data.csv --name custer1 --cell-metadata-file ../tests/data/test_1k_metadata_Data.csv --subsample
@@ -82,6 +82,7 @@
8282
from clusters import Clusters
8383
from expression_files.mtx import MTXIngestor
8484
from expression_files.dense_ingestor import DenseIngestor
85+
from h5ad import H5adIngestor
8586
from monitor import setup_logger, log_exception
8687
from de import DifferentialExpression
8788

@@ -102,6 +103,7 @@
102103
from .clusters import Clusters
103104
from .expression_files.dense_ingestor import DenseIngestor
104105
from .expression_files.mtx import MTXIngestor
106+
from .h5ad import H5adIngestor
105107
from .cli_parser import create_parser, validate_arguments
106108
from .de import DifferentialExpression
107109

@@ -125,6 +127,7 @@ def __init__(
125127
matrix_file_type: str = None,
126128
cell_metadata_file: str = None,
127129
cluster_file: str = None,
130+
h5ad_file: str = None,
128131
subsample=False,
129132
ingest_cell_metadata=False,
130133
ingest_cluster=False,
@@ -144,6 +147,7 @@ def __init__(
144147
else:
145148
self.db = None
146149
self.cluster_file = cluster_file
150+
self.h5ad_file = h5ad_file
147151
self.kwargs = kwargs
148152
self.cell_metadata_file = cell_metadata_file
149153
self.props = {}
@@ -474,6 +478,20 @@ def subsample(self):
474478
return 1
475479
return 0
476480

481+
@custom_metric(config.get_metric_properties)
482+
def ingest_h5ad(self):
483+
"""Ingests h5ad files."""
484+
self.h5ad = H5adIngestor(
485+
self.h5ad_file, self.study_id, self.study_file_id, **self.kwargs
486+
)
487+
if self.h5ad.validate():
488+
self.report_validation("success")
489+
return 0
490+
# scanpy unable to open h5ad file
491+
else:
492+
self.report_validation("failure")
493+
return 1
494+
477495
def calculate_de(self):
478496
""" Run differential expression analysis """
479497
try:
@@ -523,6 +541,11 @@ def run_ingest(ingest, arguments, parsed_args):
523541
config.set_parent_event_name("ingest-pipeline:subsample:ingest")
524542
status_subsample = ingest.subsample()
525543
status.append(status_subsample)
544+
elif "ingest_h5ad" in arguments:
545+
if arguments["ingest_h5ad"]:
546+
config.set_parent_event_name("ingest-pipeline:h5ad:ingest")
547+
status_h5ad = ingest.ingest_h5ad()
548+
status.append(status_h5ad)
526549
elif "differential_expression" in arguments:
527550
config.set_parent_event_name("ingest-pipeline:differential-expression")
528551
status_de = ingest.calculate_de()

tests/data/h5ad/bad.h5

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
non-empty

tests/data/h5ad/bad.h5ad

235 KB
Binary file not shown.

tests/data/h5ad/test.h5ad

238 KB
Binary file not shown.

tests/test_de.py

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -35,10 +35,10 @@ def find_expected_files(labels, cluster_name, annotation, scope, method):
3535
""" Check that files were created for all expected annotation labels
3636
"""
3737
found = []
38-
sanitized_cluster_name = re.sub(r'\W', '_', cluster_name)
39-
sanitized_annotation = re.sub(r'\W', '_', annotation)
38+
sanitized_cluster_name = DifferentialExpression.sanitize_strings(cluster_name)
39+
sanitized_annotation = DifferentialExpression.sanitize_strings(annotation)
4040
for label in labels:
41-
sanitized_label = re.sub(r'\W', '_', label)
41+
sanitized_label = DifferentialExpression.sanitize_strings(label)
4242
expected_file = f"{sanitized_cluster_name}--{sanitized_annotation}--{sanitized_label}--{scope}--{method}.tsv"
4343
assert os.path.exists(expected_file)
4444
found.append(expected_file)
@@ -185,16 +185,26 @@ def test_delimiter_in_gene_name(self):
185185
def test_filename_sanitation(self):
186186
""" Bugfix (SCP-4459) so sanitization does not collapse adjacent non-alphanumeric characters to
187187
single underscores, see also SCP-4455 for manual fix
188+
189+
Bugfix (SCP-4533) convert '+' to 'pos' so labels differing in only +/-
190+
do not clobber and cause display of incorrect results for one of the labels.
188191
"""
192+
test_string = "foo++)"
193+
plus_converted_result = DifferentialExpression.sanitize_strings(test_string)
194+
self.assertEqual(
195+
plus_converted_result,
196+
"foopospos_",
197+
"unexpected result from sanitation sanitize_strings function",
198+
)
199+
189200
arguments = {
190-
"cluster_name": "UMAP, pre-QC all cells (complexity greater than or equal to 1000)",
201+
"cluster_name": "UMAP+, pre-QC all cells (complexity greater than or equal to 1000)",
191202
"annotation_name": "cell..type",
192203
}
193204
files_to_match = DifferentialExpression.string_for_output_match(arguments)
194-
print(files_to_match)
195205
self.assertEqual(
196206
files_to_match,
197-
"UMAP__pre_QC_all_cells__complexity_greater_than_or_equal_to_1000_--cell__type*.tsv",
207+
"UMAPpos__pre_QC_all_cells__complexity_greater_than_or_equal_to_1000_--cell__type*.tsv",
198208
"unexpected result from sanitation function",
199209
)
200210

@@ -472,9 +482,7 @@ def test_de_process_sanitize(self):
472482
f"expected five annotation labels for {test_annotation}",
473483
)
474484

475-
expected_file = (
476-
"UMAP__pre_QC--misc__cellaneous--cholinergic__neuron_--study--wilcoxon.tsv"
477-
)
485+
expected_file = "UMAP__pre_QC--miscposposcellaneous--cholinergic__neuron_--study--wilcoxon.tsv"
478486

479487
# confirm expected results filename was generated in found result files
480488
self.assertIn(

0 commit comments

Comments
 (0)