Skip to content

Commit 913bf2b

Browse files
authored
Merge pull request #266 from broadinstitute/jlc_add_h5_extensions
Accept additional file extensions for AnnData ingest (SCP-4557)
2 parents f37a8bc + 4d0d9b1 commit 913bf2b

File tree

11 files changed

+85
-60
lines changed

11 files changed

+85
-60
lines changed

ingest/h5ad.py renamed to ingest/anndata_.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from .monitor import log_exception
88

99

10-
class H5adIngestor(IngestFiles):
10+
class AnnDataIngestor(IngestFiles):
1111
ALLOWED_FILE_TYPES = ['application/x-hdf5']
1212

1313
def __init__(self, file_path, study_file_id, study_id, **kwargs):

ingest/cli_parser.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -331,19 +331,21 @@ def create_parser():
331331
"--gene-file", help="Path to .genes.tsv file"
332332
)
333333

334-
# h5ad subparsers
335-
parser_h5ad = subparsers.add_parser(
336-
"ingest_h5ad", help="Indicates that h5ad file is being ingested"
334+
# AnnData subparsers
335+
parser_anndata = subparsers.add_parser(
336+
"ingest_anndata", help="Indicates that AnnData file is being ingested"
337337
)
338338

339-
parser_h5ad.add_argument(
340-
"--ingest-h5ad",
339+
parser_anndata.add_argument(
340+
"--ingest-anndata",
341341
required=True,
342342
action="store_true",
343-
help="Indicates that ingest of h5ad file should be invoked",
343+
help="Indicates that ingest of AnnData file should be invoked",
344344
)
345345

346-
parser_h5ad.add_argument("--h5ad-file", required=True, help="Path to h5ad file")
346+
parser_anndata.add_argument(
347+
"--anndata-file", required=True, help="Path to AnnData file"
348+
)
347349

348350
return parser
349351

ingest/ingest_files.py

Lines changed: 27 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -82,10 +82,21 @@ class IngestFiles:
8282
"ignore", "Your application has authenticated using end user credentials"
8383
)
8484

85+
ALLOWED_FILE_EXTENSIONS = {
86+
"text/csv": [".csv"],
87+
"text/plain": [".txt"],
88+
"text/tab-separated-values": [".tsv"],
89+
"dataframe": [".tsv"],
90+
"application/x-hdf5": [".h5ad", ".h5", ".hdf5"],
91+
}
92+
8593
def __init__(self, file_path, allowed_file_types):
8694
self.file_path = file_path
87-
# define filetype for h5ad file extension
95+
# valid suffixes for AnnData ingest (expecting .h5ad)
96+
# including hdf5 file extensions - AnnData files should be valid hdf5
8897
mimetypes.add_type('application/x-hdf5', '.h5ad')
98+
mimetypes.add_type('application/x-hdf5', '.h5')
99+
mimetypes.add_type('application/x-hdf5', '.hdf5')
89100
# File is remote (in GCS bucket) when running via PAPI,
90101
# and typically local when developing
91102
self.is_remote_file = IngestFiles.is_remote_file(file_path)
@@ -199,7 +210,7 @@ def open_file(self, file_path, open_as=None, start_point: int = 0, **kwargs):
199210
"text/plain": self.open_txt,
200211
"text/tab-separated-values": self.open_tsv,
201212
"dataframe": self.open_pandas,
202-
"application/x-hdf5": self.open_h5ad,
213+
"application/x-hdf5": self.open_anndata,
203214
}
204215

205216
if start_point != 0:
@@ -237,10 +248,19 @@ def open_file(self, file_path, open_as=None, start_point: int = 0, **kwargs):
237248
open_file,
238249
)
239250
else:
240-
msg = (
241-
f"Unsupported file format. Allowed file MIME types are: "
242-
f"{' '.join(self.allowed_file_types)}"
243-
)
251+
expected_suffixes = []
252+
for t in self.allowed_file_types:
253+
expected_suffixes.extend(self.ALLOWED_FILE_EXTENSIONS[t])
254+
if file_type == None:
255+
msg = (
256+
f"File type not detected for {file_path}, expected file endings are: "
257+
f"{' '.join(expected_suffixes)}"
258+
)
259+
else:
260+
msg = (
261+
f"Unsupported file format {file_path}. Expected file suffix are: "
262+
f"{' '.join(expected_suffixes)}"
263+
)
244264
log_exception(IngestFiles.dev_logger, IngestFiles.user_logger, msg)
245265
raise ValueError(msg)
246266

@@ -311,7 +331,7 @@ def open_pandas(self, file_path, file_type, **kwargs):
311331
else:
312332
raise ValueError("File must be tab or comma delimited")
313333

314-
def open_h5ad(self, file_path, **kwargs):
334+
def open_anndata(self, file_path, **kwargs):
315335
"""Opens file as AnnData object """
316336
try:
317337
return sc.read_h5ad(file_path, backed='r')

ingest/ingest_pipeline.py

Lines changed: 20 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,8 @@
2626
# Ingest dense file
2727
python ingest_pipeline.py --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_expression --taxon-name 'Homo sapiens' --taxon-common-name human --ncbi-taxid 9606 --matrix-file ../tests/data/dense_matrix_19_genes_1000_cells.txt --matrix-file-type dense
2828
29-
# Ingest h5ad file
30-
python ingest_pipeline.py --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_h5ad --h5ad-file ../tests/data/test.h5ad
29+
# Ingest AnnData file
30+
python ingest_pipeline.py --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_anndata --anndata-file ../tests/data/anndata/test.h5ad
3131
3232
# Subsample cluster and metadata file
3333
python ingest_pipeline.py --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_subsample --cluster-file ../tests/data/test_1k_cluster_Data.csv --name custer1 --cell-metadata-file ../tests/data/test_1k_metadata_Data.csv --subsample
@@ -51,12 +51,8 @@
5151
from contextlib import nullcontext
5252
from typing import Dict, Generator, List, Tuple, Union
5353
from wsgiref.simple_server import WSGIRequestHandler # noqa: F401
54-
55-
5654
from bson.objectid import ObjectId
5755

58-
59-
# from google.cloud.logging.resource import Resource
6056
try:
6157
# Used when importing internally and in tests
6258
from ingest_files import IngestFiles
@@ -82,10 +78,13 @@
8278
from clusters import Clusters
8379
from expression_files.mtx import MTXIngestor
8480
from expression_files.dense_ingestor import DenseIngestor
85-
from h5ad import H5adIngestor
8681
from monitor import setup_logger, log_exception
8782
from de import DifferentialExpression
8883

84+
# scanpy uses anndata python package, disamibguate local anndata
85+
# using underscore https://peps.python.org/pep-0008/#naming-conventions
86+
from anndata_ import AnnDataIngestor
87+
8988
except ImportError:
9089
# Used when importing as external package, e.g. imports in single_cell_portal code
9190
from .ingest_files import IngestFiles
@@ -103,7 +102,7 @@
103102
from .clusters import Clusters
104103
from .expression_files.dense_ingestor import DenseIngestor
105104
from .expression_files.mtx import MTXIngestor
106-
from .h5ad import H5adIngestor
105+
from .anndata import AnnDataIngestor
107106
from .cli_parser import create_parser, validate_arguments
108107
from .de import DifferentialExpression
109108

@@ -127,7 +126,7 @@ def __init__(
127126
matrix_file_type: str = None,
128127
cell_metadata_file: str = None,
129128
cluster_file: str = None,
130-
h5ad_file: str = None,
129+
anndata_file: str = None,
131130
subsample=False,
132131
ingest_cell_metadata=False,
133132
ingest_cluster=False,
@@ -147,7 +146,7 @@ def __init__(
147146
else:
148147
self.db = None
149148
self.cluster_file = cluster_file
150-
self.h5ad_file = h5ad_file
149+
self.anndata_file = anndata_file
151150
self.kwargs = kwargs
152151
self.cell_metadata_file = cell_metadata_file
153152
self.props = {}
@@ -479,15 +478,15 @@ def subsample(self):
479478
return 0
480479

481480
@custom_metric(config.get_metric_properties)
482-
def ingest_h5ad(self):
483-
"""Ingests h5ad files."""
484-
self.h5ad = H5adIngestor(
485-
self.h5ad_file, self.study_id, self.study_file_id, **self.kwargs
481+
def ingest_anndata(self):
482+
"""Ingests anndata files."""
483+
self.anndata = AnnDataIngestor(
484+
self.anndata_file, self.study_id, self.study_file_id, **self.kwargs
486485
)
487-
if self.h5ad.validate():
486+
if self.anndata.validate():
488487
self.report_validation("success")
489488
return 0
490-
# scanpy unable to open h5ad file
489+
# scanpy unable to open AnnData file
491490
else:
492491
self.report_validation("failure")
493492
return 1
@@ -541,11 +540,11 @@ def run_ingest(ingest, arguments, parsed_args):
541540
config.set_parent_event_name("ingest-pipeline:subsample:ingest")
542541
status_subsample = ingest.subsample()
543542
status.append(status_subsample)
544-
elif "ingest_h5ad" in arguments:
545-
if arguments["ingest_h5ad"]:
546-
config.set_parent_event_name("ingest-pipeline:h5ad:ingest")
547-
status_h5ad = ingest.ingest_h5ad()
548-
status.append(status_h5ad)
543+
elif "ingest_anndata" in arguments:
544+
if arguments["ingest_anndata"]:
545+
config.set_parent_event_name("ingest-pipeline:anndata:ingest")
546+
status_anndata = ingest.ingest_anndata()
547+
status.append(status_anndata)
549548
elif "differential_expression" in arguments:
550549
config.set_parent_event_name("ingest-pipeline:differential-expression")
551550
status_de = ingest.calculate_de()

ingest/monitor.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,22 @@ def setup_logger(logger_name, log_file, level=logging.DEBUG, format="default"):
4444
return logger
4545

4646

47+
def bypass_mongo_writes():
48+
"""Check if developer has set environment variable to bypass writing data to MongoDB
49+
BYPASS_MONGO_WRITES='yes'
50+
"""
51+
if os.environ.get("BYPASS_MONGO_WRITES") is not None:
52+
skip = os.environ["BYPASS_MONGO_WRITES"]
53+
return skip == "yes"
54+
else:
55+
return False
56+
57+
4758
def log_exception(dev_logger, user_logger, exception):
4859
user_logger.critical(str(exception))
4960
dev_logger.exception(exception)
61+
if bypass_mongo_writes():
62+
print(str(exception))
5063

5164

5265
# Modified from https://jdkandersson.com/2019/05/19/testing-decorated-python-functions/
File renamed without changes.

tests/data/anndata/bad.h5

235 KB
Binary file not shown.
File renamed without changes.

tests/data/h5ad/bad.h5

Lines changed: 0 additions & 1 deletion
This file was deleted.

tests/test_h5ad.py renamed to tests/test_anndata.py

Lines changed: 15 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,35 +1,28 @@
1-
""" test_de.py
2-
integration test to verify that de process generates expected output
1+
""" test_anndata.py
2+
verify basic AnnData validation works as expected
33
"""
44

55
import unittest
66
import sys
7-
import hashlib
8-
import os
9-
import glob
10-
from unittest.mock import patch
11-
import scanpy as sc
12-
137

148
sys.path.append("../ingest")
15-
from ingest_files import IngestFiles
16-
from h5ad import H5adIngestor
9+
from anndata_ import AnnDataIngestor
1710

1811

19-
class TestH5adIngestor(unittest.TestCase):
20-
def test_minimal_valid_h5ad(self):
21-
good_input = H5adIngestor(
22-
"../tests/data/h5ad/test.h5ad",
12+
class TestAnnDataIngestor(unittest.TestCase):
13+
def test_minimal_valid_anndata(self):
14+
good_input = AnnDataIngestor(
15+
"../tests/data/anndata/test.h5ad",
2316
"addedfeed000000000000000",
2417
"dec0dedfeed0000000000000",
2518
)
2619
self.assertTrue(
2720
good_input.validate(), "expect known good file to open with scanpy"
2821
)
2922

30-
def test_truncated_h5ad(self):
31-
truncated_input = H5adIngestor(
32-
"../tests/data/h5ad/bad.h5ad",
23+
def test_truncated_anndata(self):
24+
truncated_input = AnnDataIngestor(
25+
"../tests/data/anndata/bad.h5",
3326
"addedfeed000000000000000",
3427
"dec0dedfeed0000000000000",
3528
)
@@ -38,14 +31,14 @@ def test_truncated_h5ad(self):
3831
# an exception before assertRaises gets called
3932
self.assertRaisesRegex(
4033
ValueError,
41-
"Scanpy cannot read file, \"../tests/data/h5ad/bad.h5ad\".",
34+
"Scanpy cannot read file, \"../tests/data/anndata/bad.h5\".",
4235
lambda: truncated_input.obtain_adata(),
4336
)
4437
self.assertFalse(truncated_input.validate())
4538

46-
def test_input_not_h5ad(self):
47-
bad_input = H5adIngestor(
48-
"../tests/data/h5ad/bad.h5",
39+
def test_input_bad_suffix(self):
40+
bad_input = AnnDataIngestor(
41+
"../tests/data/anndata/bad.foo",
4942
"addedfeed000000000000000",
5043
"dec0dedfeed0000000000000",
5144
)
@@ -54,7 +47,7 @@ def test_input_not_h5ad(self):
5447
# an exception before assertRaises gets called
5548
self.assertRaisesRegex(
5649
ValueError,
57-
"Unsupported file format. Allowed file MIME types are: application/x-hdf5",
50+
"File type not detected for ../tests/data/anndata/bad.foo, expected file endings are: .h5ad .h5 .hdf5",
5851
lambda: bad_input.obtain_adata(),
5952
)
6053
self.assertFalse(bad_input.validate())

0 commit comments

Comments
 (0)