Skip to content

Commit a412c3b

Browse files
authored
Merge pull request #389 from broadinstitute/development
Release 1.41.0
2 parents b8bcf46 + 9c81680 commit a412c3b

19 files changed

+363
-89
lines changed

ingest/anndata_.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ def __init__(self, file_path, study_file_id, study_id, **kwargs):
3636
IngestFiles.__init__(
3737
self, file_path, allowed_file_types=self.ALLOWED_FILE_TYPES
3838
)
39+
self.kwargs = kwargs
3940

4041
def obtain_adata(self):
4142
try:
@@ -58,6 +59,36 @@ def basic_validation(self):
5859
except ValueError:
5960
return False
6061

62+
def validate_raw_location(self):
63+
"""
64+
Confirm file has data at raw_location
65+
"""
66+
adata = self.obtain_adata()
67+
raw_location = self.kwargs.get("raw_location")
68+
if raw_location is not None:
69+
try:
70+
if raw_location == ".raw":
71+
if adata.raw is None:
72+
msg = f'No data found in .raw slot'
73+
log_exception(
74+
IngestFiles.dev_logger, IngestFiles.user_logger, msg
75+
)
76+
raise ValueError(msg)
77+
else:
78+
if raw_location not in adata.layers.keys():
79+
msg = f'No data found at adata.layers["{raw_location}"]'
80+
log_exception(
81+
IngestFiles.dev_logger, IngestFiles.user_logger, msg
82+
)
83+
raise ValueError(msg)
84+
return True
85+
except ValueError:
86+
return False
87+
else:
88+
msg = 'Must specify location of raw counts in AnnData object'
89+
log_exception(IngestFiles.dev_logger, IngestFiles.user_logger, msg)
90+
return False
91+
6192
def create_cell_data_arrays(self):
6293
"""Extract cell name DataArray documents for raw data"""
6394
adata = self.obtain_adata()

ingest/cli_parser.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
1-
"""Helper functions for ingest_pipeline.py
2-
"""
1+
"""Helper functions for ingest_pipeline.py"""
32

43
import argparse
54
import ast
@@ -451,6 +450,12 @@ def create_parser():
451450
help="Array of obsm key(s) to extract as cluster files",
452451
)
453452

453+
parser_anndata.add_argument(
454+
"--raw-location",
455+
help="location of raw counts. '.raw' for raw slot, "
456+
"else adata.layers key value or None if no raw counts",
457+
)
458+
454459
parser_anndata.add_argument(
455460
"--extract",
456461
type=ast.literal_eval,

ingest/de.py

Lines changed: 45 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -367,7 +367,7 @@ def write_de_result(adata, group, annotation, rank_key, cluster_name, extra_para
367367
clean_group = DifferentialExpression.sanitize_string(group)
368368
out_file = f'{cluster_name}--{clean_annotation}--{clean_group}--{annot_scope}--{method}.tsv'
369369
DifferentialExpression.de_logger.info(
370-
f"Writing DE output for {clean_group} vs rest"
370+
f"Writing DE output for {clean_group} vs restq"
371371
)
372372
elif de_type == "pairwise":
373373
# rank_genes_groups accepts a list. For SCP pairwise, should be a list with one item
@@ -403,6 +403,7 @@ def run_scanpy_de(
403403
):
404404
method = extra_params.get("method")
405405
de_type = extra_params.get("de_type")
406+
raw_location = extra_params.get("raw_location")
406407

407408
try:
408409
DifferentialExpression.assess_annotation(annotation, metadata, extra_params)
@@ -432,24 +433,50 @@ def run_scanpy_de(
432433
)
433434

434435
if matrix_file_type == "h5ad":
435-
if orig_adata.raw is not None:
436-
adata = AnnData(
437-
# using .copy() for the AnnData components is good practice
438-
# but we won't be using orig_adata for analyses
439-
# choosing to avoid .copy() for memory efficiency
440-
X=orig_adata.raw.X,
441-
obs=orig_adata.obs,
442-
var=orig_adata.var,
443-
)
436+
if raw_location == ".raw":
437+
if orig_adata.raw is not None:
438+
DifferentialExpression.de_logger.info(
439+
f"Performing DE on {raw_location} data"
440+
)
441+
adata = AnnData(
442+
# using .copy() for the AnnData components is good practice
443+
# but we won't be using orig_adata for analyses
444+
# choosing to avoid .copy() for memory efficiency
445+
X=orig_adata.raw.X,
446+
obs=orig_adata.obs,
447+
var=orig_adata.var,
448+
)
449+
else:
450+
msg = f'{matrix_file_path} does not have a .raw attribute'
451+
print(msg)
452+
log_exception(
453+
DifferentialExpression.dev_logger,
454+
DifferentialExpression.de_logger,
455+
msg,
456+
)
457+
raise ValueError(msg)
444458
else:
445-
msg = f'{matrix_file_path} does not have a .raw attribute'
446-
print(msg)
447-
log_exception(
448-
DifferentialExpression.dev_logger,
449-
DifferentialExpression.de_logger,
450-
msg,
451-
)
452-
raise ValueError(msg)
459+
if raw_location in orig_adata.layers.keys():
460+
DifferentialExpression.de_logger.info(
461+
f"Performing DE on adata.layers['{raw_location}'] data"
462+
)
463+
adata = AnnData(
464+
# using .copy() for the AnnData components is good practice
465+
# but we won't be using orig_adata for analyses
466+
# choosing to avoid .copy() for memory efficiency
467+
X=orig_adata.layers[raw_location],
468+
obs=orig_adata.obs,
469+
var=orig_adata.var,
470+
)
471+
else:
472+
msg = f'{matrix_file_path} does not have adata.layers["{raw_location}"]'
473+
print(msg)
474+
log_exception(
475+
DifferentialExpression.dev_logger,
476+
DifferentialExpression.de_logger,
477+
msg,
478+
)
479+
raise ValueError(msg)
453480
# AnnData expects gene x cell so dense and mtx matrices require transposition
454481
else:
455482
adata = adata.transpose()

ingest/ingest_pipeline.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@
5555
python ingest_pipeline.py --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_anndata --ingest-anndata --anndata-file ../tests/data/anndata/trimmed_compliant_pbmc3K.h5ad --extract "['processed_expression']"
5656
5757
# Ingest AnnData - happy path raw count cell name only extraction
58-
python ingest_pipeline.py --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_anndata --ingest-anndata --anndata-file ../tests/data/anndata/trimmed_compliant_pbmc3K.h5ad --extract "['raw_counts']"
58+
python ingest_pipeline.py --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_anndata --ingest-anndata --anndata-file ../tests/data/anndata/trimmed_compliant_pbmc3K.h5ad --extract "['raw_counts']" --raw-location ".raw"
5959
6060
# Ingest AnnData - happy path cluster and metadata extraction
6161
python ingest_pipeline.py --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_anndata --ingest-anndata --anndata-file ../tests/data/anndata/trimmed_compliant_pbmc3K.h5ad --extract "['cluster', 'metadata']" --obsm-keys "['X_umap','X_tsne']"
@@ -66,17 +66,20 @@
6666
# Differential expression analysis (sparse matrix)
6767
python ingest_pipeline.py --study-id addedfeed000000000000000 --study-file-id dec0dedfeed1111111111111 differential_expression --annotation-name cell_type__ontology_label --annotation-type group --annotation-scope study --matrix-file-path ../tests/data/differential_expression/sparse/sparsemini_matrix.mtx --gene-file ../tests/data/differential_expression/sparse/sparsemini_features.tsv --barcode-file ../tests/data/differential_expression/sparse/sparsemini_barcodes.tsv --matrix-file-type mtx --annotation-file ../tests/data/differential_expression/sparse/sparsemini_metadata.txt --cluster-file ../tests/data/differential_expression/sparse/sparsemini_cluster.txt --cluster-name de_sparse_integration --study-accession SCPsparsemini --differential-expression
6868
69-
# Differential expression analysis (h5ad matrix)
70-
python ingest_pipeline.py --study-id addedfeed000000000000000 --study-file-id dec0dedfeed1111111111111 differential_expression --annotation-name louvain --annotation-type group --annotation-scope study --matrix-file-path ../tests/data/anndata/trimmed_compliant_pbmc3K.h5ad --matrix-file-type h5ad --annotation-file ../tests/data/anndata/h5ad_frag.metadata.tsv --cluster-file ../tests/data/anndata/h5ad_frag.cluster.X_umap.tsv --cluster-name umap --study-accession SCPdev --differential-expression
69+
# Differential expression analysis (h5ad matrix, raw count in raw slot)
70+
python ingest_pipeline.py --study-id addedfeed000000000000000 --study-file-id dec0dedfeed1111111111111 differential_expression --raw-location '.raw' --annotation-name cell_type__ontology_label --de-type rest --annotation-type group --annotation-scope study --annotation-file ../tests/data/anndata/compliant_liver_h5ad_frag.metadata.tsv.gz --cluster-file ../tests/data/anndata/compliant_liver_h5ad_frag.cluster.X_umap.tsv.gz --cluster-name umap --matrix-file-path ../tests/data/anndata/compliant_liver.h5ad --matrix-file-type h5ad --study-accession SCPdev --differential-expression
71+
72+
# Differential expression analysis (h5ad matrix, raw count in adata.layers['counts'])
73+
python ingest_pipeline.py --study-id addedfeed000000000000000 --study-file-id dec0dedfeed1111111111111 differential_expression --raw-location 'counts' --annotation-name cell_type__ontology_label --de-type rest --annotation-type group --annotation-scope study --annotation-file ../tests/data/anndata/compliant_liver_h5ad_frag.metadata.tsv.gz --cluster-file ../tests/data/anndata/compliant_liver_h5ad_frag.cluster.X_umap.tsv.gz --cluster-name umap --matrix-file-path ../tests/data/anndata/compliant_liver_layers_counts.h5ad --matrix-file-type h5ad --study-accession SCPdev --differential-expression
7174
7275
# Pairwise differential expression analysis (dense matrix)
7376
python ingest_pipeline.py --study-id addedfeed000000000000000 --study-file-id dec0dedfeed1111111111111 differential_expression --annotation-name cell_type__ontology_label --de-type pairwise --group1 "['cholinergic neuron']" --group2 "cranial somatomotor neuron" --annotation-type group --annotation-scope study --matrix-file-path ../tests/data/differential_expression/de_dense_matrix.tsv --matrix-file-type dense --annotation-file ../tests/data/differential_expression/de_dense_metadata.tsv --cluster-file ../tests/data/differential_expression/de_dense_cluster.tsv --cluster-name de_integration --study-accession SCPdev --differential-expression
7477
7578
# Pairwise differential expression analysis (sparse matrix)
7679
python ingest_pipeline.py --study-id addedfeed000000000000000 --study-file-id dec0dedfeed1111111111111 differential_expression --annotation-name cell_type__ontology_label --de-type pairwise --group1 "['endothelial cell']" --group2 "smooth muscle cell" --annotation-type group --annotation-scope study --matrix-file-path ../tests/data/differential_expression/sparse/sparsemini_matrix.mtx --gene-file ../tests/data/differential_expression/sparse/sparsemini_features.tsv --barcode-file ../tests/data/differential_expression/sparse/sparsemini_barcodes.tsv --matrix-file-type mtx --annotation-file ../tests/data/differential_expression/sparse/sparsemini_metadata.txt --cluster-file ../tests/data/differential_expression/sparse/sparsemini_cluster.txt --cluster-name de_sparse_integration --study-accession SCPsparsemini --differential-expression
7780
78-
# Pairwise differential expression analysis (h5ad matrix)
79-
python ingest_pipeline.py --study-id addedfeed000000000000000 --study-file-id dec0dedfeed1111111111111 differential_expression --annotation-name cell_type__ontology_label --de-type pairwise --group1 "['mature B cell']" --group2 "plasma cell" --annotation-type group --annotation-scope study --annotation-file ../tests/data/anndata/compliant_liver_h5ad_frag.metadata.tsv.gz --cluster-file ../tests/data/anndata/compliant_liver_h5ad_frag.cluster.X_umap.tsv.gz --cluster-name umap --matrix-file-path ../tests/data/anndata/compliant_liver.h5ad --matrix-file-type h5ad --study-accession SCPdev --differential-expression
81+
# Pairwise differential expression analysis (h5ad matrix, raw count in raw slot)
82+
python ingest_pipeline.py --study-id addedfeed000000000000000 --study-file-id dec0dedfeed1111111111111 differential_expression --raw-location '.raw' --annotation-name cell_type__ontology_label --de-type pairwise --group1 "mature B cell" --group2 "plasma cell" --annotation-type group --annotation-scope study --annotation-file ../tests/data/anndata/compliant_liver_h5ad_frag.metadata.tsv.gz --cluster-file ../tests/data/anndata/compliant_liver_h5ad_frag.cluster.X_umap.tsv.gz --cluster-name umap --matrix-file-path ../tests/data/anndata/compliant_liver.h5ad --matrix-file-type h5ad --study-accession SCPdev --differential-expression
8083
8184
"""
8285

@@ -559,7 +562,11 @@ def extract_from_anndata(self):
559562
if self.kwargs.get('extract') and "raw_counts" in self.kwargs.get(
560563
'extract'
561564
):
562-
self.anndata.ingest_raw_cells()
565+
if self.anndata.validate_raw_location():
566+
self.anndata.ingest_raw_cells()
567+
else:
568+
self.report_validation("failure")
569+
return 1
563570
self.report_validation("success")
564571
return 0
565572
# scanpy unable to open AnnData file
270 Bytes
Binary file not shown.
621 Bytes
Binary file not shown.
3.88 KB
Binary file not shown.
3.45 KB
Binary file not shown.
Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1 @@
1-
1738072997 # validation cache key
2-
1+
1742404288 # validation cache key
4.97 MB
Binary file not shown.

0 commit comments

Comments
 (0)