Skip to content

Commit 1ac85e0

Browse files
authored
Merge pull request #252 from broadinstitute/development
Release 1.17.2
2 parents f331419 + 3baaabf commit 1ac85e0

File tree

7 files changed

+331
-42
lines changed

7 files changed

+331
-42
lines changed

ingest/cli_parser.py

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,14 @@ def validate_arguments(parsed_args):
7474
raise ValueError(
7575
f" Invalid argument: unable to connect to a BigQuery table called {parsed_args.bq_table}."
7676
)
77+
if (
78+
"differential_expression" in parsed_args
79+
and parsed_args.annotation_type != "group"
80+
):
81+
raise ValueError(
82+
"Differential expression analysis restricted to group-type annotations,"
83+
f" cannot run on data of type \"{parsed_args.annotation_type}\"."
84+
)
7785

7886

7987
def create_parser():
@@ -268,15 +276,23 @@ def create_parser():
268276
)
269277

270278
parser_differential_expression.add_argument(
271-
"--annotation", required=True, help="Name of annotation for DE analysis"
279+
"--annotation-name", required=True, help="Name of annotation for DE analysis"
280+
)
281+
282+
parser_differential_expression.add_argument(
283+
"--annotation-type", required=True, help="Type of annotation for DE analysis"
284+
)
285+
286+
parser_differential_expression.add_argument(
287+
"--annotation-scope", required=True, help="Scope of annotation for DE analysis"
272288
)
273289

274290
parser_differential_expression.add_argument(
275291
"--method", default="wilcoxon", help="method for DE"
276292
)
277293

278294
parser_differential_expression.add_argument(
279-
"--name", required=True, help="study owner-specified cluster anem"
295+
"--cluster-name", required=True, help="study owner-specified cluster name"
280296
)
281297

282298
parser_differential_expression.add_argument(
@@ -286,9 +302,9 @@ def create_parser():
286302
)
287303

288304
parser_differential_expression.add_argument(
289-
"--cell-metadata-file",
305+
"--annotation-file",
290306
required=True,
291-
help="Absolute or relative path to cell metadata file.",
307+
help="Absolute or relative path to cell metadata or cluster file of annotations.",
292308
)
293309

294310
parser_differential_expression.add_argument(

ingest/de.py

Lines changed: 27 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from email.headerregistry import Group
12
import logging
23
import numpy as np
34
import pandas as pd
@@ -36,19 +37,20 @@ def __init__(
3637
cell_metadata,
3738
matrix_file_path,
3839
matrix_file_type,
39-
annotation,
40+
annotation_name,
4041
**kwargs,
4142
):
4243
DifferentialExpression.de_logger.info(
4344
"Initializing DifferentialExpression instance"
4445
)
4546
self.cluster = cluster
4647
self.metadata = cell_metadata
47-
self.annotation = annotation
48+
self.annotation = annotation_name
4849
self.matrix_file_path = matrix_file_path
4950
self.matrix_file_type = matrix_file_type
5051
self.kwargs = kwargs
5152
self.accession = self.kwargs["study_accession"]
53+
self.annot_scope = self.kwargs["annotation_scope"]
5254
# only used in output filename, replacing non-alphanumeric with underscores
5355
self.cluster_name = re.sub(r'\W+', '_', self.kwargs["name"])
5456
self.method = self.kwargs["method"]
@@ -175,6 +177,7 @@ def subset_adata(adata, de_cells):
175177
return adata
176178

177179
def execute_de(self):
180+
print(f'dev_info: Starting DE for {self.accession}')
178181
try:
179182
if self.matrix_file_type == "mtx":
180183
DifferentialExpression.de_logger.info("preparing DE on sparse matrix")
@@ -184,6 +187,7 @@ def execute_de(self):
184187
self.matrix_file_path,
185188
self.matrix_file_type,
186189
self.annotation,
190+
self.annot_scope,
187191
self.accession,
188192
self.cluster_name,
189193
self.method,
@@ -198,6 +202,7 @@ def execute_de(self):
198202
self.matrix_file_path,
199203
self.matrix_file_type,
200204
self.annotation,
205+
self.annot_scope,
201206
self.accession,
202207
self.cluster_name,
203208
self.method,
@@ -222,18 +227,17 @@ def get_genes(genes_path):
222227
"""
223228
genes_df = pd.read_csv(genes_path, sep="\t", header=None)
224229
if len(genes_df.columns) > 1:
230+
# unclear if falling back to gene_id is useful (SCP-4283)
231+
# print so we're aware of dups during dev testing
225232
if genes_df[1].count() == genes_df[1].nunique():
226-
return genes_df[1].tolist()
227-
elif genes_df[0].count() == genes_df[0].nunique():
228-
return genes_df[0].tolist()
233+
msg = "dev_info: Features file contains duplicate identifiers (col 2)"
234+
print(msg)
235+
return genes_df[1].tolist()
229236
else:
230-
msg = "Features file contains duplicate identifiers"
231-
print(msg)
232-
log_exception(
233-
DifferentialExpression.dev_logger, DifferentialExpression.de_logger, msg
234-
)
235-
raise ValueError(msg)
236-
return genes
237+
if genes_df[0].count() == genes_df[0].nunique():
238+
msg = "dev_info: Features file contains duplicate identifiers (col 1)"
239+
print(msg)
240+
return genes_df[0].tolist()
237241

238242
@staticmethod
239243
def get_barcodes(barcodes_path):
@@ -264,6 +268,7 @@ def run_scanpy_de(
264268
matrix_file_path,
265269
matrix_file_type,
266270
annotation,
271+
annot_scope,
267272
study_accession,
268273
cluster_name,
269274
method,
@@ -315,15 +320,23 @@ def run_scanpy_de(
315320
DifferentialExpression.dev_logger, DifferentialExpression.de_logger, msg
316321
)
317322
raise KeyError(msg)
323+
# ToDo - detection and handling of annotations with only one sample (SCP-4282)
324+
except ValueError as e:
325+
print(e)
326+
log_exception(
327+
DifferentialExpression.dev_logger, DifferentialExpression.de_logger, e
328+
)
329+
raise KeyError(e)
318330

319331
DifferentialExpression.de_logger.info("Gathering DE annotation labels")
320332
groups = np.unique(adata.obs[annotation]).tolist()
321333
for group in groups:
322-
group_filename = re.sub(r'\W+', '_', group)
334+
clean_group = re.sub(r'\W+', '_', group)
335+
clean_annotation = re.sub(r'\W+', '_', annotation)
323336
DifferentialExpression.de_logger.info(f"Writing DE output for {group}")
324337
rank = sc.get.rank_genes_groups_df(adata, key=rank_key, group=group)
325338

326-
out_file = f'{cluster_name}--{annotation}--{group_filename}--{method}.tsv'
339+
out_file = f'{cluster_name}--{clean_annotation}--{clean_group}--{annot_scope}--{method}.tsv'
327340
# Round numbers to 4 significant digits while respecting fixed point
328341
# and scientific notation (note: trailing zeros are removed)
329342
rank.to_csv(out_file, sep='\t', float_format='%.4g')

ingest/ingest_pipeline.py

Lines changed: 12 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -579,30 +579,24 @@ def main() -> None:
579579
parsed_args = create_parser().parse_args()
580580
validate_arguments(parsed_args)
581581
arguments = vars(parsed_args)
582-
status = 0
583-
status_cell_metadata = 0
584-
ingest = None
585-
582+
if "differential_expression" in arguments:
583+
# DE may use metadata or cluster file for annots BUT
584+
# IngestPipeline initialization assumes a "cell_metadata_file"
585+
arguments["cell_metadata_file"] = arguments["annotation_file"]
586+
# IngestPipeline initialization expects "name" and not "cluster_name"
587+
arguments["name"] = arguments["cluster_name"]
586588
# Initialize global variables for current ingest job
587589
config.init(
588590
arguments["study_id"],
589591
arguments["study_file_id"],
590592
arguments["user_metrics_uuid"],
591593
)
592-
593-
try:
594-
595-
ingest = IngestPipeline(**arguments)
596-
status, status_cell_metadata = run_ingest(ingest, arguments, parsed_args)
597-
# Print metrics properties
598-
metrics_dump = config.get_metric_properties().get_properties()
599-
for key in metrics_dump.keys():
600-
print(f'{key}: {metrics_dump[key]}')
601-
602-
except Exception as e:
603-
config.set_parent_event_name("ingest-pipeline:unhandled-exception:ingest")
604-
log_exception(IngestPipeline.dev_logger, IngestPipeline.user_logger, e)
605-
status = 1
594+
ingest = IngestPipeline(**arguments)
595+
status, status_cell_metadata = run_ingest(ingest, arguments, parsed_args)
596+
# Print metrics properties
597+
metrics_dump = config.get_metric_properties().get_properties()
598+
for key in metrics_dump.keys():
599+
print(f'{key}: {metrics_dump[key]}')
606600

607601
# Log Mixpanel events
608602
MetricsService.log(config.get_parent_event_name(), config.get_metric_properties())

0 commit comments

Comments
 (0)