Skip to content

Commit 3a9d12d

Browse files
authored
Merge pull request #239 from broadinstitute/development
Release 1.15.0
2 parents 52cb0ba + a68ecb5 commit 3a9d12d

11 files changed

+1371
-27
lines changed

Dockerfile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ RUN apt-get -y update && \
2222
apt -y install python3.7 && \
2323
apt -y install python3.7-dev
2424

25-
RUN python3.7 -m pip install pip
25+
RUN python3.7 -m pip install --upgrade pip
2626

2727
# Set cleaner defaults (`alias` fails)
2828
RUN ln -s /usr/bin/python3.7 /usr/bin/python && \
@@ -38,4 +38,4 @@ WORKDIR /scp-ingest-pipeline
3838
RUN python3.7 -m pip install -r requirements.txt
3939

4040
WORKDIR /scp-ingest-pipeline/ingest
41-
CMD ["python", "ingest_pipeline.py", "--help"]
41+
CMD ["python", "ingest_pipeline.py", "--help"]

ingest/annotations.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,8 @@ def determine_coordinates_and_cell_names(self):
8585
@staticmethod
8686
def get_cell_names(df):
8787
cell_names_str: str = df[("NAME", "TYPE")].to_string(index=False)
88-
cell_names: list = cell_names_str.strip().splitlines()
88+
cell_names_list: list = cell_names_str.splitlines()
89+
cell_names: list = [i.strip() for i in cell_names_list]
8990
return cell_names
9091

9192
def merge_df(self, first_df, second_df):

ingest/cli_parser.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,73 @@ def create_parser():
248248
"--cell-metadata-file", help="Absolute or relative path to cell metadata file."
249249
)
250250

251+
# Differential expression subparsers
252+
parser_differential_expression = subparsers.add_parser(
253+
"differential_expression",
254+
help="Indicates differential expression analysis processing",
255+
)
256+
257+
parser_differential_expression.add_argument(
258+
"--differential-expression",
259+
required=True,
260+
action="store_true",
261+
help="Indicates that differential expression analysis should be invoked",
262+
)
263+
264+
parser_differential_expression.add_argument(
265+
"--study-accession",
266+
required=True,
267+
help="Single study accession associated with provided DE input files.",
268+
)
269+
270+
parser_differential_expression.add_argument(
271+
"--annotation", required=True, help="Name of annotation for DE analysis"
272+
)
273+
274+
parser_differential_expression.add_argument(
275+
"--method", default="wilcoxon", help="method for DE"
276+
)
277+
278+
parser_differential_expression.add_argument(
279+
"--name", required=True, help="study owner-specified cluster anem"
280+
)
281+
282+
parser_differential_expression.add_argument(
283+
"--cluster-file",
284+
required=True,
285+
help="Absolute or relative path to cluster file.",
286+
)
287+
288+
parser_differential_expression.add_argument(
289+
"--cell-metadata-file",
290+
required=True,
291+
help="Absolute or relative path to cell metadata file.",
292+
)
293+
294+
parser_differential_expression.add_argument(
295+
"--matrix-file-path",
296+
required=True,
297+
help="Absolute or relative path to "
298+
"expression file. For 10x data this is "
299+
"the .mtx file",
300+
)
301+
302+
parser_differential_expression.add_argument(
303+
"--matrix-file-type",
304+
choices=EXPRESSION_FILE_TYPES,
305+
type=str.lower,
306+
required=True,
307+
help=matrix_file_type_txt,
308+
)
309+
310+
# Gene and Barcode arguments for MTX bundle
311+
parser_differential_expression.add_argument(
312+
"--barcode-file", help="Path to .barcodes.tsv files"
313+
)
314+
parser_differential_expression.add_argument(
315+
"--gene-file", help="Path to .genes.tsv file"
316+
)
317+
251318
return parser
252319

253320

ingest/de.py

Lines changed: 252 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,252 @@
1+
import logging
2+
import numpy as np
3+
import pandas as pd
4+
import scanpy as sc
5+
6+
try:
7+
from monitor import setup_logger, log_exception
8+
from annotations import Annotations
9+
from clusters import Clusters
10+
from cell_metadata import CellMetadata
11+
from ingest_files import IngestFiles
12+
except ImportError:
13+
# Used when importing as external package, e.g. imports in single_cell_portal code
14+
from .monitor import setup_logger, log_exception
15+
from .annotations import Annotations
16+
from .clusters import Clusters
17+
from .cell_metadata import CellMetadata
18+
from .ingest_files import IngestFiles
19+
20+
21+
class DifferentialExpression:
22+
ALLOWED_FILE_TYPES = ["text/csv", "text/plain", "text/tab-separated-values"]
23+
24+
dev_logger = setup_logger(__name__, "log.txt", format="support_configs")
25+
de_logger = setup_logger(
26+
__name__ + ".de_logger",
27+
"de_log.txt",
28+
level=logging.INFO,
29+
format="support_configs",
30+
)
31+
32+
def __init__(
33+
self,
34+
cluster,
35+
cell_metadata,
36+
matrix_file_path,
37+
matrix_file_type,
38+
annotation,
39+
**kwargs,
40+
):
41+
self.cluster = cluster
42+
self.metadata = cell_metadata
43+
self.annotation = annotation
44+
self.matrix_file_path = matrix_file_path
45+
self.matrix_file_type = matrix_file_type
46+
self.kwargs = kwargs
47+
self.accession = self.kwargs["study_accession"]
48+
# only used in output filename, removing spaces
49+
self.cluster_name = self.kwargs["name"].replace(" ", "_")
50+
self.method = self.kwargs["method"]
51+
52+
if matrix_file_type == "mtx":
53+
self.genes_path = self.kwargs["gene_file"]
54+
genes_ingest_file = IngestFiles(self.genes_path, self.ALLOWED_FILE_TYPES)
55+
self.genes_file = genes_ingest_file.resolve_path(self.genes_path)[0]
56+
self.genes: List[str] = [
57+
g.strip().strip('"') for g in self.genes_file.readlines()
58+
]
59+
60+
self.barcodes_path = self.kwargs["barcode_file"]
61+
barcodes_ingest_file = IngestFiles(
62+
self.barcodes_path, self.ALLOWED_FILE_TYPES
63+
)
64+
self.barcodes_file = barcodes_ingest_file.resolve_path(self.barcodes_path)[
65+
0
66+
]
67+
self.barcodes: List[str] = [
68+
c.strip().strip('"') for c in self.barcodes_file.readlines()
69+
]
70+
DifferentialExpression.de_logger.info(f"DifferentialExpression initialized")
71+
72+
@staticmethod
73+
def get_cluster_cells(cluster_cells):
74+
""" ID cells in cluster file """
75+
# cluster_cells.tolist() yields a list of lists that needs to be flattened
76+
# using extend converts a single-value list to a plain value
77+
cluster_cell_values = cluster_cells.tolist()
78+
cluster_cell_list = []
79+
for value in cluster_cell_values:
80+
cluster_cell_list.extend(value)
81+
return cluster_cell_list
82+
83+
@staticmethod
84+
def determine_dtypes(headers, annot_types):
85+
""" use SCP TYPE data to coerce data to proper dtypes:
86+
numeric-like group annotations
87+
missing values in group annotations (to avoid NaN)
88+
"""
89+
dtype_info = dict(zip(headers, annot_types))
90+
dtypes = {}
91+
for header, type_info in dtype_info.items():
92+
if type_info == "group":
93+
dtypes[header] = "string"
94+
return dtypes
95+
96+
@staticmethod
97+
def load_raw_annots(metadata_file_path, allowed_file_types, headers, dtypes):
98+
""" using SCP metadata header lines
99+
create properly coerced pandas dataframe of all study metadata
100+
"""
101+
annot_redux = IngestFiles(metadata_file_path, allowed_file_types)
102+
annot_file_type = annot_redux.get_file_type(metadata_file_path)[0]
103+
annot_file_handle = annot_redux.open_file(metadata_file_path)[1]
104+
raw_annots = annot_redux.open_pandas(
105+
metadata_file_path,
106+
annot_file_type,
107+
open_file_object=annot_file_handle,
108+
names=headers,
109+
skiprows=2,
110+
index_col=0,
111+
dtype=dtypes,
112+
)
113+
return raw_annots
114+
115+
@staticmethod
116+
def subset_annots(metadata, de_cells):
117+
""" subset metadata based on cells in cluster
118+
"""
119+
DifferentialExpression.de_logger.info(
120+
f"subsetting metadata on cells in clustering"
121+
)
122+
dtypes = DifferentialExpression.determine_dtypes(
123+
metadata.headers, metadata.annot_types
124+
)
125+
raw_annots = DifferentialExpression.load_raw_annots(
126+
metadata.file_path, metadata.ALLOWED_FILE_TYPES, metadata.headers, dtypes
127+
)
128+
cluster_annots = raw_annots[raw_annots.index.isin(de_cells)]
129+
return cluster_annots
130+
131+
@staticmethod
132+
def order_annots(metadata, adata_cells):
133+
""" order metadata based on cells order in matrix
134+
"""
135+
matrix_cell_order = adata_cells.tolist()
136+
return metadata.reindex(matrix_cell_order)
137+
138+
@staticmethod
139+
def subset_adata(adata, de_cells):
140+
""" subset adata object based on cells in cluster
141+
"""
142+
DifferentialExpression.de_logger.info(
143+
f"subsetting matrix on cells in clustering"
144+
)
145+
matrix_subset_list = np.in1d(adata.obs_names, de_cells)
146+
adata = adata[matrix_subset_list]
147+
return adata
148+
149+
def execute_de(self):
150+
if self.matrix_file_type == "mtx":
151+
self.prepare_h5ad(
152+
self.cluster,
153+
self.metadata,
154+
self.matrix_file_path,
155+
self.matrix_file_type,
156+
self.annotation,
157+
self.accession,
158+
self.genes,
159+
self.barcodes,
160+
)
161+
DifferentialExpression.de_logger.info("preparing DE on sparse matrix")
162+
else:
163+
self.run_h5ad(
164+
self.cluster,
165+
self.metadata,
166+
self.matrix_file_path,
167+
self.matrix_file_type,
168+
self.annotation,
169+
self.accession,
170+
self.cluster_name,
171+
self.method,
172+
)
173+
DifferentialExpression.de_logger.info("preparing DE on dense matrix")
174+
175+
@staticmethod
176+
def run_h5ad(
177+
cluster,
178+
metadata,
179+
matrix_file_path,
180+
matrix_file_type,
181+
annotation,
182+
study_accession,
183+
cluster_name,
184+
method,
185+
genes=None,
186+
barcodes=None,
187+
):
188+
189+
de_cells = DifferentialExpression.get_cluster_cells(cluster.file['NAME'].values)
190+
de_annots = DifferentialExpression.subset_annots(metadata, de_cells)
191+
192+
if matrix_file_type == "dense":
193+
# will need try/except (SCP-4205)
194+
adata = sc.read(matrix_file_path)
195+
else:
196+
# MTX DE UNTESTED (SCP-4203)
197+
# will want try/except here to catch failed data object composition
198+
adata = sc.read_mtx(matrix_file_path)
199+
# For AnnData, obs are cells and vars are genes
200+
# BUT transpose needed for both dense and sparse
201+
# so transpose step is after this data object composition step
202+
# therefore the assignements below are the reverse of expected
203+
adata.obs_names = genes
204+
adata.var_names = barcodes
205+
206+
adata = adata.transpose()
207+
208+
adata = DifferentialExpression.subset_adata(adata, de_cells)
209+
210+
# will need try/except (SCP-4205)
211+
adata.obs = DifferentialExpression.order_annots(de_annots, adata.obs_names)
212+
213+
sc.pp.normalize_total(adata, target_sum=1e4)
214+
sc.pp.log1p(adata)
215+
rank_key = "rank." + annotation + "." + method
216+
DifferentialExpression.de_logger.info(f"calculating DE")
217+
try:
218+
sc.tl.rank_genes_groups(
219+
adata,
220+
annotation,
221+
key_added=rank_key,
222+
use_raw=False,
223+
method=method,
224+
pts=True,
225+
)
226+
except KeyError:
227+
msg = f"Missing expected annotation in metadata: {annotation}, unable to calculate DE."
228+
log_exception(
229+
DifferentialExpression.dev_logger, DifferentialExpression.de_logger, msg
230+
)
231+
raise KeyError(msg)
232+
233+
groups = np.unique(adata.obs[annotation]).tolist()
234+
for group in groups:
235+
group_filename = group.replace(" ", "_")
236+
DifferentialExpression.de_logger.info(f"Writing DE output for {str(group)}")
237+
rank = sc.get.rank_genes_groups_df(adata, key=rank_key, group=str(group))
238+
239+
out_file = (
240+
f'{cluster_name}--{annotation}--{str(group_filename)}--{method}.tsv'
241+
)
242+
# Round numbers to 4 significant digits while respecting fixed point
243+
# and scientific notation (note: trailing zeros are removed)
244+
rank.to_csv(out_file, sep='\t', float_format='%.4g')
245+
246+
# Provide h5ad of DE analysis as reference computable object
247+
# DifferentialExpression.de_logger.info(f"Writing DE h5ad file")
248+
# file_name = f'{study_accession}_{cluster_name}_to_DE.h5ad'
249+
# adata.write_h5ad(file_name)
250+
251+
DifferentialExpression.de_logger.info(f"DE processing complete")
252+

0 commit comments

Comments
 (0)