Skip to content

Commit a9286cb

Browse files
authored
Merge pull request #37 from maxozo/main
Some modifications made to Hadge that were esential for running large scale processing of Onek1k data
2 parents 2ec7166 + 919079c commit a9286cb

72 files changed

Lines changed: 2887 additions & 3835 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.pre-commit-config.yaml

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,18 @@
1+
fail_fast: false
2+
default_language_version:
3+
python: python3
4+
default_stages:
5+
- commit
6+
- push
7+
minimum_pre_commit_version: 2.16.0
18
repos:
29
- repo: https://github.com/pre-commit/mirrors-prettier
310
rev: v2.7.0
411
hooks:
512
- id: prettier
13+
- repo: https://github.com/astral-sh/ruff-pre-commit
14+
rev: v0.2.0
15+
hooks:
16+
- id: ruff
17+
args: [--fix, --exit-non-zero-on-fix, --unsafe-fixes]
18+
- id: ruff-format

bin/bff.R

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,21 @@ library(DropletUtils)
44
library(Seurat)
55
library(ggplot2)
66
library(cowplot)
7+
if(!require("cellhashR")){
8+
devtools::install_github(repo = 'bimberlab/cellhashR', ref = 'master', dependencies = TRUE, upgrade = 'always')
9+
library("cellhashR")
10+
}
11+
712
library(cellhashR)
813
library(here)
914
library(dplyr)
1015
library(argparse)
11-
library(tidyverse)
16+
17+
if(!require("tidyverse")){
18+
install.packages("tidyverse")
19+
library("tidyverse")
20+
}
21+
1222

1323
# Create a parser
1424
parser <- ArgumentParser("Parameters for BFF")

bin/demuxem.py

Lines changed: 126 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -7,45 +7,110 @@
77
import pandas as pd
88
import pegasusio as io
99

10-
parser = argparse.ArgumentParser(description='Parser for DemuxEM - Demultiplexing')
11-
parser.add_argument('--rna_matrix_dir', help= 'cellranger output folder which contains raw RNA count matrix in mtx format.')
12-
parser.add_argument('--hto_matrix_dir', help= 'cellranger output folder which contains raw HTO (antibody tag) count matrix in mtx format.')
13-
parser.add_argument('--randomState', help='Random seed set for reproducing results.', type=int, default=0)
14-
parser.add_argument('--min_signal', help='Any cell/nucleus with less than min_signal hashtags from the signal will be marked as unknown.', type=float, default=10.0)
15-
parser.add_argument('--min_num_genes', help='We only demultiplex cells/nuclei with at least <number> of expressed genes.', type=int, default=100)
16-
parser.add_argument('--min_num_umis', help='We only demultiplex cells/nuclei with at least <number> of UMIs.', type=int, default=100)
17-
parser.add_argument('--alpha', help='The Dirichlet prior concentration parameter (alpha) on samples. An alpha value < 1.0 will make the prior sparse.', type=float, default=0.0)
18-
parser.add_argument('--alpha_noise', help='The Dirichlet prior concenration parameter on the background noise.', type=float, default=1.0)
19-
parser.add_argument('--tol', help='Threshold used for the EM convergence.', type=float, default=1e-6)
20-
parser.add_argument('--n_threads', help='Number of threads to use. Must be a positive integer.', type=int, default=1)
21-
parser.add_argument('--filter_demuxem', help='Use the filter for RNA, True or False', default='True')
22-
parser.add_argument('--generateGenderPlot', help='Generate violin plots using gender-specific genes (e.g. Xist). <gene> is a comma-separated list of gene names.', default='')
23-
parser.add_argument('--objectOutDemuxem', help='Output name of demultiplexing results. All outputs will use it as the prefix.', default="demuxem_res")
24-
parser.add_argument('--outputdir', help='Output directory')
10+
parser = argparse.ArgumentParser(description="Parser for DemuxEM - Demultiplexing")
11+
parser.add_argument(
12+
"--rna_matrix_dir",
13+
help="cellranger output folder which contains raw RNA count matrix in mtx format.",
14+
)
15+
parser.add_argument(
16+
"--hto_matrix_dir",
17+
help="cellranger output folder which contains raw HTO (antibody tag) count matrix in mtx format.",
18+
)
19+
parser.add_argument(
20+
"--randomState",
21+
help="Random seed set for reproducing results.",
22+
type=int,
23+
default=0,
24+
)
25+
parser.add_argument(
26+
"--min_signal",
27+
help="Any cell/nucleus with less than min_signal hashtags from the signal will be marked as unknown.",
28+
type=float,
29+
default=10.0,
30+
)
31+
parser.add_argument(
32+
"--min_num_genes",
33+
help="We only demultiplex cells/nuclei with at least <number> of expressed genes.",
34+
type=int,
35+
default=100,
36+
)
37+
parser.add_argument(
38+
"--min_num_umis",
39+
help="We only demultiplex cells/nuclei with at least <number> of UMIs.",
40+
type=int,
41+
default=100,
42+
)
43+
parser.add_argument(
44+
"--alpha",
45+
help="The Dirichlet prior concentration parameter (alpha) on samples. An alpha value < 1.0 will make the prior sparse.",
46+
type=float,
47+
default=0.0,
48+
)
49+
parser.add_argument(
50+
"--alpha_noise",
51+
help="The Dirichlet prior concenration parameter on the background noise.",
52+
type=float,
53+
default=1.0,
54+
)
55+
parser.add_argument(
56+
"--tol", help="Threshold used for the EM convergence.", type=float, default=1e-6
57+
)
58+
parser.add_argument(
59+
"--n_threads",
60+
help="Number of threads to use. Must be a positive integer.",
61+
type=int,
62+
default=1,
63+
)
64+
parser.add_argument(
65+
"--filter_demuxem", help="Use the filter for RNA, True or False", default="True"
66+
)
67+
parser.add_argument(
68+
"--generateGenderPlot",
69+
help="Generate violin plots using gender-specific genes (e.g. Xist). <gene> is a comma-separated list of gene names.",
70+
default="",
71+
)
72+
parser.add_argument(
73+
"--objectOutDemuxem",
74+
help="Output name of demultiplexing results. All outputs will use it as the prefix.",
75+
default="demuxem_res",
76+
)
77+
parser.add_argument("--outputdir", help="Output directory")
2578

2679
args = parser.parse_args()
27-
param_list = [['rna_matrix_dir', args.rna_matrix_dir], ['hto_matrix_dir', args.hto_matrix_dir], ['randomState', args.randomState], ['min_signal', args.min_signal], ['min_num_genes', args.min_num_genes], ['min_num_umis', args.min_num_umis], ['alpha', args.alpha], ['alpha_noise', args.alpha_noise], ['tol', args.tol], ['n_threads', args.n_threads], ['generateGenderPlot', args.generateGenderPlot]]
28-
29-
param_df = pd.DataFrame(param_list, columns=['Argument', 'Value'])
80+
param_list = [
81+
["rna_matrix_dir", args.rna_matrix_dir],
82+
["hto_matrix_dir", args.hto_matrix_dir],
83+
["randomState", args.randomState],
84+
["min_signal", args.min_signal],
85+
["min_num_genes", args.min_num_genes],
86+
["min_num_umis", args.min_num_umis],
87+
["alpha", args.alpha],
88+
["alpha_noise", args.alpha_noise],
89+
["tol", args.tol],
90+
["n_threads", args.n_threads],
91+
["generateGenderPlot", args.generateGenderPlot],
92+
]
3093

31-
if __name__ == '__main__':
94+
param_df = pd.DataFrame(param_list, columns=["Argument", "Value"])
95+
96+
if __name__ == "__main__":
3297
output_name = args.outputdir + "/" + args.objectOutDemuxem
3398
# load input rna data
3499
rna_data = sc.read_10x_mtx(args.rna_matrix_dir)
35-
hashing_data = sc.read_10x_mtx(args.hto_matrix_dir,gex_only=False)
100+
hashing_data = sc.read_10x_mtx(args.hto_matrix_dir, gex_only=False)
36101
rna = args.rna_matrix_dir
37102
filter = ""
38-
if args.filter_demuxem.lower() in ['true', 't', 'yes', 'y', '1']:
103+
if args.filter_demuxem.lower() in ["true", "t", "yes", "y", "1"]:
39104
filter = True
40-
elif args.filter_demuxem.lower() in ['false', 'f', 'no', 'n', '0']:
105+
elif args.filter_demuxem.lower() in ["false", "f", "no", "n", "0"]:
41106
filter = False
42107
else:
43-
raise ValueError("Invalid boolean value: {}".format(value))
108+
raise ValueError(f"Invalid boolean value: {args.filter_demuxem.lower()}")
44109
# Filter the RNA matrix
45110
rna_data.obs["n_genes"] = rna_data.X.getnnz(axis=1)
46111
rna_data.obs["n_counts"] = rna_data.X.sum(axis=1).A1
47-
#data.obs["n_counts"] = rna_data.X.sum(axis=1).A1
48-
if(filter):
112+
# data.obs["n_counts"] = rna_data.X.sum(axis=1).A1
113+
if filter:
49114
print("Filtering RNA matrix")
50115
obs_index = np.logical_and.reduce(
51116
(
@@ -56,25 +121,42 @@
56121
rna_data._inplace_subset_obs(obs_index)
57122
# run demuxEM
58123
demuxEM.estimate_background_probs(hashing_data, random_state=args.randomState)
59-
demuxEM.demultiplex(rna_data, hashing_data, min_signal=args.min_signal, alpha=args.alpha, alpha_noise=args.alpha_noise, tol=args.tol, n_threads=args.n_threads)
124+
demuxEM.demultiplex(
125+
rna_data,
126+
hashing_data,
127+
min_signal=args.min_signal,
128+
alpha=args.alpha,
129+
alpha_noise=args.alpha_noise,
130+
tol=args.tol,
131+
n_threads=args.n_threads,
132+
)
60133
# annotate raw matrix with demuxEM results
61134
demux_results = demuxEM.attach_demux_results(args.rna_matrix_dir, rna_data)
62135
# generate plots
63-
demuxEM.plot_hto_hist(hashing_data, "hto_type", output_name + ".ambient_hashtag.hist.pdf", alpha=1.0)
64-
demuxEM.plot_bar(hashing_data.uns["background_probs"], hashing_data.var_names, "Sample ID",
65-
"Background probability", output_name + ".background_probabilities.bar.pdf",)
66-
demuxEM.plot_hto_hist(hashing_data, "rna_type", output_name + ".real_content.hist.pdf", alpha=0.5)
136+
demuxEM.plot_hto_hist(
137+
hashing_data, "hto_type", output_name + ".ambient_hashtag.hist.pdf", alpha=1.0
138+
)
139+
demuxEM.plot_bar(
140+
hashing_data.uns["background_probs"],
141+
hashing_data.var_names,
142+
"Sample ID",
143+
"Background probability",
144+
output_name + ".background_probabilities.bar.pdf",
145+
)
146+
demuxEM.plot_hto_hist(
147+
hashing_data, "rna_type", output_name + ".real_content.hist.pdf", alpha=0.5
148+
)
67149
demuxEM.plot_rna_hist(rna_data, hashing_data, output_name + ".rna_demux.hist.pdf")
68-
150+
69151
if len(args.generateGenderPlot) > 0:
70152
rna_data.matrices["raw.X"] = rna_data.X.copy()
71153
rna_data.as_float()
72154
scale = 1e5 / rna_data.X.sum(axis=1).A1
73-
rna_data.X.data *= np.repeat(scale, np.diff(data.X.indptr))
155+
rna_data.X.data *= np.repeat(scale, np.diff(rna_data.X.indptr))
74156
rna_data.X.data = np.log1p(rna_data.X.data)
75157

76158
for gene_name in args.generateGenderPlot:
77-
plot_gene_violin(
159+
pg.violin(
78160
rna_data,
79161
gene_name,
80162
"{output_name}.{gene_name}.violin.pdf".format(
@@ -91,16 +173,21 @@
91173
print("total\t{}".format(rna_data.shape[0]))
92174
for name, value in rna_data.obs["demux_type"].value_counts().items():
93175
print("{}\t{}".format(name, value))
94-
summary = rna_data.obs["demux_type"].value_counts().rename_axis('classification').reset_index(name='counts')
176+
summary = (
177+
rna_data.obs["demux_type"]
178+
.value_counts()
179+
.rename_axis("classification")
180+
.reset_index(name="counts")
181+
)
95182
total = ["total", rna_data.shape[0]]
96183
summary.loc[len(summary)] = total
97184
summary.to_csv(output_name + "_summary.csv", index=False)
98-
param_df.fillna("None",inplace=True)
185+
param_df.fillna("None", inplace=True)
99186
param_df.to_csv(args.outputdir + "/params.csv", index=False)
100-
101-
rna_data.obs.assignment.replace(np.nan,'negative', inplace=True)
187+
188+
rna_data.obs.assignment.replace(np.nan, "negative", inplace=True)
102189
hashtags = hashing_data.var.index.tolist()
103190
hashtags = hashtags + ["negative"]
104-
toreplace = [ht for ht in rna_data.obs['assignment'].unique() if ht not in hashtags]
105-
rna_data.obs.assignment.replace(toreplace,'doublet', inplace=True)
191+
toreplace = [ht for ht in rna_data.obs["assignment"].unique() if ht not in hashtags]
192+
rna_data.obs.assignment.replace(toreplace, "doublet", inplace=True)
106193
rna_data.obs.to_csv(output_name + "_obs.csv")

0 commit comments

Comments
 (0)