|
7 | 7 | import pandas as pd |
8 | 8 | import pegasusio as io |
9 | 9 |
|
10 | | -parser = argparse.ArgumentParser(description='Parser for DemuxEM - Demultiplexing') |
11 | | -parser.add_argument('--rna_matrix_dir', help= 'cellranger output folder which contains raw RNA count matrix in mtx format.') |
12 | | -parser.add_argument('--hto_matrix_dir', help= 'cellranger output folder which contains raw HTO (antibody tag) count matrix in mtx format.') |
13 | | -parser.add_argument('--randomState', help='Random seed set for reproducing results.', type=int, default=0) |
14 | | -parser.add_argument('--min_signal', help='Any cell/nucleus with less than min_signal hashtags from the signal will be marked as unknown.', type=float, default=10.0) |
15 | | -parser.add_argument('--min_num_genes', help='We only demultiplex cells/nuclei with at least <number> of expressed genes.', type=int, default=100) |
16 | | -parser.add_argument('--min_num_umis', help='We only demultiplex cells/nuclei with at least <number> of UMIs.', type=int, default=100) |
17 | | -parser.add_argument('--alpha', help='The Dirichlet prior concentration parameter (alpha) on samples. An alpha value < 1.0 will make the prior sparse.', type=float, default=0.0) |
18 | | -parser.add_argument('--alpha_noise', help='The Dirichlet prior concenration parameter on the background noise.', type=float, default=1.0) |
19 | | -parser.add_argument('--tol', help='Threshold used for the EM convergence.', type=float, default=1e-6) |
20 | | -parser.add_argument('--n_threads', help='Number of threads to use. Must be a positive integer.', type=int, default=1) |
21 | | -parser.add_argument('--filter_demuxem', help='Use the filter for RNA, True or False', default='True') |
22 | | -parser.add_argument('--generateGenderPlot', help='Generate violin plots using gender-specific genes (e.g. Xist). <gene> is a comma-separated list of gene names.', default='') |
23 | | -parser.add_argument('--objectOutDemuxem', help='Output name of demultiplexing results. All outputs will use it as the prefix.', default="demuxem_res") |
24 | | -parser.add_argument('--outputdir', help='Output directory') |
| 10 | +parser = argparse.ArgumentParser(description="Parser for DemuxEM - Demultiplexing") |
| 11 | +parser.add_argument( |
| 12 | + "--rna_matrix_dir", |
| 13 | + help="cellranger output folder which contains raw RNA count matrix in mtx format.", |
| 14 | +) |
| 15 | +parser.add_argument( |
| 16 | + "--hto_matrix_dir", |
| 17 | + help="cellranger output folder which contains raw HTO (antibody tag) count matrix in mtx format.", |
| 18 | +) |
| 19 | +parser.add_argument( |
| 20 | + "--randomState", |
| 21 | + help="Random seed set for reproducing results.", |
| 22 | + type=int, |
| 23 | + default=0, |
| 24 | +) |
| 25 | +parser.add_argument( |
| 26 | + "--min_signal", |
| 27 | + help="Any cell/nucleus with less than min_signal hashtags from the signal will be marked as unknown.", |
| 28 | + type=float, |
| 29 | + default=10.0, |
| 30 | +) |
| 31 | +parser.add_argument( |
| 32 | + "--min_num_genes", |
| 33 | + help="We only demultiplex cells/nuclei with at least <number> of expressed genes.", |
| 34 | + type=int, |
| 35 | + default=100, |
| 36 | +) |
| 37 | +parser.add_argument( |
| 38 | + "--min_num_umis", |
| 39 | + help="We only demultiplex cells/nuclei with at least <number> of UMIs.", |
| 40 | + type=int, |
| 41 | + default=100, |
| 42 | +) |
| 43 | +parser.add_argument( |
| 44 | + "--alpha", |
| 45 | + help="The Dirichlet prior concentration parameter (alpha) on samples. An alpha value < 1.0 will make the prior sparse.", |
| 46 | + type=float, |
| 47 | + default=0.0, |
| 48 | +) |
| 49 | +parser.add_argument( |
| 50 | + "--alpha_noise", |
| 51 | + help="The Dirichlet prior concenration parameter on the background noise.", |
| 52 | + type=float, |
| 53 | + default=1.0, |
| 54 | +) |
| 55 | +parser.add_argument( |
| 56 | + "--tol", help="Threshold used for the EM convergence.", type=float, default=1e-6 |
| 57 | +) |
| 58 | +parser.add_argument( |
| 59 | + "--n_threads", |
| 60 | + help="Number of threads to use. Must be a positive integer.", |
| 61 | + type=int, |
| 62 | + default=1, |
| 63 | +) |
| 64 | +parser.add_argument( |
| 65 | + "--filter_demuxem", help="Use the filter for RNA, True or False", default="True" |
| 66 | +) |
| 67 | +parser.add_argument( |
| 68 | + "--generateGenderPlot", |
| 69 | + help="Generate violin plots using gender-specific genes (e.g. Xist). <gene> is a comma-separated list of gene names.", |
| 70 | + default="", |
| 71 | +) |
| 72 | +parser.add_argument( |
| 73 | + "--objectOutDemuxem", |
| 74 | + help="Output name of demultiplexing results. All outputs will use it as the prefix.", |
| 75 | + default="demuxem_res", |
| 76 | +) |
| 77 | +parser.add_argument("--outputdir", help="Output directory") |
25 | 78 |
|
26 | 79 | args = parser.parse_args() |
27 | | -param_list = [['rna_matrix_dir', args.rna_matrix_dir], ['hto_matrix_dir', args.hto_matrix_dir], ['randomState', args.randomState], ['min_signal', args.min_signal], ['min_num_genes', args.min_num_genes], ['min_num_umis', args.min_num_umis], ['alpha', args.alpha], ['alpha_noise', args.alpha_noise], ['tol', args.tol], ['n_threads', args.n_threads], ['generateGenderPlot', args.generateGenderPlot]] |
28 | | - |
29 | | -param_df = pd.DataFrame(param_list, columns=['Argument', 'Value']) |
| 80 | +param_list = [ |
| 81 | + ["rna_matrix_dir", args.rna_matrix_dir], |
| 82 | + ["hto_matrix_dir", args.hto_matrix_dir], |
| 83 | + ["randomState", args.randomState], |
| 84 | + ["min_signal", args.min_signal], |
| 85 | + ["min_num_genes", args.min_num_genes], |
| 86 | + ["min_num_umis", args.min_num_umis], |
| 87 | + ["alpha", args.alpha], |
| 88 | + ["alpha_noise", args.alpha_noise], |
| 89 | + ["tol", args.tol], |
| 90 | + ["n_threads", args.n_threads], |
| 91 | + ["generateGenderPlot", args.generateGenderPlot], |
| 92 | +] |
30 | 93 |
|
31 | | -if __name__ == '__main__': |
| 94 | +param_df = pd.DataFrame(param_list, columns=["Argument", "Value"]) |
| 95 | + |
| 96 | +if __name__ == "__main__": |
32 | 97 | output_name = args.outputdir + "/" + args.objectOutDemuxem |
33 | 98 | # load input rna data |
34 | 99 | rna_data = sc.read_10x_mtx(args.rna_matrix_dir) |
35 | | - hashing_data = sc.read_10x_mtx(args.hto_matrix_dir,gex_only=False) |
| 100 | + hashing_data = sc.read_10x_mtx(args.hto_matrix_dir, gex_only=False) |
36 | 101 | rna = args.rna_matrix_dir |
37 | 102 | filter = "" |
38 | | - if args.filter_demuxem.lower() in ['true', 't', 'yes', 'y', '1']: |
| 103 | + if args.filter_demuxem.lower() in ["true", "t", "yes", "y", "1"]: |
39 | 104 | filter = True |
40 | | - elif args.filter_demuxem.lower() in ['false', 'f', 'no', 'n', '0']: |
| 105 | + elif args.filter_demuxem.lower() in ["false", "f", "no", "n", "0"]: |
41 | 106 | filter = False |
42 | 107 | else: |
43 | | - raise ValueError("Invalid boolean value: {}".format(value)) |
| 108 | + raise ValueError(f"Invalid boolean value: {args.filter_demuxem.lower()}") |
44 | 109 | # Filter the RNA matrix |
45 | 110 | rna_data.obs["n_genes"] = rna_data.X.getnnz(axis=1) |
46 | 111 | rna_data.obs["n_counts"] = rna_data.X.sum(axis=1).A1 |
47 | | - #data.obs["n_counts"] = rna_data.X.sum(axis=1).A1 |
48 | | - if(filter): |
| 112 | + # data.obs["n_counts"] = rna_data.X.sum(axis=1).A1 |
| 113 | + if filter: |
49 | 114 | print("Filtering RNA matrix") |
50 | 115 | obs_index = np.logical_and.reduce( |
51 | 116 | ( |
|
56 | 121 | rna_data._inplace_subset_obs(obs_index) |
57 | 122 | # run demuxEM |
58 | 123 | demuxEM.estimate_background_probs(hashing_data, random_state=args.randomState) |
59 | | - demuxEM.demultiplex(rna_data, hashing_data, min_signal=args.min_signal, alpha=args.alpha, alpha_noise=args.alpha_noise, tol=args.tol, n_threads=args.n_threads) |
| 124 | + demuxEM.demultiplex( |
| 125 | + rna_data, |
| 126 | + hashing_data, |
| 127 | + min_signal=args.min_signal, |
| 128 | + alpha=args.alpha, |
| 129 | + alpha_noise=args.alpha_noise, |
| 130 | + tol=args.tol, |
| 131 | + n_threads=args.n_threads, |
| 132 | + ) |
60 | 133 | # annotate raw matrix with demuxEM results |
61 | 134 | demux_results = demuxEM.attach_demux_results(args.rna_matrix_dir, rna_data) |
62 | 135 | # generate plots |
63 | | - demuxEM.plot_hto_hist(hashing_data, "hto_type", output_name + ".ambient_hashtag.hist.pdf", alpha=1.0) |
64 | | - demuxEM.plot_bar(hashing_data.uns["background_probs"], hashing_data.var_names, "Sample ID", |
65 | | - "Background probability", output_name + ".background_probabilities.bar.pdf",) |
66 | | - demuxEM.plot_hto_hist(hashing_data, "rna_type", output_name + ".real_content.hist.pdf", alpha=0.5) |
| 136 | + demuxEM.plot_hto_hist( |
| 137 | + hashing_data, "hto_type", output_name + ".ambient_hashtag.hist.pdf", alpha=1.0 |
| 138 | + ) |
| 139 | + demuxEM.plot_bar( |
| 140 | + hashing_data.uns["background_probs"], |
| 141 | + hashing_data.var_names, |
| 142 | + "Sample ID", |
| 143 | + "Background probability", |
| 144 | + output_name + ".background_probabilities.bar.pdf", |
| 145 | + ) |
| 146 | + demuxEM.plot_hto_hist( |
| 147 | + hashing_data, "rna_type", output_name + ".real_content.hist.pdf", alpha=0.5 |
| 148 | + ) |
67 | 149 | demuxEM.plot_rna_hist(rna_data, hashing_data, output_name + ".rna_demux.hist.pdf") |
68 | | - |
| 150 | + |
69 | 151 | if len(args.generateGenderPlot) > 0: |
70 | 152 | rna_data.matrices["raw.X"] = rna_data.X.copy() |
71 | 153 | rna_data.as_float() |
72 | 154 | scale = 1e5 / rna_data.X.sum(axis=1).A1 |
73 | | - rna_data.X.data *= np.repeat(scale, np.diff(data.X.indptr)) |
| 155 | + rna_data.X.data *= np.repeat(scale, np.diff(rna_data.X.indptr)) |
74 | 156 | rna_data.X.data = np.log1p(rna_data.X.data) |
75 | 157 |
|
76 | 158 | for gene_name in args.generateGenderPlot: |
77 | | - plot_gene_violin( |
| 159 | + pg.violin( |
78 | 160 | rna_data, |
79 | 161 | gene_name, |
80 | 162 | "{output_name}.{gene_name}.violin.pdf".format( |
|
91 | 173 | print("total\t{}".format(rna_data.shape[0])) |
92 | 174 | for name, value in rna_data.obs["demux_type"].value_counts().items(): |
93 | 175 | print("{}\t{}".format(name, value)) |
94 | | - summary = rna_data.obs["demux_type"].value_counts().rename_axis('classification').reset_index(name='counts') |
| 176 | + summary = ( |
| 177 | + rna_data.obs["demux_type"] |
| 178 | + .value_counts() |
| 179 | + .rename_axis("classification") |
| 180 | + .reset_index(name="counts") |
| 181 | + ) |
95 | 182 | total = ["total", rna_data.shape[0]] |
96 | 183 | summary.loc[len(summary)] = total |
97 | 184 | summary.to_csv(output_name + "_summary.csv", index=False) |
98 | | - param_df.fillna("None",inplace=True) |
| 185 | + param_df.fillna("None", inplace=True) |
99 | 186 | param_df.to_csv(args.outputdir + "/params.csv", index=False) |
100 | | - |
101 | | - rna_data.obs.assignment.replace(np.nan,'negative', inplace=True) |
| 187 | + |
| 188 | + rna_data.obs.assignment.replace(np.nan, "negative", inplace=True) |
102 | 189 | hashtags = hashing_data.var.index.tolist() |
103 | 190 | hashtags = hashtags + ["negative"] |
104 | | - toreplace = [ht for ht in rna_data.obs['assignment'].unique() if ht not in hashtags] |
105 | | - rna_data.obs.assignment.replace(toreplace,'doublet', inplace=True) |
| 191 | + toreplace = [ht for ht in rna_data.obs["assignment"].unique() if ht not in hashtags] |
| 192 | + rna_data.obs.assignment.replace(toreplace, "doublet", inplace=True) |
106 | 193 | rna_data.obs.to_csv(output_name + "_obs.csv") |
0 commit comments