|
| 1 | +import logging |
| 2 | +import numpy as np |
| 3 | +import pandas as pd |
| 4 | +import scanpy as sc |
| 5 | + |
| 6 | +try: |
| 7 | + from monitor import setup_logger, log_exception |
| 8 | + from annotations import Annotations |
| 9 | + from clusters import Clusters |
| 10 | + from cell_metadata import CellMetadata |
| 11 | + from ingest_files import IngestFiles |
| 12 | +except ImportError: |
| 13 | + # Used when importing as external package, e.g. imports in single_cell_portal code |
| 14 | + from .monitor import setup_logger, log_exception |
| 15 | + from .annotations import Annotations |
| 16 | + from .clusters import Clusters |
| 17 | + from .cell_metadata import CellMetadata |
| 18 | + from .ingest_files import IngestFiles |
| 19 | + |
| 20 | + |
| 21 | +class DifferentialExpression: |
| 22 | + ALLOWED_FILE_TYPES = ["text/csv", "text/plain", "text/tab-separated-values"] |
| 23 | + |
| 24 | + dev_logger = setup_logger(__name__, "log.txt", format="support_configs") |
| 25 | + de_logger = setup_logger( |
| 26 | + __name__ + ".de_logger", |
| 27 | + "de_log.txt", |
| 28 | + level=logging.INFO, |
| 29 | + format="support_configs", |
| 30 | + ) |
| 31 | + |
| 32 | + def __init__( |
| 33 | + self, |
| 34 | + cluster, |
| 35 | + cell_metadata, |
| 36 | + matrix_file_path, |
| 37 | + matrix_file_type, |
| 38 | + annotation, |
| 39 | + **kwargs, |
| 40 | + ): |
| 41 | + self.cluster = cluster |
| 42 | + self.metadata = cell_metadata |
| 43 | + self.annotation = annotation |
| 44 | + self.matrix_file_path = matrix_file_path |
| 45 | + self.matrix_file_type = matrix_file_type |
| 46 | + self.kwargs = kwargs |
| 47 | + self.accession = self.kwargs["study_accession"] |
| 48 | + # only used in output filename, removing spaces |
| 49 | + self.cluster_name = self.kwargs["name"].replace(" ", "_") |
| 50 | + self.method = self.kwargs["method"] |
| 51 | + |
| 52 | + if matrix_file_type == "mtx": |
| 53 | + self.genes_path = self.kwargs["gene_file"] |
| 54 | + genes_ingest_file = IngestFiles(self.genes_path, self.ALLOWED_FILE_TYPES) |
| 55 | + self.genes_file = genes_ingest_file.resolve_path(self.genes_path)[0] |
| 56 | + self.genes: List[str] = [ |
| 57 | + g.strip().strip('"') for g in self.genes_file.readlines() |
| 58 | + ] |
| 59 | + |
| 60 | + self.barcodes_path = self.kwargs["barcode_file"] |
| 61 | + barcodes_ingest_file = IngestFiles( |
| 62 | + self.barcodes_path, self.ALLOWED_FILE_TYPES |
| 63 | + ) |
| 64 | + self.barcodes_file = barcodes_ingest_file.resolve_path(self.barcodes_path)[ |
| 65 | + 0 |
| 66 | + ] |
| 67 | + self.barcodes: List[str] = [ |
| 68 | + c.strip().strip('"') for c in self.barcodes_file.readlines() |
| 69 | + ] |
| 70 | + DifferentialExpression.de_logger.info(f"DifferentialExpression initialized") |
| 71 | + |
| 72 | + @staticmethod |
| 73 | + def get_cluster_cells(cluster_cells): |
| 74 | + """ ID cells in cluster file """ |
| 75 | + # cluster_cells.tolist() yields a list of lists that needs to be flattened |
| 76 | + # using extend converts a single-value list to a plain value |
| 77 | + cluster_cell_values = cluster_cells.tolist() |
| 78 | + cluster_cell_list = [] |
| 79 | + for value in cluster_cell_values: |
| 80 | + cluster_cell_list.extend(value) |
| 81 | + return cluster_cell_list |
| 82 | + |
| 83 | + @staticmethod |
| 84 | + def determine_dtypes(headers, annot_types): |
| 85 | + """ use SCP TYPE data to coerce data to proper dtypes: |
| 86 | + numeric-like group annotations |
| 87 | + missing values in group annotations (to avoid NaN) |
| 88 | + """ |
| 89 | + dtype_info = dict(zip(headers, annot_types)) |
| 90 | + dtypes = {} |
| 91 | + for header, type_info in dtype_info.items(): |
| 92 | + if type_info == "group": |
| 93 | + dtypes[header] = "string" |
| 94 | + return dtypes |
| 95 | + |
| 96 | + @staticmethod |
| 97 | + def load_raw_annots(metadata_file_path, allowed_file_types, headers, dtypes): |
| 98 | + """ using SCP metadata header lines |
| 99 | + create properly coerced pandas dataframe of all study metadata |
| 100 | + """ |
| 101 | + annot_redux = IngestFiles(metadata_file_path, allowed_file_types) |
| 102 | + annot_file_type = annot_redux.get_file_type(metadata_file_path)[0] |
| 103 | + annot_file_handle = annot_redux.open_file(metadata_file_path)[1] |
| 104 | + raw_annots = annot_redux.open_pandas( |
| 105 | + metadata_file_path, |
| 106 | + annot_file_type, |
| 107 | + open_file_object=annot_file_handle, |
| 108 | + names=headers, |
| 109 | + skiprows=2, |
| 110 | + index_col=0, |
| 111 | + dtype=dtypes, |
| 112 | + ) |
| 113 | + return raw_annots |
| 114 | + |
| 115 | + @staticmethod |
| 116 | + def subset_annots(metadata, de_cells): |
| 117 | + """ subset metadata based on cells in cluster |
| 118 | + """ |
| 119 | + DifferentialExpression.de_logger.info( |
| 120 | + f"subsetting metadata on cells in clustering" |
| 121 | + ) |
| 122 | + dtypes = DifferentialExpression.determine_dtypes( |
| 123 | + metadata.headers, metadata.annot_types |
| 124 | + ) |
| 125 | + raw_annots = DifferentialExpression.load_raw_annots( |
| 126 | + metadata.file_path, metadata.ALLOWED_FILE_TYPES, metadata.headers, dtypes |
| 127 | + ) |
| 128 | + cluster_annots = raw_annots[raw_annots.index.isin(de_cells)] |
| 129 | + return cluster_annots |
| 130 | + |
| 131 | + @staticmethod |
| 132 | + def order_annots(metadata, adata_cells): |
| 133 | + """ order metadata based on cells order in matrix |
| 134 | + """ |
| 135 | + matrix_cell_order = adata_cells.tolist() |
| 136 | + return metadata.reindex(matrix_cell_order) |
| 137 | + |
| 138 | + @staticmethod |
| 139 | + def subset_adata(adata, de_cells): |
| 140 | + """ subset adata object based on cells in cluster |
| 141 | + """ |
| 142 | + DifferentialExpression.de_logger.info( |
| 143 | + f"subsetting matrix on cells in clustering" |
| 144 | + ) |
| 145 | + matrix_subset_list = np.in1d(adata.obs_names, de_cells) |
| 146 | + adata = adata[matrix_subset_list] |
| 147 | + return adata |
| 148 | + |
| 149 | + def execute_de(self): |
| 150 | + if self.matrix_file_type == "mtx": |
| 151 | + self.prepare_h5ad( |
| 152 | + self.cluster, |
| 153 | + self.metadata, |
| 154 | + self.matrix_file_path, |
| 155 | + self.matrix_file_type, |
| 156 | + self.annotation, |
| 157 | + self.accession, |
| 158 | + self.genes, |
| 159 | + self.barcodes, |
| 160 | + ) |
| 161 | + DifferentialExpression.de_logger.info("preparing DE on sparse matrix") |
| 162 | + else: |
| 163 | + self.run_h5ad( |
| 164 | + self.cluster, |
| 165 | + self.metadata, |
| 166 | + self.matrix_file_path, |
| 167 | + self.matrix_file_type, |
| 168 | + self.annotation, |
| 169 | + self.accession, |
| 170 | + self.cluster_name, |
| 171 | + self.method, |
| 172 | + ) |
| 173 | + DifferentialExpression.de_logger.info("preparing DE on dense matrix") |
| 174 | + |
| 175 | + @staticmethod |
| 176 | + def run_h5ad( |
| 177 | + cluster, |
| 178 | + metadata, |
| 179 | + matrix_file_path, |
| 180 | + matrix_file_type, |
| 181 | + annotation, |
| 182 | + study_accession, |
| 183 | + cluster_name, |
| 184 | + method, |
| 185 | + genes=None, |
| 186 | + barcodes=None, |
| 187 | + ): |
| 188 | + |
| 189 | + de_cells = DifferentialExpression.get_cluster_cells(cluster.file['NAME'].values) |
| 190 | + de_annots = DifferentialExpression.subset_annots(metadata, de_cells) |
| 191 | + |
| 192 | + if matrix_file_type == "dense": |
| 193 | + # will need try/except (SCP-4205) |
| 194 | + adata = sc.read(matrix_file_path) |
| 195 | + else: |
| 196 | + # MTX DE UNTESTED (SCP-4203) |
| 197 | + # will want try/except here to catch failed data object composition |
| 198 | + adata = sc.read_mtx(matrix_file_path) |
| 199 | + # For AnnData, obs are cells and vars are genes |
| 200 | + # BUT transpose needed for both dense and sparse |
| 201 | + # so transpose step is after this data object composition step |
| 202 | + # therefore the assignements below are the reverse of expected |
| 203 | + adata.obs_names = genes |
| 204 | + adata.var_names = barcodes |
| 205 | + |
| 206 | + adata = adata.transpose() |
| 207 | + |
| 208 | + adata = DifferentialExpression.subset_adata(adata, de_cells) |
| 209 | + |
| 210 | + # will need try/except (SCP-4205) |
| 211 | + adata.obs = DifferentialExpression.order_annots(de_annots, adata.obs_names) |
| 212 | + |
| 213 | + sc.pp.normalize_total(adata, target_sum=1e4) |
| 214 | + sc.pp.log1p(adata) |
| 215 | + rank_key = "rank." + annotation + "." + method |
| 216 | + DifferentialExpression.de_logger.info(f"calculating DE") |
| 217 | + try: |
| 218 | + sc.tl.rank_genes_groups( |
| 219 | + adata, |
| 220 | + annotation, |
| 221 | + key_added=rank_key, |
| 222 | + use_raw=False, |
| 223 | + method=method, |
| 224 | + pts=True, |
| 225 | + ) |
| 226 | + except KeyError: |
| 227 | + msg = f"Missing expected annotation in metadata: {annotation}, unable to calculate DE." |
| 228 | + log_exception( |
| 229 | + DifferentialExpression.dev_logger, DifferentialExpression.de_logger, msg |
| 230 | + ) |
| 231 | + raise KeyError(msg) |
| 232 | + |
| 233 | + groups = np.unique(adata.obs[annotation]).tolist() |
| 234 | + for group in groups: |
| 235 | + group_filename = group.replace(" ", "_") |
| 236 | + DifferentialExpression.de_logger.info(f"Writing DE output for {str(group)}") |
| 237 | + rank = sc.get.rank_genes_groups_df(adata, key=rank_key, group=str(group)) |
| 238 | + |
| 239 | + out_file = ( |
| 240 | + f'{cluster_name}--{annotation}--{str(group_filename)}--{method}.tsv' |
| 241 | + ) |
| 242 | + # Round numbers to 4 significant digits while respecting fixed point |
| 243 | + # and scientific notation (note: trailing zeros are removed) |
| 244 | + rank.to_csv(out_file, sep='\t', float_format='%.4g') |
| 245 | + |
| 246 | + # Provide h5ad of DE analysis as reference computable object |
| 247 | + # DifferentialExpression.de_logger.info(f"Writing DE h5ad file") |
| 248 | + # file_name = f'{study_accession}_{cluster_name}_to_DE.h5ad' |
| 249 | + # adata.write_h5ad(file_name) |
| 250 | + |
| 251 | + DifferentialExpression.de_logger.info(f"DE processing complete") |
| 252 | + |
0 commit comments