broadinstitute
diff --git a/‎ingest/anndata_.py‎
Lines changed: 58 additions & 121 deletions b/‎ingest/anndata_.py‎
Lines changed: 58 additions & 121 deletions
diff --git a/‎ingest/ingest_files.py‎
Lines changed: 1 addition & 1 deletion b/‎ingest/ingest_files.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎ingest/ingest_pipeline.py‎
Lines changed: 27 additions & 17 deletions b/‎ingest/ingest_pipeline.py‎
Lines changed: 27 additions & 17 deletions
@@ -1,7 +1,18 @@
 import pandas as pd  # NOqa: F821
 import os
-import datetime
+import gzip
+import shutil
 import scanpy as sc
+import scipy
+from scipy.io.mmio import MMFile
+
+# scipy.io.mmwrite uses scientific notation by default
+# https://stackoverflow.com/questions/64748513
+class MMFileFixedFormat(MMFile):
+    def _field_template(self, field, precision):
+        # Override MMFile._field_template.
+        return f'%.{precision}f\n'
+
 
 try:
     from ingest_files import IngestFiles
@@ -35,7 +46,7 @@ def obtain_adata(self):
         except ValueError as e:
             raise ValueError(e)
 
-    def validate(self):
+    def basic_validation(self):
         """
         Currently, file passes "basic validation" if file
         can be opened by scanpy
@@ -91,6 +102,7 @@ def generate_cluster_body(adata, clustering_name):
         pd.DataFrame(cluster_body).to_csv(
             filename, sep="\t", mode="a", header=None, index=False
         )
+        AnnDataIngestor.compress_file(filename)
 
     @staticmethod
     def set_clustering_filename(name):
@@ -114,17 +126,58 @@ def generate_metadata_file(adata, output_name):
             f.write('\t'.join(headers) + '\n')
             f.write('\t'.join(types) + '\n')
         adata.obs.to_csv(output_name, sep="\t", mode="a", header=None, index=True)
+        AnnDataIngestor.compress_file(output_name)
 
     @staticmethod
     def clusterings_to_delocalize(arguments):
         # ToDo - check if names using obsm_keys need sanitization
         cluster_file_names = []
         for name in arguments["obsm_keys"]:
-            cluster_file_names.append(AnnDataIngestor.set_clustering_filename(name))
+            compressed_file = AnnDataIngestor.set_clustering_filename(name) + ".gz"
+            cluster_file_names.append(compressed_file)
         return cluster_file_names
 
     @staticmethod
-    def delocalize_extracted_files(file_path, study_file_id, files_to_delocalize):
+    def compress_file(filename):
+        with open(filename, 'rb') as file_in:
+            compressed_file = filename + '.gz'
+            with gzip.open(compressed_file, 'wb') as file_gz:
+                shutil.copyfileobj(file_in, file_gz)
+        os.remove(filename)
+
+    @staticmethod
+    def generate_processed_matrix(adata):
+        """
+        Generate matrix files with the following file names:
+        h5ad_frag.matrix.processed.mtx
+        h5ad_frag.barcodes.processed.tsv
+        h5ad_frag.features.processed.tsv
+        Gzip files for faster delocalization
+        """
+        pd.DataFrame(adata.var.index).to_csv(
+            "h5ad_frag.features.processed.tsv.gz",
+            sep="\t",
+            index=False,
+            header=False,
+            compression="gzip",
+        )
+        pd.DataFrame(adata.obs.index).to_csv(
+            "h5ad_frag.barcodes.processed.tsv.gz",
+            sep="\t",
+            index=False,
+            header=False,
+            compression="gzip",
+        )
+        mtx_filename = "h5ad_frag.matrix.processed.mtx"
+        MMFileFixedFormat().write(
+            mtx_filename, a=scipy.sparse.csr_matrix(adata.X.T), precision=3
+        )
+        AnnDataIngestor.compress_file(mtx_filename)
+
+    @staticmethod
+    def delocalize_extracted_files(
+        file_path, study_file_id, accession, files_to_delocalize
+    ):
         """Copy extracted files to study bucket"""
 
         for file in files_to_delocalize:
@@ -133,121 +186,5 @@ def delocalize_extracted_files(file_path, study_file_id, files_to_delocalize):
                 None,
                 file_path,
                 file,
-                f"_scp_internal/anndata_ingest/{study_file_id}/{file}",
-            )
-
-    @staticmethod
-    def check_valid(adata):
-        error_messages = []
-
-        try:
-            AnnDataIngestor.check_names_unique(adata.var_names, "Feature")
-        except ValueError as v:
-            error_messages.append(str(v))
-        try:
-            AnnDataIngestor.check_names_unique(adata.obs_names, "Obs")
-        except ValueError as v:
-            error_messages.append(str(v))
-        if len(error_messages) > 0:
-            raise ValueError("; ".join(error_messages))
-
-        return True
-
-    def process_matrix(self):
-        """Perform matrix processing"""
-        if self.check_valid(self.adata):
-            self.transform()
-
-    @staticmethod
-    def check_names_unique(names, name_type):
-        """Return True if names are unique, else false
-        Expected name_types: ["Feature", "Obs"]
-        """
-        # check feature_name and obs names, feature_id logic not included
-        # TODO (SCP-5105) non-happy path - add feature_id assessment
-        if len(names) == len(names.unique()):
-            return True
-        else:
-            dups = list_duplicates(names)
-            features_for_msg = 2
-            end = features_for_msg if len(dups) > features_for_msg else len(dups)
-            dup_list = dups[:end]
-            dup_string = " ".join(dup_list)
-
-            msg = (
-                f"{name_type} names must be unique within a file. "
-                f"{len(dups)} duplicates found, including: {dup_string}"
-            )
-            GeneExpression.log_for_mixpanel(
-                "error", "content:duplicate:values-within-file", msg
-            )
-            raise ValueError(msg)
-
-    def transform(self):
-        """Transforms matrix into gene data model."""
-        # initialize settings for mock data loads in tests
-        self.test_models = None
-        self.models_processed = 0
-
-        # derive file name from file path
-        file_name = os.path.basename(self.file_path)
-        start_time = datetime.datetime.now()
-        GeneExpression.dev_logger.info("Starting run at " + str(start_time))
-        num_processed = 0
-        gene_models = []
-        data_arrays = []
-        for all_cell_model in GeneExpression.create_data_arrays(
-            name=f"{file_name} Cells",
-            array_type="cells",
-            values=self.adata.obs.index.tolist(),
-            linear_data_type="Study",
-            linear_data_id=self.study_file_id,
-            **self.data_array_kwargs,
-        ):
-            data_arrays.append(all_cell_model)
-
-        # ASSUMPTION all_cell_model same for raw_count and processed_expression
-        # TODO (SCP-5103): if raw counts is indicated check that .raw slot is populated
-
-        # Iterate over feature names (for happy path)
-        for feature in self.adata.var_names.tolist():
-            print(f"processing feature: {feature}")
-            feature_expression_series = sc.get.obs_df(self.adata, keys=feature)
-            if feature_expression_series.hasnans:
-                msg = (
-                    f'Expected numeric expression score - '
-                    f'expression data has NaN values for feature "{feature}"'
-                )
-                GeneExpression.log_for_mixpanel(
-                    "error", "content:type:not-numeric", msg
-                )
-                raise ValueError(msg)
-            # capture sparse (only non zero values and their cell IDs)
-            # check mtx.py for all zero gene handling
-            filtered_expression_series = feature_expression_series[
-                feature_expression_series.values > 0
-            ]
-
-            exp_cells = filtered_expression_series.index.tolist()
-
-            untrimmed_exp_scores = filtered_expression_series.values.tolist()
-
-            # trim expression data to three significant digits
-            exp_scores = [round(float(value), 3) for value in untrimmed_exp_scores]
-            # TODO (SCP-5105) for None value below, replace with feature ID (string)
-            data_arrays, gene_models, num_processed = self.create_models(
-                exp_cells,
-                exp_scores,
-                feature,
-                None,
-                gene_models,
-                data_arrays,
-                num_processed,
-                False,
-            )
-        # Load any remaining models. This is necessary because the amount of
-        # models may be less than the batch size.
-        if len(gene_models) > 0 or len(data_arrays) > 0:
-            self.create_models(
-                [], [], None, None, gene_models, data_arrays, num_processed, True
+                f"_scp_internal/anndata_ingest/{accession}_{study_file_id}/{file}",
             )
@@ -341,7 +341,7 @@ def open_pandas(self, file_path, file_type, **kwargs):
     def open_anndata(self, file_path, **kwargs):
         """Opens file as AnnData object"""
         try:
-            return sc.read_h5ad(file_path, backed='r')
+            return sc.read_h5ad(file_path)
         except OSError as e:
             msg = f"Scanpy cannot read file, \"{file_path}\"."
             log_exception(IngestFiles.dev_logger, IngestFiles.user_logger, msg)
 
@@ -500,14 +500,14 @@ def extract_from_anndata(self):
         self.anndata = AnnDataIngestor(
             self.anndata_file, self.study_id, self.study_file_id, **self.kwargs
         )
-        if self.anndata.validate():
-            self.report_validation("success")
-            # process matrix data
-            ### TODO (SCP-5102, SCP-5103): how to associate "raw_count" cells to anndata file
-            if self.kwargs.get("extract") and "processed_expression" in self.kwargs.get(
-                "extract"
-            ):
-                self.anndata.process_matrix()
+        if self.anndata.basic_validation():
+            # Get metadata extraction parameters and perform extraction
+            if self.kwargs.get("extract") and "metadata" in self.kwargs.get("extract"):
+                metadata_filename = "h5ad_frag.metadata.tsv"
+                # TODO (SCP-5104): perform check for successful extraction or report failure and exit
+                AnnDataIngestor.generate_metadata_file(
+                    self.anndata.adata, metadata_filename
+                )
             # Get cluster extraction parameters and perform extraction
             if self.kwargs.get("extract") and "cluster" in self.kwargs.get("extract"):
                 if not self.kwargs["obsm_keys"]:
@@ -519,13 +519,13 @@ def extract_from_anndata(self):
                         self.anndata.adata, key
                     )
                     AnnDataIngestor.generate_cluster_body(self.anndata.adata, key)
-            # Get metadata extraction parameters and perform extraction
-            if self.kwargs.get("extract") and "metadata" in self.kwargs.get("extract"):
-                metadata_filename = f"h5ad_frag.metadata.tsv"
-                # TODO (SCP-5104): perform check for successful extraction or report failure and exit
-                AnnDataIngestor.generate_metadata_file(
-                    self.anndata.adata, metadata_filename
-                )
+            # process matrix data
+            ### TODO (SCP-5102, SCP-5103): how to associate "raw_count" cells to anndata file
+            if self.kwargs.get("extract") and "processed_expression" in self.kwargs.get(
+                "extract"
+            ):
+                self.anndata.generate_processed_matrix(self.anndata.adata)
+            self.report_validation("success")
             return 0
         # scanpy unable to open AnnData file
         else:
@@ -652,10 +652,19 @@ def exit_pipeline(ingest, status, status_cell_metadata, arguments):
                         AnnDataIngestor.clusterings_to_delocalize(arguments)
                     )
                 if "metadata" in arguments.get("extract"):
-                    metadata_filename = f"h5ad_frag.metadata.tsv"
+                    metadata_filename = f"h5ad_frag.metadata.tsv.gz"
                     files_to_delocalize.append(metadata_filename)
+                if "processed_expression" in arguments.get("extract"):
+                    mtx = "h5ad_frag.matrix.processed.mtx.gz"
+                    barcodes = "h5ad_frag.barcodes.processed.tsv.gz"
+                    features = "h5ad_frag.features.processed.tsv.gz"
+                    mtx_bundle = [mtx, barcodes, features]
+                    files_to_delocalize.extend(mtx_bundle)
                 AnnDataIngestor.delocalize_extracted_files(
-                    file_path, study_file_id, files_to_delocalize
+                    file_path,
+                    study_file_id,
+                    arguments["study_accession"],
+                    files_to_delocalize,
                 )
         # all non-DE, non-anndata ingest jobs can exit on success
         elif all(i < 1 for i in status):
@@ -726,6 +735,7 @@ def main() -> None:
     # Log Mixpanel events
     MetricsService.log(config.get_parent_event_name(), config.get_metric_properties())
     # Exit pipeline
+    arguments["study_accession"] = metrics_dump["studyAccession"]
     exit_pipeline(ingest, status, status_cell_metadata, arguments)