broadinstitute
diff --git a/‎.circleci/config.yml‎
Lines changed: 1 addition & 1 deletion b/‎.circleci/config.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎ingest/de.py‎
Lines changed: 62 additions & 12 deletions b/‎ingest/de.py‎
Lines changed: 62 additions & 12 deletions
diff --git a/‎ingest/ingest_files.py‎
Lines changed: 6 additions & 0 deletions b/‎ingest/ingest_files.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎ingest/ingest_pipeline.py‎
Lines changed: 58 additions & 28 deletions b/‎ingest/ingest_pipeline.py‎
Lines changed: 58 additions & 28 deletions
diff --git a/‎tests/data/differential_expression/de_integration_cluster.tsv‎ renamed to ‎tests/data/differential_expression/de_dense_cluster.tsv‎ b/‎tests/data/differential_expression/de_integration_cluster.tsv‎ renamed to ‎tests/data/differential_expression/de_dense_cluster.tsv‎
diff --git a/‎tests/data/differential_expression/de_integration.tsv‎ renamed to ‎tests/data/differential_expression/de_dense_matrix.tsv‎ b/‎tests/data/differential_expression/de_integration.tsv‎ renamed to ‎tests/data/differential_expression/de_dense_matrix.tsv‎
diff --git a/‎tests/data/differential_expression/de_integration_unordered_metadata.tsv‎ renamed to ‎tests/data/differential_expression/de_dense_metadata.tsv‎ b/‎tests/data/differential_expression/de_integration_unordered_metadata.tsv‎ renamed to ‎tests/data/differential_expression/de_dense_metadata.tsv‎
@@ -47,7 +47,7 @@ jobs:
           command: |
             . venv/bin/activate
             cd tests
-            pytest -k 'not test_genomes and not test_make_toy' --cov-report=xml --cov=../ingest/
+            pytest -k 'not test_genomes and not test_make_toy and not test_delocalize_file' --cov-report=xml --cov=../ingest/
 
       - codecov/upload:
           file: tests/coverage.xml
@@ -1,9 +1,9 @@
-from email.headerregistry import Group
 import logging
 import numpy as np
 import pandas as pd
 import scanpy as sc
 import re
+import glob
 
 try:
     from monitor import setup_logger, log_exception
@@ -124,9 +124,11 @@ def process_annots(metadata_file_path, allowed_file_types, headers, dtypes):
         """
         annot_redux = IngestFiles(metadata_file_path, allowed_file_types)
         annot_file_type = annot_redux.get_file_type(metadata_file_path)[0]
-        annot_file_handle = annot_redux.open_file(metadata_file_path)[1]
+        annot_file_handle, local_file_path = IngestFiles.resolve_path(
+            annot_redux, metadata_file_path
+        )
         annots = annot_redux.open_pandas(
-            metadata_file_path,
+            local_file_path,
             annot_file_type,
             open_file_object=annot_file_handle,
             names=headers,
@@ -225,17 +227,24 @@ def get_genes(genes_path):
             If two columns present, check if there are duplicates in 2nd col
             If no duplicates, use as var_names, else use 1st column
         """
-        genes_df = pd.read_csv(genes_path, sep="\t", header=None)
+        genes_object = IngestFiles(genes_path, None)
+        local_genes_path = genes_object.resolve_path(genes_path)[1]
+
+        genes_df = pd.read_csv(local_genes_path, sep="\t", header=None)
         if len(genes_df.columns) > 1:
             # unclear if falling back to gene_id is useful (SCP-4283)
             # print so we're aware of dups during dev testing
-            if genes_df[1].count() == genes_df[1].nunique():
-                msg = "dev_info: Features file contains duplicate identifiers (col 2)"
+            if genes_df[1].count() != genes_df[1].nunique():
+                msg = (
+                    "dev_info: Features file contains duplicate identifiers in column 2"
+                )
                 print(msg)
             return genes_df[1].tolist()
         else:
-            if genes_df[0].count() == genes_df[0].nunique():
-                msg = "dev_info: Features file contains duplicate identifiers (col 1)"
+            if genes_df[0].count() != genes_df[0].nunique():
+                msg = (
+                    "dev_info: Features file contains duplicate identifiers in column 1"
+                )
                 print(msg)
             return genes_df[0].tolist()
 
@@ -252,13 +261,29 @@ def get_barcodes(barcodes_path):
     def adata_from_mtx(matrix_file_path, genes_path, barcodes_path):
         """ reconstitute AnnData object from matrix, genes, barcodes files
         """
-        adata = sc.read_mtx(matrix_file_path)
+        # process smaller files before reading larger matrix file
+        barcodes = DifferentialExpression.get_barcodes(barcodes_path)
+        features = DifferentialExpression.get_genes(genes_path)
+        matrix_object = IngestFiles(matrix_file_path, None)
+        local_file_path = matrix_object.resolve_path(matrix_file_path)[1]
+        adata = sc.read_mtx(local_file_path)
         # For AnnData, obs are cells and vars are genes
         # BUT transpose needed for both dense and sparse
         # so transpose step is after this data object composition step
         # therefore the assignements below are the reverse of expected
-        adata.var_names = DifferentialExpression.get_barcodes(barcodes_path)
-        adata.obs_names = DifferentialExpression.get_genes(genes_path)
+        adata.var_names = barcodes
+        adata.obs_names = features
+        return adata
+
+    @staticmethod
+    def remove_single_sample_data(adata, annotation):
+        """ identify and remove cells that would constitute an annotation label
+            that has data with only a single sample
+        """
+        counts = adata.obs[annotation].value_counts(dropna=False)
+        for label, count in counts.iteritems():
+            if count == 1:
+                adata = adata[adata.obs[annotation] != label]
         return adata
 
     @staticmethod
@@ -285,7 +310,9 @@ def run_scanpy_de(
 
         if matrix_file_type == "dense":
             # will need try/except (SCP-4205)
-            adata = sc.read(matrix_file_path)
+            matrix_object = IngestFiles(matrix_file_path, None)
+            local_file_path = matrix_object.resolve_path(matrix_file_path)[1]
+            adata = sc.read(local_file_path)
         else:
             # MTX reconstitution UNTESTED (SCP-4203)
             # will want try/except here to catch failed data object composition
@@ -300,6 +327,8 @@ def run_scanpy_de(
         # will need try/except (SCP-4205)
         adata.obs = DifferentialExpression.order_annots(de_annots, adata.obs_names)
 
+        adata = DifferentialExpression.remove_single_sample_data(adata, annotation)
+
         sc.pp.normalize_total(adata, target_sum=1e4)
         sc.pp.log1p(adata)
         DifferentialExpression.de_logger.info("calculating DE")
@@ -348,3 +377,24 @@ def run_scanpy_de(
 
         DifferentialExpression.de_logger.info("DE processing complete")
 
+    @staticmethod
+    def string_for_output_match(arguments):
+        cleaned_cluster_name = re.sub(r'\W+', '_', arguments["cluster_name"])
+        cleaned_annotation_name = re.sub(r'\W+', '_', arguments["annotation_name"])
+        files_to_match = f"{cleaned_cluster_name}--{cleaned_annotation_name}*.tsv"
+        return files_to_match
+
+    @staticmethod
+    def delocalize_de_files(file_path, study_file_id, files_to_match):
+        """ Copy DE output files to study bucket
+        """
+
+        files = glob.glob(files_to_match)
+        for file in files:
+            IngestFiles.delocalize_file(
+                study_file_id,
+                None,
+                file_path,
+                file,
+                f"_scp_internal/differential_expression/{file}",
+            )
@@ -11,6 +11,8 @@
 import re
 from dataclasses import dataclass
 from typing import Dict, Generator, List, Tuple, Union  # noqa: F401
+import warnings
+
 
 import pandas as pd  # NOqa: F821
 from google.cloud import storage
@@ -73,6 +75,10 @@ class IngestFiles:
     # General logger for class
     # Logger provides more details
     dev_logger = setup_logger(__name__, "log.txt", format="support_configs")
+    # Filter out warnings about using end user credentials when running ingest_pipeline as dev
+    warnings.filterwarnings(
+        "ignore", "Your application has authenticated using end user credentials"
+    )
 
     def __init__(self, file_path, allowed_file_types):
         self.file_path = file_path
 
@@ -34,12 +34,20 @@
 
 # Ingest mtx files
 python ingest_pipeline.py --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_expression --taxon-name 'Homo sapiens' --taxon-common-name human --matrix-file ../tests/data/mtx/matrix.mtx --matrix-file-type mtx --gene-file ../tests/data/genes.tsv --barcode-file ../tests/data/barcodes.tsv
+
+# Differential Expression analysis (dense matrix)
+python ingest_pipeline.py --study-id addedfeed000000000000000 --study-file-id dec0dedfeed1111111111111 differential_expression --annotation-name cell_type__ontology_label --annotation-type group --annotation-scope study --matrix-file-path ../tests/data/differential_expression/de_integration.tsv --matrix-file-type dense --annotation-file ../tests/data/differential_expression/de_integration_unordered_metadata.tsv --cluster-file ../tests/data/differential_expression/de_integration_cluster.tsv --cluster-name de_integration --study-accession SCPdev --differential-expression
+
+# Differential Expression analysis (sparse matrix)
+python ingest_pipeline.py --study-id addedfeed000000000000000 --study-file-id dec0dedfeed1111111111111 differential_expression --annotation-name cell_type__ontology_label --annotation-type group --annotation-scope study --matrix-file-path ../tests/data/differential_expression/sparse/sparsemini_matrix.mtx --gene-file ../tests/data/differential_expression/sparse/sparsemini_features.tsv --barcode-file ../tests/data/differential_expression/sparse/sparsemini_barcodes.tsv --matrix-file-type mtx --cell-metadata-file ../tests/data/differential_expression/sparse/sparsemini_metadata.txt --cluster-file ../tests/data/differential_expression/sparse/sparsemini_cluster.txt --cluster-name de_sparse_integration --study-accession SCPsparsemini --differential-expression
+
 """
 import json
 import logging
 import os
 import re
 import sys
+import re
 from contextlib import nullcontext
 from typing import Dict, Generator, List, Tuple, Union
 from wsgiref.simple_server import WSGIRequestHandler  # noqa: F401
@@ -519,44 +527,66 @@ def run_ingest(ingest, arguments, parsed_args):
         config.set_parent_event_name("ingest-pipeline:differential-expression")
         status_de = ingest.calculate_de()
         status.append(status_de)
+        print(f'STATUS post-DE {status}')
 
     return status, status_cell_metadata
 
 
+def get_delocalization_info(arguments):
+    """ extract info on study file for delocalization decision-making
+    """
+    for argument in list(arguments.keys()):
+        captured_argument = re.match("(\w*file)$", argument)
+        if captured_argument is not None:
+            study_file_id = arguments["study_file_id"]
+            matched_argument = captured_argument.groups()[0]
+            file_path = arguments[matched_argument]
+
+            # Need 1 argument that has a path to identify google bucket
+            # Break after first argument
+            break
+    return file_path, study_file_id
+
+
 def exit_pipeline(ingest, status, status_cell_metadata, arguments):
     """Logs any errors, then exits Ingest Pipeline with standard OS code
     """
     if len(status) > 0:
-        if all(i < 1 for i in status):
+        # for successful DE jobs, need to delocalize results
+        if "differential_expression" in arguments and all(i < 1 for i in status):
+            file_path, study_file_id = get_delocalization_info(arguments)
+            # append status?
+            if IngestFiles.is_remote_file(file_path):
+                files_to_match = DifferentialExpression.string_for_output_match(
+                    arguments
+                )
+                DifferentialExpression.delocalize_de_files(
+                    file_path, study_file_id, files_to_match
+                )
+        # all non-DE ingest jobs can exit on success
+        elif all(i < 1 for i in status):
             sys.exit(os.EX_OK)
         else:
-            # delocalize errors file
-            for argument in list(arguments.keys()):
-                captured_argument = re.match("(\w*file)$", argument)
-                if captured_argument is not None:
-                    study_file_id = arguments["study_file_id"]
-                    matched_argument = captured_argument.groups()[0]
-                    file_path = arguments[matched_argument]
-                    if IngestFiles.is_remote_file(file_path):
-                        # Delocalize support log
-                        IngestFiles.delocalize_file(
-                            study_file_id,
-                            arguments["study_id"],
-                            file_path,
-                            "log.txt",
-                            f"parse_logs/{study_file_id}/log.txt",
-                        )
-                        # Delocalize user log
-                        IngestFiles.delocalize_file(
-                            study_file_id,
-                            arguments["study_id"],
-                            file_path,
-                            "user_log.txt",
-                            f"parse_logs/{study_file_id}/user_log.txt",
-                        )
-                    # Need 1 argument that has a path to identify google bucket
-                    # Break after first argument
-                    break
+            file_path, study_file_id = get_delocalization_info(arguments)
+            if IngestFiles.is_remote_file(file_path):
+                if "differential_expression" in arguments:
+                    log_path = (
+                        f"parse_logs/differential_expression/{study_file_id}/log.txt"
+                    )
+                else:
+                    log_path = f"parse_logs/{study_file_id}/log.txt"
+                # Delocalize support log
+                IngestFiles.delocalize_file(
+                    study_file_id, arguments["study_id"], file_path, "log.txt", log_path
+                )
+                # Delocalize user log
+                IngestFiles.delocalize_file(
+                    study_file_id,
+                    arguments["study_id"],
+                    file_path,
+                    "user_log.txt",
+                    log_path,
+                )
             if status_cell_metadata is not None:
                 if status_cell_metadata > 0 and ingest.cell_metadata.is_remote_file:
                     # PAPI jobs failing metadata validation against convention report