Update dependencies and fix tests

theislab · Mar 27, 2024 · c2e7226 · c2e7226
1 parent 44a9888
commit c2e7226
Show file tree

Hide file tree

Showing 38 changed files with 7,902 additions and 10,464 deletions.
diff --git a/.gitignore b/.gitignore
@@ -3,6 +3,9 @@ __pycache__/
 *.py[cod]
 *$py.class
 
+#
+*.DS_Store
+
 # C extensions
 *.so
 

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -16,28 +16,51 @@ classifiers = [
 
 [tool.poetry.dependencies]
 python = ">=3.9,<3.11"
-click = "^8.0.1"
+click = ">=8.0.1"
 rich = ">=10.1.0"
-PyYAML = "^6.0.1"
-Jinja2 = "^3.0.1"
-numpy = "^1.21.1"
+PyYAML = ">=6.0.1"
+Jinja2 = ">=3.0.1"
+numpy = ">=1.21.1"
 pandas = "^1.3.0"
-scanpy = "^1.8.1"
-seaborn = "^0.11.1"
-matplotlib = "^3.6.3"
-leidenalg = "^0.8.7"
-questionary = "^1.10.0"
-xgboost = "^1.6.1"
-pypi-latest = "^0.1.0"
-"ruamel.yaml" = "^0.17.10"
-jupyter-sphinx = "^0.3.2"
-nox = "^2023.04.22"
-nox-poetry = "^1.0.3"
-pandoc = "^2.1"
-bandit = "^1.7.6"
-venndata = "^0.1.0"
-Pillow = "^10.0.2"
+scanpy = ">=1.9.8"
+seaborn = ">=0.11.1"
+matplotlib = ">=3.6.3"
+leidenalg = ">=0.8.7"
+questionary = ">=1.10.0"
+xgboost = ">=1.6.1"
+pypi-latest = ">=0.1.0"
+"ruamel.yaml" = ">=0.17.10"
+jupyter-sphinx = ">=0.3.2"
+nox = ">=2023.04.22"
+nox-poetry = ">=1.0.3"
+pandoc = ">=2.1"
+bandit = ">=1.7.6"
+venndata = ">=0.1.0"
+Pillow = ">=10.0.2"
 UpSetPlot = ">=0.7.0"
+#python = ">=3.9,<3.11"
+#click = "^8.0.1"
+#rich = ">=10.1.0"
+#PyYAML = "^6.0.1"
+#Jinja2 = "^3.0.1"
+#numpy = "^1.21.1"
+#pandas = "^1.3.0"
+#scanpy = "^1.9.8"
+#seaborn = ">=0.11.1"
+#matplotlib = "^3.6.3"
+#leidenalg = "^0.8.7"
+#questionary = "^1.10.0"
+#xgboost = "^1.6.1"
+#pypi-latest = "^0.1.0"
+#"ruamel.yaml" = "^0.17.10"
+#jupyter-sphinx = "^0.3.2"
+#nox = "^2023.04.22"
+#nox-poetry = "^1.0.3"
+#pandoc = "^2.1"
+#bandit = "^1.7.6"
+#venndata = "^0.1.0"
+#Pillow = "^10.0.2"
+#UpSetPlot = ">=0.7.0"
 
 [tool.poetry.dev-dependencies]
 pytest = ">=7.4.4"

diff --git a/spapros/evaluation/evaluation.py b/spapros/evaluation/evaluation.py
@@ -256,8 +256,7 @@ def __init__(
         self.scheme = scheme
         self.marker_list = marker_list
         self.metrics_params = self._prepare_metrics_params(metrics_params)
-        assert metrics is not None, "metrics must be provided for custom scheme"
-        self.metrics: List[str] = metrics if (scheme == "custom") else self._get_metrics_of_scheme()
+        self.metrics = self._get_metrics_of_scheme(metrics)
         self.ref_name = reference_name
         self.ref_dir = reference_dir if (reference_dir is not None) else self._default_reference_dir()
         self.verbosity = verbosity
@@ -598,10 +597,16 @@ def _prepare_metrics_params(self, new_params: Dict[str, Dict]) -> Dict[str, Dict
 
     def _get_metrics_of_scheme(
         self,
+        metrics,
     ) -> List[str]:
         """Get the metrics according to the chosen scheme."""
 
-        if self.scheme == "quick":
+        supported = ["quick", "full", "custom"]
+        assert self.scheme in supported, f"Invalid scheme {self.scheme}. Choose from 'quick', 'full', 'custom'."
+
+        if self.scheme == "custom":
+            assert metrics is not None, "metrics must be provided for custom scheme"
+        elif self.scheme == "quick":
             metrics = ["knn_overlap", "forest_clfs", "gene_corr"]
         elif self.scheme == "full":
             metrics = ["cluster_similarity", "knn_overlap", "forest_clfs", "gene_corr"]

diff --git a/spapros/evaluation/metrics.py b/spapros/evaluation/metrics.py
@@ -968,11 +968,12 @@ def xgboost_forest_classification(
         celltypes = adata.obs[ct_key].unique().tolist()
     # Filter out cell types with less cells than n_cells_min
     cell_counts = adata.obs[ct_key].value_counts().loc[celltypes]
-    if (cell_counts < n_cells_min).any() and (verbosity > 0):
-        print(
-            f"[bold yellow]The following cell types are not included in forest classifications since they have fewer "
-            f"than {n_cells_min} cells: {cell_counts.loc[cell_counts < n_cells_min].index.tolist()}"
-        )
+    if (cell_counts < n_cells_min).any():
+        if verbosity > 0:
+            print(
+                f"[bold yellow]The following cell types are not included in forest classifications since they have "
+                f"fewer than {n_cells_min} cells: {cell_counts.loc[cell_counts < n_cells_min].index.tolist()}"
+            )
         celltypes = [ct for ct in celltypes if (cell_counts.loc[ct] >= n_cells_min)]
 
     # Get data
@@ -1025,7 +1026,7 @@ def xgboost_forest_classification(
             sample_weight_train = compute_sample_weight("balanced", train_y)
             sample_weight_test = compute_sample_weight("balanced", test_y)
             # Fit the classifier
-            n_classes = len(np.unique(train_y))
+            n_classes = max(len(np.unique(train_y)), len(np.unique(test_y)))
             clf = XGBClassifier(
                 max_depth=max_depth,
                 num_class=n_classes if n_classes > 2 else None,

diff --git a/spapros/plotting/plot.py b/spapros/plotting/plot.py
@@ -1018,7 +1018,7 @@ def selection_histogram(
             if selection_label in penalty_keys:
                 for _, penalty_key in enumerate(penalty_keys[selection_label]):
 
-                    if penalty_key not in adata.var:
+                    if penalty_key not in adata.var.columns:
                         raise ValueError(f"Can't plot {penalty_key} because it was not found in adata.var. ")
 
                     if penalty_key not in x_axis_keys:

diff --git a/spapros/selection/selection_procedure.py b/spapros/selection/selection_procedure.py
@@ -1652,6 +1652,11 @@ def plot_coexpression(
             probeset_mask = a.var_names.isin(probeset)
             a = a[:, probeset_mask & selection_mask]
 
+            genes = a.var_names.copy()
+            sc.pp.filter_genes(a, min_cells=1)
+            if len(genes) > a.n_vars:
+                print(f"Exclude genes since they are not expressed: {set(genes) - set(a.var_names)}")
+
             if a.shape[1] < 2:
                 print(f"No plot is drawn for {selection} because it contains less than 2 genes. ")
                 continue
@@ -1754,6 +1759,7 @@ def plot_clf_genes(
         # prepare df
         if celltypes is None:
             celltypes = self.celltypes
+        celltypes = [c for c in celltypes if c in df.columns]
         df["decision_celltypes"] = df[celltypes].apply(lambda row: list(row[row == True].index), axis=1)  # noqa: E712
         if add_marker_genes and (self.selection["marker"] is not None):
             df["marker_celltypes"] = [self.selection["marker"]["celltype"][gene] for gene in df.index]
@@ -2005,7 +2011,7 @@ def info(self) -> None:
 def select_reference_probesets(
     adata: sc.AnnData,
     n: int,
-    genes_key: str = "highly_variable",
+    genes_key: Optional[str] = "highly_variable",
     obs_key: str = "celltype",
     methods: Union[List[str], Dict[str, Dict]] = ["PCA", "DE", "HVG", "random"],
     seeds: List[int] = [0],
@@ -2021,6 +2027,7 @@ def select_reference_probesets(
             Number of selected genes.
         genes_key:
             adata.var key for subset of preselected genes to run the selections on (typically 'highly_variable_genes').
+            Set to None to not subset genes.
         obs_key:
             Only required for method 'DE'. Column name of `adata.obs` for which marker scores are calculated.
         methods:
@@ -2058,6 +2065,8 @@ def select_reference_probesets(
     # Reshape methods to dict with empty hyperparams if given as a list
     if isinstance(methods, list):
         methods = {method: {} for method in methods}
+    elif not isinstance(methods, dict):
+        raise ValueError(f"methods must be a list or dict. Got {type(methods)} instead.")
     assert isinstance(methods, dict)
 
     # Filter unsupported methods
@@ -2094,9 +2103,12 @@ def select_reference_probesets(
             if verbosity > 1:
                 sel_task = progress.add_task(f"Selecting {s['name']} genes...", total=1, level=2)
 
-            probesets[s["name"]] = selection_fcts[s["method"]](
-                adata[:, adata.var[genes_key]], n, inplace=False, **s["params"]
-            )
+            if genes_key is None:
+                probesets[s["name"]] = selection_fcts[s["method"]](adata, n, inplace=False, **s["params"])
+            else:
+                probesets[s["name"]] = selection_fcts[s["method"]](
+                    adata[:, adata.var[genes_key]], n, inplace=False, **s["params"]
+                )
 
             if save_dir:
                 probesets[s["name"]].to_csv(os.path.join(save_dir, s["name"]))

diff --git a/tests/README.md b/tests/README.md
@@ -0,0 +1,32 @@
+## Testing
+
+Run tests with `nox` or `pytest`:
+
+```bash
+nox
+```
+
+```bash
+pytest <args>
+```
+
+## Regenerating the test data
+For testing there were some files generated at some point in the past. These files shouldn't be changed. However, 
+there are cases when they might need to be recreated. E.g. an `anndata` update could lead to warnings when loading h5ads 
+saved with older versions of `anndata` or similar things. In this case, the test data can be regenerated by running:
+
+1. For test data etc.:
+Run the functions of the file `tests/_generate_test_files.py`. (not implemented yet)
+
+2. For tests that compare their outputs to previously generated outputs (mainly plots):
+- Run the according tests
+- Find out the temp directory of pytest: e.g. from `python`:
+```python
+import tempfile
+tempfile.gettempdir()
+```
+The newest outputs should be in a folder like `<tempdir>/pytest-of-<user>/pytest-<pid>/test_<testname>/`
+- Copy the new outputs to the according test subfolder `tests/...` and overwrite the old ones.
+
+3. Some tests don't save their outputs to files but compare them directly to some reference values. In this case, there 
+should be a comment in the test code that explains how to regenerate the reference values. E.g. function `test_knns_shared_comp` in `spapros/tests/evaluation/test_metrics.py`.
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -47,7 +47,7 @@ def selector_with_marker(small_adata):
         forest_hparams={"n_trees": 10, "subsample": 200, "test_subsample": 400},
         verbosity=0,
         save_dir=None,
-        marker_list="/big/st/strasserl/spapros/tests/selection/test_data/small_data_marker_list.csv",
+        marker_list="tests/evaluation/test_data/small_data_marker_list.csv",
     )
     raw_selector.select_probeset()
     return raw_selector
@@ -143,9 +143,7 @@ def evaluator_4_sets(small_adata, marker_list):
         results_dir="tests/evaluation/test_data/evaluation_results_4_sets",
         marker_list=marker_list,
     )
-    four_probesets = pd.read_csv(
-        "/big/st/strasserl/spapros/tests/evaluation/test_data/4_probesets_of_20.csv", index_col=0
-    )
+    four_probesets = pd.read_csv("tests/evaluation/test_data/4_probesets_of_20.csv", index_col=0)
     for set_id in four_probesets:
         evaluator.evaluate_probeset(set_id=set_id, genes=list(four_probesets[set_id]))
     return evaluator

diff --git a/tests/evaluation/test_data/evaluation_results_4_sets/adata1_summary.csv b/tests/evaluation/test_data/evaluation_results_4_sets/adata1_summary.csv
@@ -1,5 +1,5 @@
 ,cluster_similarity nmi_5_20,cluster_similarity nmi_21_60,knn_overlap mean_overlap_AUC,forest_clfs accuracy,forest_clfs perct acc > 0.8,gene_corr 1 - mean,gene_corr perct max < 0.8,marker_corr per marker,marker_corr per celltype,marker_corr per marker mean > 0.025
-ref_random,0.0623647871374067,0.0927104946422492,0.0009030800347461,0.2759546038164502,0.1561642009862721,0.992915456785802,1.0,0.1432902447379042,0.1275892204195234,0.1432902447379042
-ref_PCA,0.805877546541095,0.7593064103029448,0.5675193925865685,0.9119385995927008,0.9431187524990003,0.7606639919179323,0.7511645800130607,0.4385151503152752,0.5133844442314954,0.4385151503152752
-ref_DE,0.6975060455506337,0.6764212270254311,0.3661317202740812,0.901557706693009,0.8580713265600309,0.8332276216127024,0.6135335909572877,0.5737896071421181,0.5994240338272974,0.5737896071421181
+ref_random,0.062364787137406716,0.09271049464224926,0.0009030800347461037,0.27595460381645026,0.15616420098627212,0.992915456785802,1.0,0.14329024473790428,0.12758922041952342,0.14329024473790428
+ref_PCA,0.805877546541095,0.7593064103029448,0.5675193925865685,0.9119385995927007,0.9431187524990005,0.7606639919179323,0.7511645800130607,0.4385151503152752,0.5133844442314954,0.4385151503152752
+ref_DE,0.6975060455506337,0.6764212270254311,0.36613172027408125,0.901557706693009,0.8580713265600309,0.8332276216127024,0.6135335909572877,0.5737896071421181,0.5994240338272974,0.5737896071421181
 spapros_selection,0.8105865177394072,0.7569520384925248,0.5389194419268526,0.9229008935605123,0.9895748367319737,0.7777080096564589,0.7608637851426177,0.4709711093592685,0.5348414461486936,0.4709711093592685