Skip to content

Commit

Permalink
Update dependencies and fix tests
Browse files Browse the repository at this point in the history
  • Loading branch information
LouisK92 committed Mar 27, 2024
1 parent 44a9888 commit c2e7226
Show file tree
Hide file tree
Showing 38 changed files with 7,902 additions and 10,464 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@ __pycache__/
*.py[cod]
*$py.class

#
*.DS_Store

# C extensions
*.so

Expand Down
569 changes: 315 additions & 254 deletions poetry.lock

Large diffs are not rendered by default.

61 changes: 42 additions & 19 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,28 +16,51 @@ classifiers = [

[tool.poetry.dependencies]
python = ">=3.9,<3.11"
click = "^8.0.1"
click = ">=8.0.1"
rich = ">=10.1.0"
PyYAML = "^6.0.1"
Jinja2 = "^3.0.1"
numpy = "^1.21.1"
PyYAML = ">=6.0.1"
Jinja2 = ">=3.0.1"
numpy = ">=1.21.1"
pandas = "^1.3.0"
scanpy = "^1.8.1"
seaborn = "^0.11.1"
matplotlib = "^3.6.3"
leidenalg = "^0.8.7"
questionary = "^1.10.0"
xgboost = "^1.6.1"
pypi-latest = "^0.1.0"
"ruamel.yaml" = "^0.17.10"
jupyter-sphinx = "^0.3.2"
nox = "^2023.04.22"
nox-poetry = "^1.0.3"
pandoc = "^2.1"
bandit = "^1.7.6"
venndata = "^0.1.0"
Pillow = "^10.0.2"
scanpy = ">=1.9.8"
seaborn = ">=0.11.1"
matplotlib = ">=3.6.3"
leidenalg = ">=0.8.7"
questionary = ">=1.10.0"
xgboost = ">=1.6.1"
pypi-latest = ">=0.1.0"
"ruamel.yaml" = ">=0.17.10"
jupyter-sphinx = ">=0.3.2"
nox = ">=2023.04.22"
nox-poetry = ">=1.0.3"
pandoc = ">=2.1"
bandit = ">=1.7.6"
venndata = ">=0.1.0"
Pillow = ">=10.0.2"
UpSetPlot = ">=0.7.0"
#python = ">=3.9,<3.11"
#click = "^8.0.1"
#rich = ">=10.1.0"
#PyYAML = "^6.0.1"
#Jinja2 = "^3.0.1"
#numpy = "^1.21.1"
#pandas = "^1.3.0"
#scanpy = "^1.9.8"
#seaborn = ">=0.11.1"
#matplotlib = "^3.6.3"
#leidenalg = "^0.8.7"
#questionary = "^1.10.0"
#xgboost = "^1.6.1"
#pypi-latest = "^0.1.0"
#"ruamel.yaml" = "^0.17.10"
#jupyter-sphinx = "^0.3.2"
#nox = "^2023.04.22"
#nox-poetry = "^1.0.3"
#pandoc = "^2.1"
#bandit = "^1.7.6"
#venndata = "^0.1.0"
#Pillow = "^10.0.2"
#UpSetPlot = ">=0.7.0"

[tool.poetry.dev-dependencies]
pytest = ">=7.4.4"
Expand Down
11 changes: 8 additions & 3 deletions spapros/evaluation/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,8 +256,7 @@ def __init__(
self.scheme = scheme
self.marker_list = marker_list
self.metrics_params = self._prepare_metrics_params(metrics_params)
assert metrics is not None, "metrics must be provided for custom scheme"
self.metrics: List[str] = metrics if (scheme == "custom") else self._get_metrics_of_scheme()
self.metrics = self._get_metrics_of_scheme(metrics)
self.ref_name = reference_name
self.ref_dir = reference_dir if (reference_dir is not None) else self._default_reference_dir()
self.verbosity = verbosity
Expand Down Expand Up @@ -598,10 +597,16 @@ def _prepare_metrics_params(self, new_params: Dict[str, Dict]) -> Dict[str, Dict

def _get_metrics_of_scheme(
self,
metrics,
) -> List[str]:
"""Get the metrics according to the chosen scheme."""

if self.scheme == "quick":
supported = ["quick", "full", "custom"]
assert self.scheme in supported, f"Invalid scheme {self.scheme}. Choose from 'quick', 'full', 'custom'."

if self.scheme == "custom":
assert metrics is not None, "metrics must be provided for custom scheme"
elif self.scheme == "quick":
metrics = ["knn_overlap", "forest_clfs", "gene_corr"]
elif self.scheme == "full":
metrics = ["cluster_similarity", "knn_overlap", "forest_clfs", "gene_corr"]
Expand Down
13 changes: 7 additions & 6 deletions spapros/evaluation/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -968,11 +968,12 @@ def xgboost_forest_classification(
celltypes = adata.obs[ct_key].unique().tolist()
# Filter out cell types with less cells than n_cells_min
cell_counts = adata.obs[ct_key].value_counts().loc[celltypes]
if (cell_counts < n_cells_min).any() and (verbosity > 0):
print(
f"[bold yellow]The following cell types are not included in forest classifications since they have fewer "
f"than {n_cells_min} cells: {cell_counts.loc[cell_counts < n_cells_min].index.tolist()}"
)
if (cell_counts < n_cells_min).any():
if verbosity > 0:
print(
f"[bold yellow]The following cell types are not included in forest classifications since they have "
f"fewer than {n_cells_min} cells: {cell_counts.loc[cell_counts < n_cells_min].index.tolist()}"
)
celltypes = [ct for ct in celltypes if (cell_counts.loc[ct] >= n_cells_min)]

# Get data
Expand Down Expand Up @@ -1025,7 +1026,7 @@ def xgboost_forest_classification(
sample_weight_train = compute_sample_weight("balanced", train_y)
sample_weight_test = compute_sample_weight("balanced", test_y)
# Fit the classifier
n_classes = len(np.unique(train_y))
n_classes = max(len(np.unique(train_y)), len(np.unique(test_y)))
clf = XGBClassifier(
max_depth=max_depth,
num_class=n_classes if n_classes > 2 else None,
Expand Down
2 changes: 1 addition & 1 deletion spapros/plotting/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -1018,7 +1018,7 @@ def selection_histogram(
if selection_label in penalty_keys:
for _, penalty_key in enumerate(penalty_keys[selection_label]):

if penalty_key not in adata.var:
if penalty_key not in adata.var.columns:
raise ValueError(f"Can't plot {penalty_key} because it was not found in adata.var. ")

if penalty_key not in x_axis_keys:
Expand Down
20 changes: 16 additions & 4 deletions spapros/selection/selection_procedure.py
Original file line number Diff line number Diff line change
Expand Up @@ -1652,6 +1652,11 @@ def plot_coexpression(
probeset_mask = a.var_names.isin(probeset)
a = a[:, probeset_mask & selection_mask]

genes = a.var_names.copy()
sc.pp.filter_genes(a, min_cells=1)
if len(genes) > a.n_vars:
print(f"Exclude genes since they are not expressed: {set(genes) - set(a.var_names)}")

if a.shape[1] < 2:
print(f"No plot is drawn for {selection} because it contains less than 2 genes. ")
continue
Expand Down Expand Up @@ -1754,6 +1759,7 @@ def plot_clf_genes(
# prepare df
if celltypes is None:
celltypes = self.celltypes
celltypes = [c for c in celltypes if c in df.columns]
df["decision_celltypes"] = df[celltypes].apply(lambda row: list(row[row == True].index), axis=1) # noqa: E712
if add_marker_genes and (self.selection["marker"] is not None):
df["marker_celltypes"] = [self.selection["marker"]["celltype"][gene] for gene in df.index]
Expand Down Expand Up @@ -2005,7 +2011,7 @@ def info(self) -> None:
def select_reference_probesets(
adata: sc.AnnData,
n: int,
genes_key: str = "highly_variable",
genes_key: Optional[str] = "highly_variable",
obs_key: str = "celltype",
methods: Union[List[str], Dict[str, Dict]] = ["PCA", "DE", "HVG", "random"],
seeds: List[int] = [0],
Expand All @@ -2021,6 +2027,7 @@ def select_reference_probesets(
Number of selected genes.
genes_key:
adata.var key for subset of preselected genes to run the selections on (typically 'highly_variable_genes').
Set to None to not subset genes.
obs_key:
Only required for method 'DE'. Column name of `adata.obs` for which marker scores are calculated.
methods:
Expand Down Expand Up @@ -2058,6 +2065,8 @@ def select_reference_probesets(
# Reshape methods to dict with empty hyperparams if given as a list
if isinstance(methods, list):
methods = {method: {} for method in methods}
elif not isinstance(methods, dict):
raise ValueError(f"methods must be a list or dict. Got {type(methods)} instead.")
assert isinstance(methods, dict)

# Filter unsupported methods
Expand Down Expand Up @@ -2094,9 +2103,12 @@ def select_reference_probesets(
if verbosity > 1:
sel_task = progress.add_task(f"Selecting {s['name']} genes...", total=1, level=2)

probesets[s["name"]] = selection_fcts[s["method"]](
adata[:, adata.var[genes_key]], n, inplace=False, **s["params"]
)
if genes_key is None:
probesets[s["name"]] = selection_fcts[s["method"]](adata, n, inplace=False, **s["params"])
else:
probesets[s["name"]] = selection_fcts[s["method"]](
adata[:, adata.var[genes_key]], n, inplace=False, **s["params"]
)

if save_dir:
probesets[s["name"]].to_csv(os.path.join(save_dir, s["name"]))
Expand Down
32 changes: 32 additions & 0 deletions tests/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
## Testing

Run tests with `nox` or `pytest`:

```bash
nox
```

```bash
pytest <args>
```

## Regenerating the test data
For testing there were some files generated at some point in the past. These files shouldn't be changed. However,
there are cases when they might need to be recreated. E.g. an `anndata` update could lead to warnings when loading h5ads
saved with older versions of `anndata` or similar things. In this case, the test data can be regenerated by running:

1. For test data etc.:
Run the functions of the file `tests/_generate_test_files.py`. (not implemented yet)

2. For tests that compare their outputs to previously generated outputs (mainly plots):
- Run the according tests
- Find out the temp directory of pytest: e.g. from `python`:
```python
import tempfile
tempfile.gettempdir()
```
The newest outputs should be in a folder like `<tempdir>/pytest-of-<user>/pytest-<pid>/test_<testname>/`
- Copy the new outputs to the according test subfolder `tests/...` and overwrite the old ones.

3. Some tests don't save their outputs to files but compare them directly to some reference values. In this case, there
should be a comment in the test code that explains how to regenerate the reference values. E.g. function `test_knns_shared_comp` in `spapros/tests/evaluation/test_metrics.py`.
6 changes: 2 additions & 4 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def selector_with_marker(small_adata):
forest_hparams={"n_trees": 10, "subsample": 200, "test_subsample": 400},
verbosity=0,
save_dir=None,
marker_list="/big/st/strasserl/spapros/tests/selection/test_data/small_data_marker_list.csv",
marker_list="tests/evaluation/test_data/small_data_marker_list.csv",
)
raw_selector.select_probeset()
return raw_selector
Expand Down Expand Up @@ -143,9 +143,7 @@ def evaluator_4_sets(small_adata, marker_list):
results_dir="tests/evaluation/test_data/evaluation_results_4_sets",
marker_list=marker_list,
)
four_probesets = pd.read_csv(
"/big/st/strasserl/spapros/tests/evaluation/test_data/4_probesets_of_20.csv", index_col=0
)
four_probesets = pd.read_csv("tests/evaluation/test_data/4_probesets_of_20.csv", index_col=0)
for set_id in four_probesets:
evaluator.evaluate_probeset(set_id=set_id, genes=list(four_probesets[set_id]))
return evaluator
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
,cluster_similarity nmi_5_20,cluster_similarity nmi_21_60,knn_overlap mean_overlap_AUC,forest_clfs accuracy,forest_clfs perct acc > 0.8,gene_corr 1 - mean,gene_corr perct max < 0.8,marker_corr per marker,marker_corr per celltype,marker_corr per marker mean > 0.025
ref_random,0.0623647871374067,0.0927104946422492,0.0009030800347461,0.2759546038164502,0.1561642009862721,0.992915456785802,1.0,0.1432902447379042,0.1275892204195234,0.1432902447379042
ref_PCA,0.805877546541095,0.7593064103029448,0.5675193925865685,0.9119385995927008,0.9431187524990003,0.7606639919179323,0.7511645800130607,0.4385151503152752,0.5133844442314954,0.4385151503152752
ref_DE,0.6975060455506337,0.6764212270254311,0.3661317202740812,0.901557706693009,0.8580713265600309,0.8332276216127024,0.6135335909572877,0.5737896071421181,0.5994240338272974,0.5737896071421181
ref_random,0.062364787137406716,0.09271049464224926,0.0009030800347461037,0.27595460381645026,0.15616420098627212,0.992915456785802,1.0,0.14329024473790428,0.12758922041952342,0.14329024473790428
ref_PCA,0.805877546541095,0.7593064103029448,0.5675193925865685,0.9119385995927007,0.9431187524990005,0.7606639919179323,0.7511645800130607,0.4385151503152752,0.5133844442314954,0.4385151503152752
ref_DE,0.6975060455506337,0.6764212270254311,0.36613172027408125,0.901557706693009,0.8580713265600309,0.8332276216127024,0.6135335909572877,0.5737896071421181,0.5994240338272974,0.5737896071421181
spapros_selection,0.8105865177394072,0.7569520384925248,0.5389194419268526,0.9229008935605123,0.9895748367319737,0.7777080096564589,0.7608637851426177,0.4709711093592685,0.5348414461486936,0.4709711093592685
Loading

0 comments on commit c2e7226

Please sign in to comment.