Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimise PC regression #408

Merged
merged 28 commits into from
Dec 20, 2024
Merged
Show file tree
Hide file tree
Changes from 22 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
d0ee46d
add numpy linreg function and use in pcr
mumichae Apr 22, 2024
c2b3e1c
allow configuring linreg method for pcr methods
mumichae Apr 22, 2024
cecc9f0
simplify numpy linreg function
mumichae Apr 22, 2024
8c7df11
add tests for different linreg implementations
mumichae Apr 22, 2024
25fb1bf
set PCA recomputation to False by default
mumichae Apr 22, 2024
7651163
use multithreading for linear regression and add tdqm
mumichae Apr 22, 2024
f4649a6
Merge branch 'main' into optimise_pcr
mumichae Apr 22, 2024
5851f36
add durations to test command
mumichae Apr 22, 2024
559af31
Merge branch 'optimise_pcr' of github.com:theislab/scib into optimise…
mumichae Apr 22, 2024
1c0ec1d
add durations flag to pytest command
mumichae Apr 22, 2024
5b7af3d
add min duration flag
mumichae Apr 22, 2024
a3751f6
add comments for ThreadPoolExecutore run
mumichae Apr 26, 2024
c06da2a
allow to specify n_threads and linreg algorithm
mumichae Apr 27, 2024
344fb75
add keywords to cell cycle scoring
mumichae Apr 27, 2024
e4021fe
add multiple threading in pcr test
mumichae Apr 28, 2024
4197019
use nanmean
mumichae Aug 7, 2024
f8c6a91
test cell cycle score with numpy linreg
mumichae Aug 7, 2024
93aa312
use more memory-efficient implementation of ThreadPool
mumichae Aug 12, 2024
0739f8b
fix cases where there are no residuals
mumichae Sep 25, 2024
a2c3450
Merge branch 'main' into optimise_pcr
mumichae Dec 5, 2024
eb8e298
allow to configure svd_solver
mumichae Dec 10, 2024
b5ce504
use multiple linear regression and remove multithreading approach
mumichae Dec 10, 2024
7cd7848
use pcr score from numpy exact OLS implementation
mumichae Dec 10, 2024
6123cf1
remove testing code
mumichae Dec 10, 2024
504885e
include old pcr implementation
mumichae Dec 20, 2024
65ad588
Update scib/metrics/cell_cycle.py
mumichae Dec 20, 2024
e56c641
change default to numpy
mumichae Dec 20, 2024
c54ad5c
Merge branch 'optimise_pcr' of github.com:theislab/scib into optimise…
mumichae Dec 20, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ jobs:
- name: Test with pytest
if: ${{ matrix.os != 'macos-latest'}}
run: |
pytest --cov=scib --cov-report=xml -vv --ignore=tests/integration/ --ignore=tests/metrics/rpy2 -vv
pytest --cov=scib --cov-report=xml -vv --ignore=tests/integration/ --ignore=tests/metrics/rpy2 -vv --durations 0 --durations-min=1.0
mv coverage.xml "$(echo 'coverage_metrics_${{ matrix.os }}_${{ matrix.python }}.xml' | sed 's/[^a-z0-9\.\/]/_/g')"

- name: Upload coverage to GitHub Actions
Expand Down Expand Up @@ -98,7 +98,7 @@ jobs:

- name: Test with pytest
run: |
pytest --cov=scib --cov-report=xml -vv --tb=native -k rpy2
pytest --cov=scib --cov-report=xml -vv --tb=native -k rpy2 --durations 0 --durations-min=1.0
mv coverage.xml "$(echo 'coverage_rpy2_${{ matrix.os }}_${{ matrix.python }}.xml' | sed 's/[^a-z0-9\.\/]/_/g')"

- name: Upload coverage to GitHub Actions
Expand Down Expand Up @@ -129,7 +129,7 @@ jobs:

- name: Test with pytest
run: |
pytest --cov=scib --cov-report=xml -vv --tb=native -k integration
pytest --cov=scib --cov-report=xml -vv --tb=native -k integration --durations 0 --durations-min=1.0
mv coverage.xml "$(echo 'coverage_integration_${{ matrix.os }}_${{ matrix.python }}.xml' | sed 's/[^a-z0-9\.\/]/_/g')"

- name: Upload coverage to GitHub Actions
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ build-backend = "setuptools.build_meta"
log_cli = 'True'
log_cli_level = 'INFO'
addopts = '-p no:warnings'
durations = 0
50 changes: 35 additions & 15 deletions scib/metrics/cell_cycle.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import numpy as np
import pandas as pd
from tqdm import tqdm

from ..preprocessing import score_cell_cycle
from ..utils import check_adata
Expand All @@ -11,12 +12,14 @@ def cell_cycle(
adata_post,
batch_key,
embed=None,
agg_func=np.mean,
agg_func=np.nanmean,
organism="mouse",
n_comps=50,
recompute_cc=True,
precompute_pcr_key=None,
verbose=False,
linreg_method="numpy",
n_threads=1,
):
"""Cell cycle conservation score

Expand Down Expand Up @@ -44,6 +47,7 @@ def cell_cycle(
precomputed scores if available as 'S_score' and 'G2M_score' in ``adata_post.obs``
:param precompute_pcr_key: Key in adata_pre for precomputed PCR values for cell
cycle scores. Ignores cell cycle scores in adata_pre if present.
:param n_threads: Number of threads for linear regressions per principle component

:return:
A score between 1 and 0. The larger the score, the stronger the cell cycle
Expand All @@ -70,11 +74,6 @@ def cell_cycle(
if embed == "X_pca":
embed = None

batches = adata_pre.obs[batch_key].unique()
scores_final = []
scores_before = []
scores_after = []

recompute_cc = (
recompute_cc
or "S_score" not in adata_pre.obs_keys()
Expand All @@ -84,19 +83,26 @@ def cell_cycle(
precompute_pcr_key is None or precompute_pcr_key not in adata_pre.uns_keys()
)

for batch in batches:
batches = adata_pre.obs[batch_key].unique()
scores_before = []
scores_after = []
scores_final = []

for batch in tqdm(batches):
before, after = get_pcr_before_after(
adata_pre,
adata_post,
batch_key=batch_key,
batch=batch,
embed=embed,
organism=organism,
pcr_key=precompute_pcr_key,
recompute_cc=recompute_cc,
recompute_pcr=recompute_pcr,
pcr_key=precompute_pcr_key,
n_comps=n_comps,
verbose=verbose,
n_threads=n_threads,
linreg_method=linreg_method,
)

# scale result
Expand Down Expand Up @@ -140,11 +146,13 @@ def get_pcr_before_after(
batch,
embed,
organism,
recompute_cc,
recompute_pcr,
pcr_key,
n_comps,
verbose,
recompute_cc=False,
recompute_pcr=False,
n_comps=50,
verbose=True,
n_threads=1,
linreg_method="numpy",
):
"""
Principle component regression value on cell cycle scores for one batch
Expand Down Expand Up @@ -190,16 +198,28 @@ def get_pcr_before_after(
covariate = raw_sub.obs[["S_score", "G2M_score"]]

# PCR on adata before integration
if recompute_pcr:
if recompute_pcr: # TODO: does this work for precomputed values?
before = pc_regression(
raw_sub.X, covariate, pca_var=None, n_comps=n_comps, verbose=verbose
raw_sub.X,
covariate,
pca_var=None,
n_comps=n_comps,
verbose=verbose,
n_threads=n_threads,
linreg_method=linreg_method,
)
else:
before = pd.Series(raw_sub.uns[pcr_key])

# PCR on adata after integration
after = pc_regression(
int_sub, covariate, pca_var=None, n_comps=n_comps, verbose=verbose
int_sub,
covariate,
pca_var=None,
n_comps=n_comps,
verbose=verbose,
n_threads=n_threads,
linreg_method=linreg_method,
)

return before, after
Loading
Loading