Skip to content

Commit b2657b3

Browse files
Merge pull request #110 from theislab/dev
Dev v0.6.10
2 parents 0249161 + 1495eb2 commit b2657b3

File tree

9 files changed

+160
-258
lines changed

9 files changed

+160
-258
lines changed

diffxpy/fit/fit.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818

1919
def model(
20-
data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.typing.InputDataBaseTyping],
20+
data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.typing.InputDataBase],
2121
formula_loc: Union[None, str] = None,
2222
formula_scale: Union[None, str] = "~1",
2323
as_numeric: Union[List[str], Tuple[str], str] = (),
@@ -226,7 +226,7 @@ def model(
226226

227227

228228
def residuals(
229-
data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.typing.InputDataBaseTyping],
229+
data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.typing.InputDataBase],
230230
formula_loc: Union[None, str] = None,
231231
formula_scale: Union[None, str] = "~1",
232232
as_numeric: Union[List[str], Tuple[str], str] = (),
@@ -400,7 +400,7 @@ def residuals(
400400

401401

402402
def partition(
403-
data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.typing.InputDataBaseTyping],
403+
data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.typing.InputDataBase],
404404
parts: Union[str, np.ndarray, list],
405405
gene_names: Union[np.ndarray, list] = None,
406406
sample_description: pd.DataFrame = None,
@@ -454,7 +454,7 @@ class _Partition:
454454

455455
def __init__(
456456
self,
457-
data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.typing.InputDataBaseTyping],
457+
data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.typing.InputDataBase],
458458
parts: Union[str, np.ndarray, list],
459459
gene_names: Union[np.ndarray, list] = None,
460460
sample_description: pd.DataFrame = None,
@@ -481,7 +481,7 @@ def __init__(
481481
same order as in data or string-type column identifier of size-factor containing
482482
column in sample description.
483483
"""
484-
if isinstance(data, glm.typing.InputDataBaseTyping):
484+
if isinstance(data, glm.typing.InputDataBase):
485485
self.x = data.x
486486
elif isinstance(data, anndata.AnnData) or isinstance(data, Raw):
487487
self.x = data.X

diffxpy/stats/stats.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,7 @@ def wald_test(
211211
if theta_mle.shape[0] != theta0.shape[0]:
212212
raise ValueError('stats.wald_test(): theta_mle and theta0 have to contain the same number of entries')
213213

214+
theta_sd = np.nextafter(0, np.inf, out=theta_sd, where=theta_sd < np.nextafter(0, np.inf))
214215
wald_statistic = np.abs(np.divide(theta_mle - theta0, theta_sd))
215216
pvals = 2 * (1 - scipy.stats.norm(loc=0, scale=1).cdf(wald_statistic)) # two-tailed test
216217
return pvals
@@ -313,7 +314,10 @@ def two_coef_z_test(
313314
if theta_mle0.shape[0] != theta_sd0.shape[0]:
314315
raise ValueError('stats.two_coef_z_test(): theta_mle0 and theta_sd0 have to contain the same number of entries')
315316

316-
z_statistic = np.abs((theta_mle0 - theta_mle1) / np.sqrt(np.square(theta_sd0) + np.square(theta_sd1)))
317+
divisor = np.square(theta_sd0) + np.square(theta_sd1)
318+
divisor = np.nextafter(0, np.inf, out=divisor, where=divisor < np.nextafter(0, np.inf))
319+
divisor = np.sqrt(divisor)
320+
z_statistic = np.abs((theta_mle0 - theta_mle1)) / divisor
317321
pvals = 2 * (1 - scipy.stats.norm(loc=0, scale=1).cdf(z_statistic)) # two-tailed test
318322
return pvals
319323

diffxpy/testing/det.py

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1544,7 +1544,7 @@ def __init__(
15441544
super().__init__()
15451545
if isinstance(data, anndata.AnnData) or isinstance(data, anndata.Raw):
15461546
data = data.X
1547-
elif isinstance(data, glm.typing.InputDataBaseTyping):
1547+
elif isinstance(data, glm.typing.InputDataBase):
15481548
data = data.x
15491549
self._x = data
15501550
self.sample_description = sample_description
@@ -1669,7 +1669,7 @@ def __init__(
16691669
super().__init__()
16701670
if isinstance(data, anndata.AnnData) or isinstance(data, anndata.Raw):
16711671
data = data.X
1672-
elif isinstance(data, glm.typing.InputDataBaseTyping):
1672+
elif isinstance(data, glm.typing.InputDataBase):
16731673
data = data.x
16741674
self._x = data
16751675
self.sample_description = sample_description
@@ -2102,11 +2102,13 @@ def __init__(
21022102
self.grouping = grouping
21032103
self.groups = list(np.asarray(groups))
21042104

2105-
# values of parameter estimates: coefficients x genes array with one coefficient per group
2106-
self._theta_mle = model_estim.par_link_loc
2107-
# standard deviation of estimates: coefficients x genes array with one coefficient per group
2108-
# theta_sd = sqrt(diagonal(fisher_inv))
2109-
self._theta_sd = np.sqrt(np.diagonal(model_estim.fisher_inv, axis1=-2, axis2=-1)).T
2105+
# Values of parameter estimates: coefficients x genes array with one coefficient per group
2106+
self._theta_mle = model_estim.a_var
2107+
# Standard deviation of estimates: coefficients x genes array with one coefficient per group
2108+
# Need .copy() here as nextafter needs mutabls copy.
2109+
theta_sd = np.diagonal(model_estim.fisher_inv, axis1=-2, axis2=-1).T.copy()
2110+
theta_sd = np.nextafter(0, np.inf, out=theta_sd, where=theta_sd < np.nextafter(0, np.inf))
2111+
self._theta_sd = np.sqrt(theta_sd)
21102112
self._logfc = None
21112113

21122114
# Call tests in constructor.
@@ -2307,11 +2309,13 @@ def __init__(
23072309
else:
23082310
self.groups = groups.tolist()
23092311

2310-
# values of parameter estimates: coefficients x genes array with one coefficient per group
2312+
# Values of parameter estimates: coefficients x genes array with one coefficient per group
23112313
self._theta_mle = model_estim.a_var
2312-
# standard deviation of estimates: coefficients x genes array with one coefficient per group
2313-
# theta_sd = sqrt(diagonal(fisher_inv))
2314-
self._theta_sd = np.sqrt(np.diagonal(model_estim.fisher_inv, axis1=-2, axis2=-1)).T
2314+
# Standard deviation of estimates: coefficients x genes array with one coefficient per group
2315+
# Need .copy() here as nextafter needs mutabls copy.
2316+
theta_sd = np.diagonal(model_estim.fisher_inv, axis1=-2, axis2=-1).T.copy()
2317+
theta_sd = np.nextafter(0, np.inf, out=theta_sd, where=theta_sd < np.nextafter(0, np.inf))
2318+
self._theta_sd = np.sqrt(theta_sd)
23152319

23162320
def _correction(self, pvals, method="fdr_bh") -> np.ndarray:
23172321
"""
@@ -2349,7 +2353,6 @@ def _test(self, **kwargs):
23492353

23502354
def _test_pairs(self, groups0, groups1):
23512355
num_features = self.model_estim.x.shape[1]
2352-
23532356
pvals = np.tile(np.NaN, [len(groups0), len(groups1), num_features])
23542357

23552358
for i, g0 in enumerate(groups0):

diffxpy/testing/tests.py

Lines changed: 39 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ def _fit(
3939
quick_scale: bool = None,
4040
close_session=True,
4141
dtype="float64"
42-
) -> glm.typing.InputDataBaseTyping:
42+
) -> glm.typing.InputDataBase:
4343
"""
4444
:param noise_model: str, noise model to use in model-based unit_test. Possible options:
4545
@@ -186,7 +186,7 @@ def _fit(
186186

187187

188188
def lrt(
189-
data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.typing.InputDataBaseTyping],
189+
data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.typing.InputDataBase],
190190
full_formula_loc: str,
191191
reduced_formula_loc: str,
192192
full_formula_scale: str = "~1",
@@ -370,7 +370,7 @@ def lrt(
370370

371371

372372
def wald(
373-
data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.typing.InputDataBaseTyping],
373+
data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.typing.InputDataBase],
374374
factor_loc_totest: Union[str, List[str]] = None,
375375
coef_to_test: Union[str, List[str]] = None,
376376
formula_loc: Union[None, str] = None,
@@ -547,7 +547,7 @@ def wald(
547547
if isinstance(as_numeric, str):
548548
as_numeric = [as_numeric]
549549

550-
# # Parse input data formats:
550+
# Parse input data formats:
551551
gene_names = parse_gene_names(data, gene_names)
552552
if dmat_loc is None and dmat_scale is None:
553553
sample_description = parse_sample_description(data, sample_description)
@@ -644,7 +644,7 @@ def wald(
644644

645645

646646
def t_test(
647-
data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.typing.InputDataBaseTyping],
647+
data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.typing.InputDataBase],
648648
grouping,
649649
gene_names: Union[np.ndarray, list] = None,
650650
sample_description: pd.DataFrame = None,
@@ -686,7 +686,7 @@ def t_test(
686686

687687

688688
def rank_test(
689-
data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.typing.InputDataBaseTyping],
689+
data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.typing.InputDataBase],
690690
grouping: Union[str, np.ndarray, list],
691691
gene_names: Union[np.ndarray, list] = None,
692692
sample_description: pd.DataFrame = None,
@@ -728,7 +728,7 @@ def rank_test(
728728

729729

730730
def two_sample(
731-
data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.typing.InputDataBaseTyping],
731+
data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.typing.InputDataBase],
732732
grouping: Union[str, np.ndarray, list],
733733
as_numeric: Union[List[str], Tuple[str], str] = (),
734734
test: str = "t-test",
@@ -819,8 +819,8 @@ def two_sample(
819819
:param kwargs: [Debugging] Additional arguments will be passed to the _fit method.
820820
"""
821821
if test in ['t-test', 'rank'] and noise_model is not None:
822-
raise ValueError('base.two_sample(): Do not specify `noise_model` if using test t-test or rank_test: ' +
823-
'The t-test is based on a gaussian noise model and wilcoxon is model free.')
822+
raise Warning('two_sample(): Do not specify `noise_model` if using test t-test or rank_test: ' +
823+
'The t-test is based on a gaussian noise model and the rank sum test is model free.')
824824

825825
gene_names = parse_gene_names(data, gene_names)
826826
grouping = parse_grouping(data, sample_description, grouping)
@@ -848,6 +848,8 @@ def two_sample(
848848
sample_description=sample_description,
849849
noise_model=noise_model,
850850
size_factors=size_factors,
851+
init_a="closed_form",
852+
init_b="closed_form",
851853
batch_size=batch_size,
852854
training_strategy=training_strategy,
853855
quick_scale=quick_scale,
@@ -872,6 +874,8 @@ def two_sample(
872874
sample_description=sample_description,
873875
noise_model=noise_model,
874876
size_factors=size_factors,
877+
init_a="closed_form",
878+
init_b="closed_form",
875879
batch_size=batch_size,
876880
training_strategy=training_strategy,
877881
quick_scale=quick_scale,
@@ -883,16 +887,14 @@ def two_sample(
883887
data=data,
884888
gene_names=gene_names,
885889
grouping=grouping,
886-
is_sig_zerovar=is_sig_zerovar,
887-
dtype=dtype
890+
is_sig_zerovar=is_sig_zerovar
888891
)
889892
elif test.lower() == 'rank':
890893
de_test = rank_test(
891894
data=data,
892895
gene_names=gene_names,
893896
grouping=grouping,
894-
is_sig_zerovar=is_sig_zerovar,
895-
dtype=dtype
897+
is_sig_zerovar=is_sig_zerovar
896898
)
897899
else:
898900
raise ValueError('two_sample(): Parameter `test="%s"` not recognized.' % test)
@@ -901,19 +903,19 @@ def two_sample(
901903

902904

903905
def pairwise(
904-
data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.typing.InputDataBaseTyping],
906+
data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.typing.InputDataBase],
905907
grouping: Union[str, np.ndarray, list],
906908
as_numeric: Union[List[str], Tuple[str], str] = (),
907-
test: str = 'z-test',
908-
lazy: bool = False,
909+
test: str = "z-test",
910+
lazy: bool = True,
909911
gene_names: Union[np.ndarray, list] = None,
910912
sample_description: pd.DataFrame = None,
911-
noise_model: str = None,
913+
noise_model: str = "nb",
912914
size_factors: np.ndarray = None,
913915
batch_size: int = None,
914916
training_strategy: Union[str, List[Dict[str, object]], Callable] = "AUTO",
915917
is_sig_zerovar: bool = True,
916-
quick_scale: bool = None,
918+
quick_scale: bool = False,
917919
dtype="float64",
918920
pval_correction: str = "global",
919921
keep_full_test_objs: bool = False,
@@ -1036,6 +1038,8 @@ def pairwise(
10361038
design_scale=dmat,
10371039
gene_names=gene_names,
10381040
size_factors=size_factors,
1041+
init_a="closed_form",
1042+
init_b="closed_form",
10391043
batch_size=batch_size,
10401044
training_strategy=training_strategy,
10411045
quick_scale=quick_scale,
@@ -1058,6 +1062,10 @@ def pairwise(
10581062
correction_type=pval_correction
10591063
)
10601064
else:
1065+
if isinstance(data, anndata.AnnData) or isinstance(data, anndata.Raw):
1066+
data = data.X
1067+
elif isinstance(data, glm.typing.InputDataBase):
1068+
data = data.x
10611069
groups = np.unique(grouping)
10621070
pvals = np.tile(np.NaN, [len(groups), len(groups), data.shape[1]])
10631071
pvals[np.eye(pvals.shape[0]).astype(bool)] = 0
@@ -1073,16 +1081,19 @@ def pairwise(
10731081
for j, g2 in enumerate(groups[(i + 1):]):
10741082
j = j + i + 1
10751083

1076-
sel = (grouping == g1) | (grouping == g2)
1084+
idx = np.where(np.logical_or(
1085+
grouping == g1,
1086+
grouping == g2
1087+
))[0]
10771088
de_test_temp = two_sample(
1078-
data=data[sel],
1079-
grouping=grouping[sel],
1089+
data=data[idx, :],
1090+
grouping=grouping[idx],
10801091
as_numeric=as_numeric,
10811092
test=test,
10821093
gene_names=gene_names,
1083-
sample_description=sample_description.iloc[sel],
1094+
sample_description=sample_description.iloc[idx, :],
10841095
noise_model=noise_model,
1085-
size_factors=size_factors[sel] if size_factors is not None else None,
1096+
size_factors=size_factors[idx] if size_factors is not None else None,
10861097
batch_size=batch_size,
10871098
training_strategy=training_strategy,
10881099
quick_scale=quick_scale,
@@ -1093,7 +1104,7 @@ def pairwise(
10931104
pvals[i, j] = de_test_temp.pval
10941105
pvals[j, i] = pvals[i, j]
10951106
logfc[i, j] = de_test_temp.log_fold_change()
1096-
logfc[j, i] = - logfc[i, j]
1107+
logfc[j, i] = -logfc[i, j]
10971108
if keep_full_test_objs:
10981109
tests[i, j] = de_test_temp
10991110
tests[j, i] = de_test_temp
@@ -1112,7 +1123,7 @@ def pairwise(
11121123

11131124

11141125
def versus_rest(
1115-
data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.typing.InputDataBaseTyping],
1126+
data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.typing.InputDataBase],
11161127
grouping: Union[str, np.ndarray, list],
11171128
as_numeric: Union[List[str], Tuple[str], str] = (),
11181129
test: str = 'wald',
@@ -1274,7 +1285,7 @@ def versus_rest(
12741285

12751286

12761287
def partition(
1277-
data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.typing.InputDataBaseTyping],
1288+
data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.typing.InputDataBase],
12781289
parts: Union[str, np.ndarray, list],
12791290
gene_names: Union[np.ndarray, list] = None,
12801291
sample_description: pd.DataFrame = None
@@ -1317,7 +1328,7 @@ class _Partition:
13171328

13181329
def __init__(
13191330
self,
1320-
data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.typing.InputDataBaseTyping],
1331+
data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.typing.InputDataBase],
13211332
parts: Union[str, np.ndarray, list],
13221333
gene_names: Union[np.ndarray, list] = None,
13231334
sample_description: pd.DataFrame = None
@@ -1332,7 +1343,7 @@ def __init__(
13321343
:param gene_names: optional list/array of gene names which will be used if `data` does not implicitly store these
13331344
:param sample_description: optional pandas.DataFrame containing sample annotations
13341345
"""
1335-
if isinstance(data, glm.typing.InputDataBaseTyping):
1346+
if isinstance(data, glm.typing.InputDataBase):
13361347
self.x = data.x
13371348
elif isinstance(data, anndata.AnnData) or isinstance(data, Raw):
13381349
self.x = data.X

diffxpy/testing/utils.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,13 @@
1818

1919

2020
def parse_gene_names(
21-
data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.typing.InputDataBaseTyping],
21+
data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.typing.InputDataBase],
2222
gene_names: Union[list, np.ndarray, None]
2323
):
2424
if gene_names is None:
2525
if anndata is not None and (isinstance(data, anndata.AnnData) or isinstance(data, Raw)):
2626
gene_names = data.var_names
27-
elif isinstance(data, glm.typing.InputDataBaseTyping):
27+
elif isinstance(data, glm.typing.InputDataBase):
2828
gene_names = data.features
2929
else:
3030
raise ValueError("Missing gene names")
@@ -33,7 +33,7 @@ def parse_gene_names(
3333

3434

3535
def parse_sample_description(
36-
data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.typing.InputDataBaseTyping],
36+
data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.typing.InputDataBase],
3737
sample_description: Union[pd.DataFrame, None]
3838
) -> pd.DataFrame:
3939
"""
@@ -57,7 +57,7 @@ def parse_sample_description(
5757
assert data.X.shape[0] == sample_description.shape[0], \
5858
"data matrix and sample description must contain same number of cells: %i, %i" % \
5959
(data.X.shape[0], sample_description.shape[0])
60-
elif isinstance(data, glm.typing.InputDataBaseTyping):
60+
elif isinstance(data, glm.typing.InputDataBase):
6161
assert data.x.shape[0] == sample_description.shape[0], \
6262
"data matrix and sample description must contain same number of cells: %i, %i" % \
6363
(data.x.shape[0], sample_description.shape[0])
@@ -70,7 +70,7 @@ def parse_sample_description(
7070

7171
def parse_size_factors(
7272
size_factors: Union[np.ndarray, pd.core.series.Series, np.ndarray],
73-
data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.typing.InputDataBaseTyping],
73+
data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.typing.InputDataBase],
7474
sample_description: pd.DataFrame
7575
) -> Union[np.ndarray, None]:
7676
"""

0 commit comments

Comments
 (0)