theislab · ilan-gold · Sep 21, 2022 · Mar 15, 2022 · Mar 16, 2022 · Mar 16, 2022
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -41,11 +41,11 @@ repos:
             types: [text]
             stages: [commit, push, manual]
     - repo: https://github.com/pre-commit/mirrors-prettier
-      rev: v2.3.0
+      rev: v2.7.1
       hooks:
           - id: prettier
     - repo: https://github.com/pycqa/isort
-      rev: 5.8.0
+      rev: 5.10.1
       hooks:
           - id: isort
             name: isort (python)

diff --git a/batchglm/models/__init__.py b/batchglm/models/__init__.py
@@ -1 +1 @@
-from . import glm_beta, glm_nb, glm_norm, glm_poisson
+from . import base_glm, glm_beta, glm_nb, glm_norm, glm_poisson
diff --git a/batchglm/models/base_glm/__init__.py b/batchglm/models/base_glm/__init__.py
@@ -1,4 +1,4 @@
 # from .estimator import _EstimatorGLM
 from ...utils.input import InputDataGLM
-from .model import _ModelGLM
-from .utils import closedform_glm_mean, closedform_glm_scale, parse_design
+from .model import ModelGLM
+from .utils import closedform_glm_mean, closedform_glm_scale
diff --git a/batchglm/models/base_glm/model.py b/batchglm/models/base_glm/model.py
@@ -1,19 +1,22 @@
 import abc
 import logging
+import random
+import string
 from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
 
 import dask.array
 import numpy as np
+import pandas as pd
 import scipy
 
 from ...utils.input import InputDataGLM
 from .external import pkg_constants
-from .utils import generate_sample_description, parse_constraints, parse_design
+from .utils import generate_sample_description
 
 logger = logging.getLogger(__name__)
 
 
-class _ModelGLM(metaclass=abc.ABCMeta):
+class ModelGLM(metaclass=abc.ABCMeta):
     """
     Generalized Linear Model base class.
 
@@ -43,13 +46,15 @@ class _ModelGLM(metaclass=abc.ABCMeta):
     _cast_dtype: str = "float32"
     _chunk_size_cells: int
     _chunk_size_genes: int
+    _sample_description: pd.DataFrame
+    _features: List[str]
 
     def __init__(
         self,
         input_data: Optional[InputDataGLM] = None,
     ):
         """
-        Create a new _ModelGLM object.
+        Create a new ModelGLM object.
 
         :param input_data: Input data for the model
 
@@ -72,9 +77,14 @@ def extract_input_data(self, input_data: InputDataGLM):
         self._cast_dtype = input_data.cast_dtype
         self._chunk_size_genes = input_data.chunk_size_genes
         self._chunk_size_cells = input_data.chunk_size_cells
+        self._features = input_data.features
         self._xh_loc = np.matmul(self.design_loc, self.constraints_loc)
         self._xh_scale = np.matmul(self.design_scale, self.constraints_scale)
 
+    @property
+    def features(self) -> List[str]:
+        return self._features
+
     @property
     def chunk_size_cells(self) -> int:
         return self._chunk_size_cells
@@ -87,6 +97,10 @@ def chunk_size_genes(self) -> int:
     def cast_dtype(self) -> str:
         return self._cast_dtype
 
+    @property
+    def sample_description(self) -> pd.DataFrame:
+        return self._sample_description
+
     @property
     def design_loc(self) -> Union[np.ndarray, dask.array.core.Array]:
         """location design matrix"""
@@ -356,7 +370,7 @@ def generate_params(
         if rand_fn_scale is None:
             rand_fn_scale = rand_fn
 
-        _design_loc, _design_scale, _ = generate_sample_description(**kwargs)
+        _design_loc, _design_scale, _sample_description = generate_sample_description(**kwargs)
 
         self._theta_location = np.concatenate(
             [
@@ -366,8 +380,9 @@ def generate_params(
             axis=0,
         )
         self._theta_scale = np.concatenate([rand_fn_scale((_design_scale.shape[1], n_vars))], axis=0)
+        self._sample_description = _sample_description
 
-        return _design_loc, _design_scale
+        return _design_loc, _design_scale, _sample_description
 
     def generate_artificial_data(
         self,
@@ -379,6 +394,8 @@ def generate_artificial_data(
         shuffle_assignments: bool = False,
         sparse: bool = False,
         as_dask: bool = True,
+        theta_location_setter: Optional[Callable] = None,
+        theta_scale_setter: Optional[Callable] = None,
         **kwargs,
     ):
         """
@@ -391,9 +408,11 @@ def generate_artificial_data(
         :param shuffle_assignments: Depcreated. Does not do anything.
         :param sparse: If True, the simulated data matrix is sparse.
         :param as_dask: If True, use dask.
+        :param theta_location_setter: Override for parameter after generate_params, should return the parameter
+        :param theta_scale_setter: Override for parameter after generate_params, should return the parameter
         :param kwargs: Additional kwargs passed to generate_params.
         """
-        _design_loc, _design_scale = self.generate_params(
+        _design_loc, _design_scale, _ = self.generate_params(
             n_vars=n_vars,
             num_observations=n_obs,
             num_conditions=num_conditions,
@@ -402,6 +421,10 @@ def generate_artificial_data(
             shuffle_assignments=shuffle_assignments,
             **kwargs,
         )
+        if theta_location_setter is not None:
+            self._theta_location = theta_location_setter(self._theta_location)
+        if theta_scale_setter is not None:
+            self._theta_scale = theta_scale_setter(self._theta_scale)
 
         # we need to do this explicitly here in order to generate data
         self._constraints_loc = np.identity(n=_design_loc.shape[1])
@@ -413,8 +436,15 @@ def generate_artificial_data(
         data_matrix = self.generate_data().astype(self.cast_dtype)
         if sparse:
             data_matrix = scipy.sparse.csr_matrix(data_matrix)
-
-        input_data = InputDataGLM(data=data_matrix, design_loc=_design_loc, design_scale=_design_scale, as_dask=as_dask)
+        # generate random gene/feature names
+        feature_names = "".join("feature_" + str(i) for i in range(n_vars))
+        input_data = InputDataGLM(
+            data=data_matrix,
+            design_loc=_design_loc,
+            design_scale=_design_scale,
+            as_dask=as_dask,
+            feature_names=feature_names,
+        )
         self.extract_input_data(input_data)
 
     @abc.abstractmethod

diff --git a/batchglm/models/base_glm/utils.py b/batchglm/models/base_glm/utils.py
@@ -7,12 +7,22 @@
 import pandas as pd
 import patsy
 import scipy.sparse
+import sparse
 
 from .external import groupwise_solve_lm
 
 logger = logging.getLogger("batchglm")
 
 
+def densify(arr):
+    if isinstance(arr, dask.array.core.Array):
+        arr = arr.compute()
+    if isinstance(arr, sparse.COO) or isinstance(arr, scipy.sparse.csr_matrix):
+        return arr.todense()
+    else:
+        return arr
+
+
 def generate_sample_description(
     num_observations: int,
     num_conditions: int,
@@ -61,87 +71,6 @@ def generate_sample_description(
     return sim_design_loc, sim_design_scale, sample_description
 
 
-def parse_design(
-    design_matrix: Union[pd.DataFrame, patsy.design_info.DesignMatrix, dask.array.core.Array, np.ndarray],
-    param_names: List[str] = None,
-) -> Tuple[np.ndarray, List[str]]:
-    r"""
-    Parser for design matrices.
-
-    :param design_matrix: Design matrix.
-    :param param_names:
-        Optional coefficient names for design_matrix.
-        Ignored if design_matrix is pd.DataFrame or patsy.design_info.DesignMatrix.
-    :return: Tuple[np.ndarray, List[str]] containing the design matrix and the parameter names.
-    :raise AssertionError: if the type of design_matrix is not understood.
-    :raise AssertionError: if length of provided param_names is not equal to number of coefficients in design_matrix.
-    :raise ValueError: if param_names is None when type of design_matrix is numpy.ndarray or dask.array.core.Array.
-    """
-    if isinstance(design_matrix, (pd.DataFrame, patsy.design_info.DesignMatrix)) and param_names is not None:
-        logger.warning(f"The provided param_names are ignored as the design matrix is of type {type(design_matrix)}.")
-
-    if isinstance(design_matrix, patsy.design_info.DesignMatrix):
-        dmat = np.asarray(design_matrix)
-        params = design_matrix.design_info.column_names
-    elif isinstance(design_matrix, pd.DataFrame):
-        dmat = np.asarray(design_matrix)
-        params = design_matrix.columns.tolist()
-    elif isinstance(design_matrix, dask.array.core.Array):
-        dmat = design_matrix.compute()
-        params = param_names
-    elif isinstance(design_matrix, np.ndarray):
-        dmat = design_matrix
-        params = param_names
-    else:
-        raise AssertionError(f"Datatype for design_matrix not understood: {type(design_matrix)}")
-    if params is None:
-        raise ValueError("Provide names when passing design_matrix as np.ndarray or dask.array.core.Array!")
-    assert len(params) == dmat.shape[1], (
-        "Length of provided param_names is not equal to " "number of coefficients in design_matrix."
-    )
-    return dmat, params
-
-
-def parse_constraints(
-    dmat: np.ndarray,
-    dmat_par_names: List[str],
-    constraints: Optional[Union[np.ndarray, dask.array.core.Array]] = None,
-    constraint_par_names: Optional[List[str]] = None,
-) -> Tuple[np.ndarray, List[str]]:
-    r"""
-    Parser for constraint matrices.
-
-    :param dmat: Design matrix.
-    :param constraints: Constraint matrix.
-    :param constraint_par_names: Optional coefficient names for constraints.
-    :return: Tuple[np.ndarray, List[str]] containing the constraint matrix and the parameter names.
-    :raise AssertionError: if the type of given design / contraint matrix is not np.ndarray or dask.array.core.Array.
-    """
-    assert isinstance(dmat, np.ndarray), "dmat must be provided as np.ndarray."
-    if constraints is None:
-        constraints = np.identity(n=dmat.shape[1])
-        constraint_params = dmat_par_names
-    else:
-        if isinstance(constraints, dask.array.core.Array):
-            constraints = constraints.compute()
-        assert isinstance(constraints, np.ndarray), "contraints must be np.ndarray or dask.array.core.Array."
-        # Cannot use all parameter names if constraint matrix is not identity: Make up new ones.
-        # Use variable names that can be mapped (unconstrained).
-        if constraint_par_names is not None:
-            assert len(constraint_params) == len(constraint_par_names)
-            constraint_params = constraint_par_names
-        else:
-            constraint_params = [
-                "var_" + str(i)
-                if np.sum(constraints[:, i] != 0) > 1
-                else dmat_par_names[np.where(constraints[:, i] != 0)[0][0]]
-                for i in range(constraints.shape[1])
-            ]
-        assert constraints.shape[0] == dmat.shape[1], "constraint dimension mismatch"
-
-    return constraints, constraint_params
-
-
 def closedform_glm_mean(
     x: Union[np.ndarray, scipy.sparse.csr_matrix, dask.array.core.Array],
     dmat: Union[np.ndarray, dask.array.core.Array],
@@ -168,8 +97,9 @@ def closedform_glm_mean(
         x = np.divide(x, size_factors)
 
     def apply_fun(grouping):
+
         groupwise_means = np.asarray(
-            np.vstack([np.mean(x[np.where(grouping == g)[0], :], axis=0) for g in np.unique(grouping)])
+            np.vstack([np.mean(densify(x[np.where(grouping == g)[0], :]), axis=0) for g in np.unique(grouping)])
         )
         if link_fn is None:
             return groupwise_means
@@ -218,7 +148,7 @@ def apply_fun(grouping):
         # Calculate group-wise means if not supplied. These are required for variance and MME computation.
         if provided_groupwise_means is None:
             gw_means = np.asarray(
-                np.vstack([np.mean(x[np.where(grouping == g)[0], :], axis=0) for g in np.unique(grouping)])
+                np.vstack([np.mean(densify(x[np.where(grouping == g)[0], :]), axis=0) for g in np.unique(grouping)])
             )
         else:
             gw_means = provided_groupwise_means
@@ -228,14 +158,14 @@ def apply_fun(grouping):
             expect_xsq = np.asarray(
                 np.vstack(
                     [
-                        np.asarray(np.mean(x[np.where(grouping == g)[0], :].power(2), axis=0))
+                        np.asarray(np.mean(densify(x[np.where(grouping == g)[0], :]).power(2), axis=0))
                         for g in np.unique(grouping)
                     ]
                 )
             )
         else:
             expect_xsq = np.vstack(
-                [np.mean(np.square(x[np.where(grouping == g)[0], :]), axis=0) for g in np.unique(grouping)]
+                [np.mean(np.square(densify(x[np.where(grouping == g)[0], :])), axis=0) for g in np.unique(grouping)]
             )
         expect_x_sq = np.square(gw_means)
         variance = expect_xsq - expect_x_sq

diff --git a/batchglm/models/glm_beta/external.py b/batchglm/models/glm_beta/external.py
@@ -1,4 +1,4 @@
 import batchglm.utils.data as data_utils
 from batchglm import pkg_constants
-from batchglm.models.base_glm import _ModelGLM, closedform_glm_mean, closedform_glm_scale
+from batchglm.models.base_glm import ModelGLM, closedform_glm_mean, closedform_glm_scale
 from batchglm.utils.linalg import groupwise_solve_lm
diff --git a/batchglm/models/glm_beta/model.py b/batchglm/models/glm_beta/model.py
@@ -4,10 +4,10 @@
 import dask
 import numpy as np
 
-from .external import _ModelGLM
+from .external import ModelGLM
 
 
-class Model(_ModelGLM, metaclass=abc.ABCMeta):
+class Model(ModelGLM, metaclass=abc.ABCMeta):
     """
     Generalized Linear Model (GLM) with beta distributed noise, logit link for location and log link for scale.
     """

diff --git a/batchglm/models/glm_nb/external.py b/batchglm/models/glm_nb/external.py
@@ -1,4 +1,4 @@
 import batchglm.utils.data as data_utils
 from batchglm import pkg_constants
-from batchglm.models.base_glm import _ModelGLM, closedform_glm_mean, closedform_glm_scale
+from batchglm.models.base_glm import ModelGLM, closedform_glm_mean, closedform_glm_scale
 from batchglm.utils.linalg import groupwise_solve_lm
diff --git a/batchglm/models/glm_nb/model.py b/batchglm/models/glm_nb/model.py
@@ -4,10 +4,10 @@
 import dask.array
 import numpy as np
 
-from .external import _ModelGLM
+from .external import ModelGLM
 
 
-class Model(_ModelGLM, metaclass=abc.ABCMeta):
+class Model(ModelGLM, metaclass=abc.ABCMeta):
     """
     Generalized Linear Model (GLM) with negative binomial noise.
     """

diff --git a/batchglm/models/glm_norm/external.py b/batchglm/models/glm_norm/external.py
@@ -1,4 +1,4 @@
 import batchglm.utils.data as data_utils
 from batchglm import pkg_constants
-from batchglm.models.base_glm import _ModelGLM, closedform_glm_mean, closedform_glm_scale
+from batchglm.models.base_glm import ModelGLM, closedform_glm_mean, closedform_glm_scale
 from batchglm.utils.linalg import groupwise_solve_lm
diff --git a/batchglm/models/glm_norm/model.py b/batchglm/models/glm_norm/model.py
@@ -4,10 +4,10 @@
 import dask
 import numpy as np
 
-from .external import _ModelGLM
+from .external import ModelGLM
 
 
-class Model(_ModelGLM, metaclass=abc.ABCMeta):
+class Model(ModelGLM, metaclass=abc.ABCMeta):
 
     """Generalized Linear Model (GLM) with normal noise."""
 

diff --git a/batchglm/models/glm_norm/utils.py b/batchglm/models/glm_norm/utils.py
@@ -61,7 +61,6 @@ def init_par(model, init_location: str, init_scale: str) -> Tuple[np.ndarray, np
             &= D \cdot x' = f^{-1}(\theta)
     $$
     """
-
     groupwise_means = None
 
     init_location_str = init_location.lower()
@@ -79,7 +78,7 @@ def init_par(model, init_location: str, init_scale: str) -> Tuple[np.ndarray, np
     elif init_location_str == "standard":
         overall_means = np.mean(model.x, axis=0)  # directly calculate the mean
         init_theta_location = np.zeros([model.num_loc_params, model.num_features])
-        init_theta_location[0, :] = np.log(overall_means)
+        init_theta_location[0, :] = overall_means  # identity linked.
     else:
         raise ValueError("init_location string %s not recognized" % init_location)
 

diff --git a/batchglm/models/glm_poisson/external.py b/batchglm/models/glm_poisson/external.py
@@ -1,4 +1,4 @@
 import batchglm.utils.data as data_utils
 from batchglm import pkg_constants
-from batchglm.models.base_glm import _ModelGLM, closedform_glm_mean, closedform_glm_scale
+from batchglm.models.base_glm import ModelGLM, closedform_glm_mean, closedform_glm_scale
 from batchglm.utils.linalg import groupwise_solve_lm
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		from . import glm_beta, glm_nb, glm_norm, glm_poisson
		from . import base_glm, glm_beta, glm_nb, glm_norm, glm_poisson