scverse · amalia-k510 · Apr 25, 2025 · Apr 25, 2025 · Apr 25, 2025 · May 7, 2025
diff --git a/src/scanpy/metrics/__init__.py b/src/scanpy/metrics/__init__.py
@@ -3,7 +3,7 @@
 from __future__ import annotations
 
 from ._gearys_c import gearys_c
-from ._metrics import confusion_matrix
+from ._metrics import confusion_matrix, modularity, modularity_adata
 from ._morans_i import morans_i
 
-__all__ = ["confusion_matrix", "gearys_c", "morans_i"]
+__all__ = ["confusion_matrix", "gearys_c", "modularity", "modularity_adata", "morans_i"]
diff --git a/src/scanpy/metrics/_metrics.py b/src/scanpy/metrics/_metrics.py
@@ -2,15 +2,22 @@
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, cast
 
 import numpy as np
 import pandas as pd
 from natsort import natsorted
 from pandas.api.types import CategoricalDtype
+from scipy.sparse import coo_matrix
+
+from .._compat import CSRBase, SpBase
 
 if TYPE_CHECKING:
     from collections.abc import Sequence
+    from typing import Literal
+
+    from anndata import AnnData
+    from numpy.typing import ArrayLike
 
 
 def confusion_matrix(
@@ -60,7 +67,9 @@ def confusion_matrix(
     orig, new = pd.Series(orig), pd.Series(new)
     assert len(orig) == len(new)
 
-    unique_labels = pd.unique(np.concatenate((orig.values, new.values)))
+    unique_labels = pd.unique(
+        np.concatenate((np.asarray(orig.values), np.asarray(new.values)))
+    )
 
     # Compute
     mtx = _confusion_matrix(orig, new, labels=unique_labels)
@@ -89,3 +98,95 @@ def confusion_matrix(
     df = df.loc[np.array(orig_idx), np.array(new_idx)]
 
     return df
+
+
+def modularity(
+    connectivities: ArrayLike | SpBase,
+    labels: pd.Series | ArrayLike,
+    mode: Literal["UNDIRECTED", "DIRECTED"] = "UNDIRECTED",
+) -> float:
+    # accepting both dense or spare matrices as the connectivity graph
+    # setting mode between directed and undirected
+    """Compute the modularity of a graph given its connectivities and labels.
+
+    Parameters
+    ----------
+    connectivities: array-like or sparse matrix
+        Weighted adjacency matrix representing the graph. Can be a dense NumPy array or a sparse CSR matrix.
+    labels: array-like or pandas.Series
+        Cluster labels for each node in the graph.
+    mode: str
+        The mode of the graph. Can be "UNDIRECTED" or "DIRECTED". Default is "UNDIRECTED".
+
+    Returns
+    -------
+    float
+        The modularity of the graph based on the provided clustering.
+    """
+    try:
+        # try to import igraph in case the user wants to calculate modularity
+        # not in the main module to avoid import errors
+        import igraph as ig
+    except ImportError as e:
+        msg = "igraph is require for computing modularity"
+        raise ImportError(msg) from e
+    if isinstance(connectivities, CSRBase | SpBase):
+        # check if the connectivities is a sparse matrix
+        coo = coo_matrix(connectivities)
+        edges = list(zip(coo.row, coo.col, strict=False))
+        # converting to the coo format to extract the edges and weights
+        # storing only non-zero elements and their indices
+        weights = coo.data.tolist()
+        graph = ig.Graph(edges=edges, directed=mode == "DIRECTED")
+        graph.es["weight"] = weights
+    else:
+        # if the graph is dense, creates it directly using igraph's adjacency matrix
+        dense_array = np.asarray(connectivities)
+        igraph_mode = ig.ADJ_UNDIRECTED if mode == "UNDIRECTED" else ig.ADJ_DIRECTED
+        graph = ig.Graph.Weighted_Adjacency(dense_array.tolist(), mode=igraph_mode)
+    # cluster labels to integer codes required by igraph
+    labels = pd.Categorical(np.asarray(labels)).codes
+
+    return graph.modularity(labels)
+
+
+def modularity_adata(
+    adata: AnnData,
+    *,
+    labels: str | ArrayLike = "leiden",
+    obsp: str = "connectivities",
+    mode: Literal["UNDIRECTED", "DIRECTED"] = "UNDIRECTED",
+) -> float:
+    # default to leiden labels and connectivities as it is more common
+    """Compute modularity from an AnnData object using stored graph and clustering labels.
+
+    Parameters
+    ----------
+    adata: AnnData
+        The AnnData object containing the data.
+    labels: str or array-like
+        The key in adata.obs that contains the cluster labels.
+    obsp: str
+        The key in adata.obsp that contains the connectivities.
+
+    Returns
+    -------
+    float
+        The modularity of the graph based on the provided clustering.
+    """
+    # if labels is a key in adata.obs, get the values from adata.obs
+    # otherwise, assume it is an array-like object
+    label_array = adata.obs[labels] if isinstance(labels, str) else labels
+    connectivities = adata.obsp[obsp]
+
+    if isinstance(connectivities, CSRBase):
+        # converting to dense if it is a CSR matrix
+        dense = connectivities.toarray()
+    else:
+        toarray = getattr(connectivities, "toarray", None)
+        dense = toarray() if callable(toarray) else connectivities
+
+    if isinstance(dense, pd.DataFrame):
+        dense = dense.values
+    dense = cast("ArrayLike", dense)
+    return modularity(dense, label_array, mode=mode)
diff --git a/src/scanpy/neighbors/__init__.py b/src/scanpy/neighbors/__init__.py
@@ -21,9 +21,11 @@
 from .._utils import NeighborsView, _doc_params, get_literal_vals
 from . import _connectivity
 from ._common import (
+    _get_indices_distances_from_dense_matrix,
     _get_indices_distances_from_sparse_matrix,
     _get_sparse_matrix_from_indices_distances,
 )
+from ._connectivity import umap
 from ._doc import doc_n_pcs, doc_use_rep
 from ._types import _KnownTransformer, _Method
 
@@ -74,6 +76,7 @@ def neighbors(  # noqa: PLR0913
     n_neighbors: int = 15,
     n_pcs: int | None = None,
     *,
+    distance_matrix: np.ndarray | None = None,
     use_rep: str | None = None,
     knn: bool = True,
     method: _Method = "umap",
@@ -186,6 +189,15 @@ def neighbors(  # noqa: PLR0913
     :doc:`/how-to/knn-transformers`
 
     """
+    if distance_matrix is not None:
+        # Added this to support the new distance matrix function
+        # if a precomputed distance matrix is provided, skip the PCA and distance computation
+        return neighbors_from_distance(
+            adata,
+            distance_matrix,
+            n_neighbors=n_neighbors,
+            method=method,
+        )
     start = logg.info("computing neighbors")
     adata = adata.copy() if copy else adata
     if adata.is_view:  # we shouldn't need this here...
@@ -248,6 +260,102 @@ def neighbors(  # noqa: PLR0913
     return adata if copy else None
 
 
+def neighbors_from_distance(
+    adata: AnnData,
+    distance_matrix: np.ndarray | SpBase,
+    n_neighbors: int = 15,
+    method: Literal["umap", "gauss"] = "umap",  # default to umap
+    key_added: str | None = None,
+) -> AnnData:
+    # computes the neighborhood graph from a precomputed distance matrix
+    # both umap an gauss are supported, default is umap
+    # skipping PCA and distance computation and goes straight to the graph
+    # key_added is the key under which to store the results in adata.uns or adata.obsp
+    """Compute neighbors from a precomputer distance matrix.
+
+    Parameters
+    ----------
+    adata
+        Annotated data matrix.
+    distance_matrix
+        Precomputed dense or sparse distance matrix.
+    n_neighbors
+        Number of nearest neighbors to use in the graph.
+    method
+        Method to use for computing the graph. Currently only 'umap' is supported.
+    key_added
+        Optional key under which to store the results. Default is 'neighbors'.
+
+    Returns
+    -------
+    adata
+        Annotated data with computed distances and connectivities.
+    """
+    if isinstance(distance_matrix, SpBase):
+        # spare matrices can save memory for large datasets
+        # csr_matrix is the most efficient format for sparse matrices
+        # setting the diagonal to 0 is important = distance to self must not affect umap or gauss
+        # elimimate zeros is important to save memory, avoids storing explicit zeros
+        distance_matrix = sparse.csr_matrix(distance_matrix)  # noqa: TID251
+        distance_matrix.setdiag(0)
+        distance_matrix.eliminate_zeros()
+        # extracting for each observation the indices and distances of the n_neighbors
+        # being then used by umap or gauss
+        knn_indices, knn_distances = _get_indices_distances_from_sparse_matrix(
+            distance_matrix, n_neighbors
+        )
+    else:
+        # if it is dense, converting it to ndarray
+        # and setting the diagonal to 0
+        # extracting knn indices and distances
+        distance_matrix = np.asarray(distance_matrix)
+        np.fill_diagonal(distance_matrix, 0)
+        knn_indices, knn_distances = _get_indices_distances_from_dense_matrix(
+            distance_matrix, n_neighbors
+        )
+
+    if method == "umap":
+        # using umap to build connectivities from distances
+        connectivities = umap(
+            knn_indices,
+            knn_distances,
+            n_obs=adata.n_obs,
+            n_neighbors=n_neighbors,
+        )
+    elif method == "gauss":
+        # using gauss to build connectivities from distances
+        # requires sparse matrix for efficiency
+        connectivities = _connectivity.gauss(
+            sparse.csr_matrix(distance_matrix),  # noqa: TID251
+            n_neighbors,
+            knn=True,
+        )
+    else:
+        msg = f"Method {method} not implemented."
+        raise NotImplementedError(msg)
+    # defining where to store graph info
+    key = "neighbors" if key_added is None else key_added
+    dists_key = "distances" if key_added is None else key_added + "_distances"
+    conns_key = "connectivities" if key_added is None else key_added + "_connectivities"
+    # storing the actual distance and connectivitiy matrices as obsp
+    adata.uns[dists_key] = sparse.csr_matrix(distance_matrix)  # noqa: TID251
+    adata.obsp[conns_key] = connectivities
+    # populating with metadata describing how neighbors were computed
+    # I think might be important as many functions downstream rely
+    # on .uns['neighbors'] to find correct .obsp key
+    adata.uns[key] = {
+        "connectivities_key": "connectivities",
+        "distances_key": "distances",
+        "params": {
+            "n_neighbors": n_neighbors,
+            "method": method,
+            "random_state": 0,
+            "metric": "euclidean",
+        },
+    }
+    return adata
+
+
 class FlatTree(NamedTuple):  # noqa: D101
     hyperplanes: None
     offsets: None