Skip to content

Add neighbors_from_distance for computing neighborhood graphs from precomputed distance matrices #3627

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 24 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 12 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
f76dc7b
Modularity score functions with comments
amalia-k510 Apr 25, 2025
f092469
typo fix
amalia-k510 Apr 25, 2025
7ffa1ec
Merge branch 'scverse:main' into main
amalia-k510 Apr 25, 2025
c0d0c52
Merge branch 'scverse:main' into main
amalia-k510 May 7, 2025
68652a7
modularity code updated and 6 tests written for modularity
amalia-k510 May 7, 2025
948319a
error fixing from pipelines
amalia-k510 May 7, 2025
6a64330
ruff error fix
amalia-k510 May 7, 2025
793351f
keywords variable fix
amalia-k510 May 7, 2025
92d8e26
neighbors from a precomputed distance matrix, still need to make sure…
amalia-k510 May 7, 2025
198c4fb
revert back
amalia-k510 May 7, 2025
e7fb67a
code only for the prexisting distance matrix
amalia-k510 May 7, 2025
14cb441
initial changes for the neighborhors
amalia-k510 May 8, 2025
0ce8c15
distances name switch and sparse array allowed
amalia-k510 May 12, 2025
914b87d
input fix
amalia-k510 May 12, 2025
d285203
variable input fixes
amalia-k510 May 12, 2025
50705b3
test added
amalia-k510 May 12, 2025
4730667
numpy issue fix for one line
amalia-k510 May 12, 2025
040b8b7
unify metadata assembly
flying-sheep May 16, 2025
c03b863
comments fix and release notes
amalia-k510 May 23, 2025
473a437
comments fix typo
amalia-k510 May 23, 2025
ec586df
Merge branch 'scverse:main' into matrix_exist
amalia-k510 May 25, 2025
43dcfc0
fix relnotes
flying-sheep May 26, 2025
8a3588c
make non-specified metric `None`
flying-sheep May 26, 2025
293f568
Merge branch 'main' into matrix_exist
flying-sheep May 28, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/scanpy/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from __future__ import annotations

from ._gearys_c import gearys_c
from ._metrics import confusion_matrix
from ._metrics import confusion_matrix, modularity, modularity_adata
from ._morans_i import morans_i

__all__ = ["confusion_matrix", "gearys_c", "morans_i"]
__all__ = ["confusion_matrix", "gearys_c", "modularity", "modularity_adata", "morans_i"]
105 changes: 103 additions & 2 deletions src/scanpy/metrics/_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,22 @@

from __future__ import annotations

from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, cast

import numpy as np
import pandas as pd
from natsort import natsorted
from pandas.api.types import CategoricalDtype
from scipy.sparse import coo_matrix

from .._compat import CSRBase, SpBase

if TYPE_CHECKING:
from collections.abc import Sequence
from typing import Literal

from anndata import AnnData
from numpy.typing import ArrayLike


def confusion_matrix(
Expand Down Expand Up @@ -60,7 +67,9 @@ def confusion_matrix(
orig, new = pd.Series(orig), pd.Series(new)
assert len(orig) == len(new)

unique_labels = pd.unique(np.concatenate((orig.values, new.values)))
unique_labels = pd.unique(
np.concatenate((np.asarray(orig.values), np.asarray(new.values)))
)

# Compute
mtx = _confusion_matrix(orig, new, labels=unique_labels)
Expand Down Expand Up @@ -89,3 +98,95 @@ def confusion_matrix(
df = df.loc[np.array(orig_idx), np.array(new_idx)]

return df


def modularity(
connectivities: ArrayLike | SpBase,
labels: pd.Series | ArrayLike,
mode: Literal["UNDIRECTED", "DIRECTED"] = "UNDIRECTED",
) -> float:
# accepting both dense or spare matrices as the connectivity graph
# setting mode between directed and undirected
"""Compute the modularity of a graph given its connectivities and labels.

Parameters
----------
connectivities: array-like or sparse matrix
Weighted adjacency matrix representing the graph. Can be a dense NumPy array or a sparse CSR matrix.
labels: array-like or pandas.Series
Cluster labels for each node in the graph.
mode: str
The mode of the graph. Can be "UNDIRECTED" or "DIRECTED". Default is "UNDIRECTED".

Returns
-------
float
The modularity of the graph based on the provided clustering.
"""
try:
# try to import igraph in case the user wants to calculate modularity
# not in the main module to avoid import errors
import igraph as ig
except ImportError as e:
msg = "igraph is require for computing modularity"
raise ImportError(msg) from e
if isinstance(connectivities, CSRBase | SpBase):
# check if the connectivities is a sparse matrix
coo = coo_matrix(connectivities)
edges = list(zip(coo.row, coo.col, strict=False))
# converting to the coo format to extract the edges and weights
# storing only non-zero elements and their indices
weights = coo.data.tolist()
graph = ig.Graph(edges=edges, directed=mode == "DIRECTED")
graph.es["weight"] = weights
else:
# if the graph is dense, creates it directly using igraph's adjacency matrix
dense_array = np.asarray(connectivities)
igraph_mode = ig.ADJ_UNDIRECTED if mode == "UNDIRECTED" else ig.ADJ_DIRECTED
graph = ig.Graph.Weighted_Adjacency(dense_array.tolist(), mode=igraph_mode)
# cluster labels to integer codes required by igraph
labels = pd.Categorical(np.asarray(labels)).codes

return graph.modularity(labels)


def modularity_adata(
adata: AnnData,
*,
labels: str | ArrayLike = "leiden",
obsp: str = "connectivities",
mode: Literal["UNDIRECTED", "DIRECTED"] = "UNDIRECTED",
) -> float:
# default to leiden labels and connectivities as it is more common
"""Compute modularity from an AnnData object using stored graph and clustering labels.

Parameters
----------
adata: AnnData
The AnnData object containing the data.
labels: str or array-like
The key in adata.obs that contains the cluster labels.
obsp: str
The key in adata.obsp that contains the connectivities.

Returns
-------
float
The modularity of the graph based on the provided clustering.
"""
# if labels is a key in adata.obs, get the values from adata.obs
# otherwise, assume it is an array-like object
label_array = adata.obs[labels] if isinstance(labels, str) else labels
connectivities = adata.obsp[obsp]

if isinstance(connectivities, CSRBase):
# converting to dense if it is a CSR matrix
dense = connectivities.toarray()
else:
toarray = getattr(connectivities, "toarray", None)
dense = toarray() if callable(toarray) else connectivities

if isinstance(dense, pd.DataFrame):
dense = dense.values
dense = cast("ArrayLike", dense)
return modularity(dense, label_array, mode=mode)
108 changes: 108 additions & 0 deletions src/scanpy/neighbors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,11 @@
from .._utils import NeighborsView, _doc_params, get_literal_vals
from . import _connectivity
from ._common import (
_get_indices_distances_from_dense_matrix,
_get_indices_distances_from_sparse_matrix,
_get_sparse_matrix_from_indices_distances,
)
from ._connectivity import umap
from ._doc import doc_n_pcs, doc_use_rep
from ._types import _KnownTransformer, _Method

Expand Down Expand Up @@ -74,6 +76,7 @@ def neighbors( # noqa: PLR0913
n_neighbors: int = 15,
n_pcs: int | None = None,
*,
distance_matrix: np.ndarray | None = None,
use_rep: str | None = None,
knn: bool = True,
method: _Method = "umap",
Expand Down Expand Up @@ -186,6 +189,15 @@ def neighbors( # noqa: PLR0913
:doc:`/how-to/knn-transformers`

"""
if distance_matrix is not None:
# Added this to support the new distance matrix function
# if a precomputed distance matrix is provided, skip the PCA and distance computation
return neighbors_from_distance(
adata,
distance_matrix,
n_neighbors=n_neighbors,
method=method,
)
start = logg.info("computing neighbors")
adata = adata.copy() if copy else adata
if adata.is_view: # we shouldn't need this here...
Expand Down Expand Up @@ -248,6 +260,102 @@ def neighbors( # noqa: PLR0913
return adata if copy else None


def neighbors_from_distance(
adata: AnnData,
distance_matrix: np.ndarray | SpBase,
n_neighbors: int = 15,
method: Literal["umap", "gauss"] = "umap", # default to umap
key_added: str | None = None,
) -> AnnData:
# computes the neighborhood graph from a precomputed distance matrix
# both umap an gauss are supported, default is umap
# skipping PCA and distance computation and goes straight to the graph
# key_added is the key under which to store the results in adata.uns or adata.obsp
"""Compute neighbors from a precomputer distance matrix.

Parameters
----------
adata
Annotated data matrix.
distance_matrix
Precomputed dense or sparse distance matrix.
n_neighbors
Number of nearest neighbors to use in the graph.
method
Method to use for computing the graph. Currently only 'umap' is supported.
key_added
Optional key under which to store the results. Default is 'neighbors'.

Returns
-------
adata
Annotated data with computed distances and connectivities.
"""
if isinstance(distance_matrix, SpBase):
# spare matrices can save memory for large datasets
# csr_matrix is the most efficient format for sparse matrices
# setting the diagonal to 0 is important = distance to self must not affect umap or gauss
# elimimate zeros is important to save memory, avoids storing explicit zeros
distance_matrix = sparse.csr_matrix(distance_matrix) # noqa: TID251
distance_matrix.setdiag(0)
distance_matrix.eliminate_zeros()
# extracting for each observation the indices and distances of the n_neighbors
# being then used by umap or gauss
knn_indices, knn_distances = _get_indices_distances_from_sparse_matrix(
distance_matrix, n_neighbors
)
else:
# if it is dense, converting it to ndarray
# and setting the diagonal to 0
# extracting knn indices and distances
distance_matrix = np.asarray(distance_matrix)
np.fill_diagonal(distance_matrix, 0)
knn_indices, knn_distances = _get_indices_distances_from_dense_matrix(
distance_matrix, n_neighbors
)

if method == "umap":
# using umap to build connectivities from distances
connectivities = umap(
knn_indices,
knn_distances,
n_obs=adata.n_obs,
n_neighbors=n_neighbors,
)
elif method == "gauss":
# using gauss to build connectivities from distances
# requires sparse matrix for efficiency
connectivities = _connectivity.gauss(
sparse.csr_matrix(distance_matrix), # noqa: TID251
n_neighbors,
knn=True,
)
else:
msg = f"Method {method} not implemented."
raise NotImplementedError(msg)
# defining where to store graph info
key = "neighbors" if key_added is None else key_added
dists_key = "distances" if key_added is None else key_added + "_distances"
conns_key = "connectivities" if key_added is None else key_added + "_connectivities"
# storing the actual distance and connectivitiy matrices as obsp
adata.uns[dists_key] = sparse.csr_matrix(distance_matrix) # noqa: TID251
adata.obsp[conns_key] = connectivities
# populating with metadata describing how neighbors were computed
# I think might be important as many functions downstream rely
# on .uns['neighbors'] to find correct .obsp key
adata.uns[key] = {
"connectivities_key": "connectivities",
"distances_key": "distances",
"params": {
"n_neighbors": n_neighbors,
"method": method,
"random_state": 0,
"metric": "euclidean",
},
}
return adata


class FlatTree(NamedTuple): # noqa: D101
hyperplanes: None
offsets: None
Expand Down
Loading
Loading