From a75e92190ac5a299c602b687e80665d0e337c64d Mon Sep 17 00:00:00 2001 From: Erik Welch Date: Tue, 19 Nov 2024 17:14:14 -0600 Subject: [PATCH] Faster `shortest_path` (#18) Closes https://github.com/rapidsai/graph_dl/issues/631 For larger graphs, nearly all the time is spent creating dicts of lists of paths. This PR delays creating the lists of paths by duck-typing the return object and returning a Mapping that computes paths as needed. There is virtually no performance penalty for this, and a huge performance gain by delaying computation. Moved from here: https://github.com/rapidsai/cugraph/pull/4739 Authors: - Erik Welch (https://github.com/eriknw) Approvers: - Rick Ratzel (https://github.com/rlratzel) URL: https://github.com/rapidsai/nx-cugraph/pull/18 --- .../nx-cugraph/pytest-based/bench_algos.py | 4 +- .../algorithms/shortest_paths/unweighted.py | 98 ++++++++++++++----- .../algorithms/shortest_paths/weighted.py | 38 +++---- 3 files changed, 91 insertions(+), 49 deletions(-) diff --git a/benchmarks/nx-cugraph/pytest-based/bench_algos.py b/benchmarks/nx-cugraph/pytest-based/bench_algos.py index 9615b43ea..612f69d05 100644 --- a/benchmarks/nx-cugraph/pytest-based/bench_algos.py +++ b/benchmarks/nx-cugraph/pytest-based/bench_algos.py @@ -11,6 +11,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +from collections.abc import Mapping + import networkx as nx import pandas as pd import pytest @@ -496,7 +498,7 @@ def bench_shortest_path(benchmark, graph_obj, backend_wrapper): iterations=iterations, warmup_rounds=warmup_rounds, ) - assert type(result) is dict + assert isinstance(result, Mapping) # dict in nx, but we duck-type def bench_single_source_shortest_path_length(benchmark, graph_obj, backend_wrapper): diff --git a/nx_cugraph/algorithms/shortest_paths/unweighted.py b/nx_cugraph/algorithms/shortest_paths/unweighted.py index a4a4474fa..e83800f5d 100644 --- a/nx_cugraph/algorithms/shortest_paths/unweighted.py +++ b/nx_cugraph/algorithms/shortest_paths/unweighted.py @@ -10,6 +10,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import collections import itertools import cupy as cp @@ -19,7 +20,7 @@ from nx_cugraph import _nxver from nx_cugraph.convert import _to_graph -from nx_cugraph.utils import _groupby, index_dtype, networkx_algorithm +from nx_cugraph.utils import index_dtype, networkx_algorithm __all__ = [ "bidirectional_shortest_path", @@ -179,35 +180,82 @@ def _bfs( elif not reverse_path: paths.reverse() else: - if return_type == "path": - distances = distances[mask] - groups = _groupby(distances, [predecessors[mask], node_ids]) - - # `pred_node_iter` does the equivalent as these nested for loops: - # for length in range(1, len(groups)): - # preds, nodes = groups[length] - # for pred, node in zip(preds.tolist(), nodes.tolist()): - if G.key_to_id is None: - pred_node_iter = concat( - zip(*(x.tolist() for x in groups[length])) - for length in range(1, len(groups)) - ) - else: - pred_node_iter = concat( - zip(*(G._nodeiter_to_iter(x.tolist()) for x in groups[length])) - for length in range(1, len(groups)) - ) - # Consider making utility functions for creating paths - paths = {source: [source]} + # Computing paths to all nodes can be expensive, so let's delay + # computation until needed using `PathMapping`. + key_iter = node_ids.tolist() + pred_iter = predecessors[mask].tolist() + if G.key_to_id is not None: + key_iter = G._nodeiter_to_iter(key_iter) + pred_iter = G._nodeiter_to_iter(pred_iter) + key_to_pred = dict(zip(key_iter, pred_iter)) + key_to_pred[source] = None if reverse_path: - for pred, node in pred_node_iter: - paths[node] = [node, *paths[pred]] + paths = ReversePathMapping({source: [source]}, key_to_pred) else: - for pred, node in pred_node_iter: - paths[node] = [*paths[pred], node] + paths = PathMapping({source: [source]}, key_to_pred) if return_type == "path": return paths if return_type == "length": return lengths # return_type == "length-path" return lengths, paths + + +class PathMapping(collections.abc.Mapping): + """Compute path for nodes as needed using predecessors. + + The path for each node contains itself at the beginning of the path. + """ + + def __init__(self, data, key_to_pred): + self._data = data + self._key_to_pred = key_to_pred + + def __getitem__(self, key): + if key in self._data: + return self._data[key] + stack = [key] + key = self._key_to_pred[key] + while key not in self._data: + stack.append(key) + key = self._key_to_pred[key] + val = self._data[key] + for key in reversed(stack): + val = self._data[key] = [*val, key] # Switched in ReversePathMapping + return val + + def __iter__(self): + return iter(self._key_to_pred) + + def __len__(self): + return len(self._key_to_pred) + + +class ReversePathMapping(collections.abc.Mapping): + """Compute path for nodes as needed using predecessors. + + The path for each node contains itself at the end of the path. + """ + + def __init__(self, data, key_to_pred): + self._data = data + self._key_to_pred = key_to_pred + + def __getitem__(self, key): + if key in self._data: + return self._data[key] + stack = [key] + key = self._key_to_pred[key] + while key not in self._data: + stack.append(key) + key = self._key_to_pred[key] + val = self._data[key] + for key in reversed(stack): + val = self._data[key] = [key, *val] # Switched in PathMapping + return val + + def __iter__(self): + return iter(self._key_to_pred) + + def __len__(self): + return len(self._key_to_pred) diff --git a/nx_cugraph/algorithms/shortest_paths/weighted.py b/nx_cugraph/algorithms/shortest_paths/weighted.py index 032ef2c7f..5bb909131 100644 --- a/nx_cugraph/algorithms/shortest_paths/weighted.py +++ b/nx_cugraph/algorithms/shortest_paths/weighted.py @@ -15,14 +15,9 @@ import pylibcugraph as plc from nx_cugraph.convert import _to_graph -from nx_cugraph.utils import ( - _dtype_param, - _get_float_dtype, - _groupby, - networkx_algorithm, -) +from nx_cugraph.utils import _dtype_param, _get_float_dtype, networkx_algorithm -from .unweighted import _bfs +from .unweighted import PathMapping, ReversePathMapping, _bfs __all__ = [ "dijkstra_path", @@ -378,22 +373,19 @@ def _sssp( elif not reverse_path: paths.reverse() else: - groups = _groupby(predecessors[mask], node_ids) - if (id_to_key := G.id_to_key) is not None: - groups = {id_to_key[k]: v for k, v in groups.items() if k >= 0} - paths = {source: [source]} - preds = [source] - while preds: - pred = preds.pop() - pred_path = paths[pred] - nodes = G._nodearray_to_list(groups[pred]) - if reverse_path: - for node in nodes: - paths[node] = [node, *pred_path] - else: - for node in nodes: - paths[node] = [*pred_path, node] - preds.extend(nodes & groups.keys()) + # Computing paths to all nodes can be expensive, so let's delay + # computation until needed using `PathMapping`. + key_iter = node_ids.tolist() + pred_iter = predecessors[mask].tolist() + if G.key_to_id is not None: + key_iter = G._nodeiter_to_iter(key_iter) + pred_iter = G._nodeiter_to_iter(pred_iter) + key_to_pred = dict(zip(key_iter, pred_iter)) + key_to_pred[source] = None + if reverse_path: + paths = ReversePathMapping({source: [source]}, key_to_pred) + else: + paths = PathMapping({source: [source]}, key_to_pred) if return_type == "path": return paths if return_type == "length":