From a75e92190ac5a299c602b687e80665d0e337c64d Mon Sep 17 00:00:00 2001
From: Erik Welch <erik.n.welch@gmail.com>
Date: Tue, 19 Nov 2024 17:14:14 -0600
Subject: [PATCH] Faster `shortest_path` (#18)

Closes https://github.com/rapidsai/graph_dl/issues/631

For larger graphs, nearly all the time is spent creating dicts of lists of paths. This PR delays creating the lists of paths by duck-typing the return object and returning a Mapping that computes paths as needed. There is virtually no performance penalty for this, and a huge performance gain by delaying computation.

Moved from here: https://github.com/rapidsai/cugraph/pull/4739

Authors:
  - Erik Welch (https://github.com/eriknw)

Approvers:
  - Rick Ratzel (https://github.com/rlratzel)

URL: https://github.com/rapidsai/nx-cugraph/pull/18
---
 .../nx-cugraph/pytest-based/bench_algos.py    |  4 +-
 .../algorithms/shortest_paths/unweighted.py   | 98 ++++++++++++++-----
 .../algorithms/shortest_paths/weighted.py     | 38 +++----
 3 files changed, 91 insertions(+), 49 deletions(-)

diff --git a/benchmarks/nx-cugraph/pytest-based/bench_algos.py b/benchmarks/nx-cugraph/pytest-based/bench_algos.py
index 9615b43ea..612f69d05 100644
--- a/benchmarks/nx-cugraph/pytest-based/bench_algos.py
+++ b/benchmarks/nx-cugraph/pytest-based/bench_algos.py
@@ -11,6 +11,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from collections.abc import Mapping
+
 import networkx as nx
 import pandas as pd
 import pytest
@@ -496,7 +498,7 @@ def bench_shortest_path(benchmark, graph_obj, backend_wrapper):
         iterations=iterations,
         warmup_rounds=warmup_rounds,
     )
-    assert type(result) is dict
+    assert isinstance(result, Mapping)  # dict in nx, but we duck-type
 
 
 def bench_single_source_shortest_path_length(benchmark, graph_obj, backend_wrapper):
diff --git a/nx_cugraph/algorithms/shortest_paths/unweighted.py b/nx_cugraph/algorithms/shortest_paths/unweighted.py
index a4a4474fa..e83800f5d 100644
--- a/nx_cugraph/algorithms/shortest_paths/unweighted.py
+++ b/nx_cugraph/algorithms/shortest_paths/unweighted.py
@@ -10,6 +10,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import collections
 import itertools
 
 import cupy as cp
@@ -19,7 +20,7 @@
 
 from nx_cugraph import _nxver
 from nx_cugraph.convert import _to_graph
-from nx_cugraph.utils import _groupby, index_dtype, networkx_algorithm
+from nx_cugraph.utils import index_dtype, networkx_algorithm
 
 __all__ = [
     "bidirectional_shortest_path",
@@ -179,35 +180,82 @@ def _bfs(
             elif not reverse_path:
                 paths.reverse()
         else:
-            if return_type == "path":
-                distances = distances[mask]
-            groups = _groupby(distances, [predecessors[mask], node_ids])
-
-            # `pred_node_iter` does the equivalent as these nested for loops:
-            # for length in range(1, len(groups)):
-            #     preds, nodes = groups[length]
-            #     for pred, node in zip(preds.tolist(), nodes.tolist()):
-            if G.key_to_id is None:
-                pred_node_iter = concat(
-                    zip(*(x.tolist() for x in groups[length]))
-                    for length in range(1, len(groups))
-                )
-            else:
-                pred_node_iter = concat(
-                    zip(*(G._nodeiter_to_iter(x.tolist()) for x in groups[length]))
-                    for length in range(1, len(groups))
-                )
-            # Consider making utility functions for creating paths
-            paths = {source: [source]}
+            # Computing paths to all nodes can be expensive, so let's delay
+            # computation until needed using `PathMapping`.
+            key_iter = node_ids.tolist()
+            pred_iter = predecessors[mask].tolist()
+            if G.key_to_id is not None:
+                key_iter = G._nodeiter_to_iter(key_iter)
+                pred_iter = G._nodeiter_to_iter(pred_iter)
+            key_to_pred = dict(zip(key_iter, pred_iter))
+            key_to_pred[source] = None
             if reverse_path:
-                for pred, node in pred_node_iter:
-                    paths[node] = [node, *paths[pred]]
+                paths = ReversePathMapping({source: [source]}, key_to_pred)
             else:
-                for pred, node in pred_node_iter:
-                    paths[node] = [*paths[pred], node]
+                paths = PathMapping({source: [source]}, key_to_pred)
     if return_type == "path":
         return paths
     if return_type == "length":
         return lengths
     # return_type == "length-path"
     return lengths, paths
+
+
+class PathMapping(collections.abc.Mapping):
+    """Compute path for nodes as needed using predecessors.
+
+    The path for each node contains itself at the beginning of the path.
+    """
+
+    def __init__(self, data, key_to_pred):
+        self._data = data
+        self._key_to_pred = key_to_pred
+
+    def __getitem__(self, key):
+        if key in self._data:
+            return self._data[key]
+        stack = [key]
+        key = self._key_to_pred[key]
+        while key not in self._data:
+            stack.append(key)
+            key = self._key_to_pred[key]
+        val = self._data[key]
+        for key in reversed(stack):
+            val = self._data[key] = [*val, key]  # Switched in ReversePathMapping
+        return val
+
+    def __iter__(self):
+        return iter(self._key_to_pred)
+
+    def __len__(self):
+        return len(self._key_to_pred)
+
+
+class ReversePathMapping(collections.abc.Mapping):
+    """Compute path for nodes as needed using predecessors.
+
+    The path for each node contains itself at the end of the path.
+    """
+
+    def __init__(self, data, key_to_pred):
+        self._data = data
+        self._key_to_pred = key_to_pred
+
+    def __getitem__(self, key):
+        if key in self._data:
+            return self._data[key]
+        stack = [key]
+        key = self._key_to_pred[key]
+        while key not in self._data:
+            stack.append(key)
+            key = self._key_to_pred[key]
+        val = self._data[key]
+        for key in reversed(stack):
+            val = self._data[key] = [key, *val]  # Switched in PathMapping
+        return val
+
+    def __iter__(self):
+        return iter(self._key_to_pred)
+
+    def __len__(self):
+        return len(self._key_to_pred)
diff --git a/nx_cugraph/algorithms/shortest_paths/weighted.py b/nx_cugraph/algorithms/shortest_paths/weighted.py
index 032ef2c7f..5bb909131 100644
--- a/nx_cugraph/algorithms/shortest_paths/weighted.py
+++ b/nx_cugraph/algorithms/shortest_paths/weighted.py
@@ -15,14 +15,9 @@
 import pylibcugraph as plc
 
 from nx_cugraph.convert import _to_graph
-from nx_cugraph.utils import (
-    _dtype_param,
-    _get_float_dtype,
-    _groupby,
-    networkx_algorithm,
-)
+from nx_cugraph.utils import _dtype_param, _get_float_dtype, networkx_algorithm
 
-from .unweighted import _bfs
+from .unweighted import PathMapping, ReversePathMapping, _bfs
 
 __all__ = [
     "dijkstra_path",
@@ -378,22 +373,19 @@ def _sssp(
             elif not reverse_path:
                 paths.reverse()
         else:
-            groups = _groupby(predecessors[mask], node_ids)
-            if (id_to_key := G.id_to_key) is not None:
-                groups = {id_to_key[k]: v for k, v in groups.items() if k >= 0}
-            paths = {source: [source]}
-            preds = [source]
-            while preds:
-                pred = preds.pop()
-                pred_path = paths[pred]
-                nodes = G._nodearray_to_list(groups[pred])
-                if reverse_path:
-                    for node in nodes:
-                        paths[node] = [node, *pred_path]
-                else:
-                    for node in nodes:
-                        paths[node] = [*pred_path, node]
-                preds.extend(nodes & groups.keys())
+            # Computing paths to all nodes can be expensive, so let's delay
+            # computation until needed using `PathMapping`.
+            key_iter = node_ids.tolist()
+            pred_iter = predecessors[mask].tolist()
+            if G.key_to_id is not None:
+                key_iter = G._nodeiter_to_iter(key_iter)
+                pred_iter = G._nodeiter_to_iter(pred_iter)
+            key_to_pred = dict(zip(key_iter, pred_iter))
+            key_to_pred[source] = None
+            if reverse_path:
+                paths = ReversePathMapping({source: [source]}, key_to_pred)
+            else:
+                paths = PathMapping({source: [source]}, key_to_pred)
     if return_type == "path":
         return paths
     if return_type == "length":