Add conversion from TACO format

GraphBLAS · Sep 6, 2022 · 76cc4f7 · 76cc4f7
1 parent 3e926c0
commit 76cc4f7
Show file tree

Hide file tree

Showing 4 changed files with 117 additions and 23 deletions.
diff --git a/spz_python/notebooks/Example_Rank4.ipynb b/spz_python/notebooks/Example_Rank4.ipynb
@@ -1090,7 +1090,7 @@
        "```\n",
        "structure      = [sparse, sparse, doubly_compressed, sparse]\n",
        "\n",
-       "taco_structure = [compressed-nonunique, singleton, singleton, compressed]\n",
+       "taco_structure = [compressed, singleton, singleton, compressed]\n",
        "```"
       ],
       "text/plain": [
@@ -19640,7 +19640,6 @@
     "- [compressed, dense, singleton, singleton]\n",
     "- [compressed, singleton, compressed, singleton]\n",
     "- [compressed, singleton, dense, singleton]\n",
-    "- [compressed, singleton, singleton, compressed]\n",
     "- [compressed, singleton, singleton, singleton]\n",
     "- [compressed-nonunique, compressed, compressed, singleton]\n",
     "- [compressed-nonunique, compressed, dense, singleton]\n",
@@ -19653,6 +19652,7 @@
     "- [compressed-nonunique, dense, singleton, singleton]\n",
     "- [compressed-nonunique, singleton, compressed, singleton]\n",
     "- [compressed-nonunique, singleton, dense, singleton]\n",
+    "- [compressed-nonunique, singleton, singleton, compressed]\n",
     "- [dense, compressed, compressed, singleton]\n",
     "- [dense, compressed, dense, singleton]\n",
     "- [dense, compressed, singleton, singleton]\n",

diff --git a/spz_python/spz/_core.py b/spz_python/spz/_core.py
@@ -1,9 +1,10 @@
-from itertools import zip_longest
-
 import numpy as np
 import pandas as pd
 
-from .sparsetype import DC, C, S, abbreviate, unabbreviate
+from .sparsetype import DC, C, S, abbreviate
+from .sparsetype import from_taco as _from_taco
+from .sparsetype import to_taco as _to_taco
+from .sparsetype import unabbreviate
 
 
 def repeatrange(repeat, *args):
@@ -16,6 +17,12 @@ def issorted(array):
 
 
 class SPZ:
+    @classmethod
+    def from_taco(cls, arrays, shape=None, structure=None):
+        if structure is not None:
+            structure = _from_taco(structure)
+        return cls(arrays, shape=shape, structure=structure)
+
     def __init__(self, arrays, shape=None, structure=None):
         if not isinstance(arrays, (list, tuple)):
             raise TypeError("arrays argument must be a list or tuple of numpy arrays")
@@ -220,7 +227,7 @@ def _validate(self):
                 assert len(ptr) == len(set(ptr))
             else:  # pragma: no cover
                 raise AssertionError()
-        self.taco_structure
+        assert _from_taco(self.taco_structure) == structure
 
     def as_structure(self, structure):
         return SPZ(self.arrays, self.shape, structure)
@@ -271,22 +278,7 @@ def arrays(self):
 
     @property
     def taco_structure(self):
-        # I'm not 100% certain of the use of "singleton" and "compressed-nonunique"
-        rv = []
-        L = [DC] + self._structure
-        for prev, cur, nxt in zip_longest(L[:-1], L[1:], L[2:]):
-            if cur == C:
-                rv.append("dense")
-            elif prev == S and cur in {S, DC}:
-                rv.append("singleton")
-            elif prev in {C, DC} and cur == S and nxt in {S, C}:
-                # I'm not certain about `nxt == C` case here
-                rv.append("compressed-nonunique")
-            elif prev in {C, DC}:
-                rv.append("compressed")
-            else:
-                raise NotImplementedError()
-        return rv
+        return _to_taco(self._structure)
 
     def _repr_svg_(self):
         try:

diff --git a/spz_python/spz/sparsetype.py b/spz_python/spz/sparsetype.py
@@ -1,3 +1,6 @@
+from itertools import zip_longest
+
+
 class StructureType:
     def __repr__(self):
         return self.name
@@ -74,3 +77,72 @@ def unabbreviate(abbr):
         rv.append(DC)
     rv.pop()  # One extra DC
     return rv
+
+
+def to_taco(structure):
+    if isinstance(structure, str):
+        structure = unabbreviate(structure)
+    else:
+        structure = unabbreviate(abbreviate(*structure))
+    rv = []
+    L = [DC] + structure
+    lookahead = S  # backwards-fill S values
+    for prev, cur, nxt in reversed(list(zip_longest(L[:-1], L[1:], L[2:]))):
+        # Uh, these rules totally make sense.  Right?  Right?!
+        if cur != S:
+            lookahead = cur
+        if cur == C:
+            rv.append("dense")
+        elif prev == S and cur in {S, DC}:
+            rv.append("singleton")
+        elif prev in {C, DC} and cur == S and lookahead in {S, C} and nxt is not None:
+            rv.append("compressed-nonunique")
+        elif prev in {C, DC}:
+            rv.append("compressed")
+        else:
+            # We should be able to always go to TACO
+            raise NotImplementedError(f"Unable to convert to TACO structure: {structure}")
+    rv.reverse()
+    return rv
+
+
+def from_taco(structure):
+    compressed = "compressed"
+    dense = "dense"
+    nonunique = "compressed-nonunique"
+    singleton = "singleton"
+    rv = []
+    prev_nonS = structure[0]
+    # fmt: off
+    for prev, cur, nxt in zip_longest([None] + list(structure[:-1]), structure, structure[1:]):
+        # These rules were developed via trial and error.  Fingers crossed!
+        # Let's try come up with a clearer way to convert from taco.
+        if cur == dense and nxt in {dense, nonunique, compressed}:
+            rv.append(C)
+        elif (
+            prev in {None, dense, singleton, compressed} and cur == compressed
+            and nxt in {dense, nonunique, compressed}
+            or cur == singleton and (
+                prev == compressed and nxt in {dense, nonunique, compressed}
+                or prev == singleton and nxt == compressed
+                or prev == singleton and nxt in {dense, nonunique} and prev_nonS == compressed
+            ) and not (prev == singleton and nxt == compressed and prev_nonS == nonunique)
+        ):
+            rv.append(DC)
+        elif (
+            cur == nonunique and nxt in {dense, singleton}
+            or cur == compressed and nxt in {None, singleton}
+            or cur == singleton and (
+                prev == nonunique and nxt in {None, dense, singleton}
+                or prev == compressed and nxt == singleton
+                or prev == singleton and nxt in {None, singleton}
+                or prev == singleton and nxt == dense and prev_nonS == nonunique
+            ) and not (nxt is None and prev_nonS == compressed)
+        ):
+            rv.append(S)
+        else:
+            raise ValueError(f"Unable to convert from TACO structure: {structure}")
+        if cur != "singleton":
+            prev_nonS = cur
+    # fmt: on
+    return rv
diff --git a/spz_python/spz/tests/test_sparsetype.py b/spz_python/spz/tests/test_sparsetype.py
@@ -1,4 +1,8 @@
-from spz.sparsetype import DC, C, S, abbreviate, unabbreviate
+import itertools
+
+import pytest
+
+from spz.sparsetype import DC, C, S, abbreviate, from_taco, to_taco, unabbreviate
 
 
 def test_abbreviate():
@@ -14,3 +18,29 @@ def test_unabbreviate():
     assert unabbreviate("S-C-S-DC-C-S") == expected
     assert unabbreviate("SCSDCCS") == expected
     assert unabbreviate("SC-S-D-C-S") == expected
+
+
+@pytest.mark.parametrize("N", range(1, 9))
+def test_from_taco(N):
+    compressed = "compressed"
+    dense = "dense"
+    nonunique = "compressed-nonunique"
+    singleton = "singleton"
+    options = [compressed, dense, nonunique, singleton]
+    results = {}
+    for taco in itertools.product(*([options] * N)):
+        try:
+            structure = tuple(from_taco(taco))
+        except ValueError:
+            continue
+        if structure in results:  # pragma: no cover
+            print(structure)
+            print(" ", results[structure])
+            print(" ", taco)
+            raise AssertionError(
+                "Multiple TACO structures give the same structure: "
+                f"{taco} and {results[structure]} -> {structure}"
+            )
+        results[structure] = taco
+        assert tuple(to_taco(structure)) == taco
+    assert len(results) == 3 ** (N - 1)