From aedee990590f83218c8b872ebaecbb5b660d2fd3 Mon Sep 17 00:00:00 2001
From: Erik Welch <erik.n.welch@gmail.com>
Date: Sun, 21 Aug 2022 17:28:05 -0400
Subject: [PATCH] Create Python package for exploring n-dimensional sparse
 arrays with different structures

---
 .gitignore                       | 117 ++++++++++++++++
 spz_python/LICENSE               |  29 ++++
 spz_python/README.md             |   0
 spz_python/pyproject.toml        |   5 +
 spz_python/requirements.txt      |   2 +
 spz_python/setup.cfg             |  56 ++++++++
 spz_python/setup.py              |  46 ++++++
 spz_python/spz/__init__.py       |   2 +
 spz_python/spz/_core.py          | 233 +++++++++++++++++++++++++++++++
 spz_python/spz/sparsetype.py     |  75 ++++++++++
 spz_python/spz/tests/__init__.py |   0
 spz_python/spz/tests/conftest.py |  13 ++
 spz_python/spz/tests/test_spz.py |  28 ++++
 13 files changed, 606 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 spz_python/LICENSE
 create mode 100644 spz_python/README.md
 create mode 100644 spz_python/pyproject.toml
 create mode 100644 spz_python/requirements.txt
 create mode 100644 spz_python/setup.cfg
 create mode 100644 spz_python/setup.py
 create mode 100644 spz_python/spz/__init__.py
 create mode 100644 spz_python/spz/_core.py
 create mode 100644 spz_python/spz/sparsetype.py
 create mode 100644 spz_python/spz/tests/__init__.py
 create mode 100644 spz_python/spz/tests/conftest.py
 create mode 100644 spz_python/spz/tests/test_spz.py

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..07e76ee
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,117 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# Generated C code
+*.c
+
+# C extensions
+*.so
+
+# Vi
+*.swp
+*.swo
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+
+# PyCharm
+.idea
+
+# Mac
+.DS_Store
diff --git a/spz_python/LICENSE b/spz_python/LICENSE
new file mode 100644
index 0000000..a0cba38
--- /dev/null
+++ b/spz_python/LICENSE
@@ -0,0 +1,29 @@
+BSD 3-Clause License
+
+Copyright (c) 2022, GraphBLAS
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/spz_python/README.md b/spz_python/README.md
new file mode 100644
index 0000000..e69de29
diff --git a/spz_python/pyproject.toml b/spz_python/pyproject.toml
new file mode 100644
index 0000000..90ccf9c
--- /dev/null
+++ b/spz_python/pyproject.toml
@@ -0,0 +1,5 @@
+[build-system]
+requires = ["setuptools", "wheel"]
+
+[tool.black]
+line-length = 100
diff --git a/spz_python/requirements.txt b/spz_python/requirements.txt
new file mode 100644
index 0000000..5da331c
--- /dev/null
+++ b/spz_python/requirements.txt
@@ -0,0 +1,2 @@
+numpy
+pandas
diff --git a/spz_python/setup.cfg b/spz_python/setup.cfg
new file mode 100644
index 0000000..7c3fc55
--- /dev/null
+++ b/spz_python/setup.cfg
@@ -0,0 +1,56 @@
+[aliases]
+test=pytest
+
+[flake8]
+max-line-length = 100
+inline-quotes = "
+exclude =
+    versioneer.py,
+ignore =
+    E203,   # whitespace before ':'
+    E231,   # Multiple spaces around ","
+    W503,   # line break before binary operator
+per-file-ignores =
+    __init__.py:F401,
+    spz/_core.py:B020
+
+[isort]
+sections = FUTURE,STDLIB,THIRDPARTY,FIRSTPARTY,LOCALFOLDER
+profile = black
+skip_gitignore = true
+float_to_top = true
+default_section = THIRDPARTY
+known_first_party = spz
+line_length = 100
+
+[versioneer]
+VCS = git
+style = pep440
+versionfile_source = spz/_version.py
+versionfile_build = spz/_version.py
+tag_prefix=
+parentdir_prefix=spz-
+
+[tool:pytest]
+testpaths = spz/tests
+markers:
+  slow: Skipped unless --runslow passed
+
+[coverage:run]
+source = spz
+omit =
+    spz/_version.py
+
+[coverage:report]
+# Regexes for lines to exclude from consideration
+exclude_lines =
+    pragma: no cover
+
+    raise AssertionError
+    raise NotImplementedError
+
+ignore_errors = True
+precision = 1
+fail_under = 0
+skip_covered = True
+skip_empty = True
diff --git a/spz_python/setup.py b/spz_python/setup.py
new file mode 100644
index 0000000..2bc07df
--- /dev/null
+++ b/spz_python/setup.py
@@ -0,0 +1,46 @@
+import versioneer
+from setuptools import find_packages, setup
+
+install_requires = open("requirements.txt").read().strip().split("\n")
+extras_require = {
+    "test": ["pytest"],
+    "viz": ["sphinxcontrib-svgbob"],
+}
+extras_require["complete"] = sorted({v for req in extras_require.values() for v in req})
+
+with open("README.md") as f:
+    long_description = f.read()
+
+setup(
+    name="spz",
+    version=versioneer.get_version(),
+    cmdclass=versioneer.get_cmdclass(),
+    description="Explore multidimensional sparse data structures",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    author="Erik Welch",
+    author_email="erik.n.welch@gmail.com",
+    url="https://github.com/GraphBLAS/binsparse-specification",
+    packages=find_packages(),
+    license="BSD",
+    python_requires=">=3.8",
+    setup_requires=[],
+    install_requires=install_requires,
+    extras_require=extras_require,
+    include_package_data=True,
+    classifiers=[
+        "Development Status :: 3 - Alpha" "License :: OSI Approved :: BSD License",
+        "Operating System :: OS Independent",
+        "Programming Language :: Python",
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
+        "Programming Language :: Python :: 3 :: Only",
+        "Intended Audience :: Developers",
+        "Intended Audience :: Science/Research",
+        "Topic :: Scientific/Engineering",
+        "Topic :: Scientific/Engineering :: Mathematics",
+    ],
+    zip_safe=False,
+)
diff --git a/spz_python/spz/__init__.py b/spz_python/spz/__init__.py
new file mode 100644
index 0000000..eef5bf8
--- /dev/null
+++ b/spz_python/spz/__init__.py
@@ -0,0 +1,2 @@
+from ._core import SPZ
+from .sparsetype import DC, C, S, compressed, doubly_compressed, sparse
diff --git a/spz_python/spz/_core.py b/spz_python/spz/_core.py
new file mode 100644
index 0000000..569c1c7
--- /dev/null
+++ b/spz_python/spz/_core.py
@@ -0,0 +1,233 @@
+import numpy as np
+import pandas as pd
+
+from .sparsetype import DC, C, S, abbreviate, unabbreviate
+
+
+def repeatrange(repeat, *args):
+    """e.g., [0, 1, 2, 0, 1, 2]"""
+    return np.repeat(np.arange(*args)[None, :], repeat, axis=0).ravel()
+
+
+class SPZ:
+    def __init__(self, arrays, shape=None, structure=None):
+        if not isinstance(arrays, (list, tuple)):
+            raise TypeError("arrays argument must be a list or tuple of numpy arrays")
+        if not arrays:
+            raise ValueError("At least one array must be given")
+        arrays = [np.array(array) for array in arrays]
+        if not all(array.ndim == 1 for array in arrays):
+            raise ValueError("arrays must be a single dimension")
+        size = arrays[0].size
+        if not all(array.size == size for array in arrays):
+            raise ValueError("arrays must be the same size")
+        if not all(np.issubdtype(array.dtype, np.integer) for array in arrays):
+            raise ValueError("arrays must be integer dtype")
+        if not all((array >= 0).all() for array in arrays):
+            raise ValueError("array values must be positive")
+
+        if shape is not None:
+            self._shape = tuple(shape)
+            if not all(dimsize > 0 for dimsize in self._shape):
+                raise ValueError("Dimension sizes must be greater than 0")
+            if len(self._shape) != len(arrays):
+                raise ValueError("shape must be the same length as arrays")
+            if not all((array < dimsize).all() for array, dimsize in zip(arrays, self._shape)):
+                raise ValueError("index in array is out of bounds")
+        else:
+            self._shape = tuple(int(array.max()) + 1 for array in arrays)
+
+        if structure is None:  # Assume CSF
+            self._structure = [DC] * (len(arrays) - 1) + [S]
+        elif isinstance(structure, str):
+            self._structure = unabbreviate(structure)
+        else:
+            self._structure = unabbreviate(abbreviate(*structure))
+        if len(self._structure) != len(arrays):
+            raise ValueError("structure must be the same length as arrays")
+        if self._structure[-1] != S:
+            # C as the final dimension means "dense"
+            raise ValueError("The final dimension must be sparse structural type")
+
+        # Now the fun part!  Generate the compressed structure from COO
+        df = pd.DataFrame(arrays).T.sort_values(list(range(self.ndim)))
+        if df.duplicated().any():
+            raise ValueError("Duplicate indices found!")
+
+        # First create indices
+        indices = []
+        cols = list(df.columns)
+        num_s_levels = 0
+        prev = None
+        for sparsity, level in zip(self._structure, range(df.shape[-1])):
+            if sparsity == S:
+                num_s_levels += 1
+            elif sparsity == DC:
+                subdf = df[cols[: level + 1]].drop_duplicates()
+                for i in range(-num_s_levels - 1, 0):
+                    indices.append(subdf.iloc[:, i].values)
+                num_s_levels = 0
+            elif sparsity == C:
+                if level == 0:
+                    indices.append(np.arange(shape[level]))
+                elif prev == DC:
+                    subdf = df[cols[:level]].drop_duplicates()
+                    indices.append(repeatrange(len(subdf), shape[level]))
+                elif prev == S:
+                    subdf = df[cols[:level]].drop_duplicates()
+                    subdf = subdf.join(
+                        pd.DataFrame({cols[level]: range(shape[level])}), how="cross"
+                    )
+                    for i in range(-num_s_levels - 1, 0):
+                        indices.append(subdf.iloc[:, i].values)
+                    num_s_levels = 0
+                else:  # prev == C
+                    indices.append(repeatrange(indices[-1].size, shape[level]))
+            prev = sparsity
+        for i in range(-num_s_levels, 0):
+            indices.append(df.iloc[:, i].values)
+        self._indices = indices
+
+        # Now create pointers
+        pointers = []
+        for sparsity, level in zip(self._structure[:-1], range(df.shape[-1] - 1)):
+            if sparsity == S:
+                ptr = np.arange(indices[level].size + 1)
+            elif self._structure[level + 1] == C:
+                ptr = np.arange(len(indices[level]) + 1) * shape[level + 1]
+                if sparsity == C:
+                    # Update subdf to use later
+                    if level == 0:
+                        subdf = pd.DataFrame({cols[level]: range(shape[level])})
+                    elif self._structure[level - 1] == C:
+                        subdf = subdf.join(
+                            pd.DataFrame({cols[level]: range(shape[level])}), how="cross"
+                        )
+                    else:
+                        subdf = df[cols[:level]].drop_duplicates()
+                        subdf = subdf.join(
+                            pd.DataFrame({cols[level]: range(shape[level])}), how="cross"
+                        )
+            elif sparsity == DC:
+                if self._structure[level + 1] == DC:
+                    subdf = df[cols[: level + 2]].drop_duplicates()
+                else:  # sparsity[level + 1] == S
+                    # number of "S" immediately after this level
+                    nums = 0
+                    for item in self._structure[level + 1 :]:
+                        if item == S:
+                            nums += 1
+                        else:
+                            break
+                    subdf = df[cols[: level + nums + 1]].drop_duplicates()
+                    if len(self._structure) > level + nums + 1:
+                        if self._structure[level + nums + 1] == C:
+                            subdf = subdf.join(
+                                pd.DataFrame(
+                                    {cols[level + nums + 1]: range(shape[level + nums + 1])}
+                                ),
+                                how="cross",
+                            )
+                        elif self._structure[level + nums + 1] == DC:
+                            subdf = df[cols[: level + nums + 2]].drop_duplicates()
+                ptr = np.zeros(indices[level].size + 1, int)
+                ptr[1:] = subdf.groupby(cols[: level + 1])[cols[level + 1]].count().cumsum()
+            elif sparsity == C:
+                if level > 0:
+                    if self._structure[level - 1] == C:
+                        subdf1 = subdf
+                    else:
+                        subdf1 = df[cols[:level]].drop_duplicates()
+                subdf = pd.DataFrame({cols[level]: range(shape[level])})
+                if level > 0:
+                    subdf = subdf1.join(subdf, how="cross")
+                if self._structure[level + 1] == DC:
+                    subdf2 = df[cols[: level + 2]].drop_duplicates()
+                else:  # sparsity[level + 1] == S
+                    # number of "S" immediately after this level
+                    nums = 0
+                    for item in self._structure[level + 1 :]:
+                        if item == S:
+                            nums += 1
+                        else:
+                            break
+                    subdf2 = df[cols[: level + nums + 1]].drop_duplicates()
+                    if len(self._structure) > level + nums + 1:
+                        if self._structure[level + nums + 1] == C:
+                            subdf2 = subdf2.join(
+                                pd.DataFrame(
+                                    {cols[level + nums + 1]: range(shape[level + nums + 1])}
+                                ),
+                                how="cross",
+                            )
+                        elif self._structure[level + nums + 1] == DC:
+                            subdf2 = df[cols[: level + nums + 2]].drop_duplicates()
+                subdf3 = subdf.merge(subdf2, how="left")
+                subdf3[level + 1] = subdf3[level + 1].notnull()
+                ptr = np.zeros(indices[level].size + 1, int)
+                ptr[1:] = subdf3.groupby(cols[: level + 1])[level + 1].sum().cumsum()
+            pointers.append(ptr)
+        self._pointers = pointers
+        # TODO: can we detect and change sparsity type to be more efficient?
+        # For example, so we don't need to store a pointers or indices.
+
+    def as_structure(self, structure):
+        return SPZ(self.arrays, self.shape, structure)
+
+    def get_index(self, dim):
+        return self._indices[dim]
+
+    def get_pointers(self, dim):
+        return self._pointers[dim]
+
+    @property
+    def indices(self):
+        rv = list(self._indices)
+        for i, sparsity in enumerate(self._structure):
+            if sparsity == C:
+                rv[i] = None
+        return rv
+
+    @property
+    def pointers(self):
+        rv = list(self._pointers)
+        for i, sparsity in enumerate(self._structure[:-1]):
+            if sparsity == S:
+                rv[i] = None
+            elif sparsity == C and i > 0:
+                rv[i - 1] = None
+        return rv
+
+    @property
+    def ndim(self):
+        return len(self._shape)
+
+    @property
+    def shape(self):
+        return self._shape
+
+    @property
+    def structure(self):
+        return self._structure
+
+    @property
+    def abbreviation(self):
+        return abbreviate(self._structure)
+
+    @property
+    def arrays(self):
+        return [np.array(array) for array in zip(*_to_coo(self._indices, self._pointers))]
+
+
+def _to_coo(indices, pointers, start=0, stop=None):
+    index, *indices = indices
+    if stop is None:
+        stop = len(index)
+    if not indices:
+        for idx in index[start:stop]:
+            yield (idx,)
+        return
+    ptrs, *pointers = pointers
+    for idx, start, stop in zip(index[start:stop], ptrs[start:stop], ptrs[start + 1 : stop + 1]):
+        for indexes in _to_coo(indices, pointers, start, stop):
+            yield (idx,) + indexes
diff --git a/spz_python/spz/sparsetype.py b/spz_python/spz/sparsetype.py
new file mode 100644
index 0000000..cc5c95b
--- /dev/null
+++ b/spz_python/spz/sparsetype.py
@@ -0,0 +1,75 @@
+class StructureType:
+    def __repr__(self):
+        return self.name
+
+    def __eq__(self, other):
+        return self.name == to_type(other).name
+
+    def __hash__(self):
+        return hash(self.name)
+
+    def __reduce__(self):
+        return self.name
+
+
+# Singletons
+class sparse(StructureType):
+    name = "sparse"
+    abbreviation = "S"
+
+
+class compressed(StructureType):
+    name = "compressed"
+    abbreviation = "C"
+
+
+class doubly_compressed(StructureType):
+    name = "doubly_compressed"
+    abbreviation = "DC"
+
+
+S = sparse = sparse()
+C = compressed = compressed()
+DC = doubly_compressed = doubly_compressed()
+
+_STR_TO_TYPE = {
+    "s": S,
+    "sparse": S,
+    "singleton": S,
+    "c": C,
+    "compressed": C,
+    "dc": DC,
+    "d": DC,
+    "doubly_compressed": DC,
+    "doubly compressed": DC,
+    "doublycompressed": DC,
+}
+
+
+def to_type(x):
+    """Convert a string to a StructureType"""
+    if isinstance(x, StructureType):
+        return x
+    return _STR_TO_TYPE[x.lower()]
+
+
+def to_str(x):
+    return to_type(x).name
+
+
+def abbreviate(*types):
+    if len(types) == 1 and not isinstance(types[0], (StructureType, str)):
+        types = types[0]
+    abbvs = [to_type(x).abbreviation for x in types]
+    sep = "-" if "DC" in abbvs else ""
+    return sep.join(abbvs)
+
+
+def unabbreviate(abbr):
+    rv = []
+    for sub in abbr.replace("D-", "DC-").strip("-").split("DC"):
+        for c in sub:
+            rv.append(to_type(c))
+        rv.append(DC)
+    rv.pop()  # One extra DC
+    return rv
diff --git a/spz_python/spz/tests/__init__.py b/spz_python/spz/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/spz_python/spz/tests/conftest.py b/spz_python/spz/tests/conftest.py
new file mode 100644
index 0000000..1ce35a2
--- /dev/null
+++ b/spz_python/spz/tests/conftest.py
@@ -0,0 +1,13 @@
+import pytest
+
+
+@pytest.fixture(scope="session", autouse=True)
+def ic():
+    """Make `ic` available everywhere for easier debugging"""
+    try:
+        import icecream
+    except ImportError:
+        return
+    icecream.install()
+    # icecream.ic.disable()  # do ic.enable() to re-enable
+    return icecream.ic
diff --git a/spz_python/spz/tests/test_spz.py b/spz_python/spz/tests/test_spz.py
new file mode 100644
index 0000000..d2376ef
--- /dev/null
+++ b/spz_python/spz/tests/test_spz.py
@@ -0,0 +1,28 @@
+import itertools
+
+import pandas as pd
+import pytest
+
+from spz import SPZ
+
+
+@pytest.fixture
+def indices1():
+    return [
+        [0, 0, 0, 0, 0, 1, 1, 1],
+        [0, 0, 1, 1, 1, 1, 1, 1],
+        [0, 0, 0, 0, 1, 1, 1, 1],
+        [1, 2, 0, 2, 0, 0, 1, 2],
+    ]
+
+
+@pytest.mark.parametrize("shape", [[2, 2, 2, 3], [5, 6, 7, 8], [8, 7, 6, 5]])
+def test_4d(indices1, shape):
+    df = pd.DataFrame(indices1).T.sort_values([0, 1, 2, 3])
+    sparsities = ["S", "C", "DC"]
+    for sparsity in itertools.product(sparsities, sparsities, sparsities):
+        structure = "".join(sparsity) + "S"
+        spz = SPZ(indices1, shape, structure)
+        # spz._validate()
+        df2 = pd.DataFrame(spz.arrays).T
+        pd.testing.assert_frame_equal(df, df2)