From aedee990590f83218c8b872ebaecbb5b660d2fd3 Mon Sep 17 00:00:00 2001 From: Erik Welch Date: Sun, 21 Aug 2022 17:28:05 -0400 Subject: [PATCH] Create Python package for exploring n-dimensional sparse arrays with different structures --- .gitignore | 117 ++++++++++++++++ spz_python/LICENSE | 29 ++++ spz_python/README.md | 0 spz_python/pyproject.toml | 5 + spz_python/requirements.txt | 2 + spz_python/setup.cfg | 56 ++++++++ spz_python/setup.py | 46 ++++++ spz_python/spz/__init__.py | 2 + spz_python/spz/_core.py | 233 +++++++++++++++++++++++++++++++ spz_python/spz/sparsetype.py | 75 ++++++++++ spz_python/spz/tests/__init__.py | 0 spz_python/spz/tests/conftest.py | 13 ++ spz_python/spz/tests/test_spz.py | 28 ++++ 13 files changed, 606 insertions(+) create mode 100644 .gitignore create mode 100644 spz_python/LICENSE create mode 100644 spz_python/README.md create mode 100644 spz_python/pyproject.toml create mode 100644 spz_python/requirements.txt create mode 100644 spz_python/setup.cfg create mode 100644 spz_python/setup.py create mode 100644 spz_python/spz/__init__.py create mode 100644 spz_python/spz/_core.py create mode 100644 spz_python/spz/sparsetype.py create mode 100644 spz_python/spz/tests/__init__.py create mode 100644 spz_python/spz/tests/conftest.py create mode 100644 spz_python/spz/tests/test_spz.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..07e76ee --- /dev/null +++ b/.gitignore @@ -0,0 +1,117 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# Generated C code +*.c + +# C extensions +*.so + +# Vi +*.swp +*.swo + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ + +# PyCharm +.idea + +# Mac +.DS_Store diff --git a/spz_python/LICENSE b/spz_python/LICENSE new file mode 100644 index 0000000..a0cba38 --- /dev/null +++ b/spz_python/LICENSE @@ -0,0 +1,29 @@ +BSD 3-Clause License + +Copyright (c) 2022, GraphBLAS +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/spz_python/README.md b/spz_python/README.md new file mode 100644 index 0000000..e69de29 diff --git a/spz_python/pyproject.toml b/spz_python/pyproject.toml new file mode 100644 index 0000000..90ccf9c --- /dev/null +++ b/spz_python/pyproject.toml @@ -0,0 +1,5 @@ +[build-system] +requires = ["setuptools", "wheel"] + +[tool.black] +line-length = 100 diff --git a/spz_python/requirements.txt b/spz_python/requirements.txt new file mode 100644 index 0000000..5da331c --- /dev/null +++ b/spz_python/requirements.txt @@ -0,0 +1,2 @@ +numpy +pandas diff --git a/spz_python/setup.cfg b/spz_python/setup.cfg new file mode 100644 index 0000000..7c3fc55 --- /dev/null +++ b/spz_python/setup.cfg @@ -0,0 +1,56 @@ +[aliases] +test=pytest + +[flake8] +max-line-length = 100 +inline-quotes = " +exclude = + versioneer.py, +ignore = + E203, # whitespace before ':' + E231, # Multiple spaces around "," + W503, # line break before binary operator +per-file-ignores = + __init__.py:F401, + spz/_core.py:B020 + +[isort] +sections = FUTURE,STDLIB,THIRDPARTY,FIRSTPARTY,LOCALFOLDER +profile = black +skip_gitignore = true +float_to_top = true +default_section = THIRDPARTY +known_first_party = spz +line_length = 100 + +[versioneer] +VCS = git +style = pep440 +versionfile_source = spz/_version.py +versionfile_build = spz/_version.py +tag_prefix= +parentdir_prefix=spz- + +[tool:pytest] +testpaths = spz/tests +markers: + slow: Skipped unless --runslow passed + +[coverage:run] +source = spz +omit = + spz/_version.py + +[coverage:report] +# Regexes for lines to exclude from consideration +exclude_lines = + pragma: no cover + + raise AssertionError + raise NotImplementedError + +ignore_errors = True +precision = 1 +fail_under = 0 +skip_covered = True +skip_empty = True diff --git a/spz_python/setup.py b/spz_python/setup.py new file mode 100644 index 0000000..2bc07df --- /dev/null +++ b/spz_python/setup.py @@ -0,0 +1,46 @@ +import versioneer +from setuptools import find_packages, setup + +install_requires = open("requirements.txt").read().strip().split("\n") +extras_require = { + "test": ["pytest"], + "viz": ["sphinxcontrib-svgbob"], +} +extras_require["complete"] = sorted({v for req in extras_require.values() for v in req}) + +with open("README.md") as f: + long_description = f.read() + +setup( + name="spz", + version=versioneer.get_version(), + cmdclass=versioneer.get_cmdclass(), + description="Explore multidimensional sparse data structures", + long_description=long_description, + long_description_content_type="text/markdown", + author="Erik Welch", + author_email="erik.n.welch@gmail.com", + url="https://github.com/GraphBLAS/binsparse-specification", + packages=find_packages(), + license="BSD", + python_requires=">=3.8", + setup_requires=[], + install_requires=install_requires, + extras_require=extras_require, + include_package_data=True, + classifiers=[ + "Development Status :: 3 - Alpha" "License :: OSI Approved :: BSD License", + "Operating System :: OS Independent", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3 :: Only", + "Intended Audience :: Developers", + "Intended Audience :: Science/Research", + "Topic :: Scientific/Engineering", + "Topic :: Scientific/Engineering :: Mathematics", + ], + zip_safe=False, +) diff --git a/spz_python/spz/__init__.py b/spz_python/spz/__init__.py new file mode 100644 index 0000000..eef5bf8 --- /dev/null +++ b/spz_python/spz/__init__.py @@ -0,0 +1,2 @@ +from ._core import SPZ +from .sparsetype import DC, C, S, compressed, doubly_compressed, sparse diff --git a/spz_python/spz/_core.py b/spz_python/spz/_core.py new file mode 100644 index 0000000..569c1c7 --- /dev/null +++ b/spz_python/spz/_core.py @@ -0,0 +1,233 @@ +import numpy as np +import pandas as pd + +from .sparsetype import DC, C, S, abbreviate, unabbreviate + + +def repeatrange(repeat, *args): + """e.g., [0, 1, 2, 0, 1, 2]""" + return np.repeat(np.arange(*args)[None, :], repeat, axis=0).ravel() + + +class SPZ: + def __init__(self, arrays, shape=None, structure=None): + if not isinstance(arrays, (list, tuple)): + raise TypeError("arrays argument must be a list or tuple of numpy arrays") + if not arrays: + raise ValueError("At least one array must be given") + arrays = [np.array(array) for array in arrays] + if not all(array.ndim == 1 for array in arrays): + raise ValueError("arrays must be a single dimension") + size = arrays[0].size + if not all(array.size == size for array in arrays): + raise ValueError("arrays must be the same size") + if not all(np.issubdtype(array.dtype, np.integer) for array in arrays): + raise ValueError("arrays must be integer dtype") + if not all((array >= 0).all() for array in arrays): + raise ValueError("array values must be positive") + + if shape is not None: + self._shape = tuple(shape) + if not all(dimsize > 0 for dimsize in self._shape): + raise ValueError("Dimension sizes must be greater than 0") + if len(self._shape) != len(arrays): + raise ValueError("shape must be the same length as arrays") + if not all((array < dimsize).all() for array, dimsize in zip(arrays, self._shape)): + raise ValueError("index in array is out of bounds") + else: + self._shape = tuple(int(array.max()) + 1 for array in arrays) + + if structure is None: # Assume CSF + self._structure = [DC] * (len(arrays) - 1) + [S] + elif isinstance(structure, str): + self._structure = unabbreviate(structure) + else: + self._structure = unabbreviate(abbreviate(*structure)) + if len(self._structure) != len(arrays): + raise ValueError("structure must be the same length as arrays") + if self._structure[-1] != S: + # C as the final dimension means "dense" + raise ValueError("The final dimension must be sparse structural type") + + # Now the fun part! Generate the compressed structure from COO + df = pd.DataFrame(arrays).T.sort_values(list(range(self.ndim))) + if df.duplicated().any(): + raise ValueError("Duplicate indices found!") + + # First create indices + indices = [] + cols = list(df.columns) + num_s_levels = 0 + prev = None + for sparsity, level in zip(self._structure, range(df.shape[-1])): + if sparsity == S: + num_s_levels += 1 + elif sparsity == DC: + subdf = df[cols[: level + 1]].drop_duplicates() + for i in range(-num_s_levels - 1, 0): + indices.append(subdf.iloc[:, i].values) + num_s_levels = 0 + elif sparsity == C: + if level == 0: + indices.append(np.arange(shape[level])) + elif prev == DC: + subdf = df[cols[:level]].drop_duplicates() + indices.append(repeatrange(len(subdf), shape[level])) + elif prev == S: + subdf = df[cols[:level]].drop_duplicates() + subdf = subdf.join( + pd.DataFrame({cols[level]: range(shape[level])}), how="cross" + ) + for i in range(-num_s_levels - 1, 0): + indices.append(subdf.iloc[:, i].values) + num_s_levels = 0 + else: # prev == C + indices.append(repeatrange(indices[-1].size, shape[level])) + prev = sparsity + for i in range(-num_s_levels, 0): + indices.append(df.iloc[:, i].values) + self._indices = indices + + # Now create pointers + pointers = [] + for sparsity, level in zip(self._structure[:-1], range(df.shape[-1] - 1)): + if sparsity == S: + ptr = np.arange(indices[level].size + 1) + elif self._structure[level + 1] == C: + ptr = np.arange(len(indices[level]) + 1) * shape[level + 1] + if sparsity == C: + # Update subdf to use later + if level == 0: + subdf = pd.DataFrame({cols[level]: range(shape[level])}) + elif self._structure[level - 1] == C: + subdf = subdf.join( + pd.DataFrame({cols[level]: range(shape[level])}), how="cross" + ) + else: + subdf = df[cols[:level]].drop_duplicates() + subdf = subdf.join( + pd.DataFrame({cols[level]: range(shape[level])}), how="cross" + ) + elif sparsity == DC: + if self._structure[level + 1] == DC: + subdf = df[cols[: level + 2]].drop_duplicates() + else: # sparsity[level + 1] == S + # number of "S" immediately after this level + nums = 0 + for item in self._structure[level + 1 :]: + if item == S: + nums += 1 + else: + break + subdf = df[cols[: level + nums + 1]].drop_duplicates() + if len(self._structure) > level + nums + 1: + if self._structure[level + nums + 1] == C: + subdf = subdf.join( + pd.DataFrame( + {cols[level + nums + 1]: range(shape[level + nums + 1])} + ), + how="cross", + ) + elif self._structure[level + nums + 1] == DC: + subdf = df[cols[: level + nums + 2]].drop_duplicates() + ptr = np.zeros(indices[level].size + 1, int) + ptr[1:] = subdf.groupby(cols[: level + 1])[cols[level + 1]].count().cumsum() + elif sparsity == C: + if level > 0: + if self._structure[level - 1] == C: + subdf1 = subdf + else: + subdf1 = df[cols[:level]].drop_duplicates() + subdf = pd.DataFrame({cols[level]: range(shape[level])}) + if level > 0: + subdf = subdf1.join(subdf, how="cross") + if self._structure[level + 1] == DC: + subdf2 = df[cols[: level + 2]].drop_duplicates() + else: # sparsity[level + 1] == S + # number of "S" immediately after this level + nums = 0 + for item in self._structure[level + 1 :]: + if item == S: + nums += 1 + else: + break + subdf2 = df[cols[: level + nums + 1]].drop_duplicates() + if len(self._structure) > level + nums + 1: + if self._structure[level + nums + 1] == C: + subdf2 = subdf2.join( + pd.DataFrame( + {cols[level + nums + 1]: range(shape[level + nums + 1])} + ), + how="cross", + ) + elif self._structure[level + nums + 1] == DC: + subdf2 = df[cols[: level + nums + 2]].drop_duplicates() + subdf3 = subdf.merge(subdf2, how="left") + subdf3[level + 1] = subdf3[level + 1].notnull() + ptr = np.zeros(indices[level].size + 1, int) + ptr[1:] = subdf3.groupby(cols[: level + 1])[level + 1].sum().cumsum() + pointers.append(ptr) + self._pointers = pointers + # TODO: can we detect and change sparsity type to be more efficient? + # For example, so we don't need to store a pointers or indices. + + def as_structure(self, structure): + return SPZ(self.arrays, self.shape, structure) + + def get_index(self, dim): + return self._indices[dim] + + def get_pointers(self, dim): + return self._pointers[dim] + + @property + def indices(self): + rv = list(self._indices) + for i, sparsity in enumerate(self._structure): + if sparsity == C: + rv[i] = None + return rv + + @property + def pointers(self): + rv = list(self._pointers) + for i, sparsity in enumerate(self._structure[:-1]): + if sparsity == S: + rv[i] = None + elif sparsity == C and i > 0: + rv[i - 1] = None + return rv + + @property + def ndim(self): + return len(self._shape) + + @property + def shape(self): + return self._shape + + @property + def structure(self): + return self._structure + + @property + def abbreviation(self): + return abbreviate(self._structure) + + @property + def arrays(self): + return [np.array(array) for array in zip(*_to_coo(self._indices, self._pointers))] + + +def _to_coo(indices, pointers, start=0, stop=None): + index, *indices = indices + if stop is None: + stop = len(index) + if not indices: + for idx in index[start:stop]: + yield (idx,) + return + ptrs, *pointers = pointers + for idx, start, stop in zip(index[start:stop], ptrs[start:stop], ptrs[start + 1 : stop + 1]): + for indexes in _to_coo(indices, pointers, start, stop): + yield (idx,) + indexes diff --git a/spz_python/spz/sparsetype.py b/spz_python/spz/sparsetype.py new file mode 100644 index 0000000..cc5c95b --- /dev/null +++ b/spz_python/spz/sparsetype.py @@ -0,0 +1,75 @@ +class StructureType: + def __repr__(self): + return self.name + + def __eq__(self, other): + return self.name == to_type(other).name + + def __hash__(self): + return hash(self.name) + + def __reduce__(self): + return self.name + + +# Singletons +class sparse(StructureType): + name = "sparse" + abbreviation = "S" + + +class compressed(StructureType): + name = "compressed" + abbreviation = "C" + + +class doubly_compressed(StructureType): + name = "doubly_compressed" + abbreviation = "DC" + + +S = sparse = sparse() +C = compressed = compressed() +DC = doubly_compressed = doubly_compressed() + +_STR_TO_TYPE = { + "s": S, + "sparse": S, + "singleton": S, + "c": C, + "compressed": C, + "dc": DC, + "d": DC, + "doubly_compressed": DC, + "doubly compressed": DC, + "doublycompressed": DC, +} + + +def to_type(x): + """Convert a string to a StructureType""" + if isinstance(x, StructureType): + return x + return _STR_TO_TYPE[x.lower()] + + +def to_str(x): + return to_type(x).name + + +def abbreviate(*types): + if len(types) == 1 and not isinstance(types[0], (StructureType, str)): + types = types[0] + abbvs = [to_type(x).abbreviation for x in types] + sep = "-" if "DC" in abbvs else "" + return sep.join(abbvs) + + +def unabbreviate(abbr): + rv = [] + for sub in abbr.replace("D-", "DC-").strip("-").split("DC"): + for c in sub: + rv.append(to_type(c)) + rv.append(DC) + rv.pop() # One extra DC + return rv diff --git a/spz_python/spz/tests/__init__.py b/spz_python/spz/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/spz_python/spz/tests/conftest.py b/spz_python/spz/tests/conftest.py new file mode 100644 index 0000000..1ce35a2 --- /dev/null +++ b/spz_python/spz/tests/conftest.py @@ -0,0 +1,13 @@ +import pytest + + +@pytest.fixture(scope="session", autouse=True) +def ic(): + """Make `ic` available everywhere for easier debugging""" + try: + import icecream + except ImportError: + return + icecream.install() + # icecream.ic.disable() # do ic.enable() to re-enable + return icecream.ic diff --git a/spz_python/spz/tests/test_spz.py b/spz_python/spz/tests/test_spz.py new file mode 100644 index 0000000..d2376ef --- /dev/null +++ b/spz_python/spz/tests/test_spz.py @@ -0,0 +1,28 @@ +import itertools + +import pandas as pd +import pytest + +from spz import SPZ + + +@pytest.fixture +def indices1(): + return [ + [0, 0, 0, 0, 0, 1, 1, 1], + [0, 0, 1, 1, 1, 1, 1, 1], + [0, 0, 0, 0, 1, 1, 1, 1], + [1, 2, 0, 2, 0, 0, 1, 2], + ] + + +@pytest.mark.parametrize("shape", [[2, 2, 2, 3], [5, 6, 7, 8], [8, 7, 6, 5]]) +def test_4d(indices1, shape): + df = pd.DataFrame(indices1).T.sort_values([0, 1, 2, 3]) + sparsities = ["S", "C", "DC"] + for sparsity in itertools.product(sparsities, sparsities, sparsities): + structure = "".join(sparsity) + "S" + spz = SPZ(indices1, shape, structure) + # spz._validate() + df2 = pd.DataFrame(spz.arrays).T + pd.testing.assert_frame_equal(df, df2)