From 5a3a1084c273cccd623400237a0a095fa65e2e96 Mon Sep 17 00:00:00 2001 From: Erik Welch Date: Tue, 6 Sep 2022 18:47:21 -0500 Subject: [PATCH] JSON keys must be strings (oops) --- design_docs/01_rankN_arrays.md | 6 +++-- minutes/2022-09-07-strawman.html | 44 +++++++++++++++---------------- minutes/2022-09-07-strawman.ipynb | 44 +++++++++++++++---------------- minutes/2022-09-07-strawman.md | 44 +++++++++++++++---------------- spz_python/spz/_core.py | 37 ++++++++++++++++++++++++-- 5 files changed, 105 insertions(+), 70 deletions(-) diff --git a/design_docs/01_rankN_arrays.md b/design_docs/01_rankN_arrays.md index c0dae16..af47f3f 100644 --- a/design_docs/01_rankN_arrays.md +++ b/design_docs/01_rankN_arrays.md @@ -94,7 +94,7 @@ CSR, CSC, DCSR, and DCSC are familiar and keep their original meanings. #### COO -SSR and SSC in the previous table are new and are instances of COO with sorted indices and no duplicates (according to our simplifying assumptions). SSR indices are lexicographically sorted by row then column, and a backronym could be "sorted sparse rows". Conversely, SSC indices are lexicographically sorted by column then row, and a backronym could be "sorted sparse columns". +SSR and SSC in the previous table are new and are instances of COO with sorted indices and no duplicates (according to our simplifying assumptions). SSR indices are lexicographically sorted by row then column, and a backronym could be "sorted sparse rows" (or "simple" or "standard" instead of "sorted"). Conversely, SSC indices are lexicographically sorted by column then row, and a backronym could be "sorted sparse columns". We can consider also keeping COO as a type of compression. COO does not need to be sorted, and metadata can indicate whether or not it may have duplicate indices. @@ -202,12 +202,14 @@ The main difference from our proposal is the use of `pointer_i`. In TACO and ML In TACO, DCSR is constructed using "compressed", "compressed" dimensions. This results in _two_ `pointers` arrays, which is not standard DCSR. Only one `pointers` array is necessary. In MLIR, there are two ways to specify DCSR arrays. Our proposal only has one way to specify DCSR, and it does not have an extra, unnecessary `pointers` array. -It is straightforward to convert from our specification to MLIR sparse tensor specification: +It is straightforward to convert from our specification to MLIR sparse tensor specification with only "compressed" and "dense" dimensions: 1. Increment `pointers` arrays, so e.g. `pointers_0` becomes `mlir_pointers_1` 2. If a dimension has both `mlir_pointers_i` and `indices_i`, then it is MLIR's "compressed" 3. If a dimension has only `indices_i`, then it is MLIR's "singleton" 4. If a dimension does not have `mlir_pointers_i` or `indices_i`, then it is MLIR's "dense" +Converting to TACO-style with "singleton" and "compressed-nonunique" is signficantly more difficult. + ### Extensions #### Multigraph support diff --git a/minutes/2022-09-07-strawman.html b/minutes/2022-09-07-strawman.html index 928ef57..3a16e01 100755 --- a/minutes/2022-09-07-strawman.html +++ b/minutes/2022-09-07-strawman.html @@ -14822,7 +14822,7 @@

Version 2.0 "dim_types": ["compressed", "sparse"], "dim_order": [0, 1], - "dim_properties": {1: {"is_ordered": true}}, + "dim_properties": {"1": {"is_ordered": true}}, "properties": {"has_duplicates": false}, "dim_sizes": [m, n], @@ -14838,7 +14838,7 @@

Version 2.0 "dim_types": ["compressed", "sparse"], "dim_order": [1, 0], - "dim_properties": {1: {"is_ordered": true}}, + "dim_properties": {"1": {"is_ordered": true}}, "properties": {"has_duplicates": false}, "dim_sizes": [n, m], @@ -14858,8 +14858,8 @@

Version 2.0 "dim_order": [0, 1], "dim_properties": { - 0: {"is_ordered": true, "is_unique": true}, - 1: {"is_ordered": true} + "0": {"is_ordered": true, "is_unique": true}, + "1": {"is_ordered": true} }, "properties": {"has_duplicates": false}, @@ -14877,8 +14877,8 @@

Version 2.0 "dim_order": [1, 0], "dim_properties": { - 0: {"is_ordered": true, "is_unique": true}, - 1: {"is_ordered": true} + "0": {"is_ordered": true, "is_unique": true}, + "1": {"is_ordered": true} }, "properties": {"has_duplicates": false}, @@ -14899,8 +14899,8 @@

Version 2.0 "dim_order": [0, 1], "dim_properties": { - 0: {"is_ordered": true}, - 1: {"is_ordered": true} + "0": {"is_ordered": true}, + "1": {"is_ordered": true} }, "properties": {"has_duplicates": false}, @@ -14918,7 +14918,7 @@

Version 2.0 "dim_order": [0], "dim_properties": { - 0: {"is_ordered": true} + "0": {"is_ordered": true} }, "properties": {"has_duplicates": false}, @@ -14984,13 +14984,13 @@

dim_propertiesValue arrays&# "dim_types": ["compressed", "sparse"], "dim_order": [0, 1], - "dim_properties": {1: {"is_ordered": true}}, + "dim_properties": {"1": {"is_ordered": true}}, "properties": {"has_duplicates": false}, "dim_sizes": [m, n], @@ -15146,8 +15146,8 @@

Value arrays&# "dim_order": [1, 0], "dim_properties": { - 0: {"is_ordered": true, "is_unique": true}, - 1: {"is_ordered": true} + "0": {"is_ordered": true, "is_unique": true}, + "1": {"is_ordered": true} }, "properties": {"has_duplicates": false}, @@ -15188,7 +15188,7 @@

Value arrays&# "dim_types": ["compressed", "sparse"], "dim_order": [1, 0], - "dim_properties": {1: {"is_ordered": true}}, + "dim_properties": {"1": {"is_ordered": true}}, "properties": {"has_duplicates": false}, "dim_sizes": [n, m], @@ -15207,8 +15207,8 @@

Value arrays&# "dim_order": [0, 1], "dim_properties": { - 0: {"is_ordered": true, "is_unique": true}, - 1: {"is_ordered": true} + "0": {"is_ordered": true, "is_unique": true}, + "1": {"is_ordered": true} }, "properties": {"has_duplicates": false}, @@ -15249,7 +15249,7 @@

Value arrays&# "dim_types": ["sparse", "full"], "dim_order": [0, 1], - "dim_properties": {0: {"is_ordered": true}}, + "dim_properties": {"0": {"is_ordered": true}}, "properties": {"has_duplicates": false}, "dim_sizes": [m, n], @@ -15266,8 +15266,8 @@

Value arrays&# "dim_order": [0, 1, 2], "dim_properties": { - 0: {"is_ordered": true}, - 1: {"is_ordered": true} + "0": {"is_ordered": true}, + "1": {"is_ordered": true} }, "properties": {"has_duplicates": false}, @@ -15304,7 +15304,7 @@

Value arrays&# "dim_order": [0, 1], "dim_properties": - {1: {"is_ordered": true, "is_unique": true} + {"1": {"is_ordered": true, "is_unique": true} }, "properties": {"has_duplicates": true}, @@ -15321,7 +15321,7 @@

Value arrays&# "dim_types": ["compressed", "sparse"], "dim_order": [0, 1], - "dim_properties": {1: {"is_ordered": true}}, + "dim_properties": {"1": {"is_ordered": true}}, "properties": {"has_duplicates": false}, "dim_sizes": [m, n], diff --git a/minutes/2022-09-07-strawman.ipynb b/minutes/2022-09-07-strawman.ipynb index f576ce3..449d70c 100644 --- a/minutes/2022-09-07-strawman.ipynb +++ b/minutes/2022-09-07-strawman.ipynb @@ -247,7 +247,7 @@ " \"dim_types\": [\"compressed\", \"sparse\"],\n", " \"dim_order\": [0, 1],\n", "\n", - " \"dim_properties\": {1: {\"is_ordered\": true}},\n", + " \"dim_properties\": {\"1\": {\"is_ordered\": true}},\n", " \"properties\": {\"has_duplicates\": false},\n", "\n", " \"dim_sizes\": [m, n],\n", @@ -266,7 +266,7 @@ " \"dim_types\": [\"compressed\", \"sparse\"],\n", " \"dim_order\": [1, 0],\n", "\n", - " \"dim_properties\": {1: {\"is_ordered\": true}},\n", + " \"dim_properties\": {\"1\": {\"is_ordered\": true}},\n", " \"properties\": {\"has_duplicates\": false},\n", "\n", " \"dim_sizes\": [n, m],\n", @@ -289,8 +289,8 @@ " \"dim_order\": [0, 1],\n", "\n", " \"dim_properties\": {\n", - " 0: {\"is_ordered\": true, \"is_unique\": true},\n", - " 1: {\"is_ordered\": true}\n", + " \"0\": {\"is_ordered\": true, \"is_unique\": true},\n", + " \"1\": {\"is_ordered\": true}\n", " },\n", " \"properties\": {\"has_duplicates\": false},\n", "\n", @@ -311,8 +311,8 @@ " \"dim_order\": [1, 0],\n", "\n", " \"dim_properties\": {\n", - " 0: {\"is_ordered\": true, \"is_unique\": true},\n", - " 1: {\"is_ordered\": true}\n", + " \"0\": {\"is_ordered\": true, \"is_unique\": true},\n", + " \"1\": {\"is_ordered\": true}\n", " },\n", " \"properties\": {\"has_duplicates\": false},\n", "\n", @@ -336,8 +336,8 @@ " \"dim_order\": [0, 1],\n", "\n", " \"dim_properties\": {\n", - " 0: {\"is_ordered\": true},\n", - " 1: {\"is_ordered\": true}\n", + " \"0\": {\"is_ordered\": true},\n", + " \"1\": {\"is_ordered\": true}\n", " },\n", " \"properties\": {\"has_duplicates\": false},\n", "\n", @@ -358,7 +358,7 @@ " \"dim_order\": [0],\n", "\n", " \"dim_properties\": {\n", - " 0: {\"is_ordered\": true}\n", + " \"0\": {\"is_ordered\": true}\n", " },\n", " \"properties\": {\"has_duplicates\": false},\n", "\n", @@ -410,11 +410,11 @@ " - Are the indices in `indices_i` in order for `\"sparse\"` or `\"doubly_compressed\"` dimensions?\n", " - Indicates whether `indices_0` is sorted\n", " - Indicates whether `indices_i` with the same previous indices (i.e., between `pointers_{i-1}` boundaries) is sorted\n", - " - TACO (paper) calls this `ordered`\n", + " - TACO (paper) _may_ call this `ordered`\n", " - SuiteSparse:GraphBLAS uses `jumbled`\n", " - `is_unique` property\n", " - Indicates whether `indices_i` has no duplicates for `\"doubly_compressed\"` dimensions\n", - " - TACO (paper) calls this `unique`\n", + " - TACO (paper) _may_ call this `unique`\n", " - If `false`, then this conceptually splits a single index tree into multiple trees\n", "\n", "### `properties`\n", @@ -571,7 +571,7 @@ " \"dim_types\": [\"compressed\", \"sparse\"],\n", " \"dim_order\": [0, 1],\n", "\n", - " \"dim_properties\": {1: {\"is_ordered\": true}},\n", + " \"dim_properties\": {\"1\": {\"is_ordered\": true}},\n", " \"properties\": {\"has_duplicates\": false},\n", "\n", " \"dim_sizes\": [m, n],\n", @@ -596,8 +596,8 @@ " \"dim_order\": [1, 0],\n", "\n", " \"dim_properties\": {\n", - " 0: {\"is_ordered\": true, \"is_unique\": true},\n", - " 1: {\"is_ordered\": true}\n", + " \"0\": {\"is_ordered\": true, \"is_unique\": true},\n", + " \"1\": {\"is_ordered\": true}\n", " },\n", " \"properties\": {\"has_duplicates\": false},\n", "\n", @@ -647,7 +647,7 @@ " \"dim_types\": [\"compressed\", \"sparse\"],\n", " \"dim_order\": [1, 0],\n", "\n", - " \"dim_properties\": {1: {\"is_ordered\": true}},\n", + " \"dim_properties\": {\"1\": {\"is_ordered\": true}},\n", " \"properties\": {\"has_duplicates\": false},\n", "\n", " \"dim_sizes\": [n, m],\n", @@ -672,8 +672,8 @@ " \"dim_order\": [0, 1],\n", "\n", " \"dim_properties\": {\n", - " 0: {\"is_ordered\": true, \"is_unique\": true},\n", - " 1: {\"is_ordered\": true}\n", + " \"0\": {\"is_ordered\": true, \"is_unique\": true},\n", + " \"1\": {\"is_ordered\": true}\n", " },\n", " \"properties\": {\"has_duplicates\": false},\n", "\n", @@ -722,7 +722,7 @@ " \"dim_types\": [\"sparse\", \"full\"],\n", " \"dim_order\": [0, 1],\n", "\n", - " \"dim_properties\": {0: {\"is_ordered\": true}},\n", + " \"dim_properties\": {\"0\": {\"is_ordered\": true}},\n", " \"properties\": {\"has_duplicates\": false},\n", "\n", " \"dim_sizes\": [m, n],\n", @@ -745,8 +745,8 @@ " \"dim_order\": [0, 1, 2],\n", "\n", " \"dim_properties\": {\n", - " 0: {\"is_ordered\": true},\n", - " 1: {\"is_ordered\": true}\n", + " \"0\": {\"is_ordered\": true},\n", + " \"1\": {\"is_ordered\": true}\n", " },\n", " \"properties\": {\"has_duplicates\": false},\n", "\n", @@ -793,7 +793,7 @@ " \"dim_order\": [0, 1],\n", "\n", " \"dim_properties\":\n", - " {1: {\"is_ordered\": true, \"is_unique\": true}\n", + " {\"1\": {\"is_ordered\": true, \"is_unique\": true}\n", " },\n", " \"properties\": {\"has_duplicates\": true},\n", "\n", @@ -814,7 +814,7 @@ " \"dim_types\": [\"compressed\", \"sparse\"],\n", " \"dim_order\": [0, 1],\n", "\n", - " \"dim_properties\": {1: {\"is_ordered\": true}},\n", + " \"dim_properties\": {\"1\": {\"is_ordered\": true}},\n", " \"properties\": {\"has_duplicates\": false},\n", "\n", " \"dim_sizes\": [m, n],\n", diff --git a/minutes/2022-09-07-strawman.md b/minutes/2022-09-07-strawman.md index 02efd20..c4337a2 100755 --- a/minutes/2022-09-07-strawman.md +++ b/minutes/2022-09-07-strawman.md @@ -216,7 +216,7 @@ Data: "dim_types": ["compressed", "sparse"], "dim_order": [0, 1], - "dim_properties": {1: {"is_ordered": true}}, + "dim_properties": {"1": {"is_ordered": true}}, "properties": {"has_duplicates": false}, "dim_sizes": [m, n], @@ -235,7 +235,7 @@ Data: "dim_types": ["compressed", "sparse"], "dim_order": [1, 0], - "dim_properties": {1: {"is_ordered": true}}, + "dim_properties": {"1": {"is_ordered": true}}, "properties": {"has_duplicates": false}, "dim_sizes": [n, m], @@ -258,8 +258,8 @@ Data: "dim_order": [0, 1], "dim_properties": { - 0: {"is_ordered": true, "is_unique": true}, - 1: {"is_ordered": true} + "0": {"is_ordered": true, "is_unique": true}, + "1": {"is_ordered": true} }, "properties": {"has_duplicates": false}, @@ -280,8 +280,8 @@ Data: "dim_order": [1, 0], "dim_properties": { - 0: {"is_ordered": true, "is_unique": true}, - 1: {"is_ordered": true} + "0": {"is_ordered": true, "is_unique": true}, + "1": {"is_ordered": true} }, "properties": {"has_duplicates": false}, @@ -305,8 +305,8 @@ Data: "dim_order": [0, 1], "dim_properties": { - 0: {"is_ordered": true}, - 1: {"is_ordered": true} + "0": {"is_ordered": true}, + "1": {"is_ordered": true} }, "properties": {"has_duplicates": false}, @@ -327,7 +327,7 @@ Data: "dim_order": [0], "dim_properties": { - 0: {"is_ordered": true} + "0": {"is_ordered": true} }, "properties": {"has_duplicates": false}, @@ -373,11 +373,11 @@ Data: - Are the indices in `indices_i` in order for `"sparse"` or `"doubly_compressed"` dimensions? - Indicates whether `indices_0` is sorted - Indicates whether `indices_i` with the same previous indices (i.e., between `pointers_{i-1}` boundaries) is sorted - - TACO (paper) calls this `ordered` + - TACO (paper) _may_ call this `ordered` - SuiteSparse:GraphBLAS uses `jumbled` - `is_unique` property - Indicates whether `indices_i` has no duplicates for `"doubly_compressed"` dimensions - - TACO (paper) calls this `unique` + - TACO (paper) _may_ call this `unique` - If `false`, then this conceptually splits a single index tree into multiple trees ### `properties` @@ -528,7 +528,7 @@ Data: "dim_types": ["compressed", "sparse"], "dim_order": [0, 1], - "dim_properties": {1: {"is_ordered": true}}, + "dim_properties": {"1": {"is_ordered": true}}, "properties": {"has_duplicates": false}, "dim_sizes": [m, n], @@ -553,8 +553,8 @@ Data: "dim_order": [1, 0], "dim_properties": { - 0: {"is_ordered": true, "is_unique": true}, - 1: {"is_ordered": true} + "0": {"is_ordered": true, "is_unique": true}, + "1": {"is_ordered": true} }, "properties": {"has_duplicates": false}, @@ -604,7 +604,7 @@ Data: "dim_types": ["compressed", "sparse"], "dim_order": [1, 0], - "dim_properties": {1: {"is_ordered": true}}, + "dim_properties": {"1": {"is_ordered": true}}, "properties": {"has_duplicates": false}, "dim_sizes": [n, m], @@ -629,8 +629,8 @@ Data: "dim_order": [0, 1], "dim_properties": { - 0: {"is_ordered": true, "is_unique": true}, - 1: {"is_ordered": true} + "0": {"is_ordered": true, "is_unique": true}, + "1": {"is_ordered": true} }, "properties": {"has_duplicates": false}, @@ -679,7 +679,7 @@ Data: "dim_types": ["sparse", "full"], "dim_order": [0, 1], - "dim_properties": {0: {"is_ordered": true}}, + "dim_properties": {"0": {"is_ordered": true}}, "properties": {"has_duplicates": false}, "dim_sizes": [m, n], @@ -702,8 +702,8 @@ Data: "dim_order": [0, 1, 2], "dim_properties": { - 0: {"is_ordered": true}, - 1: {"is_ordered": true} + "0": {"is_ordered": true}, + "1": {"is_ordered": true} }, "properties": {"has_duplicates": false}, @@ -750,7 +750,7 @@ Data: "dim_order": [0, 1], "dim_properties": - {1: {"is_ordered": true, "is_unique": true} + {"1": {"is_ordered": true, "is_unique": true} }, "properties": {"has_duplicates": true}, @@ -771,7 +771,7 @@ Data: "dim_types": ["compressed", "sparse"], "dim_order": [0, 1], - "dim_properties": {1: {"is_ordered": true}}, + "dim_properties": {"1": {"is_ordered": true}}, "properties": {"has_duplicates": false}, "dim_sizes": [m, n], diff --git a/spz_python/spz/_core.py b/spz_python/spz/_core.py index ef8a2dd..be4a1e0 100644 --- a/spz_python/spz/_core.py +++ b/spz_python/spz/_core.py @@ -233,10 +233,43 @@ def as_structure(self, structure): return SPZ(self.arrays, self.shape, structure) def get_index(self, dim): - return self._indices[dim] + # Let's demonstrate how to compute indices that don't need to be stored + dim = range(self.ndim)[dim] # Make dim positive + if self._structure[dim] == C: + size = 1 + for cur in reversed(range(dim)): + if self._structure[cur] == C: + size *= self._shape[cur] + else: + size *= len(self._indices[cur]) + if self._structure[cur] == S: + size //= self._shape[cur + 1] + break + return repeatrange(size, self._shape[dim]) + else: + return self._indices[dim] def get_pointers(self, dim): - return self._pointers[dim] + # Let's demonstrate how to compute pointers that don't need to be stored + dim = range(self.ndim)[dim] # Make dim positive + if self._structure[dim] == S: + return np.arange(len(self._indices[dim]) + 1) + elif self._structure[dim + 1] == C: + if self._structure[dim] == DC: + size = len(self._indices[dim]) + else: + size = self._shape[dim] + for cur in reversed(range(dim)): + if self._structure[cur] == C: + size *= self._shape[cur] + else: + size *= len(self._indices[cur]) + if self._structure[cur] == S: + size //= self._shape[cur + 1] + break + return np.arange(size + 1) * self._shape[dim + 1] + else: + return self._pointers[dim] @property def indices(self):