Skip to content

1 test fails: ZeroDivisionError: division by zero #239

@yurivict

Description

@yurivict
========================================================================================= FAILURES =========================================================================================
_________________________________________________________________________ test_bitpacked_nn_descent_query_accuracy _________________________________________________________________________

nn_data = array([[0.52111531, 0.77647716, 0.93834037, 0.66185582, 0.19981062],
       [0.43632302, 0.16532886, 0.67949223, 0.587... 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ]])

    def test_bitpacked_nn_descent_query_accuracy(nn_data):
        bitpacked_data = (nn_data * 256).astype(np.uint8)
        unpacked_data = np.zeros(
            (bitpacked_data.shape[0], bitpacked_data.shape[1] * 8), dtype=np.float32
        )
        for i in range(unpacked_data.shape[0]):
            for j in range(unpacked_data.shape[1]):
                unpacked_data[i, j] = (bitpacked_data[i, j // 8] & (1 << (j % 8))) > 0
    
>       nnd = NNDescent(
            bitpacked_data[200:], "bit_jaccard", n_neighbors=50, random_state=None
        )

pynndescent/tests/test_pynndescent_.py:207: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

self = <pynndescent.pynndescent_.NNDescent object at 0x24f3964bdc10>
data = array([[ 91, 127, 169,   1,  27],
       [ 51, 148,  22, 235,   2],
       [170, 219,  25, 239, 176],
       ...,
       [ 44, 244,  81,  85, 237],
       [  0,   0,   0,   0,   0],
       [  0,   0,   0,   0,   0]], dtype=uint8)
metric = 'bit_jaccard', metric_kwds = {}, n_neighbors = 50, n_trees = 10, leaf_size = None, pruning_degree_multiplier = 1.5, diversify_prob = 1.0, n_search_trees = 1, tree_init = True
init_graph = None, init_dist = None, random_state = None, low_memory = True, max_candidates = None, max_rptree_depth = 200, n_iters = 10, delta = 0.001, n_jobs = None, compressed = False
parallel_batch_queries = False, verbose = False

    def __init__(
        self,
        data,
        metric="euclidean",
        metric_kwds=None,
        n_neighbors=30,
        n_trees=None,
        leaf_size=None,
        pruning_degree_multiplier=1.5,
        diversify_prob=1.0,
        n_search_trees=1,
        tree_init=True,
        init_graph=None,
        init_dist=None,
        random_state=None,
        low_memory=True,
        max_candidates=None,
        max_rptree_depth=200,
        n_iters=None,
        delta=0.001,
        n_jobs=None,
        compressed=False,
        parallel_batch_queries=False,
        verbose=False,
    ):
    
        if n_trees is None:
            n_trees = 5 + int(round((data.shape[0]) ** 0.25))
            n_trees = min(32, n_trees)  # Only so many trees are useful
        if n_iters is None:
            n_iters = max(5, int(round(np.log2(data.shape[0]))))
    
        self.n_trees = n_trees
        self.n_trees_after_update = max(1, int(np.round(self.n_trees / 3)))
        self.n_neighbors = n_neighbors
        self.metric = metric
        self.metric_kwds = metric_kwds
        self.leaf_size = leaf_size
        self.prune_degree_multiplier = pruning_degree_multiplier
        self.diversify_prob = diversify_prob
        self.n_search_trees = n_search_trees
        self.max_rptree_depth = max_rptree_depth
        self.max_candidates = max_candidates
        self.low_memory = low_memory
        self.n_iters = n_iters
        self.delta = delta
        self.dim = data.shape[1]
        self.n_jobs = n_jobs
        self.compressed = compressed
        self.parallel_batch_queries = parallel_batch_queries
        self.verbose = verbose
    
        if getattr(data, "dtype", None) == np.float32 and (
            issparse(data) or is_c_contiguous(data)
        ):
            copy_on_normalize = True
        else:
            copy_on_normalize = False
    
        if metric in ("bit_hamming", "bit_jaccard"):
            data = check_array(data, dtype=np.uint8, order="C")
            self._input_dtype = np.uint8
        else:
            data = check_array(data, dtype=np.float32, accept_sparse="csr", order="C")
            self._input_dtype = np.float32
    
        self._raw_data = data
    
        if not tree_init or n_trees == 0 or init_graph is not None:
            self.tree_init = False
        else:
            self.tree_init = True
    
        metric_kwds = metric_kwds or {}
        self._dist_args = tuple(metric_kwds.values())
    
        self.random_state = random_state
    
        current_random_state = check_random_state(self.random_state)
    
        self._distance_correction = None
    
        if callable(metric):
            _distance_func = metric
        elif metric in pynnd_dist.named_distances:
            if metric in pynnd_dist.fast_distance_alternatives:
                _distance_func = pynnd_dist.fast_distance_alternatives[metric]["dist"]
                self._distance_correction = pynnd_dist.fast_distance_alternatives[
                    metric
                ]["correction"]
            else:
                _distance_func = pynnd_dist.named_distances[metric]
        else:
            raise ValueError("Metric is neither callable, " + "nor a recognised string")
    
        # Create a partial function for distances with arguments
        if len(self._dist_args) > 0:
            dist_args = self._dist_args
    
            @numba.njit()
            def _partial_dist_func(x, y):
                return _distance_func(x, y, *dist_args)
    
            self._distance_func = _partial_dist_func
        else:
            self._distance_func = _distance_func
    
        if metric in (
            "cosine",
            "dot",
            "correlation",
            "dice",
            "jaccard",
            "hellinger",
            "hamming",
            "bit_hamming",
            "bit_jaccard",
        ):
            self._angular_trees = True
            if metric in ("bit_hamming", "bit_jaccard"):
                self._bit_trees = True
            else:
                self._bit_trees = False
        else:
            self._angular_trees = False
            self._bit_trees = False
    
        if metric == "dot":
            data = normalize(data, norm="l2", copy=copy_on_normalize)
            self._raw_data = data
    
        self.rng_state = current_random_state.randint(INT32_MIN, INT32_MAX, 3).astype(
            np.int64
        )
        self.search_rng_state = current_random_state.randint(
            INT32_MIN, INT32_MAX, 3
        ).astype(np.int64)
        # Warm up the rng state
        for i in range(10):
            _ = tau_rand_int(self.search_rng_state)
    
        if self.tree_init:
            if verbose:
                print(ts(), "Building RP forest with", str(n_trees), "trees")
            self._rp_forest = make_forest(
                data,
                n_neighbors,
                n_trees,
                leaf_size,
                self.rng_state,
                current_random_state,
                self.n_jobs,
                self._angular_trees,
                self._bit_trees,
                max_depth=self.max_rptree_depth,
            )
            leaf_array = rptree_leaf_array(self._rp_forest)
        else:
            self._rp_forest = None
            leaf_array = np.array([[-1]])
    
        if self.max_candidates is None:
            effective_max_candidates = min(60, self.n_neighbors)
        else:
            effective_max_candidates = self.max_candidates
    
        # Set threading constraints
        self._original_num_threads = numba.get_num_threads()
        if self.n_jobs != -1 and self.n_jobs is not None:
            numba.set_num_threads(self.n_jobs)
    
        if isspmatrix_csr(self._raw_data):
    
            self._is_sparse = True
    
            if not self._raw_data.has_sorted_indices:
                self._raw_data.sort_indices()
    
            if metric in sparse.sparse_named_distances:
                if metric in sparse.sparse_fast_distance_alternatives:
                    _distance_func = sparse.sparse_fast_distance_alternatives[metric][
                        "dist"
                    ]
                    self._distance_correction = (
                        sparse.sparse_fast_distance_alternatives[metric]["correction"]
                    )
                else:
                    _distance_func = sparse.sparse_named_distances[metric]
            elif callable(metric):
                _distance_func = metric
            else:
                raise ValueError(
                    "Metric {} not supported for sparse data".format(metric)
                )
    
            if metric in sparse.sparse_need_n_features:
                metric_kwds["n_features"] = self._raw_data.shape[1]
            self._dist_args = tuple(metric_kwds.values())
    
            # Create a partial function for distances with arguments
            if len(self._dist_args) > 0:
    
                dist_args = self._dist_args
    
                @numba.njit()
                def _partial_dist_func(ind1, data1, ind2, data2):
                    return _distance_func(ind1, data1, ind2, data2, *dist_args)
    
                self._distance_func = _partial_dist_func
            else:
                self._distance_func = _distance_func
    
            if init_graph is None:
                _init_graph = EMPTY_GRAPH
            else:
                if init_graph.shape[0] != self._raw_data.shape[0]:
                    raise ValueError("Init graph size does not match dataset size!")
                _init_graph = make_heap(init_graph.shape[0], self.n_neighbors)
                _init_graph = sparse_initalize_heap_from_graph_indices(
                    _init_graph,
                    init_graph,
                    self._raw_data.indptr,
                    self._raw_data.indices,
                    self._raw_data.data,
                    self._distance_func,
                )
    
            if verbose:
                print(ts(), "metric NN descent for", str(n_iters), "iterations")
    
            self._neighbor_graph = sparse_nnd.nn_descent(
                self._raw_data.indices,
                self._raw_data.indptr,
                self._raw_data.data,
                self.n_neighbors,
                self.rng_state,
                max_candidates=effective_max_candidates,
                dist=self._distance_func,
                n_iters=self.n_iters,
                delta=self.delta,
                rp_tree_init=True,
                leaf_array=leaf_array,
                init_graph=_init_graph,
                low_memory=self.low_memory,
                verbose=verbose,
            )
    
        else:
    
            self._is_sparse = False
    
            if init_graph is None:
                _init_graph = EMPTY_GRAPH
            else:
                if init_graph.shape[0] != self._raw_data.shape[0]:
                    raise ValueError("Init graph size does not match dataset size!")
                _init_graph = make_heap(init_graph.shape[0], self.n_neighbors)
                if init_dist is None:
                    _init_graph = initalize_heap_from_graph_indices(
                        _init_graph, init_graph, data, self._distance_func
                    )
                elif init_graph.shape != init_dist.shape:
                    raise ValueError(
                        "The shapes of init graph and init distances do not match!"
                    )
                else:
                    _init_graph = initalize_heap_from_graph_indices_and_distances(
                        _init_graph, init_graph, init_dist
                    )
    
            if verbose:
                print(ts(), "NN descent for", str(n_iters), "iterations")
    
>           self._neighbor_graph = nn_descent(
                self._raw_data,
                self.n_neighbors,
                self.rng_state,
                effective_max_candidates,
                self._distance_func,
                self.n_iters,
                self.delta,
                low_memory=self.low_memory,
                rp_tree_init=True,
                init_graph=_init_graph,
                leaf_array=leaf_array,
                verbose=verbose,
            )
E           ZeroDivisionError: division by zero

pynndescent/pynndescent_.py:946: ZeroDivisionError
===================================================================================== warnings summary =====================================================================================
pynndescent/tests/test_distances.py::test_bit_jaccard
pynndescent/tests/test_pynndescent_.py::test_bitpacked_nn_descent_neighbor_accuracy
  /usr/local/lib/python3.9/site-packages/sklearn/metrics/pairwise.py:2317: DataConversionWarning: Data was converted to boolean for metric jaccard
    warnings.warn(msg, DataConversionWarning)

pynndescent/tests/test_pynndescent_.py::test_bitpacked_nn_descent_neighbor_accuracy
  /usr/ports/math/py-pynndescent/work-py39/pynndescent-0.5.12/pynndescent/pynndescent_.py:962: UserWarning: Failed to correctly find n_neighbors for some samples. Results may be less than ideal. Try re-running with different parameters.
    warn(

-- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html
================================================================================= short test summary info ==================================================================================
SKIPPED [1] pynndescent/tests/test_distances.py:245: incorrect function in scipy<1.8
============================================================= 1 failed, 145 passed, 1 skipped, 3 warnings in 575.60s (0:09:35) =============================================================
*** Error code 1

Version: 0.5.12
Python-3.9
FreeBSD 14.0

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions