Skip to content

Commit f02ff35

Browse files
Make extensions for euclidean distance more memory efficient (#19)
Uses more efficient data structures and returns pairs instead of a flattened, sparse distance matrix to improve deduplication.
1 parent 773a7a5 commit f02ff35

4 files changed

Lines changed: 164 additions & 26 deletions

File tree

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,4 +34,5 @@ notebooks
3434

3535
# Extension artifacts
3636
*.c
37+
*.cpp
3738
*.so

perception/extensions.pyx

Lines changed: 155 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,16 @@
11
# distutils: extra_compile_args=-fopenmp
22
# distutils: extra_link_args=-fopenmp
33
# cython: language_level=3
4+
# cython: language=c++
45

6+
import sys
57
import math
68
import numpy as np
79
import cython
810
from cython.parallel import prange, parallel
911
from libc.stdlib cimport abort, malloc, free
12+
from libcpp cimport bool as cppbool
13+
from libcpp.vector cimport vector
1014

1115
cimport numpy as np
1216
cdef extern from "limits.h":
@@ -16,7 +20,7 @@ ctypedef np.uint8_t uint8
1620

1721
@cython.boundscheck(False)
1822
@cython.wraparound(False)
19-
def compute_euclidean_pairwise_duplicates(int[:, :] X, float threshold, counts: int[:] = None, compute_overlap=False):
23+
def compute_euclidean_pairwise_duplicates(int[:, :] X, float threshold, counts: np.uint32_t[:] = None, compute_overlap=False):
2024
"""Find the pairwise overlap within an array of vectors, where there may be multiple
2125
vectors for the same file. This function is faster than using scipy.spatial.distance
2226
because it computes distances in parallel, avoids computing full distances when they're
@@ -48,12 +52,13 @@ def compute_euclidean_pairwise_duplicates(int[:, :] X, float threshold, counts:
4852
- 50% of file 2 was in file 3 and 25% of file 3 was in file 2
4953
"""
5054
if counts is None:
51-
counts_arr = np.ones(X.shape[0], dtype=np.int32)
52-
counts = counts_arr
55+
counts = np.ones(X.shape[0], dtype=np.uint32)
5356
cdef Py_ssize_t n = X.shape[0]
5457
cdef Py_ssize_t m = counts.shape[0]
5558
cdef Py_ssize_t d = X.shape[1]
56-
cdef Py_ssize_t n_pairs = int(math.factorial(m)/(2*math.factorial(m-2)))
59+
n_pairs_python = int(math.factorial(m)/(2*math.factorial(m-2)))
60+
assert n_pairs_python < sys.maxsize, 'Too many files were provided for deduplication.'
61+
cdef Py_ssize_t n_pairs = n_pairs_python
5762
cdef Py_ssize_t max_counts = np.max(counts)
5863
cdef int compute_overlap_int = 0
5964
if compute_overlap:
@@ -71,10 +76,11 @@ def compute_euclidean_pairwise_duplicates(int[:, :] X, float threshold, counts:
7176
for i_1 in range(m):
7277
for i_i in range(i_1):
7378
offsets[i_1] += counts[i_i]
74-
cdef size_t local_buf_size = 4 # distance, flattened array offset, index_offset_1, index_offset_2
79+
# local_buf will contain distance, flattened array offset, index_offset_1, index_offset_2
80+
cdef size_t local_buf_size = 4
7581
cdef float threshold2 = threshold ** 2
7682
with nogil, parallel():
77-
local_buf = <int *> malloc(sizeof(int) * local_buf_size)
83+
local_buf = <np.uint64_t *> malloc(sizeof(np.uint64_t) * local_buf_size)
7884

7985
# An array of flags indicating whether a vector in file 1 was
8086
# matched.
@@ -139,3 +145,146 @@ def compute_euclidean_pairwise_duplicates(int[:, :] X, float threshold, counts:
139145
free(matched_1)
140146
free(matched_2)
141147
return duplicate_arr
148+
149+
150+
@cython.boundscheck(False)
151+
@cython.wraparound(False)
152+
def compute_euclidean_pairwise_duplicates_simple(int[:, :] X, float threshold, np.uint32_t[:] counts = None, float minimum_overlap = 0):
153+
"""Find the pairwise overlap within an array of vectors, where there may be multiple
154+
vectors for the same file. This function is similar to compute_euclidean_pairwise_duplicates
155+
but uses much less memory.
156+
157+
Args:
158+
X: The vectors with shape (N, D). Vectors for the same file need to be
159+
supplied sequentially so that we can use the counts argument
160+
to determine which vectors are for the same file.
161+
counts: For each of the M files, the number of sequential vectors in X.
162+
If not provided, each vector is assumed to be for a different file (i.e.,
163+
this is equivalent to `counts = np.ones(N)` which also implies M == N).
164+
Otherwise, assumed to have length M. The counts should add up to N.
165+
minimum_threshold: The minimum overlap between two groups of hashes to
166+
call it a match.
167+
168+
Returns:
169+
pairs: Pairs of indexes that met the matching criteria.
170+
"""
171+
if counts is None:
172+
counts_arr = np.ones(X.shape[0], dtype=np.uint32)
173+
counts = counts_arr
174+
cdef Py_ssize_t n = X.shape[0]
175+
cdef Py_ssize_t m = counts.shape[0]
176+
cdef Py_ssize_t d = X.shape[1]
177+
n_pairs_python = int(math.factorial(m)/(2*math.factorial(m-2)))
178+
assert n_pairs_python < sys.maxsize, 'Too many files were provided for deduplication.'
179+
cdef Py_ssize_t n_pairs = n_pairs_python
180+
cdef Py_ssize_t max_counts = np.max(counts)
181+
# i_1 is the index of file1, i_2 is the index of file2, i_d is the
182+
# index of the vector dimension we're on, i_i is used to compute
183+
# the starting index in the flattened vector in the different threads.
184+
# i_1_subhash is the index of the hash on file1, i_2_subhash is
185+
# the index of the hash on file2.
186+
cdef Py_ssize_t i_1, i_2, i_d, i_i, i_1_sub, i_2_sub
187+
cdef vector[cppbool] duplicate
188+
duplicate.resize(n_pairs)
189+
offsets_arr = np.zeros(m, dtype=np.uint64)
190+
cdef np.uint64_t[:] offsets = offsets_arr
191+
cdef np.int32_t expected_n = 0
192+
for i_1 in range(m):
193+
for i_i in range(i_1):
194+
offsets[i_1] += counts[i_i]
195+
expected_n += counts[i_1]
196+
assert expected_n == n, "Provided value for counts is inconsistent with X."
197+
# local_buf will contain distance, flattened array offset, index_offset_1, index_offset_2
198+
cdef size_t local_buf_size = 4
199+
cdef float threshold2 = threshold ** 2
200+
with nogil, parallel():
201+
local_buf = <np.uint64_t *> malloc(sizeof(np.uint64_t) * local_buf_size)
202+
203+
# An array of flags indicating whether a vector in file 1 was
204+
# matched.
205+
matched_1 = <int *> malloc(sizeof(int) * max_counts)
206+
207+
# An array of flags indicating whether a vector in file 2 was
208+
# matched.
209+
matched_2 = <int *> malloc(sizeof(int) * max_counts)
210+
211+
# Pair overlap
212+
overlap = <float *> malloc(sizeof(float) * 2)
213+
214+
if local_buf is NULL or matched_1 is NULL or matched_2 is NULL or overlap is NULL:
215+
abort()
216+
# Iterate over all of the files.
217+
for i_1 in prange(m-1):
218+
local_buf[1] = 0
219+
local_buf[2] = offsets[i_1]
220+
# Compute the index of the output vector
221+
# where we will count the number of duplicates.
222+
for i_i in range(i_1):
223+
local_buf[1] += m - i_i - 1
224+
# Iterate over all the other files to compare.
225+
for i_2 in range(i_1 + 1, m):
226+
overlap[0] = 0
227+
overlap[1] = 0
228+
local_buf[3] = offsets[i_2]
229+
# Initialize all match flags to zero for
230+
# both file 1 and file 2.
231+
for i_1_sub in range(counts[i_1]):
232+
matched_1[i_1_sub] = 0
233+
for i_2_sub in range(counts[i_2]):
234+
matched_2[i_2_sub] = 0
235+
# Iterate over all the hashes in file1
236+
for i_1_sub in range(counts[i_1]):
237+
# Iterate over all the hashes in file2
238+
for i_2_sub in range(counts[i_2]):
239+
local_buf[0] = 0
240+
if matched_1[i_1_sub] == 1 and matched_2[i_2_sub] == 1:
241+
# Both the vectors in this pair have already been matched, so
242+
# there is nothing to gain from this comparison.
243+
continue
244+
for i_d in range(d):
245+
local_buf[0] += (X[local_buf[2] + i_1_sub, i_d] - X[local_buf[3] + i_2_sub, i_d]) ** 2
246+
if local_buf[0] > threshold2:
247+
# If we're already beyond the distance threshold,
248+
# we don't need to continue computing squared
249+
# distances.
250+
break
251+
if local_buf[0] < threshold2:
252+
# A match was found. Set flags for both vectors
253+
# to 1.
254+
matched_1[i_1_sub] = 1
255+
matched_2[i_2_sub] = 1
256+
# Add up the number of matches for file 1.
257+
for i_1_sub in range(counts[i_1]):
258+
overlap[0] += matched_1[i_1_sub]
259+
# Add up the number of matches for file 2.
260+
for i_2_sub in range(counts[i_2]):
261+
overlap[1] += matched_2[i_2_sub]
262+
# Divide by the total number of vectors for each file.
263+
overlap[0] /= <float> counts[i_1]
264+
overlap[1] /= <float> counts[i_2]
265+
if overlap[0] > minimum_overlap and overlap[1] > minimum_overlap:
266+
duplicate[local_buf[1]] = 1
267+
local_buf[1] += 1
268+
free(matched_1)
269+
free(matched_2)
270+
free(overlap)
271+
free(local_buf)
272+
cdef int n_duplicates = 0
273+
cdef Py_ssize_t i_offset = 0
274+
for i_offset in range(n_pairs):
275+
if duplicate[i_offset] > 0:
276+
n_duplicates += 1
277+
pairs_arr = np.zeros((n_duplicates, 2), dtype=np.int32)
278+
cdef np.int32_t[:, :] pairs = pairs_arr
279+
i_offset = 0
280+
cdef Py_ssize_t pair_offset = 0
281+
for i_1 in range(m-1):
282+
# Compute the index of the output vector
283+
# where we will count the number of duplicates.
284+
for i_2 in range(i_1 + 1, m):
285+
if duplicate[i_offset] > 0:
286+
pairs[pair_offset][0] = i_1
287+
pairs[pair_offset][1] = i_2
288+
pair_offset += 1
289+
i_offset += 1
290+
return pairs_arr

perception/tools.py

Lines changed: 6 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ def deduplicate_hashes(
131131
if multiple_hashes_per_id:
132132
counts = np.zeros(
133133
shape=len(set(
134-
hash_id for hash_id, _ in hashes))).astype('int32')
134+
hash_id for hash_id, _ in hashes))).astype('uint32')
135135
previous_hash_id = None
136136
counts_idx = 0
137137
files = [
@@ -146,23 +146,11 @@ def deduplicate_hashes(
146146
files = np.array(files)
147147
else:
148148
counts = None
149-
n_files = len(files)
150-
iterator = range(n_files)
151-
if progress is not None:
152-
iterator = progress(iterator, total=n_files, desc='Deduplicating.')
153-
duplicated = (extensions.compute_euclidean_pairwise_duplicates(
154-
vectors.astype('int32'), threshold=threshold,
155-
counts=counts).max(axis=1) > 0)
156-
for file_index in iterator:
157-
if end_idx is not None:
158-
start_idx = end_idx
159-
end_idx = start_idx + (n_files - file_index - 1)
160-
current_duplicated = duplicated[start_idx:end_idx]
161-
current_file = files[file_index]
162-
duplicated_files = files[file_index + 1:][current_duplicated]
163-
pairs.extend([(current_file, duplicated_file)
164-
for duplicated_file in duplicated_files
165-
if duplicated_file != current_file])
149+
pairs = [
150+
(files[idx1], files[idx2]) for idx1, idx2 in extensions.
151+
compute_euclidean_pairwise_duplicates_simple(
152+
vectors.astype('int32'), threshold=threshold, counts=counts)
153+
]
166154
return list(set(pairs))
167155

168156

tests/test_tools.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ def test_compute_euclidean_pairwise_duplicates():
103103
actual = tools.extensions.compute_euclidean_pairwise_duplicates(
104104
X=X.astype('int32'),
105105
threshold=1,
106-
counts=counts.astype('int32'),
106+
counts=counts.astype('uint32'),
107107
compute_overlap=True)
108108
assert (expected == actual).all()
109109

@@ -112,7 +112,7 @@ def test_compute_euclidean_pairwise_duplicates():
112112
actual = tools.extensions.compute_euclidean_pairwise_duplicates(
113113
X=X.astype('int32'),
114114
threshold=1,
115-
counts=counts.astype('int32'),
115+
counts=counts.astype('uint32'),
116116
compute_overlap=False)
117117
assert (expected == actual).all()
118118

0 commit comments

Comments
 (0)