11# distutils: extra_compile_args=-fopenmp
22# distutils: extra_link_args=-fopenmp
33# cython: language_level=3
4+ # cython: language=c++
45
6+ import sys
57import math
68import numpy as np
79import cython
810from cython.parallel import prange, parallel
911from libc.stdlib cimport abort, malloc, free
12+ from libcpp cimport bool as cppbool
13+ from libcpp.vector cimport vector
1014
1115cimport numpy as np
1216cdef extern from " limits.h" :
@@ -16,7 +20,7 @@ ctypedef np.uint8_t uint8
1620
1721@ cython.boundscheck (False )
1822@ cython.wraparound (False )
19- def compute_euclidean_pairwise_duplicates (int[:, :] X , float threshold , counts: int [:] = None , compute_overlap = False ):
23+ def compute_euclidean_pairwise_duplicates (int[:, :] X , float threshold , counts: np.uint32_t [:] = None , compute_overlap = False ):
2024 """ Find the pairwise overlap within an array of vectors, where there may be multiple
2125 vectors for the same file. This function is faster than using scipy.spatial.distance
2226 because it computes distances in parallel, avoids computing full distances when they're
@@ -48,12 +52,13 @@ def compute_euclidean_pairwise_duplicates(int[:, :] X, float threshold, counts:
4852 - 50% o f file 2 was in file 3 and 25% o f file 3 was in file 2
4953 """
5054 if counts is None :
51- counts_arr = np.ones(X.shape[0 ], dtype = np.int32)
52- counts = counts_arr
55+ counts = np.ones(X.shape[0 ], dtype = np.uint32)
5356 cdef Py_ssize_t n = X.shape[0 ]
5457 cdef Py_ssize_t m = counts.shape[0 ]
5558 cdef Py_ssize_t d = X.shape[1 ]
56- cdef Py_ssize_t n_pairs = int (math.factorial(m)/ (2 * math.factorial(m- 2 )))
59+ n_pairs_python = int (math.factorial(m)/ (2 * math.factorial(m- 2 )))
60+ assert n_pairs_python < sys.maxsize, ' Too many files were provided for deduplication.'
61+ cdef Py_ssize_t n_pairs = n_pairs_python
5762 cdef Py_ssize_t max_counts = np.max(counts)
5863 cdef int compute_overlap_int = 0
5964 if compute_overlap:
@@ -71,10 +76,11 @@ def compute_euclidean_pairwise_duplicates(int[:, :] X, float threshold, counts:
7176 for i_1 in range (m):
7277 for i_i in range (i_1):
7378 offsets[i_1] += counts[i_i]
74- cdef size_t local_buf_size = 4 # distance, flattened array offset, index_offset_1, index_offset_2
79+ # local_buf will contain distance, flattened array offset, index_offset_1, index_offset_2
80+ cdef size_t local_buf_size = 4
7581 cdef float threshold2 = threshold ** 2
7682 with nogil, parallel():
77- local_buf = < int * > malloc(sizeof(int ) * local_buf_size)
83+ local_buf = < np.uint64_t * > malloc(sizeof(np.uint64_t ) * local_buf_size)
7884
7985 # An array of flags indicating whether a vector in file 1 was
8086 # matched.
@@ -139,3 +145,146 @@ def compute_euclidean_pairwise_duplicates(int[:, :] X, float threshold, counts:
139145 free(matched_1)
140146 free(matched_2)
141147 return duplicate_arr
148+
149+
150+ @ cython.boundscheck (False )
151+ @ cython.wraparound (False )
152+ def compute_euclidean_pairwise_duplicates_simple (int[:, :] X , float threshold , np.uint32_t[:] counts = None , float minimum_overlap = 0 ):
153+ """ Find the pairwise overlap within an array of vectors, where there may be multiple
154+ vectors for the same file. This function is similar to compute_euclidean_pairwise_duplicates
155+ but uses much less memory.
156+
157+ Args:
158+ X: The vectors with shape (N, D). Vectors for the same file need to be
159+ supplied sequentially so that we can use the counts argument
160+ to determine which vectors are for the same file.
161+ counts: For each of the M files, the number of sequential vectors in X.
162+ If not provided, each vector is assumed to be for a different file (i.e.,
163+ this is equivalent to `counts = np.ones(N)` which also implies M == N).
164+ Otherwise, assumed to have length M. The counts should add up to N.
165+ minimum_threshold: The minimum overlap between two groups of hashes to
166+ call it a match.
167+
168+ Returns:
169+ pairs: Pairs of indexes that met the matching criteria.
170+ """
171+ if counts is None :
172+ counts_arr = np.ones(X.shape[0 ], dtype = np.uint32)
173+ counts = counts_arr
174+ cdef Py_ssize_t n = X.shape[0 ]
175+ cdef Py_ssize_t m = counts.shape[0 ]
176+ cdef Py_ssize_t d = X.shape[1 ]
177+ n_pairs_python = int (math.factorial(m)/ (2 * math.factorial(m- 2 )))
178+ assert n_pairs_python < sys.maxsize, ' Too many files were provided for deduplication.'
179+ cdef Py_ssize_t n_pairs = n_pairs_python
180+ cdef Py_ssize_t max_counts = np.max(counts)
181+ # i_1 is the index of file1, i_2 is the index of file2, i_d is the
182+ # index of the vector dimension we're on, i_i is used to compute
183+ # the starting index in the flattened vector in the different threads.
184+ # i_1_subhash is the index of the hash on file1, i_2_subhash is
185+ # the index of the hash on file2.
186+ cdef Py_ssize_t i_1, i_2, i_d, i_i, i_1_sub, i_2_sub
187+ cdef vector[cppbool] duplicate
188+ duplicate.resize(n_pairs)
189+ offsets_arr = np.zeros(m, dtype = np.uint64)
190+ cdef np.uint64_t[:] offsets = offsets_arr
191+ cdef np.int32_t expected_n = 0
192+ for i_1 in range (m):
193+ for i_i in range (i_1):
194+ offsets[i_1] += counts[i_i]
195+ expected_n += counts[i_1]
196+ assert expected_n == n, " Provided value for counts is inconsistent with X."
197+ # local_buf will contain distance, flattened array offset, index_offset_1, index_offset_2
198+ cdef size_t local_buf_size = 4
199+ cdef float threshold2 = threshold ** 2
200+ with nogil, parallel():
201+ local_buf = < np.uint64_t * > malloc(sizeof(np.uint64_t) * local_buf_size)
202+
203+ # An array of flags indicating whether a vector in file 1 was
204+ # matched.
205+ matched_1 = < int * > malloc(sizeof(int ) * max_counts)
206+
207+ # An array of flags indicating whether a vector in file 2 was
208+ # matched.
209+ matched_2 = < int * > malloc(sizeof(int ) * max_counts)
210+
211+ # Pair overlap
212+ overlap = < float * > malloc(sizeof(float ) * 2 )
213+
214+ if local_buf is NULL or matched_1 is NULL or matched_2 is NULL or overlap is NULL :
215+ abort()
216+ # Iterate over all of the files.
217+ for i_1 in prange(m- 1 ):
218+ local_buf[1 ] = 0
219+ local_buf[2 ] = offsets[i_1]
220+ # Compute the index of the output vector
221+ # where we will count the number of duplicates.
222+ for i_i in range (i_1):
223+ local_buf[1 ] += m - i_i - 1
224+ # Iterate over all the other files to compare.
225+ for i_2 in range (i_1 + 1 , m):
226+ overlap[0 ] = 0
227+ overlap[1 ] = 0
228+ local_buf[3 ] = offsets[i_2]
229+ # Initialize all match flags to zero for
230+ # both file 1 and file 2.
231+ for i_1_sub in range (counts[i_1]):
232+ matched_1[i_1_sub] = 0
233+ for i_2_sub in range (counts[i_2]):
234+ matched_2[i_2_sub] = 0
235+ # Iterate over all the hashes in file1
236+ for i_1_sub in range (counts[i_1]):
237+ # Iterate over all the hashes in file2
238+ for i_2_sub in range (counts[i_2]):
239+ local_buf[0 ] = 0
240+ if matched_1[i_1_sub] == 1 and matched_2[i_2_sub] == 1 :
241+ # Both the vectors in this pair have already been matched, so
242+ # there is nothing to gain from this comparison.
243+ continue
244+ for i_d in range (d):
245+ local_buf[0 ] += (X[local_buf[2 ] + i_1_sub, i_d] - X[local_buf[3 ] + i_2_sub, i_d]) ** 2
246+ if local_buf[0 ] > threshold2:
247+ # If we're already beyond the distance threshold,
248+ # we don't need to continue computing squared
249+ # distances.
250+ break
251+ if local_buf[0 ] < threshold2:
252+ # A match was found. Set flags for both vectors
253+ # to 1.
254+ matched_1[i_1_sub] = 1
255+ matched_2[i_2_sub] = 1
256+ # Add up the number of matches for file 1.
257+ for i_1_sub in range (counts[i_1]):
258+ overlap[0 ] += matched_1[i_1_sub]
259+ # Add up the number of matches for file 2.
260+ for i_2_sub in range (counts[i_2]):
261+ overlap[1 ] += matched_2[i_2_sub]
262+ # Divide by the total number of vectors for each file.
263+ overlap[0 ] /= < float > counts[i_1]
264+ overlap[1 ] /= < float > counts[i_2]
265+ if overlap[0 ] > minimum_overlap and overlap[1 ] > minimum_overlap:
266+ duplicate[local_buf[1 ]] = 1
267+ local_buf[1 ] += 1
268+ free(matched_1)
269+ free(matched_2)
270+ free(overlap)
271+ free(local_buf)
272+ cdef int n_duplicates = 0
273+ cdef Py_ssize_t i_offset = 0
274+ for i_offset in range (n_pairs):
275+ if duplicate[i_offset] > 0 :
276+ n_duplicates += 1
277+ pairs_arr = np.zeros((n_duplicates, 2 ), dtype = np.int32)
278+ cdef np.int32_t[:, :] pairs = pairs_arr
279+ i_offset = 0
280+ cdef Py_ssize_t pair_offset = 0
281+ for i_1 in range (m- 1 ):
282+ # Compute the index of the output vector
283+ # where we will count the number of duplicates.
284+ for i_2 in range (i_1 + 1 , m):
285+ if duplicate[i_offset] > 0 :
286+ pairs[pair_offset][0 ] = i_1
287+ pairs[pair_offset][1 ] = i_2
288+ pair_offset += 1
289+ i_offset += 1
290+ return pairs_arr
0 commit comments