diff --git a/src/VecSim/algorithms/brute_force/brute_force.h b/src/VecSim/algorithms/brute_force/brute_force.h index d4cddfcdf..42bf12654 100644 --- a/src/VecSim/algorithms/brute_force/brute_force.h +++ b/src/VecSim/algorithms/brute_force/brute_force.h @@ -40,8 +40,8 @@ class BruteForceIndex : public VecSimIndexAbstract { size_t indexSize() const override; size_t indexCapacity() const override; std::unique_ptr getVectorsIterator() const; - DataType *getDataByInternalId(idType id) const { - return (DataType *)this->vectors->getElement(id); + const DataType *getDataByInternalId(idType id) const { + return reinterpret_cast(this->vectors->getElement(id)); } VecSimQueryReply *topKQuery(const void *queryBlob, size_t k, VecSimQueryParams *queryParams) const override; @@ -75,15 +75,41 @@ class BruteForceIndex : public VecSimIndexAbstract { virtual ~BruteForceIndex() = default; #ifdef BUILD_TESTS /** - * @brief Used for testing - store vector(s) data associated with a given label. This function - * copies the vector(s)' data buffer(s) and place it in the output vector + * @brief Used for testing - get only the vector elements associated with a given label. + * This function copies only the vector(s) elements into the output vector, + * without any additional metadata that might be stored with the vector(s). * - * @param label - * @param vectors_output empty vector to be modified, should store the blob(s) associated with - * the label. + * Important: This method returns ONLY the vector elements, even if the stored vector contains + * additional metadata. For example, with int8_t/uint8_t vectors using cosine similarity, + * this method will NOT return the norm that is stored with the vector. + * + * If you need the complete data including any metadata, use getStoredVectorDataByLabel() + * instead. + * + * @param label The label to retrieve vector(s) elements for + * @param vectors_output Empty vector to be filled with vector(s) */ virtual void getDataByLabel(labelType label, std::vector> &vectors_output) const = 0; + + /** + * @brief Used for testing - get the complete raw data associated with a given label. + * This function returns the ENTIRE vector(s) data as stored in the index, including any + * additional metadata that might be stored alongside the vector elements. + * + * For example: + * - For int8_t/uint8_t vectors with cosine similarity, this includes the norm stored at the end + * - For other vector types or future implementations, this will include any additional data + * that might be stored with the vector + * + * Use this method when you need access to the complete vector data as it is stored internally. + * + * @param label The label to retrieve data for + * @return A vector containing the complete vector data (elements + metadata) for the given + * label + */ + virtual std::vector> getStoredVectorDataByLabel(labelType label) const = 0; + void fitMemory() override { if (count == 0) { return; @@ -351,6 +377,8 @@ BruteForceIndex::newBatchIterator(const void *queryBlob, auto *queryBlobCopy = this->allocator->allocate_aligned(this->dataSize, this->preprocessors->getAlignment()); memcpy(queryBlobCopy, queryBlob, this->dim * sizeof(DataType)); + + // memcpy(queryBlobCopy, queryBlob, this->getDataSize()); this->preprocessQueryInPlace(queryBlobCopy); // Ownership of queryBlobCopy moves to BF_BatchIterator that will free it at the end. return newBatchIterator_Instance(queryBlobCopy, queryParams); diff --git a/src/VecSim/algorithms/brute_force/brute_force_multi.h b/src/VecSim/algorithms/brute_force/brute_force_multi.h index 999f5ac8b..bb6dcc2d8 100644 --- a/src/VecSim/algorithms/brute_force/brute_force_multi.h +++ b/src/VecSim/algorithms/brute_force/brute_force_multi.h @@ -45,10 +45,30 @@ class BruteForceIndex_Multi : public BruteForceIndex { for (idType id : ids->second) { auto vec = std::vector(this->dim); + // Only copy the vector data (dim * sizeof(DataType)), not any additional metadata like + // the norm memcpy(vec.data(), this->getDataByInternalId(id), this->dim * sizeof(DataType)); vectors_output.push_back(vec); } } + + std::vector> getStoredVectorDataByLabel(labelType label) const override { + std::vector> vectors_output; + auto ids = labelToIdsLookup.find(label); + + for (idType id : ids->second) { + // Get the data pointer - need to cast to char* for memcpy + const char *data = reinterpret_cast(this->getDataByInternalId(id)); + + // Create a vector with the full data (including any metadata like norms) + std::vector vec(this->getDataSize()); + memcpy(vec.data(), data, this->getDataSize()); + vectors_output.push_back(std::move(vec)); + } + + return vectors_output; + } + #endif private: // inline definitions diff --git a/src/VecSim/algorithms/brute_force/brute_force_single.h b/src/VecSim/algorithms/brute_force/brute_force_single.h index 0c27615e9..96f6ef65d 100644 --- a/src/VecSim/algorithms/brute_force/brute_force_single.h +++ b/src/VecSim/algorithms/brute_force/brute_force_single.h @@ -39,17 +39,33 @@ class BruteForceIndex_Single : public BruteForceIndex { // We call this when we KNOW that the label exists in the index. idType getIdOfLabel(labelType label) const { return labelToIdLookup.find(label)->second; } - +// #define BUILD_TESTS #ifdef BUILD_TESTS void getDataByLabel(labelType label, std::vector> &vectors_output) const override { - auto id = labelToIdLookup.at(label); auto vec = std::vector(this->dim); + // Only copy the vector data (dim * sizeof(DataType)), not any additional metadata like the + // norm memcpy(vec.data(), this->getDataByInternalId(id), this->dim * sizeof(DataType)); vectors_output.push_back(vec); } + + std::vector> getStoredVectorDataByLabel(labelType label) const override { + std::vector> vectors_output; + auto id = labelToIdLookup.at(label); + + // Get the data pointer - need to cast to char* for memcpy + const char *data = reinterpret_cast(this->getDataByInternalId(id)); + + // Create a vector with the full data (including any metadata like norms) + std::vector vec(this->getDataSize()); + memcpy(vec.data(), data, this->getDataSize()); + vectors_output.push_back(std::move(vec)); + + return vectors_output; + } #endif protected: // inline definitions diff --git a/src/VecSim/algorithms/hnsw/hnsw.h b/src/VecSim/algorithms/hnsw/hnsw.h index 61ca1fa2d..7510a66df 100644 --- a/src/VecSim/algorithms/hnsw/hnsw.h +++ b/src/VecSim/algorithms/hnsw/hnsw.h @@ -301,15 +301,41 @@ class HNSWIndex : public VecSimIndexAbstract, #ifdef BUILD_TESTS /** - * @brief Used for testing - store vector(s) data associated with a given label. This function - * copies the vector(s)' data buffer(s) and place it in the output vector + * @brief Used for testing - get only the vector elements associated with a given label. + * This function copies only the vector(s) elements into the output vector, + * without any additional metadata that might be stored with the vector. * - * @param label - * @param vectors_output empty vector to be modified, should store the blob(s) associated with - * the label. + * Important: This method returns ONLY the vector elements, even if the stored vector contains + * additional metadata. For example, with int8_t/uint8_t vectors using cosine similarity, + * this method will NOT return the norm that is stored with the vector(s). + * + * If you need the complete data including any metadata, use getStoredVectorDataByLabel() + * instead. + * + * @param label The label to retrieve vector(s) elements for + * @param vectors_output Empty vector to be filled with vector(s) */ virtual void getDataByLabel(labelType label, std::vector> &vectors_output) const = 0; + + /** + * @brief Used for testing - get the complete raw data associated with a given label. + * This function returns the ENTIRE vector(s) data as stored in the index, including any + * additional metadata that might be stored alongside the vector elements. + * + * For example: + * - For int8_t/uint8_t vectors with cosine similarity, this includes the norm stored at the end + * - For other vector types or future implementations, this will include any additional data + * that might be stored with the vector + * + * Use this method when you need access to the complete vector data as it is stored internally. + * + * @param label The label to retrieve data for + * @return A vector containing the complete vector data (elements + metadata) for the given + * label + */ + virtual std::vector> getStoredVectorDataByLabel(labelType label) const = 0; + void fitMemory() override { if (maxElements > 0) { idToMetaData.shrink_to_fit(); @@ -1559,7 +1585,7 @@ void HNSWIndex::insertElementToGraph(idType element_id, for (auto level = static_cast(max_common_level); level >= 0; level--) { candidatesMaxHeap top_candidates = searchLayer(curr_element, vector_data, level, efConstruction); - // If the entry point was marked deleted between iterations, we may recieve an empty + // If the entry point was marked deleted between iterations, we may receive an empty // candidates set. if (!top_candidates.empty()) { curr_element = mutuallyConnectNewElement(element_id, top_candidates, level); diff --git a/src/VecSim/algorithms/hnsw/hnsw_multi.h b/src/VecSim/algorithms/hnsw/hnsw_multi.h index 972c981e4..9048e9738 100644 --- a/src/VecSim/algorithms/hnsw/hnsw_multi.h +++ b/src/VecSim/algorithms/hnsw/hnsw_multi.h @@ -72,10 +72,28 @@ class HNSWIndex_Multi : public HNSWIndex { for (idType id : ids->second) { auto vec = std::vector(this->dim); - memcpy(vec.data(), this->getDataByInternalId(id), this->dataSize); + // Only copy the vector data (dim * sizeof(DataType)), not any additional metadata like + // the norm + memcpy(vec.data(), this->getDataByInternalId(id), this->dim * sizeof(DataType)); vectors_output.push_back(vec); } } + + std::vector> getStoredVectorDataByLabel(labelType label) const override { + std::vector> vectors_output; + auto ids = labelLookup.find(label); + + for (idType id : ids->second) { + const char *data = this->getDataByInternalId(id); + + // Create a vector with the full data (including any metadata like norms) + std::vector vec(this->dataSize); + memcpy(vec.data(), data, this->dataSize); + vectors_output.push_back(std::move(vec)); + } + + return vectors_output; + } #endif ~HNSWIndex_Multi() = default; @@ -201,7 +219,7 @@ HNSWIndex_Multi::newBatchIterator(const void *queryBlob, VecSimQueryParams *queryParams) const { auto queryBlobCopy = this->allocator->allocate_aligned(this->dataSize, this->preprocessors->getAlignment()); - memcpy(queryBlobCopy, queryBlob, this->dim * sizeof(DataType)); + memcpy(queryBlobCopy, queryBlob, this->getDataSize()); this->preprocessQueryInPlace(queryBlobCopy); // Ownership of queryBlobCopy moves to HNSW_BatchIterator that will free it at the end. return new (this->allocator) HNSWMulti_BatchIterator( diff --git a/src/VecSim/algorithms/hnsw/hnsw_single.h b/src/VecSim/algorithms/hnsw/hnsw_single.h index c9ef19ead..2993f7c19 100644 --- a/src/VecSim/algorithms/hnsw/hnsw_single.h +++ b/src/VecSim/algorithms/hnsw/hnsw_single.h @@ -48,9 +48,24 @@ class HNSWIndex_Single : public HNSWIndex { auto id = labelLookup.at(label); auto vec = std::vector(this->dim); - memcpy(vec.data(), this->getDataByInternalId(id), this->dataSize); + // Only copy the vector data (dim * sizeof(DataType)), not any additional metadata like the + // norm + memcpy(vec.data(), this->getDataByInternalId(id), this->dim * sizeof(DataType)); vectors_output.push_back(vec); } + + std::vector> getStoredVectorDataByLabel(labelType label) const override { + std::vector> vectors_output; + auto id = labelLookup.at(label); + const char *data = this->getDataByInternalId(id); + + // Create a vector with the full data (including any metadata like norms) + std::vector vec(this->dataSize); + memcpy(vec.data(), data, this->dataSize); + vectors_output.push_back(std::move(vec)); + + return vectors_output; + } #endif ~HNSWIndex_Single() = default; @@ -161,7 +176,7 @@ HNSWIndex_Single::newBatchIterator(const void *queryBlob, VecSimQueryParams *queryParams) const { auto queryBlobCopy = this->allocator->allocate_aligned(this->dataSize, this->preprocessors->getAlignment()); - memcpy(queryBlobCopy, queryBlob, this->dim * sizeof(DataType)); + memcpy(queryBlobCopy, queryBlob, this->getDataSize()); this->preprocessQueryInPlace(queryBlobCopy); // Ownership of queryBlobCopy moves to HNSW_BatchIterator that will free it at the end. return new (this->allocator) HNSWSingle_BatchIterator( diff --git a/src/VecSim/algorithms/hnsw/hnsw_tiered.h b/src/VecSim/algorithms/hnsw/hnsw_tiered.h index f868cd16d..a537c35d9 100644 --- a/src/VecSim/algorithms/hnsw/hnsw_tiered.h +++ b/src/VecSim/algorithms/hnsw/hnsw_tiered.h @@ -63,6 +63,8 @@ struct HNSWRepairJob : public AsyncJob { template class TieredHNSWIndex : public VecSimTieredIndex { private: + size_t bg_vector_indexing_count; + size_t main_vector_indexing_count; /// Mappings from id/label to associated jobs, for invalidating and update ids if necessary. // In MULTI, we can have more than one insert job pending per label. // **This map is protected with the flat buffer lock** @@ -197,7 +199,7 @@ class TieredHNSWIndex : public VecSimTieredIndex { VecSimDebugInfoIterator *debugInfoIterator() const override; VecSimBatchIterator *newBatchIterator(const void *queryBlob, VecSimQueryParams *queryParams) const override { - size_t blobSize = this->frontendIndex->getDim() * sizeof(DataType); + size_t blobSize = this->frontendIndex->getDataSize(); void *queryBlobCopy = this->allocator->allocate(blobSize); memcpy(queryBlobCopy, queryBlob, blobSize); return new (this->allocator) @@ -233,6 +235,8 @@ class TieredHNSWIndex : public VecSimTieredIndex { #ifdef BUILD_TESTS void getDataByLabel(labelType label, std::vector> &vectors_output) const; + + std::vector> getStoredVectorDataByLabel(labelType label) const; #endif }; @@ -536,10 +540,10 @@ void TieredHNSWIndex::executeInsertJob(HNSWInsertJob *job) { HNSWIndex *hnsw_index = this->getHNSWIndex(); // Copy the vector blob from the flat buffer, so we can release the flat lock while we are // indexing the vector into HNSW index. - auto blob_copy = this->getAllocator()->allocate_unique(this->frontendIndex->getDataSize()); + size_t data_size = this->frontendIndex->getDataSize(); + auto blob_copy = this->getAllocator()->allocate_unique(data_size); - memcpy(blob_copy.get(), this->frontendIndex->getDataByInternalId(job->id), - this->frontendIndex->getDim() * sizeof(DataType)); + memcpy(blob_copy.get(), this->frontendIndex->getDataByInternalId(job->id), data_size); this->insertVectorToHNSW(hnsw_index, job->label, blob_copy.get()); @@ -633,9 +637,11 @@ TieredHNSWIndex::TieredHNSWIndex(HNSWIndex allocator) : VecSimTieredIndex(hnsw_index, bf_index, tiered_index_params, allocator), + bg_vector_indexing_count(0), main_vector_indexing_count(0), labelToInsertJobs(this->allocator), idToRepairJobs(this->allocator), idToSwapJob(this->allocator), invalidJobs(this->allocator), currInvalidJobId(0), readySwapJobs(0) { + // If the param for swapJobThreshold is 0 use the default value, if it exceeds the maximum // allowed, use the maximum value. this->pendingSwapJobsThreshold = @@ -699,7 +705,7 @@ size_t TieredHNSWIndex::indexLabelCount() const { this->mainIndexGuard.unlock(); return output.size(); } - +#include "VecSim/vec_sim_debug.h" // In the tiered index, we assume that the blobs are processed by the flat buffer // before being transferred to the HNSW index. // When inserting vectors directly into the HNSW index—such as in VecSim_WriteInPlace mode— or when @@ -709,8 +715,42 @@ template int TieredHNSWIndex::addVector(const void *blob, labelType label) { int ret = 1; auto hnsw_index = this->getHNSWIndex(); + // if (label == 100000) { + // std::string res("index connections: {"); + + // res += "Entry Point Label: "; + // res += std::to_string(hnsw_index->getEntryPointLabel()) + "\n"; + // TIERED_LOG(VecSimCommonStrings::LOG_WARNING_STRING, + // "%s", res.c_str()); + // for (idType id = 0; id < this->backendIndex->indexSize(); id++) { + // std::string n_data("Node "); + // labelType label = hnsw_index->getExternalLabel(id); + // if (label == SIZE_MAX) + // continue; // The ID is not in the index + // int **neighbors_output; + // VecSimDebug_GetElementNeighborsInHNSWGraph(this->backendIndex, label, + // &neighbors_output); n_data += std::to_string(label) + ": "; for (size_t l = 0; + // neighbors_output[l]; l++) { + // n_data += "Level " + std::to_string(l) + " neighbors: "; + // auto &neighbours = neighbors_output[l]; + // auto neighbours_count = neighbours[0]; + // for (size_t j = 1; j <= neighbours_count; j++) { + // n_data += std::to_string(neighbours[j]) + ", "; + // } + // TIERED_LOG(VecSimCommonStrings::LOG_WARNING_STRING, + // "%s", n_data.c_str()); + // } + // VecSimDebug_ReleaseElementNeighborsInHNSWGraph(neighbors_output); + // } + // } + if (label == 999999) { + TIERED_LOG(VecSimCommonStrings::LOG_WARNING_STRING, + "bg_vec_count: %zu, main_thread_vector_count: %zu", + this->bg_vector_indexing_count, this->main_vector_indexing_count); + } + // writeMode is not protected since it is assumed to be called only from the "main thread" - // (that is the thread that is exculusively calling add/delete vector). + // (that is the thread that is exclusively calling add/delete vector). if (this->getWriteMode() == VecSim_WriteInPlace) { // First, check if we need to overwrite the vector in-place for single (from both indexes). if (!this->backendIndex->isMultiValue()) { @@ -733,6 +773,11 @@ int TieredHNSWIndex::addVector(const void *blob, labelType l // This will do nothing (and return 0) if this label doesn't exist. Otherwise, it may // remove vector from the flat buffer and/or the HNSW index. ret -= this->deleteVector(label); + if (ret != 1) { + TIERED_LOG(VecSimCommonStrings::LOG_VERBOSE_STRING, + "removed a vector while trying to insert label %zu. ret: %d", label, + ret); + } } if (this->frontendIndex->indexSize() >= this->flatBufferLimit) { // We didn't remove a vector from flat buffer due to overwrite, insert the new vector @@ -742,11 +787,13 @@ int TieredHNSWIndex::addVector(const void *blob, labelType l // index. auto storage_blob = this->frontendIndex->preprocessForStorage(blob); this->insertVectorToHNSW(hnsw_index, label, storage_blob.get()); + this->main_vector_indexing_count++; return ret; } // Otherwise, we fall back to the "regular" insertion into the flat buffer // (since it is not full anymore after removing the previous vector stored under the label). } + this->bg_vector_indexing_count++; this->flatIndexGuard.lock(); idType new_flat_id = this->frontendIndex->indexSize(); if (this->frontendIndex->isLabelExists(label) && !this->frontendIndex->isMultiValue()) { @@ -840,7 +887,7 @@ int TieredHNSWIndex::deleteVector(labelType label) { // Note that we may remove the same vector that has been removed from the flat index, if it was // being ingested at that time. // writeMode is not protected since it is assumed to be called only from the "main thread" - // (that is the thread that is exculusively calling add/delete vector). + // (that is the thread that is exclusively calling add/delete vector). if (this->getWriteMode() == VecSim_WriteAsync) { num_deleted_vectors += this->deleteLabelFromHNSW(label); // Apply ready swap jobs if number of deleted vectors reached the threshold @@ -1183,4 +1230,10 @@ void TieredHNSWIndex::getDataByLabel( labelType label, std::vector> &vectors_output) const { this->getHNSWIndex()->getDataByLabel(label, vectors_output); } + +template +std::vector> +TieredHNSWIndex::getStoredVectorDataByLabel(labelType label) const { + return this->getHNSWIndex()->getStoredVectorDataByLabel(label); +} #endif diff --git a/src/VecSim/algorithms/svs/svs.h b/src/VecSim/algorithms/svs/svs.h index 6210f6e83..0cdb1d7f3 100644 --- a/src/VecSim/algorithms/svs/svs.h +++ b/src/VecSim/algorithms/svs/svs.h @@ -146,7 +146,7 @@ class SVSIndex : public VecSimIndexAbstract, fl return MemoryUtils::unique_blob{const_cast(original_data), [](void *) {}}; } - const auto data_size = this->dim * sizeof(DataType) * n; + const auto data_size = this->getDataSize() * n; auto processed_blob = MemoryUtils::unique_blob{this->allocator->allocate(data_size), @@ -428,7 +428,7 @@ class SVSIndex : public VecSimIndexAbstract, fl VecSimQueryParams *queryParams) const override { auto *queryBlobCopy = this->allocator->allocate_aligned(this->dataSize, this->preprocessors->getAlignment()); - memcpy(queryBlobCopy, queryBlob, this->dim * sizeof(DataType)); + memcpy(queryBlobCopy, queryBlob, this->getDataSize()); this->preprocessQueryInPlace(queryBlobCopy); // Ownership of queryBlobCopy moves to VecSimBatchIterator that will free it at the end. if (indexSize() == 0) { diff --git a/src/VecSim/vec_sim_tiered_index.h b/src/VecSim/vec_sim_tiered_index.h index f166667e7..559737117 100644 --- a/src/VecSim/vec_sim_tiered_index.h +++ b/src/VecSim/vec_sim_tiered_index.h @@ -101,6 +101,9 @@ class VecSimTieredIndex : public VecSimIndexInterface { inline VecSimIndexAbstract *getFlatBufferIndex() { return this->frontendIndex; } + inline BruteForceIndex *getFlatBufferIndexAsBruteForce() { + return this->frontendIndex; + } inline size_t getFlatBufferLimit() { return this->flatBufferLimit; } virtual void fitMemory() override { diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt index f248705d4..63e245c2d 100644 --- a/tests/unit/CMakeLists.txt +++ b/tests/unit/CMakeLists.txt @@ -79,4 +79,4 @@ if(SVS_SUPPORTED) add_executable(test_svs ../utils/mock_thread_pool.cpp test_svs.cpp unit_test_utils.cpp) target_link_libraries(test_svs PUBLIC gtest_main VectorSimilarity) gtest_discover_tests(test_svs) -endif() \ No newline at end of file +endif() diff --git a/tests/unit/test_bruteforce.cpp b/tests/unit/test_bruteforce.cpp index 821af06a9..ac7db607b 100644 --- a/tests/unit/test_bruteforce.cpp +++ b/tests/unit/test_bruteforce.cpp @@ -80,7 +80,7 @@ TYPED_TEST(BruteForceTest, brute_force_vector_update_test) { ASSERT_EQ(bf_index->idToLabelMapping.size(), DEFAULT_BLOCK_SIZE); // Check update. - TEST_DATA_T *vector_data = bf_index->getDataByInternalId(0); + const TEST_DATA_T *vector_data = bf_index->getDataByInternalId(0); for (size_t i = 0; i < dim; ++i) { ASSERT_EQ(*vector_data, 2.0); ++vector_data; @@ -386,7 +386,7 @@ TYPED_TEST(BruteForceTest, test_delete_swap_block) { ASSERT_EQ(deleted_label_id_pair, bf_single_index->labelToIdLookup.end()); // The vector in index1 should hold id5 data. - TEST_DATA_T *vector_data = bf_index->getDataByInternalId(1); + const TEST_DATA_T *vector_data = bf_index->getDataByInternalId(1); for (size_t i = 0; i < dim; ++i) { ASSERT_EQ(*vector_data, 5); ++vector_data; diff --git a/tests/unit/test_int8.cpp b/tests/unit/test_int8.cpp index aaf0f5d51..840e34836 100644 --- a/tests/unit/test_int8.cpp +++ b/tests/unit/test_int8.cpp @@ -7,6 +7,7 @@ #include "VecSim/vec_sim_debug.h" #include "VecSim/spaces/L2/L2.h" #include "VecSim/spaces/IP/IP.h" +#include "VecSim/spaces/normalize/normalize_naive.h" class INT8Test : public ::testing::Test { protected: @@ -38,7 +39,9 @@ class INT8Test : public ::testing::Test { virtual HNSWIndex *CastToHNSW() { return CastIndex>(); } - void PopulateRandomVector(int8_t *out_vec) { test_utils::populate_int8_vec(out_vec, dim); } + void PopulateRandomVector(int8_t *out_vec) { + test_utils::populate_int8_vec(out_vec, dim, current_seed++); + } int PopulateRandomAndAddVector(size_t id, int8_t *out_vec) { PopulateRandomVector(out_vec); return VecSimIndex_AddVector(index, out_vec, id); @@ -92,6 +95,7 @@ class INT8Test : public ::testing::Test { VecSimIndex *index; size_t dim; + int current_seed{0}; }; class INT8HNSWTest : public INT8Test { @@ -173,7 +177,7 @@ class INT8TieredTest : public INT8Test { virtual void TearDown() override {} virtual const void *GetDataByInternalId(idType id) override { - return CastIndex>(CastToBruteForce()) + return CastIndex>(GetFlatBufferIndex()) ->getDataByInternalId(id); } @@ -186,11 +190,16 @@ class INT8TieredTest : public INT8Test { return CastIndex>(CastToHNSW()); } - VecSimIndexAbstract *CastToBruteForce() { + VecSimIndexAbstract *GetFlatBufferIndex() { auto tiered_index = dynamic_cast *>(index); return tiered_index->getFlatBufferIndex(); } + BruteForceIndex *CastToBruteForce() { + auto tiered_index = dynamic_cast *>(index); + return tiered_index->getFlatBufferIndexAsBruteForce(); + } + int GenerateRandomAndAddVector(size_t id) override { int8_t v[dim]; PopulateRandomVector(v); @@ -384,7 +393,7 @@ void INT8Test::metrics_test(params_t index_params) { double expected_score = 0; auto verify_res = [&](size_t id, double score, size_t index) { - ASSERT_EQ(score, expected_score) << "failed at vector id:" << id; + ASSERT_NEAR(score, expected_score, 1e-6f) << "failed at vector id:" << id; }; for (size_t i = 0; i < n; i++) { @@ -729,7 +738,7 @@ void INT8TieredTest::test_info(bool is_multi) { VecSimIndexDebugInfo info = INT8Test::test_info(hnsw_params); ASSERT_EQ(info.commonInfo.basicInfo.algo, VecSimAlgo_HNSWLIB); - VecSimIndexDebugInfo frontendIndexInfo = CastToBruteForce()->debugInfo(); + VecSimIndexDebugInfo frontendIndexInfo = GetFlatBufferIndex()->debugInfo(); VecSimIndexDebugInfo backendIndexInfo = CastToHNSW()->debugInfo(); compareCommonInfo(info.tieredInfo.frontendCommonInfo, frontendIndexInfo.commonInfo); @@ -837,7 +846,7 @@ void INT8TieredTest::test_info_iterator(VecSimMetric metric) { SetUp(params); VecSimIndexDebugInfo info = VecSimIndex_DebugInfo(index); VecSimDebugInfoIterator *infoIter = VecSimIndex_DebugInfoIterator(index); - VecSimIndexDebugInfo frontendIndexInfo = CastToBruteForce()->debugInfo(); + VecSimIndexDebugInfo frontendIndexInfo = GetFlatBufferIndex()->debugInfo(); VecSimIndexDebugInfo backendIndexInfo = CastToHNSW()->debugInfo(); VecSimDebugInfoIterator_Free(infoIter); } @@ -875,7 +884,7 @@ void INT8HNSWTest::test_serialization(bool is_multi) { int8_t data[n * dim]; for (size_t i = 0; i < n * dim; i += dim) { - test_utils::populate_int8_vec(data + i, dim, i); + this->PopulateRandomVector(data + i); } for (size_t j = 0; j < n; ++j) { @@ -993,3 +1002,127 @@ TEST_F(INT8TieredTest, getElementNeighbors) { HNSWParams params = {.dim = 4, .M = 20}; get_element_neighbors(params); } + +/** + * Tests int8_t vectors with cosine similarity in a tiered index across three scenarios: + * 1. Verifies vector data correctness when stored in the flat buffer + * 2. Verifies vector data correctness when inserted directly into HNSW (when flat buffer is full) + * 3. Verifies vector data correctness after transfer from flat buffer to HNSW + * + * For each scenario, the test confirms: + * - Vector data matches the expected normalized vector + * - The norm is correctly stored at the end of the vector + * - Search operations (topK, range, batch) return the expected results + */ + +TEST_F(INT8TieredTest, CosineBlobCorrectness) { + // Create TieredHNSW index with cosine metric + constexpr size_t dim = 4; + HNSWParams hnsw_params = {.dim = dim, .metric = VecSimMetric_Cosine}; + // Create tiered index with buffer limit set to 1. + TieredIndexParams tiered_params = this->generate_tiered_params(hnsw_params, 1, 1); + SetUp(tiered_params); + + auto frontend_index = this->CastToBruteForce(); + auto hnsw_index = this->CastToHNSW(); + + int8_t vector[dim]; + PopulateRandomVector(vector); + float vector_norm = spaces::IntegralType_ComputeNorm(vector, dim); + + auto verify_norm = [&](const int8_t *input_vector, float expected_norm) { + float vectors_stored_norm = *(reinterpret_cast(input_vector + dim)); + ASSERT_EQ(vectors_stored_norm, expected_norm) << "wrong vector norm"; + }; + + int8_t normalized_vec[dim + sizeof(float)]; + memcpy(normalized_vec, vector, dim); + spaces::integer_normalizeVector(normalized_vec, dim); + ASSERT_NO_FATAL_FAILURE(verify_norm(normalized_vec, vector_norm)); + + int8_t query[dim + sizeof(float)]; + PopulateRandomVector(query); + float query_norm = spaces::IntegralType_ComputeNorm(query, dim); + + // Calculate the expected score manually. + int ip = 0; + for (size_t i = 0; i < dim; i++) { + ip += vector[i] * query[i]; + } + float expected_score = 1.0 - (float(ip) / (vector_norm * query_norm)); + + auto verify_res = [&](size_t label, double score, size_t result_rank) { + ASSERT_EQ(score, expected_score) << "label: " << label; + }; + + // ============== Scenario 1: + // blob correctness in the flat buffer + + // Add a vector to the flat buffer. + VecSimIndex_AddVector(index, vector, 0); + { + SCOPED_TRACE("Store in the flat buffer"); + // Get the stored vector data including the norm + auto stored_vec = frontend_index->getStoredVectorDataByLabel(0); + const int8_t *stored_vec_data = reinterpret_cast(stored_vec.at(0).data()); + // the vector should be normalized. + ASSERT_NO_FATAL_FAILURE(CompareVectors(stored_vec_data, normalized_vec, dim)); + // The norm should be stored in the last position. + verify_norm(stored_vec_data, vector_norm); + + ASSERT_NO_FATAL_FAILURE(runTopKSearchTest(index, query, 1, verify_res)); + ASSERT_NO_FATAL_FAILURE(runRangeQueryTest(index, query, 0.5, verify_res, 1, BY_SCORE)); + VecSimBatchIterator *batchIterator = VecSimBatchIterator_New(index, query, nullptr); + ASSERT_NO_FATAL_FAILURE(runBatchIteratorSearchTest(batchIterator, 1, verify_res)); + VecSimBatchIterator_Free(batchIterator); + } + + // ============== Scenario 2: + // blob correctness when inserted directly to the hnsw + + // Add another vector and exceed the flat buffer capacity. The vector should be stored directly + // in the hnsw index + VecSimIndex_AddVector(index, vector, 1); + EXPECT_EQ(frontend_index->indexSize(), 1); + EXPECT_EQ(hnsw_index->indexSize(), 1); + { + SCOPED_TRACE("Full buffer; add vector directly to hnsw"); + auto stored_vec = hnsw_index->getStoredVectorDataByLabel(1); + const int8_t *stored_vec_data = reinterpret_cast(stored_vec.at(0).data()); + // the vector should be normalized. + ASSERT_NO_FATAL_FAILURE(CompareVectors(stored_vec_data, normalized_vec, dim)); + // The norm should be stored in the last position. + verify_norm(stored_vec_data, vector_norm); + + size_t k = 2; + ASSERT_NO_FATAL_FAILURE(runTopKSearchTest(index, query, k, verify_res)); + ASSERT_NO_FATAL_FAILURE(runRangeQueryTest(index, query, 100, verify_res, k, BY_SCORE)); + VecSimBatchIterator *batchIterator = VecSimBatchIterator_New(index, query, nullptr); + ASSERT_NO_FATAL_FAILURE(runBatchIteratorSearchTest(batchIterator, k, verify_res)); + VecSimBatchIterator_Free(batchIterator); + } + + // ============== Scenario 3: + // blob correctness after transferred to the hnsw + + // Move the first vector to the hnsw index. + mock_thread_pool.thread_iteration(); + EXPECT_EQ(frontend_index->indexSize(), 0); + EXPECT_EQ(hnsw_index->indexSize(), 2); + { + SCOPED_TRACE("Execute insertion job"); + auto stored_vec = hnsw_index->getStoredVectorDataByLabel(0); + const int8_t *stored_vec_data = reinterpret_cast(stored_vec.at(0).data()); + // the vector should be normalized. + ASSERT_NO_FATAL_FAILURE(CompareVectors(stored_vec_data, normalized_vec, dim)); + // The norm should be stored in the last position. + verify_norm(stored_vec_data, vector_norm); + + size_t k = 2; + ASSERT_NO_FATAL_FAILURE(runTopKSearchTest(index, query, k, verify_res)); + ASSERT_NO_FATAL_FAILURE(runRangeQueryTest(index, query, 100, verify_res, k, BY_SCORE)); + VecSimBatchIterator *batchIterator = VecSimBatchIterator_New(index, query, nullptr); + ASSERT_NO_FATAL_FAILURE(runBatchIteratorSearchTest(batchIterator, k, verify_res)); + VecSimBatchIterator_Free(batchIterator); + } +}