From 25f3c3abac8bcc47cd49f1a3c33f2e8540a58d4f Mon Sep 17 00:00:00 2001 From: meiravgri Date: Tue, 15 Apr 2025 11:55:59 +0000 Subject: [PATCH 1/5] add size_t bg_vector_indexing_count; size_t main_vector_indexing_count; --- src/VecSim/algorithms/hnsw/hnsw_tiered.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/VecSim/algorithms/hnsw/hnsw_tiered.h b/src/VecSim/algorithms/hnsw/hnsw_tiered.h index f868cd16d..319f92a4f 100644 --- a/src/VecSim/algorithms/hnsw/hnsw_tiered.h +++ b/src/VecSim/algorithms/hnsw/hnsw_tiered.h @@ -63,6 +63,8 @@ struct HNSWRepairJob : public AsyncJob { template class TieredHNSWIndex : public VecSimTieredIndex { private: + size_t bg_vector_indexing_count; + size_t main_vector_indexing_count; /// Mappings from id/label to associated jobs, for invalidating and update ids if necessary. // In MULTI, we can have more than one insert job pending per label. // **This map is protected with the flat buffer lock** @@ -633,9 +635,11 @@ TieredHNSWIndex::TieredHNSWIndex(HNSWIndex allocator) : VecSimTieredIndex(hnsw_index, bf_index, tiered_index_params, allocator), + bg_vector_indexing_count(0), main_vector_indexing_count(0), labelToInsertJobs(this->allocator), idToRepairJobs(this->allocator), idToSwapJob(this->allocator), invalidJobs(this->allocator), currInvalidJobId(0), readySwapJobs(0) { + // If the param for swapJobThreshold is 0 use the default value, if it exceeds the maximum // allowed, use the maximum value. this->pendingSwapJobsThreshold = @@ -708,6 +712,10 @@ size_t TieredHNSWIndex::indexLabelCount() const { template int TieredHNSWIndex::addVector(const void *blob, labelType label) { int ret = 1; + if (label == 999999) { + TIERED_LOG(VecSimCommonStrings::LOG_WARNING_STRING, + "bg_vec_count: %zu, main_thread_vector_count: %zu", this->bg_vector_indexing_count, this->main_vector_indexing_count); + } auto hnsw_index = this->getHNSWIndex(); // writeMode is not protected since it is assumed to be called only from the "main thread" // (that is the thread that is exculusively calling add/delete vector). @@ -733,6 +741,10 @@ int TieredHNSWIndex::addVector(const void *blob, labelType l // This will do nothing (and return 0) if this label doesn't exist. Otherwise, it may // remove vector from the flat buffer and/or the HNSW index. ret -= this->deleteVector(label); + if (ret != 1) { + TIERED_LOG(VecSimCommonStrings::LOG_VERBOSE_STRING, + "removed a vector while trying to insert label %zu. ret: %d", label, ret); + } } if (this->frontendIndex->indexSize() >= this->flatBufferLimit) { // We didn't remove a vector from flat buffer due to overwrite, insert the new vector @@ -742,11 +754,13 @@ int TieredHNSWIndex::addVector(const void *blob, labelType l // index. auto storage_blob = this->frontendIndex->preprocessForStorage(blob); this->insertVectorToHNSW(hnsw_index, label, storage_blob.get()); + this->main_vector_indexing_count++; return ret; } // Otherwise, we fall back to the "regular" insertion into the flat buffer // (since it is not full anymore after removing the previous vector stored under the label). } + this->bg_vector_indexing_count++; this->flatIndexGuard.lock(); idType new_flat_id = this->frontendIndex->indexSize(); if (this->frontendIndex->isLabelExists(label) && !this->frontendIndex->isMultiValue()) { From ffe9e81e6a8c7931cd3c739251284a26f1d6f6c2 Mon Sep 17 00:00:00 2001 From: meiravgri Date: Fri, 18 Apr 2025 04:37:55 +0000 Subject: [PATCH 2/5] fix datasize in executeInsertJob --- src/VecSim/algorithms/hnsw/hnsw_tiered.h | 36 ++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/src/VecSim/algorithms/hnsw/hnsw_tiered.h b/src/VecSim/algorithms/hnsw/hnsw_tiered.h index 319f92a4f..1c94a8e91 100644 --- a/src/VecSim/algorithms/hnsw/hnsw_tiered.h +++ b/src/VecSim/algorithms/hnsw/hnsw_tiered.h @@ -538,10 +538,11 @@ void TieredHNSWIndex::executeInsertJob(HNSWInsertJob *job) { HNSWIndex *hnsw_index = this->getHNSWIndex(); // Copy the vector blob from the flat buffer, so we can release the flat lock while we are // indexing the vector into HNSW index. + size_t data_size = this->frontendIndex->getDataSize(); auto blob_copy = this->getAllocator()->allocate_unique(this->frontendIndex->getDataSize()); memcpy(blob_copy.get(), this->frontendIndex->getDataByInternalId(job->id), - this->frontendIndex->getDim() * sizeof(DataType)); + data_size); this->insertVectorToHNSW(hnsw_index, job->label, blob_copy.get()); @@ -703,7 +704,7 @@ size_t TieredHNSWIndex::indexLabelCount() const { this->mainIndexGuard.unlock(); return output.size(); } - +#include "VecSim/vec_sim_debug.h" // In the tiered index, we assume that the blobs are processed by the flat buffer // before being transferred to the HNSW index. // When inserting vectors directly into the HNSW index—such as in VecSim_WriteInPlace mode— or when @@ -712,11 +713,40 @@ size_t TieredHNSWIndex::indexLabelCount() const { template int TieredHNSWIndex::addVector(const void *blob, labelType label) { int ret = 1; + auto hnsw_index = this->getHNSWIndex(); + if (label == 100000) { + std::string res("index connections: {"); + + res += "Entry Point Label: "; + res += std::to_string(hnsw_index->getEntryPointLabel()) + "\n"; + TIERED_LOG(VecSimCommonStrings::LOG_WARNING_STRING, + "%s", res.c_str()); + for (idType id = 0; id < this->backendIndex->indexSize(); id++) { + std::string n_data("Node "); + labelType label = hnsw_index->getExternalLabel(id); + if (label == SIZE_MAX) + continue; // The ID is not in the index + int **neighbors_output; + VecSimDebug_GetElementNeighborsInHNSWGraph(this->backendIndex, label, &neighbors_output); + n_data += std::to_string(label) + ": "; + for (size_t l = 0; neighbors_output[l]; l++) { + n_data += "Level " + std::to_string(l) + " neighbors: "; + auto &neighbours = neighbors_output[l]; + auto neighbours_count = neighbours[0]; + for (size_t j = 1; j <= neighbours_count; j++) { + n_data += std::to_string(neighbours[j]) + ", "; + } + TIERED_LOG(VecSimCommonStrings::LOG_WARNING_STRING, + "%s", n_data.c_str()); + } + VecSimDebug_ReleaseElementNeighborsInHNSWGraph(neighbors_output); + } + } if (label == 999999) { TIERED_LOG(VecSimCommonStrings::LOG_WARNING_STRING, "bg_vec_count: %zu, main_thread_vector_count: %zu", this->bg_vector_indexing_count, this->main_vector_indexing_count); } - auto hnsw_index = this->getHNSWIndex(); + // writeMode is not protected since it is assumed to be called only from the "main thread" // (that is the thread that is exculusively calling add/delete vector). if (this->getWriteMode() == VecSim_WriteInPlace) { From 64b3d3fdb38dcdad5c0b68a61f0fd8a9baa8b16a Mon Sep 17 00:00:00 2001 From: meiravgri Date: Sun, 27 Apr 2025 11:38:59 +0000 Subject: [PATCH 3/5] disable neighbours print --- src/VecSim/algorithms/hnsw/hnsw_tiered.h | 56 ++++++++++++------------ 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/src/VecSim/algorithms/hnsw/hnsw_tiered.h b/src/VecSim/algorithms/hnsw/hnsw_tiered.h index 1c94a8e91..44edcfe8c 100644 --- a/src/VecSim/algorithms/hnsw/hnsw_tiered.h +++ b/src/VecSim/algorithms/hnsw/hnsw_tiered.h @@ -714,34 +714,34 @@ template int TieredHNSWIndex::addVector(const void *blob, labelType label) { int ret = 1; auto hnsw_index = this->getHNSWIndex(); - if (label == 100000) { - std::string res("index connections: {"); - - res += "Entry Point Label: "; - res += std::to_string(hnsw_index->getEntryPointLabel()) + "\n"; - TIERED_LOG(VecSimCommonStrings::LOG_WARNING_STRING, - "%s", res.c_str()); - for (idType id = 0; id < this->backendIndex->indexSize(); id++) { - std::string n_data("Node "); - labelType label = hnsw_index->getExternalLabel(id); - if (label == SIZE_MAX) - continue; // The ID is not in the index - int **neighbors_output; - VecSimDebug_GetElementNeighborsInHNSWGraph(this->backendIndex, label, &neighbors_output); - n_data += std::to_string(label) + ": "; - for (size_t l = 0; neighbors_output[l]; l++) { - n_data += "Level " + std::to_string(l) + " neighbors: "; - auto &neighbours = neighbors_output[l]; - auto neighbours_count = neighbours[0]; - for (size_t j = 1; j <= neighbours_count; j++) { - n_data += std::to_string(neighbours[j]) + ", "; - } - TIERED_LOG(VecSimCommonStrings::LOG_WARNING_STRING, - "%s", n_data.c_str()); - } - VecSimDebug_ReleaseElementNeighborsInHNSWGraph(neighbors_output); - } - } + // if (label == 100000) { + // std::string res("index connections: {"); + + // res += "Entry Point Label: "; + // res += std::to_string(hnsw_index->getEntryPointLabel()) + "\n"; + // TIERED_LOG(VecSimCommonStrings::LOG_WARNING_STRING, + // "%s", res.c_str()); + // for (idType id = 0; id < this->backendIndex->indexSize(); id++) { + // std::string n_data("Node "); + // labelType label = hnsw_index->getExternalLabel(id); + // if (label == SIZE_MAX) + // continue; // The ID is not in the index + // int **neighbors_output; + // VecSimDebug_GetElementNeighborsInHNSWGraph(this->backendIndex, label, &neighbors_output); + // n_data += std::to_string(label) + ": "; + // for (size_t l = 0; neighbors_output[l]; l++) { + // n_data += "Level " + std::to_string(l) + " neighbors: "; + // auto &neighbours = neighbors_output[l]; + // auto neighbours_count = neighbours[0]; + // for (size_t j = 1; j <= neighbours_count; j++) { + // n_data += std::to_string(neighbours[j]) + ", "; + // } + // TIERED_LOG(VecSimCommonStrings::LOG_WARNING_STRING, + // "%s", n_data.c_str()); + // } + // VecSimDebug_ReleaseElementNeighborsInHNSWGraph(neighbors_output); + // } + // } if (label == 999999) { TIERED_LOG(VecSimCommonStrings::LOG_WARNING_STRING, "bg_vec_count: %zu, main_thread_vector_count: %zu", this->bg_vector_indexing_count, this->main_vector_indexing_count); From 94e5789662c47e2d32c8c99d6218ce9294b27c5d Mon Sep 17 00:00:00 2001 From: meiravgri Date: Mon, 28 Apr 2025 14:07:15 +0000 Subject: [PATCH 4/5] replace manual blob size calcaulations format comment out new tests --- .../algorithms/brute_force/brute_force.h | 2 +- .../brute_force/brute_force_multi.h | 2 +- .../brute_force/brute_force_single.h | 2 +- src/VecSim/algorithms/hnsw/hnsw_multi.h | 2 +- src/VecSim/algorithms/hnsw/hnsw_single.h | 2 +- src/VecSim/algorithms/hnsw/hnsw_tiered.h | 21 ++-- src/VecSim/algorithms/svs/svs.h | 4 +- tests/unit/CMakeLists.txt | 2 +- tests/unit/test_int8.cpp | 114 ++++++++++++++++++ 9 files changed, 133 insertions(+), 18 deletions(-) diff --git a/src/VecSim/algorithms/brute_force/brute_force.h b/src/VecSim/algorithms/brute_force/brute_force.h index d4cddfcdf..54dde3a4b 100644 --- a/src/VecSim/algorithms/brute_force/brute_force.h +++ b/src/VecSim/algorithms/brute_force/brute_force.h @@ -350,7 +350,7 @@ BruteForceIndex::newBatchIterator(const void *queryBlob, VecSimQueryParams *queryParams) const { auto *queryBlobCopy = this->allocator->allocate_aligned(this->dataSize, this->preprocessors->getAlignment()); - memcpy(queryBlobCopy, queryBlob, this->dim * sizeof(DataType)); + memcpy(queryBlobCopy, queryBlob, this->getDataSize()); this->preprocessQueryInPlace(queryBlobCopy); // Ownership of queryBlobCopy moves to BF_BatchIterator that will free it at the end. return newBatchIterator_Instance(queryBlobCopy, queryParams); diff --git a/src/VecSim/algorithms/brute_force/brute_force_multi.h b/src/VecSim/algorithms/brute_force/brute_force_multi.h index 999f5ac8b..1d0534855 100644 --- a/src/VecSim/algorithms/brute_force/brute_force_multi.h +++ b/src/VecSim/algorithms/brute_force/brute_force_multi.h @@ -45,7 +45,7 @@ class BruteForceIndex_Multi : public BruteForceIndex { for (idType id : ids->second) { auto vec = std::vector(this->dim); - memcpy(vec.data(), this->getDataByInternalId(id), this->dim * sizeof(DataType)); + memcpy(vec.data(), this->getDataByInternalId(id), this->getDataSize()); vectors_output.push_back(vec); } } diff --git a/src/VecSim/algorithms/brute_force/brute_force_single.h b/src/VecSim/algorithms/brute_force/brute_force_single.h index 0c27615e9..585c71f21 100644 --- a/src/VecSim/algorithms/brute_force/brute_force_single.h +++ b/src/VecSim/algorithms/brute_force/brute_force_single.h @@ -47,7 +47,7 @@ class BruteForceIndex_Single : public BruteForceIndex { auto id = labelToIdLookup.at(label); auto vec = std::vector(this->dim); - memcpy(vec.data(), this->getDataByInternalId(id), this->dim * sizeof(DataType)); + memcpy(vec.data(), this->getDataByInternalId(id), this->getDataSize()); vectors_output.push_back(vec); } #endif diff --git a/src/VecSim/algorithms/hnsw/hnsw_multi.h b/src/VecSim/algorithms/hnsw/hnsw_multi.h index 972c981e4..23922df90 100644 --- a/src/VecSim/algorithms/hnsw/hnsw_multi.h +++ b/src/VecSim/algorithms/hnsw/hnsw_multi.h @@ -201,7 +201,7 @@ HNSWIndex_Multi::newBatchIterator(const void *queryBlob, VecSimQueryParams *queryParams) const { auto queryBlobCopy = this->allocator->allocate_aligned(this->dataSize, this->preprocessors->getAlignment()); - memcpy(queryBlobCopy, queryBlob, this->dim * sizeof(DataType)); + memcpy(queryBlobCopy, queryBlob, this->getDataSize()); this->preprocessQueryInPlace(queryBlobCopy); // Ownership of queryBlobCopy moves to HNSW_BatchIterator that will free it at the end. return new (this->allocator) HNSWMulti_BatchIterator( diff --git a/src/VecSim/algorithms/hnsw/hnsw_single.h b/src/VecSim/algorithms/hnsw/hnsw_single.h index c9ef19ead..b8cadd634 100644 --- a/src/VecSim/algorithms/hnsw/hnsw_single.h +++ b/src/VecSim/algorithms/hnsw/hnsw_single.h @@ -161,7 +161,7 @@ HNSWIndex_Single::newBatchIterator(const void *queryBlob, VecSimQueryParams *queryParams) const { auto queryBlobCopy = this->allocator->allocate_aligned(this->dataSize, this->preprocessors->getAlignment()); - memcpy(queryBlobCopy, queryBlob, this->dim * sizeof(DataType)); + memcpy(queryBlobCopy, queryBlob, this->getDataSize()); this->preprocessQueryInPlace(queryBlobCopy); // Ownership of queryBlobCopy moves to HNSW_BatchIterator that will free it at the end. return new (this->allocator) HNSWSingle_BatchIterator( diff --git a/src/VecSim/algorithms/hnsw/hnsw_tiered.h b/src/VecSim/algorithms/hnsw/hnsw_tiered.h index 44edcfe8c..92cfc5937 100644 --- a/src/VecSim/algorithms/hnsw/hnsw_tiered.h +++ b/src/VecSim/algorithms/hnsw/hnsw_tiered.h @@ -199,7 +199,7 @@ class TieredHNSWIndex : public VecSimTieredIndex { VecSimDebugInfoIterator *debugInfoIterator() const override; VecSimBatchIterator *newBatchIterator(const void *queryBlob, VecSimQueryParams *queryParams) const override { - size_t blobSize = this->frontendIndex->getDim() * sizeof(DataType); + size_t blobSize = this->frontendIndex->getDataSize(); void *queryBlobCopy = this->allocator->allocate(blobSize); memcpy(queryBlobCopy, queryBlob, blobSize); return new (this->allocator) @@ -539,10 +539,9 @@ void TieredHNSWIndex::executeInsertJob(HNSWInsertJob *job) { // Copy the vector blob from the flat buffer, so we can release the flat lock while we are // indexing the vector into HNSW index. size_t data_size = this->frontendIndex->getDataSize(); - auto blob_copy = this->getAllocator()->allocate_unique(this->frontendIndex->getDataSize()); + auto blob_copy = this->getAllocator()->allocate_unique(data_size); - memcpy(blob_copy.get(), this->frontendIndex->getDataByInternalId(job->id), - data_size); + memcpy(blob_copy.get(), this->frontendIndex->getDataByInternalId(job->id), data_size); this->insertVectorToHNSW(hnsw_index, job->label, blob_copy.get()); @@ -636,7 +635,7 @@ TieredHNSWIndex::TieredHNSWIndex(HNSWIndex allocator) : VecSimTieredIndex(hnsw_index, bf_index, tiered_index_params, allocator), - bg_vector_indexing_count(0), main_vector_indexing_count(0), + bg_vector_indexing_count(0), main_vector_indexing_count(0), labelToInsertJobs(this->allocator), idToRepairJobs(this->allocator), idToSwapJob(this->allocator), invalidJobs(this->allocator), currInvalidJobId(0), readySwapJobs(0) { @@ -727,9 +726,9 @@ int TieredHNSWIndex::addVector(const void *blob, labelType l // if (label == SIZE_MAX) // continue; // The ID is not in the index // int **neighbors_output; - // VecSimDebug_GetElementNeighborsInHNSWGraph(this->backendIndex, label, &neighbors_output); - // n_data += std::to_string(label) + ": "; - // for (size_t l = 0; neighbors_output[l]; l++) { + // VecSimDebug_GetElementNeighborsInHNSWGraph(this->backendIndex, label, + // &neighbors_output); n_data += std::to_string(label) + ": "; for (size_t l = 0; + // neighbors_output[l]; l++) { // n_data += "Level " + std::to_string(l) + " neighbors: "; // auto &neighbours = neighbors_output[l]; // auto neighbours_count = neighbours[0]; @@ -744,7 +743,8 @@ int TieredHNSWIndex::addVector(const void *blob, labelType l // } if (label == 999999) { TIERED_LOG(VecSimCommonStrings::LOG_WARNING_STRING, - "bg_vec_count: %zu, main_thread_vector_count: %zu", this->bg_vector_indexing_count, this->main_vector_indexing_count); + "bg_vec_count: %zu, main_thread_vector_count: %zu", + this->bg_vector_indexing_count, this->main_vector_indexing_count); } // writeMode is not protected since it is assumed to be called only from the "main thread" @@ -773,7 +773,8 @@ int TieredHNSWIndex::addVector(const void *blob, labelType l ret -= this->deleteVector(label); if (ret != 1) { TIERED_LOG(VecSimCommonStrings::LOG_VERBOSE_STRING, - "removed a vector while trying to insert label %zu. ret: %d", label, ret); + "removed a vector while trying to insert label %zu. ret: %d", label, + ret); } } if (this->frontendIndex->indexSize() >= this->flatBufferLimit) { diff --git a/src/VecSim/algorithms/svs/svs.h b/src/VecSim/algorithms/svs/svs.h index 6210f6e83..0cdb1d7f3 100644 --- a/src/VecSim/algorithms/svs/svs.h +++ b/src/VecSim/algorithms/svs/svs.h @@ -146,7 +146,7 @@ class SVSIndex : public VecSimIndexAbstract, fl return MemoryUtils::unique_blob{const_cast(original_data), [](void *) {}}; } - const auto data_size = this->dim * sizeof(DataType) * n; + const auto data_size = this->getDataSize() * n; auto processed_blob = MemoryUtils::unique_blob{this->allocator->allocate(data_size), @@ -428,7 +428,7 @@ class SVSIndex : public VecSimIndexAbstract, fl VecSimQueryParams *queryParams) const override { auto *queryBlobCopy = this->allocator->allocate_aligned(this->dataSize, this->preprocessors->getAlignment()); - memcpy(queryBlobCopy, queryBlob, this->dim * sizeof(DataType)); + memcpy(queryBlobCopy, queryBlob, this->getDataSize()); this->preprocessQueryInPlace(queryBlobCopy); // Ownership of queryBlobCopy moves to VecSimBatchIterator that will free it at the end. if (indexSize() == 0) { diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt index f248705d4..63e245c2d 100644 --- a/tests/unit/CMakeLists.txt +++ b/tests/unit/CMakeLists.txt @@ -79,4 +79,4 @@ if(SVS_SUPPORTED) add_executable(test_svs ../utils/mock_thread_pool.cpp test_svs.cpp unit_test_utils.cpp) target_link_libraries(test_svs PUBLIC gtest_main VectorSimilarity) gtest_discover_tests(test_svs) -endif() \ No newline at end of file +endif() diff --git a/tests/unit/test_int8.cpp b/tests/unit/test_int8.cpp index aaf0f5d51..c377515f7 100644 --- a/tests/unit/test_int8.cpp +++ b/tests/unit/test_int8.cpp @@ -993,3 +993,117 @@ TEST_F(INT8TieredTest, getElementNeighbors) { HNSWParams params = {.dim = 4, .M = 20}; get_element_neighbors(params); } + +// /** +// * This test specifically targets the bug where memcpy in hnsw_tiered.h uses manually computed +// size +// * (dim * sizeof(DataType)) instead of getDataSize(), which doesn't account for additional data +// * stored alongside the vector. In the int8 + cosine configuration, a float norm is appended to +// the +// * vector data, and using the manual size truncates this norm during the copy. +// */ +// TEST_F(INT8TieredTest, TestInt8CosineVectorCopy) { +// // Create a tiered HNSW index with int8 vectors and cosine similarity +// HNSWParams params = { +// .type = VecSimType_INT8, +// .dim = 4, +// .metric = VecSimMetric_Cosine, +// }; + +// // Create the tiered index with a small flat buffer limit to ensure vectors move to HNSW +// quickly TieredIndexParams tiered_params = generate_tiered_params(params, 1, 1); +// SetUp(tiered_params); + +// // Create a test vector +// int8_t vector[4]; +// test_utils::populate_int8_vec(vector, 4, 42); + +// // Add the vector to the index +// VecSimIndex_AddVector(index, vector, 1); + +// // Verify the vector is in the flat buffer +// auto *flat_index = reinterpret_cast*>(CastToBruteForce()); +// auto *hnsw_index = CastToHNSW(); +// ASSERT_EQ(flat_index->indexSize(), 1); +// ASSERT_EQ(hnsw_index->indexSize(), 0); + +// // Get the vector from the flat buffer +// const int8_t *flat_vector = reinterpret_cast(flat_index->getDataByInternalId(0)); float vector_norm = +// spaces::IntegralType_ComputeNorm(flat_vector, dim); ASSERT_EQ(index_vector_norm, +// vector_norm) << "wrong vector norm for vector id:" << i; + +// // Run a thread iteration to move the vector from flat to HNSW +// mock_thread_pool.thread_iteration(); + +// // Verify the vector has been moved to HNSW +// ASSERT_EQ(flat_index->indexSize(), 0); +// ASSERT_EQ(hnsw_index->indexSize(), 1); + +// // Get the vector from HNSW +// const int8_t *hnsw_vector = reinterpret_cast(hnsw_index->getDataByInternalId(0)); + +// // Verify the norm is correctly preserved in the HNSW vector +// float norm_in_hnsw = *(reinterpret_cast(hnsw_vector + params.dim)); + +// // This would fail with the bug, as the norm would be truncated +// ASSERT_NEAR(norm_in_flat, norm_in_hnsw, 1e-5); + +// // Calculate the distance from the vector to itself in HNSW +// // For cosine distance to itself, this should be close to 0 +// // We don't use ASSERT_NEAR here because the distance calculation might have some numerical +// issues +// // that are not related to the bug we're testing +// } + +// /** +// * This test specifically targets the bug fix where memcpy was using the wrong size. +// * It demonstrates the difference between using the correct size (getDataSize()) +// * and the incorrect size (dim * sizeof(DataType)). +// */ +// TEST_F(INT8TieredTest, TestInt8CosineBugFix) { +// // Create a tiered HNSW index with int8 vectors and cosine similarity +// HNSWParams params = { +// .type = VecSimType_INT8, +// .dim = 4, +// .metric = VecSimMetric_Cosine, +// .multi = false +// }; + +// // Create the tiered index +// // TieredIndexParams tiered_params = generate_tiered_params(params); +// SetUp(params); + +// // Create a test vector +// int8_t vector[4]; +// test_utils::populate_int8_vec(vector, 4, 42); + +// // Add the vector to the index +// ASSERT_EQ(this->GenerateAndAddVector(i, i), 1); +// // VecSimIndex_AddVector(index, vector, 1); + +// // Get the flat index and the vector from it +// auto *flat_index = CastToBruteForce(); +// const int8_t *flat_vector = reinterpret_cast(flat_index->getDataByInternalId(0)); + +// // Verify the norm is present in the correct copy +// float norm_in_flat = *(reinterpret_cast(flat_vector + params.dim)); + +// ASSERT_NEAR(norm_in_flat, norm_in_correct_copy, 1e-5); + +// // Run a thread iteration to move the vector from flat to HNSW +// mock_thread_pool.thread_iteration(); + +// // Get the HNSW index and the vector from it +// auto *hnsw_index = CastToHNSW(); +// const int8_t *hnsw_vector = reinterpret_cast(hnsw_index->getDataByInternalId(0)); + +// // Verify the norm is correctly preserved in the HNSW vector +// float norm_in_hnsw = *(reinterpret_cast(hnsw_vector + params.dim)); + +// // This would fail with the bug, as the norm would be truncated +// ASSERT_NEAR(norm_in_flat, norm_in_hnsw, 1e-5); +// } From 352456528a6c8ed054395617a9f5111ea3fb0925 Mon Sep 17 00:00:00 2001 From: meiravgri Date: Wed, 30 Apr 2025 04:24:29 +0000 Subject: [PATCH 5/5] INT8Test::PopulateRandomVector increases seed every call so we will get different vectors. this required a change in metrics_test because now the score is not exactly 0. VecSimIndexAbstract CastToBruteForce()-> rename to GetFlatBufferIndex add BruteForceIndex CastToBruteForce() add getFlatBufferIndexAsBruteForce to tiered index to get BruteForceIndex functions added to hnsw and BF: Formalize getDataByLabel definition: populate the intput empty vector with the vectors data,not including any additional meta data that might present in the vector storage like the norm. add getStoredVectorDataByLabel returns a vector of char vectors. each vector contains the vector as it is stored in the vectors container, including the meta data if exists. --- .../algorithms/brute_force/brute_force.h | 44 ++- .../brute_force/brute_force_multi.h | 22 +- .../brute_force/brute_force_single.h | 22 +- src/VecSim/algorithms/hnsw/hnsw.h | 38 ++- src/VecSim/algorithms/hnsw/hnsw_multi.h | 20 +- src/VecSim/algorithms/hnsw/hnsw_single.h | 17 +- src/VecSim/algorithms/hnsw/hnsw_tiered.h | 12 +- src/VecSim/vec_sim_tiered_index.h | 3 + tests/unit/test_bruteforce.cpp | 4 +- tests/unit/test_int8.cpp | 259 ++++++++++-------- 10 files changed, 297 insertions(+), 144 deletions(-) diff --git a/src/VecSim/algorithms/brute_force/brute_force.h b/src/VecSim/algorithms/brute_force/brute_force.h index 54dde3a4b..42bf12654 100644 --- a/src/VecSim/algorithms/brute_force/brute_force.h +++ b/src/VecSim/algorithms/brute_force/brute_force.h @@ -40,8 +40,8 @@ class BruteForceIndex : public VecSimIndexAbstract { size_t indexSize() const override; size_t indexCapacity() const override; std::unique_ptr getVectorsIterator() const; - DataType *getDataByInternalId(idType id) const { - return (DataType *)this->vectors->getElement(id); + const DataType *getDataByInternalId(idType id) const { + return reinterpret_cast(this->vectors->getElement(id)); } VecSimQueryReply *topKQuery(const void *queryBlob, size_t k, VecSimQueryParams *queryParams) const override; @@ -75,15 +75,41 @@ class BruteForceIndex : public VecSimIndexAbstract { virtual ~BruteForceIndex() = default; #ifdef BUILD_TESTS /** - * @brief Used for testing - store vector(s) data associated with a given label. This function - * copies the vector(s)' data buffer(s) and place it in the output vector + * @brief Used for testing - get only the vector elements associated with a given label. + * This function copies only the vector(s) elements into the output vector, + * without any additional metadata that might be stored with the vector(s). * - * @param label - * @param vectors_output empty vector to be modified, should store the blob(s) associated with - * the label. + * Important: This method returns ONLY the vector elements, even if the stored vector contains + * additional metadata. For example, with int8_t/uint8_t vectors using cosine similarity, + * this method will NOT return the norm that is stored with the vector. + * + * If you need the complete data including any metadata, use getStoredVectorDataByLabel() + * instead. + * + * @param label The label to retrieve vector(s) elements for + * @param vectors_output Empty vector to be filled with vector(s) */ virtual void getDataByLabel(labelType label, std::vector> &vectors_output) const = 0; + + /** + * @brief Used for testing - get the complete raw data associated with a given label. + * This function returns the ENTIRE vector(s) data as stored in the index, including any + * additional metadata that might be stored alongside the vector elements. + * + * For example: + * - For int8_t/uint8_t vectors with cosine similarity, this includes the norm stored at the end + * - For other vector types or future implementations, this will include any additional data + * that might be stored with the vector + * + * Use this method when you need access to the complete vector data as it is stored internally. + * + * @param label The label to retrieve data for + * @return A vector containing the complete vector data (elements + metadata) for the given + * label + */ + virtual std::vector> getStoredVectorDataByLabel(labelType label) const = 0; + void fitMemory() override { if (count == 0) { return; @@ -350,7 +376,9 @@ BruteForceIndex::newBatchIterator(const void *queryBlob, VecSimQueryParams *queryParams) const { auto *queryBlobCopy = this->allocator->allocate_aligned(this->dataSize, this->preprocessors->getAlignment()); - memcpy(queryBlobCopy, queryBlob, this->getDataSize()); + memcpy(queryBlobCopy, queryBlob, this->dim * sizeof(DataType)); + + // memcpy(queryBlobCopy, queryBlob, this->getDataSize()); this->preprocessQueryInPlace(queryBlobCopy); // Ownership of queryBlobCopy moves to BF_BatchIterator that will free it at the end. return newBatchIterator_Instance(queryBlobCopy, queryParams); diff --git a/src/VecSim/algorithms/brute_force/brute_force_multi.h b/src/VecSim/algorithms/brute_force/brute_force_multi.h index 1d0534855..bb6dcc2d8 100644 --- a/src/VecSim/algorithms/brute_force/brute_force_multi.h +++ b/src/VecSim/algorithms/brute_force/brute_force_multi.h @@ -45,10 +45,30 @@ class BruteForceIndex_Multi : public BruteForceIndex { for (idType id : ids->second) { auto vec = std::vector(this->dim); - memcpy(vec.data(), this->getDataByInternalId(id), this->getDataSize()); + // Only copy the vector data (dim * sizeof(DataType)), not any additional metadata like + // the norm + memcpy(vec.data(), this->getDataByInternalId(id), this->dim * sizeof(DataType)); vectors_output.push_back(vec); } } + + std::vector> getStoredVectorDataByLabel(labelType label) const override { + std::vector> vectors_output; + auto ids = labelToIdsLookup.find(label); + + for (idType id : ids->second) { + // Get the data pointer - need to cast to char* for memcpy + const char *data = reinterpret_cast(this->getDataByInternalId(id)); + + // Create a vector with the full data (including any metadata like norms) + std::vector vec(this->getDataSize()); + memcpy(vec.data(), data, this->getDataSize()); + vectors_output.push_back(std::move(vec)); + } + + return vectors_output; + } + #endif private: // inline definitions diff --git a/src/VecSim/algorithms/brute_force/brute_force_single.h b/src/VecSim/algorithms/brute_force/brute_force_single.h index 585c71f21..96f6ef65d 100644 --- a/src/VecSim/algorithms/brute_force/brute_force_single.h +++ b/src/VecSim/algorithms/brute_force/brute_force_single.h @@ -39,17 +39,33 @@ class BruteForceIndex_Single : public BruteForceIndex { // We call this when we KNOW that the label exists in the index. idType getIdOfLabel(labelType label) const { return labelToIdLookup.find(label)->second; } - +// #define BUILD_TESTS #ifdef BUILD_TESTS void getDataByLabel(labelType label, std::vector> &vectors_output) const override { - auto id = labelToIdLookup.at(label); auto vec = std::vector(this->dim); - memcpy(vec.data(), this->getDataByInternalId(id), this->getDataSize()); + // Only copy the vector data (dim * sizeof(DataType)), not any additional metadata like the + // norm + memcpy(vec.data(), this->getDataByInternalId(id), this->dim * sizeof(DataType)); vectors_output.push_back(vec); } + + std::vector> getStoredVectorDataByLabel(labelType label) const override { + std::vector> vectors_output; + auto id = labelToIdLookup.at(label); + + // Get the data pointer - need to cast to char* for memcpy + const char *data = reinterpret_cast(this->getDataByInternalId(id)); + + // Create a vector with the full data (including any metadata like norms) + std::vector vec(this->getDataSize()); + memcpy(vec.data(), data, this->getDataSize()); + vectors_output.push_back(std::move(vec)); + + return vectors_output; + } #endif protected: // inline definitions diff --git a/src/VecSim/algorithms/hnsw/hnsw.h b/src/VecSim/algorithms/hnsw/hnsw.h index 61ca1fa2d..7510a66df 100644 --- a/src/VecSim/algorithms/hnsw/hnsw.h +++ b/src/VecSim/algorithms/hnsw/hnsw.h @@ -301,15 +301,41 @@ class HNSWIndex : public VecSimIndexAbstract, #ifdef BUILD_TESTS /** - * @brief Used for testing - store vector(s) data associated with a given label. This function - * copies the vector(s)' data buffer(s) and place it in the output vector + * @brief Used for testing - get only the vector elements associated with a given label. + * This function copies only the vector(s) elements into the output vector, + * without any additional metadata that might be stored with the vector. * - * @param label - * @param vectors_output empty vector to be modified, should store the blob(s) associated with - * the label. + * Important: This method returns ONLY the vector elements, even if the stored vector contains + * additional metadata. For example, with int8_t/uint8_t vectors using cosine similarity, + * this method will NOT return the norm that is stored with the vector(s). + * + * If you need the complete data including any metadata, use getStoredVectorDataByLabel() + * instead. + * + * @param label The label to retrieve vector(s) elements for + * @param vectors_output Empty vector to be filled with vector(s) */ virtual void getDataByLabel(labelType label, std::vector> &vectors_output) const = 0; + + /** + * @brief Used for testing - get the complete raw data associated with a given label. + * This function returns the ENTIRE vector(s) data as stored in the index, including any + * additional metadata that might be stored alongside the vector elements. + * + * For example: + * - For int8_t/uint8_t vectors with cosine similarity, this includes the norm stored at the end + * - For other vector types or future implementations, this will include any additional data + * that might be stored with the vector + * + * Use this method when you need access to the complete vector data as it is stored internally. + * + * @param label The label to retrieve data for + * @return A vector containing the complete vector data (elements + metadata) for the given + * label + */ + virtual std::vector> getStoredVectorDataByLabel(labelType label) const = 0; + void fitMemory() override { if (maxElements > 0) { idToMetaData.shrink_to_fit(); @@ -1559,7 +1585,7 @@ void HNSWIndex::insertElementToGraph(idType element_id, for (auto level = static_cast(max_common_level); level >= 0; level--) { candidatesMaxHeap top_candidates = searchLayer(curr_element, vector_data, level, efConstruction); - // If the entry point was marked deleted between iterations, we may recieve an empty + // If the entry point was marked deleted between iterations, we may receive an empty // candidates set. if (!top_candidates.empty()) { curr_element = mutuallyConnectNewElement(element_id, top_candidates, level); diff --git a/src/VecSim/algorithms/hnsw/hnsw_multi.h b/src/VecSim/algorithms/hnsw/hnsw_multi.h index 23922df90..9048e9738 100644 --- a/src/VecSim/algorithms/hnsw/hnsw_multi.h +++ b/src/VecSim/algorithms/hnsw/hnsw_multi.h @@ -72,10 +72,28 @@ class HNSWIndex_Multi : public HNSWIndex { for (idType id : ids->second) { auto vec = std::vector(this->dim); - memcpy(vec.data(), this->getDataByInternalId(id), this->dataSize); + // Only copy the vector data (dim * sizeof(DataType)), not any additional metadata like + // the norm + memcpy(vec.data(), this->getDataByInternalId(id), this->dim * sizeof(DataType)); vectors_output.push_back(vec); } } + + std::vector> getStoredVectorDataByLabel(labelType label) const override { + std::vector> vectors_output; + auto ids = labelLookup.find(label); + + for (idType id : ids->second) { + const char *data = this->getDataByInternalId(id); + + // Create a vector with the full data (including any metadata like norms) + std::vector vec(this->dataSize); + memcpy(vec.data(), data, this->dataSize); + vectors_output.push_back(std::move(vec)); + } + + return vectors_output; + } #endif ~HNSWIndex_Multi() = default; diff --git a/src/VecSim/algorithms/hnsw/hnsw_single.h b/src/VecSim/algorithms/hnsw/hnsw_single.h index b8cadd634..2993f7c19 100644 --- a/src/VecSim/algorithms/hnsw/hnsw_single.h +++ b/src/VecSim/algorithms/hnsw/hnsw_single.h @@ -48,9 +48,24 @@ class HNSWIndex_Single : public HNSWIndex { auto id = labelLookup.at(label); auto vec = std::vector(this->dim); - memcpy(vec.data(), this->getDataByInternalId(id), this->dataSize); + // Only copy the vector data (dim * sizeof(DataType)), not any additional metadata like the + // norm + memcpy(vec.data(), this->getDataByInternalId(id), this->dim * sizeof(DataType)); vectors_output.push_back(vec); } + + std::vector> getStoredVectorDataByLabel(labelType label) const override { + std::vector> vectors_output; + auto id = labelLookup.at(label); + const char *data = this->getDataByInternalId(id); + + // Create a vector with the full data (including any metadata like norms) + std::vector vec(this->dataSize); + memcpy(vec.data(), data, this->dataSize); + vectors_output.push_back(std::move(vec)); + + return vectors_output; + } #endif ~HNSWIndex_Single() = default; diff --git a/src/VecSim/algorithms/hnsw/hnsw_tiered.h b/src/VecSim/algorithms/hnsw/hnsw_tiered.h index 92cfc5937..a537c35d9 100644 --- a/src/VecSim/algorithms/hnsw/hnsw_tiered.h +++ b/src/VecSim/algorithms/hnsw/hnsw_tiered.h @@ -235,6 +235,8 @@ class TieredHNSWIndex : public VecSimTieredIndex { #ifdef BUILD_TESTS void getDataByLabel(labelType label, std::vector> &vectors_output) const; + + std::vector> getStoredVectorDataByLabel(labelType label) const; #endif }; @@ -748,7 +750,7 @@ int TieredHNSWIndex::addVector(const void *blob, labelType l } // writeMode is not protected since it is assumed to be called only from the "main thread" - // (that is the thread that is exculusively calling add/delete vector). + // (that is the thread that is exclusively calling add/delete vector). if (this->getWriteMode() == VecSim_WriteInPlace) { // First, check if we need to overwrite the vector in-place for single (from both indexes). if (!this->backendIndex->isMultiValue()) { @@ -885,7 +887,7 @@ int TieredHNSWIndex::deleteVector(labelType label) { // Note that we may remove the same vector that has been removed from the flat index, if it was // being ingested at that time. // writeMode is not protected since it is assumed to be called only from the "main thread" - // (that is the thread that is exculusively calling add/delete vector). + // (that is the thread that is exclusively calling add/delete vector). if (this->getWriteMode() == VecSim_WriteAsync) { num_deleted_vectors += this->deleteLabelFromHNSW(label); // Apply ready swap jobs if number of deleted vectors reached the threshold @@ -1228,4 +1230,10 @@ void TieredHNSWIndex::getDataByLabel( labelType label, std::vector> &vectors_output) const { this->getHNSWIndex()->getDataByLabel(label, vectors_output); } + +template +std::vector> +TieredHNSWIndex::getStoredVectorDataByLabel(labelType label) const { + return this->getHNSWIndex()->getStoredVectorDataByLabel(label); +} #endif diff --git a/src/VecSim/vec_sim_tiered_index.h b/src/VecSim/vec_sim_tiered_index.h index f166667e7..559737117 100644 --- a/src/VecSim/vec_sim_tiered_index.h +++ b/src/VecSim/vec_sim_tiered_index.h @@ -101,6 +101,9 @@ class VecSimTieredIndex : public VecSimIndexInterface { inline VecSimIndexAbstract *getFlatBufferIndex() { return this->frontendIndex; } + inline BruteForceIndex *getFlatBufferIndexAsBruteForce() { + return this->frontendIndex; + } inline size_t getFlatBufferLimit() { return this->flatBufferLimit; } virtual void fitMemory() override { diff --git a/tests/unit/test_bruteforce.cpp b/tests/unit/test_bruteforce.cpp index 821af06a9..ac7db607b 100644 --- a/tests/unit/test_bruteforce.cpp +++ b/tests/unit/test_bruteforce.cpp @@ -80,7 +80,7 @@ TYPED_TEST(BruteForceTest, brute_force_vector_update_test) { ASSERT_EQ(bf_index->idToLabelMapping.size(), DEFAULT_BLOCK_SIZE); // Check update. - TEST_DATA_T *vector_data = bf_index->getDataByInternalId(0); + const TEST_DATA_T *vector_data = bf_index->getDataByInternalId(0); for (size_t i = 0; i < dim; ++i) { ASSERT_EQ(*vector_data, 2.0); ++vector_data; @@ -386,7 +386,7 @@ TYPED_TEST(BruteForceTest, test_delete_swap_block) { ASSERT_EQ(deleted_label_id_pair, bf_single_index->labelToIdLookup.end()); // The vector in index1 should hold id5 data. - TEST_DATA_T *vector_data = bf_index->getDataByInternalId(1); + const TEST_DATA_T *vector_data = bf_index->getDataByInternalId(1); for (size_t i = 0; i < dim; ++i) { ASSERT_EQ(*vector_data, 5); ++vector_data; diff --git a/tests/unit/test_int8.cpp b/tests/unit/test_int8.cpp index c377515f7..840e34836 100644 --- a/tests/unit/test_int8.cpp +++ b/tests/unit/test_int8.cpp @@ -7,6 +7,7 @@ #include "VecSim/vec_sim_debug.h" #include "VecSim/spaces/L2/L2.h" #include "VecSim/spaces/IP/IP.h" +#include "VecSim/spaces/normalize/normalize_naive.h" class INT8Test : public ::testing::Test { protected: @@ -38,7 +39,9 @@ class INT8Test : public ::testing::Test { virtual HNSWIndex *CastToHNSW() { return CastIndex>(); } - void PopulateRandomVector(int8_t *out_vec) { test_utils::populate_int8_vec(out_vec, dim); } + void PopulateRandomVector(int8_t *out_vec) { + test_utils::populate_int8_vec(out_vec, dim, current_seed++); + } int PopulateRandomAndAddVector(size_t id, int8_t *out_vec) { PopulateRandomVector(out_vec); return VecSimIndex_AddVector(index, out_vec, id); @@ -92,6 +95,7 @@ class INT8Test : public ::testing::Test { VecSimIndex *index; size_t dim; + int current_seed{0}; }; class INT8HNSWTest : public INT8Test { @@ -173,7 +177,7 @@ class INT8TieredTest : public INT8Test { virtual void TearDown() override {} virtual const void *GetDataByInternalId(idType id) override { - return CastIndex>(CastToBruteForce()) + return CastIndex>(GetFlatBufferIndex()) ->getDataByInternalId(id); } @@ -186,11 +190,16 @@ class INT8TieredTest : public INT8Test { return CastIndex>(CastToHNSW()); } - VecSimIndexAbstract *CastToBruteForce() { + VecSimIndexAbstract *GetFlatBufferIndex() { auto tiered_index = dynamic_cast *>(index); return tiered_index->getFlatBufferIndex(); } + BruteForceIndex *CastToBruteForce() { + auto tiered_index = dynamic_cast *>(index); + return tiered_index->getFlatBufferIndexAsBruteForce(); + } + int GenerateRandomAndAddVector(size_t id) override { int8_t v[dim]; PopulateRandomVector(v); @@ -384,7 +393,7 @@ void INT8Test::metrics_test(params_t index_params) { double expected_score = 0; auto verify_res = [&](size_t id, double score, size_t index) { - ASSERT_EQ(score, expected_score) << "failed at vector id:" << id; + ASSERT_NEAR(score, expected_score, 1e-6f) << "failed at vector id:" << id; }; for (size_t i = 0; i < n; i++) { @@ -729,7 +738,7 @@ void INT8TieredTest::test_info(bool is_multi) { VecSimIndexDebugInfo info = INT8Test::test_info(hnsw_params); ASSERT_EQ(info.commonInfo.basicInfo.algo, VecSimAlgo_HNSWLIB); - VecSimIndexDebugInfo frontendIndexInfo = CastToBruteForce()->debugInfo(); + VecSimIndexDebugInfo frontendIndexInfo = GetFlatBufferIndex()->debugInfo(); VecSimIndexDebugInfo backendIndexInfo = CastToHNSW()->debugInfo(); compareCommonInfo(info.tieredInfo.frontendCommonInfo, frontendIndexInfo.commonInfo); @@ -837,7 +846,7 @@ void INT8TieredTest::test_info_iterator(VecSimMetric metric) { SetUp(params); VecSimIndexDebugInfo info = VecSimIndex_DebugInfo(index); VecSimDebugInfoIterator *infoIter = VecSimIndex_DebugInfoIterator(index); - VecSimIndexDebugInfo frontendIndexInfo = CastToBruteForce()->debugInfo(); + VecSimIndexDebugInfo frontendIndexInfo = GetFlatBufferIndex()->debugInfo(); VecSimIndexDebugInfo backendIndexInfo = CastToHNSW()->debugInfo(); VecSimDebugInfoIterator_Free(infoIter); } @@ -875,7 +884,7 @@ void INT8HNSWTest::test_serialization(bool is_multi) { int8_t data[n * dim]; for (size_t i = 0; i < n * dim; i += dim) { - test_utils::populate_int8_vec(data + i, dim, i); + this->PopulateRandomVector(data + i); } for (size_t j = 0; j < n; ++j) { @@ -994,116 +1003,126 @@ TEST_F(INT8TieredTest, getElementNeighbors) { get_element_neighbors(params); } -// /** -// * This test specifically targets the bug where memcpy in hnsw_tiered.h uses manually computed -// size -// * (dim * sizeof(DataType)) instead of getDataSize(), which doesn't account for additional data -// * stored alongside the vector. In the int8 + cosine configuration, a float norm is appended to -// the -// * vector data, and using the manual size truncates this norm during the copy. -// */ -// TEST_F(INT8TieredTest, TestInt8CosineVectorCopy) { -// // Create a tiered HNSW index with int8 vectors and cosine similarity -// HNSWParams params = { -// .type = VecSimType_INT8, -// .dim = 4, -// .metric = VecSimMetric_Cosine, -// }; - -// // Create the tiered index with a small flat buffer limit to ensure vectors move to HNSW -// quickly TieredIndexParams tiered_params = generate_tiered_params(params, 1, 1); -// SetUp(tiered_params); - -// // Create a test vector -// int8_t vector[4]; -// test_utils::populate_int8_vec(vector, 4, 42); - -// // Add the vector to the index -// VecSimIndex_AddVector(index, vector, 1); - -// // Verify the vector is in the flat buffer -// auto *flat_index = reinterpret_cast*>(CastToBruteForce()); -// auto *hnsw_index = CastToHNSW(); -// ASSERT_EQ(flat_index->indexSize(), 1); -// ASSERT_EQ(hnsw_index->indexSize(), 0); - -// // Get the vector from the flat buffer -// const int8_t *flat_vector = reinterpret_cast(flat_index->getDataByInternalId(0)); float vector_norm = -// spaces::IntegralType_ComputeNorm(flat_vector, dim); ASSERT_EQ(index_vector_norm, -// vector_norm) << "wrong vector norm for vector id:" << i; - -// // Run a thread iteration to move the vector from flat to HNSW -// mock_thread_pool.thread_iteration(); - -// // Verify the vector has been moved to HNSW -// ASSERT_EQ(flat_index->indexSize(), 0); -// ASSERT_EQ(hnsw_index->indexSize(), 1); - -// // Get the vector from HNSW -// const int8_t *hnsw_vector = reinterpret_cast(hnsw_index->getDataByInternalId(0)); - -// // Verify the norm is correctly preserved in the HNSW vector -// float norm_in_hnsw = *(reinterpret_cast(hnsw_vector + params.dim)); - -// // This would fail with the bug, as the norm would be truncated -// ASSERT_NEAR(norm_in_flat, norm_in_hnsw, 1e-5); - -// // Calculate the distance from the vector to itself in HNSW -// // For cosine distance to itself, this should be close to 0 -// // We don't use ASSERT_NEAR here because the distance calculation might have some numerical -// issues -// // that are not related to the bug we're testing -// } - -// /** -// * This test specifically targets the bug fix where memcpy was using the wrong size. -// * It demonstrates the difference between using the correct size (getDataSize()) -// * and the incorrect size (dim * sizeof(DataType)). -// */ -// TEST_F(INT8TieredTest, TestInt8CosineBugFix) { -// // Create a tiered HNSW index with int8 vectors and cosine similarity -// HNSWParams params = { -// .type = VecSimType_INT8, -// .dim = 4, -// .metric = VecSimMetric_Cosine, -// .multi = false -// }; - -// // Create the tiered index -// // TieredIndexParams tiered_params = generate_tiered_params(params); -// SetUp(params); - -// // Create a test vector -// int8_t vector[4]; -// test_utils::populate_int8_vec(vector, 4, 42); - -// // Add the vector to the index -// ASSERT_EQ(this->GenerateAndAddVector(i, i), 1); -// // VecSimIndex_AddVector(index, vector, 1); - -// // Get the flat index and the vector from it -// auto *flat_index = CastToBruteForce(); -// const int8_t *flat_vector = reinterpret_cast(flat_index->getDataByInternalId(0)); - -// // Verify the norm is present in the correct copy -// float norm_in_flat = *(reinterpret_cast(flat_vector + params.dim)); - -// ASSERT_NEAR(norm_in_flat, norm_in_correct_copy, 1e-5); - -// // Run a thread iteration to move the vector from flat to HNSW -// mock_thread_pool.thread_iteration(); - -// // Get the HNSW index and the vector from it -// auto *hnsw_index = CastToHNSW(); -// const int8_t *hnsw_vector = reinterpret_cast(hnsw_index->getDataByInternalId(0)); - -// // Verify the norm is correctly preserved in the HNSW vector -// float norm_in_hnsw = *(reinterpret_cast(hnsw_vector + params.dim)); - -// // This would fail with the bug, as the norm would be truncated -// ASSERT_NEAR(norm_in_flat, norm_in_hnsw, 1e-5); -// } +/** + * Tests int8_t vectors with cosine similarity in a tiered index across three scenarios: + * 1. Verifies vector data correctness when stored in the flat buffer + * 2. Verifies vector data correctness when inserted directly into HNSW (when flat buffer is full) + * 3. Verifies vector data correctness after transfer from flat buffer to HNSW + * + * For each scenario, the test confirms: + * - Vector data matches the expected normalized vector + * - The norm is correctly stored at the end of the vector + * - Search operations (topK, range, batch) return the expected results + */ + +TEST_F(INT8TieredTest, CosineBlobCorrectness) { + // Create TieredHNSW index with cosine metric + constexpr size_t dim = 4; + HNSWParams hnsw_params = {.dim = dim, .metric = VecSimMetric_Cosine}; + // Create tiered index with buffer limit set to 1. + TieredIndexParams tiered_params = this->generate_tiered_params(hnsw_params, 1, 1); + SetUp(tiered_params); + + auto frontend_index = this->CastToBruteForce(); + auto hnsw_index = this->CastToHNSW(); + + int8_t vector[dim]; + PopulateRandomVector(vector); + float vector_norm = spaces::IntegralType_ComputeNorm(vector, dim); + + auto verify_norm = [&](const int8_t *input_vector, float expected_norm) { + float vectors_stored_norm = *(reinterpret_cast(input_vector + dim)); + ASSERT_EQ(vectors_stored_norm, expected_norm) << "wrong vector norm"; + }; + + int8_t normalized_vec[dim + sizeof(float)]; + memcpy(normalized_vec, vector, dim); + spaces::integer_normalizeVector(normalized_vec, dim); + ASSERT_NO_FATAL_FAILURE(verify_norm(normalized_vec, vector_norm)); + + int8_t query[dim + sizeof(float)]; + PopulateRandomVector(query); + float query_norm = spaces::IntegralType_ComputeNorm(query, dim); + + // Calculate the expected score manually. + int ip = 0; + for (size_t i = 0; i < dim; i++) { + ip += vector[i] * query[i]; + } + float expected_score = 1.0 - (float(ip) / (vector_norm * query_norm)); + + auto verify_res = [&](size_t label, double score, size_t result_rank) { + ASSERT_EQ(score, expected_score) << "label: " << label; + }; + + // ============== Scenario 1: + // blob correctness in the flat buffer + + // Add a vector to the flat buffer. + VecSimIndex_AddVector(index, vector, 0); + { + SCOPED_TRACE("Store in the flat buffer"); + // Get the stored vector data including the norm + auto stored_vec = frontend_index->getStoredVectorDataByLabel(0); + const int8_t *stored_vec_data = reinterpret_cast(stored_vec.at(0).data()); + // the vector should be normalized. + ASSERT_NO_FATAL_FAILURE(CompareVectors(stored_vec_data, normalized_vec, dim)); + // The norm should be stored in the last position. + verify_norm(stored_vec_data, vector_norm); + + ASSERT_NO_FATAL_FAILURE(runTopKSearchTest(index, query, 1, verify_res)); + ASSERT_NO_FATAL_FAILURE(runRangeQueryTest(index, query, 0.5, verify_res, 1, BY_SCORE)); + VecSimBatchIterator *batchIterator = VecSimBatchIterator_New(index, query, nullptr); + ASSERT_NO_FATAL_FAILURE(runBatchIteratorSearchTest(batchIterator, 1, verify_res)); + VecSimBatchIterator_Free(batchIterator); + } + + // ============== Scenario 2: + // blob correctness when inserted directly to the hnsw + + // Add another vector and exceed the flat buffer capacity. The vector should be stored directly + // in the hnsw index + VecSimIndex_AddVector(index, vector, 1); + EXPECT_EQ(frontend_index->indexSize(), 1); + EXPECT_EQ(hnsw_index->indexSize(), 1); + { + SCOPED_TRACE("Full buffer; add vector directly to hnsw"); + auto stored_vec = hnsw_index->getStoredVectorDataByLabel(1); + const int8_t *stored_vec_data = reinterpret_cast(stored_vec.at(0).data()); + // the vector should be normalized. + ASSERT_NO_FATAL_FAILURE(CompareVectors(stored_vec_data, normalized_vec, dim)); + // The norm should be stored in the last position. + verify_norm(stored_vec_data, vector_norm); + + size_t k = 2; + ASSERT_NO_FATAL_FAILURE(runTopKSearchTest(index, query, k, verify_res)); + ASSERT_NO_FATAL_FAILURE(runRangeQueryTest(index, query, 100, verify_res, k, BY_SCORE)); + VecSimBatchIterator *batchIterator = VecSimBatchIterator_New(index, query, nullptr); + ASSERT_NO_FATAL_FAILURE(runBatchIteratorSearchTest(batchIterator, k, verify_res)); + VecSimBatchIterator_Free(batchIterator); + } + + // ============== Scenario 3: + // blob correctness after transferred to the hnsw + + // Move the first vector to the hnsw index. + mock_thread_pool.thread_iteration(); + EXPECT_EQ(frontend_index->indexSize(), 0); + EXPECT_EQ(hnsw_index->indexSize(), 2); + { + SCOPED_TRACE("Execute insertion job"); + auto stored_vec = hnsw_index->getStoredVectorDataByLabel(0); + const int8_t *stored_vec_data = reinterpret_cast(stored_vec.at(0).data()); + // the vector should be normalized. + ASSERT_NO_FATAL_FAILURE(CompareVectors(stored_vec_data, normalized_vec, dim)); + // The norm should be stored in the last position. + verify_norm(stored_vec_data, vector_norm); + + size_t k = 2; + ASSERT_NO_FATAL_FAILURE(runTopKSearchTest(index, query, k, verify_res)); + ASSERT_NO_FATAL_FAILURE(runRangeQueryTest(index, query, 100, verify_res, k, BY_SCORE)); + VecSimBatchIterator *batchIterator = VecSimBatchIterator_New(index, query, nullptr); + ASSERT_NO_FATAL_FAILURE(runBatchIteratorSearchTest(batchIterator, k, verify_res)); + VecSimBatchIterator_Free(batchIterator); + } +}