Skip to content

Meiravg_investigate_int8_anamaly #665

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 35 additions & 7 deletions src/VecSim/algorithms/brute_force/brute_force.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,8 @@ class BruteForceIndex : public VecSimIndexAbstract<DataType, DistType> {
size_t indexSize() const override;
size_t indexCapacity() const override;
std::unique_ptr<RawDataContainer::Iterator> getVectorsIterator() const;
DataType *getDataByInternalId(idType id) const {
return (DataType *)this->vectors->getElement(id);
const DataType *getDataByInternalId(idType id) const {
return reinterpret_cast<const DataType *>(this->vectors->getElement(id));
}
VecSimQueryReply *topKQuery(const void *queryBlob, size_t k,
VecSimQueryParams *queryParams) const override;
Expand Down Expand Up @@ -75,15 +75,41 @@ class BruteForceIndex : public VecSimIndexAbstract<DataType, DistType> {
virtual ~BruteForceIndex() = default;
#ifdef BUILD_TESTS
/**
* @brief Used for testing - store vector(s) data associated with a given label. This function
* copies the vector(s)' data buffer(s) and place it in the output vector
* @brief Used for testing - get only the vector elements associated with a given label.
* This function copies only the vector(s) elements into the output vector,
* without any additional metadata that might be stored with the vector(s).
*
* @param label
* @param vectors_output empty vector to be modified, should store the blob(s) associated with
* the label.
* Important: This method returns ONLY the vector elements, even if the stored vector contains
* additional metadata. For example, with int8_t/uint8_t vectors using cosine similarity,
* this method will NOT return the norm that is stored with the vector.
*
* If you need the complete data including any metadata, use getStoredVectorDataByLabel()
* instead.
*
* @param label The label to retrieve vector(s) elements for
* @param vectors_output Empty vector to be filled with vector(s)
*/
virtual void getDataByLabel(labelType label,
std::vector<std::vector<DataType>> &vectors_output) const = 0;

/**
* @brief Used for testing - get the complete raw data associated with a given label.
* This function returns the ENTIRE vector(s) data as stored in the index, including any
* additional metadata that might be stored alongside the vector elements.
*
* For example:
* - For int8_t/uint8_t vectors with cosine similarity, this includes the norm stored at the end
* - For other vector types or future implementations, this will include any additional data
* that might be stored with the vector
*
* Use this method when you need access to the complete vector data as it is stored internally.
*
* @param label The label to retrieve data for
* @return A vector containing the complete vector data (elements + metadata) for the given
* label
*/
virtual std::vector<std::vector<char>> getStoredVectorDataByLabel(labelType label) const = 0;

void fitMemory() override {
if (count == 0) {
return;
Expand Down Expand Up @@ -351,6 +377,8 @@ BruteForceIndex<DataType, DistType>::newBatchIterator(const void *queryBlob,
auto *queryBlobCopy =
this->allocator->allocate_aligned(this->dataSize, this->preprocessors->getAlignment());
memcpy(queryBlobCopy, queryBlob, this->dim * sizeof(DataType));

// memcpy(queryBlobCopy, queryBlob, this->getDataSize());
this->preprocessQueryInPlace(queryBlobCopy);
// Ownership of queryBlobCopy moves to BF_BatchIterator that will free it at the end.
return newBatchIterator_Instance(queryBlobCopy, queryParams);
Expand Down
20 changes: 20 additions & 0 deletions src/VecSim/algorithms/brute_force/brute_force_multi.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,30 @@

for (idType id : ids->second) {
auto vec = std::vector<DataType>(this->dim);
// Only copy the vector data (dim * sizeof(DataType)), not any additional metadata like
// the norm
memcpy(vec.data(), this->getDataByInternalId(id), this->dim * sizeof(DataType));
vectors_output.push_back(vec);
}
}

std::vector<std::vector<char>> getStoredVectorDataByLabel(labelType label) const override {
std::vector<std::vector<char>> vectors_output;
auto ids = labelToIdsLookup.find(label);

Check warning on line 57 in src/VecSim/algorithms/brute_force/brute_force_multi.h

View check run for this annotation

Codecov / codecov/patch

src/VecSim/algorithms/brute_force/brute_force_multi.h#L55-L57

Added lines #L55 - L57 were not covered by tests

for (idType id : ids->second) {

Check warning on line 59 in src/VecSim/algorithms/brute_force/brute_force_multi.h

View check run for this annotation

Codecov / codecov/patch

src/VecSim/algorithms/brute_force/brute_force_multi.h#L59

Added line #L59 was not covered by tests
// Get the data pointer - need to cast to char* for memcpy
const char *data = reinterpret_cast<const char *>(this->getDataByInternalId(id));

Check warning on line 61 in src/VecSim/algorithms/brute_force/brute_force_multi.h

View check run for this annotation

Codecov / codecov/patch

src/VecSim/algorithms/brute_force/brute_force_multi.h#L61

Added line #L61 was not covered by tests

// Create a vector with the full data (including any metadata like norms)
std::vector<char> vec(this->getDataSize());
memcpy(vec.data(), data, this->getDataSize());
vectors_output.push_back(std::move(vec));

Check warning on line 66 in src/VecSim/algorithms/brute_force/brute_force_multi.h

View check run for this annotation

Codecov / codecov/patch

src/VecSim/algorithms/brute_force/brute_force_multi.h#L64-L66

Added lines #L64 - L66 were not covered by tests
}

return vectors_output;
}

Check warning on line 70 in src/VecSim/algorithms/brute_force/brute_force_multi.h

View check run for this annotation

Codecov / codecov/patch

src/VecSim/algorithms/brute_force/brute_force_multi.h#L69-L70

Added lines #L69 - L70 were not covered by tests

#endif
private:
// inline definitions
Expand Down
20 changes: 18 additions & 2 deletions src/VecSim/algorithms/brute_force/brute_force_single.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,17 +39,33 @@ class BruteForceIndex_Single : public BruteForceIndex<DataType, DistType> {

// We call this when we KNOW that the label exists in the index.
idType getIdOfLabel(labelType label) const { return labelToIdLookup.find(label)->second; }

// #define BUILD_TESTS
#ifdef BUILD_TESTS
void getDataByLabel(labelType label,
std::vector<std::vector<DataType>> &vectors_output) const override {

auto id = labelToIdLookup.at(label);

auto vec = std::vector<DataType>(this->dim);
// Only copy the vector data (dim * sizeof(DataType)), not any additional metadata like the
// norm
memcpy(vec.data(), this->getDataByInternalId(id), this->dim * sizeof(DataType));
vectors_output.push_back(vec);
}

std::vector<std::vector<char>> getStoredVectorDataByLabel(labelType label) const override {
std::vector<std::vector<char>> vectors_output;
auto id = labelToIdLookup.at(label);

// Get the data pointer - need to cast to char* for memcpy
const char *data = reinterpret_cast<const char *>(this->getDataByInternalId(id));

// Create a vector with the full data (including any metadata like norms)
std::vector<char> vec(this->getDataSize());
memcpy(vec.data(), data, this->getDataSize());
vectors_output.push_back(std::move(vec));

return vectors_output;
}
#endif
protected:
// inline definitions
Expand Down
38 changes: 32 additions & 6 deletions src/VecSim/algorithms/hnsw/hnsw.h
Original file line number Diff line number Diff line change
Expand Up @@ -301,15 +301,41 @@ class HNSWIndex : public VecSimIndexAbstract<DataType, DistType>,

#ifdef BUILD_TESTS
/**
* @brief Used for testing - store vector(s) data associated with a given label. This function
* copies the vector(s)' data buffer(s) and place it in the output vector
* @brief Used for testing - get only the vector elements associated with a given label.
* This function copies only the vector(s) elements into the output vector,
* without any additional metadata that might be stored with the vector.
*
* @param label
* @param vectors_output empty vector to be modified, should store the blob(s) associated with
* the label.
* Important: This method returns ONLY the vector elements, even if the stored vector contains
* additional metadata. For example, with int8_t/uint8_t vectors using cosine similarity,
* this method will NOT return the norm that is stored with the vector(s).
*
* If you need the complete data including any metadata, use getStoredVectorDataByLabel()
* instead.
*
* @param label The label to retrieve vector(s) elements for
* @param vectors_output Empty vector to be filled with vector(s)
*/
virtual void getDataByLabel(labelType label,
std::vector<std::vector<DataType>> &vectors_output) const = 0;

/**
* @brief Used for testing - get the complete raw data associated with a given label.
* This function returns the ENTIRE vector(s) data as stored in the index, including any
* additional metadata that might be stored alongside the vector elements.
*
* For example:
* - For int8_t/uint8_t vectors with cosine similarity, this includes the norm stored at the end
* - For other vector types or future implementations, this will include any additional data
* that might be stored with the vector
*
* Use this method when you need access to the complete vector data as it is stored internally.
*
* @param label The label to retrieve data for
* @return A vector containing the complete vector data (elements + metadata) for the given
* label
*/
virtual std::vector<std::vector<char>> getStoredVectorDataByLabel(labelType label) const = 0;

void fitMemory() override {
if (maxElements > 0) {
idToMetaData.shrink_to_fit();
Expand Down Expand Up @@ -1559,7 +1585,7 @@ void HNSWIndex<DataType, DistType>::insertElementToGraph(idType element_id,
for (auto level = static_cast<int>(max_common_level); level >= 0; level--) {
candidatesMaxHeap<DistType> top_candidates =
searchLayer(curr_element, vector_data, level, efConstruction);
// If the entry point was marked deleted between iterations, we may recieve an empty
// If the entry point was marked deleted between iterations, we may receive an empty
// candidates set.
if (!top_candidates.empty()) {
curr_element = mutuallyConnectNewElement(element_id, top_candidates, level);
Expand Down
22 changes: 20 additions & 2 deletions src/VecSim/algorithms/hnsw/hnsw_multi.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,10 +72,28 @@

for (idType id : ids->second) {
auto vec = std::vector<DataType>(this->dim);
memcpy(vec.data(), this->getDataByInternalId(id), this->dataSize);
// Only copy the vector data (dim * sizeof(DataType)), not any additional metadata like
// the norm
memcpy(vec.data(), this->getDataByInternalId(id), this->dim * sizeof(DataType));

Check warning on line 77 in src/VecSim/algorithms/hnsw/hnsw_multi.h

View check run for this annotation

Codecov / codecov/patch

src/VecSim/algorithms/hnsw/hnsw_multi.h#L77

Added line #L77 was not covered by tests
vectors_output.push_back(vec);
}
}

std::vector<std::vector<char>> getStoredVectorDataByLabel(labelType label) const override {
std::vector<std::vector<char>> vectors_output;
auto ids = labelLookup.find(label);

Check warning on line 84 in src/VecSim/algorithms/hnsw/hnsw_multi.h

View check run for this annotation

Codecov / codecov/patch

src/VecSim/algorithms/hnsw/hnsw_multi.h#L82-L84

Added lines #L82 - L84 were not covered by tests

for (idType id : ids->second) {
const char *data = this->getDataByInternalId(id);

Check warning on line 87 in src/VecSim/algorithms/hnsw/hnsw_multi.h

View check run for this annotation

Codecov / codecov/patch

src/VecSim/algorithms/hnsw/hnsw_multi.h#L86-L87

Added lines #L86 - L87 were not covered by tests

// Create a vector with the full data (including any metadata like norms)
std::vector<char> vec(this->dataSize);
memcpy(vec.data(), data, this->dataSize);
vectors_output.push_back(std::move(vec));

Check warning on line 92 in src/VecSim/algorithms/hnsw/hnsw_multi.h

View check run for this annotation

Codecov / codecov/patch

src/VecSim/algorithms/hnsw/hnsw_multi.h#L90-L92

Added lines #L90 - L92 were not covered by tests
}

return vectors_output;
}

Check warning on line 96 in src/VecSim/algorithms/hnsw/hnsw_multi.h

View check run for this annotation

Codecov / codecov/patch

src/VecSim/algorithms/hnsw/hnsw_multi.h#L95-L96

Added lines #L95 - L96 were not covered by tests
#endif
~HNSWIndex_Multi() = default;

Expand Down Expand Up @@ -201,7 +219,7 @@
VecSimQueryParams *queryParams) const {
auto queryBlobCopy =
this->allocator->allocate_aligned(this->dataSize, this->preprocessors->getAlignment());
memcpy(queryBlobCopy, queryBlob, this->dim * sizeof(DataType));
memcpy(queryBlobCopy, queryBlob, this->getDataSize());
this->preprocessQueryInPlace(queryBlobCopy);
// Ownership of queryBlobCopy moves to HNSW_BatchIterator that will free it at the end.
return new (this->allocator) HNSWMulti_BatchIterator<DataType, DistType>(
Expand Down
19 changes: 17 additions & 2 deletions src/VecSim/algorithms/hnsw/hnsw_single.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,24 @@ class HNSWIndex_Single : public HNSWIndex<DataType, DistType> {
auto id = labelLookup.at(label);

auto vec = std::vector<DataType>(this->dim);
memcpy(vec.data(), this->getDataByInternalId(id), this->dataSize);
// Only copy the vector data (dim * sizeof(DataType)), not any additional metadata like the
// norm
memcpy(vec.data(), this->getDataByInternalId(id), this->dim * sizeof(DataType));
vectors_output.push_back(vec);
}

std::vector<std::vector<char>> getStoredVectorDataByLabel(labelType label) const override {
std::vector<std::vector<char>> vectors_output;
auto id = labelLookup.at(label);
const char *data = this->getDataByInternalId(id);

// Create a vector with the full data (including any metadata like norms)
std::vector<char> vec(this->dataSize);
memcpy(vec.data(), data, this->dataSize);
vectors_output.push_back(std::move(vec));

return vectors_output;
}
#endif
~HNSWIndex_Single() = default;

Expand Down Expand Up @@ -161,7 +176,7 @@ HNSWIndex_Single<DataType, DistType>::newBatchIterator(const void *queryBlob,
VecSimQueryParams *queryParams) const {
auto queryBlobCopy =
this->allocator->allocate_aligned(this->dataSize, this->preprocessors->getAlignment());
memcpy(queryBlobCopy, queryBlob, this->dim * sizeof(DataType));
memcpy(queryBlobCopy, queryBlob, this->getDataSize());
this->preprocessQueryInPlace(queryBlobCopy);
// Ownership of queryBlobCopy moves to HNSW_BatchIterator that will free it at the end.
return new (this->allocator) HNSWSingle_BatchIterator<DataType, DistType>(
Expand Down
Loading
Loading