Skip to content

Commit 3202aed

Browse files
author
Hilik Yochai
committed
initial version a suggestion for the graph data interface
1 parent d2660c0 commit 3202aed

File tree

3 files changed

+617
-0
lines changed

3 files changed

+617
-0
lines changed
+179
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
#pragma once
2+
3+
#include <deque>
4+
#include <memory>
5+
#include <cassert>
6+
#include <climits>
7+
#include <queue>
8+
#include <random>
9+
#include <iostream>
10+
#include <algorithm>
11+
#include <unordered_map>
12+
#include <sys/resource.h>
13+
#include <fstream>
14+
#include <shared_mutex>
15+
16+
#include "visited_nodes_handler.h"
17+
#include "VecSim/spaces/spaces.h"
18+
#include "VecSim/memory/vecsim_malloc.h"
19+
#include "VecSim/utils/vecsim_stl.h"
20+
#include "VecSim/utils/vec_utils.h"
21+
#include "VecSim/utils/data_block.h"
22+
#include "VecSim/utils/vecsim_results_container.h"
23+
#include "VecSim/query_result_definitions.h"
24+
#include "VecSim/vec_sim_common.h"
25+
#include "VecSim/vec_sim_index.h"
26+
#include "VecSim/tombstone_interface.h"
27+
28+
#ifdef BUILD_TESTS
29+
#include "hnsw_serialization_utils.h"
30+
#include "VecSim/utils/serializer.h"
31+
#endif
32+
33+
using std::pair;
34+
using graphNodeType = pair<idType, ushort>; // represented as: (element_id, level)
35+
36+
37+
38+
class absEdges {
39+
public:
40+
absEdges();
41+
virtual ~absEdges();
42+
43+
virtual void push(idType id) = 0;
44+
45+
virtual bool removeIdIfExists(idType element_id) = 0;
46+
virtual void removeId(idType element_id) = 0;
47+
48+
virtual std::pair<size_t, const idType *> Get() = 0;
49+
virtual void Set(std::pair<size_t, const idType *> inp) = 0;
50+
51+
virtual void save(std::ofstream &output) ;
52+
virtual void restore(std::ifstream &input);
53+
};
54+
55+
56+
// vector metadata contains all the metadata of the vector;
57+
// this is replacing the id->metadata table and the element graph data
58+
//
59+
60+
struct VectorMetaData
61+
{
62+
enum Flags {
63+
DELETE_MARK = 0x1, // element is logically deleted, but still exists in the graph
64+
IN_PROCESS = 0x2, // element is being inserted into the graph
65+
PERMANENT_DELETE = 0x3,
66+
};
67+
VectorMetaData(const labelType &label, uint8_t max_level) :
68+
label_(label), max_level_(max_level), flags_(0) {}
69+
70+
// mark methods
71+
void mark(Flags flag) {
72+
flags_ |= flag;
73+
}
74+
void unmark(Flags flag) {
75+
flags_ &= ~flag;
76+
}
77+
bool ismarked(Flags flag) const {
78+
return flags_ & flag;
79+
}
80+
81+
labelType label_;
82+
uint8_t max_level_;
83+
std::atomic<uint8_t> flags_ = 0;
84+
std::mutex NodeGuard;
85+
};
86+
87+
88+
class WriteBatch;
89+
class absGraphData {
90+
public:
91+
absGraphData() {}
92+
virtual ~absGraphData() {};
93+
94+
// vector methods
95+
virtual const char *
96+
getVectorByInternalId(idType internal_id) const = 0;
97+
98+
virtual void
99+
multiGetVectors(const std::vector<idType> &,
100+
std::vector<const char *> &results) const = 0;
101+
102+
virtual idType
103+
pushVector(const void *vector_data,
104+
int max_level,
105+
const labelType &label,
106+
WriteBatch *wb) = 0;
107+
108+
// premanently delete the vector and the edges "free" the id
109+
virtual void
110+
deleteVectorAndEdges(idType internalId,
111+
WriteBatch *wb) = 0;
112+
113+
114+
// vectorMetaData methods
115+
virtual const VectorMetaData &
116+
vectorMetaDataById(idType internal_id) const = 0;
117+
118+
119+
virtual VectorMetaData &
120+
vectorMetaDataById(idType internal_id,
121+
WriteBatch *wb);
122+
123+
124+
125+
126+
127+
// outgoing edges
128+
virtual const absEdges &
129+
GetLevelOutgoingEdges(const graphNodeType &) const = 0;
130+
131+
virtual absEdges &
132+
GetLevelOutgoingEdges(const graphNodeType &,
133+
WriteBatch *) = 0;
134+
135+
136+
// inomming edges
137+
// fetch incoming from the database
138+
virtual const absEdges &
139+
GetLevelIncomingEdges(const graphNodeType &) const = 0;
140+
virtual absEdges &
141+
GetLevelIncomingEdges(const graphNodeType &,
142+
WriteBatch *) = 0;
143+
144+
// support only simple updates (add / delete target) operations
145+
// may not fetch the data from the database
146+
virtual absEdges &
147+
GetLevelVirtualIncomingEdges(const graphNodeType &id,
148+
WriteBatch *) = 0;
149+
// helper methods
150+
151+
// scan the database for the first node after starting id that exist at level
152+
virtual idType
153+
getVectorIdByLevel(short level,
154+
idType startingId) const = 0;
155+
156+
// get a pair of candidates to swap for the gc
157+
// first is a location that is permanent deleted
158+
// second is a location that is valid
159+
// start points is the last pair returned in the prev scan
160+
virtual idType
161+
getGarbadgeCollectionTarget(idType startPoint) const = 0;
162+
163+
// new and commit wrire batch
164+
virtual WriteBatch *newWriteBatch() = 0;
165+
virtual void CommitWriteBatch(WriteBatch *wb) = 0;
166+
167+
168+
virtual void shrinkToFit() = 0;
169+
170+
public:
171+
virtual void save(std::ofstream &output) const = 0;
172+
virtual void restore(std::ifstream &input) = 0;
173+
174+
protected:
175+
176+
177+
178+
};
179+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,209 @@
1+
#include "graph_data_ram.h"
2+
3+
struct LevelDataOnRam {
4+
static size_t max_num_outgoing_links;
5+
LevelDataOnRam(std::shared_ptr<VecSimAllocator> allocator) :
6+
incomingEdges(allocator), outgoingEdges() {
7+
}
8+
static size_t
9+
GetAllocationSizeBytes() {
10+
return sizeof(incomingEdges) + sizeof(outgoingEdges) +
11+
max_num_outgoing_links * sizeof(idType);
12+
}
13+
// currently only one size of level data
14+
IncomingEdges incomingEdges;
15+
OutgoingEdges outgoingEdges; // must be last
16+
17+
18+
};
19+
size_t LevelDataOnRam::max_num_outgoing_links;
20+
21+
struct VectorGraphData {
22+
VectorGraphData(std::shared_ptr<VecSimAllocator> allocator,
23+
size_t num_levels) :
24+
level0_data(allocator) {
25+
if (num_levels == 0) {
26+
others = nullptr;
27+
} else {
28+
others = (char *)allocator->callocate(
29+
LevelDataOnRam::GetAllocationSizeBytes() * num_levels);
30+
}
31+
}
32+
33+
34+
LevelDataOnRam &getLevelData(size_t level_num) {
35+
if (level_num == 0) return level0_data;
36+
// else
37+
return *(LevelDataOnRam *)
38+
(others + (level_num-1) * LevelDataOnRam::GetAllocationSizeBytes());
39+
}
40+
static size_t
41+
GetAllocationSizeBytes() {
42+
return sizeof(char *) + LevelDataOnRam::GetAllocationSizeBytes();
43+
};
44+
45+
char *others;
46+
// since level0_data has an variable size it must be last
47+
LevelDataOnRam level0_data;
48+
};
49+
50+
51+
RamGraphData::RamGraphData(std::shared_ptr<VecSimAllocator> allocator,
52+
size_t block_size,
53+
size_t max_num_outgoing_links,
54+
size_t vector_size_bytes,
55+
size_t initial_capacity,
56+
size_t vector_alignment) :
57+
vectorBlocks_(allocator),
58+
graphDataBlocks_(allocator),
59+
idToMetaData_(allocator),
60+
allocator_(allocator),
61+
block_size_(block_size),
62+
vector_size_bytes_(vector_size_bytes),
63+
vector_alignment_(vector_alignment)
64+
{
65+
LevelDataOnRam::max_num_outgoing_links = max_num_outgoing_links;
66+
if (initial_capacity) {
67+
idToMetaData_.reserve(initial_capacity);
68+
auto initial_vector_size = initial_capacity / block_size_;
69+
vectorBlocks_.reserve(initial_vector_size);
70+
graphDataBlocks_.reserve(initial_vector_size);
71+
}
72+
}
73+
74+
75+
const char *
76+
RamGraphData::getVectorByInternalId(idType internal_id) const {
77+
return vectorBlocks_[internal_id / block_size_].getElement(internal_id % block_size_);
78+
}
79+
80+
81+
82+
idType
83+
RamGraphData::pushVector(const void *vector_data,
84+
int max_level,
85+
const labelType &label,
86+
WriteBatch *wb) {
87+
idToMetaData_.push_back(new VectorMetaData(label,max_level));
88+
89+
if (vectorBlocks_.size() == 0 ||
90+
vectorBlocks_.back().getLength() == block_size_) {
91+
growByBlock();
92+
93+
}
94+
idType ret = vectorBlocks_.size() * block_size_ +
95+
vectorBlocks_.back().getLength();
96+
assert(idToMetaData_.size() == ret);
97+
98+
vectorBlocks_.back().addElement(vector_data);
99+
100+
VectorGraphData tmp(allocator_, max_level);
101+
graphDataBlocks_.back().addElement(&tmp);
102+
103+
return ret;
104+
}
105+
106+
// outgoing edges
107+
const absEdges &
108+
RamGraphData::GetLevelOutgoingEdges(const graphNodeType &gn) const {
109+
return getGraphDataByInternalId(gn.first)->
110+
getLevelData(gn.second).outgoingEdges;
111+
}
112+
113+
absEdges &
114+
RamGraphData::GetLevelOutgoingEdges(const graphNodeType &gn,
115+
WriteBatch *) {
116+
return getGraphDataByInternalId(gn.first)->
117+
getLevelData(gn.second).outgoingEdges;
118+
}
119+
// incoming edges
120+
const absEdges &
121+
RamGraphData::GetLevelIncomingEdges(const graphNodeType &gn) const {
122+
return getGraphDataByInternalId(gn.first)->
123+
getLevelData(gn.second).incomingEdges;
124+
}
125+
126+
absEdges &
127+
RamGraphData::GetLevelIncomingEdges(const graphNodeType &gn,
128+
WriteBatch *) {
129+
return getGraphDataByInternalId(gn.first)->
130+
getLevelData(gn.second).incomingEdges;
131+
}
132+
133+
idType
134+
RamGraphData::getVectorIdByLevel(short level,
135+
idType startingId) const {
136+
for (idType i = startingId; i < idToMetaData_.size(); i++) {
137+
auto const &v = vectorMetaDataById(i);
138+
if (v.max_level_ == level) {
139+
return i;
140+
}
141+
}
142+
for (idType i = 0; i < startingId; i++) {
143+
auto const &v = vectorMetaDataById(i);
144+
if (v.max_level_ == level) {
145+
return i;
146+
}
147+
}
148+
return idType(-1);
149+
}
150+
151+
idType
152+
RamGraphData::getGarbadgeCollectionTarget(idType startingId) const {
153+
for (idType i = startingId; i < idToMetaData_.size(); i++) {
154+
auto const &v = vectorMetaDataById(i);
155+
if (v.ismarked(VectorMetaData::PERMANENT_DELETE)) {
156+
return i;
157+
}
158+
}
159+
return idType(-1);
160+
}
161+
162+
163+
VectorGraphData *
164+
RamGraphData::getGraphDataByInternalId(idType internal_id) const {
165+
return (VectorGraphData *)
166+
graphDataBlocks_[internal_id / block_size_].
167+
getElement(internal_id % block_size_);
168+
}
169+
170+
void RamGraphData::growByBlock() {
171+
// Validations
172+
assert(vectorBlocks_.size() == graphDataBlocks_.size());
173+
assert(vectorBlocks_.size() == 0 ||
174+
vectorBlocks_.back().getLength() == block_size_);
175+
176+
vectorBlocks_.emplace_back(block_size_, vector_size_bytes_,
177+
allocator_, vector_alignment_);
178+
graphDataBlocks_.emplace_back(block_size_,
179+
VectorGraphData::GetAllocationSizeBytes(),
180+
allocator_);
181+
}
182+
183+
void RamGraphData::shrinkByBlock() {
184+
assert(vectorBlocks_.size() == graphDataBlocks_.size());
185+
assert(vectorBlocks_.size() > 0);
186+
assert(vectorBlocks_.back().getLength() == 0);
187+
188+
vectorBlocks_.pop_back();
189+
graphDataBlocks_.pop_back();
190+
}
191+
192+
void RamGraphData::shrinkToFit() {
193+
while (vectorBlocks_.size() && vectorBlocks_.back().getLength() == 0) {
194+
shrinkByBlock();
195+
}
196+
vectorBlocks_.shrink_to_fit();
197+
graphDataBlocks_.shrink_to_fit();
198+
idToMetaData_.shrink_to_fit();
199+
}
200+
201+
202+
void RamGraphData::save(std::ofstream &) const {
203+
}
204+
205+
void RamGraphGrestore(std::ifstream &) {
206+
// TBD
207+
}
208+
209+

0 commit comments

Comments
 (0)