Skip to content

Commit 69b6ea9

Browse files
committed
Merge pull request #80 from hendrik-cliqz/dictionary-merger
Dictionary merger optimizations
2 parents 6c4a787 + 74d2334 commit 69b6ea9

10 files changed

Lines changed: 134 additions & 38 deletions

File tree

keyvi/src/cpp/dictionary/dictionary_compiler.h

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -121,9 +121,8 @@ class DictionaryCompiler
121121
void Add(const std::string& input_key, typename ValueStoreT::value_t value =
122122
ValueStoreT::no_value) {
123123

124-
sorter_.push(key_value_t(input_key, RegisterValue(value)));
125-
126124
size_of_keys_ += input_key.size();
125+
sorter_.push(key_value_t(std::move(input_key), RegisterValue(value)));
127126
}
128127

129128
#ifdef Py_PYTHON_H
@@ -157,7 +156,7 @@ class DictionaryCompiler
157156

158157
TRACE("adding to generator: %s", key_value.key.c_str());
159158

160-
generator_->Add(key_value.key, key_value.value);
159+
generator_->Add(std::move(key_value.key), key_value.value);
161160
++added_key_values_;
162161
if (progress_callback && (added_key_values_ % callback_trigger_ == 0)){
163162
progress_callback(added_key_values_, number_of_items_, user_data);

keyvi/src/cpp/dictionary/dictionary_merger.h

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -50,11 +50,14 @@ final {
5050
int priority;
5151
bool operator<(const SegmentEntryForMerge& rhs) const
5252
{
53-
if (entry_iterator.GetKey() == rhs.entry_iterator.GetKey()) {
54-
return priority < rhs.priority;
55-
}
53+
// very important difference in semantics: we have to ensure that in case of equal key,
54+
// the iterator with the higher priority is taken
55+
56+
if (priority < rhs.priority) {
57+
return entry_iterator > rhs.entry_iterator;
58+
}
5659

57-
return entry_iterator > rhs.entry_iterator;
60+
return rhs.entry_iterator < entry_iterator;
5861
}
5962
};
6063

@@ -86,20 +89,23 @@ final {
8689

8790
fsa::Generator<PersistenceT, ValueStoreT> generator(1073741824, fsa::generator_param_t(), value_store);
8891

92+
std::string top_key;
93+
8994
while(!pqueue.empty()){
9095
auto entry_it = pqueue.top();
9196
pqueue.pop();
9297

93-
auto key = entry_it.entry_iterator.GetKey();
98+
top_key = entry_it.entry_iterator.GetKey();
9499

95100
// check for same keys and merge only the most recent one
96-
while (!pqueue.empty() and pqueue.top().entry_iterator.GetKey() == key) {
101+
while (!pqueue.empty() and pqueue.top().entry_iterator.operator==(top_key)) {
97102

98103
auto to_inc = pqueue.top();
99104
TRACE("removing element with prio %d (in favor of %d)", to_inc.priority, entry_it.priority);
100105

101106
pqueue.pop();
102107
if (++to_inc.entry_iterator != end_it) {
108+
TRACE("push iterator");
103109
pqueue.push(to_inc);
104110
}
105111
}
@@ -109,18 +115,21 @@ final {
109115

110116
// Todo: if inner weights are used update them
111117
//handle.weight = value_store_->GetWeightValue(value);
118+
handle.weight = 0;
112119

113120
handle.value_idx = value_store->GetValue(entry_it.entry_iterator.GetFsa()->GetValueStore()->GetValueStorePayload(),
114121
entry_it.entry_iterator.GetValueId(),
115122
handle.no_minimization);
116123

117-
TRACE("Add key: %s", key.c_str());
118-
generator.Add(key, handle);
124+
TRACE("Add key: %s", top_key.c_str());
125+
generator.Add(std::move(top_key), handle);
119126

120127
if (++entry_it.entry_iterator != end_it) {
121128
pqueue.push(entry_it);
122129
}
123130
}
131+
TRACE("finished iterating, do final compile.");
132+
124133
generator.CloseFeeding();
125134

126135
generator.WriteToFile(filename);

keyvi/src/cpp/dictionary/fsa/entry_iterator.h

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -40,9 +40,9 @@ class EntryIterator final{
4040

4141
public:
4242
EntryIterator()
43-
: current_state_(0),
44-
current_value_(0),
45-
fsa_(nullptr) {
43+
: fsa_(nullptr),
44+
current_state_(0),
45+
current_value_(0) {
4646
}
4747

4848
EntryIterator(automata_t f) : EntryIterator(f, f->GetStartState()) {}
@@ -107,6 +107,14 @@ class EntryIterator final{
107107
return !(operator==(other));
108108
}
109109

110+
bool operator==(const std::string& other_key) const {
111+
if (GetDepth() != other_key.size()) {
112+
return false;
113+
}
114+
115+
return std::memcmp(other_key.c_str(), (const char*) traversal_stack_.data(), other_key.size() == 0);
116+
}
117+
110118
bool operator<(const EntryIterator& other) const {
111119

112120
int compare_result = memcmp((const char*) other.traversal_stack_.data(),

keyvi/src/cpp/dictionary/fsa/generator.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -236,7 +236,7 @@ final {
236236
size_t commonPrefixLength = get_common_prefix_length(last_key_.c_str(), key);
237237

238238
if (commonPrefixLength == input_key.size() && last_key_.size() == input_key.size()) {
239-
last_key_ = key;
239+
last_key_ = std::move(input_key);
240240
return;
241241
}
242242

@@ -256,7 +256,7 @@ final {
256256
stack_->UpdateWeights(0, input_key.size() + 1, handle.weight);
257257
}
258258

259-
last_key_ = key;
259+
last_key_ = std::move(input_key);
260260
state_ = generator_state::FEEDING;
261261
}
262262

keyvi/src/cpp/dictionary/fsa/generator_adapter.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,11 +61,11 @@ class GeneratorAdapter final: public GeneratorAdapterInterface<PersistenceT, Val
6161

6262
void Add(const std::string& input_key, typename ValueStoreT::value_t value =
6363
ValueStoreT::no_value) {
64-
generator_.Add(input_key, value);
64+
generator_.Add(std::move(input_key), value);
6565
}
6666

6767
void Add(const std::string& input_key, const fsa::ValueHandle& value) {
68-
generator_.Add(input_key, value);
68+
generator_.Add(std::move(input_key), value);
6969
}
7070

7171
size_t GetFsaSize() const {

keyvi/src/cpp/dictionary/fsa/internal/json_value_store.h

Lines changed: 31 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -219,22 +219,47 @@ class JsonValueStore final : public IValueStoreWriter {
219219
friend class ::keyvi::dictionary::DictionaryMerger;
220220

221221
uint64_t GetValue(const char* payload, uint64_t fsa_value, bool& no_minimization){
222-
std::string packed_string = util::decodeVarintString(payload + fsa_value);
222+
size_t buffer_size;
223223

224-
return GetValue(util::DecodeJsonValue(packed_string), no_minimization);
224+
const char* full_buf = payload + fsa_value;
225+
const char* buf_ptr = util::decodeVarintString(full_buf, &buffer_size);
226+
227+
const RawPointerForCompare<MemoryMapManager> stp(buf_ptr, buffer_size,
228+
values_extern_);
229+
const RawPointer p = hash_.Get(stp);
230+
231+
if (!p.IsEmpty()) {
232+
// found the same value again, minimize
233+
TRACE("Minimized value");
234+
return p.GetOffset();
235+
} // else persist string value
236+
237+
no_minimization = true;
238+
TRACE("New unique value");
239+
++number_of_unique_values_;
240+
241+
uint64_t pt = static_cast<uint64_t>(values_buffer_size_);
242+
size_t full_buf_size = (buf_ptr-full_buf) + buffer_size;
243+
244+
values_extern_->Append((void*)full_buf,
245+
full_buf_size);
246+
values_buffer_size_ += full_buf_size;
247+
248+
hash_.Add(RawPointer(pt, stp.GetHashcode(), buffer_size));
249+
250+
return pt;
225251
}
226252

227253
uint64_t AddValue(){
228254
uint64_t pt = static_cast<uint64_t>(values_buffer_size_);
255+
size_t length;
229256

230-
dictionary::util::encodeVarint(string_buffer_.size(), *values_extern_);
231-
values_buffer_size_ += util::getVarintLength(string_buffer_.size());
232-
257+
dictionary::util::encodeVarint(string_buffer_.size(), *values_extern_, &length);
258+
values_buffer_size_ += length;
233259
values_extern_->Append((void*)string_buffer_.data(),
234260
string_buffer_.size());
235261
values_buffer_size_ += string_buffer_.size();
236262

237-
TRACE("add value to hash at %d, length %d", pt, string_buffer_.size());
238263
return pt;
239264
}
240265

keyvi/src/cpp/dictionary/fsa/internal/sparse_array_builder.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -362,7 +362,7 @@ class SparseArrayBuilder<SparseArrayPersistence<uint16_t>, OffsetTypeT, HashCode
362362

363363
// else overflow encoding
364364
uint16_t vshort_pointer[8];
365-
uint8_t vshort_size = 0;
365+
size_t vshort_size = 0;
366366

367367
util::encodeVarshort(transitionPointer_high, vshort_pointer, &vshort_size);
368368

@@ -441,7 +441,7 @@ class SparseArrayBuilder<SparseArrayPersistence<uint16_t>, OffsetTypeT, HashCode
441441
FINAL_OFFSET_CODE, 0);
442442

443443
uint16_t vshort_pointer[8];
444-
uint8_t vshort_size = 0;
444+
size_t vshort_size = 0;
445445

446446
util::encodeVarshort(value, vshort_pointer, &vshort_size);
447447

keyvi/src/cpp/dictionary/util/vint.h

Lines changed: 27 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,8 @@ namespace util {
4040
* @parma outputSizePtr A pointer to a single integer that is set to the number of bytes used in the output memory.
4141
*/
4242
template<typename int_t = uint64_t>
43-
void encodeVarint(int_t value, uint8_t* output, uint8_t* outputSizePtr) {
44-
uint8_t outputSize = 0;
43+
void encodeVarint(int_t value, uint8_t* output, size_t* outputSizePtr) {
44+
size_t outputSize = 0;
4545
//While more than 7 bits of data are left, occupy the last output byte
4646
// and set the next byte flag
4747
while (value > 127) {
@@ -63,8 +63,8 @@ void encodeVarint(int_t value, uint8_t* output, uint8_t* outputSizePtr) {
6363
* @parma outputSizePtr A pointer to a single integer that is set to the number of bytes used in the output memory.
6464
*/
6565
template<typename int_t = uint64_t>
66-
void encodeVarshort(int_t value, uint16_t* output, uint8_t* outputSizePtr) {
67-
uint8_t outputSize = 0;
66+
void encodeVarshort(int_t value, uint16_t* output, size_t* outputSizePtr) {
67+
size_t outputSize = 0;
6868
//While more than 15 bits of data are left, occupy the last output byte
6969
// and set the next byte flag
7070
while (value > 32767) {
@@ -115,16 +115,19 @@ size_t getVarshortLength(int_t value){
115115
* @param output the buffer to write to
116116
*/
117117
template<typename int_t = uint64_t, typename buffer_t>
118-
void encodeVarint(int_t value, buffer_t& output) {
118+
void encodeVarint(int_t value, buffer_t& output, size_t* written_bytes) {
119119
//While more than 7 bits of data are left, occupy the last output byte
120120
// and set the next byte flag
121+
size_t length = 0;
121122
while (value > 127) {
122123
//|128: Set the next byte flag
123124
output.push_back(((uint8_t) (value & 127)) | 128);
124125
//Remove the seven bits we just wrote
125126
value >>= 7;
127+
++length;
126128
}
127129
output.push_back(((uint8_t) value) & 127);
130+
*written_bytes = ++length;
128131
}
129132

130133
/**
@@ -178,7 +181,7 @@ inline size_t skipVarint(const char* input) {
178181

179182
inline std::string decodeVarintString(const char* input){
180183
uint64_t length = 0;
181-
uint8_t i = 0;
184+
size_t i = 0;
182185

183186
for (;; i++) {
184187
length |= (input[i] & 127) << (7 * i);
@@ -192,6 +195,24 @@ inline std::string decodeVarintString(const char* input){
192195
return std::string(input + i + 1, length);
193196
}
194197

198+
inline const char* decodeVarintString(const char* input, size_t* length_ptr) {
199+
size_t length = 0;
200+
size_t i = 0;
201+
202+
for (;; i++) {
203+
length |= (input[i] & 127) << (7 * i);
204+
205+
//If the next-byte flag is set
206+
if (!(input[i] & 128)) {
207+
break;
208+
}
209+
}
210+
211+
*length_ptr = length;
212+
return input + i + 1;
213+
214+
}
215+
195216

196217
} /* namespace util */
197218
} /* namespace dictionary */

keyvi/tests/cpp/dictionary/dictionary_merger_test.cpp

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,7 @@ BOOST_AUTO_TEST_CASE ( MergeIntegerDictsValueMerge) {
136136

137137
std::vector<std::pair<std::string, uint32_t>> test_data2 = {
138138
{ "abc", 25 },
139+
{ "abbc", 42 },
139140
{ "abcd", 21 },
140141
{ "abbc", 30 },
141142
};
@@ -178,6 +179,8 @@ BOOST_AUTO_TEST_CASE ( MergeIntegerDictsValueMerge) {
178179

179180
BOOST_CHECK_EQUAL("22", d2->operator[]("abc").GetValueAsString());
180181
BOOST_CHECK_EQUAL("21", d2->operator[]("abcd").GetValueAsString());
182+
183+
// overwritten by 2nd
181184
BOOST_CHECK_EQUAL("24", d2->operator[]("abbc").GetValueAsString());
182185

183186
std::remove(filename.c_str());
@@ -218,6 +221,7 @@ BOOST_AUTO_TEST_CASE ( MergeStringDicts) {
218221

219222
std::vector<std::pair<std::string, std::string>> test_data2 = {
220223
{ "abbe", "d" },
224+
{ "abbc", "z" },
221225
{ "abcd", "a" },
222226
{ "bbacd", "f" },
223227
};
@@ -241,7 +245,9 @@ BOOST_AUTO_TEST_CASE ( MergeStringDicts) {
241245
BOOST_CHECK(d->Contains("bba"));
242246

243247
BOOST_CHECK_EQUAL("a", d->operator[]("abc").GetValueAsString());
244-
BOOST_CHECK_EQUAL("b", d->operator[]("abbc").GetValueAsString());
248+
249+
// overwritten by 2nd
250+
BOOST_CHECK_EQUAL("z", d->operator[]("abbc").GetValueAsString());
245251
BOOST_CHECK_EQUAL("c", d->operator[]("abbcd").GetValueAsString());
246252
BOOST_CHECK_EQUAL("a", d->operator[]("abcde").GetValueAsString());
247253
BOOST_CHECK_EQUAL("b", d->operator[]("abdd").GetValueAsString());
@@ -271,6 +277,7 @@ BOOST_AUTO_TEST_CASE ( MergeJsonDicts) {
271277

272278
std::vector<std::pair<std::string, std::string>> test_data2 = {
273279
{ "abbe", "{d:4}" },
280+
{ "abbc", "{b:3}" },
274281
{ "abcd", "{a:1}" },
275282
{ "bbacd", "{f:5}" },
276283
};
@@ -294,7 +301,9 @@ BOOST_AUTO_TEST_CASE ( MergeJsonDicts) {
294301
BOOST_CHECK(d->Contains("bba"));
295302

296303
BOOST_CHECK_EQUAL("\"{a:1}\"", d->operator[]("abc").GetValueAsString());
297-
BOOST_CHECK_EQUAL("\"{b:2}\"", d->operator[]("abbc").GetValueAsString());
304+
305+
// overwritten by 2nd
306+
BOOST_CHECK_EQUAL("\"{b:3}\"", d->operator[]("abbc").GetValueAsString());
298307
BOOST_CHECK_EQUAL("\"{c:3}\"", d->operator[]("abbcd").GetValueAsString());
299308
BOOST_CHECK_EQUAL("\"{a:1}\"", d->operator[]("abcde").GetValueAsString());
300309
BOOST_CHECK_EQUAL("\"{b:2}\"", d->operator[]("abdd").GetValueAsString());

0 commit comments

Comments
 (0)