Skip to content

Commit ed4491f

Browse files
committed
Unpacker: key_cache option.
Heavily inspired from ruby/json#675. When parsing documents with the same structure repeatedly, a lot of time can be saved by keeping a small cache of map keys encountered. Using the existing `bench/bench.rb`, comparing with and without the cache shows a 30% performance improvement: ``` Calculating ------------------------------------- unpack-pooled 960.380k (± 1.4%) i/s - 4.865M in 5.066600s unpack-key-cache 1.245M (± 1.6%) i/s - 6.232M in 5.009060s Comparison: unpack-pooled: 960379.8 i/s unpack-key-cache: 1244517.6 i/s - 1.30x (± 0.00) faster ``` However, on the same benchmark, but with the cache filled with other keys, the performance is notably degraded: ``` Calculating ------------------------------------- unpack-pooled 926.849k (± 2.1%) i/s - 4.639M in 5.007333s unpack-key-cache 822.266k (± 2.4%) i/s - 4.113M in 5.004645s Comparison: unpack-pooled: 926849.2 i/s unpack-key-cache: 822265.6 i/s - 1.13x (± 0.00) slower ``` So this feature is powerful but situational.
1 parent 83a2600 commit ed4491f

File tree

5 files changed

+176
-8
lines changed

5 files changed

+176
-8
lines changed

doclib/msgpack/unpacker.rb

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@ class Unpacker
1919
# Supported options:
2020
#
2121
# * *:symbolize_keys* deserialize keys of Hash objects as Symbol instead of String
22+
# * *:freeze* freeze the deserialized objects. Can allow string deduplication and some allocation elision.
23+
# * *:key_cache* Enable caching of map keys, this can improve performance significantly if the same map keys are frequently encountered, but also degrade performance if that's not the case.
2224
# * *:allow_unknown_ext* allow to deserialize ext type object with unknown type id as ExtensionValue instance. Otherwise (by default), unpacker throws UnknownExtTypeError.
2325
#
2426
# See also Buffer#initialize for other options.

ext/msgpack/buffer.h

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -473,4 +473,131 @@ static inline VALUE msgpack_buffer_read_top_as_symbol(msgpack_buffer_t* b, size_
473473
return rb_str_intern(msgpack_buffer_read_top_as_string(b, length, true, utf8));
474474
}
475475

476+
// Hash keys are likely to be repeated, and are frozen.
477+
// As such we can re-use them if we keep a cache of the ones we've seen so far,
478+
// and save much more expensive lookups into the global fstring table.
479+
// This cache implementation is deliberately simple, as we're optimizing for compactness,
480+
// to be able to fit easily embeded inside msgpack_unpacker_t.
481+
// As such, binary search into a sorted array gives a good tradeoff between compactness and
482+
// performance.
483+
#define MSGPACK_KEY_CACHE_CAPACITY 63
484+
485+
typedef struct msgpack_key_cache_t msgpack_key_cache_t;
486+
struct msgpack_key_cache_t {
487+
int length;
488+
VALUE entries[MSGPACK_KEY_CACHE_CAPACITY];
489+
};
490+
491+
static inline VALUE build_interned_string(const char *str, const long length)
492+
{
493+
# ifdef HAVE_RB_ENC_INTERNED_STR
494+
return rb_enc_interned_str(str, length, rb_utf8_encoding());
495+
# else
496+
VALUE rstring = rb_utf8_str_new(str, length);
497+
return rb_funcall(rb_str_freeze(rstring), s_uminus, 0);
498+
# endif
499+
}
500+
501+
static inline VALUE build_symbol(const char *str, const long length)
502+
{
503+
return rb_str_intern(build_interned_string(str, length));
504+
}
505+
506+
static void rvalue_cache_insert_at(msgpack_key_cache_t *cache, int index, VALUE rstring)
507+
{
508+
MEMMOVE(&cache->entries[index + 1], &cache->entries[index], VALUE, cache->length - index);
509+
cache->length++;
510+
cache->entries[index] = rstring;
511+
}
512+
513+
static inline int rstring_cache_cmp(const char *str, const long length, VALUE rstring)
514+
{
515+
long rstring_length = RSTRING_LEN(rstring);
516+
if (length == rstring_length) {
517+
return memcmp(str, RSTRING_PTR(rstring), length);
518+
} else {
519+
return (int)(length - rstring_length);
520+
}
521+
}
522+
523+
static VALUE rstring_cache_fetch(msgpack_key_cache_t *cache, const char *str, const long length)
524+
{
525+
int low = 0;
526+
int high = cache->length - 1;
527+
int mid = 0;
528+
int last_cmp = 0;
529+
530+
while (low <= high) {
531+
mid = (high + low) >> 1;
532+
VALUE entry = cache->entries[mid];
533+
last_cmp = rstring_cache_cmp(str, length, entry);
534+
535+
if (last_cmp == 0) {
536+
return entry;
537+
} else if (last_cmp > 0) {
538+
low = mid + 1;
539+
} else {
540+
high = mid - 1;
541+
}
542+
}
543+
544+
VALUE rstring = build_interned_string(str, length);
545+
546+
if (cache->length < MSGPACK_KEY_CACHE_CAPACITY) {
547+
if (last_cmp > 0) {
548+
mid += 1;
549+
}
550+
551+
rvalue_cache_insert_at(cache, mid, rstring);
552+
}
553+
return rstring;
554+
}
555+
556+
static VALUE rsymbol_cache_fetch(msgpack_key_cache_t *cache, const char *str, const long length)
557+
{
558+
int low = 0;
559+
int high = cache->length - 1;
560+
int mid = 0;
561+
int last_cmp = 0;
562+
563+
while (low <= high) {
564+
mid = (high + low) >> 1;
565+
VALUE entry = cache->entries[mid];
566+
last_cmp = rstring_cache_cmp(str, length, rb_sym2str(entry));
567+
568+
if (last_cmp == 0) {
569+
return entry;
570+
} else if (last_cmp > 0) {
571+
low = mid + 1;
572+
} else {
573+
high = mid - 1;
574+
}
575+
}
576+
577+
VALUE rsymbol = build_symbol(str, length);
578+
579+
if (cache->length < MSGPACK_KEY_CACHE_CAPACITY) {
580+
if (last_cmp > 0) {
581+
mid += 1;
582+
}
583+
584+
rvalue_cache_insert_at(cache, mid, rsymbol);
585+
}
586+
return rsymbol;
587+
}
588+
589+
static inline VALUE msgpack_buffer_read_top_as_interned_symbol(msgpack_buffer_t* b, msgpack_key_cache_t *cache, size_t length)
590+
{
591+
VALUE result = rsymbol_cache_fetch(cache, b->read_buffer, length);
592+
_msgpack_buffer_consumed(b, length);
593+
return result;
594+
}
595+
596+
static inline VALUE msgpack_buffer_read_top_as_interned_string(msgpack_buffer_t* b, msgpack_key_cache_t *cache, size_t length)
597+
{
598+
VALUE result = rstring_cache_fetch(cache, b->read_buffer, length);
599+
_msgpack_buffer_consumed(b, length);
600+
return result;
601+
}
602+
476603
#endif

ext/msgpack/unpacker.c

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -130,11 +130,20 @@ void msgpack_unpacker_mark_stack(msgpack_unpacker_stack_t* stack)
130130
}
131131
}
132132

133+
void msgpack_unpacker_mark_key_cache(msgpack_key_cache_t *cache)
134+
{
135+
int index;
136+
for (index = 0; index < cache->length; index++) {
137+
rb_gc_mark(cache->entries[index]);
138+
}
139+
}
140+
133141
void msgpack_unpacker_mark(msgpack_unpacker_t* uk)
134142
{
135143
rb_gc_mark(uk->last_object);
136144
rb_gc_mark(uk->reading_raw);
137145
msgpack_unpacker_mark_stack(&uk->stack);
146+
msgpack_unpacker_mark_key_cache(&uk->key_cache);
138147
/* See MessagePack_Buffer_wrap */
139148
/* msgpack_buffer_mark(UNPACKER_BUFFER_(uk)); */
140149
rb_gc_mark(uk->buffer_ref);
@@ -374,15 +383,32 @@ static inline int read_raw_body_begin(msgpack_unpacker_t* uk, int raw_type)
374383
size_t length = uk->reading_raw_remaining;
375384
if(length <= msgpack_buffer_top_readable_size(UNPACKER_BUFFER_(uk))) {
376385
int ret;
377-
if ((uk->optimized_symbol_ext_type && uk->symbol_ext_type == raw_type) || (uk->symbolize_keys && is_reading_map_key(uk))) {
386+
if ((uk->optimized_symbol_ext_type && uk->symbol_ext_type == raw_type)) {
378387
VALUE symbol = msgpack_buffer_read_top_as_symbol(UNPACKER_BUFFER_(uk), length, raw_type != RAW_TYPE_BINARY);
379388
ret = object_complete_symbol(uk, symbol);
389+
} else if (is_reading_map_key(uk) && raw_type == RAW_TYPE_STRING) {
390+
/* don't use zerocopy for hash keys but get a frozen string directly
391+
* because rb_hash_aset freezes keys and it causes copying */
392+
VALUE key;
393+
if (uk->symbolize_keys) {
394+
if (uk->use_key_cache) {
395+
key = msgpack_buffer_read_top_as_interned_symbol(UNPACKER_BUFFER_(uk), &uk->key_cache, length);
396+
} else {
397+
key = msgpack_buffer_read_top_as_symbol(UNPACKER_BUFFER_(uk), length, true);
398+
}
399+
ret = object_complete_symbol(uk, key);
400+
} else {
401+
if (uk->use_key_cache) {
402+
key = msgpack_buffer_read_top_as_interned_string(UNPACKER_BUFFER_(uk), &uk->key_cache, length);
403+
} else {
404+
key = msgpack_buffer_read_top_as_string(UNPACKER_BUFFER_(uk), length, true, true);
405+
}
406+
407+
ret = object_complete(uk, key);
408+
}
380409
} else {
381410
bool will_freeze = uk->freeze;
382411
if(raw_type == RAW_TYPE_STRING || raw_type == RAW_TYPE_BINARY) {
383-
/* don't use zerocopy for hash keys but get a frozen string directly
384-
* because rb_hash_aset freezes keys and it causes copying */
385-
will_freeze = will_freeze || is_reading_map_key(uk);
386412
VALUE string = msgpack_buffer_read_top_as_string(UNPACKER_BUFFER_(uk), length, will_freeze, raw_type == RAW_TYPE_STRING);
387413
ret = object_complete(uk, string);
388414
} else {

ext/msgpack/unpacker.h

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ struct msgpack_unpacker_stack_t {
5050
struct msgpack_unpacker_t {
5151
msgpack_buffer_t buffer;
5252
msgpack_unpacker_stack_t stack;
53+
msgpack_key_cache_t key_cache;
5354

5455
VALUE self;
5556
VALUE last_object;
@@ -66,10 +67,12 @@ struct msgpack_unpacker_t {
6667

6768
/* options */
6869
int symbol_ext_type;
69-
bool symbolize_keys;
70-
bool freeze;
71-
bool allow_unknown_ext;
72-
bool optimized_symbol_ext_type;
70+
71+
bool use_key_cache: 1;
72+
bool symbolize_keys: 1;
73+
bool freeze: 1;
74+
bool allow_unknown_ext: 1;
75+
bool optimized_symbol_ext_type: 1;
7376
};
7477

7578
#define UNPACKER_BUFFER_(uk) (&(uk)->buffer)
@@ -101,6 +104,11 @@ static inline void msgpack_unpacker_set_symbolized_keys(msgpack_unpacker_t* uk,
101104
uk->symbolize_keys = enable;
102105
}
103106

107+
static inline void msgpack_unpacker_set_key_cache(msgpack_unpacker_t* uk, bool enable)
108+
{
109+
uk->use_key_cache = enable;
110+
}
111+
104112
static inline void msgpack_unpacker_set_freeze(msgpack_unpacker_t* uk, bool enable)
105113
{
106114
uk->freeze = enable;

ext/msgpack/unpacker_class.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ static VALUE eUnknownExtTypeError;
3434
static VALUE mTypeError; // obsoleted. only for backward compatibility. See #86.
3535

3636
static VALUE sym_symbolize_keys;
37+
static VALUE sym_key_cache;
3738
static VALUE sym_freeze;
3839
static VALUE sym_allow_unknown_ext;
3940

@@ -128,6 +129,9 @@ VALUE MessagePack_Unpacker_initialize(int argc, VALUE* argv, VALUE self)
128129
if(options != Qnil) {
129130
VALUE v;
130131

132+
v = rb_hash_aref(options, sym_key_cache);
133+
msgpack_unpacker_set_key_cache(uk, RTEST(v));
134+
131135
v = rb_hash_aref(options, sym_symbolize_keys);
132136
msgpack_unpacker_set_symbolized_keys(uk, RTEST(v));
133137

@@ -413,6 +417,7 @@ void MessagePack_Unpacker_module_init(VALUE mMessagePack)
413417
eUnknownExtTypeError = rb_define_class_under(mMessagePack, "UnknownExtTypeError", eUnpackError);
414418

415419
sym_symbolize_keys = ID2SYM(rb_intern("symbolize_keys"));
420+
sym_key_cache = ID2SYM(rb_intern("key_cache"));
416421
sym_freeze = ID2SYM(rb_intern("freeze"));
417422
sym_allow_unknown_ext = ID2SYM(rb_intern("allow_unknown_ext"));
418423

0 commit comments

Comments
 (0)