Skip to content

Unpacker: key_cache option. #378

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Nov 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doclib/msgpack/unpacker.rb
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ class Unpacker
# Supported options:
#
# * *:symbolize_keys* deserialize keys of Hash objects as Symbol instead of String
# * *:freeze* freeze the deserialized objects. Can allow string deduplication and some allocation elision.
# * *:key_cache* Enable caching of map keys, this can improve performance significantly if the same map keys are frequently encountered, but also degrade performance if that's not the case.
# * *:allow_unknown_ext* allow to deserialize ext type object with unknown type id as ExtensionValue instance. Otherwise (by default), unpacker throws UnknownExtTypeError.
#
# See also Buffer#initialize for other options.
Expand Down
127 changes: 127 additions & 0 deletions ext/msgpack/buffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -473,4 +473,131 @@ static inline VALUE msgpack_buffer_read_top_as_symbol(msgpack_buffer_t* b, size_
return rb_str_intern(msgpack_buffer_read_top_as_string(b, length, true, utf8));
}

// Hash keys are likely to be repeated, and are frozen.
// As such we can re-use them if we keep a cache of the ones we've seen so far,
// and save much more expensive lookups into the global fstring table.
// This cache implementation is deliberately simple, as we're optimizing for compactness,
// to be able to fit easily embeded inside msgpack_unpacker_t.
// As such, binary search into a sorted array gives a good tradeoff between compactness and
// performance.
#define MSGPACK_KEY_CACHE_CAPACITY 63

typedef struct msgpack_key_cache_t msgpack_key_cache_t;
struct msgpack_key_cache_t {
int length;
VALUE entries[MSGPACK_KEY_CACHE_CAPACITY];
};

static inline VALUE build_interned_string(const char *str, const long length)
{
# ifdef HAVE_RB_ENC_INTERNED_STR
return rb_enc_interned_str(str, length, rb_utf8_encoding());
# else
VALUE rstring = rb_utf8_str_new(str, length);
return rb_funcall(rb_str_freeze(rstring), s_uminus, 0);
# endif
}

static inline VALUE build_symbol(const char *str, const long length)
{
return rb_str_intern(build_interned_string(str, length));
}

static void rvalue_cache_insert_at(msgpack_key_cache_t *cache, int index, VALUE rstring)
{
MEMMOVE(&cache->entries[index + 1], &cache->entries[index], VALUE, cache->length - index);
cache->length++;
cache->entries[index] = rstring;
}

static inline int rstring_cache_cmp(const char *str, const long length, VALUE rstring)
{
long rstring_length = RSTRING_LEN(rstring);
if (length == rstring_length) {
return memcmp(str, RSTRING_PTR(rstring), length);
} else {
return (int)(length - rstring_length);
}
}

static VALUE rstring_cache_fetch(msgpack_key_cache_t *cache, const char *str, const long length)
{
int low = 0;
int high = cache->length - 1;
int mid = 0;
int last_cmp = 0;

while (low <= high) {
mid = (high + low) >> 1;
VALUE entry = cache->entries[mid];
last_cmp = rstring_cache_cmp(str, length, entry);

if (last_cmp == 0) {
return entry;
} else if (last_cmp > 0) {
low = mid + 1;
} else {
high = mid - 1;
}
}

VALUE rstring = build_interned_string(str, length);

if (cache->length < MSGPACK_KEY_CACHE_CAPACITY) {
if (last_cmp > 0) {
mid += 1;
}

rvalue_cache_insert_at(cache, mid, rstring);
}
return rstring;
}

static VALUE rsymbol_cache_fetch(msgpack_key_cache_t *cache, const char *str, const long length)
{
int low = 0;
int high = cache->length - 1;
int mid = 0;
int last_cmp = 0;

while (low <= high) {
mid = (high + low) >> 1;
VALUE entry = cache->entries[mid];
last_cmp = rstring_cache_cmp(str, length, rb_sym2str(entry));

if (last_cmp == 0) {
return entry;
} else if (last_cmp > 0) {
low = mid + 1;
} else {
high = mid - 1;
}
}

VALUE rsymbol = build_symbol(str, length);

if (cache->length < MSGPACK_KEY_CACHE_CAPACITY) {
if (last_cmp > 0) {
mid += 1;
}

rvalue_cache_insert_at(cache, mid, rsymbol);
}
return rsymbol;
}

static inline VALUE msgpack_buffer_read_top_as_interned_symbol(msgpack_buffer_t* b, msgpack_key_cache_t *cache, size_t length)
{
VALUE result = rsymbol_cache_fetch(cache, b->read_buffer, length);
_msgpack_buffer_consumed(b, length);
return result;
}

static inline VALUE msgpack_buffer_read_top_as_interned_string(msgpack_buffer_t* b, msgpack_key_cache_t *cache, size_t length)
{
VALUE result = rstring_cache_fetch(cache, b->read_buffer, length);
_msgpack_buffer_consumed(b, length);
return result;
}

#endif
1 change: 1 addition & 0 deletions ext/msgpack/extconf.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
have_func("rb_enc_interned_str", "ruby.h") # Ruby 3.0+
have_func("rb_hash_new_capa", "ruby.h") # Ruby 3.2+
have_func("rb_proc_call_with_block", "ruby.h") # CRuby (TruffleRuby doesn't have it)
have_func("rb_gc_mark_locations", "ruby.h") # Missing on TruffleRuby

append_cflags([
"-fvisibility=hidden",
Expand Down
45 changes: 41 additions & 4 deletions ext/msgpack/unpacker.c
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,19 @@
#define rb_proc_call_with_block(recv, argc, argv, block) rb_funcallv(recv, rb_intern("call"), argc, argv)
#endif

#ifndef HAVE_RB_GC_MARK_LOCATIONS
// For TruffleRuby
void rb_gc_mark_locations(const VALUE *start, const VALUE *end)
{
VALUE *value = start;

while (value < end) {
rb_gc_mark(*value);
value++;
}
}
#endif

struct protected_proc_call_args {
VALUE proc;
int argc;
Expand Down Expand Up @@ -130,11 +143,18 @@ void msgpack_unpacker_mark_stack(msgpack_unpacker_stack_t* stack)
}
}

void msgpack_unpacker_mark_key_cache(msgpack_key_cache_t *cache)
{
const VALUE *entries = &cache->entries[0];
rb_gc_mark_locations(entries, entries + cache->length);
}

void msgpack_unpacker_mark(msgpack_unpacker_t* uk)
{
rb_gc_mark(uk->last_object);
rb_gc_mark(uk->reading_raw);
msgpack_unpacker_mark_stack(&uk->stack);
msgpack_unpacker_mark_key_cache(&uk->key_cache);
/* See MessagePack_Buffer_wrap */
/* msgpack_buffer_mark(UNPACKER_BUFFER_(uk)); */
rb_gc_mark(uk->buffer_ref);
Expand Down Expand Up @@ -374,15 +394,32 @@ static inline int read_raw_body_begin(msgpack_unpacker_t* uk, int raw_type)
size_t length = uk->reading_raw_remaining;
if(length <= msgpack_buffer_top_readable_size(UNPACKER_BUFFER_(uk))) {
int ret;
if ((uk->optimized_symbol_ext_type && uk->symbol_ext_type == raw_type) || (uk->symbolize_keys && is_reading_map_key(uk))) {
if ((uk->optimized_symbol_ext_type && uk->symbol_ext_type == raw_type)) {
VALUE symbol = msgpack_buffer_read_top_as_symbol(UNPACKER_BUFFER_(uk), length, raw_type != RAW_TYPE_BINARY);
ret = object_complete_symbol(uk, symbol);
} else if (is_reading_map_key(uk) && raw_type == RAW_TYPE_STRING) {
/* don't use zerocopy for hash keys but get a frozen string directly
* because rb_hash_aset freezes keys and it causes copying */
VALUE key;
if (uk->symbolize_keys) {
if (uk->use_key_cache) {
key = msgpack_buffer_read_top_as_interned_symbol(UNPACKER_BUFFER_(uk), &uk->key_cache, length);
} else {
key = msgpack_buffer_read_top_as_symbol(UNPACKER_BUFFER_(uk), length, true);
}
ret = object_complete_symbol(uk, key);
} else {
if (uk->use_key_cache) {
key = msgpack_buffer_read_top_as_interned_string(UNPACKER_BUFFER_(uk), &uk->key_cache, length);
} else {
key = msgpack_buffer_read_top_as_string(UNPACKER_BUFFER_(uk), length, true, true);
}

ret = object_complete(uk, key);
}
} else {
bool will_freeze = uk->freeze;
if(raw_type == RAW_TYPE_STRING || raw_type == RAW_TYPE_BINARY) {
/* don't use zerocopy for hash keys but get a frozen string directly
* because rb_hash_aset freezes keys and it causes copying */
will_freeze = will_freeze || is_reading_map_key(uk);
VALUE string = msgpack_buffer_read_top_as_string(UNPACKER_BUFFER_(uk), length, will_freeze, raw_type == RAW_TYPE_STRING);
ret = object_complete(uk, string);
} else {
Expand Down
16 changes: 12 additions & 4 deletions ext/msgpack/unpacker.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ struct msgpack_unpacker_stack_t {
struct msgpack_unpacker_t {
msgpack_buffer_t buffer;
msgpack_unpacker_stack_t stack;
msgpack_key_cache_t key_cache;

VALUE self;
VALUE last_object;
Expand All @@ -66,10 +67,12 @@ struct msgpack_unpacker_t {

/* options */
int symbol_ext_type;
bool symbolize_keys;
bool freeze;
bool allow_unknown_ext;
bool optimized_symbol_ext_type;

bool use_key_cache: 1;
bool symbolize_keys: 1;
bool freeze: 1;
bool allow_unknown_ext: 1;
bool optimized_symbol_ext_type: 1;
};

#define UNPACKER_BUFFER_(uk) (&(uk)->buffer)
Expand Down Expand Up @@ -101,6 +104,11 @@ static inline void msgpack_unpacker_set_symbolized_keys(msgpack_unpacker_t* uk,
uk->symbolize_keys = enable;
}

static inline void msgpack_unpacker_set_key_cache(msgpack_unpacker_t* uk, bool enable)
{
uk->use_key_cache = enable;
}

static inline void msgpack_unpacker_set_freeze(msgpack_unpacker_t* uk, bool enable)
{
uk->freeze = enable;
Expand Down
5 changes: 5 additions & 0 deletions ext/msgpack/unpacker_class.c
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ static VALUE eUnknownExtTypeError;
static VALUE mTypeError; // obsoleted. only for backward compatibility. See #86.

static VALUE sym_symbolize_keys;
static VALUE sym_key_cache;
static VALUE sym_freeze;
static VALUE sym_allow_unknown_ext;

Expand Down Expand Up @@ -128,6 +129,9 @@ VALUE MessagePack_Unpacker_initialize(int argc, VALUE* argv, VALUE self)
if(options != Qnil) {
VALUE v;

v = rb_hash_aref(options, sym_key_cache);
msgpack_unpacker_set_key_cache(uk, RTEST(v));

v = rb_hash_aref(options, sym_symbolize_keys);
msgpack_unpacker_set_symbolized_keys(uk, RTEST(v));

Expand Down Expand Up @@ -413,6 +417,7 @@ void MessagePack_Unpacker_module_init(VALUE mMessagePack)
eUnknownExtTypeError = rb_define_class_under(mMessagePack, "UnknownExtTypeError", eUnpackError);

sym_symbolize_keys = ID2SYM(rb_intern("symbolize_keys"));
sym_key_cache = ID2SYM(rb_intern("key_cache"));
sym_freeze = ID2SYM(rb_intern("freeze"));
sym_allow_unknown_ext = ID2SYM(rb_intern("allow_unknown_ext"));

Expand Down