Skip to content

Commit

Permalink
Raise on invalid unicode in rails mode mimmicry
Browse files Browse the repository at this point in the history
Activesupport & JSON gem will raise an exception when trying to an
encode an object containing a string with invalid byte sequences for the
string's encoding. Oj correctly raises if escaspe_html_entites_in_json
is enabled, but if that's disabled, the invalid byte sequence is copied
directly to the output.

Use the same logic to validate unicode in that case as well.
  • Loading branch information
KJ Tsanaktsidis committed Feb 1, 2024
1 parent 3b3e91e commit ad29f92
Show file tree
Hide file tree
Showing 3 changed files with 156 additions and 59 deletions.
39 changes: 30 additions & 9 deletions ext/oj/dump.c
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,18 @@ inline static long rails_xss_friendly_size(const uint8_t *str, size_t len) {
}

inline static size_t rails_friendly_size(const uint8_t *str, size_t len) {
return calculate_string_size(str, len, rails_friendly_chars);
long size = 0;
size_t i = len;
uint8_t hi = 0;

for (; 0 < i; str++, i--) {
size += rails_friendly_chars[*str];
hi |= *str & 0x80;
}
if (0 == hi) {
return size - len * (size_t)'0';
}
return -(size - len * (size_t)'0');
}

const char *oj_nan_str(VALUE obj, int opt, int mode, bool plus, int *lenp) {
Expand Down Expand Up @@ -752,6 +763,7 @@ void oj_dump_cstr(const char *str, size_t cnt, bool is_sym, bool escape1, Out ou
char *cmap;
const char *orig = str;
bool has_hi = false;
bool do_unicode_validation = false;

switch (out->opts->escape_mode) {
case NLEsc:
Expand All @@ -774,6 +786,7 @@ void oj_dump_cstr(const char *str, size_t cnt, bool is_sym, bool escape1, Out ou
case JXEsc:
cmap = hixss_friendly_chars;
size = hixss_friendly_size((uint8_t *)str, cnt);
do_unicode_validation = true;
break;
case RailsXEsc: {
long sz;
Expand All @@ -786,12 +799,22 @@ void oj_dump_cstr(const char *str, size_t cnt, bool is_sym, bool escape1, Out ou
} else {
size = (size_t)sz;
}
do_unicode_validation = true;
break;
}
case RailsEsc:
case RailsEsc: {
long sz;
cmap = rails_friendly_chars;
size = rails_friendly_size((uint8_t *)str, cnt);
sz = rails_friendly_size((uint8_t *)str, cnt);
if (sz < 0) {
has_hi = true;
size = (size_t)-sz;
} else {
size = (size_t)sz;
}
do_unicode_validation = true;
break;
}
case JSONEsc:
default: cmap = hibit_friendly_chars; size = hibit_friendly_size((uint8_t *)str, cnt);
}
Expand Down Expand Up @@ -822,7 +845,7 @@ void oj_dump_cstr(const char *str, size_t cnt, bool is_sym, bool escape1, Out ou
for (; str < end; str++) {
switch (cmap[(uint8_t)*str]) {
case '1':
if ((JXEsc == out->opts->escape_mode || RailsXEsc == out->opts->escape_mode) && check_start <= str) {
if (do_unicode_validation && check_start <= str) {
if (0 != (0x80 & (uint8_t)*str)) {
if (0xC0 == (0xC0 & (uint8_t)*str)) {
check_start = check_unicode(str, end, orig);
Expand All @@ -846,8 +869,7 @@ void oj_dump_cstr(const char *str, size_t cnt, bool is_sym, bool escape1, Out ou
}
break;
case '3': // Unicode
if (0xe2 == (uint8_t)*str && (JXEsc == out->opts->escape_mode || RailsXEsc == out->opts->escape_mode) &&
2 <= end - str) {
if (0xe2 == (uint8_t)*str && do_unicode_validation && 2 <= end - str) {
if (0x80 == (uint8_t)str[1] && (0xa8 == (uint8_t)str[2] || 0xa9 == (uint8_t)str[2])) {
str = dump_unicode(str, end, out, orig);
} else {
Expand All @@ -866,8 +888,7 @@ void oj_dump_cstr(const char *str, size_t cnt, bool is_sym, bool escape1, Out ou
APPEND_CHARS(out->cur, "\\u00", 4);
dump_hex((uint8_t)*str, out);
} else {
if (0xe2 == (uint8_t)*str &&
(JXEsc == out->opts->escape_mode || RailsXEsc == out->opts->escape_mode) && 2 <= end - str) {
if (0xe2 == (uint8_t)*str && do_unicode_validation && 2 <= end - str) {
if (0x80 == (uint8_t)str[1] && (0xa8 == (uint8_t)str[2] || 0xa9 == (uint8_t)str[2])) {
str = dump_unicode(str, end, out, orig);
} else {
Expand All @@ -884,7 +905,7 @@ void oj_dump_cstr(const char *str, size_t cnt, bool is_sym, bool escape1, Out ou
}
*out->cur++ = '"';
}
if ((JXEsc == out->opts->escape_mode || RailsXEsc == out->opts->escape_mode) && 0 < str - orig &&
if (do_unicode_validation && 0 < str - orig &&
0 != (0x80 & *(str - 1))) {
uint8_t c = (uint8_t) * (str - 1);
int i;
Expand Down
91 changes: 63 additions & 28 deletions test/activesupport6/encoding_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -74,42 +74,77 @@ def test_hash_keys_encoding
ActiveSupport.escape_html_entities_in_json = false
end

def test_utf8_string_encoded_properly
# The original test seems to expect that
# ActiveSupport.escape_html_entities_in_json reverts to true even after
# being set to false. I haven't been able to figure that out so the value is
# set to true, the default, before running the test. This might be wrong but
# for now it will have to do.
ActiveSupport.escape_html_entities_in_json = true
result = ActiveSupport::JSON.encode("€2.99")
assert_equal '"€2.99"', result
assert_equal(Encoding::UTF_8, result.encoding)

result = ActiveSupport::JSON.encode("✎☺")
assert_equal '"✎☺"', result
assert_equal(Encoding::UTF_8, result.encoding)
def test_hash_keys_encoding_without_escaping
assert_equal "{\"<>\":\"<>\"}", ActiveSupport::JSON.encode("<>" => "<>")
end

def test_non_utf8_string_transcodes
s = "二".encode("Shift_JIS")
result = ActiveSupport::JSON.encode(s)
assert_equal '"二"', result
assert_equal Encoding::UTF_8, result.encoding
module UnicodeTests
def test_utf8_string_encoded_properly
result = ActiveSupport::JSON.encode("€2.99")
assert_equal '"€2.99"', result
assert_equal(Encoding::UTF_8, result.encoding)

result = ActiveSupport::JSON.encode("✎☺")
assert_equal '"✎☺"', result
assert_equal(Encoding::UTF_8, result.encoding)
end

def test_non_utf8_string_transcodes
s = "二".encode("Shift_JIS")
result = ActiveSupport::JSON.encode(s)
assert_equal '"二"', result
assert_equal Encoding::UTF_8, result.encoding
end

def test_wide_utf8_chars
w = "𠜎"
result = ActiveSupport::JSON.encode(w)
assert_equal '"𠜎"', result
end

def test_wide_utf8_roundtrip
hash = { string: "𐒑" }
json = ActiveSupport::JSON.encode(hash)
decoded_hash = ActiveSupport::JSON.decode(json)
assert_equal "𐒑", decoded_hash["string"]
end

def test_invalid_encoding_raises
s = "\xAE\xFF\x9F"
refute s.valid_encoding?

# n.b. this raises EncodingError, because we didn't call Oj.mimic_JSON in the test setup; but,
# if you do that (even indirectly through Oj.optimize_rails), then this raises a
# JSON::GeneratorError instead of an EncodingError.
assert_raises(EncodingError) do
ActiveSupport::JSON.encode([s])
end
end
end

def test_wide_utf8_chars
w = "𠜎"
result = ActiveSupport::JSON.encode(w)
assert_equal '"𠜎"', result
module UnicodeTestsWithEscapingOn
def setup
ActiveSupport.escape_html_entities_in_json = true
end

def teardown
ActiveSupport.escape_html_entities_in_json = false
end

include UnicodeTests
end

def test_wide_utf8_roundtrip
hash = { string: "𐒑" }
json = ActiveSupport::JSON.encode(hash)
decoded_hash = ActiveSupport::JSON.decode(json)
assert_equal "𐒑", decoded_hash["string"]
module UnicodeTestsWithEscapingOff
def setup
ActiveSupport.escape_html_entities_in_json = false
end

include UnicodeTests
end

include UnicodeTestsWithEscapingOn
include UnicodeTestsWithEscapingOff

def test_hash_key_identifiers_are_always_quoted
values = { 0 => 0, 1 => 1, :_ => :_, "$" => "$", "a" => "a", :A => :A, :A0 => :A0, "A0B" => "A0B" }
assert_equal %w( "$" "A" "A0" "A0B" "_" "a" "0" "1" ).sort, object_keys(ActiveSupport::JSON.encode(values))
Expand Down
85 changes: 63 additions & 22 deletions test/activesupport7/encoding_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -70,36 +70,77 @@ def test_hash_keys_encoding
ActiveSupport.escape_html_entities_in_json = false
end

def test_utf8_string_encoded_properly
result = ActiveSupport::JSON.encode("€2.99")
assert_equal '"€2.99"', result
assert_equal(Encoding::UTF_8, result.encoding)

result = ActiveSupport::JSON.encode("✎☺")
assert_equal '"✎☺"', result
assert_equal(Encoding::UTF_8, result.encoding)
def test_hash_keys_encoding_without_escaping
assert_equal "{\"<>\":\"<>\"}", ActiveSupport::JSON.encode("<>" => "<>")
end

def test_non_utf8_string_transcodes
s = "二".encode("Shift_JIS")
result = ActiveSupport::JSON.encode(s)
assert_equal '"二"', result
assert_equal Encoding::UTF_8, result.encoding
module UnicodeTests
def test_utf8_string_encoded_properly
result = ActiveSupport::JSON.encode("€2.99")
assert_equal '"€2.99"', result
assert_equal(Encoding::UTF_8, result.encoding)

result = ActiveSupport::JSON.encode("✎☺")
assert_equal '"✎☺"', result
assert_equal(Encoding::UTF_8, result.encoding)
end

def test_non_utf8_string_transcodes
s = "二".encode("Shift_JIS")
result = ActiveSupport::JSON.encode(s)
assert_equal '"二"', result
assert_equal Encoding::UTF_8, result.encoding
end

def test_wide_utf8_chars
w = "𠜎"
result = ActiveSupport::JSON.encode(w)
assert_equal '"𠜎"', result
end

def test_wide_utf8_roundtrip
hash = { string: "𐒑" }
json = ActiveSupport::JSON.encode(hash)
decoded_hash = ActiveSupport::JSON.decode(json)
assert_equal "𐒑", decoded_hash["string"]
end

def test_invalid_encoding_raises
s = "\xAE\xFF\x9F"
refute s.valid_encoding?

# n.b. this raises EncodingError, because we didn't call Oj.mimic_JSON in the test setup; but,
# if you do that (even indirectly through Oj.optimize_rails), then this raises a
# JSON::GeneratorError instead of an EncodingError.
assert_raises(EncodingError) do
ActiveSupport::JSON.encode([s])
end
end
end

def test_wide_utf8_chars
w = "𠜎"
result = ActiveSupport::JSON.encode(w)
assert_equal '"𠜎"', result
module UnicodeTestsWithEscapingOn
def setup
ActiveSupport.escape_html_entities_in_json = true
end

def teardown
ActiveSupport.escape_html_entities_in_json = false
end

include UnicodeTests
end

def test_wide_utf8_roundtrip
hash = { string: "𐒑" }
json = ActiveSupport::JSON.encode(hash)
decoded_hash = ActiveSupport::JSON.decode(json)
assert_equal "𐒑", decoded_hash["string"]
module UnicodeTestsWithEscapingOff
def setup
ActiveSupport.escape_html_entities_in_json = false
end

include UnicodeTests
end

include UnicodeTestsWithEscapingOn
include UnicodeTestsWithEscapingOff

def test_hash_key_identifiers_are_always_quoted
values = { 0 => 0, 1 => 1, :_ => :_, "$" => "$", "a" => "a", :A => :A, :A0 => :A0, "A0B" => "A0B" }
assert_equal %w( "$" "A" "A0" "A0B" "_" "a" "0" "1" ).sort, object_keys(ActiveSupport::JSON.encode(values))
Expand Down

0 comments on commit ad29f92

Please sign in to comment.