Skip to content

Commit 4d47aae

Browse files
authored
Write 4-byte characters (surrogate pairs) instead of escapes (#1335)
1 parent 5c76113 commit 4d47aae

File tree

6 files changed

+111
-30
lines changed

6 files changed

+111
-30
lines changed

release-notes/CREDITS-2.x

+9
Original file line numberDiff line numberDiff line change
@@ -435,6 +435,15 @@ Antonin Janec (@xtonic)
435435
* Contributed #1218: Simplify Unicode surrogate pair conversion for generation
436436
(2.17.0)
437437

438+
Ian Roberts (@ianroberts)
439+
* Reported #223: `UTF8JsonGenerator` writes supplementary characters as a
440+
surrogate pair: should use 4-byte encoding
441+
(2.18.0)
442+
443+
Radovan Netuka (@rnetuka)
444+
* Contributed fix for #223: `UTF8JsonGenerator` writes supplementary characters as a
445+
surrogate pair: should use 4-byte encoding
446+
438447
Jared Stehler (@jaredstehler)
439448
* Reported, contributed fix for #1274: `NUL`-corrupted keys, values on JSON serialization
440449
(2.18.0)

release-notes/VERSION-2.x

+4
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,10 @@ a pure JSON library.
1616

1717
2.18.0 (not yet released)
1818

19+
#223: `UTF8JsonGenerator` writes supplementary characters as a surrogate pair:
20+
should use 4-byte encoding
21+
(reported by Ian R)
22+
(fix contributed by Radovan N)
1923
#1230: Improve performance of `float` and `double` parsing from `TextBuffer`
2024
(implemented by @pjfanning)
2125
#1251: `InternCache` replace synchronized with `ReentrantLock` - the cache

src/main/java/com/fasterxml/jackson/core/JsonGenerator.java

+11-4
Original file line numberDiff line numberDiff line change
@@ -269,13 +269,20 @@ public enum Feature {
269269
WRITE_HEX_UPPER_CASE(true),
270270

271271
/**
272-
* Feature that specifies whether {@link JsonGenerator} should escape forward slashes.
273-
* <p>
274-
* Feature is disabled by default for Jackson 2.x version, and enabled by default in Jackson 3.0.
272+
* See {@link com.fasterxml.jackson.core.json.JsonWriteFeature#ESCAPE_FORWARD_SLASHES}.
275273
*
276274
* @since 2.17
277275
*/
278-
ESCAPE_FORWARD_SLASHES(false);
276+
ESCAPE_FORWARD_SLASHES(false),
277+
278+
/**
279+
* See {@link com.fasterxml.jackson.core.json.JsonWriteFeature#COMBINE_UNICODE_SURROGATES_IN_UTF8}.
280+
*
281+
* @since 2.18
282+
*/
283+
COMBINE_UNICODE_SURROGATES_IN_UTF8(false),
284+
285+
;
279286

280287
private final boolean _defaultState;
281288
private final int _mask;

src/main/java/com/fasterxml/jackson/core/json/JsonWriteFeature.java

+23-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ public enum JsonWriteFeature
1212
{
1313
// // // Support for non-standard data format constructs: comments
1414

15-
// // Quoting/ecsaping-related features
15+
// // Quoting/escaping-related features
1616

1717
/**
1818
* Feature that determines whether JSON Object field names are
@@ -117,6 +117,28 @@ public enum JsonWriteFeature
117117
*/
118118
ESCAPE_FORWARD_SLASHES(false, JsonGenerator.Feature.ESCAPE_FORWARD_SLASHES),
119119

120+
/**
121+
* Feature that specifies how characters outside "Basic Multilingual Plane" (BMP) -- ones encoded
122+
* as 4-byte UTF-8 sequences but represented in JVM memory as 2 16-bit "surrogate" {@code chars} --
123+
* should be encoded as UTF-8 by {@link JsonGenerator}.
124+
* If enabled, surrogate pairs are combined and flushed as a
125+
* single, 4-byte UTF-8 character.
126+
* If disabled, each {@code char} of pair is written as 2 separate characters: that is, as 2
127+
* separate 3-byte UTF-8 characters with values in Surrogate character ranges
128+
* ({@code 0xD800} - {@code 0xDBFF} and {@code 0xDC00} - {@code 0xDFFF})
129+
* <p>
130+
* Note that this feature only has effect for {@link JsonGenerator}s that directly encode
131+
* {@code byte}-based output, as UTF-8 (target {@link java.io.OutputStream}, {@code byte[]}
132+
* and so on); it will not (can not) change handling of
133+
* {@code char}-based output (like {@link java.io.Writer} or {@link java.lang.String}).
134+
* <p>
135+
* Feature is disabled by default in 2.x for backwards-compatibility (will be enabled
136+
* in 3.0).
137+
*
138+
* @since 2.18
139+
*/
140+
COMBINE_UNICODE_SURROGATES_IN_UTF8(false, JsonGenerator.Feature.COMBINE_UNICODE_SURROGATES_IN_UTF8),
141+
120142
;
121143

122144
final private boolean _defaultState;

src/main/java/com/fasterxml/jackson/core/json/UTF8JsonGenerator.java

+38
Original file line numberDiff line numberDiff line change
@@ -1510,6 +1510,16 @@ private final void _writeStringSegment2(final char[] cbuf, int offset, final int
15101510
outputBuffer[outputPtr++] = (byte) (0xc0 | (ch >> 6));
15111511
outputBuffer[outputPtr++] = (byte) (0x80 | (ch & 0x3f));
15121512
} else {
1513+
// 3- or 4-byte character
1514+
if (_isSurrogateChar(ch)) {
1515+
final boolean combineSurrogates = Feature.COMBINE_UNICODE_SURROGATES_IN_UTF8.enabledIn(_features);
1516+
if (combineSurrogates && offset < end) {
1517+
char highSurrogate = (char) ch;
1518+
char lowSurrogate = cbuf[offset++];
1519+
outputPtr = _outputSurrogatePair(highSurrogate, lowSurrogate, outputPtr);
1520+
continue;
1521+
}
1522+
}
15131523
outputPtr = _outputMultiByteChar(ch, outputPtr);
15141524
}
15151525
}
@@ -1548,6 +1558,16 @@ private final void _writeStringSegment2(final String text, int offset, final int
15481558
outputBuffer[outputPtr++] = (byte) (0xc0 | (ch >> 6));
15491559
outputBuffer[outputPtr++] = (byte) (0x80 | (ch & 0x3f));
15501560
} else {
1561+
// 3- or 4-byte character
1562+
if (_isSurrogateChar(ch)) {
1563+
final boolean combineSurrogates = Feature.COMBINE_UNICODE_SURROGATES_IN_UTF8.enabledIn(_features);
1564+
if (combineSurrogates && offset < end) {
1565+
char highSurrogate = (char) ch;
1566+
char lowSurrogate = text.charAt(offset++);
1567+
outputPtr = _outputSurrogatePair(highSurrogate, lowSurrogate, outputPtr);
1568+
continue;
1569+
}
1570+
}
15511571
outputPtr = _outputMultiByteChar(ch, outputPtr);
15521572
}
15531573
}
@@ -2133,6 +2153,19 @@ protected final void _outputSurrogates(int surr1, int surr2) throws IOException
21332153
bbuf[_outputTail++] = (byte) (0x80 | (c & 0x3f));
21342154
}
21352155

2156+
// @since 2.18
2157+
private int _outputSurrogatePair(char highSurrogate, char lowSurrogate, int outputPtr) {
2158+
final int unicode = 0x10000 + ((highSurrogate & 0x03FF) << 10)
2159+
+ (lowSurrogate & 0x03FF);
2160+
2161+
_outputBuffer[outputPtr++] = (byte) (0xF0 + ((unicode >> 18) & 0x07));
2162+
_outputBuffer[outputPtr++] = (byte) (0x80 + ((unicode >> 12) & 0x3F));
2163+
_outputBuffer[outputPtr++] = (byte) (0x80 + ((unicode >> 6) & 0x3F));
2164+
_outputBuffer[outputPtr++] = (byte) (0x80 + (unicode & 0x3F));
2165+
2166+
return outputPtr;
2167+
}
2168+
21362169
/**
21372170
*
21382171
* @param ch
@@ -2214,5 +2247,10 @@ protected final void _flushBuffer() throws IOException
22142247
private byte[] getHexBytes() {
22152248
return _cfgWriteHexUppercase ? HEX_BYTES_UPPER : HEX_BYTES_LOWER;
22162249
}
2250+
2251+
// @since 2.18
2252+
private boolean _isSurrogateChar(int ch) {
2253+
return (ch & 0xD800) == 0xD800;
2254+
}
22172255
}
22182256

Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
package com.fasterxml.jackson.failing;
1+
package com.fasterxml.jackson.core.json;
22

33
import java.io.ByteArrayOutputStream;
44
import java.io.StringWriter;
@@ -9,10 +9,18 @@
99
import org.junit.jupiter.api.Test;
1010

1111
import static org.junit.jupiter.api.Assertions.assertEquals;
12+
import static org.junit.jupiter.api.Assertions.assertFalse;
1213

1314
class Surrogate223Test extends JUnit5TestBase
1415
{
15-
private final JsonFactory JSON_F = new JsonFactory();
16+
private final JsonFactory DEFAULT_JSON_F = newStreamFactory();
17+
18+
// for [core#223]
19+
@Test
20+
void surrogatesDefaultSetting() throws Exception {
21+
// default in 2.x should be disabled:
22+
assertFalse(DEFAULT_JSON_F.isEnabled(JsonWriteFeature.COMBINE_UNICODE_SURROGATES_IN_UTF8.mappedFeature()));
23+
}
1624

1725
// for [core#223]
1826
@Test
@@ -23,36 +31,41 @@ void surrogatesByteBacked() throws Exception
2331
final String toQuote = new String(Character.toChars(0x1F602));
2432
assertEquals(2, toQuote.length()); // just sanity check
2533

26-
// default should be disabled:
27-
// assertFalse(JSON_F.isEnabled(JsonGenerator.Feature.ESCAPE_UTF8_SURROGATES));
28-
2934
out = new ByteArrayOutputStream();
30-
g = JSON_F.createGenerator(out);
35+
36+
JsonFactory f = JsonFactory.builder()
37+
.enable(JsonWriteFeature.COMBINE_UNICODE_SURROGATES_IN_UTF8)
38+
.build();
39+
g = f.createGenerator(out);
3140
g.writeStartArray();
3241
g.writeString(toQuote);
3342
g.writeEndArray();
3443
g.close();
3544
assertEquals(2 + 2 + 4, out.size()); // brackets, quotes, 4-byte encoding
3645

3746
// Also parse back to ensure correctness
38-
JsonParser p = JSON_F.createParser(out.toByteArray());
47+
JsonParser p = f.createParser(out.toByteArray());
3948
assertToken(JsonToken.START_ARRAY, p.nextToken());
4049
assertToken(JsonToken.VALUE_STRING, p.nextToken());
50+
assertEquals(toQuote, p.getText());
4151
assertToken(JsonToken.END_ARRAY, p.nextToken());
4252
p.close();
4353

4454
// but may revert back to original behavior
4555
out = new ByteArrayOutputStream();
46-
g = JSON_F.createGenerator(out);
47-
// g.enable(JsonGenerator.Feature.ESCAPE_UTF8_SURROGATES);
56+
f = JsonFactory.builder()
57+
.disable(JsonWriteFeature.COMBINE_UNICODE_SURROGATES_IN_UTF8)
58+
.build();
59+
60+
g = f.createGenerator(out);
4861
g.writeStartArray();
4962
g.writeString(toQuote);
5063
g.writeEndArray();
5164
g.close();
5265
assertEquals(2 + 2 + 12, out.size()); // brackets, quotes, 2 x 6 byte JSON escape
5366
}
5467

55-
// for [core#223]
68+
// for [core#223]: no change for character-backed (cannot do anything)
5669
@Test
5770
void surrogatesCharBacked() throws Exception
5871
{
@@ -61,32 +74,20 @@ void surrogatesCharBacked() throws Exception
6174
final String toQuote = new String(Character.toChars(0x1F602));
6275
assertEquals(2, toQuote.length()); // just sanity check
6376

64-
// default should be disabled:
65-
// assertFalse(JSON_F.isEnabled(JsonGenerator.Feature.ESCAPE_UTF8_SURROGATES));
66-
6777
out = new StringWriter();
68-
g = JSON_F.createGenerator(out);
78+
g = DEFAULT_JSON_F.createGenerator(out);
6979
g.writeStartArray();
7080
g.writeString(toQuote);
7181
g.writeEndArray();
7282
g.close();
7383
assertEquals(2 + 2 + 2, out.toString().length()); // brackets, quotes, 2 chars as is
7484

7585
// Also parse back to ensure correctness
76-
JsonParser p = JSON_F.createParser(out.toString());
86+
JsonParser p = DEFAULT_JSON_F.createParser(out.toString());
7787
assertToken(JsonToken.START_ARRAY, p.nextToken());
7888
assertToken(JsonToken.VALUE_STRING, p.nextToken());
89+
assertEquals(toQuote, p.getText());
7990
assertToken(JsonToken.END_ARRAY, p.nextToken());
8091
p.close();
81-
82-
// but may revert back to original behavior
83-
out = new StringWriter();
84-
g = JSON_F.createGenerator(out);
85-
// g.enable(JsonGenerator.Feature.ESCAPE_UTF8_SURROGATES);
86-
g.writeStartArray();
87-
g.writeString(toQuote);
88-
g.writeEndArray();
89-
g.close();
90-
assertEquals(2 + 2 + 12, out.toString().length()); // brackets, quotes, 2 x 6 byte JSON escape
9192
}
9293
}

0 commit comments

Comments
 (0)