Skip to content

Commit bd3e8db

Browse files
committed
Write 4-byte characters (surrogate pairs) instead of escapes
1 parent 89b2381 commit bd3e8db

File tree

3 files changed

+48
-7
lines changed

3 files changed

+48
-7
lines changed

src/main/java/com/fasterxml/jackson/core/JsonGenerator.java

+12-1
Original file line numberDiff line numberDiff line change
@@ -275,7 +275,18 @@ public enum Feature {
275275
*
276276
* @since 2.17
277277
*/
278-
ESCAPE_FORWARD_SLASHES(false);
278+
ESCAPE_FORWARD_SLASHES(false),
279+
280+
/**
281+
* Feature that specifies how 4-byte characters should be handled in {@link JsonGenerator}. If enabled,
282+
* 4-byte characters made by surrogate pairs are combined and flushed as a single character encoded in UTF-8.
283+
* If disabled, each pair is written as UTF-16 escape.
284+
* <p>
285+
* Feature is disabled by default
286+
*
287+
* @since 2.18
288+
*/
289+
COMBINE_UNICODE_SURROGATES(false);
279290

280291
private final boolean _defaultState;
281292
private final int _mask;

src/main/java/com/fasterxml/jackson/core/json/UTF8JsonGenerator.java

+32-2
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import java.io.*;
44
import java.math.BigDecimal;
55
import java.math.BigInteger;
6+
import java.nio.charset.StandardCharsets;
67

78
import com.fasterxml.jackson.core.*;
89
import com.fasterxml.jackson.core.io.CharTypes;
@@ -659,6 +660,10 @@ public void writeUTF8String(byte[] text, int offset, int len) throws IOException
659660
_outputBuffer[_outputTail++] = _quoteChar;
660661
}
661662

663+
private boolean isSurrogatePair(char ch) {
664+
return (ch & 0xD800) == 0xD800;
665+
}
666+
662667
/*
663668
/**********************************************************
664669
/* Output method implementations, unprocessed ("raw")
@@ -1489,6 +1494,8 @@ private final void _writeStringSegment2(final char[] cbuf, int offset, final int
14891494
final byte[] outputBuffer = _outputBuffer;
14901495
final int[] escCodes = _outputEscapes;
14911496

1497+
boolean combineSurrogates = Feature.COMBINE_UNICODE_SURROGATES.enabledIn(_features);
1498+
14921499
while (offset < end) {
14931500
int ch = cbuf[offset++];
14941501
if (ch <= 0x7F) {
@@ -1510,7 +1517,14 @@ private final void _writeStringSegment2(final char[] cbuf, int offset, final int
15101517
outputBuffer[outputPtr++] = (byte) (0xc0 | (ch >> 6));
15111518
outputBuffer[outputPtr++] = (byte) (0x80 | (ch & 0x3f));
15121519
} else {
1513-
outputPtr = _outputMultiByteChar(ch, outputPtr);
1520+
// multibyte character
1521+
if (combineSurrogates && isSurrogatePair((char) ch) && offset < end) {
1522+
char highSurrogate = (char) ch;
1523+
char lowSurrogate = cbuf[offset++];
1524+
outputPtr = _outputSurrogatePair(highSurrogate, lowSurrogate, outputPtr);
1525+
} else {
1526+
outputPtr = _outputMultiByteChar(ch, outputPtr);
1527+
}
15141528
}
15151529
}
15161530
_outputTail = outputPtr;
@@ -1527,6 +1541,8 @@ private final void _writeStringSegment2(final String text, int offset, final int
15271541
final byte[] outputBuffer = _outputBuffer;
15281542
final int[] escCodes = _outputEscapes;
15291543

1544+
boolean combineSurrogates = Feature.COMBINE_UNICODE_SURROGATES.enabledIn(_features);
1545+
15301546
while (offset < end) {
15311547
int ch = text.charAt(offset++);
15321548
if (ch <= 0x7F) {
@@ -1548,7 +1564,14 @@ private final void _writeStringSegment2(final String text, int offset, final int
15481564
outputBuffer[outputPtr++] = (byte) (0xc0 | (ch >> 6));
15491565
outputBuffer[outputPtr++] = (byte) (0x80 | (ch & 0x3f));
15501566
} else {
1551-
outputPtr = _outputMultiByteChar(ch, outputPtr);
1567+
// multibyte character
1568+
if (combineSurrogates && isSurrogatePair((char) ch) && offset < end) {
1569+
char highSurrogate = (char) ch;
1570+
char lowSurrogate = text.charAt(offset++);
1571+
outputPtr = _outputSurrogatePair(highSurrogate, lowSurrogate, outputPtr);
1572+
} else {
1573+
outputPtr = _outputMultiByteChar(ch, outputPtr);
1574+
}
15521575
}
15531576
}
15541577
_outputTail = outputPtr;
@@ -2133,6 +2156,13 @@ protected final void _outputSurrogates(int surr1, int surr2) throws IOException
21332156
bbuf[_outputTail++] = (byte) (0x80 | (c & 0x3f));
21342157
}
21352158

2159+
private int _outputSurrogatePair(char highSurrogate, char lowSurrogate, int outputPtr) {
2160+
String s = String.valueOf(highSurrogate) + lowSurrogate;
2161+
byte[] bytes = s.getBytes(StandardCharsets.UTF_8);
2162+
System.arraycopy(bytes, 0, _outputBuffer, outputPtr, bytes.length);
2163+
return outputPtr + bytes.length;
2164+
}
2165+
21362166
/**
21372167
*
21382168
* @param ch

src/test/java/com/fasterxml/jackson/failing/Surrogate223Test.java renamed to src/test/java/com/fasterxml/jackson/core/json/Surrogate223Test.java

+4-4
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
package com.fasterxml.jackson.failing;
1+
package com.fasterxml.jackson.core.json;
22

33
import java.io.ByteArrayOutputStream;
44
import java.io.StringWriter;
@@ -8,6 +8,7 @@
88

99
import org.junit.jupiter.api.Test;
1010

11+
import static com.fasterxml.jackson.core.JsonGenerator.Feature;
1112
import static org.junit.jupiter.api.Assertions.assertEquals;
1213

1314
class Surrogate223Test extends JUnit5TestBase
@@ -27,7 +28,7 @@ void surrogatesByteBacked() throws Exception
2728
// assertFalse(JSON_F.isEnabled(JsonGenerator.Feature.ESCAPE_UTF8_SURROGATES));
2829

2930
out = new ByteArrayOutputStream();
30-
g = JSON_F.createGenerator(out);
31+
g = JSON_F.createGenerator(out).enable(Feature.COMBINE_UNICODE_SURROGATES);
3132
g.writeStartArray();
3233
g.writeString(toQuote);
3334
g.writeEndArray();
@@ -43,8 +44,7 @@ void surrogatesByteBacked() throws Exception
4344

4445
// but may revert back to original behavior
4546
out = new ByteArrayOutputStream();
46-
g = JSON_F.createGenerator(out);
47-
// g.enable(JsonGenerator.Feature.ESCAPE_UTF8_SURROGATES);
47+
g = JSON_F.createGenerator(out).disable(Feature.COMBINE_UNICODE_SURROGATES);
4848
g.writeStartArray();
4949
g.writeString(toQuote);
5050
g.writeEndArray();

0 commit comments

Comments
 (0)