Skip to content

Commit 8c7f964

Browse files
committed
Account for bytes processed by encoding detection
UTF8StreamJsonParser tracks read pointer (offset) and bytes processed separately and uses those to generate JsonLocation. When the byte payload starts with a UTF BOM, ByteSourceJsonBootstrapper processes a few bytes ahead of the parser, moves/increases the offset and passes the newly computed offset to the parser without telling it some bytes have been pre-processed. With this change, the number of bytes pre-processed for encoding detection is passed to the parser. JsonLocation instances returned by the parser now point to the correct byte offset when payload has a BOM. Issue: #533
1 parent a8fbb07 commit 8c7f964

File tree

5 files changed

+130
-19
lines changed

5 files changed

+130
-19
lines changed

src/main/java/com/fasterxml/jackson/core/json/ByteSourceJsonBootstrapper.java

+3-1
Original file line numberDiff line numberDiff line change
@@ -242,7 +242,9 @@ public JsonParser constructParser(ObjectReadContext readCtxt,
242242
ByteQuadsCanonicalizer rootByteSymbols, CharsToNameCanonicalizer rootCharSymbols,
243243
int factoryFeatures) throws IOException
244244
{
245+
int prevInputPtr = _inputPtr;
245246
JsonEncoding enc = detectEncoding();
247+
int bytesProcessed = _inputPtr - prevInputPtr;
246248

247249
if (enc == JsonEncoding.UTF8) {
248250
/* and without canonicalization, byte-based approach is not performant; just use std UTF-8 reader
@@ -252,7 +254,7 @@ public JsonParser constructParser(ObjectReadContext readCtxt,
252254
ByteQuadsCanonicalizer can = rootByteSymbols.makeChild(factoryFeatures);
253255
return new UTF8StreamJsonParser(readCtxt, _context,
254256
streamReadFeatures, formatReadFeatures, _in, can,
255-
_inputBuffer, _inputPtr, _inputEnd, _bufferRecyclable);
257+
_inputBuffer, _inputPtr, _inputEnd, bytesProcessed, _bufferRecyclable);
256258
}
257259
}
258260
return new ReaderBasedJsonParser(readCtxt, _context, streamReadFeatures, formatReadFeatures,

src/main/java/com/fasterxml/jackson/core/json/UTF8StreamJsonParser.java

+3-3
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,7 @@ public UTF8StreamJsonParser(ObjectReadContext readCtxt, IOContext ctxt,
122122
int stdFeatures, int formatReadFeatures,
123123
InputStream in,
124124
ByteQuadsCanonicalizer sym,
125-
byte[] inputBuffer, int start, int end,
125+
byte[] inputBuffer, int start, int end, int bytesPreProcessed,
126126
boolean bufferRecyclable)
127127
{
128128
super(readCtxt, ctxt, stdFeatures, formatReadFeatures);
@@ -131,9 +131,9 @@ public UTF8StreamJsonParser(ObjectReadContext readCtxt, IOContext ctxt,
131131
_inputBuffer = inputBuffer;
132132
_inputPtr = start;
133133
_inputEnd = end;
134-
_currInputRowStart = start;
134+
_currInputRowStart = start - bytesPreProcessed;
135135
// If we have offset, need to omit that from byte offset, so:
136-
_currInputProcessed = -start;
136+
_currInputProcessed = -start + bytesPreProcessed;
137137
_bufferRecyclable = bufferRecyclable;
138138
}
139139

src/test/java/com/fasterxml/jackson/core/json/LocationOffsetsTest.java

+120-5
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ public void testSimpleInitialOffsets() throws Exception
2323
assertEquals(0L, loc.getCharOffset());
2424
assertEquals(1, loc.getLineNr());
2525
assertEquals(1, loc.getColumnNr());
26-
26+
2727
loc = p.getCurrentLocation();
2828
assertEquals(-1L, loc.getByteOffset());
2929
assertEquals(1L, loc.getCharOffset());
@@ -33,7 +33,7 @@ public void testSimpleInitialOffsets() throws Exception
3333
p.close();
3434

3535
// then byte-based
36-
36+
3737
p = JSON_F.createParser(ObjectReadContext.empty(), DOC.getBytes("UTF-8"));
3838
assertToken(JsonToken.START_OBJECT, p.nextToken());
3939

@@ -42,7 +42,7 @@ public void testSimpleInitialOffsets() throws Exception
4242
assertEquals(-1L, loc.getCharOffset());
4343
assertEquals(1, loc.getLineNr());
4444
assertEquals(1, loc.getColumnNr());
45-
45+
4646
loc = p.getCurrentLocation();
4747
assertEquals(1L, loc.getByteOffset());
4848
assertEquals(-1L, loc.getCharOffset());
@@ -61,15 +61,15 @@ public void testOffsetWithInputOffset() throws Exception
6161
byte[] b = " { } ".getBytes("UTF-8");
6262

6363
// and then peel them off
64-
p = JSON_F.createParser(ObjectReadContext.empty(), b, 3, b.length-5);
64+
p = JSON_F.createParser(ObjectReadContext.empty(), b, 3, b.length - 5);
6565
assertToken(JsonToken.START_OBJECT, p.nextToken());
6666

6767
loc = p.getTokenLocation();
6868
assertEquals(0L, loc.getByteOffset());
6969
assertEquals(-1L, loc.getCharOffset());
7070
assertEquals(1, loc.getLineNr());
7171
assertEquals(1, loc.getColumnNr());
72-
72+
7373
loc = p.getCurrentLocation();
7474
assertEquals(1L, loc.getByteOffset());
7575
assertEquals(-1L, loc.getCharOffset());
@@ -78,4 +78,119 @@ public void testOffsetWithInputOffset() throws Exception
7878

7979
p.close();
8080
}
81+
82+
public void testOffsetWithoutInputOffset() throws Exception
83+
{
84+
JsonLocation loc;
85+
JsonParser p;
86+
// 3 spaces before, 2 after, just for padding
87+
byte[] b = " { } ".getBytes("UTF-8");
88+
89+
// and then peel them off
90+
p = JSON_F.createParser(ObjectReadContext.empty(), b);
91+
assertToken(JsonToken.START_OBJECT, p.nextToken());
92+
93+
loc = p.getTokenLocation();
94+
assertEquals(3L, loc.getByteOffset());
95+
assertEquals(-1L, loc.getCharOffset());
96+
assertEquals(1, loc.getLineNr());
97+
assertEquals(4, loc.getColumnNr());
98+
99+
loc = p.getCurrentLocation();
100+
assertEquals(4L, loc.getByteOffset());
101+
assertEquals(-1L, loc.getCharOffset());
102+
assertEquals(1, loc.getLineNr());
103+
assertEquals(5, loc.getColumnNr());
104+
105+
p.close();
106+
}
107+
108+
// for [core#533]
109+
public void testUtf8Bom() throws Exception
110+
{
111+
JsonLocation loc;
112+
JsonParser p;
113+
114+
byte[] b = withUtf8Bom("{ }".getBytes());
115+
116+
// and then peel them off
117+
p = JSON_F.createParser(ObjectReadContext.empty(), b);
118+
assertToken(JsonToken.START_OBJECT, p.nextToken());
119+
120+
loc = p.getTokenLocation();
121+
assertEquals(3L, loc.getByteOffset());
122+
assertEquals(-1L, loc.getCharOffset());
123+
assertEquals(1, loc.getLineNr());
124+
assertEquals(4, loc.getColumnNr());
125+
126+
loc = p.getCurrentLocation();
127+
assertEquals(4L, loc.getByteOffset());
128+
assertEquals(-1L, loc.getCharOffset());
129+
assertEquals(1, loc.getLineNr());
130+
assertEquals(5, loc.getColumnNr());
131+
132+
p.close();
133+
}
134+
135+
public void testUtf8BomWithPadding() throws Exception
136+
{
137+
JsonLocation loc;
138+
JsonParser p;
139+
140+
byte[] b = withUtf8Bom(" { }".getBytes());
141+
142+
// and then peel them off
143+
p = JSON_F.createParser(ObjectReadContext.empty(), b);
144+
assertToken(JsonToken.START_OBJECT, p.nextToken());
145+
146+
loc = p.getTokenLocation();
147+
assertEquals(6L, loc.getByteOffset());
148+
assertEquals(-1L, loc.getCharOffset());
149+
assertEquals(1, loc.getLineNr());
150+
assertEquals(7, loc.getColumnNr());
151+
152+
loc = p.getCurrentLocation();
153+
assertEquals(7L, loc.getByteOffset());
154+
assertEquals(-1L, loc.getCharOffset());
155+
assertEquals(1, loc.getLineNr());
156+
assertEquals(8, loc.getColumnNr());
157+
158+
p.close();
159+
}
160+
161+
public void testUtf8BomWithInputOffset() throws Exception
162+
{
163+
JsonLocation loc;
164+
JsonParser p;
165+
166+
byte[] b = withUtf8Bom(" { }".getBytes());
167+
168+
// and then peel them off
169+
p = JSON_F.createParser(ObjectReadContext.empty(), b);
170+
assertToken(JsonToken.START_OBJECT, p.nextToken());
171+
172+
loc = p.getTokenLocation();
173+
assertEquals(6L, loc.getByteOffset());
174+
assertEquals(-1L, loc.getCharOffset());
175+
assertEquals(1, loc.getLineNr());
176+
assertEquals(7, loc.getColumnNr());
177+
178+
loc = p.getCurrentLocation();
179+
assertEquals(7L, loc.getByteOffset());
180+
assertEquals(-1L, loc.getCharOffset());
181+
assertEquals(1, loc.getLineNr());
182+
assertEquals(8, loc.getColumnNr());
183+
184+
p.close();
185+
}
186+
187+
private byte[] withUtf8Bom(byte[] bytes) {
188+
byte[] arr = new byte[bytes.length + 3];
189+
// write UTF-8 BOM
190+
arr[0] = (byte) 0xEF;
191+
arr[1] = (byte) 0xBB;
192+
arr[2] = (byte) 0xBF;
193+
System.arraycopy(bytes, 0, arr, 3, bytes.length);
194+
return arr;
195+
}
81196
}

src/test/java/com/fasterxml/jackson/core/read/JsonParserTest.java

+3-9
Original file line numberDiff line numberDiff line change
@@ -432,15 +432,9 @@ public void testUtf8BOMHandling() throws Exception
432432

433433
JsonParser p = JSON_FACTORY.createParser(ObjectReadContext.empty(), input);
434434
assertEquals(JsonToken.START_ARRAY, p.nextToken());
435-
// should also have skipped first 3 bytes of BOM; but do we have offset available?
436-
/* 08-Oct-2013, tatu: Alas, due to [core#111], we have to omit BOM in calculations
437-
* as we do not know what the offset is due to -- may need to revisit, if this
438-
* discrepancy becomes an issue. For now it just means that BOM is considered
439-
* "out of stream" (not part of input).
440-
*/
435+
441436
JsonLocation loc = p.getTokenLocation();
442-
// so if BOM was consider in-stream (part of input), this should expect 3:
443-
assertEquals(0, loc.getByteOffset());
437+
assertEquals(3, loc.getByteOffset());
444438
assertEquals(-1, loc.getCharOffset());
445439
assertEquals(JsonToken.VALUE_NUMBER_INT, p.nextToken());
446440
assertEquals(JsonToken.END_ARRAY, p.nextToken());
@@ -449,7 +443,7 @@ public void testUtf8BOMHandling() throws Exception
449443
p = JSON_FACTORY.createParser(ObjectReadContext.empty(),
450444
new MockDataInput(input));
451445
assertEquals(JsonToken.START_ARRAY, p.nextToken());
452-
// same BOM, but DataInput is more restrctive so can skip but offsets
446+
// same BOM, but DataInput is more restrictive so can skip but offsets
453447
// are not reliable...
454448
loc = p.getTokenLocation();
455449
assertNotNull(loc);

src/test/java/com/fasterxml/jackson/core/util/JsonParserSequenceTest.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ public void testSkipChildren() throws IOException {
4949
UTF8StreamJsonParser uTF8StreamJsonParser = new UTF8StreamJsonParser(ObjectReadContext.empty(),
5050
ioContext,
5151
0, 0, byteArrayInputStream, ByteQuadsCanonicalizer.createRoot(),
52-
byteArray, -1, (byte) 9, true);
52+
byteArray, -1, (byte) 9, 0, true);
5353
JsonParserDelegate jsonParserDelegate = new JsonParserDelegate(jsonParserArray[0]);
5454
JsonParserSequence jsonParserSequence = JsonParserSequence.createFlattened(true, uTF8StreamJsonParser, jsonParserDelegate);
5555
JsonParserSequence jsonParserSequenceTwo = (JsonParserSequence) jsonParserSequence.skipChildren();

0 commit comments

Comments
 (0)