@@ -2281,18 +2281,14 @@ protected void _finishToken() throws IOException
22812281 }
22822282 return ;
22832283 }
2284- // 29-Jan-2021, tatu: as per [dataformats-binary#238] must keep in mind that
2285- // the longest individual unit is 4 bytes (surrogate pair) so we
2286- // actually need len+3 bytes to avoid bounds checks
22872284 // 18-Jan-2024, tatu: For malicious input / Fuzzers, need to worry about overflow
22882285 // like Integer.MAX_VALUE
2289- final int needed = Math .max (len , len + 3 );
22902286 final int available = _inputEnd - _inputPtr ;
22912287
2292- if ((available >= needed )
2288+ if ((available >= len )
22932289 // if not, could we read? NOTE: we do not require it, just attempt to read
2294- || ((_inputBuffer .length >= needed )
2295- && _tryToLoadToHaveAtLeast (needed ))) {
2290+ || ((_inputBuffer .length >= len )
2291+ && _tryToLoadToHaveAtLeast (len ))) {
22962292 _finishShortText (len );
22972293 return ;
22982294 }
@@ -2326,22 +2322,18 @@ protected String _finishTextToken(int ch) throws IOException
23262322 _finishChunkedText ();
23272323 return _textBuffer .contentsAsString ();
23282324 }
2329- // 29-Jan-2021, tatu: as per [dataformats-binary#238] must keep in mind that
2330- // the longest individual unit is 4 bytes (surrogate pair) so we
2331- // actually need len+3 bytes to avoid bounds checks
23322325
23332326 // 19-Mar-2021, tatu: [dataformats-binary#259] shows the case where length
23342327 // we get is Integer.MAX_VALUE, leading to overflow. Could change values
23352328 // to longs but simpler to truncate "needed" (will never pass following test
23362329 // due to inputBuffer never being even close to that big).
23372330
2338- final int needed = Math .max (len + 3 , len );
23392331 final int available = _inputEnd - _inputPtr ;
23402332
2341- if ((available >= needed )
2333+ if ((available >= len )
23422334 // if not, could we read? NOTE: we do not require it, just attempt to read
2343- || ((_inputBuffer .length >= needed )
2344- && _tryToLoadToHaveAtLeast (needed ))) {
2335+ || ((_inputBuffer .length >= len )
2336+ && _tryToLoadToHaveAtLeast (len ))) {
23452337 return _finishShortText (len );
23462338 }
23472339 // If not enough space, need handling similar to chunked
@@ -2369,7 +2361,7 @@ private final String _finishShortText(int len) throws IOException
23692361 final byte [] inputBuf = _inputBuffer ;
23702362
23712363 // Let's actually do a tight loop for ASCII first:
2372- final int end = inPtr + len ;
2364+ final int end = _inputPtr ;
23732365
23742366 int i ;
23752367 while ((i = inputBuf [inPtr ]) >= 0 ) {
@@ -2386,44 +2378,50 @@ private final String _finishShortText(int len) throws IOException
23862378 final int [] codes = UTF8_UNIT_CODES ;
23872379 do {
23882380 i = inputBuf [inPtr ++] & 0xFF ;
2389- switch (codes [i ]) {
2390- case 0 :
2391- break ;
2392- case 1 :
2393- {
2394- final int c2 = inputBuf [inPtr ++];
2395- if ((c2 & 0xC0 ) != 0x080 ) {
2396- _reportInvalidOther (c2 & 0xFF , inPtr );
2397- }
2398- i = ((i & 0x1F ) << 6 ) | (c2 & 0x3F );
2381+ int code = codes [i ];
2382+ if (code != 0 ) {
2383+ // 05-Jul-2021, tatu: As per [dataformats-binary#289] need to
2384+ // be careful wrt end-of-buffer truncated codepoints
2385+ if ((inPtr + code ) > end ) {
2386+ final int firstCharOffset = len - (end - inPtr ) - 1 ;
2387+ _reportTruncatedUTF8InString (len , firstCharOffset , i , code );
23992388 }
2400- break ;
2401- case 2 :
2402- {
2403- final int c2 = inputBuf [inPtr ++];
2404- if ((c2 & 0xC0 ) != 0x080 ) {
2405- _reportInvalidOther (c2 & 0xFF , inPtr );
2389+
2390+ switch (code ) {
2391+ case 1 : {
2392+ final int c2 = inputBuf [inPtr ++];
2393+ if ((c2 & 0xC0 ) != 0x080 ) {
2394+ _reportInvalidOther (c2 & 0xFF , inPtr );
2395+ }
2396+ i = ((i & 0x1F ) << 6 ) | (c2 & 0x3F );
24062397 }
2407- final int c3 = inputBuf [inPtr ++];
2408- if ((c3 & 0xC0 ) != 0x080 ) {
2409- _reportInvalidOther (c3 & 0xFF , inPtr );
2398+ break ;
2399+ case 2 : {
2400+ final int c2 = inputBuf [inPtr ++];
2401+ if ((c2 & 0xC0 ) != 0x080 ) {
2402+ _reportInvalidOther (c2 & 0xFF , inPtr );
2403+ }
2404+ final int c3 = inputBuf [inPtr ++];
2405+ if ((c3 & 0xC0 ) != 0x080 ) {
2406+ _reportInvalidOther (c3 & 0xFF , inPtr );
2407+ }
2408+ i = ((i & 0x0F ) << 12 ) | ((c2 & 0x3F ) << 6 ) | (c3 & 0x3F );
24102409 }
2411- i = ((i & 0x0F ) << 12 ) | ((c2 & 0x3F ) << 6 ) | (c3 & 0x3F );
2410+ break ;
2411+ case 3 :
2412+ // 30-Jan-2021, tatu: TODO - validate these too?
2413+ i = ((i & 0x07 ) << 18 )
2414+ | ((inputBuf [inPtr ++] & 0x3F ) << 12 )
2415+ | ((inputBuf [inPtr ++] & 0x3F ) << 6 )
2416+ | (inputBuf [inPtr ++] & 0x3F );
2417+ // note: this is the codepoint value; need to split, too
2418+ i -= 0x10000 ;
2419+ outBuf [outPtr ++] = (char ) (0xD800 | (i >> 10 ));
2420+ i = 0xDC00 | (i & 0x3FF );
2421+ break ;
2422+ default : // invalid
2423+ _reportInvalidInitial (i );
24122424 }
2413- break ;
2414- case 3 :
2415- // 30-Jan-2021, tatu: TODO - validate these too?
2416- i = ((i & 0x07 ) << 18 )
2417- | ((inputBuf [inPtr ++] & 0x3F ) << 12 )
2418- | ((inputBuf [inPtr ++] & 0x3F ) << 6 )
2419- | (inputBuf [inPtr ++] & 0x3F );
2420- // note: this is the codepoint value; need to split, too
2421- i -= 0x10000 ;
2422- outBuf [outPtr ++] = (char ) (0xD800 | (i >> 10 ));
2423- i = 0xDC00 | (i & 0x3FF );
2424- break ;
2425- default : // invalid
2426- _reportInvalidInitial (i );
24272425 }
24282426 outBuf [outPtr ++] = (char ) i ;
24292427 } while (inPtr < end );
@@ -3850,18 +3848,16 @@ protected void _reportIncompleteBinaryRead(int expLen, int actLen) throws IOExce
38503848 expLen , actLen ), _currToken );
38513849 }
38523850
3853- // @since 2.13
3854- /*
3851+ // @since 2.18.1
38553852 private String _reportTruncatedUTF8InString (int strLenBytes , int truncatedCharOffset ,
38563853 int firstUTFByteValue , int bytesExpected )
38573854 throws IOException
38583855 {
38593856 throw _constructError (String .format (
3860- "Truncated UTF-8 character in Chunked Unicode String value (%d bytes): "
3857+ "Truncated UTF-8 character in Unicode String value (%d bytes): "
38613858+"byte 0x%02X at offset #%d indicated %d more bytes needed" ,
38623859strLenBytes , firstUTFByteValue , truncatedCharOffset , bytesExpected ));
38633860 }
3864- */
38653861
38663862 // @since 2.13
38673863 private String _reportTruncatedUTF8InName (int strLenBytes , int truncatedCharOffset ,
0 commit comments