Fix #236

cowtowncoder · cowtowncoder · commit ab746939fdf8 · 2021-01-29T19:04:27.000-08:00
diff --git a/cbor/src/main/java/com/fasterxml/jackson/dataformat/cbor/CBORParser.java b/cbor/src/main/java/com/fasterxml/jackson/dataformat/cbor/CBORParser.java
@@ -89,7 +89,7 @@ private Feature(boolean defaultState) {
      * I/O context for this reader. It handles buffer allocation
      * for the reader.
      */
-    final protected IOContext _ioContext;
+    protected final IOContext _ioContext;
 
     /**
      * Flag that indicates whether parser is closed or not. Gets
@@ -2157,17 +2157,21 @@ protected String _finishTextToken(int ch) throws IOException
             _finishChunkedText();
             return _textBuffer.contentsAsString();
         }
-        if (len > (_inputEnd - _inputPtr)) {
-            // or if not, could we read?
-            if (len >= _inputBuffer.length) {
-                // If not enough space, need handling similar to chunked
-                _finishLongText(len);
-                return _textBuffer.contentsAsString();
-            }
-            _loadToHaveAtLeast(len);
+        // 29-Jan-2021, tatu: as per [dataformats-binary#238] must keep in mind that
+        //    the longest individual unit is 4 bytes (surrogate pair) so we
+        //    actually need len+3 bytes to avoid bounds checks
+        final int needed = len + 3;
+        final int available = _inputEnd - _inputPtr;
+
+        if ((available >= needed)
+            // if not, could we read? NOTE: we do not require it, just attempt to read
+                || ((_inputBuffer.length >= needed)
+                        && _tryToLoadToHaveAtLeast(needed))) {
+            return _finishShortText(len);
         }
-        // offline for better optimization
-        return _finishShortText(len);
+        // If not enough space, need handling similar to chunked
+        _finishLongText(len);
+        return _textBuffer.contentsAsString();
     }
 
     private final String _finishShortText(int len) throws IOException
@@ -2240,7 +2244,7 @@ private final void _finishLongText(int len) throws IOException
                 continue;
             }
             if ((len -= code) < 0) { // may need to improve error here but...
-                throw _constructError("Malformed UTF-8 character at end of long (non-chunked) text segment");
+                throw _constructError("Malformed UTF-8 character at the end of a (non-chunked) text segment");
             }
             
             switch (code) {
@@ -2260,13 +2264,13 @@ private final void _finishLongText(int len) throws IOException
                 break;
             case 3: // 4-byte UTF
                 c = _decodeUTF8_4(c);
-                // Let's add first part right away:
-                outBuf[outPtr++] = (char) (0xD800 | (c >> 10));
                 if (outPtr >= outBuf.length) {
                     outBuf = _textBuffer.finishCurrentSegment();
                     outPtr = 0;
                     outEnd = outBuf.length;
                 }
+                // Let's add first part right away:
+                outBuf[outPtr++] = (char) (0xD800 | (c >> 10));
                 c = 0xDC00 | (c & 0x3FF);
                 // And let the other char output down below
                 break;
@@ -3367,6 +3371,37 @@ protected final void _loadToHaveAtLeast(int minAvailable) throws IOException
         }
     }
 
+    // @since 2.12.2
+    protected final boolean _tryToLoadToHaveAtLeast(int minAvailable) throws IOException
+    {
+        // No input stream, no leading (either we are closed, or have non-stream input source)
+        if (_inputStream == null) {
+            return false;
+        }
+        // Need to move remaining data in front?
+        int amount = _inputEnd - _inputPtr;
+        if (amount > 0 && _inputPtr > 0) {
+            //_currInputRowStart -= _inputPtr;
+            System.arraycopy(_inputBuffer, _inputPtr, _inputBuffer, 0, amount);
+            _inputEnd = amount;
+        } else {
+            _inputEnd = 0;
+        }
+        // Needs to be done here, as per [dataformats-binary#178]
+        _currInputProcessed += _inputPtr;
+        _inputPtr = 0;
+        while (_inputEnd < minAvailable) {
+            int count = _inputStream.read(_inputBuffer, _inputEnd, _inputBuffer.length - _inputEnd);
+            if (count < 1) {
+                // End of input; not ideal but we'll accept it here
+                _closeInput();
+                return false;
+            }
+            _inputEnd += count;
+        }
+        return true;
+    }
+
     protected ByteArrayBuilder _getByteArrayBuilder() {
         if (_byteArrayBuilder == null) {
             _byteArrayBuilder = new ByteArrayBuilder();
diff --git a/cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/failing/ParseInvalidUTF8String236Test.java b/cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/failing/ParseInvalidUTF8String236Test.java
@@ -1,20 +1,57 @@
 package com.fasterxml.jackson.dataformat.cbor.failing;
 
-import com.fasterxml.jackson.core.JsonToken;
+import com.fasterxml.jackson.core.*;
+import com.fasterxml.jackson.core.exc.StreamReadException;
 import com.fasterxml.jackson.dataformat.cbor.CBORParser;
 import com.fasterxml.jackson.dataformat.cbor.CBORTestBase;
 
 public class ParseInvalidUTF8String236Test extends CBORTestBase
 {
-    // [dataformats-binary#236]: Ends with the first byte of alleged 2-byte
-    // UTF-8 character; parser trying to access second byte beyond end.
-    public void testArrayIssue236() throws Exception
+    // [dataformats-binary#236]: Original version; broken UTF-8 all around.
+    // but gets hit by end-of-input only (since content not validated)
+    public void testShortString236Original() throws Exception
     {
         final byte[] input = {0x66, (byte) 0xef, 0x7d, 0x7d, 0xa, 0x2d, (byte) 0xda};
         try (CBORParser p = cborParser(input)) {
             assertToken(JsonToken.VALUE_STRING, p.nextToken());
-            assertEquals("foobar", p.getText());
-            assertNull(p.nextToken());
+            try {
+                String str = p.getText();
+                fail("Should have failed, did not, String = '"+str+"'");
+            } catch (StreamReadException e) {
+                verifyException(e, "Invalid UTF-8 middle byte 0x7d");
+            }
+        }
+    }
+
+    // Variant where the length would be valid, but the last byte is partial UTF-8
+    // code point
+    public void testShortString236EndsWithPartialUTF8() throws Exception
+    {
+        final byte[] input = {0x63, 0x41, 0x2d, (byte) 0xda};
+        try (CBORParser p = cborParser(input)) {
+            assertToken(JsonToken.VALUE_STRING, p.nextToken());
+            try {
+                String str = p.getText();
+                fail("Should have failed, did not, String = '"+str+"'");
+            } catch (StreamReadException e) {
+                verifyException(e, "Malformed UTF-8 character at the end of");
+            }
+        }
+    }
+
+    // Variant where the length itself exceeds buffer
+    public void testShortString236TruncatedString() throws Exception
+    {
+        // String with length of 6 bytes claimed; only 5 provided
+        final byte[] input = {0x63, 0x41, 0x2d };
+        try (CBORParser p = cborParser(input)) {
+            assertToken(JsonToken.VALUE_STRING, p.nextToken());
+            try {
+                String str = p.getText();
+                fail("Should have failed, did not, String = '"+str+"'");
+            } catch (StreamReadException e) {
+                verifyException(e, "Unexpected end-of-input in VALUE_STRING");
+            }
         }
     }
 }
diff --git a/release-notes/CREDITS-2.x b/release-notes/CREDITS-2.x
@@ -157,3 +157,9 @@ Josh Barr (jobarr-amzn@github)
 
 * Contributed #232: (ion) Allow disabling native type ids in IonMapper
  (2.12.1)
+
+Fabian Meumertzheim (fmeum@github)
+
+* Reported #236: `ArrayIndexOutOfBoundsException` in `CBORParser` for invalid UTF-8 String
+ (2.12.2)
+
diff --git a/release-notes/VERSION-2.x b/release-notes/VERSION-2.x
@@ -9,6 +9,11 @@ Project: jackson-datatypes-binaryModules:
 === Releases ===
 ------------------------------------------------------------------------
 
+2.12.2 (not yet released)
+
+#236: `ArrayIndexOutOfBoundsException` in `CBORParser` for invalid UTF-8 String
+ (reported by Fabian M)
+
 2.12.1 (08-Jan-2021)
 
 #232: (ion) Allow disabling native type ids in IonMapper