Fix UTF-8 decoding of lazy bytestrings

Lysxia · Lysxia · commit 48b2d2d1abef · 2021-05-08T17:04:30.000-04:00
At the beginning of a new chunk we may be trying to complete a UTF-8
sequence started in the previous chunk (contained in the `undecode0`
buffer). If it turns out to be invalid, we must apply the `onErr`
handler to every character in that buffer.

When we reach the end of the chunk, we must also be more careful
about when to keep the previous buffer: a UTF-8 sequence (up to 4 bytes)
can span more than two chunks, when those chunks are very short
(of length 0, 1, or 2).
diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
@@ -70,6 +70,7 @@ import Control.Monad.ST (runST)
 import Data.Bits ((.&.))
 import Data.ByteString as B
 import qualified Data.ByteString.Internal as B
+import Data.Foldable (traverse_)
 import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode)
 import Data.Text.Internal (Text(..), safe, text)
 import Data.Text.Internal.Functions
@@ -285,19 +286,22 @@ newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable)
 streamDecodeUtf8 :: ByteString -> Decoding
 streamDecodeUtf8 = streamDecodeUtf8With strictDecode
 
--- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8
+-- | Decode, in a stream oriented way, a lazy 'ByteString' containing UTF-8
 -- encoded text.
 --
 -- @since 1.0.0.0
 streamDecodeUtf8With :: OnDecodeError -> ByteString -> Decoding
 streamDecodeUtf8With onErr = decodeChunk B.empty 0 0
  where
   -- We create a slightly larger than necessary buffer to accommodate a
-  -- potential surrogate pair started in the last buffer
+  -- potential surrogate pair started in the last buffer (@undecoded0@), or
+  -- replacement characters for each byte in @undecoded0@ if the
+  -- sequence turns out to be invalid. There can be up to three bytes there,
+  -- hence we allocate @len+3@ 16-bit words.
   decodeChunk :: ByteString -> CodePoint -> DecoderState -> ByteString
               -> Decoding
   decodeChunk undecoded0 codepoint0 state0 bs = withBS bs aux where
-    aux fp len = runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A.new (len+1)
+    aux fp len = runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A.new (len+3)
        where
         decodeChunkToBuffer :: A.MArray s -> IO Decoding
         decodeChunkToBuffer dest = unsafeWithForeignPtr fp $ \ptr ->
@@ -307,23 +311,32 @@ streamDecodeUtf8With onErr = decodeChunk B.empty 0 0
           with nullPtr $ \curPtrPtr ->
             let end = ptr `plusPtr` len
                 loop curPtr = do
+                  prevState <- peek statePtr
                   poke curPtrPtr curPtr
-                  curPtr' <- c_decode_utf8_with_state (A.maBA dest) destOffPtr
+                  lastPtr <- c_decode_utf8_with_state (A.maBA dest) destOffPtr
                              curPtrPtr end codepointPtr statePtr
                   state <- peek statePtr
                   case state of
                     UTF8_REJECT -> do
                       -- We encountered an encoding error
-                      x <- peek curPtr'
                       poke statePtr 0
-                      case onErr desc (Just x) of
-                        Nothing -> loop $ curPtr' `plusPtr` 1
-                        Just c -> do
-                          destOff <- peek destOffPtr
-                          w <- unsafeSTToIO $
-                               unsafeWrite dest (fromIntegral destOff) (safe c)
-                          poke destOffPtr (destOff + fromIntegral w)
-                          loop $ curPtr' `plusPtr` 1
+                      let skipByte x = case onErr desc (Just x) of
+                            Nothing -> return ()
+                            Just c -> do
+                              destOff <- peek destOffPtr
+                              w <- unsafeSTToIO $
+                                   unsafeWrite dest (fromIntegral destOff) (safe c)
+                              poke destOffPtr (destOff + fromIntegral w)
+                      if ptr == lastPtr && prevState /= UTF8_ACCEPT then do
+                        -- If we can't complete the sequence @undecoded0@ from
+                        -- the previous chunk, we invalidate the bytes from
+                        -- @undecoded0@ and retry decoding the current chunk from
+                        -- the initial state.
+                        traverse_ skipByte (B.unpack undecoded0 )
+                        loop lastPtr
+                      else do
+                        peek lastPtr >>= skipByte
+                        loop (lastPtr `plusPtr` 1)
 
                     _ -> do
                       -- We encountered the end of the buffer while decoding
@@ -332,11 +345,11 @@ streamDecodeUtf8With onErr = decodeChunk B.empty 0 0
                       chunkText <- unsafeSTToIO $ do
                           arr <- A.unsafeFreeze dest
                           return $! text arr 0 (fromIntegral n)
-                      lastPtr <- peek curPtrPtr
-                      let left = lastPtr `minusPtr` curPtr
+                      let left = lastPtr `minusPtr` ptr
                           !undecoded = case state of
                             UTF8_ACCEPT -> B.empty
-                            _           -> B.append undecoded0 (B.drop left bs)
+                            _ | left == 0 && prevState /= UTF8_ACCEPT -> B.append undecoded0 bs
+                              | otherwise -> B.drop left bs
                       return $ Some chunkText undecoded
                                (decodeChunk undecoded codepoint state)
             in loop ptr
diff --git a/src/Data/Text/Lazy/Encoding.hs b/src/Data/Text/Lazy/Encoding.hs
@@ -102,9 +102,13 @@ decodeUtf8With onErr (B.Chunk b0 bs0) =
         TE.Some t l f -> chunk t (go f l bs)
     go _ l _
       | S.null l  = empty
-      | otherwise = case onErr desc (Just (B.unsafeHead l)) of
-                      Nothing -> empty
-                      Just c  -> Chunk (T.singleton c) Empty
+      | otherwise =
+        let !t = T.pack (skipBytes l)
+            skipBytes = S.foldr (\x s' ->
+                  case onErr desc (Just x) of
+                    Just c -> c : s'
+                    Nothing -> s') [] in
+        Chunk t Empty
     desc = "Data.Text.Lazy.Encoding.decodeUtf8With: Invalid UTF-8 stream"
 decodeUtf8With _ _ = empty
 
diff --git a/tests/Tests/Properties/Transcoding.hs b/tests/Tests/Properties/Transcoding.hs
@@ -24,6 +24,7 @@ import qualified Data.ByteString as B
 import qualified Data.ByteString.Lazy as BL
 import qualified Data.Text as T
 import qualified Data.Text.Encoding as E
+import qualified Data.Text.Encoding.Error as E
 import qualified Data.Text.Lazy as TL
 import qualified Data.Text.Lazy.Encoding as EL
 
@@ -152,6 +153,18 @@ genInvalidUTF8 = B.pack <$> oneof [
       k <- choose (0,n)
       vectorOf k gen
 
+decodeLL :: BL.ByteString -> TL.Text
+decodeLL = EL.decodeUtf8With E.lenientDecode
+
+decodeL :: B.ByteString -> T.Text
+decodeL = E.decodeUtf8With E.lenientDecode
+
+-- The lenient decoding of lazy bytestrings should not depend on how they are chunked,
+-- and it should behave the same as decoding of strict bytestrings.
+t_decode_utf8_lenient :: Property
+t_decode_utf8_lenient = forAllShrinkShow arbitrary shrink (show . BL.toChunks) $ \bs ->
+    decodeLL bs === (TL.fromStrict . decodeL . B.concat . BL.toChunks) bs
+
 -- See http://unicode.org/faq/utf_bom.html#gen8
 -- A sequence such as <110xxxxx2 0xxxxxxx2> is illegal ...
 -- When faced with this illegal byte sequence ... a UTF-8 conformant process
@@ -206,6 +219,7 @@ testTranscoding =
       testProperty "t_utf8_err'" t_utf8_err'
     ],
     testGroup "error recovery" [
+      testProperty "t_decode_utf8_lenient" t_decode_utf8_lenient,
       testProperty "t_decode_with_error2" t_decode_with_error2,
       testProperty "t_decode_with_error3" t_decode_with_error3,
       testProperty "t_decode_with_error4" t_decode_with_error4,
diff --git a/tests/Tests/QuickCheckUtils.hs b/tests/Tests/QuickCheckUtils.hs
@@ -52,6 +52,7 @@ import Test.QuickCheck.Monadic (assert, monadicIO, run)
 import Test.QuickCheck.Unicode (string)
 import Tests.Utils
 import qualified Data.ByteString as B
+import qualified Data.ByteString.Lazy as BL
 import qualified Data.Text as T
 import qualified Data.Text.Encoding.Error as T
 import qualified Data.Text.Internal.Fusion as TF
@@ -69,6 +70,9 @@ import Data.Word (Word, Word64)
 genUnicode :: IsString a => Gen a
 genUnicode = fromString <$> string
 
+genWord8 :: Gen Word8
+genWord8 = chooseAny
+
 instance Random I16 where
     randomR = integralRandomR
     random  = randomR (minBound,maxBound)
@@ -78,9 +82,23 @@ instance Arbitrary I16 where
     shrink        = shrinkIntegral
 
 instance Arbitrary B.ByteString where
-    arbitrary     = B.pack `fmap` arbitrary
+    arbitrary     = B.pack `fmap` listOf genWord8
     shrink        = map B.pack . shrink . B.unpack
 
+instance Arbitrary BL.ByteString where
+    arbitrary = oneof
+      [ BL.fromChunks <$> arbitrary
+      -- so that a single utf8 code point could appear split over up to 4 chunks
+      , BL.fromChunks . map B.singleton <$> listOf genWord8
+      -- so that a code point with 4 byte long utf8 representation
+      -- could appear split over 3 non-singleton chunks
+      , (\a b c -> BL.fromChunks [a, b, c])
+        <$> arbitrary
+        <*> ((\a b -> B.pack [a, b]) <$> genWord8 <*> genWord8)
+        <*> arbitrary
+      ]
+    shrink xs = BL.fromChunks <$> shrink (BL.toChunks xs)
+
 #if !MIN_VERSION_base(4,4,0)
 instance Random Int64 where
     randomR = integralRandomR
diff --git a/tests/Tests/Regressions.hs b/tests/Tests/Regressions.hs
@@ -20,6 +20,7 @@ import qualified Data.ByteString.Lazy as LB
 import qualified Data.Text as T
 import qualified Data.Text.Array as TA
 import qualified Data.Text.Encoding as TE
+import qualified Data.Text.Encoding.Error as E
 import qualified Data.Text.Internal as T
 import qualified Data.Text.IO as T
 import qualified Data.Text.Lazy as LT
@@ -136,6 +137,13 @@ t301 = do
     original@(T.Text originalArr originalOff originalLen) = T.pack "1234567890"
     T.Text newArr _off _len = T.take 1 $ T.drop 1 original
 
+t330 :: IO ()
+t330 = do
+  let decodeL = LE.decodeUtf8With E.lenientDecode
+  assertEqual "The lenient decoding of lazy bytestrings should not depend on how they are chunked"
+    (decodeL (LB.fromChunks [B.pack [194], B.pack [97, 98, 99]]))
+    (decodeL (LB.fromChunks [B.pack [194, 97, 98, 99]]))
+
 tests :: F.TestTree
 tests = F.testGroup "Regressions"
     [ F.testCase "hGetContents_crash" hGetContents_crash
@@ -149,4 +157,5 @@ tests = F.testGroup "Regressions"
     , F.testCase "t280/fromString" t280_fromString
     , F.testCase "t280/singleton" t280_singleton
     , F.testCase "t301" t301
+    , F.testCase "t330" t330
     ]