@@ -70,6 +70,7 @@ import Control.Monad.ST (runST)
7070import Data.Bits ((.&.) )
7171import Data.ByteString as B
7272import qualified Data.ByteString.Internal as B
73+ import Data.Foldable (traverse_ )
7374import Data.Text.Encoding.Error (OnDecodeError , UnicodeException , strictDecode )
7475import Data.Text.Internal (Text (.. ), safe , text )
7576import Data.Text.Internal.Functions
@@ -285,19 +286,22 @@ newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable)
285286streamDecodeUtf8 :: ByteString -> Decoding
286287streamDecodeUtf8 = streamDecodeUtf8With strictDecode
287288
288- -- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8
289+ -- | Decode, in a stream oriented way, a lazy 'ByteString' containing UTF-8
289290-- encoded text.
290291--
291292-- @since 1.0.0.0
292293streamDecodeUtf8With :: OnDecodeError -> ByteString -> Decoding
293294streamDecodeUtf8With onErr = decodeChunk B. empty 0 0
294295 where
295296 -- We create a slightly larger than necessary buffer to accommodate a
296- -- potential surrogate pair started in the last buffer
297+ -- potential surrogate pair started in the last buffer (@undecoded0@), or
298+ -- replacement characters for each byte in @undecoded0@ if the
299+ -- sequence turns out to be invalid. There can be up to three bytes there,
300+ -- hence we allocate @len+3@ 16-bit words.
297301 decodeChunk :: ByteString -> CodePoint -> DecoderState -> ByteString
298302 -> Decoding
299303 decodeChunk undecoded0 codepoint0 state0 bs = withBS bs aux where
300- aux fp len = runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A. new (len+ 1 )
304+ aux fp len = runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A. new (len+ 3 )
301305 where
302306 decodeChunkToBuffer :: A. MArray s -> IO Decoding
303307 decodeChunkToBuffer dest = unsafeWithForeignPtr fp $ \ ptr ->
@@ -307,23 +311,32 @@ streamDecodeUtf8With onErr = decodeChunk B.empty 0 0
307311 with nullPtr $ \ curPtrPtr ->
308312 let end = ptr `plusPtr` len
309313 loop curPtr = do
314+ prevState <- peek statePtr
310315 poke curPtrPtr curPtr
311- curPtr' <- c_decode_utf8_with_state (A. maBA dest) destOffPtr
316+ lastPtr <- c_decode_utf8_with_state (A. maBA dest) destOffPtr
312317 curPtrPtr end codepointPtr statePtr
313318 state <- peek statePtr
314319 case state of
315320 UTF8_REJECT -> do
316321 -- We encountered an encoding error
317- x <- peek curPtr'
318322 poke statePtr 0
319- case onErr desc (Just x) of
320- Nothing -> loop $ curPtr' `plusPtr` 1
321- Just c -> do
322- destOff <- peek destOffPtr
323- w <- unsafeSTToIO $
324- unsafeWrite dest (fromIntegral destOff) (safe c)
325- poke destOffPtr (destOff + fromIntegral w)
326- loop $ curPtr' `plusPtr` 1
323+ let skipByte x = case onErr desc (Just x) of
324+ Nothing -> return ()
325+ Just c -> do
326+ destOff <- peek destOffPtr
327+ w <- unsafeSTToIO $
328+ unsafeWrite dest (fromIntegral destOff) (safe c)
329+ poke destOffPtr (destOff + fromIntegral w)
330+ if ptr == lastPtr && prevState /= UTF8_ACCEPT then do
331+ -- If we can't complete the sequence @undecoded0@ from
332+ -- the previous chunk, we invalidate the bytes from
333+ -- @undecoded0@ and retry decoding the current chunk from
334+ -- the initial state.
335+ traverse_ skipByte (B. unpack undecoded0 )
336+ loop lastPtr
337+ else do
338+ peek lastPtr >>= skipByte
339+ loop (lastPtr `plusPtr` 1 )
327340
328341 _ -> do
329342 -- We encountered the end of the buffer while decoding
@@ -332,11 +345,11 @@ streamDecodeUtf8With onErr = decodeChunk B.empty 0 0
332345 chunkText <- unsafeSTToIO $ do
333346 arr <- A. unsafeFreeze dest
334347 return $! text arr 0 (fromIntegral n)
335- lastPtr <- peek curPtrPtr
336- let left = lastPtr `minusPtr` curPtr
348+ let left = lastPtr `minusPtr` ptr
337349 ! undecoded = case state of
338350 UTF8_ACCEPT -> B. empty
339- _ -> B. append undecoded0 (B. drop left bs)
351+ _ | left == 0 && prevState /= UTF8_ACCEPT -> B. append undecoded0 bs
352+ | otherwise -> B. drop left bs
340353 return $ Some chunkText undecoded
341354 (decodeChunk undecoded codepoint state)
342355 in loop ptr
0 commit comments