Skip to content

Commit 48b2d2d

Browse files
committed
Fix UTF-8 decoding of lazy bytestrings
At the beginning of a new chunk we may be trying to complete a UTF-8 sequence started in the previous chunk (contained in the `undecode0` buffer). If it turns out to be invalid, we must apply the `onErr` handler to every character in that buffer. When we reach the end of the chunk, we must also be more careful about when to keep the previous buffer: a UTF-8 sequence (up to 4 bytes) can span more than two chunks, when those chunks are very short (of length 0, 1, or 2).
1 parent 35d53cd commit 48b2d2d

File tree

5 files changed

+78
-20
lines changed

5 files changed

+78
-20
lines changed

src/Data/Text/Encoding.hs

Lines changed: 29 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ import Control.Monad.ST (runST)
7070
import Data.Bits ((.&.))
7171
import Data.ByteString as B
7272
import qualified Data.ByteString.Internal as B
73+
import Data.Foldable (traverse_)
7374
import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode)
7475
import Data.Text.Internal (Text(..), safe, text)
7576
import Data.Text.Internal.Functions
@@ -285,19 +286,22 @@ newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable)
285286
streamDecodeUtf8 :: ByteString -> Decoding
286287
streamDecodeUtf8 = streamDecodeUtf8With strictDecode
287288

288-
-- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8
289+
-- | Decode, in a stream oriented way, a lazy 'ByteString' containing UTF-8
289290
-- encoded text.
290291
--
291292
-- @since 1.0.0.0
292293
streamDecodeUtf8With :: OnDecodeError -> ByteString -> Decoding
293294
streamDecodeUtf8With onErr = decodeChunk B.empty 0 0
294295
where
295296
-- We create a slightly larger than necessary buffer to accommodate a
296-
-- potential surrogate pair started in the last buffer
297+
-- potential surrogate pair started in the last buffer (@undecoded0@), or
298+
-- replacement characters for each byte in @undecoded0@ if the
299+
-- sequence turns out to be invalid. There can be up to three bytes there,
300+
-- hence we allocate @len+3@ 16-bit words.
297301
decodeChunk :: ByteString -> CodePoint -> DecoderState -> ByteString
298302
-> Decoding
299303
decodeChunk undecoded0 codepoint0 state0 bs = withBS bs aux where
300-
aux fp len = runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A.new (len+1)
304+
aux fp len = runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A.new (len+3)
301305
where
302306
decodeChunkToBuffer :: A.MArray s -> IO Decoding
303307
decodeChunkToBuffer dest = unsafeWithForeignPtr fp $ \ptr ->
@@ -307,23 +311,32 @@ streamDecodeUtf8With onErr = decodeChunk B.empty 0 0
307311
with nullPtr $ \curPtrPtr ->
308312
let end = ptr `plusPtr` len
309313
loop curPtr = do
314+
prevState <- peek statePtr
310315
poke curPtrPtr curPtr
311-
curPtr' <- c_decode_utf8_with_state (A.maBA dest) destOffPtr
316+
lastPtr <- c_decode_utf8_with_state (A.maBA dest) destOffPtr
312317
curPtrPtr end codepointPtr statePtr
313318
state <- peek statePtr
314319
case state of
315320
UTF8_REJECT -> do
316321
-- We encountered an encoding error
317-
x <- peek curPtr'
318322
poke statePtr 0
319-
case onErr desc (Just x) of
320-
Nothing -> loop $ curPtr' `plusPtr` 1
321-
Just c -> do
322-
destOff <- peek destOffPtr
323-
w <- unsafeSTToIO $
324-
unsafeWrite dest (fromIntegral destOff) (safe c)
325-
poke destOffPtr (destOff + fromIntegral w)
326-
loop $ curPtr' `plusPtr` 1
323+
let skipByte x = case onErr desc (Just x) of
324+
Nothing -> return ()
325+
Just c -> do
326+
destOff <- peek destOffPtr
327+
w <- unsafeSTToIO $
328+
unsafeWrite dest (fromIntegral destOff) (safe c)
329+
poke destOffPtr (destOff + fromIntegral w)
330+
if ptr == lastPtr && prevState /= UTF8_ACCEPT then do
331+
-- If we can't complete the sequence @undecoded0@ from
332+
-- the previous chunk, we invalidate the bytes from
333+
-- @undecoded0@ and retry decoding the current chunk from
334+
-- the initial state.
335+
traverse_ skipByte (B.unpack undecoded0 )
336+
loop lastPtr
337+
else do
338+
peek lastPtr >>= skipByte
339+
loop (lastPtr `plusPtr` 1)
327340

328341
_ -> do
329342
-- We encountered the end of the buffer while decoding
@@ -332,11 +345,11 @@ streamDecodeUtf8With onErr = decodeChunk B.empty 0 0
332345
chunkText <- unsafeSTToIO $ do
333346
arr <- A.unsafeFreeze dest
334347
return $! text arr 0 (fromIntegral n)
335-
lastPtr <- peek curPtrPtr
336-
let left = lastPtr `minusPtr` curPtr
348+
let left = lastPtr `minusPtr` ptr
337349
!undecoded = case state of
338350
UTF8_ACCEPT -> B.empty
339-
_ -> B.append undecoded0 (B.drop left bs)
351+
_ | left == 0 && prevState /= UTF8_ACCEPT -> B.append undecoded0 bs
352+
| otherwise -> B.drop left bs
340353
return $ Some chunkText undecoded
341354
(decodeChunk undecoded codepoint state)
342355
in loop ptr

src/Data/Text/Lazy/Encoding.hs

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -102,9 +102,13 @@ decodeUtf8With onErr (B.Chunk b0 bs0) =
102102
TE.Some t l f -> chunk t (go f l bs)
103103
go _ l _
104104
| S.null l = empty
105-
| otherwise = case onErr desc (Just (B.unsafeHead l)) of
106-
Nothing -> empty
107-
Just c -> Chunk (T.singleton c) Empty
105+
| otherwise =
106+
let !t = T.pack (skipBytes l)
107+
skipBytes = S.foldr (\x s' ->
108+
case onErr desc (Just x) of
109+
Just c -> c : s'
110+
Nothing -> s') [] in
111+
Chunk t Empty
108112
desc = "Data.Text.Lazy.Encoding.decodeUtf8With: Invalid UTF-8 stream"
109113
decodeUtf8With _ _ = empty
110114

tests/Tests/Properties/Transcoding.hs

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import qualified Data.ByteString as B
2424
import qualified Data.ByteString.Lazy as BL
2525
import qualified Data.Text as T
2626
import qualified Data.Text.Encoding as E
27+
import qualified Data.Text.Encoding.Error as E
2728
import qualified Data.Text.Lazy as TL
2829
import qualified Data.Text.Lazy.Encoding as EL
2930

@@ -152,6 +153,18 @@ genInvalidUTF8 = B.pack <$> oneof [
152153
k <- choose (0,n)
153154
vectorOf k gen
154155

156+
decodeLL :: BL.ByteString -> TL.Text
157+
decodeLL = EL.decodeUtf8With E.lenientDecode
158+
159+
decodeL :: B.ByteString -> T.Text
160+
decodeL = E.decodeUtf8With E.lenientDecode
161+
162+
-- The lenient decoding of lazy bytestrings should not depend on how they are chunked,
163+
-- and it should behave the same as decoding of strict bytestrings.
164+
t_decode_utf8_lenient :: Property
165+
t_decode_utf8_lenient = forAllShrinkShow arbitrary shrink (show . BL.toChunks) $ \bs ->
166+
decodeLL bs === (TL.fromStrict . decodeL . B.concat . BL.toChunks) bs
167+
155168
-- See http://unicode.org/faq/utf_bom.html#gen8
156169
-- A sequence such as <110xxxxx2 0xxxxxxx2> is illegal ...
157170
-- When faced with this illegal byte sequence ... a UTF-8 conformant process
@@ -206,6 +219,7 @@ testTranscoding =
206219
testProperty "t_utf8_err'" t_utf8_err'
207220
],
208221
testGroup "error recovery" [
222+
testProperty "t_decode_utf8_lenient" t_decode_utf8_lenient,
209223
testProperty "t_decode_with_error2" t_decode_with_error2,
210224
testProperty "t_decode_with_error3" t_decode_with_error3,
211225
testProperty "t_decode_with_error4" t_decode_with_error4,

tests/Tests/QuickCheckUtils.hs

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ import Test.QuickCheck.Monadic (assert, monadicIO, run)
5252
import Test.QuickCheck.Unicode (string)
5353
import Tests.Utils
5454
import qualified Data.ByteString as B
55+
import qualified Data.ByteString.Lazy as BL
5556
import qualified Data.Text as T
5657
import qualified Data.Text.Encoding.Error as T
5758
import qualified Data.Text.Internal.Fusion as TF
@@ -69,6 +70,9 @@ import Data.Word (Word, Word64)
6970
genUnicode :: IsString a => Gen a
7071
genUnicode = fromString <$> string
7172

73+
genWord8 :: Gen Word8
74+
genWord8 = chooseAny
75+
7276
instance Random I16 where
7377
randomR = integralRandomR
7478
random = randomR (minBound,maxBound)
@@ -78,9 +82,23 @@ instance Arbitrary I16 where
7882
shrink = shrinkIntegral
7983

8084
instance Arbitrary B.ByteString where
81-
arbitrary = B.pack `fmap` arbitrary
85+
arbitrary = B.pack `fmap` listOf genWord8
8286
shrink = map B.pack . shrink . B.unpack
8387

88+
instance Arbitrary BL.ByteString where
89+
arbitrary = oneof
90+
[ BL.fromChunks <$> arbitrary
91+
-- so that a single utf8 code point could appear split over up to 4 chunks
92+
, BL.fromChunks . map B.singleton <$> listOf genWord8
93+
-- so that a code point with 4 byte long utf8 representation
94+
-- could appear split over 3 non-singleton chunks
95+
, (\a b c -> BL.fromChunks [a, b, c])
96+
<$> arbitrary
97+
<*> ((\a b -> B.pack [a, b]) <$> genWord8 <*> genWord8)
98+
<*> arbitrary
99+
]
100+
shrink xs = BL.fromChunks <$> shrink (BL.toChunks xs)
101+
84102
#if !MIN_VERSION_base(4,4,0)
85103
instance Random Int64 where
86104
randomR = integralRandomR

tests/Tests/Regressions.hs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import qualified Data.ByteString.Lazy as LB
2020
import qualified Data.Text as T
2121
import qualified Data.Text.Array as TA
2222
import qualified Data.Text.Encoding as TE
23+
import qualified Data.Text.Encoding.Error as E
2324
import qualified Data.Text.Internal as T
2425
import qualified Data.Text.IO as T
2526
import qualified Data.Text.Lazy as LT
@@ -136,6 +137,13 @@ t301 = do
136137
original@(T.Text originalArr originalOff originalLen) = T.pack "1234567890"
137138
T.Text newArr _off _len = T.take 1 $ T.drop 1 original
138139

140+
t330 :: IO ()
141+
t330 = do
142+
let decodeL = LE.decodeUtf8With E.lenientDecode
143+
assertEqual "The lenient decoding of lazy bytestrings should not depend on how they are chunked"
144+
(decodeL (LB.fromChunks [B.pack [194], B.pack [97, 98, 99]]))
145+
(decodeL (LB.fromChunks [B.pack [194, 97, 98, 99]]))
146+
139147
tests :: F.TestTree
140148
tests = F.testGroup "Regressions"
141149
[ F.testCase "hGetContents_crash" hGetContents_crash
@@ -149,4 +157,5 @@ tests = F.testGroup "Regressions"
149157
, F.testCase "t280/fromString" t280_fromString
150158
, F.testCase "t280/singleton" t280_singleton
151159
, F.testCase "t301" t301
160+
, F.testCase "t330" t330
152161
]

0 commit comments

Comments
 (0)