diff --git a/Data/Csv.hs b/Data/Csv.hs index 86a4e76..f38f1d6 100644 --- a/Data/Csv.hs +++ b/Data/Csv.hs @@ -32,10 +32,12 @@ module Data.Csv -- $options , DecodeOptions(..) , defaultDecodeOptions + , spaceDecodeOptions , decodeWith , decodeByNameWith , EncodeOptions(..) , defaultEncodeOptions + , spaceEncodeOptions , encodeWith , encodeByNameWith diff --git a/Data/Csv/Encoding.hs b/Data/Csv/Encoding.hs index b381746..5857c33 100644 --- a/Data/Csv/Encoding.hs +++ b/Data/Csv/Encoding.hs @@ -21,10 +21,12 @@ module Data.Csv.Encoding -- ** Encoding and decoding options , DecodeOptions(..) , defaultDecodeOptions + , spaceDecodeOptions , decodeWith , decodeByNameWith , EncodeOptions(..) , defaultEncodeOptions + , spaceEncodeOptions , encodeWith , encodeByNameWith ) where @@ -123,7 +125,7 @@ decodeWithC :: (DecodeOptions -> AL.Parser a) -> DecodeOptions -> HasHeader -> BL8.ByteString -> Either String a decodeWithC p !opts hasHeader = decodeWithP parser where parser = case hasHeader of - HasHeader -> header (decDelimiter opts) *> p opts + HasHeader -> header opts *> p opts NoHeader -> p opts {-# INLINE decodeWithC #-} @@ -151,10 +153,16 @@ data EncodeOptions = EncodeOptions encDelimiter :: {-# UNPACK #-} !Word8 } deriving (Eq, Show) --- | Encoding options for CSV files. +-- | Encoding options for CSV files. Comma is used as separator. defaultEncodeOptions :: EncodeOptions defaultEncodeOptions = EncodeOptions - { encDelimiter = 44 -- comma + { encDelimiter = 44 -- comma + } + +-- | Encode options for space-delimited files. Tab is used as separator. +spaceEncodeOptions :: EncodeOptions +spaceEncodeOptions = EncodeOptions + { encDelimiter = 9 -- tab } -- | Like 'encode', but lets you customize how the CSV data is @@ -167,13 +175,13 @@ encodeWith opts = toLazyByteString encodeRecord :: Word8 -> Record -> Builder encodeRecord delim = mconcat . intersperse (fromWord8 delim) - . map fromByteString . map escape . V.toList + . map fromByteString . map (escape delim) . V.toList {-# INLINE encodeRecord #-} -- TODO: Optimize -escape :: B.ByteString -> B.ByteString -escape s - | B.any (\ b -> b == dquote || b == comma || b == nl || b == cr || b == sp) +escape :: Word8 -> B.ByteString -> B.ByteString +escape delim s + | B.any (\ b -> b == dquote || b == delim || b == nl || b == cr || b == sp) s = toByteString $ fromWord8 dquote <> B.foldl @@ -185,11 +193,11 @@ escape s <> fromWord8 dquote | otherwise = s where + sp = 32 dquote = 34 - comma = 44 nl = 10 cr = 13 - sp = 32 + -- | Like 'encodeByName', but lets you customize how the CSV data is -- encoded. @@ -262,7 +270,7 @@ csv !opts = do return $! V.fromList vals where records = do - !r <- record (decDelimiter opts) + !r <- record opts if blankLine r then (endOfLine *> records) <|> pure [] else case runParser (parseRecord r) of @@ -276,7 +284,7 @@ csv !opts = do csvWithHeader :: FromNamedRecord a => DecodeOptions -> AL.Parser (Header, V.Vector a) csvWithHeader !opts = do - !hdr <- header (decDelimiter opts) + !hdr <- header opts vals <- records hdr _ <- optional endOfLine endOfInput @@ -284,7 +292,7 @@ csvWithHeader !opts = do return (hdr, v) where records hdr = do - !r <- record (decDelimiter opts) + !r <- record opts if blankLine r then (endOfLine *> records hdr) <|> pure [] else case runParser (convert hdr r) of diff --git a/Data/Csv/Incremental.hs b/Data/Csv/Incremental.hs index c03aefc..c185bdb 100644 --- a/Data/Csv/Incremental.hs +++ b/Data/Csv/Incremental.hs @@ -133,7 +133,7 @@ decodeHeader = decodeHeaderWith defaultDecodeOptions decodeHeaderWith :: DecodeOptions -> HeaderParser B.ByteString decodeHeaderWith !opts = PartialH (go . parser) where - parser = A.parse (header $ decDelimiter opts) + parser = A.parse (header opts) go (A.Fail rest _ msg) = FailH rest err where err = "parse error (" ++ msg ++ ")" @@ -290,7 +290,7 @@ decodeWithP p !opts = go Incomplete [] . parser acc' | blankLine r = acc | otherwise = let !r' = convert r in r' : acc - parser = A.parse (record (decDelimiter opts) <* (endOfLine <|> endOfInput)) + parser = A.parse (record opts <* (endOfLine <|> endOfInput)) convert = runParser . p {-# INLINE decodeWithP #-} diff --git a/Data/Csv/Parser.hs b/Data/Csv/Parser.hs index 0a8c80e..7002dc6 100644 --- a/Data/Csv/Parser.hs +++ b/Data/Csv/Parser.hs @@ -16,6 +16,7 @@ module Data.Csv.Parser ( DecodeOptions(..) , defaultDecodeOptions + , spaceDecodeOptions , csv , csvWithHeader , header @@ -26,8 +27,8 @@ module Data.Csv.Parser import Blaze.ByteString.Builder (fromByteString, toByteString) import Blaze.ByteString.Builder.Char.Utf8 (fromChar) -import Control.Applicative (Alternative, (*>), (<$>), (<*), (<|>), optional, - pure) +import Control.Applicative (Alternative, (*>), (<$), (<$>), (<*), (<|>), + optional, pure) import Data.Attoparsec.Char8 (char, endOfInput, endOfLine) import qualified Data.Attoparsec as A import qualified Data.Attoparsec.Lazy as AL @@ -55,19 +56,51 @@ import Data.Csv.Util ((<$!>), blankLine) -- > } data DecodeOptions = DecodeOptions { -- | Field delimiter. - decDelimiter :: {-# UNPACK #-} !Word8 - } deriving (Eq, Show) + decDelimiter :: Word8 -> Bool --- | Decoding options for parsing CSV files. + -- | Runs of consecutive delimiters are regarded as a single + -- delimiter. This is useful e.g. when parsing white space + -- separated data. + , decMergeDelimiters :: !Bool + + -- | Trim leading and trailing whitespace at the begining and + -- end of each record (but not at the begining and end of each + -- field). + , decTrimRecordSpace :: !Bool + } + +-- | Decoding options for parsing CSV files. Fields' values are set to: +-- +-- [@'decDelimiter'@] comma +-- +-- [@'decMergeDelimiters'@] false +-- +-- [@'decTrimRecordSpace'@] false defaultDecodeOptions :: DecodeOptions defaultDecodeOptions = DecodeOptions - { decDelimiter = 44 -- comma + { decDelimiter = (==44) -- comma + , decMergeDelimiters = False + , decTrimRecordSpace = False + } + +-- | Decoding options for parsing space-delimited files. Fields' values are set to: +-- +-- [@'decDelimiter'@] space or tab character. +-- +-- [@'decMergeDelimiters'@] true +-- +-- [@'decTrimRecordSpace'@] true +spaceDecodeOptions :: DecodeOptions +spaceDecodeOptions = DecodeOptions + { decDelimiter = \c -> c == space || c == tab + , decMergeDelimiters = True + , decTrimRecordSpace = True } -- | Parse a CSV file that does not include a header. csv :: DecodeOptions -> AL.Parser Csv csv !opts = do - vals <- record (decDelimiter opts) `sepBy1'` endOfLine + vals <- record opts `sepBy1'` endOfLine _ <- optional endOfLine endOfInput let nonEmpty = removeBlankLines vals @@ -94,23 +127,24 @@ sepBy1' p s = go -- | Parse a CSV file that includes a header. csvWithHeader :: DecodeOptions -> AL.Parser (Header, V.Vector NamedRecord) csvWithHeader !opts = do - !hdr <- header (decDelimiter opts) + !hdr <- header opts vals <- map (toNamedRecord hdr) . removeBlankLines <$> - (record (decDelimiter opts)) `sepBy1'` endOfLine + (record opts) `sepBy1'` endOfLine _ <- optional endOfLine endOfInput let !v = V.fromList vals return (hdr, v) -- | Parse a header, including the terminating line separator. -header :: Word8 -- ^ Field delimiter +header :: DecodeOptions -- ^ Field delimiter -> AL.Parser Header -header !delim = V.fromList <$!> name delim `sepBy1'` (A.word8 delim) <* endOfLine +header = record +{-# INLINE header #-} -- | Parse a header name. Header names have the same format as regular -- 'field's. -name :: Word8 -> AL.Parser Name -name !delim = field delim +name :: DecodeOptions -> AL.Parser Name +name = field removeBlankLines :: [Record] -> [Record] removeBlankLines = filter (not . blankLine) @@ -120,23 +154,42 @@ removeBlankLines = filter (not . blankLine) -- CSV file is allowed to not have a terminating line separator. You -- most likely want to use the 'endOfLine' parser in combination with -- this parser. -record :: Word8 -- ^ Field delimiter - -> AL.Parser Record -record !delim = do - fs <- field delim `sepBy1'` (A.word8 delim) - return $! V.fromList fs +record :: DecodeOptions -> AL.Parser Record +record !opts + -- If we need to trim spaces from line only robust way to do so is + -- to read whole line, remove spaces and run record parser on + -- trimmed line. For example: + -- + -- + "a,b,c " will be parsed as ["a","b","c "] since spaces are + -- allowed in field + -- + "a b c " will be parsed as ["a","b","c",""] if we use space + -- as separator. + | decTrimRecordSpace opts = do + AL.skipMany $ AL.satisfy isSpace + line <- AL.takeWhile $ \c -> c /= newline && c /= cr + let (dat,_) = S.spanEnd isSpace line + case AL.parseOnly parser dat of + Left e -> fail e + Right x -> return x + | otherwise = parser + where + delim = decDelimiter opts + delimiter | decMergeDelimiters opts = A.skipMany1 (A.satisfy delim) + | otherwise = () <$ A.satisfy delim + parser = do fs <- field opts `sepBy1'` delimiter + return $! V.fromList fs {-# INLINE record #-} -- | Parse a field. The field may be in either the escaped or -- non-escaped format. The return value is unescaped. -field :: Word8 -> AL.Parser Field -field !delim = do +field :: DecodeOptions -> AL.Parser Field +field !opt = do mb <- A.peekWord8 -- We purposely don't use <|> as we want to commit to the first -- choice if we see a double quote. case mb of Just b | b == doubleQuote -> escapedField - _ -> unescapedField delim + _ -> unescapedField opt {-# INLINE field #-} escapedField :: AL.Parser S.ByteString @@ -154,11 +207,14 @@ escapedField = do Left err -> fail err else return s -unescapedField :: Word8 -> AL.Parser S.ByteString -unescapedField !delim = A.takeWhile (\ c -> c /= doubleQuote && - c /= newline && - c /= delim && - c /= cr) +unescapedField :: DecodeOptions -> AL.Parser S.ByteString +unescapedField !opt = A.takeWhile (\ c -> c /= doubleQuote && + c /= newline && + c /= cr && + not (delim c)) + where + delim = decDelimiter opt +{-# INLINE unescapedField #-} dquote :: AL.Parser Char dquote = char '"' @@ -178,7 +234,13 @@ unescape = toByteString <$!> go mempty where then return (acc `mappend` fromByteString h) else rest -doubleQuote, newline, cr :: Word8 +doubleQuote, newline, cr, space, tab :: Word8 doubleQuote = 34 -newline = 10 -cr = 13 +newline = 10 +cr = 13 +space = 32 +tab = 9 + +isSpace :: Word8 -> Bool +isSpace c = c == space || c == tab +{-# INLINE isSpace #-} diff --git a/tests/UnitTests.hs b/tests/UnitTests.hs index 4089b07..5523d29 100644 --- a/tests/UnitTests.hs +++ b/tests/UnitTests.hs @@ -23,6 +23,7 @@ import Test.Framework.Providers.QuickCheck2 as TF import Data.Csv hiding (record) import qualified Data.Csv.Streaming as S + ------------------------------------------------------------------------ -- Parse tests @@ -114,6 +115,14 @@ positionalTests = [ testCase "tab-delim" $ encodesWithAs (defEnc { encDelimiter = 9 }) [["1", "2"]] "1\t2\r\n" ] + , testGroup "encodeSpace" $ map (\(n,a,b) -> testCase n $ encodesWithAs spaceEncodeOptions a b) + [ ("simple", [["abc"]], "abc\r\n") + , ("leadingSpace", [[" abc"]], "\" abc\"\r\n") + , ("comma", [["abc,def"]], "abc,def\r\n") + , ("space", [["abc def"]], "\"abc def\"\r\n") + , ("tab", [["abc\tdef"]], "\"abc\tdef\"\r\n") + , ("twoFields", [["abc","def"]], "abc\tdef\r\n") + ] , testGroup "decode" $ map decodeTest decodeTests , testGroup "decodeWith" $ map decodeWithTest decodeWithTests , testGroup "streaming" @@ -142,7 +151,12 @@ positionalTests = , ("rfc4180", rfc4180Input, rfc4180Output) ] decodeWithTests = - [ ("tab-delim", defDec { decDelimiter = 9 }, "1\t2", [["1", "2"]]) + [ ("tab-delim", defDec { decDelimiter = (==9) }, "1\t2", [["1", "2"]]) + , ("mixed-space", spaceDec, " 88 c \t 0.4 ", [["88", "c", "0.4"]]) + , ("multiline-space", spaceDec, " 11 22 \n 11 22", [ ["11","22"] + , ["11","22"]]) + , ("blankLine-space", spaceDec, "1 2\n\n3 4\n", [ ["1","2"] + , ["3","4"]]) ] encodeTest (name, input, expected) = @@ -155,8 +169,9 @@ positionalTests = testCase name $ input `decodesStreamingAs` expected streamingDecodeWithTest (name, opts, input, expected) = testCase name $ decodesWithStreamingAs opts input expected - defEnc = defaultEncodeOptions - defDec = defaultDecodeOptions + defEnc = defaultEncodeOptions + defDec = defaultDecodeOptions + spaceDec = spaceDecodeOptions nameBasedTests :: [TF.Test] nameBasedTests =