diff --git a/.gitignore b/.gitignore index 503a5d38..8f23dcb8 100644 --- a/.gitignore +++ b/.gitignore @@ -83,7 +83,7 @@ Icon .AppleDB .AppleDesktop Network\ Trash\ Folder -Temporary Items +Temporary\ Items .apdisk # intelliJ idea And other editors diff --git a/Gemfile.lock b/Gemfile.lock index b10398a9..d0febc32 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -122,6 +122,7 @@ GEM execjs (>= 0.3.0, < 3) PLATFORMS + ruby x86_64-linux DEPENDENCIES diff --git a/_data/contents.yml b/_data/contents.yml index 260d02a9..46e8e31b 100644 --- a/_data/contents.yml +++ b/_data/contents.yml @@ -10,3 +10,11 @@ basics: - control-structures - pattern-matching - containers +parsers: + - introduction + - readp + - parsec + - megaparsec + - attoparsec + - appar + - parsley diff --git a/_data/locales/en.yml b/_data/locales/en.yml index 66becfa5..a1dd348c 100644 --- a/_data/locales/en.yml +++ b/_data/locales/en.yml @@ -11,6 +11,7 @@ sections: blog: Blog contributors: Contributors translation_report: Translation Report + parsers: Parsers combinators version_messages: outdated: "Some contents of this translation may be outdated." diff --git a/en/lessons/parsers/Introduction.hs b/en/lessons/parsers/Introduction.hs new file mode 100644 index 00000000..ca4c8a2b --- /dev/null +++ b/en/lessons/parsers/Introduction.hs @@ -0,0 +1,18 @@ +module Introduction where + +import Text.ParserCombinators.ReadP + +tuples :: ReadP [(String, String)] +tuples = many tuple + +tuple :: ReadP (String, String) +tuple = + between (char '(') (char ')') $ do + left <- munch1 (/= ',') + char ',' + skipSpaces + right <- munch1 (/= ')') + return (left, right) + +example :: [([(String, String)], String)] +example = readP_to_S tuples "(abc,def)(gh, ij)(k,l)" diff --git a/en/lessons/parsers/Parsec.hs b/en/lessons/parsers/Parsec.hs new file mode 100644 index 00000000..47c27d41 --- /dev/null +++ b/en/lessons/parsers/Parsec.hs @@ -0,0 +1,26 @@ +module Parsec where + +import Control.Monad +import Data.Functor.Identity +import Text.Parsec +import Text.Parsec.Char +import Text.Parsec.Language +import Text.Parsec.Token + +tokenParser :: GenTokenParser String u Identity +tokenParser = makeTokenParser emptyDef + +numParser :: Parsec String Integer Integer +numParser = do + n <- integer tokenParser + previous <- getState + guard $ n > previous + putState n + return n + +increasingNumbersParser :: Parsec String Integer [Integer] +increasingNumbersParser = + numParser `sepBy` spaces + +increasingNumbers :: String -> Either ParseError [Integer] +increasingNumbers = runParser increasingNumbersParser (-1) "" diff --git a/en/lessons/parsers/appar.md b/en/lessons/parsers/appar.md new file mode 100644 index 00000000..e249a3bf --- /dev/null +++ b/en/lessons/parsers/appar.md @@ -0,0 +1,112 @@ +--- +version: 1.0.0 +title: Appar +--- + +{% include toc.html %} + +[Appar](https://hackage.haskell.org/package/appar) is different from the other +libraries such that, they are monadic, while `appar` is applicative. +See [this](https://stackoverflow.com/a/7863380/1599054) to have a good explanation +between the two. + +# Definition + +Let's start with the definition: + +```haskell +data MkParser inp a = P { + runParser :: inp -> (Maybe a, inp) + } +``` + +It really looks like `attoparsec`'s `ZeptoT`, except that the `Monad` is inside +the result. + +Let's see the instances: + +```haskell +instance Functor (MkParser inp) where + f `fmap` p = return f <*> p + +instance Applicative (MkParser inp) where + pure a = P $ \bs -> (Just a, bs) + (<*>) = ap + +instance Alternative (MkParser inp) where + empty = mzero + (<|>) = mplus + +instance Monad (MkParser inp) where + return = pure + p >>= f = P $ \bs -> case runParser p bs of + (Nothing, bs') -> (Nothing, bs') + (Just a, bs') -> runParser (f a) bs' + +instance MonadPlus (MkParser inp) where + mzero = P $ \bs -> (Nothing, bs) + p `mplus` q = P $ \bs -> case runParser p bs of + (Nothing, bs') -> runParser q bs' + (Just a, bs') -> (Just a, bs') +``` + +One thing surprising is the `Monad` instance for an applicative parser, we can +conclude that the '`Applicative`ness' of the library cames from the lack of +shortcut, more than the implemented solution. + +# Construction + +How are parsers built: + +```haskell +class Eq inp => Input inp where + -- | The head function for input + car :: inp -> Char + -- | The tail function for input + cdr :: inp -> inp + -- | The end of input + nil :: inp + -- | The function to check the end of input + isNil :: inp -> Bool + +satisfy :: Input inp => (Char -> Bool) -> MkParser inp Char +satisfy predicate = P sat + where + sat bs + | isNil bs = (Nothing, nil) + | predicate b = (Just b, bs') + | otherwise = (Nothing, bs) + where + b = car bs + bs' = cdr bs +``` + +A bit abstract (and a bit lisp-like), but without surprises. + +Anothor interesting one: + +```haskell +try :: MkParser inp a -> MkParser inp a +try p = P $ \bs -> case runParser p bs of + (Nothing, _ ) -> (Nothing, bs) + (Just a, bs') -> (Just a, bs') +``` + +As expected: the parser is ran, if it succeeds, the input is consumer, or the +original input is returned. + +# Running the parser + +As we can expect, running the pparser will only consist in getting the computed +result: + +```haskell +parse :: Input inp => MkParser inp a -> inp -> Maybe a +parse p bs = fst (runParser p bs) +``` + +# Conclusion + +`appar` is really simple, but it is unique in the sense that it prevent +context-dependant parsing. + diff --git a/en/lessons/parsers/attoparsec.md b/en/lessons/parsers/attoparsec.md new file mode 100644 index 00000000..58f3a7c4 --- /dev/null +++ b/en/lessons/parsers/attoparsec.md @@ -0,0 +1,221 @@ +--- +version: 1.0.0 +title: Attoparsec +--- + +{% include toc.html %} + +[`attoparsec`](https://hackage.haskell.org/package/attoparsec) is a well known +parser combinators library, especially for its performances. + +# Definition + +Having a look at its definition: + +```haskell +newtype Parser i a = Parser { + runParser :: forall r. + State i -> Pos -> More + -> Failure i (State i) r + -> Success i (State i) a r + -> IResult i r + } + +type family State i +type instance State ByteString = B.Buffer +type instance State Text = T.Buffer + +type Failure i t r = t -> Pos -> More -> [String] -> String + -> IResult i r +type Success i t a r = t -> Pos -> More -> a -> IResult i r + +-- | Have we read all available input? +data More = Complete | Incomplete + +newtype Pos = Pos { fromPos :: Int } + deriving (Eq, Ord, Show, Num) + +-- | The result of a parse. This is parameterised over the type @i@ +-- of string that was processed. +-- +-- This type is an instance of 'Functor', where 'fmap' transforms the +-- value in a 'Done' result. +data IResult i r = + Fail i [String] String + -- ^ The parse failed. The @i@ parameter is the input that had + -- not yet been consumed when the failure occurred. The + -- @[@'String'@]@ is a list of contexts in which the error + -- occurred. The 'String' is the message describing the error, if + -- any. + | Partial (i -> IResult i r) + -- ^ Supply this continuation with more input so that the parser + -- can resume. To indicate that no more input is available, pass + -- an empty string to the continuation. + -- + -- __Note__: if you get a 'Partial' result, do not call its + -- continuation more than once. + | Done i r + -- ^ The parse succeeded. The @i@ parameter is the input that had + -- not yet been consumed (if any) when the parse succeeded. +``` + +It looks like `Megaparsec`, but with a far simpler (and more specialized) `State`. + +# Running the Parser + +Let see how to run a `Parser` (for `ByteString`): + +```haskell +parse :: Parser a -> ByteString -> Result a +parse m s = T.runParser m (buffer s) (Pos 0) Incomplete failK successK + +failK :: Failure a +failK t (Pos pos) _more stack msg = Fail (Buf.unsafeDrop pos t) stack msg + +successK :: Success a a +successK t (Pos pos) _more a = Done (Buf.unsafeDrop pos t) a + +buffer :: ByteString -> Buffer +``` + +`T.runParser` is the `Parser`'s value. + +Very simple, actually, error handling is limited to the position (`Pos`) and +the current processing state, are given directly. + +# Combinators + +Let's have a look at some combinators, starting by the usual instances: + +```haskell +instance Applicative (Parser i) where + pure v = Parser $ \t !pos more _lose succ -> succ t pos more v + +instance Alternative (Parser i) where + f <|> g = Parser $ \t pos more lose succ -> + let lose' t' _pos' more' _ctx _msg = runParser g t' pos more' lose succ + in runParser f t pos more lose' succ + +instance Monad (Parser i) where + m >>= k = Parser $ \t !pos more lose succ -> + let succ' t' !pos' more' a = runParser (k a) t' pos' more' lose succ + in runParser m t pos more lose succ' +``` + +Straightforward, maybe some combinators would be more interesting: + +```haskell +satisfy :: (Word8 -> Bool) -> Parser Word8 +satisfy p = do + h <- peekWord8' + if p h + then advance 1 >> return h + else fail "satisfy" + +-- | Match any byte, to perform lookahead. Does not consume any +-- input, but will fail if end of input has been reached. +peekWord8' :: Parser Word8 +peekWord8' = T.Parser $ \t pos more lose succ -> + if lengthAtLeast pos 1 t + then succ t pos more (Buf.unsafeIndex t (fromPos pos)) + else let succ' t' pos' more' bs' = succ t' pos' more' $! B.unsafeHead bs' + in ensureSuspended 1 t pos more lose succ' + +advance :: Int -> Parser () +advance n = T.Parser $ \t pos more _lose succ -> + succ t (pos + Pos n) more () +``` + +Here comes the interesting part: we check the head, if it works, we increment +the position, and left the input untouched. + +We can easily unterstand the reason behind `attoparsec`'s speed: a basic error +handling, and a small state. + +# Zepto + +`attoparsec` comes with another parser combinators type: [`Zepto`](https://hackage.haskell.org/package/attoparsec-0.14.3/docs/Data-Attoparsec-Zepto.html): + +> A tiny, highly specialized combinator parser for `ByteString` strings. +> +> While the main attoparsec module generally performs well, this module is particularly fast for simple non-recursive loops that should not normally result in failed parses. + +Let's have a look: + +```haskell +-- | A simple parser. +-- +-- This monad is strict in its state, and the monadic bind operator +-- ('>>=') evaluates each result to weak head normal form before +-- passing it along. +newtype ZeptoT m a = Parser { + runParser :: S -> m (Result a) + } + +type Parser a = ZeptoT Identity a + +newtype S = S { + input :: ByteString + } + +data Result a = Fail String + | OK !a S +``` + +Definitively the simplest parser combinator you can come up with. + +```haskell +instance (Monad m) => Applicative (ZeptoT m) where + pure a = Parser $ \s -> return (OK a s) + +instance Monad m => Alternative (ZeptoT m) where + empty = fail "empty" + + a <|> b = Parser $ \s -> do + result <- runParser a s + case result of + ok@(OK _ _) -> return ok + _ -> runParser b s + +instance Monad m => Monad (ZeptoT m) where + m >>= k = Parser $ \s -> do + result <- runParser m s + case result of + OK a s' -> runParser (k a) s' + Fail err -> return (Fail err) +``` + +It looks tedious because you deal with `Result`, but, in the end, you have all +the boilerplate needed to create a function stored in `Parser`. + +```haskell +parseT :: Monad m => ZeptoT m a -> ByteString -> m (Either String a) +parseT p bs = do + result <- runParser p (S bs) + case result of + OK a _ -> return (Right a) + Fail err -> return (Left err) +``` + +It comes with the following primitives: + +```haskell +gets :: Monad m => (S -> a) -> ZeptoT m a +gets f = Parser $ \s -> return (OK (f s) s) + +put :: Monad m => S -> ZeptoT m () +put s = Parser $ \_ -> return (OK () s) +``` + +With that, are defined the only available functions: `takeWhile`, `take`, `string`, `atEnd`. + +# Conclusion + +`attoparsec` clearly focuses on performances, saying that, it brings two important points: + +* It reaches performances by having the simplest design possible, sticking to its internals (mostly `ByteString`) +* It comes with a second abstraction, which push these principles to the extreme + +Interestingly, we could assume that, in order to acheive good performances, +the code would be more cryptic, while it's the simplest implementation we have +seen so far. diff --git a/en/lessons/parsers/introduction.md b/en/lessons/parsers/introduction.md new file mode 100644 index 00000000..15fa2358 --- /dev/null +++ b/en/lessons/parsers/introduction.md @@ -0,0 +1,68 @@ +--- +version: 1.0.0 +title: Introduction to Parser Combinators +--- + +{% include toc.html %} + +Parsers are ubiquitous in educational Haskell literature because parser combinators can be used to illustrate the power of composition. + +Parsers, while being mostly associated with compilers and interpreters, can be found everywhere. + +Their goal is to take input data (usually text or a stream), and produce structured output (usually a data structure, such as an [AST](https://en.wikipedia.org/wiki/Abstract_syntax_tree)). See [this article](https://lexi-lambda.github.io/blog/2019/11/05/parse-don-t-validate/) for day to day application. + +Parser Combinators are a way to build parsers by composition. + +Let's say we want to extract all the tuples from some text. We can start by defining our tuple parser: + +```haskell +import Text.ParserCombinators.ReadP + +tuple :: ReadP (String, String) +tuple = + between (char '(') (char ')') $ do + left <- munch1 (/= ',') + char ',' + skipSpaces + right <- munch1 (/= ')') + return (left, right) +``` + +As you can see, relying on `Monad` allows us to focus on the structure of the input, instead of the structure of the parser. + +we can then define our main parser through composition: + +```haskell +tuples :: ReadP [(String, String)] +tuples = many tuple +``` + +This is the power of parser combinators: since `ReadP` implements `Alternative`, any parser can be reused and extended (here via [`many`](https://hackage.haskell.org/package/base/docs/Control-Applicative.html#v:many)). + +We can have a toy example: + +```haskell +example :: [([(String, String)], String)] +example = readP_to_S tuples "(abc,def)(gh, ij)(k,l)" +``` + +which gives us: + +```haskell +[ + ([], "(abc,def)(gh, ij)(k,l)") +, ([("abc", "def") ], "(gh, ij)(k,l)") +, ([("abc", "def"), ("gh", "ij")], "(k,l)") +, ([( "abc", "def"), ("gh", "ij"), ("k", "l")] , "") +] +``` + +As you can see, running `ReadP` gives all possible parsed results, which can lead to performances issues. + +Going further: [Jake Wheat's Intro to Parsing with Parsec in Haskell](http://jakewheat.github.io/intro_to_parsing/) + +## Targeted audience + +The aim of this topic is to explore Haskell's idioms through the study of different designs. + +A minimal understanding of Haskell and it's main abstractions (eg. Functor, Applicative, Monad) is expected and can be gathered in the previous courses. diff --git a/en/lessons/parsers/megaparsec.md b/en/lessons/parsers/megaparsec.md new file mode 100644 index 00000000..62ac9f65 --- /dev/null +++ b/en/lessons/parsers/megaparsec.md @@ -0,0 +1,346 @@ +--- +version: 1.0.0 +title: Megaparsec +--- + +{% include toc.html %} + +[Megaparsec](https://hackage.haskell.org/package/megaparsec) is the direct fork of *Parsec*. + +We'll see what changed and what are the impacts. + +# Definition + +*Megaparsec* also has a type named `ParsecT`, but it's definition changed a bit from *Parsec*: + +```haskell +-- | @'ParsecT' e s m a@ is a parser with custom data component of error +-- @e@, stream type @s@, underlying monad @m@ and return type @a@. +newtype ParsecT e s m a = ParsecT + { unParser :: + forall b. + State s e -> + (a -> State s e -> Hints (Token s) -> m b) -> -- consumed-OK + (ParseError s e -> State s e -> m b) -> -- consumed-error + (a -> State s e -> Hints (Token s) -> m b) -> -- empty-OK + (ParseError s e -> State s e -> m b) -> -- empty-error + m b + } +``` + +We can see three big changes: + + * The user state has been dropped (we can guess that it should be part of `m` if we want to have one) + * `ParseError` has been parameterize (and the error type too) + * A `Hints` type was introduced over `Token`, which is an associated type over `Stream` + +We can start enquireing about `Hints`: + +```haskell +-- | 'Hints' represent a collection of 'ErrorItem's to be included into +-- 'ParseError' (when it's a 'TrivialError') as “expected” message items +-- when a parser fails without consuming input right after successful parser +-- that produced the hints. +-- +-- For example, without hints you could get: +-- +-- >>> parseTest (many (char 'r') <* eof) "ra" +-- 1:2: +-- unexpected 'a' +-- expecting end of input +-- +-- We're getting better error messages with the help of hints: +-- +-- >>> parseTest (many (char 'r') <* eof) "ra" +-- 1:2: +-- unexpected 'a' +-- expecting 'r' or end of input +newtype Hints t = Hints [Set (ErrorItem t)] +``` + +Very interesting, that's another reason to have a look at `ParseError`: + +```haskell +data ParseError s e + = -- | Trivial errors, generated by the Megaparsec's machinery. The data + -- constructor includes the offset of error, unexpected token (if any), + -- and expected tokens. + TrivialError Int (Maybe (ErrorItem (Token s))) (Set (ErrorItem (Token s))) + | -- | Fancy, custom errors. + FancyError Int (Set (ErrorFancy e)) +``` + +There's clearly an emphasis on error handling. + +Even `State` is parameterizing with the error type: + +```haskell +data State s e = State + { -- | The rest of input to process + stateInput :: s, + -- | Number of processed tokens so far + stateOffset :: !Int, + -- | State that is used for line\/column calculation + statePosState :: PosState s, + -- | Collection of “delayed” 'ParseError's in reverse order. This means + -- that the last registered error is the first element of the list. + stateParseErrors :: [ParseError s e] + } + deriving (Typeable, Generic) + +-- | A special kind of state that is used to calculate line\/column +-- positions on demand. +-- +data PosState s = PosState + { -- | The rest of input to process + pstateInput :: s, + -- | Offset corresponding to beginning of 'pstateInput' + pstateOffset :: !Int, + -- | Source position corresponding to beginning of 'pstateInput' + pstateSourcePos :: !SourcePos, + -- | Tab width to use for column calculation + pstateTabWidth :: Pos, + -- | Prefix to prepend to offending line + pstateLinePrefix :: String + } +``` + +Ok, I'm not sure why there's `stateInput` and `pstateInput`. + +let see how there're used is the code: + +```haskell +initialState :: String -> s -> State s e +initialState name s = + State + { stateInput = s, + stateOffset = 0, + statePosState = + PosState + { pstateInput = s, + pstateOffset = 0, + pstateSourcePos = initialPos name, + pstateTabWidth = defaultTabWidth, + pstateLinePrefix = "" + }, + stateParseErrors = [] + } +``` + +No surprise here, at initialization they point out to the value. + +Next one: + +```haskell +-- | Return the current input. +getInput :: MonadParsec e s m => m s +getInput = stateInput <$> getParserState + +-- | @'setInput' input@ continues parsing with @input@. +setInput :: MonadParsec e s m => s -> m () +setInput s = updateParserState (\(State _ o pst de) -> State s o pst de) +``` + +Like in `Parsec`, `stateInput` holds the current position. + +Then, the only other mention of `pstateInput` or `stateInput` is: + +```haskell +-- | A helper definition to facilitate defining 'reachOffset' for various +-- stream types. +reachOffset' :: + forall s. + Stream s => + -- | How to split input stream at given offset + (Int -> s -> (Tokens s, s)) -> + -- | How to fold over input stream + (forall b. (b -> Token s -> b) -> b -> Tokens s -> b) -> + -- | How to convert chunk of input stream into a 'String' + (Tokens s -> String) -> + -- | How to convert a token into a 'Char' + (Token s -> Char) -> + -- | Newline token and tab token + (Token s, Token s) -> + -- | Offset to reach + Int -> + -- | Initial 'PosState' to use + PosState s -> + -- | Line at which 'SourcePos' is located, updated 'PosState' + (Maybe String, PosState s) +reachOffset' + splitAt' + foldl'' + fromToks + fromTok + (newlineTok, tabTok) + o + PosState {..} = + ( Just $ case expandTab pstateTabWidth + . addPrefix + . f + . fromToks + . fst + $ takeWhile_ (/= newlineTok) post of + "" -> "" + xs -> xs, + PosState + { pstateInput = post, + pstateOffset = max pstateOffset o, + pstateSourcePos = spos, + pstateTabWidth = pstateTabWidth, + pstateLinePrefix = + if sameLine + then -- NOTE We don't use difference lists here because it's + -- desirable for 'PosState' to be an instance of 'Eq' and + -- 'Show'. So we just do appending here. Fortunately several + -- parse errors on the same line should be relatively rare. + pstateLinePrefix ++ f "" + else f "" + } + ) + where + addPrefix xs = + if sameLine + then pstateLinePrefix ++ xs + else xs + sameLine = sourceLine spos == sourceLine pstateSourcePos + (pre, post) = splitAt' (o - pstateOffset) pstateInput + St spos f = foldl'' go (St pstateSourcePos id) pre + go (St apos g) ch = + let SourcePos n l c = apos + c' = unPos c + w = unPos pstateTabWidth + in if + | ch == newlineTok -> + St + (SourcePos n (l <> pos1) pos1) + id + | ch == tabTok -> + St + (SourcePos n l (mkPos $ c' + w - ((c' - 1) `rem` w))) + (g . (fromTok ch :)) + | otherwise -> + St + (SourcePos n l (c <> pos1)) + (g . (fromTok ch :)) +``` + +Let's see what is reachOffset is used for: + +```haskell +-- | Type class for inputs that can also be used for error reporting. +-- +-- @since 9.0.0 +class Stream s => TraversableStream s where + {-# MINIMAL reachOffset | reachOffsetNoLine #-} + + -- | Given an offset @o@ and initial 'PosState', adjust the state in such + -- a way that it starts at the offset. + -- + -- Return two values (in order): + -- + -- * 'Maybe' 'String' representing the line on which the given offset + -- @o@ is located. It can be omitted (i.e. 'Nothing'); in that case + -- error reporting functions will not show offending lines. If + -- returned, the line should satisfy a number of conditions that are + -- described below. + -- * The updated 'PosState' which can be in turn used to locate + -- another offset @o'@ given that @o' >= o@. + -- + -- The 'String' representing the offending line in input stream should + -- satisfy the following: + -- + -- * It should adequately represent location of token at the offset of + -- interest, that is, character at 'sourceColumn' of the returned + -- 'SourcePos' should correspond to the token at the offset @o@. + -- * It should not include the newline at the end. + -- * It should not be empty, if the line happens to be empty, it + -- should be replaced with the string @\"\\"@. + -- * Tab characters should be replaced by appropriate number of + -- spaces, which is determined by the 'pstateTabWidth' field of + -- 'PosState'. + -- + -- __Note__: type signature of the function was changed in the version + -- /9.0.0/. + -- + -- @since 7.0.0 + reachOffset :: + -- | Offset to reach + Int -> + -- | Initial 'PosState' to use + PosState s -> + -- | See the description of the function + (Maybe String, PosState s) +``` + +Ok, `reachOffset` is a way to update `PosState`, let's how it's used: + +```haskell +-- | Return the current source position. This function /is not cheap/, do +-- not call it e.g. on matching of every token, that's a bad idea. Still you +-- can use it to get 'SourcePos' to attach to things that you parse. +-- +-- The function works under the assumption that we move in the input stream +-- only forwards and never backwards, which is always true unless the user +-- abuses the library. +-- +-- @since 7.0.0 +getSourcePos :: (TraversableStream s, MonadParsec e s m) => m SourcePos +getSourcePos = do + st <- getParserState + let pst = reachOffsetNoLine (stateOffset st) (statePosState st) + setParserState st {statePosState = pst} + return (pstateSourcePos pst) + +-- | Pretty-print a 'ParseErrorBundle'. All 'ParseError's in the bundle will +-- be pretty-printed in order together with the corresponding offending +-- lines by doing a single pass over the input stream. The rendered 'String' +-- always ends with a newline. +-- +-- @since 7.0.0 +errorBundlePretty :: + forall s e. + ( VisualStream s, + TraversableStream s, + ShowErrorComponent e + ) => + -- | Parse error bundle to display + ParseErrorBundle s e -> + -- | Textual rendition of the bundle + String +errorBundlePretty ParseErrorBundle {..} = ... +``` + +So it's only used for debugging purposes. + +While `stateInput` is used in base functions: + +```haskell +pToken :: + forall e s m a. + Stream s => + (Token s -> Maybe a) -> + Set (ErrorItem (Token s)) -> + ParsecT e s m a +pToken test ps = ParsecT $ \s@(State input o pst de) cok _ _ eerr -> + case take1_ input of + Nothing -> + let us = pure EndOfInput + in eerr (TrivialError o us ps) s + Just (c, cs) -> + case test c of + Nothing -> + let us = (Just . Tokens . nes) c + in eerr + (TrivialError o us ps) + (State input o pst de) + Just x -> + cok x (State cs (o + 1) pst de) mempty +``` + +# Conclusion + +While `Megaparsec` is a fork of `Parsec`, it diverged somehow: + + * It uses more structures underneath + * There's a more detailed structure for error handling diff --git a/en/lessons/parsers/parsec.md b/en/lessons/parsers/parsec.md new file mode 100644 index 00000000..e0bc84ec --- /dev/null +++ b/en/lessons/parsers/parsec.md @@ -0,0 +1,355 @@ +--- +version: 1.0.0 +title: Parsec +--- + +{% include toc.html %} + +[Parsec](https://hackage.haskell.org/package/parsec) is one of most famous haskell library, it embeds the concept of parser combinators. + +# Definition + +The main type if defined as follows: + +```haskell +newtype ParsecT s u m a + = ParsecT {unParser :: forall b . + State s u + -> (a -> State s u -> ParseError -> m b) -- consumed ok + -> (ParseError -> m b) -- consumed err + -> (a -> State s u -> ParseError -> m b) -- empty ok + -> (ParseError -> m b) -- empty err + -> m b + } +``` + +Whith the frontend type: + +```haskell +type Parsec s u = ParsecT s u Identity +``` + +And the following support types: + +```haskell +data Consumed a = Consumed a + | Empty !a + deriving ( Typeable ) + +data Reply s u a = Ok a !(State s u) ParseError + | Error ParseError + deriving ( Typeable ) + +data State s u = State { + stateInput :: s, + statePos :: !SourcePos, + stateUser :: !u + } + deriving ( Typeable ) + +data ParseError = ParseError !SourcePos [Message] + deriving ( Typeable ) +``` + +It also have a a bunch of `instance`s: + + * `MonadState` + * `MonadReader` + * `MonadError` + * `MonadTrans` + * `Monad` + * `Functor` + * `MonadFail` + * `Applicative` + * `MonadIO` + * `Alternative` + * `MonadPlus` + * `MonadCont` + * `Semigroup` + * `Monoid` + +# Construction + +That's a lot, let's take a combinator to figure out each types' usage: + +```haskell +between :: Stream s m t => ParsecT s u m open -> ParsecT s u m close -> ParsecT s u m a -> ParsecT s u m a +``` + +We start to see better: + + * `s` is the `Stream` type (eg. `Text`, `ByteString`), the input + * `u` is the user's state + * `m` is the underlying `Monad` (eg. `Identity`, `IO`, etc.) + * `a` is the produced value (`ParsecT` is a `Monad`) + +To go a bit deeper with the user's state, we can have a look at the related functions: + +```haskell +getState :: Monad m => ParsecT s u m u +putState :: Monad m => u -> ParsecT s u m () +modifyState :: Monad m => (u -> u) -> ParsecT s u m () +``` + +# First example + +If we want to play with it, we can have a parser working only on increasing numbers sequences: + +```haskell +import Control.Monad +import Data.Functor.Identity +import Text.Parsec +import Text.Parsec.Char +import Text.Parsec.Language +import Text.Parsec.Token + +tokenParser :: GenTokenParser String u Identity +tokenParser = makeTokenParser emptyDef + +numParser :: Parsec String Integer Integer +numParser = do + n <- integer tokenParser + previous <- getState + guard $ n > previous + putState n + return n + +increasingNumbersParser :: Parsec String Integer [Integer] +increasingNumbersParser = + numParser `sepBy` spaces + +increasingNumbers :: String -> Either ParseError [Integer] +increasingNumbers = runParser increasingNumbersParser (-1) "" +``` + +We have the following elements: + + * `tokenParser` which gives us a way to parse `Integer`s in `numParser` + * `numParser`, it parses an `Integer`, check the parsed number against the state and update the state + * `increasingNumberParser` which define the way numbers are chained + * `increasingNumbers` which hides `Parsec` details + +If we play a bit with it, here's what we get: + +``` +> increasingNumbers "2 9 12" +Right + [ 2 + , 9 + , 12 + ] +> increasingNumbers "2 1 12" +Left "" + ( line 1 + , column 5 + ) : + unexpected "1" +``` + +# Key operators +In order to have a better understanding of the whole function, we have to dig +into the basic functions, to see how it's build and used. + +We can start by `satisfy` which, given a predicate over a `Char`, accept it or not: + +```haskell +satisfy :: (Stream s m Char) => (Char -> Bool) -> ParsecT s u m Char +satisfy f = tokenPrim (\c -> show [c]) + (\pos c _cs -> updatePosChar pos c) + (\c -> if f c then Just c else Nothing) +``` + +Ok, it seems that we have to provide three functions to provide: + + * One for debugging + * One to update position + * One to actually do the parsing + +Since it's quite different of `ParsecT`, we should had a look at `tokenPrim`: + +```haskell +-- | The parser @tokenPrim showTok nextPos testTok@ accepts a token @t@ +-- with result @x@ when the function @testTok t@ returns @'Just' x@. The +-- token can be shown using @showTok t@. The position of the /next/ +-- token should be returned when @nextPos@ is called with the current +-- source position @pos@, the current token @t@ and the rest of the +-- tokens @toks@, @nextPos pos t toks@. +-- +-- This is the most primitive combinator for accepting tokens. For +-- example, the 'Text.Parsec.Char.char' parser could be implemented as: +-- +-- > char c +-- > = tokenPrim showChar nextPos testChar +-- > where +-- > showChar x = "'" ++ x ++ "'" +-- > testChar x = if x == c then Just x else Nothing +-- > nextPos pos x xs = updatePosChar pos x + +tokenPrim :: (Stream s m t) + => (t -> String) -- ^ Token pretty-printing function. + -> (SourcePos -> t -> s -> SourcePos) -- ^ Next position calculating function. + -> (t -> Maybe a) -- ^ Matching function for the token to parse. + -> ParsecT s u m a +tokenPrim showToken nextpos test + = ParsecT $ \(State input pos user) cok _cerr _eok eerr -> do + r <- uncons input + case r of + Nothing -> eerr $ unexpectError "" pos + Just (c,cs) + -> case test c of + Just x -> let newpos = nextpos pos c cs + newstate = State cs newpos user + in seq newpos $ seq newstate $ + cok x newstate (newErrorUnknown newpos) + Nothing -> eerr $ unexpectError (showToken c) pos +``` + +So far so good our deductions were correct, let see what's going on: + + * We consume the first element of `State`'s `input` + * We return an empty error if there's no element + * We test the predicate + * If it works: + * We compute the new position + * We create the new state (`input`'s remainings, new position, old user state) + * We strictly evaluate the new position and state, then we pass it all to the successful function + * If it fails, we use the empty error + +It's clearer, now, we can see which functions are given when the parser is run: + +```haskell + +runParserT :: (Stream s m t) + => ParsecT s u m a -> u -> SourceName -> s -> m (Either ParseError a) +runParserT p u name s + = do res <- runParsecT p (State s (initialPos name) u) + r <- parserReply res + case r of + Ok x _ _ -> return (Right x) + Error err -> return (Left err) + where + parserReply res + = case res of + Consumed r -> r + Empty r -> r +``` + +Nothing fancy here: + + * We run the parser with an initial state (`initialPos` starts at line 1 an column 1) + * Transforme the reply (`Consumed`) in `Either` + +We have to dig into `runParsecT` to see what's going on: + +```haskell +runParsecT :: Monad m => ParsecT s u m a -> State s u -> m (Consumed (m (Reply s u a))) +runParsecT p s = unParser p s cok cerr eok eerr + where cok a s' err = return . Consumed . return $ Ok a s' err + cerr err = return . Consumed . return $ Error err + eok a s' err = return . Empty . return $ Ok a s' err + eerr err = return . Empty . return $ Error err +``` + +No surprise neither, just some constructor calling. + +# Combinators + +Since parser combinators are made from composition, we can see the `Applicative`/`Alternative`/`Monad` instances: + +```haskell +instance Applicative.Applicative (ParsecT s u m) where + pure = parserReturn + (<*>) = ap + +instance Applicative.Alternative (ParsecT s u m) where + empty = mzero + (<|>) = mplus + +instance Monad (ParsecT s u m) where + return = Applicative.pure + p >>= f = parserBind p f +``` + +We'll have a look at `MonadPlus` later, here are `parserReturn` and `parserBind`: + +```haskell +parserReturn :: a -> ParsecT s u m a +parserReturn x + = ParsecT $ \s _ _ eok _ -> + eok x s (unknownError s) + +parserBind :: ParsecT s u m a -> (a -> ParsecT s u m b) -> ParsecT s u m b +{-# INLINE parserBind #-} +parserBind m k + = ParsecT $ \s cok cerr eok eerr -> + let + -- consumed-okay case for m + mcok x s err + | errorIsUnknown err = unParser (k x) s cok cerr cok cerr + | otherwise = + let + -- if (k x) consumes, those go straight up + pcok = cok + pcerr = cerr + + -- if (k x) doesn't consume input, but is okay, + -- we still return in the consumed continuation + peok x s err' = cok x s (mergeError err err') + + -- if (k x) doesn't consume input, but errors, + -- we return the error in the 'consumed-error' + -- continuation + peerr err' = cerr (mergeError err err') + in unParser (k x) s pcok pcerr peok peerr + + -- empty-ok case for m + meok x s err + | errorIsUnknown err = unParser (k x) s cok cerr eok eerr + | otherwise = + let + -- in these cases, (k x) can return as empty + pcok = cok + peok x s err' = eok x s (mergeError err err') + pcerr = cerr + peerr err' = eerr (mergeError err err') + in unParser (k x) s pcok pcerr peok peerr + -- consumed-error case for m + mcerr = cerr + + -- empty-error case for m + meerr = eerr + + in unParser m s mcok mcerr meok meerr +``` + +`parserReturn` is fairly straight to the point: pass the value to the successful function. + +While `parserBind` is way more "worked". + +We can focus on `MonadPlus`: + +```haskell + +instance MonadPlus (ParsecT s u m) where + -- | @mzero@ always fails without consuming any input. + mzero + = ParsecT $ \s _ _ _ eerr -> + eerr $ unknownError s + + mplus m n + = ParsecT $ \s cok cerr eok eerr -> + let + meerr err = + let + neok y s' err' = eok y s' (mergeError err err') + neerr err' = eerr $ mergeError err err' + in unParser n s cok cerr neok neerr + in unParser m s cok cerr eok meerr +``` + +Despite the appearant complexity of `plus`, the idea is dead simple: + + * Override first parser's empty error function, such as is calls the second one + * The second one see its errors merged with the first one + +Note that they both operate on the `State`. diff --git a/en/lessons/parsers/parsley.md b/en/lessons/parsers/parsley.md new file mode 100644 index 00000000..5a45f370 --- /dev/null +++ b/en/lessons/parsers/parsley.md @@ -0,0 +1,132 @@ +--- +version: 1.0.0 +title: Parsley +--- + +{% include toc.html %} + +From the documentation: + +> [parsley](https://hackage.haskell.org/package/parsley) is a staged selective +> parser combinator library, which means it does not support monadic operations, +> and relies on Typed Template Haskell to generate very fast code. + +# Definition + +Let's seen how it's defined: + +```haskell +newtype Parser a = Parser {unParser :: Fix (Combinator :+: ScopeRegister) a} + +-- Core datatype +data Combinator (k :: Type -> Type) (a :: Type) where + Pure :: Defunc a -> Combinator k a + Satisfy :: Defunc (Char -> Bool) -> Combinator k Char + (:<*>:) :: k (a -> b) -> k a -> Combinator k b + (:*>:) :: k a -> k b -> Combinator k b + (:<*:) :: k a -> k b -> Combinator k a + (:<|>:) :: k a -> k a -> Combinator k a + Empty :: Combinator k a + Try :: k a -> Combinator k a + LookAhead :: k a -> Combinator k a + Let :: Bool -> MVar a -> Combinator k a + NotFollowedBy :: k a -> Combinator k () + Branch :: k (Either a b) -> k (a -> c) -> k (b -> c) -> Combinator k c + Match :: k a -> [Defunc (a -> Bool)] -> [k b] -> k b -> Combinator k b + Loop :: k () -> k a -> Combinator k a + MakeRegister :: ΣVar a -> k a -> k b -> Combinator k b + GetRegister :: ΣVar a -> Combinator k a + PutRegister :: ΣVar a -> k a -> Combinator k () + Position :: PosSelector -> Combinator k Int + Debug :: String -> k a -> Combinator k a + MetaCombinator :: MetaCombinator -> k a -> Combinator k a +``` + +Clearly, that's the complete opposite approach of everything we have seen so +far, each operation has a constructor. + +We can also have a look at their associated types: + +```haskell + +data ScopeRegister (k :: Type -> Type) (a :: Type) where + ScopeRegister :: k a -> (forall r. Reg r a -> k b) -> ScopeRegister k b + +data PosSelector where + Line :: PosSelector + Col :: PosSelector + +{-| +This is an opaque representation of a parsing register. +It is the abstracted representation of a runtime storage location. +-} +newtype Reg (r :: Type) a = Reg (ΣVar a) + +data MetaCombinator where + -- | After this combinator exits, a cut has happened + Cut :: MetaCombinator + -- | This combinator requires a cut from below to respect parsec semantics + RequiresCut :: MetaCombinator + -- | This combinator denotes that within its scope, cut semantics are not enforced + CutImmune :: MetaCombinator + +{-| +An identifier representing concrete registers and mutable state. +-} +newtype ΣVar (a :: Type) = ΣVar IΣVar + +{-| +Underlying untyped identifier, which is numeric but otherwise opaque. +-} +newtype IΣVar = IΣVar Word64 deriving newtype (Ord, Eq, Num, Enum, Show, Ix) + +{-| +This datatype is useful for providing an /inspectable/ representation of common Haskell functions. +-} +data Defunc a -- complex implementation + +``` + +Not particularly interesting, but it is very detailed. + +# Running the parser + +We can have a look at how everything is used, from the example: + +```haskell +parseOut :: ByteString -> Maybe [Out] +parseOut = $$(Parsley.parse myParser) +``` + +Then we expect the `Template` work to be here: + +```haskell +parse :: (Trace, Input input) => Parser a -> Code (input -> Maybe a) +parse p = [||\input -> $$(eval [||input||] (compile (try p) codeGen))||] +``` + +The implementation is quite complex, but it'll hopefully help to get a better picture: + +```haskell +{-| +Translates a parser represented with combinators into its machine representation. +-} +{-# INLINEABLE codeGen #-} +codeGen :: Trace + => Maybe (MVar x) -- ^ The name of the parser, if it exists. + -> Fix Combinator x -- ^ The definition of the parser. + -> Set SomeΣVar -- ^ The free registers it requires to run. + -> IMVar -- ^ The binding identifier to start name generation from. + -> LetBinding o a x + +eval :: forall o a. (Trace, Ops o) => Code (InputDependant o) -> LetBinding o a a -> DMap MVar (LetBinding o a) -> Code (Maybe a) + +compile :: forall compiled a. Trace => Parser a -> (forall x. Maybe (MVar x) -> Fix Combinator x -> Set IΣVar -> IMVar -> IΣVar -> compiled x) -> (compiled a, DMap MVar compiled) +``` + +The idea is to compile `Combinator` to `Template`'s `Code` through a complete machinery. + +# Conclusion + +The special feature of `parsley` is to be mostly working at compile-time, +making the implementation far more complex. diff --git a/en/lessons/parsers/readp.md b/en/lessons/parsers/readp.md new file mode 100644 index 00000000..c7a7800f --- /dev/null +++ b/en/lessons/parsers/readp.md @@ -0,0 +1,127 @@ +--- +version: 1.0.0 +title: ReadP +--- + +{% include toc.html %} + +`base` embeds a parser combinators library ([`ReadP`](https://hackage.haskell.org/package/base-4.16.0.0/docs/Text-ParserCombinators-ReadP.html)) +as seen in the introduction. + +# Definition + +Right from the introduction: + +> It parses all alternatives in parallel, so it never keeps hold of the beginning of the input string, a common source of space leaks with other parsers. + +Let's proceed with `ReadP` definition: + +```haskell +newtype ReadP a = R (forall b . (a -> P b) -> P b) + +instance Functor ReadP where + fmap h (R f) = R (\k -> f (k . h)) + +instance Applicative ReadP where + pure x = R (\k -> k x) + (<*>) = ap + +instance Monad ReadP where + R m >>= f = R (\k -> m (\a -> let R m' = f a in m' k)) + +instance Alternative ReadP where + empty = R (\_ -> Fail) + R f1 <|> R f2 = R (\k -> f1 k <|> f2 k) +``` + +That's minimalist, let see how `P` is defined: + +```haskell +data P a + = Get (Char -> P a) + | Look (String -> P a) + | Fail + | Result a (P a) + | Final (NonEmpty (a,String)) +``` + +It looks like `P` is the current parser's instruction. + +Now, we can have a look at the associated instances: + +```haskell +instance Applicative P where + pure x = Result x Fail + (<*>) = ap + +instance Monad P where + (Get f) >>= k = Get (\c -> f c >>= k) + (Look f) >>= k = Look (\s -> f s >>= k) + Fail >>= _ = Fail + (Result x p) >>= k = k x <|> (p >>= k) + (Final (r:|rs)) >>= k = final [ys' | (x,s) <- (r:rs), ys' <- run (k x) s] + +instance Alternative P where + empty = Fail + + -- most common case: two gets are combined + Get f1 <|> Get f2 = Get (\c -> f1 c <|> f2 c) + + -- results are delivered as soon as possible + Result x p <|> q = Result x (p <|> q) + p <|> Result x q = Result x (p <|> q) + + -- fail disappears + Fail <|> p = p + p <|> Fail = p + + -- two finals are combined + -- final + look becomes one look and one final (=optimization) + -- final + sthg else becomes one look and one final + Final r <|> Final t = Final (r <> t) + Final (r:|rs) <|> Look f = Look (\s -> Final (r:|(rs ++ run (f s) s))) + Final (r:|rs) <|> p = Look (\s -> Final (r:|(rs ++ run p s))) + Look f <|> Final r = Look (\s -> Final (case run (f s) s of + [] -> r + (x:xs) -> (x:|xs) <> r)) + p <|> Final r = Look (\s -> Final (case run p s of + [] -> r + (x:xs) -> (x:|xs) <> r)) + + -- two looks are combined (=optimization) + -- look + sthg else floats upwards + Look f <|> Look g = Look (\s -> f s <|> g s) + Look f <|> p = Look (\s -> f s <|> p) + p <|> Look f = Look (\s -> p <|> f s) +``` + +All the logic is here, it makes more sense. + +# Running the parser + +There's no direct to get a result from a `ReadP`, instead we have: + +```haskell +type ReadS a = String -> [(a, String)] + +readP_to_S :: ReadP a -> ReadS a +readP_to_S (R f) = run (f return) + +run :: P a -> ReadS a +run (Get f) (c:s) = run (f c) s +run (Look f) s = run (f s) s +run (Result x p) s = (x,s) : run p s +run (Final (r:|rs)) _ = (r:rs) +run _ _ = [] +``` + +Simple, looks like a basic expression interpreter, and it explains why we +might get multiple results. + +We also see that error handling is implicit, you only know that parsing failed, +because there's still characters to consume. + +# Conclusion + +`ReadP` is a really simple parser combinators library, it's a trade off which +costs error handling and performance.