From 2f25c08651beee10b80242fd9c6a1c66328148d8 Mon Sep 17 00:00:00 2001 From: Jonathan Coates Date: Mon, 5 Jun 2023 00:22:06 +0100 Subject: [PATCH] Store compiled regexes in RE (#166) This changes the RE type to (lazily) compile the regex when constructed, rather than in the tokenizer. This allows us to avoid re-compiling regexes for each separate tokenize call, instead sharing them globally. We try to hide the internals of this, exposing the previous interfae (RE { reString, reCaseSensitive }) with pattern synonyms. --- skylighting-core/src/Skylighting/Regex.hs | 78 +++++++++++++++++-- skylighting-core/src/Skylighting/Tokenizer.hs | 19 ++--- 2 files changed, 75 insertions(+), 22 deletions(-) diff --git a/skylighting-core/src/Skylighting/Regex.hs b/skylighting-core/src/Skylighting/Regex.hs index a6f62075d..1d1357b9b 100644 --- a/skylighting-core/src/Skylighting/Regex.hs +++ b/skylighting-core/src/Skylighting/Regex.hs @@ -2,11 +2,14 @@ {-# LANGUAGE DeriveDataTypeable #-} {-# LANGUAGE DeriveGeneric #-} {-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE PatternSynonyms #-} {-# LANGUAGE ScopedTypeVariables #-} module Skylighting.Regex ( Regex(..) - , RE(..) + , RE + , pattern RE, reCaseSensitive, reString + , compileRE , compileRegex , matchRegex , testRegex @@ -14,25 +17,81 @@ module Skylighting.Regex ( ) where import Data.Aeson -import Data.Binary (Binary) +import Data.Binary (Binary(..)) import qualified Data.ByteString.Base64 as Base64 import qualified Data.ByteString.Char8 as BS import Data.Data import qualified Data.Text as Text import qualified Data.Text.Encoding as TE -import GHC.Generics (Generic) #if !MIN_VERSION_base(4,13,0) import Control.Monad.Fail (MonadFail) #endif import Regex.KDE +import Text.Read hiding (get) + -- | A representation of a regular expression. -data RE = RE{ - reString :: BS.ByteString - , reCaseSensitive :: Bool -} deriving (Show, Read, Ord, Eq, Data, Typeable, Generic) +data RE = RE'{ + _reString :: BS.ByteString + , _reCaseSensitive :: Bool + , _reCompiled :: Either String Regex +} deriving Typeable + +-- We define a smart constructor which also holds the compiled regex, to avoid +-- recompiling each time we tokenize. + +{-# COMPLETE RE #-} +pattern RE :: BS.ByteString -> Bool -> RE +pattern RE {reString, reCaseSensitive} <- RE' reString reCaseSensitive _ where + RE str caseSensitive = RE' str caseSensitive (compileRegex caseSensitive str) + +-- Unfortunately this means we need to derive all the instances ourselves. + +instance Show RE where + showsPrec d (RE str caseSensitive) = showParen (d > 10) + $ showString "RE {reString = " + . showsPrec 11 str + . showString ", reCaseSensitive = " + . showsPrec 11 caseSensitive + . showString "}" + +instance Read RE where + readPrec = parens . prec 10 $ do + Ident "RE" <- lexP + Punc "{" <- lexP + Ident "reString" <- lexP + Punc "=" <- lexP + str <- readPrec + Punc "," <- lexP + Ident "reCaseSensitive" <- lexP + Punc "=" <- lexP + caseSensitive <- readPrec + Punc "}" <- lexP + pure (RE str caseSensitive) -instance Binary RE +toComparisonKey :: RE -> (BS.ByteString, Bool) +toComparisonKey (RE x y) = (x, y) + +instance Eq RE where + x == y = toComparisonKey x == toComparisonKey y + +instance Ord RE where + x `compare` y = toComparisonKey x `compare` toComparisonKey y + +conRE :: Constr +conRE = mkConstr tyRE "RE" [] Prefix +tyRE :: DataType +tyRE = mkDataType "Skylighting.Regex.RE" [conRE] + +instance Data RE where + gfoldl k z (RE s c) = z RE `k` s `k` c + gunfold k z _ = k (k (z RE)) + toConstr _ = conRE + dataTypeOf _ = tyRE + +instance Binary RE where + put (RE x y) = put x >> put y + get = RE <$> get <*> get instance ToJSON RE where toJSON re = object [ "reString" .= encodeToText (reString re) @@ -49,3 +108,6 @@ encodeToText = TE.decodeUtf8 . Base64.encode decodeFromText :: (Monad m, MonadFail m) => Text.Text -> m BS.ByteString decodeFromText = either fail return . Base64.decode . TE.encodeUtf8 + +compileRE :: RE -> Either String Regex +compileRE = _reCompiled diff --git a/skylighting-core/src/Skylighting/Tokenizer.hs b/skylighting-core/src/Skylighting/Tokenizer.hs index 42c9b8376..27119a4c8 100644 --- a/skylighting-core/src/Skylighting/Tokenizer.hs +++ b/skylighting-core/src/Skylighting/Tokenizer.hs @@ -56,7 +56,6 @@ data TokenizerState = TokenizerState{ , column :: Int , lineContinuation :: Bool , firstNonspaceColumn :: Maybe Int - , compiledRegexes :: Map.Map RE Regex } -- | Configuration options for 'tokenize'. @@ -163,7 +162,6 @@ tokenize config syntax inp = , column = 0 , lineContinuation = False , firstNonspaceColumn = Nothing - , compiledRegexes = Map.empty } info :: String -> TokenizerM () @@ -552,18 +550,11 @@ regExpr dynamic re inp = do -- return $! traceShowId $! (reStr, inp) let reStr = reString re when (BS.take 2 reStr == "\\b") $ wordBoundary inp - compiledREs <- gets compiledRegexes - regex <- case Map.lookup re compiledREs of - Nothing -> do - cre <- case compileRegex (reCaseSensitive re) reStr of - Right r -> return r - Left e -> throwError $ - "Error compiling regex " ++ - UTF8.toString reStr ++ ": " ++ e - modify $ \st -> st{ compiledRegexes = - Map.insert re cre (compiledRegexes st) } - return cre - Just cre -> return cre + regex <- case compileRE re of + Right r -> return r + Left e -> throwError $ + "Error compiling regex " ++ + UTF8.toString reStr ++ ": " ++ e regex' <- if dynamic then subDynamic regex else return regex