Skip to content

Commit

Permalink
Store compiled regexes in RE (#166)
Browse files Browse the repository at this point in the history
This changes the RE type to (lazily) compile the regex when constructed,
rather than in the tokenizer. This allows us to avoid re-compiling
regexes for each separate tokenize call, instead sharing them globally.

We try to hide the internals of this, exposing the previous interfae
(RE { reString, reCaseSensitive }) with pattern synonyms.
  • Loading branch information
SquidDev authored Jun 4, 2023
1 parent da282a2 commit 2f25c08
Show file tree
Hide file tree
Showing 2 changed files with 75 additions and 22 deletions.
78 changes: 70 additions & 8 deletions skylighting-core/src/Skylighting/Regex.hs
Original file line number Diff line number Diff line change
Expand Up @@ -2,37 +2,96 @@
{-# LANGUAGE DeriveDataTypeable #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE PatternSynonyms #-}
{-# LANGUAGE ScopedTypeVariables #-}

module Skylighting.Regex (
Regex(..)
, RE(..)
, RE
, pattern RE, reCaseSensitive, reString
, compileRE
, compileRegex
, matchRegex
, testRegex
, isWordChar
) where

import Data.Aeson
import Data.Binary (Binary)
import Data.Binary (Binary(..))
import qualified Data.ByteString.Base64 as Base64
import qualified Data.ByteString.Char8 as BS
import Data.Data
import qualified Data.Text as Text
import qualified Data.Text.Encoding as TE
import GHC.Generics (Generic)
#if !MIN_VERSION_base(4,13,0)
import Control.Monad.Fail (MonadFail)
#endif
import Regex.KDE

import Text.Read hiding (get)

-- | A representation of a regular expression.
data RE = RE{
reString :: BS.ByteString
, reCaseSensitive :: Bool
} deriving (Show, Read, Ord, Eq, Data, Typeable, Generic)
data RE = RE'{
_reString :: BS.ByteString
, _reCaseSensitive :: Bool
, _reCompiled :: Either String Regex
} deriving Typeable

-- We define a smart constructor which also holds the compiled regex, to avoid
-- recompiling each time we tokenize.

{-# COMPLETE RE #-}
pattern RE :: BS.ByteString -> Bool -> RE
pattern RE {reString, reCaseSensitive} <- RE' reString reCaseSensitive _ where
RE str caseSensitive = RE' str caseSensitive (compileRegex caseSensitive str)

-- Unfortunately this means we need to derive all the instances ourselves.

instance Show RE where
showsPrec d (RE str caseSensitive) = showParen (d > 10)
$ showString "RE {reString = "
. showsPrec 11 str
. showString ", reCaseSensitive = "
. showsPrec 11 caseSensitive
. showString "}"

instance Read RE where
readPrec = parens . prec 10 $ do
Ident "RE" <- lexP
Punc "{" <- lexP
Ident "reString" <- lexP
Punc "=" <- lexP
str <- readPrec
Punc "," <- lexP
Ident "reCaseSensitive" <- lexP
Punc "=" <- lexP
caseSensitive <- readPrec
Punc "}" <- lexP
pure (RE str caseSensitive)

instance Binary RE
toComparisonKey :: RE -> (BS.ByteString, Bool)
toComparisonKey (RE x y) = (x, y)

instance Eq RE where
x == y = toComparisonKey x == toComparisonKey y

instance Ord RE where
x `compare` y = toComparisonKey x `compare` toComparisonKey y

conRE :: Constr
conRE = mkConstr tyRE "RE" [] Prefix
tyRE :: DataType
tyRE = mkDataType "Skylighting.Regex.RE" [conRE]

instance Data RE where
gfoldl k z (RE s c) = z RE `k` s `k` c
gunfold k z _ = k (k (z RE))
toConstr _ = conRE
dataTypeOf _ = tyRE

instance Binary RE where
put (RE x y) = put x >> put y
get = RE <$> get <*> get

instance ToJSON RE where
toJSON re = object [ "reString" .= encodeToText (reString re)
Expand All @@ -49,3 +108,6 @@ encodeToText = TE.decodeUtf8 . Base64.encode

decodeFromText :: (Monad m, MonadFail m) => Text.Text -> m BS.ByteString
decodeFromText = either fail return . Base64.decode . TE.encodeUtf8

compileRE :: RE -> Either String Regex
compileRE = _reCompiled
19 changes: 5 additions & 14 deletions skylighting-core/src/Skylighting/Tokenizer.hs
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,6 @@ data TokenizerState = TokenizerState{
, column :: Int
, lineContinuation :: Bool
, firstNonspaceColumn :: Maybe Int
, compiledRegexes :: Map.Map RE Regex
}

-- | Configuration options for 'tokenize'.
Expand Down Expand Up @@ -163,7 +162,6 @@ tokenize config syntax inp =
, column = 0
, lineContinuation = False
, firstNonspaceColumn = Nothing
, compiledRegexes = Map.empty
}

info :: String -> TokenizerM ()
Expand Down Expand Up @@ -552,18 +550,11 @@ regExpr dynamic re inp = do
-- return $! traceShowId $! (reStr, inp)
let reStr = reString re
when (BS.take 2 reStr == "\\b") $ wordBoundary inp
compiledREs <- gets compiledRegexes
regex <- case Map.lookup re compiledREs of
Nothing -> do
cre <- case compileRegex (reCaseSensitive re) reStr of
Right r -> return r
Left e -> throwError $
"Error compiling regex " ++
UTF8.toString reStr ++ ": " ++ e
modify $ \st -> st{ compiledRegexes =
Map.insert re cre (compiledRegexes st) }
return cre
Just cre -> return cre
regex <- case compileRE re of
Right r -> return r
Left e -> throwError $
"Error compiling regex " ++
UTF8.toString reStr ++ ": " ++ e
regex' <- if dynamic
then subDynamic regex
else return regex
Expand Down

0 comments on commit 2f25c08

Please sign in to comment.