{-# LANGUAGE OverloadedStrings #-} module Lexer ( Token (..), lexer, emphasis, ) where import Control.Monad (mfilter) import Data.Char (isAlphaNum, isPrint) import Data.Functor (($>)) import Data.Text (Text) import Data.Text qualified as Text import GHC.Stack (HasCallStack) import ParserMonad (Parser, initialParserState) import Text.Parsec import Text.Parsec qualified as Parsec import Text.Parsec.Pos (updatePosChar) type LocatedToken = (SourcePos, Token) type Lexer = Parser [LocatedToken] data Token = Token Text | Anchor | AngleOpen | AngleClose | BoldOpen | BoldClose | BracketOpen | BracketClose | EmphasisOpen | EmphasisClose | MonospaceOpen | MonospaceClose | Newline | ParenOpen | ParenClose | QuoteOpen | QuoteClose | Space | EOF deriving (Eq, Show) lexer :: String -> Either ParseError [LocatedToken] lexer = Parsec.runParser lexText initialParserState "input" . Text.pack lexText :: (HasCallStack) => Parser [LocatedToken] lexText = go where go = do Parsec.optionMaybe Parsec.eof >>= \case Just _ -> pure [] Nothing -> do toks <- choice [ newlineToken , spaceToken , textElement , identifier , other ] rest <- go pure (toks <> rest) match :: Parser a -> Parser (Text, a) match p = do input <- getInput result <- p input' <- getInput let !consumed = Text.take (Text.length input - Text.length input') input pure (consumed, result) -- Tokens textElement :: Parser [LocatedToken] textElement = choice $ Parsec.try <$> [ emphasis , bold , monospace , parens , brackets , angles ] delimited :: String -> String -> Token -> Token -> Parser [LocatedToken] delimited c1 c2 ot ct = do (_, content) <- match $ between op cl any' pos <- getPosition let openTok :: LocatedToken = (pos, ot) closeTok :: LocatedToken = (pos, ct) res :: LocatedToken = (pos, Token content) pure [openTok, res, closeTok] where op = string c1 cl = string c2 any' = Text.pack <$> manyTill anyChar (lookAhead cl) delimited' :: String -> Token -> Token -> Parser [LocatedToken] delimited' s t1 t2 = delimited s s t1 t2 emphasis :: Lexer emphasis = delimited' "/" EmphasisOpen EmphasisClose bold :: Lexer bold = delimited' "__" BoldOpen BoldClose monospace :: Lexer monospace = delimited' "@" MonospaceOpen MonospaceClose parens :: Parser [LocatedToken] parens = delimited "(" ")" ParenOpen ParenClose brackets :: Lexer brackets = delimited "[" "]" ParenOpen ParenClose angles :: Parser [LocatedToken] angles = delimited "<" ">" AngleOpen AngleClose other :: Lexer other = do pos <- getPosition c <- takeWhile1_ isPrint pure . pure $ (pos, Token c) spaceToken :: Lexer spaceToken = do pos <- getPosition _ <- many1 (char ' ') pure . pure $ (pos, Space) newlineToken :: Lexer newlineToken = do pos <- getPosition _ <- newline pure . pure $ (pos, Newline) identifier :: Lexer identifier = do pos <- getPosition txt <- takeWhile1_ isAlphaNum pure . pure $ (pos, Token txt) ------- -- Helpers ------- -- | Like `takeWhile`, but unconditionally take escaped characters. takeWhile_ :: (Char -> Bool) -> Parser Text takeWhile_ p = scan p_ False where p_ escaped c | escaped = Just False | not $ p c = Nothing | otherwise = Just (c == '\\') -- | Like 'takeWhile1', but unconditionally take escaped characters. takeWhile1_ :: (Char -> Bool) -> Parser Text takeWhile1_ = mfilter (not . Text.null) . takeWhile_ {- | Scan the input text, accumulating characters as long as the scanning function returns true. -} scan :: -- | scan function (state -> Char -> Maybe state) -> -- | initial state state -> Parser Text scan f initState = do parserState@State{stateInput = input, statePos = pos} <- Parsec.getParserState (remaining, finalPos, ct) <- go input initState pos 0 let newState = parserState{stateInput = remaining, statePos = finalPos} Parsec.setParserState newState $> Text.take ct input where go !input' !st !posAccum !count' = case Text.uncons input' of Nothing -> pure (input', posAccum, count') Just (char', input'') -> case f st char' of Nothing -> pure (input', posAccum, count') Just st' -> go input'' st' (updatePosChar posAccum char') (count' + 1)