{-# LANGUAGE OverloadedStrings #-} module Lexer ( Token (..), lexer, emphasis, ) where import Control.Monad (mfilter, void) import Data.Functor (($>)) import Data.Text (Text, intercalate) import Data.Text qualified as Text import GHC.Unicode (isAlpha, isAlphaNum, isControl, isPrint, isSpace, isUpper) import ParserMonad (Parser, initialParserState) import Text.Parsec import Text.Parsec qualified as Parsec import Text.Parsec.Pos (updatePosChar) type LocatedToken = (SourcePos, Token) type Lexer = Parser [LocatedToken] data Level = One | Two | Three | Four | Five | Six deriving (Eq, Show) data Token = Token Text | Anchor | BirdTrack | BoldOpen | BoldClose | Escape | EmphasisOpen | EmphasisClose | Header Level | MonospaceOpen | MonospaceClose | Newline | LinkOpen | LinkClose | LabeledLinkOpen | LabeledLinkClose | ParenOpen | ParenClose | BracketOpen | BracketClose | MathsParenOpen | MathsParenClose | MathsBracketOpen | MathsBracketClose | Module | QuoteOpen | QuoteClose | Space | EOF deriving (Eq, Show) lexer :: String -> Either ParseError [LocatedToken] lexer = Parsec.runParser lexText initialParserState "input" . Text.pack lexText :: Parser [LocatedToken] lexText = go where go = do Parsec.optionMaybe Parsec.eof >>= \case Just _ -> pure [] Nothing -> do toks <- choice $ Parsec.try <$> [ mathsBracket , mathsParens , escape -- maths go before escape to avoid mismatch , headers , newlineToken , spaceToken , link , labeledLink , modules , anchors , textElement , quotes , birdTrack , other ] rest <- go pure (toks <> rest) -- Tokens textElement :: Parser [LocatedToken] textElement = choice $ Parsec.try <$> [ emphasis , bold , monospace ] headers :: Parser [LocatedToken] headers = choice $ Parsec.try <$> [ header1 , header2 , header3 , header4 , header5 , header6 ] delimitedMaybe :: Parser a -> Parser a -> Token -> Maybe Token -> Parser [LocatedToken] delimitedMaybe openMark closeMark openToken closeToken = do openPos <- getPosition void openMark tokenPos <- getPosition content <- anyUntil closeMark closePos <- getPosition void closeMark let openTok :: LocatedToken = (openPos, openToken) res :: LocatedToken = (tokenPos, Token content) closeToks :: [LocatedToken] = case closeToken of Just close -> [(closePos, close)] Nothing -> [] pure $ [openTok, res] <> closeToks anyUntil :: Parser a -> Parser Text anyUntil p = Text.pack <$> manyTill anyChar (lookAhead p) delimited :: Parser a -> Parser a -> Token -> Token -> Parser [LocatedToken] delimited a b c d = delimitedMaybe a b c (Just d) delimitedSymmetric :: Parser a -> Token -> Token -> Parser [LocatedToken] delimitedSymmetric s t1 t2 = delimited s s t1 t2 eol :: Parser () eol = void "\n" <|> Parsec.eof header1 :: Lexer header1 = delimitedMaybe (void $ "= ") eol (Header One) Nothing header2 :: Lexer header2 = delimitedMaybe (void $ "== ") eol (Header Two) Nothing header3 :: Lexer header3 = delimitedMaybe (void $ "=== ") eol (Header Three) Nothing header4 :: Lexer header4 = delimitedMaybe (void $ "==== ") eol (Header Four) Nothing header5 :: Lexer header5 = delimitedMaybe (void $ "===== ") eol (Header Five) Nothing header6 :: Lexer header6 = delimitedMaybe (void $ "====== ") eol (Header Six) Nothing -- #anchors# anchors :: Lexer anchors = do pos <- getPosition void $ try anchor' pos' <- getPosition txt <- anyUntil anchor' pos'' <- getPosition void $ try anchor' pure [ (pos, Anchor) , (pos', Token txt) , (pos'', Anchor) ] where anchor' = (string "#" <|> string "\\#") -- "Module.Name" -- "Module.Name#anchor" -- "Module.Name#anchor" modules :: Lexer modules = do pos <- getPosition void $ char '"' pos' <- getPosition modName <- modId anch <- option [] do pos'' <- getPosition void $ try (string "#" <|> string "\\#") pos''' <- getPosition a <- Text.pack <$> many (satisfy (\c -> c /= '"' && not (isSpace c))) pure [(pos'', Anchor), (pos''', Token a)] void $ char '"' pure $ [(pos, Module), (pos', Token modName)] <> anch where modId = intercalate "." <$> (fmap Text.pack <$> (conId `sepBy1` (char '.'))) conId :: Parser String conId = (:) <$> satisfy (\c -> isAlpha c && isUpper c) <*> many1 conChar conChar :: Parser Char conChar = satisfy (\c -> isAlphaNum c || c == '_') linkRaw :: Lexer linkRaw = do pos1 <- getPosition void $ string "[" pos2 <- getPosition text <- anyUntil $ Text.pack <$> string "]" pos3 <- getPosition void $ "]" pos4 <- getPosition void $ "(" pos5 <- getPosition link' <- anyUntil $ Text.pack <$> string ")" pos6 <- getPosition void $ ")" pure $ [ (pos1, BracketOpen) , (pos2, Token text) , (pos3, BracketClose) , (pos4, ParenOpen) , (pos5, Token link') , (pos6, ParenClose) ] link :: Lexer link = do pos <- getPosition l <- linkRaw -- "unconsume" the last token pos' <- flip incSourceColumn (-1) <$> getPosition pure $ (pos, LinkOpen) : l <> [(pos', LinkClose)] labeledLink :: Lexer labeledLink = do pos <- getPosition void $ string "<" link' <- linkRaw pos7 <- getPosition label' <- anyUntil $ string ">" pos8 <- getPosition void $ ">" pure $ (pos, LabeledLinkOpen) : link' <> [ (pos7, Token label') , (pos8, LabeledLinkClose) ] mathsBracket :: Lexer mathsBracket = delimited (void $ "\\[") (void "\\]") MathsBracketOpen MathsBracketClose mathsParens :: Lexer mathsParens = delimited (void $ "\\(") (void "\\)") MathsParenOpen MathsParenClose birdTrack :: Lexer birdTrack = delimitedMaybe (void ">> ") eol BirdTrack Nothing escape :: Lexer escape = delimitedMaybe (void "\\") eol Escape Nothing quotes :: Lexer quotes = delimitedSymmetric "\"" QuoteOpen QuoteClose emphasis :: Lexer emphasis = delimitedSymmetric "/" EmphasisOpen EmphasisClose bold :: Lexer bold = delimitedSymmetric "__" BoldOpen BoldClose monospace :: Lexer monospace = delimitedSymmetric "@" MonospaceOpen MonospaceClose other :: Lexer other = do pos <- getPosition c <- takeWhile1_ isUnicodeAlphaNum pure . pure $ (pos, Token c) where isUnicodeAlphaNum c = isPrint c && not (isControl c) && not (isSpace c) spaceToken :: Lexer spaceToken = do pos <- getPosition _ <- many1 (char ' ') pure . pure $ (pos, Space) newlineToken :: Lexer newlineToken = do pos <- getPosition _ <- newline pure . pure $ (pos, Newline) ------- -- Helpers ------- -- | Like `takeWhile`, but unconditionally take escaped characters. takeWhile_ :: (Char -> Bool) -> Parser Text takeWhile_ p = scan p_ False where p_ escaped c | escaped = Just False | not $ p c = Nothing | otherwise = Just (c == '\\') -- | Like 'takeWhile1', but unconditionally take escaped characters. takeWhile1_ :: (Char -> Bool) -> Parser Text takeWhile1_ = mfilter (not . Text.null) . takeWhile_ {- | Scan the input text, accumulating characters as long as the scanning function returns true. -} scan :: -- | scan function (state -> Char -> Maybe state) -> -- | initial state state -> Parser Text scan f initState = do parserState@State{stateInput = input, statePos = pos} <- Parsec.getParserState (remaining, finalPos, ct) <- go input initState pos 0 let newState = parserState{stateInput = remaining, statePos = finalPos} Parsec.setParserState newState $> Text.take ct input where go !input' !st !posAccum !count' = case Text.uncons input' of Nothing -> pure (input', posAccum, count') Just (char', input'') -> case f st char' of Nothing -> pure (input', posAccum, count') Just st' -> go input'' st' (updatePosChar posAccum char') (count' + 1)