{-# LANGUAGE OverloadedStrings #-} module Lexer ( Token (..), lexer, emphasis, ) where import Control.Monad (mfilter, void) import Data.Functor (($>)) import Data.Text (Text, intercalate) import Data.Text qualified as Text import GHC.Unicode (isAlpha, isAlphaNum, isControl, isPrint, isSpace, isUpper) import ParserMonad (Parser, initialParserState) import Text.Parsec import Text.Parsec qualified as Parsec import Text.Parsec.Pos (updatePosChar) type Located a = (SourcePos, a) type LocatedToken = (SourcePos, Token) type Lexer = Parser [LocatedToken] data Level = One | Two | Three | Four | Five | Six deriving (Eq, Show) data Token = Token Text | Anchor Text | BirdTrack | BoldOpen | BoldClose | Escape | EmphasisOpen | EmphasisClose | Header Level | MonospaceOpen | MonospaceClose | Newline | LinkOpen | LinkClose | LabeledLinkOpen | LabeledLinkClose | ParenOpen | ParenClose | BracketOpen | BracketClose | MathsParenOpen | MathsParenClose | MathsBracketOpen | MathsBracketClose | NumericEntity Int | Module Text | QuoteOpen | QuoteClose | Space | EOF deriving (Eq, Show) located :: Parser a -> Parser (SourcePos, a) located p = (,) <$> getPosition <*> p startPosition :: Parser a -> Parser SourcePos startPosition = fmap fst . located tokenise :: [Parser a] -> Parser [(SourcePos, a)] tokenise = sequence . map located lexer :: String -> Either ParseError [LocatedToken] lexer = Parsec.runParser lexText initialParserState "input" . Text.pack lexText :: Parser [LocatedToken] lexText = go where go = do Parsec.optionMaybe Parsec.eof >>= \case Just _ -> pure [] Nothing -> do toks <- choice $ Parsec.try <$> [ mathsBracket , mathsParens , escape -- maths go before escape to avoid mismatch , headers , newlineToken , spaceToken , link , labeledLink , modules , anchors , textElement , quotes , birdTrack , other ] rest <- go pure (toks <> rest) -- Tokens textElement :: Parser [LocatedToken] textElement = choice $ Parsec.try <$> [ emphasis , bold , monospace ] headers :: Parser [LocatedToken] headers = choice $ Parsec.try <$> [ header1 , header2 , header3 , header4 , header5 , header6 ] anyUntil :: Parser a -> Parser Text anyUntil p = Text.pack <$> manyTill anyChar (lookAhead p) delimitedAsTuple :: Parser open -> Parser close -> Parser (Located open, LocatedToken, Located close) delimitedAsTuple openP closeP = (,,) <$> located openP <*> located (Token <$> anyUntil closeP) <*> located closeP delimited :: Parser a -> Parser b -> Token -> Token -> Parser [LocatedToken] delimited openP closeP openTok closeTok = asList <$> delimitedAsTuple (openTok <$ openP) (closeTok <$ closeP) where asList (a, tok, b) = [a, tok, b] delimitedNoTrailing :: Parser a -> Parser b -> Token -> Parser [LocatedToken] delimitedNoTrailing openP closeP openTok = asList <$> delimitedAsTuple (openTok <$ openP) (void closeP) where asList (a, tok, _) = [a, tok] delimitedSymmetric :: Parser a -> Token -> Token -> Parser [LocatedToken] delimitedSymmetric s t1 t2 = delimited s s t1 t2 eol :: Parser () eol = void "\n" <|> void "\r\n" <|> Parsec.eof anchorHash :: Parser Text anchorHash = "#" <|> try "\\#" header1 :: Lexer header1 = delimitedNoTrailing "= " eol (Header One) header2 :: Lexer header2 = delimitedNoTrailing "== " eol (Header Two) header3 :: Lexer header3 = delimitedNoTrailing "=== " eol (Header Three) header4 :: Lexer header4 = delimitedNoTrailing "==== " eol (Header Four) header5 :: Lexer header5 = delimitedNoTrailing "===== " eol (Header Five) header6 :: Lexer header6 = delimitedNoTrailing "====== " eol (Header Six) -- #anchors# anchors :: Lexer anchors = tokenise [ between anchorHash anchorHash (Anchor <$> anyUntil anchorHash) ] moduleNames :: Parser Text moduleNames = intercalate "." . fmap Text.pack <$> upperId `sepBy1` char '.' upperId :: Parser String upperId = (:) <$> satisfy isUpper <*> many1 identifierChar identifierChar :: Parser Char identifierChar = satisfy (\c -> isAlphaNum c || c == '_') -- "Module.Name" -- "Module.Name#anchor" -- "Module.Name\#anchor" -- this has been deprecated for 9 years, thanks Ben modules :: Lexer modules = between (char '"') (char '"') inner where inner = do module_ <- located $ Module <$> moduleNames mAnchor <- optionMaybe (located $ anchorHash *> (Anchor <$> anchorText)) pure $ case mAnchor of Just anchor -> [module_, anchor] Nothing -> [module_] anchorText :: Parser Text anchorText = Text.pack <$> many (satisfy (\c -> c /= '"' && not (isSpace c))) linkRaw :: Lexer linkRaw = tokenise [ BracketOpen <$ "[" , Token <$> anyUntil "]" , BracketClose <$ "]" , ParenOpen <$ "(" , Token <$> anyUntil ")" , ParenClose <$ ")" ] link :: Lexer link = do pos <- getPosition l <- linkRaw -- "unconsume" the last token pos' <- flip incSourceColumn (-1) <$> getPosition pure $ (pos, LinkOpen) : l <> [(pos', LinkClose)] labeledLink :: Lexer labeledLink = do pos <- getPosition void $ string "<" link' <- linkRaw pos7 <- getPosition label' <- anyUntil $ string ">" pos8 <- getPosition void $ ">" pure $ (pos, LabeledLinkOpen) : link' <> [ (pos7, Token label') , (pos8, LabeledLinkClose) ] mathsBracket :: Lexer mathsBracket = delimited "\\[" "\\]" MathsBracketOpen MathsBracketClose mathsParens :: Lexer mathsParens = delimited "\\(" "\\)" MathsParenOpen MathsParenClose birdTrack :: Lexer birdTrack = delimitedNoTrailing ">> " eol BirdTrack escape :: Lexer escape = delimitedNoTrailing "\\" eol Escape quotes :: Lexer quotes = delimitedSymmetric "\"" QuoteOpen QuoteClose emphasis :: Lexer emphasis = delimitedSymmetric "/" EmphasisOpen EmphasisClose bold :: Lexer bold = delimitedSymmetric "__" BoldOpen BoldClose monospace :: Lexer monospace = delimitedSymmetric "@" MonospaceOpen MonospaceClose other :: Lexer other = do pos <- getPosition c <- takeWhile1_ isUnicodeAlphaNum pure . pure $ (pos, Token c) where isUnicodeAlphaNum c = isPrint c && not (isControl c) && not (isSpace c) spaceToken :: Lexer spaceToken = do pos <- getPosition _ <- many1 (char ' ') pure . pure $ (pos, Space) newlineToken :: Lexer newlineToken = do pos <- getPosition _ <- newline pure . pure $ (pos, Newline) ------- -- Helpers ------- -- | Like `takeWhile`, but unconditionally take escaped characters. takeWhile_ :: (Char -> Bool) -> Parser Text takeWhile_ p = scan p_ False where p_ escaped c | escaped = Just False | not $ p c = Nothing | otherwise = Just (c == '\\') -- | Like 'takeWhile1', but unconditionally take escaped characters. takeWhile1_ :: (Char -> Bool) -> Parser Text takeWhile1_ = mfilter (not . Text.null) . takeWhile_ {- | Scan the input text, accumulating characters as long as the scanning function returns true. -} scan :: -- | scan function (state -> Char -> Maybe state) -> -- | initial state state -> Parser Text scan f initState = do parserState@State{stateInput = input, statePos = pos} <- Parsec.getParserState (remaining, finalPos, ct) <- go input initState pos 0 let newState = parserState{stateInput = remaining, statePos = finalPos} Parsec.setParserState newState $> Text.take ct input where go !input' !st !posAccum !count' = case Text.uncons input' of Nothing -> pure (input', posAccum, count') Just (char', input'') -> case f st char' of Nothing -> pure (input', posAccum, count') Just st' -> go input'' st' (updatePosChar posAccum char') (count' + 1)