{-# LANGUAGE OverloadedStrings #-} module Lexer ( Token (..), lexer, emphasis, ) where import Control.Monad (mfilter, void) import Data.Char (ord, toLower) import Data.Functor (($>)) import Data.Text (Text, intercalate) import Data.Text qualified as Text import GHC.Unicode (isAlphaNum, isControl, isDigit, isPrint, isSpace, isUpper) import ParserMonad (Parser, initialParserState) import Text.Parsec import Text.Parsec qualified as Parsec import Text.Parsec.Pos (updatePosChar) type Located a = (SourcePos, a) type LocatedToken = (SourcePos, Token) type Lexer = Parser [LocatedToken] data Level = One | Two | Three | Four | Five | Six deriving (Eq, Show) data Token = Token Text | Anchor Text | BirdTrack | BoldOpen | BoldClose | Escape | EmphasisOpen | EmphasisClose | Expression | Header Level | MonospaceOpen | MonospaceClose | Newline | LinkOpen | LinkClose | LabeledLinkOpen | LabeledLinkClose | ParenOpen | ParenClose | BracketOpen | BracketClose | MathInlineOpen | MathInlineClose | MathMultilineOpen | MathMultilineClose | NumericEntity Int | Module Text | QuoteOpen | QuoteClose | Space | EOF deriving (Eq, Show) located :: Parser a -> Parser (SourcePos, a) located p = (,) <$> getPosition <*> p tokenise :: [Parser a] -> Parser [(SourcePos, a)] tokenise = mapM located lexer :: String -> Either ParseError [LocatedToken] lexer = Parsec.runParser lexText initialParserState "input" . Text.pack lexText :: Parser [LocatedToken] lexText = go where go = do Parsec.optionMaybe Parsec.eof >>= \case Just _ -> pure [] Nothing -> do toks <- topLevel rest <- go pure (toks <> rest) {- FOURMOLU_DISABLE -} topLevel = -- backtracking here so we always have a chance to try "other", the "catch-all-leave-to-parser-to-deal-with" choice -- TODO: is this desirable? do we throw lexer error at all? try ( choice -- Sorted in -- - longest to shortest parse path -- - highest frequency to lowest frequency (for performance?) -- - more exact to more freeform (the latter can be the former but not vice versa) [ spaceToken , newlineToken , try module_ , quotes , try expression , birdTrack -- starts with "\" , try mathMultiline , try mathInline , escape , headers , labeledLink , link , anchor , numericEntity , textElement ] ) <|> other {- FOURMOLU_ENABLE -} -- Tokens textElement :: Parser [LocatedToken] textElement = choice $ Parsec.try <$> [ emphasis , bold , monospace ] headers :: Parser [LocatedToken] headers = choice $ Parsec.try <$> [ header1 , header2 , header3 , header4 , header5 , header6 ] anyUntil :: Parser a -> Parser Text anyUntil p = Text.pack <$> manyTill anyChar (lookAhead p) delimitedAsTuple :: Parser open -> Parser close -> Parser (Located open, LocatedToken, Located close) delimitedAsTuple openP closeP = (,,) <$> located openP <*> located (Token <$> anyUntil closeP) <*> located closeP delimited :: Parser open -> Parser close -> Token -> Token -> Parser [LocatedToken] delimited openP closeP openTok closeTok = asList <$> delimitedAsTuple (openTok <$ openP) (closeTok <$ closeP) where asList (a, tok, b) = [a, tok, b] delimitedNoTrailing :: Parser open -> Parser close -> Token -> Parser [LocatedToken] delimitedNoTrailing openP closeP openTok = asList <$> delimitedAsTuple (openTok <$ openP) (void closeP) where asList (a, tok, _) = [a, tok] delimitedSymmetric :: Parser a -> Token -> Token -> Parser [LocatedToken] delimitedSymmetric s = delimited s s --- End of line // end of file eol :: Parser () eol = void "\n" <|> void "\r\n" <|> Parsec.eof -- Start of line // start of file sol :: Parser () sol = do p <- getPosition if sourceColumn p == 1 then pure () else fail "Not at start of line/document" header1 :: Lexer header1 = delimitedNoTrailing "= " eol (Header One) header2 :: Lexer header2 = delimitedNoTrailing "== " eol (Header Two) header3 :: Lexer header3 = delimitedNoTrailing "=== " eol (Header Three) header4 :: Lexer header4 = delimitedNoTrailing "==== " eol (Header Four) header5 :: Lexer header5 = delimitedNoTrailing "===== " eol (Header Five) header6 :: Lexer header6 = delimitedNoTrailing "====== " eol (Header Six) -- #anchors# anchor :: Lexer anchor = do x <- located $ between "#" "#" (Anchor <$> anyUntil "#") pure [x] moduleNames :: Parser Text moduleNames = intercalate "." . fmap Text.pack <$> upperId `sepBy1` char '.' upperId :: Parser String upperId = (:) <$> satisfy isUpper <*> many1 identifierChar identifierChar :: Parser Char identifierChar = satisfy (\c -> isAlphaNum c || c == '_') -- "Module.Name" -- "Module.Name#anchor" -- "Module.Name\#anchor" -- known as "old anchor". this has been deprecated for 9 years, thanks Ben module_ :: Lexer module_ = between (char '"') (char '"') inner where inner = do m <- located $ Module <$> moduleNames mAnchor <- optionMaybe (located $ anchorHash *> (Anchor <$> anchorText)) pure $ case mAnchor of Just anc -> [m, anc] Nothing -> [m] anchorHash :: Parser Text anchorHash = "#" <|> try "\\#" anchorText :: Parser Text anchorText = Text.pack <$> many (satisfy (\c -> c /= '"' && not (isSpace c))) linkRaw :: Lexer linkRaw = tokenise [ BracketOpen <$ "[" , Token <$> anyUntil "]" , BracketClose <$ "]" , ParenOpen <$ "(" , Token <$> anyUntil ")" , ParenClose <$ ")" ] link :: Lexer link = do pos <- getPosition l <- linkRaw -- register the position of the last token pos' <- flip incSourceColumn (-1) <$> getPosition pure $ (pos, LinkOpen) : l <> [(pos', LinkClose)] labeledLink :: Lexer labeledLink = do open <- located $ LabeledLinkOpen <$ "<" linkRes <- linkRaw labelRes <- located $ Token <$> anyUntil ">" close <- located $ LabeledLinkClose <$ ">" pure $ open : linkRes <> [labelRes, close] mathMultiline :: Lexer mathMultiline = delimited "\\[" "\\]" MathMultilineOpen MathMultilineClose mathInline :: Lexer mathInline = delimited "\\(" "\\)" MathInlineOpen MathInlineClose -- TODO: make sure this starts at column 0? birdTrack :: Lexer birdTrack = delimitedNoTrailing (sol <* "> ") eol BirdTrack -- TODO: also match following lines iff: -- they start with alphanum -- they're not empty expression :: Lexer expression = delimitedNoTrailing (sol <* ">>> ") eol Expression escape :: Lexer escape = delimitedNoTrailing "\\" eol Escape quotes :: Lexer quotes = delimitedSymmetric "\"" QuoteOpen QuoteClose emphasis :: Lexer emphasis = delimitedSymmetric "/" EmphasisOpen EmphasisClose bold :: Lexer bold = delimitedSymmetric "__" BoldOpen BoldClose monospace :: Lexer monospace = delimitedSymmetric "@" MonospaceOpen MonospaceClose decimal :: Parser Int decimal = read . Text.unpack <$> takeWhile1_ isDigit hexadecimal :: Parser Int hexadecimal = "x" *> (convert 0 . fmap (normalise . toLower) <$> many1 hexDigit) where normalise :: Char -> Int normalise c | ord '0' <= n && n <= ord '9' = n - ord '0' | ord 'A' <= n && n <= ord 'F' = n - ord 'A' + 10 | ord 'a' <= n && n <= ord 'f' = n - ord 'a' + 10 | otherwise = error "unexpected: invalid hex number" where n = ord c convert :: Int -> [Int] -> Int convert acc [] = acc convert acc (x : xs) = convert (acc * 16 + x) xs numericEntity :: Lexer numericEntity = do x <- located $ between "&#" ";" (NumericEntity <$> (hexadecimal <|> decimal)) pure [x] other :: Lexer other = do pos <- getPosition c <- takeWhile1_ isUnicodeAlphaNum pure . pure $ (pos, Token c) where isUnicodeAlphaNum c = isPrint c && not (isControl c) && not (isSpace c) spaceToken :: Lexer spaceToken = do pos <- getPosition _ <- many1 (char ' ') pure . pure $ (pos, Space) newlineToken :: Lexer newlineToken = do pos <- getPosition _ <- newline pure . pure $ (pos, Newline) ------- -- Helpers ------- -- | Like `takeWhile`, but unconditionally take escaped characters. takeWhile_ :: (Char -> Bool) -> Parser Text takeWhile_ p = scan p_ False where p_ escaped c | escaped = Just False | not $ p c = Nothing | otherwise = Just (c == '\\') -- | Like 'takeWhile1', but unconditionally take escaped characters. takeWhile1_ :: (Char -> Bool) -> Parser Text takeWhile1_ = mfilter (not . Text.null) . takeWhile_ {- | Scan the input text, accumulating characters as long as the scanning function returns true. -} scan :: -- | scan function (state -> Char -> Maybe state) -> -- | initial state state -> Parser Text scan f initState = do parserState@State{stateInput = input, statePos = pos} <- Parsec.getParserState (remaining, finalPos, ct) <- go input initState pos 0 let newState = parserState{stateInput = remaining, statePos = finalPos} Parsec.setParserState newState $> Text.take ct input where go !input' !st !posAccum !count' = case Text.uncons input' of Nothing -> pure (input', posAccum, count') Just (char', input'') -> case f st char' of Nothing -> pure (input', posAccum, count') Just st' -> go input'' st' (updatePosChar posAccum char') (count' + 1)