I think it is clearer to phrase it this way so it is clear that we are not unconsuming (i.e. changing the state of the parser).
325 lines
8.5 KiB
Haskell
325 lines
8.5 KiB
Haskell
{-# LANGUAGE OverloadedStrings #-}
|
|
|
|
module Lexer (
|
|
Token (..),
|
|
lexer,
|
|
emphasis,
|
|
) where
|
|
|
|
import Control.Monad (mfilter, void)
|
|
import Data.Functor (($>))
|
|
import Data.Text (Text, intercalate)
|
|
import Data.Text qualified as Text
|
|
import GHC.Unicode (isAlpha, isAlphaNum, isControl, isPrint, isSpace, isUpper)
|
|
import ParserMonad (Parser, initialParserState)
|
|
import Text.Parsec
|
|
import Text.Parsec qualified as Parsec
|
|
import Text.Parsec.Pos (updatePosChar)
|
|
|
|
type Located a = (SourcePos, a)
|
|
type LocatedToken = (SourcePos, Token)
|
|
|
|
type Lexer = Parser [LocatedToken]
|
|
|
|
data Level
|
|
= One
|
|
| Two
|
|
| Three
|
|
| Four
|
|
| Five
|
|
| Six
|
|
deriving (Eq, Show)
|
|
|
|
data Token
|
|
= Token Text
|
|
| Anchor Text
|
|
| BirdTrack
|
|
| BoldOpen
|
|
| BoldClose
|
|
| Escape
|
|
| EmphasisOpen
|
|
| EmphasisClose
|
|
| Header Level
|
|
| MonospaceOpen
|
|
| MonospaceClose
|
|
| Newline
|
|
| LinkOpen
|
|
| LinkClose
|
|
| LabeledLinkOpen
|
|
| LabeledLinkClose
|
|
| ParenOpen
|
|
| ParenClose
|
|
| BracketOpen
|
|
| BracketClose
|
|
| MathsParenOpen
|
|
| MathsParenClose
|
|
| MathsBracketOpen
|
|
| MathsBracketClose
|
|
| NumericEntity Int
|
|
| Module Text
|
|
| QuoteOpen
|
|
| QuoteClose
|
|
| Space
|
|
| EOF
|
|
deriving (Eq, Show)
|
|
|
|
located :: Parser a -> Parser (SourcePos, a)
|
|
located p = (,) <$> getPosition <*> p
|
|
|
|
startPosition :: Parser a -> Parser SourcePos
|
|
startPosition = fmap fst . located
|
|
|
|
tokenise :: [Parser a] -> Parser [(SourcePos, a)]
|
|
tokenise = sequence . map located
|
|
|
|
lexer :: String -> Either ParseError [LocatedToken]
|
|
lexer = Parsec.runParser lexText initialParserState "input" . Text.pack
|
|
|
|
lexText :: Parser [LocatedToken]
|
|
lexText = go
|
|
where
|
|
go = do
|
|
Parsec.optionMaybe Parsec.eof >>= \case
|
|
Just _ -> pure []
|
|
Nothing -> do
|
|
toks <-
|
|
choice $
|
|
Parsec.try
|
|
<$> [ mathsBracket
|
|
, mathsParens
|
|
, escape -- maths go before escape to avoid mismatch
|
|
, headers
|
|
, newlineToken
|
|
, spaceToken
|
|
, link
|
|
, labeledLink
|
|
, modules
|
|
, anchors
|
|
, textElement
|
|
, quotes
|
|
, birdTrack
|
|
, other
|
|
]
|
|
rest <- go
|
|
pure (toks <> rest)
|
|
|
|
-- Tokens
|
|
|
|
textElement :: Parser [LocatedToken]
|
|
textElement =
|
|
choice $
|
|
Parsec.try
|
|
<$> [ emphasis
|
|
, bold
|
|
, monospace
|
|
]
|
|
|
|
headers :: Parser [LocatedToken]
|
|
headers =
|
|
choice $
|
|
Parsec.try
|
|
<$> [ header1
|
|
, header2
|
|
, header3
|
|
, header4
|
|
, header5
|
|
, header6
|
|
]
|
|
|
|
anyUntil :: Parser a -> Parser Text
|
|
anyUntil p = Text.pack <$> manyTill anyChar (lookAhead p)
|
|
|
|
delimitedAsTuple :: Parser open -> Parser close -> Parser (Located open, LocatedToken, Located close)
|
|
delimitedAsTuple openP closeP =
|
|
(,,)
|
|
<$> located openP
|
|
<*> located (Token <$> anyUntil closeP)
|
|
<*> located closeP
|
|
|
|
delimited :: Parser a -> Parser b -> Token -> Token -> Parser [LocatedToken]
|
|
delimited openP closeP openTok closeTok = asList <$> delimitedAsTuple (openTok <$ openP) (closeTok <$ closeP)
|
|
where
|
|
asList (a, tok, b) = [a, tok, b]
|
|
|
|
delimitedNoTrailing :: Parser a -> Parser b -> Token -> Parser [LocatedToken]
|
|
delimitedNoTrailing openP closeP openTok = asList <$> delimitedAsTuple (openTok <$ openP) (void closeP)
|
|
where
|
|
asList (a, tok, _) = [a, tok]
|
|
|
|
delimitedSymmetric :: Parser a -> Token -> Token -> Parser [LocatedToken]
|
|
delimitedSymmetric s t1 t2 = delimited s s t1 t2
|
|
|
|
eol :: Parser ()
|
|
eol = void "\n" <|> void "\r\n" <|> Parsec.eof
|
|
|
|
anchorHash :: Parser Text
|
|
anchorHash = "#" <|> try "\\#"
|
|
|
|
header1 :: Lexer
|
|
header1 = delimitedNoTrailing "= " eol (Header One)
|
|
|
|
header2 :: Lexer
|
|
header2 = delimitedNoTrailing "== " eol (Header Two)
|
|
|
|
header3 :: Lexer
|
|
header3 = delimitedNoTrailing "=== " eol (Header Three)
|
|
|
|
header4 :: Lexer
|
|
header4 = delimitedNoTrailing "==== " eol (Header Four)
|
|
|
|
header5 :: Lexer
|
|
header5 = delimitedNoTrailing "===== " eol (Header Five)
|
|
|
|
header6 :: Lexer
|
|
header6 = delimitedNoTrailing "====== " eol (Header Six)
|
|
|
|
-- #anchors#
|
|
anchors :: Lexer
|
|
anchors =
|
|
tokenise
|
|
[ between anchorHash anchorHash (Anchor <$> anyUntil anchorHash)
|
|
]
|
|
|
|
|
|
moduleNames :: Parser Text
|
|
moduleNames = intercalate "." . fmap Text.pack <$> upperId `sepBy1` char '.'
|
|
|
|
upperId :: Parser String
|
|
upperId = (:) <$> satisfy isUpper <*> many1 identifierChar
|
|
|
|
identifierChar :: Parser Char
|
|
identifierChar = satisfy (\c -> isAlphaNum c || c == '_')
|
|
|
|
-- "Module.Name"
|
|
-- "Module.Name#anchor"
|
|
-- "Module.Name\#anchor" -- this has been deprecated for 9 years, thanks Ben
|
|
modules :: Lexer
|
|
modules = between (char '"') (char '"') inner
|
|
where
|
|
inner = do
|
|
module_ <- located $ Module <$> moduleNames
|
|
mAnchor <- optionMaybe (located $ anchorHash *> (Anchor <$> anchorText))
|
|
pure $ case mAnchor of
|
|
Just anchor -> [module_, anchor]
|
|
Nothing -> [module_]
|
|
|
|
anchorText :: Parser Text
|
|
anchorText = Text.pack <$> many (satisfy (\c -> c /= '"' && not (isSpace c)))
|
|
|
|
linkRaw :: Lexer
|
|
linkRaw =
|
|
tokenise
|
|
[ BracketOpen <$ "["
|
|
, Token <$> anyUntil "]"
|
|
, BracketClose <$ "]"
|
|
, ParenOpen <$ "("
|
|
, Token <$> anyUntil ")"
|
|
, ParenClose <$ ")"
|
|
]
|
|
|
|
link :: Lexer
|
|
link = do
|
|
pos <- getPosition
|
|
l <- linkRaw
|
|
-- register the position of the last token
|
|
pos' <- flip incSourceColumn (-1) <$> getPosition
|
|
pure $ (pos, LinkOpen) : l <> [(pos', LinkClose)]
|
|
|
|
labeledLink :: Lexer
|
|
labeledLink = do
|
|
pos <- getPosition
|
|
void $ string "<"
|
|
link' <- linkRaw
|
|
pos7 <- getPosition
|
|
label' <- anyUntil $ string ">"
|
|
pos8 <- getPosition
|
|
void $ ">"
|
|
|
|
pure $
|
|
(pos, LabeledLinkOpen)
|
|
: link'
|
|
<> [ (pos7, Token label')
|
|
, (pos8, LabeledLinkClose)
|
|
]
|
|
|
|
mathsBracket :: Lexer
|
|
mathsBracket = delimited "\\[" "\\]" MathsBracketOpen MathsBracketClose
|
|
|
|
mathsParens :: Lexer
|
|
mathsParens = delimited "\\(" "\\)" MathsParenOpen MathsParenClose
|
|
|
|
birdTrack :: Lexer
|
|
birdTrack = delimitedNoTrailing ">> " eol BirdTrack
|
|
|
|
escape :: Lexer
|
|
escape = delimitedNoTrailing "\\" eol Escape
|
|
|
|
quotes :: Lexer
|
|
quotes = delimitedSymmetric "\"" QuoteOpen QuoteClose
|
|
|
|
emphasis :: Lexer
|
|
emphasis = delimitedSymmetric "/" EmphasisOpen EmphasisClose
|
|
|
|
bold :: Lexer
|
|
bold = delimitedSymmetric "__" BoldOpen BoldClose
|
|
|
|
monospace :: Lexer
|
|
monospace = delimitedSymmetric "@" MonospaceOpen MonospaceClose
|
|
|
|
other :: Lexer
|
|
other = do
|
|
pos <- getPosition
|
|
c <- takeWhile1_ isUnicodeAlphaNum
|
|
pure . pure $ (pos, Token c)
|
|
where
|
|
isUnicodeAlphaNum c = isPrint c && not (isControl c) && not (isSpace c)
|
|
|
|
spaceToken :: Lexer
|
|
spaceToken = do
|
|
pos <- getPosition
|
|
_ <- many1 (char ' ')
|
|
pure . pure $ (pos, Space)
|
|
|
|
newlineToken :: Lexer
|
|
newlineToken = do
|
|
pos <- getPosition
|
|
_ <- newline
|
|
pure . pure $ (pos, Newline)
|
|
|
|
-------
|
|
-- Helpers
|
|
-------
|
|
|
|
-- | Like `takeWhile`, but unconditionally take escaped characters.
|
|
takeWhile_ :: (Char -> Bool) -> Parser Text
|
|
takeWhile_ p = scan p_ False
|
|
where
|
|
p_ escaped c
|
|
| escaped = Just False
|
|
| not $ p c = Nothing
|
|
| otherwise = Just (c == '\\')
|
|
|
|
-- | Like 'takeWhile1', but unconditionally take escaped characters.
|
|
takeWhile1_ :: (Char -> Bool) -> Parser Text
|
|
takeWhile1_ = mfilter (not . Text.null) . takeWhile_
|
|
|
|
{- | Scan the input text, accumulating characters as long as the scanning
|
|
function returns true.
|
|
-}
|
|
scan ::
|
|
-- | scan function
|
|
(state -> Char -> Maybe state) ->
|
|
-- | initial state
|
|
state ->
|
|
Parser Text
|
|
scan f initState = do
|
|
parserState@State{stateInput = input, statePos = pos} <- Parsec.getParserState
|
|
(remaining, finalPos, ct) <- go input initState pos 0
|
|
let newState = parserState{stateInput = remaining, statePos = finalPos}
|
|
Parsec.setParserState newState $> Text.take ct input
|
|
where
|
|
go !input' !st !posAccum !count' = case Text.uncons input' of
|
|
Nothing -> pure (input', posAccum, count')
|
|
Just (char', input'') -> case f st char' of
|
|
Nothing -> pure (input', posAccum, count')
|
|
Just st' -> go input'' st' (updatePosChar posAccum char') (count' + 1)
|