325 lines
8.5 KiB
Haskell
325 lines
8.5 KiB
Haskell
{-# LANGUAGE OverloadedStrings #-}
|
|
|
|
module Lexer (
|
|
Token (..),
|
|
lexer,
|
|
emphasis,
|
|
) where
|
|
|
|
import Control.Monad (mfilter, void)
|
|
import Data.Functor (($>))
|
|
import Data.Text (Text, intercalate)
|
|
import Data.Text qualified as Text
|
|
import GHC.Unicode (isAlpha, isAlphaNum, isControl, isPrint, isSpace, isUpper)
|
|
import ParserMonad (Parser, initialParserState)
|
|
import Text.Parsec
|
|
import Text.Parsec qualified as Parsec
|
|
import Text.Parsec.Pos (updatePosChar)
|
|
|
|
type Located a = (SourcePos, a)
|
|
type LocatedToken = (SourcePos, Token)
|
|
|
|
type Lexer = Parser [LocatedToken]
|
|
|
|
data Level
|
|
= One
|
|
| Two
|
|
| Three
|
|
| Four
|
|
| Five
|
|
| Six
|
|
deriving (Eq, Show)
|
|
|
|
data Token
|
|
= Token Text
|
|
| Anchor Text
|
|
| BirdTrack
|
|
| BoldOpen
|
|
| BoldClose
|
|
| Escape
|
|
| EmphasisOpen
|
|
| EmphasisClose
|
|
| Header Level
|
|
| MonospaceOpen
|
|
| MonospaceClose
|
|
| Newline
|
|
| LinkOpen
|
|
| LinkClose
|
|
| LabeledLinkOpen
|
|
| LabeledLinkClose
|
|
| ParenOpen
|
|
| ParenClose
|
|
| BracketOpen
|
|
| BracketClose
|
|
| MathsParenOpen
|
|
| MathsParenClose
|
|
| MathsBracketOpen
|
|
| MathsBracketClose
|
|
| NumericEntity Int
|
|
| Module Text
|
|
| QuoteOpen
|
|
| QuoteClose
|
|
| Space
|
|
| EOF
|
|
deriving (Eq, Show)
|
|
|
|
located :: Parser a -> Parser (SourcePos, a)
|
|
located p = (,) <$> getPosition <*> p
|
|
|
|
startPosition :: Parser a -> Parser SourcePos
|
|
startPosition = fmap fst . located
|
|
|
|
tokenise :: [Parser a] -> Parser [(SourcePos, a)]
|
|
tokenise = sequence . map located
|
|
|
|
lexer :: String -> Either ParseError [LocatedToken]
|
|
lexer = Parsec.runParser lexText initialParserState "input" . Text.pack
|
|
|
|
lexText :: Parser [LocatedToken]
|
|
lexText = go
|
|
where
|
|
go = do
|
|
Parsec.optionMaybe Parsec.eof >>= \case
|
|
Just _ -> pure []
|
|
Nothing -> do
|
|
toks <-
|
|
choice $
|
|
Parsec.try
|
|
<$> [ mathsBracket
|
|
, mathsParens
|
|
, escape -- maths go before escape to avoid mismatch
|
|
, headers
|
|
, newlineToken
|
|
, spaceToken
|
|
, link
|
|
, labeledLink
|
|
, modules
|
|
, anchors
|
|
, textElement
|
|
, quotes
|
|
, birdTrack
|
|
, other
|
|
]
|
|
rest <- go
|
|
pure (toks <> rest)
|
|
|
|
-- Tokens
|
|
|
|
textElement :: Parser [LocatedToken]
|
|
textElement =
|
|
choice $
|
|
Parsec.try
|
|
<$> [ emphasis
|
|
, bold
|
|
, monospace
|
|
]
|
|
|
|
headers :: Parser [LocatedToken]
|
|
headers =
|
|
choice $
|
|
Parsec.try
|
|
<$> [ header1
|
|
, header2
|
|
, header3
|
|
, header4
|
|
, header5
|
|
, header6
|
|
]
|
|
|
|
anyUntil :: Parser a -> Parser Text
|
|
anyUntil p = Text.pack <$> manyTill anyChar (lookAhead p)
|
|
|
|
delimitedAsTuple :: Parser open -> Parser close -> Parser (Located open, LocatedToken, Located close)
|
|
delimitedAsTuple openP closeP =
|
|
(,,)
|
|
<$> located openP
|
|
<*> located (Token <$> anyUntil closeP)
|
|
<*> located closeP
|
|
|
|
delimited :: Parser open -> Parser close -> Token -> Token -> Parser [LocatedToken]
|
|
delimited openP closeP openTok closeTok = asList <$> delimitedAsTuple (openTok <$ openP) (closeTok <$ closeP)
|
|
where
|
|
asList (a, tok, b) = [a, tok, b]
|
|
|
|
delimitedNoTrailing :: Parser open -> Parser close -> Token -> Parser [LocatedToken]
|
|
delimitedNoTrailing openP closeP openTok = asList <$> delimitedAsTuple (openTok <$ openP) (void closeP)
|
|
where
|
|
asList (a, tok, _) = [a, tok]
|
|
|
|
delimitedSymmetric :: Parser a -> Token -> Token -> Parser [LocatedToken]
|
|
delimitedSymmetric s t1 t2 = delimited s s t1 t2
|
|
|
|
eol :: Parser ()
|
|
eol = void "\n" <|> void "\r\n" <|> Parsec.eof
|
|
|
|
anchorHash :: Parser Text
|
|
anchorHash = "#" <|> try "\\#"
|
|
|
|
header1 :: Lexer
|
|
header1 = delimitedNoTrailing "= " eol (Header One)
|
|
|
|
header2 :: Lexer
|
|
header2 = delimitedNoTrailing "== " eol (Header Two)
|
|
|
|
header3 :: Lexer
|
|
header3 = delimitedNoTrailing "=== " eol (Header Three)
|
|
|
|
header4 :: Lexer
|
|
header4 = delimitedNoTrailing "==== " eol (Header Four)
|
|
|
|
header5 :: Lexer
|
|
header5 = delimitedNoTrailing "===== " eol (Header Five)
|
|
|
|
header6 :: Lexer
|
|
header6 = delimitedNoTrailing "====== " eol (Header Six)
|
|
|
|
-- #anchors#
|
|
anchors :: Lexer
|
|
anchors =
|
|
tokenise
|
|
[ between anchorHash anchorHash (Anchor <$> anyUntil anchorHash)
|
|
]
|
|
|
|
|
|
moduleNames :: Parser Text
|
|
moduleNames = intercalate "." . fmap Text.pack <$> upperId `sepBy1` char '.'
|
|
|
|
upperId :: Parser String
|
|
upperId = (:) <$> satisfy isUpper <*> many1 identifierChar
|
|
|
|
identifierChar :: Parser Char
|
|
identifierChar = satisfy (\c -> isAlphaNum c || c == '_')
|
|
|
|
-- "Module.Name"
|
|
-- "Module.Name#anchor"
|
|
-- "Module.Name\#anchor" -- this has been deprecated for 9 years, thanks Ben
|
|
modules :: Lexer
|
|
modules = between (char '"') (char '"') inner
|
|
where
|
|
inner = do
|
|
module_ <- located $ Module <$> moduleNames
|
|
mAnchor <- optionMaybe (located $ anchorHash *> (Anchor <$> anchorText))
|
|
pure $ case mAnchor of
|
|
Just anchor -> [module_, anchor]
|
|
Nothing -> [module_]
|
|
|
|
anchorText :: Parser Text
|
|
anchorText = Text.pack <$> many (satisfy (\c -> c /= '"' && not (isSpace c)))
|
|
|
|
linkRaw :: Lexer
|
|
linkRaw =
|
|
tokenise
|
|
[ BracketOpen <$ "["
|
|
, Token <$> anyUntil "]"
|
|
, BracketClose <$ "]"
|
|
, ParenOpen <$ "("
|
|
, Token <$> anyUntil ")"
|
|
, ParenClose <$ ")"
|
|
]
|
|
|
|
link :: Lexer
|
|
link = do
|
|
pos <- getPosition
|
|
l <- linkRaw
|
|
-- register the position of the last token
|
|
pos' <- flip incSourceColumn (-1) <$> getPosition
|
|
pure $ (pos, LinkOpen) : l <> [(pos', LinkClose)]
|
|
|
|
labeledLink :: Lexer
|
|
labeledLink = do
|
|
pos <- getPosition
|
|
void $ string "<"
|
|
link' <- linkRaw
|
|
pos7 <- getPosition
|
|
label' <- anyUntil $ string ">"
|
|
pos8 <- getPosition
|
|
void $ ">"
|
|
|
|
pure $
|
|
(pos, LabeledLinkOpen)
|
|
: link'
|
|
<> [ (pos7, Token label')
|
|
, (pos8, LabeledLinkClose)
|
|
]
|
|
|
|
mathsBracket :: Lexer
|
|
mathsBracket = delimited "\\[" "\\]" MathsBracketOpen MathsBracketClose
|
|
|
|
mathsParens :: Lexer
|
|
mathsParens = delimited "\\(" "\\)" MathsParenOpen MathsParenClose
|
|
|
|
birdTrack :: Lexer
|
|
birdTrack = delimitedNoTrailing ">> " eol BirdTrack
|
|
|
|
escape :: Lexer
|
|
escape = delimitedNoTrailing "\\" eol Escape
|
|
|
|
quotes :: Lexer
|
|
quotes = delimitedSymmetric "\"" QuoteOpen QuoteClose
|
|
|
|
emphasis :: Lexer
|
|
emphasis = delimitedSymmetric "/" EmphasisOpen EmphasisClose
|
|
|
|
bold :: Lexer
|
|
bold = delimitedSymmetric "__" BoldOpen BoldClose
|
|
|
|
monospace :: Lexer
|
|
monospace = delimitedSymmetric "@" MonospaceOpen MonospaceClose
|
|
|
|
other :: Lexer
|
|
other = do
|
|
pos <- getPosition
|
|
c <- takeWhile1_ isUnicodeAlphaNum
|
|
pure . pure $ (pos, Token c)
|
|
where
|
|
isUnicodeAlphaNum c = isPrint c && not (isControl c) && not (isSpace c)
|
|
|
|
spaceToken :: Lexer
|
|
spaceToken = do
|
|
pos <- getPosition
|
|
_ <- many1 (char ' ')
|
|
pure . pure $ (pos, Space)
|
|
|
|
newlineToken :: Lexer
|
|
newlineToken = do
|
|
pos <- getPosition
|
|
_ <- newline
|
|
pure . pure $ (pos, Newline)
|
|
|
|
-------
|
|
-- Helpers
|
|
-------
|
|
|
|
-- | Like `takeWhile`, but unconditionally take escaped characters.
|
|
takeWhile_ :: (Char -> Bool) -> Parser Text
|
|
takeWhile_ p = scan p_ False
|
|
where
|
|
p_ escaped c
|
|
| escaped = Just False
|
|
| not $ p c = Nothing
|
|
| otherwise = Just (c == '\\')
|
|
|
|
-- | Like 'takeWhile1', but unconditionally take escaped characters.
|
|
takeWhile1_ :: (Char -> Bool) -> Parser Text
|
|
takeWhile1_ = mfilter (not . Text.null) . takeWhile_
|
|
|
|
{- | Scan the input text, accumulating characters as long as the scanning
|
|
function returns true.
|
|
-}
|
|
scan ::
|
|
-- | scan function
|
|
(state -> Char -> Maybe state) ->
|
|
-- | initial state
|
|
state ->
|
|
Parser Text
|
|
scan f initState = do
|
|
parserState@State{stateInput = input, statePos = pos} <- Parsec.getParserState
|
|
(remaining, finalPos, ct) <- go input initState pos 0
|
|
let newState = parserState{stateInput = remaining, statePos = finalPos}
|
|
Parsec.setParserState newState $> Text.take ct input
|
|
where
|
|
go !input' !st !posAccum !count' = case Text.uncons input' of
|
|
Nothing -> pure (input', posAccum, count')
|
|
Just (char', input'') -> case f st char' of
|
|
Nothing -> pure (input', posAccum, count')
|
|
Just st' -> go input'' st' (updatePosChar posAccum char') (count' + 1)
|