forked from elland/haddock2
184 lines
4.3 KiB
Haskell
184 lines
4.3 KiB
Haskell
{-# LANGUAGE OverloadedStrings #-}
|
|
|
|
module Lexer (
|
|
Token (..),
|
|
lexer,
|
|
emphasis,
|
|
) where
|
|
|
|
import Control.Monad (mfilter)
|
|
import Data.Char (isAlphaNum, isPrint)
|
|
import Data.Functor (($>))
|
|
import Data.Text (Text)
|
|
import Data.Text qualified as Text
|
|
import GHC.Stack (HasCallStack)
|
|
import ParserMonad (Parser, initialParserState)
|
|
import Text.Parsec
|
|
import Text.Parsec qualified as Parsec
|
|
import Text.Parsec.Pos (updatePosChar)
|
|
|
|
type LocatedToken = (SourcePos, Token)
|
|
|
|
type Lexer = Parser [LocatedToken]
|
|
|
|
data Token
|
|
= Token Text
|
|
| Anchor
|
|
| AngleOpen
|
|
| AngleClose
|
|
| BoldOpen
|
|
| BoldClose
|
|
| BracketOpen
|
|
| BracketClose
|
|
| EmphasisOpen
|
|
| EmphasisClose
|
|
| MonospaceOpen
|
|
| MonospaceClose
|
|
| Newline
|
|
| ParenOpen
|
|
| ParenClose
|
|
| QuoteOpen
|
|
| QuoteClose
|
|
| Space
|
|
| EOF
|
|
deriving (Eq, Show)
|
|
|
|
lexer :: String -> Either ParseError [LocatedToken]
|
|
lexer = Parsec.runParser lexText initialParserState "input" . Text.pack
|
|
|
|
lexText :: (HasCallStack) => Parser [LocatedToken]
|
|
lexText = go
|
|
where
|
|
go = do
|
|
Parsec.optionMaybe Parsec.eof >>= \case
|
|
Just _ -> pure []
|
|
Nothing -> do
|
|
toks <-
|
|
choice
|
|
[ newlineToken
|
|
, spaceToken
|
|
, textElement
|
|
, identifier
|
|
, other
|
|
]
|
|
rest <- go
|
|
pure (toks <> rest)
|
|
|
|
match :: Parser a -> Parser (Text, a)
|
|
match p = do
|
|
input <- getInput
|
|
result <- p
|
|
input' <- getInput
|
|
let !consumed = Text.take (Text.length input - Text.length input') input
|
|
pure (consumed, result)
|
|
|
|
-- Tokens
|
|
|
|
textElement :: Parser [LocatedToken]
|
|
textElement =
|
|
choice $
|
|
Parsec.try
|
|
<$> [ emphasis
|
|
, bold
|
|
, monospace
|
|
, parens
|
|
, brackets
|
|
, angles
|
|
]
|
|
|
|
delimited :: String -> String -> Token -> Token -> Parser [LocatedToken]
|
|
delimited c1 c2 ot ct = do
|
|
(_, content) <- match $ between op cl any'
|
|
pos <- getPosition
|
|
let openTok :: LocatedToken = (pos, ot)
|
|
closeTok :: LocatedToken = (pos, ct)
|
|
res :: LocatedToken = (pos, Token content)
|
|
|
|
pure [openTok, res, closeTok]
|
|
where
|
|
op = string c1
|
|
cl = string c2
|
|
any' = Text.pack <$> manyTill anyChar (lookAhead cl)
|
|
|
|
delimited' :: String -> Token -> Token -> Parser [LocatedToken]
|
|
delimited' s t1 t2 = delimited s s t1 t2
|
|
|
|
emphasis :: Lexer
|
|
emphasis = delimited' "/" EmphasisOpen EmphasisClose
|
|
|
|
bold :: Lexer
|
|
bold = delimited' "__" BoldOpen BoldClose
|
|
|
|
monospace :: Lexer
|
|
monospace = delimited' "@" MonospaceOpen MonospaceClose
|
|
|
|
parens :: Parser [LocatedToken]
|
|
parens = delimited "(" ")" ParenOpen ParenClose
|
|
|
|
brackets :: Lexer
|
|
brackets = delimited "[" "]" ParenOpen ParenClose
|
|
|
|
angles :: Parser [LocatedToken]
|
|
angles = delimited "<" ">" AngleOpen AngleClose
|
|
|
|
other :: Lexer
|
|
other = do
|
|
pos <- getPosition
|
|
c <- takeWhile1_ isPrint
|
|
pure . pure $ (pos, Token c)
|
|
|
|
spaceToken :: Lexer
|
|
spaceToken = do
|
|
pos <- getPosition
|
|
_ <- many1 (char ' ')
|
|
pure . pure $ (pos, Space)
|
|
|
|
newlineToken :: Lexer
|
|
newlineToken = do
|
|
pos <- getPosition
|
|
_ <- newline
|
|
pure . pure $ (pos, Newline)
|
|
|
|
identifier :: Lexer
|
|
identifier = do
|
|
pos <- getPosition
|
|
txt <- takeWhile1_ isAlphaNum
|
|
pure . pure $ (pos, Token txt)
|
|
|
|
-------
|
|
-- Helpers
|
|
-------
|
|
|
|
-- | Like `takeWhile`, but unconditionally take escaped characters.
|
|
takeWhile_ :: (Char -> Bool) -> Parser Text
|
|
takeWhile_ p = scan p_ False
|
|
where
|
|
p_ escaped c
|
|
| escaped = Just False
|
|
| not $ p c = Nothing
|
|
| otherwise = Just (c == '\\')
|
|
|
|
-- | Like 'takeWhile1', but unconditionally take escaped characters.
|
|
takeWhile1_ :: (Char -> Bool) -> Parser Text
|
|
takeWhile1_ = mfilter (not . Text.null) . takeWhile_
|
|
|
|
{- | Scan the input text, accumulating characters as long as the scanning
|
|
function returns true.
|
|
-}
|
|
scan ::
|
|
-- | scan function
|
|
(state -> Char -> Maybe state) ->
|
|
-- | initial state
|
|
state ->
|
|
Parser Text
|
|
scan f initState = do
|
|
parserState@State{stateInput = input, statePos = pos} <- Parsec.getParserState
|
|
(remaining, finalPos, ct) <- go input initState pos 0
|
|
let newState = parserState{stateInput = remaining, statePos = finalPos}
|
|
Parsec.setParserState newState $> Text.take ct input
|
|
where
|
|
go !input' !st !posAccum !count' = case Text.uncons input' of
|
|
Nothing -> pure (input', posAccum, count')
|
|
Just (char', input'') -> case f st char' of
|
|
Nothing -> pure (input', posAccum, count')
|
|
Just st' -> go input'' st' (updatePosChar posAccum char') (count' + 1)
|