haddock2/src/Lexer.hs
Igor Ranieri a64ac93bd9
All checks were successful
Haskell CI / build (pull_request) Successful in 2m51s
Haskell CI / test (pull_request) Successful in 2m40s
Haskell CI / fourmolu (pull_request) Successful in 6s
Haskell CI / hlint (pull_request) Successful in 6s
Added expression eval; adjusted birdtrack, added sol combinator.
2025-10-05 15:20:28 +00:00

375 lines
9.2 KiB
Haskell

{-# LANGUAGE OverloadedStrings #-}
module Lexer (
Token (..),
lexer,
emphasis,
)
where
import Control.Monad (mfilter, void)
import Data.Char (ord, toLower)
import Data.Functor (($>))
import Data.Text (Text, intercalate)
import Data.Text qualified as Text
import GHC.Unicode (isAlphaNum, isControl, isDigit, isPrint, isSpace, isUpper)
import ParserMonad (Parser, initialParserState)
import Text.Parsec
import Text.Parsec qualified as Parsec
import Text.Parsec.Pos (updatePosChar)
type Located a = (SourcePos, a)
type LocatedToken = (SourcePos, Token)
type Lexer = Parser [LocatedToken]
data Level
= One
| Two
| Three
| Four
| Five
| Six
deriving (Eq, Show)
data Token
= Token Text
| Anchor Text
| BirdTrack
| BoldOpen
| BoldClose
| Escape
| EmphasisOpen
| EmphasisClose
| Expression
| Header Level
| MonospaceOpen
| MonospaceClose
| Newline
| LinkOpen
| LinkClose
| LabeledLinkOpen
| LabeledLinkClose
| ParenOpen
| ParenClose
| BracketOpen
| BracketClose
| MathInlineOpen
| MathInlineClose
| MathMultilineOpen
| MathMultilineClose
| NumericEntity Int
| Module Text
| QuoteOpen
| QuoteClose
| Space
| EOF
deriving (Eq, Show)
located :: Parser a -> Parser (SourcePos, a)
located p = (,) <$> getPosition <*> p
tokenise :: [Parser a] -> Parser [(SourcePos, a)]
tokenise = mapM located
lexer :: String -> Either ParseError [LocatedToken]
lexer = Parsec.runParser lexText initialParserState "input" . Text.pack
lexText :: Parser [LocatedToken]
lexText = go
where
go = do
Parsec.optionMaybe Parsec.eof >>= \case
Just _ -> pure []
Nothing -> do
toks <- topLevel
rest <- go
pure (toks <> rest)
{- FOURMOLU_DISABLE -}
topLevel =
-- backtracking here so we always have a chance to try "other", the "catch-all-leave-to-parser-to-deal-with" choice
-- TODO: is this desirable? do we throw lexer error at all?
try
( choice
-- Sorted in
-- - longest to shortest parse path
-- - highest frequency to lowest frequency (for performance?)
-- - more exact to more freeform (the latter can be the former but not vice versa)
[ spaceToken
, newlineToken
, try module_
, quotes
, try expression
, birdTrack
-- starts with "\"
, try mathMultiline
, try mathInline
, escape
, headers
, labeledLink
, link
, anchor
, numericEntity
, textElement
]
)
<|> other
{- FOURMOLU_ENABLE -}
-- Tokens
textElement :: Parser [LocatedToken]
textElement =
choice $
Parsec.try
<$> [ emphasis
, bold
, monospace
]
headers :: Parser [LocatedToken]
headers =
choice $
Parsec.try
<$> [ header1
, header2
, header3
, header4
, header5
, header6
]
anyUntil :: Parser a -> Parser Text
anyUntil p = Text.pack <$> manyTill anyChar (lookAhead p)
delimitedAsTuple :: Parser open -> Parser close -> Parser (Located open, LocatedToken, Located close)
delimitedAsTuple openP closeP =
(,,)
<$> located openP
<*> located (Token <$> anyUntil closeP)
<*> located closeP
delimited :: Parser open -> Parser close -> Token -> Token -> Parser [LocatedToken]
delimited openP closeP openTok closeTok = asList <$> delimitedAsTuple (openTok <$ openP) (closeTok <$ closeP)
where
asList (a, tok, b) = [a, tok, b]
delimitedNoTrailing :: Parser open -> Parser close -> Token -> Parser [LocatedToken]
delimitedNoTrailing openP closeP openTok =
asList <$> delimitedAsTuple (openTok <$ openP) (void closeP)
where
asList (a, tok, _) = [a, tok]
delimitedSymmetric :: Parser a -> Token -> Token -> Parser [LocatedToken]
delimitedSymmetric s = delimited s s
--- End of line // end of file
eol :: Parser ()
eol = void "\n" <|> void "\r\n" <|> Parsec.eof
-- Start of line // start of file
sol :: Parser ()
sol = do
p <- getPosition
if sourceColumn p == 1
then pure ()
else fail "Not at start of line/document"
header1 :: Lexer
header1 = delimitedNoTrailing "= " eol (Header One)
header2 :: Lexer
header2 = delimitedNoTrailing "== " eol (Header Two)
header3 :: Lexer
header3 = delimitedNoTrailing "=== " eol (Header Three)
header4 :: Lexer
header4 = delimitedNoTrailing "==== " eol (Header Four)
header5 :: Lexer
header5 = delimitedNoTrailing "===== " eol (Header Five)
header6 :: Lexer
header6 = delimitedNoTrailing "====== " eol (Header Six)
-- #anchors#
anchor :: Lexer
anchor = do
x <- located $ between "#" "#" (Anchor <$> anyUntil "#")
pure [x]
moduleNames :: Parser Text
moduleNames = intercalate "." . fmap Text.pack <$> upperId `sepBy1` char '.'
upperId :: Parser String
upperId = (:) <$> satisfy isUpper <*> many1 identifierChar
identifierChar :: Parser Char
identifierChar = satisfy (\c -> isAlphaNum c || c == '_')
-- "Module.Name"
-- "Module.Name#anchor"
-- "Module.Name\#anchor" -- known as "old anchor". this has been deprecated for 9 years, thanks Ben
module_ :: Lexer
module_ = between (char '"') (char '"') inner
where
inner = do
m <- located $ Module <$> moduleNames
mAnchor <- optionMaybe (located $ anchorHash *> (Anchor <$> anchorText))
pure $ case mAnchor of
Just anc -> [m, anc]
Nothing -> [m]
anchorHash :: Parser Text
anchorHash = "#" <|> try "\\#"
anchorText :: Parser Text
anchorText = Text.pack <$> many (satisfy (\c -> c /= '"' && not (isSpace c)))
linkRaw :: Lexer
linkRaw =
tokenise
[ BracketOpen <$ "["
, Token <$> anyUntil "]"
, BracketClose <$ "]"
, ParenOpen <$ "("
, Token <$> anyUntil ")"
, ParenClose <$ ")"
]
link :: Lexer
link = do
pos <- getPosition
l <- linkRaw
-- register the position of the last token
pos' <- flip incSourceColumn (-1) <$> getPosition
pure $ (pos, LinkOpen) : l <> [(pos', LinkClose)]
labeledLink :: Lexer
labeledLink = do
open <- located $ LabeledLinkOpen <$ "<"
linkRes <- linkRaw
labelRes <- located $ Token <$> anyUntil ">"
close <- located $ LabeledLinkClose <$ ">"
pure $
open : linkRes <> [labelRes, close]
mathMultiline :: Lexer
mathMultiline = delimited "\\[" "\\]" MathMultilineOpen MathMultilineClose
mathInline :: Lexer
mathInline = delimited "\\(" "\\)" MathInlineOpen MathInlineClose
-- TODO: make sure this starts at column 0?
birdTrack :: Lexer
birdTrack = delimitedNoTrailing (sol <* "> ") eol BirdTrack
-- TODO: also match following lines iff:
-- they start with alphanum
-- they're not empty
expression :: Lexer
expression = delimitedNoTrailing (sol <* ">>> ") eol Expression
escape :: Lexer
escape = delimitedNoTrailing "\\" eol Escape
quotes :: Lexer
quotes = delimitedSymmetric "\"" QuoteOpen QuoteClose
emphasis :: Lexer
emphasis = delimitedSymmetric "/" EmphasisOpen EmphasisClose
bold :: Lexer
bold = delimitedSymmetric "__" BoldOpen BoldClose
monospace :: Lexer
monospace = delimitedSymmetric "@" MonospaceOpen MonospaceClose
decimal :: Parser Int
decimal = read . Text.unpack <$> takeWhile1_ isDigit
hexadecimal :: Parser Int
hexadecimal = "x" *> (convert 0 . fmap (normalise . toLower) <$> many1 hexDigit)
where
normalise :: Char -> Int
normalise c
| ord '0' <= n && n <= ord '9' = n - ord '0'
| ord 'A' <= n && n <= ord 'F' = n - ord 'A' + 10
| ord 'a' <= n && n <= ord 'f' = n - ord 'a' + 10
| otherwise = error "unexpected: invalid hex number"
where
n = ord c
convert :: Int -> [Int] -> Int
convert acc [] = acc
convert acc (x : xs) = convert (acc * 16 + x) xs
numericEntity :: Lexer
numericEntity = do
x <- located $ between "&#" ";" (NumericEntity <$> (hexadecimal <|> decimal))
pure [x]
other :: Lexer
other = do
pos <- getPosition
c <- takeWhile1_ isUnicodeAlphaNum
pure . pure $ (pos, Token c)
where
isUnicodeAlphaNum c = isPrint c && not (isControl c) && not (isSpace c)
spaceToken :: Lexer
spaceToken = do
pos <- getPosition
_ <- many1 (char ' ')
pure . pure $ (pos, Space)
newlineToken :: Lexer
newlineToken = do
pos <- getPosition
_ <- newline
pure . pure $ (pos, Newline)
-------
-- Helpers
-------
-- | Like `takeWhile`, but unconditionally take escaped characters.
takeWhile_ :: (Char -> Bool) -> Parser Text
takeWhile_ p = scan p_ False
where
p_ escaped c
| escaped = Just False
| not $ p c = Nothing
| otherwise = Just (c == '\\')
-- | Like 'takeWhile1', but unconditionally take escaped characters.
takeWhile1_ :: (Char -> Bool) -> Parser Text
takeWhile1_ = mfilter (not . Text.null) . takeWhile_
{- | Scan the input text, accumulating characters as long as the scanning
function returns true.
-}
scan ::
-- | scan function
(state -> Char -> Maybe state) ->
-- | initial state
state ->
Parser Text
scan f initState = do
parserState@State{stateInput = input, statePos = pos} <- Parsec.getParserState
(remaining, finalPos, ct) <- go input initState pos 0
let newState = parserState{stateInput = remaining, statePos = finalPos}
Parsec.setParserState newState $> Text.take ct input
where
go !input' !st !posAccum !count' = case Text.uncons input' of
Nothing -> pure (input', posAccum, count')
Just (char', input'') -> case f st char' of
Nothing -> pure (input', posAccum, count')
Just st' -> go input'' st' (updatePosChar posAccum char') (count' + 1)