This commit is contained in:
Igor Ranieri 2025-09-21 08:19:01 +02:00
commit c9f61c4e06
16 changed files with 1366 additions and 0 deletions

190
src/Lexer.hs Normal file
View file

@ -0,0 +1,190 @@
{-# LANGUAGE OverloadedStrings #-}
module Lexer (
Token (..),
lexer,
emphasis,
) where
import Control.Monad (mfilter)
import Data.Char (isAlphaNum, isPrint)
import Data.Functor (($>))
import Data.Text (Text)
import Data.Text qualified as Text
import GHC.Stack (HasCallStack)
import ParserMonad (Parser, initialParserState)
import Text.Parsec
import Text.Parsec qualified as Parsec
import Text.Parsec.Pos (updatePosChar)
type LocatedToken = (SourcePos, Token)
type Lexer = Parser [LocatedToken]
data Token
= Token Text
| Anchor
| AngleOpen
| AngleClose
| BoldOpen
| BoldClose
| BracketOpen
| BracketClose
| EmphasisOpen
| EmphasisClose
| MonospaceOpen
| MonospaceClose
| Newline
| ParenOpen
| ParenClose
| QuoteOpen
| QuoteClose
| Space
| EOF
deriving (Eq, Show)
lexer :: String -> Either ParseError [LocatedToken]
lexer = Parsec.runParser lexText initialParserState "input" . Text.pack
lexText :: (HasCallStack) => Parser [LocatedToken]
lexText = go
where
go = do
Parsec.optionMaybe Parsec.eof >>= \case
Just _ -> pure []
Nothing -> do
toks <-
choice
[ newlineToken
, spaceToken
, textElement
, identifier
, other
]
rest <- go
pure (toks <> rest)
match :: Parser a -> Parser (Text, a)
match p = do
input <- getInput
result <- p
input' <- getInput
let !consumed = Text.take (Text.length input - Text.length input') input
pure (consumed, result)
-- Tokens
textElement :: Parser [LocatedToken]
textElement =
choice $
Parsec.try
<$> [ emphasis
, bold
, monospace
, parens
, brackets
, angles
]
delimited :: String -> String -> Token -> Token -> Parser [LocatedToken]
delimited c1 c2 ot ct = do
pos <- getPosition
(_, content) <- match $ between op cl any'
innerToks <- case lexer $ Text.unpack content of
Left _ -> do
pos' <- getPosition
pure $ [(pos', Token content)]
Right toks -> pure toks
let openTok :: LocatedToken = (pos, ot)
closeTok :: LocatedToken = (pos, ct)
pure $ openTok : innerToks <> [closeTok]
where
op = string c1
cl = string c2
any' = Text.pack <$> manyTill anyChar (lookAhead cl)
delimited' :: String -> Token -> Token -> Parser [LocatedToken]
delimited' s t1 t2 = delimited s s t1 t2
emphasis :: Lexer
emphasis = delimited' "/" EmphasisOpen EmphasisClose
bold :: Lexer
bold = delimited' "__" BoldOpen BoldClose
monospace :: Lexer
monospace = delimited' "@" MonospaceOpen MonospaceClose
parens :: Parser [LocatedToken]
parens = delimited "(" ")" ParenOpen ParenClose
brackets :: Lexer
brackets = delimited "[" "]" ParenOpen ParenClose
angles :: Parser [LocatedToken]
angles = delimited "<" ">" AngleOpen AngleClose
other :: Lexer
other = do
pos <- getPosition
c <- takeWhile1_ isPrint
pure . pure $ (pos, Token c)
spaceToken :: Lexer
spaceToken = do
pos <- getPosition
_ <- many1 (char ' ')
pure . pure $ (pos, Space)
newlineToken :: Lexer
newlineToken = do
pos <- getPosition
_ <- newline
pure . pure $ (pos, Newline)
identifier :: Lexer
identifier = do
pos <- getPosition
txt <- takeWhile1_ isAlphaNum
pure . pure $ (pos, Token txt)
-------
-- Helpers
-------
-- | Like `takeWhile`, but unconditionally take escaped characters.
takeWhile_ :: (Char -> Bool) -> Parser Text
takeWhile_ p = scan p_ False
where
p_ escaped c
| escaped = Just False
| not $ p c = Nothing
| otherwise = Just (c == '\\')
-- | Like 'takeWhile1', but unconditionally take escaped characters.
takeWhile1_ :: (Char -> Bool) -> Parser Text
takeWhile1_ = mfilter (not . Text.null) . takeWhile_
{- | Scan the input text, accumulating characters as long as the scanning
function returns true.
-}
scan ::
-- | scan function
(state -> Char -> Maybe state) ->
-- | initial state
state ->
Parser Text
scan f initState = do
parserState@State{stateInput = input, statePos = pos} <- Parsec.getParserState
(remaining, finalPos, ct) <- go input initState pos 0
let newState = parserState{stateInput = remaining, statePos = finalPos}
Parsec.setParserState newState $> Text.take ct input
where
go !input' !st !posAccum !count' = case Text.uncons input' of
Nothing -> pure (input', posAccum, count')
Just (char', input'') -> case f st char' of
Nothing -> pure (input', posAccum, count')
Just st' -> go input'' st' (updatePosChar posAccum char') (count' + 1)