Embed Anchor content inside token

This commit is contained in:
Igor Ranieri 2025-09-24 09:48:01 +02:00
parent 82e1c76fe7
commit 8887476626
3 changed files with 311 additions and 313 deletions

View file

@ -1,9 +1,9 @@
{-# LANGUAGE OverloadedStrings #-}
module Lexer (
Token (..),
lexer,
emphasis,
Token (..),
lexer,
emphasis,
) where
import Control.Monad (mfilter, void)
@ -21,116 +21,117 @@ type LocatedToken = (SourcePos, Token)
type Lexer = Parser [LocatedToken]
data Level
= One
| Two
| Three
| Four
| Five
| Six
deriving (Eq, Show)
= One
| Two
| Three
| Four
| Five
| Six
deriving (Eq, Show)
data Token
= Token Text
| Anchor
| BirdTrack
| BoldOpen
| BoldClose
| Escape
| EmphasisOpen
| EmphasisClose
| Header Level
| MonospaceOpen
| MonospaceClose
| Newline
| LinkOpen
| LinkClose
| LabeledLinkOpen
| LabeledLinkClose
| ParenOpen
| ParenClose
| BracketOpen
| BracketClose
| MathsParenOpen
| MathsParenClose
| MathsBracketOpen
| MathsBracketClose
| Module
| QuoteOpen
| QuoteClose
| Space
| EOF
deriving (Eq, Show)
= Token Text
| Anchor Text
| BirdTrack
| BoldOpen
| BoldClose
| Escape
| EmphasisOpen
| EmphasisClose
| Header Level
| MonospaceOpen
| MonospaceClose
| Newline
| LinkOpen
| LinkClose
| LabeledLinkOpen
| LabeledLinkClose
| ParenOpen
| ParenClose
| BracketOpen
| BracketClose
| MathsParenOpen
| MathsParenClose
| MathsBracketOpen
| MathsBracketClose
| NumericEntity Int
| Module
| QuoteOpen
| QuoteClose
| Space
| EOF
deriving (Eq, Show)
lexer :: String -> Either ParseError [LocatedToken]
lexer = Parsec.runParser lexText initialParserState "input" . Text.pack
lexText :: Parser [LocatedToken]
lexText = go
where
go = do
Parsec.optionMaybe Parsec.eof >>= \case
Just _ -> pure []
Nothing -> do
toks <-
choice $
Parsec.try
<$> [ mathsBracket
, mathsParens
, escape -- maths go before escape to avoid mismatch
, headers
, newlineToken
, spaceToken
, link
, labeledLink
, modules
, anchors
, textElement
, quotes
, birdTrack
, other
]
rest <- go
pure (toks <> rest)
where
go = do
Parsec.optionMaybe Parsec.eof >>= \case
Just _ -> pure []
Nothing -> do
toks <-
choice $
Parsec.try
<$> [ mathsBracket
, mathsParens
, escape -- maths go before escape to avoid mismatch
, headers
, newlineToken
, spaceToken
, link
, labeledLink
, modules
, anchors
, textElement
, quotes
, birdTrack
, other
]
rest <- go
pure (toks <> rest)
-- Tokens
textElement :: Parser [LocatedToken]
textElement =
choice $
Parsec.try
<$> [ emphasis
, bold
, monospace
]
choice $
Parsec.try
<$> [ emphasis
, bold
, monospace
]
headers :: Parser [LocatedToken]
headers =
choice $
Parsec.try
<$> [ header1
, header2
, header3
, header4
, header5
, header6
]
choice $
Parsec.try
<$> [ header1
, header2
, header3
, header4
, header5
, header6
]
delimitedMaybe :: Parser a -> Parser a -> Token -> Maybe Token -> Parser [LocatedToken]
delimitedMaybe openMark closeMark openToken closeToken = do
openPos <- getPosition
void openMark
tokenPos <- getPosition
content <- anyUntil closeMark
closePos <- getPosition
void closeMark
openPos <- getPosition
void openMark
tokenPos <- getPosition
content <- anyUntil closeMark
closePos <- getPosition
void closeMark
let openTok :: LocatedToken = (openPos, openToken)
res :: LocatedToken = (tokenPos, Token content)
closeToks :: [LocatedToken] = case closeToken of
Just close -> [(closePos, close)]
Nothing -> []
let openTok :: LocatedToken = (openPos, openToken)
res :: LocatedToken = (tokenPos, Token content)
closeToks :: [LocatedToken] = case closeToken of
Just close -> [(closePos, close)]
Nothing -> []
pure $ [openTok, res] <> closeToks
pure $ [openTok, res] <> closeToks
anyUntil :: Parser a -> Parser Text
anyUntil p = Text.pack <$> manyTill anyChar (lookAhead p)
@ -165,99 +166,91 @@ header6 = delimitedMaybe (void $ "====== ") eol (Header Six) Nothing
-- #anchors#
anchors :: Lexer
anchors = do
pos <- getPosition
void $ try anchor'
pos' <- getPosition
txt <- anyUntil anchor'
pos'' <- getPosition
void $ try anchor'
pos <- getPosition
void $ try anchor'
txt <- anyUntil anchor'
void $ try anchor'
pure
[ (pos, Anchor)
, (pos', Token txt)
, (pos'', Anchor)
]
where
anchor' = (string "#" <|> string "\\#")
pure [(pos, Anchor txt)]
where
anchor' = (string "#" <|> string "\\#")
-- "Module.Name"
-- "Module.Name#anchor"
-- "Module.Name#anchor"
modules :: Lexer
modules = do
pos <- getPosition
void $ char '"'
pos' <- getPosition
modName <- modId
anch <- option [] do
pos'' <- getPosition
void $ try (string "#" <|> string "\\#")
pos''' <- getPosition
a <- Text.pack <$> many (satisfy (\c -> c /= '"' && not (isSpace c)))
pure [(pos'', Anchor), (pos''', Token a)]
pos <- getPosition
void $ char '"'
pos' <- getPosition
modName <- modId
anch <- option [] do
pos'' <- getPosition
void $ try (string "#" <|> string "\\#")
a <- Text.pack <$> many (satisfy (\c -> c /= '"' && not (isSpace c)))
pure [(pos'', Anchor a)]
void $ char '"'
pure $ [(pos, Module), (pos', Token modName)] <> anch
where
modId = intercalate "." <$> (fmap Text.pack <$> (conId `sepBy1` (char '.')))
void $ char '"'
pure $ [(pos, Module), (pos', Token modName)] <> anch
where
modId = intercalate "." <$> (fmap Text.pack <$> (conId `sepBy1` (char '.')))
conId :: Parser String
conId =
(:)
<$> satisfy (\c -> isAlpha c && isUpper c)
<*> many1 conChar
conId :: Parser String
conId =
(:)
<$> satisfy (\c -> isAlpha c && isUpper c)
<*> many1 conChar
conChar :: Parser Char
conChar = satisfy (\c -> isAlphaNum c || c == '_')
conChar :: Parser Char
conChar = satisfy (\c -> isAlphaNum c || c == '_')
linkRaw :: Lexer
linkRaw = do
pos1 <- getPosition
void $ string "["
pos2 <- getPosition
text <- anyUntil $ Text.pack <$> string "]"
pos3 <- getPosition
void $ "]"
pos4 <- getPosition
void $ "("
pos5 <- getPosition
link' <- anyUntil $ Text.pack <$> string ")"
pos6 <- getPosition
void $ ")"
pos1 <- getPosition
void $ string "["
pos2 <- getPosition
text <- anyUntil $ Text.pack <$> string "]"
pos3 <- getPosition
void $ "]"
pos4 <- getPosition
void $ "("
pos5 <- getPosition
link' <- anyUntil $ Text.pack <$> string ")"
pos6 <- getPosition
void $ ")"
pure $
[ (pos1, BracketOpen)
, (pos2, Token text)
, (pos3, BracketClose)
, (pos4, ParenOpen)
, (pos5, Token link')
, (pos6, ParenClose)
]
pure $
[ (pos1, BracketOpen)
, (pos2, Token text)
, (pos3, BracketClose)
, (pos4, ParenOpen)
, (pos5, Token link')
, (pos6, ParenClose)
]
link :: Lexer
link = do
pos <- getPosition
l <- linkRaw
-- "unconsume" the last token
pos' <- flip incSourceColumn (-1) <$> getPosition
pure $ (pos, LinkOpen) : l <> [(pos', LinkClose)]
pos <- getPosition
l <- linkRaw
-- "unconsume" the last token
pos' <- flip incSourceColumn (-1) <$> getPosition
pure $ (pos, LinkOpen) : l <> [(pos', LinkClose)]
labeledLink :: Lexer
labeledLink = do
pos <- getPosition
void $ string "<"
link' <- linkRaw
pos7 <- getPosition
label' <- anyUntil $ string ">"
pos8 <- getPosition
void $ ">"
pos <- getPosition
void $ string "<"
link' <- linkRaw
pos7 <- getPosition
label' <- anyUntil $ string ">"
pos8 <- getPosition
void $ ">"
pure $
(pos, LabeledLinkOpen)
: link'
<> [ (pos7, Token label')
, (pos8, LabeledLinkClose)
]
pure $
(pos, LabeledLinkOpen)
: link'
<> [ (pos7, Token label')
, (pos8, LabeledLinkClose)
]
mathsBracket :: Lexer
mathsBracket = delimited (void $ "\\[") (void "\\]") MathsBracketOpen MathsBracketClose
@ -285,23 +278,23 @@ monospace = delimitedSymmetric "@" MonospaceOpen MonospaceClose
other :: Lexer
other = do
pos <- getPosition
c <- takeWhile1_ isUnicodeAlphaNum
pure . pure $ (pos, Token c)
where
isUnicodeAlphaNum c = isPrint c && not (isControl c) && not (isSpace c)
pos <- getPosition
c <- takeWhile1_ isUnicodeAlphaNum
pure . pure $ (pos, Token c)
where
isUnicodeAlphaNum c = isPrint c && not (isControl c) && not (isSpace c)
spaceToken :: Lexer
spaceToken = do
pos <- getPosition
_ <- many1 (char ' ')
pure . pure $ (pos, Space)
pos <- getPosition
_ <- many1 (char ' ')
pure . pure $ (pos, Space)
newlineToken :: Lexer
newlineToken = do
pos <- getPosition
_ <- newline
pure . pure $ (pos, Newline)
pos <- getPosition
_ <- newline
pure . pure $ (pos, Newline)
-------
-- Helpers
@ -310,11 +303,11 @@ newlineToken = do
-- | Like `takeWhile`, but unconditionally take escaped characters.
takeWhile_ :: (Char -> Bool) -> Parser Text
takeWhile_ p = scan p_ False
where
p_ escaped c
| escaped = Just False
| not $ p c = Nothing
| otherwise = Just (c == '\\')
where
p_ escaped c
| escaped = Just False
| not $ p c = Nothing
| otherwise = Just (c == '\\')
-- | Like 'takeWhile1', but unconditionally take escaped characters.
takeWhile1_ :: (Char -> Bool) -> Parser Text
@ -324,19 +317,19 @@ takeWhile1_ = mfilter (not . Text.null) . takeWhile_
function returns true.
-}
scan ::
-- | scan function
(state -> Char -> Maybe state) ->
-- | initial state
state ->
Parser Text
-- | scan function
(state -> Char -> Maybe state) ->
-- | initial state
state ->
Parser Text
scan f initState = do
parserState@State{stateInput = input, statePos = pos} <- Parsec.getParserState
(remaining, finalPos, ct) <- go input initState pos 0
let newState = parserState{stateInput = remaining, statePos = finalPos}
Parsec.setParserState newState $> Text.take ct input
where
go !input' !st !posAccum !count' = case Text.uncons input' of
Nothing -> pure (input', posAccum, count')
Just (char', input'') -> case f st char' of
Nothing -> pure (input', posAccum, count')
Just st' -> go input'' st' (updatePosChar posAccum char') (count' + 1)
parserState@State{stateInput = input, statePos = pos} <- Parsec.getParserState
(remaining, finalPos, ct) <- go input initState pos 0
let newState = parserState{stateInput = remaining, statePos = finalPos}
Parsec.setParserState newState $> Text.take ct input
where
go !input' !st !posAccum !count' = case Text.uncons input' of
Nothing -> pure (input', posAccum, count')
Just (char', input'') -> case f st char' of
Nothing -> pure (input', posAccum, count')
Just st' -> go input'' st' (updatePosChar posAccum char') (count' + 1)