refactor lexer #2

Merged
primrose merged 13 commits from primrose/haddock2:leana8959 into dev 2025-09-26 14:44:47 +00:00
3 changed files with 91 additions and 106 deletions

View file

@ -12,7 +12,7 @@ bold ::= '__' text_no_newline '__'
monospace ::= '@' text_content '@'
link ::= module_link | hyperlink | markdown_link
module_link ::= '"' module_name ( '#' anchor_name )? '"'
module_link ::= '"' module_name ( ('#' | '\#') anchor_name )? '"'
hyperlink ::= '<' url ( ' ' link_text )? '>'
markdown_link ::= '[' link_text '](' ( url | module_link ) ')'

View file

@ -10,12 +10,13 @@ import Control.Monad (mfilter, void)
import Data.Functor (($>))
import Data.Text (Text, intercalate)
import Data.Text qualified as Text
import GHC.Unicode (isAlpha, isAlphaNum, isControl, isPrint, isSpace, isUpper)
import GHC.Unicode (isAlphaNum, isControl, isPrint, isSpace, isUpper)
import ParserMonad (Parser, initialParserState)
import Text.Parsec
import Text.Parsec qualified as Parsec
import Text.Parsec.Pos (updatePosChar)
type Located a = (SourcePos, a)
type LocatedToken = (SourcePos, Token)
type Lexer = Parser [LocatedToken]
@ -50,12 +51,12 @@ data Token
| ParenClose
| BracketOpen
| BracketClose
| MathsParenOpen
| MathsParenClose
| MathsBracketOpen
| MathsBracketClose
| MathInlineOpen
| MathInlineClose
| MathMultilineOpen
| MathMultilineClose
| NumericEntity Int
| Module
| Module Text
| QuoteOpen
| QuoteClose
| Space
@ -65,9 +66,6 @@ data Token
located :: Parser a -> Parser (SourcePos, a)
located p = (,) <$> getPosition <*> p
startPosition :: Parser a -> Parser SourcePos
startPosition = fmap fst . located
tokenise :: [Parser a] -> Parser [(SourcePos, a)]
tokenise = sequence . map located
@ -84,16 +82,16 @@ lexText = go
toks <-
choice $
Parsec.try
<$> [ mathsBracket
, mathsParens
<$> [ mathMultiline
, mathInline
, escape -- maths go before escape to avoid mismatch
, headers
, newlineToken
, spaceToken
, link
, labeledLink
, modules
, anchors
, module_
, anchor
, textElement
, quotes
, birdTrack
@ -125,90 +123,84 @@ headers =
, header6
]
delimitedMaybe :: Parser a -> Parser a -> Token -> Maybe Token -> Parser [LocatedToken]
delimitedMaybe openMark closeMark openToken closeToken = do
openPos <- getPosition
void openMark
tokenPos <- getPosition
content <- anyUntil closeMark
closePos <- getPosition
void closeMark
let openTok :: LocatedToken = (openPos, openToken)
res :: LocatedToken = (tokenPos, Token content)
closeToks :: [LocatedToken] = case closeToken of
Just close -> [(closePos, close)]
Nothing -> []
pure $ [openTok, res] <> closeToks
anyUntil :: Parser a -> Parser Text
anyUntil p = Text.pack <$> manyTill anyChar (lookAhead p)
delimited :: Parser a -> Parser a -> Token -> Token -> Parser [LocatedToken]
delimited a b c d = delimitedMaybe a b c (Just d)
delimitedAsTuple :: Parser open -> Parser close -> Parser (Located open, LocatedToken, Located close)
delimitedAsTuple openP closeP =
(,,)
<$> located openP
<*> located (Token <$> anyUntil closeP)
<*> located closeP
delimited :: Parser open -> Parser close -> Token -> Token -> Parser [LocatedToken]
delimited openP closeP openTok closeTok = asList <$> delimitedAsTuple (openTok <$ openP) (closeTok <$ closeP)
where
asList (a, tok, b) = [a, tok, b]

nice! do you want to do the same nice naming as delimitedAsTouple with open/close instead of a b?

nice! do you want to do the same nice naming as delimitedAsTouple with open/close instead of a b?

That's a good idea, will do

That's a good idea, will do
delimitedNoTrailing :: Parser open -> Parser close -> Token -> Parser [LocatedToken]
delimitedNoTrailing openP closeP openTok = asList <$> delimitedAsTuple (openTok <$ openP) (void closeP)
where
asList (a, tok, _) = [a, tok]
delimitedSymmetric :: Parser a -> Token -> Token -> Parser [LocatedToken]
delimitedSymmetric s t1 t2 = delimited s s t1 t2
eol :: Parser ()
eol = void "\n" <|> Parsec.eof
eol = void "\n" <|> void "\r\n" <|> Parsec.eof
header1 :: Lexer
header1 = delimitedMaybe (void $ "= ") eol (Header One) Nothing
header1 = delimitedNoTrailing "= " eol (Header One)
header2 :: Lexer
header2 = delimitedMaybe (void $ "== ") eol (Header Two) Nothing
header2 = delimitedNoTrailing "== " eol (Header Two)
header3 :: Lexer
header3 = delimitedMaybe (void $ "=== ") eol (Header Three) Nothing
header3 = delimitedNoTrailing "=== " eol (Header Three)
header4 :: Lexer
header4 = delimitedMaybe (void $ "==== ") eol (Header Four) Nothing
header4 = delimitedNoTrailing "==== " eol (Header Four)
header5 :: Lexer
header5 = delimitedMaybe (void $ "===== ") eol (Header Five) Nothing
header5 = delimitedNoTrailing "===== " eol (Header Five)
header6 :: Lexer
header6 = delimitedMaybe (void $ "====== ") eol (Header Six) Nothing
header6 = delimitedNoTrailing "====== " eol (Header Six)
-- #anchors#
anchors :: Lexer
anchors = do
pos <- getPosition
void $ try anchor'
txt <- anyUntil anchor'
void $ try anchor'
anchor :: Lexer
anchor = do
x <- located $ between "#" "#" (Anchor <$> anyUntil "#")
pure [x]
pure [(pos, Anchor txt)]
where
anchor' = (string "#" <|> string "\\#")
moduleNames :: Parser Text
moduleNames = intercalate "." . fmap Text.pack <$> upperId `sepBy1` char '.'
upperId :: Parser String
upperId = (:) <$> satisfy isUpper <*> many1 identifierChar
identifierChar :: Parser Char
identifierChar = satisfy (\c -> isAlphaNum c || c == '_')
-- "Module.Name"
-- "Module.Name#anchor"
-- "Module.Name\#anchor" -- this has been deprecated for 9 years, thanks Ben
modules :: Lexer
modules = do
startPos <- startPosition $ char '"'
(modPos, modName) <- located modId
anch <- option [] do
anchPos <- startPosition (string "#" <|> string' "\\#")
txt <- Text.pack <$> many (satisfy (\c -> c /= '"' && not (isSpace c)))
pure [(anchPos, Anchor txt)]
void $ char '"'
pure $ [(startPos, Module), (modPos, Token modName)] <> anch
-- "Module.Name\#anchor" -- known as "old anchor". this has been deprecated for 9 years, thanks Ben
module_ :: Lexer
module_ = between (char '"') (char '"') inner
where
modId = intercalate "." <$> (fmap Text.pack <$> (conId `sepBy1` (char '.')))
inner = do
m <- located $ Module <$> moduleNames
mAnchor <- optionMaybe (located $ anchorHash *> (Anchor <$> anchorText))
pure $ case mAnchor of
Just anc -> [m, anc]
Nothing -> [m]
conId :: Parser String
conId =
(:)
<$> satisfy (\c -> isAlpha c && isUpper c)
<*> many1 conChar
anchorHash :: Parser Text
anchorHash = "#" <|> try "\\#"
conChar :: Parser Char
conChar = satisfy (\c -> isAlphaNum c || c == '_')
anchorText :: Parser Text
anchorText = Text.pack <$> many (satisfy (\c -> c /= '"' && not (isSpace c)))
linkRaw :: Lexer
linkRaw =
@ -225,38 +217,30 @@ link :: Lexer
link = do
pos <- getPosition
l <- linkRaw
-- "unconsume" the last token
-- register the position of the last token
pos' <- flip incSourceColumn (-1) <$> getPosition
pure $ (pos, LinkOpen) : l <> [(pos', LinkClose)]
labeledLink :: Lexer
labeledLink = do
pos <- getPosition
void $ string "<"
link' <- linkRaw
pos7 <- getPosition
label' <- anyUntil $ string ">"
pos8 <- getPosition
void $ ">"
open <- located $ LabeledLinkOpen <$ "<"
linkRes <- linkRaw
labelRes <- located $ Token <$> anyUntil ">"
close <- located $ LabeledLinkClose <$ ">"
pure $
(pos, LabeledLinkOpen)
: link'
<> [ (pos7, Token label')
, (pos8, LabeledLinkClose)
]
open : linkRes <> [ labelRes , close ]
mathsBracket :: Lexer
mathsBracket = delimited (void $ "\\[") (void "\\]") MathsBracketOpen MathsBracketClose
mathMultiline :: Lexer
mathMultiline = delimited "\\[" "\\]" MathMultilineOpen MathMultilineClose
mathsParens :: Lexer
mathsParens = delimited (void $ "\\(") (void "\\)") MathsParenOpen MathsParenClose
mathInline :: Lexer
mathInline = delimited "\\(" "\\)" MathInlineOpen MathInlineClose
birdTrack :: Lexer
birdTrack = delimitedMaybe (void ">> ") eol BirdTrack Nothing
birdTrack = delimitedNoTrailing ">> " eol BirdTrack
escape :: Lexer
escape = delimitedMaybe (void "\\") eol Escape Nothing
escape = delimitedNoTrailing "\\" eol Escape
quotes :: Lexer
quotes = delimitedSymmetric "\"" QuoteOpen QuoteClose

View file

@ -19,8 +19,8 @@ main = hspec $ do
describe "minimal" do
it "handles unicode" unicode
it "escapes" escaping
it "maths" maths
it "anchors" anchors
it "maths" math
it "anchors" anchor
it "space chars" space
it "bare string" someString
it "emphasis" emphatic
@ -45,19 +45,16 @@ main = hspec $ do
modules :: Expectation
modules = do
"\"MyModule.Name\""

Now here I am a bit unsure about what we want. Should the position match the inner value or the delimiters?

Now here I am a bit unsure about what we _want_. Should the position match the inner value or the delimiters?

I actually don't know, should we even keep the quotes?

I actually don't know, should we even keep the quotes?

I wouldn’t keep it bc we for some representations we don’t need it and we can recreate for the ones we do.

I wouldn’t keep it bc we for some representations we don’t need it and we can recreate for the ones we do.
`shouldLexTo` [ (1, 1, Module)
, (1, 2, Token "MyModule.Name")
`shouldLexTo` [ (1, 2, Module "MyModule.Name")
]
"\"OtherModule.Name#myAnchor\""
`shouldLexTo` [ (1, 1, Module)
, (1, 2, Token "OtherModule.Name")
`shouldLexTo` [ (1, 2, Module "OtherModule.Name")
, (1, 18, Anchor "myAnchor")
]
"\"OtherModule.Name\\#myAnchor\""
`shouldLexTo` [ (1, 1, Module)
, (1, 2, Token "OtherModule.Name")
`shouldLexTo` [ (1, 2, Module "OtherModule.Name")
, (1, 18, Anchor "myAnchor")
]
link :: Expectation
@ -87,31 +84,35 @@ labeledLink =
, (1, 35, LabeledLinkClose)
]
anchors :: Expectation
anchors =
anchor :: Expectation
anchor =
"#myAnchor#"
`shouldLexTo` [ (1, 1, Anchor "myAnchor")
]
maths :: IO ()
maths = do
math :: IO ()
math = do
"\\[some math\\]"
`shouldLexTo` [ (1, 1, MathsBracketOpen)
`shouldLexTo` [ (1, 1, MathMultilineOpen)
, (1, 3, Token "some math")
, (1, 12, MathsBracketClose)
, (1, 12, MathMultilineClose)
]
"\\(other maths\\)"
`shouldLexTo` [ (1, 1, MathsParenOpen)
`shouldLexTo` [ (1, 1, MathInlineOpen)
, (1, 3, Token "other maths")
, (1, 14, MathsParenClose)
, (1, 14, MathInlineClose)
]
escaping :: Expectation
escaping =
escaping = do
"\\("
`shouldLexTo` [ (1, 1, Escape)
, (1, 2, Token "(")
]
"\\(\r\n"
`shouldLexTo` [ (1, 1, Escape)
, (1, 2, Token "(")
]
unicode :: Expectation
unicode =