refactor lexer (#2)

Reduce manual usage of getPosition and setting it and improve some helper functions.

Reviewed-on: elland/haddock2#2
Reviewed-by: elland <igor@elland.me>
Co-authored-by: Léana 江 <leana.jiang+git@icloud.com>
Co-committed-by: Léana 江 <leana.jiang+git@icloud.com>
This commit is contained in:
Primrose 2025-09-26 14:44:46 +00:00 committed by Primrose
parent 39cfe2035d
commit 6064afd0b9
3 changed files with 91 additions and 106 deletions

View file

@ -12,7 +12,7 @@ bold ::= '__' text_no_newline '__'
monospace ::= '@' text_content '@' monospace ::= '@' text_content '@'
link ::= module_link | hyperlink | markdown_link link ::= module_link | hyperlink | markdown_link
module_link ::= '"' module_name ( '#' anchor_name )? '"' module_link ::= '"' module_name ( ('#' | '\#') anchor_name )? '"'
hyperlink ::= '<' url ( ' ' link_text )? '>' hyperlink ::= '<' url ( ' ' link_text )? '>'
markdown_link ::= '[' link_text '](' ( url | module_link ) ')' markdown_link ::= '[' link_text '](' ( url | module_link ) ')'

View file

@ -10,12 +10,13 @@ import Control.Monad (mfilter, void)
import Data.Functor (($>)) import Data.Functor (($>))
import Data.Text (Text, intercalate) import Data.Text (Text, intercalate)
import Data.Text qualified as Text import Data.Text qualified as Text
import GHC.Unicode (isAlpha, isAlphaNum, isControl, isPrint, isSpace, isUpper) import GHC.Unicode (isAlphaNum, isControl, isPrint, isSpace, isUpper)
import ParserMonad (Parser, initialParserState) import ParserMonad (Parser, initialParserState)
import Text.Parsec import Text.Parsec
import Text.Parsec qualified as Parsec import Text.Parsec qualified as Parsec
import Text.Parsec.Pos (updatePosChar) import Text.Parsec.Pos (updatePosChar)
type Located a = (SourcePos, a)
type LocatedToken = (SourcePos, Token) type LocatedToken = (SourcePos, Token)
type Lexer = Parser [LocatedToken] type Lexer = Parser [LocatedToken]
@ -50,12 +51,12 @@ data Token
| ParenClose | ParenClose
| BracketOpen | BracketOpen
| BracketClose | BracketClose
| MathsParenOpen | MathInlineOpen
| MathsParenClose | MathInlineClose
| MathsBracketOpen | MathMultilineOpen
| MathsBracketClose | MathMultilineClose
| NumericEntity Int | NumericEntity Int
| Module | Module Text
| QuoteOpen | QuoteOpen
| QuoteClose | QuoteClose
| Space | Space
@ -65,9 +66,6 @@ data Token
located :: Parser a -> Parser (SourcePos, a) located :: Parser a -> Parser (SourcePos, a)
located p = (,) <$> getPosition <*> p located p = (,) <$> getPosition <*> p
startPosition :: Parser a -> Parser SourcePos
startPosition = fmap fst . located
tokenise :: [Parser a] -> Parser [(SourcePos, a)] tokenise :: [Parser a] -> Parser [(SourcePos, a)]
tokenise = sequence . map located tokenise = sequence . map located
@ -84,16 +82,16 @@ lexText = go
toks <- toks <-
choice $ choice $
Parsec.try Parsec.try
<$> [ mathsBracket <$> [ mathMultiline
, mathsParens , mathInline
, escape -- maths go before escape to avoid mismatch , escape -- maths go before escape to avoid mismatch
, headers , headers
, newlineToken , newlineToken
, spaceToken , spaceToken
, link , link
, labeledLink , labeledLink
, modules , module_
, anchors , anchor
, textElement , textElement
, quotes , quotes
, birdTrack , birdTrack
@ -125,90 +123,84 @@ headers =
, header6 , header6
] ]
delimitedMaybe :: Parser a -> Parser a -> Token -> Maybe Token -> Parser [LocatedToken]
delimitedMaybe openMark closeMark openToken closeToken = do
openPos <- getPosition
void openMark
tokenPos <- getPosition
content <- anyUntil closeMark
closePos <- getPosition
void closeMark
let openTok :: LocatedToken = (openPos, openToken)
res :: LocatedToken = (tokenPos, Token content)
closeToks :: [LocatedToken] = case closeToken of
Just close -> [(closePos, close)]
Nothing -> []
pure $ [openTok, res] <> closeToks
anyUntil :: Parser a -> Parser Text anyUntil :: Parser a -> Parser Text
anyUntil p = Text.pack <$> manyTill anyChar (lookAhead p) anyUntil p = Text.pack <$> manyTill anyChar (lookAhead p)
delimited :: Parser a -> Parser a -> Token -> Token -> Parser [LocatedToken] delimitedAsTuple :: Parser open -> Parser close -> Parser (Located open, LocatedToken, Located close)
delimited a b c d = delimitedMaybe a b c (Just d) delimitedAsTuple openP closeP =
(,,)
<$> located openP
<*> located (Token <$> anyUntil closeP)
<*> located closeP
delimited :: Parser open -> Parser close -> Token -> Token -> Parser [LocatedToken]
delimited openP closeP openTok closeTok = asList <$> delimitedAsTuple (openTok <$ openP) (closeTok <$ closeP)
where
asList (a, tok, b) = [a, tok, b]
delimitedNoTrailing :: Parser open -> Parser close -> Token -> Parser [LocatedToken]
delimitedNoTrailing openP closeP openTok = asList <$> delimitedAsTuple (openTok <$ openP) (void closeP)
where
asList (a, tok, _) = [a, tok]
delimitedSymmetric :: Parser a -> Token -> Token -> Parser [LocatedToken] delimitedSymmetric :: Parser a -> Token -> Token -> Parser [LocatedToken]
delimitedSymmetric s t1 t2 = delimited s s t1 t2 delimitedSymmetric s t1 t2 = delimited s s t1 t2
eol :: Parser () eol :: Parser ()
eol = void "\n" <|> Parsec.eof eol = void "\n" <|> void "\r\n" <|> Parsec.eof
header1 :: Lexer header1 :: Lexer
header1 = delimitedMaybe (void $ "= ") eol (Header One) Nothing header1 = delimitedNoTrailing "= " eol (Header One)
header2 :: Lexer header2 :: Lexer
header2 = delimitedMaybe (void $ "== ") eol (Header Two) Nothing header2 = delimitedNoTrailing "== " eol (Header Two)
header3 :: Lexer header3 :: Lexer
header3 = delimitedMaybe (void $ "=== ") eol (Header Three) Nothing header3 = delimitedNoTrailing "=== " eol (Header Three)
header4 :: Lexer header4 :: Lexer
header4 = delimitedMaybe (void $ "==== ") eol (Header Four) Nothing header4 = delimitedNoTrailing "==== " eol (Header Four)
header5 :: Lexer header5 :: Lexer
header5 = delimitedMaybe (void $ "===== ") eol (Header Five) Nothing header5 = delimitedNoTrailing "===== " eol (Header Five)
header6 :: Lexer header6 :: Lexer
header6 = delimitedMaybe (void $ "====== ") eol (Header Six) Nothing header6 = delimitedNoTrailing "====== " eol (Header Six)
-- #anchors# -- #anchors#
anchors :: Lexer anchor :: Lexer
anchors = do anchor = do
pos <- getPosition x <- located $ between "#" "#" (Anchor <$> anyUntil "#")
void $ try anchor' pure [x]
txt <- anyUntil anchor'
void $ try anchor'
pure [(pos, Anchor txt)]
where moduleNames :: Parser Text
anchor' = (string "#" <|> string "\\#") moduleNames = intercalate "." . fmap Text.pack <$> upperId `sepBy1` char '.'
upperId :: Parser String
upperId = (:) <$> satisfy isUpper <*> many1 identifierChar
identifierChar :: Parser Char
identifierChar = satisfy (\c -> isAlphaNum c || c == '_')
-- "Module.Name" -- "Module.Name"
-- "Module.Name#anchor" -- "Module.Name#anchor"
-- "Module.Name\#anchor" -- this has been deprecated for 9 years, thanks Ben -- "Module.Name\#anchor" -- known as "old anchor". this has been deprecated for 9 years, thanks Ben
modules :: Lexer module_ :: Lexer
modules = do module_ = between (char '"') (char '"') inner
startPos <- startPosition $ char '"'
(modPos, modName) <- located modId
anch <- option [] do
anchPos <- startPosition (string "#" <|> string' "\\#")
txt <- Text.pack <$> many (satisfy (\c -> c /= '"' && not (isSpace c)))
pure [(anchPos, Anchor txt)]
void $ char '"'
pure $ [(startPos, Module), (modPos, Token modName)] <> anch
where where
modId = intercalate "." <$> (fmap Text.pack <$> (conId `sepBy1` (char '.'))) inner = do
m <- located $ Module <$> moduleNames
mAnchor <- optionMaybe (located $ anchorHash *> (Anchor <$> anchorText))
pure $ case mAnchor of
Just anc -> [m, anc]
Nothing -> [m]
conId :: Parser String anchorHash :: Parser Text
conId = anchorHash = "#" <|> try "\\#"
(:)
<$> satisfy (\c -> isAlpha c && isUpper c)
<*> many1 conChar
conChar :: Parser Char anchorText :: Parser Text
conChar = satisfy (\c -> isAlphaNum c || c == '_') anchorText = Text.pack <$> many (satisfy (\c -> c /= '"' && not (isSpace c)))
linkRaw :: Lexer linkRaw :: Lexer
linkRaw = linkRaw =
@ -225,38 +217,30 @@ link :: Lexer
link = do link = do
pos <- getPosition pos <- getPosition
l <- linkRaw l <- linkRaw
-- "unconsume" the last token -- register the position of the last token
pos' <- flip incSourceColumn (-1) <$> getPosition pos' <- flip incSourceColumn (-1) <$> getPosition
pure $ (pos, LinkOpen) : l <> [(pos', LinkClose)] pure $ (pos, LinkOpen) : l <> [(pos', LinkClose)]
labeledLink :: Lexer labeledLink :: Lexer
labeledLink = do labeledLink = do
pos <- getPosition open <- located $ LabeledLinkOpen <$ "<"
void $ string "<" linkRes <- linkRaw
link' <- linkRaw labelRes <- located $ Token <$> anyUntil ">"
pos7 <- getPosition close <- located $ LabeledLinkClose <$ ">"
label' <- anyUntil $ string ">"
pos8 <- getPosition
void $ ">"
pure $ pure $
(pos, LabeledLinkOpen) open : linkRes <> [ labelRes , close ]
: link'
<> [ (pos7, Token label')
, (pos8, LabeledLinkClose)
]
mathsBracket :: Lexer mathMultiline :: Lexer
mathsBracket = delimited (void $ "\\[") (void "\\]") MathsBracketOpen MathsBracketClose mathMultiline = delimited "\\[" "\\]" MathMultilineOpen MathMultilineClose
mathsParens :: Lexer mathInline :: Lexer
mathsParens = delimited (void $ "\\(") (void "\\)") MathsParenOpen MathsParenClose mathInline = delimited "\\(" "\\)" MathInlineOpen MathInlineClose
birdTrack :: Lexer birdTrack :: Lexer
birdTrack = delimitedMaybe (void ">> ") eol BirdTrack Nothing birdTrack = delimitedNoTrailing ">> " eol BirdTrack
escape :: Lexer escape :: Lexer
escape = delimitedMaybe (void "\\") eol Escape Nothing escape = delimitedNoTrailing "\\" eol Escape
quotes :: Lexer quotes :: Lexer
quotes = delimitedSymmetric "\"" QuoteOpen QuoteClose quotes = delimitedSymmetric "\"" QuoteOpen QuoteClose

View file

@ -19,8 +19,8 @@ main = hspec $ do
describe "minimal" do describe "minimal" do
it "handles unicode" unicode it "handles unicode" unicode
it "escapes" escaping it "escapes" escaping
it "maths" maths it "maths" math
it "anchors" anchors it "anchors" anchor
it "space chars" space it "space chars" space
it "bare string" someString it "bare string" someString
it "emphasis" emphatic it "emphasis" emphatic
@ -45,19 +45,16 @@ main = hspec $ do
modules :: Expectation modules :: Expectation
modules = do modules = do
"\"MyModule.Name\"" "\"MyModule.Name\""
`shouldLexTo` [ (1, 1, Module) `shouldLexTo` [ (1, 2, Module "MyModule.Name")
, (1, 2, Token "MyModule.Name")
] ]
"\"OtherModule.Name#myAnchor\"" "\"OtherModule.Name#myAnchor\""
`shouldLexTo` [ (1, 1, Module) `shouldLexTo` [ (1, 2, Module "OtherModule.Name")
, (1, 2, Token "OtherModule.Name")
, (1, 18, Anchor "myAnchor") , (1, 18, Anchor "myAnchor")
] ]
"\"OtherModule.Name\\#myAnchor\"" "\"OtherModule.Name\\#myAnchor\""
`shouldLexTo` [ (1, 1, Module) `shouldLexTo` [ (1, 2, Module "OtherModule.Name")
, (1, 2, Token "OtherModule.Name")
, (1, 18, Anchor "myAnchor") , (1, 18, Anchor "myAnchor")
] ]
link :: Expectation link :: Expectation
@ -87,31 +84,35 @@ labeledLink =
, (1, 35, LabeledLinkClose) , (1, 35, LabeledLinkClose)
] ]
anchors :: Expectation anchor :: Expectation
anchors = anchor =
"#myAnchor#" "#myAnchor#"
`shouldLexTo` [ (1, 1, Anchor "myAnchor") `shouldLexTo` [ (1, 1, Anchor "myAnchor")
] ]
maths :: IO () math :: IO ()
maths = do math = do
"\\[some math\\]" "\\[some math\\]"
`shouldLexTo` [ (1, 1, MathsBracketOpen) `shouldLexTo` [ (1, 1, MathMultilineOpen)
, (1, 3, Token "some math") , (1, 3, Token "some math")
, (1, 12, MathsBracketClose) , (1, 12, MathMultilineClose)
] ]
"\\(other maths\\)" "\\(other maths\\)"
`shouldLexTo` [ (1, 1, MathsParenOpen) `shouldLexTo` [ (1, 1, MathInlineOpen)
, (1, 3, Token "other maths") , (1, 3, Token "other maths")
, (1, 14, MathsParenClose) , (1, 14, MathInlineClose)
] ]
escaping :: Expectation escaping :: Expectation
escaping = escaping = do
"\\(" "\\("
`shouldLexTo` [ (1, 1, Escape) `shouldLexTo` [ (1, 1, Escape)
, (1, 2, Token "(") , (1, 2, Token "(")
] ]
"\\(\r\n"
`shouldLexTo` [ (1, 1, Escape)
, (1, 2, Token "(")
]
unicode :: Expectation unicode :: Expectation
unicode = unicode =