Compare commits

...
Sign in to create a new pull request.

13 commits

Author SHA1 Message Date
970b658926
chore(lexer): clean up 2025-09-24 22:32:52 +08:00
2597e693f1
ref(lexer): simplify labeledLink 2025-09-24 22:31:59 +08:00
29c015b793
style(lexer): make binding naming consistent 2025-09-24 22:25:45 +08:00
326c7b681c
fix(lexer): old anchor is only used in moduleName 2025-09-24 22:20:32 +08:00
c4d59d3236
ref(lexer): rename MathsBracket -> MathMultiline 2025-09-24 21:35:21 +08:00
6ec47dad04
ref(lexer): rename MathParen -> MathInline 2025-09-24 21:31:26 +08:00
f3b3b08919
style(lexer): use "open" "close" in the type 2025-09-24 21:28:17 +08:00
6c0b4a4288
doc(lexer): explain the use of incSourceColumn
I think it is clearer to phrase it this way so it is clear that we are
not unconsuming (i.e. changing the state of the parser).
2025-09-24 21:25:46 +08:00
75c4817166
style(lexer): pluralize moduleNames parser 2025-09-24 21:25:46 +08:00
7ceb9b0277
ref(moduleName): break into multiple smaller functions
upperId has been changed to only use isUpper because an non alphabetical
character would be false anyway
2025-09-24 21:25:46 +08:00
368e5bc9a0
ref(lexer): simplify anchor 2025-09-24 21:25:46 +08:00
d6087ec3d6
ref(lexer): simplify delimited logic 2025-09-24 21:25:46 +08:00
fdb9070e99
fix(lexer): handle crlf in newline
Do we support windows 🤔
2025-09-24 21:25:46 +08:00
3 changed files with 91 additions and 106 deletions

View file

@ -12,7 +12,7 @@ bold ::= '__' text_no_newline '__'
monospace ::= '@' text_content '@'
link ::= module_link | hyperlink | markdown_link
module_link ::= '"' module_name ( '#' anchor_name )? '"'
module_link ::= '"' module_name ( ('#' | '\#') anchor_name )? '"'
hyperlink ::= '<' url ( ' ' link_text )? '>'
markdown_link ::= '[' link_text '](' ( url | module_link ) ')'

View file

@ -10,12 +10,13 @@ import Control.Monad (mfilter, void)
import Data.Functor (($>))
import Data.Text (Text, intercalate)
import Data.Text qualified as Text
import GHC.Unicode (isAlpha, isAlphaNum, isControl, isPrint, isSpace, isUpper)
import GHC.Unicode (isAlphaNum, isControl, isPrint, isSpace, isUpper)
import ParserMonad (Parser, initialParserState)
import Text.Parsec
import Text.Parsec qualified as Parsec
import Text.Parsec.Pos (updatePosChar)
type Located a = (SourcePos, a)
type LocatedToken = (SourcePos, Token)
type Lexer = Parser [LocatedToken]
@ -50,12 +51,12 @@ data Token
| ParenClose
| BracketOpen
| BracketClose
| MathsParenOpen
| MathsParenClose
| MathsBracketOpen
| MathsBracketClose
| MathInlineOpen
| MathInlineClose
| MathMultilineOpen
| MathMultilineClose
| NumericEntity Int
| Module
| Module Text
| QuoteOpen
| QuoteClose
| Space
@ -65,9 +66,6 @@ data Token
located :: Parser a -> Parser (SourcePos, a)
located p = (,) <$> getPosition <*> p
startPosition :: Parser a -> Parser SourcePos
startPosition = fmap fst . located
tokenise :: [Parser a] -> Parser [(SourcePos, a)]
tokenise = sequence . map located
@ -84,16 +82,16 @@ lexText = go
toks <-
choice $
Parsec.try
<$> [ mathsBracket
, mathsParens
<$> [ mathMultiline
, mathInline
, escape -- maths go before escape to avoid mismatch
, headers
, newlineToken
, spaceToken
, link
, labeledLink
, modules
, anchors
, module_
, anchor
, textElement
, quotes
, birdTrack
@ -125,90 +123,84 @@ headers =
, header6
]
delimitedMaybe :: Parser a -> Parser a -> Token -> Maybe Token -> Parser [LocatedToken]
delimitedMaybe openMark closeMark openToken closeToken = do
openPos <- getPosition
void openMark
tokenPos <- getPosition
content <- anyUntil closeMark
closePos <- getPosition
void closeMark
let openTok :: LocatedToken = (openPos, openToken)
res :: LocatedToken = (tokenPos, Token content)
closeToks :: [LocatedToken] = case closeToken of
Just close -> [(closePos, close)]
Nothing -> []
pure $ [openTok, res] <> closeToks
anyUntil :: Parser a -> Parser Text
anyUntil p = Text.pack <$> manyTill anyChar (lookAhead p)
delimited :: Parser a -> Parser a -> Token -> Token -> Parser [LocatedToken]
delimited a b c d = delimitedMaybe a b c (Just d)
delimitedAsTuple :: Parser open -> Parser close -> Parser (Located open, LocatedToken, Located close)
delimitedAsTuple openP closeP =
(,,)
<$> located openP
<*> located (Token <$> anyUntil closeP)
<*> located closeP
delimited :: Parser open -> Parser close -> Token -> Token -> Parser [LocatedToken]
delimited openP closeP openTok closeTok = asList <$> delimitedAsTuple (openTok <$ openP) (closeTok <$ closeP)
where
asList (a, tok, b) = [a, tok, b]
delimitedNoTrailing :: Parser open -> Parser close -> Token -> Parser [LocatedToken]
delimitedNoTrailing openP closeP openTok = asList <$> delimitedAsTuple (openTok <$ openP) (void closeP)
where
asList (a, tok, _) = [a, tok]
delimitedSymmetric :: Parser a -> Token -> Token -> Parser [LocatedToken]
delimitedSymmetric s t1 t2 = delimited s s t1 t2
eol :: Parser ()
eol = void "\n" <|> Parsec.eof
eol = void "\n" <|> void "\r\n" <|> Parsec.eof
header1 :: Lexer
header1 = delimitedMaybe (void $ "= ") eol (Header One) Nothing
header1 = delimitedNoTrailing "= " eol (Header One)
header2 :: Lexer
header2 = delimitedMaybe (void $ "== ") eol (Header Two) Nothing
header2 = delimitedNoTrailing "== " eol (Header Two)
header3 :: Lexer
header3 = delimitedMaybe (void $ "=== ") eol (Header Three) Nothing
header3 = delimitedNoTrailing "=== " eol (Header Three)
header4 :: Lexer
header4 = delimitedMaybe (void $ "==== ") eol (Header Four) Nothing
header4 = delimitedNoTrailing "==== " eol (Header Four)
header5 :: Lexer
header5 = delimitedMaybe (void $ "===== ") eol (Header Five) Nothing
header5 = delimitedNoTrailing "===== " eol (Header Five)
header6 :: Lexer
header6 = delimitedMaybe (void $ "====== ") eol (Header Six) Nothing
header6 = delimitedNoTrailing "====== " eol (Header Six)
-- #anchors#
anchors :: Lexer
anchors = do
pos <- getPosition
void $ try anchor'
txt <- anyUntil anchor'
void $ try anchor'
anchor :: Lexer
anchor = do
x <- located $ between "#" "#" (Anchor <$> anyUntil "#")
pure [x]
pure [(pos, Anchor txt)]
where
anchor' = (string "#" <|> string "\\#")
moduleNames :: Parser Text
moduleNames = intercalate "." . fmap Text.pack <$> upperId `sepBy1` char '.'
upperId :: Parser String
upperId = (:) <$> satisfy isUpper <*> many1 identifierChar
identifierChar :: Parser Char
identifierChar = satisfy (\c -> isAlphaNum c || c == '_')
-- "Module.Name"
-- "Module.Name#anchor"
-- "Module.Name\#anchor" -- this has been deprecated for 9 years, thanks Ben
modules :: Lexer
modules = do
startPos <- startPosition $ char '"'
(modPos, modName) <- located modId
anch <- option [] do
anchPos <- startPosition (string "#" <|> string' "\\#")
txt <- Text.pack <$> many (satisfy (\c -> c /= '"' && not (isSpace c)))
pure [(anchPos, Anchor txt)]
void $ char '"'
pure $ [(startPos, Module), (modPos, Token modName)] <> anch
-- "Module.Name\#anchor" -- known as "old anchor". this has been deprecated for 9 years, thanks Ben
module_ :: Lexer
module_ = between (char '"') (char '"') inner
where
modId = intercalate "." <$> (fmap Text.pack <$> (conId `sepBy1` (char '.')))
inner = do
m <- located $ Module <$> moduleNames
mAnchor <- optionMaybe (located $ anchorHash *> (Anchor <$> anchorText))
pure $ case mAnchor of
Just anc -> [m, anc]
Nothing -> [m]
conId :: Parser String
conId =
(:)
<$> satisfy (\c -> isAlpha c && isUpper c)
<*> many1 conChar
anchorHash :: Parser Text
anchorHash = "#" <|> try "\\#"
conChar :: Parser Char
conChar = satisfy (\c -> isAlphaNum c || c == '_')
anchorText :: Parser Text
anchorText = Text.pack <$> many (satisfy (\c -> c /= '"' && not (isSpace c)))
linkRaw :: Lexer
linkRaw =
@ -225,38 +217,30 @@ link :: Lexer
link = do
pos <- getPosition
l <- linkRaw
-- "unconsume" the last token
-- register the position of the last token
pos' <- flip incSourceColumn (-1) <$> getPosition
pure $ (pos, LinkOpen) : l <> [(pos', LinkClose)]
labeledLink :: Lexer
labeledLink = do
pos <- getPosition
void $ string "<"
link' <- linkRaw
pos7 <- getPosition
label' <- anyUntil $ string ">"
pos8 <- getPosition
void $ ">"
open <- located $ LabeledLinkOpen <$ "<"
linkRes <- linkRaw
labelRes <- located $ Token <$> anyUntil ">"
close <- located $ LabeledLinkClose <$ ">"
pure $
(pos, LabeledLinkOpen)
: link'
<> [ (pos7, Token label')
, (pos8, LabeledLinkClose)
]
open : linkRes <> [ labelRes , close ]
mathsBracket :: Lexer
mathsBracket = delimited (void $ "\\[") (void "\\]") MathsBracketOpen MathsBracketClose
mathMultiline :: Lexer
mathMultiline = delimited "\\[" "\\]" MathMultilineOpen MathMultilineClose
mathsParens :: Lexer
mathsParens = delimited (void $ "\\(") (void "\\)") MathsParenOpen MathsParenClose
mathInline :: Lexer
mathInline = delimited "\\(" "\\)" MathInlineOpen MathInlineClose
birdTrack :: Lexer
birdTrack = delimitedMaybe (void ">> ") eol BirdTrack Nothing
birdTrack = delimitedNoTrailing ">> " eol BirdTrack
escape :: Lexer
escape = delimitedMaybe (void "\\") eol Escape Nothing
escape = delimitedNoTrailing "\\" eol Escape
quotes :: Lexer
quotes = delimitedSymmetric "\"" QuoteOpen QuoteClose

View file

@ -19,8 +19,8 @@ main = hspec $ do
describe "minimal" do
it "handles unicode" unicode
it "escapes" escaping
it "maths" maths
it "anchors" anchors
it "maths" math
it "anchors" anchor
it "space chars" space
it "bare string" someString
it "emphasis" emphatic
@ -45,19 +45,16 @@ main = hspec $ do
modules :: Expectation
modules = do
"\"MyModule.Name\""
`shouldLexTo` [ (1, 1, Module)
, (1, 2, Token "MyModule.Name")
`shouldLexTo` [ (1, 2, Module "MyModule.Name")
]
"\"OtherModule.Name#myAnchor\""
`shouldLexTo` [ (1, 1, Module)
, (1, 2, Token "OtherModule.Name")
`shouldLexTo` [ (1, 2, Module "OtherModule.Name")
, (1, 18, Anchor "myAnchor")
]
"\"OtherModule.Name\\#myAnchor\""
`shouldLexTo` [ (1, 1, Module)
, (1, 2, Token "OtherModule.Name")
`shouldLexTo` [ (1, 2, Module "OtherModule.Name")
, (1, 18, Anchor "myAnchor")
]
link :: Expectation
@ -87,31 +84,35 @@ labeledLink =
, (1, 35, LabeledLinkClose)
]
anchors :: Expectation
anchors =
anchor :: Expectation
anchor =
"#myAnchor#"
`shouldLexTo` [ (1, 1, Anchor "myAnchor")
]
maths :: IO ()
maths = do
math :: IO ()
math = do
"\\[some math\\]"
`shouldLexTo` [ (1, 1, MathsBracketOpen)
`shouldLexTo` [ (1, 1, MathMultilineOpen)
, (1, 3, Token "some math")
, (1, 12, MathsBracketClose)
, (1, 12, MathMultilineClose)
]
"\\(other maths\\)"
`shouldLexTo` [ (1, 1, MathsParenOpen)
`shouldLexTo` [ (1, 1, MathInlineOpen)
, (1, 3, Token "other maths")
, (1, 14, MathsParenClose)
, (1, 14, MathInlineClose)
]
escaping :: Expectation
escaping =
escaping = do
"\\("
`shouldLexTo` [ (1, 1, Escape)
, (1, 2, Token "(")
]
"\\(\r\n"
`shouldLexTo` [ (1, 1, Escape)
, (1, 2, Token "(")
]
unicode :: Expectation
unicode =