Compare commits

...
Sign in to create a new pull request.

13 commits

Author SHA1 Message Date
970b658926
chore(lexer): clean up 2025-09-24 22:32:52 +08:00
2597e693f1
ref(lexer): simplify labeledLink 2025-09-24 22:31:59 +08:00
29c015b793
style(lexer): make binding naming consistent 2025-09-24 22:25:45 +08:00
326c7b681c
fix(lexer): old anchor is only used in moduleName 2025-09-24 22:20:32 +08:00
c4d59d3236
ref(lexer): rename MathsBracket -> MathMultiline 2025-09-24 21:35:21 +08:00
6ec47dad04
ref(lexer): rename MathParen -> MathInline 2025-09-24 21:31:26 +08:00
f3b3b08919
style(lexer): use "open" "close" in the type 2025-09-24 21:28:17 +08:00
6c0b4a4288
doc(lexer): explain the use of incSourceColumn
I think it is clearer to phrase it this way so it is clear that we are
not unconsuming (i.e. changing the state of the parser).
2025-09-24 21:25:46 +08:00
75c4817166
style(lexer): pluralize moduleNames parser 2025-09-24 21:25:46 +08:00
7ceb9b0277
ref(moduleName): break into multiple smaller functions
upperId has been changed to only use isUpper because an non alphabetical
character would be false anyway
2025-09-24 21:25:46 +08:00
368e5bc9a0
ref(lexer): simplify anchor 2025-09-24 21:25:46 +08:00
d6087ec3d6
ref(lexer): simplify delimited logic 2025-09-24 21:25:46 +08:00
fdb9070e99
fix(lexer): handle crlf in newline
Do we support windows 🤔
2025-09-24 21:25:46 +08:00
3 changed files with 91 additions and 106 deletions

View file

@ -12,7 +12,7 @@ bold ::= '__' text_no_newline '__'
monospace ::= '@' text_content '@' monospace ::= '@' text_content '@'
link ::= module_link | hyperlink | markdown_link link ::= module_link | hyperlink | markdown_link
module_link ::= '"' module_name ( '#' anchor_name )? '"' module_link ::= '"' module_name ( ('#' | '\#') anchor_name )? '"'
hyperlink ::= '<' url ( ' ' link_text )? '>' hyperlink ::= '<' url ( ' ' link_text )? '>'
markdown_link ::= '[' link_text '](' ( url | module_link ) ')' markdown_link ::= '[' link_text '](' ( url | module_link ) ')'

View file

@ -10,12 +10,13 @@ import Control.Monad (mfilter, void)
import Data.Functor (($>)) import Data.Functor (($>))
import Data.Text (Text, intercalate) import Data.Text (Text, intercalate)
import Data.Text qualified as Text import Data.Text qualified as Text
import GHC.Unicode (isAlpha, isAlphaNum, isControl, isPrint, isSpace, isUpper) import GHC.Unicode (isAlphaNum, isControl, isPrint, isSpace, isUpper)
import ParserMonad (Parser, initialParserState) import ParserMonad (Parser, initialParserState)
import Text.Parsec import Text.Parsec
import Text.Parsec qualified as Parsec import Text.Parsec qualified as Parsec
import Text.Parsec.Pos (updatePosChar) import Text.Parsec.Pos (updatePosChar)
type Located a = (SourcePos, a)
type LocatedToken = (SourcePos, Token) type LocatedToken = (SourcePos, Token)
type Lexer = Parser [LocatedToken] type Lexer = Parser [LocatedToken]
@ -50,12 +51,12 @@ data Token
| ParenClose | ParenClose
| BracketOpen | BracketOpen
| BracketClose | BracketClose
| MathsParenOpen | MathInlineOpen
| MathsParenClose | MathInlineClose
| MathsBracketOpen | MathMultilineOpen
| MathsBracketClose | MathMultilineClose
| NumericEntity Int | NumericEntity Int
| Module | Module Text
| QuoteOpen | QuoteOpen
| QuoteClose | QuoteClose
| Space | Space
@ -65,9 +66,6 @@ data Token
located :: Parser a -> Parser (SourcePos, a) located :: Parser a -> Parser (SourcePos, a)
located p = (,) <$> getPosition <*> p located p = (,) <$> getPosition <*> p
startPosition :: Parser a -> Parser SourcePos
startPosition = fmap fst . located
tokenise :: [Parser a] -> Parser [(SourcePos, a)] tokenise :: [Parser a] -> Parser [(SourcePos, a)]
tokenise = sequence . map located tokenise = sequence . map located
@ -84,16 +82,16 @@ lexText = go
toks <- toks <-
choice $ choice $
Parsec.try Parsec.try
<$> [ mathsBracket <$> [ mathMultiline
, mathsParens , mathInline
, escape -- maths go before escape to avoid mismatch , escape -- maths go before escape to avoid mismatch
, headers , headers
, newlineToken , newlineToken
, spaceToken , spaceToken
, link , link
, labeledLink , labeledLink
, modules , module_
, anchors , anchor
, textElement , textElement
, quotes , quotes
, birdTrack , birdTrack
@ -125,90 +123,84 @@ headers =
, header6 , header6
] ]
delimitedMaybe :: Parser a -> Parser a -> Token -> Maybe Token -> Parser [LocatedToken]
delimitedMaybe openMark closeMark openToken closeToken = do
openPos <- getPosition
void openMark
tokenPos <- getPosition
content <- anyUntil closeMark
closePos <- getPosition
void closeMark
let openTok :: LocatedToken = (openPos, openToken)
res :: LocatedToken = (tokenPos, Token content)
closeToks :: [LocatedToken] = case closeToken of
Just close -> [(closePos, close)]
Nothing -> []
pure $ [openTok, res] <> closeToks
anyUntil :: Parser a -> Parser Text anyUntil :: Parser a -> Parser Text
anyUntil p = Text.pack <$> manyTill anyChar (lookAhead p) anyUntil p = Text.pack <$> manyTill anyChar (lookAhead p)
delimited :: Parser a -> Parser a -> Token -> Token -> Parser [LocatedToken] delimitedAsTuple :: Parser open -> Parser close -> Parser (Located open, LocatedToken, Located close)
delimited a b c d = delimitedMaybe a b c (Just d) delimitedAsTuple openP closeP =
(,,)
<$> located openP
<*> located (Token <$> anyUntil closeP)
<*> located closeP
delimited :: Parser open -> Parser close -> Token -> Token -> Parser [LocatedToken]
delimited openP closeP openTok closeTok = asList <$> delimitedAsTuple (openTok <$ openP) (closeTok <$ closeP)
where
asList (a, tok, b) = [a, tok, b]
delimitedNoTrailing :: Parser open -> Parser close -> Token -> Parser [LocatedToken]
delimitedNoTrailing openP closeP openTok = asList <$> delimitedAsTuple (openTok <$ openP) (void closeP)
where
asList (a, tok, _) = [a, tok]
delimitedSymmetric :: Parser a -> Token -> Token -> Parser [LocatedToken] delimitedSymmetric :: Parser a -> Token -> Token -> Parser [LocatedToken]
delimitedSymmetric s t1 t2 = delimited s s t1 t2 delimitedSymmetric s t1 t2 = delimited s s t1 t2
eol :: Parser () eol :: Parser ()
eol = void "\n" <|> Parsec.eof eol = void "\n" <|> void "\r\n" <|> Parsec.eof
header1 :: Lexer header1 :: Lexer
header1 = delimitedMaybe (void $ "= ") eol (Header One) Nothing header1 = delimitedNoTrailing "= " eol (Header One)
header2 :: Lexer header2 :: Lexer
header2 = delimitedMaybe (void $ "== ") eol (Header Two) Nothing header2 = delimitedNoTrailing "== " eol (Header Two)
header3 :: Lexer header3 :: Lexer
header3 = delimitedMaybe (void $ "=== ") eol (Header Three) Nothing header3 = delimitedNoTrailing "=== " eol (Header Three)
header4 :: Lexer header4 :: Lexer
header4 = delimitedMaybe (void $ "==== ") eol (Header Four) Nothing header4 = delimitedNoTrailing "==== " eol (Header Four)
header5 :: Lexer header5 :: Lexer
header5 = delimitedMaybe (void $ "===== ") eol (Header Five) Nothing header5 = delimitedNoTrailing "===== " eol (Header Five)
header6 :: Lexer header6 :: Lexer
header6 = delimitedMaybe (void $ "====== ") eol (Header Six) Nothing header6 = delimitedNoTrailing "====== " eol (Header Six)
-- #anchors# -- #anchors#
anchors :: Lexer anchor :: Lexer
anchors = do anchor = do
pos <- getPosition x <- located $ between "#" "#" (Anchor <$> anyUntil "#")
void $ try anchor' pure [x]
txt <- anyUntil anchor'
void $ try anchor'
pure [(pos, Anchor txt)]
where moduleNames :: Parser Text
anchor' = (string "#" <|> string "\\#") moduleNames = intercalate "." . fmap Text.pack <$> upperId `sepBy1` char '.'
upperId :: Parser String
upperId = (:) <$> satisfy isUpper <*> many1 identifierChar
identifierChar :: Parser Char
identifierChar = satisfy (\c -> isAlphaNum c || c == '_')
-- "Module.Name" -- "Module.Name"
-- "Module.Name#anchor" -- "Module.Name#anchor"
-- "Module.Name\#anchor" -- this has been deprecated for 9 years, thanks Ben -- "Module.Name\#anchor" -- known as "old anchor". this has been deprecated for 9 years, thanks Ben
modules :: Lexer module_ :: Lexer
modules = do module_ = between (char '"') (char '"') inner
startPos <- startPosition $ char '"'
(modPos, modName) <- located modId
anch <- option [] do
anchPos <- startPosition (string "#" <|> string' "\\#")
txt <- Text.pack <$> many (satisfy (\c -> c /= '"' && not (isSpace c)))
pure [(anchPos, Anchor txt)]
void $ char '"'
pure $ [(startPos, Module), (modPos, Token modName)] <> anch
where where
modId = intercalate "." <$> (fmap Text.pack <$> (conId `sepBy1` (char '.'))) inner = do
m <- located $ Module <$> moduleNames
mAnchor <- optionMaybe (located $ anchorHash *> (Anchor <$> anchorText))
pure $ case mAnchor of
Just anc -> [m, anc]
Nothing -> [m]
conId :: Parser String anchorHash :: Parser Text
conId = anchorHash = "#" <|> try "\\#"
(:)
<$> satisfy (\c -> isAlpha c && isUpper c)
<*> many1 conChar
conChar :: Parser Char anchorText :: Parser Text
conChar = satisfy (\c -> isAlphaNum c || c == '_') anchorText = Text.pack <$> many (satisfy (\c -> c /= '"' && not (isSpace c)))
linkRaw :: Lexer linkRaw :: Lexer
linkRaw = linkRaw =
@ -225,38 +217,30 @@ link :: Lexer
link = do link = do
pos <- getPosition pos <- getPosition
l <- linkRaw l <- linkRaw
-- "unconsume" the last token -- register the position of the last token
pos' <- flip incSourceColumn (-1) <$> getPosition pos' <- flip incSourceColumn (-1) <$> getPosition
pure $ (pos, LinkOpen) : l <> [(pos', LinkClose)] pure $ (pos, LinkOpen) : l <> [(pos', LinkClose)]
labeledLink :: Lexer labeledLink :: Lexer
labeledLink = do labeledLink = do
pos <- getPosition open <- located $ LabeledLinkOpen <$ "<"
void $ string "<" linkRes <- linkRaw
link' <- linkRaw labelRes <- located $ Token <$> anyUntil ">"
pos7 <- getPosition close <- located $ LabeledLinkClose <$ ">"
label' <- anyUntil $ string ">"
pos8 <- getPosition
void $ ">"
pure $ pure $
(pos, LabeledLinkOpen) open : linkRes <> [ labelRes , close ]
: link'
<> [ (pos7, Token label')
, (pos8, LabeledLinkClose)
]
mathsBracket :: Lexer mathMultiline :: Lexer
mathsBracket = delimited (void $ "\\[") (void "\\]") MathsBracketOpen MathsBracketClose mathMultiline = delimited "\\[" "\\]" MathMultilineOpen MathMultilineClose
mathsParens :: Lexer mathInline :: Lexer
mathsParens = delimited (void $ "\\(") (void "\\)") MathsParenOpen MathsParenClose mathInline = delimited "\\(" "\\)" MathInlineOpen MathInlineClose
birdTrack :: Lexer birdTrack :: Lexer
birdTrack = delimitedMaybe (void ">> ") eol BirdTrack Nothing birdTrack = delimitedNoTrailing ">> " eol BirdTrack
escape :: Lexer escape :: Lexer
escape = delimitedMaybe (void "\\") eol Escape Nothing escape = delimitedNoTrailing "\\" eol Escape
quotes :: Lexer quotes :: Lexer
quotes = delimitedSymmetric "\"" QuoteOpen QuoteClose quotes = delimitedSymmetric "\"" QuoteOpen QuoteClose

View file

@ -19,8 +19,8 @@ main = hspec $ do
describe "minimal" do describe "minimal" do
it "handles unicode" unicode it "handles unicode" unicode
it "escapes" escaping it "escapes" escaping
it "maths" maths it "maths" math
it "anchors" anchors it "anchors" anchor
it "space chars" space it "space chars" space
it "bare string" someString it "bare string" someString
it "emphasis" emphatic it "emphasis" emphatic
@ -45,19 +45,16 @@ main = hspec $ do
modules :: Expectation modules :: Expectation
modules = do modules = do
"\"MyModule.Name\"" "\"MyModule.Name\""
`shouldLexTo` [ (1, 1, Module) `shouldLexTo` [ (1, 2, Module "MyModule.Name")
, (1, 2, Token "MyModule.Name")
] ]
"\"OtherModule.Name#myAnchor\"" "\"OtherModule.Name#myAnchor\""
`shouldLexTo` [ (1, 1, Module) `shouldLexTo` [ (1, 2, Module "OtherModule.Name")
, (1, 2, Token "OtherModule.Name")
, (1, 18, Anchor "myAnchor") , (1, 18, Anchor "myAnchor")
] ]
"\"OtherModule.Name\\#myAnchor\"" "\"OtherModule.Name\\#myAnchor\""
`shouldLexTo` [ (1, 1, Module) `shouldLexTo` [ (1, 2, Module "OtherModule.Name")
, (1, 2, Token "OtherModule.Name")
, (1, 18, Anchor "myAnchor") , (1, 18, Anchor "myAnchor")
] ]
link :: Expectation link :: Expectation
@ -87,31 +84,35 @@ labeledLink =
, (1, 35, LabeledLinkClose) , (1, 35, LabeledLinkClose)
] ]
anchors :: Expectation anchor :: Expectation
anchors = anchor =
"#myAnchor#" "#myAnchor#"
`shouldLexTo` [ (1, 1, Anchor "myAnchor") `shouldLexTo` [ (1, 1, Anchor "myAnchor")
] ]
maths :: IO () math :: IO ()
maths = do math = do
"\\[some math\\]" "\\[some math\\]"
`shouldLexTo` [ (1, 1, MathsBracketOpen) `shouldLexTo` [ (1, 1, MathMultilineOpen)
, (1, 3, Token "some math") , (1, 3, Token "some math")
, (1, 12, MathsBracketClose) , (1, 12, MathMultilineClose)
] ]
"\\(other maths\\)" "\\(other maths\\)"
`shouldLexTo` [ (1, 1, MathsParenOpen) `shouldLexTo` [ (1, 1, MathInlineOpen)
, (1, 3, Token "other maths") , (1, 3, Token "other maths")
, (1, 14, MathsParenClose) , (1, 14, MathInlineClose)
] ]
escaping :: Expectation escaping :: Expectation
escaping = escaping = do
"\\(" "\\("
`shouldLexTo` [ (1, 1, Escape) `shouldLexTo` [ (1, 1, Escape)
, (1, 2, Token "(") , (1, 2, Token "(")
] ]
"\\(\r\n"
`shouldLexTo` [ (1, 1, Escape)
, (1, 2, Token "(")
]
unicode :: Expectation unicode :: Expectation
unicode = unicode =