2025-09-26 14:44:47 +00:00 · 2025-09-24 13:05:36 +00:00 · 2025-09-24 13:28:04 +00:00 · 2025-09-24 13:14:30 +00:00 · 2025-09-24 13:29:13 +00:00 · 2025-09-24 16:03:34 +00:00
3 changed files with 91 additions and 106 deletions
--- a/Grammar.ebnf
+++ b/Grammar.ebnf
@ -12,7 +12,7 @@ bold ::= '__' text_no_newline '__'
 monospace ::= '@' text_content '@'

 link ::= module_link | hyperlink | markdown_link
-module_link ::= '"' module_name ( '#' anchor_name )? '"'
+module_link ::= '"' module_name ( ('#' | '\#')  anchor_name )? '"'
 hyperlink ::= '<' url ( ' ' link_text )? '>'
 markdown_link ::= '[' link_text '](' ( url | module_link ) ')'

--- a/src/Lexer.hs
+++ b/src/Lexer.hs
@ -10,12 +10,13 @@ import Control.Monad (mfilter, void)
 import Data.Functor (($>))
 import Data.Text (Text, intercalate)
 import Data.Text qualified as Text
-import GHC.Unicode (isAlpha, isAlphaNum, isControl, isPrint, isSpace, isUpper)
+import GHC.Unicode (isAlphaNum, isControl, isPrint, isSpace, isUpper)
 import ParserMonad (Parser, initialParserState)
 import Text.Parsec
 import Text.Parsec qualified as Parsec
 import Text.Parsec.Pos (updatePosChar)

+type Located a = (SourcePos, a)
 type LocatedToken = (SourcePos, Token)

 type Lexer = Parser [LocatedToken]
@ -50,12 +51,12 @@ data Token
    | ParenClose
    | BracketOpen
    | BracketClose
-    | MathsParenOpen
-    | MathsParenClose
-    | MathsBracketOpen
-    | MathsBracketClose
+    | MathInlineOpen
+    | MathInlineClose
+    | MathMultilineOpen
+    | MathMultilineClose
    | NumericEntity Int
-    | Module
+    | Module Text
    | QuoteOpen
    | QuoteClose
    | Space
@ -65,9 +66,6 @@ data Token
 located :: Parser a -> Parser (SourcePos, a)
 located p = (,) <$> getPosition <*> p

-startPosition :: Parser a -> Parser SourcePos
-startPosition = fmap fst . located
-
 tokenise :: [Parser a] -> Parser [(SourcePos, a)]
 tokenise = sequence . map located

@ -84,16 +82,16 @@ lexText = go
                toks <-
                    choice $
                        Parsec.try
-                            <$> [ mathsBracket
-                                , mathsParens
+                            <$> [ mathMultiline
+                                , mathInline
                                , escape -- maths go before escape to avoid mismatch
                                , headers
                                , newlineToken
                                , spaceToken
                                , link
                                , labeledLink
-                                , modules
-                                , anchors
+                                , module_
+                                , anchor
                                , textElement
                                , quotes
                                , birdTrack
@ -125,90 +123,84 @@ headers =
                , header6
                ]

-delimitedMaybe :: Parser a -> Parser a -> Token -> Maybe Token -> Parser [LocatedToken]
-delimitedMaybe openMark closeMark openToken closeToken = do
-    openPos <- getPosition
-    void openMark
-    tokenPos <- getPosition
-    content <- anyUntil closeMark
-    closePos <- getPosition
-    void closeMark
-
-    let openTok :: LocatedToken = (openPos, openToken)
-        res :: LocatedToken = (tokenPos, Token content)
-        closeToks :: [LocatedToken] = case closeToken of
-            Just close -> [(closePos, close)]
-            Nothing -> []
-
-    pure $ [openTok, res] <> closeToks
-
 anyUntil :: Parser a -> Parser Text
 anyUntil p = Text.pack <$> manyTill anyChar (lookAhead p)

-delimited :: Parser a -> Parser a -> Token -> Token -> Parser [LocatedToken]
-delimited a b c d = delimitedMaybe a b c (Just d)
+delimitedAsTuple :: Parser open -> Parser close -> Parser (Located open, LocatedToken, Located close)
+delimitedAsTuple openP closeP =
+    (,,)
+        <$> located openP
+        <*> located (Token <$> anyUntil closeP)
+        <*> located closeP
+
+delimited :: Parser open -> Parser close -> Token -> Token -> Parser [LocatedToken]
+delimited openP closeP openTok closeTok = asList <$> delimitedAsTuple (openTok <$ openP) (closeTok <$ closeP)
+    where
+        asList (a, tok, b) = [a, tok, b]
+
+delimitedNoTrailing :: Parser open -> Parser close -> Token -> Parser [LocatedToken]
+delimitedNoTrailing openP closeP openTok = asList <$> delimitedAsTuple (openTok <$ openP) (void closeP)
+    where
+        asList (a, tok, _) = [a, tok]

 delimitedSymmetric :: Parser a -> Token -> Token -> Parser [LocatedToken]
 delimitedSymmetric s t1 t2 = delimited s s t1 t2

 eol :: Parser ()
-eol = void "\n" <|> Parsec.eof
+eol = void "\n" <|> void "\r\n" <|> Parsec.eof

 header1 :: Lexer
-header1 = delimitedMaybe (void $ "= ") eol (Header One) Nothing
+header1 = delimitedNoTrailing "= " eol (Header One)

 header2 :: Lexer
-header2 = delimitedMaybe (void $ "== ") eol (Header Two) Nothing
+header2 = delimitedNoTrailing "== " eol (Header Two)

 header3 :: Lexer
-header3 = delimitedMaybe (void $ "=== ") eol (Header Three) Nothing
+header3 = delimitedNoTrailing "=== " eol (Header Three)

 header4 :: Lexer
-header4 = delimitedMaybe (void $ "==== ") eol (Header Four) Nothing
+header4 = delimitedNoTrailing "==== " eol (Header Four)

 header5 :: Lexer
-header5 = delimitedMaybe (void $ "===== ") eol (Header Five) Nothing
+header5 = delimitedNoTrailing "===== " eol (Header Five)

 header6 :: Lexer
-header6 = delimitedMaybe (void $ "====== ") eol (Header Six) Nothing
+header6 = delimitedNoTrailing "====== " eol (Header Six)

 -- #anchors#
-anchors :: Lexer
-anchors = do
-    pos <- getPosition
-    void $ try anchor'
-    txt <- anyUntil anchor'
-    void $ try anchor'
+anchor :: Lexer
+anchor = do
+    x <- located $ between "#" "#" (Anchor <$> anyUntil "#")
+    pure [x]

-    pure [(pos, Anchor txt)]
-  where
-    anchor' = (string "#" <|> string "\\#")
+
+moduleNames :: Parser Text
+moduleNames = intercalate "." . fmap Text.pack <$> upperId `sepBy1` char '.'
+
+upperId :: Parser String
+upperId = (:) <$> satisfy isUpper <*> many1 identifierChar
+
+identifierChar :: Parser Char
+identifierChar = satisfy (\c -> isAlphaNum c || c == '_')

 -- "Module.Name"
 -- "Module.Name#anchor"
-- "Module.Name\#anchor" -- this has been deprecated for 9 years, thanks Ben
-modules :: Lexer
-modules = do
-    startPos <- startPosition $ char '"'
-    (modPos, modName) <- located modId
-    anch <- option [] do
-        anchPos <- startPosition (string "#" <|> string' "\\#")
-        txt <- Text.pack <$> many (satisfy (\c -> c /= '"' && not (isSpace c)))
-        pure [(anchPos, Anchor txt)]
-
-    void $ char '"'
-    pure $ [(startPos, Module), (modPos, Token modName)] <> anch
+-- "Module.Name\#anchor" -- known as "old anchor". this has been deprecated for 9 years, thanks Ben
+module_ :: Lexer
+module_ = between (char '"') (char '"') inner
  where
-    modId = intercalate "." <$> (fmap Text.pack <$> (conId `sepBy1` (char '.')))
+    inner = do
+        m <- located $ Module <$> moduleNames
+        mAnchor <- optionMaybe (located $ anchorHash *> (Anchor <$> anchorText))
+        pure $ case mAnchor of
+            Just anc -> [m, anc]
+            Nothing -> [m]

-    conId :: Parser String
-    conId =
-        (:)
-            <$> satisfy (\c -> isAlpha c && isUpper c)
-            <*> many1 conChar
+    anchorHash :: Parser Text
+    anchorHash = "#" <|> try "\\#"

-    conChar :: Parser Char
-    conChar = satisfy (\c -> isAlphaNum c || c == '_')
+    anchorText :: Parser Text
+    anchorText = Text.pack <$> many (satisfy (\c -> c /= '"' && not (isSpace c)))

 linkRaw :: Lexer
 linkRaw =
@ -225,38 +217,30 @@ link :: Lexer
 link = do
    pos <- getPosition
    l <- linkRaw
-    -- "unconsume" the last token
+    -- register the position of the last token
    pos' <- flip incSourceColumn (-1) <$> getPosition
    pure $ (pos, LinkOpen) : l <> [(pos', LinkClose)]

 labeledLink :: Lexer
 labeledLink = do
-    pos <- getPosition
-    void $ string "<"
-    link' <- linkRaw
-    pos7 <- getPosition
-    label' <- anyUntil $ string ">"
-    pos8 <- getPosition
-    void $ ">"
-
+    open <- located $ LabeledLinkOpen <$ "<"
+    linkRes <- linkRaw
+    labelRes <- located $ Token <$> anyUntil ">"
+    close <- located $ LabeledLinkClose <$ ">"
    pure $
-        (pos, LabeledLinkOpen)
-            : link'
-                <> [ (pos7, Token label')
-                   , (pos8, LabeledLinkClose)
-                   ]
+        open : linkRes <> [ labelRes , close ]

-mathsBracket :: Lexer
-mathsBracket = delimited (void $ "\\[") (void "\\]") MathsBracketOpen MathsBracketClose
+mathMultiline :: Lexer
+mathMultiline = delimited "\\[" "\\]" MathMultilineOpen MathMultilineClose

-mathsParens :: Lexer
-mathsParens = delimited (void $ "\\(") (void "\\)") MathsParenOpen MathsParenClose
+mathInline :: Lexer
+mathInline = delimited "\\(" "\\)" MathInlineOpen MathInlineClose

 birdTrack :: Lexer
-birdTrack = delimitedMaybe (void ">> ") eol BirdTrack Nothing
+birdTrack = delimitedNoTrailing ">> " eol BirdTrack

 escape :: Lexer
-escape = delimitedMaybe (void "\\") eol Escape Nothing
+escape = delimitedNoTrailing "\\" eol Escape

 quotes :: Lexer
 quotes = delimitedSymmetric "\"" QuoteOpen QuoteClose
--- a/test/Spec.hs
+++ b/test/Spec.hs
@ -19,8 +19,8 @@ main = hspec $ do
        describe "minimal" do
            it "handles unicode" unicode
            it "escapes" escaping
-            it "maths" maths
-            it "anchors" anchors
+            it "maths" math
+            it "anchors" anchor
            it "space chars" space
            it "bare string" someString
            it "emphasis" emphatic
@ -45,19 +45,16 @@ main = hspec $ do
 modules :: Expectation
 modules = do
    "\"MyModule.Name\""
-        `shouldLexTo` [ (1, 1, Module)
-                      , (1, 2, Token "MyModule.Name")
+        `shouldLexTo` [ (1, 2, Module "MyModule.Name")
                      ]

    "\"OtherModule.Name#myAnchor\""
-        `shouldLexTo` [ (1, 1, Module)
-                      , (1, 2, Token "OtherModule.Name")
+        `shouldLexTo` [ (1, 2, Module "OtherModule.Name")
                      , (1, 18, Anchor "myAnchor")
                      ]

    "\"OtherModule.Name\\#myAnchor\""
-        `shouldLexTo` [ (1, 1, Module)
-                      , (1, 2, Token "OtherModule.Name")
+        `shouldLexTo` [ (1, 2, Module "OtherModule.Name")
                      , (1, 18, Anchor "myAnchor")
                      ]
 link :: Expectation
@ -87,31 +84,35 @@ labeledLink =
                      , (1, 35, LabeledLinkClose)
                      ]

-anchors :: Expectation
-anchors =
+anchor :: Expectation
+anchor =
    "#myAnchor#"
        `shouldLexTo` [ (1, 1, Anchor "myAnchor")
                      ]

-maths :: IO ()
-maths = do
+math :: IO ()
+math = do
    "\\[some math\\]"
-        `shouldLexTo` [ (1, 1, MathsBracketOpen)
+        `shouldLexTo` [ (1, 1, MathMultilineOpen)
                      , (1, 3, Token "some math")
-                      , (1, 12, MathsBracketClose)
+                      , (1, 12, MathMultilineClose)
                      ]
    "\\(other maths\\)"
-        `shouldLexTo` [ (1, 1, MathsParenOpen)
+        `shouldLexTo` [ (1, 1, MathInlineOpen)
                      , (1, 3, Token "other maths")
-                      , (1, 14, MathsParenClose)
+                      , (1, 14, MathInlineClose)
                      ]

 escaping :: Expectation
-escaping =
+escaping = do
    "\\("
        `shouldLexTo` [ (1, 1, Escape)
                      , (1, 2, Token "(")
                      ]
+    "\\(\r\n"
+        `shouldLexTo` [ (1, 1, Escape)
+                      , (1, 2, Token "(")
+                      ]

 unicode :: Expectation
 unicode =