From 8887476626fc9debe388a8382fcf52a9b01ce042 Mon Sep 17 00:00:00 2001 From: Igor Ranieri Date: Wed, 24 Sep 2025 09:48:01 +0200 Subject: [PATCH 1/2] Embed Anchor content inside token --- src/Lexer.hs | 385 ++++++++++++++++++++++++------------------------- test/Spec.hs | 227 ++++++++++++++--------------- test/markup.md | 12 +- 3 files changed, 311 insertions(+), 313 deletions(-) diff --git a/src/Lexer.hs b/src/Lexer.hs index c299d41..5b29aa8 100644 --- a/src/Lexer.hs +++ b/src/Lexer.hs @@ -1,9 +1,9 @@ {-# LANGUAGE OverloadedStrings #-} module Lexer ( - Token (..), - lexer, - emphasis, + Token (..), + lexer, + emphasis, ) where import Control.Monad (mfilter, void) @@ -21,116 +21,117 @@ type LocatedToken = (SourcePos, Token) type Lexer = Parser [LocatedToken] data Level - = One - | Two - | Three - | Four - | Five - | Six - deriving (Eq, Show) + = One + | Two + | Three + | Four + | Five + | Six + deriving (Eq, Show) data Token - = Token Text - | Anchor - | BirdTrack - | BoldOpen - | BoldClose - | Escape - | EmphasisOpen - | EmphasisClose - | Header Level - | MonospaceOpen - | MonospaceClose - | Newline - | LinkOpen - | LinkClose - | LabeledLinkOpen - | LabeledLinkClose - | ParenOpen - | ParenClose - | BracketOpen - | BracketClose - | MathsParenOpen - | MathsParenClose - | MathsBracketOpen - | MathsBracketClose - | Module - | QuoteOpen - | QuoteClose - | Space - | EOF - deriving (Eq, Show) + = Token Text + | Anchor Text + | BirdTrack + | BoldOpen + | BoldClose + | Escape + | EmphasisOpen + | EmphasisClose + | Header Level + | MonospaceOpen + | MonospaceClose + | Newline + | LinkOpen + | LinkClose + | LabeledLinkOpen + | LabeledLinkClose + | ParenOpen + | ParenClose + | BracketOpen + | BracketClose + | MathsParenOpen + | MathsParenClose + | MathsBracketOpen + | MathsBracketClose + | NumericEntity Int + | Module + | QuoteOpen + | QuoteClose + | Space + | EOF + deriving (Eq, Show) lexer :: String -> Either ParseError [LocatedToken] lexer = Parsec.runParser lexText initialParserState "input" . Text.pack lexText :: Parser [LocatedToken] lexText = go - where - go = do - Parsec.optionMaybe Parsec.eof >>= \case - Just _ -> pure [] - Nothing -> do - toks <- - choice $ - Parsec.try - <$> [ mathsBracket - , mathsParens - , escape -- maths go before escape to avoid mismatch - , headers - , newlineToken - , spaceToken - , link - , labeledLink - , modules - , anchors - , textElement - , quotes - , birdTrack - , other - ] - rest <- go - pure (toks <> rest) + where + go = do + Parsec.optionMaybe Parsec.eof >>= \case + Just _ -> pure [] + Nothing -> do + toks <- + choice $ + Parsec.try + <$> [ mathsBracket + , mathsParens + , escape -- maths go before escape to avoid mismatch + , headers + , newlineToken + , spaceToken + , link + , labeledLink + , modules + , anchors + , textElement + , quotes + , birdTrack + , other + ] + rest <- go + pure (toks <> rest) -- Tokens textElement :: Parser [LocatedToken] textElement = - choice $ - Parsec.try - <$> [ emphasis - , bold - , monospace - ] + choice $ + Parsec.try + <$> [ emphasis + , bold + , monospace + ] headers :: Parser [LocatedToken] headers = - choice $ - Parsec.try - <$> [ header1 - , header2 - , header3 - , header4 - , header5 - , header6 - ] + choice $ + Parsec.try + <$> [ header1 + , header2 + , header3 + , header4 + , header5 + , header6 + ] delimitedMaybe :: Parser a -> Parser a -> Token -> Maybe Token -> Parser [LocatedToken] delimitedMaybe openMark closeMark openToken closeToken = do - openPos <- getPosition - void openMark - tokenPos <- getPosition - content <- anyUntil closeMark - closePos <- getPosition - void closeMark + openPos <- getPosition + void openMark + tokenPos <- getPosition + content <- anyUntil closeMark + closePos <- getPosition + void closeMark - let openTok :: LocatedToken = (openPos, openToken) - res :: LocatedToken = (tokenPos, Token content) - closeToks :: [LocatedToken] = case closeToken of - Just close -> [(closePos, close)] - Nothing -> [] + let openTok :: LocatedToken = (openPos, openToken) + res :: LocatedToken = (tokenPos, Token content) + closeToks :: [LocatedToken] = case closeToken of + Just close -> [(closePos, close)] + Nothing -> [] - pure $ [openTok, res] <> closeToks + pure $ [openTok, res] <> closeToks anyUntil :: Parser a -> Parser Text anyUntil p = Text.pack <$> manyTill anyChar (lookAhead p) @@ -165,99 +166,91 @@ header6 = delimitedMaybe (void $ "====== ") eol (Header Six) Nothing -- #anchors# anchors :: Lexer anchors = do - pos <- getPosition - void $ try anchor' - pos' <- getPosition - txt <- anyUntil anchor' - pos'' <- getPosition - void $ try anchor' + pos <- getPosition + void $ try anchor' + txt <- anyUntil anchor' + void $ try anchor' - pure - [ (pos, Anchor) - , (pos', Token txt) - , (pos'', Anchor) - ] - where - anchor' = (string "#" <|> string "\\#") + pure [(pos, Anchor txt)] + where + anchor' = (string "#" <|> string "\\#") -- "Module.Name" -- "Module.Name#anchor" --- "Module.Name#anchor" modules :: Lexer modules = do - pos <- getPosition - void $ char '"' - pos' <- getPosition - modName <- modId - anch <- option [] do - pos'' <- getPosition - void $ try (string "#" <|> string "\\#") - pos''' <- getPosition - a <- Text.pack <$> many (satisfy (\c -> c /= '"' && not (isSpace c))) - pure [(pos'', Anchor), (pos''', Token a)] + pos <- getPosition + void $ char '"' + pos' <- getPosition + modName <- modId + anch <- option [] do + pos'' <- getPosition + void $ try (string "#" <|> string "\\#") + a <- Text.pack <$> many (satisfy (\c -> c /= '"' && not (isSpace c))) + pure [(pos'', Anchor a)] - void $ char '"' - pure $ [(pos, Module), (pos', Token modName)] <> anch - where - modId = intercalate "." <$> (fmap Text.pack <$> (conId `sepBy1` (char '.'))) + void $ char '"' + pure $ [(pos, Module), (pos', Token modName)] <> anch + where + modId = intercalate "." <$> (fmap Text.pack <$> (conId `sepBy1` (char '.'))) - conId :: Parser String - conId = - (:) - <$> satisfy (\c -> isAlpha c && isUpper c) - <*> many1 conChar + conId :: Parser String + conId = + (:) + <$> satisfy (\c -> isAlpha c && isUpper c) + <*> many1 conChar - conChar :: Parser Char - conChar = satisfy (\c -> isAlphaNum c || c == '_') + conChar :: Parser Char + conChar = satisfy (\c -> isAlphaNum c || c == '_') linkRaw :: Lexer linkRaw = do - pos1 <- getPosition - void $ string "[" - pos2 <- getPosition - text <- anyUntil $ Text.pack <$> string "]" - pos3 <- getPosition - void $ "]" - pos4 <- getPosition - void $ "(" - pos5 <- getPosition - link' <- anyUntil $ Text.pack <$> string ")" - pos6 <- getPosition - void $ ")" + pos1 <- getPosition + void $ string "[" + pos2 <- getPosition + text <- anyUntil $ Text.pack <$> string "]" + pos3 <- getPosition + void $ "]" + pos4 <- getPosition + void $ "(" + pos5 <- getPosition + link' <- anyUntil $ Text.pack <$> string ")" + pos6 <- getPosition + void $ ")" - pure $ - [ (pos1, BracketOpen) - , (pos2, Token text) - , (pos3, BracketClose) - , (pos4, ParenOpen) - , (pos5, Token link') - , (pos6, ParenClose) - ] + pure $ + [ (pos1, BracketOpen) + , (pos2, Token text) + , (pos3, BracketClose) + , (pos4, ParenOpen) + , (pos5, Token link') + , (pos6, ParenClose) + ] link :: Lexer link = do - pos <- getPosition - l <- linkRaw - -- "unconsume" the last token - pos' <- flip incSourceColumn (-1) <$> getPosition - pure $ (pos, LinkOpen) : l <> [(pos', LinkClose)] + pos <- getPosition + l <- linkRaw + -- "unconsume" the last token + pos' <- flip incSourceColumn (-1) <$> getPosition + pure $ (pos, LinkOpen) : l <> [(pos', LinkClose)] labeledLink :: Lexer labeledLink = do - pos <- getPosition - void $ string "<" - link' <- linkRaw - pos7 <- getPosition - label' <- anyUntil $ string ">" - pos8 <- getPosition - void $ ">" + pos <- getPosition + void $ string "<" + link' <- linkRaw + pos7 <- getPosition + label' <- anyUntil $ string ">" + pos8 <- getPosition + void $ ">" - pure $ - (pos, LabeledLinkOpen) - : link' - <> [ (pos7, Token label') - , (pos8, LabeledLinkClose) - ] + pure $ + (pos, LabeledLinkOpen) + : link' + <> [ (pos7, Token label') + , (pos8, LabeledLinkClose) + ] mathsBracket :: Lexer mathsBracket = delimited (void $ "\\[") (void "\\]") MathsBracketOpen MathsBracketClose @@ -285,23 +278,23 @@ monospace = delimitedSymmetric "@" MonospaceOpen MonospaceClose other :: Lexer other = do - pos <- getPosition - c <- takeWhile1_ isUnicodeAlphaNum - pure . pure $ (pos, Token c) - where - isUnicodeAlphaNum c = isPrint c && not (isControl c) && not (isSpace c) + pos <- getPosition + c <- takeWhile1_ isUnicodeAlphaNum + pure . pure $ (pos, Token c) + where + isUnicodeAlphaNum c = isPrint c && not (isControl c) && not (isSpace c) spaceToken :: Lexer spaceToken = do - pos <- getPosition - _ <- many1 (char ' ') - pure . pure $ (pos, Space) + pos <- getPosition + _ <- many1 (char ' ') + pure . pure $ (pos, Space) newlineToken :: Lexer newlineToken = do - pos <- getPosition - _ <- newline - pure . pure $ (pos, Newline) + pos <- getPosition + _ <- newline + pure . pure $ (pos, Newline) ------- -- Helpers @@ -310,11 +303,11 @@ newlineToken = do -- | Like `takeWhile`, but unconditionally take escaped characters. takeWhile_ :: (Char -> Bool) -> Parser Text takeWhile_ p = scan p_ False - where - p_ escaped c - | escaped = Just False - | not $ p c = Nothing - | otherwise = Just (c == '\\') + where + p_ escaped c + | escaped = Just False + | not $ p c = Nothing + | otherwise = Just (c == '\\') -- | Like 'takeWhile1', but unconditionally take escaped characters. takeWhile1_ :: (Char -> Bool) -> Parser Text @@ -324,19 +317,19 @@ takeWhile1_ = mfilter (not . Text.null) . takeWhile_ function returns true. -} scan :: - -- | scan function - (state -> Char -> Maybe state) -> - -- | initial state - state -> - Parser Text + -- | scan function + (state -> Char -> Maybe state) -> + -- | initial state + state -> + Parser Text scan f initState = do - parserState@State{stateInput = input, statePos = pos} <- Parsec.getParserState - (remaining, finalPos, ct) <- go input initState pos 0 - let newState = parserState{stateInput = remaining, statePos = finalPos} - Parsec.setParserState newState $> Text.take ct input - where - go !input' !st !posAccum !count' = case Text.uncons input' of - Nothing -> pure (input', posAccum, count') - Just (char', input'') -> case f st char' of - Nothing -> pure (input', posAccum, count') - Just st' -> go input'' st' (updatePosChar posAccum char') (count' + 1) + parserState@State{stateInput = input, statePos = pos} <- Parsec.getParserState + (remaining, finalPos, ct) <- go input initState pos 0 + let newState = parserState{stateInput = remaining, statePos = finalPos} + Parsec.setParserState newState $> Text.take ct input + where + go !input' !st !posAccum !count' = case Text.uncons input' of + Nothing -> pure (input', posAccum, count') + Just (char', input'') -> case f st char' of + Nothing -> pure (input', posAccum, count') + Just st' -> go input'' st' (updatePosChar posAccum char') (count' + 1) diff --git a/test/Spec.hs b/test/Spec.hs index 0b61570..9f76b8b 100644 --- a/test/Spec.hs +++ b/test/Spec.hs @@ -14,28 +14,28 @@ import Text.Parsec.Pos main :: IO () main = hspec $ do - describe "Lexer" do - describe "minimal" do - it "handles unicode" unicode - it "escapes" escaping - it "maths" maths - it "anchors" anchors - it "space chars" space - it "bare string" someString - it "emphasis" emphatic - it "monospace" monospace - it "labeled link" labeledLink - it "markdown link" link - it "bird tracks" birdTracks - it "module names" modules - it "quotes" quotes - it "ignores nesting" ignoreNesting + describe "Lexer" do + describe "minimal" do + it "handles unicode" unicode + it "escapes" escaping + it "maths" maths + it "anchors" anchors + it "space chars" space + it "bare string" someString + it "emphasis" emphatic + it "monospace" monospace + it "labeled link" labeledLink + it "markdown link" link + it "bird tracks" birdTracks + it "module names" modules + it "quotes" quotes + it "ignores nesting" ignoreNesting - describe "Parser" do - it "Bold" do - "__bold__" `shouldParseTo` (DocBold (DocString "bold")) - it "Emphasis" do - "/emphasis/" `shouldParseTo` (DocEmphasis (DocString "emphasis")) + describe "Parser" do + it "Bold" do + "__bold__" `shouldParseTo` (DocBold (DocString "bold")) + it "Emphasis" do + "/emphasis/" `shouldParseTo` (DocEmphasis (DocString "emphasis")) ------------ -- Tests @@ -43,134 +43,131 @@ main = hspec $ do modules :: Expectation modules = do - "\"MyModule.Name\"" - `shouldLexTo` [ (1, 1, Module) - , (1, 2, Token "MyModule.Name") - ] + "\"MyModule.Name\"" + `shouldLexTo` [ (1, 1, Module) + , (1, 2, Token "MyModule.Name") + ] - "\"OtherModule.Name#myAnchor\"" - `shouldLexTo` [ (1, 1, Module) - , (1, 2, Token "OtherModule.Name") - , (1, 18, Anchor) - , (1, 19, Token "myAnchor") - ] + "\"OtherModule.Name#myAnchor\"" + `shouldLexTo` [ (1, 1, Module) + , (1, 2, Token "OtherModule.Name") + , (1, 18, Anchor "myAnchor") + ] link :: Expectation link = - "[link to](http://some.website)" - `shouldLexTo` [ (1, 1, LinkOpen) - , (1, 1, BracketOpen) - , (1, 2, Token "link to") - , (1, 9, BracketClose) - , (1, 10, ParenOpen) - , (1, 11, Token "http://some.website") - , (1, 30, ParenClose) - , (1, 30, LinkClose) - ] + "[link to](http://some.website)" + `shouldLexTo` [ (1, 1, LinkOpen) + , (1, 1, BracketOpen) + , (1, 2, Token "link to") + , (1, 9, BracketClose) + , (1, 10, ParenOpen) + , (1, 11, Token "http://some.website") + , (1, 30, ParenClose) + , (1, 30, LinkClose) + ] labeledLink :: Expectation labeledLink = - "<[link here](http://to.here) label>" - `shouldLexTo` [ (1, 1, LabeledLinkOpen) - , (1, 2, BracketOpen) - , (1, 3, Token "link here") - , (1, 12, BracketClose) - , (1, 13, ParenOpen) - , (1, 14, Token "http://to.here") - , (1, 28, ParenClose) - , (1, 29, Token " label") - , (1, 35, LabeledLinkClose) - ] + "<[link here](http://to.here) label>" + `shouldLexTo` [ (1, 1, LabeledLinkOpen) + , (1, 2, BracketOpen) + , (1, 3, Token "link here") + , (1, 12, BracketClose) + , (1, 13, ParenOpen) + , (1, 14, Token "http://to.here") + , (1, 28, ParenClose) + , (1, 29, Token " label") + , (1, 35, LabeledLinkClose) + ] anchors :: Expectation anchors = - "#myAnchor#" - `shouldLexTo` [ (1, 1, Anchor) - , (1, 2, Token "myAnchor") - , (1, 10, Anchor) - ] + "#myAnchor#" + `shouldLexTo` [ (1, 1, Anchor "myAnchor") + ] maths :: IO () maths = do - "\\[some math\\]" - `shouldLexTo` [ (1, 1, MathsBracketOpen) - , (1, 3, Token "some math") - , (1, 12, MathsBracketClose) - ] - "\\(other maths\\)" - `shouldLexTo` [ (1, 1, MathsParenOpen) - , (1, 3, Token "other maths") - , (1, 14, MathsParenClose) - ] + "\\[some math\\]" + `shouldLexTo` [ (1, 1, MathsBracketOpen) + , (1, 3, Token "some math") + , (1, 12, MathsBracketClose) + ] + "\\(other maths\\)" + `shouldLexTo` [ (1, 1, MathsParenOpen) + , (1, 3, Token "other maths") + , (1, 14, MathsParenClose) + ] escaping :: Expectation escaping = - "\\(" - `shouldLexTo` [ (1, 1, Escape) - , (1, 2, Token "(") - ] + "\\(" + `shouldLexTo` [ (1, 1, Escape) + , (1, 2, Token "(") + ] unicode :: Expectation unicode = - "ドラゴンクエストの冒険者🐉" - `shouldLexTo` [ (1, 1, Token "ドラゴンクエストの冒険者🐉") - ] + "ドラゴンクエストの冒険者🐉" + `shouldLexTo` [ (1, 1, Token "ドラゴンクエストの冒険者🐉") + ] ignoreNesting :: Expectation ignoreNesting = - ">/foo/" - `shouldLexTo` [ (1, 1, Token ">/foo/") - ] + ">/foo/" + `shouldLexTo` [ (1, 1, Token ">/foo/") + ] birdTracks :: Expectation birdTracks = - ">> code" - `shouldLexTo` [ (1, 1, BirdTrack) - , (1, 4, Token "code") - ] + ">> code" + `shouldLexTo` [ (1, 1, BirdTrack) + , (1, 4, Token "code") + ] quotes :: Expectation quotes = - "\"quoted\"" - `shouldLexTo` [ (1, 1, QuoteOpen) - , (1, 2, Token "quoted") - , (1, 8, QuoteClose) - ] + "\"quoted\"" + `shouldLexTo` [ (1, 1, QuoteOpen) + , (1, 2, Token "quoted") + , (1, 8, QuoteClose) + ] space :: Expectation space = do - "\n " - `shouldLexTo` [ (1, 1, Newline) - , (2, 1, Space) - ] - " \n" - `shouldLexTo` [ (1, 1, Space) - , (1, 2, Newline) - ] + "\n " + `shouldLexTo` [ (1, 1, Newline) + , (2, 1, Space) + ] + " \n" + `shouldLexTo` [ (1, 1, Space) + , (1, 2, Newline) + ] monospace :: Expectation monospace = - "@mono@" - `shouldLexTo` [ (1, 1, MonospaceOpen) - , (1, 2, Token "mono") - , (1, 6, MonospaceClose) - ] + "@mono@" + `shouldLexTo` [ (1, 1, MonospaceOpen) + , (1, 2, Token "mono") + , (1, 6, MonospaceClose) + ] emphatic :: Expectation emphatic = - "/emphatic/" - `shouldLexTo` [ (1, 1, EmphasisOpen) - , (1, 2, Token "emphatic") - , (1, 10, EmphasisClose) - ] + "/emphatic/" + `shouldLexTo` [ (1, 1, EmphasisOpen) + , (1, 2, Token "emphatic") + , (1, 10, EmphasisClose) + ] someString :: Expectation someString = - "some string" - `shouldLexTo` [ (1, 1, Token "some") - , (1, 5, Space) - , (1, 6, Token "string") - ] + "some string" + `shouldLexTo` [ (1, 1, Token "some") + , (1, 5, Space) + , (1, 6, Token "string") + ] -------------- -- Helpers @@ -179,15 +176,15 @@ someString = type Doc id = DocMarkup () id instance IsString (Doc String) where - fromString = DocString + fromString = DocString shouldLexTo :: String -> [(Int, Int, Token)] -> Expectation shouldLexTo input expected = - case lexer input of - Right tokens -> do - let actual = map (\(pos, tok) -> (sourceLine pos, sourceColumn pos, tok)) tokens - actual `shouldBe` expected - Left err -> expectationFailure $ "Parse error: " <> show err + case lexer input of + Right tokens -> do + let actual = map (\(pos, tok) -> (sourceLine pos, sourceColumn pos, tok)) tokens + actual `shouldBe` expected + Left err -> expectationFailure $ "Parse error: " <> show err shouldParseTo :: Text -> DocMarkup mod Identifier -> Expectation shouldParseTo input ast = parseText input `shouldBe` ast diff --git a/test/markup.md b/test/markup.md index befd3ed..187a525 100644 --- a/test/markup.md +++ b/test/markup.md @@ -27,8 +27,16 @@ ftp\://example.com ![alt text](image.png) -\(mathematical expression\) -\[mathematical expression\] +\(mathematical 1+3 expression\) + +\[mathematical + expression + accross lines with + addition and such +\] + +{ +e +¥ @ code block content From 7ae868932d5aa11df631a04b86e9293010610cec Mon Sep 17 00:00:00 2001 From: Igor Ranieri Date: Wed, 24 Sep 2025 10:13:16 +0200 Subject: [PATCH 2/2] Add located combinator; improved anchor matches with tests --- src/Lexer.hs | 22 +++++++++++++--------- test/Spec.hs | 5 +++++ 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/src/Lexer.hs b/src/Lexer.hs index 5b29aa8..abced93 100644 --- a/src/Lexer.hs +++ b/src/Lexer.hs @@ -175,22 +175,26 @@ anchors = do where anchor' = (string "#" <|> string "\\#") +located :: Parser a -> Parser (SourcePos, a) +located p = (,) <$> getPosition <*> p + +startPosition :: Parser a -> Parser SourcePos +startPosition = fmap fst . located + -- "Module.Name" -- "Module.Name#anchor" +-- "Module.Name\#anchor" -- this has been deprecated for 9 years, thanks Ben modules :: Lexer modules = do - pos <- getPosition - void $ char '"' - pos' <- getPosition - modName <- modId + startPos <- startPosition $ char '"' + (modPos, modName) <- located modId anch <- option [] do - pos'' <- getPosition - void $ try (string "#" <|> string "\\#") - a <- Text.pack <$> many (satisfy (\c -> c /= '"' && not (isSpace c))) - pure [(pos'', Anchor a)] + anchPos <- startPosition (string "#" <|> string' "\\#") + txt <- Text.pack <$> many (satisfy (\c -> c /= '"' && not (isSpace c))) + pure [(anchPos, Anchor txt)] void $ char '"' - pure $ [(pos, Module), (pos', Token modName)] <> anch + pure $ [(startPos, Module), (modPos, Token modName)] <> anch where modId = intercalate "." <$> (fmap Text.pack <$> (conId `sepBy1` (char '.'))) diff --git a/test/Spec.hs b/test/Spec.hs index 9f76b8b..d202ac2 100644 --- a/test/Spec.hs +++ b/test/Spec.hs @@ -54,6 +54,11 @@ modules = do , (1, 18, Anchor "myAnchor") ] + "\"OtherModule.Name\\#myAnchor\"" + `shouldLexTo` [ (1, 1, Module) + , (1, 2, Token "OtherModule.Name") + , (1, 18, Anchor "myAnchor") + ] link :: Expectation link = "[link to](http://some.website)"