From e18082db1b9a3a440137e65650bfc3da30204fc3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C3=A9ana=20=E6=B1=9F?= Date: Sun, 28 Sep 2025 12:35:56 +0800 Subject: [PATCH 1/2] ref(lexer): attempt to not try on every token ...for a better error message and better perf --- src/Lexer.hs | 54 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 35 insertions(+), 19 deletions(-) diff --git a/src/Lexer.hs b/src/Lexer.hs index 77fc84a..0d46560 100644 --- a/src/Lexer.hs +++ b/src/Lexer.hs @@ -82,28 +82,43 @@ lexText = go Parsec.optionMaybe Parsec.eof >>= \case Just _ -> pure [] Nothing -> do - toks <- - choice $ - Parsec.try - <$> [ mathMultiline - , mathInline - , escape -- maths go before escape to avoid mismatch - , headers - , newlineToken - , spaceToken - , link - , labeledLink - , module_ - , anchor - , numericEntity - , textElement - , quotes - , birdTrack - , other - ] + toks <- topLevel rest <- go pure (toks <> rest) +{- FOURMOLU_DISABLE -} + topLevel = + -- backtracking here so we always have a chance to try "other", the "catch-all-leave-to-parser-to-deal-with" choice + -- TODO: is this desirable? do we throw lexer error at all? + try + ( choice + -- Sorted in + -- - longest to shortest parse path + -- - highest frequency to lowest frequency (for performance?) + -- - more exact to more freeform (the latter can be the former but not vice versa) + [ spaceToken + , newlineToken + + -- starts with "\" + , try mathMultiline + , try mathInline + + , try module_ + , quotes + , birdTrack + + , escape + , headers + , labeledLink + , link + , anchor + , numericEntity + , textElement + ] + ) + <|> other +{- FOURMOLU_ENABLE -} + -- Tokens textElement :: Parser [LocatedToken] @@ -239,6 +254,7 @@ mathMultiline = delimited "\\[" "\\]" MathMultilineOpen MathMultilineClose mathInline :: Lexer mathInline = delimited "\\(" "\\)" MathInlineOpen MathInlineClose +-- TODO: make sure this starts at column 0? birdTrack :: Lexer birdTrack = delimitedNoTrailing ">> " eol BirdTrack From 449b7c8ca73018b738a81069f219af38b8270b87 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C3=A9ana=20=E6=B1=9F?= Date: Sun, 28 Sep 2025 12:35:56 +0800 Subject: [PATCH 2/2] ref(lexer): attempt to not try on every token ...for a better error message and better perf --- src/Lexer.hs | 54 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 35 insertions(+), 19 deletions(-) diff --git a/src/Lexer.hs b/src/Lexer.hs index 77fc84a..84e38cc 100644 --- a/src/Lexer.hs +++ b/src/Lexer.hs @@ -82,28 +82,43 @@ lexText = go Parsec.optionMaybe Parsec.eof >>= \case Just _ -> pure [] Nothing -> do - toks <- - choice $ - Parsec.try - <$> [ mathMultiline - , mathInline - , escape -- maths go before escape to avoid mismatch - , headers - , newlineToken - , spaceToken - , link - , labeledLink - , module_ - , anchor - , numericEntity - , textElement - , quotes - , birdTrack - , other - ] + toks <- topLevel rest <- go pure (toks <> rest) +{- FOURMOLU_DISABLE -} + topLevel = + -- backtracking here so we always have a chance to try "other", the "catch-all-leave-to-parser-to-deal-with" choice + -- TODO: is this desirable? do we throw lexer error at all? + try + ( choice + -- Sorted in + -- - longest to shortest parse path + -- - highest frequency to lowest frequency (for performance?) + -- - more exact to more freeform (the latter can be the former but not vice versa) + [ spaceToken + , newlineToken + + , try module_ + , quotes + , birdTrack + + -- starts with "\" + , try mathMultiline + , try mathInline + , escape + + , headers + , labeledLink + , link + , anchor + , numericEntity + , textElement + ] + ) + <|> other +{- FOURMOLU_ENABLE -} + -- Tokens textElement :: Parser [LocatedToken] @@ -239,6 +254,7 @@ mathMultiline = delimited "\\[" "\\]" MathMultilineOpen MathMultilineClose mathInline :: Lexer mathInline = delimited "\\(" "\\)" MathInlineOpen MathInlineClose +-- TODO: make sure this starts at column 0? birdTrack :: Lexer birdTrack = delimitedNoTrailing ">> " eol BirdTrack