diff --git a/src/Lexer.hs b/src/Lexer.hs index 77fc84a..3b04ecc 100644 --- a/src/Lexer.hs +++ b/src/Lexer.hs @@ -42,6 +42,7 @@ data Token | Escape | EmphasisOpen | EmphasisClose + | Expression | Header Level | MonospaceOpen | MonospaceClose @@ -82,28 +83,44 @@ lexText = go Parsec.optionMaybe Parsec.eof >>= \case Just _ -> pure [] Nothing -> do - toks <- - choice $ - Parsec.try - <$> [ mathMultiline - , mathInline - , escape -- maths go before escape to avoid mismatch - , headers - , newlineToken - , spaceToken - , link - , labeledLink - , module_ - , anchor - , numericEntity - , textElement - , quotes - , birdTrack - , other - ] + toks <- topLevel rest <- go pure (toks <> rest) +{- FOURMOLU_DISABLE -} + topLevel = + -- backtracking here so we always have a chance to try "other", the "catch-all-leave-to-parser-to-deal-with" choice + -- TODO: is this desirable? do we throw lexer error at all? + try + ( choice + -- Sorted in + -- - longest to shortest parse path + -- - highest frequency to lowest frequency (for performance?) + -- - more exact to more freeform (the latter can be the former but not vice versa) + [ spaceToken + , newlineToken + + , try module_ + , quotes + , try expression + , birdTrack + + -- starts with "\" + , try mathMultiline + , try mathInline + , escape + + , headers + , labeledLink + , link + , anchor + , numericEntity + , textElement + ] + ) + <|> other +{- FOURMOLU_ENABLE -} + -- Tokens textElement :: Parser [LocatedToken] @@ -143,16 +160,26 @@ delimited openP closeP openTok closeTok = asList <$> delimitedAsTuple (openTok < asList (a, tok, b) = [a, tok, b] delimitedNoTrailing :: Parser open -> Parser close -> Token -> Parser [LocatedToken] -delimitedNoTrailing openP closeP openTok = asList <$> delimitedAsTuple (openTok <$ openP) (void closeP) +delimitedNoTrailing openP closeP openTok = + asList <$> delimitedAsTuple (openTok <$ openP) (void closeP) where asList (a, tok, _) = [a, tok] delimitedSymmetric :: Parser a -> Token -> Token -> Parser [LocatedToken] delimitedSymmetric s = delimited s s +--- End of line // end of file eol :: Parser () eol = void "\n" <|> void "\r\n" <|> Parsec.eof +-- Start of line // start of file +sol :: Parser () +sol = do + p <- getPosition + if sourceColumn p == 1 + then pure () + else fail "Not at start of line/document" + header1 :: Lexer header1 = delimitedNoTrailing "= " eol (Header One) @@ -239,8 +266,15 @@ mathMultiline = delimited "\\[" "\\]" MathMultilineOpen MathMultilineClose mathInline :: Lexer mathInline = delimited "\\(" "\\)" MathInlineOpen MathInlineClose +-- TODO: make sure this starts at column 0? birdTrack :: Lexer -birdTrack = delimitedNoTrailing ">> " eol BirdTrack +birdTrack = delimitedNoTrailing (sol <* "> ") eol BirdTrack + +-- TODO: also match following lines iff: +-- they start with alphanum +-- they're not empty +expression :: Lexer +expression = delimitedNoTrailing (sol <* ">>> ") eol Expression escape :: Lexer escape = delimitedNoTrailing "\\" eol Escape diff --git a/test/Spec.hs b/test/Spec.hs index 2040e2f..3db44f1 100644 --- a/test/Spec.hs +++ b/test/Spec.hs @@ -18,7 +18,7 @@ main = hspec $ do describe "minimal" do it "handles unicode" unicode it "escapes" escaping - it "maths" math + it "maths" maths it "anchors" anchor it "space chars" space it "bare string" someString @@ -29,6 +29,7 @@ main = hspec $ do it "bird tracks" birdTracks it "module names" modules it "quotes" quotes + it "expressions" expressions it "numeric entity" numericEntity it "ignores nesting" ignoreNesting @@ -91,8 +92,8 @@ anchor = `shouldLexTo` [ (1, 1, Anchor "myAnchor") ] -math :: IO () -math = do +maths :: IO () +maths = do "\\[some math\\]" `shouldLexTo` [ (1, 1, MathMultilineOpen) , (1, 3, Token "some math") @@ -128,10 +129,35 @@ ignoreNesting = ] birdTracks :: Expectation -birdTracks = - ">> code" +birdTracks = do + "> code line" `shouldLexTo` [ (1, 1, BirdTrack) - , (1, 4, Token "code") + , (1, 3, Token "code line") + ] + " > not code" + `shouldLexTo` [ (1, 1, Space) + , (1, 2, Token ">") + , (1, 3, Space) + , (1, 4, Token "not") + , (1, 7, Space) + , (1, 8, Token "code") + ] + +expressions :: Expectation +expressions = do + ">>> eval this" + `shouldLexTo` [ (1, 1, Expression) + , (1, 5, Token "eval this") + ] + " >>> not eval this" + `shouldLexTo` [ (1, 1, Space) + , (1, 2, Token ">>>") + , (1, 5, Space) + , (1, 6, Token "not") + , (1, 9, Space) + , (1, 10, Token "eval") + , (1, 14, Space) + , (1, 15, Token "this") ] quotes :: Expectation