Compare commits

..

2 commits

3 changed files with 320 additions and 313 deletions

View file

@ -1,9 +1,9 @@
{-# LANGUAGE OverloadedStrings #-}
module Lexer (
Token (..),
lexer,
emphasis,
Token (..),
lexer,
emphasis,
) where
import Control.Monad (mfilter, void)
@ -21,116 +21,117 @@ type LocatedToken = (SourcePos, Token)
type Lexer = Parser [LocatedToken]
data Level
= One
| Two
| Three
| Four
| Five
| Six
deriving (Eq, Show)
= One
| Two
| Three
| Four
| Five
| Six
deriving (Eq, Show)
data Token
= Token Text
| Anchor
| BirdTrack
| BoldOpen
| BoldClose
| Escape
| EmphasisOpen
| EmphasisClose
| Header Level
| MonospaceOpen
| MonospaceClose
| Newline
| LinkOpen
| LinkClose
| LabeledLinkOpen
| LabeledLinkClose
| ParenOpen
| ParenClose
| BracketOpen
| BracketClose
| MathsParenOpen
| MathsParenClose
| MathsBracketOpen
| MathsBracketClose
| Module
| QuoteOpen
| QuoteClose
| Space
| EOF
deriving (Eq, Show)
= Token Text
| Anchor Text
| BirdTrack
| BoldOpen
| BoldClose
| Escape
| EmphasisOpen
| EmphasisClose
| Header Level
| MonospaceOpen
| MonospaceClose
| Newline
| LinkOpen
| LinkClose
| LabeledLinkOpen
| LabeledLinkClose
| ParenOpen
| ParenClose
| BracketOpen
| BracketClose
| MathsParenOpen
| MathsParenClose
| MathsBracketOpen
| MathsBracketClose
| NumericEntity Int
| Module
| QuoteOpen
| QuoteClose
| Space
| EOF
deriving (Eq, Show)
lexer :: String -> Either ParseError [LocatedToken]
lexer = Parsec.runParser lexText initialParserState "input" . Text.pack
lexText :: Parser [LocatedToken]
lexText = go
where
go = do
Parsec.optionMaybe Parsec.eof >>= \case
Just _ -> pure []
Nothing -> do
toks <-
choice $
Parsec.try
<$> [ mathsBracket
, mathsParens
, escape -- maths go before escape to avoid mismatch
, headers
, newlineToken
, spaceToken
, link
, labeledLink
, modules
, anchors
, textElement
, quotes
, birdTrack
, other
]
rest <- go
pure (toks <> rest)
where
go = do
Parsec.optionMaybe Parsec.eof >>= \case
Just _ -> pure []
Nothing -> do
toks <-
choice $
Parsec.try
<$> [ mathsBracket
, mathsParens
, escape -- maths go before escape to avoid mismatch
, headers
, newlineToken
, spaceToken
, link
, labeledLink
, modules
, anchors
, textElement
, quotes
, birdTrack
, other
]
rest <- go
pure (toks <> rest)
-- Tokens
textElement :: Parser [LocatedToken]
textElement =
choice $
Parsec.try
<$> [ emphasis
, bold
, monospace
]
choice $
Parsec.try
<$> [ emphasis
, bold
, monospace
]
headers :: Parser [LocatedToken]
headers =
choice $
Parsec.try
<$> [ header1
, header2
, header3
, header4
, header5
, header6
]
choice $
Parsec.try
<$> [ header1
, header2
, header3
, header4
, header5
, header6
]
delimitedMaybe :: Parser a -> Parser a -> Token -> Maybe Token -> Parser [LocatedToken]
delimitedMaybe openMark closeMark openToken closeToken = do
openPos <- getPosition
void openMark
tokenPos <- getPosition
content <- anyUntil closeMark
closePos <- getPosition
void closeMark
openPos <- getPosition
void openMark
tokenPos <- getPosition
content <- anyUntil closeMark
closePos <- getPosition
void closeMark
let openTok :: LocatedToken = (openPos, openToken)
res :: LocatedToken = (tokenPos, Token content)
closeToks :: [LocatedToken] = case closeToken of
Just close -> [(closePos, close)]
Nothing -> []
let openTok :: LocatedToken = (openPos, openToken)
res :: LocatedToken = (tokenPos, Token content)
closeToks :: [LocatedToken] = case closeToken of
Just close -> [(closePos, close)]
Nothing -> []
pure $ [openTok, res] <> closeToks
pure $ [openTok, res] <> closeToks
anyUntil :: Parser a -> Parser Text
anyUntil p = Text.pack <$> manyTill anyChar (lookAhead p)
@ -165,99 +166,95 @@ header6 = delimitedMaybe (void $ "====== ") eol (Header Six) Nothing
-- #anchors#
anchors :: Lexer
anchors = do
pos <- getPosition
void $ try anchor'
pos' <- getPosition
txt <- anyUntil anchor'
pos'' <- getPosition
void $ try anchor'
pos <- getPosition
void $ try anchor'
txt <- anyUntil anchor'
void $ try anchor'
pure
[ (pos, Anchor)
, (pos', Token txt)
, (pos'', Anchor)
]
where
anchor' = (string "#" <|> string "\\#")
pure [(pos, Anchor txt)]
where
anchor' = (string "#" <|> string "\\#")
located :: Parser a -> Parser (SourcePos, a)
located p = (,) <$> getPosition <*> p
startPosition :: Parser a -> Parser SourcePos
startPosition = fmap fst . located
-- "Module.Name"
-- "Module.Name#anchor"
-- "Module.Name#anchor"
-- "Module.Name\#anchor" -- this has been deprecated for 9 years, thanks Ben
modules :: Lexer
modules = do
pos <- getPosition
void $ char '"'
pos' <- getPosition
modName <- modId
anch <- option [] do
pos'' <- getPosition
void $ try (string "#" <|> string "\\#")
pos''' <- getPosition
a <- Text.pack <$> many (satisfy (\c -> c /= '"' && not (isSpace c)))
pure [(pos'', Anchor), (pos''', Token a)]
startPos <- startPosition $ char '"'
(modPos, modName) <- located modId
anch <- option [] do
anchPos <- startPosition (string "#" <|> string' "\\#")
txt <- Text.pack <$> many (satisfy (\c -> c /= '"' && not (isSpace c)))
pure [(anchPos, Anchor txt)]
void $ char '"'
pure $ [(pos, Module), (pos', Token modName)] <> anch
where
modId = intercalate "." <$> (fmap Text.pack <$> (conId `sepBy1` (char '.')))
void $ char '"'
pure $ [(startPos, Module), (modPos, Token modName)] <> anch
where
modId = intercalate "." <$> (fmap Text.pack <$> (conId `sepBy1` (char '.')))
conId :: Parser String
conId =
(:)
<$> satisfy (\c -> isAlpha c && isUpper c)
<*> many1 conChar
conId :: Parser String
conId =
(:)
<$> satisfy (\c -> isAlpha c && isUpper c)
<*> many1 conChar
conChar :: Parser Char
conChar = satisfy (\c -> isAlphaNum c || c == '_')
conChar :: Parser Char
conChar = satisfy (\c -> isAlphaNum c || c == '_')
linkRaw :: Lexer
linkRaw = do
pos1 <- getPosition
void $ string "["
pos2 <- getPosition
text <- anyUntil $ Text.pack <$> string "]"
pos3 <- getPosition
void $ "]"
pos4 <- getPosition
void $ "("
pos5 <- getPosition
link' <- anyUntil $ Text.pack <$> string ")"
pos6 <- getPosition
void $ ")"
pos1 <- getPosition
void $ string "["
pos2 <- getPosition
text <- anyUntil $ Text.pack <$> string "]"
pos3 <- getPosition
void $ "]"
pos4 <- getPosition
void $ "("
pos5 <- getPosition
link' <- anyUntil $ Text.pack <$> string ")"
pos6 <- getPosition
void $ ")"
pure $
[ (pos1, BracketOpen)
, (pos2, Token text)
, (pos3, BracketClose)
, (pos4, ParenOpen)
, (pos5, Token link')
, (pos6, ParenClose)
]
pure $
[ (pos1, BracketOpen)
, (pos2, Token text)
, (pos3, BracketClose)
, (pos4, ParenOpen)
, (pos5, Token link')
, (pos6, ParenClose)
]
link :: Lexer
link = do
pos <- getPosition
l <- linkRaw
-- "unconsume" the last token
pos' <- flip incSourceColumn (-1) <$> getPosition
pure $ (pos, LinkOpen) : l <> [(pos', LinkClose)]
pos <- getPosition
l <- linkRaw
-- "unconsume" the last token
pos' <- flip incSourceColumn (-1) <$> getPosition
pure $ (pos, LinkOpen) : l <> [(pos', LinkClose)]
labeledLink :: Lexer
labeledLink = do
pos <- getPosition
void $ string "<"
link' <- linkRaw
pos7 <- getPosition
label' <- anyUntil $ string ">"
pos8 <- getPosition
void $ ">"
pos <- getPosition
void $ string "<"
link' <- linkRaw
pos7 <- getPosition
label' <- anyUntil $ string ">"
pos8 <- getPosition
void $ ">"
pure $
(pos, LabeledLinkOpen)
: link'
<> [ (pos7, Token label')
, (pos8, LabeledLinkClose)
]
pure $
(pos, LabeledLinkOpen)
: link'
<> [ (pos7, Token label')
, (pos8, LabeledLinkClose)
]
mathsBracket :: Lexer
mathsBracket = delimited (void $ "\\[") (void "\\]") MathsBracketOpen MathsBracketClose
@ -285,23 +282,23 @@ monospace = delimitedSymmetric "@" MonospaceOpen MonospaceClose
other :: Lexer
other = do
pos <- getPosition
c <- takeWhile1_ isUnicodeAlphaNum
pure . pure $ (pos, Token c)
where
isUnicodeAlphaNum c = isPrint c && not (isControl c) && not (isSpace c)
pos <- getPosition
c <- takeWhile1_ isUnicodeAlphaNum
pure . pure $ (pos, Token c)
where
isUnicodeAlphaNum c = isPrint c && not (isControl c) && not (isSpace c)
spaceToken :: Lexer
spaceToken = do
pos <- getPosition
_ <- many1 (char ' ')
pure . pure $ (pos, Space)
pos <- getPosition
_ <- many1 (char ' ')
pure . pure $ (pos, Space)
newlineToken :: Lexer
newlineToken = do
pos <- getPosition
_ <- newline
pure . pure $ (pos, Newline)
pos <- getPosition
_ <- newline
pure . pure $ (pos, Newline)
-------
-- Helpers
@ -310,11 +307,11 @@ newlineToken = do
-- | Like `takeWhile`, but unconditionally take escaped characters.
takeWhile_ :: (Char -> Bool) -> Parser Text
takeWhile_ p = scan p_ False
where
p_ escaped c
| escaped = Just False
| not $ p c = Nothing
| otherwise = Just (c == '\\')
where
p_ escaped c
| escaped = Just False
| not $ p c = Nothing
| otherwise = Just (c == '\\')
-- | Like 'takeWhile1', but unconditionally take escaped characters.
takeWhile1_ :: (Char -> Bool) -> Parser Text
@ -324,19 +321,19 @@ takeWhile1_ = mfilter (not . Text.null) . takeWhile_
function returns true.
-}
scan ::
-- | scan function
(state -> Char -> Maybe state) ->
-- | initial state
state ->
Parser Text
-- | scan function
(state -> Char -> Maybe state) ->
-- | initial state
state ->
Parser Text
scan f initState = do
parserState@State{stateInput = input, statePos = pos} <- Parsec.getParserState
(remaining, finalPos, ct) <- go input initState pos 0
let newState = parserState{stateInput = remaining, statePos = finalPos}
Parsec.setParserState newState $> Text.take ct input
where
go !input' !st !posAccum !count' = case Text.uncons input' of
Nothing -> pure (input', posAccum, count')
Just (char', input'') -> case f st char' of
Nothing -> pure (input', posAccum, count')
Just st' -> go input'' st' (updatePosChar posAccum char') (count' + 1)
parserState@State{stateInput = input, statePos = pos} <- Parsec.getParserState
(remaining, finalPos, ct) <- go input initState pos 0
let newState = parserState{stateInput = remaining, statePos = finalPos}
Parsec.setParserState newState $> Text.take ct input
where
go !input' !st !posAccum !count' = case Text.uncons input' of
Nothing -> pure (input', posAccum, count')
Just (char', input'') -> case f st char' of
Nothing -> pure (input', posAccum, count')
Just st' -> go input'' st' (updatePosChar posAccum char') (count' + 1)

View file

@ -14,28 +14,28 @@ import Text.Parsec.Pos
main :: IO ()
main = hspec $ do
describe "Lexer" do
describe "minimal" do
it "handles unicode" unicode
it "escapes" escaping
it "maths" maths
it "anchors" anchors
it "space chars" space
it "bare string" someString
it "emphasis" emphatic
it "monospace" monospace
it "labeled link" labeledLink
it "markdown link" link
it "bird tracks" birdTracks
it "module names" modules
it "quotes" quotes
it "ignores nesting" ignoreNesting
describe "Lexer" do
describe "minimal" do
it "handles unicode" unicode
it "escapes" escaping
it "maths" maths
it "anchors" anchors
it "space chars" space
it "bare string" someString
it "emphasis" emphatic
it "monospace" monospace
it "labeled link" labeledLink
it "markdown link" link
it "bird tracks" birdTracks
it "module names" modules
it "quotes" quotes
it "ignores nesting" ignoreNesting
describe "Parser" do
it "Bold" do
"__bold__" `shouldParseTo` (DocBold (DocString "bold"))
it "Emphasis" do
"/emphasis/" `shouldParseTo` (DocEmphasis (DocString "emphasis"))
describe "Parser" do
it "Bold" do
"__bold__" `shouldParseTo` (DocBold (DocString "bold"))
it "Emphasis" do
"/emphasis/" `shouldParseTo` (DocEmphasis (DocString "emphasis"))
------------
-- Tests
@ -43,134 +43,136 @@ main = hspec $ do
modules :: Expectation
modules = do
"\"MyModule.Name\""
`shouldLexTo` [ (1, 1, Module)
, (1, 2, Token "MyModule.Name")
]
"\"MyModule.Name\""
`shouldLexTo` [ (1, 1, Module)
, (1, 2, Token "MyModule.Name")
]
"\"OtherModule.Name#myAnchor\""
`shouldLexTo` [ (1, 1, Module)
, (1, 2, Token "OtherModule.Name")
, (1, 18, Anchor)
, (1, 19, Token "myAnchor")
]
"\"OtherModule.Name#myAnchor\""
`shouldLexTo` [ (1, 1, Module)
, (1, 2, Token "OtherModule.Name")
, (1, 18, Anchor "myAnchor")
]
"\"OtherModule.Name\\#myAnchor\""
`shouldLexTo` [ (1, 1, Module)
, (1, 2, Token "OtherModule.Name")
, (1, 18, Anchor "myAnchor")
]
link :: Expectation
link =
"[link to](http://some.website)"
`shouldLexTo` [ (1, 1, LinkOpen)
, (1, 1, BracketOpen)
, (1, 2, Token "link to")
, (1, 9, BracketClose)
, (1, 10, ParenOpen)
, (1, 11, Token "http://some.website")
, (1, 30, ParenClose)
, (1, 30, LinkClose)
]
"[link to](http://some.website)"
`shouldLexTo` [ (1, 1, LinkOpen)
, (1, 1, BracketOpen)
, (1, 2, Token "link to")
, (1, 9, BracketClose)
, (1, 10, ParenOpen)
, (1, 11, Token "http://some.website")
, (1, 30, ParenClose)
, (1, 30, LinkClose)
]
labeledLink :: Expectation
labeledLink =
"<[link here](http://to.here) label>"
`shouldLexTo` [ (1, 1, LabeledLinkOpen)
, (1, 2, BracketOpen)
, (1, 3, Token "link here")
, (1, 12, BracketClose)
, (1, 13, ParenOpen)
, (1, 14, Token "http://to.here")
, (1, 28, ParenClose)
, (1, 29, Token " label")
, (1, 35, LabeledLinkClose)
]
"<[link here](http://to.here) label>"
`shouldLexTo` [ (1, 1, LabeledLinkOpen)
, (1, 2, BracketOpen)
, (1, 3, Token "link here")
, (1, 12, BracketClose)
, (1, 13, ParenOpen)
, (1, 14, Token "http://to.here")
, (1, 28, ParenClose)
, (1, 29, Token " label")
, (1, 35, LabeledLinkClose)
]
anchors :: Expectation
anchors =
"#myAnchor#"
`shouldLexTo` [ (1, 1, Anchor)
, (1, 2, Token "myAnchor")
, (1, 10, Anchor)
]
"#myAnchor#"
`shouldLexTo` [ (1, 1, Anchor "myAnchor")
]
maths :: IO ()
maths = do
"\\[some math\\]"
`shouldLexTo` [ (1, 1, MathsBracketOpen)
, (1, 3, Token "some math")
, (1, 12, MathsBracketClose)
]
"\\(other maths\\)"
`shouldLexTo` [ (1, 1, MathsParenOpen)
, (1, 3, Token "other maths")
, (1, 14, MathsParenClose)
]
"\\[some math\\]"
`shouldLexTo` [ (1, 1, MathsBracketOpen)
, (1, 3, Token "some math")
, (1, 12, MathsBracketClose)
]
"\\(other maths\\)"
`shouldLexTo` [ (1, 1, MathsParenOpen)
, (1, 3, Token "other maths")
, (1, 14, MathsParenClose)
]
escaping :: Expectation
escaping =
"\\("
`shouldLexTo` [ (1, 1, Escape)
, (1, 2, Token "(")
]
"\\("
`shouldLexTo` [ (1, 1, Escape)
, (1, 2, Token "(")
]
unicode :: Expectation
unicode =
"ドラゴンクエストの冒険者🐉"
`shouldLexTo` [ (1, 1, Token "ドラゴンクエストの冒険者🐉")
]
"ドラゴンクエストの冒険者🐉"
`shouldLexTo` [ (1, 1, Token "ドラゴンクエストの冒険者🐉")
]
ignoreNesting :: Expectation
ignoreNesting =
">/foo/"
`shouldLexTo` [ (1, 1, Token ">/foo/")
]
">/foo/"
`shouldLexTo` [ (1, 1, Token ">/foo/")
]
birdTracks :: Expectation
birdTracks =
">> code"
`shouldLexTo` [ (1, 1, BirdTrack)
, (1, 4, Token "code")
]
">> code"
`shouldLexTo` [ (1, 1, BirdTrack)
, (1, 4, Token "code")
]
quotes :: Expectation
quotes =
"\"quoted\""
`shouldLexTo` [ (1, 1, QuoteOpen)
, (1, 2, Token "quoted")
, (1, 8, QuoteClose)
]
"\"quoted\""
`shouldLexTo` [ (1, 1, QuoteOpen)
, (1, 2, Token "quoted")
, (1, 8, QuoteClose)
]
space :: Expectation
space = do
"\n "
`shouldLexTo` [ (1, 1, Newline)
, (2, 1, Space)
]
" \n"
`shouldLexTo` [ (1, 1, Space)
, (1, 2, Newline)
]
"\n "
`shouldLexTo` [ (1, 1, Newline)
, (2, 1, Space)
]
" \n"
`shouldLexTo` [ (1, 1, Space)
, (1, 2, Newline)
]
monospace :: Expectation
monospace =
"@mono@"
`shouldLexTo` [ (1, 1, MonospaceOpen)
, (1, 2, Token "mono")
, (1, 6, MonospaceClose)
]
"@mono@"
`shouldLexTo` [ (1, 1, MonospaceOpen)
, (1, 2, Token "mono")
, (1, 6, MonospaceClose)
]
emphatic :: Expectation
emphatic =
"/emphatic/"
`shouldLexTo` [ (1, 1, EmphasisOpen)
, (1, 2, Token "emphatic")
, (1, 10, EmphasisClose)
]
"/emphatic/"
`shouldLexTo` [ (1, 1, EmphasisOpen)
, (1, 2, Token "emphatic")
, (1, 10, EmphasisClose)
]
someString :: Expectation
someString =
"some string"
`shouldLexTo` [ (1, 1, Token "some")
, (1, 5, Space)
, (1, 6, Token "string")
]
"some string"
`shouldLexTo` [ (1, 1, Token "some")
, (1, 5, Space)
, (1, 6, Token "string")
]
--------------
-- Helpers
@ -179,15 +181,15 @@ someString =
type Doc id = DocMarkup () id
instance IsString (Doc String) where
fromString = DocString
fromString = DocString
shouldLexTo :: String -> [(Int, Int, Token)] -> Expectation
shouldLexTo input expected =
case lexer input of
Right tokens -> do
let actual = map (\(pos, tok) -> (sourceLine pos, sourceColumn pos, tok)) tokens
actual `shouldBe` expected
Left err -> expectationFailure $ "Parse error: " <> show err
case lexer input of
Right tokens -> do
let actual = map (\(pos, tok) -> (sourceLine pos, sourceColumn pos, tok)) tokens
actual `shouldBe` expected
Left err -> expectationFailure $ "Parse error: " <> show err
shouldParseTo :: Text -> DocMarkup mod Identifier -> Expectation
shouldParseTo input ast = parseText input `shouldBe` ast

View file

@ -27,8 +27,16 @@ ftp\://example.com
![alt text](image.png)
\(mathematical expression\)
\[mathematical expression\]
\(mathematical 1+3 expression\)
\[mathematical
expression
accross lines with + addition and such
\]
&#123
&#x65
&#165
@
code block content