Compare commits

..

2 commits

3 changed files with 320 additions and 313 deletions

View file

@ -1,9 +1,9 @@
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
module Lexer ( module Lexer (
Token (..), Token (..),
lexer, lexer,
emphasis, emphasis,
) where ) where
import Control.Monad (mfilter, void) import Control.Monad (mfilter, void)
@ -21,116 +21,117 @@ type LocatedToken = (SourcePos, Token)
type Lexer = Parser [LocatedToken] type Lexer = Parser [LocatedToken]
data Level data Level
= One = One
| Two | Two
| Three | Three
| Four | Four
| Five | Five
| Six | Six
deriving (Eq, Show) deriving (Eq, Show)
data Token data Token
= Token Text = Token Text
| Anchor | Anchor Text
| BirdTrack | BirdTrack
| BoldOpen | BoldOpen
| BoldClose | BoldClose
| Escape | Escape
| EmphasisOpen | EmphasisOpen
| EmphasisClose | EmphasisClose
| Header Level | Header Level
| MonospaceOpen | MonospaceOpen
| MonospaceClose | MonospaceClose
| Newline | Newline
| LinkOpen | LinkOpen
| LinkClose | LinkClose
| LabeledLinkOpen | LabeledLinkOpen
| LabeledLinkClose | LabeledLinkClose
| ParenOpen | ParenOpen
| ParenClose | ParenClose
| BracketOpen | BracketOpen
| BracketClose | BracketClose
| MathsParenOpen | MathsParenOpen
| MathsParenClose | MathsParenClose
| MathsBracketOpen | MathsBracketOpen
| MathsBracketClose | MathsBracketClose
| Module | NumericEntity Int
| QuoteOpen | Module
| QuoteClose | QuoteOpen
| Space | QuoteClose
| EOF | Space
deriving (Eq, Show) | EOF
deriving (Eq, Show)
lexer :: String -> Either ParseError [LocatedToken] lexer :: String -> Either ParseError [LocatedToken]
lexer = Parsec.runParser lexText initialParserState "input" . Text.pack lexer = Parsec.runParser lexText initialParserState "input" . Text.pack
lexText :: Parser [LocatedToken] lexText :: Parser [LocatedToken]
lexText = go lexText = go
where where
go = do go = do
Parsec.optionMaybe Parsec.eof >>= \case Parsec.optionMaybe Parsec.eof >>= \case
Just _ -> pure [] Just _ -> pure []
Nothing -> do Nothing -> do
toks <- toks <-
choice $ choice $
Parsec.try Parsec.try
<$> [ mathsBracket <$> [ mathsBracket
, mathsParens , mathsParens
, escape -- maths go before escape to avoid mismatch , escape -- maths go before escape to avoid mismatch
, headers , headers
, newlineToken , newlineToken
, spaceToken , spaceToken
, link , link
, labeledLink , labeledLink
, modules , modules
, anchors , anchors
, textElement , textElement
, quotes , quotes
, birdTrack , birdTrack
, other , other
] ]
rest <- go rest <- go
pure (toks <> rest) pure (toks <> rest)
-- Tokens -- Tokens
textElement :: Parser [LocatedToken] textElement :: Parser [LocatedToken]
textElement = textElement =
choice $ choice $
Parsec.try Parsec.try
<$> [ emphasis <$> [ emphasis
, bold , bold
, monospace , monospace
] ]
headers :: Parser [LocatedToken] headers :: Parser [LocatedToken]
headers = headers =
choice $ choice $
Parsec.try Parsec.try
<$> [ header1 <$> [ header1
, header2 , header2
, header3 , header3
, header4 , header4
, header5 , header5
, header6 , header6
] ]
delimitedMaybe :: Parser a -> Parser a -> Token -> Maybe Token -> Parser [LocatedToken] delimitedMaybe :: Parser a -> Parser a -> Token -> Maybe Token -> Parser [LocatedToken]
delimitedMaybe openMark closeMark openToken closeToken = do delimitedMaybe openMark closeMark openToken closeToken = do
openPos <- getPosition openPos <- getPosition
void openMark void openMark
tokenPos <- getPosition tokenPos <- getPosition
content <- anyUntil closeMark content <- anyUntil closeMark
closePos <- getPosition closePos <- getPosition
void closeMark void closeMark
let openTok :: LocatedToken = (openPos, openToken) let openTok :: LocatedToken = (openPos, openToken)
res :: LocatedToken = (tokenPos, Token content) res :: LocatedToken = (tokenPos, Token content)
closeToks :: [LocatedToken] = case closeToken of closeToks :: [LocatedToken] = case closeToken of
Just close -> [(closePos, close)] Just close -> [(closePos, close)]
Nothing -> [] Nothing -> []
pure $ [openTok, res] <> closeToks pure $ [openTok, res] <> closeToks
anyUntil :: Parser a -> Parser Text anyUntil :: Parser a -> Parser Text
anyUntil p = Text.pack <$> manyTill anyChar (lookAhead p) anyUntil p = Text.pack <$> manyTill anyChar (lookAhead p)
@ -165,99 +166,95 @@ header6 = delimitedMaybe (void $ "====== ") eol (Header Six) Nothing
-- #anchors# -- #anchors#
anchors :: Lexer anchors :: Lexer
anchors = do anchors = do
pos <- getPosition pos <- getPosition
void $ try anchor' void $ try anchor'
pos' <- getPosition txt <- anyUntil anchor'
txt <- anyUntil anchor' void $ try anchor'
pos'' <- getPosition
void $ try anchor'
pure pure [(pos, Anchor txt)]
[ (pos, Anchor) where
, (pos', Token txt) anchor' = (string "#" <|> string "\\#")
, (pos'', Anchor)
] located :: Parser a -> Parser (SourcePos, a)
where located p = (,) <$> getPosition <*> p
anchor' = (string "#" <|> string "\\#")
startPosition :: Parser a -> Parser SourcePos
startPosition = fmap fst . located
-- "Module.Name" -- "Module.Name"
-- "Module.Name#anchor" -- "Module.Name#anchor"
-- "Module.Name#anchor" -- "Module.Name\#anchor" -- this has been deprecated for 9 years, thanks Ben
modules :: Lexer modules :: Lexer
modules = do modules = do
pos <- getPosition startPos <- startPosition $ char '"'
void $ char '"' (modPos, modName) <- located modId
pos' <- getPosition anch <- option [] do
modName <- modId anchPos <- startPosition (string "#" <|> string' "\\#")
anch <- option [] do txt <- Text.pack <$> many (satisfy (\c -> c /= '"' && not (isSpace c)))
pos'' <- getPosition pure [(anchPos, Anchor txt)]
void $ try (string "#" <|> string "\\#")
pos''' <- getPosition
a <- Text.pack <$> many (satisfy (\c -> c /= '"' && not (isSpace c)))
pure [(pos'', Anchor), (pos''', Token a)]
void $ char '"' void $ char '"'
pure $ [(pos, Module), (pos', Token modName)] <> anch pure $ [(startPos, Module), (modPos, Token modName)] <> anch
where where
modId = intercalate "." <$> (fmap Text.pack <$> (conId `sepBy1` (char '.'))) modId = intercalate "." <$> (fmap Text.pack <$> (conId `sepBy1` (char '.')))
conId :: Parser String conId :: Parser String
conId = conId =
(:) (:)
<$> satisfy (\c -> isAlpha c && isUpper c) <$> satisfy (\c -> isAlpha c && isUpper c)
<*> many1 conChar <*> many1 conChar
conChar :: Parser Char conChar :: Parser Char
conChar = satisfy (\c -> isAlphaNum c || c == '_') conChar = satisfy (\c -> isAlphaNum c || c == '_')
linkRaw :: Lexer linkRaw :: Lexer
linkRaw = do linkRaw = do
pos1 <- getPosition pos1 <- getPosition
void $ string "[" void $ string "["
pos2 <- getPosition pos2 <- getPosition
text <- anyUntil $ Text.pack <$> string "]" text <- anyUntil $ Text.pack <$> string "]"
pos3 <- getPosition pos3 <- getPosition
void $ "]" void $ "]"
pos4 <- getPosition pos4 <- getPosition
void $ "(" void $ "("
pos5 <- getPosition pos5 <- getPosition
link' <- anyUntil $ Text.pack <$> string ")" link' <- anyUntil $ Text.pack <$> string ")"
pos6 <- getPosition pos6 <- getPosition
void $ ")" void $ ")"
pure $ pure $
[ (pos1, BracketOpen) [ (pos1, BracketOpen)
, (pos2, Token text) , (pos2, Token text)
, (pos3, BracketClose) , (pos3, BracketClose)
, (pos4, ParenOpen) , (pos4, ParenOpen)
, (pos5, Token link') , (pos5, Token link')
, (pos6, ParenClose) , (pos6, ParenClose)
] ]
link :: Lexer link :: Lexer
link = do link = do
pos <- getPosition pos <- getPosition
l <- linkRaw l <- linkRaw
-- "unconsume" the last token -- "unconsume" the last token
pos' <- flip incSourceColumn (-1) <$> getPosition pos' <- flip incSourceColumn (-1) <$> getPosition
pure $ (pos, LinkOpen) : l <> [(pos', LinkClose)] pure $ (pos, LinkOpen) : l <> [(pos', LinkClose)]
labeledLink :: Lexer labeledLink :: Lexer
labeledLink = do labeledLink = do
pos <- getPosition pos <- getPosition
void $ string "<" void $ string "<"
link' <- linkRaw link' <- linkRaw
pos7 <- getPosition pos7 <- getPosition
label' <- anyUntil $ string ">" label' <- anyUntil $ string ">"
pos8 <- getPosition pos8 <- getPosition
void $ ">" void $ ">"
pure $ pure $
(pos, LabeledLinkOpen) (pos, LabeledLinkOpen)
: link' : link'
<> [ (pos7, Token label') <> [ (pos7, Token label')
, (pos8, LabeledLinkClose) , (pos8, LabeledLinkClose)
] ]
mathsBracket :: Lexer mathsBracket :: Lexer
mathsBracket = delimited (void $ "\\[") (void "\\]") MathsBracketOpen MathsBracketClose mathsBracket = delimited (void $ "\\[") (void "\\]") MathsBracketOpen MathsBracketClose
@ -285,23 +282,23 @@ monospace = delimitedSymmetric "@" MonospaceOpen MonospaceClose
other :: Lexer other :: Lexer
other = do other = do
pos <- getPosition pos <- getPosition
c <- takeWhile1_ isUnicodeAlphaNum c <- takeWhile1_ isUnicodeAlphaNum
pure . pure $ (pos, Token c) pure . pure $ (pos, Token c)
where where
isUnicodeAlphaNum c = isPrint c && not (isControl c) && not (isSpace c) isUnicodeAlphaNum c = isPrint c && not (isControl c) && not (isSpace c)
spaceToken :: Lexer spaceToken :: Lexer
spaceToken = do spaceToken = do
pos <- getPosition pos <- getPosition
_ <- many1 (char ' ') _ <- many1 (char ' ')
pure . pure $ (pos, Space) pure . pure $ (pos, Space)
newlineToken :: Lexer newlineToken :: Lexer
newlineToken = do newlineToken = do
pos <- getPosition pos <- getPosition
_ <- newline _ <- newline
pure . pure $ (pos, Newline) pure . pure $ (pos, Newline)
------- -------
-- Helpers -- Helpers
@ -310,11 +307,11 @@ newlineToken = do
-- | Like `takeWhile`, but unconditionally take escaped characters. -- | Like `takeWhile`, but unconditionally take escaped characters.
takeWhile_ :: (Char -> Bool) -> Parser Text takeWhile_ :: (Char -> Bool) -> Parser Text
takeWhile_ p = scan p_ False takeWhile_ p = scan p_ False
where where
p_ escaped c p_ escaped c
| escaped = Just False | escaped = Just False
| not $ p c = Nothing | not $ p c = Nothing
| otherwise = Just (c == '\\') | otherwise = Just (c == '\\')
-- | Like 'takeWhile1', but unconditionally take escaped characters. -- | Like 'takeWhile1', but unconditionally take escaped characters.
takeWhile1_ :: (Char -> Bool) -> Parser Text takeWhile1_ :: (Char -> Bool) -> Parser Text
@ -324,19 +321,19 @@ takeWhile1_ = mfilter (not . Text.null) . takeWhile_
function returns true. function returns true.
-} -}
scan :: scan ::
-- | scan function -- | scan function
(state -> Char -> Maybe state) -> (state -> Char -> Maybe state) ->
-- | initial state -- | initial state
state -> state ->
Parser Text Parser Text
scan f initState = do scan f initState = do
parserState@State{stateInput = input, statePos = pos} <- Parsec.getParserState parserState@State{stateInput = input, statePos = pos} <- Parsec.getParserState
(remaining, finalPos, ct) <- go input initState pos 0 (remaining, finalPos, ct) <- go input initState pos 0
let newState = parserState{stateInput = remaining, statePos = finalPos} let newState = parserState{stateInput = remaining, statePos = finalPos}
Parsec.setParserState newState $> Text.take ct input Parsec.setParserState newState $> Text.take ct input
where where
go !input' !st !posAccum !count' = case Text.uncons input' of go !input' !st !posAccum !count' = case Text.uncons input' of
Nothing -> pure (input', posAccum, count') Nothing -> pure (input', posAccum, count')
Just (char', input'') -> case f st char' of Just (char', input'') -> case f st char' of
Nothing -> pure (input', posAccum, count') Nothing -> pure (input', posAccum, count')
Just st' -> go input'' st' (updatePosChar posAccum char') (count' + 1) Just st' -> go input'' st' (updatePosChar posAccum char') (count' + 1)

View file

@ -14,28 +14,28 @@ import Text.Parsec.Pos
main :: IO () main :: IO ()
main = hspec $ do main = hspec $ do
describe "Lexer" do describe "Lexer" do
describe "minimal" do describe "minimal" do
it "handles unicode" unicode it "handles unicode" unicode
it "escapes" escaping it "escapes" escaping
it "maths" maths it "maths" maths
it "anchors" anchors it "anchors" anchors
it "space chars" space it "space chars" space
it "bare string" someString it "bare string" someString
it "emphasis" emphatic it "emphasis" emphatic
it "monospace" monospace it "monospace" monospace
it "labeled link" labeledLink it "labeled link" labeledLink
it "markdown link" link it "markdown link" link
it "bird tracks" birdTracks it "bird tracks" birdTracks
it "module names" modules it "module names" modules
it "quotes" quotes it "quotes" quotes
it "ignores nesting" ignoreNesting it "ignores nesting" ignoreNesting
describe "Parser" do describe "Parser" do
it "Bold" do it "Bold" do
"__bold__" `shouldParseTo` (DocBold (DocString "bold")) "__bold__" `shouldParseTo` (DocBold (DocString "bold"))
it "Emphasis" do it "Emphasis" do
"/emphasis/" `shouldParseTo` (DocEmphasis (DocString "emphasis")) "/emphasis/" `shouldParseTo` (DocEmphasis (DocString "emphasis"))
------------ ------------
-- Tests -- Tests
@ -43,134 +43,136 @@ main = hspec $ do
modules :: Expectation modules :: Expectation
modules = do modules = do
"\"MyModule.Name\"" "\"MyModule.Name\""
`shouldLexTo` [ (1, 1, Module) `shouldLexTo` [ (1, 1, Module)
, (1, 2, Token "MyModule.Name") , (1, 2, Token "MyModule.Name")
] ]
"\"OtherModule.Name#myAnchor\"" "\"OtherModule.Name#myAnchor\""
`shouldLexTo` [ (1, 1, Module) `shouldLexTo` [ (1, 1, Module)
, (1, 2, Token "OtherModule.Name") , (1, 2, Token "OtherModule.Name")
, (1, 18, Anchor) , (1, 18, Anchor "myAnchor")
, (1, 19, Token "myAnchor") ]
]
"\"OtherModule.Name\\#myAnchor\""
`shouldLexTo` [ (1, 1, Module)
, (1, 2, Token "OtherModule.Name")
, (1, 18, Anchor "myAnchor")
]
link :: Expectation link :: Expectation
link = link =
"[link to](http://some.website)" "[link to](http://some.website)"
`shouldLexTo` [ (1, 1, LinkOpen) `shouldLexTo` [ (1, 1, LinkOpen)
, (1, 1, BracketOpen) , (1, 1, BracketOpen)
, (1, 2, Token "link to") , (1, 2, Token "link to")
, (1, 9, BracketClose) , (1, 9, BracketClose)
, (1, 10, ParenOpen) , (1, 10, ParenOpen)
, (1, 11, Token "http://some.website") , (1, 11, Token "http://some.website")
, (1, 30, ParenClose) , (1, 30, ParenClose)
, (1, 30, LinkClose) , (1, 30, LinkClose)
] ]
labeledLink :: Expectation labeledLink :: Expectation
labeledLink = labeledLink =
"<[link here](http://to.here) label>" "<[link here](http://to.here) label>"
`shouldLexTo` [ (1, 1, LabeledLinkOpen) `shouldLexTo` [ (1, 1, LabeledLinkOpen)
, (1, 2, BracketOpen) , (1, 2, BracketOpen)
, (1, 3, Token "link here") , (1, 3, Token "link here")
, (1, 12, BracketClose) , (1, 12, BracketClose)
, (1, 13, ParenOpen) , (1, 13, ParenOpen)
, (1, 14, Token "http://to.here") , (1, 14, Token "http://to.here")
, (1, 28, ParenClose) , (1, 28, ParenClose)
, (1, 29, Token " label") , (1, 29, Token " label")
, (1, 35, LabeledLinkClose) , (1, 35, LabeledLinkClose)
] ]
anchors :: Expectation anchors :: Expectation
anchors = anchors =
"#myAnchor#" "#myAnchor#"
`shouldLexTo` [ (1, 1, Anchor) `shouldLexTo` [ (1, 1, Anchor "myAnchor")
, (1, 2, Token "myAnchor") ]
, (1, 10, Anchor)
]
maths :: IO () maths :: IO ()
maths = do maths = do
"\\[some math\\]" "\\[some math\\]"
`shouldLexTo` [ (1, 1, MathsBracketOpen) `shouldLexTo` [ (1, 1, MathsBracketOpen)
, (1, 3, Token "some math") , (1, 3, Token "some math")
, (1, 12, MathsBracketClose) , (1, 12, MathsBracketClose)
] ]
"\\(other maths\\)" "\\(other maths\\)"
`shouldLexTo` [ (1, 1, MathsParenOpen) `shouldLexTo` [ (1, 1, MathsParenOpen)
, (1, 3, Token "other maths") , (1, 3, Token "other maths")
, (1, 14, MathsParenClose) , (1, 14, MathsParenClose)
] ]
escaping :: Expectation escaping :: Expectation
escaping = escaping =
"\\(" "\\("
`shouldLexTo` [ (1, 1, Escape) `shouldLexTo` [ (1, 1, Escape)
, (1, 2, Token "(") , (1, 2, Token "(")
] ]
unicode :: Expectation unicode :: Expectation
unicode = unicode =
"ドラゴンクエストの冒険者🐉" "ドラゴンクエストの冒険者🐉"
`shouldLexTo` [ (1, 1, Token "ドラゴンクエストの冒険者🐉") `shouldLexTo` [ (1, 1, Token "ドラゴンクエストの冒険者🐉")
] ]
ignoreNesting :: Expectation ignoreNesting :: Expectation
ignoreNesting = ignoreNesting =
">/foo/" ">/foo/"
`shouldLexTo` [ (1, 1, Token ">/foo/") `shouldLexTo` [ (1, 1, Token ">/foo/")
] ]
birdTracks :: Expectation birdTracks :: Expectation
birdTracks = birdTracks =
">> code" ">> code"
`shouldLexTo` [ (1, 1, BirdTrack) `shouldLexTo` [ (1, 1, BirdTrack)
, (1, 4, Token "code") , (1, 4, Token "code")
] ]
quotes :: Expectation quotes :: Expectation
quotes = quotes =
"\"quoted\"" "\"quoted\""
`shouldLexTo` [ (1, 1, QuoteOpen) `shouldLexTo` [ (1, 1, QuoteOpen)
, (1, 2, Token "quoted") , (1, 2, Token "quoted")
, (1, 8, QuoteClose) , (1, 8, QuoteClose)
] ]
space :: Expectation space :: Expectation
space = do space = do
"\n " "\n "
`shouldLexTo` [ (1, 1, Newline) `shouldLexTo` [ (1, 1, Newline)
, (2, 1, Space) , (2, 1, Space)
] ]
" \n" " \n"
`shouldLexTo` [ (1, 1, Space) `shouldLexTo` [ (1, 1, Space)
, (1, 2, Newline) , (1, 2, Newline)
] ]
monospace :: Expectation monospace :: Expectation
monospace = monospace =
"@mono@" "@mono@"
`shouldLexTo` [ (1, 1, MonospaceOpen) `shouldLexTo` [ (1, 1, MonospaceOpen)
, (1, 2, Token "mono") , (1, 2, Token "mono")
, (1, 6, MonospaceClose) , (1, 6, MonospaceClose)
] ]
emphatic :: Expectation emphatic :: Expectation
emphatic = emphatic =
"/emphatic/" "/emphatic/"
`shouldLexTo` [ (1, 1, EmphasisOpen) `shouldLexTo` [ (1, 1, EmphasisOpen)
, (1, 2, Token "emphatic") , (1, 2, Token "emphatic")
, (1, 10, EmphasisClose) , (1, 10, EmphasisClose)
] ]
someString :: Expectation someString :: Expectation
someString = someString =
"some string" "some string"
`shouldLexTo` [ (1, 1, Token "some") `shouldLexTo` [ (1, 1, Token "some")
, (1, 5, Space) , (1, 5, Space)
, (1, 6, Token "string") , (1, 6, Token "string")
] ]
-------------- --------------
-- Helpers -- Helpers
@ -179,15 +181,15 @@ someString =
type Doc id = DocMarkup () id type Doc id = DocMarkup () id
instance IsString (Doc String) where instance IsString (Doc String) where
fromString = DocString fromString = DocString
shouldLexTo :: String -> [(Int, Int, Token)] -> Expectation shouldLexTo :: String -> [(Int, Int, Token)] -> Expectation
shouldLexTo input expected = shouldLexTo input expected =
case lexer input of case lexer input of
Right tokens -> do Right tokens -> do
let actual = map (\(pos, tok) -> (sourceLine pos, sourceColumn pos, tok)) tokens let actual = map (\(pos, tok) -> (sourceLine pos, sourceColumn pos, tok)) tokens
actual `shouldBe` expected actual `shouldBe` expected
Left err -> expectationFailure $ "Parse error: " <> show err Left err -> expectationFailure $ "Parse error: " <> show err
shouldParseTo :: Text -> DocMarkup mod Identifier -> Expectation shouldParseTo :: Text -> DocMarkup mod Identifier -> Expectation
shouldParseTo input ast = parseText input `shouldBe` ast shouldParseTo input ast = parseText input `shouldBe` ast

View file

@ -27,8 +27,16 @@ ftp\://example.com
![alt text](image.png) ![alt text](image.png)
\(mathematical expression\) \(mathematical 1+3 expression\)
\[mathematical expression\]
\[mathematical
expression
accross lines with + addition and such
\]
&#123
&#x65
&#165
@ @
code block content code block content