forked from elland/haddock2
refactor lexer (#2)
Reduce manual usage of getPosition and setting it and improve some helper functions. Reviewed-on: elland/haddock2#2 Reviewed-by: elland <igor@elland.me> Co-authored-by: Léana 江 <leana.jiang+git@icloud.com> Co-committed-by: Léana 江 <leana.jiang+git@icloud.com>
This commit is contained in:
parent
39cfe2035d
commit
6064afd0b9
3 changed files with 91 additions and 106 deletions
|
|
@ -12,7 +12,7 @@ bold ::= '__' text_no_newline '__'
|
||||||
monospace ::= '@' text_content '@'
|
monospace ::= '@' text_content '@'
|
||||||
|
|
||||||
link ::= module_link | hyperlink | markdown_link
|
link ::= module_link | hyperlink | markdown_link
|
||||||
module_link ::= '"' module_name ( '#' anchor_name )? '"'
|
module_link ::= '"' module_name ( ('#' | '\#') anchor_name )? '"'
|
||||||
hyperlink ::= '<' url ( ' ' link_text )? '>'
|
hyperlink ::= '<' url ( ' ' link_text )? '>'
|
||||||
markdown_link ::= '[' link_text '](' ( url | module_link ) ')'
|
markdown_link ::= '[' link_text '](' ( url | module_link ) ')'
|
||||||
|
|
||||||
|
|
|
||||||
160
src/Lexer.hs
160
src/Lexer.hs
|
|
@ -10,12 +10,13 @@ import Control.Monad (mfilter, void)
|
||||||
import Data.Functor (($>))
|
import Data.Functor (($>))
|
||||||
import Data.Text (Text, intercalate)
|
import Data.Text (Text, intercalate)
|
||||||
import Data.Text qualified as Text
|
import Data.Text qualified as Text
|
||||||
import GHC.Unicode (isAlpha, isAlphaNum, isControl, isPrint, isSpace, isUpper)
|
import GHC.Unicode (isAlphaNum, isControl, isPrint, isSpace, isUpper)
|
||||||
import ParserMonad (Parser, initialParserState)
|
import ParserMonad (Parser, initialParserState)
|
||||||
import Text.Parsec
|
import Text.Parsec
|
||||||
import Text.Parsec qualified as Parsec
|
import Text.Parsec qualified as Parsec
|
||||||
import Text.Parsec.Pos (updatePosChar)
|
import Text.Parsec.Pos (updatePosChar)
|
||||||
|
|
||||||
|
type Located a = (SourcePos, a)
|
||||||
type LocatedToken = (SourcePos, Token)
|
type LocatedToken = (SourcePos, Token)
|
||||||
|
|
||||||
type Lexer = Parser [LocatedToken]
|
type Lexer = Parser [LocatedToken]
|
||||||
|
|
@ -50,12 +51,12 @@ data Token
|
||||||
| ParenClose
|
| ParenClose
|
||||||
| BracketOpen
|
| BracketOpen
|
||||||
| BracketClose
|
| BracketClose
|
||||||
| MathsParenOpen
|
| MathInlineOpen
|
||||||
| MathsParenClose
|
| MathInlineClose
|
||||||
| MathsBracketOpen
|
| MathMultilineOpen
|
||||||
| MathsBracketClose
|
| MathMultilineClose
|
||||||
| NumericEntity Int
|
| NumericEntity Int
|
||||||
| Module
|
| Module Text
|
||||||
| QuoteOpen
|
| QuoteOpen
|
||||||
| QuoteClose
|
| QuoteClose
|
||||||
| Space
|
| Space
|
||||||
|
|
@ -65,9 +66,6 @@ data Token
|
||||||
located :: Parser a -> Parser (SourcePos, a)
|
located :: Parser a -> Parser (SourcePos, a)
|
||||||
located p = (,) <$> getPosition <*> p
|
located p = (,) <$> getPosition <*> p
|
||||||
|
|
||||||
startPosition :: Parser a -> Parser SourcePos
|
|
||||||
startPosition = fmap fst . located
|
|
||||||
|
|
||||||
tokenise :: [Parser a] -> Parser [(SourcePos, a)]
|
tokenise :: [Parser a] -> Parser [(SourcePos, a)]
|
||||||
tokenise = sequence . map located
|
tokenise = sequence . map located
|
||||||
|
|
||||||
|
|
@ -84,16 +82,16 @@ lexText = go
|
||||||
toks <-
|
toks <-
|
||||||
choice $
|
choice $
|
||||||
Parsec.try
|
Parsec.try
|
||||||
<$> [ mathsBracket
|
<$> [ mathMultiline
|
||||||
, mathsParens
|
, mathInline
|
||||||
, escape -- maths go before escape to avoid mismatch
|
, escape -- maths go before escape to avoid mismatch
|
||||||
, headers
|
, headers
|
||||||
, newlineToken
|
, newlineToken
|
||||||
, spaceToken
|
, spaceToken
|
||||||
, link
|
, link
|
||||||
, labeledLink
|
, labeledLink
|
||||||
, modules
|
, module_
|
||||||
, anchors
|
, anchor
|
||||||
, textElement
|
, textElement
|
||||||
, quotes
|
, quotes
|
||||||
, birdTrack
|
, birdTrack
|
||||||
|
|
@ -125,90 +123,84 @@ headers =
|
||||||
, header6
|
, header6
|
||||||
]
|
]
|
||||||
|
|
||||||
delimitedMaybe :: Parser a -> Parser a -> Token -> Maybe Token -> Parser [LocatedToken]
|
|
||||||
delimitedMaybe openMark closeMark openToken closeToken = do
|
|
||||||
openPos <- getPosition
|
|
||||||
void openMark
|
|
||||||
tokenPos <- getPosition
|
|
||||||
content <- anyUntil closeMark
|
|
||||||
closePos <- getPosition
|
|
||||||
void closeMark
|
|
||||||
|
|
||||||
let openTok :: LocatedToken = (openPos, openToken)
|
|
||||||
res :: LocatedToken = (tokenPos, Token content)
|
|
||||||
closeToks :: [LocatedToken] = case closeToken of
|
|
||||||
Just close -> [(closePos, close)]
|
|
||||||
Nothing -> []
|
|
||||||
|
|
||||||
pure $ [openTok, res] <> closeToks
|
|
||||||
|
|
||||||
anyUntil :: Parser a -> Parser Text
|
anyUntil :: Parser a -> Parser Text
|
||||||
anyUntil p = Text.pack <$> manyTill anyChar (lookAhead p)
|
anyUntil p = Text.pack <$> manyTill anyChar (lookAhead p)
|
||||||
|
|
||||||
delimited :: Parser a -> Parser a -> Token -> Token -> Parser [LocatedToken]
|
delimitedAsTuple :: Parser open -> Parser close -> Parser (Located open, LocatedToken, Located close)
|
||||||
delimited a b c d = delimitedMaybe a b c (Just d)
|
delimitedAsTuple openP closeP =
|
||||||
|
(,,)
|
||||||
|
<$> located openP
|
||||||
|
<*> located (Token <$> anyUntil closeP)
|
||||||
|
<*> located closeP
|
||||||
|
|
||||||
|
delimited :: Parser open -> Parser close -> Token -> Token -> Parser [LocatedToken]
|
||||||
|
delimited openP closeP openTok closeTok = asList <$> delimitedAsTuple (openTok <$ openP) (closeTok <$ closeP)
|
||||||
|
where
|
||||||
|
asList (a, tok, b) = [a, tok, b]
|
||||||
|
|
||||||
|
delimitedNoTrailing :: Parser open -> Parser close -> Token -> Parser [LocatedToken]
|
||||||
|
delimitedNoTrailing openP closeP openTok = asList <$> delimitedAsTuple (openTok <$ openP) (void closeP)
|
||||||
|
where
|
||||||
|
asList (a, tok, _) = [a, tok]
|
||||||
|
|
||||||
delimitedSymmetric :: Parser a -> Token -> Token -> Parser [LocatedToken]
|
delimitedSymmetric :: Parser a -> Token -> Token -> Parser [LocatedToken]
|
||||||
delimitedSymmetric s t1 t2 = delimited s s t1 t2
|
delimitedSymmetric s t1 t2 = delimited s s t1 t2
|
||||||
|
|
||||||
eol :: Parser ()
|
eol :: Parser ()
|
||||||
eol = void "\n" <|> Parsec.eof
|
eol = void "\n" <|> void "\r\n" <|> Parsec.eof
|
||||||
|
|
||||||
header1 :: Lexer
|
header1 :: Lexer
|
||||||
header1 = delimitedMaybe (void $ "= ") eol (Header One) Nothing
|
header1 = delimitedNoTrailing "= " eol (Header One)
|
||||||
|
|
||||||
header2 :: Lexer
|
header2 :: Lexer
|
||||||
header2 = delimitedMaybe (void $ "== ") eol (Header Two) Nothing
|
header2 = delimitedNoTrailing "== " eol (Header Two)
|
||||||
|
|
||||||
header3 :: Lexer
|
header3 :: Lexer
|
||||||
header3 = delimitedMaybe (void $ "=== ") eol (Header Three) Nothing
|
header3 = delimitedNoTrailing "=== " eol (Header Three)
|
||||||
|
|
||||||
header4 :: Lexer
|
header4 :: Lexer
|
||||||
header4 = delimitedMaybe (void $ "==== ") eol (Header Four) Nothing
|
header4 = delimitedNoTrailing "==== " eol (Header Four)
|
||||||
|
|
||||||
header5 :: Lexer
|
header5 :: Lexer
|
||||||
header5 = delimitedMaybe (void $ "===== ") eol (Header Five) Nothing
|
header5 = delimitedNoTrailing "===== " eol (Header Five)
|
||||||
|
|
||||||
header6 :: Lexer
|
header6 :: Lexer
|
||||||
header6 = delimitedMaybe (void $ "====== ") eol (Header Six) Nothing
|
header6 = delimitedNoTrailing "====== " eol (Header Six)
|
||||||
|
|
||||||
-- #anchors#
|
-- #anchors#
|
||||||
anchors :: Lexer
|
anchor :: Lexer
|
||||||
anchors = do
|
anchor = do
|
||||||
pos <- getPosition
|
x <- located $ between "#" "#" (Anchor <$> anyUntil "#")
|
||||||
void $ try anchor'
|
pure [x]
|
||||||
txt <- anyUntil anchor'
|
|
||||||
void $ try anchor'
|
|
||||||
|
|
||||||
pure [(pos, Anchor txt)]
|
|
||||||
where
|
moduleNames :: Parser Text
|
||||||
anchor' = (string "#" <|> string "\\#")
|
moduleNames = intercalate "." . fmap Text.pack <$> upperId `sepBy1` char '.'
|
||||||
|
|
||||||
|
upperId :: Parser String
|
||||||
|
upperId = (:) <$> satisfy isUpper <*> many1 identifierChar
|
||||||
|
|
||||||
|
identifierChar :: Parser Char
|
||||||
|
identifierChar = satisfy (\c -> isAlphaNum c || c == '_')
|
||||||
|
|
||||||
-- "Module.Name"
|
-- "Module.Name"
|
||||||
-- "Module.Name#anchor"
|
-- "Module.Name#anchor"
|
||||||
-- "Module.Name\#anchor" -- this has been deprecated for 9 years, thanks Ben
|
-- "Module.Name\#anchor" -- known as "old anchor". this has been deprecated for 9 years, thanks Ben
|
||||||
modules :: Lexer
|
module_ :: Lexer
|
||||||
modules = do
|
module_ = between (char '"') (char '"') inner
|
||||||
startPos <- startPosition $ char '"'
|
|
||||||
(modPos, modName) <- located modId
|
|
||||||
anch <- option [] do
|
|
||||||
anchPos <- startPosition (string "#" <|> string' "\\#")
|
|
||||||
txt <- Text.pack <$> many (satisfy (\c -> c /= '"' && not (isSpace c)))
|
|
||||||
pure [(anchPos, Anchor txt)]
|
|
||||||
|
|
||||||
void $ char '"'
|
|
||||||
pure $ [(startPos, Module), (modPos, Token modName)] <> anch
|
|
||||||
where
|
where
|
||||||
modId = intercalate "." <$> (fmap Text.pack <$> (conId `sepBy1` (char '.')))
|
inner = do
|
||||||
|
m <- located $ Module <$> moduleNames
|
||||||
|
mAnchor <- optionMaybe (located $ anchorHash *> (Anchor <$> anchorText))
|
||||||
|
pure $ case mAnchor of
|
||||||
|
Just anc -> [m, anc]
|
||||||
|
Nothing -> [m]
|
||||||
|
|
||||||
conId :: Parser String
|
anchorHash :: Parser Text
|
||||||
conId =
|
anchorHash = "#" <|> try "\\#"
|
||||||
(:)
|
|
||||||
<$> satisfy (\c -> isAlpha c && isUpper c)
|
|
||||||
<*> many1 conChar
|
|
||||||
|
|
||||||
conChar :: Parser Char
|
anchorText :: Parser Text
|
||||||
conChar = satisfy (\c -> isAlphaNum c || c == '_')
|
anchorText = Text.pack <$> many (satisfy (\c -> c /= '"' && not (isSpace c)))
|
||||||
|
|
||||||
linkRaw :: Lexer
|
linkRaw :: Lexer
|
||||||
linkRaw =
|
linkRaw =
|
||||||
|
|
@ -225,38 +217,30 @@ link :: Lexer
|
||||||
link = do
|
link = do
|
||||||
pos <- getPosition
|
pos <- getPosition
|
||||||
l <- linkRaw
|
l <- linkRaw
|
||||||
-- "unconsume" the last token
|
-- register the position of the last token
|
||||||
pos' <- flip incSourceColumn (-1) <$> getPosition
|
pos' <- flip incSourceColumn (-1) <$> getPosition
|
||||||
pure $ (pos, LinkOpen) : l <> [(pos', LinkClose)]
|
pure $ (pos, LinkOpen) : l <> [(pos', LinkClose)]
|
||||||
|
|
||||||
labeledLink :: Lexer
|
labeledLink :: Lexer
|
||||||
labeledLink = do
|
labeledLink = do
|
||||||
pos <- getPosition
|
open <- located $ LabeledLinkOpen <$ "<"
|
||||||
void $ string "<"
|
linkRes <- linkRaw
|
||||||
link' <- linkRaw
|
labelRes <- located $ Token <$> anyUntil ">"
|
||||||
pos7 <- getPosition
|
close <- located $ LabeledLinkClose <$ ">"
|
||||||
label' <- anyUntil $ string ">"
|
|
||||||
pos8 <- getPosition
|
|
||||||
void $ ">"
|
|
||||||
|
|
||||||
pure $
|
pure $
|
||||||
(pos, LabeledLinkOpen)
|
open : linkRes <> [ labelRes , close ]
|
||||||
: link'
|
|
||||||
<> [ (pos7, Token label')
|
|
||||||
, (pos8, LabeledLinkClose)
|
|
||||||
]
|
|
||||||
|
|
||||||
mathsBracket :: Lexer
|
mathMultiline :: Lexer
|
||||||
mathsBracket = delimited (void $ "\\[") (void "\\]") MathsBracketOpen MathsBracketClose
|
mathMultiline = delimited "\\[" "\\]" MathMultilineOpen MathMultilineClose
|
||||||
|
|
||||||
mathsParens :: Lexer
|
mathInline :: Lexer
|
||||||
mathsParens = delimited (void $ "\\(") (void "\\)") MathsParenOpen MathsParenClose
|
mathInline = delimited "\\(" "\\)" MathInlineOpen MathInlineClose
|
||||||
|
|
||||||
birdTrack :: Lexer
|
birdTrack :: Lexer
|
||||||
birdTrack = delimitedMaybe (void ">> ") eol BirdTrack Nothing
|
birdTrack = delimitedNoTrailing ">> " eol BirdTrack
|
||||||
|
|
||||||
escape :: Lexer
|
escape :: Lexer
|
||||||
escape = delimitedMaybe (void "\\") eol Escape Nothing
|
escape = delimitedNoTrailing "\\" eol Escape
|
||||||
|
|
||||||
quotes :: Lexer
|
quotes :: Lexer
|
||||||
quotes = delimitedSymmetric "\"" QuoteOpen QuoteClose
|
quotes = delimitedSymmetric "\"" QuoteOpen QuoteClose
|
||||||
|
|
|
||||||
35
test/Spec.hs
35
test/Spec.hs
|
|
@ -19,8 +19,8 @@ main = hspec $ do
|
||||||
describe "minimal" do
|
describe "minimal" do
|
||||||
it "handles unicode" unicode
|
it "handles unicode" unicode
|
||||||
it "escapes" escaping
|
it "escapes" escaping
|
||||||
it "maths" maths
|
it "maths" math
|
||||||
it "anchors" anchors
|
it "anchors" anchor
|
||||||
it "space chars" space
|
it "space chars" space
|
||||||
it "bare string" someString
|
it "bare string" someString
|
||||||
it "emphasis" emphatic
|
it "emphasis" emphatic
|
||||||
|
|
@ -45,19 +45,16 @@ main = hspec $ do
|
||||||
modules :: Expectation
|
modules :: Expectation
|
||||||
modules = do
|
modules = do
|
||||||
"\"MyModule.Name\""
|
"\"MyModule.Name\""
|
||||||
`shouldLexTo` [ (1, 1, Module)
|
`shouldLexTo` [ (1, 2, Module "MyModule.Name")
|
||||||
, (1, 2, Token "MyModule.Name")
|
|
||||||
]
|
]
|
||||||
|
|
||||||
"\"OtherModule.Name#myAnchor\""
|
"\"OtherModule.Name#myAnchor\""
|
||||||
`shouldLexTo` [ (1, 1, Module)
|
`shouldLexTo` [ (1, 2, Module "OtherModule.Name")
|
||||||
, (1, 2, Token "OtherModule.Name")
|
|
||||||
, (1, 18, Anchor "myAnchor")
|
, (1, 18, Anchor "myAnchor")
|
||||||
]
|
]
|
||||||
|
|
||||||
"\"OtherModule.Name\\#myAnchor\""
|
"\"OtherModule.Name\\#myAnchor\""
|
||||||
`shouldLexTo` [ (1, 1, Module)
|
`shouldLexTo` [ (1, 2, Module "OtherModule.Name")
|
||||||
, (1, 2, Token "OtherModule.Name")
|
|
||||||
, (1, 18, Anchor "myAnchor")
|
, (1, 18, Anchor "myAnchor")
|
||||||
]
|
]
|
||||||
link :: Expectation
|
link :: Expectation
|
||||||
|
|
@ -87,31 +84,35 @@ labeledLink =
|
||||||
, (1, 35, LabeledLinkClose)
|
, (1, 35, LabeledLinkClose)
|
||||||
]
|
]
|
||||||
|
|
||||||
anchors :: Expectation
|
anchor :: Expectation
|
||||||
anchors =
|
anchor =
|
||||||
"#myAnchor#"
|
"#myAnchor#"
|
||||||
`shouldLexTo` [ (1, 1, Anchor "myAnchor")
|
`shouldLexTo` [ (1, 1, Anchor "myAnchor")
|
||||||
]
|
]
|
||||||
|
|
||||||
maths :: IO ()
|
math :: IO ()
|
||||||
maths = do
|
math = do
|
||||||
"\\[some math\\]"
|
"\\[some math\\]"
|
||||||
`shouldLexTo` [ (1, 1, MathsBracketOpen)
|
`shouldLexTo` [ (1, 1, MathMultilineOpen)
|
||||||
, (1, 3, Token "some math")
|
, (1, 3, Token "some math")
|
||||||
, (1, 12, MathsBracketClose)
|
, (1, 12, MathMultilineClose)
|
||||||
]
|
]
|
||||||
"\\(other maths\\)"
|
"\\(other maths\\)"
|
||||||
`shouldLexTo` [ (1, 1, MathsParenOpen)
|
`shouldLexTo` [ (1, 1, MathInlineOpen)
|
||||||
, (1, 3, Token "other maths")
|
, (1, 3, Token "other maths")
|
||||||
, (1, 14, MathsParenClose)
|
, (1, 14, MathInlineClose)
|
||||||
]
|
]
|
||||||
|
|
||||||
escaping :: Expectation
|
escaping :: Expectation
|
||||||
escaping =
|
escaping = do
|
||||||
"\\("
|
"\\("
|
||||||
`shouldLexTo` [ (1, 1, Escape)
|
`shouldLexTo` [ (1, 1, Escape)
|
||||||
, (1, 2, Token "(")
|
, (1, 2, Token "(")
|
||||||
]
|
]
|
||||||
|
"\\(\r\n"
|
||||||
|
`shouldLexTo` [ (1, 1, Escape)
|
||||||
|
, (1, 2, Token "(")
|
||||||
|
]
|
||||||
|
|
||||||
unicode :: Expectation
|
unicode :: Expectation
|
||||||
unicode =
|
unicode =
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue