Compare commits

...

7 commits

Author SHA1 Message Date
6c0b4a4288
doc(lexer): explain the use of incSourceColumn
I think it is clearer to phrase it this way so it is clear that we are
not unconsuming (i.e. changing the state of the parser).
2025-09-24 21:25:46 +08:00
75c4817166
style(lexer): pluralize moduleNames parser 2025-09-24 21:25:46 +08:00
7ceb9b0277
ref(moduleName): break into multiple smaller functions
upperId has been changed to only use isUpper because an non alphabetical
character would be false anyway
2025-09-24 21:25:46 +08:00
368e5bc9a0
ref(lexer): simplify anchor 2025-09-24 21:25:46 +08:00
d6087ec3d6
ref(lexer): simplify delimited logic 2025-09-24 21:25:46 +08:00
fdb9070e99
fix(lexer): handle crlf in newline
Do we support windows 🤔
2025-09-24 21:25:46 +08:00
39cfe2035d test: freeze the callstack for better failure messages (#1)
This would make the function `shouldLexTo` be transparent in the error reporting! Before all errors would point to this line.

Reviewed-on: #1
Co-authored-by: Léana 江 <leana.jiang+git@icloud.com>
Co-committed-by: Léana 江 <leana.jiang+git@icloud.com>
2025-09-24 11:01:52 +00:00
2 changed files with 65 additions and 66 deletions

View file

@ -16,6 +16,7 @@ import Text.Parsec
import Text.Parsec qualified as Parsec import Text.Parsec qualified as Parsec
import Text.Parsec.Pos (updatePosChar) import Text.Parsec.Pos (updatePosChar)
type Located a = (SourcePos, a)
type LocatedToken = (SourcePos, Token) type LocatedToken = (SourcePos, Token)
type Lexer = Parser [LocatedToken] type Lexer = Parser [LocatedToken]
@ -55,7 +56,7 @@ data Token
| MathsBracketOpen | MathsBracketOpen
| MathsBracketClose | MathsBracketClose
| NumericEntity Int | NumericEntity Int
| Module | Module Text
| QuoteOpen | QuoteOpen
| QuoteClose | QuoteClose
| Space | Space
@ -125,90 +126,85 @@ headers =
, header6 , header6
] ]
delimitedMaybe :: Parser a -> Parser a -> Token -> Maybe Token -> Parser [LocatedToken]
delimitedMaybe openMark closeMark openToken closeToken = do
openPos <- getPosition
void openMark
tokenPos <- getPosition
content <- anyUntil closeMark
closePos <- getPosition
void closeMark
let openTok :: LocatedToken = (openPos, openToken)
res :: LocatedToken = (tokenPos, Token content)
closeToks :: [LocatedToken] = case closeToken of
Just close -> [(closePos, close)]
Nothing -> []
pure $ [openTok, res] <> closeToks
anyUntil :: Parser a -> Parser Text anyUntil :: Parser a -> Parser Text
anyUntil p = Text.pack <$> manyTill anyChar (lookAhead p) anyUntil p = Text.pack <$> manyTill anyChar (lookAhead p)
delimited :: Parser a -> Parser a -> Token -> Token -> Parser [LocatedToken] delimitedAsTuple :: Parser open -> Parser close -> Parser (Located open, LocatedToken, Located close)
delimited a b c d = delimitedMaybe a b c (Just d) delimitedAsTuple openP closeP =
(,,)
<$> located openP
<*> located (Token <$> anyUntil closeP)
<*> located closeP
delimited :: Parser a -> Parser b -> Token -> Token -> Parser [LocatedToken]
delimited openP closeP openTok closeTok = asList <$> delimitedAsTuple (openTok <$ openP) (closeTok <$ closeP)
where
asList (a, tok, b) = [a, tok, b]
delimitedNoTrailing :: Parser a -> Parser b -> Token -> Parser [LocatedToken]
delimitedNoTrailing openP closeP openTok = asList <$> delimitedAsTuple (openTok <$ openP) (void closeP)
where
asList (a, tok, _) = [a, tok]
delimitedSymmetric :: Parser a -> Token -> Token -> Parser [LocatedToken] delimitedSymmetric :: Parser a -> Token -> Token -> Parser [LocatedToken]
delimitedSymmetric s t1 t2 = delimited s s t1 t2 delimitedSymmetric s t1 t2 = delimited s s t1 t2
eol :: Parser () eol :: Parser ()
eol = void "\n" <|> Parsec.eof eol = void "\n" <|> void "\r\n" <|> Parsec.eof
anchorHash :: Parser Text
anchorHash = "#" <|> try "\\#"
header1 :: Lexer header1 :: Lexer
header1 = delimitedMaybe (void $ "= ") eol (Header One) Nothing header1 = delimitedNoTrailing "= " eol (Header One)
header2 :: Lexer header2 :: Lexer
header2 = delimitedMaybe (void $ "== ") eol (Header Two) Nothing header2 = delimitedNoTrailing "== " eol (Header Two)
header3 :: Lexer header3 :: Lexer
header3 = delimitedMaybe (void $ "=== ") eol (Header Three) Nothing header3 = delimitedNoTrailing "=== " eol (Header Three)
header4 :: Lexer header4 :: Lexer
header4 = delimitedMaybe (void $ "==== ") eol (Header Four) Nothing header4 = delimitedNoTrailing "==== " eol (Header Four)
header5 :: Lexer header5 :: Lexer
header5 = delimitedMaybe (void $ "===== ") eol (Header Five) Nothing header5 = delimitedNoTrailing "===== " eol (Header Five)
header6 :: Lexer header6 :: Lexer
header6 = delimitedMaybe (void $ "====== ") eol (Header Six) Nothing header6 = delimitedNoTrailing "====== " eol (Header Six)
-- #anchors# -- #anchors#
anchors :: Lexer anchors :: Lexer
anchors = do anchors =
pos <- getPosition tokenise
void $ try anchor' [ between anchorHash anchorHash (Anchor <$> anyUntil anchorHash)
txt <- anyUntil anchor' ]
void $ try anchor'
pure [(pos, Anchor txt)]
where moduleNames :: Parser Text
anchor' = (string "#" <|> string "\\#") moduleNames = intercalate "." . fmap Text.pack <$> upperId `sepBy1` char '.'
upperId :: Parser String
upperId = (:) <$> satisfy isUpper <*> many1 identifierChar
identifierChar :: Parser Char
identifierChar = satisfy (\c -> isAlphaNum c || c == '_')
-- "Module.Name" -- "Module.Name"
-- "Module.Name#anchor" -- "Module.Name#anchor"
-- "Module.Name\#anchor" -- this has been deprecated for 9 years, thanks Ben -- "Module.Name\#anchor" -- this has been deprecated for 9 years, thanks Ben
modules :: Lexer modules :: Lexer
modules = do modules = between (char '"') (char '"') inner
startPos <- startPosition $ char '"'
(modPos, modName) <- located modId
anch <- option [] do
anchPos <- startPosition (string "#" <|> string' "\\#")
txt <- Text.pack <$> many (satisfy (\c -> c /= '"' && not (isSpace c)))
pure [(anchPos, Anchor txt)]
void $ char '"'
pure $ [(startPos, Module), (modPos, Token modName)] <> anch
where where
modId = intercalate "." <$> (fmap Text.pack <$> (conId `sepBy1` (char '.'))) inner = do
module_ <- located $ Module <$> moduleNames
mAnchor <- optionMaybe (located $ anchorHash *> (Anchor <$> anchorText))
pure $ case mAnchor of
Just anchor -> [module_, anchor]
Nothing -> [module_]
conId :: Parser String anchorText :: Parser Text
conId = anchorText = Text.pack <$> many (satisfy (\c -> c /= '"' && not (isSpace c)))
(:)
<$> satisfy (\c -> isAlpha c && isUpper c)
<*> many1 conChar
conChar :: Parser Char
conChar = satisfy (\c -> isAlphaNum c || c == '_')
linkRaw :: Lexer linkRaw :: Lexer
linkRaw = linkRaw =
@ -225,7 +221,7 @@ link :: Lexer
link = do link = do
pos <- getPosition pos <- getPosition
l <- linkRaw l <- linkRaw
-- "unconsume" the last token -- register the position of the last token
pos' <- flip incSourceColumn (-1) <$> getPosition pos' <- flip incSourceColumn (-1) <$> getPosition
pure $ (pos, LinkOpen) : l <> [(pos', LinkClose)] pure $ (pos, LinkOpen) : l <> [(pos', LinkClose)]
@ -247,16 +243,16 @@ labeledLink = do
] ]
mathsBracket :: Lexer mathsBracket :: Lexer
mathsBracket = delimited (void $ "\\[") (void "\\]") MathsBracketOpen MathsBracketClose mathsBracket = delimited "\\[" "\\]" MathsBracketOpen MathsBracketClose
mathsParens :: Lexer mathsParens :: Lexer
mathsParens = delimited (void $ "\\(") (void "\\)") MathsParenOpen MathsParenClose mathsParens = delimited "\\(" "\\)" MathsParenOpen MathsParenClose
birdTrack :: Lexer birdTrack :: Lexer
birdTrack = delimitedMaybe (void ">> ") eol BirdTrack Nothing birdTrack = delimitedNoTrailing ">> " eol BirdTrack
escape :: Lexer escape :: Lexer
escape = delimitedMaybe (void "\\") eol Escape Nothing escape = delimitedNoTrailing "\\" eol Escape
quotes :: Lexer quotes :: Lexer
quotes = delimitedSymmetric "\"" QuoteOpen QuoteClose quotes = delimitedSymmetric "\"" QuoteOpen QuoteClose

View file

@ -11,6 +11,7 @@ import Types
import Data.String (IsString (..)) import Data.String (IsString (..))
import Data.Text (Text) import Data.Text (Text)
import Text.Parsec.Pos import Text.Parsec.Pos
import GHC.Stack
main :: IO () main :: IO ()
main = hspec $ do main = hspec $ do
@ -44,19 +45,16 @@ main = hspec $ do
modules :: Expectation modules :: Expectation
modules = do modules = do
"\"MyModule.Name\"" "\"MyModule.Name\""
`shouldLexTo` [ (1, 1, Module) `shouldLexTo` [ (1, 2, Module "MyModule.Name")
, (1, 2, Token "MyModule.Name")
] ]
"\"OtherModule.Name#myAnchor\"" "\"OtherModule.Name#myAnchor\""
`shouldLexTo` [ (1, 1, Module) `shouldLexTo` [ (1, 2, Module "OtherModule.Name")
, (1, 2, Token "OtherModule.Name")
, (1, 18, Anchor "myAnchor") , (1, 18, Anchor "myAnchor")
] ]
"\"OtherModule.Name\\#myAnchor\"" "\"OtherModule.Name\\#myAnchor\""
`shouldLexTo` [ (1, 1, Module) `shouldLexTo` [ (1, 2, Module "OtherModule.Name")
, (1, 2, Token "OtherModule.Name")
, (1, 18, Anchor "myAnchor") , (1, 18, Anchor "myAnchor")
] ]
link :: Expectation link :: Expectation
@ -106,11 +104,15 @@ maths = do
] ]
escaping :: Expectation escaping :: Expectation
escaping = escaping = do
"\\(" "\\("
`shouldLexTo` [ (1, 1, Escape) `shouldLexTo` [ (1, 1, Escape)
, (1, 2, Token "(") , (1, 2, Token "(")
] ]
"\\(\r\n"
`shouldLexTo` [ (1, 1, Escape)
, (1, 2, Token "(")
]
unicode :: Expectation unicode :: Expectation
unicode = unicode =
@ -185,6 +187,7 @@ instance IsString (Doc String) where
shouldLexTo :: String -> [(Int, Int, Token)] -> Expectation shouldLexTo :: String -> [(Int, Int, Token)] -> Expectation
shouldLexTo input expected = shouldLexTo input expected =
withFrozenCallStack $
case lexer input of case lexer input of
Right tokens -> do Right tokens -> do
let actual = map (\(pos, tok) -> (sourceLine pos, sourceColumn pos, tok)) tokens let actual = map (\(pos, tok) -> (sourceLine pos, sourceColumn pos, tok)) tokens