haddock2/src/Lexer.hs

{-# LANGUAGE OverloadedStrings #-}

module Lexer (
  Token (..),
  lexer,
  emphasis,
)
where

import Control.Monad (mfilter, void)
import Data.Char (ord, toLower)
import Data.Functor (($>))
import Data.Text (Text, intercalate)
import Data.Text qualified as Text
import GHC.Unicode (isAlphaNum, isControl, isDigit, isPrint, isSpace, isUpper)
import ParserMonad (Parser, initialParserState)
import Text.Parsec
import Text.Parsec qualified as Parsec
import Text.Parsec.Pos (updatePosChar)

type Located a = (SourcePos, a)

type LocatedToken = (SourcePos, Token)

type Lexer = Parser [LocatedToken]

data Level
  = One
  | Two
  | Three
  | Four
  | Five
  | Six
  deriving (Eq, Show)

data Token
  = Token Text
  | Anchor Text
  | BirdTrack
  | BoldOpen
  | BoldClose
  | Escape
  | EmphasisOpen
  | EmphasisClose
  | Expression
  | Header Level
  | MonospaceOpen
  | MonospaceClose
  | Newline
  | LinkOpen
  | LinkClose
  | LabeledLinkOpen
  | LabeledLinkClose
  | ParenOpen
  | ParenClose
  | BracketOpen
  | BracketClose
  | MathInlineOpen
  | MathInlineClose
  | MathMultilineOpen
  | MathMultilineClose
  | NumericEntity Int
  | Module Text
  | QuoteOpen
  | QuoteClose
  | Space
  | EOF
  deriving (Eq, Show)

located :: Parser a -> Parser (SourcePos, a)
located p = (,) <$> getPosition <*> p

tokenise :: [Parser a] -> Parser [(SourcePos, a)]
tokenise = mapM located

lexer :: String -> Either ParseError [LocatedToken]
lexer = Parsec.runParser lexText initialParserState "input" . Text.pack

lexText :: Parser [LocatedToken]
lexText = go
 where
  go = do
    Parsec.optionMaybe Parsec.eof >>= \case
      Just _ -> pure []
      Nothing -> do
        toks <- topLevel
        rest <- go
        pure (toks <> rest)

{- FOURMOLU_DISABLE -}
  topLevel =
    -- backtracking here so we always have a chance to try "other", the "catch-all-leave-to-parser-to-deal-with" choice
    -- TODO: is this desirable? do we throw lexer error at all?
    try
      ( choice
          -- Sorted in
          -- - longest to shortest parse path
          -- - highest frequency to lowest frequency (for performance?)
          -- - more exact to more freeform (the latter can be the former but not vice versa)
          [ spaceToken
          , newlineToken

          , try module_
          , quotes
          , try expression
          , birdTrack

            -- starts with "\"
          , try mathMultiline
          , try mathInline
          , escape

          , headers
          , labeledLink
          , link
          , anchor
          , numericEntity
          , textElement
          ]
      )
      <|> other
{- FOURMOLU_ENABLE -}

-- Tokens

textElement :: Parser [LocatedToken]
textElement =
  choice $
    Parsec.try
      <$> [ emphasis
          , bold
          , monospace
          ]

headers :: Parser [LocatedToken]
headers =
  choice $
    Parsec.try
      <$> [ header1
          , header2
          , header3
          , header4
          , header5
          , header6
          ]

anyUntil :: Parser a -> Parser Text
anyUntil p = Text.pack <$> manyTill anyChar (lookAhead p)

delimitedAsTuple :: Parser open -> Parser close -> Parser (Located open, LocatedToken, Located close)
delimitedAsTuple openP closeP =
  (,,)
    <$> located openP
    <*> located (Token <$> anyUntil closeP)
    <*> located closeP

delimited :: Parser open -> Parser close -> Token -> Token -> Parser [LocatedToken]
delimited openP closeP openTok closeTok = asList <$> delimitedAsTuple (openTok <$ openP) (closeTok <$ closeP)
 where
  asList (a, tok, b) = [a, tok, b]

delimitedNoTrailing :: Parser open -> Parser close -> Token -> Parser [LocatedToken]
delimitedNoTrailing openP closeP openTok =
  asList <$> delimitedAsTuple (openTok <$ openP) (void closeP)
 where
  asList (a, tok, _) = [a, tok]

delimitedSymmetric :: Parser a -> Token -> Token -> Parser [LocatedToken]
delimitedSymmetric s = delimited s s

--- End of line // end of file
eol :: Parser ()
eol = void "\n" <|> void "\r\n" <|> Parsec.eof

-- Start of line // start of file
sol :: Parser ()
sol = do
  p <- getPosition
  if sourceColumn p == 1
    then pure ()
    else fail "Not at start of line/document"

header1 :: Lexer
header1 = delimitedNoTrailing "= " eol (Header One)

header2 :: Lexer
header2 = delimitedNoTrailing "== " eol (Header Two)

header3 :: Lexer
header3 = delimitedNoTrailing "=== " eol (Header Three)

header4 :: Lexer
header4 = delimitedNoTrailing "==== " eol (Header Four)

header5 :: Lexer
header5 = delimitedNoTrailing "===== " eol (Header Five)

header6 :: Lexer
header6 = delimitedNoTrailing "====== " eol (Header Six)

-- #anchors#
anchor :: Lexer
anchor = do
  x <- located $ between "#" "#" (Anchor <$> anyUntil "#")
  pure [x]

moduleNames :: Parser Text
moduleNames = intercalate "." . fmap Text.pack <$> upperId `sepBy1` char '.'

upperId :: Parser String
upperId = (:) <$> satisfy isUpper <*> many1 identifierChar

identifierChar :: Parser Char
identifierChar = satisfy (\c -> isAlphaNum c || c == '_')

-- "Module.Name"
-- "Module.Name#anchor"
-- "Module.Name\#anchor" -- known as "old anchor". this has been deprecated for 9 years, thanks Ben
module_ :: Lexer
module_ = between (char '"') (char '"') inner
 where
  inner = do
    m <- located $ Module <$> moduleNames
    mAnchor <- optionMaybe (located $ anchorHash *> (Anchor <$> anchorText))
    pure $ case mAnchor of
      Just anc -> [m, anc]
      Nothing -> [m]

  anchorHash :: Parser Text
  anchorHash = "#" <|> try "\\#"

  anchorText :: Parser Text
  anchorText = Text.pack <$> many (satisfy (\c -> c /= '"' && not (isSpace c)))

linkRaw :: Lexer
linkRaw =
  tokenise
    [ BracketOpen <$ "["
    , Token <$> anyUntil "]"
    , BracketClose <$ "]"
    , ParenOpen <$ "("
    , Token <$> anyUntil ")"
    , ParenClose <$ ")"
    ]

link :: Lexer
link = do
  pos <- getPosition
  l <- linkRaw
  -- register the position of the last token
  pos' <- flip incSourceColumn (-1) <$> getPosition
  pure $ (pos, LinkOpen) : l <> [(pos', LinkClose)]

labeledLink :: Lexer
labeledLink = do
  open <- located $ LabeledLinkOpen <$ "<"
  linkRes <- linkRaw
  labelRes <- located $ Token <$> anyUntil ">"
  close <- located $ LabeledLinkClose <$ ">"
  pure $
    open : linkRes <> [labelRes, close]

mathMultiline :: Lexer
mathMultiline = delimited "\\[" "\\]" MathMultilineOpen MathMultilineClose

mathInline :: Lexer
mathInline = delimited "\\(" "\\)" MathInlineOpen MathInlineClose

-- TODO: make sure this starts at column 0?
birdTrack :: Lexer
birdTrack = delimitedNoTrailing (sol <* "> ") eol BirdTrack

-- TODO: also match following lines iff:
-- they start with alphanum
-- they're not empty
expression :: Lexer
expression = delimitedNoTrailing (sol <* ">>> ") eol Expression

escape :: Lexer
escape = delimitedNoTrailing "\\" eol Escape

quotes :: Lexer
quotes = delimitedSymmetric "\"" QuoteOpen QuoteClose

emphasis :: Lexer
emphasis = delimitedSymmetric "/" EmphasisOpen EmphasisClose

bold :: Lexer
bold = delimitedSymmetric "__" BoldOpen BoldClose

monospace :: Lexer
monospace = delimitedSymmetric "@" MonospaceOpen MonospaceClose

decimal :: Parser Int
decimal = read . Text.unpack <$> takeWhile1_ isDigit

hexadecimal :: Parser Int
hexadecimal = "x" *> (convert 0 . fmap (normalise . toLower) <$> many1 hexDigit)
 where
  normalise :: Char -> Int
  normalise c
    | ord '0' <= n && n <= ord '9' = n - ord '0'
    | ord 'A' <= n && n <= ord 'F' = n - ord 'A' + 10
    | ord 'a' <= n && n <= ord 'f' = n - ord 'a' + 10
    | otherwise = error "unexpected: invalid hex number"
   where
    n = ord c

  convert :: Int -> [Int] -> Int
  convert acc [] = acc
  convert acc (x : xs) = convert (acc * 16 + x) xs

numericEntity :: Lexer
numericEntity = do
  x <- located $ between "&#" ";" (NumericEntity <$> (hexadecimal <|> decimal))
  pure [x]

other :: Lexer
other = do
  pos <- getPosition
  c <- takeWhile1_ isUnicodeAlphaNum
  pure . pure $ (pos, Token c)
 where
  isUnicodeAlphaNum c = isPrint c && not (isControl c) && not (isSpace c)

spaceToken :: Lexer
spaceToken = do
  pos <- getPosition
  _ <- many1 (char ' ')
  pure . pure $ (pos, Space)

newlineToken :: Lexer
newlineToken = do
  pos <- getPosition
  _ <- newline
  pure . pure $ (pos, Newline)

-------
-- Helpers
-------

-- | Like `takeWhile`, but unconditionally take escaped characters.
takeWhile_ :: (Char -> Bool) -> Parser Text
takeWhile_ p = scan p_ False
 where
  p_ escaped c
    | escaped = Just False
    | not $ p c = Nothing
    | otherwise = Just (c == '\\')

-- | Like 'takeWhile1', but unconditionally take escaped characters.
takeWhile1_ :: (Char -> Bool) -> Parser Text
takeWhile1_ = mfilter (not . Text.null) . takeWhile_

{- | Scan the input text, accumulating characters as long as the scanning
function returns true.
-}
scan ::
  -- | scan function
  (state -> Char -> Maybe state) ->
  -- | initial state
  state ->
  Parser Text
scan f initState = do
  parserState@State{stateInput = input, statePos = pos} <- Parsec.getParserState

  (remaining, finalPos, ct) <- go input initState pos 0
  let newState = parserState{stateInput = remaining, statePos = finalPos}
  Parsec.setParserState newState $> Text.take ct input
 where
  go !input' !st !posAccum !count' = case Text.uncons input' of
    Nothing -> pure (input', posAccum, count')
    Just (char', input'') -> case f st char' of
      Nothing -> pure (input', posAccum, count')
      Just st' -> go input'' st' (updatePosChar posAccum char') (count' + 1)