haddock2/src/Parser.hs

{-# LANGUAGE OverloadedStrings #-}

module Parser (
  parse,
  parseText,
) where

import Identifier
import ParserMonad
import Types

import Control.Applicative
import Control.Monad
import Data.Char qualified as Char
import Data.Functor (($>))
import Data.List (intercalate)
import Data.Text (Text)
import Data.Text qualified as Text
import Parser.Util
import Text.Parsec qualified as Parsec

parse :: Parser a -> Text -> (ParserState, a)
parse parser = either err id . parse' (parser <* Parsec.eof)
 where
  err = error . ("Haddock.Parser.parse: " ++)

--------------------
-- Markup
--------------------

{- | Skips a single special character and treats it as a plain string.
This is done to skip over any special characters belonging to other
elements but which were not deemed meaningful at their positions.
-}
skipSpecialChar :: Parser (DocMarkup mod a)
skipSpecialChar = DocString . pure <$> Parsec.oneOf specialChar

{- | Plain, regular parser for text. Called as one of the last parsers
to ensure that we have already given a chance to more meaningful parsers
before capturing their characters.
-}
string' :: Parser (DocMarkup mod a)
string' =
  DocString
    -- After the first character, stop for @\(@ or @\[@ math starters. (The
    -- first character won't start a valid math string because this parser
    -- should follow math parsers. But this parser is expected to accept at
    -- least one character from all inputs that don't start with special
    -- characters, so the first character parser can't have the @"(["@
    -- restriction.)
    <$> ((:) <$> rawOrEscChar "" <*> many (rawOrEscChar "(["))
 where
  -- \| Parse a single logical character, either raw or escaped. Don't accept
  -- escaped characters from the argument string.
  rawOrEscChar :: [Char] -> Parser Char
  rawOrEscChar restrictedEscapes =
    Parsec.try $
      Parsec.noneOf specialChar >>= \case
        -- Handle backslashes:
        --   - Fail on forbidden escape characters.
        --   - Non-forbidden characters: simply unescape, e.g. parse "\b" as 'b',
        --   - Trailing backslash: treat it as a raw backslash, not an escape
        --     sequence. (This is the logic that this parser followed when this
        --     comment was written; it is not necessarily intentional but now I
        --     don't want to break anything relying on it.)
        '\\' -> Parsec.noneOf restrictedEscapes <|> Parsec.eof $> '\\'
        c -> pure c

{- | Emphasis parser.

>>> parseString "/Hello world/"
DocEmphasis (DocString "Hello world")
-}
emphasis :: Parser (DocMarkup mod Identifier)
emphasis =
  DocEmphasis . parseAll
    <$> disallowNewline ("/" *> takeWhile1_ (/= '/') <* "/")

{- | Bold parser.

>>> parseString "__Hello world__"
DocBold (DocString "Hello world")
-}
bold :: Parser (DocMarkup mod Identifier)
bold = DocBold . parseAll <$> disallowNewline ("__" *> takeUntil "__")

{- | Monospaced strings.

>>> parseString "@cruel@"
DocMonospaced (DocString "cruel")
-}
monospace :: Parser (DocMarkup mod Identifier)
monospace =
  DocMonospace . parseAll
    <$> ("@" *> takeWhile1_ (/= '@') <* "@")

{- | Text anchors to allow for jumping around the generated documentation.

>>> parseString "#Hello world#"
DocAName "Hello world"
-}
anchor :: Parser (DocMarkup mod a)
anchor =
  DocAnchor . Text.unpack
    <$> ("#" *> takeWhile1_ (\x -> x /= '#' && not (Char.isSpace x)) <* "#")

-- | Parses identifiers with help of 'parseValid'.
identifier :: Parser (DocMarkup mod Identifier)
identifier = DocIdentifier <$> parseValid

{- | Module names.

Note that we allow '#' and '\' to support anchors (old style anchors are of
the form "SomeModule\#anchor").
-}
moduleName :: Parser (DocMarkup mod a)
moduleName = DocModule . flip ModuleLink Nothing <$> ("\"" *> moduleNameString <* "\"")

-- | A module name, optionally with an anchor
moduleNameString :: Parser String
moduleNameString = moduleId `maybeFollowedBy` anchor_
 where
  moduleId = intercalate "." <$> conid `Parsec.sepBy1` "."
  anchor_ =
    (++)
      <$> (Parsec.string "#" <|> Parsec.string "\\#")
      <*> many (Parsec.satisfy (\c -> c /= '"' && not (Char.isSpace c)))

  maybeFollowedBy pre suf = (\x -> maybe x (x ++)) <$> pre <*> optional suf

  conid =
    (:)
      <$> Parsec.satisfy (\c -> Char.isAlpha c && Char.isUpper c)
      <*> many conChar

  conChar = Parsec.alphaNum <|> Parsec.char '_'

------------------------
-- Markup components
------------------------

{- | List of characters that we use to delimit any special markup.
Once we have checked for any of these and tried to parse the
relevant markup, we can assume they are used as regular text.
-}
specialChar :: [Char]
specialChar = "_/<@\"&'`#[ "

------------------------
--  Helpers
------------------------

parse' :: Parser a -> Text -> Either String (ParserState, a)
parse' parser t =
  let parser' = (,) <$> parser <*> Parsec.getState
   in case Parsec.runParser parser' initialParserState "<haddock>" t of
        Left e -> Left (show e)
        Right (x, s) -> Right (s, x)

docConcat :: [DocMarkup mod id] -> DocMarkup mod id
docConcat = foldr docAppend DocEmpty
 where
  -- Prevent doc append from becoming too nested
  docAppend (DocDefinitionList ds1) (DocDefinitionList ds2) = DocDefinitionList (ds1 <> ds2)
  docAppend (DocDefinitionList ds1) (DocAppend (DocDefinitionList ds2) d) = DocAppend (DocDefinitionList (ds1 <> ds2)) d
  docAppend (DocOrderedList ds1) (DocOrderedList ds2) = DocOrderedList (ds1 <> ds2)
  docAppend (DocOrderedList ds1) (DocAppend (DocOrderedList ds2) d) = DocAppend (DocOrderedList (ds1 <> ds2)) d
  docAppend (DocUnorderedList ds1) (DocUnorderedList ds2) = DocUnorderedList (ds1 <> ds2)
  docAppend (DocUnorderedList ds1) (DocAppend (DocUnorderedList ds2) d) = DocAppend (DocUnorderedList (ds1 <> ds2)) d
  docAppend DocEmpty d = d
  docAppend d DocEmpty = d
  docAppend (DocString s1) (DocString s2) = DocString (s1 <> s2)
  docAppend (DocAppend d (DocString s1)) (DocString s2) = DocAppend d (DocString (s1 <> s2))
  docAppend (DocString s1) (DocAppend (DocString s2) d) = DocAppend (DocString (s1 <> s2)) d
  docAppend d1 d2 = DocAppend d1 d2

{- | Parse a text paragraph. Actually just a wrapper over 'parseAll' which
drops leading whitespace.
-}
parseText :: Text -> DocMarkup mod Identifier
parseText = parseAll . Text.dropWhile Char.isSpace . Text.filter (/= '\r')

parseAll :: Text -> DocMarkup mod Identifier
parseAll = snd . parse myParser
 where
  -- docConcat
  --   <$> many
  --     ( choice'
  --         [ monospace
  --         , anchor
  --         , identifier
  --         , moduleName
  --         , picture
  --         , mathDisplay
  --         , mathInline
  --         , markdownImage
  --         , markdownLink
  --         , hyperlink
  --         , bold
  --         , emphasis
  --         , encodedChar
  --         , string'
  --         , skipSpecialChar
  --         ]
  --     )
  myParser :: Parser (DocMarkup mod Identifier)
  myParser =
    docConcat
      <$> many
        ( choice'
            [ monospace
            , anchor
            , identifier
            , moduleName
            , bold
            , emphasis
            , string'
            , skipSpecialChar
            ]
        )

choice' :: [Parser a] -> Parser a
choice' [] = empty
choice' [p] = p
choice' (p : ps) = Parsec.try p <|> choice' ps

disallowNewline :: Parser Text -> Parser Text
disallowNewline = mfilter (Text.all (/= '\n'))