Init

2025-09-21 08:19:01 +02:00 · 2025-09-21 08:19:01 +02:00 · c9f61c4e06
commit c9f61c4e06
16 changed files with 1366 additions and 0 deletions
--- a/src/Parser.hs
+++ b/src/Parser.hs
@ -0,0 +1,228 @@
+{-# LANGUAGE OverloadedStrings #-}
+
+module Parser (
+  parse,
+  parseText,
+) where
+
+import Identifier
+import ParserMonad
+import Types
+
+import Control.Applicative
+import Control.Monad
+import Data.Char qualified as Char
+import Data.Functor (($>))
+import Data.List (intercalate)
+import Data.Text (Text)
+import Data.Text qualified as Text
+import Parser.Util
+import Text.Parsec qualified as Parsec
+
+parse :: Parser a -> Text -> (ParserState, a)
+parse parser = either err id . parse' (parser <* Parsec.eof)
+ where
+  err = error . ("Haddock.Parser.parse: " ++)
+
+--------------------
+-- Markup
+--------------------
+
+{- | Skips a single special character and treats it as a plain string.
+This is done to skip over any special characters belonging to other
+elements but which were not deemed meaningful at their positions.
+-}
+skipSpecialChar :: Parser (DocMarkup mod a)
+skipSpecialChar = DocString . pure <$> Parsec.oneOf specialChar
+
+{- | Plain, regular parser for text. Called as one of the last parsers
+to ensure that we have already given a chance to more meaningful parsers
+before capturing their characters.
+-}
+string' :: Parser (DocMarkup mod a)
+string' =
+  DocString
+    -- After the first character, stop for @\(@ or @\[@ math starters. (The
+    -- first character won't start a valid math string because this parser
+    -- should follow math parsers. But this parser is expected to accept at
+    -- least one character from all inputs that don't start with special
+    -- characters, so the first character parser can't have the @"(["@
+    -- restriction.)
+    <$> ((:) <$> rawOrEscChar "" <*> many (rawOrEscChar "(["))
+ where
+  -- \| Parse a single logical character, either raw or escaped. Don't accept
+  -- escaped characters from the argument string.
+  rawOrEscChar :: [Char] -> Parser Char
+  rawOrEscChar restrictedEscapes =
+    Parsec.try $
+      Parsec.noneOf specialChar >>= \case
+        -- Handle backslashes:
+        --   - Fail on forbidden escape characters.
+        --   - Non-forbidden characters: simply unescape, e.g. parse "\b" as 'b',
+        --   - Trailing backslash: treat it as a raw backslash, not an escape
+        --     sequence. (This is the logic that this parser followed when this
+        --     comment was written; it is not necessarily intentional but now I
+        --     don't want to break anything relying on it.)
+        '\\' -> Parsec.noneOf restrictedEscapes <|> Parsec.eof $> '\\'
+        c -> pure c
+
+{- | Emphasis parser.
+
+>>> parseString "/Hello world/"
+DocEmphasis (DocString "Hello world")
+-}
+emphasis :: Parser (DocMarkup mod Identifier)
+emphasis =
+  DocEmphasis . parseAll
+    <$> disallowNewline ("/" *> takeWhile1_ (/= '/') <* "/")
+
+{- | Bold parser.
+
+>>> parseString "__Hello world__"
+DocBold (DocString "Hello world")
+-}
+bold :: Parser (DocMarkup mod Identifier)
+bold = DocBold . parseAll <$> disallowNewline ("__" *> takeUntil "__")
+
+{- | Monospaced strings.
+
+>>> parseString "@cruel@"
+DocMonospaced (DocString "cruel")
+-}
+monospace :: Parser (DocMarkup mod Identifier)
+monospace =
+  DocMonospace . parseAll
+    <$> ("@" *> takeWhile1_ (/= '@') <* "@")
+
+{- | Text anchors to allow for jumping around the generated documentation.
+
+>>> parseString "#Hello world#"
+DocAName "Hello world"
+-}
+anchor :: Parser (DocMarkup mod a)
+anchor =
+  DocAnchor . Text.unpack
+    <$> ("#" *> takeWhile1_ (\x -> x /= '#' && not (Char.isSpace x)) <* "#")
+
+-- | Parses identifiers with help of 'parseValid'.
+identifier :: Parser (DocMarkup mod Identifier)
+identifier = DocIdentifier <$> parseValid
+
+{- | Module names.
+
+Note that we allow '#' and '\' to support anchors (old style anchors are of
+the form "SomeModule\#anchor").
+-}
+moduleName :: Parser (DocMarkup mod a)
+moduleName = DocModule . flip ModuleLink Nothing <$> ("\"" *> moduleNameString <* "\"")
+
+-- | A module name, optionally with an anchor
+moduleNameString :: Parser String
+moduleNameString = moduleId `maybeFollowedBy` anchor_
+ where
+  moduleId = intercalate "." <$> conid `Parsec.sepBy1` "."
+  anchor_ =
+    (++)
+      <$> (Parsec.string "#" <|> Parsec.string "\\#")
+      <*> many (Parsec.satisfy (\c -> c /= '"' && not (Char.isSpace c)))
+
+  maybeFollowedBy pre suf = (\x -> maybe x (x ++)) <$> pre <*> optional suf
+
+  conid =
+    (:)
+      <$> Parsec.satisfy (\c -> Char.isAlpha c && Char.isUpper c)
+      <*> many conChar
+
+  conChar = Parsec.alphaNum <|> Parsec.char '_'
+
+------------------------
+-- Markup components
+------------------------
+
+{- | List of characters that we use to delimit any special markup.
+Once we have checked for any of these and tried to parse the
+relevant markup, we can assume they are used as regular text.
+-}
+specialChar :: [Char]
+specialChar = "_/<@\"&'`#[ "
+
+------------------------
+--  Helpers
+------------------------
+
+parse' :: Parser a -> Text -> Either String (ParserState, a)
+parse' parser t =
+  let parser' = (,) <$> parser <*> Parsec.getState
+   in case Parsec.runParser parser' initialParserState "<haddock>" t of
+        Left e -> Left (show e)
+        Right (x, s) -> Right (s, x)
+
+docConcat :: [DocMarkup mod id] -> DocMarkup mod id
+docConcat = foldr docAppend DocEmpty
+ where
+  -- Prevent doc append from becoming too nested
+  docAppend (DocDefinitionList ds1) (DocDefinitionList ds2) = DocDefinitionList (ds1 <> ds2)
+  docAppend (DocDefinitionList ds1) (DocAppend (DocDefinitionList ds2) d) = DocAppend (DocDefinitionList (ds1 <> ds2)) d
+  docAppend (DocOrderedList ds1) (DocOrderedList ds2) = DocOrderedList (ds1 <> ds2)
+  docAppend (DocOrderedList ds1) (DocAppend (DocOrderedList ds2) d) = DocAppend (DocOrderedList (ds1 <> ds2)) d
+  docAppend (DocUnorderedList ds1) (DocUnorderedList ds2) = DocUnorderedList (ds1 <> ds2)
+  docAppend (DocUnorderedList ds1) (DocAppend (DocUnorderedList ds2) d) = DocAppend (DocUnorderedList (ds1 <> ds2)) d
+  docAppend DocEmpty d = d
+  docAppend d DocEmpty = d
+  docAppend (DocString s1) (DocString s2) = DocString (s1 <> s2)
+  docAppend (DocAppend d (DocString s1)) (DocString s2) = DocAppend d (DocString (s1 <> s2))
+  docAppend (DocString s1) (DocAppend (DocString s2) d) = DocAppend (DocString (s1 <> s2)) d
+  docAppend d1 d2 = DocAppend d1 d2
+
+{- | Parse a text paragraph. Actually just a wrapper over 'parseAll' which
+drops leading whitespace.
+-}
+parseText :: Text -> DocMarkup mod Identifier
+parseText = parseAll . Text.dropWhile Char.isSpace . Text.filter (/= '\r')
+
+parseAll :: Text -> DocMarkup mod Identifier
+parseAll = snd . parse myParser
+ where
+  -- docConcat
+  --   <$> many
+  --     ( choice'
+  --         [ monospace
+  --         , anchor
+  --         , identifier
+  --         , moduleName
+  --         , picture
+  --         , mathDisplay
+  --         , mathInline
+  --         , markdownImage
+  --         , markdownLink
+  --         , hyperlink
+  --         , bold
+  --         , emphasis
+  --         , encodedChar
+  --         , string'
+  --         , skipSpecialChar
+  --         ]
+  --     )
+  myParser :: Parser (DocMarkup mod Identifier)
+  myParser =
+    docConcat
+      <$> many
+        ( choice'
+            [ monospace
+            , anchor
+            , identifier
+            , moduleName
+            , bold
+            , emphasis
+            , string'
+            , skipSpecialChar
+            ]
+        )
+
+choice' :: [Parser a] -> Parser a
+choice' [] = empty
+choice' [p] = p
+choice' (p : ps) = Parsec.try p <|> choice' ps
+
+disallowNewline :: Parser Text -> Parser Text
+disallowNewline = mfilter (Text.all (/= '\n'))