commit c9f61c4e0610d606949a01a948539df63921a138 Author: Igor Ranieri Date: Sun Sep 21 08:19:01 2025 +0200 Init diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..48a004c --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +dist-newstyle diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..ab4e1dc --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,5 @@ +# Revision history for haddock2 + +## 0.1.0.0 -- YYYY-mm-dd + +* First version. Released on an unsuspecting world. diff --git a/Grammar.ebnf b/Grammar.ebnf new file mode 100644 index 0000000..bff331d --- /dev/null +++ b/Grammar.ebnf @@ -0,0 +1,88 @@ +document ::= paragraph* + +paragraph ::= ( text_paragraph | code_block | header | list | table ) newline? + +text_paragraph ::= text_element+ + +text_element ::= emphasis | bold | monospace | link | anchor | identifier | + inline_math | image | plain_text | escaped_char + +emphasis ::= '/' text_no_newline '/' +bold ::= '__' text_no_newline '__' +monospace ::= '@' text_content '@' + +link ::= module_link | hyperlink | markdown_link +module_link ::= '"' module_name ( '#' anchor_name )? '"' +hyperlink ::= '<' url ( ' ' link_text )? '>' +markdown_link ::= '[' link_text '](' ( url | module_link ) ')' + +anchor ::= '#' anchor_name '#' +identifier ::= "'" haskell_id "'" + +inline_math ::= '\(' math_expr '\)' +image ::= '<<' image_path ( ' ' image_title )? '>>' | + '![' alt_text '](' image_path ')' + +code_block ::= at_block | bird_tracks | example_block | property_block +at_block ::= '@' newline code_content newline '@' +bird_tracks ::= ( '>' ' '? code_line newline )+ +example_block ::= ( '>>>' ' ' expression newline result_line* )+ +property_block ::= 'prop>' ' ' property_desc newline + +header ::= header_marker ' ' header_text newline +header_marker ::= '=' | '==' | '===' | '====' | '=====' | '======' + +list ::= unordered_list | ordered_list | definition_list +unordered_list ::= ( list_marker ' ' list_content )+ +list_marker ::= '*' | '-' +ordered_list ::= ( number_marker ' ' list_content )+ +number_marker ::= digit+ '.' | '(' digit+ ')' +definition_list ::= ( '[' term ']' ':'? ' ' definition_content )+ + +table ::= table_border header_row header_sep data_row* table_border +table_border ::= '+' ( '-' | '+' )* newline +header_row ::= '|' ( table_cell '|' )* newline +header_sep ::= '+' ( '=' | '+' )* newline +data_row ::= '|' ( table_cell '|' )* newline + +plain_text ::= text_char+ +text_char ::= letter | digit | ' ' | punctuation +text_no_newline ::= ( letter | digit | ' ' | safe_punctuation )+ +text_content ::= ( letter | digit | ' ' | newline | punctuation )* +code_content ::= code_char* +code_char ::= letter | digit | ' ' | newline | punctuation +code_line ::= ( letter | digit | ' ' | punctuation )* +list_content ::= text_element* ( newline ' ' text_element* )? +table_cell ::= ' ' table_char* ' ' +table_char ::= letter | digit | ' ' | safe_punctuation +header_text ::= ( letter | digit | ' ' | punctuation )+ +link_text ::= link_char+ +link_char ::= letter | digit | ' ' | safe_punctuation +alt_text ::= alt_char+ +alt_char ::= letter | digit | ' ' | safe_punctuation +image_path ::= ( letter | digit | '.' | '/' | '-' | '_' )+ +image_title ::= image_title_char+ +image_title_char ::= letter | digit | ' ' | safe_punctuation +anchor_name ::= letter ( letter | digit | '-' )* +haskell_id ::= letter ( letter | digit | '_' | "'" )* +module_name ::= haskell_id ( '.' haskell_id )* +url ::= ( 'http://' | 'https://' | 'ftp://' ) url_char+ +url_char ::= letter | digit | '/' | '.' | ':' | '-' | '_' | '?' | '&' | '=' +math_expr ::= math_char+ +math_char ::= letter | digit | ' ' | math_punctuation +property_desc ::= ( letter | digit | ' ' | punctuation )+ +expression ::= ( letter | digit | ' ' | punctuation )+ +result_line ::= result_char* newline +result_char ::= letter | digit | ' ' | safe_punctuation +term ::= term_char+ +term_char ::= letter | digit | ' ' | safe_punctuation +definition_content ::= text_element+ +escaped_char ::= '\' special_char + +special_char ::= '/' | '*' | '@' | "'" | '"' | '#' | '<' | '>' | '[' | ']' | '(' | ')' | '|' | '=' | '-' | '+' +safe_punctuation ::= '!' | '$' | '%' | '^' | '&' | '(' | ')' | '_' | '+' | '{' | '}' | ';' | ':' | ',' | '.' | '`' | '~' +math_punctuation ::= '+' | '-' | '*' | '/' | '^' | '_' | '=' | '(' | ')' | '{' | '}' | '[' | ']' +punctuation ::= '!' | '@' | '#' | '$' | '%' | '^' | '&' | '*' | '(' | ')' | '-' | '_' | '+' | '=' | '{' | '}' | '[' | ']' | '|' | '\' | ':' | ';' | '"' | "'" | '<' | '>' | '?' | ',' | '.' | '/' | '`' | '~' +letter ::= 'a' | 'b' | 'c' | 'd' | 'e' | 'f' | 'g' | 'h' | 'i' | 'j' | 'k' | 'l' | 'm' | 'n' | 'o' | 'p' | 'q' | 'r' | 's' | 't' | 'u' | 'v' | 'w' | 'x' | 'y' | 'z' | 'A' | 'B' | 'C' | 'D' | 'E' | 'F' | 'G' | 'H' | 'I' | 'J' | 'K' | 'L' | 'M' | 'N' | 'O' | 'P' | 'Q' | 'R' | 'S' | 'T' | 'U' | 'V' | 'W' | 'X' | 'Y' | 'Z' +digit ::= '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' +newline ::= #xA | #xD #xA | #xD diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..2225119 --- /dev/null +++ b/LICENSE @@ -0,0 +1,29 @@ +Copyright (c) 2025, Igor Ranieri + + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/app/Main.hs b/app/Main.hs new file mode 100644 index 0000000..65ae4a0 --- /dev/null +++ b/app/Main.hs @@ -0,0 +1,4 @@ +module Main where + +main :: IO () +main = putStrLn "Hello, Haskell!" diff --git a/cabal.project.local b/cabal.project.local new file mode 100644 index 0000000..8909668 --- /dev/null +++ b/cabal.project.local @@ -0,0 +1 @@ +tests: True diff --git a/haddock2.cabal b/haddock2.cabal new file mode 100644 index 0000000..9ba514f --- /dev/null +++ b/haddock2.cabal @@ -0,0 +1,55 @@ +cabal-version: 3.4 +name: haddock2 +version: 0.1.0.0 +license: BSD-3-Clause +license-file: LICENSE +author: Igor Ranieri +maintainer: igor@elland.me +build-type: Simple +extra-doc-files: CHANGELOG.md + +common common + ghc-options: -Wall + default-extensions: + BlockArguments + DuplicateRecordFields + NoFieldSelectors + +executable haddock2 + import: common + main-is: Main.hs + build-depends: base >=4.20.1.0 + hs-source-dirs: app + default-language: GHC2024 + +library haddock2-lib + import: common + build-depends: + base >=4.20.1.0, + parsec ^>=3.1.18.0, + text ^>=2.1.2, + + hs-source-dirs: src + -- cabal-gild: discover src + exposed-modules: + Identifier + Lexer + Parser + Parser.Util + ParserMonad + Types + + default-language: GHC2024 + +test-suite haddock2-test + import: common + type: exitcode-stdio-1.0 + main-is: Spec.hs + build-depends: + base >=4.20.1.0, + haddock2:{haddock2-lib}, + hspec ^>=2.11.0, + text ^>=2.1.2, + + hs-source-dirs: test + default-language: GHC2024 diff --git a/markup.md b/markup.md new file mode 100644 index 0000000..48efafe --- /dev/null +++ b/markup.md @@ -0,0 +1,198 @@ +# Haddock Markup Language Examples + +## Text Formatting + +### Emphasis +``` +/emphasized text/ +``` +Single line only, no newlines allowed. + +### Bold +``` +__bold text__ +``` +Single line only, no newlines allowed. + +### Monospace/Code +``` +@monospace text@ +``` +Can span multiple lines. + +## Links and References + +### Module Links +``` +"Module.Name" +"Module.Name#anchor" +"Module.Name\#anchor" +``` + +### Hyperlinks +``` + + +``` +Auto-detected URLs: +``` +http://example.com +https://example.com +ftp://example.com +``` + +### Markdown-style Links +``` +[link text](http://example.com) +[link text]("Module.Name") +``` + +### Anchors +``` +#anchor-name# +``` +No spaces allowed in anchor names. + +### Identifiers +``` +'identifier' +``` +Links to Haskell identifiers. + +## Images + +### Basic Images +``` +<> +<> +``` + +### Markdown Images +``` +![alt text](image.png) +``` + +## Math + +### Inline Math +``` +\(mathematical expression\) +``` +Single line only. + +### Display Math +``` +\[mathematical expression\] +``` +Can span multiple lines. + +## Code and Examples + +### Code Blocks +``` +@ +code block content +with multiple lines +@ +``` + +### Bird Tracks (Code) +``` +> code line 1 +> code line 2 +``` +Each line starts with `>` followed by optional space. + +### Examples +``` +>>> expression +result line 1 +result line 2 + +>>> another expression +result +``` + +### Properties +``` +prop> property description +``` + +## Lists + +### Unordered Lists +``` +* item 1 +* item 2 + continued content + +- item 1 +- item 2 +``` + +### Ordered Lists +``` +1. item 1 +2. item 2 + +(1) item 1 +(2) item 2 +``` + +### Definition Lists +``` +[term] definition content +[another term] more definition content +``` +Optional colon after closing bracket. + +## Tables + +### Grid Tables +``` ++----------+----------+ +| Header 1 | Header 2 | ++==========+==========+ +| Cell 1 | Cell 2 | ++----------+----------+ +| Cell 3 | Cell 4 | ++----------+----------+ +``` +- First row determines table width +- Header separator uses `=` characters +- Regular separators use `-` +- Edges can be `+` or `|` + +## Headers +``` += Level 1 Header +== Level 2 Header +=== Level 3 Header +==== Level 4 Header +===== Level 5 Header +====== Level 6 Header +``` +Up to 6 levels deep. + +## Special Elements + +### Since Annotations +``` +@since package-name-1.2.3 +@since 1.2.3 +``` + +### Numeric Character References +``` +A (decimal) +A (hexadecimal) +A (hexadecimal) +``` + +## Escaping +Use backslash `\` to escape special characters. Trailing backslash without following character is treated as literal backslash. + +## Structure +- Paragraphs separated by blank lines +- 4-space indentation for nested content +- Whitespace handling varies by context diff --git a/src/Identifier.hs b/src/Identifier.hs new file mode 100644 index 0000000..ca1f3bc --- /dev/null +++ b/src/Identifier.hs @@ -0,0 +1,157 @@ +module Identifier ( + Identifier (..), + Namespace (..), + parseValid, +) where + +import Data.Text qualified as Text +import Text.Parsec qualified as Parsec + +import Control.Monad (guard) +import Data.Char (isAlpha, isAlphaNum) +import Data.Functor (($>)) +import Data.Maybe (listToMaybe, maybeToList) +import Data.Text (Text) +import ParserMonad +import Text.Parsec (State (..)) +import Text.Parsec.Pos (updatePosChar) +import Text.Read.Lex (isSymbolChar) + +-- | Identifier string surrounded with namespace, opening, and closing quotes/backticks. +data Identifier = Identifier !Namespace !Char String !Char + deriving (Show, Eq) + +-- | The namespace qualification for an identifier. +data Namespace = Value | Type | None deriving (Eq, Ord, Enum, Show) + +parseValid :: Parser Identifier +parseValid = do + state@State{stateInput = input, statePos = pos} <- Parsec.getParserState + + case takeIdentifier input of + Nothing -> Parsec.parserFail "parseValid: Failed to match a valid identifier" + Just (namespace, op, ident, cl, input') -> + let posOp = updatePosChar pos op + posIdent = Text.foldl updatePosChar posOp ident + posCl = updatePosChar posIdent cl + newState = state{stateInput = input', statePos = posCl} + in Parsec.setParserState newState $> Identifier namespace op (Text.unpack ident) cl + +{- | Try to parse a delimited identifier off the front of the given input. + +This tries to match as many valid Haskell identifiers/operators as possible, +to the point of sometimes accepting invalid things (ex: keywords). Some +considerations: + + - operators and identifiers can have module qualifications + - operators can be wrapped in parens (for prefix) + - identifiers can be wrapped in backticks (for infix) + - delimiters are backticks or regular ticks + - since regular ticks are also valid in identifiers, we opt for the + longest successful parse + +This function should make /O(1)/ allocations +-} +takeIdentifier :: Text -> Maybe (Namespace, Char, Text, Char, Text) +takeIdentifier input = listToMaybe $ do + -- Optional namespace + let (namespace, input') = case Text.uncons input of + Just ('v', i) -> (Value, i) + Just ('t', i) -> (Type, i) + _ -> (None, input) + -- Opening tick + (op, input'') <- maybeToList (Text.uncons input') + guard (op == '\'' || op == '`') + + -- Identifier/operator + (ident, input''') <- wrapped input'' + + -- Closing tick + (cl, input'''') <- maybeToList (Text.uncons input''') + guard (cl == '\'' || cl == '`') + + pure (namespace, op, ident, cl, input'''') + where + -- \| Parse out a wrapped, possibly qualified, operator or identifier + wrapped t = do + (c, t') <- maybeToList (Text.uncons t) + -- Tuples + case c of + '(' + | Just (c', _) <- Text.uncons t' + , c' == ',' || c' == ')' -> + do + let (commas, t'') = Text.span (== ',') t' + (')', t''') <- maybeToList (Text.uncons t'') + pure (Text.take (Text.length commas + 2) t, t''') + + -- Parenthesized + '(' -> do + (n, t'') <- general False 0 [] t' + (')', t''') <- maybeToList (Text.uncons t'') + pure (Text.take (n + 2) t, t''') + + -- Backticked + '`' -> do + (n, t'') <- general False 0 [] t' + ('`', t''') <- maybeToList (Text.uncons t'') + pure (Text.take (n + 2) t, t''') + + -- Unadorned + _ -> do + (n, t'') <- general False 0 [] t + pure (Text.take n t, t'') + + -- \| Parse out a possibly qualified operator or identifier + general :: + Bool -> + -- \^ refuse inputs starting with operators + Int -> + -- \^ total characters \"consumed\" so far + [(Int, Text)] -> + -- \^ accumulated results + Text -> + -- \^ current input + [(Int, Text)] + -- \^ total characters parsed & what remains + general !identOnly !i acc t + -- Starts with an identifier (either just an identifier, or a module qual) + | Just (n, rest) <- identLike t = + if Text.null rest + then acc + else case Text.head rest of + '`' -> (n + i, rest) : acc + ')' -> (n + i, rest) : acc + '.' -> general False (n + i + 1) acc (Text.tail rest) + '\'' -> + let (m, rest') = quotes rest + in general True (n + m + 1 + i) ((n + m + i, rest') : acc) (Text.tail rest') + _ -> acc + -- An operator + | Just (n, rest) <- optr t + , not identOnly = + (n + i, rest) : acc + -- Anything else + | otherwise = + acc + + -- \| Parse an identifier off the front of the input + identLike t + | Text.null t = Nothing + | isAlpha (Text.head t) || '_' == Text.head t = + let !(idt, rest) = Text.span (\c -> isAlphaNum c || c == '_') t + !(octos, rest') = Text.span (== '#') rest + in Just (Text.length idt + Text.length octos, rest') + | otherwise = Nothing + + -- \| Parse all but the last quote off the front of the input + -- PRECONDITION: T.head t `elem` ['\'', '`'] + quotes :: Text -> (Int, Text) + quotes t = + let !n = Text.length (Text.takeWhile (`elem` ['\'', '`']) t) - 1 + in (n, Text.drop n t) + + -- \| Parse an operator off the front of the input + optr t = + let !(op, rest) = Text.span isSymbolChar t + in if Text.null op then Nothing else Just (Text.length op, rest) diff --git a/src/Lexer.hs b/src/Lexer.hs new file mode 100644 index 0000000..2029ad6 --- /dev/null +++ b/src/Lexer.hs @@ -0,0 +1,190 @@ +{-# LANGUAGE OverloadedStrings #-} + +module Lexer ( + Token (..), + lexer, + emphasis, +) where + +import Control.Monad (mfilter) +import Data.Char (isAlphaNum, isPrint) +import Data.Functor (($>)) +import Data.Text (Text) +import Data.Text qualified as Text +import GHC.Stack (HasCallStack) +import ParserMonad (Parser, initialParserState) +import Text.Parsec +import Text.Parsec qualified as Parsec +import Text.Parsec.Pos (updatePosChar) + +type LocatedToken = (SourcePos, Token) + +type Lexer = Parser [LocatedToken] + +data Token + = Token Text + | Anchor + | AngleOpen + | AngleClose + | BoldOpen + | BoldClose + | BracketOpen + | BracketClose + | EmphasisOpen + | EmphasisClose + | MonospaceOpen + | MonospaceClose + | Newline + | ParenOpen + | ParenClose + | QuoteOpen + | QuoteClose + | Space + | EOF + deriving (Eq, Show) + +lexer :: String -> Either ParseError [LocatedToken] +lexer = Parsec.runParser lexText initialParserState "input" . Text.pack + +lexText :: (HasCallStack) => Parser [LocatedToken] +lexText = go + where + go = do + Parsec.optionMaybe Parsec.eof >>= \case + Just _ -> pure [] + Nothing -> do + toks <- + choice + [ newlineToken + , spaceToken + , textElement + , identifier + , other + ] + rest <- go + pure (toks <> rest) + +match :: Parser a -> Parser (Text, a) +match p = do + input <- getInput + result <- p + input' <- getInput + let !consumed = Text.take (Text.length input - Text.length input') input + pure (consumed, result) + +-- Tokens + +textElement :: Parser [LocatedToken] +textElement = + choice $ + Parsec.try + <$> [ emphasis + , bold + , monospace + , parens + , brackets + , angles + ] + +delimited :: String -> String -> Token -> Token -> Parser [LocatedToken] +delimited c1 c2 ot ct = do + pos <- getPosition + (_, content) <- match $ between op cl any' + + innerToks <- case lexer $ Text.unpack content of + Left _ -> do + pos' <- getPosition + pure $ [(pos', Token content)] + Right toks -> pure toks + + let openTok :: LocatedToken = (pos, ot) + closeTok :: LocatedToken = (pos, ct) + + pure $ openTok : innerToks <> [closeTok] + where + op = string c1 + cl = string c2 + any' = Text.pack <$> manyTill anyChar (lookAhead cl) + +delimited' :: String -> Token -> Token -> Parser [LocatedToken] +delimited' s t1 t2 = delimited s s t1 t2 + +emphasis :: Lexer +emphasis = delimited' "/" EmphasisOpen EmphasisClose + +bold :: Lexer +bold = delimited' "__" BoldOpen BoldClose + +monospace :: Lexer +monospace = delimited' "@" MonospaceOpen MonospaceClose + +parens :: Parser [LocatedToken] +parens = delimited "(" ")" ParenOpen ParenClose + +brackets :: Lexer +brackets = delimited "[" "]" ParenOpen ParenClose + +angles :: Parser [LocatedToken] +angles = delimited "<" ">" AngleOpen AngleClose + +other :: Lexer +other = do + pos <- getPosition + c <- takeWhile1_ isPrint + pure . pure $ (pos, Token c) + +spaceToken :: Lexer +spaceToken = do + pos <- getPosition + _ <- many1 (char ' ') + pure . pure $ (pos, Space) + +newlineToken :: Lexer +newlineToken = do + pos <- getPosition + _ <- newline + pure . pure $ (pos, Newline) + +identifier :: Lexer +identifier = do + pos <- getPosition + txt <- takeWhile1_ isAlphaNum + pure . pure $ (pos, Token txt) + +------- +-- Helpers +------- + +-- | Like `takeWhile`, but unconditionally take escaped characters. +takeWhile_ :: (Char -> Bool) -> Parser Text +takeWhile_ p = scan p_ False + where + p_ escaped c + | escaped = Just False + | not $ p c = Nothing + | otherwise = Just (c == '\\') + +-- | Like 'takeWhile1', but unconditionally take escaped characters. +takeWhile1_ :: (Char -> Bool) -> Parser Text +takeWhile1_ = mfilter (not . Text.null) . takeWhile_ + +{- | Scan the input text, accumulating characters as long as the scanning +function returns true. +-} +scan :: + -- | scan function + (state -> Char -> Maybe state) -> + -- | initial state + state -> + Parser Text +scan f initState = do + parserState@State{stateInput = input, statePos = pos} <- Parsec.getParserState + (remaining, finalPos, ct) <- go input initState pos 0 + let newState = parserState{stateInput = remaining, statePos = finalPos} + Parsec.setParserState newState $> Text.take ct input + where + go !input' !st !posAccum !count' = case Text.uncons input' of + Nothing -> pure (input', posAccum, count') + Just (char', input'') -> case f st char' of + Nothing -> pure (input', posAccum, count') + Just st' -> go input'' st' (updatePosChar posAccum char') (count' + 1) diff --git a/src/Parser.hs b/src/Parser.hs new file mode 100644 index 0000000..927c918 --- /dev/null +++ b/src/Parser.hs @@ -0,0 +1,228 @@ +{-# LANGUAGE OverloadedStrings #-} + +module Parser ( + parse, + parseText, +) where + +import Identifier +import ParserMonad +import Types + +import Control.Applicative +import Control.Monad +import Data.Char qualified as Char +import Data.Functor (($>)) +import Data.List (intercalate) +import Data.Text (Text) +import Data.Text qualified as Text +import Parser.Util +import Text.Parsec qualified as Parsec + +parse :: Parser a -> Text -> (ParserState, a) +parse parser = either err id . parse' (parser <* Parsec.eof) + where + err = error . ("Haddock.Parser.parse: " ++) + +-------------------- +-- Markup +-------------------- + +{- | Skips a single special character and treats it as a plain string. +This is done to skip over any special characters belonging to other +elements but which were not deemed meaningful at their positions. +-} +skipSpecialChar :: Parser (DocMarkup mod a) +skipSpecialChar = DocString . pure <$> Parsec.oneOf specialChar + +{- | Plain, regular parser for text. Called as one of the last parsers +to ensure that we have already given a chance to more meaningful parsers +before capturing their characters. +-} +string' :: Parser (DocMarkup mod a) +string' = + DocString + -- After the first character, stop for @\(@ or @\[@ math starters. (The + -- first character won't start a valid math string because this parser + -- should follow math parsers. But this parser is expected to accept at + -- least one character from all inputs that don't start with special + -- characters, so the first character parser can't have the @"(["@ + -- restriction.) + <$> ((:) <$> rawOrEscChar "" <*> many (rawOrEscChar "([")) + where + -- \| Parse a single logical character, either raw or escaped. Don't accept + -- escaped characters from the argument string. + rawOrEscChar :: [Char] -> Parser Char + rawOrEscChar restrictedEscapes = + Parsec.try $ + Parsec.noneOf specialChar >>= \case + -- Handle backslashes: + -- - Fail on forbidden escape characters. + -- - Non-forbidden characters: simply unescape, e.g. parse "\b" as 'b', + -- - Trailing backslash: treat it as a raw backslash, not an escape + -- sequence. (This is the logic that this parser followed when this + -- comment was written; it is not necessarily intentional but now I + -- don't want to break anything relying on it.) + '\\' -> Parsec.noneOf restrictedEscapes <|> Parsec.eof $> '\\' + c -> pure c + +{- | Emphasis parser. + +>>> parseString "/Hello world/" +DocEmphasis (DocString "Hello world") +-} +emphasis :: Parser (DocMarkup mod Identifier) +emphasis = + DocEmphasis . parseAll + <$> disallowNewline ("/" *> takeWhile1_ (/= '/') <* "/") + +{- | Bold parser. + +>>> parseString "__Hello world__" +DocBold (DocString "Hello world") +-} +bold :: Parser (DocMarkup mod Identifier) +bold = DocBold . parseAll <$> disallowNewline ("__" *> takeUntil "__") + +{- | Monospaced strings. + +>>> parseString "@cruel@" +DocMonospaced (DocString "cruel") +-} +monospace :: Parser (DocMarkup mod Identifier) +monospace = + DocMonospace . parseAll + <$> ("@" *> takeWhile1_ (/= '@') <* "@") + +{- | Text anchors to allow for jumping around the generated documentation. + +>>> parseString "#Hello world#" +DocAName "Hello world" +-} +anchor :: Parser (DocMarkup mod a) +anchor = + DocAnchor . Text.unpack + <$> ("#" *> takeWhile1_ (\x -> x /= '#' && not (Char.isSpace x)) <* "#") + +-- | Parses identifiers with help of 'parseValid'. +identifier :: Parser (DocMarkup mod Identifier) +identifier = DocIdentifier <$> parseValid + +{- | Module names. + +Note that we allow '#' and '\' to support anchors (old style anchors are of +the form "SomeModule\#anchor"). +-} +moduleName :: Parser (DocMarkup mod a) +moduleName = DocModule . flip ModuleLink Nothing <$> ("\"" *> moduleNameString <* "\"") + +-- | A module name, optionally with an anchor +moduleNameString :: Parser String +moduleNameString = moduleId `maybeFollowedBy` anchor_ + where + moduleId = intercalate "." <$> conid `Parsec.sepBy1` "." + anchor_ = + (++) + <$> (Parsec.string "#" <|> Parsec.string "\\#") + <*> many (Parsec.satisfy (\c -> c /= '"' && not (Char.isSpace c))) + + maybeFollowedBy pre suf = (\x -> maybe x (x ++)) <$> pre <*> optional suf + + conid = + (:) + <$> Parsec.satisfy (\c -> Char.isAlpha c && Char.isUpper c) + <*> many conChar + + conChar = Parsec.alphaNum <|> Parsec.char '_' + +------------------------ +-- Markup components +------------------------ + +{- | List of characters that we use to delimit any special markup. +Once we have checked for any of these and tried to parse the +relevant markup, we can assume they are used as regular text. +-} +specialChar :: [Char] +specialChar = "_/<@\"&'`#[ " + +------------------------ +-- Helpers +------------------------ + +parse' :: Parser a -> Text -> Either String (ParserState, a) +parse' parser t = + let parser' = (,) <$> parser <*> Parsec.getState + in case Parsec.runParser parser' initialParserState "" t of + Left e -> Left (show e) + Right (x, s) -> Right (s, x) + +docConcat :: [DocMarkup mod id] -> DocMarkup mod id +docConcat = foldr docAppend DocEmpty + where + -- Prevent doc append from becoming too nested + docAppend (DocDefinitionList ds1) (DocDefinitionList ds2) = DocDefinitionList (ds1 <> ds2) + docAppend (DocDefinitionList ds1) (DocAppend (DocDefinitionList ds2) d) = DocAppend (DocDefinitionList (ds1 <> ds2)) d + docAppend (DocOrderedList ds1) (DocOrderedList ds2) = DocOrderedList (ds1 <> ds2) + docAppend (DocOrderedList ds1) (DocAppend (DocOrderedList ds2) d) = DocAppend (DocOrderedList (ds1 <> ds2)) d + docAppend (DocUnorderedList ds1) (DocUnorderedList ds2) = DocUnorderedList (ds1 <> ds2) + docAppend (DocUnorderedList ds1) (DocAppend (DocUnorderedList ds2) d) = DocAppend (DocUnorderedList (ds1 <> ds2)) d + docAppend DocEmpty d = d + docAppend d DocEmpty = d + docAppend (DocString s1) (DocString s2) = DocString (s1 <> s2) + docAppend (DocAppend d (DocString s1)) (DocString s2) = DocAppend d (DocString (s1 <> s2)) + docAppend (DocString s1) (DocAppend (DocString s2) d) = DocAppend (DocString (s1 <> s2)) d + docAppend d1 d2 = DocAppend d1 d2 + +{- | Parse a text paragraph. Actually just a wrapper over 'parseAll' which +drops leading whitespace. +-} +parseText :: Text -> DocMarkup mod Identifier +parseText = parseAll . Text.dropWhile Char.isSpace . Text.filter (/= '\r') + +parseAll :: Text -> DocMarkup mod Identifier +parseAll = snd . parse myParser + where + -- docConcat + -- <$> many + -- ( choice' + -- [ monospace + -- , anchor + -- , identifier + -- , moduleName + -- , picture + -- , mathDisplay + -- , mathInline + -- , markdownImage + -- , markdownLink + -- , hyperlink + -- , bold + -- , emphasis + -- , encodedChar + -- , string' + -- , skipSpecialChar + -- ] + -- ) + myParser :: Parser (DocMarkup mod Identifier) + myParser = + docConcat + <$> many + ( choice' + [ monospace + , anchor + , identifier + , moduleName + , bold + , emphasis + , string' + , skipSpecialChar + ] + ) + +choice' :: [Parser a] -> Parser a +choice' [] = empty +choice' [p] = p +choice' (p : ps) = Parsec.try p <|> choice' ps + +disallowNewline :: Parser Text -> Parser Text +disallowNewline = mfilter (Text.all (/= '\n')) diff --git a/src/Parser/Util.hs b/src/Parser/Util.hs new file mode 100644 index 0000000..4cf96cb --- /dev/null +++ b/src/Parser/Util.hs @@ -0,0 +1,65 @@ +module Parser.Util where + +import Control.Monad (mfilter) +import Data.Functor (($>)) +import Data.Text (Text) +import Data.Text qualified as Text +import ParserMonad (Parser) +import Text.Parsec (State (..)) +import Text.Parsec qualified as Parsec +import Text.Parsec.Pos (updatePosChar) + +{- | Consume characters from the input up to and including the given pattern. +Return everything consumed except for the end pattern itself. +-} +takeUntil :: Text -> Parser Text +takeUntil end_ = Text.dropEnd (Text.length end_) <$> requireEnd (scan p (False, end)) >>= gotSome + where + end = Text.unpack end_ + + p :: (Bool, String) -> Char -> Maybe (Bool, String) + p acc c = case acc of + (True, _) -> Just (False, end) + (_, []) -> Nothing + (_, x : xs) | x == c -> Just (False, xs) + _ -> Just (c == '\\', end) + + requireEnd = mfilter (Text.isSuffixOf end_) + + gotSome xs + | Text.null xs = fail "didn't get any content" + | otherwise = return xs + +-- | Like `takeWhile`, but unconditionally take escaped characters. +takeWhile_ :: (Char -> Bool) -> Parser Text +takeWhile_ p = scan p_ False + where + p_ escaped c + | escaped = Just False + | not $ p c = Nothing + | otherwise = Just (c == '\\') + +-- | Like 'takeWhile1', but unconditionally take escaped characters. +takeWhile1_ :: (Char -> Bool) -> Parser Text +takeWhile1_ = mfilter (not . Text.null) . takeWhile_ + +{- | Scan the input text, accumulating characters as long as the scanning +function returns true. +-} +scan :: + -- | scan function + (state -> Char -> Maybe state) -> + -- | initial state + state -> + Parser Text +scan f initState = do + parserState@State{stateInput = input, statePos = pos} <- Parsec.getParserState + (remaining, finalPos, ct) <- go input initState pos 0 + let newState = parserState{stateInput = remaining, statePos = finalPos} + Parsec.setParserState newState $> Text.take ct input + where + go !input' !st !posAccum !count' = case Text.uncons input' of + Nothing -> pure (input', posAccum, count') + Just (char', input'') -> case f st char' of + Nothing -> pure (input', posAccum, count') + Just st' -> go input'' st' (updatePosChar posAccum char') (count' + 1) diff --git a/src/ParserMonad.hs b/src/ParserMonad.hs new file mode 100644 index 0000000..a76e3b8 --- /dev/null +++ b/src/ParserMonad.hs @@ -0,0 +1,26 @@ +module ParserMonad where + +import Data.String +import Data.Text (Text) +import Data.Text qualified as Text +import Text.Parsec (Parsec) +import Text.Parsec qualified as Parsec + +import Types + +type Parser = Parsec Text ParserState + +instance (a ~ Text) => IsString (Parser a) where + fromString = fmap Text.pack . Parsec.string + +{- | The only bit of information we really care about trudging along with us +through parsing is the version attached to a @\@since@ annotation - if +the doc even contained one. +-} +newtype ParserState = ParserState + { since :: Maybe Since + } + deriving (Eq, Show) + +initialParserState :: ParserState +initialParserState = ParserState Nothing diff --git a/src/Types.hs b/src/Types.hs new file mode 100644 index 0000000..615c401 --- /dev/null +++ b/src/Types.hs @@ -0,0 +1,196 @@ +module Types ( + DocMarkup (..), + Document (..), + Meta (..), + ModuleLink (..), + Package, + Since (..), + Version, +) +where + +newtype Document = Document + { meta :: Meta + } + deriving (Eq, Show) + +newtype Meta = Meta + { since :: Maybe Since + } + deriving (Eq, Show) + +data Since = Since + { package :: Maybe Package + -- ^ optional package qualification + , version :: Version + } + deriving (Eq, Show) + +-- Could have a better type? +type Version = [Int] +type Package = String + +data DocMarkup mod id + = DocEmpty + | -- | This is not represented in the markup language, this is for internal use + DocAppend (DocMarkup mod id) (DocMarkup mod id) + | -- | Any text that doesn't match any rules is a bare string + DocString String + | -- | Paragraphs are demarcated by blank lines + DocParagraph (DocMarkup mod id) + | -- | A haskell identifier + DocIdentifier id + | -- | A qualified identifier that couldn't be resolved. + DocIdentifierUnchecked + | -- | A link to a module, might include a label + DocModule (ModuleLink (DocMarkup mod id)) + | -- | Emphasis /italics/ + DocEmphasis (DocMarkup mod id) + | -- | Monospaced @source code@ + DocMonospace (DocMarkup mod id) + | -- | Bold __bold text__ + DocBold (DocMarkup mod id) + | {- | Unordered lists + * this + or + - this + -} + DocUnorderedList [DocMarkup mod id] + | {- | Ordered lists + 1. this + or + (1) this + -} + DocOrderedList [(Int, DocMarkup mod id)] + | {- | Definition lists + [term] a term + [another term] another definition + -} + DocDefinitionList [(DocMarkup mod id, DocMarkup mod id)] + | {- | Code blocks + @ + a code block in here + with multiple lines + @ + + Or with bird tracks: + > some code + > goes here + -} + DocCodeBlock (DocMarkup mod id) + | {- | Hyperlinks + __marked__: + + + __Auto-detected URLs__: + http://example.com + https://example.com + ftp://example.com + __Markdown style__ + [link text](http://example.com) + [link text]("Module.Name") + -} + DocHyperlink (Hyperlink (DocMarkup mod id)) + | {- | Pictures + <> + <> + + __Markdown Images__ + + ![alt text](image.png) + -} + DocPicture Picture + | {- | Inline math expressions + \(mathematical expression\) + -} + DocMathInline String + | {- | Math multiline display + \[ + mathematical expression + in multiple lines + \] + -} + DocMathDisplay String + | {- | Anchors, no spaces allowed + #anchor-name# + -} + DocAnchor String + | {- | Property descriptions + prop> property description + -} + DocProperty String + | {- | Examples + >>> expression + result line 1 + result line 2 + -} + DocExamples [Example] + | -- | Header + DocHeader (Header (DocMarkup mod id)) + | -- Table + DocTable (Table (DocMarkup mod id)) + deriving (Eq, Show) + +instance Semigroup (DocMarkup mod id) where + (<>) = DocAppend + +instance Monoid (DocMarkup mod id) where + mempty = DocEmpty + mconcat = foldr (<>) mempty + +data ModuleLink id = ModuleLink + { name :: String + , label :: Maybe id + } + deriving (Eq, Show) + +data Picture = Picture + { uri :: String + , title :: Maybe String + } + deriving (Eq, Show) + +data Hyperlink id = Hyperlink + { url :: String + , label :: Maybe id + } + deriving (Eq, Show, Functor, Foldable, Traversable) + +data TableCell id = TableCell + { col :: Int + , row :: Int + , content :: id + } + deriving (Eq, Show, Functor, Foldable, Traversable) + +newtype TableRow id = TableRow + { rows :: [TableCell id] + } + deriving (Eq, Show, Functor, Foldable, Traversable) + +data Table id = Table + { headerRows :: [TableRow id] + , bodyRows :: [TableRow id] + } + deriving (Eq, Show, Functor, Foldable, Traversable) + +data Example = Example + { exampleExpression :: String + , exampleResult :: [String] + } + deriving (Eq, Show) + +data Header id = Header + { level :: HeaderLevel + , title :: id + } + deriving (Eq, Show) + +data HeaderLevel + = H1 + | H2 + | H3 + | H4 + | H5 + | H6 + deriving (Eq, Show, Bounded, Enum) diff --git a/test/Spec.hs b/test/Spec.hs new file mode 100644 index 0000000..21ff624 --- /dev/null +++ b/test/Spec.hs @@ -0,0 +1,34 @@ +{-# LANGUAGE OverloadedStrings #-} +{-# OPTIONS_GHC -Wno-orphans #-} + +import Test.Hspec + +import Data.String (IsString (..)) +import Data.Text (Text) + +import Identifier (Identifier) +import Lexer +import Parser +import Types + +main :: IO () +main = hspec $ do + describe "Lexer" do + it "lexes" do + lexer "This is string" `shouldBe` undefined + describe "Parser" do + it "Bold" do + "__bold__" `shouldParseTo` (DocBold (DocString "bold")) + it "Emphasis" do + "/emphasis/" `shouldParseTo` (DocEmphasis (DocString "emphasis")) + +shouldParseTo :: Text -> DocMarkup mod Identifier -> Expectation +shouldParseTo input ast = parseText input `shouldBe` ast + +type Doc id = DocMarkup () id + +instance IsString (Doc String) where + fromString = DocString + +file :: IO String +file = readFile "test/markup.md" diff --git a/test/markup.md b/test/markup.md new file mode 100644 index 0000000..befd3ed --- /dev/null +++ b/test/markup.md @@ -0,0 +1,89 @@ +/emphasized text/ + +__bold text__ + +@monospace text@ + +"Module.Name" +"Module.Name#anchor" +"Module.Name#anchor" + +[http://example.com](http://example.com) +<[http://example.com](http://example.com) label text> + +[http://example.com](http://example.com) +[https://example.com](https://example.com) +ftp\://example.com + +[link text](http://example.com) +[link text]("Module.Name") + +#anchor-name# + +'identifier' + +<> +<> + +![alt text](image.png) + +\(mathematical expression\) +\[mathematical expression\] + +@ +code block content +with multiple lines +@ + +> code line 1 +> code line 2 + +>>> expression +result line 1 +result line 2 + +>>> another expression +result + +prop> property description + +* item 1 +* item 2 + continued content + +- item 1 +- item 2 + +1. item 1 +2. item 2 + +(1) item 1 +(2) item 2 + +[term] definition content +[another term] more definition content + ++----------+----------+ +| Header 1 | Header 2 | ++==========+==========+ +| Cell 1 | Cell 2 | ++----------+----------+ +| Cell 3 | Cell 4 | ++----------+----------+ + += Level 1 Header +== Level 2 Header +=== Level 3 Header +==== Level 4 Header +===== Level 5 Header +====== Level 6 Header + +@since package-name-1.2.3 +@since 1.2.3 + +A (decimal) +A (hexadecimal) +A (hexadecimal) + + +This \@escapes\@ at signs