haddock2/src/Identifier.hs

module Identifier (
  Identifier (..),
  Namespace (..),
  parseValid,
) where

import Data.Text qualified as Text
import Text.Parsec qualified as Parsec

import Control.Monad (guard)
import Data.Char (isAlpha, isAlphaNum)
import Data.Functor (($>))
import Data.Maybe (listToMaybe, maybeToList)
import Data.Text (Text)
import ParserMonad
import Text.Parsec (State (..))
import Text.Parsec.Pos (updatePosChar)
import Text.Read.Lex (isSymbolChar)

-- | Identifier string surrounded with namespace, opening, and closing quotes/backticks.
data Identifier = Identifier !Namespace !Char String !Char
  deriving (Show, Eq)

-- | The namespace qualification for an identifier.
data Namespace = Value | Type | None deriving (Eq, Ord, Enum, Show)

parseValid :: Parser Identifier
parseValid = do
  state@State{stateInput = input, statePos = pos} <- Parsec.getParserState

  case takeIdentifier input of
    Nothing -> Parsec.parserFail "parseValid: Failed to match a valid identifier"
    Just (namespace, op, ident, cl, input') ->
      let posOp = updatePosChar pos op
          posIdent = Text.foldl updatePosChar posOp ident
          posCl = updatePosChar posIdent cl
          newState = state{stateInput = input', statePos = posCl}
       in Parsec.setParserState newState $> Identifier namespace op (Text.unpack ident) cl

{- | Try to parse a delimited identifier off the front of the given input.

This tries to match as many valid Haskell identifiers/operators as possible,
to the point of sometimes accepting invalid things (ex: keywords). Some
considerations:

  - operators and identifiers can have module qualifications
  - operators can be wrapped in parens (for prefix)
  - identifiers can be wrapped in backticks (for infix)
  - delimiters are backticks or regular ticks
  - since regular ticks are also valid in identifiers, we opt for the
    longest successful parse

This function should make /O(1)/ allocations
-}
takeIdentifier :: Text -> Maybe (Namespace, Char, Text, Char, Text)
takeIdentifier input = listToMaybe $ do
  -- Optional namespace
  let (namespace, input') = case Text.uncons input of
        Just ('v', i) -> (Value, i)
        Just ('t', i) -> (Type, i)
        _ -> (None, input)
  -- Opening tick
  (op, input'') <- maybeToList (Text.uncons input')
  guard (op == '\'' || op == '`')

  -- Identifier/operator
  (ident, input''') <- wrapped input''

  -- Closing tick
  (cl, input'''') <- maybeToList (Text.uncons input''')
  guard (cl == '\'' || cl == '`')

  pure (namespace, op, ident, cl, input'''')
 where
  -- \| Parse out a wrapped, possibly qualified, operator or identifier
  wrapped t = do
    (c, t') <- maybeToList (Text.uncons t)
    -- Tuples
    case c of
      '('
        | Just (c', _) <- Text.uncons t'
        , c' == ',' || c' == ')' ->
            do
              let (commas, t'') = Text.span (== ',') t'
              (')', t''') <- maybeToList (Text.uncons t'')
              pure (Text.take (Text.length commas + 2) t, t''')

      -- Parenthesized
      '(' -> do
        (n, t'') <- general False 0 [] t'
        (')', t''') <- maybeToList (Text.uncons t'')
        pure (Text.take (n + 2) t, t''')

      -- Backticked
      '`' -> do
        (n, t'') <- general False 0 [] t'
        ('`', t''') <- maybeToList (Text.uncons t'')
        pure (Text.take (n + 2) t, t''')

      -- Unadorned
      _ -> do
        (n, t'') <- general False 0 [] t
        pure (Text.take n t, t'')

  -- \| Parse out a possibly qualified operator or identifier
  general ::
    Bool ->
    -- \^ refuse inputs starting with operators
    Int ->
    -- \^ total characters \"consumed\" so far
    [(Int, Text)] ->
    -- \^ accumulated results
    Text ->
    -- \^ current input
    [(Int, Text)]
  -- \^ total characters parsed & what remains
  general !identOnly !i acc t
    -- Starts with an identifier (either just an identifier, or a module qual)
    | Just (n, rest) <- identLike t =
        if Text.null rest
          then acc
          else case Text.head rest of
            '`' -> (n + i, rest) : acc
            ')' -> (n + i, rest) : acc
            '.' -> general False (n + i + 1) acc (Text.tail rest)
            '\'' ->
              let (m, rest') = quotes rest
               in general True (n + m + 1 + i) ((n + m + i, rest') : acc) (Text.tail rest')
            _ -> acc
    -- An operator
    | Just (n, rest) <- optr t
    , not identOnly =
        (n + i, rest) : acc
    -- Anything else
    | otherwise =
        acc

  -- \| Parse an identifier off the front of the input
  identLike t
    | Text.null t = Nothing
    | isAlpha (Text.head t) || '_' == Text.head t =
        let !(idt, rest) = Text.span (\c -> isAlphaNum c || c == '_') t
            !(octos, rest') = Text.span (== '#') rest
         in Just (Text.length idt + Text.length octos, rest')
    | otherwise = Nothing

  -- \| Parse all but the last quote off the front of the input
  -- PRECONDITION: T.head t `elem` ['\'', '`']
  quotes :: Text -> (Int, Text)
  quotes t =
    let !n = Text.length (Text.takeWhile (`elem` ['\'', '`']) t) - 1
     in (n, Text.drop n t)

  -- \| Parse an operator off the front of the input
  optr t =
    let !(op, rest) = Text.span isSymbolChar t
     in if Text.null op then Nothing else Just (Text.length op, rest)