Init

2025-09-21 08:19:01 +02:00 · 2025-09-21 08:19:01 +02:00 · c9f61c4e06
commit c9f61c4e06
16 changed files with 1366 additions and 0 deletions
--- a/src/Identifier.hs
+++ b/src/Identifier.hs
@ -0,0 +1,157 @@
+module Identifier (
+  Identifier (..),
+  Namespace (..),
+  parseValid,
+) where
+
+import Data.Text qualified as Text
+import Text.Parsec qualified as Parsec
+
+import Control.Monad (guard)
+import Data.Char (isAlpha, isAlphaNum)
+import Data.Functor (($>))
+import Data.Maybe (listToMaybe, maybeToList)
+import Data.Text (Text)
+import ParserMonad
+import Text.Parsec (State (..))
+import Text.Parsec.Pos (updatePosChar)
+import Text.Read.Lex (isSymbolChar)
+
+-- | Identifier string surrounded with namespace, opening, and closing quotes/backticks.
+data Identifier = Identifier !Namespace !Char String !Char
+  deriving (Show, Eq)
+
+-- | The namespace qualification for an identifier.
+data Namespace = Value | Type | None deriving (Eq, Ord, Enum, Show)
+
+parseValid :: Parser Identifier
+parseValid = do
+  state@State{stateInput = input, statePos = pos} <- Parsec.getParserState
+
+  case takeIdentifier input of
+    Nothing -> Parsec.parserFail "parseValid: Failed to match a valid identifier"
+    Just (namespace, op, ident, cl, input') ->
+      let posOp = updatePosChar pos op
+          posIdent = Text.foldl updatePosChar posOp ident
+          posCl = updatePosChar posIdent cl
+          newState = state{stateInput = input', statePos = posCl}
+       in Parsec.setParserState newState $> Identifier namespace op (Text.unpack ident) cl
+
+{- | Try to parse a delimited identifier off the front of the given input.
+
+This tries to match as many valid Haskell identifiers/operators as possible,
+to the point of sometimes accepting invalid things (ex: keywords). Some
+considerations:
+
+  - operators and identifiers can have module qualifications
+  - operators can be wrapped in parens (for prefix)
+  - identifiers can be wrapped in backticks (for infix)
+  - delimiters are backticks or regular ticks
+  - since regular ticks are also valid in identifiers, we opt for the
+    longest successful parse
+
+This function should make /O(1)/ allocations
+-}
+takeIdentifier :: Text -> Maybe (Namespace, Char, Text, Char, Text)
+takeIdentifier input = listToMaybe $ do
+  -- Optional namespace
+  let (namespace, input') = case Text.uncons input of
+        Just ('v', i) -> (Value, i)
+        Just ('t', i) -> (Type, i)
+        _ -> (None, input)
+  -- Opening tick
+  (op, input'') <- maybeToList (Text.uncons input')
+  guard (op == '\'' || op == '`')
+
+  -- Identifier/operator
+  (ident, input''') <- wrapped input''
+
+  -- Closing tick
+  (cl, input'''') <- maybeToList (Text.uncons input''')
+  guard (cl == '\'' || cl == '`')
+
+  pure (namespace, op, ident, cl, input'''')
+ where
+  -- \| Parse out a wrapped, possibly qualified, operator or identifier
+  wrapped t = do
+    (c, t') <- maybeToList (Text.uncons t)
+    -- Tuples
+    case c of
+      '('
+        | Just (c', _) <- Text.uncons t'
+        , c' == ',' || c' == ')' ->
+            do
+              let (commas, t'') = Text.span (== ',') t'
+              (')', t''') <- maybeToList (Text.uncons t'')
+              pure (Text.take (Text.length commas + 2) t, t''')
+
+      -- Parenthesized
+      '(' -> do
+        (n, t'') <- general False 0 [] t'
+        (')', t''') <- maybeToList (Text.uncons t'')
+        pure (Text.take (n + 2) t, t''')
+
+      -- Backticked
+      '`' -> do
+        (n, t'') <- general False 0 [] t'
+        ('`', t''') <- maybeToList (Text.uncons t'')
+        pure (Text.take (n + 2) t, t''')
+
+      -- Unadorned
+      _ -> do
+        (n, t'') <- general False 0 [] t
+        pure (Text.take n t, t'')
+
+  -- \| Parse out a possibly qualified operator or identifier
+  general ::
+    Bool ->
+    -- \^ refuse inputs starting with operators
+    Int ->
+    -- \^ total characters \"consumed\" so far
+    [(Int, Text)] ->
+    -- \^ accumulated results
+    Text ->
+    -- \^ current input
+    [(Int, Text)]
+  -- \^ total characters parsed & what remains
+  general !identOnly !i acc t
+    -- Starts with an identifier (either just an identifier, or a module qual)
+    | Just (n, rest) <- identLike t =
+        if Text.null rest
+          then acc
+          else case Text.head rest of
+            '`' -> (n + i, rest) : acc
+            ')' -> (n + i, rest) : acc
+            '.' -> general False (n + i + 1) acc (Text.tail rest)
+            '\'' ->
+              let (m, rest') = quotes rest
+               in general True (n + m + 1 + i) ((n + m + i, rest') : acc) (Text.tail rest')
+            _ -> acc
+    -- An operator
+    | Just (n, rest) <- optr t
+    , not identOnly =
+        (n + i, rest) : acc
+    -- Anything else
+    | otherwise =
+        acc
+
+  -- \| Parse an identifier off the front of the input
+  identLike t
+    | Text.null t = Nothing
+    | isAlpha (Text.head t) || '_' == Text.head t =
+        let !(idt, rest) = Text.span (\c -> isAlphaNum c || c == '_') t
+            !(octos, rest') = Text.span (== '#') rest
+         in Just (Text.length idt + Text.length octos, rest')
+    | otherwise = Nothing
+
+  -- \| Parse all but the last quote off the front of the input
+  -- PRECONDITION: T.head t `elem` ['\'', '`']
+  quotes :: Text -> (Int, Text)
+  quotes t =
+    let !n = Text.length (Text.takeWhile (`elem` ['\'', '`']) t) - 1
+     in (n, Text.drop n t)
+
+  -- \| Parse an operator off the front of the input
+  optr t =
+    let !(op, rest) = Text.span isSymbolChar t
+     in if Text.null op then Nothing else Just (Text.length op, rest)