184 lines
5 KiB
Haskell
184 lines
5 KiB
Haskell
{-# LANGUAGE DeriveGeneric #-}
|
|
{-# LANGUAGE FlexibleInstances #-}
|
|
{-# LANGUAGE MultiParamTypeClasses #-}
|
|
{-# LANGUAGE OverloadedStrings #-}
|
|
|
|
module Language.Yanais.Parser (
|
|
Text,
|
|
Ident(..),
|
|
Parser(..),
|
|
Parser',
|
|
ParserEnv(..),
|
|
Span(..),
|
|
HandleTree(..),
|
|
makeParseEnv,
|
|
runParser,
|
|
takeUntil,
|
|
takeWhile,
|
|
takeIdent,
|
|
takeWithProperty,
|
|
skipWhiteSpace,
|
|
tryOne,
|
|
eats,
|
|
recordSpan,
|
|
) where
|
|
|
|
import Prelude hiding (takeWhile, span, splitAt)
|
|
import Control.Applicative ((<|>))
|
|
import Control.DeepSeq (NFData)
|
|
import Control.Monad.State.Strict
|
|
import Data.Hashable (Hashable)
|
|
import qualified Data.ByteString.Char8 as C
|
|
import qualified Data.ByteString.UTF8 as B
|
|
import qualified Data.HashMap.Strict as H
|
|
import qualified Data.String.UTF8 as U
|
|
import qualified Data.Text as T
|
|
import qualified Data.Text.ICU.Char as IC
|
|
import qualified Data.Text.ICU.Normalize2 as IN
|
|
-- import Language.Yanais.Fusion
|
|
import Generic.Data
|
|
|
|
-- A 0-indexed, half-open interval of integers, defined by start & end indices
|
|
data Span = Span
|
|
{ start :: {-# UNPACK #-} !Int
|
|
, end :: {-# UNPACK #-} !Int
|
|
}
|
|
deriving (Eq, Generic, Ord, Show)
|
|
|
|
instance Hashable Span
|
|
instance NFData Span
|
|
|
|
instance Semigroup Span where
|
|
Span start1 end1 <> Span start2 end2 = Span (min start1 start2) (max end1 end2)
|
|
|
|
-- a generic parser monad
|
|
newtype Parser s e a = Parser { runP :: StateT s (Either e) a }
|
|
|
|
instance Functor (Parser s e) where
|
|
fmap f = Parser . (fmap f) . runP
|
|
{-# INLINE fmap #-}
|
|
|
|
instance Applicative (Parser s e) where
|
|
pure = Parser . pure
|
|
{-# INLINE pure #-}
|
|
Parser a <*> Parser b = Parser $ a <*> b
|
|
{-# INLINE (<*>) #-}
|
|
|
|
instance Monad (Parser s e) where
|
|
return = pure
|
|
Parser a >>= fb = Parser $ a >>= (runP . fb)
|
|
{-# INLINE (>>=) #-}
|
|
|
|
instance MonadState s (Parser s e) where
|
|
state = Parser . state
|
|
{-# INLINE state #-}
|
|
|
|
type Text = U.UTF8 B.ByteString
|
|
|
|
data Ident = Ident !Span !Text
|
|
|
|
instance Show Ident where
|
|
show (Ident _ t) = show t
|
|
|
|
data ParserEnv = ParserEnv {
|
|
peOffset :: !Int,
|
|
peText :: Text
|
|
}
|
|
|
|
type Parser' e a = Parser ParserEnv e a
|
|
|
|
makeParseEnv :: B.ByteString -> ParserEnv
|
|
makeParseEnv bs = ParserEnv { peOffset = 0, peText = U.fromRep bs }
|
|
|
|
runParser :: s -> Parser s e a -> Either e (a, s)
|
|
runParser st (Parser p) = runStateT p st
|
|
|
|
-- some simple parser combinators
|
|
|
|
shiftEnv :: (Int, Text) -> Parser' e ()
|
|
shiftEnv (ll, r) = Parser $ modify go
|
|
where
|
|
go :: ParserEnv -> ParserEnv
|
|
go st = st { peOffset = (peOffset st) + ll, peText = r }
|
|
|
|
slen :: Text -> Int
|
|
slen = C.length . U.toRep
|
|
|
|
takeUntil :: (Char -> Bool) -> Parser' e Text
|
|
takeUntil f = takeWhile (not . f)
|
|
|
|
takeWhile :: (Char -> Bool) -> Parser' e Text
|
|
takeWhile f = do
|
|
env <- get
|
|
let (l, r) = U.span f (peText env)
|
|
shiftEnv (slen l, r)
|
|
return l
|
|
|
|
takeWithProperty :: IC.Bool_ -> Parser' e Text
|
|
takeWithProperty p = takeWhile $ IC.property p
|
|
|
|
takeIdent :: Parser' e (Maybe Ident)
|
|
takeIdent = do
|
|
env <- get
|
|
let start_ = peOffset env
|
|
let (l, r) = U.splitAt 1 (peText env)
|
|
case U.uncons l of
|
|
Nothing -> return Nothing
|
|
Just (fi, r2) -> (
|
|
(if not ((slen r2) == 0) then error "takeIdent : unable to iterator over characters" else ())
|
|
`seq`
|
|
(if IC.property IC.XidStart fi then (
|
|
do
|
|
shiftEnv (slen l, r)
|
|
rest <- takeWithProperty IC.XidContinue
|
|
end_ <- gets peOffset
|
|
-- I hate this...
|
|
return (Just . (\s -> Ident (Span { start = start_, end = end_ }) s) . U.fromRep . B.fromString . T.unpack . IN.nfc . T.pack $ fi:(B.toString $ U.toRep rest))
|
|
) else return Nothing))
|
|
|
|
skipWhiteSpace :: Parser' e ()
|
|
skipWhiteSpace = takeWithProperty IC.WhiteSpace >> pure ()
|
|
|
|
-- | try to parse the first character as a token
|
|
tryOne :: (Char -> Maybe tok) -> Parser' e (Maybe tok)
|
|
tryOne f = do
|
|
env <- get
|
|
let (l, r) = U.splitAt 1 (peText env)
|
|
case U.uncons l of
|
|
Nothing -> return Nothing
|
|
Just (fi, r2) -> (
|
|
(if not ((slen r2) == 0) then error "tryOne : unable to iterator over characters" else ())
|
|
`seq`
|
|
(shiftEnv (slen l, r) >> return (f fi)))
|
|
|
|
-- | A case tree for `eats`
|
|
data HandleTree x =
|
|
HandleTree (H.HashMap Char (HandleTree x)) (Maybe x)
|
|
| Htleaf !x
|
|
|
|
-- | try to recognize specific tokens
|
|
eats :: HandleTree x -> Parser' e (Maybe x)
|
|
eats ht = Parser . StateT $ \st ->
|
|
-- do backtracking in case of failure
|
|
let Parser (StateT eatsfun) = eats_ ht in
|
|
-- fmap @ Either
|
|
fmap (\val -> case val of
|
|
(Nothing, _) -> (Nothing, st)
|
|
(Just x, st2) -> (Just x, st2)
|
|
) $ eatsfun st
|
|
|
|
where
|
|
eatOne hm xc = tryOne $ \c -> (H.lookup c hm) <|> (fmap Htleaf xc)
|
|
eats_ :: HandleTree x -> Parser' e (Maybe x)
|
|
eats_ (Htleaf x) = pure (Just x)
|
|
eats_ (HandleTree hm xc) = (eatOne hm xc) >>= \x -> case x of
|
|
Nothing -> pure Nothing
|
|
Just y -> eats_ y
|
|
|
|
-- | record the span of the inner parser
|
|
recordSpan :: Parser' e a -> Parser' e (Span, a)
|
|
recordSpan inner = do
|
|
start_ <- gets peOffset
|
|
x <- inner
|
|
end_ <- gets peOffset
|
|
pure (Span { start = start_, end = end_ }, x)
|