haskell: split generic parser into generic and bytestring-specific parts

This commit is contained in:
Alain Zscheile 2023-10-26 01:18:03 +02:00
parent 926176eafd
commit ac71ebd9b9
7 changed files with 272 additions and 220 deletions

View file

@ -1,7 +1,7 @@
module Main where
import System.Environment
import qualified Language.Yanais.Parser as P
import qualified Language.Yanais.Parser.Lex as PL
import qualified Language.Yanais.Nfker7h.Parser.Lex as L
main :: IO ()
@ -10,7 +10,7 @@ main = do
mainA args
mainA :: [String] -> IO ()
mainA [x] = P.parseFile x L.lexe >>= putStrLn . (\y -> case y of
mainA [x] = PL.parseFile x L.lexe >>= putStrLn . (\y -> case y of
Right z -> fmtList z
Left e -> show e)
mainA _ = putStrLn "Hello World!"

View file

@ -1,6 +1,7 @@
module Language.Yanais.Nfker7h.Parser.Lex (
Token,
TokenKind(..),
TokParser,
Keyword(..),
lexe,
) where
@ -12,6 +13,7 @@ import Control.Monad.Error.Class (MonadError(..))
import Control.Monad.State.Strict
import qualified Data.HashMap.Strict as H
import Language.Yanais.Parser
import Language.Yanais.Parser.Lex
import Language.Yanais.Nfker7h.Parser.Types
import Numeric.Natural
@ -61,6 +63,8 @@ handleIdentsI pfx = do
tmp Nothing (Just x) _ = Just $ TIdent x
tmp Nothing Nothing _ = Nothing
type TokParser e a = Parser [Token] e a
lexe :: Parser' Error [Token]
lexe = do
lexeComments 0

View file

@ -9,6 +9,7 @@ import Data.Hashable (Hashable)
import Data.Text.Encoding.Error (UnicodeException)
import Generic.Data (Generic)
import qualified Language.Yanais.Parser as P
import qualified Language.Yanais.Parser.Lex as PL
data Context = CtxModule deriving (Eq, Generic)
instance Hashable Context
@ -17,11 +18,11 @@ data Error = UnexpectedEof Context
| UnexpectedChar !Int !Char
| AlternativeErr !Int
| DteUnicodeExc UnicodeException
| UnknownIdent !P.Ident
| UnknownIdent !PL.Ident
instance P.ParserError Error where
type PErrEnv Error = P.ParserEnv
perrUnknown st = AlternativeErr (P.peOffset st)
type PErrEnv Error = PL.ParserEnv
perrUnknown st = AlternativeErr (PL.peOffset st)
instance Show Context where
show CtxModule = "module level"

View file

@ -0,0 +1,29 @@
module Language.Yanais.Nfker7h.Pattern (
Pattern(..),
Pattern',
patAllocSlots,
parsePattern,
) where
import Data.Void (Void, absurd)
import qualified Data.ByteString as B
import qualified Data.String.UTF8 as US
import Numeric.Natural
import qualified Language.Yanais.Parser as P
import qualified Language.Yanais.Parser.Lex as PL
import qualified Language.Yanais.Nfker7h.Parser.Lex as PNL
import qualified Language.Yanais.Nfker7h.Parser.Types as PT
data Pattern fall = PatName PL.Ident | PatFall fall
-- TODO: records
-- | infallible pattern
type Pattern' = Pattern Void
patAllocSlots :: Pattern' -> Int
patAllocSlots (PatName (PL.Ident _ nam)) = if B.null $ US.toRep nam then 0 else 1
patAllocSlots (PatFall fl) = absurd fl
parsePattern :: PNL.TokParser PT.Error (Maybe fall) -> PNL.TokParser PT.Error (Pattern fall)
parsePattern = error "not implemented"

View file

@ -4,70 +4,22 @@
#-}
module Language.Yanais.Parser (
Text,
Ident(..),
Parser(..),
Parser',
ParserEnv(..),
ParserError(..),
Span(..),
HandleTree(..),
emptyIdent,
-- creating and running parsers
makeParseEnv,
runParser,
makeParser,
parseFile,
-- parser combinators
flatMaybe,
flatEither,
takeUntil,
takeWhile,
takeIdent,
takeNatural,
takeWithProperty,
skipWhiteSpace,
tryOne,
eats,
recordSpan,
) where
import Prelude hiding (takeWhile, span, splitAt)
import Control.Applicative (Alternative(..), (<|>))
import Control.DeepSeq (NFData)
import Control.Monad (guard)
import Control.Monad.Error.Class (MonadError(..))
import Control.Monad.State.Strict
import Data.Char (isDigit, isHexDigit, digitToInt)
import Data.Hashable (Hashable)
import Data.Kind (Type)
import qualified Data.ByteString as Bb
import qualified Data.ByteString.Char8 as C
import qualified Data.ByteString.UTF8 as B
import qualified Data.HashMap.Strict as H
import qualified Data.String.UTF8 as U
import qualified Data.Text.Encoding as TE
import Data.Text.Encoding.Error (UnicodeException)
import qualified Data.Text.ICU.Char as IC
import qualified Data.Text.ICU.Normalize2 as IN
-- import Language.Yanais.Fusion
import Generic.Data
import Numeric.Natural (Natural)
-- A 0-indexed, half-open interval of integers, defined by start & end indices
data Span = Span
{ start :: {-# UNPACK #-} !Int
, end :: {-# UNPACK #-} !Int
}
deriving (Eq, Generic, Ord, Show)
instance Hashable Span
instance NFData Span
instance Semigroup Span where
Span start1 end1 <> Span start2 end2 = Span (min start1 start2) (max end1 end2)
-- misc failure conditions
class Show e => ParserError e where
@ -122,34 +74,9 @@ instance MonadError e (Parser s e) where
Left err -> let (Parser (StateT ehp)) = eh err in ehp st
Right x -> Right x
type Text = U.UTF8 B.ByteString
data Ident = Ident !Span !Text
emptyIdent :: Int -> Ident
emptyIdent m = Ident (Span { start = m, end = m }) (U.fromRep Bb.empty)
instance Show Ident where
show (Ident _ t) = B.toString $ U.toRep t
data ParserEnv = ParserEnv {
peOffset :: !Int,
peText :: Text
}
type Parser' e a = Parser ParserEnv e a
makeParseEnv :: B.ByteString -> ParserEnv
makeParseEnv bs = ParserEnv { peOffset = 0, peText = U.fromRep bs }
runParser :: s -> Parser s e a -> Either e (a, s)
runParser st (Parser p) = runStateT p st
parseFile :: String -> Parser' e a -> IO (Either e a)
parseFile f p = do
contents <- C.readFile f
return (fmap (\(r, _) -> r) $ runParser (makeParseEnv contents) p)
-- some simple parser combinators
-- | shave an maybe from the parser result and throw an unknown error if it got Nothing
@ -161,145 +88,3 @@ flatMaybe mb = Parser . StateT $ case mb of
-- | shave an either from the parser result and possibly throw errors into the parser itself
flatEither :: Either e a -> Parser s e a
flatEither eith = Parser . StateT $ \st -> fmap (\y -> (y, st)) eith
shiftEnv :: (Int, Text) -> Parser' e ()
shiftEnv (ll, r) = Parser $ modify go
where
go :: ParserEnv -> ParserEnv
go st = st { peOffset = (peOffset st) + ll, peText = r }
slen :: Text -> Int
slen = C.length . U.toRep
takeUntil :: (Char -> Bool) -> Parser' e Text
takeUntil f = takeWhile (not . f)
takeWhile :: (Char -> Bool) -> Parser' e Text
takeWhile f = do
env <- get
let (l, r) = U.span f (peText env)
shiftEnv (slen l, r)
return l
takeWithProperty :: IC.Bool_ -> Parser' e Text
takeWithProperty p = takeWhile $ IC.property p
takeIdent :: Parser' e (Either UnicodeException (Maybe Ident))
takeIdent = do
env <- get
let start_ = peOffset env
let (l, r) = U.splitAt 1 (peText env)
case U.uncons l of
Nothing -> okNone
Just (fi, r2) -> (
(if not ((slen r2) == 0) then error "takeIdent : unable to iterator over characters" else ())
`seq`
(if IC.property IC.XidStart fi then (
do
shiftEnv (slen l, r)
rest <- takeWithProperty IC.XidContinue
end_ <- gets peOffset
let thlw = theHorrorsLieWithin fi rest
let mkident = \s -> Ident (Span { start = start_, end = end_ }) s
return (fmap (Just . mkident) thlw)
) else okNone))
where
okNone :: Parser' e (Either e2 (Maybe dt))
okNone = Parser . StateT $ \st -> Right (Right Nothing, st)
-- I hate this...
theHorrorsLieWithin :: Char -> Text -> Either UnicodeException Text
theHorrorsLieWithin fi rest =
-- make a byte string from the parts of the identifyer
let iabs = Bb.append (B.fromString [fi]) (U.toRep rest) in
-- finish a UTF-8 decoded identifier
let fini = U.fromRep . TE.encodeUtf8 . IN.nfc in
-- handle decoding
fmap fini $ TE.decodeUtf8' iabs
skipWhiteSpace :: Parser' e ()
skipWhiteSpace = takeWithProperty IC.WhiteSpace >> pure ()
-- | try to parse the first character as a token
tryOne :: (Char -> Maybe tok) -> Parser' e (Maybe tok)
tryOne f = do
env <- get
let (l, r) = U.splitAt 1 (peText env)
case U.uncons l of
Nothing -> return Nothing
Just (fi, r2) ->
(if not ((slen r2) == 0) then error "tryOne : unable to iterator over characters" else ())
`seq`
case f fi of
Nothing -> return Nothing
Just x -> do
shiftEnv (slen l, r)
return . Just $ x
-- | A case tree for `eats`
data HandleTree x =
HandleTree (H.HashMap Char (HandleTree x)) (Maybe x)
| Htleaf !x
-- | try to recognize specific tokens
eats :: HandleTree x -> Parser' e (Maybe x)
eats ht = Parser . StateT $ \st ->
-- do backtracking in case of failure
let Parser (StateT eatsfun) = eats_ ht in
-- fmap @ Either
fmap (\val -> case val of
(Nothing, _) -> (Nothing, st)
(Just x, st2) -> (Just x, st2)
) $ eatsfun st
where
eatOne hm xc = tryOne $ \c -> (H.lookup c hm) <|> (fmap Htleaf xc)
eats_ :: HandleTree x -> Parser' e (Maybe x)
eats_ (Htleaf x) = pure (Just x)
eats_ (HandleTree hm xc) = (eatOne hm xc) >>= \x -> case x of
Nothing -> pure Nothing
Just y -> eats_ y
-- | record the span of the inner parser
recordSpan :: Parser' e a -> Parser' e (Span, a)
recordSpan inner = do
start_ <- gets peOffset
x <- inner
end_ <- gets peOffset
pure (Span { start = start_, end = end_ }, x)
takeNatural :: (ParserError e, PErrEnv e ~ ParserEnv) => Parser' e (Maybe (Span, Natural))
takeNatural = (fmap (\(sp, i) -> fmap (\j -> (sp, j)) i)) . recordSpan $ do
fi' <- tryOne $ chkThenNat isDigit
case fi' of
Just 0 ->
fmap Just $ handleHex <|> do
se' <- tryOne $ chkThenNat isDigit
-- invalid number format
guard (se' == Nothing)
return 0
Just d -> fmap Just $ handleDigits 10 isDigit d
Nothing -> return Nothing
where
charToNat :: Char -> Natural
charToNat = fromInteger . toInteger . digitToInt
chkThenNat :: (Char -> Bool) -> Char -> Maybe Natural
chkThenNat chk c = if chk c then Just $ charToNat c else Nothing
-- this parser can't fail
handleDigits :: Natural -> (Char -> Bool) -> Natural -> Parser' e Natural
handleDigits mult chk acc' = go acc'
where
go acc = do
hd' <- tryOne $ chkThenNat chk
case hd' of
Just hd -> go (mult * acc + hd)
Nothing -> pure acc
handleHex = (tryOne $ \c -> if c == 'x' then Just () else Nothing) >>= (\x ->
(guard (x == Just ())) >> (handleDigits 16 isHexDigit 0)
)

View file

@ -0,0 +1,231 @@
{-# LANGUAGE
DeriveGeneric, FlexibleInstances, MultiParamTypeClasses,
OverloadedStrings, TypeFamilies
#-}
module Language.Yanais.Parser.Lex (
Text,
Ident(..),
Parser',
ParserEnv(..),
Span(..),
HandleTree(..),
emptyIdent,
-- creating and running parsers
makeParseEnv,
parseFile,
-- parser combinators
takeUntil,
takeWhile,
takeIdent,
takeNatural,
takeWithProperty,
skipWhiteSpace,
tryOne,
eats,
recordSpan,
) where
import Prelude hiding (takeWhile, span, splitAt)
import Control.Applicative (Alternative(..), (<|>))
import Control.DeepSeq (NFData)
import Control.Monad (guard)
import Control.Monad.State.Strict
import Data.Char (isDigit, isHexDigit, digitToInt)
import Data.Hashable (Hashable)
import qualified Data.ByteString as Bb
import qualified Data.ByteString.Char8 as C
import qualified Data.ByteString.UTF8 as B
import qualified Data.HashMap.Strict as H
import qualified Data.String.UTF8 as U
import qualified Data.Text.Encoding as TE
import Data.Text.Encoding.Error (UnicodeException)
import qualified Data.Text.ICU.Char as IC
import qualified Data.Text.ICU.Normalize2 as IN
import Language.Yanais.Parser
import Generic.Data
import Numeric.Natural (Natural)
-- A 0-indexed, half-open interval of integers, defined by start & end indices
data Span = Span
{ start :: {-# UNPACK #-} !Int
, end :: {-# UNPACK #-} !Int
}
deriving (Eq, Generic, Ord, Show)
instance Hashable Span
instance NFData Span
instance Semigroup Span where
Span start1 end1 <> Span start2 end2 = Span (min start1 start2) (max end1 end2)
type Text = U.UTF8 B.ByteString
data Ident = Ident !Span !Text
emptyIdent :: Int -> Ident
emptyIdent m = Ident (Span { start = m, end = m }) (U.fromRep Bb.empty)
instance Show Ident where
show (Ident _ t) = B.toString $ U.toRep t
data ParserEnv = ParserEnv {
peOffset :: !Int,
peText :: Text
}
type Parser' e a = Parser ParserEnv e a
makeParseEnv :: B.ByteString -> ParserEnv
makeParseEnv bs = ParserEnv { peOffset = 0, peText = U.fromRep bs }
parseFile :: String -> Parser' e a -> IO (Either e a)
parseFile f p = do
contents <- C.readFile f
return (fmap (\(r, _) -> r) $ runParser (makeParseEnv contents) p)
-- some simple parser combinators
shiftEnv :: (Int, Text) -> Parser' e ()
shiftEnv (ll, r) = Parser $ modify go
where
go :: ParserEnv -> ParserEnv
go st = st { peOffset = (peOffset st) + ll, peText = r }
slen :: Text -> Int
slen = C.length . U.toRep
takeUntil :: (Char -> Bool) -> Parser' e Text
takeUntil f = takeWhile (not . f)
takeWhile :: (Char -> Bool) -> Parser' e Text
takeWhile f = do
env <- get
let (l, r) = U.span f (peText env)
shiftEnv (slen l, r)
return l
takeWithProperty :: IC.Bool_ -> Parser' e Text
takeWithProperty p = takeWhile $ IC.property p
takeIdent :: Parser' e (Either UnicodeException (Maybe Ident))
takeIdent = do
env <- get
let start_ = peOffset env
let (l, r) = U.splitAt 1 (peText env)
case U.uncons l of
Nothing -> okNone
Just (fi, r2) -> (
(if not ((slen r2) == 0) then error "takeIdent : unable to iterator over characters" else ())
`seq`
(if IC.property IC.XidStart fi then (
do
shiftEnv (slen l, r)
rest <- takeWithProperty IC.XidContinue
end_ <- gets peOffset
let thlw = theHorrorsLieWithin fi rest
let mkident = \s -> Ident (Span { start = start_, end = end_ }) s
return (fmap (Just . mkident) thlw)
) else okNone))
where
okNone :: Parser' e (Either e2 (Maybe dt))
okNone = Parser . StateT $ \st -> Right (Right Nothing, st)
-- I hate this...
theHorrorsLieWithin :: Char -> Text -> Either UnicodeException Text
theHorrorsLieWithin fi rest =
-- make a byte string from the parts of the identifyer
let iabs = Bb.append (B.fromString [fi]) (U.toRep rest) in
-- finish a UTF-8 decoded identifier
let fini = U.fromRep . TE.encodeUtf8 . IN.nfc in
-- handle decoding
fmap fini $ TE.decodeUtf8' iabs
skipWhiteSpace :: Parser' e ()
skipWhiteSpace = takeWithProperty IC.WhiteSpace >> pure ()
-- | try to parse the first character as a token
tryOne :: (Char -> Maybe tok) -> Parser' e (Maybe tok)
tryOne f = do
env <- get
let (l, r) = U.splitAt 1 (peText env)
case U.uncons l of
Nothing -> return Nothing
Just (fi, r2) ->
(if not ((slen r2) == 0) then error "tryOne : unable to iterator over characters" else ())
`seq`
case f fi of
Nothing -> return Nothing
Just x -> do
shiftEnv (slen l, r)
return . Just $ x
-- | A case tree for `eats`
data HandleTree x =
HandleTree (H.HashMap Char (HandleTree x)) (Maybe x)
| Htleaf !x
-- | try to recognize specific tokens
eats :: HandleTree x -> Parser' e (Maybe x)
eats ht = Parser . StateT $ \st ->
-- do backtracking in case of failure
let Parser (StateT eatsfun) = eats_ ht in
-- fmap @ Either
fmap (\val -> case val of
(Nothing, _) -> (Nothing, st)
(Just x, st2) -> (Just x, st2)
) $ eatsfun st
where
eatOne hm xc = tryOne $ \c -> (H.lookup c hm) <|> (fmap Htleaf xc)
eats_ :: HandleTree x -> Parser' e (Maybe x)
eats_ (Htleaf x) = pure (Just x)
eats_ (HandleTree hm xc) = (eatOne hm xc) >>= \x -> case x of
Nothing -> pure Nothing
Just y -> eats_ y
-- | record the span of the inner parser
recordSpan :: Parser' e a -> Parser' e (Span, a)
recordSpan inner = do
start_ <- gets peOffset
x <- inner
end_ <- gets peOffset
pure (Span { start = start_, end = end_ }, x)
takeNatural :: (ParserError e, PErrEnv e ~ ParserEnv) => Parser' e (Maybe (Span, Natural))
takeNatural = (fmap (\(sp, i) -> fmap (\j -> (sp, j)) i)) . recordSpan $ do
fi' <- tryOne $ chkThenNat isDigit
case fi' of
Just 0 ->
fmap Just $ handleHex <|> do
se' <- tryOne $ chkThenNat isDigit
-- invalid number format
guard (se' == Nothing)
return 0
Just d -> fmap Just $ handleDigits 10 isDigit d
Nothing -> return Nothing
where
charToNat :: Char -> Natural
charToNat = fromInteger . toInteger . digitToInt
chkThenNat :: (Char -> Bool) -> Char -> Maybe Natural
chkThenNat chk c = if chk c then Just $ charToNat c else Nothing
-- this parser can't fail
handleDigits :: Natural -> (Char -> Bool) -> Natural -> Parser' e Natural
handleDigits mult chk acc' = go acc'
where
go acc = do
hd' <- tryOne $ chkThenNat chk
case hd' of
Just hd -> go (mult * acc + hd)
Nothing -> pure acc
handleHex = (tryOne $ \c -> if c == 'x' then Just () else Nothing) >>= (\x ->
(guard (x == Just ())) >> (handleDigits 16 isHexDigit 0)
)

View file

@ -62,6 +62,8 @@ library
exposed-modules:
Language.Yanais.Fusion
, Language.Yanais.Parser
, Language.Yanais.Parser.Lex
, Language.Yanais.Nfker7h.Pattern
, Language.Yanais.Nfker7h.Parser.Types
, Language.Yanais.Nfker7h.Parser.Lex