haskell: handle unicode decoding errors properly

This commit is contained in:
Alain Emilia Anna Zscheile 2023-10-25 20:28:55 +02:00
parent 9785fdd5f4
commit 41db2b8139
6 changed files with 47 additions and 32 deletions

View file

@ -2,7 +2,6 @@ module Main where
import System.Environment import System.Environment
import qualified Language.Yanais.Parser as P import qualified Language.Yanais.Parser as P
import qualified Language.Yanais.Parser.IO as Pio
import qualified Language.Yanais.Nfker7h.Parser.Lex as L import qualified Language.Yanais.Nfker7h.Parser.Lex as L
main :: IO () main :: IO ()
@ -11,8 +10,8 @@ main = do
mainA args mainA args
mainA :: [String] -> IO () mainA :: [String] -> IO ()
mainA [x] = Pio.parseFile x L.lexe >>= putStrLn . (\x -> case x of mainA [x] = P.parseFile x L.lexe >>= putStrLn . (\y -> case y of
Right y -> fmtList y Right z -> fmtList z
Left e -> show e) Left e -> show e)
mainA _ = putStrLn "Hello World!" mainA _ = putStrLn "Hello World!"

View file

@ -1,5 +1,3 @@
{-# LANGUAGE OverloadedStrings #-}
module Language.Yanais.Nfker7h.Parser.Lex ( module Language.Yanais.Nfker7h.Parser.Lex (
Token, Token,
TokenKind(..), TokenKind(..),
@ -47,15 +45,14 @@ lexeIdentPrefixTree = HandleTree (H.fromList l1) Nothing
where where
l1 = [('.', Htleaf IpDot), ('$', Htleaf IpPatOut)] l1 = [('.', Htleaf IpDot), ('$', Htleaf IpPatOut)]
eitherToParser :: Either Error x -> Parser' Error x
eitherToParser e = (Parser . StateT $ \st -> fmap (\x -> (x, st)) e)
handleIdentsI :: Parser' Error (Maybe TokenKind) handleIdentsI :: Parser' Error (Maybe TokenKind)
handleIdentsI = do handleIdentsI = do
pfx <- eats lexeIdentPrefixTree pfx <- eats lexeIdentPrefixTree
marker <- gets peOffset marker <- gets peOffset
ident <- takeIdent ident' <- takeIdent >>= (\ident -> eitherToParser $ case ident of
case tmp pfx ident marker of Left exc -> Left $ DteUnicodeExc exc
Right x -> Right x)
case tmp pfx ident' marker of
Just x -> pure . Just $ x Just x -> pure . Just $ x
Nothing -> do Nothing -> do
gtc <- tryOne Just gtc <- tryOne Just
@ -67,7 +64,7 @@ handleIdentsI = do
where where
tmp (Just IpDot) (Just x) _ = Just $ TDotIdent x tmp (Just IpDot) (Just x) _ = Just $ TDotIdent x
tmp (Just IpDot) Nothing _ = Just $ TDot tmp (Just IpDot) Nothing _ = Just $ TDot
tmp (Just IpPatOut) x m = Just $ TPatOut (maybe (Ident (Span {start = m, end = m}) "") (\z -> z) x) tmp (Just IpPatOut) x m = Just $ TPatOut (maybe (emptyIdent m) (\z -> z) x)
tmp Nothing (Just x) _ = Just $ TIdent x tmp Nothing (Just x) _ = Just $ TIdent x
tmp Nothing Nothing _ = Nothing tmp Nothing Nothing _ = Nothing

View file

@ -7,6 +7,7 @@ module Language.Yanais.Nfker7h.Parser.Types (
MaybeGetSpan(..) MaybeGetSpan(..)
) where ) where
import Data.Text.Encoding.Error (UnicodeException)
import qualified Language.Yanais.Parser as P import qualified Language.Yanais.Parser as P
-- a "classic" tristate result -- a "classic" tristate result
@ -16,6 +17,7 @@ data Context = CtxModule
data Error = UnexpectedEof Context data Error = UnexpectedEof Context
| UnexpectedChar !Int !Char | UnexpectedChar !Int !Char
| DteUnicodeExc UnicodeException
| UnknownIdent !P.Ident | UnknownIdent !P.Ident
instance Functor Result where instance Functor Result where
@ -52,3 +54,4 @@ instance Show Error where
show (UnexpectedEof ctx) = "end of file encountered inside " <> show ctx show (UnexpectedEof ctx) = "end of file encountered inside " <> show ctx
show (UnknownIdent x) = "unknown identifier: " <> show x show (UnknownIdent x) = "unknown identifier: " <> show x
show (UnexpectedChar pos c) = "unexpected / unknown character at " <> show pos <> ": " <> show c <> " (" <> (show $ fromEnum c) <> ")" show (UnexpectedChar pos c) = "unexpected / unknown character at " <> show pos <> ": " <> show c <> " (" <> (show $ fromEnum c) <> ")"
show (DteUnicodeExc exc) = "unicode decoding error: " <> show exc

View file

@ -11,8 +11,15 @@ module Language.Yanais.Parser (
ParserEnv(..), ParserEnv(..),
Span(..), Span(..),
HandleTree(..), HandleTree(..),
emptyIdent,
eitherToParser,
-- creating and running parsers
makeParseEnv, makeParseEnv,
runParser, runParser,
parseFile,
-- parser combinators
takeUntil, takeUntil,
takeWhile, takeWhile,
takeIdent, takeIdent,
@ -33,8 +40,8 @@ import qualified Data.ByteString.Char8 as C
import qualified Data.ByteString.UTF8 as B import qualified Data.ByteString.UTF8 as B
import qualified Data.HashMap.Strict as H import qualified Data.HashMap.Strict as H
import qualified Data.String.UTF8 as U import qualified Data.String.UTF8 as U
import qualified Data.Text as T
import qualified Data.Text.Encoding as TE import qualified Data.Text.Encoding as TE
import Data.Text.Encoding.Error (UnicodeException)
import qualified Data.Text.ICU.Char as IC import qualified Data.Text.ICU.Char as IC
import qualified Data.Text.ICU.Normalize2 as IN import qualified Data.Text.ICU.Normalize2 as IN
-- import Language.Yanais.Fusion -- import Language.Yanais.Fusion
@ -79,6 +86,9 @@ type Text = U.UTF8 B.ByteString
data Ident = Ident !Span !Text data Ident = Ident !Span !Text
emptyIdent :: Int -> Ident
emptyIdent m = Ident (Span { start = m, end = m }) (U.fromRep Bb.empty)
instance Show Ident where instance Show Ident where
show (Ident _ t) = B.toString $ U.toRep t show (Ident _ t) = B.toString $ U.toRep t
@ -89,12 +99,21 @@ data ParserEnv = ParserEnv {
type Parser' e a = Parser ParserEnv e a type Parser' e a = Parser ParserEnv e a
-- | injects an `Either` into the parser (used to conditionally throw errors)
eitherToParser :: Either e a -> Parser s e a
eitherToParser e = (Parser . StateT $ \st -> fmap (\x -> (x, st)) e)
makeParseEnv :: B.ByteString -> ParserEnv makeParseEnv :: B.ByteString -> ParserEnv
makeParseEnv bs = ParserEnv { peOffset = 0, peText = U.fromRep bs } makeParseEnv bs = ParserEnv { peOffset = 0, peText = U.fromRep bs }
runParser :: s -> Parser s e a -> Either e (a, s) runParser :: s -> Parser s e a -> Either e (a, s)
runParser st (Parser p) = runStateT p st runParser st (Parser p) = runStateT p st
parseFile :: String -> Parser' e a -> IO (Either e a)
parseFile f p = do
contents <- C.readFile f
return (fmap (\(r, _) -> r) $ runParser (makeParseEnv contents) p)
-- some simple parser combinators -- some simple parser combinators
shiftEnv :: (Int, Text) -> Parser' e () shiftEnv :: (Int, Text) -> Parser' e ()
@ -119,13 +138,13 @@ takeWhile f = do
takeWithProperty :: IC.Bool_ -> Parser' e Text takeWithProperty :: IC.Bool_ -> Parser' e Text
takeWithProperty p = takeWhile $ IC.property p takeWithProperty p = takeWhile $ IC.property p
takeIdent :: Parser' e (Maybe Ident) takeIdent :: Parser' e (Either UnicodeException (Maybe Ident))
takeIdent = do takeIdent = do
env <- get env <- get
let start_ = peOffset env let start_ = peOffset env
let (l, r) = U.splitAt 1 (peText env) let (l, r) = U.splitAt 1 (peText env)
case U.uncons l of case U.uncons l of
Nothing -> return Nothing Nothing -> okNone
Just (fi, r2) -> ( Just (fi, r2) -> (
(if not ((slen r2) == 0) then error "takeIdent : unable to iterator over characters" else ()) (if not ((slen r2) == 0) then error "takeIdent : unable to iterator over characters" else ())
`seq` `seq`
@ -134,12 +153,24 @@ takeIdent = do
shiftEnv (slen l, r) shiftEnv (slen l, r)
rest <- takeWithProperty IC.XidContinue rest <- takeWithProperty IC.XidContinue
end_ <- gets peOffset end_ <- gets peOffset
return (Just . (\s -> Ident (Span { start = start_, end = end_ }) s) $ theHorrorsLieWithin fi rest) let thlw = theHorrorsLieWithin fi rest
) else return Nothing)) let mkident = \s -> Ident (Span { start = start_, end = end_ }) s
return (fmap (Just . mkident) thlw)
) else okNone))
where where
okNone :: Parser' e (Either e2 (Maybe dt))
okNone = Parser . StateT $ \st -> Right (Right Nothing, st)
-- I hate this... -- I hate this...
theHorrorsLieWithin fi rest = U.fromRep . TE.encodeUtf8 . IN.nfc . TE.decodeUtf8Lenient $ (Bb.append (B.fromChar fi) (U.toRep rest)) theHorrorsLieWithin :: Char -> Text -> Either UnicodeException Text
theHorrorsLieWithin fi rest =
-- make a byte string from the parts of the identifyer
let iabs = Bb.append (B.fromString [fi]) (U.toRep rest) in
-- finish a UTF-8 decoded identifier
let fini = U.fromRep . TE.encodeUtf8 . IN.nfc in
-- handle decoding
fmap fini $ TE.decodeUtf8' iabs
skipWhiteSpace :: Parser' e () skipWhiteSpace :: Parser' e ()
skipWhiteSpace = takeWithProperty IC.WhiteSpace >> pure () skipWhiteSpace = takeWithProperty IC.WhiteSpace >> pure ()

View file

@ -1,13 +0,0 @@
{-# LANGUAGE ScopedTypeVariables #-}
module Language.Yanais.Parser.IO (
parseFile
) where
import qualified Data.ByteString.Char8 as C
import qualified Language.Yanais.Parser as P
parseFile :: String -> P.Parser' e a -> IO (Either e a)
parseFile f p = do
contents <- C.readFile f
let pe = P.makeParseEnv contents
return (fmap (\(r, _) -> r) $ P.runParser pe p)

View file

@ -62,7 +62,6 @@ library
exposed-modules: exposed-modules:
Language.Yanais.Fusion Language.Yanais.Fusion
, Language.Yanais.Parser , Language.Yanais.Parser
, Language.Yanais.Parser.IO
, Language.Yanais.Nfker7h.Parser.Types , Language.Yanais.Nfker7h.Parser.Types
, Language.Yanais.Nfker7h.Parser.Lex , Language.Yanais.Nfker7h.Parser.Lex
@ -77,7 +76,6 @@ library
MultiParamTypeClasses MultiParamTypeClasses
OverloadedStrings OverloadedStrings
RankNTypes RankNTypes
ScopedTypeVariables
-- Other library packages from which modules are imported. -- Other library packages from which modules are imported.
build-depends: build-depends: