haskell: handle unicode decoding errors properly

This commit is contained in:
Alain Zscheile 2023-10-25 20:28:55 +02:00
parent 9785fdd5f4
commit 41db2b8139
6 changed files with 47 additions and 32 deletions

View file

@ -2,7 +2,6 @@ module Main where
import System.Environment
import qualified Language.Yanais.Parser as P
import qualified Language.Yanais.Parser.IO as Pio
import qualified Language.Yanais.Nfker7h.Parser.Lex as L
main :: IO ()
@ -11,8 +10,8 @@ main = do
mainA args
mainA :: [String] -> IO ()
mainA [x] = Pio.parseFile x L.lexe >>= putStrLn . (\x -> case x of
Right y -> fmtList y
mainA [x] = P.parseFile x L.lexe >>= putStrLn . (\y -> case y of
Right z -> fmtList z
Left e -> show e)
mainA _ = putStrLn "Hello World!"

View file

@ -1,5 +1,3 @@
{-# LANGUAGE OverloadedStrings #-}
module Language.Yanais.Nfker7h.Parser.Lex (
Token,
TokenKind(..),
@ -47,15 +45,14 @@ lexeIdentPrefixTree = HandleTree (H.fromList l1) Nothing
where
l1 = [('.', Htleaf IpDot), ('$', Htleaf IpPatOut)]
eitherToParser :: Either Error x -> Parser' Error x
eitherToParser e = (Parser . StateT $ \st -> fmap (\x -> (x, st)) e)
handleIdentsI :: Parser' Error (Maybe TokenKind)
handleIdentsI = do
pfx <- eats lexeIdentPrefixTree
marker <- gets peOffset
ident <- takeIdent
case tmp pfx ident marker of
ident' <- takeIdent >>= (\ident -> eitherToParser $ case ident of
Left exc -> Left $ DteUnicodeExc exc
Right x -> Right x)
case tmp pfx ident' marker of
Just x -> pure . Just $ x
Nothing -> do
gtc <- tryOne Just
@ -67,7 +64,7 @@ handleIdentsI = do
where
tmp (Just IpDot) (Just x) _ = Just $ TDotIdent x
tmp (Just IpDot) Nothing _ = Just $ TDot
tmp (Just IpPatOut) x m = Just $ TPatOut (maybe (Ident (Span {start = m, end = m}) "") (\z -> z) x)
tmp (Just IpPatOut) x m = Just $ TPatOut (maybe (emptyIdent m) (\z -> z) x)
tmp Nothing (Just x) _ = Just $ TIdent x
tmp Nothing Nothing _ = Nothing

View file

@ -7,6 +7,7 @@ module Language.Yanais.Nfker7h.Parser.Types (
MaybeGetSpan(..)
) where
import Data.Text.Encoding.Error (UnicodeException)
import qualified Language.Yanais.Parser as P
-- a "classic" tristate result
@ -16,6 +17,7 @@ data Context = CtxModule
data Error = UnexpectedEof Context
| UnexpectedChar !Int !Char
| DteUnicodeExc UnicodeException
| UnknownIdent !P.Ident
instance Functor Result where
@ -52,3 +54,4 @@ instance Show Error where
show (UnexpectedEof ctx) = "end of file encountered inside " <> show ctx
show (UnknownIdent x) = "unknown identifier: " <> show x
show (UnexpectedChar pos c) = "unexpected / unknown character at " <> show pos <> ": " <> show c <> " (" <> (show $ fromEnum c) <> ")"
show (DteUnicodeExc exc) = "unicode decoding error: " <> show exc

View file

@ -11,8 +11,15 @@ module Language.Yanais.Parser (
ParserEnv(..),
Span(..),
HandleTree(..),
emptyIdent,
eitherToParser,
-- creating and running parsers
makeParseEnv,
runParser,
parseFile,
-- parser combinators
takeUntil,
takeWhile,
takeIdent,
@ -33,8 +40,8 @@ import qualified Data.ByteString.Char8 as C
import qualified Data.ByteString.UTF8 as B
import qualified Data.HashMap.Strict as H
import qualified Data.String.UTF8 as U
import qualified Data.Text as T
import qualified Data.Text.Encoding as TE
import Data.Text.Encoding.Error (UnicodeException)
import qualified Data.Text.ICU.Char as IC
import qualified Data.Text.ICU.Normalize2 as IN
-- import Language.Yanais.Fusion
@ -79,6 +86,9 @@ type Text = U.UTF8 B.ByteString
data Ident = Ident !Span !Text
emptyIdent :: Int -> Ident
emptyIdent m = Ident (Span { start = m, end = m }) (U.fromRep Bb.empty)
instance Show Ident where
show (Ident _ t) = B.toString $ U.toRep t
@ -89,12 +99,21 @@ data ParserEnv = ParserEnv {
type Parser' e a = Parser ParserEnv e a
-- | injects an `Either` into the parser (used to conditionally throw errors)
eitherToParser :: Either e a -> Parser s e a
eitherToParser e = (Parser . StateT $ \st -> fmap (\x -> (x, st)) e)
makeParseEnv :: B.ByteString -> ParserEnv
makeParseEnv bs = ParserEnv { peOffset = 0, peText = U.fromRep bs }
runParser :: s -> Parser s e a -> Either e (a, s)
runParser st (Parser p) = runStateT p st
parseFile :: String -> Parser' e a -> IO (Either e a)
parseFile f p = do
contents <- C.readFile f
return (fmap (\(r, _) -> r) $ runParser (makeParseEnv contents) p)
-- some simple parser combinators
shiftEnv :: (Int, Text) -> Parser' e ()
@ -119,13 +138,13 @@ takeWhile f = do
takeWithProperty :: IC.Bool_ -> Parser' e Text
takeWithProperty p = takeWhile $ IC.property p
takeIdent :: Parser' e (Maybe Ident)
takeIdent :: Parser' e (Either UnicodeException (Maybe Ident))
takeIdent = do
env <- get
let start_ = peOffset env
let (l, r) = U.splitAt 1 (peText env)
case U.uncons l of
Nothing -> return Nothing
Nothing -> okNone
Just (fi, r2) -> (
(if not ((slen r2) == 0) then error "takeIdent : unable to iterator over characters" else ())
`seq`
@ -134,12 +153,24 @@ takeIdent = do
shiftEnv (slen l, r)
rest <- takeWithProperty IC.XidContinue
end_ <- gets peOffset
return (Just . (\s -> Ident (Span { start = start_, end = end_ }) s) $ theHorrorsLieWithin fi rest)
) else return Nothing))
let thlw = theHorrorsLieWithin fi rest
let mkident = \s -> Ident (Span { start = start_, end = end_ }) s
return (fmap (Just . mkident) thlw)
) else okNone))
where
okNone :: Parser' e (Either e2 (Maybe dt))
okNone = Parser . StateT $ \st -> Right (Right Nothing, st)
-- I hate this...
theHorrorsLieWithin fi rest = U.fromRep . TE.encodeUtf8 . IN.nfc . TE.decodeUtf8Lenient $ (Bb.append (B.fromChar fi) (U.toRep rest))
theHorrorsLieWithin :: Char -> Text -> Either UnicodeException Text
theHorrorsLieWithin fi rest =
-- make a byte string from the parts of the identifyer
let iabs = Bb.append (B.fromString [fi]) (U.toRep rest) in
-- finish a UTF-8 decoded identifier
let fini = U.fromRep . TE.encodeUtf8 . IN.nfc in
-- handle decoding
fmap fini $ TE.decodeUtf8' iabs
skipWhiteSpace :: Parser' e ()
skipWhiteSpace = takeWithProperty IC.WhiteSpace >> pure ()

View file

@ -1,13 +0,0 @@
{-# LANGUAGE ScopedTypeVariables #-}
module Language.Yanais.Parser.IO (
parseFile
) where
import qualified Data.ByteString.Char8 as C
import qualified Language.Yanais.Parser as P
parseFile :: String -> P.Parser' e a -> IO (Either e a)
parseFile f p = do
contents <- C.readFile f
let pe = P.makeParseEnv contents
return (fmap (\(r, _) -> r) $ P.runParser pe p)

View file

@ -62,7 +62,6 @@ library
exposed-modules:
Language.Yanais.Fusion
, Language.Yanais.Parser
, Language.Yanais.Parser.IO
, Language.Yanais.Nfker7h.Parser.Types
, Language.Yanais.Nfker7h.Parser.Lex
@ -77,7 +76,6 @@ library
MultiParamTypeClasses
OverloadedStrings
RankNTypes
ScopedTypeVariables
-- Other library packages from which modules are imported.
build-depends: