haskell: handle unicode decoding errors properly
This commit is contained in:
parent
9785fdd5f4
commit
41db2b8139
6 changed files with 47 additions and 32 deletions
|
@ -2,7 +2,6 @@ module Main where
|
|||
|
||||
import System.Environment
|
||||
import qualified Language.Yanais.Parser as P
|
||||
import qualified Language.Yanais.Parser.IO as Pio
|
||||
import qualified Language.Yanais.Nfker7h.Parser.Lex as L
|
||||
|
||||
main :: IO ()
|
||||
|
@ -11,8 +10,8 @@ main = do
|
|||
mainA args
|
||||
|
||||
mainA :: [String] -> IO ()
|
||||
mainA [x] = Pio.parseFile x L.lexe >>= putStrLn . (\x -> case x of
|
||||
Right y -> fmtList y
|
||||
mainA [x] = P.parseFile x L.lexe >>= putStrLn . (\y -> case y of
|
||||
Right z -> fmtList z
|
||||
Left e -> show e)
|
||||
mainA _ = putStrLn "Hello World!"
|
||||
|
||||
|
|
|
@ -1,5 +1,3 @@
|
|||
{-# LANGUAGE OverloadedStrings #-}
|
||||
|
||||
module Language.Yanais.Nfker7h.Parser.Lex (
|
||||
Token,
|
||||
TokenKind(..),
|
||||
|
@ -47,15 +45,14 @@ lexeIdentPrefixTree = HandleTree (H.fromList l1) Nothing
|
|||
where
|
||||
l1 = [('.', Htleaf IpDot), ('$', Htleaf IpPatOut)]
|
||||
|
||||
eitherToParser :: Either Error x -> Parser' Error x
|
||||
eitherToParser e = (Parser . StateT $ \st -> fmap (\x -> (x, st)) e)
|
||||
|
||||
handleIdentsI :: Parser' Error (Maybe TokenKind)
|
||||
handleIdentsI = do
|
||||
pfx <- eats lexeIdentPrefixTree
|
||||
marker <- gets peOffset
|
||||
ident <- takeIdent
|
||||
case tmp pfx ident marker of
|
||||
ident' <- takeIdent >>= (\ident -> eitherToParser $ case ident of
|
||||
Left exc -> Left $ DteUnicodeExc exc
|
||||
Right x -> Right x)
|
||||
case tmp pfx ident' marker of
|
||||
Just x -> pure . Just $ x
|
||||
Nothing -> do
|
||||
gtc <- tryOne Just
|
||||
|
@ -67,7 +64,7 @@ handleIdentsI = do
|
|||
where
|
||||
tmp (Just IpDot) (Just x) _ = Just $ TDotIdent x
|
||||
tmp (Just IpDot) Nothing _ = Just $ TDot
|
||||
tmp (Just IpPatOut) x m = Just $ TPatOut (maybe (Ident (Span {start = m, end = m}) "") (\z -> z) x)
|
||||
tmp (Just IpPatOut) x m = Just $ TPatOut (maybe (emptyIdent m) (\z -> z) x)
|
||||
tmp Nothing (Just x) _ = Just $ TIdent x
|
||||
tmp Nothing Nothing _ = Nothing
|
||||
|
||||
|
|
|
@ -7,6 +7,7 @@ module Language.Yanais.Nfker7h.Parser.Types (
|
|||
MaybeGetSpan(..)
|
||||
) where
|
||||
|
||||
import Data.Text.Encoding.Error (UnicodeException)
|
||||
import qualified Language.Yanais.Parser as P
|
||||
|
||||
-- a "classic" tristate result
|
||||
|
@ -16,6 +17,7 @@ data Context = CtxModule
|
|||
|
||||
data Error = UnexpectedEof Context
|
||||
| UnexpectedChar !Int !Char
|
||||
| DteUnicodeExc UnicodeException
|
||||
| UnknownIdent !P.Ident
|
||||
|
||||
instance Functor Result where
|
||||
|
@ -52,3 +54,4 @@ instance Show Error where
|
|||
show (UnexpectedEof ctx) = "end of file encountered inside " <> show ctx
|
||||
show (UnknownIdent x) = "unknown identifier: " <> show x
|
||||
show (UnexpectedChar pos c) = "unexpected / unknown character at " <> show pos <> ": " <> show c <> " (" <> (show $ fromEnum c) <> ")"
|
||||
show (DteUnicodeExc exc) = "unicode decoding error: " <> show exc
|
||||
|
|
|
@ -11,8 +11,15 @@ module Language.Yanais.Parser (
|
|||
ParserEnv(..),
|
||||
Span(..),
|
||||
HandleTree(..),
|
||||
emptyIdent,
|
||||
eitherToParser,
|
||||
|
||||
-- creating and running parsers
|
||||
makeParseEnv,
|
||||
runParser,
|
||||
parseFile,
|
||||
|
||||
-- parser combinators
|
||||
takeUntil,
|
||||
takeWhile,
|
||||
takeIdent,
|
||||
|
@ -33,8 +40,8 @@ import qualified Data.ByteString.Char8 as C
|
|||
import qualified Data.ByteString.UTF8 as B
|
||||
import qualified Data.HashMap.Strict as H
|
||||
import qualified Data.String.UTF8 as U
|
||||
import qualified Data.Text as T
|
||||
import qualified Data.Text.Encoding as TE
|
||||
import Data.Text.Encoding.Error (UnicodeException)
|
||||
import qualified Data.Text.ICU.Char as IC
|
||||
import qualified Data.Text.ICU.Normalize2 as IN
|
||||
-- import Language.Yanais.Fusion
|
||||
|
@ -79,6 +86,9 @@ type Text = U.UTF8 B.ByteString
|
|||
|
||||
data Ident = Ident !Span !Text
|
||||
|
||||
emptyIdent :: Int -> Ident
|
||||
emptyIdent m = Ident (Span { start = m, end = m }) (U.fromRep Bb.empty)
|
||||
|
||||
instance Show Ident where
|
||||
show (Ident _ t) = B.toString $ U.toRep t
|
||||
|
||||
|
@ -89,12 +99,21 @@ data ParserEnv = ParserEnv {
|
|||
|
||||
type Parser' e a = Parser ParserEnv e a
|
||||
|
||||
-- | injects an `Either` into the parser (used to conditionally throw errors)
|
||||
eitherToParser :: Either e a -> Parser s e a
|
||||
eitherToParser e = (Parser . StateT $ \st -> fmap (\x -> (x, st)) e)
|
||||
|
||||
makeParseEnv :: B.ByteString -> ParserEnv
|
||||
makeParseEnv bs = ParserEnv { peOffset = 0, peText = U.fromRep bs }
|
||||
|
||||
runParser :: s -> Parser s e a -> Either e (a, s)
|
||||
runParser st (Parser p) = runStateT p st
|
||||
|
||||
parseFile :: String -> Parser' e a -> IO (Either e a)
|
||||
parseFile f p = do
|
||||
contents <- C.readFile f
|
||||
return (fmap (\(r, _) -> r) $ runParser (makeParseEnv contents) p)
|
||||
|
||||
-- some simple parser combinators
|
||||
|
||||
shiftEnv :: (Int, Text) -> Parser' e ()
|
||||
|
@ -119,13 +138,13 @@ takeWhile f = do
|
|||
takeWithProperty :: IC.Bool_ -> Parser' e Text
|
||||
takeWithProperty p = takeWhile $ IC.property p
|
||||
|
||||
takeIdent :: Parser' e (Maybe Ident)
|
||||
takeIdent :: Parser' e (Either UnicodeException (Maybe Ident))
|
||||
takeIdent = do
|
||||
env <- get
|
||||
let start_ = peOffset env
|
||||
let (l, r) = U.splitAt 1 (peText env)
|
||||
case U.uncons l of
|
||||
Nothing -> return Nothing
|
||||
Nothing -> okNone
|
||||
Just (fi, r2) -> (
|
||||
(if not ((slen r2) == 0) then error "takeIdent : unable to iterator over characters" else ())
|
||||
`seq`
|
||||
|
@ -134,12 +153,24 @@ takeIdent = do
|
|||
shiftEnv (slen l, r)
|
||||
rest <- takeWithProperty IC.XidContinue
|
||||
end_ <- gets peOffset
|
||||
return (Just . (\s -> Ident (Span { start = start_, end = end_ }) s) $ theHorrorsLieWithin fi rest)
|
||||
) else return Nothing))
|
||||
let thlw = theHorrorsLieWithin fi rest
|
||||
let mkident = \s -> Ident (Span { start = start_, end = end_ }) s
|
||||
return (fmap (Just . mkident) thlw)
|
||||
) else okNone))
|
||||
|
||||
where
|
||||
okNone :: Parser' e (Either e2 (Maybe dt))
|
||||
okNone = Parser . StateT $ \st -> Right (Right Nothing, st)
|
||||
|
||||
-- I hate this...
|
||||
theHorrorsLieWithin fi rest = U.fromRep . TE.encodeUtf8 . IN.nfc . TE.decodeUtf8Lenient $ (Bb.append (B.fromChar fi) (U.toRep rest))
|
||||
theHorrorsLieWithin :: Char -> Text -> Either UnicodeException Text
|
||||
theHorrorsLieWithin fi rest =
|
||||
-- make a byte string from the parts of the identifyer
|
||||
let iabs = Bb.append (B.fromString [fi]) (U.toRep rest) in
|
||||
-- finish a UTF-8 decoded identifier
|
||||
let fini = U.fromRep . TE.encodeUtf8 . IN.nfc in
|
||||
-- handle decoding
|
||||
fmap fini $ TE.decodeUtf8' iabs
|
||||
|
||||
skipWhiteSpace :: Parser' e ()
|
||||
skipWhiteSpace = takeWithProperty IC.WhiteSpace >> pure ()
|
||||
|
|
|
@ -1,13 +0,0 @@
|
|||
{-# LANGUAGE ScopedTypeVariables #-}
|
||||
module Language.Yanais.Parser.IO (
|
||||
parseFile
|
||||
) where
|
||||
|
||||
import qualified Data.ByteString.Char8 as C
|
||||
import qualified Language.Yanais.Parser as P
|
||||
|
||||
parseFile :: String -> P.Parser' e a -> IO (Either e a)
|
||||
parseFile f p = do
|
||||
contents <- C.readFile f
|
||||
let pe = P.makeParseEnv contents
|
||||
return (fmap (\(r, _) -> r) $ P.runParser pe p)
|
|
@ -62,7 +62,6 @@ library
|
|||
exposed-modules:
|
||||
Language.Yanais.Fusion
|
||||
, Language.Yanais.Parser
|
||||
, Language.Yanais.Parser.IO
|
||||
, Language.Yanais.Nfker7h.Parser.Types
|
||||
, Language.Yanais.Nfker7h.Parser.Lex
|
||||
|
||||
|
@ -77,7 +76,6 @@ library
|
|||
MultiParamTypeClasses
|
||||
OverloadedStrings
|
||||
RankNTypes
|
||||
ScopedTypeVariables
|
||||
|
||||
-- Other library packages from which modules are imported.
|
||||
build-depends:
|
||||
|
|
Loading…
Reference in a new issue