haskell: handle unicode decoding errors properly
This commit is contained in:
parent
9785fdd5f4
commit
41db2b8139
6 changed files with 47 additions and 32 deletions
|
@ -2,7 +2,6 @@ module Main where
|
||||||
|
|
||||||
import System.Environment
|
import System.Environment
|
||||||
import qualified Language.Yanais.Parser as P
|
import qualified Language.Yanais.Parser as P
|
||||||
import qualified Language.Yanais.Parser.IO as Pio
|
|
||||||
import qualified Language.Yanais.Nfker7h.Parser.Lex as L
|
import qualified Language.Yanais.Nfker7h.Parser.Lex as L
|
||||||
|
|
||||||
main :: IO ()
|
main :: IO ()
|
||||||
|
@ -11,8 +10,8 @@ main = do
|
||||||
mainA args
|
mainA args
|
||||||
|
|
||||||
mainA :: [String] -> IO ()
|
mainA :: [String] -> IO ()
|
||||||
mainA [x] = Pio.parseFile x L.lexe >>= putStrLn . (\x -> case x of
|
mainA [x] = P.parseFile x L.lexe >>= putStrLn . (\y -> case y of
|
||||||
Right y -> fmtList y
|
Right z -> fmtList z
|
||||||
Left e -> show e)
|
Left e -> show e)
|
||||||
mainA _ = putStrLn "Hello World!"
|
mainA _ = putStrLn "Hello World!"
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,3 @@
|
||||||
{-# LANGUAGE OverloadedStrings #-}
|
|
||||||
|
|
||||||
module Language.Yanais.Nfker7h.Parser.Lex (
|
module Language.Yanais.Nfker7h.Parser.Lex (
|
||||||
Token,
|
Token,
|
||||||
TokenKind(..),
|
TokenKind(..),
|
||||||
|
@ -47,15 +45,14 @@ lexeIdentPrefixTree = HandleTree (H.fromList l1) Nothing
|
||||||
where
|
where
|
||||||
l1 = [('.', Htleaf IpDot), ('$', Htleaf IpPatOut)]
|
l1 = [('.', Htleaf IpDot), ('$', Htleaf IpPatOut)]
|
||||||
|
|
||||||
eitherToParser :: Either Error x -> Parser' Error x
|
|
||||||
eitherToParser e = (Parser . StateT $ \st -> fmap (\x -> (x, st)) e)
|
|
||||||
|
|
||||||
handleIdentsI :: Parser' Error (Maybe TokenKind)
|
handleIdentsI :: Parser' Error (Maybe TokenKind)
|
||||||
handleIdentsI = do
|
handleIdentsI = do
|
||||||
pfx <- eats lexeIdentPrefixTree
|
pfx <- eats lexeIdentPrefixTree
|
||||||
marker <- gets peOffset
|
marker <- gets peOffset
|
||||||
ident <- takeIdent
|
ident' <- takeIdent >>= (\ident -> eitherToParser $ case ident of
|
||||||
case tmp pfx ident marker of
|
Left exc -> Left $ DteUnicodeExc exc
|
||||||
|
Right x -> Right x)
|
||||||
|
case tmp pfx ident' marker of
|
||||||
Just x -> pure . Just $ x
|
Just x -> pure . Just $ x
|
||||||
Nothing -> do
|
Nothing -> do
|
||||||
gtc <- tryOne Just
|
gtc <- tryOne Just
|
||||||
|
@ -67,7 +64,7 @@ handleIdentsI = do
|
||||||
where
|
where
|
||||||
tmp (Just IpDot) (Just x) _ = Just $ TDotIdent x
|
tmp (Just IpDot) (Just x) _ = Just $ TDotIdent x
|
||||||
tmp (Just IpDot) Nothing _ = Just $ TDot
|
tmp (Just IpDot) Nothing _ = Just $ TDot
|
||||||
tmp (Just IpPatOut) x m = Just $ TPatOut (maybe (Ident (Span {start = m, end = m}) "") (\z -> z) x)
|
tmp (Just IpPatOut) x m = Just $ TPatOut (maybe (emptyIdent m) (\z -> z) x)
|
||||||
tmp Nothing (Just x) _ = Just $ TIdent x
|
tmp Nothing (Just x) _ = Just $ TIdent x
|
||||||
tmp Nothing Nothing _ = Nothing
|
tmp Nothing Nothing _ = Nothing
|
||||||
|
|
||||||
|
|
|
@ -7,6 +7,7 @@ module Language.Yanais.Nfker7h.Parser.Types (
|
||||||
MaybeGetSpan(..)
|
MaybeGetSpan(..)
|
||||||
) where
|
) where
|
||||||
|
|
||||||
|
import Data.Text.Encoding.Error (UnicodeException)
|
||||||
import qualified Language.Yanais.Parser as P
|
import qualified Language.Yanais.Parser as P
|
||||||
|
|
||||||
-- a "classic" tristate result
|
-- a "classic" tristate result
|
||||||
|
@ -16,6 +17,7 @@ data Context = CtxModule
|
||||||
|
|
||||||
data Error = UnexpectedEof Context
|
data Error = UnexpectedEof Context
|
||||||
| UnexpectedChar !Int !Char
|
| UnexpectedChar !Int !Char
|
||||||
|
| DteUnicodeExc UnicodeException
|
||||||
| UnknownIdent !P.Ident
|
| UnknownIdent !P.Ident
|
||||||
|
|
||||||
instance Functor Result where
|
instance Functor Result where
|
||||||
|
@ -52,3 +54,4 @@ instance Show Error where
|
||||||
show (UnexpectedEof ctx) = "end of file encountered inside " <> show ctx
|
show (UnexpectedEof ctx) = "end of file encountered inside " <> show ctx
|
||||||
show (UnknownIdent x) = "unknown identifier: " <> show x
|
show (UnknownIdent x) = "unknown identifier: " <> show x
|
||||||
show (UnexpectedChar pos c) = "unexpected / unknown character at " <> show pos <> ": " <> show c <> " (" <> (show $ fromEnum c) <> ")"
|
show (UnexpectedChar pos c) = "unexpected / unknown character at " <> show pos <> ": " <> show c <> " (" <> (show $ fromEnum c) <> ")"
|
||||||
|
show (DteUnicodeExc exc) = "unicode decoding error: " <> show exc
|
||||||
|
|
|
@ -11,8 +11,15 @@ module Language.Yanais.Parser (
|
||||||
ParserEnv(..),
|
ParserEnv(..),
|
||||||
Span(..),
|
Span(..),
|
||||||
HandleTree(..),
|
HandleTree(..),
|
||||||
|
emptyIdent,
|
||||||
|
eitherToParser,
|
||||||
|
|
||||||
|
-- creating and running parsers
|
||||||
makeParseEnv,
|
makeParseEnv,
|
||||||
runParser,
|
runParser,
|
||||||
|
parseFile,
|
||||||
|
|
||||||
|
-- parser combinators
|
||||||
takeUntil,
|
takeUntil,
|
||||||
takeWhile,
|
takeWhile,
|
||||||
takeIdent,
|
takeIdent,
|
||||||
|
@ -33,8 +40,8 @@ import qualified Data.ByteString.Char8 as C
|
||||||
import qualified Data.ByteString.UTF8 as B
|
import qualified Data.ByteString.UTF8 as B
|
||||||
import qualified Data.HashMap.Strict as H
|
import qualified Data.HashMap.Strict as H
|
||||||
import qualified Data.String.UTF8 as U
|
import qualified Data.String.UTF8 as U
|
||||||
import qualified Data.Text as T
|
|
||||||
import qualified Data.Text.Encoding as TE
|
import qualified Data.Text.Encoding as TE
|
||||||
|
import Data.Text.Encoding.Error (UnicodeException)
|
||||||
import qualified Data.Text.ICU.Char as IC
|
import qualified Data.Text.ICU.Char as IC
|
||||||
import qualified Data.Text.ICU.Normalize2 as IN
|
import qualified Data.Text.ICU.Normalize2 as IN
|
||||||
-- import Language.Yanais.Fusion
|
-- import Language.Yanais.Fusion
|
||||||
|
@ -79,6 +86,9 @@ type Text = U.UTF8 B.ByteString
|
||||||
|
|
||||||
data Ident = Ident !Span !Text
|
data Ident = Ident !Span !Text
|
||||||
|
|
||||||
|
emptyIdent :: Int -> Ident
|
||||||
|
emptyIdent m = Ident (Span { start = m, end = m }) (U.fromRep Bb.empty)
|
||||||
|
|
||||||
instance Show Ident where
|
instance Show Ident where
|
||||||
show (Ident _ t) = B.toString $ U.toRep t
|
show (Ident _ t) = B.toString $ U.toRep t
|
||||||
|
|
||||||
|
@ -89,12 +99,21 @@ data ParserEnv = ParserEnv {
|
||||||
|
|
||||||
type Parser' e a = Parser ParserEnv e a
|
type Parser' e a = Parser ParserEnv e a
|
||||||
|
|
||||||
|
-- | injects an `Either` into the parser (used to conditionally throw errors)
|
||||||
|
eitherToParser :: Either e a -> Parser s e a
|
||||||
|
eitherToParser e = (Parser . StateT $ \st -> fmap (\x -> (x, st)) e)
|
||||||
|
|
||||||
makeParseEnv :: B.ByteString -> ParserEnv
|
makeParseEnv :: B.ByteString -> ParserEnv
|
||||||
makeParseEnv bs = ParserEnv { peOffset = 0, peText = U.fromRep bs }
|
makeParseEnv bs = ParserEnv { peOffset = 0, peText = U.fromRep bs }
|
||||||
|
|
||||||
runParser :: s -> Parser s e a -> Either e (a, s)
|
runParser :: s -> Parser s e a -> Either e (a, s)
|
||||||
runParser st (Parser p) = runStateT p st
|
runParser st (Parser p) = runStateT p st
|
||||||
|
|
||||||
|
parseFile :: String -> Parser' e a -> IO (Either e a)
|
||||||
|
parseFile f p = do
|
||||||
|
contents <- C.readFile f
|
||||||
|
return (fmap (\(r, _) -> r) $ runParser (makeParseEnv contents) p)
|
||||||
|
|
||||||
-- some simple parser combinators
|
-- some simple parser combinators
|
||||||
|
|
||||||
shiftEnv :: (Int, Text) -> Parser' e ()
|
shiftEnv :: (Int, Text) -> Parser' e ()
|
||||||
|
@ -119,13 +138,13 @@ takeWhile f = do
|
||||||
takeWithProperty :: IC.Bool_ -> Parser' e Text
|
takeWithProperty :: IC.Bool_ -> Parser' e Text
|
||||||
takeWithProperty p = takeWhile $ IC.property p
|
takeWithProperty p = takeWhile $ IC.property p
|
||||||
|
|
||||||
takeIdent :: Parser' e (Maybe Ident)
|
takeIdent :: Parser' e (Either UnicodeException (Maybe Ident))
|
||||||
takeIdent = do
|
takeIdent = do
|
||||||
env <- get
|
env <- get
|
||||||
let start_ = peOffset env
|
let start_ = peOffset env
|
||||||
let (l, r) = U.splitAt 1 (peText env)
|
let (l, r) = U.splitAt 1 (peText env)
|
||||||
case U.uncons l of
|
case U.uncons l of
|
||||||
Nothing -> return Nothing
|
Nothing -> okNone
|
||||||
Just (fi, r2) -> (
|
Just (fi, r2) -> (
|
||||||
(if not ((slen r2) == 0) then error "takeIdent : unable to iterator over characters" else ())
|
(if not ((slen r2) == 0) then error "takeIdent : unable to iterator over characters" else ())
|
||||||
`seq`
|
`seq`
|
||||||
|
@ -134,12 +153,24 @@ takeIdent = do
|
||||||
shiftEnv (slen l, r)
|
shiftEnv (slen l, r)
|
||||||
rest <- takeWithProperty IC.XidContinue
|
rest <- takeWithProperty IC.XidContinue
|
||||||
end_ <- gets peOffset
|
end_ <- gets peOffset
|
||||||
return (Just . (\s -> Ident (Span { start = start_, end = end_ }) s) $ theHorrorsLieWithin fi rest)
|
let thlw = theHorrorsLieWithin fi rest
|
||||||
) else return Nothing))
|
let mkident = \s -> Ident (Span { start = start_, end = end_ }) s
|
||||||
|
return (fmap (Just . mkident) thlw)
|
||||||
|
) else okNone))
|
||||||
|
|
||||||
where
|
where
|
||||||
|
okNone :: Parser' e (Either e2 (Maybe dt))
|
||||||
|
okNone = Parser . StateT $ \st -> Right (Right Nothing, st)
|
||||||
|
|
||||||
-- I hate this...
|
-- I hate this...
|
||||||
theHorrorsLieWithin fi rest = U.fromRep . TE.encodeUtf8 . IN.nfc . TE.decodeUtf8Lenient $ (Bb.append (B.fromChar fi) (U.toRep rest))
|
theHorrorsLieWithin :: Char -> Text -> Either UnicodeException Text
|
||||||
|
theHorrorsLieWithin fi rest =
|
||||||
|
-- make a byte string from the parts of the identifyer
|
||||||
|
let iabs = Bb.append (B.fromString [fi]) (U.toRep rest) in
|
||||||
|
-- finish a UTF-8 decoded identifier
|
||||||
|
let fini = U.fromRep . TE.encodeUtf8 . IN.nfc in
|
||||||
|
-- handle decoding
|
||||||
|
fmap fini $ TE.decodeUtf8' iabs
|
||||||
|
|
||||||
skipWhiteSpace :: Parser' e ()
|
skipWhiteSpace :: Parser' e ()
|
||||||
skipWhiteSpace = takeWithProperty IC.WhiteSpace >> pure ()
|
skipWhiteSpace = takeWithProperty IC.WhiteSpace >> pure ()
|
||||||
|
|
|
@ -1,13 +0,0 @@
|
||||||
{-# LANGUAGE ScopedTypeVariables #-}
|
|
||||||
module Language.Yanais.Parser.IO (
|
|
||||||
parseFile
|
|
||||||
) where
|
|
||||||
|
|
||||||
import qualified Data.ByteString.Char8 as C
|
|
||||||
import qualified Language.Yanais.Parser as P
|
|
||||||
|
|
||||||
parseFile :: String -> P.Parser' e a -> IO (Either e a)
|
|
||||||
parseFile f p = do
|
|
||||||
contents <- C.readFile f
|
|
||||||
let pe = P.makeParseEnv contents
|
|
||||||
return (fmap (\(r, _) -> r) $ P.runParser pe p)
|
|
|
@ -62,7 +62,6 @@ library
|
||||||
exposed-modules:
|
exposed-modules:
|
||||||
Language.Yanais.Fusion
|
Language.Yanais.Fusion
|
||||||
, Language.Yanais.Parser
|
, Language.Yanais.Parser
|
||||||
, Language.Yanais.Parser.IO
|
|
||||||
, Language.Yanais.Nfker7h.Parser.Types
|
, Language.Yanais.Nfker7h.Parser.Types
|
||||||
, Language.Yanais.Nfker7h.Parser.Lex
|
, Language.Yanais.Nfker7h.Parser.Lex
|
||||||
|
|
||||||
|
@ -77,7 +76,6 @@ library
|
||||||
MultiParamTypeClasses
|
MultiParamTypeClasses
|
||||||
OverloadedStrings
|
OverloadedStrings
|
||||||
RankNTypes
|
RankNTypes
|
||||||
ScopedTypeVariables
|
|
||||||
|
|
||||||
-- Other library packages from which modules are imported.
|
-- Other library packages from which modules are imported.
|
||||||
build-depends:
|
build-depends:
|
||||||
|
|
Loading…
Reference in a new issue