/* * SPDX-FileCopyrightText: 2023 Alain Zscheile * * SPDX-License-Identifier: Apache-2.0 */ use core::{fmt, marker::PhantomData}; use std::sync::Arc; use super::{Error, ErrorCtx, ErrorKind, EvEqSourceSpan, Result}; use yn_functor::Functor; pub use yz_string_utils::StrLexerBase; pub fn consume_ident(slb: &mut StrLexerBase<'_>) -> Arc { use unicode_normalization::UnicodeNormalization; let s = slb .consume_select(unicode_ident::is_xid_continue) .nfc() .to_string(); assert!(!s.is_empty()); s.into() } pub fn try_consume_ident(slb: &mut StrLexerBase<'_>) -> Option> { if slb.inp.chars().next().map(unicode_ident::is_xid_start) == Some(true) { Some(consume_ident(slb)) } else { None } } pub struct Lexer<'a, Kw> { inner: StrLexerBase<'a>, _kw: PhantomData Kw>, } impl Clone for Lexer<'_, Kw> { #[inline(always)] fn clone(&self) -> Self { Self { inner: self.inner, _kw: PhantomData, } } } #[derive(Clone, Debug, PartialEq, yn_functor::Functor)] pub struct Token { pub kind: TokenKind, pub span: EvEqSourceSpan, } impl fmt::Display for Token { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "@ {}: {:?}", self.span, self.kind) } } #[derive(Clone, Debug, PartialEq, Eq, Functor)] pub enum TokenKind { Ident(Arc), PatOut(Arc), DotIdent(Arc), String(Box), Integer(usize), Caret, LParen, RParen, LBrace, RBrace, LArr, RArr, Dot, RefOf, DubColon, SemiColon, Assign, Kw(KwT), } impl<'a, Kw> Lexer<'a, Kw> { #[inline] pub fn new(inp: &'a str) -> Self { Self { inner: StrLexerBase { inp, offset: 0 }, _kw: PhantomData, } } #[inline(always)] pub fn pmap(self) -> Lexer<'a, Kw2> { Lexer { inner: self.inner, _kw: PhantomData, } } #[inline(always)] pub fn offset(&self) -> usize { self.inner.offset } } impl<'a, Kw: core::str::FromStr> Lexer<'a, Kw> { pub fn peek(&self) -> Option>> { self.clone().next() } pub fn peek_span(&self) -> EvEqSourceSpan { let mut this = self.clone(); match this.next() { Some(Ok(Token { span, .. })) => span, Some(Err(Error { span, .. })) => span, None => (this.inner.offset..this.inner.offset).into(), } } // handle EOF as error pub fn next_in_noeof(&mut self, ctx: ErrorCtx) -> Result> { let offset = self.offset(); self.next().unwrap_or_else(|| { Err(Error { span: (offset..offset).into(), kind: ErrorKind::UnexpectedEof(ctx), }) }) } } impl<'a, Kw> Lexer<'a, Kw> where Kw: core::str::FromStr + core::cmp::PartialEq, { // consume token if it is expected (for optional tokens) pub fn got(&mut self, xkind: TokenKind) -> Option { let mut nxt = self.clone(); match nxt.next() { Some(Ok(Token { span, kind })) if xkind == kind => { *self = nxt; Some(span) } _ => None, } } // like `got`, but produce a proper error message if it is not there pub fn expect(&mut self, xkind: TokenKind, ctx: ErrorCtx) -> Result { let mut nxt = self.clone(); let Token { span, kind } = nxt.next_in_noeof(ctx)?; if xkind == kind { *self = nxt; Ok(span) } else { Err(Error { span, kind: ErrorKind::Expected(ctx), }) } } } impl Iterator for Lexer<'_, Kw> { type Item = Result>; fn next(&mut self) -> Option>> { use TokenKind as Tk; let slb = &mut self.inner; let mut offset; let tmp = 'lvl: loop { // handle whitespace slb.consume_select(|i| i.is_whitespace()); if slb.inp.is_empty() { return None; } offset = slb.offset; break match slb.inp.chars().next()? { '0'..='9' => { let s = slb.consume_select(|i| i.is_ascii_digit()); debug_assert!(!s.is_empty()); s.parse().map(TokenKind::Integer).map_err(|e| e.into()) } '"' => { let mut escape = false; let mut it = slb.inp.chars().peekable(); let mut res = String::new(); loop { let x = match it.next() { None => break 'lvl Err(ErrorKind::UnexpectedEof(ErrorCtx::String)), Some(x) => x, }; slb.consume(x.len_utf8()); if escape { escape = false; // TODO: parse escape codes... res.push(x); } else { match x { '"' => break, '\\' => escape = true, _ => res.push(x), } } } Ok(Tk::String(res.into_boxed_str())) } c if unicode_ident::is_xid_start(c) => { // identifier let s = consume_ident(slb); // handle keywords Ok(match s.parse::() { Ok(x) => Tk::Kw(x), Err(_) => Tk::Ident(s), }) } c => { slb.consume(c.len_utf8()); Ok(match c { '.' => Tk::DotIdent(try_consume_ident(slb).unwrap_or_else(|| Arc::from(""))), '$' => Tk::PatOut(try_consume_ident(slb).unwrap_or_else(|| Arc::from(""))), '&' => Tk::RefOf, ';' => Tk::SemiColon, '^' => Tk::Caret, '←' => Tk::LArr, '→' => Tk::RArr, '{' => Tk::LBrace, '}' => Tk::RBrace, '(' /* ')' */ => { if slb.inp.starts_with('*') { // comment let mut lvl = 1u32; let mut it = slb.inp.chars().peekable(); while lvl > 0 { let c = match it.next() { Some(c) => c, None => break 'lvl Err(ErrorKind::UnexpectedEof(ErrorCtx::Comment)), }; slb.consume(c.len_utf8()); let c2 = it.peek().copied(); match (c, c2) { ('(', Some('*')) => lvl = match lvl.checked_add(1) { Some(x) => x, None => break 'lvl Err(ErrorKind::CommentNestOverflow), }, ('*', Some(')')) => { lvl -= 1; it.next(); slb.consume(1); }, _ => {} } } continue; } else { Tk::LParen } } /* '(' */ ')' => Tk::RParen, _ => break 'lvl Err(ErrorKind::UnhandledChar(c)), }) } }; }; let span = (offset..slb.offset).into(); Some( tmp.map(|kind| Token { span, kind }) .map_err(|kind| Error { span, kind }), ) } }