276 lines
8.6 KiB
Rust
276 lines
8.6 KiB
Rust
/*
|
|
* SPDX-FileCopyrightText: 2023 Alain Zscheile <fogti+devel@ytrizja.de>
|
|
*
|
|
* SPDX-License-Identifier: Apache-2.0
|
|
*/
|
|
|
|
use core::{fmt, marker::PhantomData};
|
|
use std::sync::Arc;
|
|
|
|
use super::{Error, ErrorCtx, ErrorKind, EvEqSourceSpan, Result};
|
|
use yn_functor::Functor;
|
|
pub use yz_string_utils::StrLexerBase;
|
|
|
|
pub fn consume_ident(slb: &mut StrLexerBase<'_>) -> Arc<str> {
|
|
use unicode_normalization::UnicodeNormalization;
|
|
let s = slb
|
|
.consume_select(unicode_ident::is_xid_continue)
|
|
.nfc()
|
|
.to_string();
|
|
assert!(!s.is_empty());
|
|
s.into()
|
|
}
|
|
|
|
pub fn try_consume_ident(slb: &mut StrLexerBase<'_>) -> Option<Arc<str>> {
|
|
if slb.inp.chars().next().map(unicode_ident::is_xid_start) == Some(true) {
|
|
Some(consume_ident(slb))
|
|
} else {
|
|
None
|
|
}
|
|
}
|
|
|
|
pub struct Lexer<'a, Kw> {
|
|
inner: StrLexerBase<'a>,
|
|
_kw: PhantomData<fn() -> Kw>,
|
|
}
|
|
|
|
impl<Kw> Clone for Lexer<'_, Kw> {
|
|
#[inline(always)]
|
|
fn clone(&self) -> Self {
|
|
Self {
|
|
inner: self.inner,
|
|
_kw: PhantomData,
|
|
}
|
|
}
|
|
}
|
|
|
|
#[derive(Clone, Debug, PartialEq, yn_functor::Functor)]
|
|
pub struct Token<KwT> {
|
|
pub kind: TokenKind<KwT>,
|
|
pub span: EvEqSourceSpan,
|
|
}
|
|
|
|
impl<Kw: fmt::Debug> fmt::Display for Token<Kw> {
|
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
|
write!(f, "@ {}: {:?}", self.span, self.kind)
|
|
}
|
|
}
|
|
|
|
#[derive(Clone, Debug, PartialEq, Eq, Functor)]
|
|
pub enum TokenKind<KwT> {
|
|
Ident(Arc<str>),
|
|
PatOut(Arc<str>),
|
|
DotIdent(Arc<str>),
|
|
String(Box<str>),
|
|
|
|
Integer(usize),
|
|
|
|
Caret,
|
|
LParen,
|
|
RParen,
|
|
LBrace,
|
|
RBrace,
|
|
LArr,
|
|
RArr,
|
|
Dot,
|
|
RefOf,
|
|
DubColon,
|
|
SemiColon,
|
|
Assign,
|
|
|
|
Kw(KwT),
|
|
}
|
|
|
|
impl<'a, Kw> Lexer<'a, Kw> {
|
|
#[inline]
|
|
pub fn new(inp: &'a str) -> Self {
|
|
Self {
|
|
inner: StrLexerBase { inp, offset: 0 },
|
|
_kw: PhantomData,
|
|
}
|
|
}
|
|
|
|
#[inline(always)]
|
|
pub fn pmap<Kw2>(self) -> Lexer<'a, Kw2> {
|
|
Lexer {
|
|
inner: self.inner,
|
|
_kw: PhantomData,
|
|
}
|
|
}
|
|
|
|
#[inline(always)]
|
|
pub fn offset(&self) -> usize {
|
|
self.inner.offset
|
|
}
|
|
}
|
|
|
|
impl<'a, Kw: core::str::FromStr> Lexer<'a, Kw> {
|
|
pub fn peek(&self) -> Option<Result<Token<Kw>>> {
|
|
self.clone().next()
|
|
}
|
|
|
|
pub fn peek_span(&self) -> EvEqSourceSpan {
|
|
let mut this = self.clone();
|
|
match this.next() {
|
|
Some(Ok(Token { span, .. })) => span,
|
|
Some(Err(Error { span, .. })) => span,
|
|
None => (this.inner.offset..this.inner.offset).into(),
|
|
}
|
|
}
|
|
|
|
// handle EOF as error
|
|
pub fn next_in_noeof(&mut self, ctx: ErrorCtx) -> Result<Token<Kw>> {
|
|
let offset = self.offset();
|
|
self.next().unwrap_or_else(|| {
|
|
Err(Error {
|
|
span: (offset..offset).into(),
|
|
kind: ErrorKind::UnexpectedEof(ctx),
|
|
})
|
|
})
|
|
}
|
|
}
|
|
|
|
impl<'a, Kw> Lexer<'a, Kw>
|
|
where
|
|
Kw: core::str::FromStr + core::cmp::PartialEq,
|
|
{
|
|
// consume token if it is expected (for optional tokens)
|
|
pub fn got(&mut self, xkind: TokenKind<Kw>) -> Option<EvEqSourceSpan> {
|
|
let mut nxt = self.clone();
|
|
match nxt.next() {
|
|
Some(Ok(Token { span, kind })) if xkind == kind => {
|
|
*self = nxt;
|
|
Some(span)
|
|
}
|
|
_ => None,
|
|
}
|
|
}
|
|
|
|
// like `got`, but produce a proper error message if it is not there
|
|
pub fn expect(&mut self, xkind: TokenKind<Kw>, ctx: ErrorCtx) -> Result<EvEqSourceSpan> {
|
|
let mut nxt = self.clone();
|
|
let Token { span, kind } = nxt.next_in_noeof(ctx)?;
|
|
if xkind == kind {
|
|
*self = nxt;
|
|
Ok(span)
|
|
} else {
|
|
Err(Error {
|
|
span,
|
|
kind: ErrorKind::Expected(ctx),
|
|
})
|
|
}
|
|
}
|
|
}
|
|
|
|
impl<Kw: core::str::FromStr> Iterator for Lexer<'_, Kw> {
|
|
type Item = Result<Token<Kw>>;
|
|
|
|
fn next(&mut self) -> Option<Result<Token<Kw>>> {
|
|
use TokenKind as Tk;
|
|
let slb = &mut self.inner;
|
|
let mut offset;
|
|
let tmp = 'lvl: loop {
|
|
// handle whitespace
|
|
slb.consume_select(|i| i.is_whitespace());
|
|
if slb.inp.is_empty() {
|
|
return None;
|
|
}
|
|
offset = slb.offset;
|
|
break match slb.inp.chars().next()? {
|
|
'0'..='9' => {
|
|
let s = slb.consume_select(|i| i.is_ascii_digit());
|
|
debug_assert!(!s.is_empty());
|
|
s.parse().map(TokenKind::Integer).map_err(|e| e.into())
|
|
}
|
|
|
|
'"' => {
|
|
let mut escape = false;
|
|
let mut it = slb.inp.chars().peekable();
|
|
let mut res = String::new();
|
|
loop {
|
|
let x = match it.next() {
|
|
None => break 'lvl Err(ErrorKind::UnexpectedEof(ErrorCtx::String)),
|
|
Some(x) => x,
|
|
};
|
|
slb.consume(x.len_utf8());
|
|
if escape {
|
|
escape = false;
|
|
// TODO: parse escape codes...
|
|
res.push(x);
|
|
} else {
|
|
match x {
|
|
'"' => break,
|
|
'\\' => escape = true,
|
|
_ => res.push(x),
|
|
}
|
|
}
|
|
}
|
|
Ok(Tk::String(res.into_boxed_str()))
|
|
}
|
|
|
|
c if unicode_ident::is_xid_start(c) => {
|
|
// identifier
|
|
let s = consume_ident(slb);
|
|
// handle keywords
|
|
Ok(match s.parse::<Kw>() {
|
|
Ok(x) => Tk::Kw(x),
|
|
Err(_) => Tk::Ident(s),
|
|
})
|
|
}
|
|
|
|
c => {
|
|
slb.consume(c.len_utf8());
|
|
Ok(match c {
|
|
'.' => Tk::DotIdent(try_consume_ident(slb).unwrap_or_else(|| Arc::from(""))),
|
|
'$' => Tk::PatOut(try_consume_ident(slb).unwrap_or_else(|| Arc::from(""))),
|
|
'&' => Tk::RefOf,
|
|
';' => Tk::SemiColon,
|
|
'^' => Tk::Caret,
|
|
'←' => Tk::LArr,
|
|
'→' => Tk::RArr,
|
|
'{' => Tk::LBrace,
|
|
'}' => Tk::RBrace,
|
|
'(' /* ')' */ => {
|
|
if slb.inp.starts_with('*') {
|
|
// comment
|
|
let mut lvl = 1u32;
|
|
let mut it = slb.inp.chars().peekable();
|
|
while lvl > 0 {
|
|
let c = match it.next() {
|
|
Some(c) => c,
|
|
None => break 'lvl Err(ErrorKind::UnexpectedEof(ErrorCtx::Comment)),
|
|
};
|
|
slb.consume(c.len_utf8());
|
|
let c2 = it.peek().copied();
|
|
match (c, c2) {
|
|
('(', Some('*')) => lvl = match lvl.checked_add(1) {
|
|
Some(x) => x,
|
|
None => break 'lvl Err(ErrorKind::CommentNestOverflow),
|
|
},
|
|
('*', Some(')')) => {
|
|
lvl -= 1;
|
|
it.next();
|
|
slb.consume(1);
|
|
},
|
|
_ => {}
|
|
}
|
|
}
|
|
continue;
|
|
} else {
|
|
Tk::LParen
|
|
}
|
|
}
|
|
/* '(' */ ')' => Tk::RParen,
|
|
_ => break 'lvl Err(ErrorKind::UnhandledChar(c)),
|
|
})
|
|
}
|
|
};
|
|
};
|
|
let span = (offset..slb.offset).into();
|
|
Some(
|
|
tmp.map(|kind| Token { span, kind })
|
|
.map_err(|kind| Error { span, kind }),
|
|
)
|
|
}
|
|
}
|