yanais/rust/crates/yanais-syntax/src/lex.rs
Alain Zscheile 9b0f719fe2 RefOf [WIP]
2024-02-11 18:01:38 +01:00

276 lines
8.6 KiB
Rust

/*
* SPDX-FileCopyrightText: 2023 Alain Zscheile <fogti+devel@ytrizja.de>
*
* SPDX-License-Identifier: Apache-2.0
*/
use core::{fmt, marker::PhantomData};
use std::sync::Arc;
use super::{Error, ErrorCtx, ErrorKind, EvEqSourceSpan, Result};
use yn_functor::Functor;
pub use yz_string_utils::StrLexerBase;
pub fn consume_ident(slb: &mut StrLexerBase<'_>) -> Arc<str> {
use unicode_normalization::UnicodeNormalization;
let s = slb
.consume_select(unicode_ident::is_xid_continue)
.nfc()
.to_string();
assert!(!s.is_empty());
s.into()
}
pub fn try_consume_ident(slb: &mut StrLexerBase<'_>) -> Option<Arc<str>> {
if slb.inp.chars().next().map(unicode_ident::is_xid_start) == Some(true) {
Some(consume_ident(slb))
} else {
None
}
}
pub struct Lexer<'a, Kw> {
inner: StrLexerBase<'a>,
_kw: PhantomData<fn() -> Kw>,
}
impl<Kw> Clone for Lexer<'_, Kw> {
#[inline(always)]
fn clone(&self) -> Self {
Self {
inner: self.inner,
_kw: PhantomData,
}
}
}
#[derive(Clone, Debug, PartialEq, yn_functor::Functor)]
pub struct Token<KwT> {
pub kind: TokenKind<KwT>,
pub span: EvEqSourceSpan,
}
impl<Kw: fmt::Debug> fmt::Display for Token<Kw> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "@ {}: {:?}", self.span, self.kind)
}
}
#[derive(Clone, Debug, PartialEq, Eq, Functor)]
pub enum TokenKind<KwT> {
Ident(Arc<str>),
PatOut(Arc<str>),
DotIdent(Arc<str>),
String(Box<str>),
Integer(usize),
Caret,
LParen,
RParen,
LBrace,
RBrace,
LArr,
RArr,
Dot,
RefOf,
DubColon,
SemiColon,
Assign,
Kw(KwT),
}
impl<'a, Kw> Lexer<'a, Kw> {
#[inline]
pub fn new(inp: &'a str) -> Self {
Self {
inner: StrLexerBase { inp, offset: 0 },
_kw: PhantomData,
}
}
#[inline(always)]
pub fn pmap<Kw2>(self) -> Lexer<'a, Kw2> {
Lexer {
inner: self.inner,
_kw: PhantomData,
}
}
#[inline(always)]
pub fn offset(&self) -> usize {
self.inner.offset
}
}
impl<'a, Kw: core::str::FromStr> Lexer<'a, Kw> {
pub fn peek(&self) -> Option<Result<Token<Kw>>> {
self.clone().next()
}
pub fn peek_span(&self) -> EvEqSourceSpan {
let mut this = self.clone();
match this.next() {
Some(Ok(Token { span, .. })) => span,
Some(Err(Error { span, .. })) => span,
None => (this.inner.offset..this.inner.offset).into(),
}
}
// handle EOF as error
pub fn next_in_noeof(&mut self, ctx: ErrorCtx) -> Result<Token<Kw>> {
let offset = self.offset();
self.next().unwrap_or_else(|| {
Err(Error {
span: (offset..offset).into(),
kind: ErrorKind::UnexpectedEof(ctx),
})
})
}
}
impl<'a, Kw> Lexer<'a, Kw>
where
Kw: core::str::FromStr + core::cmp::PartialEq,
{
// consume token if it is expected (for optional tokens)
pub fn got(&mut self, xkind: TokenKind<Kw>) -> Option<EvEqSourceSpan> {
let mut nxt = self.clone();
match nxt.next() {
Some(Ok(Token { span, kind })) if xkind == kind => {
*self = nxt;
Some(span)
}
_ => None,
}
}
// like `got`, but produce a proper error message if it is not there
pub fn expect(&mut self, xkind: TokenKind<Kw>, ctx: ErrorCtx) -> Result<EvEqSourceSpan> {
let mut nxt = self.clone();
let Token { span, kind } = nxt.next_in_noeof(ctx)?;
if xkind == kind {
*self = nxt;
Ok(span)
} else {
Err(Error {
span,
kind: ErrorKind::Expected(ctx),
})
}
}
}
impl<Kw: core::str::FromStr> Iterator for Lexer<'_, Kw> {
type Item = Result<Token<Kw>>;
fn next(&mut self) -> Option<Result<Token<Kw>>> {
use TokenKind as Tk;
let slb = &mut self.inner;
let mut offset;
let tmp = 'lvl: loop {
// handle whitespace
slb.consume_select(|i| i.is_whitespace());
if slb.inp.is_empty() {
return None;
}
offset = slb.offset;
break match slb.inp.chars().next()? {
'0'..='9' => {
let s = slb.consume_select(|i| i.is_ascii_digit());
debug_assert!(!s.is_empty());
s.parse().map(TokenKind::Integer).map_err(|e| e.into())
}
'"' => {
let mut escape = false;
let mut it = slb.inp.chars().peekable();
let mut res = String::new();
loop {
let x = match it.next() {
None => break 'lvl Err(ErrorKind::UnexpectedEof(ErrorCtx::String)),
Some(x) => x,
};
slb.consume(x.len_utf8());
if escape {
escape = false;
// TODO: parse escape codes...
res.push(x);
} else {
match x {
'"' => break,
'\\' => escape = true,
_ => res.push(x),
}
}
}
Ok(Tk::String(res.into_boxed_str()))
}
c if unicode_ident::is_xid_start(c) => {
// identifier
let s = consume_ident(slb);
// handle keywords
Ok(match s.parse::<Kw>() {
Ok(x) => Tk::Kw(x),
Err(_) => Tk::Ident(s),
})
}
c => {
slb.consume(c.len_utf8());
Ok(match c {
'.' => Tk::DotIdent(try_consume_ident(slb).unwrap_or_else(|| Arc::from(""))),
'$' => Tk::PatOut(try_consume_ident(slb).unwrap_or_else(|| Arc::from(""))),
'&' => Tk::RefOf,
';' => Tk::SemiColon,
'^' => Tk::Caret,
'←' => Tk::LArr,
'→' => Tk::RArr,
'{' => Tk::LBrace,
'}' => Tk::RBrace,
'(' /* ')' */ => {
if slb.inp.starts_with('*') {
// comment
let mut lvl = 1u32;
let mut it = slb.inp.chars().peekable();
while lvl > 0 {
let c = match it.next() {
Some(c) => c,
None => break 'lvl Err(ErrorKind::UnexpectedEof(ErrorCtx::Comment)),
};
slb.consume(c.len_utf8());
let c2 = it.peek().copied();
match (c, c2) {
('(', Some('*')) => lvl = match lvl.checked_add(1) {
Some(x) => x,
None => break 'lvl Err(ErrorKind::CommentNestOverflow),
},
('*', Some(')')) => {
lvl -= 1;
it.next();
slb.consume(1);
},
_ => {}
}
}
continue;
} else {
Tk::LParen
}
}
/* '(' */ ')' => Tk::RParen,
_ => break 'lvl Err(ErrorKind::UnhandledChar(c)),
})
}
};
};
let span = (offset..slb.offset).into();
Some(
tmp.map(|kind| Token { span, kind })
.map_err(|kind| Error { span, kind }),
)
}
}