WaFlesh-v1/crates/wafl-parser/src/lex.rs
2023-05-20 23:33:24 +02:00

200 lines
6.6 KiB
Rust

use core::num::NonZeroU32;
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum Token<'a> {
// keywords
CtrlFlowEdit,
Defer,
Final,
Module,
Public,
// single-char keywords
Assign,
OpenBrace,
CloseBrace,
Comma,
SemiColon,
Dot,
// dynamic stuff
/// amount of prefixed `$` to force immediate application
ScopeRef(NonZeroU32),
Identifier(&'a str),
Integer(i64),
Unknown(&'a str),
}
impl Token<'_> {
#[inline]
pub fn is_def_attr(&self) -> bool {
matches!(self, Token::Final)
}
#[inline]
pub fn is_cobj_attr(&self) -> bool {
matches!(self, Token::CtrlFlowEdit)
}
}
#[derive(Clone, PartialEq, Eq)]
pub struct Lexer<'a> {
pub loc: Location,
pub s: &'a str,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub struct Location {
pub fileid: u32,
pub offset: u32,
}
impl core::ops::AddAssign<usize> for Location {
fn add_assign(&mut self, rhs: usize) {
let rhs2: u32 = rhs.try_into().unwrap();
self.offset += rhs2;
}
}
impl<'a> Lexer<'a> {
pub fn eat(&mut self, n: usize) -> bool {
let ssl = self.s.len();
let ret = n <= ssl;
let (l, r) = if n >= ssl {
(ssl, "")
} else {
(n, &self.s[n..])
};
self.loc += l;
self.s = r;
ret
}
/// use to determinate the length of a token based upon e.g.
/// `lex.eat(loc.offset); toklen = lex.first_token_length();`
pub fn first_token_length(&mut self) -> Option<usize> {
let start = self.s.len();
self.next()?;
let stop = self.s.len();
assert!(stop >= start);
Some(stop - start)
}
fn select_text(&mut self, skip: usize, f: impl Fn(char) -> bool) -> &'a str {
let tmp = &self.s[skip..];
let ret = match tmp.find(f) {
None => core::mem::replace(&mut self.s, ""),
Some(nxt) => {
let (txt, rest) = tmp.split_at(nxt);
self.s = rest;
txt
}
};
self.loc += ret.len();
ret
}
}
impl<'a> Iterator for Lexer<'a> {
// note: we don't need to
type Item = (Location, Token<'a>);
fn next(&mut self) -> Option<(Location, Token<'a>)> {
loop {
let x = self.s.chars().next()?;
let loc = self.loc;
match x {
'{' | '}' | '=' | ',' | ';' | '.' => {
self.eat(x.len_utf8());
return Some((
loc,
match x {
'{' => Token::OpenBrace,
'}' => Token::CloseBrace,
'=' => Token::Assign,
',' => Token::Comma,
';' => Token::SemiColon,
'.' => Token::Dot,
_ => unreachable!(),
},
));
}
'#' | '⍝' => {
// comment until next line
match self.s.find('\n') {
None => {
self.eat(self.s.len());
return None;
}
Some(y) => {
self.eat(x.len_utf8() + y);
}
}
}
'0'..='9' => {
// integer, unknown base up to 36
let start = self.s;
let istr = self.select_text(0, |i| i.is_digit(36));
let tok = if istr.len() == 1 {
Token::Integer(0)
} else if self.s.chars().next() == Some('@') {
// base specification in base36 (@9 => base 10; @Z)
self.eat(1); // '@'
if let Some(base) = self.s.chars().next().and_then(|bc| bc.to_digit(36)) {
let basecv: i64 = base.into();
let mut resi: i64 = 0;
for i in istr.chars() {
if let Some(resi2) = i.to_digit(base).and_then(|j| {
resi.checked_mul(basecv)?.checked_add_unsigned(j.into())
}) {
// the digit was successfully converted via given base and added to the integer.
resi = resi2;
} else {
// conversion failed (some kind of overflow or out-of-range)
return Some((loc, Token::Unknown(&start[..istr.len() + 2])));
}
}
Token::Integer(resi)
} else {
Token::Unknown(&start[..istr.len() + 1])
}
} else if istr.chars().all(|i| i.is_ascii_digit()) {
// base 10
match istr.parse::<i64>() {
Ok(i) => Token::Integer(i),
Err(_) => Token::Unknown(istr),
}
} else {
Token::Unknown(istr)
};
return Some((loc, tok));
}
'$' => {
let scoperef = u32::try_from(self.select_text(0, |i| i != '$').chars().count())
.expect("too many scope ref uppers");
return Some((loc, Token::ScopeRef(NonZeroU32::new(scoperef).unwrap())));
}
_ if unicode_ident::is_xid_start(x) => {
let ident = self.select_text(0, |i| !unicode_ident::is_xid_continue(i));
return Some((loc, match ident {
"cfe" => Token::CtrlFlowEdit,
"defer" => Token::Defer,
"final" => Token::Final,
"module" => Token::Module,
"pub" => Token::Public,
_ => Token::Identifier(ident),
}));
}
_ if x.is_whitespace() => {
self.eat(x.len_utf8());
}
_ => {
let tok = &self.s[..x.len_utf8()];
self.eat(x.len_utf8());
return Some((loc, Token::Unknown(tok)));
}
}
}
}
}