add simple lexer

This commit is contained in:
Alain Zscheile 2023-05-19 00:46:12 +02:00
parent 6554b0f679
commit 8287a96134
7 changed files with 191 additions and 11 deletions

6
.gitignore vendored Normal file
View file

@ -0,0 +1,6 @@
.#*
/target
result
result-*
perf.data*
flamegraph.svg

16
Cargo.lock generated Normal file
View file

@ -0,0 +1,16 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 3
[[package]]
name = "unicode-ident"
version = "1.0.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e5464a87b239f13a63a501f2701565754bae92d243d4bb7eb12f6d57d2269bf4"
[[package]]
name = "wafl-parser"
version = "0.1.0"
dependencies = [
"unicode-ident",
]

7
Cargo.toml Normal file
View file

@ -0,0 +1,7 @@
[workspace]
members = ["crates/*"]
[profile.release]
codegen-units = 1
debug = 1
lto = "thin"

View file

@ -9,7 +9,7 @@
## Syntax
```
[cfe] [final] name ["{" args (separated via commas) "}"] = {
[@cfe] [@final] name ["{" args (separated via commas) "}"] = {
content;
content;
etc;
@ -19,8 +19,8 @@
## Attributes of Code Objects
- `final` prevents overwriting and basically marks a root
- `cfe` allows an object to access the continuation and omits the default return
- `@final` prevents overwriting and basically marks a root
- `@cfe` allows an object to access the continuation and omits the default return
## Early vs late binding
@ -34,9 +34,9 @@ as an argument, prefix the block with `'`.
## Control flow builtins
- `if cond ... then ... else ...`
- `loop ...`
- `return ...` (implicit when last expr in a block is not terminated via semicolon)
- `@if cond ... then ... else ...`
- `@loop ...`
- `@return ...` (implicit when last expr in a block is not terminated via semicolon)
## Modules
@ -60,8 +60,8 @@ with package-lock files or such. Packages (packaged module trees) would then be
```
# 1.
final main { args, env } = {
std::io::writeln "Hello World!";
@final main { args, env } = {
std:io:writeln "Hello World!";
0
};
@ -70,9 +70,9 @@ with package-lock files or such. Packages (packaged module trees) would then be
business_in_the_front = party_in_the_back;
what_now = {
self::business_in_the_front;
} self::party_in_the_back {
std::io::writeln "It works";
self:business_in_the_front;
}, self:party_in_the_back {
std:io:writeln "It works";
};
# running `what_now` results in "It works" being printed.

View file

@ -0,0 +1,8 @@
[package]
name = "wafl-parser"
description = "Walls of Flesh parser"
version = "0.1.0"
edition = "2021"
[dependencies]
unicode-ident = "1.0"

View file

@ -0,0 +1,140 @@
pub enum Token<'a> {
// keywords
CtrlFlowEdit,
Final,
If,
Loop,
Return,
// single-char keywords
Assign,
OpenBrace,
CloseBrace,
Comma,
SemiColon,
// dynamic stuff
Identifier(&'a str),
Integer(i64),
Unknown(&'a str),
}
pub struct Lexer<'a> {
pub fileid: u32,
pub offset: u32,
pub s: &'a str,
}
pub struct Location {
pub fileid: u32,
pub offset: u32,
}
impl<'a> Lexer<'a> {
pub fn loc(&self) -> Location {
Location {
fileid: self.fileid,
offset: self.offset,
}
}
pub fn eat(&mut self, n: usize) -> bool {
let ssl = self.s.len();
let ret = n <= ssl;
let (l, r) = if n >= ssl {
(ssl, "")
} else {
(n, &self.s[n..])
};
let l2: u32 = l.try_into().unwrap();
self.offset += l2;
self.s = r;
ret
}
/// use to determinate the length of a token based upon e.g.
/// `lex.eat(loc.offset); toklen = lex.first_token_length();`
pub fn first_token_length(&mut self) -> Option<usize> {
let start = self.s.len();
self.next()?;
let stop = self.s.len();
assert!(stop >= start);
Some(stop - start)
}
fn select_text(&mut self, skip: usize, f: impl Fn(char) -> bool) -> &'a str {
let tmp = &self.s[skip..];
let ret = match tmp.find(f) {
None => core::mem::replace(&mut self.s, ""),
Some(nxt) => {
let (txt, rest) = tmp.split_at(nxt);
self.s = rest;
txt
}
};
let rl2: u32 = ret.len().try_into().unwrap();
self.offset += rl2;
ret
}
}
impl<'a> Iterator for Lexer<'a> {
// note: we don't need to
type Item = (Location, Token<'a>);
fn next(&mut self) -> Option<(Location, Token<'a>)> {
loop {
let x = self.s.chars().next()?;
let loc = self.loc();
match x {
'{' | '}' | '=' | ',' | ';' => {
self.eat(x.len_utf8());
return Some((loc, match x {
'{' => Token::OpenBrace,
'}' => Token::CloseBrace,
'=' => Token::Assign,
',' => Token::Comma,
';' => Token::SemiColon,
_ => unreachable!(),
}));
}
'#' | '⍝' => {
// comment until next line
match self.s.find('\n') {
None => {
self.eat(self.s.len());
return None;
}
Some(y) => {
self.eat(x.len_utf8() + y);
}
}
}
'@' => {
// keyword
let ident = self.select_text(1, |i| !i.is_alphanumeric());
return Some((loc, match &ident[1..] {
"cfe" => Token::CtrlFlowEdit,
"final" => Token::Final,
"if" => Token::If,
"loop" => Token::Loop,
"return" => Token::Return,
_ => Token::Unknown(ident),
}));
}
_ if (unicode_ident::is_xid_start(x) || x == ':') => {
let ident = self.select_text(0, |i| !unicode_ident::is_xid_continue(i) && i != ':');
return Some((loc, Token::Identifier(ident)));
}
_ if x.is_whitespace() => {
self.eat(x.len_utf8());
}
_ => {
let tok = &self.s[..x.len_utf8()];
self.eat(x.len_utf8());
return Some((loc, Token::Unknown(tok)));
}
}
}
}
}

View file

@ -0,0 +1,3 @@
#![no_std]
pub mod lex;