From 8287a96134a735d2dba7d694ece1f080d932cb84 Mon Sep 17 00:00:00 2001 From: Alain Zscheile Date: Fri, 19 May 2023 00:46:12 +0200 Subject: [PATCH] add simple lexer --- .gitignore | 6 ++ Cargo.lock | 16 ++++ Cargo.toml | 7 ++ README.md | 22 +++--- crates/wafl-parser/Cargo.toml | 8 ++ crates/wafl-parser/src/lex.rs | 140 ++++++++++++++++++++++++++++++++++ crates/wafl-parser/src/lib.rs | 3 + 7 files changed, 191 insertions(+), 11 deletions(-) create mode 100644 .gitignore create mode 100644 Cargo.lock create mode 100644 Cargo.toml create mode 100644 crates/wafl-parser/Cargo.toml create mode 100644 crates/wafl-parser/src/lex.rs create mode 100644 crates/wafl-parser/src/lib.rs diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f1ca14a --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +.#* +/target +result +result-* +perf.data* +flamegraph.svg diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..5280d83 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,16 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "unicode-ident" +version = "1.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5464a87b239f13a63a501f2701565754bae92d243d4bb7eb12f6d57d2269bf4" + +[[package]] +name = "wafl-parser" +version = "0.1.0" +dependencies = [ + "unicode-ident", +] diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..6072de7 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,7 @@ +[workspace] +members = ["crates/*"] + +[profile.release] +codegen-units = 1 +debug = 1 +lto = "thin" diff --git a/README.md b/README.md index 4ab9bc7..24e7144 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ ## Syntax ``` -[cfe] [final] name ["{" args (separated via commas) "}"] = { +[@cfe] [@final] name ["{" args (separated via commas) "}"] = { content; content; etc; @@ -19,8 +19,8 @@ ## Attributes of Code Objects -- `final` prevents overwriting and basically marks a root -- `cfe` allows an object to access the continuation and omits the default return +- `@final` prevents overwriting and basically marks a root +- `@cfe` allows an object to access the continuation and omits the default return ## Early vs late binding @@ -34,9 +34,9 @@ as an argument, prefix the block with `'`. ## Control flow builtins -- `if cond ... then ... else ...` -- `loop ...` -- `return ...` (implicit when last expr in a block is not terminated via semicolon) +- `@if cond ... then ... else ...` +- `@loop ...` +- `@return ...` (implicit when last expr in a block is not terminated via semicolon) ## Modules @@ -60,8 +60,8 @@ with package-lock files or such. Packages (packaged module trees) would then be ``` # 1. - final main { args, env } = { - std::io::writeln "Hello World!"; + @final main { args, env } = { + std:io:writeln "Hello World!"; 0 }; @@ -70,9 +70,9 @@ with package-lock files or such. Packages (packaged module trees) would then be business_in_the_front = party_in_the_back; what_now = { - self::business_in_the_front; - } self::party_in_the_back { - std::io::writeln "It works"; + self:business_in_the_front; + }, self:party_in_the_back { + std:io:writeln "It works"; }; # running `what_now` results in "It works" being printed. diff --git a/crates/wafl-parser/Cargo.toml b/crates/wafl-parser/Cargo.toml new file mode 100644 index 0000000..4cfbaca --- /dev/null +++ b/crates/wafl-parser/Cargo.toml @@ -0,0 +1,8 @@ +[package] +name = "wafl-parser" +description = "Walls of Flesh parser" +version = "0.1.0" +edition = "2021" + +[dependencies] +unicode-ident = "1.0" diff --git a/crates/wafl-parser/src/lex.rs b/crates/wafl-parser/src/lex.rs new file mode 100644 index 0000000..fded4d0 --- /dev/null +++ b/crates/wafl-parser/src/lex.rs @@ -0,0 +1,140 @@ +pub enum Token<'a> { + // keywords + CtrlFlowEdit, + Final, + If, + Loop, + Return, + + // single-char keywords + Assign, + OpenBrace, + CloseBrace, + Comma, + SemiColon, + + // dynamic stuff + Identifier(&'a str), + Integer(i64), + Unknown(&'a str), +} + +pub struct Lexer<'a> { + pub fileid: u32, + pub offset: u32, + pub s: &'a str, +} + +pub struct Location { + pub fileid: u32, + pub offset: u32, +} + +impl<'a> Lexer<'a> { + pub fn loc(&self) -> Location { + Location { + fileid: self.fileid, + offset: self.offset, + } + } + + pub fn eat(&mut self, n: usize) -> bool { + let ssl = self.s.len(); + let ret = n <= ssl; + let (l, r) = if n >= ssl { + (ssl, "") + } else { + (n, &self.s[n..]) + }; + let l2: u32 = l.try_into().unwrap(); + self.offset += l2; + self.s = r; + ret + } + + /// use to determinate the length of a token based upon e.g. + /// `lex.eat(loc.offset); toklen = lex.first_token_length();` + pub fn first_token_length(&mut self) -> Option { + let start = self.s.len(); + self.next()?; + let stop = self.s.len(); + assert!(stop >= start); + Some(stop - start) + } + + fn select_text(&mut self, skip: usize, f: impl Fn(char) -> bool) -> &'a str { + let tmp = &self.s[skip..]; + let ret = match tmp.find(f) { + None => core::mem::replace(&mut self.s, ""), + Some(nxt) => { + let (txt, rest) = tmp.split_at(nxt); + self.s = rest; + txt + } + }; + let rl2: u32 = ret.len().try_into().unwrap(); + self.offset += rl2; + ret + } +} + +impl<'a> Iterator for Lexer<'a> { + // note: we don't need to + type Item = (Location, Token<'a>); + + fn next(&mut self) -> Option<(Location, Token<'a>)> { + loop { + let x = self.s.chars().next()?; + let loc = self.loc(); + match x { + '{' | '}' | '=' | ',' | ';' => { + self.eat(x.len_utf8()); + return Some((loc, match x { + '{' => Token::OpenBrace, + '}' => Token::CloseBrace, + '=' => Token::Assign, + ',' => Token::Comma, + ';' => Token::SemiColon, + _ => unreachable!(), + })); + } + '#' | '⍝' => { + // comment until next line + match self.s.find('\n') { + None => { + self.eat(self.s.len()); + return None; + } + Some(y) => { + self.eat(x.len_utf8() + y); + } + } + } + '@' => { + // keyword + let ident = self.select_text(1, |i| !i.is_alphanumeric()); + return Some((loc, match &ident[1..] { + "cfe" => Token::CtrlFlowEdit, + "final" => Token::Final, + "if" => Token::If, + "loop" => Token::Loop, + "return" => Token::Return, + _ => Token::Unknown(ident), + })); + } + _ if (unicode_ident::is_xid_start(x) || x == ':') => { + let ident = self.select_text(0, |i| !unicode_ident::is_xid_continue(i) && i != ':'); + return Some((loc, Token::Identifier(ident))); + } + _ if x.is_whitespace() => { + self.eat(x.len_utf8()); + } + _ => { + let tok = &self.s[..x.len_utf8()]; + self.eat(x.len_utf8()); + return Some((loc, Token::Unknown(tok))); + } + } + } + } +} diff --git a/crates/wafl-parser/src/lib.rs b/crates/wafl-parser/src/lib.rs new file mode 100644 index 0000000..252be2e --- /dev/null +++ b/crates/wafl-parser/src/lib.rs @@ -0,0 +1,3 @@ +#![no_std] + +pub mod lex;