add simple lexer

2023-05-19 00:46:12 +02:00 · 2023-05-19 00:46:12 +02:00 · 8287a96134
commit 8287a96134
parent 6554b0f679
7 changed files with 191 additions and 11 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,6 @@
 .#*
 /target
 result
 result-*
 perf.data*
 flamegraph.svg
--- a/Cargo.lock
+++ b/Cargo.lock
@ -0,0 +1,16 @@
 # This file is automatically @generated by Cargo.
 # It is not intended for manual editing.
 version = 3
 [[package]]
 name = "unicode-ident"
 version = "1.0.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e5464a87b239f13a63a501f2701565754bae92d243d4bb7eb12f6d57d2269bf4"
 [[package]]
 name = "wafl-parser"
 version = "0.1.0"
 dependencies = [
 "unicode-ident",
 ]
--- a/Cargo.toml
+++ b/Cargo.toml
@ -0,0 +1,7 @@
 [workspace]
 members = ["crates/*"]
 [profile.release]
 codegen-units = 1
 debug = 1
 lto = "thin"
--- a/README.md
+++ b/README.md
@ -9,7 +9,7 @@
 ## Syntax
 ```
-[cfe] [final] name ["{" args (separated via commas) "}"] = {
+[@cfe] [@final] name ["{" args (separated via commas) "}"] = {
  content;
  content;
  etc;
@ -19,8 +19,8 @@
 ## Attributes of Code Objects
- `final` prevents overwriting and basically marks a root
+- `@final` prevents overwriting and basically marks a root
- `cfe` allows an object to access the continuation and omits the default return
+- `@cfe` allows an object to access the continuation and omits the default return
 ## Early vs late binding
@ -34,9 +34,9 @@ as an argument, prefix the block with `'`.
 ## Control flow builtins
- `if cond ... then ... else ...`
+- `@if cond ... then ... else ...`
- `loop ...`
+- `@loop ...`
- `return ...` (implicit when last expr in a block is not terminated via semicolon)
+- `@return ...` (implicit when last expr in a block is not terminated via semicolon)
 ## Modules
@ -60,8 +60,8 @@ with package-lock files or such. Packages (packaged module trees) would then be
 ```
  # 1.
-  final main { args, env } = {
+  @final main { args, env } = {
-    std::io::writeln "Hello World!";
+    std:io:writeln "Hello World!";
    0
  };
@ -70,9 +70,9 @@ with package-lock files or such. Packages (packaged module trees) would then be
  business_in_the_front = party_in_the_back;
  what_now = {
-    self::business_in_the_front;
+    self:business_in_the_front;
-  } self::party_in_the_back {
+  }, self:party_in_the_back {
-    std::io::writeln "It works";
+    std:io:writeln "It works";
  };
  # running `what_now` results in "It works" being printed.
--- a/crates/wafl-parser/Cargo.toml
+++ b/crates/wafl-parser/Cargo.toml
@ -0,0 +1,8 @@
 [package]
 name = "wafl-parser"
 description = "Walls of Flesh parser"
 version = "0.1.0"
 edition = "2021"
 [dependencies]
 unicode-ident = "1.0"
--- a/crates/wafl-parser/src/lex.rs
+++ b/crates/wafl-parser/src/lex.rs
@ -0,0 +1,140 @@
 pub enum Token<'a> {
    // keywords
    CtrlFlowEdit,
    Final,
    If,
    Loop,
    Return,
    // single-char keywords
    Assign,
    OpenBrace,
    CloseBrace,
    Comma,
    SemiColon,
    // dynamic stuff
    Identifier(&'a str),
    Integer(i64),
    Unknown(&'a str),
 }
 pub struct Lexer<'a> {
    pub fileid: u32,
    pub offset: u32,
    pub s: &'a str,
 }
 pub struct Location {
    pub fileid: u32,
    pub offset: u32,
 }
 impl<'a> Lexer<'a> {
    pub fn loc(&self) -> Location {
        Location {
            fileid: self.fileid,
            offset: self.offset,
        }
    }
    pub fn eat(&mut self, n: usize) -> bool {
        let ssl = self.s.len();
        let ret = n <= ssl;
        let (l, r) = if n >= ssl {
            (ssl, "")
        } else {
            (n, &self.s[n..])
        };
        let l2: u32 = l.try_into().unwrap();
        self.offset += l2;
        self.s = r;
        ret
    }
    /// use to determinate the length of a token based upon e.g.
    /// `lex.eat(loc.offset); toklen = lex.first_token_length();`
    pub fn first_token_length(&mut self) -> Option<usize> {
        let start = self.s.len();
        self.next()?;
        let stop = self.s.len();
        assert!(stop >= start);
        Some(stop - start)
    }
    fn select_text(&mut self, skip: usize, f: impl Fn(char) -> bool) -> &'a str {
        let tmp = &self.s[skip..];
        let ret = match tmp.find(f) {
            None => core::mem::replace(&mut self.s, ""),
            Some(nxt) => {
                let (txt, rest) = tmp.split_at(nxt);
                self.s = rest;
                txt
            }
        };
        let rl2: u32 = ret.len().try_into().unwrap();
        self.offset += rl2;
        ret
    }
 }
 impl<'a> Iterator for Lexer<'a> {
    // note: we don't need to 
    type Item = (Location, Token<'a>);
    fn next(&mut self) -> Option<(Location, Token<'a>)> {
        loop {
            let x = self.s.chars().next()?;
            let loc = self.loc();
            match x {
                '{' | '}' | '=' | ',' | ';' => {
                    self.eat(x.len_utf8());
                    return Some((loc, match x {
                        '{' => Token::OpenBrace,
                        '}' => Token::CloseBrace,
                        '=' => Token::Assign,
                        ',' => Token::Comma,
                        ';' => Token::SemiColon,
                        _ => unreachable!(),
                    }));
                }
                '#' | '⍝' => {
                    // comment until next line
                    match self.s.find('\n') {
                        None => {
                            self.eat(self.s.len());
                            return None;
                        }
                        Some(y) => {
                            self.eat(x.len_utf8() + y);
                        }
                    }
                }
                '@' => {
                    // keyword
                    let ident = self.select_text(1, |i| !i.is_alphanumeric());
                    return Some((loc, match &ident[1..] {
                        "cfe" => Token::CtrlFlowEdit,
                        "final" => Token::Final,
                        "if" => Token::If,
                        "loop" => Token::Loop,
                        "return" => Token::Return,
                        _ => Token::Unknown(ident),
                    }));
                }
                _ if (unicode_ident::is_xid_start(x) || x == ':') => {
                    let ident = self.select_text(0, |i| !unicode_ident::is_xid_continue(i) && i != ':');
                    return Some((loc, Token::Identifier(ident)));
                }
                _ if x.is_whitespace() => {
                    self.eat(x.len_utf8());
                }
                _ => {
                    let tok = &self.s[..x.len_utf8()];
                    self.eat(x.len_utf8());
                    return Some((loc, Token::Unknown(tok)));
                }
            }
        }
    }
 }
--- a/crates/wafl-parser/src/lib.rs
+++ b/crates/wafl-parser/src/lib.rs
@ -0,0 +1,3 @@
 #![no_std]
 pub mod lex;