yanais/ocaml/libsyntax/lex.ml
2024-02-11 14:24:29 +01:00

139 lines
4.4 KiB
OCaml

(* A lexer for yanais *)
open Tys
type lexer =
{ buf : Sedlexing.lexbuf
; file : string
}
let l_digit = [%sedlex.regexp? '0' .. '9']
let l_number = [%sedlex.regexp? Plus l_digit]
let l_ident = [%sedlex.regexp? xid_start, Star xid_continue]
let l_encap lx res offs = (res,
{ start = (Sedlexing.lexeme_start lx.buf) - offs
; length = Unsigned.UInt32.of_int (Sedlexing.lexeme_length lx.buf)
})
let l_raise lx xc =
let ((), sp) = l_encap lx () 0 in
Yanais_syntax_err (xc, lx.file, sp) |> raise
let l_unhandled lx = l_raise lx (EUnhandled (Sedlexing.Utf8.lexeme lx.buf))
(* this should be equal to Uunf_string.normalize_utf_8, but that function is unavailable with some builds *)
let normalize_utf_8 nf s =
let rec add buf normalizer v = match Uunf.add normalizer v with
| `Uchar u -> Buffer.add_utf_8_uchar buf u; add buf normalizer `Await
| `Await | `End -> ()
in
let rec loop buf s i max normalizer =
if i > max then (add buf normalizer `End; Buffer.contents buf) else
let dec = String.get_utf_8_uchar s i in
add buf normalizer (`Uchar (Uchar.utf_decode_uchar dec));
loop buf s (i + Uchar.utf_decode_length dec) max normalizer
in
let buf = Buffer.create (String.length s * 3) in
let normalizer = Uunf.create nf in
loop buf s 0 (String.length s - 1) normalizer
let l_ow_ident lx =
let buf = lx.buf in
match%sedlex buf with
| ("" | l_ident | eof) -> Sedlexing.Utf8.lexeme buf
| _ -> failwith "Unexpected character (internal error)"
let handle_comments lx () =
let buf = lx.buf in
let rec inner lvl =
if lvl > 0 then
let nlvl = match%sedlex buf with
| "(*" -> lvl + 1
| "*)" -> lvl - 1
| eof -> l_raise lx (EUnexpectedEof XComment)
| _ -> lvl
in inner nlvl
else
match%sedlex buf with
| white_space -> inner 0
| "(*" -> inner 1
| _ -> ()
in inner 0
let rec handle_string lx s =
(* this gets invoked after ""... *)
let buf = lx.buf in
let tmp = match%sedlex buf with
| "\\b" -> Some "\b"
| "\\n" -> Some "\n"
| "\\r" -> Some "\r"
| "\\t" -> Some "\t"
| "\\\"" -> Some "\""
| "\"" -> None
| any -> Some (Sedlexing.Utf8.lexeme buf)
| eof -> failwith "EOF while scanning for end of string"
| _ -> failwith "Unexpected character (internal error while scanning string)"
in match tmp with
| Some x -> handle_string lx (x::s)
| None -> List.rev s |> String.concat "" (* note: no normalization here *)
let tokens lx () =
handle_comments lx ();
let buf = lx.buf in
match%sedlex buf with
| l_number ->
let res = LLiteral (Sedlexing.Utf8.lexeme buf |> fun x -> Literal.LNatural (int_of_string x)) in
Some (l_encap lx res 0)
| l_ident ->
let cur = Sedlexing.Utf8.lexeme buf |> normalize_utf_8 `NFC in
let res = match Literal.of_string cur with
| Some x -> LLiteral x
| None -> match cur with
| "data" -> Data
| "layout" -> Layout
| "let" -> Let
| "match" -> Match
| "\u{03BB}" -> Lambda
| "\u{039B}" -> TyLambda (* a large lambda *)
| "\u{03BC}" -> Mu
| _ -> Ident cur
in
Some (l_encap lx res 0)
| '$' ->
let res = l_ow_ident lx |> (fun x -> PatOut x) in
Some (l_encap lx res 1)
| '.' ->
let res = l_ow_ident lx |> (fun x -> DotIdent x) in
Some (l_encap lx res 1)
| '^' -> Some (l_encap lx Caret 0)
| ':' -> Some (l_encap lx DubColon 0)
| ';' -> Some (l_encap lx SemiColon 0)
| '=' -> Some (l_encap lx Assign 0)
| '(' -> Some (l_encap lx (Paren Open) 0)
| ')' -> Some (l_encap lx (Paren Close) 0)
| '{' -> Some (l_encap lx (Brace Open) 0)
| '}' -> Some (l_encap lx (Brace Close) 0)
| "\u{2190}" -> Some (l_encap lx LArr 0)
| "\u{2192}" -> Some (l_encap lx RArr 0)
| "\u{21D0}" -> Some (l_encap lx LDubArr 0)
| "\u{21D2}" -> Some (l_encap lx RDubArr 0)
| "\"" ->
let start = Sedlexing.lexeme_start lx.buf in
let s = handle_string lx [] in
let endp = Sedlexing.lexeme_end lx.buf in
Some (TString s, { start; length = Unsigned.UInt32.of_int (endp - start) })
| eof -> None
(* note: white space is already handled by handle_comments *)
| any -> l_unhandled lx
| _ -> failwith "Unexpected character"
let run file dat =
let lx =
{ buf = Sedlexing.Utf8.from_gen dat
; file = file
} in
Sedlexing.set_filename lx.buf file;
Gen.to_list (tokens lx)