139 lines
4.4 KiB
OCaml
139 lines
4.4 KiB
OCaml
(* A lexer for yanais *)
|
|
|
|
open Tys
|
|
|
|
type lexer =
|
|
{ buf : Sedlexing.lexbuf
|
|
; file : string
|
|
}
|
|
|
|
let l_digit = [%sedlex.regexp? '0' .. '9']
|
|
let l_number = [%sedlex.regexp? Plus l_digit]
|
|
let l_ident = [%sedlex.regexp? xid_start, Star xid_continue]
|
|
|
|
let l_encap lx res offs = (res,
|
|
{ start = (Sedlexing.lexeme_start lx.buf) - offs
|
|
; length = Unsigned.UInt32.of_int (Sedlexing.lexeme_length lx.buf)
|
|
})
|
|
|
|
let l_raise lx xc =
|
|
let ((), sp) = l_encap lx () 0 in
|
|
Yanais_syntax_err (xc, lx.file, sp) |> raise
|
|
let l_unhandled lx = l_raise lx (EUnhandled (Sedlexing.Utf8.lexeme lx.buf))
|
|
|
|
(* this should be equal to Uunf_string.normalize_utf_8, but that function is unavailable with some builds *)
|
|
let normalize_utf_8 nf s =
|
|
let rec add buf normalizer v = match Uunf.add normalizer v with
|
|
| `Uchar u -> Buffer.add_utf_8_uchar buf u; add buf normalizer `Await
|
|
| `Await | `End -> ()
|
|
in
|
|
let rec loop buf s i max normalizer =
|
|
if i > max then (add buf normalizer `End; Buffer.contents buf) else
|
|
let dec = String.get_utf_8_uchar s i in
|
|
add buf normalizer (`Uchar (Uchar.utf_decode_uchar dec));
|
|
loop buf s (i + Uchar.utf_decode_length dec) max normalizer
|
|
in
|
|
let buf = Buffer.create (String.length s * 3) in
|
|
let normalizer = Uunf.create nf in
|
|
loop buf s 0 (String.length s - 1) normalizer
|
|
|
|
let l_ow_ident lx =
|
|
let buf = lx.buf in
|
|
match%sedlex buf with
|
|
| ("" | l_ident | eof) -> Sedlexing.Utf8.lexeme buf
|
|
| _ -> failwith "Unexpected character (internal error)"
|
|
|
|
let handle_comments lx () =
|
|
let buf = lx.buf in
|
|
let rec inner lvl =
|
|
if lvl > 0 then
|
|
let nlvl = match%sedlex buf with
|
|
| "(*" -> lvl + 1
|
|
| "*)" -> lvl - 1
|
|
| eof -> l_raise lx (EUnexpectedEof XComment)
|
|
| _ -> lvl
|
|
in inner nlvl
|
|
else
|
|
match%sedlex buf with
|
|
| white_space -> inner 0
|
|
| "(*" -> inner 1
|
|
| _ -> ()
|
|
in inner 0
|
|
|
|
let rec handle_string lx s =
|
|
(* this gets invoked after ""... *)
|
|
let buf = lx.buf in
|
|
let tmp = match%sedlex buf with
|
|
| "\\b" -> Some "\b"
|
|
| "\\n" -> Some "\n"
|
|
| "\\r" -> Some "\r"
|
|
| "\\t" -> Some "\t"
|
|
| "\\\"" -> Some "\""
|
|
| "\"" -> None
|
|
| any -> Some (Sedlexing.Utf8.lexeme buf)
|
|
| eof -> failwith "EOF while scanning for end of string"
|
|
| _ -> failwith "Unexpected character (internal error while scanning string)"
|
|
in match tmp with
|
|
| Some x -> handle_string lx (x::s)
|
|
| None -> List.rev s |> String.concat "" (* note: no normalization here *)
|
|
|
|
let tokens lx () =
|
|
handle_comments lx ();
|
|
let buf = lx.buf in
|
|
match%sedlex buf with
|
|
| l_number ->
|
|
let res = LLiteral (Sedlexing.Utf8.lexeme buf |> fun x -> Literal.LNatural (int_of_string x)) in
|
|
Some (l_encap lx res 0)
|
|
| l_ident ->
|
|
let cur = Sedlexing.Utf8.lexeme buf |> normalize_utf_8 `NFC in
|
|
let res = match Literal.of_string cur with
|
|
| Some x -> LLiteral x
|
|
| None -> match cur with
|
|
| "data" -> Data
|
|
| "layout" -> Layout
|
|
| "let" -> Let
|
|
| "match" -> Match
|
|
| "\u{03BB}" -> Lambda
|
|
| "\u{039B}" -> TyLambda (* a large lambda *)
|
|
| "\u{03BC}" -> Mu
|
|
| _ -> Ident cur
|
|
in
|
|
Some (l_encap lx res 0)
|
|
| '$' ->
|
|
let res = l_ow_ident lx |> (fun x -> PatOut x) in
|
|
Some (l_encap lx res 1)
|
|
| '.' ->
|
|
let res = l_ow_ident lx |> (fun x -> DotIdent x) in
|
|
Some (l_encap lx res 1)
|
|
|
|
| '^' -> Some (l_encap lx Caret 0)
|
|
| ':' -> Some (l_encap lx DubColon 0)
|
|
| ';' -> Some (l_encap lx SemiColon 0)
|
|
| '=' -> Some (l_encap lx Assign 0)
|
|
| '(' -> Some (l_encap lx (Paren Open) 0)
|
|
| ')' -> Some (l_encap lx (Paren Close) 0)
|
|
| '{' -> Some (l_encap lx (Brace Open) 0)
|
|
| '}' -> Some (l_encap lx (Brace Close) 0)
|
|
| "\u{2190}" -> Some (l_encap lx LArr 0)
|
|
| "\u{2192}" -> Some (l_encap lx RArr 0)
|
|
| "\u{21D0}" -> Some (l_encap lx LDubArr 0)
|
|
| "\u{21D2}" -> Some (l_encap lx RDubArr 0)
|
|
|
|
| "\"" ->
|
|
let start = Sedlexing.lexeme_start lx.buf in
|
|
let s = handle_string lx [] in
|
|
let endp = Sedlexing.lexeme_end lx.buf in
|
|
Some (TString s, { start; length = Unsigned.UInt32.of_int (endp - start) })
|
|
| eof -> None
|
|
(* note: white space is already handled by handle_comments *)
|
|
| any -> l_unhandled lx
|
|
| _ -> failwith "Unexpected character"
|
|
|
|
let run file dat =
|
|
let lx =
|
|
{ buf = Sedlexing.Utf8.from_gen dat
|
|
; file = file
|
|
} in
|
|
Sedlexing.set_filename lx.buf file;
|
|
Gen.to_list (tokens lx)
|