From f6efa5bb7490b40518ae1ec3eab6d4cdad16e7fe Mon Sep 17 00:00:00 2001 From: Sammy99jsp Date: Thu, 28 Dec 2023 02:35:17 +0000 Subject: [PATCH 01/39] Add dependencies: `lazy_static`, `regex`. --- Cargo.lock | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ Cargo.toml | 2 ++ 2 files changed, 56 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index 73ab2d1..2eb5cd3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,60 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "aho-corasick" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0" +dependencies = [ + "memchr", +] + [[package]] name = "avjason" version = "0.1.0" +dependencies = [ + "lazy_static", + "regex", +] + +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + +[[package]] +name = "memchr" +version = "2.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167" + +[[package]] +name = "regex" +version = "1.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "380b951a9c5e80ddfd6136919eef32310721aa4aacd4889a8d39124b026ab343" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f804c7828047e88b2d32e2d7fe5a105da8ee3264f01902f796c8e067dc2483f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" diff --git a/Cargo.toml b/Cargo.toml index 1fd0a5c..2acc9e0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,3 +6,5 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] +lazy_static = "1.4.0" +regex = "1.10.2" From 1fb849641f3f981dce777868b1366b81c951bed5 Mon Sep 17 00:00:00 2001 From: Sammy99jsp Date: Sat, 30 Dec 2023 02:26:51 +0000 Subject: [PATCH 02/39] Write utilities for parsing. --- src/lib.rs | 5 +- src/utils/mod.rs | 110 +++++++++++++++++++++++++++++++ src/utils/span.rs | 165 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 279 insertions(+), 1 deletion(-) create mode 100644 src/utils/mod.rs create mode 100644 src/utils/span.rs diff --git a/src/lib.rs b/src/lib.rs index aee46cb..eb765c1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -3,4 +3,7 @@ //! > A child of the [AvdanOS](https://github.com/Avdan-OS) project. //! //! A parser for [JSON5](https://json5.org/). -//! \ No newline at end of file +//! +#![feature(iter_map_windows)] + +pub mod utils; \ No newline at end of file diff --git a/src/utils/mod.rs b/src/utils/mod.rs new file mode 100644 index 0000000..b3b6b87 --- /dev/null +++ b/src/utils/mod.rs @@ -0,0 +1,110 @@ +//! +//! Utilities. +//! + +pub mod span; +use std::{ + fs, io, + ops::RangeBounds, + path::{Path, PathBuf}, +}; + +pub use span::*; + +#[derive(Debug)] +pub struct SourceFile { + path: PathBuf, + contents: String, + line_starts: Vec, +} + +impl SourceFile { + /// + /// Splits lines by ECMA-abiding line endings. + /// + fn split_lines(src: &str) -> impl Iterator + '_ { + src.chars() + .enumerate() + .map_windows(|[(a_i, a), (b_i, b)]| { + // Implementing https://262.ecma-international.org/5.1/#sec-7.3 + Some(match (*a, *b) { + ('\n', _) => a_i + 1, + ('\r', '\n') => b_i + 1, + ('\r', _) => a_i + 1, + ('\u{2028}', _) => a_i + 1, + ('\u{2029}', _) => a_i + 1, + _ => return None, + }) + }) + .flatten() + .chain(std::iter::once(src.len())) + } + + /// + /// Returns a string representing a [Loc] in ${FILE}:${LINE}:${COLUMN} format. + /// + pub fn file_line_column(&self, loc: &Loc) -> Option { + let Some((ln, col)) = self + .line_starts + .iter() + .enumerate() + .find(|(_, i)| loc.index < **i) + .map(|(ln, len)| (ln, len - loc.index)) + else { + return None; + }; + + Some(format!("{}:{ln}:{col}", &self.path.to_str()?)) + } + + /// + /// Returns the original source code at a particular [Span]. + /// + pub fn source_at(&self, span: impl RangeBounds) -> Option<&str> { + let span = S::try_into_span(span)?; + if span.end.index > self.contents.len() { + return None; + } + + Some(&self.contents[span.start.index..span.end.index]) + } + + #[cfg(test)] + pub(crate) fn dummy_file(path: impl AsRef, contents: impl ToString) -> Self { + let contents = contents.to_string(); + let line_lengths = Self::split_lines(&contents).collect(); + Self { + path: path.as_ref().to_owned(), + contents, + line_starts: line_lengths, + } + } + + /// + /// Attempts to read a [SourceFile] from a file. + /// + pub fn load_file(path: impl AsRef) -> io::Result { + let path = path.as_ref(); + let contents = fs::read_to_string(path)?; + let line_starts = Self::split_lines(&contents).collect(); + + Ok(Self { + path: path.to_owned(), + contents, + line_starts, + }) + } +} + +#[cfg(test)] +mod tests { + use super::SourceFile; + + #[test] + fn source_file() { + let src = SourceFile::dummy_file("example.txt", "I am a\ngood file!\n\nGimme a pet!"); + println!("{src:?}"); + + println!("{:?}", src.source_at(7..11)) + } +} diff --git a/src/utils/span.rs b/src/utils/span.rs new file mode 100644 index 0000000..9630ba4 --- /dev/null +++ b/src/utils/span.rs @@ -0,0 +1,165 @@ +//! +//! Helpers for finding the locations of things. +//! + +use std::ops::{Add, RangeBounds}; + +/// +/// Represents a character's location in source code. +/// +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] +pub struct Loc { + pub(crate) index: usize, +} + +impl Add for Loc +where + Rhs: Copy, + usize: Add, +{ + type Output = Loc; + + fn add(self, rhs: Rhs) -> Self::Output { + Self { + index: self.index + rhs, + } + } +} + +/// +/// Represents a token's position in the code. +/// +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] +pub struct Span { + /// + /// Lower bound. + /// + pub(crate) start: Loc, + + /// + /// Exclusive upper bound. + /// + pub(crate) end: Loc, +} + +impl Span { + /// + /// Returns the length of this span in characters. + /// + pub fn len(&self) -> usize { + self.end.index - self.start.index + } + + /// + /// Returns whether this [Span] contains nothing. + /// + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// + /// Allows you to find a smaller [Span] within this span. + /// + /// ```ignore + /// // Find location of the word "pumpkin" + /// let pumpkin: Span = find_word("pumpkin"); + /// + /// // Gets the Span corresponding to the "pump" substring of "pumpkin". + /// let pump = pumpkin.subspan(..4); + /// ``` + /// + pub fn subspan(&self, bounds: R) -> Option + where + R: RangeBounds, + { + let start = match bounds.start_bound() { + std::ops::Bound::Included(i) => *i, + std::ops::Bound::Excluded(_) => unimplemented!("Excluded lower bounds: impossible."), + std::ops::Bound::Unbounded => 0, + }; + + let end = match bounds.end_bound() { + std::ops::Bound::Included(i) => *i + 1, + std::ops::Bound::Excluded(i) => *i, + std::ops::Bound::Unbounded => self.len(), + }; + + if start > 0 || start > end { + return None; + } + + if end > self.len() { + return None; + } + + Some(Span { + start: Loc { index: start }, + end: Loc { index: end }, + }) + } +} + +/// +/// Convenience converter trait. +/// +/// ### Examples +/// ``` +/// use avjason::utils::TryIntoSpan; +/// +/// fn test(span: S) { +/// let span = span.into_span(); +/// // TODO: Do stuff with `s`... +/// } +/// ``` +/// +pub trait TryIntoSpan { + fn try_into_span(range: impl RangeBounds) -> Option; +} + +impl TryIntoSpan for Loc { + fn try_into_span(range: impl RangeBounds) -> Option { + let start = range.start_bound(); + let end = range.end_bound(); + + let start = match start { + std::ops::Bound::Included(Loc { index: i }) => *i, + std::ops::Bound::Excluded(_) => unimplemented!("Not possible: excluded lower bound."), + std::ops::Bound::Unbounded => 0, + }; + + let end = match end { + std::ops::Bound::Included(Loc { index: i }) => *i + 1, + std::ops::Bound::Excluded(Loc { index: i }) => *i, + std::ops::Bound::Unbounded => return None, + }; + + Some(Span { + start: Loc { index: start }, + end: Loc { index: end }, + }) + } +} + +impl TryIntoSpan for usize { + fn try_into_span(range: impl RangeBounds) -> Option { + let start = range.start_bound(); + let end = range.end_bound(); + + let start = match start { + std::ops::Bound::Included(i) => *i, + std::ops::Bound::Excluded(_) => unimplemented!("Not possible: excluded lower bound."), + std::ops::Bound::Unbounded => 0, + }; + + let end = match end { + std::ops::Bound::Included(i) => *i + 1, + std::ops::Bound::Excluded(i) => *i, + std::ops::Bound::Unbounded => return None, + }; + + Some(Span { + start: Loc { index: start }, + end: Loc { index: end }, + }) + } +} From 5dd734478b6f8b19de72106831499afe9df12772 Mon Sep 17 00:00:00 2001 From: Sammy99jsp Date: Sat, 30 Dec 2023 02:51:46 +0000 Subject: [PATCH 03/39] Add dependency: `unicode_categories` --- Cargo.lock | 7 +++++++ Cargo.toml | 1 + 2 files changed, 8 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index 2eb5cd3..40f99fc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -17,6 +17,7 @@ version = "0.1.0" dependencies = [ "lazy_static", "regex", + "unicode_categories", ] [[package]] @@ -59,3 +60,9 @@ name = "regex-syntax" version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" + +[[package]] +name = "unicode_categories" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e" diff --git a/Cargo.toml b/Cargo.toml index 2acc9e0..5ef3d1d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,3 +8,4 @@ edition = "2021" [dependencies] lazy_static = "1.4.0" regex = "1.10.2" +unicode_categories = "0.1.1" From 292242c1456ce9b2ced5571a93f9b26014f173ce Mon Sep 17 00:00:00 2001 From: Sammy99jsp Date: Sun, 31 Dec 2023 16:10:01 +0000 Subject: [PATCH 04/39] =?UTF-8?q?=F0=9F=A7=B9=20Add=20`single=5Fchar`=20me?= =?UTF-8?q?thod=20+=20Clean=20up.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/utils/mod.rs | 8 ++++---- src/utils/span.rs | 7 +++++++ 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/src/utils/mod.rs b/src/utils/mod.rs index b3b6b87..7a317bb 100644 --- a/src/utils/mod.rs +++ b/src/utils/mod.rs @@ -21,7 +21,7 @@ pub struct SourceFile { impl SourceFile { /// /// Splits lines by ECMA-abiding line endings. - /// + /// fn split_lines(src: &str) -> impl Iterator + '_ { src.chars() .enumerate() @@ -42,7 +42,7 @@ impl SourceFile { /// /// Returns a string representing a [Loc] in ${FILE}:${LINE}:${COLUMN} format. - /// + /// pub fn file_line_column(&self, loc: &Loc) -> Option { let Some((ln, col)) = self .line_starts @@ -59,7 +59,7 @@ impl SourceFile { /// /// Returns the original source code at a particular [Span]. - /// + /// pub fn source_at(&self, span: impl RangeBounds) -> Option<&str> { let span = S::try_into_span(span)?; if span.end.index > self.contents.len() { @@ -82,7 +82,7 @@ impl SourceFile { /// /// Attempts to read a [SourceFile] from a file. - /// + /// pub fn load_file(path: impl AsRef) -> io::Result { let path = path.as_ref(); let contents = fs::read_to_string(path)?; diff --git a/src/utils/span.rs b/src/utils/span.rs index 9630ba4..c0bc8ec 100644 --- a/src/utils/span.rs +++ b/src/utils/span.rs @@ -97,6 +97,13 @@ impl Span { end: Loc { index: end }, }) } + + pub fn single_char(loc: Loc) -> Span { + Self { + start: loc, + end: loc + 1, + } + } } /// From c5ffee94bbbb20fecefc4144bc581d6f65c4da05 Mon Sep 17 00:00:00 2001 From: Sammy99jsp Date: Mon, 1 Jan 2024 20:19:28 +0000 Subject: [PATCH 05/39] Add macros utility subcrate --- Cargo.lock | 5 +++++ Cargo.toml | 4 ++++ macros/Cargo.toml | 11 +++++++++++ macros/src/lib.rs | 0 4 files changed, 20 insertions(+) create mode 100644 macros/Cargo.toml create mode 100644 macros/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index 40f99fc..07d7288 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -15,11 +15,16 @@ dependencies = [ name = "avjason" version = "0.1.0" dependencies = [ + "avjason-macros", "lazy_static", "regex", "unicode_categories", ] +[[package]] +name = "avjason-macros" +version = "0.1.0" + [[package]] name = "lazy_static" version = "1.4.0" diff --git a/Cargo.toml b/Cargo.toml index 5ef3d1d..1a2b6ff 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,3 +1,4 @@ +workspace = { members = ["macros"] } [package] name = "avjason" version = "0.1.0" @@ -9,3 +10,6 @@ edition = "2021" lazy_static = "1.4.0" regex = "1.10.2" unicode_categories = "0.1.1" + +[dependencies.avjason-macros] +path = "./macros" \ No newline at end of file diff --git a/macros/Cargo.toml b/macros/Cargo.toml new file mode 100644 index 0000000..08e55e8 --- /dev/null +++ b/macros/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "avjason-macros" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[lib] +proc-macro=true + +[dependencies] diff --git a/macros/src/lib.rs b/macros/src/lib.rs new file mode 100644 index 0000000..e69de29 From 7ccb807d81cc0c7fc8353bfe26526b9771c6db3f Mon Sep 17 00:00:00 2001 From: Sammy99jsp Date: Tue, 2 Jan 2024 00:35:34 +0000 Subject: [PATCH 06/39] Substitute `unicode_categories` for `finl_unicode` --- Cargo.lock | 47 +++++++++++++++++++++++++++++++++++++++++++---- Cargo.toml | 4 ++-- 2 files changed, 45 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 07d7288..dc4f50f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -16,14 +16,24 @@ name = "avjason" version = "0.1.0" dependencies = [ "avjason-macros", + "finl_unicode", "lazy_static", "regex", - "unicode_categories", ] [[package]] name = "avjason-macros" version = "0.1.0" +dependencies = [ + "quote", + "syn", +] + +[[package]] +name = "finl_unicode" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fcfdc7a0362c9f4444381a9e697c79d435fe65b52a37466fc2c1184cee9edc6" [[package]] name = "lazy_static" @@ -37,6 +47,24 @@ version = "2.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167" +[[package]] +name = "proc-macro2" +version = "1.0.73" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dd5e8a1f1029c43224ad5898e50140c2aebb1705f19e67c918ebf5b9e797fe1" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22a37c9326af5ed140c86a46655b5278de879853be5573c01df185b6f49a580a" +dependencies = [ + "proc-macro2", +] + [[package]] name = "regex" version = "1.10.2" @@ -67,7 +95,18 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" [[package]] -name = "unicode_categories" -version = "0.1.1" +name = "syn" +version = "2.0.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eae3c679c56dc214320b67a1bc04ef3dfbd6411f6443974b5e4893231298e66" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "unicode-ident" +version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e" +checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" diff --git a/Cargo.toml b/Cargo.toml index 1a2b6ff..9e28810 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,9 +7,9 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] +finl_unicode = "1.2.0" lazy_static = "1.4.0" regex = "1.10.2" -unicode_categories = "0.1.1" [dependencies.avjason-macros] -path = "./macros" \ No newline at end of file +path = "./macros" From 9ffed5f9875a3bb8ede79a3a18fbd9a3fcc543ee Mon Sep 17 00:00:00 2001 From: Sammy99jsp Date: Tue, 2 Jan 2024 21:55:20 +0000 Subject: [PATCH 07/39] Add helper macros to macro subcrate. --- macros/Cargo.toml | 2 + macros/src/lib.rs | 217 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 219 insertions(+) diff --git a/macros/Cargo.toml b/macros/Cargo.toml index 08e55e8..679a89f 100644 --- a/macros/Cargo.toml +++ b/macros/Cargo.toml @@ -9,3 +9,5 @@ edition = "2021" proc-macro=true [dependencies] +quote = "1.0.34" +syn = { version = "2.0.45", features = ["full"] } diff --git a/macros/src/lib.rs b/macros/src/lib.rs index e69de29..d892420 100644 --- a/macros/src/lib.rs +++ b/macros/src/lib.rs @@ -0,0 +1,217 @@ +#![feature(proc_macro_diagnostic, iter_intersperse)] +use proc_macro::{Diagnostic, Level, Span, TokenStream}; +use quote::quote; +use syn::spanned::Spanned; + +#[proc_macro_derive(Spanned)] +pub fn derive(input: TokenStream) -> TokenStream { + if let Ok(en) = syn::parse::(input.clone()) { + let ident = en.ident.clone(); + let passed = en + .variants + .iter() + .map(|var| { + let ident = var.ident.clone(); + let syn::Fields::Unnamed(syn::FieldsUnnamed { unnamed: _, .. }) = var.fields else { + return Err(var.span()); + }; + + Ok(ident) + }) + .collect::>(); + + if passed.iter().any(Result::is_err) { + let errors = passed.into_iter().filter_map(Result::err); + + errors.for_each(|s| { + Diagnostic::spanned(s.unwrap(), Level::Error, "Need tuple-like struct here.").emit() + }); + + return syn::Error::new( + Span::call_site().into(), + "Expected enum with tuple variants.", + ) + .into_compile_error() + .into(); + } + + let vars = passed.into_iter().filter_map(Result::ok).map(|var| { + quote! { + #ident::#var(ref s) => crate::utils::Spanned::span(s) + } + }); + + return quote! { + impl crate::utils::Spanned for #ident { + fn span(&self) -> crate::utils::Span { + match self { + #(#vars),* + } + } + } + } + .into(); + }; + + if let Ok(st) = syn::parse::(input) { + let ident = st.ident.clone(); + match st.fields { + syn::Fields::Named(syn::FieldsNamed { named: f, .. }) => { + let pass = f.iter().any(|syn::Field { ident, .. }| { + ident.as_ref().map(|ident| ident == "span").unwrap_or(false) + }); + + if !pass { + return syn::Error::new( + f.span(), + "Cannot derive Spanned for named struct without `span` field.", + ) + .into_compile_error() + .into(); + } + + return quote! { + impl crate::utils::Spanned for #ident { + fn span(&self) -> Span { + self.span + } + } + } + .into(); + } + syn::Fields::Unnamed(syn::FieldsUnnamed { unnamed: f, .. }) => { + if f.is_empty() { + return syn::Error::new( + f.span(), + "Cannot derive Spanned for empty tuple struct.", + ) + .into_compile_error() + .into(); + } + + return quote! { + impl crate::utils::Spanned for #ident { + fn span(&self) -> Span { + self.0 + } + } + } + .into(); + } + syn::Fields::Unit => { + return syn::Error::new(st.span(), "Cannot derive Spanned for unit struct.") + .into_compile_error() + .into(); + } + } + } + + syn::Error::new(Span::call_site().into(), "Expected either enum or struct.") + .into_compile_error() + .into() +} + +#[proc_macro_attribute] +#[allow(non_snake_case)] +pub fn Lex(args: TokenStream, input: TokenStream) -> TokenStream { + let st = syn::parse::(input.clone()); + let en = syn::parse::(input); + + match (st, en) { + (Ok(st), Err(_)) => { + let ident = &st.ident; + let ch: syn::LitChar = match syn::parse(args) { + Ok(ch) => ch, + Err(err) => { + return err.into_compile_error().into(); + } + }; + quote! { + #st + + impl Lex for #ident { + fn lex(input: &mut crate::utils::SourceIter) -> Option { + if input.peek() == Some(&#ch) { + // Unwrap okay, because otherwise .peek returns None. + let (l, _) = input.next().unwrap(); + return Some(Self(crate::utils::Span::single_char(l))); + } + + None + } + + fn peek(input: &crate::utils::SourceIter) -> bool { + input.peek() == Some(&#ch) + } + } + } + .into() + } + (Err(_), Ok(en)) => { + let ident = &en.ident; + + let vars = en + .variants + .iter() + .map(|syn::Variant { ident, fields, .. }| match fields { + syn::Fields::Named(_) => None, + syn::Fields::Unnamed(syn::FieldsUnnamed { unnamed: f, .. }) => { + if f.is_empty() { + return None; + } + let f = f.iter().next().unwrap(); + Some((ident.clone(), f.ty.clone())) + } + syn::Fields::Unit => None, + }) + .collect::>(); + + if vars.iter().any(Option::is_none) { + return syn::Error::new_spanned( + en, + "Cannot auto-impl Lex on enum that is not only single-tuple variants.", + ) + .into_compile_error() + .into(); + } + + let (vars, peeks): (Vec<_>, Vec<_>) = vars + .into_iter() + .flatten() + .map(|(v, ty)| { + ( + quote! { + if let Some(s) = #ty::lex(input).into_lex_result()? { + return Ok(Some(Self::#v(s))); + } + }, + quote! { + #ty::peek(input) + }, + ) + }) + .unzip(); + + let peeks = peeks.into_iter().intersperse(quote! {||}); + + quote! { + #en + + impl Lex for #ident { + fn lex(input: &mut SourceIter) -> impl IntoLexResult { + #(#vars)* + + Ok(None) + } + + fn peek(input: &SourceIter) -> bool { + #(#peeks)* + } + } + + } + .into() + } + _ => unimplemented!("Mutually exlusive parsing."), + } +} From dafd68db58eedd446f59ae581daca07bf160d827 Mon Sep 17 00:00:00 2001 From: Sammy99jsp Date: Tue, 2 Jan 2024 21:56:03 +0000 Subject: [PATCH 08/39] Implement lexer. --- src/lex/escape.rs | 168 +++++++ src/lex/mod.rs | 70 +++ src/lex/number.rs | 1098 ++++++++++++++++++++++++++++++++++++++++++++ src/lex/strings.rs | 190 ++++++++ src/lex/tokens.rs | 433 +++++++++++++++++ src/lib.rs | 5 +- src/utils/mod.rs | 241 +++++++++- src/utils/span.rs | 18 + 8 files changed, 2216 insertions(+), 7 deletions(-) create mode 100644 src/lex/escape.rs create mode 100644 src/lex/mod.rs create mode 100644 src/lex/number.rs create mode 100644 src/lex/strings.rs create mode 100644 src/lex/tokens.rs diff --git a/src/lex/escape.rs b/src/lex/escape.rs new file mode 100644 index 0000000..73242cc --- /dev/null +++ b/src/lex/escape.rs @@ -0,0 +1,168 @@ +//! +//! Escape sequences. +//! + +use avjason_macros::{Lex, Spanned}; + +use crate::utils::{SourceIter, Span, TryIntoSpan}; +use crate::lex::IntoLexResult; + +use super::tokens::{Lex, LineTerminator}; + +#[inline] +pub fn is_hex_digit(ch: &char) -> bool { + ch.is_ascii_hexdigit() +} + +#[derive(Debug, Spanned)] +#[Lex] +pub enum EscapeSequence { + Unicode(UnicodeEscapeSequence), + Hex(HexEscapeSequence), + Null(NullEscapeSequence), + Character(CharacterEscapeSequence), +} + +#[derive(Debug, Spanned)] +#[Lex] +pub enum CharacterEscapeSequence { + Single(SingleEscapeCharacter), + NonEscape(NonEscapeCharacter), +} + +#[derive(Debug, Spanned)] +pub struct SingleEscapeCharacter(Span); + +impl Lex for SingleEscapeCharacter { + fn lex(input: &mut SourceIter) -> Option { + if !Self::peek(input) { + return None; + } + + let loc = input.next()?.0; + Some(Self(Span::single_char(loc))) + } + + fn peek(input: &SourceIter) -> bool { + matches!( + input.peek(), + Some(&'\'' | &'"' | &'\\' | &'b' | &'f' | &'n' | &'r' | &'t' | &'v') + ) + } +} + +#[derive(Debug, Spanned)] +pub struct NonEscapeCharacter(Span); + +struct EscapeCharacter; + +impl Lex for EscapeCharacter { + fn lex(_: &mut SourceIter) -> Option { + unimplemented!() + } + + fn peek(input: &SourceIter) -> bool { + let Some(ch) = input.peek() else { + return false; + }; + + SingleEscapeCharacter::peek(input) + || ch.is_ascii_digit() // DecimalDigit + || ch == &'x' + || ch == &'u' + } +} + +impl Lex for NonEscapeCharacter { + fn lex(input: &mut SourceIter) -> Option { + if !Self::peek(input) { + return None; + } + + let loc = input.next()?.0; + Some(Self(Span::single_char(loc))) + } + + fn peek(input: &SourceIter) -> bool { + !(EscapeCharacter::peek(input) || LineTerminator::peek(input)) + } +} + +#[derive(Debug, Spanned)] +pub struct NullEscapeSequence(Span); + +impl Lex for NullEscapeSequence { + fn lex(input: &mut SourceIter) -> Option { + if !Self::peek(input) { + return None; + } + + let loc = input.next()?.0; + Some(Self(Span::single_char(loc))) + } + + fn peek(input: &SourceIter) -> bool { + // with lookahead: not DecimalDigit. + input.peek() == Some(&'0') && !input.peek2().map(char::is_ascii_digit).unwrap_or(false) + } +} + +#[derive(Debug, Spanned)] +pub struct HexEscapeSequence(Span); + +impl Lex for HexEscapeSequence { + fn lex(input: &mut SourceIter) -> impl IntoLexResult { + if !Self::peek(input) { + return Ok(None); + } + + let start = input.next().unwrap().0; + + let mut end = start; + + for _ in 0..2 { + if input.peek().map(is_hex_digit).unwrap_or(false) { + end = input.next().unwrap().0; + } else { + return input.error() + .expected(Some(-1..1), ""); + } + } + + Ok(Some(Self(TryIntoSpan::try_into_span(start..=end).unwrap()))) + } + + fn peek(input: &SourceIter) -> bool { + input.peek() == Some(&'x') && input.relative_match(1..=2, is_hex_digit) + } +} + +#[derive(Debug, Spanned)] +pub struct UnicodeEscapeSequence(Span); + +impl Lex for UnicodeEscapeSequence { + fn lex(input: &mut SourceIter) -> impl IntoLexResult { + if !Self::peek(input) { + return Ok(None); + } + + let start = input.next().unwrap().0; + + let mut end = start; + + for _ in 0..4 { + if is_hex_digit(input.peek().unwrap()) { + end = input.next().unwrap().0; + } else { + return input.error() + .expected(Some(-1..), "") + } + } + + Ok(Some(Self(TryIntoSpan::try_into_span(start..=end).unwrap()))) + } + + fn peek(input: &SourceIter) -> bool { + input.peek() == Some(&'u') && input.relative_match(1..=4, is_hex_digit) + } +} diff --git a/src/lex/mod.rs b/src/lex/mod.rs new file mode 100644 index 0000000..441a84a --- /dev/null +++ b/src/lex/mod.rs @@ -0,0 +1,70 @@ +//! +//! Lexxing utilities. +//! + +use std::ops::RangeBounds; + +use crate::utils::{Span, TryIntoSpan}; + +pub mod escape; +pub mod strings; +pub mod tokens; +pub mod number; + +#[derive(Debug)] +pub struct LexError { + span: Span, + message: String, + text: Option, +} + +impl LexError { + pub(crate) fn new>( + span: B, + message: impl ToString, + text: impl Into>, + ) -> Self { + let span = TryIntoSpan::try_into_span(span).unwrap(); + let message = message.to_string(); + let text = text.into(); + + Self { + span, + message, + text, + } + } +} + +/// +/// Utility for Lexer erorrs, +/// +pub type LexResult = Result, LexError>; + +pub trait IntoLexResult: Sized { + fn into_lex_result(self) -> LexResult; +} + +default impl IntoLexResult for T { + fn into_lex_result(self) -> LexResult { + Ok(Some(self)) + } +} + +impl IntoLexResult for Option { + fn into_lex_result(self) -> LexResult { + Ok(self) + } +} + +impl IntoLexResult for LexResult { + fn into_lex_result(self) -> LexResult { + self + } +} + +impl IntoLexResult for Result { + fn into_lex_result(self) -> LexResult { + self.map(Option::Some) + } +} diff --git a/src/lex/number.rs b/src/lex/number.rs new file mode 100644 index 0000000..e62ca41 --- /dev/null +++ b/src/lex/number.rs @@ -0,0 +1,1098 @@ +//! +//! Number. +//! + +use std::iter::once; +use std::ops::RangeBounds; + +use avjason_macros::{Lex, Spanned}; + +use super::tokens::{Dot, LIdentifier, Lex, Minus, Plus}; +use super::{IntoLexResult, LexResult}; + +use crate::lex::escape::is_hex_digit; +use crate::utils::{SourceIter, Span, Spanned, TryIntoSpan}; +use crate::Token; + +/// +/// **JSON5Number**. +/// +/// --- +/// See [the JSON5 specification](https://spec.json5.org/#prod-JSON5Number). +/// +#[derive(Debug)] +pub struct Number(Option, Numeric); + +impl Spanned for Number { + fn span(&self) -> Span { + if let Some(ref sign) = self.0 { + sign.span().combine([self.1.span()]) + } else { + self.1.span() + } + } +} + +impl Lex for Number { + fn lex(input: &mut SourceIter) -> impl IntoLexResult { + if !Self::peek(input) { + return Ok(None); + } + + let sign = if Sign::peek(input) { + Sign::lex(input).into_lex_result().unwrap() + } else { + None + }; + + let Ok(Some(numeric)) = Numeric::lex(input).into_lex_result() else { + return input.error().expected(Some(-1..0), ""); + }; + + Ok(Some(Self(sign, numeric))) + } + + fn peek(input: &SourceIter) -> bool { + Sign::peek(input) || Numeric::peek(input) + } +} + +#[derive(Debug, Spanned)] +#[Lex] +pub enum Sign { + Positive(Plus), + Negative(Minus), +} + +trait Keyword: Sized { + const TOKEN: &'static str; + + fn new(sp: impl RangeBounds) -> Self; +} + +impl Lex for K { + fn lex(input: &mut SourceIter) -> impl IntoLexResult { + if !Self::peek(input) { + return Ok(None); + } + + let start = input.next().unwrap().0; + let end = start + Self::TOKEN.len(); + input.offset(Self::TOKEN.len() + 1); + + Ok(Some(Self::new(start..end))) + } + + fn peek(input: &SourceIter) -> bool { + input + .ahead(..Self::TOKEN.len()) + .map(|ref s| s == Self::TOKEN) + .unwrap_or(false) + } +} + +#[derive(Debug, Spanned)] +pub struct Infinity(Span); + +impl Keyword for Infinity { + const TOKEN: &'static str = "Infinity"; + + fn new(sp: impl RangeBounds) -> Self { + Self(TryIntoSpan::try_into_span(sp).unwrap()) + } +} + +#[derive(Debug, Spanned)] +pub struct NaN(Span); + +impl Keyword for NaN { + const TOKEN: &'static str = "NaN"; + + fn new(sp: impl RangeBounds) -> Self { + Self(TryIntoSpan::try_into_span(sp).unwrap()) + } +} + +/// +/// **JSON5NumericLiteral** +/// +/// --- +/// +/// See [the JSON5 specification](https://spec.json5.org/#prod-JSON5NumericLiteral). +/// +#[derive(Debug, Spanned)] +#[Lex] +pub enum Numeric { + Infinity(Infinity), + NaN(NaN), + Lit(NumericLiteral), +} + +/// +/// ECMAScript **NumericLiteral** +/// +/// --- +/// +/// See the [ECMAScript specification](https://262.ecma-international.org/5.1/#sec-7.8.3). +/// +#[derive(Debug, Spanned)] +pub enum NumericLiteral { + Decimal(DecimalLiteral), + Hex(HexIntegerLiteral), +} + +impl NumericLiteral { + /// + /// From [ECMAScript standard](https://262.ecma-international.org/5.1/#sec-7.8.3) + /// > NOTE: The source character immediately following a [NumericLiteral] must not be an *IdentifierStart* or *DecimalDigit*. + /// + fn after_check(input: &SourceIter) -> bool { + !(LIdentifier::is_identifier_start(input) + || input.peek().map(char::is_ascii_digit).unwrap_or(false)) + } +} + +impl Lex for NumericLiteral { + fn lex(mut input: &mut SourceIter) -> impl IntoLexResult { + let res: LexResult = match input { + ref mut input if HexIntegerLiteral::peek(input) => Ok(Some(Self::Hex( + HexIntegerLiteral::lex(input) + .into_lex_result() + .unwrap() + .unwrap(), + ))), + ref mut input if DecimalLiteral::peek(input) => Ok(Some(Self::Decimal( + DecimalLiteral::lex(input) + .into_lex_result() + .unwrap() + .unwrap(), + ))), + _ => Ok(None), + }; + + if !Self::after_check(input) { + return input + .error() + .unexpected(Some(-1..0), ""); + } + + res + } + + fn peek(input: &SourceIter) -> bool { + DecimalLiteral::peek(input) || HexIntegerLiteral::peek(input) + } +} + +#[derive(Debug, Spanned)] +#[Lex] +pub enum DecimalLiteral { + IntegralDecimalMantissa(IntegralDecimalMantissa), + DecimalMantissa(DecimalMantissa), + Integer(Integer), +} + +#[derive(Debug)] +pub struct IntegralDecimalMantissa( + DecimalIntegerLiteral, + Token![.], + Option, + Option, +); + +impl Spanned for IntegralDecimalMantissa { + fn span(&self) -> Span { + self.0.span().combine( + self.2 + .as_ref() + .map(|s| s.span()) + .into_iter() + .chain(self.3.as_ref().map(|s| s.span())), + ) + } +} + +impl Lex for IntegralDecimalMantissa { + fn lex(input: &mut SourceIter) -> impl IntoLexResult { + if !Self::peek(input) { + return Ok(None); + } + + let i = DecimalIntegerLiteral::lex(input) + .into_lex_result() + .unwrap() + .unwrap(); + + let Ok(Some(d)) = Dot::lex(input).into_lex_result() else { + return input.error().expected(Some(-1..1), "."); + }; + + let m = if DecimalDigits::peek(input) { + DecimalDigits::lex(input).into_lex_result().unwrap() + } else { + None + }; + + let exp = if ExponentPart::peek(input) { + ExponentPart::lex(input).into_lex_result().unwrap() + } else { + None + }; + + Ok(Some(Self(i, d, m, exp))) + } + + fn peek(input: &SourceIter) -> bool { + if DecimalIntegerLiteral::peek(input) { + let mut fork = input.fork(); + let _ = DecimalIntegerLiteral::lex(&mut fork) + .into_lex_result() + .unwrap() + .unwrap(); + + return Dot::peek(&fork); + } + + false + } +} + +#[derive(Debug)] +pub struct DecimalMantissa(Token![.], DecimalDigits, Option); + +impl Spanned for DecimalMantissa { + fn span(&self) -> Span { + let s = self.0.span(); + + if let Some(ref exp) = self.2 { + s.combine([exp.span()]) + } else { + s.combine([self.1.span()]) + } + } +} + +impl Lex for DecimalMantissa { + fn lex(input: &mut SourceIter) -> impl IntoLexResult { + if !Self::peek(input) { + return Ok(None); + } + + let d = Dot::lex(input).into_lex_result().unwrap().unwrap(); + + let Ok(Some(ds)) = DecimalDigits::lex(input).into_lex_result() else { + return input + .error() + .expected(Some(-1..0), ""); + }; + + let exp = if ExponentPart::peek(input) { + ExponentPart::lex(input).into_lex_result().unwrap() + } else { + None + }; + + Ok(Some(Self(d, ds, exp))) + } + + fn peek(input: &SourceIter) -> bool { + Dot::peek(input) + } +} + +#[derive(Debug)] +pub struct Integer(DecimalIntegerLiteral, Option); + +impl Spanned for Integer { + fn span(&self) -> Span { + self.0.span().combine(self.1.as_ref().map(Spanned::span)) + } +} + +impl Lex for Integer { + fn lex(input: &mut SourceIter) -> impl IntoLexResult { + if !Self::peek(input) { + return None; + } + + let int = DecimalIntegerLiteral::lex(input) + .into_lex_result() + .unwrap()?; + + let exp = if ExponentPart::peek(input) { + ExponentPart::lex(input).into_lex_result().unwrap() + } else { + None + }; + + Some(Self(int, exp)) + } + + fn peek(input: &SourceIter) -> bool { + DecimalIntegerLiteral::peek(input) + } +} + +#[derive(Debug)] +pub enum DecimalIntegerLiteral { + Zero(Zero), + NonZero(NonZero, Option), +} + +impl Spanned for DecimalIntegerLiteral { + fn span(&self) -> Span { + match self { + DecimalIntegerLiteral::Zero(z) => z.span(), + DecimalIntegerLiteral::NonZero(a, b) => a.span().combine(b.as_ref().map(Spanned::span)), + } + } +} + +impl Lex for DecimalIntegerLiteral { + fn lex(input: &mut SourceIter) -> impl IntoLexResult { + if Zero::peek(input) { + return Some(Self::Zero(Zero::lex(input).into_lex_result().unwrap()?)); + } + if NonZero::peek(input) { + let s = NonZero::lex(input).into_lex_result().unwrap()?; + let after = if DecimalDigits::peek(input) { + DecimalDigits::lex(input).into_lex_result().unwrap() + } else { + None + }; + + return Some(Self::NonZero(s, after)); + } + + None + } + + fn peek(input: &SourceIter) -> bool { + Zero::peek(input) || NonZero::peek(input) + } +} + +#[derive(Debug, Spanned)] +#[Lex('0')] +pub struct Zero(Span); + +#[derive(Debug, Spanned)] +pub struct NonZero(Span); + +impl Lex for NonZero { + fn lex(input: &mut SourceIter) -> impl IntoLexResult { + if !Self::peek(input) { + return None; + } + + Some(Self(Span::single_char(input.next()?.0))) + } + + fn peek(input: &SourceIter) -> bool { + input + .peek() + .map(|d| matches!(d, '1'..='9')) + .unwrap_or(false) + } +} + +#[derive(Debug, Spanned)] +pub struct DecimalDigits(Span); + +impl Lex for DecimalDigits { + fn lex(input: &mut SourceIter) -> impl IntoLexResult { + if !Self::peek(input) { + return None; + } + + let start = input.next()?.0; + let mut end = start; + + loop { + if !Self::peek(input) { + break; + } + + end = input.next().unwrap().0; + } + + Some(Self(TryIntoSpan::try_into_span(start..=end).unwrap())) + } + + fn peek(input: &SourceIter) -> bool { + input.peek().map(|d| d.is_ascii_digit()).unwrap_or(false) + } +} + +#[derive(Debug)] +pub struct ExponentPart(ExponentIdicator, SignedInteger); + +impl Spanned for ExponentPart { + fn span(&self) -> Span { + self.0.span().combine([self.1.span()]) + } +} + +impl Lex for ExponentPart { + fn lex(input: &mut SourceIter) -> impl IntoLexResult { + if !Self::peek(input) { + return Ok(None); + } + + let e_token = ExponentIdicator::lex(input) + .into_lex_result() + .unwrap() + .unwrap(); + + let Ok(Some(int)) = SignedInteger::lex(input).into_lex_result() else { + return input + .error() + .expected(Some(-2..0), "Signed integer (e.g. +1, -2, 4)"); + }; + + Ok(Some(Self(e_token, int))) + } + + fn peek(input: &SourceIter) -> bool { + ExponentIdicator::peek(input) + } +} + +#[derive(Debug, Spanned)] +#[Lex] +pub enum ExponentIdicator { + Uppercase(E), + Lowercase(e), +} + +#[derive(Debug, Spanned)] +#[Lex('E')] +pub struct E(Span); + +#[derive(Debug, Spanned)] +#[Lex('e')] +#[allow(non_camel_case_types)] +pub struct e(Span); + +#[derive(Debug)] +pub enum SignedInteger { + None(DecimalDigits), + Positive(Token![+], DecimalDigits), + Negative(Token![-], DecimalDigits), +} + +impl Spanned for SignedInteger { + fn span(&self) -> Span { + match self { + SignedInteger::None(d) => d.span(), + SignedInteger::Positive(s, d) => s.span().combine([d.span()]), + SignedInteger::Negative(s, d) => s.span().combine([d.span()]), + } + } +} + +impl Lex for SignedInteger { + fn lex(input: &mut SourceIter) -> impl IntoLexResult { + if Plus::peek(input) { + return Some(Self::Positive( + Plus::lex(input).into_lex_result().unwrap()?, + DecimalDigits::lex(input).into_lex_result().unwrap()?, + )); + } + + if Minus::peek(input) { + return Some(Self::Negative( + Minus::lex(input).into_lex_result().unwrap()?, + DecimalDigits::lex(input).into_lex_result().unwrap()?, + )); + } + + if DecimalDigits::peek(input) { + return Some(Self::None( + DecimalDigits::lex(input).into_lex_result().unwrap()?, + )); + } + + None + } + + fn peek(input: &SourceIter) -> bool { + ::peek(input) + || ::peek(input) + || ::peek(input) + } +} + +#[derive(Debug)] +pub struct HexIntegerLiteral(HexPrefix, HexDigit, Vec); + +#[derive(Debug, Spanned)] +#[Lex] +pub enum HexPrefix { + Lowercase(LowercaseHexPrefix), + Uppercase(UppercaseHexPrefix), +} + +impl Spanned for HexIntegerLiteral { + fn span(&self) -> Span { + self.0 + .span() + .combine(once(self.1.span()).chain(self.2.iter().map(Spanned::span))) + } +} + +impl Lex for HexIntegerLiteral { + fn lex(mut input: &mut SourceIter) -> impl IntoLexResult { + let p = match input { + ref mut i if HexPrefix::peek(i) => { + HexPrefix::lex(i).into_lex_result().unwrap().unwrap() + } + _ => return Ok(None), + }; + + let Ok(Some(d)) = HexDigit::lex(input).into_lex_result() else { + return input.error().expected(Some(-1..0), ""); + }; + + let mut ds = vec![]; + + while let Some(ch) = input.peek() { + if is_hex_digit(ch) { + ds.push(HexDigit::lex(input).into_lex_result().unwrap().unwrap()); + } else { + break; + } + } + + Ok(Some(Self(p, d, ds))) + } + + fn peek(input: &SourceIter) -> bool { + LowercaseHexPrefix::peek(input) || UppercaseHexPrefix::peek(input) + } +} + +#[derive(Debug, Spanned)] +pub struct LowercaseHexPrefix(Span); + +impl Lex for LowercaseHexPrefix { + fn lex(input: &mut SourceIter) -> impl IntoLexResult { + if !Self::peek(input) { + return None; + } + + let start = input.next().unwrap().0; + input.offset(1); + + Some(Self( + TryIntoSpan::try_into_span(start..=(start + 1)).unwrap(), + )) + } + + fn peek(input: &SourceIter) -> bool { + input.ahead(0..2).map(|s| s == "0x").unwrap_or(false) + } +} + +#[derive(Debug, Spanned)] +pub struct UppercaseHexPrefix(Span); + +impl Lex for UppercaseHexPrefix { + fn lex(input: &mut SourceIter) -> impl IntoLexResult { + if !Self::peek(input) { + return None; + } + + let start = input.next().unwrap().0; + input.offset(1); + + Some(Self( + TryIntoSpan::try_into_span(start..=(start + 1)).unwrap(), + )) + } + + fn peek(input: &SourceIter) -> bool { + input.ahead(0..2).map(|s| s == "0X").unwrap_or(false) + } +} + +#[derive(Debug, Spanned)] +pub struct HexDigit(Span); + +impl Lex for HexDigit { + fn lex(input: &mut SourceIter) -> impl IntoLexResult { + if !Self::peek(input) { + return None; + } + + Some(Self(Span::single_char(input.next().unwrap().0))) + } + + fn peek(input: &SourceIter) -> bool { + matches!(input.peek(), Some(a) if is_hex_digit(a)) + } +} + +#[cfg(test)] +mod tests { + + use crate::{ + lex::{ + number::{ + DecimalLiteral, DecimalMantissa, HexIntegerLiteral, Integer, + IntegralDecimalMantissa, Number, Numeric, NumericLiteral, + }, + tokens::Lex, + IntoLexResult, LexResult, + }, + utils::SourceFile, + }; + + use super::{ExponentIdicator, ExponentPart, HexPrefix, Sign, SignedInteger}; + + fn test_lex(s: impl ToString, src: &str) -> LexResult { + let src = SourceFile::dummy_file(format!("test.{}", s.to_string()), src); + let iter = &mut src.iter(); + T::lex(iter).into_lex_result() + } + + macro_rules! dot_man_exp { + ($m: pat, $e: pat) => { + Ok(Some(Number( + None, + Numeric::Lit(NumericLiteral::Decimal(DecimalLiteral::DecimalMantissa( + DecimalMantissa(_, $m, $e), + ))), + ))) + }; + ($s: pat, $m: pat, $e: pat) => { + Ok(Some(Number( + $s, + Numeric::Lit(NumericLiteral::Decimal(DecimalLiteral::DecimalMantissa( + DecimalMantissa(_, $m, $e), + ))), + ))) + }; + } + + macro_rules! int_exp { + ($m: pat, $e: pat) => { + Ok(Some(Number( + None, + Numeric::Lit(NumericLiteral::Decimal(DecimalLiteral::Integer(Integer( + $m, $e, + )))), + ))) + }; + ($s: pat, $m: pat, $e: pat) => { + Ok(Some(Number( + $s, + Numeric::Lit(NumericLiteral::Decimal(DecimalLiteral::Integer(Integer( + $m, $e, + )))), + ))) + }; + } + + macro_rules! hex_int { + ($c: pat, $d: pat, $ds: pat) => { + Ok(Some(Number( + None, + Numeric::Lit(NumericLiteral::Hex(HexIntegerLiteral($c, $d, $ds))), + ))) + }; + ($s: pat, $c: pat, $d: pat, $ds: pat) => { + Ok(Some(Number( + $s, + Numeric::Lit(NumericLiteral::Hex(HexIntegerLiteral($c, $d, $ds))), + ))) + }; + } + + macro_rules! int_dot_man_exp { + ($m: pat, $n: pat) => { + Ok(Some(Number( + None, + Numeric::Lit(NumericLiteral::Decimal( + DecimalLiteral::IntegralDecimalMantissa(IntegralDecimalMantissa(_, _, $m, $n)), + )), + ))) + }; + ($s: pat, $m: pat, $n: pat) => { + Ok(Some(Number( + $s, + Numeric::Lit(NumericLiteral::Decimal( + DecimalLiteral::IntegralDecimalMantissa(IntegralDecimalMantissa(_, _, $m, $n)), + )), + ))) + }; + } + + macro_rules! test_lex { + ($s: expr, $p: pat) => {{ + let tmp = test_lex::(0, $s); + if !matches!(tmp, $p) { + panic!("{tmp:?}"); + } + }}; + } + + #[test] + fn no_sign() { + assert!(!matches!(test_lex::(0, "02."), Ok(Some(_)))); + + test_lex!("1.", int_dot_man_exp!(None, None)); + test_lex!("123.", int_dot_man_exp!(None, None)); + test_lex!("1.2", int_dot_man_exp!(Some(_), None)); + test_lex!("13.2", int_dot_man_exp!(Some(_), None)); + test_lex!("1.e-5", int_dot_man_exp!(None, Some(_))); + test_lex!("134.2e-5", int_dot_man_exp!(Some(_), Some(_))); + + test_lex!(".1234", dot_man_exp!(_, None)); + test_lex!(".1234e-5", dot_man_exp!(_, Some(_))); + + test_lex!("1234", int_exp!(_, None)); + + test_lex!( + "467832674328438e2", + int_exp!( + _, + Some(ExponentPart( + ExponentIdicator::Lowercase(_), + SignedInteger::None(_) + )) + ) + ); + test_lex!( + "467832674328438E2", + int_exp!( + _, + Some(ExponentPart( + ExponentIdicator::Uppercase(_), + SignedInteger::None(_) + )) + ) + ); + test_lex!( + "467832674328438e+2", + int_exp!( + _, + Some(ExponentPart( + ExponentIdicator::Lowercase(_), + SignedInteger::Positive(_, _) + )) + ) + ); + test_lex!( + "467832674328438E+2", + int_exp!( + _, + Some(ExponentPart( + ExponentIdicator::Uppercase(_), + SignedInteger::Positive(_, _) + )) + ) + ); + test_lex!( + "467832674328438e-2", + int_exp!( + _, + Some(ExponentPart( + ExponentIdicator::Lowercase(_), + SignedInteger::Negative(_, _) + )) + ) + ); + test_lex!( + "467832674328438E-2", + int_exp!( + _, + Some(ExponentPart( + ExponentIdicator::Uppercase(_), + SignedInteger::Negative(_, _) + )) + ) + ); + + test_lex!("0x6432ABA3", hex_int!(HexPrefix::Lowercase(_), _, _)); + test_lex!("0x6432aba3", hex_int!(HexPrefix::Lowercase(_), _, _)); + test_lex!("0X6432ABA3", hex_int!(HexPrefix::Uppercase(_), _, _)); + test_lex!("0X6432ABA3", hex_int!(HexPrefix::Uppercase(_), _, _)); + } + + #[test] + fn positive() { + test_lex!("+1.", int_dot_man_exp!(Some(Sign::Positive(_)), None, None)); + test_lex!( + "+123.", + int_dot_man_exp!(Some(Sign::Positive(_)), None, None) + ); + test_lex!( + "+1.2", + int_dot_man_exp!(Some(Sign::Positive(_)), Some(_), None) + ); + test_lex!( + "+13.2", + int_dot_man_exp!(Some(Sign::Positive(_)), Some(_), None) + ); + test_lex!( + "+1.e-5", + int_dot_man_exp!(Some(Sign::Positive(_)), None, Some(_)) + ); + test_lex!( + "+134.2e-5", + int_dot_man_exp!(Some(Sign::Positive(_)), Some(_), Some(_)) + ); + + test_lex!("+.1234", dot_man_exp!(Some(Sign::Positive(_)), _, None)); + test_lex!( + "+.1234e-5", + dot_man_exp!(Some(Sign::Positive(_)), _, Some(_)) + ); + + test_lex!("+1234", int_exp!(Some(Sign::Positive(_)), _, None)); + + test_lex!( + "+467832674328438e2", + int_exp!( + Some(Sign::Positive(_)), + _, + Some(ExponentPart( + ExponentIdicator::Lowercase(_), + SignedInteger::None(_) + )) + ) + ); + test_lex!( + "+467832674328438E2", + int_exp!( + Some(Sign::Positive(_)), + _, + Some(ExponentPart( + ExponentIdicator::Uppercase(_), + SignedInteger::None(_) + )) + ) + ); + test_lex!( + "+467832674328438e+2", + int_exp!( + Some(Sign::Positive(_)), + _, + Some(ExponentPart( + ExponentIdicator::Lowercase(_), + SignedInteger::Positive(_, _) + )) + ) + ); + test_lex!( + "+467832674328438E+2", + int_exp!( + Some(Sign::Positive(_)), + _, + Some(ExponentPart( + ExponentIdicator::Uppercase(_), + SignedInteger::Positive(_, _) + )) + ) + ); + test_lex!( + "+467832674328438e-2", + int_exp!( + Some(Sign::Positive(_)), + _, + Some(ExponentPart( + ExponentIdicator::Lowercase(_), + SignedInteger::Negative(_, _) + )) + ) + ); + test_lex!( + "+467832674328438E-2", + int_exp!( + Some(Sign::Positive(_)), + _, + Some(ExponentPart( + ExponentIdicator::Uppercase(_), + SignedInteger::Negative(_, _) + )) + ) + ); + + test_lex!( + "+0x6432ABA3", + hex_int!(Some(Sign::Positive(_)), HexPrefix::Lowercase(_), _, _) + ); + test_lex!( + "+0x6432aba3", + hex_int!(Some(Sign::Positive(_)), HexPrefix::Lowercase(_), _, _) + ); + test_lex!( + "+0X6432ABA3", + hex_int!(Some(Sign::Positive(_)), HexPrefix::Uppercase(_), _, _) + ); + test_lex!( + "+0X6432ABA3", + hex_int!(Some(Sign::Positive(_)), HexPrefix::Uppercase(_), _, _) + ); + } + + #[test] + fn negative() { + test_lex!("-1.", int_dot_man_exp!(Some(Sign::Negative(_)), None, None)); + test_lex!( + "-123.", + int_dot_man_exp!(Some(Sign::Negative(_)), None, None) + ); + test_lex!( + "-1.2", + int_dot_man_exp!(Some(Sign::Negative(_)), Some(_), None) + ); + test_lex!( + "-13.2", + int_dot_man_exp!(Some(Sign::Negative(_)), Some(_), None) + ); + test_lex!( + "-1.e-5", + int_dot_man_exp!(Some(Sign::Negative(_)), None, Some(_)) + ); + test_lex!( + "-134.2e-5", + int_dot_man_exp!(Some(Sign::Negative(_)), Some(_), Some(_)) + ); + + test_lex!("-.1234", dot_man_exp!(Some(Sign::Negative(_)), _, None)); + test_lex!( + "-.1234e-5", + dot_man_exp!(Some(Sign::Negative(_)), _, Some(_)) + ); + + test_lex!("-1234", int_exp!(Some(Sign::Negative(_)), _, None)); + + test_lex!( + "-467832674328438e2", + int_exp!( + Some(Sign::Negative(_)), + _, + Some(ExponentPart( + ExponentIdicator::Lowercase(_), + SignedInteger::None(_) + )) + ) + ); + test_lex!( + "-467832674328438E2", + int_exp!( + Some(Sign::Negative(_)), + _, + Some(ExponentPart( + ExponentIdicator::Uppercase(_), + SignedInteger::None(_) + )) + ) + ); + test_lex!( + "-467832674328438e+2", + int_exp!( + Some(Sign::Negative(_)), + _, + Some(ExponentPart( + ExponentIdicator::Lowercase(_), + SignedInteger::Positive(_, _) + )) + ) + ); + test_lex!( + "-467832674328438E+2", + int_exp!( + Some(Sign::Negative(_)), + _, + Some(ExponentPart( + ExponentIdicator::Uppercase(_), + SignedInteger::Positive(_, _) + )) + ) + ); + test_lex!( + "-467832674328438e-2", + int_exp!( + Some(Sign::Negative(_)), + _, + Some(ExponentPart( + ExponentIdicator::Lowercase(_), + SignedInteger::Negative(_, _) + )) + ) + ); + test_lex!( + "-467832674328438E-2", + int_exp!( + Some(Sign::Negative(_)), + _, + Some(ExponentPart( + ExponentIdicator::Uppercase(_), + SignedInteger::Negative(_, _) + )) + ) + ); + + test_lex!( + "-0x6432ABA3", + hex_int!(Some(Sign::Negative(_)), HexPrefix::Lowercase(_), _, _) + ); + test_lex!( + "-0x6432aba3", + hex_int!(Some(Sign::Negative(_)), HexPrefix::Lowercase(_), _, _) + ); + test_lex!( + "-0X6432ABA3", + hex_int!(Some(Sign::Negative(_)), HexPrefix::Uppercase(_), _, _) + ); + test_lex!( + "-0X6432ABA3", + hex_int!(Some(Sign::Negative(_)), HexPrefix::Uppercase(_), _, _) + ); + } + + #[test] + fn idents() { + assert!(matches!( + test_lex::(0, "Infinity"), + Ok(Some(Number(None, Numeric::Infinity(_)))) + )); + assert!(matches!( + test_lex::(0, "+Infinity"), + Ok(Some(Number(Some(Sign::Positive(_)), Numeric::Infinity(_)))) + )); + assert!(matches!( + test_lex::(0, "-Infinity"), + Ok(Some(Number(Some(Sign::Negative(_)), Numeric::Infinity(_)))) + )); + + assert!(test_lex::(0, "-Ifty").is_err()); + assert!(test_lex::(0, "+Inf").is_err()); + assert!(matches!(test_lex::(0, "Infinty"), Ok(None))); + assert!(matches!( + test_lex::(0, "Idfhfdsbhjfdsvbaysj"), + Ok(None) + )); + + assert!(matches!( + test_lex::(0, "NaN"), + Ok(Some(Number(None, Numeric::NaN(_)))) + )); + assert!(matches!( + test_lex::(0, "+NaN"), + Ok(Some(Number(Some(Sign::Positive(_)), Numeric::NaN(_)))) + )); + assert!(matches!( + test_lex::(0, "-NaN"), + Ok(Some(Number(Some(Sign::Negative(_)), Numeric::NaN(_)))) + )); + + assert!(test_lex::(0, "-NAN").is_err()); + assert!(matches!(test_lex::(0, "nAN"), Ok(None))); + assert!(test_lex::(0, "+nAn").is_err()); + assert!(test_lex::(0, "-NAn").is_err()); + } +} diff --git a/src/lex/strings.rs b/src/lex/strings.rs new file mode 100644 index 0000000..7a7866d --- /dev/null +++ b/src/lex/strings.rs @@ -0,0 +1,190 @@ +//! +//! String literals. +//! + +use avjason_macros::{Lex, Spanned}; + +use crate::{ + lex::tokens::{LineTerminator, LineTerminatorSeq}, + utils::{SourceIter, Span, TryIntoSpan}, +}; + +use super::{escape::EscapeSequence, tokens::Lex, IntoLexResult, LexError, LexResult}; + +#[derive(Debug, Spanned)] +#[Lex] +pub enum LString { + Single(SingleString), + Double(DoubleString), +} + +fn eat_inner_chars( + input: &mut SourceIter, + delimit: char, +) -> Result>, LexError> { + let mut contents = vec![]; + + while let Some(ch) = input.peek() { + if ch == &delimit { + break; + } + + if LineTerminator::peek(input) { + return input.error().unexpected(Some(0..1), ""); + } + + if ch == &'\\' { + // Escape sequence. + let mut fork = input.fork(); + fork.offset(1); + + if EscapeSequence::peek(&fork) { + contents.push(StrFrag::EscSeq(EscapeSequence::lex(&mut fork).into_lex_result()?.unwrap())); + + input.advance_to(fork); + continue; + } + + if LineTerminatorSeq::peek(&fork) { + contents.push(StrFrag::LineEsc(LineTerminatorSeq::lex(&mut fork).into_lex_result()?.unwrap())); + + input.advance_to(fork); + continue; + } + + return input + .error() + .expected(Some(0..1), "Escaped Newline, or escape sequence"); + } + + let (_, c) = input.next().unwrap(); + contents.push(StrFrag::Char(c)); + } + + Ok(Some(contents)) +} + +#[derive(Debug)] +pub enum StrFrag { + Char(char), + EscSeq(EscapeSequence), + LineEsc(LineTerminatorSeq), +} + +#[derive(Debug, Spanned)] +pub struct SingleString(Span, Vec); + +impl Lex for SingleString { + fn lex(input: &mut SourceIter) -> LexResult { + if !Self::peek(input) { + return Ok(None); + } + + let start = input.next().unwrap().0; + + let contents = eat_inner_chars(input, '\'')?.unwrap(); + + if input.peek() != Some(&'\'') { + return input.error().expected(Some(0..1), "\'"); + } + + let end = input.next().unwrap().0; + + Ok(Some(Self( + TryIntoSpan::try_into_span(start..=end).unwrap(), + contents, + ))) + } + + fn peek(input: &SourceIter) -> bool { + input.peek() == Some(&'\'') + } +} + +#[derive(Debug, Spanned)] +pub struct DoubleString(Span, Vec); + +impl Lex for DoubleString { + fn lex(input: &mut SourceIter) -> LexResult { + if !Self::peek(input) { + return Ok(None); + } + + let start = input.next().unwrap().0; + + let contents = eat_inner_chars(input, '\"')?.unwrap(); + + if input.peek() != Some(&'\"') { + return input.error().expected(Some(0..1), "\""); + } + + let end = input.next().unwrap().0; + + Ok(Some(Self( + TryIntoSpan::try_into_span(start..=end).unwrap(), + contents, + ))) + } + + fn peek(input: &SourceIter) -> bool { + input.peek() == Some(&'"') + } +} + +#[cfg(test)] +mod tests { + use crate::{ + lex::{strings::DoubleString, tokens::Lex, IntoLexResult, LexResult}, + utils::SourceFile, + }; + + fn test_lex(s: impl ToString, src: &str) -> LexResult { + let src = SourceFile::dummy_file(format!("test.{}", s.to_string()), src); + let iter = &mut src.iter(); + T::lex(iter).into_lex_result() + } + + #[test] + fn unicode_escape() { + let twice_valid = test_lex::(0, r#""\u1522\u2431""#); + assert!(matches!(twice_valid, Ok(Some(_)))); + let once_valid_once_invalid = test_lex::(1, r#""\u1522\u241""#); + assert!(once_valid_once_invalid.is_err()); + let once_invalid = test_lex::(3, r#""\u1S2Y""#); + assert!(once_invalid.is_err()); + } + + #[test] + fn hex_escape() { + let twice_valid = test_lex::(0, r#""\x0F\xFF""#); + assert!(matches!(twice_valid, Ok(Some(_)))); + let once_valid_once_invalid = test_lex::(0, r#""\x0F\xSF""#); + assert!(once_valid_once_invalid.is_err()); + let once_invalid = test_lex::(0, r#""\xSF""#); + assert!(once_invalid.is_err()); + } + + #[test] + fn single_char() { + let escaped = test_lex::(0, r#""\t\r\v\n\"\\""#); + assert!(matches!(escaped, Ok(Some(_)))); + let normal = test_lex::(0, r#""\!\?\:\@\~\#\}\{\(\)\&\$""#); + assert!(matches!(normal, Ok(Some(_)))); + } + + #[test] + fn null_escape() { + let valid = test_lex::(0, r#""\0\0\0\0\0\0\0\0""#); + assert!(matches!(valid, Ok(Some(_)))); + let invalid = test_lex::(0, r#""\00\01\04\06"#); + assert!(invalid.is_err()); + } + + #[test] + fn mixed_escapes() { + let test0 = test_lex::(0, r#""\v\!\%\x00""#); + assert!(matches!(test0, Ok(Some(_)))); + let test1 = test_lex::(1, r#""\v\!\% abhbdasjdas^da'''gadudgasi a@@@~ {} dauasdhi\x00""#); + assert!(matches!(test1, Ok(Some(_)))); + } +} diff --git a/src/lex/tokens.rs b/src/lex/tokens.rs new file mode 100644 index 0000000..4996599 --- /dev/null +++ b/src/lex/tokens.rs @@ -0,0 +1,433 @@ +use avjason_macros::{Lex, Spanned}; +use finl_unicode::categories::{CharacterCategories, MinorCategory}; + +use crate::utils::{SourceIter, Span, TryIntoSpan}; + +use super::{escape::UnicodeEscapeSequence, number::Number, strings::LString, IntoLexResult}; + +pub(crate) trait Lex: Sized { + fn lex(input: &mut SourceIter) -> impl IntoLexResult; + fn peek(input: &SourceIter) -> bool; +} + +#[derive(Debug, Spanned)] +#[Lex('{')] +pub struct OpenBrace(Span); + +#[derive(Debug, Spanned)] +#[Lex('}')] +pub struct CloseBrace(Span); + +#[derive(Debug, Spanned)] +#[Lex('[')] +pub struct OpenBracket(Span); + +#[derive(Debug, Spanned)] +#[Lex(']')] +pub struct CloseBracket(Span); + +#[derive(Debug, Spanned)] +#[Lex(':')] +pub struct Colon(Span); + +#[derive(Debug, Spanned)] +#[Lex(',')] +pub struct Comma(Span); + +#[derive(Debug, Spanned)] +#[Lex('-')] +pub struct Minus(Span); + +#[derive(Debug, Spanned)] +#[Lex('+')] +pub struct Plus(Span); + +#[derive(Debug, Spanned)] +#[Lex('.')] +pub struct Dot(Span); + +#[macro_export] +macro_rules! Token { + ['{'] => { + $crate::lex::tokens::OpenBrace + }; + ['}'] => { + $crate::lex::tokens::CloseBrace + }; + ['['] => { + $crate::lex::tokens::OpenBracket + }; + [']'] => { + $crate::lex::tokens::CloseBracket + }; + [':'] => { + $crate::lex::tokens::Colon + }; + [','] => { + $crate::lex::tokens::Comma + }; + ['-'] => { + $crate::lex::tokens::Minus + }; + ['+'] => { + $crate::lex::tokens::Plus + }; + ['.'] => { + $crate::lex::tokens::Dot + }; + [:] => { + $crate::lex::tokens::Colon + }; + [,] => { + $crate::lex::tokens::Comma + }; + [-] => { + $crate::lex::tokens::Minus + }; + [+] => { + $crate::lex::tokens::Plus + }; + [.] => { + $crate::lex::tokens::Dot + }; +} + +#[derive(Debug, Spanned)] +#[Lex] +pub enum Punct { + OpenBrace(OpenBrace), + CloseBrace(CloseBrace), + OpenBracket(OpenBracket), + CloseBracket(CloseBracket), + Colon(Colon), + Comma(Comma), +} + +#[derive(Debug, Spanned)] +pub struct WhiteSpace(Span); + +impl WhiteSpace { + /// + /// In accordance with + /// [ECMAScript standards](https://262.ecma-international.org/5.1/#sec-7.2). + /// + pub fn is_whitespace(ch: &char) -> bool { + ch == &'\u{0009}' + || ch == &'\u{000b}' + || ch == &'\u{000c}' + || ch == &'\u{0020}' + || ch == &'\u{00a0}' + || (*ch).get_minor_category() == MinorCategory::Zs + } +} + +impl Lex for WhiteSpace { + fn lex(input: &mut SourceIter) -> Option { + let ch = input.peek()?; + let Some(start) = (if Self::is_whitespace(ch) { + Some(input.next()?.0) + } else { + return None; + }) else { + return None; + }; + + let mut end = start; + while let Some(ch) = input.peek() { + if !Self::is_whitespace(ch) { + break; + } + end = input.next()?.0; + } + + Some(Self(TryIntoSpan::try_into_span(start..=end)?)) + } + + fn peek(input: &SourceIter) -> bool { + input.peek().map(Self::is_whitespace).unwrap_or_default() + } +} + +/// +/// In accordance with the [ECMAScript standard](https://262.ecma-international.org/5.1/#sec-7.3). +/// +#[derive(Debug, Spanned)] +pub struct LineTerminator(Span); + +impl Lex for LineTerminator { + fn lex(input: &mut SourceIter) -> Option { + match input.peek()? { + // , , , + &'\u{000a}' | &'\u{000d}' | &'\u{2028}' | &'\u{2029}' => { + let loc = input.next()?.0; + Some(Self(Span::single_char(loc))) + } + _ => None, + } + } + + fn peek(input: &SourceIter) -> bool { + matches!( + input.peek(), + Some(&'\u{000a}' | &'\u{000d}' | &'\u{2028}' | &'\u{2029}') + ) + } +} + +#[derive(Debug, Spanned)] +pub struct LineTerminatorSeq(Span); + +impl Lex for LineTerminatorSeq { + fn lex(input: &mut SourceIter) -> Option { + match (input.peek()?, input.peek2()) { + // + (&'\u{000d}', Some(&'\u{000a}')) => { + let start = input.next()?.0; + let end = input.next()?.0; + Some(Self(TryIntoSpan::try_into_span(start..=end)?)) + } + // , , , + (&'\u{000a}' | &'\u{000d}' | &'\u{2028}' | &'\u{2029}', _) => { + let loc = input.next()?.0; + Some(Self(Span::single_char(loc))) + } + _ => None, + } + } + + fn peek(input: &SourceIter) -> bool { + match (input.peek(), input.peek2()) { + // + (Some(&'\u{000d}'), Some(&'\u{000a}')) => true, + // , , , + (Some(&'\u{000a}' | &'\u{000d}' | &'\u{2028}' | &'\u{2029}'), _) => true, + _ => false, + } + } +} + +#[derive(Debug, Spanned)] +#[Lex] +pub enum Comment { + MultiLine(MultiLineComment), + SingleLine(SingleLineComment), +} + +#[derive(Debug, Spanned)] +pub struct SingleLineComment(Span); + +impl Lex for SingleLineComment { + fn lex(input: &mut SourceIter) -> Option { + if !Self::peek(input) { + return None; + } + + let start = input.next()?.0; // First slash + let _ = input.next()?; // Second slash + + let mut end = start; + while !LineTerminator::peek(input) { + // Unwrap ok since peek -> Some implies next -> Some/ + end = input.next().unwrap().0; + } + + Some(Self(TryIntoSpan::try_into_span(start..=end)?)) + } + + fn peek(input: &SourceIter) -> bool { + matches!((input.peek(), input.peek2()), (Some(&'/'), Some(&'/'))) + } +} + +#[derive(Debug, Spanned)] +pub struct MultiLineComment(Span); + +impl MultiLineComment { + fn peek_end(input: &SourceIter) -> bool { + matches!((input.peek(), input.peek2()), (Some(&'*'), Some(&'/'))) + } +} + +impl Lex for MultiLineComment { + fn lex(input: &mut SourceIter) -> Option { + if !Self::peek(input) { + return None; + } + + let start = input.next()?.0; // First slash + let _ = input.next()?; // Second slash + + while !Self::peek_end(input) { + // Unwrap ok since peek -> Some implies next -> Some + _ = input.next().unwrap().0; + } + + input.next().unwrap(); // `*` - Unwraps ok since peek, peek2 -> Some, Some + let end = input.next().unwrap().0; // `/` + + Some(Self(TryIntoSpan::try_into_span(start..=end)?)) + } + + fn peek(input: &SourceIter) -> bool { + matches!((input.peek(), input.peek2()), (Some(&'/'), Some(&'*'))) + } +} + +#[derive(Debug, Spanned)] +#[Lex] +pub enum InputElement { + LineTerminator(LineTerminator), + WhiteSpace(WhiteSpace), + Comment(Comment), + Token(Token), +} + +/// +/// Compliant with [ECMAScript specification for `IdentifierName`](https://262.ecma-international.org/5.1/#sec-7.6). +/// +#[derive(Debug, Spanned)] +pub struct LIdentifier(Span); + +impl LIdentifier { + fn is_unicode_letter(ch: &char) -> bool { + use MinorCategory::*; + matches!(ch.get_minor_category(), Lu | Ll | Lt | Lm | Lo | Nl) + } + + fn is_unicode_combining_mark(ch: &char) -> bool { + use MinorCategory::*; + matches!(ch.get_minor_category(), Mn | Mc) + } + + fn is_unicode_digit(ch: &char) -> bool { + use MinorCategory::*; + matches!(ch.get_minor_category(), Nd) + } + + fn is_unicode_connector_punctuation(ch: &char) -> bool { + use MinorCategory::*; + matches!(ch.get_minor_category(), Pc) + } + + pub(crate) fn is_identifier_start(input: &SourceIter) -> bool { + // IdentifierStart + let Some(ch) = input.peek() else { + return false; + }; + + match ch { + c if Self::is_unicode_letter(c) => true, + &'$' | &'_' => true, + &'\\' => { + // Check for unicode escape sequence. + let mut fork = input.fork(); + fork.next().unwrap(); + UnicodeEscapeSequence::peek(input) + } + _ => false, + } + } + + fn is_identifier_part(input: &SourceIter) -> bool { + if Self::is_identifier_start(input) { + return true; + } + + let Some(ch) = input.peek() else { + return false; + }; + + Self::is_unicode_combining_mark(ch) + || Self::is_unicode_digit(ch) + || Self::is_unicode_connector_punctuation(ch) + || matches!(ch, &'\u{200c}' | &'\u{200d}') // | + } + + fn peek_middle(input: &SourceIter) -> bool { + Self::is_identifier_part(input) + } +} + +impl Lex for LIdentifier { + fn lex(input: &mut SourceIter) -> Option { + if !Self::peek(input) { + return None; + } + + let start = input.next().unwrap().0; + let mut end = start + 1; + while Self::peek_middle(input) { + end = input.next().unwrap().0; + } + + Some(Self(TryIntoSpan::try_into_span(start..=end)?)) + } + + fn peek(input: &SourceIter) -> bool { + Self::is_identifier_start(input) + } +} + +#[derive(Debug, Spanned)] +pub enum Token { + Identifier(LIdentifier), + Punctuator(Punct), + String(LString), + Number(Number), +} + +impl Lex for Token { + fn lex(input: &mut SourceIter) -> impl IntoLexResult { + if let Some(s) = LIdentifier::lex(input).into_lex_result()? { + return Ok(Some(Self::Identifier(s))); + } + + if let Some(s) = Punct::lex(input).into_lex_result()? { + return Ok(Some(Self::Punctuator(s))); + } + + if let Some(s) = LString::lex(input).into_lex_result()? { + return Ok(Some(Self::String(s))); + } + + if let Some(s) = Number::lex(input).into_lex_result()? { + return Ok(Some(Self::Number(s))); + } + + Ok(None) + } + + fn peek(_: &SourceIter) -> bool { + unimplemented!() + } +} + +#[cfg(test)] +mod tests { + use crate::{ + lex::IntoLexResult, + utils::SourceFile, + }; + + use super::{InputElement, Lex}; + + #[test] + fn lexxing_tests() { + let src = "\ + []\n\ + 21, 5.65 + { }:,\n\ + // Single line comment\n\ + /* Multi line Comment\n\ + Wa-hey!*/\r\n + \"Here's a string!\"\n + 1.234678\t7.2367\t-Infinity"; + + println!("{src:?}"); + let src = SourceFile::dummy_file("test.1", src); + let iter = &mut src.iter(); + while let Ok(Some(l)) = InputElement::lex(iter).into_lex_result() { + println!("--> {l:?}"); + } + } +} diff --git a/src/lib.rs b/src/lib.rs index eb765c1..09cda01 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4,6 +4,7 @@ //! //! A parser for [JSON5](https://json5.org/). //! -#![feature(iter_map_windows)] +#![feature(iter_map_windows, specialization)] -pub mod utils; \ No newline at end of file +pub mod utils; +pub mod lex; \ No newline at end of file diff --git a/src/utils/mod.rs b/src/utils/mod.rs index 7a317bb..fe668d1 100644 --- a/src/utils/mod.rs +++ b/src/utils/mod.rs @@ -4,6 +4,7 @@ pub mod span; use std::{ + fmt::Debug, fs, io, ops::RangeBounds, path::{Path, PathBuf}, @@ -11,10 +12,12 @@ use std::{ pub use span::*; +use crate::lex::{LexError, LexResult}; + #[derive(Debug)] pub struct SourceFile { path: PathBuf, - contents: String, + contents: Vec, line_starts: Vec, } @@ -60,13 +63,36 @@ impl SourceFile { /// /// Returns the original source code at a particular [Span]. /// - pub fn source_at(&self, span: impl RangeBounds) -> Option<&str> { + pub fn source_at(&self, span: impl RangeBounds) -> Option { let span = S::try_into_span(span)?; if span.end.index > self.contents.len() { return None; } - Some(&self.contents[span.start.index..span.end.index]) + if span.start.index >= span.end.index { + return None; + } + + Some( + self.contents[span.start.index..span.end.index] + .iter() + .collect(), + ) + } + + /// + /// Returns the original source code at a particular [Span]. + /// + pub fn source_at_span(&self, span: Span) -> Option { + if span.end.index > self.contents.len() { + return None; + } + + Some( + self.contents[span.start.index..span.end.index] + .iter() + .collect(), + ) } #[cfg(test)] @@ -75,7 +101,7 @@ impl SourceFile { let line_lengths = Self::split_lines(&contents).collect(); Self { path: path.as_ref().to_owned(), - contents, + contents: contents.chars().collect(), line_starts: line_lengths, } } @@ -90,10 +116,215 @@ impl SourceFile { Ok(Self { path: path.to_owned(), - contents, + contents: contents.chars().collect(), line_starts, }) } + + pub(crate) fn iter(&self) -> SourceIter { + SourceIter::new(self) + } +} + +#[derive(Clone)] +pub struct SourceIter<'a> { + file: &'a SourceFile, + inner: &'a Vec, + index: usize, +} + +impl<'a> std::fmt::Debug for SourceIter<'a> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("SourceIter") + .field("left", &String::from_iter(&self.inner[self.index..])) + .field("index", &self.index) + .finish() + } +} + +impl<'a> SourceIter<'a> { + pub(crate) fn new(file: &'a SourceFile) -> Self { + Self { + file, + inner: &file.contents, + index: 0, + } + } + + pub(crate) fn peek(&self) -> Option<&char> { + self.inner.get(self.index) + } + + pub(crate) fn peek2(&self) -> Option<&char> { + self.inner.get(self.index + 1) + } + + pub(crate) fn fork(&self) -> Self { + self.clone() + } + + pub(crate) fn ahead(&self, range: impl RangeBounds) -> Option { + let abs_start = self.index + + match range.start_bound() { + std::ops::Bound::Included(d) => *d, + std::ops::Bound::Excluded(d) => (*d) + 1, + std::ops::Bound::Unbounded => 0, + }; + + let abs_end = self.index + + match range.end_bound() { + std::ops::Bound::Included(d) => *d + 1, + std::ops::Bound::Excluded(d) => *d, + std::ops::Bound::Unbounded => self.inner.len(), + }; + + if !(abs_start < self.inner.len() && abs_end <= self.inner.len()) { + return None; + } + + Some(self.inner[abs_start..abs_end].iter().collect()) + } + + pub(crate) fn relative_match( + &self, + range: impl RangeBounds, + pred: impl Fn(&char) -> bool, + ) -> bool { + let abs_start = self.index + + match range.start_bound() { + std::ops::Bound::Included(d) => *d, + std::ops::Bound::Excluded(d) => (*d) + 1, + std::ops::Bound::Unbounded => 0, + }; + let abs_end = self.index + + match range.end_bound() { + std::ops::Bound::Included(d) => *d + 1, + std::ops::Bound::Excluded(d) => *d, + std::ops::Bound::Unbounded => self.inner.len(), + }; + + if !(abs_start < self.inner.len() && abs_end <= self.inner.len()) { + return false; + } + + let s = &self.inner[abs_start..abs_end]; + s.iter().all(pred) + } + + pub(crate) fn offset(&mut self, offset: usize) { + self.index += offset; + } + + pub(crate) fn advance_to(&mut self, other: Self) { + self.index = other.index; + } + + pub(crate) fn error(&self) -> SourceErrorHelper { + SourceErrorHelper { iter: self } + } +} + +pub(crate) struct SourceErrorHelper<'a> { + iter: &'a SourceIter<'a>, +} + +impl<'a> SourceErrorHelper<'a> { + pub(crate) fn unexpected( + self, + range: Option>, + token: impl ToString, + ) -> LexResult + where + isize: TryFrom, + A: Copy + Debug, + >::Error: Debug, + { + let token = token.to_string(); + + let mut text = None; + let mut span = 0..self.iter.inner.len(); + if let Some(range) = range { + let i = self.iter.index as isize; + let start = i + match range.start_bound() { + std::ops::Bound::Included(r) => isize::try_from(*r).unwrap(), + std::ops::Bound::Excluded(r) => isize::try_from(*r).unwrap() + 1, + std::ops::Bound::Unbounded => 0isize, + }; + + let end = i + match range.start_bound() { + std::ops::Bound::Included(r) => isize::try_from(*r).unwrap() + 1, + std::ops::Bound::Excluded(r) => isize::try_from(*r).unwrap(), + std::ops::Bound::Unbounded => self.iter.inner.len() as isize, + }; + + let start = start as usize; + let end = end as usize; + + text = self.iter.file.source_at(start..end); + span = start..end; + } + + Err(LexError::new( + span, + format!("Unexpected token `{token}`"), + text, + )) + } + + pub(crate) fn expected( + self, + rel_range: Option>, + token: impl ToString, + ) -> LexResult + where + isize: TryFrom, + A: Copy + Debug, + >::Error: Debug, + { + let token = token.to_string(); + + let mut text = None; + let mut span = 0..self.iter.inner.len(); + if let Some(range) = rel_range { + let i = self.iter.index as isize; + let start = i + match range.start_bound() { + std::ops::Bound::Included(r) => isize::try_from(*r).unwrap(), + std::ops::Bound::Excluded(r) => isize::try_from(*r).unwrap() + 1, + std::ops::Bound::Unbounded => 0isize, + }; + + let end = i + match range.start_bound() { + std::ops::Bound::Included(r) => isize::try_from(*r).unwrap() + 1, + std::ops::Bound::Excluded(r) => isize::try_from(*r).unwrap(), + std::ops::Bound::Unbounded => self.iter.inner.len() as isize, + }; + + let start = start as usize; + let end = end as usize; + + text = self.iter.file.source_at(start..end); + span = start..end; + } + + Err(LexError::new( + span, + format!("Expected token `{token}` here"), + text, + )) + } +} + +impl<'a> Iterator for SourceIter<'a> { + type Item = (Loc, char); + + fn next(&mut self) -> Option { + let ch = self.inner.get(self.index)?; + let l = Loc { index: self.index }; + + self.index += 1; + + Some((l, *ch)) + } } #[cfg(test)] diff --git a/src/utils/span.rs b/src/utils/span.rs index c0bc8ec..6637adb 100644 --- a/src/utils/span.rs +++ b/src/utils/span.rs @@ -104,6 +104,20 @@ impl Span { end: loc + 1, } } + + pub fn combine(self, iter: impl IntoIterator) -> Self { + let start = self; + let last = iter.into_iter().last(); + + if let Some(end) = last { + Self { + start: start.start, + end: end.end, + } + } else { + start + } + } } /// @@ -170,3 +184,7 @@ impl TryIntoSpan for usize { }) } } + +pub trait Spanned { + fn span(&self) -> Span; +} From d7addb6ea6fbd87dd77e5dbd20ba01bfc7b3a79f Mon Sep 17 00:00:00 2001 From: Sammy99jsp Date: Wed, 3 Jan 2024 01:27:05 +0000 Subject: [PATCH 09/39] Parser mostly implemented.\nNeeds more tests. --- Cargo.lock | 7 + Cargo.toml | 1 + macros/src/lib.rs | 2 +- src/lex/escape.rs | 14 +- src/lex/mod.rs | 8 ++ src/lex/number.rs | 69 ++++++---- src/lex/strings.rs | 16 ++- src/lex/tokens.rs | 249 +++++++++++++++++++++++++++++----- src/lib.rs | 5 +- src/syntax/mod.rs | 137 +++++++++++++++++++ src/syntax/utils.rs | 51 +++++++ src/syntax/value.rs | 316 ++++++++++++++++++++++++++++++++++++++++++++ src/utils/mod.rs | 55 +++++++- 13 files changed, 856 insertions(+), 74 deletions(-) create mode 100644 src/syntax/mod.rs create mode 100644 src/syntax/utils.rs create mode 100644 src/syntax/value.rs diff --git a/Cargo.lock b/Cargo.lock index dc4f50f..c7a8613 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -11,10 +11,17 @@ dependencies = [ "memchr", ] +[[package]] +name = "anyhow" +version = "1.0.79" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "080e9890a082662b09c1ad45f567faeeb47f22b5fb23895fbe1e651e718e25ca" + [[package]] name = "avjason" version = "0.1.0" dependencies = [ + "anyhow", "avjason-macros", "finl_unicode", "lazy_static", diff --git a/Cargo.toml b/Cargo.toml index 9e28810..0de9457 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,6 +7,7 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] +anyhow = "1.0.79" finl_unicode = "1.2.0" lazy_static = "1.4.0" regex = "1.10.2" diff --git a/macros/src/lib.rs b/macros/src/lib.rs index d892420..3cbaa0f 100644 --- a/macros/src/lib.rs +++ b/macros/src/lib.rs @@ -134,7 +134,7 @@ pub fn Lex(args: TokenStream, input: TokenStream) -> TokenStream { if input.peek() == Some(&#ch) { // Unwrap okay, because otherwise .peek returns None. let (l, _) = input.next().unwrap(); - return Some(Self(crate::utils::Span::single_char(l))); + return Some(Self{ span: crate::utils::Span::single_char(l)}); } None diff --git a/src/lex/escape.rs b/src/lex/escape.rs index 73242cc..e0e8376 100644 --- a/src/lex/escape.rs +++ b/src/lex/escape.rs @@ -14,7 +14,7 @@ pub fn is_hex_digit(ch: &char) -> bool { ch.is_ascii_hexdigit() } -#[derive(Debug, Spanned)] +#[derive(Debug, Clone, Spanned)] #[Lex] pub enum EscapeSequence { Unicode(UnicodeEscapeSequence), @@ -23,14 +23,14 @@ pub enum EscapeSequence { Character(CharacterEscapeSequence), } -#[derive(Debug, Spanned)] +#[derive(Debug, Clone, Spanned)] #[Lex] pub enum CharacterEscapeSequence { Single(SingleEscapeCharacter), NonEscape(NonEscapeCharacter), } -#[derive(Debug, Spanned)] +#[derive(Debug, Clone, Spanned)] pub struct SingleEscapeCharacter(Span); impl Lex for SingleEscapeCharacter { @@ -51,7 +51,7 @@ impl Lex for SingleEscapeCharacter { } } -#[derive(Debug, Spanned)] +#[derive(Debug, Clone, Spanned)] pub struct NonEscapeCharacter(Span); struct EscapeCharacter; @@ -88,7 +88,7 @@ impl Lex for NonEscapeCharacter { } } -#[derive(Debug, Spanned)] +#[derive(Debug, Clone, Spanned)] pub struct NullEscapeSequence(Span); impl Lex for NullEscapeSequence { @@ -107,7 +107,7 @@ impl Lex for NullEscapeSequence { } } -#[derive(Debug, Spanned)] +#[derive(Debug, Clone, Spanned)] pub struct HexEscapeSequence(Span); impl Lex for HexEscapeSequence { @@ -137,7 +137,7 @@ impl Lex for HexEscapeSequence { } } -#[derive(Debug, Spanned)] +#[derive(Debug, Clone, Spanned)] pub struct UnicodeEscapeSequence(Span); impl Lex for UnicodeEscapeSequence { diff --git a/src/lex/mod.rs b/src/lex/mod.rs index 441a84a..143c80c 100644 --- a/src/lex/mod.rs +++ b/src/lex/mod.rs @@ -36,6 +36,14 @@ impl LexError { } } +impl std::error::Error for LexError {} + +impl std::fmt::Display for LexError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "Error occured during lexxing:\t{}\n\tNear `{}`", self.message, self.text.as_ref().unwrap_or(&String::default())) + } +} + /// /// Utility for Lexer erorrs, /// diff --git a/src/lex/number.rs b/src/lex/number.rs index e62ca41..23775db 100644 --- a/src/lex/number.rs +++ b/src/lex/number.rs @@ -7,10 +7,11 @@ use std::ops::RangeBounds; use avjason_macros::{Lex, Spanned}; -use super::tokens::{Dot, LIdentifier, Lex, Minus, Plus}; +use super::tokens::{Dot, LIdentifier, Lex, Minus, Plus, Token}; use super::{IntoLexResult, LexResult}; use crate::lex::escape::is_hex_digit; +use crate::syntax::Parse; use crate::utils::{SourceIter, Span, Spanned, TryIntoSpan}; use crate::Token; @@ -20,7 +21,7 @@ use crate::Token; /// --- /// See [the JSON5 specification](https://spec.json5.org/#prod-JSON5Number). /// -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct Number(Option, Numeric); impl Spanned for Number { @@ -57,7 +58,13 @@ impl Lex for Number { } } -#[derive(Debug, Spanned)] +impl Number { + pub(crate) fn peek_token(token: &Token) -> bool { + matches!(token, Token::Number(_)) + } +} + +#[derive(Debug, Clone, Spanned)] #[Lex] pub enum Sign { Positive(Plus), @@ -91,7 +98,7 @@ impl Lex for K { } } -#[derive(Debug, Spanned)] +#[derive(Debug, Clone, Spanned)] pub struct Infinity(Span); impl Keyword for Infinity { @@ -102,7 +109,7 @@ impl Keyword for Infinity { } } -#[derive(Debug, Spanned)] +#[derive(Debug, Clone, Spanned)] pub struct NaN(Span); impl Keyword for NaN { @@ -120,7 +127,7 @@ impl Keyword for NaN { /// /// See [the JSON5 specification](https://spec.json5.org/#prod-JSON5NumericLiteral). /// -#[derive(Debug, Spanned)] +#[derive(Debug, Clone, Spanned)] #[Lex] pub enum Numeric { Infinity(Infinity), @@ -135,7 +142,7 @@ pub enum Numeric { /// /// See the [ECMAScript specification](https://262.ecma-international.org/5.1/#sec-7.8.3). /// -#[derive(Debug, Spanned)] +#[derive(Debug, Clone, Spanned)] pub enum NumericLiteral { Decimal(DecimalLiteral), Hex(HexIntegerLiteral), @@ -184,7 +191,7 @@ impl Lex for NumericLiteral { } } -#[derive(Debug, Spanned)] +#[derive(Debug, Clone, Spanned)] #[Lex] pub enum DecimalLiteral { IntegralDecimalMantissa(IntegralDecimalMantissa), @@ -192,7 +199,7 @@ pub enum DecimalLiteral { Integer(Integer), } -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct IntegralDecimalMantissa( DecimalIntegerLiteral, Token![.], @@ -257,7 +264,7 @@ impl Lex for IntegralDecimalMantissa { } } -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct DecimalMantissa(Token![.], DecimalDigits, Option); impl Spanned for DecimalMantissa { @@ -300,7 +307,7 @@ impl Lex for DecimalMantissa { } } -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct Integer(DecimalIntegerLiteral, Option); impl Spanned for Integer { @@ -333,7 +340,7 @@ impl Lex for Integer { } } -#[derive(Debug)] +#[derive(Debug, Clone)] pub enum DecimalIntegerLiteral { Zero(Zero), NonZero(NonZero, Option), @@ -372,11 +379,13 @@ impl Lex for DecimalIntegerLiteral { } } -#[derive(Debug, Spanned)] +#[derive(Debug, Clone, Spanned)] #[Lex('0')] -pub struct Zero(Span); +pub struct Zero { + span: Span +} -#[derive(Debug, Spanned)] +#[derive(Debug, Clone, Spanned)] pub struct NonZero(Span); impl Lex for NonZero { @@ -396,7 +405,7 @@ impl Lex for NonZero { } } -#[derive(Debug, Spanned)] +#[derive(Debug, Clone, Spanned)] pub struct DecimalDigits(Span); impl Lex for DecimalDigits { @@ -424,7 +433,7 @@ impl Lex for DecimalDigits { } } -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct ExponentPart(ExponentIdicator, SignedInteger); impl Spanned for ExponentPart { @@ -458,23 +467,27 @@ impl Lex for ExponentPart { } } -#[derive(Debug, Spanned)] +#[derive(Debug, Clone, Spanned)] #[Lex] pub enum ExponentIdicator { Uppercase(E), Lowercase(e), } -#[derive(Debug, Spanned)] +#[derive(Debug, Clone, Spanned)] #[Lex('E')] -pub struct E(Span); +pub struct E { + span: Span +} -#[derive(Debug, Spanned)] +#[derive(Debug, Clone, Spanned)] #[Lex('e')] #[allow(non_camel_case_types)] -pub struct e(Span); +pub struct e { + span: Span +} -#[derive(Debug)] +#[derive(Debug, Clone)] pub enum SignedInteger { None(DecimalDigits), Positive(Token![+], DecimalDigits), @@ -523,10 +536,10 @@ impl Lex for SignedInteger { } } -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct HexIntegerLiteral(HexPrefix, HexDigit, Vec); -#[derive(Debug, Spanned)] +#[derive(Debug, Clone, Spanned)] #[Lex] pub enum HexPrefix { Lowercase(LowercaseHexPrefix), @@ -572,7 +585,7 @@ impl Lex for HexIntegerLiteral { } } -#[derive(Debug, Spanned)] +#[derive(Debug, Clone, Spanned)] pub struct LowercaseHexPrefix(Span); impl Lex for LowercaseHexPrefix { @@ -594,7 +607,7 @@ impl Lex for LowercaseHexPrefix { } } -#[derive(Debug, Spanned)] +#[derive(Debug, Clone, Spanned)] pub struct UppercaseHexPrefix(Span); impl Lex for UppercaseHexPrefix { @@ -616,7 +629,7 @@ impl Lex for UppercaseHexPrefix { } } -#[derive(Debug, Spanned)] +#[derive(Debug, Clone, Spanned)] pub struct HexDigit(Span); impl Lex for HexDigit { diff --git a/src/lex/strings.rs b/src/lex/strings.rs index 7a7866d..eba1d0f 100644 --- a/src/lex/strings.rs +++ b/src/lex/strings.rs @@ -9,15 +9,21 @@ use crate::{ utils::{SourceIter, Span, TryIntoSpan}, }; -use super::{escape::EscapeSequence, tokens::Lex, IntoLexResult, LexError, LexResult}; +use super::{escape::EscapeSequence, tokens::{Lex, Token}, IntoLexResult, LexError, LexResult}; -#[derive(Debug, Spanned)] +#[derive(Debug, Clone, Spanned)] #[Lex] pub enum LString { Single(SingleString), Double(DoubleString), } +impl LString { + pub(crate) fn peek_token(token: &Token) -> bool { + matches!(token, Token::String(s)) + } +} + fn eat_inner_chars( input: &mut SourceIter, delimit: char, @@ -64,14 +70,14 @@ fn eat_inner_chars( Ok(Some(contents)) } -#[derive(Debug)] +#[derive(Debug, Clone)] pub enum StrFrag { Char(char), EscSeq(EscapeSequence), LineEsc(LineTerminatorSeq), } -#[derive(Debug, Spanned)] +#[derive(Debug, Clone, Spanned)] pub struct SingleString(Span, Vec); impl Lex for SingleString { @@ -101,7 +107,7 @@ impl Lex for SingleString { } } -#[derive(Debug, Spanned)] +#[derive(Debug, Clone, Spanned)] pub struct DoubleString(Span, Vec); impl Lex for DoubleString { diff --git a/src/lex/tokens.rs b/src/lex/tokens.rs index 4996599..44c047c 100644 --- a/src/lex/tokens.rs +++ b/src/lex/tokens.rs @@ -1,7 +1,10 @@ use avjason_macros::{Lex, Spanned}; use finl_unicode::categories::{CharacterCategories, MinorCategory}; -use crate::utils::{SourceIter, Span, TryIntoSpan}; +use crate::{ + syntax::Parse, + utils::{SourceFile, SourceIter, Span, TryIntoSpan, Spanned}, +}; use super::{escape::UnicodeEscapeSequence, number::Number, strings::LString, IntoLexResult}; @@ -10,41 +13,195 @@ pub(crate) trait Lex: Sized { fn peek(input: &SourceIter) -> bool; } -#[derive(Debug, Spanned)] +/// +/// Util macro for Syntax parsing. +/// +macro_rules! peek { + ($t: ident, $l: literal, $e: expr) => { + #[allow(non_snake_case)] + #[doc(hidden)] + pub fn $t() -> crate::syntax::utils::Peeker<$t> { + ($e, $e) + } + + impl crate::syntax::Parse for $t { + fn parse(input: &mut crate::syntax::ParseBuffer) -> crate::syntax::ParserResult { + let Some(token) = input.next() else { + return input.error().expected(concat!("`", stringify!($l), "`")); + }; + + #[allow(clippy::redundant_closure_call)] + let Some(t) = $e(token) else { + return input.error().expected(concat!("`", stringify!($l), "`")); + }; + + Ok(t) + } + } + }; +} + +macro_rules! peek_only { + ($t: ident, $e: expr) => { + #[allow(non_snake_case)] + #[doc(hidden)] + pub fn $t(token: &Token) -> bool { + #[allow(clippy::redundant_closure_call)] + $e(token) + } + }; +} + +#[derive(Debug, Clone, Spanned)] +pub struct True { + span: Span +} + +impl Parse for True { + fn parse(input: &mut crate::syntax::ParseBuffer) -> crate::syntax::ParserResult { + let mut f = input.fork(); + let ident: LIdentifier = f.parse()?; + if f.source_text(ident.span()) != "true" { + return input.error() + .expected("`true` here."); + } + + input.advance_to(f); + + Ok(Self{ span: ident.span() }) + } +} + +peek_only!(True, |token: &Token| matches!(token, Token::Identifier(ref ident) if ident.raw_value == "true")); + + +#[derive(Debug, Clone, Spanned)] +pub struct False { + span: Span +} + +impl Parse for False { + fn parse(input: &mut crate::syntax::ParseBuffer) -> crate::syntax::ParserResult { + let mut f = input.fork(); + let ident: LIdentifier = f.parse()?; + if f.source_text(ident.span()) != "false" { + return input.error() + .expected("`false` here."); + } + + input.advance_to(f); + + Ok(Self{ span: ident.span() }) + } +} +peek_only!(False, |token: &Token| matches!(token, Token::Identifier(ref ident) if ident.raw_value == "false")); + +#[derive(Debug, Clone, Spanned)] +pub struct Null { + span: Span +} + +impl Parse for Null { + fn parse(input: &mut crate::syntax::ParseBuffer) -> crate::syntax::ParserResult { + let mut f = input.fork(); + let ident: LIdentifier = f.parse()?; + if f.source_text(ident.span()) != "null" { + return input.error() + .expected("`null` here."); + } + + input.advance_to(f); + + Ok(Self{ span: ident.span() }) + } +} + +peek_only!(Null, |token: &Token| matches!(token, Token::Identifier(ref ident) if ident.raw_value == "null")); + + +#[derive(Debug, Clone, Spanned)] #[Lex('{')] -pub struct OpenBrace(Span); +pub struct OpenBrace { + span: Span, +} -#[derive(Debug, Spanned)] +peek!(OpenBrace, '{', |token| match token { + Token::Punctuator(crate::lex::tokens::Punct::OpenBrace(s)) => Some(s), + _ => None, +}); + +#[derive(Debug, Clone, Spanned)] #[Lex('}')] -pub struct CloseBrace(Span); +pub struct CloseBrace { + span: Span, +} -#[derive(Debug, Spanned)] +peek!(CloseBrace, '}', |token| match token { + Token::Punctuator(crate::lex::tokens::Punct::CloseBrace(s)) => Some(s), + _ => None, +}); + +#[derive(Debug, Clone, Spanned)] #[Lex('[')] -pub struct OpenBracket(Span); +pub struct OpenBracket { + span: Span, +} -#[derive(Debug, Spanned)] +peek!(OpenBracket, '[', |token| match token { + Token::Punctuator(crate::lex::tokens::Punct::OpenBracket(s)) => Some(s), + _ => None, +}); + +#[derive(Debug, Clone, Spanned)] #[Lex(']')] -pub struct CloseBracket(Span); +pub struct CloseBracket { + span: Span, +} -#[derive(Debug, Spanned)] +peek!(CloseBracket, ']', |token| match token { + Token::Punctuator(crate::lex::tokens::Punct::CloseBracket(s)) => Some(s), + _ => None, +}); + +#[derive(Debug, Clone, Spanned)] #[Lex(':')] -pub struct Colon(Span); +pub struct Colon { + span: Span, +} -#[derive(Debug, Spanned)] +peek!(Colon, ':', |token| match token { + Token::Punctuator(crate::lex::tokens::Punct::Colon(s)) => Some(s), + _ => None, +}); + +#[derive(Debug, Clone, Spanned)] #[Lex(',')] -pub struct Comma(Span); +pub struct Comma { + span: Span, +} -#[derive(Debug, Spanned)] +peek!(Comma, ',', |token| match token { + Token::Punctuator(crate::lex::tokens::Punct::Comma(s)) => Some(s), + _ => None, +}); + +#[derive(Debug, Clone, Spanned)] #[Lex('-')] -pub struct Minus(Span); +pub struct Minus { + span: Span, +} -#[derive(Debug, Spanned)] +#[derive(Debug, Clone, Spanned)] #[Lex('+')] -pub struct Plus(Span); +pub struct Plus { + span: Span, +} -#[derive(Debug, Spanned)] +#[derive(Debug, Clone, Spanned)] #[Lex('.')] -pub struct Dot(Span); +pub struct Dot { + span: Span, +} #[macro_export] macro_rules! Token { @@ -90,9 +247,18 @@ macro_rules! Token { [.] => { $crate::lex::tokens::Dot }; + [false] => { + $crate::lex::tokens::False + }; + [true] => { + $crate::lex::tokens::True + }; + [null] => { + $crate::lex::tokens::Null + }; } -#[derive(Debug, Spanned)] +#[derive(Debug, Clone, Spanned)] #[Lex] pub enum Punct { OpenBrace(OpenBrace), @@ -103,7 +269,7 @@ pub enum Punct { Comma(Comma), } -#[derive(Debug, Spanned)] +#[derive(Debug, Clone, Spanned)] pub struct WhiteSpace(Span); impl WhiteSpace { @@ -174,7 +340,7 @@ impl Lex for LineTerminator { } } -#[derive(Debug, Spanned)] +#[derive(Debug, Clone, Spanned)] pub struct LineTerminatorSeq(Span); impl Lex for LineTerminatorSeq { @@ -285,8 +451,30 @@ pub enum InputElement { /// /// Compliant with [ECMAScript specification for `IdentifierName`](https://262.ecma-international.org/5.1/#sec-7.6). /// -#[derive(Debug, Spanned)] -pub struct LIdentifier(Span); +#[derive(Debug, Spanned, Clone)] +pub struct LIdentifier { + span: Span, + raw_value: String, +} + +impl LIdentifier { + pub(crate) fn value(&self, file: &SourceFile) -> String { + todo!() + } + + pub(crate) fn peek_token(token: &Token) -> bool { + matches!(token, Token::Identifier(i)) + } +} + +impl Parse for LIdentifier { + fn parse(input: &mut crate::syntax::ParseBuffer) -> crate::syntax::ParserResult { + match input.next() { + Some(Token::Identifier(ident)) => Ok(ident), + _ => input.error().expected("identifier"), + } + } +} impl LIdentifier { fn is_unicode_letter(ch: &char) -> bool { @@ -360,7 +548,11 @@ impl Lex for LIdentifier { end = input.next().unwrap().0; } - Some(Self(TryIntoSpan::try_into_span(start..=end)?)) + let span = TryIntoSpan::try_into_span(start..=end)?; + Some(Self { + span, + raw_value: input.source_at(span) + }) } fn peek(input: &SourceIter) -> bool { @@ -368,7 +560,7 @@ impl Lex for LIdentifier { } } -#[derive(Debug, Spanned)] +#[derive(Debug, Spanned, Clone)] pub enum Token { Identifier(LIdentifier), Punctuator(Punct), @@ -404,10 +596,7 @@ impl Lex for Token { #[cfg(test)] mod tests { - use crate::{ - lex::IntoLexResult, - utils::SourceFile, - }; + use crate::{lex::IntoLexResult, utils::SourceFile}; use super::{InputElement, Lex}; diff --git a/src/lib.rs b/src/lib.rs index 09cda01..0fb8ea7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4,7 +4,8 @@ //! //! A parser for [JSON5](https://json5.org/). //! -#![feature(iter_map_windows, specialization)] +#![feature(iter_map_windows, associated_type_defaults, specialization)] pub mod utils; -pub mod lex; \ No newline at end of file +pub mod lex; +pub mod syntax; \ No newline at end of file diff --git a/src/syntax/mod.rs b/src/syntax/mod.rs new file mode 100644 index 0000000..c9118e5 --- /dev/null +++ b/src/syntax/mod.rs @@ -0,0 +1,137 @@ +//! +//! Syntax Grammar. +//! + +pub mod utils; +pub mod value; + +use crate::{ + lex::tokens::Token, + utils::{Loc, SourceFile, Span}, +}; + +use self::utils::Peek; + +#[derive(Debug)] +pub struct ParseError { + near: String, + message: String, +} + +impl std::error::Error for ParseError {} + +impl std::fmt::Display for ParseError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "Error occured during parsing:\t{}\n\tAt {}", + self.message, self.near + ) + } +} + +pub type ParserResult = Result; + +#[derive(Debug, Clone)] +pub struct ParseBuffer<'a> { + file: &'a SourceFile, + inner: Vec, + index: usize, +} + +impl<'a> ParseBuffer<'a> { + pub(crate) fn new(file: &'a SourceFile, inner: Vec) -> Self { + Self { + file, + inner, + index: 0, + } + } + + pub(crate) fn fork(&self) -> Self { + self.clone() + } + + pub(crate) fn source_text(&self, span: Span) -> String { + self.file.source_at_span(span).unwrap() + } + + pub(crate) fn upcoming(&self) -> Option<&Token> { + self.inner.get(self.index) + } + + pub(crate) fn peek

(&self, p: P) -> bool + where + P: Peek, + { + self.upcoming().map(|t| p.peek(t)).unwrap_or(false) + } + + pub(crate) fn error(&'a self) -> ParseErrorHelper<'a> { + ParseErrorHelper(self) + } + + pub(crate) fn index_display(&self, loc: impl IntoLoc) -> String { + self.file.file_line_column(&loc.into_loc()).unwrap() + } + + pub(crate) fn cursor(&self) -> usize { + self.index + } + + pub(crate) fn parse(&mut self) -> ParserResult

{ + P::parse(self) + } + + pub(crate) fn advance_to(&mut self, other: Self) { + self.index = other.index; + } +} + +pub(crate) trait IntoLoc { + fn into_loc(self) -> Loc; +} + +impl IntoLoc for Loc { + fn into_loc(self) -> Loc { + self + } +} + +impl> IntoLoc for I { + fn into_loc(self) -> Loc { + Loc { index: self.into() } + } +} + +pub struct ParseErrorHelper<'a>(&'a ParseBuffer<'a>); + +impl<'a> ParseErrorHelper<'a> { + pub(crate) fn unexpected(self, message: impl ToString) -> ParserResult { + Err(ParseError { + near: self.0.index_display(self.0.cursor() - 1), + message: format!("Unexpected {}", message.to_string()), + }) + } + + pub(crate) fn expected(self, message: impl ToString) -> ParserResult { + Err(ParseError { + near: self.0.index_display(self.0.cursor() - 1), + message: format!("Expected {}", message.to_string()), + }) + } +} + +impl<'a> Iterator for ParseBuffer<'a> { + type Item = Token; + + fn next(&mut self) -> Option { + let item = self.inner.get(self.index); + self.index += 1; + item.cloned() + } +} + +pub trait Parse: Sized { + fn parse(input: &mut ParseBuffer) -> ParserResult; +} diff --git a/src/syntax/utils.rs b/src/syntax/utils.rs new file mode 100644 index 0000000..2fac7ca --- /dev/null +++ b/src/syntax/utils.rs @@ -0,0 +1,51 @@ +//! +//! Utilities for parsing tokens. +//! + +use crate::lex::tokens::Token; + +#[allow(private_bounds)] +pub trait Peek: Sealed {} + +#[doc(hidden)] +pub(crate) trait Sealed { + type T; + + fn peek(&self, token: &Token) -> bool; + fn try_from(&self, token: Token) -> Option; +} + +impl Peek for S + where S: Sealed +{} + +pub type Peeker = (fn(&Token) -> Option<&T>, fn(Token) -> Option); + +impl Sealed for F +where + F: Fn() -> Peeker, +{ + type T = T1; + + default fn peek(&self, token: &Token) -> bool { + self().0(token).is_some() + } + + default fn try_from(&self, token: Token) -> Option { + self().1(token) + } +} + +pub enum Unparseable {} + +default impl Sealed for fn(&Token) -> bool { + type T = Unparseable; + + fn peek(&self, token: &Token) -> bool { + self(token) + } + + fn try_from(&self, token: Token) -> Option { + unimplemented!() + } +} diff --git a/src/syntax/value.rs b/src/syntax/value.rs new file mode 100644 index 0000000..c911b12 --- /dev/null +++ b/src/syntax/value.rs @@ -0,0 +1,316 @@ +//! +//! JSON5 Values. +//! + +use avjason_macros::Spanned; + +use crate::{ + lex::{ + number::Number, + strings::LString, + tokens::{False, LIdentifier, Null, Token, True}, + }, + Token, utils::{Spanned, Span, Loc}, +}; + +use super::{Parse, ParseBuffer, ParserResult}; + +#[derive(Debug, Clone, Spanned)] +pub enum Boolean { + True(Token![true]), + False(Token![false]), +} + +impl Boolean { + fn peek(input: &ParseBuffer) -> bool { + input + .upcoming() + .map(|token| True(token) || False(token)) + .unwrap_or_default() + } +} + +impl Parse for Boolean { + fn parse(input: &mut super::ParseBuffer) -> super::ParserResult { + if input.upcoming().map(True).unwrap_or_default() { + return Ok(Self::True(Parse::parse(input)?)); + } + + if input.upcoming().map(False).unwrap_or_default() { + return Ok(Self::False(Parse::parse(input)?)); + } + + input + .error() + .expected("boolean literal `true`, or `false`.") + } +} + +#[derive(Debug, Clone, Spanned)] +pub enum Value { + Null(Token![null]), + Boolean(Boolean), + String(LString), + Number(Number), + Object(Object), + Array(Array), +} + +impl Parse for Value { + fn parse(input: &mut ParseBuffer) -> ParserResult { + let Some(token) = input.upcoming() else { + return input.error().expected("Expected Value here!"); + }; + + if Null(token) { + return Ok(Self::Null(input.parse()?)); + } + + if Boolean::peek(input) { + return Ok(Self::Boolean(input.parse()?)); + } + + if LString::peek_token(token) { + return Ok(Self::String(input.parse()?)); + } + + if Number::peek_token(token) { + return Ok(Self::Number(input.parse()?)); + } + + if Object::peek(input) { + return Ok(Self::Object(input.parse()?)); + } + + if Array::peek(input) { + return Ok(Self::Array(input.parse()?)); + } + + input + .error() + .expected("JSON value (`null`, number, string, boolean, object, or array") + } +} + +#[derive(Debug, Clone)] +pub struct Punctuated { + inner: Vec, + trailing: Option, +} + +impl Spanned for Punctuated + where + El: Spanned, + Punct: Spanned +{ + fn span(&self) -> crate::utils::Span { + if self.inner.is_empty() { + return Span::single_char(Loc {index: 0}); + } + + let s = self.inner[0].span(); + let e = if let Some(ref t) = self.trailing { + t.span() + } else if self.inner.len() > 1 { + self.inner.last().unwrap().span() + } else { + s + }; + + s.combine([e]) + } +} + +impl Punctuated +where + El: Parse, + Punct: Parse, +{ + fn parse_until( + input: &mut ParseBuffer, + pred: impl Fn(&ParseBuffer) -> bool, + ) -> ParserResult { + let mut inner: Vec = vec![]; + let mut trailing: Option = None; + + loop { + if pred(input) { + break; + } + + inner.push(El::parse(input)?); + trailing = None; + + if pred(input) { + break; + } + + trailing = Some(Punct::parse(input)?); + } + + Ok(Self { inner, trailing }) + } +} + +#[derive(Debug, Clone)] +pub struct Object { + open: Token!['{'], + members: Punctuated, + close: Token!['}'], +} + +impl Spanned for Object { + fn span(&self) -> Span { + let s = self.open.span(); + let e = self.close.span(); + + s.combine([e]) + } +} + +impl Object { + pub(crate) fn peek(input: &ParseBuffer) -> bool { + input.peek(Token!['{']) + } +} + +impl Parse for Object { + fn parse(input: &mut ParseBuffer) -> ParserResult { + let open = Parse::parse(input)?; + let members = Punctuated::parse_until(input, |input| input.peek(Token!['}']))?; + let close = Parse::parse(input)?; + Ok(Self { + open, + members, + close, + }) + } +} + +#[derive(Debug, Clone)] +pub struct Member { + name: MemberName, + colon: Token![:], + value: Value, +} + +impl Spanned for Member { + fn span(&self) -> Span { + let s = self.name.span(); + let e = self.value.span(); + + s.combine([e]) + } +} + +impl Parse for Member { + fn parse(input: &mut ParseBuffer) -> ParserResult { + Ok(Self { + name: input.parse()?, + colon: input.parse()?, + value: input.parse()?, + }) + } +} + +#[derive(Debug, Clone, Spanned)] +pub enum MemberName { + Identifier(LIdentifier), + String(LString), +} + +impl Parse for LString { + fn parse(input: &mut ParseBuffer) -> ParserResult { + match input { + i if i.upcoming().map(LString::peek_token).unwrap_or_default() => { + match i.next().unwrap() { + Token::String(l) => Ok(l), + _ => unreachable!(), + } + } + _ => input.error().expected("string literal"), + } + } +} + +impl Parse for Number { + fn parse(input: &mut crate::syntax::ParseBuffer) -> crate::syntax::ParserResult { + let Some(Token::Number(token)) = input.next() else { + return input.error().expected("number literal"); + }; + + Ok(token) + } +} + +impl Parse for MemberName { + fn parse(input: &mut ParseBuffer) -> ParserResult { + if input + .upcoming() + .map(LIdentifier::peek_token) + .unwrap_or_default() + { + return Ok(Self::Identifier(Parse::parse(input)?)); + } + + if input + .upcoming() + .map(LString::peek_token) + .unwrap_or_default() + { + return Ok(Self::String(Parse::parse(input)?)); + } + + input + .error() + .expected("either string literal, or identifier") + } +} + +#[derive(Debug, Clone)] +pub struct Array { + open: Token!['['], + elements: Punctuated, + close: Token![']'], +} + +impl Spanned for Array { + fn span(&self) -> Span { + let s = self.open.span(); + let e = self.close.span(); + + s.combine([e]) + } +} + +impl Array { + pub(crate) fn peek(input: &ParseBuffer) -> bool { + input.peek(Token!['[']) + } +} + +impl Parse for Array { + fn parse(input: &mut ParseBuffer) -> ParserResult { + let open = input.parse()?; + let elements = Punctuated::parse_until(input, |input| input.peek(Token![']']))?; + let close = input.parse()?; + + Ok(Self { + open, + elements, + close, + }) + } +} + +#[cfg(test)] +mod tests { + use crate::utils::SourceFile; + + #[test] + fn parse_value() { + let src = SourceFile::dummy_file("test.0", r#"{"fruits": [{name: "apple", qty: 2}], }"#); + let v = src.parse(); + println!("{v:#?}"); + } +} \ No newline at end of file diff --git a/src/utils/mod.rs b/src/utils/mod.rs index fe668d1..dfb4565 100644 --- a/src/utils/mod.rs +++ b/src/utils/mod.rs @@ -10,9 +10,16 @@ use std::{ path::{Path, PathBuf}, }; +use anyhow::anyhow; pub use span::*; -use crate::lex::{LexError, LexResult}; +use crate::{ + lex::{ + tokens::{InputElement, Lex}, + IntoLexResult, LexError, LexResult, + }, + syntax::{value::Value, ParseBuffer}, +}; #[derive(Debug)] pub struct SourceFile { @@ -124,6 +131,42 @@ impl SourceFile { pub(crate) fn iter(&self) -> SourceIter { SourceIter::new(self) } + + pub(crate) fn lex(&self) -> LexResult> { + let mut v = vec![]; + let iter = &mut self.iter(); + + while !iter.eof() { + match InputElement::lex(iter).into_lex_result() { + Ok(Some(t)) => v.push(t), + Ok(None) => { + return iter.error().expected(Some(0..), "Something..."); + } + Err(err) => { + return Err(err); + } + } + } + + Ok(Some(v)) + } + + pub(crate) fn parse(&self) -> Result { + let Some(lexxed) = self.lex()? else { + return Err(anyhow!("Empty file!")); + }; + + let tokens = lexxed + .into_iter() + .filter_map(|token| match token { + InputElement::Token(t) => Some(t), + _ => None, + }) + .collect(); + + let buf = &mut ParseBuffer::new(self, tokens); + buf.parse().map_err(Into::into) + } } #[derive(Clone)] @@ -151,6 +194,12 @@ impl<'a> SourceIter<'a> { } } + pub(crate) fn source_at(&self, span: Span) -> String { + (self.inner[span.start.index..span.end.index]) + .iter() + .collect() + } + pub(crate) fn peek(&self) -> Option<&char> { self.inner.get(self.index) } @@ -222,6 +271,10 @@ impl<'a> SourceIter<'a> { pub(crate) fn error(&self) -> SourceErrorHelper { SourceErrorHelper { iter: self } } + + pub(crate) fn eof(&self) -> bool { + self.index >= self.inner.len() + } } pub(crate) struct SourceErrorHelper<'a> { From 11b86916705aeb20ee1e35cf4a6c88033a9b9783 Mon Sep 17 00:00:00 2001 From: Sammy99jsp Date: Sat, 6 Jan 2024 23:50:40 +0000 Subject: [PATCH 10/39] Blank start. --- src/lex/escape.rs | 168 ------- src/lex/mod.rs | 78 +-- src/lex/number.rs | 1111 ------------------------------------------- src/lex/strings.rs | 196 -------- src/lex/tokens.rs | 622 ------------------------ src/syntax/mod.rs | 137 ------ src/syntax/utils.rs | 51 -- src/syntax/value.rs | 316 ------------ src/utils/mod.rs | 151 ------ 9 files changed, 2 insertions(+), 2828 deletions(-) delete mode 100644 src/lex/escape.rs delete mode 100644 src/lex/number.rs delete mode 100644 src/lex/strings.rs delete mode 100644 src/lex/tokens.rs delete mode 100644 src/syntax/utils.rs delete mode 100644 src/syntax/value.rs diff --git a/src/lex/escape.rs b/src/lex/escape.rs deleted file mode 100644 index e0e8376..0000000 --- a/src/lex/escape.rs +++ /dev/null @@ -1,168 +0,0 @@ -//! -//! Escape sequences. -//! - -use avjason_macros::{Lex, Spanned}; - -use crate::utils::{SourceIter, Span, TryIntoSpan}; -use crate::lex::IntoLexResult; - -use super::tokens::{Lex, LineTerminator}; - -#[inline] -pub fn is_hex_digit(ch: &char) -> bool { - ch.is_ascii_hexdigit() -} - -#[derive(Debug, Clone, Spanned)] -#[Lex] -pub enum EscapeSequence { - Unicode(UnicodeEscapeSequence), - Hex(HexEscapeSequence), - Null(NullEscapeSequence), - Character(CharacterEscapeSequence), -} - -#[derive(Debug, Clone, Spanned)] -#[Lex] -pub enum CharacterEscapeSequence { - Single(SingleEscapeCharacter), - NonEscape(NonEscapeCharacter), -} - -#[derive(Debug, Clone, Spanned)] -pub struct SingleEscapeCharacter(Span); - -impl Lex for SingleEscapeCharacter { - fn lex(input: &mut SourceIter) -> Option { - if !Self::peek(input) { - return None; - } - - let loc = input.next()?.0; - Some(Self(Span::single_char(loc))) - } - - fn peek(input: &SourceIter) -> bool { - matches!( - input.peek(), - Some(&'\'' | &'"' | &'\\' | &'b' | &'f' | &'n' | &'r' | &'t' | &'v') - ) - } -} - -#[derive(Debug, Clone, Spanned)] -pub struct NonEscapeCharacter(Span); - -struct EscapeCharacter; - -impl Lex for EscapeCharacter { - fn lex(_: &mut SourceIter) -> Option { - unimplemented!() - } - - fn peek(input: &SourceIter) -> bool { - let Some(ch) = input.peek() else { - return false; - }; - - SingleEscapeCharacter::peek(input) - || ch.is_ascii_digit() // DecimalDigit - || ch == &'x' - || ch == &'u' - } -} - -impl Lex for NonEscapeCharacter { - fn lex(input: &mut SourceIter) -> Option { - if !Self::peek(input) { - return None; - } - - let loc = input.next()?.0; - Some(Self(Span::single_char(loc))) - } - - fn peek(input: &SourceIter) -> bool { - !(EscapeCharacter::peek(input) || LineTerminator::peek(input)) - } -} - -#[derive(Debug, Clone, Spanned)] -pub struct NullEscapeSequence(Span); - -impl Lex for NullEscapeSequence { - fn lex(input: &mut SourceIter) -> Option { - if !Self::peek(input) { - return None; - } - - let loc = input.next()?.0; - Some(Self(Span::single_char(loc))) - } - - fn peek(input: &SourceIter) -> bool { - // with lookahead: not DecimalDigit. - input.peek() == Some(&'0') && !input.peek2().map(char::is_ascii_digit).unwrap_or(false) - } -} - -#[derive(Debug, Clone, Spanned)] -pub struct HexEscapeSequence(Span); - -impl Lex for HexEscapeSequence { - fn lex(input: &mut SourceIter) -> impl IntoLexResult { - if !Self::peek(input) { - return Ok(None); - } - - let start = input.next().unwrap().0; - - let mut end = start; - - for _ in 0..2 { - if input.peek().map(is_hex_digit).unwrap_or(false) { - end = input.next().unwrap().0; - } else { - return input.error() - .expected(Some(-1..1), ""); - } - } - - Ok(Some(Self(TryIntoSpan::try_into_span(start..=end).unwrap()))) - } - - fn peek(input: &SourceIter) -> bool { - input.peek() == Some(&'x') && input.relative_match(1..=2, is_hex_digit) - } -} - -#[derive(Debug, Clone, Spanned)] -pub struct UnicodeEscapeSequence(Span); - -impl Lex for UnicodeEscapeSequence { - fn lex(input: &mut SourceIter) -> impl IntoLexResult { - if !Self::peek(input) { - return Ok(None); - } - - let start = input.next().unwrap().0; - - let mut end = start; - - for _ in 0..4 { - if is_hex_digit(input.peek().unwrap()) { - end = input.next().unwrap().0; - } else { - return input.error() - .expected(Some(-1..), "") - } - } - - Ok(Some(Self(TryIntoSpan::try_into_span(start..=end).unwrap()))) - } - - fn peek(input: &SourceIter) -> bool { - input.peek() == Some(&'u') && input.relative_match(1..=4, is_hex_digit) - } -} diff --git a/src/lex/mod.rs b/src/lex/mod.rs index 143c80c..0233404 100644 --- a/src/lex/mod.rs +++ b/src/lex/mod.rs @@ -1,78 +1,4 @@ //! -//! Lexxing utilities. -//! - -use std::ops::RangeBounds; - -use crate::utils::{Span, TryIntoSpan}; - -pub mod escape; -pub mod strings; -pub mod tokens; -pub mod number; - -#[derive(Debug)] -pub struct LexError { - span: Span, - message: String, - text: Option, -} - -impl LexError { - pub(crate) fn new>( - span: B, - message: impl ToString, - text: impl Into>, - ) -> Self { - let span = TryIntoSpan::try_into_span(span).unwrap(); - let message = message.to_string(); - let text = text.into(); - - Self { - span, - message, - text, - } - } -} - -impl std::error::Error for LexError {} - -impl std::fmt::Display for LexError { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "Error occured during lexxing:\t{}\n\tNear `{}`", self.message, self.text.as_ref().unwrap_or(&String::default())) - } -} - -/// -/// Utility for Lexer erorrs, -/// -pub type LexResult = Result, LexError>; - -pub trait IntoLexResult: Sized { - fn into_lex_result(self) -> LexResult; -} - -default impl IntoLexResult for T { - fn into_lex_result(self) -> LexResult { - Ok(Some(self)) - } -} - -impl IntoLexResult for Option { - fn into_lex_result(self) -> LexResult { - Ok(self) - } -} - -impl IntoLexResult for LexResult { - fn into_lex_result(self) -> LexResult { - self - } -} +//! Lexxing +//! -impl IntoLexResult for Result { - fn into_lex_result(self) -> LexResult { - self.map(Option::Some) - } -} diff --git a/src/lex/number.rs b/src/lex/number.rs deleted file mode 100644 index 23775db..0000000 --- a/src/lex/number.rs +++ /dev/null @@ -1,1111 +0,0 @@ -//! -//! Number. -//! - -use std::iter::once; -use std::ops::RangeBounds; - -use avjason_macros::{Lex, Spanned}; - -use super::tokens::{Dot, LIdentifier, Lex, Minus, Plus, Token}; -use super::{IntoLexResult, LexResult}; - -use crate::lex::escape::is_hex_digit; -use crate::syntax::Parse; -use crate::utils::{SourceIter, Span, Spanned, TryIntoSpan}; -use crate::Token; - -/// -/// **JSON5Number**. -/// -/// --- -/// See [the JSON5 specification](https://spec.json5.org/#prod-JSON5Number). -/// -#[derive(Debug, Clone)] -pub struct Number(Option, Numeric); - -impl Spanned for Number { - fn span(&self) -> Span { - if let Some(ref sign) = self.0 { - sign.span().combine([self.1.span()]) - } else { - self.1.span() - } - } -} - -impl Lex for Number { - fn lex(input: &mut SourceIter) -> impl IntoLexResult { - if !Self::peek(input) { - return Ok(None); - } - - let sign = if Sign::peek(input) { - Sign::lex(input).into_lex_result().unwrap() - } else { - None - }; - - let Ok(Some(numeric)) = Numeric::lex(input).into_lex_result() else { - return input.error().expected(Some(-1..0), ""); - }; - - Ok(Some(Self(sign, numeric))) - } - - fn peek(input: &SourceIter) -> bool { - Sign::peek(input) || Numeric::peek(input) - } -} - -impl Number { - pub(crate) fn peek_token(token: &Token) -> bool { - matches!(token, Token::Number(_)) - } -} - -#[derive(Debug, Clone, Spanned)] -#[Lex] -pub enum Sign { - Positive(Plus), - Negative(Minus), -} - -trait Keyword: Sized { - const TOKEN: &'static str; - - fn new(sp: impl RangeBounds) -> Self; -} - -impl Lex for K { - fn lex(input: &mut SourceIter) -> impl IntoLexResult { - if !Self::peek(input) { - return Ok(None); - } - - let start = input.next().unwrap().0; - let end = start + Self::TOKEN.len(); - input.offset(Self::TOKEN.len() + 1); - - Ok(Some(Self::new(start..end))) - } - - fn peek(input: &SourceIter) -> bool { - input - .ahead(..Self::TOKEN.len()) - .map(|ref s| s == Self::TOKEN) - .unwrap_or(false) - } -} - -#[derive(Debug, Clone, Spanned)] -pub struct Infinity(Span); - -impl Keyword for Infinity { - const TOKEN: &'static str = "Infinity"; - - fn new(sp: impl RangeBounds) -> Self { - Self(TryIntoSpan::try_into_span(sp).unwrap()) - } -} - -#[derive(Debug, Clone, Spanned)] -pub struct NaN(Span); - -impl Keyword for NaN { - const TOKEN: &'static str = "NaN"; - - fn new(sp: impl RangeBounds) -> Self { - Self(TryIntoSpan::try_into_span(sp).unwrap()) - } -} - -/// -/// **JSON5NumericLiteral** -/// -/// --- -/// -/// See [the JSON5 specification](https://spec.json5.org/#prod-JSON5NumericLiteral). -/// -#[derive(Debug, Clone, Spanned)] -#[Lex] -pub enum Numeric { - Infinity(Infinity), - NaN(NaN), - Lit(NumericLiteral), -} - -/// -/// ECMAScript **NumericLiteral** -/// -/// --- -/// -/// See the [ECMAScript specification](https://262.ecma-international.org/5.1/#sec-7.8.3). -/// -#[derive(Debug, Clone, Spanned)] -pub enum NumericLiteral { - Decimal(DecimalLiteral), - Hex(HexIntegerLiteral), -} - -impl NumericLiteral { - /// - /// From [ECMAScript standard](https://262.ecma-international.org/5.1/#sec-7.8.3) - /// > NOTE: The source character immediately following a [NumericLiteral] must not be an *IdentifierStart* or *DecimalDigit*. - /// - fn after_check(input: &SourceIter) -> bool { - !(LIdentifier::is_identifier_start(input) - || input.peek().map(char::is_ascii_digit).unwrap_or(false)) - } -} - -impl Lex for NumericLiteral { - fn lex(mut input: &mut SourceIter) -> impl IntoLexResult { - let res: LexResult = match input { - ref mut input if HexIntegerLiteral::peek(input) => Ok(Some(Self::Hex( - HexIntegerLiteral::lex(input) - .into_lex_result() - .unwrap() - .unwrap(), - ))), - ref mut input if DecimalLiteral::peek(input) => Ok(Some(Self::Decimal( - DecimalLiteral::lex(input) - .into_lex_result() - .unwrap() - .unwrap(), - ))), - _ => Ok(None), - }; - - if !Self::after_check(input) { - return input - .error() - .unexpected(Some(-1..0), ""); - } - - res - } - - fn peek(input: &SourceIter) -> bool { - DecimalLiteral::peek(input) || HexIntegerLiteral::peek(input) - } -} - -#[derive(Debug, Clone, Spanned)] -#[Lex] -pub enum DecimalLiteral { - IntegralDecimalMantissa(IntegralDecimalMantissa), - DecimalMantissa(DecimalMantissa), - Integer(Integer), -} - -#[derive(Debug, Clone)] -pub struct IntegralDecimalMantissa( - DecimalIntegerLiteral, - Token![.], - Option, - Option, -); - -impl Spanned for IntegralDecimalMantissa { - fn span(&self) -> Span { - self.0.span().combine( - self.2 - .as_ref() - .map(|s| s.span()) - .into_iter() - .chain(self.3.as_ref().map(|s| s.span())), - ) - } -} - -impl Lex for IntegralDecimalMantissa { - fn lex(input: &mut SourceIter) -> impl IntoLexResult { - if !Self::peek(input) { - return Ok(None); - } - - let i = DecimalIntegerLiteral::lex(input) - .into_lex_result() - .unwrap() - .unwrap(); - - let Ok(Some(d)) = Dot::lex(input).into_lex_result() else { - return input.error().expected(Some(-1..1), "."); - }; - - let m = if DecimalDigits::peek(input) { - DecimalDigits::lex(input).into_lex_result().unwrap() - } else { - None - }; - - let exp = if ExponentPart::peek(input) { - ExponentPart::lex(input).into_lex_result().unwrap() - } else { - None - }; - - Ok(Some(Self(i, d, m, exp))) - } - - fn peek(input: &SourceIter) -> bool { - if DecimalIntegerLiteral::peek(input) { - let mut fork = input.fork(); - let _ = DecimalIntegerLiteral::lex(&mut fork) - .into_lex_result() - .unwrap() - .unwrap(); - - return Dot::peek(&fork); - } - - false - } -} - -#[derive(Debug, Clone)] -pub struct DecimalMantissa(Token![.], DecimalDigits, Option); - -impl Spanned for DecimalMantissa { - fn span(&self) -> Span { - let s = self.0.span(); - - if let Some(ref exp) = self.2 { - s.combine([exp.span()]) - } else { - s.combine([self.1.span()]) - } - } -} - -impl Lex for DecimalMantissa { - fn lex(input: &mut SourceIter) -> impl IntoLexResult { - if !Self::peek(input) { - return Ok(None); - } - - let d = Dot::lex(input).into_lex_result().unwrap().unwrap(); - - let Ok(Some(ds)) = DecimalDigits::lex(input).into_lex_result() else { - return input - .error() - .expected(Some(-1..0), ""); - }; - - let exp = if ExponentPart::peek(input) { - ExponentPart::lex(input).into_lex_result().unwrap() - } else { - None - }; - - Ok(Some(Self(d, ds, exp))) - } - - fn peek(input: &SourceIter) -> bool { - Dot::peek(input) - } -} - -#[derive(Debug, Clone)] -pub struct Integer(DecimalIntegerLiteral, Option); - -impl Spanned for Integer { - fn span(&self) -> Span { - self.0.span().combine(self.1.as_ref().map(Spanned::span)) - } -} - -impl Lex for Integer { - fn lex(input: &mut SourceIter) -> impl IntoLexResult { - if !Self::peek(input) { - return None; - } - - let int = DecimalIntegerLiteral::lex(input) - .into_lex_result() - .unwrap()?; - - let exp = if ExponentPart::peek(input) { - ExponentPart::lex(input).into_lex_result().unwrap() - } else { - None - }; - - Some(Self(int, exp)) - } - - fn peek(input: &SourceIter) -> bool { - DecimalIntegerLiteral::peek(input) - } -} - -#[derive(Debug, Clone)] -pub enum DecimalIntegerLiteral { - Zero(Zero), - NonZero(NonZero, Option), -} - -impl Spanned for DecimalIntegerLiteral { - fn span(&self) -> Span { - match self { - DecimalIntegerLiteral::Zero(z) => z.span(), - DecimalIntegerLiteral::NonZero(a, b) => a.span().combine(b.as_ref().map(Spanned::span)), - } - } -} - -impl Lex for DecimalIntegerLiteral { - fn lex(input: &mut SourceIter) -> impl IntoLexResult { - if Zero::peek(input) { - return Some(Self::Zero(Zero::lex(input).into_lex_result().unwrap()?)); - } - if NonZero::peek(input) { - let s = NonZero::lex(input).into_lex_result().unwrap()?; - let after = if DecimalDigits::peek(input) { - DecimalDigits::lex(input).into_lex_result().unwrap() - } else { - None - }; - - return Some(Self::NonZero(s, after)); - } - - None - } - - fn peek(input: &SourceIter) -> bool { - Zero::peek(input) || NonZero::peek(input) - } -} - -#[derive(Debug, Clone, Spanned)] -#[Lex('0')] -pub struct Zero { - span: Span -} - -#[derive(Debug, Clone, Spanned)] -pub struct NonZero(Span); - -impl Lex for NonZero { - fn lex(input: &mut SourceIter) -> impl IntoLexResult { - if !Self::peek(input) { - return None; - } - - Some(Self(Span::single_char(input.next()?.0))) - } - - fn peek(input: &SourceIter) -> bool { - input - .peek() - .map(|d| matches!(d, '1'..='9')) - .unwrap_or(false) - } -} - -#[derive(Debug, Clone, Spanned)] -pub struct DecimalDigits(Span); - -impl Lex for DecimalDigits { - fn lex(input: &mut SourceIter) -> impl IntoLexResult { - if !Self::peek(input) { - return None; - } - - let start = input.next()?.0; - let mut end = start; - - loop { - if !Self::peek(input) { - break; - } - - end = input.next().unwrap().0; - } - - Some(Self(TryIntoSpan::try_into_span(start..=end).unwrap())) - } - - fn peek(input: &SourceIter) -> bool { - input.peek().map(|d| d.is_ascii_digit()).unwrap_or(false) - } -} - -#[derive(Debug, Clone)] -pub struct ExponentPart(ExponentIdicator, SignedInteger); - -impl Spanned for ExponentPart { - fn span(&self) -> Span { - self.0.span().combine([self.1.span()]) - } -} - -impl Lex for ExponentPart { - fn lex(input: &mut SourceIter) -> impl IntoLexResult { - if !Self::peek(input) { - return Ok(None); - } - - let e_token = ExponentIdicator::lex(input) - .into_lex_result() - .unwrap() - .unwrap(); - - let Ok(Some(int)) = SignedInteger::lex(input).into_lex_result() else { - return input - .error() - .expected(Some(-2..0), "Signed integer (e.g. +1, -2, 4)"); - }; - - Ok(Some(Self(e_token, int))) - } - - fn peek(input: &SourceIter) -> bool { - ExponentIdicator::peek(input) - } -} - -#[derive(Debug, Clone, Spanned)] -#[Lex] -pub enum ExponentIdicator { - Uppercase(E), - Lowercase(e), -} - -#[derive(Debug, Clone, Spanned)] -#[Lex('E')] -pub struct E { - span: Span -} - -#[derive(Debug, Clone, Spanned)] -#[Lex('e')] -#[allow(non_camel_case_types)] -pub struct e { - span: Span -} - -#[derive(Debug, Clone)] -pub enum SignedInteger { - None(DecimalDigits), - Positive(Token![+], DecimalDigits), - Negative(Token![-], DecimalDigits), -} - -impl Spanned for SignedInteger { - fn span(&self) -> Span { - match self { - SignedInteger::None(d) => d.span(), - SignedInteger::Positive(s, d) => s.span().combine([d.span()]), - SignedInteger::Negative(s, d) => s.span().combine([d.span()]), - } - } -} - -impl Lex for SignedInteger { - fn lex(input: &mut SourceIter) -> impl IntoLexResult { - if Plus::peek(input) { - return Some(Self::Positive( - Plus::lex(input).into_lex_result().unwrap()?, - DecimalDigits::lex(input).into_lex_result().unwrap()?, - )); - } - - if Minus::peek(input) { - return Some(Self::Negative( - Minus::lex(input).into_lex_result().unwrap()?, - DecimalDigits::lex(input).into_lex_result().unwrap()?, - )); - } - - if DecimalDigits::peek(input) { - return Some(Self::None( - DecimalDigits::lex(input).into_lex_result().unwrap()?, - )); - } - - None - } - - fn peek(input: &SourceIter) -> bool { - ::peek(input) - || ::peek(input) - || ::peek(input) - } -} - -#[derive(Debug, Clone)] -pub struct HexIntegerLiteral(HexPrefix, HexDigit, Vec); - -#[derive(Debug, Clone, Spanned)] -#[Lex] -pub enum HexPrefix { - Lowercase(LowercaseHexPrefix), - Uppercase(UppercaseHexPrefix), -} - -impl Spanned for HexIntegerLiteral { - fn span(&self) -> Span { - self.0 - .span() - .combine(once(self.1.span()).chain(self.2.iter().map(Spanned::span))) - } -} - -impl Lex for HexIntegerLiteral { - fn lex(mut input: &mut SourceIter) -> impl IntoLexResult { - let p = match input { - ref mut i if HexPrefix::peek(i) => { - HexPrefix::lex(i).into_lex_result().unwrap().unwrap() - } - _ => return Ok(None), - }; - - let Ok(Some(d)) = HexDigit::lex(input).into_lex_result() else { - return input.error().expected(Some(-1..0), ""); - }; - - let mut ds = vec![]; - - while let Some(ch) = input.peek() { - if is_hex_digit(ch) { - ds.push(HexDigit::lex(input).into_lex_result().unwrap().unwrap()); - } else { - break; - } - } - - Ok(Some(Self(p, d, ds))) - } - - fn peek(input: &SourceIter) -> bool { - LowercaseHexPrefix::peek(input) || UppercaseHexPrefix::peek(input) - } -} - -#[derive(Debug, Clone, Spanned)] -pub struct LowercaseHexPrefix(Span); - -impl Lex for LowercaseHexPrefix { - fn lex(input: &mut SourceIter) -> impl IntoLexResult { - if !Self::peek(input) { - return None; - } - - let start = input.next().unwrap().0; - input.offset(1); - - Some(Self( - TryIntoSpan::try_into_span(start..=(start + 1)).unwrap(), - )) - } - - fn peek(input: &SourceIter) -> bool { - input.ahead(0..2).map(|s| s == "0x").unwrap_or(false) - } -} - -#[derive(Debug, Clone, Spanned)] -pub struct UppercaseHexPrefix(Span); - -impl Lex for UppercaseHexPrefix { - fn lex(input: &mut SourceIter) -> impl IntoLexResult { - if !Self::peek(input) { - return None; - } - - let start = input.next().unwrap().0; - input.offset(1); - - Some(Self( - TryIntoSpan::try_into_span(start..=(start + 1)).unwrap(), - )) - } - - fn peek(input: &SourceIter) -> bool { - input.ahead(0..2).map(|s| s == "0X").unwrap_or(false) - } -} - -#[derive(Debug, Clone, Spanned)] -pub struct HexDigit(Span); - -impl Lex for HexDigit { - fn lex(input: &mut SourceIter) -> impl IntoLexResult { - if !Self::peek(input) { - return None; - } - - Some(Self(Span::single_char(input.next().unwrap().0))) - } - - fn peek(input: &SourceIter) -> bool { - matches!(input.peek(), Some(a) if is_hex_digit(a)) - } -} - -#[cfg(test)] -mod tests { - - use crate::{ - lex::{ - number::{ - DecimalLiteral, DecimalMantissa, HexIntegerLiteral, Integer, - IntegralDecimalMantissa, Number, Numeric, NumericLiteral, - }, - tokens::Lex, - IntoLexResult, LexResult, - }, - utils::SourceFile, - }; - - use super::{ExponentIdicator, ExponentPart, HexPrefix, Sign, SignedInteger}; - - fn test_lex(s: impl ToString, src: &str) -> LexResult { - let src = SourceFile::dummy_file(format!("test.{}", s.to_string()), src); - let iter = &mut src.iter(); - T::lex(iter).into_lex_result() - } - - macro_rules! dot_man_exp { - ($m: pat, $e: pat) => { - Ok(Some(Number( - None, - Numeric::Lit(NumericLiteral::Decimal(DecimalLiteral::DecimalMantissa( - DecimalMantissa(_, $m, $e), - ))), - ))) - }; - ($s: pat, $m: pat, $e: pat) => { - Ok(Some(Number( - $s, - Numeric::Lit(NumericLiteral::Decimal(DecimalLiteral::DecimalMantissa( - DecimalMantissa(_, $m, $e), - ))), - ))) - }; - } - - macro_rules! int_exp { - ($m: pat, $e: pat) => { - Ok(Some(Number( - None, - Numeric::Lit(NumericLiteral::Decimal(DecimalLiteral::Integer(Integer( - $m, $e, - )))), - ))) - }; - ($s: pat, $m: pat, $e: pat) => { - Ok(Some(Number( - $s, - Numeric::Lit(NumericLiteral::Decimal(DecimalLiteral::Integer(Integer( - $m, $e, - )))), - ))) - }; - } - - macro_rules! hex_int { - ($c: pat, $d: pat, $ds: pat) => { - Ok(Some(Number( - None, - Numeric::Lit(NumericLiteral::Hex(HexIntegerLiteral($c, $d, $ds))), - ))) - }; - ($s: pat, $c: pat, $d: pat, $ds: pat) => { - Ok(Some(Number( - $s, - Numeric::Lit(NumericLiteral::Hex(HexIntegerLiteral($c, $d, $ds))), - ))) - }; - } - - macro_rules! int_dot_man_exp { - ($m: pat, $n: pat) => { - Ok(Some(Number( - None, - Numeric::Lit(NumericLiteral::Decimal( - DecimalLiteral::IntegralDecimalMantissa(IntegralDecimalMantissa(_, _, $m, $n)), - )), - ))) - }; - ($s: pat, $m: pat, $n: pat) => { - Ok(Some(Number( - $s, - Numeric::Lit(NumericLiteral::Decimal( - DecimalLiteral::IntegralDecimalMantissa(IntegralDecimalMantissa(_, _, $m, $n)), - )), - ))) - }; - } - - macro_rules! test_lex { - ($s: expr, $p: pat) => {{ - let tmp = test_lex::(0, $s); - if !matches!(tmp, $p) { - panic!("{tmp:?}"); - } - }}; - } - - #[test] - fn no_sign() { - assert!(!matches!(test_lex::(0, "02."), Ok(Some(_)))); - - test_lex!("1.", int_dot_man_exp!(None, None)); - test_lex!("123.", int_dot_man_exp!(None, None)); - test_lex!("1.2", int_dot_man_exp!(Some(_), None)); - test_lex!("13.2", int_dot_man_exp!(Some(_), None)); - test_lex!("1.e-5", int_dot_man_exp!(None, Some(_))); - test_lex!("134.2e-5", int_dot_man_exp!(Some(_), Some(_))); - - test_lex!(".1234", dot_man_exp!(_, None)); - test_lex!(".1234e-5", dot_man_exp!(_, Some(_))); - - test_lex!("1234", int_exp!(_, None)); - - test_lex!( - "467832674328438e2", - int_exp!( - _, - Some(ExponentPart( - ExponentIdicator::Lowercase(_), - SignedInteger::None(_) - )) - ) - ); - test_lex!( - "467832674328438E2", - int_exp!( - _, - Some(ExponentPart( - ExponentIdicator::Uppercase(_), - SignedInteger::None(_) - )) - ) - ); - test_lex!( - "467832674328438e+2", - int_exp!( - _, - Some(ExponentPart( - ExponentIdicator::Lowercase(_), - SignedInteger::Positive(_, _) - )) - ) - ); - test_lex!( - "467832674328438E+2", - int_exp!( - _, - Some(ExponentPart( - ExponentIdicator::Uppercase(_), - SignedInteger::Positive(_, _) - )) - ) - ); - test_lex!( - "467832674328438e-2", - int_exp!( - _, - Some(ExponentPart( - ExponentIdicator::Lowercase(_), - SignedInteger::Negative(_, _) - )) - ) - ); - test_lex!( - "467832674328438E-2", - int_exp!( - _, - Some(ExponentPart( - ExponentIdicator::Uppercase(_), - SignedInteger::Negative(_, _) - )) - ) - ); - - test_lex!("0x6432ABA3", hex_int!(HexPrefix::Lowercase(_), _, _)); - test_lex!("0x6432aba3", hex_int!(HexPrefix::Lowercase(_), _, _)); - test_lex!("0X6432ABA3", hex_int!(HexPrefix::Uppercase(_), _, _)); - test_lex!("0X6432ABA3", hex_int!(HexPrefix::Uppercase(_), _, _)); - } - - #[test] - fn positive() { - test_lex!("+1.", int_dot_man_exp!(Some(Sign::Positive(_)), None, None)); - test_lex!( - "+123.", - int_dot_man_exp!(Some(Sign::Positive(_)), None, None) - ); - test_lex!( - "+1.2", - int_dot_man_exp!(Some(Sign::Positive(_)), Some(_), None) - ); - test_lex!( - "+13.2", - int_dot_man_exp!(Some(Sign::Positive(_)), Some(_), None) - ); - test_lex!( - "+1.e-5", - int_dot_man_exp!(Some(Sign::Positive(_)), None, Some(_)) - ); - test_lex!( - "+134.2e-5", - int_dot_man_exp!(Some(Sign::Positive(_)), Some(_), Some(_)) - ); - - test_lex!("+.1234", dot_man_exp!(Some(Sign::Positive(_)), _, None)); - test_lex!( - "+.1234e-5", - dot_man_exp!(Some(Sign::Positive(_)), _, Some(_)) - ); - - test_lex!("+1234", int_exp!(Some(Sign::Positive(_)), _, None)); - - test_lex!( - "+467832674328438e2", - int_exp!( - Some(Sign::Positive(_)), - _, - Some(ExponentPart( - ExponentIdicator::Lowercase(_), - SignedInteger::None(_) - )) - ) - ); - test_lex!( - "+467832674328438E2", - int_exp!( - Some(Sign::Positive(_)), - _, - Some(ExponentPart( - ExponentIdicator::Uppercase(_), - SignedInteger::None(_) - )) - ) - ); - test_lex!( - "+467832674328438e+2", - int_exp!( - Some(Sign::Positive(_)), - _, - Some(ExponentPart( - ExponentIdicator::Lowercase(_), - SignedInteger::Positive(_, _) - )) - ) - ); - test_lex!( - "+467832674328438E+2", - int_exp!( - Some(Sign::Positive(_)), - _, - Some(ExponentPart( - ExponentIdicator::Uppercase(_), - SignedInteger::Positive(_, _) - )) - ) - ); - test_lex!( - "+467832674328438e-2", - int_exp!( - Some(Sign::Positive(_)), - _, - Some(ExponentPart( - ExponentIdicator::Lowercase(_), - SignedInteger::Negative(_, _) - )) - ) - ); - test_lex!( - "+467832674328438E-2", - int_exp!( - Some(Sign::Positive(_)), - _, - Some(ExponentPart( - ExponentIdicator::Uppercase(_), - SignedInteger::Negative(_, _) - )) - ) - ); - - test_lex!( - "+0x6432ABA3", - hex_int!(Some(Sign::Positive(_)), HexPrefix::Lowercase(_), _, _) - ); - test_lex!( - "+0x6432aba3", - hex_int!(Some(Sign::Positive(_)), HexPrefix::Lowercase(_), _, _) - ); - test_lex!( - "+0X6432ABA3", - hex_int!(Some(Sign::Positive(_)), HexPrefix::Uppercase(_), _, _) - ); - test_lex!( - "+0X6432ABA3", - hex_int!(Some(Sign::Positive(_)), HexPrefix::Uppercase(_), _, _) - ); - } - - #[test] - fn negative() { - test_lex!("-1.", int_dot_man_exp!(Some(Sign::Negative(_)), None, None)); - test_lex!( - "-123.", - int_dot_man_exp!(Some(Sign::Negative(_)), None, None) - ); - test_lex!( - "-1.2", - int_dot_man_exp!(Some(Sign::Negative(_)), Some(_), None) - ); - test_lex!( - "-13.2", - int_dot_man_exp!(Some(Sign::Negative(_)), Some(_), None) - ); - test_lex!( - "-1.e-5", - int_dot_man_exp!(Some(Sign::Negative(_)), None, Some(_)) - ); - test_lex!( - "-134.2e-5", - int_dot_man_exp!(Some(Sign::Negative(_)), Some(_), Some(_)) - ); - - test_lex!("-.1234", dot_man_exp!(Some(Sign::Negative(_)), _, None)); - test_lex!( - "-.1234e-5", - dot_man_exp!(Some(Sign::Negative(_)), _, Some(_)) - ); - - test_lex!("-1234", int_exp!(Some(Sign::Negative(_)), _, None)); - - test_lex!( - "-467832674328438e2", - int_exp!( - Some(Sign::Negative(_)), - _, - Some(ExponentPart( - ExponentIdicator::Lowercase(_), - SignedInteger::None(_) - )) - ) - ); - test_lex!( - "-467832674328438E2", - int_exp!( - Some(Sign::Negative(_)), - _, - Some(ExponentPart( - ExponentIdicator::Uppercase(_), - SignedInteger::None(_) - )) - ) - ); - test_lex!( - "-467832674328438e+2", - int_exp!( - Some(Sign::Negative(_)), - _, - Some(ExponentPart( - ExponentIdicator::Lowercase(_), - SignedInteger::Positive(_, _) - )) - ) - ); - test_lex!( - "-467832674328438E+2", - int_exp!( - Some(Sign::Negative(_)), - _, - Some(ExponentPart( - ExponentIdicator::Uppercase(_), - SignedInteger::Positive(_, _) - )) - ) - ); - test_lex!( - "-467832674328438e-2", - int_exp!( - Some(Sign::Negative(_)), - _, - Some(ExponentPart( - ExponentIdicator::Lowercase(_), - SignedInteger::Negative(_, _) - )) - ) - ); - test_lex!( - "-467832674328438E-2", - int_exp!( - Some(Sign::Negative(_)), - _, - Some(ExponentPart( - ExponentIdicator::Uppercase(_), - SignedInteger::Negative(_, _) - )) - ) - ); - - test_lex!( - "-0x6432ABA3", - hex_int!(Some(Sign::Negative(_)), HexPrefix::Lowercase(_), _, _) - ); - test_lex!( - "-0x6432aba3", - hex_int!(Some(Sign::Negative(_)), HexPrefix::Lowercase(_), _, _) - ); - test_lex!( - "-0X6432ABA3", - hex_int!(Some(Sign::Negative(_)), HexPrefix::Uppercase(_), _, _) - ); - test_lex!( - "-0X6432ABA3", - hex_int!(Some(Sign::Negative(_)), HexPrefix::Uppercase(_), _, _) - ); - } - - #[test] - fn idents() { - assert!(matches!( - test_lex::(0, "Infinity"), - Ok(Some(Number(None, Numeric::Infinity(_)))) - )); - assert!(matches!( - test_lex::(0, "+Infinity"), - Ok(Some(Number(Some(Sign::Positive(_)), Numeric::Infinity(_)))) - )); - assert!(matches!( - test_lex::(0, "-Infinity"), - Ok(Some(Number(Some(Sign::Negative(_)), Numeric::Infinity(_)))) - )); - - assert!(test_lex::(0, "-Ifty").is_err()); - assert!(test_lex::(0, "+Inf").is_err()); - assert!(matches!(test_lex::(0, "Infinty"), Ok(None))); - assert!(matches!( - test_lex::(0, "Idfhfdsbhjfdsvbaysj"), - Ok(None) - )); - - assert!(matches!( - test_lex::(0, "NaN"), - Ok(Some(Number(None, Numeric::NaN(_)))) - )); - assert!(matches!( - test_lex::(0, "+NaN"), - Ok(Some(Number(Some(Sign::Positive(_)), Numeric::NaN(_)))) - )); - assert!(matches!( - test_lex::(0, "-NaN"), - Ok(Some(Number(Some(Sign::Negative(_)), Numeric::NaN(_)))) - )); - - assert!(test_lex::(0, "-NAN").is_err()); - assert!(matches!(test_lex::(0, "nAN"), Ok(None))); - assert!(test_lex::(0, "+nAn").is_err()); - assert!(test_lex::(0, "-NAn").is_err()); - } -} diff --git a/src/lex/strings.rs b/src/lex/strings.rs deleted file mode 100644 index eba1d0f..0000000 --- a/src/lex/strings.rs +++ /dev/null @@ -1,196 +0,0 @@ -//! -//! String literals. -//! - -use avjason_macros::{Lex, Spanned}; - -use crate::{ - lex::tokens::{LineTerminator, LineTerminatorSeq}, - utils::{SourceIter, Span, TryIntoSpan}, -}; - -use super::{escape::EscapeSequence, tokens::{Lex, Token}, IntoLexResult, LexError, LexResult}; - -#[derive(Debug, Clone, Spanned)] -#[Lex] -pub enum LString { - Single(SingleString), - Double(DoubleString), -} - -impl LString { - pub(crate) fn peek_token(token: &Token) -> bool { - matches!(token, Token::String(s)) - } -} - -fn eat_inner_chars( - input: &mut SourceIter, - delimit: char, -) -> Result>, LexError> { - let mut contents = vec![]; - - while let Some(ch) = input.peek() { - if ch == &delimit { - break; - } - - if LineTerminator::peek(input) { - return input.error().unexpected(Some(0..1), ""); - } - - if ch == &'\\' { - // Escape sequence. - let mut fork = input.fork(); - fork.offset(1); - - if EscapeSequence::peek(&fork) { - contents.push(StrFrag::EscSeq(EscapeSequence::lex(&mut fork).into_lex_result()?.unwrap())); - - input.advance_to(fork); - continue; - } - - if LineTerminatorSeq::peek(&fork) { - contents.push(StrFrag::LineEsc(LineTerminatorSeq::lex(&mut fork).into_lex_result()?.unwrap())); - - input.advance_to(fork); - continue; - } - - return input - .error() - .expected(Some(0..1), "Escaped Newline, or escape sequence"); - } - - let (_, c) = input.next().unwrap(); - contents.push(StrFrag::Char(c)); - } - - Ok(Some(contents)) -} - -#[derive(Debug, Clone)] -pub enum StrFrag { - Char(char), - EscSeq(EscapeSequence), - LineEsc(LineTerminatorSeq), -} - -#[derive(Debug, Clone, Spanned)] -pub struct SingleString(Span, Vec); - -impl Lex for SingleString { - fn lex(input: &mut SourceIter) -> LexResult { - if !Self::peek(input) { - return Ok(None); - } - - let start = input.next().unwrap().0; - - let contents = eat_inner_chars(input, '\'')?.unwrap(); - - if input.peek() != Some(&'\'') { - return input.error().expected(Some(0..1), "\'"); - } - - let end = input.next().unwrap().0; - - Ok(Some(Self( - TryIntoSpan::try_into_span(start..=end).unwrap(), - contents, - ))) - } - - fn peek(input: &SourceIter) -> bool { - input.peek() == Some(&'\'') - } -} - -#[derive(Debug, Clone, Spanned)] -pub struct DoubleString(Span, Vec); - -impl Lex for DoubleString { - fn lex(input: &mut SourceIter) -> LexResult { - if !Self::peek(input) { - return Ok(None); - } - - let start = input.next().unwrap().0; - - let contents = eat_inner_chars(input, '\"')?.unwrap(); - - if input.peek() != Some(&'\"') { - return input.error().expected(Some(0..1), "\""); - } - - let end = input.next().unwrap().0; - - Ok(Some(Self( - TryIntoSpan::try_into_span(start..=end).unwrap(), - contents, - ))) - } - - fn peek(input: &SourceIter) -> bool { - input.peek() == Some(&'"') - } -} - -#[cfg(test)] -mod tests { - use crate::{ - lex::{strings::DoubleString, tokens::Lex, IntoLexResult, LexResult}, - utils::SourceFile, - }; - - fn test_lex(s: impl ToString, src: &str) -> LexResult { - let src = SourceFile::dummy_file(format!("test.{}", s.to_string()), src); - let iter = &mut src.iter(); - T::lex(iter).into_lex_result() - } - - #[test] - fn unicode_escape() { - let twice_valid = test_lex::(0, r#""\u1522\u2431""#); - assert!(matches!(twice_valid, Ok(Some(_)))); - let once_valid_once_invalid = test_lex::(1, r#""\u1522\u241""#); - assert!(once_valid_once_invalid.is_err()); - let once_invalid = test_lex::(3, r#""\u1S2Y""#); - assert!(once_invalid.is_err()); - } - - #[test] - fn hex_escape() { - let twice_valid = test_lex::(0, r#""\x0F\xFF""#); - assert!(matches!(twice_valid, Ok(Some(_)))); - let once_valid_once_invalid = test_lex::(0, r#""\x0F\xSF""#); - assert!(once_valid_once_invalid.is_err()); - let once_invalid = test_lex::(0, r#""\xSF""#); - assert!(once_invalid.is_err()); - } - - #[test] - fn single_char() { - let escaped = test_lex::(0, r#""\t\r\v\n\"\\""#); - assert!(matches!(escaped, Ok(Some(_)))); - let normal = test_lex::(0, r#""\!\?\:\@\~\#\}\{\(\)\&\$""#); - assert!(matches!(normal, Ok(Some(_)))); - } - - #[test] - fn null_escape() { - let valid = test_lex::(0, r#""\0\0\0\0\0\0\0\0""#); - assert!(matches!(valid, Ok(Some(_)))); - let invalid = test_lex::(0, r#""\00\01\04\06"#); - assert!(invalid.is_err()); - } - - #[test] - fn mixed_escapes() { - let test0 = test_lex::(0, r#""\v\!\%\x00""#); - assert!(matches!(test0, Ok(Some(_)))); - let test1 = test_lex::(1, r#""\v\!\% abhbdasjdas^da'''gadudgasi a@@@~ {} dauasdhi\x00""#); - assert!(matches!(test1, Ok(Some(_)))); - } -} diff --git a/src/lex/tokens.rs b/src/lex/tokens.rs deleted file mode 100644 index 44c047c..0000000 --- a/src/lex/tokens.rs +++ /dev/null @@ -1,622 +0,0 @@ -use avjason_macros::{Lex, Spanned}; -use finl_unicode::categories::{CharacterCategories, MinorCategory}; - -use crate::{ - syntax::Parse, - utils::{SourceFile, SourceIter, Span, TryIntoSpan, Spanned}, -}; - -use super::{escape::UnicodeEscapeSequence, number::Number, strings::LString, IntoLexResult}; - -pub(crate) trait Lex: Sized { - fn lex(input: &mut SourceIter) -> impl IntoLexResult; - fn peek(input: &SourceIter) -> bool; -} - -/// -/// Util macro for Syntax parsing. -/// -macro_rules! peek { - ($t: ident, $l: literal, $e: expr) => { - #[allow(non_snake_case)] - #[doc(hidden)] - pub fn $t() -> crate::syntax::utils::Peeker<$t> { - ($e, $e) - } - - impl crate::syntax::Parse for $t { - fn parse(input: &mut crate::syntax::ParseBuffer) -> crate::syntax::ParserResult { - let Some(token) = input.next() else { - return input.error().expected(concat!("`", stringify!($l), "`")); - }; - - #[allow(clippy::redundant_closure_call)] - let Some(t) = $e(token) else { - return input.error().expected(concat!("`", stringify!($l), "`")); - }; - - Ok(t) - } - } - }; -} - -macro_rules! peek_only { - ($t: ident, $e: expr) => { - #[allow(non_snake_case)] - #[doc(hidden)] - pub fn $t(token: &Token) -> bool { - #[allow(clippy::redundant_closure_call)] - $e(token) - } - }; -} - -#[derive(Debug, Clone, Spanned)] -pub struct True { - span: Span -} - -impl Parse for True { - fn parse(input: &mut crate::syntax::ParseBuffer) -> crate::syntax::ParserResult { - let mut f = input.fork(); - let ident: LIdentifier = f.parse()?; - if f.source_text(ident.span()) != "true" { - return input.error() - .expected("`true` here."); - } - - input.advance_to(f); - - Ok(Self{ span: ident.span() }) - } -} - -peek_only!(True, |token: &Token| matches!(token, Token::Identifier(ref ident) if ident.raw_value == "true")); - - -#[derive(Debug, Clone, Spanned)] -pub struct False { - span: Span -} - -impl Parse for False { - fn parse(input: &mut crate::syntax::ParseBuffer) -> crate::syntax::ParserResult { - let mut f = input.fork(); - let ident: LIdentifier = f.parse()?; - if f.source_text(ident.span()) != "false" { - return input.error() - .expected("`false` here."); - } - - input.advance_to(f); - - Ok(Self{ span: ident.span() }) - } -} -peek_only!(False, |token: &Token| matches!(token, Token::Identifier(ref ident) if ident.raw_value == "false")); - -#[derive(Debug, Clone, Spanned)] -pub struct Null { - span: Span -} - -impl Parse for Null { - fn parse(input: &mut crate::syntax::ParseBuffer) -> crate::syntax::ParserResult { - let mut f = input.fork(); - let ident: LIdentifier = f.parse()?; - if f.source_text(ident.span()) != "null" { - return input.error() - .expected("`null` here."); - } - - input.advance_to(f); - - Ok(Self{ span: ident.span() }) - } -} - -peek_only!(Null, |token: &Token| matches!(token, Token::Identifier(ref ident) if ident.raw_value == "null")); - - -#[derive(Debug, Clone, Spanned)] -#[Lex('{')] -pub struct OpenBrace { - span: Span, -} - -peek!(OpenBrace, '{', |token| match token { - Token::Punctuator(crate::lex::tokens::Punct::OpenBrace(s)) => Some(s), - _ => None, -}); - -#[derive(Debug, Clone, Spanned)] -#[Lex('}')] -pub struct CloseBrace { - span: Span, -} - -peek!(CloseBrace, '}', |token| match token { - Token::Punctuator(crate::lex::tokens::Punct::CloseBrace(s)) => Some(s), - _ => None, -}); - -#[derive(Debug, Clone, Spanned)] -#[Lex('[')] -pub struct OpenBracket { - span: Span, -} - -peek!(OpenBracket, '[', |token| match token { - Token::Punctuator(crate::lex::tokens::Punct::OpenBracket(s)) => Some(s), - _ => None, -}); - -#[derive(Debug, Clone, Spanned)] -#[Lex(']')] -pub struct CloseBracket { - span: Span, -} - -peek!(CloseBracket, ']', |token| match token { - Token::Punctuator(crate::lex::tokens::Punct::CloseBracket(s)) => Some(s), - _ => None, -}); - -#[derive(Debug, Clone, Spanned)] -#[Lex(':')] -pub struct Colon { - span: Span, -} - -peek!(Colon, ':', |token| match token { - Token::Punctuator(crate::lex::tokens::Punct::Colon(s)) => Some(s), - _ => None, -}); - -#[derive(Debug, Clone, Spanned)] -#[Lex(',')] -pub struct Comma { - span: Span, -} - -peek!(Comma, ',', |token| match token { - Token::Punctuator(crate::lex::tokens::Punct::Comma(s)) => Some(s), - _ => None, -}); - -#[derive(Debug, Clone, Spanned)] -#[Lex('-')] -pub struct Minus { - span: Span, -} - -#[derive(Debug, Clone, Spanned)] -#[Lex('+')] -pub struct Plus { - span: Span, -} - -#[derive(Debug, Clone, Spanned)] -#[Lex('.')] -pub struct Dot { - span: Span, -} - -#[macro_export] -macro_rules! Token { - ['{'] => { - $crate::lex::tokens::OpenBrace - }; - ['}'] => { - $crate::lex::tokens::CloseBrace - }; - ['['] => { - $crate::lex::tokens::OpenBracket - }; - [']'] => { - $crate::lex::tokens::CloseBracket - }; - [':'] => { - $crate::lex::tokens::Colon - }; - [','] => { - $crate::lex::tokens::Comma - }; - ['-'] => { - $crate::lex::tokens::Minus - }; - ['+'] => { - $crate::lex::tokens::Plus - }; - ['.'] => { - $crate::lex::tokens::Dot - }; - [:] => { - $crate::lex::tokens::Colon - }; - [,] => { - $crate::lex::tokens::Comma - }; - [-] => { - $crate::lex::tokens::Minus - }; - [+] => { - $crate::lex::tokens::Plus - }; - [.] => { - $crate::lex::tokens::Dot - }; - [false] => { - $crate::lex::tokens::False - }; - [true] => { - $crate::lex::tokens::True - }; - [null] => { - $crate::lex::tokens::Null - }; -} - -#[derive(Debug, Clone, Spanned)] -#[Lex] -pub enum Punct { - OpenBrace(OpenBrace), - CloseBrace(CloseBrace), - OpenBracket(OpenBracket), - CloseBracket(CloseBracket), - Colon(Colon), - Comma(Comma), -} - -#[derive(Debug, Clone, Spanned)] -pub struct WhiteSpace(Span); - -impl WhiteSpace { - /// - /// In accordance with - /// [ECMAScript standards](https://262.ecma-international.org/5.1/#sec-7.2). - /// - pub fn is_whitespace(ch: &char) -> bool { - ch == &'\u{0009}' - || ch == &'\u{000b}' - || ch == &'\u{000c}' - || ch == &'\u{0020}' - || ch == &'\u{00a0}' - || (*ch).get_minor_category() == MinorCategory::Zs - } -} - -impl Lex for WhiteSpace { - fn lex(input: &mut SourceIter) -> Option { - let ch = input.peek()?; - let Some(start) = (if Self::is_whitespace(ch) { - Some(input.next()?.0) - } else { - return None; - }) else { - return None; - }; - - let mut end = start; - while let Some(ch) = input.peek() { - if !Self::is_whitespace(ch) { - break; - } - end = input.next()?.0; - } - - Some(Self(TryIntoSpan::try_into_span(start..=end)?)) - } - - fn peek(input: &SourceIter) -> bool { - input.peek().map(Self::is_whitespace).unwrap_or_default() - } -} - -/// -/// In accordance with the [ECMAScript standard](https://262.ecma-international.org/5.1/#sec-7.3). -/// -#[derive(Debug, Spanned)] -pub struct LineTerminator(Span); - -impl Lex for LineTerminator { - fn lex(input: &mut SourceIter) -> Option { - match input.peek()? { - // , , , - &'\u{000a}' | &'\u{000d}' | &'\u{2028}' | &'\u{2029}' => { - let loc = input.next()?.0; - Some(Self(Span::single_char(loc))) - } - _ => None, - } - } - - fn peek(input: &SourceIter) -> bool { - matches!( - input.peek(), - Some(&'\u{000a}' | &'\u{000d}' | &'\u{2028}' | &'\u{2029}') - ) - } -} - -#[derive(Debug, Clone, Spanned)] -pub struct LineTerminatorSeq(Span); - -impl Lex for LineTerminatorSeq { - fn lex(input: &mut SourceIter) -> Option { - match (input.peek()?, input.peek2()) { - // - (&'\u{000d}', Some(&'\u{000a}')) => { - let start = input.next()?.0; - let end = input.next()?.0; - Some(Self(TryIntoSpan::try_into_span(start..=end)?)) - } - // , , , - (&'\u{000a}' | &'\u{000d}' | &'\u{2028}' | &'\u{2029}', _) => { - let loc = input.next()?.0; - Some(Self(Span::single_char(loc))) - } - _ => None, - } - } - - fn peek(input: &SourceIter) -> bool { - match (input.peek(), input.peek2()) { - // - (Some(&'\u{000d}'), Some(&'\u{000a}')) => true, - // , , , - (Some(&'\u{000a}' | &'\u{000d}' | &'\u{2028}' | &'\u{2029}'), _) => true, - _ => false, - } - } -} - -#[derive(Debug, Spanned)] -#[Lex] -pub enum Comment { - MultiLine(MultiLineComment), - SingleLine(SingleLineComment), -} - -#[derive(Debug, Spanned)] -pub struct SingleLineComment(Span); - -impl Lex for SingleLineComment { - fn lex(input: &mut SourceIter) -> Option { - if !Self::peek(input) { - return None; - } - - let start = input.next()?.0; // First slash - let _ = input.next()?; // Second slash - - let mut end = start; - while !LineTerminator::peek(input) { - // Unwrap ok since peek -> Some implies next -> Some/ - end = input.next().unwrap().0; - } - - Some(Self(TryIntoSpan::try_into_span(start..=end)?)) - } - - fn peek(input: &SourceIter) -> bool { - matches!((input.peek(), input.peek2()), (Some(&'/'), Some(&'/'))) - } -} - -#[derive(Debug, Spanned)] -pub struct MultiLineComment(Span); - -impl MultiLineComment { - fn peek_end(input: &SourceIter) -> bool { - matches!((input.peek(), input.peek2()), (Some(&'*'), Some(&'/'))) - } -} - -impl Lex for MultiLineComment { - fn lex(input: &mut SourceIter) -> Option { - if !Self::peek(input) { - return None; - } - - let start = input.next()?.0; // First slash - let _ = input.next()?; // Second slash - - while !Self::peek_end(input) { - // Unwrap ok since peek -> Some implies next -> Some - _ = input.next().unwrap().0; - } - - input.next().unwrap(); // `*` - Unwraps ok since peek, peek2 -> Some, Some - let end = input.next().unwrap().0; // `/` - - Some(Self(TryIntoSpan::try_into_span(start..=end)?)) - } - - fn peek(input: &SourceIter) -> bool { - matches!((input.peek(), input.peek2()), (Some(&'/'), Some(&'*'))) - } -} - -#[derive(Debug, Spanned)] -#[Lex] -pub enum InputElement { - LineTerminator(LineTerminator), - WhiteSpace(WhiteSpace), - Comment(Comment), - Token(Token), -} - -/// -/// Compliant with [ECMAScript specification for `IdentifierName`](https://262.ecma-international.org/5.1/#sec-7.6). -/// -#[derive(Debug, Spanned, Clone)] -pub struct LIdentifier { - span: Span, - raw_value: String, -} - -impl LIdentifier { - pub(crate) fn value(&self, file: &SourceFile) -> String { - todo!() - } - - pub(crate) fn peek_token(token: &Token) -> bool { - matches!(token, Token::Identifier(i)) - } -} - -impl Parse for LIdentifier { - fn parse(input: &mut crate::syntax::ParseBuffer) -> crate::syntax::ParserResult { - match input.next() { - Some(Token::Identifier(ident)) => Ok(ident), - _ => input.error().expected("identifier"), - } - } -} - -impl LIdentifier { - fn is_unicode_letter(ch: &char) -> bool { - use MinorCategory::*; - matches!(ch.get_minor_category(), Lu | Ll | Lt | Lm | Lo | Nl) - } - - fn is_unicode_combining_mark(ch: &char) -> bool { - use MinorCategory::*; - matches!(ch.get_minor_category(), Mn | Mc) - } - - fn is_unicode_digit(ch: &char) -> bool { - use MinorCategory::*; - matches!(ch.get_minor_category(), Nd) - } - - fn is_unicode_connector_punctuation(ch: &char) -> bool { - use MinorCategory::*; - matches!(ch.get_minor_category(), Pc) - } - - pub(crate) fn is_identifier_start(input: &SourceIter) -> bool { - // IdentifierStart - let Some(ch) = input.peek() else { - return false; - }; - - match ch { - c if Self::is_unicode_letter(c) => true, - &'$' | &'_' => true, - &'\\' => { - // Check for unicode escape sequence. - let mut fork = input.fork(); - fork.next().unwrap(); - UnicodeEscapeSequence::peek(input) - } - _ => false, - } - } - - fn is_identifier_part(input: &SourceIter) -> bool { - if Self::is_identifier_start(input) { - return true; - } - - let Some(ch) = input.peek() else { - return false; - }; - - Self::is_unicode_combining_mark(ch) - || Self::is_unicode_digit(ch) - || Self::is_unicode_connector_punctuation(ch) - || matches!(ch, &'\u{200c}' | &'\u{200d}') // | - } - - fn peek_middle(input: &SourceIter) -> bool { - Self::is_identifier_part(input) - } -} - -impl Lex for LIdentifier { - fn lex(input: &mut SourceIter) -> Option { - if !Self::peek(input) { - return None; - } - - let start = input.next().unwrap().0; - let mut end = start + 1; - while Self::peek_middle(input) { - end = input.next().unwrap().0; - } - - let span = TryIntoSpan::try_into_span(start..=end)?; - Some(Self { - span, - raw_value: input.source_at(span) - }) - } - - fn peek(input: &SourceIter) -> bool { - Self::is_identifier_start(input) - } -} - -#[derive(Debug, Spanned, Clone)] -pub enum Token { - Identifier(LIdentifier), - Punctuator(Punct), - String(LString), - Number(Number), -} - -impl Lex for Token { - fn lex(input: &mut SourceIter) -> impl IntoLexResult { - if let Some(s) = LIdentifier::lex(input).into_lex_result()? { - return Ok(Some(Self::Identifier(s))); - } - - if let Some(s) = Punct::lex(input).into_lex_result()? { - return Ok(Some(Self::Punctuator(s))); - } - - if let Some(s) = LString::lex(input).into_lex_result()? { - return Ok(Some(Self::String(s))); - } - - if let Some(s) = Number::lex(input).into_lex_result()? { - return Ok(Some(Self::Number(s))); - } - - Ok(None) - } - - fn peek(_: &SourceIter) -> bool { - unimplemented!() - } -} - -#[cfg(test)] -mod tests { - use crate::{lex::IntoLexResult, utils::SourceFile}; - - use super::{InputElement, Lex}; - - #[test] - fn lexxing_tests() { - let src = "\ - []\n\ - 21, 5.65 - { }:,\n\ - // Single line comment\n\ - /* Multi line Comment\n\ - Wa-hey!*/\r\n - \"Here's a string!\"\n - 1.234678\t7.2367\t-Infinity"; - - println!("{src:?}"); - let src = SourceFile::dummy_file("test.1", src); - let iter = &mut src.iter(); - while let Ok(Some(l)) = InputElement::lex(iter).into_lex_result() { - println!("--> {l:?}"); - } - } -} diff --git a/src/syntax/mod.rs b/src/syntax/mod.rs index c9118e5..e69de29 100644 --- a/src/syntax/mod.rs +++ b/src/syntax/mod.rs @@ -1,137 +0,0 @@ -//! -//! Syntax Grammar. -//! - -pub mod utils; -pub mod value; - -use crate::{ - lex::tokens::Token, - utils::{Loc, SourceFile, Span}, -}; - -use self::utils::Peek; - -#[derive(Debug)] -pub struct ParseError { - near: String, - message: String, -} - -impl std::error::Error for ParseError {} - -impl std::fmt::Display for ParseError { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!( - f, - "Error occured during parsing:\t{}\n\tAt {}", - self.message, self.near - ) - } -} - -pub type ParserResult = Result; - -#[derive(Debug, Clone)] -pub struct ParseBuffer<'a> { - file: &'a SourceFile, - inner: Vec, - index: usize, -} - -impl<'a> ParseBuffer<'a> { - pub(crate) fn new(file: &'a SourceFile, inner: Vec) -> Self { - Self { - file, - inner, - index: 0, - } - } - - pub(crate) fn fork(&self) -> Self { - self.clone() - } - - pub(crate) fn source_text(&self, span: Span) -> String { - self.file.source_at_span(span).unwrap() - } - - pub(crate) fn upcoming(&self) -> Option<&Token> { - self.inner.get(self.index) - } - - pub(crate) fn peek

(&self, p: P) -> bool - where - P: Peek, - { - self.upcoming().map(|t| p.peek(t)).unwrap_or(false) - } - - pub(crate) fn error(&'a self) -> ParseErrorHelper<'a> { - ParseErrorHelper(self) - } - - pub(crate) fn index_display(&self, loc: impl IntoLoc) -> String { - self.file.file_line_column(&loc.into_loc()).unwrap() - } - - pub(crate) fn cursor(&self) -> usize { - self.index - } - - pub(crate) fn parse(&mut self) -> ParserResult

{ - P::parse(self) - } - - pub(crate) fn advance_to(&mut self, other: Self) { - self.index = other.index; - } -} - -pub(crate) trait IntoLoc { - fn into_loc(self) -> Loc; -} - -impl IntoLoc for Loc { - fn into_loc(self) -> Loc { - self - } -} - -impl> IntoLoc for I { - fn into_loc(self) -> Loc { - Loc { index: self.into() } - } -} - -pub struct ParseErrorHelper<'a>(&'a ParseBuffer<'a>); - -impl<'a> ParseErrorHelper<'a> { - pub(crate) fn unexpected(self, message: impl ToString) -> ParserResult { - Err(ParseError { - near: self.0.index_display(self.0.cursor() - 1), - message: format!("Unexpected {}", message.to_string()), - }) - } - - pub(crate) fn expected(self, message: impl ToString) -> ParserResult { - Err(ParseError { - near: self.0.index_display(self.0.cursor() - 1), - message: format!("Expected {}", message.to_string()), - }) - } -} - -impl<'a> Iterator for ParseBuffer<'a> { - type Item = Token; - - fn next(&mut self) -> Option { - let item = self.inner.get(self.index); - self.index += 1; - item.cloned() - } -} - -pub trait Parse: Sized { - fn parse(input: &mut ParseBuffer) -> ParserResult; -} diff --git a/src/syntax/utils.rs b/src/syntax/utils.rs deleted file mode 100644 index 2fac7ca..0000000 --- a/src/syntax/utils.rs +++ /dev/null @@ -1,51 +0,0 @@ -//! -//! Utilities for parsing tokens. -//! - -use crate::lex::tokens::Token; - -#[allow(private_bounds)] -pub trait Peek: Sealed {} - -#[doc(hidden)] -pub(crate) trait Sealed { - type T; - - fn peek(&self, token: &Token) -> bool; - fn try_from(&self, token: Token) -> Option; -} - -impl Peek for S - where S: Sealed -{} - -pub type Peeker = (fn(&Token) -> Option<&T>, fn(Token) -> Option); - -impl Sealed for F -where - F: Fn() -> Peeker, -{ - type T = T1; - - default fn peek(&self, token: &Token) -> bool { - self().0(token).is_some() - } - - default fn try_from(&self, token: Token) -> Option { - self().1(token) - } -} - -pub enum Unparseable {} - -default impl Sealed for fn(&Token) -> bool { - type T = Unparseable; - - fn peek(&self, token: &Token) -> bool { - self(token) - } - - fn try_from(&self, token: Token) -> Option { - unimplemented!() - } -} diff --git a/src/syntax/value.rs b/src/syntax/value.rs deleted file mode 100644 index c911b12..0000000 --- a/src/syntax/value.rs +++ /dev/null @@ -1,316 +0,0 @@ -//! -//! JSON5 Values. -//! - -use avjason_macros::Spanned; - -use crate::{ - lex::{ - number::Number, - strings::LString, - tokens::{False, LIdentifier, Null, Token, True}, - }, - Token, utils::{Spanned, Span, Loc}, -}; - -use super::{Parse, ParseBuffer, ParserResult}; - -#[derive(Debug, Clone, Spanned)] -pub enum Boolean { - True(Token![true]), - False(Token![false]), -} - -impl Boolean { - fn peek(input: &ParseBuffer) -> bool { - input - .upcoming() - .map(|token| True(token) || False(token)) - .unwrap_or_default() - } -} - -impl Parse for Boolean { - fn parse(input: &mut super::ParseBuffer) -> super::ParserResult { - if input.upcoming().map(True).unwrap_or_default() { - return Ok(Self::True(Parse::parse(input)?)); - } - - if input.upcoming().map(False).unwrap_or_default() { - return Ok(Self::False(Parse::parse(input)?)); - } - - input - .error() - .expected("boolean literal `true`, or `false`.") - } -} - -#[derive(Debug, Clone, Spanned)] -pub enum Value { - Null(Token![null]), - Boolean(Boolean), - String(LString), - Number(Number), - Object(Object), - Array(Array), -} - -impl Parse for Value { - fn parse(input: &mut ParseBuffer) -> ParserResult { - let Some(token) = input.upcoming() else { - return input.error().expected("Expected Value here!"); - }; - - if Null(token) { - return Ok(Self::Null(input.parse()?)); - } - - if Boolean::peek(input) { - return Ok(Self::Boolean(input.parse()?)); - } - - if LString::peek_token(token) { - return Ok(Self::String(input.parse()?)); - } - - if Number::peek_token(token) { - return Ok(Self::Number(input.parse()?)); - } - - if Object::peek(input) { - return Ok(Self::Object(input.parse()?)); - } - - if Array::peek(input) { - return Ok(Self::Array(input.parse()?)); - } - - input - .error() - .expected("JSON value (`null`, number, string, boolean, object, or array") - } -} - -#[derive(Debug, Clone)] -pub struct Punctuated { - inner: Vec, - trailing: Option, -} - -impl Spanned for Punctuated - where - El: Spanned, - Punct: Spanned -{ - fn span(&self) -> crate::utils::Span { - if self.inner.is_empty() { - return Span::single_char(Loc {index: 0}); - } - - let s = self.inner[0].span(); - let e = if let Some(ref t) = self.trailing { - t.span() - } else if self.inner.len() > 1 { - self.inner.last().unwrap().span() - } else { - s - }; - - s.combine([e]) - } -} - -impl Punctuated -where - El: Parse, - Punct: Parse, -{ - fn parse_until( - input: &mut ParseBuffer, - pred: impl Fn(&ParseBuffer) -> bool, - ) -> ParserResult { - let mut inner: Vec = vec![]; - let mut trailing: Option = None; - - loop { - if pred(input) { - break; - } - - inner.push(El::parse(input)?); - trailing = None; - - if pred(input) { - break; - } - - trailing = Some(Punct::parse(input)?); - } - - Ok(Self { inner, trailing }) - } -} - -#[derive(Debug, Clone)] -pub struct Object { - open: Token!['{'], - members: Punctuated, - close: Token!['}'], -} - -impl Spanned for Object { - fn span(&self) -> Span { - let s = self.open.span(); - let e = self.close.span(); - - s.combine([e]) - } -} - -impl Object { - pub(crate) fn peek(input: &ParseBuffer) -> bool { - input.peek(Token!['{']) - } -} - -impl Parse for Object { - fn parse(input: &mut ParseBuffer) -> ParserResult { - let open = Parse::parse(input)?; - let members = Punctuated::parse_until(input, |input| input.peek(Token!['}']))?; - let close = Parse::parse(input)?; - Ok(Self { - open, - members, - close, - }) - } -} - -#[derive(Debug, Clone)] -pub struct Member { - name: MemberName, - colon: Token![:], - value: Value, -} - -impl Spanned for Member { - fn span(&self) -> Span { - let s = self.name.span(); - let e = self.value.span(); - - s.combine([e]) - } -} - -impl Parse for Member { - fn parse(input: &mut ParseBuffer) -> ParserResult { - Ok(Self { - name: input.parse()?, - colon: input.parse()?, - value: input.parse()?, - }) - } -} - -#[derive(Debug, Clone, Spanned)] -pub enum MemberName { - Identifier(LIdentifier), - String(LString), -} - -impl Parse for LString { - fn parse(input: &mut ParseBuffer) -> ParserResult { - match input { - i if i.upcoming().map(LString::peek_token).unwrap_or_default() => { - match i.next().unwrap() { - Token::String(l) => Ok(l), - _ => unreachable!(), - } - } - _ => input.error().expected("string literal"), - } - } -} - -impl Parse for Number { - fn parse(input: &mut crate::syntax::ParseBuffer) -> crate::syntax::ParserResult { - let Some(Token::Number(token)) = input.next() else { - return input.error().expected("number literal"); - }; - - Ok(token) - } -} - -impl Parse for MemberName { - fn parse(input: &mut ParseBuffer) -> ParserResult { - if input - .upcoming() - .map(LIdentifier::peek_token) - .unwrap_or_default() - { - return Ok(Self::Identifier(Parse::parse(input)?)); - } - - if input - .upcoming() - .map(LString::peek_token) - .unwrap_or_default() - { - return Ok(Self::String(Parse::parse(input)?)); - } - - input - .error() - .expected("either string literal, or identifier") - } -} - -#[derive(Debug, Clone)] -pub struct Array { - open: Token!['['], - elements: Punctuated, - close: Token![']'], -} - -impl Spanned for Array { - fn span(&self) -> Span { - let s = self.open.span(); - let e = self.close.span(); - - s.combine([e]) - } -} - -impl Array { - pub(crate) fn peek(input: &ParseBuffer) -> bool { - input.peek(Token!['[']) - } -} - -impl Parse for Array { - fn parse(input: &mut ParseBuffer) -> ParserResult { - let open = input.parse()?; - let elements = Punctuated::parse_until(input, |input| input.peek(Token![']']))?; - let close = input.parse()?; - - Ok(Self { - open, - elements, - close, - }) - } -} - -#[cfg(test)] -mod tests { - use crate::utils::SourceFile; - - #[test] - fn parse_value() { - let src = SourceFile::dummy_file("test.0", r#"{"fruits": [{name: "apple", qty: 2}], }"#); - let v = src.parse(); - println!("{v:#?}"); - } -} \ No newline at end of file diff --git a/src/utils/mod.rs b/src/utils/mod.rs index dfb4565..3a01689 100644 --- a/src/utils/mod.rs +++ b/src/utils/mod.rs @@ -13,14 +13,6 @@ use std::{ use anyhow::anyhow; pub use span::*; -use crate::{ - lex::{ - tokens::{InputElement, Lex}, - IntoLexResult, LexError, LexResult, - }, - syntax::{value::Value, ParseBuffer}, -}; - #[derive(Debug)] pub struct SourceFile { path: PathBuf, @@ -131,42 +123,6 @@ impl SourceFile { pub(crate) fn iter(&self) -> SourceIter { SourceIter::new(self) } - - pub(crate) fn lex(&self) -> LexResult> { - let mut v = vec![]; - let iter = &mut self.iter(); - - while !iter.eof() { - match InputElement::lex(iter).into_lex_result() { - Ok(Some(t)) => v.push(t), - Ok(None) => { - return iter.error().expected(Some(0..), "Something..."); - } - Err(err) => { - return Err(err); - } - } - } - - Ok(Some(v)) - } - - pub(crate) fn parse(&self) -> Result { - let Some(lexxed) = self.lex()? else { - return Err(anyhow!("Empty file!")); - }; - - let tokens = lexxed - .into_iter() - .filter_map(|token| match token { - InputElement::Token(t) => Some(t), - _ => None, - }) - .collect(); - - let buf = &mut ParseBuffer::new(self, tokens); - buf.parse().map_err(Into::into) - } } #[derive(Clone)] @@ -268,118 +224,11 @@ impl<'a> SourceIter<'a> { self.index = other.index; } - pub(crate) fn error(&self) -> SourceErrorHelper { - SourceErrorHelper { iter: self } - } - pub(crate) fn eof(&self) -> bool { self.index >= self.inner.len() } } -pub(crate) struct SourceErrorHelper<'a> { - iter: &'a SourceIter<'a>, -} - -impl<'a> SourceErrorHelper<'a> { - pub(crate) fn unexpected( - self, - range: Option>, - token: impl ToString, - ) -> LexResult - where - isize: TryFrom, - A: Copy + Debug, - >::Error: Debug, - { - let token = token.to_string(); - - let mut text = None; - let mut span = 0..self.iter.inner.len(); - if let Some(range) = range { - let i = self.iter.index as isize; - let start = i + match range.start_bound() { - std::ops::Bound::Included(r) => isize::try_from(*r).unwrap(), - std::ops::Bound::Excluded(r) => isize::try_from(*r).unwrap() + 1, - std::ops::Bound::Unbounded => 0isize, - }; - - let end = i + match range.start_bound() { - std::ops::Bound::Included(r) => isize::try_from(*r).unwrap() + 1, - std::ops::Bound::Excluded(r) => isize::try_from(*r).unwrap(), - std::ops::Bound::Unbounded => self.iter.inner.len() as isize, - }; - - let start = start as usize; - let end = end as usize; - - text = self.iter.file.source_at(start..end); - span = start..end; - } - - Err(LexError::new( - span, - format!("Unexpected token `{token}`"), - text, - )) - } - - pub(crate) fn expected( - self, - rel_range: Option>, - token: impl ToString, - ) -> LexResult - where - isize: TryFrom, - A: Copy + Debug, - >::Error: Debug, - { - let token = token.to_string(); - - let mut text = None; - let mut span = 0..self.iter.inner.len(); - if let Some(range) = rel_range { - let i = self.iter.index as isize; - let start = i + match range.start_bound() { - std::ops::Bound::Included(r) => isize::try_from(*r).unwrap(), - std::ops::Bound::Excluded(r) => isize::try_from(*r).unwrap() + 1, - std::ops::Bound::Unbounded => 0isize, - }; - - let end = i + match range.start_bound() { - std::ops::Bound::Included(r) => isize::try_from(*r).unwrap() + 1, - std::ops::Bound::Excluded(r) => isize::try_from(*r).unwrap(), - std::ops::Bound::Unbounded => self.iter.inner.len() as isize, - }; - - let start = start as usize; - let end = end as usize; - - text = self.iter.file.source_at(start..end); - span = start..end; - } - - Err(LexError::new( - span, - format!("Expected token `{token}` here"), - text, - )) - } -} - -impl<'a> Iterator for SourceIter<'a> { - type Item = (Loc, char); - - fn next(&mut self) -> Option { - let ch = self.inner.get(self.index)?; - let l = Loc { index: self.index }; - - self.index += 1; - - Some((l, *ch)) - } -} - #[cfg(test)] mod tests { use super::SourceFile; From 0d247fdb3eb46d1d6462d59fc758fb409e5b9a31 Mon Sep 17 00:00:00 2001 From: Sammy99jsp Date: Sun, 7 Jan 2024 00:00:28 +0000 Subject: [PATCH 11/39] Macros clean start. --- macros/src/lib.rs | 212 ---------------------------------------------- 1 file changed, 212 deletions(-) diff --git a/macros/src/lib.rs b/macros/src/lib.rs index 3cbaa0f..3563a1e 100644 --- a/macros/src/lib.rs +++ b/macros/src/lib.rs @@ -3,215 +3,3 @@ use proc_macro::{Diagnostic, Level, Span, TokenStream}; use quote::quote; use syn::spanned::Spanned; -#[proc_macro_derive(Spanned)] -pub fn derive(input: TokenStream) -> TokenStream { - if let Ok(en) = syn::parse::(input.clone()) { - let ident = en.ident.clone(); - let passed = en - .variants - .iter() - .map(|var| { - let ident = var.ident.clone(); - let syn::Fields::Unnamed(syn::FieldsUnnamed { unnamed: _, .. }) = var.fields else { - return Err(var.span()); - }; - - Ok(ident) - }) - .collect::>(); - - if passed.iter().any(Result::is_err) { - let errors = passed.into_iter().filter_map(Result::err); - - errors.for_each(|s| { - Diagnostic::spanned(s.unwrap(), Level::Error, "Need tuple-like struct here.").emit() - }); - - return syn::Error::new( - Span::call_site().into(), - "Expected enum with tuple variants.", - ) - .into_compile_error() - .into(); - } - - let vars = passed.into_iter().filter_map(Result::ok).map(|var| { - quote! { - #ident::#var(ref s) => crate::utils::Spanned::span(s) - } - }); - - return quote! { - impl crate::utils::Spanned for #ident { - fn span(&self) -> crate::utils::Span { - match self { - #(#vars),* - } - } - } - } - .into(); - }; - - if let Ok(st) = syn::parse::(input) { - let ident = st.ident.clone(); - match st.fields { - syn::Fields::Named(syn::FieldsNamed { named: f, .. }) => { - let pass = f.iter().any(|syn::Field { ident, .. }| { - ident.as_ref().map(|ident| ident == "span").unwrap_or(false) - }); - - if !pass { - return syn::Error::new( - f.span(), - "Cannot derive Spanned for named struct without `span` field.", - ) - .into_compile_error() - .into(); - } - - return quote! { - impl crate::utils::Spanned for #ident { - fn span(&self) -> Span { - self.span - } - } - } - .into(); - } - syn::Fields::Unnamed(syn::FieldsUnnamed { unnamed: f, .. }) => { - if f.is_empty() { - return syn::Error::new( - f.span(), - "Cannot derive Spanned for empty tuple struct.", - ) - .into_compile_error() - .into(); - } - - return quote! { - impl crate::utils::Spanned for #ident { - fn span(&self) -> Span { - self.0 - } - } - } - .into(); - } - syn::Fields::Unit => { - return syn::Error::new(st.span(), "Cannot derive Spanned for unit struct.") - .into_compile_error() - .into(); - } - } - } - - syn::Error::new(Span::call_site().into(), "Expected either enum or struct.") - .into_compile_error() - .into() -} - -#[proc_macro_attribute] -#[allow(non_snake_case)] -pub fn Lex(args: TokenStream, input: TokenStream) -> TokenStream { - let st = syn::parse::(input.clone()); - let en = syn::parse::(input); - - match (st, en) { - (Ok(st), Err(_)) => { - let ident = &st.ident; - let ch: syn::LitChar = match syn::parse(args) { - Ok(ch) => ch, - Err(err) => { - return err.into_compile_error().into(); - } - }; - quote! { - #st - - impl Lex for #ident { - fn lex(input: &mut crate::utils::SourceIter) -> Option { - if input.peek() == Some(&#ch) { - // Unwrap okay, because otherwise .peek returns None. - let (l, _) = input.next().unwrap(); - return Some(Self{ span: crate::utils::Span::single_char(l)}); - } - - None - } - - fn peek(input: &crate::utils::SourceIter) -> bool { - input.peek() == Some(&#ch) - } - } - } - .into() - } - (Err(_), Ok(en)) => { - let ident = &en.ident; - - let vars = en - .variants - .iter() - .map(|syn::Variant { ident, fields, .. }| match fields { - syn::Fields::Named(_) => None, - syn::Fields::Unnamed(syn::FieldsUnnamed { unnamed: f, .. }) => { - if f.is_empty() { - return None; - } - let f = f.iter().next().unwrap(); - Some((ident.clone(), f.ty.clone())) - } - syn::Fields::Unit => None, - }) - .collect::>(); - - if vars.iter().any(Option::is_none) { - return syn::Error::new_spanned( - en, - "Cannot auto-impl Lex on enum that is not only single-tuple variants.", - ) - .into_compile_error() - .into(); - } - - let (vars, peeks): (Vec<_>, Vec<_>) = vars - .into_iter() - .flatten() - .map(|(v, ty)| { - ( - quote! { - if let Some(s) = #ty::lex(input).into_lex_result()? { - return Ok(Some(Self::#v(s))); - } - }, - quote! { - #ty::peek(input) - }, - ) - }) - .unzip(); - - let peeks = peeks.into_iter().intersperse(quote! {||}); - - quote! { - #en - - impl Lex for #ident { - fn lex(input: &mut SourceIter) -> impl IntoLexResult { - #(#vars)* - - Ok(None) - } - - fn peek(input: &SourceIter) -> bool { - #(#peeks)* - } - } - - } - .into() - } - _ => unimplemented!("Mutually exlusive parsing."), - } -} From 71e1ced993b2956f7ee7cd4a21bef0da3b7ec21c Mon Sep 17 00:00:00 2001 From: Sammy99jsp Date: Sun, 7 Jan 2024 00:26:38 +0000 Subject: [PATCH 12/39] Add dependency `thiserror`. --- Cargo.lock | 33 +++++++++++++++++++++++++++------ Cargo.toml | 1 + 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c7a8613..d0cfb88 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -26,6 +26,7 @@ dependencies = [ "finl_unicode", "lazy_static", "regex", + "thiserror", ] [[package]] @@ -56,18 +57,18 @@ checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167" [[package]] name = "proc-macro2" -version = "1.0.73" +version = "1.0.76" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2dd5e8a1f1029c43224ad5898e50140c2aebb1705f19e67c918ebf5b9e797fe1" +checksum = "95fc56cda0b5c3325f5fbbd7ff9fda9e02bb00bb3dac51252d2f1bfa1cb8cc8c" dependencies = [ "unicode-ident", ] [[package]] name = "quote" -version = "1.0.34" +version = "1.0.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22a37c9326af5ed140c86a46655b5278de879853be5573c01df185b6f49a580a" +checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef" dependencies = [ "proc-macro2", ] @@ -103,15 +104,35 @@ checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" [[package]] name = "syn" -version = "2.0.45" +version = "2.0.48" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0eae3c679c56dc214320b67a1bc04ef3dfbd6411f6443974b5e4893231298e66" +checksum = "0f3531638e407dfc0814761abb7c00a5b54992b849452a0646b7f65c9f770f3f" dependencies = [ "proc-macro2", "quote", "unicode-ident", ] +[[package]] +name = "thiserror" +version = "1.0.56" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d54378c645627613241d077a3a79db965db602882668f9136ac42af9ecb730ad" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.56" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa0faa943b50f3db30a20aa7e265dbc66076993efed8463e8de414e5d06d3471" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "unicode-ident" version = "1.0.12" diff --git a/Cargo.toml b/Cargo.toml index 0de9457..1b56874 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,6 +11,7 @@ anyhow = "1.0.79" finl_unicode = "1.2.0" lazy_static = "1.4.0" regex = "1.10.2" +thiserror = "1.0.56" [dependencies.avjason-macros] path = "./macros" From 91fc6bcb24558f8bd1893bd018fe4518d38d4909 Mon Sep 17 00:00:00 2001 From: Sammy99jsp Date: Sun, 7 Jan 2024 03:35:11 +0000 Subject: [PATCH 13/39] Add documentation referencing, derive(Spanned) macros. --- macros/src/lib.rs | 252 +++++++++++++++++++++++++++++++++++++++++++- macros/src/utils.rs | 66 ++++++++++++ 2 files changed, 316 insertions(+), 2 deletions(-) create mode 100644 macros/src/utils.rs diff --git a/macros/src/lib.rs b/macros/src/lib.rs index 3563a1e..5c83c7e 100644 --- a/macros/src/lib.rs +++ b/macros/src/lib.rs @@ -1,5 +1,253 @@ +//! +//! Utility macros for the main crate. +//! + #![feature(proc_macro_diagnostic, iter_intersperse)] -use proc_macro::{Diagnostic, Level, Span, TokenStream}; +mod utils; + +use proc_macro::{Diagnostic, Level, TokenStream}; use quote::quote; -use syn::spanned::Spanned; +use syn::{spanned::Spanned, ItemStruct, Token}; +use utils::{get_struct_member_where_type, path_contains, NonEmptyStr}; + +/// +/// Parses the item, and arguments for the Ref* macros. +/// +/// Also provides useful errors. +/// +fn reference_macro(arg: TokenStream, target: TokenStream) -> Option<(syn::Item, Args)> +where + Args: syn::parse::Parse, +{ + // Only apply to items (Rust syntax). + let target: syn::Item = match syn::parse(target) { + Ok(item) => item, + Err(err) => { + Diagnostic::spanned( + err.span().unwrap(), + Level::Error, + "Cannot apply this to a non-item (e.g. struct, enum, type).", + ) + .emit(); + + return None; + } + }; + + let arg: Args = match syn::parse(arg) { + Ok(arg) => arg, + Err(err) => { + Diagnostic::spanned( + err.span().unwrap(), + Level::Error, + "Expected literal &str here.", + ) + .emit(); + + return None; + } + }; + + Some((target, arg)) +} + +/// +/// Reference to a part of the JSON5 spec. +/// +/// Adds a link to the part of the original spec. +/// +/// ### Example +/// ```ignore +/// use avjason_macros::SpecRef; +/// +/// /// +/// /// Whitespace characters that do not influence syntax. +/// /// +/// #[SpecRef("WhiteSpace")] +/// pub struct WhiteSpace { +/// /* Blah, blah, blah. */ +/// } +/// ``` +/// +#[proc_macro_attribute] +#[allow(non_snake_case)] +pub fn SpecRef(arg: TokenStream, target: TokenStream) -> TokenStream { + let Some((target, arg)): Option<(_, NonEmptyStr)> = reference_macro(arg, target) else { + return TokenStream::default(); + }; + + let link = format!( + "See the original spec: [**{0}**](https://spec.json5.org/#prod-{0}).", + arg.value() + ); + + quote! { + #[doc = ""] + #[doc = "---"] + #[doc = #link] + #[doc = ""] + #target + } + .into() +} + +/// +/// Format for the ECMAScript spec reference, since their urls +/// make no sense. +/// +struct EcmaRef { + /// + /// Display text for the link. + /// + text: NonEmptyStr, + + /// + /// Comma seperator. + /// + _comma: Token![,], + + /// + /// Href for the link. + /// + href: NonEmptyStr, +} + +impl syn::parse::Parse for EcmaRef { + fn parse(input: syn::parse::ParseStream) -> syn::Result { + Ok(Self { + text: input.parse()?, + _comma: input.parse()?, + href: input.parse()?, + }) + } +} + +/// Adds a link to the part of the original ECMAScript spec. +/// +/// ### Example +/// ```ignore +/// use avjason_macros::ECMARef; +/// +/// #[ECMARef("LineTermintor", "https://262.ecma-international.org/5.1/#sec-7.3")] +/// pub struct LineTerminator { +/// /* Blah, blah, blah */ +/// } +/// ``` +/// +/// +#[proc_macro_attribute] +#[allow(non_snake_case)] +pub fn ECMARef(arg: TokenStream, target: TokenStream) -> TokenStream { + let Some((target, arg)): Option<(_, EcmaRef)> = reference_macro(arg, target) else { + return TokenStream::default(); + }; + + let link = format!( + "See the original ECMAScript spec: [**{}**]({}).", + arg.text.value(), + arg.href.value(), + ); + + quote! { + #[doc = ""] + #[doc = "---"] + #[doc = #link] + #[doc = ""] + #target + } + .into() +} + +#[proc_macro_derive(Spanned)] +pub fn spanned(item: TokenStream) -> TokenStream { + let (st, en): (syn::Result, syn::Result) = + (syn::parse(item.clone()), syn::parse(item)); + + if let Ok(ref st) = st { + // Find first field with the `Span` type. + let Some(m) = get_struct_member_where_type(st, |ty| match ty { + syn::Type::Path(syn::TypePath { path, .. }) => path_contains(path, "Span"), + _ => false, + }) else { + Diagnostic::spanned( + st.span().unwrap(), + Level::Error, + "Need a field with type `Span` in it.", + ) + .emit(); + + return Default::default(); + }; + + let ident = &st.ident; + return quote! { + impl crate::utils::Spanned for #ident { + fn span(&self) -> Span { + #m + } + } + } + .into(); + } + + if let Ok(en) = en { + let vars: Vec<_> = en + .variants + .iter() + .map(|var| match &var.fields { + syn::Fields::Named(_) => None, + syn::Fields::Unnamed(syn::FieldsUnnamed { unnamed, .. }) => { + (unnamed.len() == 1).then_some(&var.ident) + } + syn::Fields::Unit => None, + }) + .collect(); + + if vars.iter().any(Option::is_none) { + Diagnostic::spanned( + en.span().unwrap(), + Level::Error, + "Enum variants can only be a single-element tuple.", + ) + .emit(); + + return Default::default(); + } + let ident = &en.ident; + // SAFETY: We've already checked above if any are none. + let vars = vars + .into_iter() + .map(|a| unsafe { a.unwrap_unchecked() }) + .map(|ident| { + quote! { + Self::#ident(inner) => inner.span() + } + }); + + return quote! { + impl crate::utils::Spanned for #ident { + fn span(&self) -> crate::utils::Span { + match self { + #(#vars),*, + _ => unreachable!() + } + } + } + } + .into(); + } + + // SAFETY: We check before if st is Ok and early-return, + // so this is safe. + // This is done since syn::ItemStruct doesn't impl `Debug` :( + let err = unsafe { st.unwrap_err_unchecked() }; + + Diagnostic::spanned( + err.span().unwrap(), + Level::Error, + "Expected either struct, or enum definition here.", + ) + .emit(); + Default::default() +} diff --git a/macros/src/utils.rs b/macros/src/utils.rs new file mode 100644 index 0000000..61a3a46 --- /dev/null +++ b/macros/src/utils.rs @@ -0,0 +1,66 @@ +//! +//! Utilities for the utility macros. +//! + +use std::ops::Deref; + +use proc_macro::{Diagnostic, Level, Span}; +use quote::quote; +use syn::punctuated::Punctuated; + +/// +/// A lit str, but a warning is displayed +/// if it is empty. +/// +pub struct NonEmptyStr(syn::LitStr); + +impl syn::parse::Parse for NonEmptyStr { + fn parse(input: syn::parse::ParseStream) -> syn::Result { + let lit: syn::LitStr = input.parse()?; + + if lit.value().is_empty() { + Diagnostic::spanned( + lit.span().unwrap(), + Level::Warning, + "This should not be empty.", + ) + .emit(); + } + + Ok(Self(lit)) + } +} + +impl Deref for NonEmptyStr { + type Target = syn::LitStr; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +pub fn path_contains(path: &syn::Path, st: &str) -> bool { + path.segments.iter().any(|seg| seg.ident == st) +} + +pub fn get_struct_member_where_type( + st: &syn::ItemStruct, + pred: impl Fn(&syn::Type) -> bool, +) -> Option { + let ident = match &st.fields { + syn::Fields::Named(syn::FieldsNamed { named, .. }) => named + .iter() + .find_map(|f| pred(&f.ty).then(|| f.ident.as_ref().unwrap().clone())), + syn::Fields::Unnamed(syn::FieldsUnnamed { unnamed, .. }) => { + unnamed.iter().enumerate().find_map(|(i, f)| { + pred(&f.ty).then(|| syn::Ident::new(&i.to_string(), Span::call_site().into())) + }) + } + syn::Fields::Unit => None, + }?; + + // Unwrap as we should have valid syntax here. + Some(syn::parse2(quote! { + self.#ident + }).unwrap()) +} From 353e094398be1dde8be83d8c302685c8c43d92da Mon Sep 17 00:00:00 2001 From: Sammy99jsp Date: Sun, 7 Jan 2024 03:36:40 +0000 Subject: [PATCH 14/39] Add new fancy lexing utils. --- src/lex/mod.rs | 36 +++++++- src/lex/utils.rs | 215 ++++++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 2 +- src/utils/mod.rs | 12 +++ src/utils/span.rs | 6 ++ 5 files changed, 268 insertions(+), 3 deletions(-) create mode 100644 src/lex/utils.rs diff --git a/src/lex/mod.rs b/src/lex/mod.rs index 0233404..b2b8994 100644 --- a/src/lex/mod.rs +++ b/src/lex/mod.rs @@ -1,4 +1,36 @@ //! -//! Lexxing -//! +//! ## Step 1: Lexing +//! +//! This process involves converting source code +//! into what's known as a [lexical grammar](https://en.wikipedia.org/wiki/Lexical_grammar). +//! +//! This step will eventually yield tokens, which can then be +//! checked against syntax. +//! +//! This lexical grammar is defined in the [JSON5 specification](https://spec.json5.org/#lexical-grammar). +//! + +pub mod utils; +pub mod whitespace; +pub mod line_terminator; + +use avjason_macros::{SpecRef, Spanned}; + +use self::{whitespace::WhiteSpace, line_terminator::LineTerminator}; + +pub(crate) use utils::{LexError, Lex, LexResult}; +/// +/// ## JSON5InputElement +/// +/// All possible acceptable things our lexer accepts. +/// * A superset of valid tokens: Valid Tokens + { Comments, Whitespace, LineTerminator, }. +/// +#[SpecRef("JSON5InputElement")] +#[derive(Debug, Spanned)] +pub(crate) enum InputElement { + WhiteSpace(WhiteSpace), + LineTerminator(LineTerminator), + // Comment(Comment), + // Token(Token), +} diff --git a/src/lex/utils.rs b/src/lex/utils.rs new file mode 100644 index 0000000..e45ad63 --- /dev/null +++ b/src/lex/utils.rs @@ -0,0 +1,215 @@ +//! +//! Utilities for lexing. +//! + +use std::fmt::Debug; + +use thiserror::Error; + +use crate::utils::{SourceIter, Span, TryIntoSpan, Spanned}; + +/// +/// Errors that can occur during lexing. +/// +#[derive(Debug, Clone, Error, PartialEq, PartialOrd, Eq, Ord, Hash)] +pub(crate) enum LexError { + /// + /// Expected a specific character/sub-token at a location. + /// + #[error("Expected `{token}` {extra}\n\tat {position}")] + Expected { + token: String, + position: String, + extra: String, + }, + + /// + /// Invalid character at acertain position. + /// + #[error("Unexpected `{token}` {extra}\n\tat {position}")] + Unexpected { + token: String, + position: String, + extra: String, + }, +} + +/// +/// Convenience type for lexer result types. +/// +#[derive(Debug, PartialEq, PartialOrd, Eq, Ord, Hash)] +pub(crate) enum LexResult { + /// + /// No lexical grammars were violated, but the + /// characters cannot be lexed into this token, + /// so skip trying to lex this token. + /// + /// It's important to note that the character iterator + /// *has not* advanced. + /// + Stop, + + /// + /// Successful lex result according to + /// our lexical grammar. + /// + /// Character iterator has been advanced. + /// + Ok(T), + + /// + /// Some grammatical rules have violated whilst lexing + /// -- stop lexing immeadiately. + /// + Err(LexError), +} + +/// +/// Implementing [std::ops::Try], though experimental, allows the leverage of +/// the `?` operator for our convinence. +/// +/// Now, we can stop halfway through if: +/// * we get an error ([LexResult::Err]), or +/// * we know that we don't need to continue the rest of the computation, +/// but don't have an error ([LexResult::Stop]). +/// +impl std::ops::Try for LexResult { + type Output = T; + + type Residual = LexResult; + + fn from_output(output: Self::Output) -> Self { + Self::Ok(output) + } + + fn branch(self) -> std::ops::ControlFlow { + match self { + LexResult::Stop => std::ops::ControlFlow::Break(LexResult::Stop), + LexResult::Ok(o) => std::ops::ControlFlow::Continue(o), + LexResult::Err(err) => std::ops::ControlFlow::Break(LexResult::Err(err)), + } + } +} + +impl std::ops::FromResidual for LexResult { + fn from_residual(residual: ::Residual) -> Self { + match residual { + LexResult::Stop => Self::Stop, + LexResult::Err(err) => Self::Err(err), + LexResult::Ok(_) => unreachable!(), + } + } +} + +impl From> for LexResult { + fn from(value: Option) -> Self { + match value { + Some(s) => Self::Ok(s), + None => Self::Stop, + } + } +} + +/// +/// ## Lexing +/// +/// Attempts to lex characters into a proto-token. +/// +/// Uses [LexResult] as control flow. +/// +pub(crate) trait Lex: Sized + Debug { + fn lex(input: &mut SourceIter) -> LexResult; + fn peek(input: &SourceIter) -> bool; +} + +/// +/// Utility implementation for +/// optional lexing. +/// +impl Lex for Option +where + T: Lex, +{ + fn lex(input: &mut SourceIter) -> LexResult { + if !T::peek(input) { + return LexResult::Ok(None); + } + + match T::lex(input) { + LexResult::Ok(t) => LexResult::Ok(Some(t)), + + // Here, we allow `Stop`-s since they indicate that T is not present. + // So, it's a valid parse -- we have nothing. + LexResult::Stop => LexResult::Ok(None), + + // Error variant indicates that something looking like T + // was present, but it doesn't conform to T's grammar. + LexResult::Err(err) => LexResult::Err(err), + } + } + + fn peek(_: &SourceIter) -> bool { + // Checking to see if an optional thing is present is + // virtual insanity -- it's *always true*. + unimplemented!("Tautologically useless!") + } +} + +/// +/// Keeps collecting characters (into a span) +/// while a condition is true. +/// +pub(crate) fn capture_while( + input: &mut SourceIter, + pred: impl Fn(&char) -> bool, +) -> LexResult { + let pred = &pred; + if !input.peek().map(pred).unwrap_or(false) { + return LexResult::Stop; + } + // Unwrap ok as we know (a) char exists, and (b) is whitespace. + let start = input.next().unwrap().0; + let mut end = start + 1; + while let Some(ch) = input.peek() { + if pred(ch) { + // Unwrap ok for same reason as above. + end = input.next().unwrap().0; + } else { + break; + } + } + + LexResult::Ok(TryIntoSpan::try_into_span(start..=end).unwrap()) +} + +#[cfg(test)] +mod tests { + use crate::lex::utils::LexResult; + + #[test] + fn lex_result() { + fn dummy_lexer(input: usize) -> LexResult { + match input % 3 { + 0 => LexResult::Ok(input), + 1 => LexResult::Stop, + 2 => LexResult::Err(crate::lex::utils::LexError::Expected { + token: "number that is {0, 1} mod 3".to_string(), + position: "nope".to_string(), + extra: "".to_string(), + }), + _ => unreachable!(), + } + } + + fn dummy(input: [usize; 3]) -> LexResult<[usize; 3]> { + let first = dummy_lexer(input[0])?; + let second = dummy_lexer(input[1])?; + let third = dummy_lexer(input[2])?; + + LexResult::Ok([first, second, third]) + } + + assert_eq!(dummy([1, 2, 3]), LexResult::Stop); // Stops on 3n + 1 + assert!(matches!(dummy([0, 2, 3]), LexResult::Err(_))); // Error on 3n + 2 + } +} diff --git a/src/lib.rs b/src/lib.rs index 0fb8ea7..e353320 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4,7 +4,7 @@ //! //! A parser for [JSON5](https://json5.org/). //! -#![feature(iter_map_windows, associated_type_defaults, specialization)] +#![feature(iter_map_windows, associated_type_defaults, specialization, try_trait_v2)] pub mod utils; pub mod lex; diff --git a/src/utils/mod.rs b/src/utils/mod.rs index 3a01689..61e6cbd 100644 --- a/src/utils/mod.rs +++ b/src/utils/mod.rs @@ -229,6 +229,18 @@ impl<'a> SourceIter<'a> { } } +impl<'a> Iterator for SourceIter<'a> { + type Item = (Loc, char); + + fn next(&mut self) -> Option { + let ch = self.inner.get(self.index).copied()?; + let loc = Loc { index: self.index }; + self.index += 1; + + Some((loc, ch)) + } +} + #[cfg(test)] mod tests { use super::SourceFile; diff --git a/src/utils/span.rs b/src/utils/span.rs index 6637adb..6160d76 100644 --- a/src/utils/span.rs +++ b/src/utils/span.rs @@ -188,3 +188,9 @@ impl TryIntoSpan for usize { pub trait Spanned { fn span(&self) -> Span; } + +impl Spanned for Span { + fn span(&self) -> Span { + *self + } +} \ No newline at end of file From dd5cfe6cfa4a10f74e42a37636707cec08d7cccf Mon Sep 17 00:00:00 2001 From: Sammy99jsp Date: Sun, 7 Jan 2024 03:37:06 +0000 Subject: [PATCH 15/39] Add support for `LineTerminator`, `WhiteSpace` --- src/lex/line_terminator.rs | 97 ++++++++++++++++++++++++++++++++++++++ src/lex/whitespace.rs | 52 ++++++++++++++++++++ 2 files changed, 149 insertions(+) create mode 100644 src/lex/line_terminator.rs create mode 100644 src/lex/whitespace.rs diff --git a/src/lex/line_terminator.rs b/src/lex/line_terminator.rs new file mode 100644 index 0000000..3e61b44 --- /dev/null +++ b/src/lex/line_terminator.rs @@ -0,0 +1,97 @@ +use avjason_macros::{ECMARef, Spanned}; + +use crate::utils::{SourceIter, Span, TryIntoSpan}; + +use super::{utils::capture_while, Lex, LexResult}; + +/// +/// ## LineTerminator +/// +/// Defined in [Section 7.3 Table 3](https://262.ecma-international.org/5.1/#sec-7.3). +/// Characters that end lines (single characters only, no `\r\n` here.) +/// +/// See [LineTerminatorSequence] for the version including `\r\n`. +/// +#[ECMARef("LineTerminator", "https://262.ecma-international.org/5.1/#sec-7.3")] +#[derive(Debug, Spanned)] +pub struct LineTerminator { + span: Span, +} + +impl LineTerminator { + fn is_line_terminator(ch: &char) -> bool { + matches!(ch, '\u{000A}' | '\u{000D}' | '\u{2028}' | '\u{2029}') + } +} + +impl Lex for LineTerminator { + fn lex(input: &mut SourceIter) -> LexResult { + if !Self::peek(input) { + return LexResult::Stop; + } + + LexResult::Ok(Self { + span: capture_while(input, Self::is_line_terminator)?, + }) + } + + fn peek(input: &SourceIter) -> bool { + input.peek().map(Self::is_line_terminator).unwrap_or(false) + } +} + +/// +/// ## LineTerminatorSequence +/// +/// All accepted line endings, including `\r\n`. +/// +#[ECMARef( + "LineTerminatorSequence", + "https://262.ecma-international.org/5.1/#sec-7.3" +)] +#[derive(Debug, Spanned)] +pub enum LineTerminatorSequence { + LF(Span), + CR(Span), + LS(Span), + PS(Span), + CRLF(Span), +} + +impl Lex for LineTerminatorSequence { + fn lex(input: &mut SourceIter) -> LexResult { + if !Self::peek(input) { + return LexResult::Stop; + } + + if let Some(ch) = input.peek() { + return match ch { + '\u{000A}' => LexResult::Ok(Self::CR(Span::single_char(input.next().unwrap().0))), + '\u{000D}' => { + if input.peek2().map(|n| n == &'\u{000A}').unwrap_or(false) { + let start = input.next().unwrap().0; + let end = input.next().unwrap().0; + LexResult::Ok(Self::CRLF(TryIntoSpan::try_into_span(start..=end).unwrap())) + } else { + LexResult::Ok(Self::CR(Span::single_char(input.next().unwrap().0))) + } + } + '\u{2028}' => LexResult::Ok(Self::CR(Span::single_char(input.next().unwrap().0))), + '\u{2029}' => LexResult::Ok(Self::CR(Span::single_char(input.next().unwrap().0))), + _ => LexResult::Stop, + } + } + + LexResult::Stop + } + + fn peek(input: &SourceIter) -> bool { + input + .peek() + .map(LineTerminator::is_line_terminator) + .unwrap_or(false) + } +} + +#[cfg(test)] +mod tests {} diff --git a/src/lex/whitespace.rs b/src/lex/whitespace.rs new file mode 100644 index 0000000..cd0a2e6 --- /dev/null +++ b/src/lex/whitespace.rs @@ -0,0 +1,52 @@ +use avjason_macros::{ECMARef, Spanned}; +use finl_unicode::categories::{MinorCategory, CharacterCategories}; + +use crate::{utils::Span, lex::utils::capture_while}; + +use super::{Lex, LexResult}; + +/// +/// ## WhiteSpace +/// +/// Whitespace characters (e.g. spaces, tabs, etc.). +/// +#[ECMARef( + "WhiteSpace", + "https://www.ecma-international.org/ecma-262/5.1/#sec-7.2" +)] +#[derive(Debug, Spanned)] +pub struct WhiteSpace { + span: Span, +} + +impl WhiteSpace { + /// + /// Implementation matching Table 2 in [Section 7.2](https://262.ecma-international.org/5.1/#sec-7.2). + /// + fn is_whitespace(ch: &char) -> bool { + use MinorCategory::*; + + match ch { + '\u{0009}' | '\u{000B}' | '\u{000C}' | '\u{0020}' | '\u{00A0}' | '\u{FEFF}' => true, + c if c.get_minor_category() == Zs => true, + _ => false, + } + } +} + +impl Lex for WhiteSpace { + fn lex(input: &mut crate::utils::SourceIter) -> LexResult { + LexResult::Ok(Self { + span: capture_while(input, Self::is_whitespace)?, + }) + } + + fn peek(input: &crate::utils::SourceIter) -> bool { + input.peek().map(Self::is_whitespace).unwrap_or(false) + } +} + +#[cfg(test)] +mod tests { + +} \ No newline at end of file From 10c22f4799da9bcb47b0e2144be0be8c1f6fdecf Mon Sep 17 00:00:00 2001 From: Sammy99jsp Date: Fri, 26 Jan 2024 18:15:09 +0000 Subject: [PATCH 16/39] Add deps, macros crate. --- Cargo.lock | 78 +------------- Cargo.toml | 3 - macros/Cargo.toml | 11 +- macros/src/lib.rs | 254 +--------------------------------------------- 4 files changed, 11 insertions(+), 335 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d0cfb88..af36520 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,15 +2,6 @@ # It is not intended for manual editing. version = 3 -[[package]] -name = "aho-corasick" -version = "1.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0" -dependencies = [ - "memchr", -] - [[package]] name = "anyhow" version = "1.0.79" @@ -24,15 +15,13 @@ dependencies = [ "anyhow", "avjason-macros", "finl_unicode", - "lazy_static", - "regex", - "thiserror", ] [[package]] name = "avjason-macros" version = "0.1.0" dependencies = [ + "proc-macro2", "quote", "syn", ] @@ -43,23 +32,11 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8fcfdc7a0362c9f4444381a9e697c79d435fe65b52a37466fc2c1184cee9edc6" -[[package]] -name = "lazy_static" -version = "1.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" - -[[package]] -name = "memchr" -version = "2.6.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167" - [[package]] name = "proc-macro2" -version = "1.0.76" +version = "1.0.78" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95fc56cda0b5c3325f5fbbd7ff9fda9e02bb00bb3dac51252d2f1bfa1cb8cc8c" +checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae" dependencies = [ "unicode-ident", ] @@ -73,35 +50,6 @@ dependencies = [ "proc-macro2", ] -[[package]] -name = "regex" -version = "1.10.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "380b951a9c5e80ddfd6136919eef32310721aa4aacd4889a8d39124b026ab343" -dependencies = [ - "aho-corasick", - "memchr", - "regex-automata", - "regex-syntax", -] - -[[package]] -name = "regex-automata" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f804c7828047e88b2d32e2d7fe5a105da8ee3264f01902f796c8e067dc2483f" -dependencies = [ - "aho-corasick", - "memchr", - "regex-syntax", -] - -[[package]] -name = "regex-syntax" -version = "0.8.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" - [[package]] name = "syn" version = "2.0.48" @@ -113,26 +61,6 @@ dependencies = [ "unicode-ident", ] -[[package]] -name = "thiserror" -version = "1.0.56" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d54378c645627613241d077a3a79db965db602882668f9136ac42af9ecb730ad" -dependencies = [ - "thiserror-impl", -] - -[[package]] -name = "thiserror-impl" -version = "1.0.56" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa0faa943b50f3db30a20aa7e265dbc66076993efed8463e8de414e5d06d3471" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - [[package]] name = "unicode-ident" version = "1.0.12" diff --git a/Cargo.toml b/Cargo.toml index 1b56874..1c02b1b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,9 +9,6 @@ edition = "2021" [dependencies] anyhow = "1.0.79" finl_unicode = "1.2.0" -lazy_static = "1.4.0" -regex = "1.10.2" -thiserror = "1.0.56" [dependencies.avjason-macros] path = "./macros" diff --git a/macros/Cargo.toml b/macros/Cargo.toml index 679a89f..df1cf4c 100644 --- a/macros/Cargo.toml +++ b/macros/Cargo.toml @@ -3,11 +3,12 @@ name = "avjason-macros" version = "0.1.0" edition = "2021" -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html - [lib] -proc-macro=true +proc-macro = true + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -quote = "1.0.34" -syn = { version = "2.0.45", features = ["full"] } +proc-macro2 = "1.0.78" +quote = "1.0.35" +syn = { version = "2.0.48", features = ["full"] } diff --git a/macros/src/lib.rs b/macros/src/lib.rs index 5c83c7e..e78c9c4 100644 --- a/macros/src/lib.rs +++ b/macros/src/lib.rs @@ -1,253 +1,3 @@ //! -//! Utility macros for the main crate. -//! - -#![feature(proc_macro_diagnostic, iter_intersperse)] -mod utils; - -use proc_macro::{Diagnostic, Level, TokenStream}; -use quote::quote; -use syn::{spanned::Spanned, ItemStruct, Token}; -use utils::{get_struct_member_where_type, path_contains, NonEmptyStr}; - -/// -/// Parses the item, and arguments for the Ref* macros. -/// -/// Also provides useful errors. -/// -fn reference_macro(arg: TokenStream, target: TokenStream) -> Option<(syn::Item, Args)> -where - Args: syn::parse::Parse, -{ - // Only apply to items (Rust syntax). - let target: syn::Item = match syn::parse(target) { - Ok(item) => item, - Err(err) => { - Diagnostic::spanned( - err.span().unwrap(), - Level::Error, - "Cannot apply this to a non-item (e.g. struct, enum, type).", - ) - .emit(); - - return None; - } - }; - - let arg: Args = match syn::parse(arg) { - Ok(arg) => arg, - Err(err) => { - Diagnostic::spanned( - err.span().unwrap(), - Level::Error, - "Expected literal &str here.", - ) - .emit(); - - return None; - } - }; - - Some((target, arg)) -} - -/// -/// Reference to a part of the JSON5 spec. -/// -/// Adds a link to the part of the original spec. -/// -/// ### Example -/// ```ignore -/// use avjason_macros::SpecRef; -/// -/// /// -/// /// Whitespace characters that do not influence syntax. -/// /// -/// #[SpecRef("WhiteSpace")] -/// pub struct WhiteSpace { -/// /* Blah, blah, blah. */ -/// } -/// ``` -/// -#[proc_macro_attribute] -#[allow(non_snake_case)] -pub fn SpecRef(arg: TokenStream, target: TokenStream) -> TokenStream { - let Some((target, arg)): Option<(_, NonEmptyStr)> = reference_macro(arg, target) else { - return TokenStream::default(); - }; - - let link = format!( - "See the original spec: [**{0}**](https://spec.json5.org/#prod-{0}).", - arg.value() - ); - - quote! { - #[doc = ""] - #[doc = "---"] - #[doc = #link] - #[doc = ""] - #target - } - .into() -} - -/// -/// Format for the ECMAScript spec reference, since their urls -/// make no sense. -/// -struct EcmaRef { - /// - /// Display text for the link. - /// - text: NonEmptyStr, - - /// - /// Comma seperator. - /// - _comma: Token![,], - - /// - /// Href for the link. - /// - href: NonEmptyStr, -} - -impl syn::parse::Parse for EcmaRef { - fn parse(input: syn::parse::ParseStream) -> syn::Result { - Ok(Self { - text: input.parse()?, - _comma: input.parse()?, - href: input.parse()?, - }) - } -} - -/// Adds a link to the part of the original ECMAScript spec. -/// -/// ### Example -/// ```ignore -/// use avjason_macros::ECMARef; -/// -/// #[ECMARef("LineTermintor", "https://262.ecma-international.org/5.1/#sec-7.3")] -/// pub struct LineTerminator { -/// /* Blah, blah, blah */ -/// } -/// ``` -/// -/// -#[proc_macro_attribute] -#[allow(non_snake_case)] -pub fn ECMARef(arg: TokenStream, target: TokenStream) -> TokenStream { - let Some((target, arg)): Option<(_, EcmaRef)> = reference_macro(arg, target) else { - return TokenStream::default(); - }; - - let link = format!( - "See the original ECMAScript spec: [**{}**]({}).", - arg.text.value(), - arg.href.value(), - ); - - quote! { - #[doc = ""] - #[doc = "---"] - #[doc = #link] - #[doc = ""] - #target - } - .into() -} - -#[proc_macro_derive(Spanned)] -pub fn spanned(item: TokenStream) -> TokenStream { - let (st, en): (syn::Result, syn::Result) = - (syn::parse(item.clone()), syn::parse(item)); - - if let Ok(ref st) = st { - // Find first field with the `Span` type. - let Some(m) = get_struct_member_where_type(st, |ty| match ty { - syn::Type::Path(syn::TypePath { path, .. }) => path_contains(path, "Span"), - _ => false, - }) else { - Diagnostic::spanned( - st.span().unwrap(), - Level::Error, - "Need a field with type `Span` in it.", - ) - .emit(); - - return Default::default(); - }; - - let ident = &st.ident; - return quote! { - impl crate::utils::Spanned for #ident { - fn span(&self) -> Span { - #m - } - } - } - .into(); - } - - if let Ok(en) = en { - let vars: Vec<_> = en - .variants - .iter() - .map(|var| match &var.fields { - syn::Fields::Named(_) => None, - syn::Fields::Unnamed(syn::FieldsUnnamed { unnamed, .. }) => { - (unnamed.len() == 1).then_some(&var.ident) - } - syn::Fields::Unit => None, - }) - .collect(); - - if vars.iter().any(Option::is_none) { - Diagnostic::spanned( - en.span().unwrap(), - Level::Error, - "Enum variants can only be a single-element tuple.", - ) - .emit(); - - return Default::default(); - } - let ident = &en.ident; - // SAFETY: We've already checked above if any are none. - let vars = vars - .into_iter() - .map(|a| unsafe { a.unwrap_unchecked() }) - .map(|ident| { - quote! { - Self::#ident(inner) => inner.span() - } - }); - - return quote! { - impl crate::utils::Spanned for #ident { - fn span(&self) -> crate::utils::Span { - match self { - #(#vars),*, - _ => unreachable!() - } - } - } - } - .into(); - } - - // SAFETY: We check before if st is Ok and early-return, - // so this is safe. - // This is done since syn::ItemStruct doesn't impl `Debug` :( - let err = unsafe { st.unwrap_err_unchecked() }; - - Diagnostic::spanned( - err.span().unwrap(), - Level::Error, - "Expected either struct, or enum definition here.", - ) - .emit(); - - Default::default() -} +//! Macros for the main crate. +//! From 62537ea7fa4e728619d6f78b6aa047c3a849b7f6 Mon Sep 17 00:00:00 2001 From: Sammy99jsp Date: Fri, 26 Jan 2024 18:24:21 +0000 Subject: [PATCH 17/39] Blank Start. --- macros/src/utils.rs | 66 ---------- src/lex/line_terminator.rs | 97 -------------- src/lex/mod.rs | 36 ------ src/lex/utils.rs | 215 ------------------------------- src/lex/whitespace.rs | 52 -------- src/lib.rs | 5 - src/syntax/mod.rs | 0 src/utils/mod.rs | 255 ------------------------------------- src/utils/span.rs | 196 ---------------------------- 9 files changed, 922 deletions(-) delete mode 100644 macros/src/utils.rs delete mode 100644 src/lex/line_terminator.rs delete mode 100644 src/lex/mod.rs delete mode 100644 src/lex/utils.rs delete mode 100644 src/lex/whitespace.rs delete mode 100644 src/syntax/mod.rs delete mode 100644 src/utils/mod.rs delete mode 100644 src/utils/span.rs diff --git a/macros/src/utils.rs b/macros/src/utils.rs deleted file mode 100644 index 61a3a46..0000000 --- a/macros/src/utils.rs +++ /dev/null @@ -1,66 +0,0 @@ -//! -//! Utilities for the utility macros. -//! - -use std::ops::Deref; - -use proc_macro::{Diagnostic, Level, Span}; -use quote::quote; -use syn::punctuated::Punctuated; - -/// -/// A lit str, but a warning is displayed -/// if it is empty. -/// -pub struct NonEmptyStr(syn::LitStr); - -impl syn::parse::Parse for NonEmptyStr { - fn parse(input: syn::parse::ParseStream) -> syn::Result { - let lit: syn::LitStr = input.parse()?; - - if lit.value().is_empty() { - Diagnostic::spanned( - lit.span().unwrap(), - Level::Warning, - "This should not be empty.", - ) - .emit(); - } - - Ok(Self(lit)) - } -} - -impl Deref for NonEmptyStr { - type Target = syn::LitStr; - - fn deref(&self) -> &Self::Target { - &self.0 - } -} - -pub fn path_contains(path: &syn::Path, st: &str) -> bool { - path.segments.iter().any(|seg| seg.ident == st) -} - -pub fn get_struct_member_where_type( - st: &syn::ItemStruct, - pred: impl Fn(&syn::Type) -> bool, -) -> Option { - let ident = match &st.fields { - syn::Fields::Named(syn::FieldsNamed { named, .. }) => named - .iter() - .find_map(|f| pred(&f.ty).then(|| f.ident.as_ref().unwrap().clone())), - syn::Fields::Unnamed(syn::FieldsUnnamed { unnamed, .. }) => { - unnamed.iter().enumerate().find_map(|(i, f)| { - pred(&f.ty).then(|| syn::Ident::new(&i.to_string(), Span::call_site().into())) - }) - } - syn::Fields::Unit => None, - }?; - - // Unwrap as we should have valid syntax here. - Some(syn::parse2(quote! { - self.#ident - }).unwrap()) -} diff --git a/src/lex/line_terminator.rs b/src/lex/line_terminator.rs deleted file mode 100644 index 3e61b44..0000000 --- a/src/lex/line_terminator.rs +++ /dev/null @@ -1,97 +0,0 @@ -use avjason_macros::{ECMARef, Spanned}; - -use crate::utils::{SourceIter, Span, TryIntoSpan}; - -use super::{utils::capture_while, Lex, LexResult}; - -/// -/// ## LineTerminator -/// -/// Defined in [Section 7.3 Table 3](https://262.ecma-international.org/5.1/#sec-7.3). -/// Characters that end lines (single characters only, no `\r\n` here.) -/// -/// See [LineTerminatorSequence] for the version including `\r\n`. -/// -#[ECMARef("LineTerminator", "https://262.ecma-international.org/5.1/#sec-7.3")] -#[derive(Debug, Spanned)] -pub struct LineTerminator { - span: Span, -} - -impl LineTerminator { - fn is_line_terminator(ch: &char) -> bool { - matches!(ch, '\u{000A}' | '\u{000D}' | '\u{2028}' | '\u{2029}') - } -} - -impl Lex for LineTerminator { - fn lex(input: &mut SourceIter) -> LexResult { - if !Self::peek(input) { - return LexResult::Stop; - } - - LexResult::Ok(Self { - span: capture_while(input, Self::is_line_terminator)?, - }) - } - - fn peek(input: &SourceIter) -> bool { - input.peek().map(Self::is_line_terminator).unwrap_or(false) - } -} - -/// -/// ## LineTerminatorSequence -/// -/// All accepted line endings, including `\r\n`. -/// -#[ECMARef( - "LineTerminatorSequence", - "https://262.ecma-international.org/5.1/#sec-7.3" -)] -#[derive(Debug, Spanned)] -pub enum LineTerminatorSequence { - LF(Span), - CR(Span), - LS(Span), - PS(Span), - CRLF(Span), -} - -impl Lex for LineTerminatorSequence { - fn lex(input: &mut SourceIter) -> LexResult { - if !Self::peek(input) { - return LexResult::Stop; - } - - if let Some(ch) = input.peek() { - return match ch { - '\u{000A}' => LexResult::Ok(Self::CR(Span::single_char(input.next().unwrap().0))), - '\u{000D}' => { - if input.peek2().map(|n| n == &'\u{000A}').unwrap_or(false) { - let start = input.next().unwrap().0; - let end = input.next().unwrap().0; - LexResult::Ok(Self::CRLF(TryIntoSpan::try_into_span(start..=end).unwrap())) - } else { - LexResult::Ok(Self::CR(Span::single_char(input.next().unwrap().0))) - } - } - '\u{2028}' => LexResult::Ok(Self::CR(Span::single_char(input.next().unwrap().0))), - '\u{2029}' => LexResult::Ok(Self::CR(Span::single_char(input.next().unwrap().0))), - _ => LexResult::Stop, - } - } - - LexResult::Stop - } - - fn peek(input: &SourceIter) -> bool { - input - .peek() - .map(LineTerminator::is_line_terminator) - .unwrap_or(false) - } -} - -#[cfg(test)] -mod tests {} diff --git a/src/lex/mod.rs b/src/lex/mod.rs deleted file mode 100644 index b2b8994..0000000 --- a/src/lex/mod.rs +++ /dev/null @@ -1,36 +0,0 @@ -//! -//! ## Step 1: Lexing -//! -//! This process involves converting source code -//! into what's known as a [lexical grammar](https://en.wikipedia.org/wiki/Lexical_grammar). -//! -//! This step will eventually yield tokens, which can then be -//! checked against syntax. -//! -//! This lexical grammar is defined in the [JSON5 specification](https://spec.json5.org/#lexical-grammar). -//! - -pub mod utils; -pub mod whitespace; -pub mod line_terminator; - -use avjason_macros::{SpecRef, Spanned}; - -use self::{whitespace::WhiteSpace, line_terminator::LineTerminator}; - -pub(crate) use utils::{LexError, Lex, LexResult}; - -/// -/// ## JSON5InputElement -/// -/// All possible acceptable things our lexer accepts. -/// * A superset of valid tokens: Valid Tokens + { Comments, Whitespace, LineTerminator, }. -/// -#[SpecRef("JSON5InputElement")] -#[derive(Debug, Spanned)] -pub(crate) enum InputElement { - WhiteSpace(WhiteSpace), - LineTerminator(LineTerminator), - // Comment(Comment), - // Token(Token), -} diff --git a/src/lex/utils.rs b/src/lex/utils.rs deleted file mode 100644 index e45ad63..0000000 --- a/src/lex/utils.rs +++ /dev/null @@ -1,215 +0,0 @@ -//! -//! Utilities for lexing. -//! - -use std::fmt::Debug; - -use thiserror::Error; - -use crate::utils::{SourceIter, Span, TryIntoSpan, Spanned}; - -/// -/// Errors that can occur during lexing. -/// -#[derive(Debug, Clone, Error, PartialEq, PartialOrd, Eq, Ord, Hash)] -pub(crate) enum LexError { - /// - /// Expected a specific character/sub-token at a location. - /// - #[error("Expected `{token}` {extra}\n\tat {position}")] - Expected { - token: String, - position: String, - extra: String, - }, - - /// - /// Invalid character at acertain position. - /// - #[error("Unexpected `{token}` {extra}\n\tat {position}")] - Unexpected { - token: String, - position: String, - extra: String, - }, -} - -/// -/// Convenience type for lexer result types. -/// -#[derive(Debug, PartialEq, PartialOrd, Eq, Ord, Hash)] -pub(crate) enum LexResult { - /// - /// No lexical grammars were violated, but the - /// characters cannot be lexed into this token, - /// so skip trying to lex this token. - /// - /// It's important to note that the character iterator - /// *has not* advanced. - /// - Stop, - - /// - /// Successful lex result according to - /// our lexical grammar. - /// - /// Character iterator has been advanced. - /// - Ok(T), - - /// - /// Some grammatical rules have violated whilst lexing - /// -- stop lexing immeadiately. - /// - Err(LexError), -} - -/// -/// Implementing [std::ops::Try], though experimental, allows the leverage of -/// the `?` operator for our convinence. -/// -/// Now, we can stop halfway through if: -/// * we get an error ([LexResult::Err]), or -/// * we know that we don't need to continue the rest of the computation, -/// but don't have an error ([LexResult::Stop]). -/// -impl std::ops::Try for LexResult { - type Output = T; - - type Residual = LexResult; - - fn from_output(output: Self::Output) -> Self { - Self::Ok(output) - } - - fn branch(self) -> std::ops::ControlFlow { - match self { - LexResult::Stop => std::ops::ControlFlow::Break(LexResult::Stop), - LexResult::Ok(o) => std::ops::ControlFlow::Continue(o), - LexResult::Err(err) => std::ops::ControlFlow::Break(LexResult::Err(err)), - } - } -} - -impl std::ops::FromResidual for LexResult { - fn from_residual(residual: ::Residual) -> Self { - match residual { - LexResult::Stop => Self::Stop, - LexResult::Err(err) => Self::Err(err), - LexResult::Ok(_) => unreachable!(), - } - } -} - -impl From> for LexResult { - fn from(value: Option) -> Self { - match value { - Some(s) => Self::Ok(s), - None => Self::Stop, - } - } -} - -/// -/// ## Lexing -/// -/// Attempts to lex characters into a proto-token. -/// -/// Uses [LexResult] as control flow. -/// -pub(crate) trait Lex: Sized + Debug { - fn lex(input: &mut SourceIter) -> LexResult; - fn peek(input: &SourceIter) -> bool; -} - -/// -/// Utility implementation for -/// optional lexing. -/// -impl Lex for Option -where - T: Lex, -{ - fn lex(input: &mut SourceIter) -> LexResult { - if !T::peek(input) { - return LexResult::Ok(None); - } - - match T::lex(input) { - LexResult::Ok(t) => LexResult::Ok(Some(t)), - - // Here, we allow `Stop`-s since they indicate that T is not present. - // So, it's a valid parse -- we have nothing. - LexResult::Stop => LexResult::Ok(None), - - // Error variant indicates that something looking like T - // was present, but it doesn't conform to T's grammar. - LexResult::Err(err) => LexResult::Err(err), - } - } - - fn peek(_: &SourceIter) -> bool { - // Checking to see if an optional thing is present is - // virtual insanity -- it's *always true*. - unimplemented!("Tautologically useless!") - } -} - -/// -/// Keeps collecting characters (into a span) -/// while a condition is true. -/// -pub(crate) fn capture_while( - input: &mut SourceIter, - pred: impl Fn(&char) -> bool, -) -> LexResult { - let pred = &pred; - if !input.peek().map(pred).unwrap_or(false) { - return LexResult::Stop; - } - // Unwrap ok as we know (a) char exists, and (b) is whitespace. - let start = input.next().unwrap().0; - let mut end = start + 1; - while let Some(ch) = input.peek() { - if pred(ch) { - // Unwrap ok for same reason as above. - end = input.next().unwrap().0; - } else { - break; - } - } - - LexResult::Ok(TryIntoSpan::try_into_span(start..=end).unwrap()) -} - -#[cfg(test)] -mod tests { - use crate::lex::utils::LexResult; - - #[test] - fn lex_result() { - fn dummy_lexer(input: usize) -> LexResult { - match input % 3 { - 0 => LexResult::Ok(input), - 1 => LexResult::Stop, - 2 => LexResult::Err(crate::lex::utils::LexError::Expected { - token: "number that is {0, 1} mod 3".to_string(), - position: "nope".to_string(), - extra: "".to_string(), - }), - _ => unreachable!(), - } - } - - fn dummy(input: [usize; 3]) -> LexResult<[usize; 3]> { - let first = dummy_lexer(input[0])?; - let second = dummy_lexer(input[1])?; - let third = dummy_lexer(input[2])?; - - LexResult::Ok([first, second, third]) - } - - assert_eq!(dummy([1, 2, 3]), LexResult::Stop); // Stops on 3n + 1 - assert!(matches!(dummy([0, 2, 3]), LexResult::Err(_))); // Error on 3n + 2 - } -} diff --git a/src/lex/whitespace.rs b/src/lex/whitespace.rs deleted file mode 100644 index cd0a2e6..0000000 --- a/src/lex/whitespace.rs +++ /dev/null @@ -1,52 +0,0 @@ -use avjason_macros::{ECMARef, Spanned}; -use finl_unicode::categories::{MinorCategory, CharacterCategories}; - -use crate::{utils::Span, lex::utils::capture_while}; - -use super::{Lex, LexResult}; - -/// -/// ## WhiteSpace -/// -/// Whitespace characters (e.g. spaces, tabs, etc.). -/// -#[ECMARef( - "WhiteSpace", - "https://www.ecma-international.org/ecma-262/5.1/#sec-7.2" -)] -#[derive(Debug, Spanned)] -pub struct WhiteSpace { - span: Span, -} - -impl WhiteSpace { - /// - /// Implementation matching Table 2 in [Section 7.2](https://262.ecma-international.org/5.1/#sec-7.2). - /// - fn is_whitespace(ch: &char) -> bool { - use MinorCategory::*; - - match ch { - '\u{0009}' | '\u{000B}' | '\u{000C}' | '\u{0020}' | '\u{00A0}' | '\u{FEFF}' => true, - c if c.get_minor_category() == Zs => true, - _ => false, - } - } -} - -impl Lex for WhiteSpace { - fn lex(input: &mut crate::utils::SourceIter) -> LexResult { - LexResult::Ok(Self { - span: capture_while(input, Self::is_whitespace)?, - }) - } - - fn peek(input: &crate::utils::SourceIter) -> bool { - input.peek().map(Self::is_whitespace).unwrap_or(false) - } -} - -#[cfg(test)] -mod tests { - -} \ No newline at end of file diff --git a/src/lib.rs b/src/lib.rs index e353320..d437de8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4,8 +4,3 @@ //! //! A parser for [JSON5](https://json5.org/). //! -#![feature(iter_map_windows, associated_type_defaults, specialization, try_trait_v2)] - -pub mod utils; -pub mod lex; -pub mod syntax; \ No newline at end of file diff --git a/src/syntax/mod.rs b/src/syntax/mod.rs deleted file mode 100644 index e69de29..0000000 diff --git a/src/utils/mod.rs b/src/utils/mod.rs deleted file mode 100644 index 61e6cbd..0000000 --- a/src/utils/mod.rs +++ /dev/null @@ -1,255 +0,0 @@ -//! -//! Utilities. -//! - -pub mod span; -use std::{ - fmt::Debug, - fs, io, - ops::RangeBounds, - path::{Path, PathBuf}, -}; - -use anyhow::anyhow; -pub use span::*; - -#[derive(Debug)] -pub struct SourceFile { - path: PathBuf, - contents: Vec, - line_starts: Vec, -} - -impl SourceFile { - /// - /// Splits lines by ECMA-abiding line endings. - /// - fn split_lines(src: &str) -> impl Iterator + '_ { - src.chars() - .enumerate() - .map_windows(|[(a_i, a), (b_i, b)]| { - // Implementing https://262.ecma-international.org/5.1/#sec-7.3 - Some(match (*a, *b) { - ('\n', _) => a_i + 1, - ('\r', '\n') => b_i + 1, - ('\r', _) => a_i + 1, - ('\u{2028}', _) => a_i + 1, - ('\u{2029}', _) => a_i + 1, - _ => return None, - }) - }) - .flatten() - .chain(std::iter::once(src.len())) - } - - /// - /// Returns a string representing a [Loc] in ${FILE}:${LINE}:${COLUMN} format. - /// - pub fn file_line_column(&self, loc: &Loc) -> Option { - let Some((ln, col)) = self - .line_starts - .iter() - .enumerate() - .find(|(_, i)| loc.index < **i) - .map(|(ln, len)| (ln, len - loc.index)) - else { - return None; - }; - - Some(format!("{}:{ln}:{col}", &self.path.to_str()?)) - } - - /// - /// Returns the original source code at a particular [Span]. - /// - pub fn source_at(&self, span: impl RangeBounds) -> Option { - let span = S::try_into_span(span)?; - if span.end.index > self.contents.len() { - return None; - } - - if span.start.index >= span.end.index { - return None; - } - - Some( - self.contents[span.start.index..span.end.index] - .iter() - .collect(), - ) - } - - /// - /// Returns the original source code at a particular [Span]. - /// - pub fn source_at_span(&self, span: Span) -> Option { - if span.end.index > self.contents.len() { - return None; - } - - Some( - self.contents[span.start.index..span.end.index] - .iter() - .collect(), - ) - } - - #[cfg(test)] - pub(crate) fn dummy_file(path: impl AsRef, contents: impl ToString) -> Self { - let contents = contents.to_string(); - let line_lengths = Self::split_lines(&contents).collect(); - Self { - path: path.as_ref().to_owned(), - contents: contents.chars().collect(), - line_starts: line_lengths, - } - } - - /// - /// Attempts to read a [SourceFile] from a file. - /// - pub fn load_file(path: impl AsRef) -> io::Result { - let path = path.as_ref(); - let contents = fs::read_to_string(path)?; - let line_starts = Self::split_lines(&contents).collect(); - - Ok(Self { - path: path.to_owned(), - contents: contents.chars().collect(), - line_starts, - }) - } - - pub(crate) fn iter(&self) -> SourceIter { - SourceIter::new(self) - } -} - -#[derive(Clone)] -pub struct SourceIter<'a> { - file: &'a SourceFile, - inner: &'a Vec, - index: usize, -} - -impl<'a> std::fmt::Debug for SourceIter<'a> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("SourceIter") - .field("left", &String::from_iter(&self.inner[self.index..])) - .field("index", &self.index) - .finish() - } -} - -impl<'a> SourceIter<'a> { - pub(crate) fn new(file: &'a SourceFile) -> Self { - Self { - file, - inner: &file.contents, - index: 0, - } - } - - pub(crate) fn source_at(&self, span: Span) -> String { - (self.inner[span.start.index..span.end.index]) - .iter() - .collect() - } - - pub(crate) fn peek(&self) -> Option<&char> { - self.inner.get(self.index) - } - - pub(crate) fn peek2(&self) -> Option<&char> { - self.inner.get(self.index + 1) - } - - pub(crate) fn fork(&self) -> Self { - self.clone() - } - - pub(crate) fn ahead(&self, range: impl RangeBounds) -> Option { - let abs_start = self.index - + match range.start_bound() { - std::ops::Bound::Included(d) => *d, - std::ops::Bound::Excluded(d) => (*d) + 1, - std::ops::Bound::Unbounded => 0, - }; - - let abs_end = self.index - + match range.end_bound() { - std::ops::Bound::Included(d) => *d + 1, - std::ops::Bound::Excluded(d) => *d, - std::ops::Bound::Unbounded => self.inner.len(), - }; - - if !(abs_start < self.inner.len() && abs_end <= self.inner.len()) { - return None; - } - - Some(self.inner[abs_start..abs_end].iter().collect()) - } - - pub(crate) fn relative_match( - &self, - range: impl RangeBounds, - pred: impl Fn(&char) -> bool, - ) -> bool { - let abs_start = self.index - + match range.start_bound() { - std::ops::Bound::Included(d) => *d, - std::ops::Bound::Excluded(d) => (*d) + 1, - std::ops::Bound::Unbounded => 0, - }; - let abs_end = self.index - + match range.end_bound() { - std::ops::Bound::Included(d) => *d + 1, - std::ops::Bound::Excluded(d) => *d, - std::ops::Bound::Unbounded => self.inner.len(), - }; - - if !(abs_start < self.inner.len() && abs_end <= self.inner.len()) { - return false; - } - - let s = &self.inner[abs_start..abs_end]; - s.iter().all(pred) - } - - pub(crate) fn offset(&mut self, offset: usize) { - self.index += offset; - } - - pub(crate) fn advance_to(&mut self, other: Self) { - self.index = other.index; - } - - pub(crate) fn eof(&self) -> bool { - self.index >= self.inner.len() - } -} - -impl<'a> Iterator for SourceIter<'a> { - type Item = (Loc, char); - - fn next(&mut self) -> Option { - let ch = self.inner.get(self.index).copied()?; - let loc = Loc { index: self.index }; - self.index += 1; - - Some((loc, ch)) - } -} - -#[cfg(test)] -mod tests { - use super::SourceFile; - - #[test] - fn source_file() { - let src = SourceFile::dummy_file("example.txt", "I am a\ngood file!\n\nGimme a pet!"); - println!("{src:?}"); - - println!("{:?}", src.source_at(7..11)) - } -} diff --git a/src/utils/span.rs b/src/utils/span.rs deleted file mode 100644 index 6160d76..0000000 --- a/src/utils/span.rs +++ /dev/null @@ -1,196 +0,0 @@ -//! -//! Helpers for finding the locations of things. -//! - -use std::ops::{Add, RangeBounds}; - -/// -/// Represents a character's location in source code. -/// -#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] -pub struct Loc { - pub(crate) index: usize, -} - -impl Add for Loc -where - Rhs: Copy, - usize: Add, -{ - type Output = Loc; - - fn add(self, rhs: Rhs) -> Self::Output { - Self { - index: self.index + rhs, - } - } -} - -/// -/// Represents a token's position in the code. -/// -#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] -pub struct Span { - /// - /// Lower bound. - /// - pub(crate) start: Loc, - - /// - /// Exclusive upper bound. - /// - pub(crate) end: Loc, -} - -impl Span { - /// - /// Returns the length of this span in characters. - /// - pub fn len(&self) -> usize { - self.end.index - self.start.index - } - - /// - /// Returns whether this [Span] contains nothing. - /// - pub fn is_empty(&self) -> bool { - self.len() == 0 - } - - /// - /// Allows you to find a smaller [Span] within this span. - /// - /// ```ignore - /// // Find location of the word "pumpkin" - /// let pumpkin: Span = find_word("pumpkin"); - /// - /// // Gets the Span corresponding to the "pump" substring of "pumpkin". - /// let pump = pumpkin.subspan(..4); - /// ``` - /// - pub fn subspan(&self, bounds: R) -> Option - where - R: RangeBounds, - { - let start = match bounds.start_bound() { - std::ops::Bound::Included(i) => *i, - std::ops::Bound::Excluded(_) => unimplemented!("Excluded lower bounds: impossible."), - std::ops::Bound::Unbounded => 0, - }; - - let end = match bounds.end_bound() { - std::ops::Bound::Included(i) => *i + 1, - std::ops::Bound::Excluded(i) => *i, - std::ops::Bound::Unbounded => self.len(), - }; - - if start > 0 || start > end { - return None; - } - - if end > self.len() { - return None; - } - - Some(Span { - start: Loc { index: start }, - end: Loc { index: end }, - }) - } - - pub fn single_char(loc: Loc) -> Span { - Self { - start: loc, - end: loc + 1, - } - } - - pub fn combine(self, iter: impl IntoIterator) -> Self { - let start = self; - let last = iter.into_iter().last(); - - if let Some(end) = last { - Self { - start: start.start, - end: end.end, - } - } else { - start - } - } -} - -/// -/// Convenience converter trait. -/// -/// ### Examples -/// ``` -/// use avjason::utils::TryIntoSpan; -/// -/// fn test(span: S) { -/// let span = span.into_span(); -/// // TODO: Do stuff with `s`... -/// } -/// ``` -/// -pub trait TryIntoSpan { - fn try_into_span(range: impl RangeBounds) -> Option; -} - -impl TryIntoSpan for Loc { - fn try_into_span(range: impl RangeBounds) -> Option { - let start = range.start_bound(); - let end = range.end_bound(); - - let start = match start { - std::ops::Bound::Included(Loc { index: i }) => *i, - std::ops::Bound::Excluded(_) => unimplemented!("Not possible: excluded lower bound."), - std::ops::Bound::Unbounded => 0, - }; - - let end = match end { - std::ops::Bound::Included(Loc { index: i }) => *i + 1, - std::ops::Bound::Excluded(Loc { index: i }) => *i, - std::ops::Bound::Unbounded => return None, - }; - - Some(Span { - start: Loc { index: start }, - end: Loc { index: end }, - }) - } -} - -impl TryIntoSpan for usize { - fn try_into_span(range: impl RangeBounds) -> Option { - let start = range.start_bound(); - let end = range.end_bound(); - - let start = match start { - std::ops::Bound::Included(i) => *i, - std::ops::Bound::Excluded(_) => unimplemented!("Not possible: excluded lower bound."), - std::ops::Bound::Unbounded => 0, - }; - - let end = match end { - std::ops::Bound::Included(i) => *i + 1, - std::ops::Bound::Excluded(i) => *i, - std::ops::Bound::Unbounded => return None, - }; - - Some(Span { - start: Loc { index: start }, - end: Loc { index: end }, - }) - } -} - -pub trait Spanned { - fn span(&self) -> Span; -} - -impl Spanned for Span { - fn span(&self) -> Span { - *self - } -} \ No newline at end of file From 12dd99bbcb7b0f35971c553472cc97e46df5eb70 Mon Sep 17 00:00:00 2001 From: Sammy99jsp Date: Fri, 26 Jan 2024 23:15:51 +0000 Subject: [PATCH 18/39] Add reference macros. --- macros/src/lib.rs | 95 +++++++++++++++++++++- macros/src/utils.rs | 186 ++++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 16 ++++ 3 files changed, 296 insertions(+), 1 deletion(-) create mode 100644 macros/src/utils.rs diff --git a/macros/src/lib.rs b/macros/src/lib.rs index e78c9c4..91c18da 100644 --- a/macros/src/lib.rs +++ b/macros/src/lib.rs @@ -1,3 +1,96 @@ //! //! Macros for the main crate. -//! +//! + +mod utils; + +use proc_macro::TokenStream as Tokens; +use quote::ToTokens; +use syn::parse_macro_input; +use utils::{get_item_attrs, ECMARef, JSON5Ref, ToRustdoc}; + +/// +/// ## SpecRef +/// +/// Allows easy reference of the **JSON5** specification. +/// +/// This macro will add an additional section at the top of the Rustdoc +/// for the item attached, linking to the relevant section in the specification. +/// +/// ### Example +/// +/// ```ignore +/// use crate::SpecRef; +/// +/// // With custom title. +/// #[SpecRef("Number", "JSON5Number")] +/// struct Number; +/// +/// // Without custom title. +/// #[SpecRef("JSON5String")] +/// struct LitString; +/// ``` +/// +#[allow(non_snake_case)] +#[proc_macro_attribute] +pub fn SpecRef(params: Tokens, target: Tokens) -> Tokens { + let mut target: syn::Item = parse_macro_input!(target); + let params: JSON5Ref = parse_macro_input!(params); + let attrs = params.to_rustdoc(); + + let Some(original_attrs) = get_item_attrs(&mut target) else { + return syn::Error::new_spanned(target, "Cannot add spec ref to this item.") + .into_compile_error() + .into(); + }; + + // Prepend our new documentation to the start of + // the attribute macros. + *original_attrs = attrs + .into_iter() + .chain(original_attrs.iter().cloned()) + .collect(); + + target.into_token_stream().into() +} + +/// +/// ## ECMARef +/// +/// Allows easy reference of the **ECMAScript** specification. +/// +/// This macro will add an additional section at the top of the Rustdoc +/// for the item attached, linking to the relevant section in the specification. +/// +/// ### Example +/// +/// ```ignore +/// use crate::ECMARef; +/// +/// // You must always include an acompanying URL. +/// #[ECMARef("NullLiteral", "https://262.ecma-international.org/5.1/#sec-7.8.1")] +/// struct LitNull; +/// ``` +/// +#[allow(non_snake_case)] +#[proc_macro_attribute] +pub fn ECMARef(params: Tokens, target: Tokens) -> Tokens { + let mut target: syn::Item = parse_macro_input!(target); + let params: ECMARef = parse_macro_input!(params); + let attrs = params.to_rustdoc(); + + let Some(original_attrs) = get_item_attrs(&mut target) else { + return syn::Error::new_spanned(target, "Cannot add spec ref to this item.") + .into_compile_error() + .into(); + }; + + // Prepend our new documentation to the start of + // the attribute macros. + *original_attrs = attrs + .into_iter() + .chain(original_attrs.iter().cloned()) + .collect(); + + target.into_token_stream().into() +} diff --git a/macros/src/utils.rs b/macros/src/utils.rs new file mode 100644 index 0000000..a6307e2 --- /dev/null +++ b/macros/src/utils.rs @@ -0,0 +1,186 @@ +use proc_macro2::Span; +use syn::{punctuated::Punctuated, Token}; + +/// +/// Creates lines of Rustdoc from &self. +/// +/// ### Example +/// ```ignore +/// use proc_macro2::Span; +/// +/// let boolean_lit = ECMARef { +/// name: syn::LitStr::new("BooleanLiteral", Span::call_site()), +/// href: syn::LitStr::new("https://262.ecma-international.org/5.1/#sec-7.8.2", Span::call_site()) +/// }; +/// +/// boolean_lit.to_rustdoc() +/// ``` +/// +/// would return: +/// +/// ```ignore +/// #[doc = "## BooleanLiteral"] +/// #[doc = "See the official [ECMAScript specification](https://262.ecma-international.org/5.1/#sec-7.8.2)."] +/// #[doc = "***"] +/// ``` +/// +pub trait ToRustdoc { + fn to_rustdoc(&self) -> impl IntoIterator; +} + +/// +/// Produces a line of rust doc. +/// +/// ### Example +/// +/// ```ignore +/// rustdoc_line("Ridicule!") +/// ``` +/// +/// will produce: +/// +/// ```ignore +/// #[doc = "Ridicule!"] +/// ``` +/// +fn rustdoc_line(st: impl ToString) -> syn::Attribute { + syn::Attribute { + pound_token: Default::default(), + style: syn::AttrStyle::Outer, + bracket_token: Default::default(), + meta: syn::Meta::NameValue(syn::MetaNameValue { + path: syn::Path { + leading_colon: Default::default(), + segments: Punctuated::from_iter([syn::PathSegment { + ident: syn::Ident::new("doc", Span::call_site()), + arguments: syn::PathArguments::None, + }]), + }, + eq_token: Default::default(), + value: syn::Expr::Lit(syn::ExprLit { + attrs: Default::default(), + lit: syn::Lit::Str(syn::LitStr::new(&st.to_string(), Span::call_site())), + }), + }), + } +} + +/// +/// Represents a reference to the ECMAScript specification. +/// +pub struct ECMARef { + name: syn::LitStr, + href: syn::LitStr, +} + +impl syn::parse::Parse for ECMARef { + fn parse(input: syn::parse::ParseStream) -> syn::Result { + let first: syn::LitStr = input.parse()?; + let _: Token![,] = input.parse()?; + let second: syn::LitStr = input.parse()?; + + Ok(Self { + name: first, + href: second, + }) + } +} + +impl ToRustdoc for ECMARef { + fn to_rustdoc(&self) -> impl IntoIterator { + let Self { name, href } = self; + let (name, href) = (name.value(), href.value()); + [ + format!("## {name}"), + format!("See more on the [ECMAScript specification]({href})."), + "***".to_string(), + ] + .into_iter() + .map(rustdoc_line) + } +} + +/// +/// Represents a reference to the JSON5 specification. +/// +/// ### Example +/// +/// ```ignore +/// #[JSON5Ref("Null", "JSON5Null")] // (a) +/// #[JSON5Ref("JSON5Identifier")] // (b) +/// ``` +/// +/// would yield: +/// +/// ```ignore +/// // (a) +/// JSON5Ref { +/// name: Some(syn::LitStr::new("Null", _)), +/// id: syn::LitStr::new("JSON5Null", _), +/// } +/// +/// // (b) +/// JSON5Ref { +/// name: None, +/// id: syn::LitStr::new("JSON5Identifier", _), +/// } +/// ``` +/// +pub struct JSON5Ref { + name: Option, + id: syn::LitStr, +} + +impl syn::parse::Parse for JSON5Ref { + fn parse(input: syn::parse::ParseStream) -> syn::Result { + let first: syn::LitStr = input.parse()?; + + if input.peek(Token![,]) { + let _: Token![,] = input.parse()?; + let second: syn::LitStr = input.parse()?; + + return Ok(Self { + name: Some(first), + id: second, + }); + } + + Ok(Self { name: None, id: first }) + } +} + +impl ToRustdoc for JSON5Ref { + fn to_rustdoc(&self) -> impl IntoIterator { + let Self { name, id } = self; + let (name, id) = (name.as_ref().map(|s| s.value()), id.value()); + [ + format!("## {}", name.as_ref().unwrap_or(&id)), + format!("See more on the [JSON5 specification](https://spec.json5.org/#prod-{id})."), + "***".to_string(), + ] + .into_iter() + .map(rustdoc_line) + } +} + +pub fn get_item_attrs(item: &mut syn::Item) -> Option<&mut Vec> { + match item { + syn::Item::Const(syn::ItemConst { ref mut attrs, .. }) => Some(attrs), + syn::Item::Enum(syn::ItemEnum { ref mut attrs, .. }) => Some(attrs), + syn::Item::ExternCrate(syn::ItemExternCrate { ref mut attrs, .. }) => Some(attrs), + syn::Item::Fn(syn::ItemFn { ref mut attrs, .. }) => Some(attrs), + syn::Item::ForeignMod(syn::ItemForeignMod { ref mut attrs, .. }) => Some(attrs), + syn::Item::Impl(syn::ItemImpl { ref mut attrs, .. }) => Some(attrs), + syn::Item::Macro(syn::ItemMacro { ref mut attrs, .. }) => Some(attrs), + syn::Item::Mod(syn::ItemMod { ref mut attrs, .. }) => Some(attrs), + syn::Item::Static(syn::ItemStatic { ref mut attrs, .. }) => Some(attrs), + syn::Item::Struct(syn::ItemStruct { ref mut attrs, .. }) => Some(attrs), + syn::Item::Trait(syn::ItemTrait { ref mut attrs, .. }) => Some(attrs), + syn::Item::TraitAlias(syn::ItemTraitAlias { ref mut attrs, .. }) => Some(attrs), + syn::Item::Type(syn::ItemType { ref mut attrs, .. }) => Some(attrs), + syn::Item::Union(syn::ItemUnion { ref mut attrs, .. }) => Some(attrs), + syn::Item::Use(syn::ItemUse { ref mut attrs, .. }) => Some(attrs), + syn::Item::Verbatim(_) => None, + _ => None, + } +} \ No newline at end of file diff --git a/src/lib.rs b/src/lib.rs index d437de8..356cca7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4,3 +4,19 @@ //! //! A parser for [JSON5](https://json5.org/). //! + +mod macro_test { + use avjason_macros::{ECMARef, SpecRef}; + + #[SpecRef("Identifier", "JSON5Identifier")] + #[allow(unused)] + struct Identifier; + + #[SpecRef("JSON5Null")] + #[allow(unused)] + struct Null; + + #[ECMARef("BooleanLiteral", "https://262.ecma-international.org/5.1/#sec-7.8.2")] + #[allow(unused)] + struct LitBool; +} \ No newline at end of file From 81de3c74ab379a764991386f1d6c0b015aa13389 Mon Sep 17 00:00:00 2001 From: Sammy99jsp Date: Sat, 27 Jan 2024 01:08:14 +0000 Subject: [PATCH 19/39] Add utilities for tracability, and source code. --- src/common/location.rs | 183 +++++++++++++++++++++++++++++++++++++++++ src/common/mod.rs | 9 ++ src/common/source.rs | 129 +++++++++++++++++++++++++++++ src/lib.rs | 9 ++ 4 files changed, 330 insertions(+) create mode 100644 src/common/location.rs create mode 100644 src/common/mod.rs create mode 100644 src/common/source.rs diff --git a/src/common/location.rs b/src/common/location.rs new file mode 100644 index 0000000..6b972d9 --- /dev/null +++ b/src/common/location.rs @@ -0,0 +1,183 @@ +//! +//! Things that help trace errors and tokens: [Span] and [Loc]. +//! + +use std::ops::{Add, Bound, Range, RangeBounds}; + +/// +/// Represents the index of a character in source code. +/// +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] +pub struct Loc(pub(crate) usize); + +impl From for Loc { + fn from(value: usize) -> Self { + Self(value) + } +} + +impl From for usize { + fn from(value: Loc) -> Self { + value.0 + } +} + +impl Add for Loc +where + usize: Add, +{ + type Output = Loc; + + fn add(self, rhs: A) -> Self::Output { + Self(self.0 + rhs) + } +} + +/// +/// Represents the location of a token in source code. +/// +pub struct Span { + /// + /// Start index: inclusive lower bound. + /// + pub(crate) start: Loc, + + /// + /// End index: exclusive upper bound. + /// + pub(crate) end: Loc, +} + +impl Span { + /// + /// Returns Some(subspan), given the relative indexes from the start of this span, + /// returning None if the end index is out of this span's bounds. + /// + pub fn subspan(&self, indexes: impl RangeBounds) -> Option { + let start = match indexes.start_bound() { + Bound::Included(included) => self.start + included, + Bound::Excluded(excluded) => self.start + (excluded + 1), + Bound::Unbounded => self.start, + }; + + let end = match indexes.end_bound() { + Bound::Included(included) => self.start + (included + 1), + Bound::Excluded(excluded) => self.start + excluded, + Bound::Unbounded => self.end, + }; + + if end > self.end { + // Not a subspan, since we overflow the end. + return None; + } + + Some(Self { start, end }) + } + + /// + /// Use this [Span] as a start, taking the range between this span's start, + /// and the end oi the last of the passed in iterator (including itself). + /// + pub fn combine(self, others: impl IntoIterator) -> Span { + let Self { start, end } = self; + + // Take the end bound of the last Span, + // if others is not empty, and use that instead. + let last = others.into_iter().last(); + if let Some(Self { end, .. }) = last { + return Self { start, end }; + } + + Self { start, end } + } + + /// + /// Return the start and end bounds as a Rust [Range] + /// + pub fn as_range(&self) -> Range { + self.start.0..self.end.0 + } +} + +/// +/// Utility trait for handling multiple spans. +/// +pub trait SpanIter: Sized + IntoIterator { + /// + /// Combine all of this iterator's spans, + /// resulting in a [Span] encompassing all + /// passed in [Span]s (assuming this iter is in ascending order). + /// + fn combine(self) -> Option; +} + +impl> SpanIter for Iter { + fn combine(self) -> Option { + let mut iter = self.into_iter(); + iter.next().map(|s| s.combine(iter)) + } +} + +impl RangeBounds for Span { + fn start_bound(&self) -> Bound<&usize> { + Bound::Included(&self.start.0) + } + + fn end_bound(&self) -> Bound<&usize> { + Bound::Excluded(&self.end.0) + } +} + +impl RangeBounds for Span { + fn start_bound(&self) -> Bound<&Loc> { + Bound::Included(&self.start) + } + + fn end_bound(&self) -> Bound<&Loc> { + Bound::Excluded(&self.end) + } +} + +#[cfg(test)] +mod tests { + use crate::common::source::{DummySource, Source, ToSpan}; + + #[test] + fn subspan() { + let source = DummySource::new("testthing."); + let span = (0..9).to_span(&source); + + // Valid + assert_eq!(span.subspan(1..).map(|s| s.as_range()), Some(1..9)); + assert_eq!(span.subspan(1..2).map(|s| s.as_range()), Some(1..2)); + assert_eq!(span.subspan(..5).map(|s| s.as_range()), Some(0..5)); + assert_eq!(span.subspan(..).map(|s| s.as_range()), Some(0..9)); + + // Invalid + assert_eq!(span.subspan(..17).map(|s| s.as_range()), None); + assert_eq!(span.subspan(144..1343).map(|s| s.as_range()), None); + } + + #[test] + fn source_at() { + let source = DummySource::new("testthing."); + let span = (0..9).to_span(&source); + + assert_eq!( + span.subspan(..4).and_then(|s| source.source_at(s)), + Some("test".to_string()) + ); + + assert_eq!( + span.subspan(4..).and_then(|s| source.source_at(s)), + Some("thing".to_string()) + ); + + assert_eq!( + span.subspan(..4).and_then(|s| source.source_at(s)), + Some("test".to_string()) + ); + + assert_eq!(span.subspan(49..).and_then(|s| source.source_at(s)), None); + } +} diff --git a/src/common/mod.rs b/src/common/mod.rs new file mode 100644 index 0000000..8f7a98f --- /dev/null +++ b/src/common/mod.rs @@ -0,0 +1,9 @@ +//! +//! Common utilities across lexing and syntax-parsing. +//! + +pub mod source; +pub mod location; + +pub use location::*; +pub use source::*; \ No newline at end of file diff --git a/src/common/source.rs b/src/common/source.rs new file mode 100644 index 0000000..3dbab29 --- /dev/null +++ b/src/common/source.rs @@ -0,0 +1,129 @@ +//! +//! Sources of source code. +//! + +use std::ops::{Bound, Range, RangeBounds}; + +/// +/// Generic idea of source code: could be a file, +/// or a simple string. +/// +/// This trait aims to abstract the gathering of the source +/// text and focus on the Source -> Lexing -> Syntax -> AST +/// pipeline. +/// +pub trait Source { + /// + /// A friendly appropriate format to point + /// to a location of a token. + /// + /// This could be line-column information, or simply an index. + /// + type Location; + + /// + /// Find the location of this span, + /// and put it into a friendly appropriate format. + /// + fn locate(&self, span: Span) -> Option; + + /// + /// Returns the start and (exclusive) end index of this source. + /// + fn bounds(&self) -> Range; + + /// + /// Checks if a given [Span] is within bounds. + /// + fn in_bounds(&self, span: &Span) -> bool { + self.bounds().end >= span.end + } + + /// + /// Returns the source code at a given [Span], if within bounds. + /// + fn source_at(&self, span: Span) -> Option; +} + +/// +/// Utility conversion into a [Span], given +/// boundary information from the origin [Source]. +/// +pub trait ToSpan { + fn to_span(self, source: &impl Source) -> Span; +} + +impl> ToSpan for R { + fn to_span(self, source: &impl Source) -> Span { + let Range { + start: start_bound, + end: end_bound, + } = source.bounds(); + + let start = match self.start_bound() { + Bound::Included(included) => Loc(*included), + Bound::Excluded(excluded) => Loc(*excluded + 1), + Bound::Unbounded => start_bound, + }; + + let end = match self.end_bound() { + Bound::Included(included) => Loc(included + 1), + Bound::Excluded(excluded) => Loc(*excluded), + Bound::Unbounded => end_bound, + }; + + Span { start, end } + } +} + +#[cfg(test)] +pub use testing_only::DummySource; + +use super::{Loc, Span}; + +#[cfg(test)] +mod testing_only { + use std::ops::Range; + + use crate::common::{Loc, Span}; + + use super::Source; + + /// + /// [Source] implementation for testing purposes only! + /// + pub struct DummySource { + text: String, + } + + impl DummySource { + pub fn new(text: impl ToString) -> Self { + let text = text.to_string(); + Self { text } + } + } + + impl Source for DummySource { + type Location = Range; + + fn locate(&self, span: Span) -> Option { + if self.in_bounds(&span) { + Some(span.start.0..span.end.0) + } else { + None + } + } + + fn bounds(&self) -> Range { + Loc(0)..Loc(self.text.len()) + } + + fn source_at(&self, span: Span) -> Option { + if self.in_bounds(&span) { + self.text.get(span.as_range()).map(ToString::to_string) + } else { + None + } + } + } +} diff --git a/src/lib.rs b/src/lib.rs index 356cca7..1a71789 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4,6 +4,15 @@ //! //! A parser for [JSON5](https://json5.org/). //! +//! ## Why? +//! This crate provides a very important function: traceability. +//! ### Tracability +//! This allows for line-column data to be preserved so that further +//! processing can benefit from spanned errors, which tell the end +//! user *where* the error happened. +//! + +pub mod common; mod macro_test { use avjason_macros::{ECMARef, SpecRef}; From 1bd69b033767db16b49c070a781550fecf8d88b9 Mon Sep 17 00:00:00 2001 From: Sammy99jsp Date: Sat, 27 Jan 2024 03:04:31 +0000 Subject: [PATCH 20/39] Add `SourceFile` implementation of `Source` --- src/common/file.rs | 211 +++++++++++++++++++++++++++++++++++++++++ src/common/location.rs | 1 + src/common/mod.rs | 1 + src/common/source.rs | 17 ++-- 4 files changed, 223 insertions(+), 7 deletions(-) create mode 100644 src/common/file.rs diff --git a/src/common/file.rs b/src/common/file.rs new file mode 100644 index 0000000..800d486 --- /dev/null +++ b/src/common/file.rs @@ -0,0 +1,211 @@ +//! +//! A source file. +//! + +use std::{fmt::Formatter, ops::Range, path::Path}; + +use super::{Loc, Source, Span}; + +/// +/// Line and column information for +/// a particular location in source code. +/// +#[derive(Debug)] +pub struct LineColumn<'a> { + file: &'a str, + line: usize, + column: usize, +} + +/// +/// Converting to 1-based only for display. +/// +impl<'a> std::fmt::Display for LineColumn<'a> { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "{}:{}:{}", self.file, self.line + 1, self.column + 1) + } +} + +/// +/// Finds the starting character index of all +/// lines, using any [ECMAScript LineTerminatorSequence](https://262.ecma-international.org/5.1/#sec-7.3) +/// to delimit lines. +/// +fn line_starts(st: &[char]) -> Vec { + let mut v = vec![0]; + let mut i = 0; + + while i < st.len() { + let ch = st[i]; + + match ch { + '\u{000A}' => v.push(i + 1), // + '\u{2028}' => v.push(i + 1), // + '\u{2029}' => v.push(i + 1), // + '\u{000D}' => { + if matches!(st.get(i + 1), Some('\u{000A}')) { + v.push(i + 2); // + i += 1; + } else { + v.push(i + 1); // + } + } + _ => (), + } + + i += 1; + } + + if matches!(v.last(), Some(i) if *i >= st.len()) { + let _ = v.pop(); + } + + v +} + +/// +/// A real source file. +/// +/// Here, line-column information can be provided. +/// +#[derive(Debug, Clone)] +pub struct SourceFile { + path: String, + contents: String, + chars: Vec, + line_starts: Vec, +} + +impl SourceFile { + /// + /// TESTING ONLY + /// *** + /// Create a dumy file with a fake path. + /// + #[cfg(test)] + pub fn dummy_file(contents: &'static str) -> Self { + let path = "DUMMY.FILE".to_string(); + let contents = contents.to_string(); + + let chars = contents.chars().collect::>(); + let line_starts = line_starts(&chars); + + Self { + path, + contents, + chars, + line_starts, + } + } + + /// + /// Attempts to read source code from a given file path. + /// + pub fn read_from_file>(path: P) -> std::io::Result { + let path = path.as_ref().to_owned(); + let contents = std::fs::read_to_string(&path)?; + + let path = path.to_str().expect("Valid path as string").to_string(); + let chars = contents.chars().collect::>(); + let line_starts = line_starts(&chars); + + Ok(Self { + path, + contents, + chars, + line_starts, + }) + } + + /// + /// Return the (0-based) line and column information at a [Loc] in this file. + /// + fn line_col(&self, loc: Loc) -> Option<(usize, usize)> { + // Essentially, pair the start of the a line with the end of the next (or EOF), + // check if loc is in its range. If so, get the corresponding line and calculate the + // corresponding column. + self.line_starts + .iter() + .copied() + .zip( + self.line_starts + .iter() + .copied() + .skip(1) + .chain([self.contents.len()]), + ) + .enumerate() + .filter(|&(_, (start_col, end_col))| (start_col <= loc.0 && loc.0 < end_col)) + .map(|(ln, (start_col, _))| (ln, loc.0 - start_col)) + .next() + } +} + +impl Source for SourceFile { + type Location<'a> = LineColumn<'a> + where Self: 'a; + + fn locate(&self, span: Span) -> Option> { + if self.in_bounds(&span) { + let (line, column) = self.line_col(span.start)?; + return Some(LineColumn { + file: &self.path, + line, + column, + }); + } + + None + } + + fn bounds(&self) -> Range { + Loc(0)..Loc(self.chars.len()) + } + + fn source_at(&self, span: Span) -> Option { + if self.in_bounds(&span) { + return Some(self.chars[span.as_range()].iter().collect()); + } + + None + } +} + +#[cfg(test)] +mod tests { + use crate::common::{file::LineColumn, Source}; + + use super::{super::ToSpan, line_starts, SourceFile}; + + #[test] + fn lines() { + assert!(matches!( + &line_starts(&"ba\nb\nc".chars().collect::>())[..], + &[0, 3, 5] + )); + + assert!(matches!( + &line_starts( + &"babs\r\nbaaa\r__\u{2028}asagsgas\u{2029}a\nc\n" + .chars() + .collect::>() + )[..], + &[0, 6, 11, 14, 23, 25,] + )) + } + + #[test] + fn line_col() { + let f = SourceFile::dummy_file("PEN\nPINEAPPLE\nAPPLE\nPEN"); + let ananas = (4..13).to_span(&f); + assert_eq!(f.source_at(ananas), Some("PINEAPPLE".to_string())); + assert!(matches!( + f.locate(ananas), + Some(LineColumn { + line: 1, + column: 0, + .. + }) + )); + } +} diff --git a/src/common/location.rs b/src/common/location.rs index 6b972d9..ce55557 100644 --- a/src/common/location.rs +++ b/src/common/location.rs @@ -36,6 +36,7 @@ where /// /// Represents the location of a token in source code. /// +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] pub struct Span { /// /// Start index: inclusive lower bound. diff --git a/src/common/mod.rs b/src/common/mod.rs index 8f7a98f..4db3a70 100644 --- a/src/common/mod.rs +++ b/src/common/mod.rs @@ -4,6 +4,7 @@ pub mod source; pub mod location; +pub mod file; pub use location::*; pub use source::*; \ No newline at end of file diff --git a/src/common/source.rs b/src/common/source.rs index 3dbab29..3464994 100644 --- a/src/common/source.rs +++ b/src/common/source.rs @@ -19,13 +19,15 @@ pub trait Source { /// /// This could be line-column information, or simply an index. /// - type Location; + type Location<'a> + where + Self: 'a; /// /// Find the location of this span, /// and put it into a friendly appropriate format. /// - fn locate(&self, span: Span) -> Option; + fn locate(&self, span: Span) -> Option>; /// /// Returns the start and (exclusive) end index of this source. @@ -104,14 +106,15 @@ mod testing_only { } impl Source for DummySource { - type Location = Range; + type Location<'a> = Range + where Self: 'a; - fn locate(&self, span: Span) -> Option { + fn locate(&self, span: Span) -> Option> { if self.in_bounds(&span) { - Some(span.start.0..span.end.0) - } else { - None + return Some(span.start.0..span.end.0); } + + None } fn bounds(&self) -> Range { From 55dd0fc26bd60cbb80495ab9a758e58e394cb236 Mon Sep 17 00:00:00 2001 From: Sammy99jsp Date: Sat, 27 Jan 2024 16:01:45 +0000 Subject: [PATCH 21/39] Add base traits and impls for lexing. --- src/common/file.rs | 6 ++ src/common/location.rs | 12 ++++ src/common/source.rs | 6 ++ src/lexing/mod.rs | 8 +++ src/lexing/utils/lex_impls.rs | 118 ++++++++++++++++++++++++++++++++++ src/lexing/utils/mod.rs | 84 ++++++++++++++++++++++++ src/lexing/utils/peek.rs | 31 +++++++++ src/lexing/utils/result.rs | 50 ++++++++++++++ src/lexing/utils/stream.rs | 38 +++++++++++ src/lib.rs | 1 + 10 files changed, 354 insertions(+) create mode 100644 src/lexing/mod.rs create mode 100644 src/lexing/utils/lex_impls.rs create mode 100644 src/lexing/utils/mod.rs create mode 100644 src/lexing/utils/peek.rs create mode 100644 src/lexing/utils/result.rs create mode 100644 src/lexing/utils/stream.rs diff --git a/src/common/file.rs b/src/common/file.rs index 800d486..0ff31ca 100644 --- a/src/common/file.rs +++ b/src/common/file.rs @@ -169,6 +169,12 @@ impl Source for SourceFile { None } + + fn characters(&self) -> &[char] { + &self.chars + } + + } #[cfg(test)] diff --git a/src/common/location.rs b/src/common/location.rs index ce55557..6f55e40 100644 --- a/src/common/location.rs +++ b/src/common/location.rs @@ -139,6 +139,18 @@ impl RangeBounds for Span { } } +/// +/// Returns the span attached to this +/// object. +/// +pub trait Spanned { + /// + /// Returns the span attached to this + /// object. + /// + fn span(&self) -> Span; +} + #[cfg(test)] mod tests { use crate::common::source::{DummySource, Source, ToSpan}; diff --git a/src/common/source.rs b/src/common/source.rs index 3464994..7230439 100644 --- a/src/common/source.rs +++ b/src/common/source.rs @@ -45,6 +45,8 @@ pub trait Source { /// Returns the source code at a given [Span], if within bounds. /// fn source_at(&self, span: Span) -> Option; + + fn characters(&self) -> &[char]; } /// @@ -128,5 +130,9 @@ mod testing_only { None } } + + fn characters(&self) -> &[char] { + unimplemented!() + } } } diff --git a/src/lexing/mod.rs b/src/lexing/mod.rs new file mode 100644 index 0000000..a27a7d5 --- /dev/null +++ b/src/lexing/mod.rs @@ -0,0 +1,8 @@ +//! +//! The process of lexing involves converting [char]s +//! from source code into lexical tokens according to +//! some [lexical grammar](https://en.wikipedia.org/wiki/Lexical_grammar). +//! + +pub mod utils; + diff --git a/src/lexing/utils/lex_impls.rs b/src/lexing/utils/lex_impls.rs new file mode 100644 index 0000000..32558f9 --- /dev/null +++ b/src/lexing/utils/lex_impls.rs @@ -0,0 +1,118 @@ +//! +//! Utility implementations for [Lex]. +//! + +use std::ops::{Deref, DerefMut}; + +use crate::common::Source; + +use super::{LexError, LexT, SourceStream}; + +/// +/// Many (possibly one or zero) of a lexical token. +/// +pub type Many = Vec; + +impl LexT for Many { + fn peek(input: &SourceStream) -> bool { + L::peek(input) + } + + fn lex(input: &mut SourceStream) -> Result { + let mut v = vec![]; + + while L::peek(input) { + v.push(L::lex(input)?); + } + + Ok(v) + } +} + +/// +/// At least N lots of `L`-tokens. +/// +#[derive(Debug)] +pub struct AtLeast(Vec); + +impl LexT for AtLeast { + fn peek(input: &SourceStream) -> bool { + L::peek(input) + } + + fn lex(input: &mut SourceStream) -> Result { + let many: Many = LexT::lex(input)?; + + if many.len() < N { + return Err(input.error(format!( + "Expected at least {N} {} tokens: got {}.", + std::any::type_name::(), + many.len(), + ))); + } + + Ok(Self(many)) + } +} + +impl Deref for AtLeast { + type Target = Vec; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl DerefMut for AtLeast { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + +/// +/// Exactly N lots of `L`-tokens: no more, no less. +/// +#[derive(Debug)] +pub struct Exactly([L; N]) +where + [(); N]: Sized; + +impl LexT for Exactly +where + [(); N]: Sized, +{ + fn peek(input: &SourceStream) -> bool { + L::peek(input) + } + + fn lex(input: &mut SourceStream) -> Result { + let many: Many = LexT::lex(input)?; + + if many.len() != N { + return Err(input.error(format!( + "Expected {N} {} tokens: got {}.", + std::any::type_name::(), + many.len() + ))); + } + + // SAFETY: Just checked the length, so unwrap okay. + let many: [L; N] = unsafe { many.try_into().unwrap_unchecked() }; + + Ok(Self(many)) + } +} + +impl Deref for Exactly { + type Target = [L; N]; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl DerefMut for Exactly { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} diff --git a/src/lexing/utils/mod.rs b/src/lexing/utils/mod.rs new file mode 100644 index 0000000..438efc9 --- /dev/null +++ b/src/lexing/utils/mod.rs @@ -0,0 +1,84 @@ +//! +//! Utilities for lexing. +//! + +pub mod lex_impls; +pub mod peek; +pub mod result; +pub mod stream; + +use std::marker::PhantomData; + +use crate::common::Source; + +pub use self::{ + lex_impls::{AtLeast, Exactly, Many}, + peek::Peek, + result::{LexError, LexResult}, + stream::SourceStream, +}; + +/// +/// Private trait, only for internal use. +/// +#[doc(hidden)] +pub trait LexT: Sized { + /// + /// Checks to see if this token is possibly upcoming. + /// + fn peek(input: &SourceStream) -> bool; + + /// + /// Given that the token is present, + /// start lexing. + /// + /// This function has guaranteed side-effects on the input [SourceStream] (advancing it). + /// + fn lex(input: &mut SourceStream) -> Result; +} + +/// +/// Oprations on lexical tokens: +/// * Lexing, +/// * Peeking +/// +pub trait Lex: Sized { + /// + /// Checks is this token is potentially present, + /// which can then be further further lexed. + /// + fn peek(input: &SourceStream) -> Peek; + + /// + /// Returns a [LexResult] with either: + /// * a valid token [LexResult::Lexed], + /// * [LexResult::Nothing] (token not present), + /// * or [LexResult::Errant] (spanned error). + /// + fn lex(input: &mut SourceStream) -> LexResult; +} + +/// +/// The public-facing implementation. +/// +impl Lex for L { + #[inline] + fn peek(input: &SourceStream) -> Peek { + // Forward to internal impl, then make proper [Peek] + // enum variant. + match ::peek(input) { + true => Peek::Possible(PhantomData::), + false => Peek::Absent, + } + } + + /// + /// Returns a [LexResult] with either: + /// * a valid token [LexResult::Lexed], + /// * [LexResult::Nothing] (token not present), + /// * or [LexResult::Errant] (spanned error). + /// + fn lex(input: &mut SourceStream) -> LexResult { + ::peek(input).then_lex(input) + } +} diff --git a/src/lexing/utils/peek.rs b/src/lexing/utils/peek.rs new file mode 100644 index 0000000..b2c722d --- /dev/null +++ b/src/lexing/utils/peek.rs @@ -0,0 +1,31 @@ +//! +//! Peeking for lexical tokens. +//! + +use std::marker::PhantomData; + +use crate::common::Source; + +use super::{LexResult, LexT, SourceStream}; + +/// +/// Result of a peek, either: +/// * Possibly present, +/// * or not. +/// +pub enum Peek { + Possible(PhantomData), + Absent, +} + +impl Peek { + pub fn then_lex(self, input: &mut SourceStream) -> LexResult { + match self { + Peek::Possible(_) => match LexT::lex(input) { + Ok(lexed) => LexResult::Lexed(lexed), + Err(errant) => LexResult::Errant(errant), + }, + Peek::Absent => LexResult::Nothing, + } + } +} diff --git a/src/lexing/utils/result.rs b/src/lexing/utils/result.rs new file mode 100644 index 0000000..5fbe90b --- /dev/null +++ b/src/lexing/utils/result.rs @@ -0,0 +1,50 @@ +use crate::common::{Source, Span, Spanned}; + +use super::SourceStream; + +#[derive(Debug)] +pub struct LexError { + span: Span, + message: String, +} + +impl LexError { + pub fn new(span: &impl Spanned, message: impl ToString) -> Self { + Self { + span: span.span(), + message: message.to_string(), + } + } +} + +impl<'a, S: Source> SourceStream<'a, S> { + /// + /// Make a new error at the stream's current location. + /// + pub fn error(&self, msg: impl ToString) -> LexError { + LexError::new(self, msg) + } +} + +/// +/// The rust of attempting parse token `L` +/// from a [SourceStream]. +/// +pub enum LexResult { + /// + /// Valid token. + /// + Lexed(L), + + /// + /// An attempt was made to parse a token, + /// but it did not fully abide by the lexical grammar. + /// + Errant(LexError), + + /// + /// The token `L` was not found, + /// so the parsing was skipped. + /// + Nothing, +} diff --git a/src/lexing/utils/stream.rs b/src/lexing/utils/stream.rs new file mode 100644 index 0000000..ce93cfc --- /dev/null +++ b/src/lexing/utils/stream.rs @@ -0,0 +1,38 @@ +use crate::common::{Loc, Source, Span, Spanned, ToSpan}; + +use super::{Lex, LexResult}; + +#[derive(Debug, Clone)] +pub struct SourceStream<'a, S: Source> { + index: usize, + source: &'a S, +} + +impl<'a, S: Source> SourceStream<'a, S> { + /// + /// Take the next character in this [SourceStream]. + /// + pub fn take(&mut self) -> Option<(Loc, char)> { + let index = self.index; + + if let Some(ch) = self.source.characters().get(index) { + self.index += 1; + return Some((Loc(index), *ch)); + } + + None + } + + /// + /// Attempt to lex for token `L`. + /// + pub fn lex(&mut self) -> LexResult { + Lex::lex(self) + } +} + +impl<'a, S: Source> Spanned for SourceStream<'a, S> { + fn span(&self) -> Span { + (self.index..=self.index).to_span(self.source) + } +} diff --git a/src/lib.rs b/src/lib.rs index 1a71789..375accc 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -13,6 +13,7 @@ //! pub mod common; +pub mod lexing; mod macro_test { use avjason_macros::{ECMARef, SpecRef}; From f0ff50391c244f13bcbdf83106b50f6f543ab47f Mon Sep 17 00:00:00 2001 From: Sammy99jsp Date: Sat, 27 Jan 2024 16:20:56 +0000 Subject: [PATCH 22/39] Add `Verbatim` primitive token. --- macros/src/lib.rs | 2 +- src/lexing/mod.rs | 1 + src/lexing/tokens/mod.rs | 32 ++++++++++++++++++++++++++++++++ src/lexing/utils/mod.rs | 2 +- src/lexing/utils/stream.rs | 27 +++++++++++++++++++++++++++ src/lib.rs | 4 ++++ 6 files changed, 66 insertions(+), 2 deletions(-) create mode 100644 src/lexing/tokens/mod.rs diff --git a/macros/src/lib.rs b/macros/src/lib.rs index 91c18da..f4689bb 100644 --- a/macros/src/lib.rs +++ b/macros/src/lib.rs @@ -93,4 +93,4 @@ pub fn ECMARef(params: Tokens, target: Tokens) -> Tokens { .collect(); target.into_token_stream().into() -} +} \ No newline at end of file diff --git a/src/lexing/mod.rs b/src/lexing/mod.rs index a27a7d5..87915e7 100644 --- a/src/lexing/mod.rs +++ b/src/lexing/mod.rs @@ -5,4 +5,5 @@ //! pub mod utils; +pub mod tokens; diff --git a/src/lexing/tokens/mod.rs b/src/lexing/tokens/mod.rs new file mode 100644 index 0000000..8e94b0a --- /dev/null +++ b/src/lexing/tokens/mod.rs @@ -0,0 +1,32 @@ +//! +//! Lexical tokens. +//! + +use crate::common::{Loc, Source, Span, SpanIter, ToSpan}; + +use super::utils::{LexError, LexT, SourceStream}; + +pub struct Verbatim { + span: Span, +} + +impl LexT for Verbatim { + fn peek(input: &SourceStream) -> bool { + input.upcoming(A) + } + + fn lex(input: &mut SourceStream) -> Result { + let mut locs = vec![]; + + for _ in 0..A.len() { + let (Loc(loc), _) = input.take().unwrap(); + locs.push((loc..(loc+1)).to_span(input.source())); + } + + Ok(Self { + span: locs.into_iter().combine() + .expect("DO PUT EMPTY STRINGS IN VERBATIM!"), + }) + + } +} \ No newline at end of file diff --git a/src/lexing/utils/mod.rs b/src/lexing/utils/mod.rs index 438efc9..0c18821 100644 --- a/src/lexing/utils/mod.rs +++ b/src/lexing/utils/mod.rs @@ -29,7 +29,7 @@ pub trait LexT: Sized { fn peek(input: &SourceStream) -> bool; /// - /// Given that the token is present, + /// Given that the token is potentially present, /// start lexing. /// /// This function has guaranteed side-effects on the input [SourceStream] (advancing it). diff --git a/src/lexing/utils/stream.rs b/src/lexing/utils/stream.rs index ce93cfc..51d2a53 100644 --- a/src/lexing/utils/stream.rs +++ b/src/lexing/utils/stream.rs @@ -8,7 +8,30 @@ pub struct SourceStream<'a, S: Source> { source: &'a S, } +/// +/// Things that [SourceStream] can +/// check are coming up. +/// +pub trait Lookahead { + fn upcoming(&self, input: &SourceStream) -> bool; +} + +impl Lookahead for str { + fn upcoming(&self, input: &SourceStream) -> bool { + let chars = self.chars().collect::>(); + input.source.characters()[input.index..(input.index + chars.len())] == chars + } +} + impl<'a, S: Source> SourceStream<'a, S> { + /// + /// Returns the source where this [SourceStream] + /// came from. + /// + pub fn source(&self) -> &S { + &self.source + } + /// /// Take the next character in this [SourceStream]. /// @@ -29,6 +52,10 @@ impl<'a, S: Source> SourceStream<'a, S> { pub fn lex(&mut self) -> LexResult { Lex::lex(self) } + + pub fn upcoming(&self, lookahead: &L) -> bool { + lookahead.upcoming(self) + } } impl<'a, S: Source> Spanned for SourceStream<'a, S> { diff --git a/src/lib.rs b/src/lib.rs index 375accc..0e8585c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -12,6 +12,10 @@ //! user *where* the error happened. //! +// This will have to be removed to solve #5 +#![allow(incomplete_features)] +#![feature(adt_const_params)] + pub mod common; pub mod lexing; From 27b45cd243d1c70c2447045a3a72818d98e4cce5 Mon Sep 17 00:00:00 2001 From: Sammy99jsp Date: Sat, 27 Jan 2024 18:48:46 +0000 Subject: [PATCH 23/39] Add `derive(Spanned)` macro. --- macros/src/lib.rs | 34 +++- macros/src/spanned.rs | 364 +++++++++++++++++++++++++++++++++++ macros/src/type_traversal.rs | 126 ++++++++++++ macros/src/utils.rs | 3 + src/common/location.rs | 33 +++- src/lib.rs | 42 +++- 6 files changed, 591 insertions(+), 11 deletions(-) create mode 100644 macros/src/spanned.rs create mode 100644 macros/src/type_traversal.rs diff --git a/macros/src/lib.rs b/macros/src/lib.rs index f4689bb..84fcc77 100644 --- a/macros/src/lib.rs +++ b/macros/src/lib.rs @@ -2,10 +2,15 @@ //! Macros for the main crate. //! +#![feature(proc_macro_diagnostic)] + +mod spanned; +mod type_traversal; mod utils; -use proc_macro::TokenStream as Tokens; +use proc_macro::{Diagnostic, Level, Span, TokenStream as Tokens}; use quote::ToTokens; +use spanned::{derive_spanned_for_enum, derive_spanned_for_struct}; use syn::parse_macro_input; use utils::{get_item_attrs, ECMARef, JSON5Ref, ToRustdoc}; @@ -66,7 +71,7 @@ pub fn SpecRef(params: Tokens, target: Tokens) -> Tokens { /// /// ```ignore /// use crate::ECMARef; -/// +/// /// // You must always include an acompanying URL. /// #[ECMARef("NullLiteral", "https://262.ecma-international.org/5.1/#sec-7.8.1")] /// struct LitNull; @@ -93,4 +98,27 @@ pub fn ECMARef(params: Tokens, target: Tokens) -> Tokens { .collect(); target.into_token_stream().into() -} \ No newline at end of file +} + +/// +/// +/// +#[proc_macro_derive(Spanned)] +pub fn spanned(target: Tokens) -> Tokens { + if let Ok(st) = syn::parse::(target.clone()) { + return derive_spanned_for_struct(&st); + } + + if let Ok(en) = syn::parse::(target.clone()) { + return derive_spanned_for_enum(&en); + } + + Diagnostic::spanned( + Span::call_site(), + Level::Error, + "Expected a struct or enum here.", + ) + .emit(); + + Default::default() +} diff --git a/macros/src/spanned.rs b/macros/src/spanned.rs new file mode 100644 index 0000000..9ad7321 --- /dev/null +++ b/macros/src/spanned.rs @@ -0,0 +1,364 @@ +//! +//! Utilities for #[derive(Spanned)] +//! + +use proc_macro::{Diagnostic, Level}; +use proc_macro2::Span; +use quote::quote; +use syn::{punctuated::Punctuated, spanned::Spanned}; + +use crate::type_traversal::{ + field_access, index, is_named_type, lit_str, self_keyword, variant_path, Generic, ToMember, +}; + +mod paths { + use proc_macro2::Span; + use syn::punctuated::Punctuated; + + /// + /// Equivalent to: + /// + /// ```ignore + /// crate::common::Spanned::span + /// ``` + /// + pub fn span() -> syn::Expr { + syn::Expr::Path(syn::ExprPath { + attrs: Default::default(), + qself: Default::default(), + path: syn::Path { + leading_colon: Default::default(), + segments: Punctuated::from_iter( + ["crate", "common", "Spanned", "span"] + .into_iter() + .map(|s| syn::PathSegment { + ident: syn::Ident::new(s, Span::call_site()), + arguments: syn::PathArguments::None, + }), + ), + }, + }) + } + + /// + /// Equivalent to: + /// + /// ```ignore + /// crate::common::SpanIter::combine + /// ``` + /// + pub fn combine() -> syn::Expr { + syn::Expr::Path(syn::ExprPath { + attrs: Default::default(), + qself: Default::default(), + path: syn::Path { + leading_colon: Default::default(), + segments: Punctuated::from_iter( + ["crate", "common", "SpanIter", "combine"] + .into_iter() + .map(|s| syn::PathSegment { + ident: syn::Ident::new(s, Span::call_site()), + arguments: syn::PathArguments::None, + }), + ), + }, + }) + } +} + +/// +/// Equivalent to: +/// +/// ``` +/// crate::utils::Spanned::span(& $expr) +/// ``` +/// +fn span_of(expr: syn::Expr) -> syn::Expr { + let reference = syn::Expr::Reference(syn::ExprReference { + attrs: Default::default(), + and_token: Default::default(), + mutability: None, + expr: Box::new(expr), + }); + + syn::Expr::Call(syn::ExprCall { + attrs: Default::default(), + func: Box::new(paths::span()), + paren_token: Default::default(), + args: Punctuated::from_iter([reference]), + }) +} + +/// +/// Equivalent to: +/// +/// ``` +/// crate::utils::SpanIter::combine([crate::utils::Spanned::span(& $expr), .. ]) +/// ``` +/// +fn spans_of(exprs: impl IntoIterator) -> syn::Expr { + let spans = exprs.into_iter().map(span_of); + + let base = syn::Expr::Call(syn::ExprCall { + attrs: Default::default(), + func: Box::new(paths::combine()), + paren_token: Default::default(), + args: Punctuated::from_iter([syn::Expr::Array(syn::ExprArray { + attrs: Default::default(), + bracket_token: Default::default(), + elems: Punctuated::from_iter(spans), + })]), + }); + + syn::Expr::MethodCall(syn::ExprMethodCall { + attrs: Default::default(), + receiver: Box::new(base), + dot_token: Default::default(), + method: syn::Ident::new("expect", Span::call_site()), + turbofish: Default::default(), + paren_token: Default::default(), + args: Punctuated::from_iter([lit_str( + "AUTOGENERATED: Should always have >= 1 spanned fields", + )]), + }) +} + +pub fn spanned_for_struct(st: &syn::ItemStruct) -> Option { + let syn::ItemStruct { fields, .. } = st; + + let span_field = fields + .iter() + .enumerate() + .find(|(_, syn::Field { ty, .. })| is_named_type(ty, "Span").is_some()); + + // Case 1: this struct represents a terminal token. + // Use the included `Span` field. + if let Some((idx, span_field)) = span_field { + if matches!(fields, syn::Fields::Unnamed(_)) { + if fields.len() > 1 { + Diagnostic::spanned( + st.fields.span().unwrap(), + Level::Warning, + "Non single-field tuple with Span field.", + ) + .emit(); + + Diagnostic::spanned( + st.ident.span().unwrap(), + Level::Help, + "Make these fields named with a `span` field instead.", + ) + .emit() + } + + return Some(span_of(field_access(index(idx as u32)))); + } + + if matches!(fields, syn::Fields::Named(_)) { + // Unwrap ok since we're not a tuple-struct. + let ident = span_field.ident.clone().unwrap(); + + if ident != "span" { + Diagnostic::spanned( + ident.span().unwrap(), + Level::Warning, + "Named Span field should be called `span`.", + ) + .emit(); + + Diagnostic::spanned( + ident.span().unwrap(), + Level::Help, + "Rename this field to `span`.", + ) + .emit(); + } + + return Some(span_of(field_access(ident))); + } + } + + // Case 2: Product type => combine all span values of our fields, in order. + + match fields { + syn::Fields::Named(syn::FieldsNamed { named, .. }) => 'a: { + if named.is_empty() { + break 'a; + } + + return Some(spans_of( + named + .into_iter() + .cloned() + .filter_map(|f| f.ident) + .map(field_access), + )); + } + syn::Fields::Unnamed(syn::FieldsUnnamed { unnamed, .. }) => 'a: { + if unnamed.is_empty() { + break 'a; + } + + return Some(spans_of( + unnamed + .into_iter() + .cloned() + .enumerate() + .map(|(i, _)| index(i as u32)) + .map(field_access), + )); + } + syn::Fields::Unit => (), + } + + Diagnostic::spanned( + st.span().unwrap(), + Level::Error, + "Cannot derive `Spanned` for unit-like struct.", + ) + .emit(); + + None +} + +fn ident_pat(ident: syn::Ident) -> syn::Pat { + syn::Pat::Ident(syn::PatIdent { + attrs: Default::default(), + by_ref: None, + mutability: None, + ident, + subpat: None, + }) +} + +fn spanned_variant_arm(var: &syn::Variant) -> syn::Arm { + let syn::Variant { ident, fields, .. } = var; + let path = variant_path(ident); + + let (members, f_idents): (Vec<_>, Vec<_>) = fields + .iter() + .enumerate() + .map(|(i, f)| { + f.ident + .clone() + .map(|i| (i.clone().to_member(), i)) + .unwrap_or_else(|| { + ( + index(i as u32).to_member(), + syn::Ident::new(&format!("f{i}"), Span::call_site()), + ) + }) + }) + .unzip(); + + let pat = match fields { + syn::Fields::Named(_) => syn::Pat::Struct(syn::PatStruct { + attrs: Default::default(), + qself: Default::default(), + path, + brace_token: Default::default(), + fields: Punctuated::from_iter(members.into_iter().zip(f_idents.iter().cloned()).map( + |(member, ident)| syn::FieldPat { + attrs: Default::default(), + member, + colon_token: Default::default(), + pat: Box::new(ident_pat(ident)), + }, + )), + rest: Default::default(), + }), + syn::Fields::Unnamed(_) => syn::Pat::TupleStruct(syn::PatTupleStruct { + attrs: Default::default(), + qself: Default::default(), + path, + paren_token: Default::default(), + elems: Punctuated::from_iter(f_idents.iter().cloned().map(ident_pat)), + }), + syn::Fields::Unit => unreachable!(), + }; + + syn::Arm { + attrs: Default::default(), + pat, + guard: None, + fat_arrow_token: Default::default(), + body: Box::new(spans_of(f_idents.into_iter().map(|ident| { + syn::Expr::Path(syn::ExprPath { + attrs: Default::default(), + qself: Default::default(), + path: ident.into(), + }) + }))), + comma: Some(Default::default()), + } +} + +pub fn spanned_for_enum(en: &syn::ItemEnum) -> Option { + let vars = &en.variants; + + if vars.is_empty() { + Diagnostic::spanned( + en.span().unwrap(), + Level::Error, + "Cannot derive spanned for enum no variants.", + ) + .emit(); + + return None; + } + + // Check if any variants are unit-like, if so give errors then terminate. + if vars + .iter() + .filter(|syn::Variant { fields, .. }| fields.is_empty()) + .map(|f| { + Diagnostic::spanned( + f.span().unwrap(), + Level::Error, + "Cannot derive spanned for enum with unit-like variants.", + ) + .emit() + }) + .next() + .is_some() + { + return None; + } + + Some(syn::Expr::Match(syn::ExprMatch { + attrs: Default::default(), + match_token: Default::default(), + expr: Box::new(self_keyword()), + brace_token: Default::default(), + arms: vars.iter().map(spanned_variant_arm).collect(), + })) +} + +fn derive_spanned(gen: &impl Generic, span_expr: Option) -> proc_macro::TokenStream { + let ident = gen.ident(); + let generics = gen.generics(); + let generic_letters = gen.generic_letters(); + + if let Some(span) = span_expr { + return quote! { + impl #generics crate::common::Spanned for #ident #generic_letters { + fn span(&self) -> crate::common::Span { + #span + } + } + } + .into(); + } + + Default::default() +} + +pub fn derive_spanned_for_struct(st: &syn::ItemStruct) -> proc_macro::TokenStream { + let span_expr = spanned_for_struct(st); + derive_spanned(st, span_expr) +} + +pub fn derive_spanned_for_enum(en: &syn::ItemEnum) -> proc_macro::TokenStream { + let span_expr = spanned_for_enum(en); + derive_spanned(en, span_expr) +} diff --git a/macros/src/type_traversal.rs b/macros/src/type_traversal.rs new file mode 100644 index 0000000..baa988f --- /dev/null +++ b/macros/src/type_traversal.rs @@ -0,0 +1,126 @@ +//! +//! Utilities that allow use to traverse `struct`s and `enum`s. +//! + +use proc_macro2::Span; +use quote::{quote, ToTokens}; +use syn::punctuated::Punctuated; + +/// +/// Checks to see if an identifier is in a path. +/// +pub fn in_path<'a>(path: &'a syn::Path, ident: &str) -> Option<&'a syn::PathSegment> { + path.segments + .iter() + .find(|syn::PathSegment { ident: id, .. }| id == ident) +} + +/// +/// Checks if a type has the ident inside its name. +/// +pub fn is_named_type<'a>(ty: &'a syn::Type, ident: &str) -> Option<&'a syn::PathSegment> { + match ty { + syn::Type::Path(syn::TypePath { path, .. }) => in_path(path, ident), + _ => None, + } +} + +pub fn self_keyword() -> syn::Expr { + syn::Expr::Path(syn::ExprPath { + attrs: Default::default(), + qself: Default::default(), + path: syn::Ident::new("self", Span::call_site()).into(), + }) +} + +pub trait ToMember { + fn to_member(self) -> syn::Member; +} + +impl ToMember for syn::Index { + fn to_member(self) -> syn::Member { + syn::Member::Unnamed(self) + } +} + +impl ToMember for syn::Ident { + fn to_member(self) -> syn::Member { + syn::Member::Named(self) + } +} + +pub fn index(index: u32) -> syn::Index { + syn::Index { + index, + span: Span::call_site(), + } +} + +pub fn field_access(m: impl ToMember) -> syn::Expr { + syn::Expr::Field(syn::ExprField { + attrs: Default::default(), + base: Box::new(self_keyword()), + dot_token: Default::default(), + member: m.to_member(), + }) +} + +pub trait Generic { + fn ident(&self) -> &syn::Ident; + + fn generics(&self) -> &syn::Generics; + + fn generic_letters(&self) -> proc_macro2::TokenStream { + let generics = self.generics(); + let letters = generics.params.iter().map(|param| match param { + syn::GenericParam::Lifetime(l) => l.lifetime.to_token_stream(), + syn::GenericParam::Type(ty) => ty.ident.to_token_stream(), + syn::GenericParam::Const(cons) => cons.ident.to_token_stream(), + }); + + quote! { + <#(#letters),*> + } + } +} + +impl Generic for syn::ItemStruct { + fn generics(&self) -> &syn::Generics { + &self.generics + } + + fn ident(&self) -> &syn::Ident { + &self.ident + } +} + +impl Generic for syn::ItemEnum { + fn generics(&self) -> &syn::Generics { + &self.generics + } + + fn ident(&self) -> &syn::Ident { + &self.ident + } +} + +pub fn lit_str(st: &str) -> syn::Expr { + syn::Expr::Lit(syn::ExprLit { + attrs: Default::default(), + lit: syn::Lit::Str(syn::LitStr::new(st, Span::call_site())), + }) +} + +pub fn variant_path(var: &syn::Ident) -> syn::Path { + syn::Path { + leading_colon: Default::default(), + segments: Punctuated::from_iter( + [syn::Ident::new("Self", Span::call_site()), var.clone()] + .into_iter() + .map(|ident| syn::PathSegment { + ident, + arguments: syn::PathArguments::None, + }), + ), + } +} diff --git a/macros/src/utils.rs b/macros/src/utils.rs index a6307e2..19e8199 100644 --- a/macros/src/utils.rs +++ b/macros/src/utils.rs @@ -163,6 +163,9 @@ impl ToRustdoc for JSON5Ref { } } +/// +/// Attempt to get the attribute macros for a [syn::Item]. +/// pub fn get_item_attrs(item: &mut syn::Item) -> Option<&mut Vec> { match item { syn::Item::Const(syn::ItemConst { ref mut attrs, .. }) => Some(attrs), diff --git a/src/common/location.rs b/src/common/location.rs index 6f55e40..ccb46c0 100644 --- a/src/common/location.rs +++ b/src/common/location.rs @@ -1,6 +1,6 @@ //! //! Things that help trace errors and tokens: [Span] and [Loc]. -//! +//! use std::ops::{Add, Bound, Range, RangeBounds}; @@ -142,19 +142,33 @@ impl RangeBounds for Span { /// /// Returns the span attached to this /// object. -/// +/// pub trait Spanned { /// /// Returns the span attached to this /// object. - /// + /// fn span(&self) -> Span; } +impl Spanned for Span { + fn span(&self) -> Span { + *self + } +} + +impl<'a, S: Spanned> Spanned for &'a S { + fn span(&self) -> Span { + (*self).span() + } +} + #[cfg(test)] mod tests { use crate::common::source::{DummySource, Source, ToSpan}; + use super::SpanIter; + #[test] fn subspan() { let source = DummySource::new("testthing."); @@ -193,4 +207,17 @@ mod tests { assert_eq!(span.subspan(49..).and_then(|s| source.source_at(s)), None); } + + #[test] + fn test_combine_span() { + let source = DummySource::new( + "agdshJAGDJHAVghVJAtesfsdagdsagdsaJGASDHJGAWDHJAGSDASGHJASGHASDGJSADBHJASDGVBJHtthing.", + ); + + let s1 = (0..13).to_span(&source); + let s2 = (13..23).to_span(&source); + let s3 = (23..26).to_span(&source); + + assert_eq!([s1, s2, s3].combine(), Some((0..26).to_span(&source))); + } } diff --git a/src/lib.rs b/src/lib.rs index 0e8585c..473d4ae 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,16 +1,16 @@ //! //! ## AvJason //! > A child of the [AvdanOS](https://github.com/Avdan-OS) project. -//! +//! //! A parser for [JSON5](https://json5.org/). -//! +//! //! ## Why? //! This crate provides a very important function: traceability. //! ### Tracability //! This allows for line-column data to be preserved so that further //! processing can benefit from spanned errors, which tell the end //! user *where* the error happened. -//! +//! // This will have to be removed to solve #5 #![allow(incomplete_features)] @@ -20,7 +20,11 @@ pub mod common; pub mod lexing; mod macro_test { - use avjason_macros::{ECMARef, SpecRef}; + use std::marker::PhantomData; + + use avjason_macros::{ECMARef, Spanned, SpecRef}; + + use crate::common::Span; #[SpecRef("Identifier", "JSON5Identifier")] #[allow(unused)] @@ -33,4 +37,32 @@ mod macro_test { #[ECMARef("BooleanLiteral", "https://262.ecma-international.org/5.1/#sec-7.8.2")] #[allow(unused)] struct LitBool; -} \ No newline at end of file + + #[derive(Spanned)] + struct True(Span); + + #[derive(Spanned)] + struct False { + span: Span, + ghost: PhantomData, + } + + #[derive(Spanned)] + struct Is { + span: Span, + } + + #[derive(Spanned)] + struct IsTrue(Is, True); + + #[derive(Spanned)] + enum Boolean { + True(True), + False(False), + Both(True, False), + Complex { + truthy: True, + falsey: False + } + } +} From fd7482e322753db228772dc7b489d1b8967c4d59 Mon Sep 17 00:00:00 2001 From: Sammy99jsp Date: Sat, 27 Jan 2024 18:49:29 +0000 Subject: [PATCH 24/39] Add `derive(Spanned)` macro. --- src/lib.rs | 30 +----------------------------- 1 file changed, 1 insertion(+), 29 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 473d4ae..f3a3fd8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -37,32 +37,4 @@ mod macro_test { #[ECMARef("BooleanLiteral", "https://262.ecma-international.org/5.1/#sec-7.8.2")] #[allow(unused)] struct LitBool; - - #[derive(Spanned)] - struct True(Span); - - #[derive(Spanned)] - struct False { - span: Span, - ghost: PhantomData, - } - - #[derive(Spanned)] - struct Is { - span: Span, - } - - #[derive(Spanned)] - struct IsTrue(Is, True); - - #[derive(Spanned)] - enum Boolean { - True(True), - False(False), - Both(True, False), - Complex { - truthy: True, - falsey: False - } - } -} +} \ No newline at end of file From 127fcb8feeb8dcd5619764daa2aa06024a4e102c Mon Sep 17 00:00:00 2001 From: Sammy99jsp Date: Sat, 27 Jan 2024 18:51:22 +0000 Subject: [PATCH 25/39] Add `derive(Spanned)` macro. --- src/lib.rs | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index f3a3fd8..4d6e0ae 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -37,4 +37,33 @@ mod macro_test { #[ECMARef("BooleanLiteral", "https://262.ecma-international.org/5.1/#sec-7.8.2")] #[allow(unused)] struct LitBool; -} \ No newline at end of file + + #[derive(Spanned)] + struct True(Span); + + #[derive(Spanned)] + struct False { + span: Span, + ghost: PhantomData, + } + + #[derive(Spanned)] + struct Is { + span: Span, + } + + #[derive(Spanned)] + struct IsTrue(Is, True); + + #[derive(Spanned)] + #[allow(unused)] + enum Boolean { + True(True), + False(False), + Both(True, False), + Complex { + truthy: True, + falsey: False + } + } +} From 0aebafe763dd5fe269df0da9531a06caec283481 Mon Sep 17 00:00:00 2001 From: Sammy99jsp Date: Sun, 28 Jan 2024 14:48:04 +0000 Subject: [PATCH 26/39] `Spanned`: Add `impl`s for collections helpers, update docs for `#[derive(Spanned)]`. --- macros/src/lib.rs | 87 ++++++++++++++++++++++++++++++++++- macros/src/spanned.rs | 16 +------ macros/src/type_traversal.rs | 7 --- src/common/location.rs | 35 ++++++++++---- src/lexing/tokens/mod.rs | 5 +- src/lexing/utils/lex_impls.rs | 20 +++++++- src/lexing/utils/stream.rs | 2 +- 7 files changed, 138 insertions(+), 34 deletions(-) diff --git a/macros/src/lib.rs b/macros/src/lib.rs index 84fcc77..2ef6808 100644 --- a/macros/src/lib.rs +++ b/macros/src/lib.rs @@ -101,7 +101,92 @@ pub fn ECMARef(params: Tokens, target: Tokens) -> Tokens { } /// -/// +/// ## derive(Spanned) +/// +/// Derives the Spanned trait for both structs and enums. +/// +/// ### Terminal Tokens +/// ```ignore +/// /// +/// /// (1) Named span field. +/// /// +/// /// ASCII digit '0'..='9'. +/// /// +/// #[derive(Spanned)] +/// struct Digit { +/// letter: char, +/// span: Span, +/// } +/// +/// /// +/// /// (2) Tuple struct. +/// /// +/// /// Literally `.` +/// /// +/// #[derive(Spanned)] +/// struct Dot(Span); +/// ``` +/// These are not composed of any smaller tokens. These *must* either: +/// 1. have a name `span: Span` field, or +/// 2. be a tuple struct with *only* a single Span field. +/// +/// *** +/// +/// ### Non-terminal Tokens +/// ```ignore +/// /// +/// /// (1.1) Named Struct +/// /// +/// /// A base-10 decimal number, +/// /// with optional integral part. +/// /// +/// #[derive(Spanned)] +/// struct Decimal { +/// integral: Many, +/// point: Dot, +/// mantissa: AtLeast<1, Digit> +/// } +/// +/// /// +/// /// (1.2) Tuple struct +/// /// +/// /// A base-10 integer. +/// /// +/// #[derive(Spanned)] +/// struct Integer(AtLeast<1, Digit>); +/// +/// /// +/// /// (2.1) Enum (union of tokens). +/// /// +/// /// A number: either an integer, or floating-point. +/// /// +/// #[derive(Spanned)] +/// enum Number { +/// Decimal(Decimal), +/// Integer(Integer), +/// } +/// +/// /// +/// /// (2.2) More complex enum. +/// /// +/// /// Either a base-10 integer, or hex integer. +/// /// +/// #[derive(Spanned)] +/// enum NumberOrHex { +/// Base10(AtLeast<1, Digit>), +/// Base16(v!(0x), AtLeast<1, HexDigit>), +/// } +/// ``` +/// +/// These tokens derive their span from all of their child tokens. +/// They can be expressed either as: +/// +/// 1. Structs, either: +/// 1. Named, or +/// 2. Tuple. +/// 2. Enums: +/// 1. Union types, and even +/// 2. More complicated structures. /// #[proc_macro_derive(Spanned)] pub fn spanned(target: Tokens) -> Tokens { diff --git a/macros/src/spanned.rs b/macros/src/spanned.rs index 9ad7321..4a518e4 100644 --- a/macros/src/spanned.rs +++ b/macros/src/spanned.rs @@ -8,7 +8,7 @@ use quote::quote; use syn::{punctuated::Punctuated, spanned::Spanned}; use crate::type_traversal::{ - field_access, index, is_named_type, lit_str, self_keyword, variant_path, Generic, ToMember, + field_access, index, is_named_type, self_keyword, variant_path, Generic, ToMember, }; mod paths { @@ -99,7 +99,7 @@ fn span_of(expr: syn::Expr) -> syn::Expr { fn spans_of(exprs: impl IntoIterator) -> syn::Expr { let spans = exprs.into_iter().map(span_of); - let base = syn::Expr::Call(syn::ExprCall { + syn::Expr::Call(syn::ExprCall { attrs: Default::default(), func: Box::new(paths::combine()), paren_token: Default::default(), @@ -108,18 +108,6 @@ fn spans_of(exprs: impl IntoIterator) -> syn::Expr { bracket_token: Default::default(), elems: Punctuated::from_iter(spans), })]), - }); - - syn::Expr::MethodCall(syn::ExprMethodCall { - attrs: Default::default(), - receiver: Box::new(base), - dot_token: Default::default(), - method: syn::Ident::new("expect", Span::call_site()), - turbofish: Default::default(), - paren_token: Default::default(), - args: Punctuated::from_iter([lit_str( - "AUTOGENERATED: Should always have >= 1 spanned fields", - )]), }) } diff --git a/macros/src/type_traversal.rs b/macros/src/type_traversal.rs index baa988f..4b13116 100644 --- a/macros/src/type_traversal.rs +++ b/macros/src/type_traversal.rs @@ -104,13 +104,6 @@ impl Generic for syn::ItemEnum { } } -pub fn lit_str(st: &str) -> syn::Expr { - syn::Expr::Lit(syn::ExprLit { - attrs: Default::default(), - lit: syn::Lit::Str(syn::LitStr::new(st, Span::call_site())), - }) -} - pub fn variant_path(var: &syn::Ident) -> syn::Path { syn::Path { leading_colon: Default::default(), diff --git a/src/common/location.rs b/src/common/location.rs index ccb46c0..6b19d47 100644 --- a/src/common/location.rs +++ b/src/common/location.rs @@ -54,7 +54,7 @@ impl Span { /// Returns Some(subspan), given the relative indexes from the start of this span, /// returning None if the end index is out of this span's bounds. /// - pub fn subspan(&self, indexes: impl RangeBounds) -> Option { + pub fn subspan(&self, indexes: impl RangeBounds) -> Option { let start = match indexes.start_bound() { Bound::Included(included) => self.start + included, Bound::Excluded(excluded) => self.start + (excluded + 1), @@ -79,12 +79,12 @@ impl Span { /// Use this [Span] as a start, taking the range between this span's start, /// and the end oi the last of the passed in iterator (including itself). /// - pub fn combine(self, others: impl IntoIterator) -> Span { + pub fn combine(self, others: impl IntoIterator) -> Self { let Self { start, end } = self; - // Take the end bound of the last Span, + // Take the end bound of the last non-empty Span, // if others is not empty, and use that instead. - let last = others.into_iter().last(); + let last = others.into_iter().filter(|s| !s.is_empty()).last(); if let Some(Self { end, .. }) = last { return Self { start, end }; } @@ -98,6 +98,23 @@ impl Span { pub fn as_range(&self) -> Range { self.start.0..self.end.0 } + + /// + /// Returns an empty span. + /// + pub fn empty() -> Self { + Self { + start: Loc(0), + end: Loc(0), + } + } + + /// + /// Is this [Span] empty (captures nothing). + /// + pub fn is_empty(&self) -> bool { + self.end.0 - self.start.0 == 0 + } } /// @@ -109,13 +126,15 @@ pub trait SpanIter: Sized + IntoIterator { /// resulting in a [Span] encompassing all /// passed in [Span]s (assuming this iter is in ascending order). /// - fn combine(self) -> Option; + fn combine(self) -> Span; } impl> SpanIter for Iter { - fn combine(self) -> Option { + fn combine(self) -> Span { let mut iter = self.into_iter(); - iter.next().map(|s| s.combine(iter)) + iter.next() + .map(|s| s.combine(iter)) + .unwrap_or(Span::empty()) } } @@ -218,6 +237,6 @@ mod tests { let s2 = (13..23).to_span(&source); let s3 = (23..26).to_span(&source); - assert_eq!([s1, s2, s3].combine(), Some((0..26).to_span(&source))); + assert_eq!([s1, s2, s3].combine(), (0..26).to_span(&source)); } } diff --git a/src/lexing/tokens/mod.rs b/src/lexing/tokens/mod.rs index 8e94b0a..c0c96ae 100644 --- a/src/lexing/tokens/mod.rs +++ b/src/lexing/tokens/mod.rs @@ -23,9 +23,10 @@ impl LexT for Verbatim { locs.push((loc..(loc+1)).to_span(input.source())); } + Ok(Self { - span: locs.into_iter().combine() - .expect("DO PUT EMPTY STRINGS IN VERBATIM!"), + // If A == "", then an empty Span is returned. + span: locs.into_iter().combine(), }) } diff --git a/src/lexing/utils/lex_impls.rs b/src/lexing/utils/lex_impls.rs index 32558f9..3819e6a 100644 --- a/src/lexing/utils/lex_impls.rs +++ b/src/lexing/utils/lex_impls.rs @@ -4,7 +4,7 @@ use std::ops::{Deref, DerefMut}; -use crate::common::Source; +use crate::common::{Source, Span, SpanIter, Spanned}; use super::{LexError, LexT, SourceStream}; @@ -29,6 +29,12 @@ impl LexT for Many { } } +impl Spanned for Many { + fn span(&self) -> Span { + SpanIter::combine(self.iter().map(S::span)) + } +} + /// /// At least N lots of `L`-tokens. /// @@ -55,6 +61,12 @@ impl LexT for AtLeast { } } +impl Spanned for AtLeast { + fn span(&self) -> Span { + SpanIter::combine(self.iter().map(S::span)) + } +} + impl Deref for AtLeast { type Target = Vec; @@ -103,6 +115,12 @@ where } } +impl Spanned for Exactly { + fn span(&self) -> Span { + SpanIter::combine(self.iter().map(S::span)) + } +} + impl Deref for Exactly { type Target = [L; N]; diff --git a/src/lexing/utils/stream.rs b/src/lexing/utils/stream.rs index 51d2a53..47318cc 100644 --- a/src/lexing/utils/stream.rs +++ b/src/lexing/utils/stream.rs @@ -29,7 +29,7 @@ impl<'a, S: Source> SourceStream<'a, S> { /// came from. /// pub fn source(&self) -> &S { - &self.source + self.source } /// From 7b1aaca3fb5fc2cb41c01c47b53b6606c1b5134b Mon Sep 17 00:00:00 2001 From: Sammy99jsp Date: Sun, 28 Jan 2024 19:18:27 +0000 Subject: [PATCH 27/39] Add utility debug functions for `LexResult` --- src/lexing/utils/result.rs | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/src/lexing/utils/result.rs b/src/lexing/utils/result.rs index 5fbe90b..1766e20 100644 --- a/src/lexing/utils/result.rs +++ b/src/lexing/utils/result.rs @@ -48,3 +48,39 @@ pub enum LexResult { /// Nothing, } + +impl LexResult { + /// + /// Allegory of [Result::expect] + /// + pub fn expect(self, msg: impl ToString) -> L { + match self { + LexResult::Lexed(lexed) => lexed, + LexResult::Errant(errant) => panic!("{}: {errant:?}", msg.to_string()), + LexResult::Nothing => panic!("{}: on LexResult::Nothing", msg.to_string()), + } + } + + /// + /// Allegory of [Result::unwrap] + /// + pub fn unwrap(self) -> L { + match self { + LexResult::Lexed(lexed) => lexed, + LexResult::Errant(errant) => panic!("called `LexResult::unwrap()` on an `Errant` value: {errant:?}"), + LexResult::Nothing => panic!("called `LexResult::unwrap()` on a `Nothing` value"), + } + } + + pub fn is_errant(&self) -> bool { + matches!(self, Self::Errant(_)) + } + + pub fn is_lexed(&self) -> bool { + matches!(self, Self::Lexed(_)) + } + + pub fn is_nothing(&self) -> bool { + matches!(self, Self::Nothing) + } +} From e8747f59c4cf4d14105539b79b2fdf74773565db Mon Sep 17 00:00:00 2001 From: Sammy99jsp Date: Sun, 28 Jan 2024 19:20:42 +0000 Subject: [PATCH 28/39] Add impl for `Loc -> Span` conversion, and Formatting --- src/common/location.rs | 12 ++++++++++++ src/common/source.rs | 24 +++++++++++++++++++----- src/lib.rs | 5 +---- 3 files changed, 32 insertions(+), 9 deletions(-) diff --git a/src/common/location.rs b/src/common/location.rs index 6b19d47..dbf1b22 100644 --- a/src/common/location.rs +++ b/src/common/location.rs @@ -158,6 +158,18 @@ impl RangeBounds for Span { } } +impl From for Span { + /// + /// Span of a single character. + /// + fn from(start: Loc) -> Self { + Self { + start, + end: start +1, + } + } +} + /// /// Returns the span attached to this /// object. diff --git a/src/common/source.rs b/src/common/source.rs index 7230439..16d05f7 100644 --- a/src/common/source.rs +++ b/src/common/source.rs @@ -4,6 +4,12 @@ use std::ops::{Bound, Range, RangeBounds}; +use crate::lexing::utils::SourceStream; +use super::{Loc, Span}; + +#[cfg(test)] +pub use testing_only::DummySource; + /// /// Generic idea of source code: could be a file, /// or a simple string. @@ -46,7 +52,20 @@ pub trait Source { /// fn source_at(&self, span: Span) -> Option; + /// + /// Get the characters in this [Source]. + /// fn characters(&self) -> &[char]; + + /// + /// Crate a stream from this source. + /// + fn stream(&self) -> SourceStream + where + Self: Sized, + { + SourceStream::new(self) + } } /// @@ -80,11 +99,6 @@ impl> ToSpan for R { } } -#[cfg(test)] -pub use testing_only::DummySource; - -use super::{Loc, Span}; - #[cfg(test)] mod testing_only { use std::ops::Range; diff --git a/src/lib.rs b/src/lib.rs index 4d6e0ae..27749aa 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -61,9 +61,6 @@ mod macro_test { True(True), False(False), Both(True, False), - Complex { - truthy: True, - falsey: False - } + Complex { truthy: True, falsey: False }, } } From 0eab3712f18c2b9d06f11d782ced1bde14d2a0b3 Mon Sep 17 00:00:00 2001 From: Sammy99jsp Date: Sun, 28 Jan 2024 19:21:18 +0000 Subject: [PATCH 29/39] Add `verbatim!`. --- macros/src/lib.rs | 51 +++++++-- macros/src/verbatim.rs | 214 +++++++++++++++++++++++++++++++++++++ src/lexing/mod.rs | 2 + src/lexing/tokens/mod.rs | 106 ++++++++++++++++-- src/lexing/utils/stream.rs | 60 +++++++++-- 5 files changed, 410 insertions(+), 23 deletions(-) create mode 100644 macros/src/verbatim.rs diff --git a/macros/src/lib.rs b/macros/src/lib.rs index 2ef6808..9fbadba 100644 --- a/macros/src/lib.rs +++ b/macros/src/lib.rs @@ -2,17 +2,19 @@ //! Macros for the main crate. //! -#![feature(proc_macro_diagnostic)] +#![feature(proc_macro_diagnostic, char_min)] mod spanned; mod type_traversal; mod utils; +mod verbatim; use proc_macro::{Diagnostic, Level, Span, TokenStream as Tokens}; use quote::ToTokens; use spanned::{derive_spanned_for_enum, derive_spanned_for_struct}; use syn::parse_macro_input; use utils::{get_item_attrs, ECMARef, JSON5Ref, ToRustdoc}; +use verbatim::VerbatimPat; /// /// ## SpecRef @@ -102,9 +104,9 @@ pub fn ECMARef(params: Tokens, target: Tokens) -> Tokens { /// /// ## derive(Spanned) -/// +/// /// Derives the Spanned trait for both structs and enums. -/// +/// /// ### Terminal Tokens /// ```ignore /// /// @@ -129,9 +131,9 @@ pub fn ECMARef(params: Tokens, target: Tokens) -> Tokens { /// These are not composed of any smaller tokens. These *must* either: /// 1. have a name `span: Span` field, or /// 2. be a tuple struct with *only* a single Span field. -/// +/// /// *** -/// +/// /// ### Non-terminal Tokens /// ```ignore /// /// @@ -146,7 +148,7 @@ pub fn ECMARef(params: Tokens, target: Tokens) -> Tokens { /// point: Dot, /// mantissa: AtLeast<1, Digit> /// } -/// +/// /// /// /// /// (1.2) Tuple struct /// /// @@ -154,7 +156,7 @@ pub fn ECMARef(params: Tokens, target: Tokens) -> Tokens { /// /// /// #[derive(Spanned)] /// struct Integer(AtLeast<1, Digit>); -/// +/// /// /// /// /// (2.1) Enum (union of tokens). /// /// @@ -177,10 +179,10 @@ pub fn ECMARef(params: Tokens, target: Tokens) -> Tokens { /// Base16(v!(0x), AtLeast<1, HexDigit>), /// } /// ``` -/// +/// /// These tokens derive their span from all of their child tokens. /// They can be expressed either as: -/// +/// /// 1. Structs, either: /// 1. Named, or /// 2. Tuple. @@ -207,3 +209,34 @@ pub fn spanned(target: Tokens) -> Tokens { Default::default() } + +/// +/// ## verbatim! +/// +/// Often shortend to `v!`, use *this* macro instead +/// of its struct helper friends `Verbatim<...>`, `CharPattern<...>`. +/// +/// ### Examples +/// ```ignore +/// use avjason_macros::verbatim as v; +/// +/// // (1) Single char match -> Verbatim<{char as &str}> +/// type Comma = v!(','); +/// +/// // (2) String match -> Verbatim<{&str}> +/// type NaN = v!("NaN"); +/// +/// // (3) Char range match -> CharPattern<{CharacterRange { +/// // start: start, +/// // end: end, // (modified to make the end exclusive) +/// // }}> +/// type Digit = v!('0'..='9'); +/// type NonZero = v!('1'..='9'); +/// ``` +/// +#[proc_macro] +pub fn verbatim(params: Tokens) -> Tokens { + let params: VerbatimPat = syn::parse_macro_input!(params); + let ty = params.into_type(); + ty.into_token_stream().into() +} diff --git a/macros/src/verbatim.rs b/macros/src/verbatim.rs new file mode 100644 index 0000000..8c400a3 --- /dev/null +++ b/macros/src/verbatim.rs @@ -0,0 +1,214 @@ +//! +//! Utilities for the `verbatim!` macro. +//! + +use std::ops::Deref; + +use proc_macro2::Span; +use syn::parse::{Parse, ParseStream}; + +use self::paths::generic_path; + +pub enum VerbatimPat { + LitStr(syn::LitStr), + LitChar(syn::LitChar), + CharRange(char, char), +} + +mod paths { + use proc_macro2::Span; + use syn::punctuated::Punctuated; + + use crate::type_traversal::ToMember; + + /// + /// Makes an ident from a string, + /// with the Span resolving to Span::call_site() + /// + fn ident(st: &str) -> syn::Ident { + syn::Ident::new(st, Span::call_site()) + } + + /// + /// ```ignore + /// $path<$arg> + /// ``` + /// + pub fn generic_path(path: [&str; N], arg: syn::GenericArgument) -> syn::Type { + syn::Type::Path(syn::TypePath { + qself: None, + path: syn::Path { + leading_colon: None, + segments: Punctuated::from_iter( + path[..N - 1] + .iter() + .copied() + .map(ident) + .map(syn::PathSegment::from) + .chain([syn::PathSegment { + ident: ident(path[N - 1]), + arguments: syn::PathArguments::AngleBracketed( + syn::AngleBracketedGenericArguments { + colon2_token: Default::default(), + lt_token: Default::default(), + args: Punctuated::from_iter([arg]), + gt_token: Default::default(), + }, + ), + }]), + ), + }, + }) + } + + pub fn character_range(start: syn::Expr, end: syn::Expr) -> syn::Expr { + let path = syn::Path { + leading_colon: None, + segments: Punctuated::from_iter( + ["crate", "lexing", "CharacterRange"] + .map(ident) + .map(syn::PathSegment::from), + ), + }; + + syn::Expr::Struct(syn::ExprStruct { + attrs: Default::default(), + qself: Default::default(), + path, + brace_token: Default::default(), + fields: Punctuated::from_iter([("start", start), ("end", end)].map(|(f, expr)| { + syn::FieldValue { + attrs: Default::default(), + member: ident(f).to_member(), + colon_token: Some(Default::default()), + expr, + } + })), + dot2_token: None, + rest: None, + }) + } +} + +impl VerbatimPat { + /// + /// Build the AST for this pattern, + /// using helper structs in the main crate. + /// + pub fn into_type(self) -> syn::Type { + match self { + VerbatimPat::LitStr(st) => paths::generic_path( + ["crate", "lexing", "Verbatim"], + syn::GenericArgument::Const(syn::Expr::Lit(syn::ExprLit { + attrs: Default::default(), + lit: syn::Lit::Str(st), + })), + ), + VerbatimPat::LitChar(ch) => paths::generic_path( + ["crate", "lexing", "Verbatim"], + syn::GenericArgument::Const(syn::Expr::Lit(syn::ExprLit { + attrs: Default::default(), + lit: syn::Lit::Str(syn::LitStr::new( + &ch.value().to_string(), + Span::call_site(), + )), + })), + ), + VerbatimPat::CharRange(start, end) => { + let bounds = [start, end].map(|c| syn::LitChar::new(c, Span::call_site())); + let [start, end] = bounds.map(|ch| { + syn::Expr::Lit(syn::ExprLit { + attrs: Default::default(), + lit: syn::Lit::Char(ch), + }) + }); + + let constructed = paths::character_range(start, end); + let braced = syn::Expr::Block(syn::ExprBlock { + attrs: Default::default(), + label: Default::default(), + block: syn::Block { + brace_token: Default::default(), + stmts: vec![syn::Stmt::Expr(constructed, None)], + }, + }); + + let const_param = syn::GenericArgument::Const(braced); + + generic_path(["crate", "lexing", "CharPattern"], const_param) + } + } + } +} + +/// +/// Is this expression a char literal? +/// +fn is_lit_char(expr: &impl Deref) -> bool { + let expr = expr.deref(); + matches!( + expr, + syn::Expr::Lit(syn::ExprLit { + lit: syn::Lit::Char(_), + .. + }) + ) +} + +/// +/// Gets the character value if this expression is a char literal. +/// +fn get_char(expr: &impl Deref) -> Option { + match expr.deref() { + syn::Expr::Lit(syn::ExprLit { + lit: syn::Lit::Char(litchar), + .. + }) => Some(litchar.value()), + _ => None, + } +} + +impl Parse for VerbatimPat { + fn parse(input: ParseStream) -> syn::Result { + let pat = syn::Pat::parse_single(input)?; + + // Nasty pattern matching, but that's the downside of enums galore. + match pat { + syn::Pat::Lit(syn::ExprLit { + lit: lit @ (syn::Lit::Char(_) | syn::Lit::Str(_)), + .. + }) => match lit { + syn::Lit::Char(ch) => Ok(Self::LitChar(ch)), + syn::Lit::Str(st) => Ok(Self::LitStr(st)), + _ => unreachable!(), + }, + syn::Pat::Range(syn::PatRange { + start, end, limits, .. + }) if start.as_ref().map(is_lit_char).unwrap_or(true) + && end.as_ref().map(is_lit_char).unwrap_or(true) => + { + let c_start = start.as_ref().and_then(get_char).unwrap_or(char::MIN); + let c_end = end.as_ref().and_then(get_char).unwrap_or(char::MAX); + + let (c_start, c_end) = match limits { + syn::RangeLimits::HalfOpen(_) => (c_start, Some(c_end)), + syn::RangeLimits::Closed(_) => (c_start, char::from_u32(c_end as u32 + 1)), + }; + + if c_end.is_none() { + return Err(syn::Error::new_spanned( + end, + "This char literal cannot be used as an inclusive end.", + )); + } + + let (start, end) = (c_start, c_end.unwrap()); + Ok(Self::CharRange(start, end)) + } + _ => Err(syn::Error::new_spanned( + pat, + "Only string and char literals, and char ranges are accepted here", + )), + } + } +} diff --git a/src/lexing/mod.rs b/src/lexing/mod.rs index 87915e7..6edf548 100644 --- a/src/lexing/mod.rs +++ b/src/lexing/mod.rs @@ -7,3 +7,5 @@ pub mod utils; pub mod tokens; +pub use tokens::{CharPattern, Verbatim}; +pub use utils::stream::{SourceStream, CharacterRange}; \ No newline at end of file diff --git a/src/lexing/tokens/mod.rs b/src/lexing/tokens/mod.rs index c0c96ae..be9f897 100644 --- a/src/lexing/tokens/mod.rs +++ b/src/lexing/tokens/mod.rs @@ -1,11 +1,17 @@ //! //! Lexical tokens. -//! +//! + +use avjason_macros::Spanned; use crate::common::{Loc, Source, Span, SpanIter, ToSpan}; -use super::utils::{LexError, LexT, SourceStream}; +use super::utils::{stream::CharacterRange, LexError, LexT, SourceStream}; +/// +/// Do not use me directly, use [crate::verbatim] instead! +/// +#[derive(Debug, Spanned)] pub struct Verbatim { span: Span, } @@ -17,17 +23,105 @@ impl LexT for Verbatim { fn lex(input: &mut SourceStream) -> Result { let mut locs = vec![]; - + for _ in 0..A.len() { let (Loc(loc), _) = input.take().unwrap(); - locs.push((loc..(loc+1)).to_span(input.source())); + locs.push((loc..(loc + 1)).to_span(input.source())); } - Ok(Self { // If A == "", then an empty Span is returned. span: locs.into_iter().combine(), }) + } +} + +/// +/// Matches a character with a given range. +/// +#[derive(Debug, Spanned)] +pub struct CharPattern { + raw: char, + span: Span, +} + +impl CharPattern { + pub fn raw(&self) -> &char { + &self.raw + } +} + +impl LexT for CharPattern { + fn peek(input: &SourceStream) -> bool { + input.upcoming(&R) + } + + fn lex(input: &mut SourceStream) -> Result { + let (loc, raw) = input.take().unwrap(); + Ok(Self { + raw, + span: Span::from(loc), + }) + } +} + +#[cfg(test)] +mod tests { + use avjason_macros::verbatim as v; + + use crate::{ + common::{file::SourceFile, Source}, + lexing::{ + tokens::CharPattern, + utils::{stream::CharacterRange, Many}, + }, + }; + + use super::Verbatim; + + #[test] + fn verbatim() { + let source = SourceFile::dummy_file(",."); + let input = &mut source.stream(); + let comma: Verbatim<","> = input.lex().expect("Valid parse"); + println!("{comma:?}") + } + + #[test] + fn ranged() { + const DIGIT: CharacterRange = CharacterRange { + start: '0', + end: ':', + }; + + let source = SourceFile::dummy_file("126439012363421890"); + let input = &mut source.stream(); + let digit: Many> = input.lex().expect("Valid parse"); + println!("{digit:?}") + } + + #[test] + fn macro_test() { + type Comma = v!(','); + type DoubleColon = v!("::"); + type Digit = v!('0'..='9'); + + { + let source = SourceFile::dummy_file(","); + let input = &mut source.stream(); + let _: Comma = input.lex().expect("Valid parse"); + } + { + let source = SourceFile::dummy_file("::"); + let input = &mut source.stream(); + let _: DoubleColon = input.lex().expect("Valid parse"); + } + + { + let source = SourceFile::dummy_file("126439012363421890"); + let input = &mut source.stream(); + let _: Many = input.lex().expect("Valid parse"); + } } -} \ No newline at end of file +} diff --git a/src/lexing/utils/stream.rs b/src/lexing/utils/stream.rs index 47318cc..3f83787 100644 --- a/src/lexing/utils/stream.rs +++ b/src/lexing/utils/stream.rs @@ -1,13 +1,9 @@ +use std::{marker::ConstParamTy, ops::{Range, RangeInclusive}}; + use crate::common::{Loc, Source, Span, Spanned, ToSpan}; use super::{Lex, LexResult}; -#[derive(Debug, Clone)] -pub struct SourceStream<'a, S: Source> { - index: usize, - source: &'a S, -} - /// /// Things that [SourceStream] can /// check are coming up. @@ -19,15 +15,63 @@ pub trait Lookahead { impl Lookahead for str { fn upcoming(&self, input: &SourceStream) -> bool { let chars = self.chars().collect::>(); - input.source.characters()[input.index..(input.index + chars.len())] == chars + input + .source + .characters() + .get(input.index..(input.index + chars.len())) + .map(|st| st == chars) + .unwrap_or(false) } } +/// +/// A const-friendly implementation of [RangeBounds]. +/// +#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)] +pub struct CharacterRange { + /// + /// Inclusive start. + /// + pub start: char, + + /// + /// Exclusive end. + /// + pub end: char, +} + + +impl ConstParamTy for CharacterRange {} + +impl Lookahead for CharacterRange { + fn upcoming(&self, input: &SourceStream) -> bool { + input + .source + .characters() + .get(input.index) + .map(|ch| (self.start..self.end).contains(ch)) + .unwrap_or(false) + } +} + +#[derive(Debug, Clone)] +pub struct SourceStream<'a, S: Source> { + index: usize, + source: &'a S, +} + impl<'a, S: Source> SourceStream<'a, S> { + /// + /// Create a new stream from a source. + /// + pub fn new(source: &'a S) -> Self { + Self { index: 0, source } + } + /// /// Returns the source where this [SourceStream] /// came from. - /// + /// pub fn source(&self) -> &S { self.source } From 250e4ecd0b73bac6e028c62ffc1e698bd617d07b Mon Sep 17 00:00:00 2001 From: Sammy99jsp Date: Sun, 28 Jan 2024 19:29:57 +0000 Subject: [PATCH 30/39] Minor documentation, impls. --- macros/src/verbatim.rs | 20 +++++++++++++++++++- src/lexing/tokens/mod.rs | 4 +++- src/lexing/utils/result.rs | 4 +++- src/lexing/utils/stream.rs | 11 ++++++++--- 4 files changed, 33 insertions(+), 6 deletions(-) diff --git a/macros/src/verbatim.rs b/macros/src/verbatim.rs index 8c400a3..a20fd18 100644 --- a/macros/src/verbatim.rs +++ b/macros/src/verbatim.rs @@ -9,6 +9,9 @@ use syn::parse::{Parse, ParseStream}; use self::paths::generic_path; +/// +/// Accepted patterns for `verbatim!`. +/// pub enum VerbatimPat { LitStr(syn::LitStr), LitChar(syn::LitChar), @@ -30,6 +33,11 @@ mod paths { } /// + /// Generates a path with the last segment + /// having generic parameters. + /// + /// Equivalent to: + /// /// ```ignore /// $path<$arg> /// ``` @@ -61,6 +69,16 @@ mod paths { }) } + /// + /// Equivalent to: + /// + /// ```ignore + /// crate::lexing::CharacterRange { + /// start: $start, + /// end: $end, + /// } + /// ``` + /// pub fn character_range(start: syn::Expr, end: syn::Expr) -> syn::Expr { let path = syn::Path { leading_colon: None, @@ -172,7 +190,7 @@ impl Parse for VerbatimPat { fn parse(input: ParseStream) -> syn::Result { let pat = syn::Pat::parse_single(input)?; - // Nasty pattern matching, but that's the downside of enums galore. + // Nasty pattern matching, but that's the downside of nested enums. match pat { syn::Pat::Lit(syn::ExprLit { lit: lit @ (syn::Lit::Char(_) | syn::Lit::Str(_)), diff --git a/src/lexing/tokens/mod.rs b/src/lexing/tokens/mod.rs index be9f897..afbad30 100644 --- a/src/lexing/tokens/mod.rs +++ b/src/lexing/tokens/mod.rs @@ -9,7 +9,9 @@ use crate::common::{Loc, Source, Span, SpanIter, ToSpan}; use super::utils::{stream::CharacterRange, LexError, LexT, SourceStream}; /// -/// Do not use me directly, use [crate::verbatim] instead! +/// Looks for a particular string in input. +/// +/// **Do not use me directly, use [crate::verbatim] instead!** /// #[derive(Debug, Spanned)] pub struct Verbatim { diff --git a/src/lexing/utils/result.rs b/src/lexing/utils/result.rs index 1766e20..9d23afc 100644 --- a/src/lexing/utils/result.rs +++ b/src/lexing/utils/result.rs @@ -1,8 +1,10 @@ +use avjason_macros::Spanned; + use crate::common::{Source, Span, Spanned}; use super::SourceStream; -#[derive(Debug)] +#[derive(Debug, Spanned)] pub struct LexError { span: Span, message: String, diff --git a/src/lexing/utils/stream.rs b/src/lexing/utils/stream.rs index 3f83787..b5a9c25 100644 --- a/src/lexing/utils/stream.rs +++ b/src/lexing/utils/stream.rs @@ -1,4 +1,4 @@ -use std::{marker::ConstParamTy, ops::{Range, RangeInclusive}}; +use std::marker::ConstParamTy; use crate::common::{Loc, Source, Span, Spanned, ToSpan}; @@ -25,7 +25,10 @@ impl Lookahead for str { } /// -/// A const-friendly implementation of [RangeBounds]. +/// A const-friendly implementation of [std::ops::Range]. +/// +/// This works with the [crate::verbatim] macro to support +/// the range syntax: `v!('0'..='9')`. /// #[derive(Debug, PartialEq, Eq, PartialOrd, Ord)] pub struct CharacterRange { @@ -40,7 +43,6 @@ pub struct CharacterRange { pub end: char, } - impl ConstParamTy for CharacterRange {} impl Lookahead for CharacterRange { @@ -97,6 +99,9 @@ impl<'a, S: Source> SourceStream<'a, S> { Lex::lex(self) } + /// + /// Checks if a lookahead pattern is next in the stream. + /// pub fn upcoming(&self, lookahead: &L) -> bool { lookahead.upcoming(self) } From 932134b585b15765f98e1e67af3179d51dccd264 Mon Sep 17 00:00:00 2001 From: Sammy99jsp Date: Sun, 28 Jan 2024 19:38:11 +0000 Subject: [PATCH 31/39] Refactor: moving `Verbatim` et al. to `lexing/utils`. --- src/lexing/mod.rs | 9 ++- src/lexing/tokens/mod.rs | 125 -------------------------------- src/lexing/utils/mod.rs | 1 + src/lexing/utils/verbatim.rs | 135 +++++++++++++++++++++++++++++++++++ src/lib.rs | 6 +- 5 files changed, 146 insertions(+), 130 deletions(-) create mode 100644 src/lexing/utils/verbatim.rs diff --git a/src/lexing/mod.rs b/src/lexing/mod.rs index 6edf548..a3a14d3 100644 --- a/src/lexing/mod.rs +++ b/src/lexing/mod.rs @@ -4,8 +4,11 @@ //! some [lexical grammar](https://en.wikipedia.org/wiki/Lexical_grammar). //! -pub mod utils; pub mod tokens; +pub mod utils; -pub use tokens::{CharPattern, Verbatim}; -pub use utils::stream::{SourceStream, CharacterRange}; \ No newline at end of file +pub use utils::{ + stream::CharacterRange, + verbatim::{CharPattern, Verbatim}, + AtLeast, Exactly, Lex, LexError, LexResult, LexT, Many, Peek, SourceStream, +}; diff --git a/src/lexing/tokens/mod.rs b/src/lexing/tokens/mod.rs index afbad30..492ae42 100644 --- a/src/lexing/tokens/mod.rs +++ b/src/lexing/tokens/mod.rs @@ -2,128 +2,3 @@ //! Lexical tokens. //! -use avjason_macros::Spanned; - -use crate::common::{Loc, Source, Span, SpanIter, ToSpan}; - -use super::utils::{stream::CharacterRange, LexError, LexT, SourceStream}; - -/// -/// Looks for a particular string in input. -/// -/// **Do not use me directly, use [crate::verbatim] instead!** -/// -#[derive(Debug, Spanned)] -pub struct Verbatim { - span: Span, -} - -impl LexT for Verbatim { - fn peek(input: &SourceStream) -> bool { - input.upcoming(A) - } - - fn lex(input: &mut SourceStream) -> Result { - let mut locs = vec![]; - - for _ in 0..A.len() { - let (Loc(loc), _) = input.take().unwrap(); - locs.push((loc..(loc + 1)).to_span(input.source())); - } - - Ok(Self { - // If A == "", then an empty Span is returned. - span: locs.into_iter().combine(), - }) - } -} - -/// -/// Matches a character with a given range. -/// -#[derive(Debug, Spanned)] -pub struct CharPattern { - raw: char, - span: Span, -} - -impl CharPattern { - pub fn raw(&self) -> &char { - &self.raw - } -} - -impl LexT for CharPattern { - fn peek(input: &SourceStream) -> bool { - input.upcoming(&R) - } - - fn lex(input: &mut SourceStream) -> Result { - let (loc, raw) = input.take().unwrap(); - Ok(Self { - raw, - span: Span::from(loc), - }) - } -} - -#[cfg(test)] -mod tests { - use avjason_macros::verbatim as v; - - use crate::{ - common::{file::SourceFile, Source}, - lexing::{ - tokens::CharPattern, - utils::{stream::CharacterRange, Many}, - }, - }; - - use super::Verbatim; - - #[test] - fn verbatim() { - let source = SourceFile::dummy_file(",."); - let input = &mut source.stream(); - let comma: Verbatim<","> = input.lex().expect("Valid parse"); - println!("{comma:?}") - } - - #[test] - fn ranged() { - const DIGIT: CharacterRange = CharacterRange { - start: '0', - end: ':', - }; - - let source = SourceFile::dummy_file("126439012363421890"); - let input = &mut source.stream(); - let digit: Many> = input.lex().expect("Valid parse"); - println!("{digit:?}") - } - - #[test] - fn macro_test() { - type Comma = v!(','); - type DoubleColon = v!("::"); - type Digit = v!('0'..='9'); - - { - let source = SourceFile::dummy_file(","); - let input = &mut source.stream(); - let _: Comma = input.lex().expect("Valid parse"); - } - - { - let source = SourceFile::dummy_file("::"); - let input = &mut source.stream(); - let _: DoubleColon = input.lex().expect("Valid parse"); - } - - { - let source = SourceFile::dummy_file("126439012363421890"); - let input = &mut source.stream(); - let _: Many = input.lex().expect("Valid parse"); - } - } -} diff --git a/src/lexing/utils/mod.rs b/src/lexing/utils/mod.rs index 0c18821..cf5a687 100644 --- a/src/lexing/utils/mod.rs +++ b/src/lexing/utils/mod.rs @@ -6,6 +6,7 @@ pub mod lex_impls; pub mod peek; pub mod result; pub mod stream; +pub mod verbatim; use std::marker::PhantomData; diff --git a/src/lexing/utils/verbatim.rs b/src/lexing/utils/verbatim.rs new file mode 100644 index 0000000..480f4f8 --- /dev/null +++ b/src/lexing/utils/verbatim.rs @@ -0,0 +1,135 @@ +//! +//! Pattern matching helpers. +//! + +use avjason_macros::Spanned; + +use crate::common::{Loc, Source, Span, SpanIter, ToSpan}; + +use crate::lexing::{CharacterRange, LexError, LexT, SourceStream}; + +/// +/// Looks for a particular string in input. +/// +/// *** +/// +/// **Do not use me directly, use [crate::verbatim] instead!** +/// +#[derive(Debug, Spanned)] +pub struct Verbatim { + span: Span, +} + +impl LexT for Verbatim { + fn peek(input: &SourceStream) -> bool { + input.upcoming(A) + } + + fn lex(input: &mut SourceStream) -> Result { + let mut locs = vec![]; + + for _ in 0..A.len() { + let (Loc(loc), _) = input.take().unwrap(); + locs.push((loc..(loc + 1)).to_span(input.source())); + } + + Ok(Self { + // If A == "", then an empty Span is returned. + span: locs.into_iter().combine(), + }) + } +} + +/// +/// Matches a character with a given range. +/// +/// *** +/// +/// **Do not use me directly, use [crate::verbatim] instead!** +/// +#[derive(Debug, Spanned)] +pub struct CharPattern { + raw: char, + span: Span, +} + +impl CharPattern { + pub fn raw(&self) -> &char { + &self.raw + } +} + +impl LexT for CharPattern { + fn peek(input: &SourceStream) -> bool { + input.upcoming(&R) + } + + fn lex(input: &mut SourceStream) -> Result { + let (loc, raw) = input.take().unwrap(); + Ok(Self { + raw, + span: Span::from(loc), + }) + } +} + +#[cfg(test)] +mod tests { + use avjason_macros::verbatim as v; + + use crate::{ + common::{file::SourceFile, Source}, + lexing::{ + CharPattern, + utils::{stream::CharacterRange, Many}, + }, + }; + + use super::Verbatim; + + #[test] + fn verbatim() { + let source = SourceFile::dummy_file(",."); + let input = &mut source.stream(); + let comma: Verbatim<","> = input.lex().expect("Valid parse"); + println!("{comma:?}") + } + + #[test] + fn ranged() { + const DIGIT: CharacterRange = CharacterRange { + start: '0', + end: ':', + }; + + let source = SourceFile::dummy_file("126439012363421890"); + let input = &mut source.stream(); + let digit: Many> = input.lex().expect("Valid parse"); + println!("{digit:?}") + } + + #[test] + fn macro_test() { + type Comma = v!(','); + type DoubleColon = v!("::"); + type Digit = v!('0'..='9'); + + { + let source = SourceFile::dummy_file(","); + let input = &mut source.stream(); + let _: Comma = input.lex().expect("Valid parse"); + } + + { + let source = SourceFile::dummy_file("::"); + let input = &mut source.stream(); + let _: DoubleColon = input.lex().expect("Valid parse"); + } + + { + let source = SourceFile::dummy_file("126439012363421890"); + let input = &mut source.stream(); + let _: Many = input.lex().expect("Valid parse"); + } + } +} diff --git a/src/lib.rs b/src/lib.rs index 27749aa..d82a3d3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -12,17 +12,19 @@ //! user *where* the error happened. //! -// This will have to be removed to solve #5 +// This will have to be removed to solve #5: #![allow(incomplete_features)] #![feature(adt_const_params)] pub mod common; pub mod lexing; +pub(crate) use avjason_macros::*; + mod macro_test { use std::marker::PhantomData; - use avjason_macros::{ECMARef, Spanned, SpecRef}; + use super::{ECMARef, Spanned, SpecRef}; use crate::common::Span; From cfdcccf5616a5d7cde03ab027533993bf8bd0d8c Mon Sep 17 00:00:00 2001 From: Sammy99jsp Date: Sun, 28 Jan 2024 20:52:57 +0000 Subject: [PATCH 32/39] Add `LineTerminator`, `LineTerminatorSequence` Fix bug with multi-byte chars --- src/common/file.rs | 5 +- src/common/source.rs | 9 ++- src/lexing/tokens/line_terminator.rs | 116 +++++++++++++++++++++++++++ src/lexing/tokens/mod.rs | 2 + src/lexing/tokens/whitespace.rs | 64 +++++++++++++++ src/lexing/utils/lex_impls.rs | 2 +- src/lexing/utils/mod.rs | 2 +- src/lexing/utils/result.rs | 73 ++++++++++++++++- src/lexing/utils/stream.rs | 65 ++++++++++++--- src/lexing/utils/verbatim.rs | 30 ++++--- 10 files changed, 335 insertions(+), 33 deletions(-) create mode 100644 src/lexing/tokens/line_terminator.rs create mode 100644 src/lexing/tokens/whitespace.rs diff --git a/src/common/file.rs b/src/common/file.rs index 0ff31ca..2f87b8e 100644 --- a/src/common/file.rs +++ b/src/common/file.rs @@ -4,7 +4,7 @@ use std::{fmt::Formatter, ops::Range, path::Path}; -use super::{Loc, Source, Span}; +use super::{Loc, Source, Span, Spanned}; /// /// Line and column information for @@ -162,7 +162,8 @@ impl Source for SourceFile { Loc(0)..Loc(self.chars.len()) } - fn source_at(&self, span: Span) -> Option { + fn source_at(&self, span: impl Spanned) -> Option { + let span = span.span(); if self.in_bounds(&span) { return Some(self.chars[span.as_range()].iter().collect()); } diff --git a/src/common/source.rs b/src/common/source.rs index 16d05f7..631e41c 100644 --- a/src/common/source.rs +++ b/src/common/source.rs @@ -5,7 +5,7 @@ use std::ops::{Bound, Range, RangeBounds}; use crate::lexing::utils::SourceStream; -use super::{Loc, Span}; +use super::{Loc, Span, Spanned}; #[cfg(test)] pub use testing_only::DummySource; @@ -50,7 +50,7 @@ pub trait Source { /// /// Returns the source code at a given [Span], if within bounds. /// - fn source_at(&self, span: Span) -> Option; + fn source_at(&self, span: impl Spanned) -> Option; /// /// Get the characters in this [Source]. @@ -103,7 +103,7 @@ impl> ToSpan for R { mod testing_only { use std::ops::Range; - use crate::common::{Loc, Span}; + use crate::common::{Loc, Span, Spanned}; use super::Source; @@ -137,7 +137,8 @@ mod testing_only { Loc(0)..Loc(self.text.len()) } - fn source_at(&self, span: Span) -> Option { + fn source_at(&self, span: impl Spanned) -> Option { + let span = span.span(); if self.in_bounds(&span) { self.text.get(span.as_range()).map(ToString::to_string) } else { diff --git a/src/lexing/tokens/line_terminator.rs b/src/lexing/tokens/line_terminator.rs new file mode 100644 index 0000000..5412f5d --- /dev/null +++ b/src/lexing/tokens/line_terminator.rs @@ -0,0 +1,116 @@ +//! +//! ## Line Terminators +//! +//! These signify the end of lines (although techincally [LineTerminatorSequence]s do!) +//! + +use avjason_macros::{verbatim as v, ECMARef, Spanned}; + +use crate::{ + common::Source, + lexing::{Lex, LexError, LexT, SourceStream}, +}; + +#[ECMARef("LineTerminator", "https://262.ecma-international.org/5.1/#sec-7.3")] +#[derive(Debug, Spanned)] +pub enum LineTerminator { + LF(v!('\n')), + CR(v!('\r')), + LS(v!('\u{2028}')), + PS(v!('\u{2029}')), +} + +#[ECMARef( + "LineTerminatorSequence", + "https://262.ecma-international.org/5.1/#sec-7.3" +)] +#[derive(Debug, Spanned)] +pub enum LineTerminatorSequence { + CRLF(v!("\r\n")), + LF(v!('\n')), + CR(v!('\r')), + LS(v!('\u{2028}')), + PS(v!('\u{2029}')), +} + +impl LexT for LineTerminator { + fn peek(input: &SourceStream) -> bool { + ::peek(input) + || ::peek(input) + || ::peek(input) + || ::peek(input) + } + + fn lex(input: &mut SourceStream) -> Result { + // .into_result() ok since we know there's at least one upcoming variant. + Lex::lex(input) + .map(Self::LF) + .or(|| Lex::lex(input).map(Self::CR)) + .or(|| Lex::lex(input).map(Self::LS)) + .or(|| Lex::lex(input).map(Self::PS)) + .into_result() + } +} + +impl LexT for LineTerminatorSequence { + fn peek(input: &SourceStream) -> bool { + ::peek(input) + || ::peek(input) + || ::peek(input) + || ::peek(input) + } + + fn lex(input: &mut SourceStream) -> Result { + // .into_result() ok since we know there's at least one upcoming variant. + Lex::lex(input) + .map(Self::CRLF) + .or(|| Lex::lex(input).map(Self::LF)) + .or(|| Lex::lex(input).map(Self::CR)) + .or(|| Lex::lex(input).map(Self::LS)) + .or(|| Lex::lex(input).map(Self::PS)) + .into_result() + } +} + +#[cfg(test)] +mod tests { + use crate::{ + common::{file::SourceFile, Source}, + lexing::{tokens::line_terminator::LineTerminatorSequence, Exactly}, + }; + + use super::LineTerminator; + + #[test] + fn line_terminators() { + let source = SourceFile::dummy_file("\r\n\u{2028}\u{2029}"); + let input = &mut source.stream(); + let new_lines: Exactly<4, LineTerminator> = input.lex().expect("Valid parse"); + assert!(matches!( + &*new_lines, + &[ + LineTerminator::CR(_), + LineTerminator::LF(_), + LineTerminator::LS(_), + LineTerminator::PS(_) + ] + )); + } + + #[test] + fn line_terminator_sequences() { + let source = SourceFile::dummy_file("\r\r\n\n\u{2028}\u{2029}"); + let input = &mut source.stream(); + let new_lines: Exactly<5, LineTerminatorSequence> = input.lex().expect("Valid parse"); + assert!(matches!( + &*new_lines, + &[ + LineTerminatorSequence::CR(_), + LineTerminatorSequence::CRLF(_), + LineTerminatorSequence::LF(_), + LineTerminatorSequence::LS(_), + LineTerminatorSequence::PS(_) + ] + )); + } +} diff --git a/src/lexing/tokens/mod.rs b/src/lexing/tokens/mod.rs index 492ae42..ed8abea 100644 --- a/src/lexing/tokens/mod.rs +++ b/src/lexing/tokens/mod.rs @@ -2,3 +2,5 @@ //! Lexical tokens. //! +pub mod whitespace; +pub mod line_terminator; diff --git a/src/lexing/tokens/whitespace.rs b/src/lexing/tokens/whitespace.rs new file mode 100644 index 0000000..6185f3a --- /dev/null +++ b/src/lexing/tokens/whitespace.rs @@ -0,0 +1,64 @@ +//! +//! ## WhiteSpace +//! Empty space that doesn't contribute syntactically. +//! + +use avjason_macros::{ECMARef, Spanned}; +use finl_unicode::categories::{CharacterCategories, MinorCategory}; + +use crate::{ + common::{Source, Span}, + lexing::{LexError, LexT, SourceStream}, +}; + +/// +/// Whitespace characters. +/// +#[derive(Debug, Spanned)] +#[ECMARef("WhiteSpace", "https://262.ecma-international.org/5.1/#sec-7.2")] +pub struct WhiteSpace { + span: Span, +} + +/// +/// Is this character whitespace? +/// +/// Compliant with [Table 2, Section 7.2](https://262.ecma-international.org/5.1/#sec-7.2) of the ECMAScript specification. +/// +fn is_whitespace(ch: &char) -> bool { + use MinorCategory::Zs; + + match ch { + '\u{0009}' | '\u{000B}' | '\u{000C}' | '\u{0020}' | '\u{00A0}' | '\u{FEFF}' => true, + c if matches!(c.get_minor_category(), Zs) => true, + _ => false, + } +} + +impl LexT for WhiteSpace { + fn peek(input: &SourceStream) -> bool { + input.upcoming(is_whitespace) + } + + fn lex(input: &mut SourceStream) -> Result { + // Since Self::peek() -> there's at least one character. + let (span, _) = input.take_while(is_whitespace).unwrap(); + Ok(Self { span }) + } +} + +#[cfg(test)] +mod tests { + use crate::common::{file::SourceFile, Source}; + + use super::WhiteSpace; + + #[test] + fn lex_whitespace() { + let ws = "\t\t \t\t\u{000B}\u{000C}"; + let source = SourceFile::dummy_file(ws); + let input = &mut source.stream(); + let whitespace: WhiteSpace = input.lex().expect("Valid parse"); + assert_eq!(source.source_at(whitespace), Some(ws.to_string())) + } +} \ No newline at end of file diff --git a/src/lexing/utils/lex_impls.rs b/src/lexing/utils/lex_impls.rs index 3819e6a..a196f3a 100644 --- a/src/lexing/utils/lex_impls.rs +++ b/src/lexing/utils/lex_impls.rs @@ -2,7 +2,7 @@ //! Utility implementations for [Lex]. //! -use std::ops::{Deref, DerefMut}; +use std::{any::type_name, ops::{Deref, DerefMut}}; use crate::common::{Source, Span, SpanIter, Spanned}; diff --git a/src/lexing/utils/mod.rs b/src/lexing/utils/mod.rs index cf5a687..045ea16 100644 --- a/src/lexing/utils/mod.rs +++ b/src/lexing/utils/mod.rs @@ -20,7 +20,7 @@ pub use self::{ }; /// -/// Private trait, only for internal use. +/// For internal use. /// #[doc(hidden)] pub trait LexT: Sized { diff --git a/src/lexing/utils/result.rs b/src/lexing/utils/result.rs index 9d23afc..7b23092 100644 --- a/src/lexing/utils/result.rs +++ b/src/lexing/utils/result.rs @@ -1,3 +1,5 @@ +use std::any::type_name; + use avjason_macros::Spanned; use crate::common::{Source, Span, Spanned}; @@ -69,20 +71,89 @@ impl LexResult { pub fn unwrap(self) -> L { match self { LexResult::Lexed(lexed) => lexed, - LexResult::Errant(errant) => panic!("called `LexResult::unwrap()` on an `Errant` value: {errant:?}"), + LexResult::Errant(errant) => { + panic!("called `LexResult::unwrap()` on an `Errant` value: {errant:?}") + } LexResult::Nothing => panic!("called `LexResult::unwrap()` on a `Nothing` value"), } } + /// + /// Is this [LexResult::Errant]? + /// pub fn is_errant(&self) -> bool { matches!(self, Self::Errant(_)) } + /// + /// Is this [LexResult::Lexed]? + /// pub fn is_lexed(&self) -> bool { matches!(self, Self::Lexed(_)) } + /// + /// Is this [LexResult::Nothing]? + /// pub fn is_nothing(&self) -> bool { matches!(self, Self::Nothing) } + + /// + /// Allegory of [Result::map]. + /// + /// If this is [LexResult::Lexed], the mapper function will be called, + /// and then its return type will be re-wrapped. + /// + pub fn map T>(self, mapper: F) -> LexResult { + match self { + LexResult::Lexed(lexed) => LexResult::Lexed(mapper(lexed)), + LexResult::Errant(errant) => LexResult::Errant(errant), + LexResult::Nothing => LexResult::Nothing, + } + } + + /// + /// Require this potential token to be present, not [LexResult::Nothing] or [LexResult::Errant]. + /// + /// If this is [LexResult::Nothing], make this into a [LexResult::Errant] + /// with the message "expected a {$TOKEN} token". + /// + pub fn expected(self, input: SourceStream) -> Self { + match self { + s @ LexResult::Lexed(_) => s, + s @ LexResult::Errant(_) => s, + LexResult::Nothing => LexResult::Errant(LexError { + span: input.span(), + message: format!("Expected a {} token here.", type_name::()) + }), + } + } + + /// + /// If this is [LexResult::Nothing], execute the `or` function instead, + /// and return its result. + /// + /// This allows for chaining of results, which may be useful + /// in lexing enums with different variants. + /// + pub fn or Self>(self, or: F) -> Self { + match self { + s @ LexResult::Lexed(_) => s, + s @ LexResult::Errant(_) => s, + LexResult::Nothing => or(), + } + } + + /// + /// Turn this into a normal Rust [Result], + /// [panic]-ing if this is a [LexResult::Nothing]. + /// + pub fn into_result(self) -> Result { + match self { + LexResult::Lexed(lexed) => Ok(lexed), + LexResult::Errant(errant) => Err(errant), + LexResult::Nothing => panic!("Called `LexResult::into_result()` on a Nothing value."), + } + } } diff --git a/src/lexing/utils/stream.rs b/src/lexing/utils/stream.rs index b5a9c25..7f1fea9 100644 --- a/src/lexing/utils/stream.rs +++ b/src/lexing/utils/stream.rs @@ -9,11 +9,11 @@ use super::{Lex, LexResult}; /// check are coming up. /// pub trait Lookahead { - fn upcoming(&self, input: &SourceStream) -> bool; + fn upcoming(self, input: &SourceStream) -> bool; } -impl Lookahead for str { - fn upcoming(&self, input: &SourceStream) -> bool { +impl<'a> Lookahead for &'a str { + fn upcoming(self, input: &SourceStream) -> bool { let chars = self.chars().collect::>(); input .source @@ -24,9 +24,15 @@ impl Lookahead for str { } } +impl bool> Lookahead for F { + fn upcoming(self, input: &SourceStream) -> bool { + input.peek().map(self).unwrap_or(false) + } +} + /// /// A const-friendly implementation of [std::ops::Range]. -/// +/// /// This works with the [crate::verbatim] macro to support /// the range syntax: `v!('0'..='9')`. /// @@ -45,8 +51,8 @@ pub struct CharacterRange { impl ConstParamTy for CharacterRange {} -impl Lookahead for CharacterRange { - fn upcoming(&self, input: &SourceStream) -> bool { +impl<'a> Lookahead for &'a CharacterRange { + fn upcoming(self, input: &SourceStream) -> bool { input .source .characters() @@ -82,16 +88,39 @@ impl<'a, S: Source> SourceStream<'a, S> { /// Take the next character in this [SourceStream]. /// pub fn take(&mut self) -> Option<(Loc, char)> { - let index = self.index; + let start = self.index; - if let Some(ch) = self.source.characters().get(index) { + if let Some(ch) = self.source.characters().get(self.index) { self.index += 1; - return Some((Loc(index), *ch)); + return Some((Loc(start), *ch)); } None } + /// + /// Take characters in this [SourceStream] whilst they + /// satisfy some predicate. + /// + pub fn take_while(&mut self, pred: impl Fn(&char) -> bool) -> Option<(Span, Vec)> { + let start = self.index; + let mut chars = vec![]; + while let Some(ch) = self.source.characters().get(self.index) { + if !pred(ch) { + break; + } + + chars.push(*ch); + self.index += 1; + } + + if chars.is_empty() { + return None; + } + + Some(((start..self.index).to_span(self.source), chars)) + } + /// /// Attempt to lex for token `L`. /// @@ -101,10 +130,24 @@ impl<'a, S: Source> SourceStream<'a, S> { /// /// Checks if a lookahead pattern is next in the stream. - /// - pub fn upcoming(&self, lookahead: &L) -> bool { + /// + pub fn upcoming(&self, lookahead: L) -> bool { lookahead.upcoming(self) } + + /// + /// Peeks at the next upcoming character. + /// + pub fn peek(&self) -> Option<&char> { + self.source.characters().get(self.index) + } + + pub fn left(&self) -> Option { + self.source + .characters() + .get(self.index..) + .map(|s| s.into_iter().collect()) + } } impl<'a, S: Source> Spanned for SourceStream<'a, S> { diff --git a/src/lexing/utils/verbatim.rs b/src/lexing/utils/verbatim.rs index 480f4f8..c8a230e 100644 --- a/src/lexing/utils/verbatim.rs +++ b/src/lexing/utils/verbatim.rs @@ -1,18 +1,18 @@ //! //! Pattern matching helpers. -//! +//! use avjason_macros::Spanned; -use crate::common::{Loc, Source, Span, SpanIter, ToSpan}; +use crate::common::{Source, Span, SpanIter}; use crate::lexing::{CharacterRange, LexError, LexT, SourceStream}; /// /// Looks for a particular string in input. -/// +/// /// *** -/// +/// /// **Do not use me directly, use [crate::verbatim] instead!** /// #[derive(Debug, Spanned)] @@ -20,6 +20,12 @@ pub struct Verbatim { span: Span, } +impl Verbatim { + fn char_length() -> usize { + A.chars().count() + } +} + impl LexT for Verbatim { fn peek(input: &SourceStream) -> bool { input.upcoming(A) @@ -28,9 +34,9 @@ impl LexT for Verbatim { fn lex(input: &mut SourceStream) -> Result { let mut locs = vec![]; - for _ in 0..A.len() { - let (Loc(loc), _) = input.take().unwrap(); - locs.push((loc..(loc + 1)).to_span(input.source())); + for _ in 0..Self::char_length() { + let (loc, _) = input.take().unwrap(); + locs.push(Span::from(loc)); } Ok(Self { @@ -42,9 +48,9 @@ impl LexT for Verbatim { /// /// Matches a character with a given range. -/// +/// /// *** -/// +/// /// **Do not use me directly, use [crate::verbatim] instead!** /// #[derive(Debug, Spanned)] @@ -80,8 +86,8 @@ mod tests { use crate::{ common::{file::SourceFile, Source}, lexing::{ - CharPattern, utils::{stream::CharacterRange, Many}, + CharPattern, }, }; @@ -92,7 +98,6 @@ mod tests { let source = SourceFile::dummy_file(",."); let input = &mut source.stream(); let comma: Verbatim<","> = input.lex().expect("Valid parse"); - println!("{comma:?}") } #[test] @@ -105,7 +110,6 @@ mod tests { let source = SourceFile::dummy_file("126439012363421890"); let input = &mut source.stream(); let digit: Many> = input.lex().expect("Valid parse"); - println!("{digit:?}") } #[test] @@ -119,7 +123,7 @@ mod tests { let input = &mut source.stream(); let _: Comma = input.lex().expect("Valid parse"); } - + { let source = SourceFile::dummy_file("::"); let input = &mut source.stream(); From 6239744a6b105eb1358158f57840595b5f7e89a5 Mon Sep 17 00:00:00 2001 From: Sammy99jsp Date: Sun, 28 Jan 2024 22:32:59 +0000 Subject: [PATCH 33/39] Add `Comment` token support. --- src/lexing/tokens/comment.rs | 172 ++++++++++++++++++++++++++++++++++ src/lexing/tokens/mod.rs | 1 + src/lexing/utils/lex_impls.rs | 3 +- src/lexing/utils/stream.rs | 25 ++++- src/lexing/utils/verbatim.rs | 6 +- 5 files changed, 202 insertions(+), 5 deletions(-) create mode 100644 src/lexing/tokens/comment.rs diff --git a/src/lexing/tokens/comment.rs b/src/lexing/tokens/comment.rs new file mode 100644 index 0000000..95ebac1 --- /dev/null +++ b/src/lexing/tokens/comment.rs @@ -0,0 +1,172 @@ +//! +//! Comments. +//! + +use avjason_macros::{verbatim as v, ECMARef, Spanned}; + +use crate::{ + common::{Source, Span, Spanned}, + lexing::{Lex, LexError, LexT, SourceStream}, +}; + +use super::line_terminator::LineTerminator; + +/// +/// ```js +/// // Comments +/// /* of either type. */ +/// ``` +/// +#[ECMARef("Comment", "https://262.ecma-international.org/5.1/#sec-7.4")] +#[derive(Debug, Spanned)] +pub enum Comment { + Single(SingleLineComment), + Multi(MultiLineComment), +} + +/// +/// ```js +/// // Single-line comment. +/// ``` +/// +#[ECMARef("SingleLineComment", "https://262.ecma-international.org/5.1/#sec-7.4")] +#[derive(Debug, Spanned)] +pub struct SingleLineComment { + span: Span, + + /// + /// Span of the contents of this comment + /// + inner: Span, +} + +/// +/// ```js +/// /* Multi-line comment. */ +/// ``` +/// +#[ECMARef("MultiLineComment", "https://262.ecma-international.org/5.1/#sec-7.4")] +#[derive(Debug, Spanned)] +pub struct MultiLineComment { + span: Span, + + /// + /// Span of the contents of this comment + /// + inner: Span, +} + +impl Comment { + pub fn inner(&self) -> Span { + match self { + Comment::Single(single) => single.inner, + Comment::Multi(multi) => multi.inner, + } + } +} + +impl LexT for Comment { + fn peek(input: &SourceStream) -> bool { + ::peek(input) || ::peek(input) + } + + fn lex(input: &mut SourceStream) -> Result { + // .into_result() ok since Self::peek() -> exists either variant. + Lex::lex(input) + .map(Self::Single) + .or(|| Lex::lex(input).map(Self::Multi)) + .into_result() + } +} + +impl LexT for SingleLineComment { + fn peek(input: &SourceStream) -> bool { + input.upcoming("//") + } + + fn lex(input: &mut SourceStream) -> Result { + let double_slash = ::lex(input)?; + let contents = input + .take_until(::peek) + .map(|(span, _)| span) + .unwrap_or(Span::empty()); + + Ok(Self { + span: double_slash.span().combine([contents]), + inner: contents, + }) + } +} + +impl LexT for MultiLineComment { + fn peek(input: &SourceStream) -> bool { + input.upcoming("/*") + } + + fn lex(input: &mut SourceStream) -> Result { + let opening = ::lex(input)?; + let contents = input + .take_until(::peek) + .map(|(span, _)| span) + .unwrap_or(Span::empty()); + + Ok(Self { + span: opening.span().combine([contents]), + inner: contents, + }) + } +} + +#[cfg(test)] +mod tests { + use crate::{ + common::{file::SourceFile, Source}, + lexing::tokens::comment::Comment, + }; + + use super::{MultiLineComment, SingleLineComment}; + + #[test] + fn single_line_comment() { + { + let source = SourceFile::dummy_file("// An apple a day..."); + let input = &mut source.stream(); + let comment: SingleLineComment = input.lex().expect("Valid parse"); + + assert_eq!( + source.source_at(comment.inner), + Some(" An apple a day...".to_string()) + ); + } + } + + #[test] + fn multi_line_comment() { + { + let source = + SourceFile::dummy_file("/* An apple a day\n\r\u{2029}Keeps the doctor away! */"); + let input = &mut source.stream(); + let comment: MultiLineComment = input.lex().expect("Valid parse"); + + assert_eq!( + source.source_at(comment.inner), + Some(" An apple a day\n\r\u{2029}Keeps the doctor away! ".to_string()) + ); + } + } + + #[test] + fn comments() { + { + let source = + SourceFile::dummy_file("/* An apple a day\n\r\u{2029}Keeps the doctor away! */"); + let input = &mut source.stream(); + let comment: Comment = input.lex().expect("Valid parse"); + + assert_eq!( + source.source_at(comment.inner()), + Some(" An apple a day\n\r\u{2029}Keeps the doctor away! ".to_string()) + ); + } + } +} diff --git a/src/lexing/tokens/mod.rs b/src/lexing/tokens/mod.rs index ed8abea..0cfcf34 100644 --- a/src/lexing/tokens/mod.rs +++ b/src/lexing/tokens/mod.rs @@ -4,3 +4,4 @@ pub mod whitespace; pub mod line_terminator; +pub mod comment; diff --git a/src/lexing/utils/lex_impls.rs b/src/lexing/utils/lex_impls.rs index a196f3a..8706c91 100644 --- a/src/lexing/utils/lex_impls.rs +++ b/src/lexing/utils/lex_impls.rs @@ -2,7 +2,7 @@ //! Utility implementations for [Lex]. //! -use std::{any::type_name, ops::{Deref, DerefMut}}; +use std::ops::{Deref, DerefMut}; use crate::common::{Source, Span, SpanIter, Spanned}; @@ -22,6 +22,7 @@ impl LexT for Many { let mut v = vec![]; while L::peek(input) { + println!("{:?}", input.left()); v.push(L::lex(input)?); } diff --git a/src/lexing/utils/stream.rs b/src/lexing/utils/stream.rs index 7f1fea9..acb66f7 100644 --- a/src/lexing/utils/stream.rs +++ b/src/lexing/utils/stream.rs @@ -120,6 +120,29 @@ impl<'a, S: Source> SourceStream<'a, S> { Some(((start..self.index).to_span(self.source), chars)) } + + /// + /// Take characters in this [SourceStream] until + /// the precdicate return true. + /// + pub fn take_until(&mut self, pred: impl Fn(&Self) -> bool) -> Option<(Span, Vec)> { + let start = self.index; + let mut chars = vec![]; + while let Some(ch) = self.source.characters().get(self.index) { + if pred(self) { + break; + } + + chars.push(*ch); + self.index += 1; + } + + if chars.is_empty() { + return None; + } + + Some(((start..self.index).to_span(self.source), chars)) + } /// /// Attempt to lex for token `L`. @@ -146,7 +169,7 @@ impl<'a, S: Source> SourceStream<'a, S> { self.source .characters() .get(self.index..) - .map(|s| s.into_iter().collect()) + .map(|s| s.iter().collect()) } } diff --git a/src/lexing/utils/verbatim.rs b/src/lexing/utils/verbatim.rs index c8a230e..fbac33c 100644 --- a/src/lexing/utils/verbatim.rs +++ b/src/lexing/utils/verbatim.rs @@ -97,7 +97,7 @@ mod tests { fn verbatim() { let source = SourceFile::dummy_file(",."); let input = &mut source.stream(); - let comma: Verbatim<","> = input.lex().expect("Valid parse"); + let _: Verbatim<","> = input.lex().expect("Valid parse"); } #[test] @@ -109,11 +109,11 @@ mod tests { let source = SourceFile::dummy_file("126439012363421890"); let input = &mut source.stream(); - let digit: Many> = input.lex().expect("Valid parse"); + let _: Many> = input.lex().expect("Valid parse"); } #[test] - fn macro_test() { + fn verbatim_macro_test() { type Comma = v!(','); type DoubleColon = v!("::"); type Digit = v!('0'..='9'); From 8802e9e5426d3eeda51b0606e5ace8f7dbe5deb5 Mon Sep 17 00:00:00 2001 From: Sammy99jsp Date: Sun, 28 Jan 2024 22:46:06 +0000 Subject: [PATCH 34/39] Add `Punctuator` + `rustfmt` --- macros/src/lib.rs | 14 ++--- macros/src/utils.rs | 9 ++- macros/src/verbatim.rs | 8 +-- src/common/file.rs | 2 - src/common/location.rs | 4 +- src/common/mod.rs | 6 +- src/common/source.rs | 2 +- src/lexing/tokens/mod.rs | 5 +- src/lexing/tokens/punctuator.rs | 106 ++++++++++++++++++++++++++++++++ src/lexing/tokens/whitespace.rs | 2 +- src/lexing/utils/result.rs | 16 ++--- src/lexing/utils/stream.rs | 2 +- 12 files changed, 142 insertions(+), 34 deletions(-) create mode 100644 src/lexing/tokens/punctuator.rs diff --git a/macros/src/lib.rs b/macros/src/lib.rs index 9fbadba..e24831a 100644 --- a/macros/src/lib.rs +++ b/macros/src/lib.rs @@ -212,20 +212,20 @@ pub fn spanned(target: Tokens) -> Tokens { /// /// ## verbatim! -/// +/// /// Often shortend to `v!`, use *this* macro instead /// of its struct helper friends `Verbatim<...>`, `CharPattern<...>`. -/// +/// /// ### Examples /// ```ignore /// use avjason_macros::verbatim as v; -/// -/// // (1) Single char match -> Verbatim<{char as &str}> +/// +/// // (1) Single char match -> Verbatim<{char as &str}> /// type Comma = v!(','); -/// -/// // (2) String match -> Verbatim<{&str}> +/// +/// // (2) String match -> Verbatim<{&str}> /// type NaN = v!("NaN"); -/// +/// /// // (3) Char range match -> CharPattern<{CharacterRange { /// // start: start, /// // end: end, // (modified to make the end exclusive) diff --git a/macros/src/utils.rs b/macros/src/utils.rs index 19e8199..78ef8d8 100644 --- a/macros/src/utils.rs +++ b/macros/src/utils.rs @@ -145,7 +145,10 @@ impl syn::parse::Parse for JSON5Ref { }); } - Ok(Self { name: None, id: first }) + Ok(Self { + name: None, + id: first, + }) } } @@ -165,7 +168,7 @@ impl ToRustdoc for JSON5Ref { /// /// Attempt to get the attribute macros for a [syn::Item]. -/// +/// pub fn get_item_attrs(item: &mut syn::Item) -> Option<&mut Vec> { match item { syn::Item::Const(syn::ItemConst { ref mut attrs, .. }) => Some(attrs), @@ -186,4 +189,4 @@ pub fn get_item_attrs(item: &mut syn::Item) -> Option<&mut Vec> syn::Item::Verbatim(_) => None, _ => None, } -} \ No newline at end of file +} diff --git a/macros/src/verbatim.rs b/macros/src/verbatim.rs index a20fd18..6b580a1 100644 --- a/macros/src/verbatim.rs +++ b/macros/src/verbatim.rs @@ -34,10 +34,10 @@ mod paths { /// /// Generates a path with the last segment - /// having generic parameters. - /// + /// having generic parameters. + /// /// Equivalent to: - /// + /// /// ```ignore /// $path<$arg> /// ``` @@ -71,7 +71,7 @@ mod paths { /// /// Equivalent to: - /// + /// /// ```ignore /// crate::lexing::CharacterRange { /// start: $start, diff --git a/src/common/file.rs b/src/common/file.rs index 2f87b8e..be401b2 100644 --- a/src/common/file.rs +++ b/src/common/file.rs @@ -174,8 +174,6 @@ impl Source for SourceFile { fn characters(&self) -> &[char] { &self.chars } - - } #[cfg(test)] diff --git a/src/common/location.rs b/src/common/location.rs index dbf1b22..01538b4 100644 --- a/src/common/location.rs +++ b/src/common/location.rs @@ -161,11 +161,11 @@ impl RangeBounds for Span { impl From for Span { /// /// Span of a single character. - /// + /// fn from(start: Loc) -> Self { Self { start, - end: start +1, + end: start + 1, } } } diff --git a/src/common/mod.rs b/src/common/mod.rs index 4db3a70..07adee2 100644 --- a/src/common/mod.rs +++ b/src/common/mod.rs @@ -2,9 +2,9 @@ //! Common utilities across lexing and syntax-parsing. //! -pub mod source; -pub mod location; pub mod file; +pub mod location; +pub mod source; pub use location::*; -pub use source::*; \ No newline at end of file +pub use source::*; diff --git a/src/common/source.rs b/src/common/source.rs index 631e41c..39a09f5 100644 --- a/src/common/source.rs +++ b/src/common/source.rs @@ -4,8 +4,8 @@ use std::ops::{Bound, Range, RangeBounds}; -use crate::lexing::utils::SourceStream; use super::{Loc, Span, Spanned}; +use crate::lexing::utils::SourceStream; #[cfg(test)] pub use testing_only::DummySource; diff --git a/src/lexing/tokens/mod.rs b/src/lexing/tokens/mod.rs index 0cfcf34..792df18 100644 --- a/src/lexing/tokens/mod.rs +++ b/src/lexing/tokens/mod.rs @@ -2,6 +2,7 @@ //! Lexical tokens. //! -pub mod whitespace; -pub mod line_terminator; pub mod comment; +pub mod line_terminator; +pub mod punctuator; +pub mod whitespace; diff --git a/src/lexing/tokens/punctuator.rs b/src/lexing/tokens/punctuator.rs new file mode 100644 index 0000000..33c0cb9 --- /dev/null +++ b/src/lexing/tokens/punctuator.rs @@ -0,0 +1,106 @@ +//! +//! Punctuators. +//! + +use avjason_macros::{verbatim as v, SpecRef}; + +use crate::{ + common::Source, + lexing::{LexError, LexT, SourceStream}, +}; + +/// +/// `{` +/// +pub type OpenBrace = v!('{'); + +/// +/// `}` +/// +pub type CloseBrace = v!('}'); + +/// +/// `[` +/// +pub type OpenBracket = v!('['); + +/// +/// `]` +/// +pub type CloseBracket = v!(']'); + +/// +/// `:` +/// +pub type Colon = v!(':'); + +/// +/// `,` +/// +pub type Comma = v!(','); + +/// +/// `{ } [ ] : ,` +/// +#[SpecRef("JSON5Punctuator")] +pub enum Punctuator { + OpenBrace(OpenBrace), + CloseBrace(CloseBrace), + OpenBracket(OpenBracket), + CloseBracket(CloseBracket), + Colon(Colon), + Comma(Comma), +} + +impl LexT for Punctuator { + fn peek(input: &SourceStream) -> bool { + ::peek(input) + || ::peek(input) + || ::peek(input) + || ::peek(input) + || ::peek(input) + || ::peek(input) + } + + fn lex(input: &mut SourceStream) -> Result { + // .into_result() ok since Self::peek() -> one variant present. + input + .lex() + .map(Self::OpenBrace) + .or(|| input.lex().map(Self::CloseBrace)) + .or(|| input.lex().map(Self::OpenBracket)) + .or(|| input.lex().map(Self::CloseBracket)) + .or(|| input.lex().map(Self::Colon)) + .or(|| input.lex().map(Self::Comma)) + .into_result() + } +} + +#[cfg(test)] +mod tests { + use crate::{ + common::{file::SourceFile, Source}, + lexing::Exactly, + }; + + use super::Punctuator; + + #[test] + fn mixed_test() { + let source = SourceFile::dummy_file("{}[]:,"); + let input = &mut source.stream(); + let puncts: Exactly<6, Punctuator> = input.lex().expect("Valid parse"); + + assert!(matches!( + &*puncts, + &[ + Punctuator::OpenBrace(_), + Punctuator::CloseBrace(_), + Punctuator::OpenBracket(_), + Punctuator::CloseBracket(_), + Punctuator::Colon(_), + Punctuator::Comma(_) + ] + )) + } +} diff --git a/src/lexing/tokens/whitespace.rs b/src/lexing/tokens/whitespace.rs index 6185f3a..144e001 100644 --- a/src/lexing/tokens/whitespace.rs +++ b/src/lexing/tokens/whitespace.rs @@ -61,4 +61,4 @@ mod tests { let whitespace: WhiteSpace = input.lex().expect("Valid parse"); assert_eq!(source.source_at(whitespace), Some(ws.to_string())) } -} \ No newline at end of file +} diff --git a/src/lexing/utils/result.rs b/src/lexing/utils/result.rs index 7b23092..acd6c5f 100644 --- a/src/lexing/utils/result.rs +++ b/src/lexing/utils/result.rs @@ -101,10 +101,10 @@ impl LexResult { /// /// Allegory of [Result::map]. - /// + /// /// If this is [LexResult::Lexed], the mapper function will be called, /// and then its return type will be re-wrapped. - /// + /// pub fn map T>(self, mapper: F) -> LexResult { match self { LexResult::Lexed(lexed) => LexResult::Lexed(mapper(lexed)), @@ -115,17 +115,17 @@ impl LexResult { /// /// Require this potential token to be present, not [LexResult::Nothing] or [LexResult::Errant]. - /// + /// /// If this is [LexResult::Nothing], make this into a [LexResult::Errant] /// with the message "expected a {$TOKEN} token". - /// + /// pub fn expected(self, input: SourceStream) -> Self { match self { s @ LexResult::Lexed(_) => s, s @ LexResult::Errant(_) => s, LexResult::Nothing => LexResult::Errant(LexError { span: input.span(), - message: format!("Expected a {} token here.", type_name::()) + message: format!("Expected a {} token here.", type_name::()), }), } } @@ -133,10 +133,10 @@ impl LexResult { /// /// If this is [LexResult::Nothing], execute the `or` function instead, /// and return its result. - /// + /// /// This allows for chaining of results, which may be useful /// in lexing enums with different variants. - /// + /// pub fn or Self>(self, or: F) -> Self { match self { s @ LexResult::Lexed(_) => s, @@ -148,7 +148,7 @@ impl LexResult { /// /// Turn this into a normal Rust [Result], /// [panic]-ing if this is a [LexResult::Nothing]. - /// + /// pub fn into_result(self) -> Result { match self { LexResult::Lexed(lexed) => Ok(lexed), diff --git a/src/lexing/utils/stream.rs b/src/lexing/utils/stream.rs index acb66f7..f77a420 100644 --- a/src/lexing/utils/stream.rs +++ b/src/lexing/utils/stream.rs @@ -120,7 +120,7 @@ impl<'a, S: Source> SourceStream<'a, S> { Some(((start..self.index).to_span(self.source), chars)) } - + /// /// Take characters in this [SourceStream] until /// the precdicate return true. From 7836bbf7a7467fe2fd6859744e67604e19e732d1 Mon Sep 17 00:00:00 2001 From: Sammy99jsp Date: Sun, 28 Jan 2024 23:58:45 +0000 Subject: [PATCH 35/39] Add escape code support. --- macros/src/spanned.rs | 2 +- src/lexing/tokens/comment.rs | 4 +- src/lexing/tokens/escapes.rs | 365 +++++++++++++++++++++++++++ src/lexing/tokens/line_terminator.rs | 22 +- src/lexing/tokens/mod.rs | 2 + src/lexing/tokens/number.rs | 138 ++++++++++ src/lexing/tokens/punctuator.rs | 8 +- src/lexing/utils/lex_impls.rs | 1 - src/lexing/utils/result.rs | 2 +- src/lexing/utils/stream.rs | 7 + 10 files changed, 531 insertions(+), 20 deletions(-) create mode 100644 src/lexing/tokens/escapes.rs create mode 100644 src/lexing/tokens/number.rs diff --git a/macros/src/spanned.rs b/macros/src/spanned.rs index 4a518e4..965d3a0 100644 --- a/macros/src/spanned.rs +++ b/macros/src/spanned.rs @@ -288,7 +288,7 @@ pub fn spanned_for_enum(en: &syn::ItemEnum) -> Option { Diagnostic::spanned( en.span().unwrap(), Level::Error, - "Cannot derive spanned for enum no variants.", + "Cannot derive spanned for enum with no variants.", ) .emit(); diff --git a/src/lexing/tokens/comment.rs b/src/lexing/tokens/comment.rs index 95ebac1..505c01e 100644 --- a/src/lexing/tokens/comment.rs +++ b/src/lexing/tokens/comment.rs @@ -71,11 +71,11 @@ impl LexT for Comment { } fn lex(input: &mut SourceStream) -> Result { - // .into_result() ok since Self::peek() -> exists either variant. + // .unwrap_as_result() ok since Self::peek() -> exists either variant. Lex::lex(input) .map(Self::Single) .or(|| Lex::lex(input).map(Self::Multi)) - .into_result() + .unwrap_as_result() } } diff --git a/src/lexing/tokens/escapes.rs b/src/lexing/tokens/escapes.rs new file mode 100644 index 0000000..feb3147 --- /dev/null +++ b/src/lexing/tokens/escapes.rs @@ -0,0 +1,365 @@ +//! +//! Escape +//! +//! Technically not tokens. +//! These are used between strings and identifiers. +//! + +use avjason_macros::{verbatim as v, Spanned}; + +use crate::{ + common::{Source, Span}, + lexing::{Exactly, Lex, LexError, LexT, SourceStream}, +}; + +use super::{line_terminator::is_line_terminator, number::HexDigit}; +#[derive(Debug, Spanned)] +pub enum EscapeSequence { + CharacterEscapeSequence(CharacterEscapeSequence), + Null(Null), + HexEscapeSequence(HexEscapeSequence), + UnicodeEscapeSequence(UnicodeEscapeSequence), +} + +#[derive(Debug, Spanned)] +pub enum CharacterEscapeSequence { + Single(SingleEscapeChar), + NonEscape(NonEscapeChar), +} + +#[derive(Debug, Spanned)] +pub struct SingleEscapeChar { + span: Span, + raw: char, +} + +#[derive(Debug, Spanned)] +pub struct NonEscapeChar { + span: Span, + raw: char, +} + +#[derive(Debug, Spanned)] +pub struct Null { + span: Span, +} + +#[derive(Debug, Spanned)] +pub struct HexEscapeSequence(v!('x'), Exactly<2, HexDigit>); + +#[derive(Debug, Spanned)] +pub struct UnicodeEscapeSequence(v!('u'), Exactly<4, HexDigit>); + +// --- + +impl LexT for EscapeSequence { + fn peek(input: &SourceStream) -> bool { + ::peek(input) + || ::peek(input) + || ::peek(input) + || ::peek(input) + } + + fn lex(input: &mut SourceStream) -> Result { + // .unwrap_as_result() ok since one of these variants is upcoming. + input + .lex() + .map(Self::CharacterEscapeSequence) + .or(|| input.lex().map(Self::Null)) + .or(|| input.lex().map(Self::HexEscapeSequence)) + .or(|| input.lex().map(Self::UnicodeEscapeSequence)) + .unwrap_as_result() + } +} + +impl LexT for CharacterEscapeSequence { + fn peek(input: &SourceStream) -> bool { + ::peek(input) || ::peek(input) + } + + fn lex(input: &mut SourceStream) -> Result { + // .unwrap_as_result() ok since Self::peek() -> there is one variant ahead. + Lex::lex(input) + .map(Self::Single) + .or(|| Lex::lex(input).map(Self::NonEscape)) + .unwrap_as_result() + } +} + +fn is_single_escape_char(ch: &char) -> bool { + matches!(ch, '\'' | '"' | '\\' | 'b' | 'f' | 'n' | 'r' | 't' | 'v') +} + +impl LexT for SingleEscapeChar { + fn peek(input: &SourceStream) -> bool { + input.upcoming(is_single_escape_char) + } + + fn lex(input: &mut SourceStream) -> Result { + // Unwrap ok since Self::peek() -> a character exists. + let (loc, raw) = input.take().unwrap(); + + Ok(Self { + span: Span::from(loc), + raw, + }) + } +} + +fn is_escape_char(ch: &char) -> bool { + is_single_escape_char(ch) || matches!(ch, '0'..='9' | 'x' | 'u') +} + +impl LexT for NonEscapeChar { + fn peek(input: &SourceStream) -> bool { + input.upcoming(|ch: &char| !(is_line_terminator(ch) || is_escape_char(ch))) + } + + fn lex(input: &mut SourceStream) -> Result { + // Unwrap ok since Self::peek() -> a character exists. + let (loc, raw) = input.take().unwrap(); + + Ok(Self { + span: Span::from(loc), + raw, + }) + } +} + +impl LexT for Null { + fn peek(input: &SourceStream) -> bool { + input.upcoming("0") && !matches!(input.peek_n(1), Some('0'..='9')) + } + + fn lex(input: &mut SourceStream) -> Result { + // .unwrap() ok since Self::peek() -> next character exists. + let (loc, _) = input.take().unwrap(); + + Ok(Self { + span: Span::from(loc), + }) + } +} + +impl LexT for HexEscapeSequence { + fn peek(input: &SourceStream) -> bool { + ::peek(input) + } + + fn lex(input: &mut SourceStream) -> Result { + Ok(Self(LexT::lex(input)?, LexT::lex(input)?)) + } +} + +impl LexT for UnicodeEscapeSequence { + fn peek(input: &SourceStream) -> bool { + ::peek(input) + } + + fn lex(input: &mut SourceStream) -> Result { + Ok(Self(LexT::lex(input)?, LexT::lex(input)?)) + } +} + +#[cfg(test)] +mod tests { + use crate::{ + common::{file::SourceFile, Source}, + lexing::{ + tokens::escapes::{CharacterEscapeSequence, EscapeSequence, NonEscapeChar}, + Exactly, Lex, Verbatim, + }, + }; + + use super::{HexEscapeSequence, Null, SingleEscapeChar, UnicodeEscapeSequence}; + + #[test] + fn single_escape() { + let source = SourceFile::dummy_file("'\"\\bfnrtv"); + let input = &mut source.stream(); + let esc: Exactly<9, SingleEscapeChar> = input.lex().expect("Valid parse"); + assert!(matches!( + &*esc, + &[ + SingleEscapeChar { raw: '\'', .. }, + SingleEscapeChar { raw: '"', .. }, + SingleEscapeChar { raw: '\\', .. }, + SingleEscapeChar { raw: 'b', .. }, + SingleEscapeChar { raw: 'f', .. }, + SingleEscapeChar { raw: 'n', .. }, + SingleEscapeChar { raw: 'r', .. }, + SingleEscapeChar { raw: 't', .. }, + SingleEscapeChar { raw: 'v', .. }, + ] + )) + } + + #[test] + fn non_escape_char() { + let source = SourceFile::dummy_file("a!£%*&-=💩"); + let input = &mut source.stream(); + let esc: Exactly<9, NonEscapeChar> = input.lex().expect("Valid parse"); + assert!(matches!( + &*esc, + &[ + NonEscapeChar { raw: 'a', .. }, + NonEscapeChar { raw: '!', .. }, + NonEscapeChar { raw: '£', .. }, + NonEscapeChar { raw: '%', .. }, + NonEscapeChar { raw: '*', .. }, + NonEscapeChar { raw: '&', .. }, + NonEscapeChar { raw: '-', .. }, + NonEscapeChar { raw: '=', .. }, + NonEscapeChar { raw: '💩', .. }, + ] + )) + } + + #[test] + fn character_escape_sequence() { + let source = SourceFile::dummy_file("'\"\\bfnrtva!£%*&-=💩"); + let input = &mut source.stream(); + let esc: Exactly<18, CharacterEscapeSequence> = input.lex().expect("Valid parse"); + assert!(matches!( + &*esc, + &[ + CharacterEscapeSequence::Single(SingleEscapeChar { raw: '\'', .. }), + CharacterEscapeSequence::Single(SingleEscapeChar { raw: '"', .. }), + CharacterEscapeSequence::Single(SingleEscapeChar { raw: '\\', .. }), + CharacterEscapeSequence::Single(SingleEscapeChar { raw: 'b', .. }), + CharacterEscapeSequence::Single(SingleEscapeChar { raw: 'f', .. }), + CharacterEscapeSequence::Single(SingleEscapeChar { raw: 'n', .. }), + CharacterEscapeSequence::Single(SingleEscapeChar { raw: 'r', .. }), + CharacterEscapeSequence::Single(SingleEscapeChar { raw: 't', .. }), + CharacterEscapeSequence::Single(SingleEscapeChar { raw: 'v', .. }), + CharacterEscapeSequence::NonEscape(NonEscapeChar { raw: 'a', .. }), + CharacterEscapeSequence::NonEscape(NonEscapeChar { raw: '!', .. }), + CharacterEscapeSequence::NonEscape(NonEscapeChar { raw: '£', .. }), + CharacterEscapeSequence::NonEscape(NonEscapeChar { raw: '%', .. }), + CharacterEscapeSequence::NonEscape(NonEscapeChar { raw: '*', .. }), + CharacterEscapeSequence::NonEscape(NonEscapeChar { raw: '&', .. }), + CharacterEscapeSequence::NonEscape(NonEscapeChar { raw: '-', .. }), + CharacterEscapeSequence::NonEscape(NonEscapeChar { raw: '=', .. }), + CharacterEscapeSequence::NonEscape(NonEscapeChar { raw: '💩', .. }), + ] + )) + } + + #[test] + fn null_char() { + { + let source = SourceFile::dummy_file("0"); + let input = &mut source.stream(); + let _: Null = input.lex().expect("Valid parse"); + } + + { + let source = SourceFile::dummy_file("01"); + let input = &mut source.stream(); + let esc = Null::lex(input); + assert!(esc.is_nothing()) + } + } + + #[test] + fn hex_escape() { + let source = SourceFile::dummy_file("x20x26x25x3c"); + let input = &mut source.stream(); + let _: Exactly<4, HexEscapeSequence> = input.lex().expect("Valid parse"); + } + + #[test] + fn unicode_escape() { + let source = SourceFile::dummy_file("u0000u2AFCu6798u1623"); + let input = &mut source.stream(); + let _: Exactly<4, UnicodeEscapeSequence> = input.lex().expect("Valid parse"); + } + + #[test] + fn mixed() { + let source = + SourceFile::dummy_file("'\"\\bfnrtva!£%*&-=💩0x20x26x25x3cu0000u2AFCu6798u1623"); + let input = &mut source.stream(); + let esc: Exactly<27, EscapeSequence> = input.lex().expect("Valid parse"); + assert!(matches!( + &*esc, + &[ + EscapeSequence::CharacterEscapeSequence(CharacterEscapeSequence::Single( + SingleEscapeChar { raw: '\'', .. } + )), + EscapeSequence::CharacterEscapeSequence(CharacterEscapeSequence::Single( + SingleEscapeChar { raw: '"', .. } + )), + EscapeSequence::CharacterEscapeSequence(CharacterEscapeSequence::Single( + SingleEscapeChar { raw: '\\', .. } + )), + EscapeSequence::CharacterEscapeSequence(CharacterEscapeSequence::Single( + SingleEscapeChar { raw: 'b', .. } + )), + EscapeSequence::CharacterEscapeSequence(CharacterEscapeSequence::Single( + SingleEscapeChar { raw: 'f', .. } + )), + EscapeSequence::CharacterEscapeSequence(CharacterEscapeSequence::Single( + SingleEscapeChar { raw: 'n', .. } + )), + EscapeSequence::CharacterEscapeSequence(CharacterEscapeSequence::Single( + SingleEscapeChar { raw: 'r', .. } + )), + EscapeSequence::CharacterEscapeSequence(CharacterEscapeSequence::Single( + SingleEscapeChar { raw: 't', .. } + )), + EscapeSequence::CharacterEscapeSequence(CharacterEscapeSequence::Single( + SingleEscapeChar { raw: 'v', .. } + )), + EscapeSequence::CharacterEscapeSequence(CharacterEscapeSequence::NonEscape( + NonEscapeChar { raw: 'a', .. } + )), + EscapeSequence::CharacterEscapeSequence(CharacterEscapeSequence::NonEscape( + NonEscapeChar { raw: '!', .. } + )), + EscapeSequence::CharacterEscapeSequence(CharacterEscapeSequence::NonEscape( + NonEscapeChar { raw: '£', .. } + )), + EscapeSequence::CharacterEscapeSequence(CharacterEscapeSequence::NonEscape( + NonEscapeChar { raw: '%', .. } + )), + EscapeSequence::CharacterEscapeSequence(CharacterEscapeSequence::NonEscape( + NonEscapeChar { raw: '*', .. } + )), + EscapeSequence::CharacterEscapeSequence(CharacterEscapeSequence::NonEscape( + NonEscapeChar { raw: '&', .. } + )), + EscapeSequence::CharacterEscapeSequence(CharacterEscapeSequence::NonEscape( + NonEscapeChar { raw: '-', .. } + )), + EscapeSequence::CharacterEscapeSequence(CharacterEscapeSequence::NonEscape( + NonEscapeChar { raw: '=', .. } + )), + EscapeSequence::CharacterEscapeSequence(CharacterEscapeSequence::NonEscape( + NonEscapeChar { raw: '💩', .. } + )), + EscapeSequence::Null(Null { .. }), + EscapeSequence::HexEscapeSequence(HexEscapeSequence(Verbatim::<"x"> { .. }, _)), + EscapeSequence::HexEscapeSequence(HexEscapeSequence(Verbatim::<"x"> { .. }, _)), + EscapeSequence::HexEscapeSequence(HexEscapeSequence(Verbatim::<"x"> { .. }, _)), + EscapeSequence::HexEscapeSequence(HexEscapeSequence(Verbatim::<"x"> { .. }, _)), + EscapeSequence::UnicodeEscapeSequence(UnicodeEscapeSequence( + Verbatim::<"u"> { .. }, + _ + )), + EscapeSequence::UnicodeEscapeSequence(UnicodeEscapeSequence( + Verbatim::<"u"> { .. }, + _ + )), + EscapeSequence::UnicodeEscapeSequence(UnicodeEscapeSequence( + Verbatim::<"u"> { .. }, + _ + )), + EscapeSequence::UnicodeEscapeSequence(UnicodeEscapeSequence( + Verbatim::<"u"> { .. }, + _ + )), + ] + )) + } +} diff --git a/src/lexing/tokens/line_terminator.rs b/src/lexing/tokens/line_terminator.rs index 5412f5d..e2d1ddf 100644 --- a/src/lexing/tokens/line_terminator.rs +++ b/src/lexing/tokens/line_terminator.rs @@ -33,42 +33,40 @@ pub enum LineTerminatorSequence { PS(v!('\u{2029}')), } +pub fn is_line_terminator(ch: &char) -> bool { + matches!(ch, '\n' | '\r' | '\u{2028}' | '\u{2029}') +} + impl LexT for LineTerminator { fn peek(input: &SourceStream) -> bool { - ::peek(input) - || ::peek(input) - || ::peek(input) - || ::peek(input) + input.upcoming(is_line_terminator) } fn lex(input: &mut SourceStream) -> Result { - // .into_result() ok since we know there's at least one upcoming variant. + // .unwrap_as_result() ok since we know there's at least one upcoming variant. Lex::lex(input) .map(Self::LF) .or(|| Lex::lex(input).map(Self::CR)) .or(|| Lex::lex(input).map(Self::LS)) .or(|| Lex::lex(input).map(Self::PS)) - .into_result() + .unwrap_as_result() } } impl LexT for LineTerminatorSequence { fn peek(input: &SourceStream) -> bool { - ::peek(input) - || ::peek(input) - || ::peek(input) - || ::peek(input) + input.upcoming(is_line_terminator) } fn lex(input: &mut SourceStream) -> Result { - // .into_result() ok since we know there's at least one upcoming variant. + // .unwrap_as_result() ok since we know there's at least one upcoming variant. Lex::lex(input) .map(Self::CRLF) .or(|| Lex::lex(input).map(Self::LF)) .or(|| Lex::lex(input).map(Self::CR)) .or(|| Lex::lex(input).map(Self::LS)) .or(|| Lex::lex(input).map(Self::PS)) - .into_result() + .unwrap_as_result() } } diff --git a/src/lexing/tokens/mod.rs b/src/lexing/tokens/mod.rs index 792df18..86aaf5b 100644 --- a/src/lexing/tokens/mod.rs +++ b/src/lexing/tokens/mod.rs @@ -6,3 +6,5 @@ pub mod comment; pub mod line_terminator; pub mod punctuator; pub mod whitespace; +pub mod number; +pub mod escapes; diff --git a/src/lexing/tokens/number.rs b/src/lexing/tokens/number.rs new file mode 100644 index 0000000..2001cfa --- /dev/null +++ b/src/lexing/tokens/number.rs @@ -0,0 +1,138 @@ +//! +//! ## Number +//! +//! Number tokens like integers, hex integers, and decimals, +//! + +use std::ops::Add; + +use avjason_macros::{verbatim as v, ECMARef, Spanned}; + +use crate::{ + common::{Source, Span}, + lexing::{AtLeast, Exactly, LexError, LexT, SourceStream}, +}; + +/// +/// The numerical value of a literal. +/// +/// See the [ECMAScript spec](https://262.ecma-international.org/5.1/#sec-7.8.3). +/// +pub trait MathematicalValue { + type Value: Copy + Add; + const BASE: usize; + + fn mv(&self) -> Self::Value; +} + +#[ECMARef("DecimalDigit", "https://262.ecma-international.org/5.1/#sec-7.8.3")] +pub type DecimalDigit = v!('0'..='9'); + +#[ECMARef("HexDigit", "https://262.ecma-international.org/5.1/#sec-7.8.3")] +#[derive(Debug, Spanned)] +pub struct HexDigit { + span: Span, + raw: char, +} + +// --- + +impl LexT for HexDigit { + fn peek(input: &SourceStream) -> bool { + input.upcoming(char::is_ascii_hexdigit) + } + + fn lex(input: &mut SourceStream) -> Result { + // .unwrap() ok since Self::peek() -> character exists. + let (loc, raw) = input.take().unwrap(); + Ok(Self { + span: Span::from(loc), + raw, + }) + } +} + +// --- + +impl MathematicalValue for DecimalDigit { + type Value = u8; + const BASE: usize = 10; + + fn mv(&self) -> Self::Value { + match self.raw() { + '0' => 0, + '1' => 1, + '2' => 2, + '3' => 3, + '4' => 4, + '5' => 5, + '6' => 6, + '7' => 7, + '8' => 8, + '9' => 9, + _ => unreachable!(), + } + } +} + +impl MathematicalValue for HexDigit { + type Value = u8; + const BASE: usize = 16; + + fn mv(&self) -> Self::Value { + match self.raw { + '0' => 0x0, + '1' => 0x1, + '2' => 0x2, + '3' => 0x3, + '4' => 0x4, + '5' => 0x5, + '6' => 0x6, + '7' => 0x7, + '8' => 0x8, + '9' => 0x9, + 'A' => 0xA, + 'B' => 0xB, + 'C' => 0xC, + 'D' => 0xD, + 'E' => 0xE, + 'F' => 0xF, + _ => unreachable!(), + } + } +} + +impl MathematicalValue for Exactly<2, HexDigit> { + type Value = u8; + const BASE: usize = 16; + + fn mv(&self) -> Self::Value { + self[0].mv() * Self::BASE as u8 + self[1].mv() + } +} + +impl MathematicalValue for Exactly<4, HexDigit> { + type Value = u16; + const BASE: usize = 16; + + fn mv(&self) -> Self::Value { + (self[0].mv() as u16) * (Self::BASE.pow(3) as u16) + + (self[1].mv() as u16) * (Self::BASE.pow(2) as u16) + + (self[2].mv() as u16) * (Self::BASE.pow(1) as u16) + + self[3].mv() as u16 + } +} + +impl MathematicalValue for AtLeast { + type Value = u64; + const BASE: usize = 16; + + fn mv(&self) -> Self::Value { + self.iter() + .map(MathematicalValue::mv) + .map(|mv| mv as u64) + .enumerate() + .map(|(i, v)| v * (Self::BASE.pow(i as u32) as u64)) + .sum() + } +} diff --git a/src/lexing/tokens/punctuator.rs b/src/lexing/tokens/punctuator.rs index 33c0cb9..8cfc5dd 100644 --- a/src/lexing/tokens/punctuator.rs +++ b/src/lexing/tokens/punctuator.rs @@ -1,5 +1,7 @@ //! -//! Punctuators. +//! ## Punctuators +//! +//! Pieces of punctuation: `{}[]:,`. //! use avjason_macros::{verbatim as v, SpecRef}; @@ -63,7 +65,7 @@ impl LexT for Punctuator { } fn lex(input: &mut SourceStream) -> Result { - // .into_result() ok since Self::peek() -> one variant present. + // .unwrap_as_result() ok since Self::peek() -> one variant present. input .lex() .map(Self::OpenBrace) @@ -72,7 +74,7 @@ impl LexT for Punctuator { .or(|| input.lex().map(Self::CloseBracket)) .or(|| input.lex().map(Self::Colon)) .or(|| input.lex().map(Self::Comma)) - .into_result() + .unwrap_as_result() } } diff --git a/src/lexing/utils/lex_impls.rs b/src/lexing/utils/lex_impls.rs index 8706c91..3819e6a 100644 --- a/src/lexing/utils/lex_impls.rs +++ b/src/lexing/utils/lex_impls.rs @@ -22,7 +22,6 @@ impl LexT for Many { let mut v = vec![]; while L::peek(input) { - println!("{:?}", input.left()); v.push(L::lex(input)?); } diff --git a/src/lexing/utils/result.rs b/src/lexing/utils/result.rs index acd6c5f..0237b54 100644 --- a/src/lexing/utils/result.rs +++ b/src/lexing/utils/result.rs @@ -149,7 +149,7 @@ impl LexResult { /// Turn this into a normal Rust [Result], /// [panic]-ing if this is a [LexResult::Nothing]. /// - pub fn into_result(self) -> Result { + pub fn unwrap_as_result(self) -> Result { match self { LexResult::Lexed(lexed) => Ok(lexed), LexResult::Errant(errant) => Err(errant), diff --git a/src/lexing/utils/stream.rs b/src/lexing/utils/stream.rs index f77a420..5b1bf0b 100644 --- a/src/lexing/utils/stream.rs +++ b/src/lexing/utils/stream.rs @@ -164,6 +164,13 @@ impl<'a, S: Source> SourceStream<'a, S> { pub fn peek(&self) -> Option<&char> { self.source.characters().get(self.index) } + + /// + /// Peeks at the (0-based) n-th next upcoming character. + /// + pub fn peek_n(&self, n: usize) -> Option<&char> { + self.source.characters().get(self.index + n) + } pub fn left(&self) -> Option { self.source From 781ea61ae4719b1a7699c3203636d3b6998a1b8f Mon Sep 17 00:00:00 2001 From: Sammy99jsp Date: Mon, 29 Jan 2024 02:52:06 +0000 Subject: [PATCH 36/39] Add String literal supprt, fix bug with empty Many --- src/lexing/tokens/escapes.rs | 84 ++++++++- src/lexing/tokens/mod.rs | 1 + src/lexing/tokens/number.rs | 6 + src/lexing/tokens/string.rs | 335 ++++++++++++++++++++++++++++++++++ src/lexing/utils/lex_impls.rs | 61 ++++--- src/lexing/utils/peek.rs | 7 + src/lexing/utils/result.rs | 84 ++++++++- src/lexing/utils/verbatim.rs | 2 +- src/lib.rs | 2 +- 9 files changed, 552 insertions(+), 30 deletions(-) create mode 100644 src/lexing/tokens/string.rs diff --git a/src/lexing/tokens/escapes.rs b/src/lexing/tokens/escapes.rs index feb3147..5563bdd 100644 --- a/src/lexing/tokens/escapes.rs +++ b/src/lexing/tokens/escapes.rs @@ -12,7 +12,11 @@ use crate::{ lexing::{Exactly, Lex, LexError, LexT, SourceStream}, }; -use super::{line_terminator::is_line_terminator, number::HexDigit}; +use super::{ + line_terminator::is_line_terminator, + number::{HexDigit, MathematicalValue}, + string::CharacterValue, +}; #[derive(Debug, Spanned)] pub enum EscapeSequence { CharacterEscapeSequence(CharacterEscapeSequence), @@ -147,7 +151,7 @@ impl LexT for HexEscapeSequence { } fn lex(input: &mut SourceStream) -> Result { - Ok(Self(LexT::lex(input)?, LexT::lex(input)?)) + Ok(Self(LexT::lex(input)?, Lex::lex(input).unwrap_as_result()?)) } } @@ -157,7 +161,81 @@ impl LexT for UnicodeEscapeSequence { } fn lex(input: &mut SourceStream) -> Result { - Ok(Self(LexT::lex(input)?, LexT::lex(input)?)) + Ok(Self(LexT::lex(input)?, Lex::lex(input).unwrap_as_result()?)) + } +} + +// --- + +impl CharacterValue for EscapeSequence { + fn cv<'a, 'b: 'a>(&'a self, buf: &'b mut [u16; 2]) -> &'b [u16] { + match self { + EscapeSequence::CharacterEscapeSequence(esc) => esc.cv(buf), + EscapeSequence::Null(null) => null.cv(buf), + EscapeSequence::HexEscapeSequence(hex) => hex.cv(buf), + EscapeSequence::UnicodeEscapeSequence(unicode) => unicode.cv(buf), + } + } +} + +impl CharacterValue for CharacterEscapeSequence { + fn cv<'a, 'b: 'a>(&'a self, buf: &'b mut [u16; 2]) -> &'b [u16] { + match self { + CharacterEscapeSequence::Single(single) => single.cv(buf), + CharacterEscapeSequence::NonEscape(non_escape) => non_escape.cv(buf), + } + } +} + +impl CharacterValue for Null { + fn cv<'a, 'b: 'a>(&'a self, buf: &'b mut [u16; 2]) -> &'b [u16] { + '\u{0000}'.encode_utf16(buf) + } +} + +impl CharacterValue for SingleEscapeChar { + /// + /// Compliant with [Table 4, Section 7.4](https://262.ecma-international.org/5.1/#sec-7.8.4) + /// of the ECMAScript spec. + /// + fn cv<'a, 'b: 'a>(&'a self, buf: &'b mut [u16; 2]) -> &'b [u16] { + match self.raw { + '\'' => '\u{0027}', // single quote + '"' => '\u{0022}', // double quote + '\\' => '\u{005C}', // backslash + 'b' => '\u{0008}', // backspace + 'f' => '\u{000C}', // form feed + 'n' => '\u{000A}', // line feed (new line) + 'r' => '\u{000D}', // carriage return + 't' => '\u{0009}', // horizontal tab + 'v' => '\u{000B}', // vertical tab + _ => unreachable!(), + } + .encode_utf16(buf) + } +} + +impl CharacterValue for NonEscapeChar { + /// + /// > The CV of NonEscapeCharacter :: SourceCharacter but not one of EscapeCharacter or + /// > LineTerminator is the SourceCharacter character itself. + /// + fn cv<'a, 'b: 'a>(&'a self, buf: &'b mut [u16; 2]) -> &'b [u16] { + self.raw.encode_utf16(buf) + } +} + +impl CharacterValue for HexEscapeSequence { + fn cv<'a, 'b: 'a>(&'a self, buf: &'b mut [u16; 2]) -> &'b [u16] { + buf[0] = self.1.mv() as u16; + &buf[0..1] + } +} + +impl CharacterValue for UnicodeEscapeSequence { + fn cv<'a, 'b: 'a>(&'a self, buf: &'b mut [u16; 2]) -> &'b [u16] { + buf[0] = self.1.mv(); + &buf[0..1] } } diff --git a/src/lexing/tokens/mod.rs b/src/lexing/tokens/mod.rs index 86aaf5b..67219a5 100644 --- a/src/lexing/tokens/mod.rs +++ b/src/lexing/tokens/mod.rs @@ -8,3 +8,4 @@ pub mod punctuator; pub mod whitespace; pub mod number; pub mod escapes; +pub mod string; diff --git a/src/lexing/tokens/number.rs b/src/lexing/tokens/number.rs index 2001cfa..741cc0f 100644 --- a/src/lexing/tokens/number.rs +++ b/src/lexing/tokens/number.rs @@ -97,6 +97,12 @@ impl MathematicalValue for HexDigit { 'D' => 0xD, 'E' => 0xE, 'F' => 0xF, + 'a' => 0xA, + 'b' => 0xB, + 'c' => 0xC, + 'd' => 0xD, + 'e' => 0xE, + 'f' => 0xF, _ => unreachable!(), } } diff --git a/src/lexing/tokens/string.rs b/src/lexing/tokens/string.rs new file mode 100644 index 0000000..880df20 --- /dev/null +++ b/src/lexing/tokens/string.rs @@ -0,0 +1,335 @@ +//! +//! String Literals. +//! + +use avjason_macros::{verbatim as v, Spanned, SpecRef}; + +use crate::{ + common::{Source, Span}, + lexing::{LexError, LexResult, LexT, Many, SourceStream}, +}; + +use super::{ + escapes::EscapeSequence, + line_terminator::{is_line_terminator, LineTerminatorSequence}, +}; + +/// +/// String literals. +/// +#[SpecRef("JSON5String")] +#[derive(Debug, Spanned)] +pub enum LString { + Double(v!('"'), Many>, v!('"')), + Single(v!('\''), Many>, v!('\'')), +} + +/// +/// All possible parts of a string literal. +/// +#[derive(Debug, Spanned)] +pub enum StringPart { + Char(StringChar), + Escape(v!('\\'), EscapeSequence), + LineContinuation(v!('\\'), LineTerminatorSequence), + LS(v!('\u{2028}')), + PS(v!('\u{2029}')), +} + +/// +/// A non-escaped string character. +/// +/// This represents itself. +/// +#[derive(Debug, Spanned)] +pub struct StringChar { + span: Span, + raw: char, +} + +// --- + +impl LexT for LString { + fn peek(input: &SourceStream) -> bool { + ::peek(input) || ::peek(input) + } + + fn lex(input: &mut SourceStream) -> Result { + input + .lex() + .and_then(|opening| { + let contents = input.lex()?; + let closing = input.lex().expected_msg(input, "Expected closing `\"`")?; + LexResult::Lexed(Self::Double(opening, contents, closing)) + }) + .or(|| { + input.lex().and_then(|opening| { + let contents = input.lex()?; + let closing = input.lex().expected_msg(input, "Expected closing `'`")?; + LexResult::Lexed(Self::Single(opening, contents, closing)) + }) + }) + .unwrap_as_result() + } +} + +impl LexT for StringPart { + fn peek(input: &SourceStream) -> bool { + as LexT>::peek(input) + || ::peek(input) + || ::peek(input) + || ::peek(input) + } + + fn lex(input: &mut SourceStream) -> Result { + // Some real nastiness going on here: + // essentially, complex functional-like control flow + // for the \ character to check if either . + + // .unwrap_as_result() ok since Self::peek() + input + .lex() + .map(Self::LS) + .or(|| input.lex().map(Self::PS)) + .or(|| input.lex().map(Self::Char)) + .or(|| { + input.lex().and_then(|backslash: v!('\\')| { + input + .lex() + .map(|esc| Self::Escape(backslash.clone(), esc)) + .or(|| { + LexResult::Lexed(Self::LineContinuation( + backslash, + input.lex().expected_msg( + input, + "Expected either an escape code here, or newline; got neither.", + )?, + )) + }) + }) + }) + .unwrap_as_result() + } +} + +impl LexT for StringChar { + fn peek(input: &SourceStream) -> bool { + !(input.upcoming(D) || input.upcoming(is_line_terminator) || input.upcoming("\\")) + && input.peek().is_some() + } + + fn lex(input: &mut SourceStream) -> Result { + // .unwrap() ok since Self::peek() -> next character exists. + let (loc, raw) = input.take().unwrap(); + + Ok(Self { + span: Span::from(loc), + raw, + }) + } +} + +// --- + +/// +/// The character value of a part of a string literal, which +/// dictates which character that part represents. +/// +/// See the [ECMAScript spec](https://262.ecma-international.org/5.1/#sec-7.8.4). +/// +pub trait CharacterValue { + /// + /// Encodes the utf-16 based character value into a + /// buffer, returning a slice of the bytes used. + /// + fn cv<'a, 'b: 'a>(&'a self, buf: &'b mut [u16; 2]) -> &'b [u16]; +} + +/// +/// The value a string literal represents. +/// +/// See the [ECMAScript spec](https://262.ecma-international.org/5.1/#sec-7.8.4). +/// +pub trait StringValue { + /// + /// Because this is ECMAScript, strings are utf-16 encoded + /// — this will be preserved at this stage. + /// + fn sv(&self) -> Vec; + + /// + /// Workaround for testing only. + /// + #[cfg(test)] + fn to_rust_string_lossy(&self) -> String { + let utf16 = self.sv(); + String::from_utf16_lossy(&utf16) + } +} + +// --- + +impl CharacterValue for StringPart { + fn cv<'a, 'b: 'a>(&'a self, buf: &'b mut [u16; 2]) -> &'b [u16] { + match self { + StringPart::Char(ch) => ch.cv(buf), + StringPart::Escape(_, esc) => esc.cv(buf), + StringPart::LineContinuation(_, _) => &buf[0..0], // Skip. + StringPart::LS(_) => '\u{2028}'.encode_utf16(buf), + StringPart::PS(_) => '\u{2029}'.encode_utf16(buf), + } + } +} + +impl CharacterValue for StringChar { + fn cv<'a, 'b: 'a>(&'a self, buf: &'b mut [u16; 2]) -> &'b [u16] { + self.raw.encode_utf16(buf) + } +} + +// --- + +impl StringValue for LString { + fn sv(&self) -> Vec { + match self { + LString::Double(_, contents, _) => contents.sv(), + LString::Single(_, contents, _) => contents.sv(), + } + } +} + +impl StringValue for Many> { + fn sv(&self) -> Vec { + // Complete guesswork about the initial capacity: + // I'm assuming that we're not going to get too many multi-u16 chars. + let mut string = Vec::with_capacity(self.len() * 5 / 4); + + let buf = &mut [0; 2]; + for part in self { + string.extend(part.cv(buf)) + } + + string + } +} +// --- + +#[cfg(test)] +mod tests { + use crate::{ + common::{file::SourceFile, Source}, + lexing::{tokens::string::StringValue, LexResult}, + }; + + use super::LString; + + fn test_string(st: &'static str) -> LexResult { + let source = SourceFile::dummy_file(st); + let input = &mut source.stream(); + input.lex() + } + + #[test] + fn normal_use_case() { + assert_eq!( + test_string(r"'AvdanOS is a community-led open-source project that attempts to implement Avdan\'s \'AvdanOS\' concept as a Wayland compositor.'") + .unwrap().to_rust_string_lossy(), + "AvdanOS is a community-led open-source project that attempts to implement Avdan\'s \'AvdanOS\' concept as a Wayland compositor." + ); + } + + #[test] + fn empty_string() { + assert_eq!(test_string("''").unwrap().to_rust_string_lossy(), ""); + assert_eq!(test_string("\"\"").unwrap().to_rust_string_lossy(), ""); + } + + #[test] + fn escapes() { + let lit = test_string( + r"'\'\\\b\f\n\r\t\v\a\!\£\%\*\&\-\=\💩\0\x20\x26\x25\x3c\u0000\u2AFC\u6798\u1623'", + ) + .expect("Valid parse"); + + assert_eq!( + lit.sv(), + // Answer from JavaScript (Chrome's V8). + vec![ + 39, 92, 8, 12, 10, 13, 9, 11, 97, 33, 163, 37, 42, 38, 45, 61, 55357, 56489, 0, 32, + 38, 37, 60, 0, 11004, 26520, 5667 + ] + ) + } + + #[test] + fn unbalanced_quotes() { + test_string(r"'Think this is unbalanced -- have you seen capitalism?").unwrap_err(); + test_string(r"'They don\'t let dogs in prison, Grommit! They\'ll put you down!\'") + .unwrap_err(); + test_string("\"Nothing is more appealing right now than a cup of coffee").unwrap_err(); + test_string("\"Have you heard about the album 'Nervermind'?\\\"").unwrap_err(); + } + + #[test] + fn invalid_escapes() { + test_string(r"'\x2'").unwrap_err(); + test_string(r"'\xSS'").unwrap_err(); + test_string(r"'\uSFAA'").unwrap_err(); + test_string(r"'\u2AA'").unwrap_err(); + + // It turns out that this form of escape is, in fact, octal. + // This is not mentioned in the official ECMAScript spec, + // But is in the optional extenstions: Section B.1.2(https://262.ecma-international.org/5.1/#sec-B.1.2). + + // For example, Node (V8) supports this, but Bun (JavaScriptCore) does not. + // As it is not mentioned whether to comply with optional extensions, + // this crate will not be implementing octal syntax. + test_string(r"'\1'").unwrap_err(); + } + + /// + /// Random series of u16's interpreted as + /// string literals, with the utf-16 value + /// compared to V8's answer. + /// + #[test] + #[allow(text_direction_codepoint_in_literal)] + fn fuzzing() { + assert_eq!( + test_string(r"'䂞ᤴ쭜ؚ洲綏뤒힓蔫黮뱏꽻ꜵ킩악\x19젏◣愜ꏟ醟㾊䑥뷜筵읩ꡓむ髇阏⍉딴퓼됪璮轫ʢ톽觻䀫ꮳ玐耠綈亄宅坍♳ꯑ\uDBCD㇀甚渭￐㛓魴矮︄跕鹞㉋᪽ꎓ鸩먾汕䱏쏀㘓씩㕟챬ᆀ瓅㫱భd瀒峊ツꮫ뀥靺㉏ꙓⷳᨾ짽ꑙΥפ肜혃ᐜ恴婁⛫╴䰛⾁\x9A䚠댂䜡ૢ¦ꊠ⧽랸儔根햩쫹輤Ȫ䜭ﺆᬒ偠⊽Ṑ敇봅¨팔檵\uDBB9Գ౓ถ啼摚㿓껠͛躏湜㵬褤쵐㽴䒦迼\uD933ᛳ뵁楻뤣璻㰒\uDB11疲ᆐ腻抐즲ଉ灮鷋䝡밶ꛃ\uDF4Br⯝ଆ㷍工좭澏挣\uDC83◘语开劊椢䀐럵갿懼嗵⊫ꑬ縭郁얱仁༅ⷬ垉₍荌ﵙ䭿⦤牐詌撸উ崙\uDE8E荓畨ꯔᇤ垯蠐⏧쨁▏賈⇜欁ꓕ⍎讷∥㫲画鴶醎迚崴쭹짲교뎈噍⽚\uDFB8냅㥤射'") + .expect("Valid parse").sv(), + vec![ + 16542, 6452, 52060, 1562, 27954, 32143, 47378, 55187, 34091, 40686, 48207, 44923, 42805, 53417, 50501, 25, 51215, 9699, 62034, 24860, 41951, 37279, 16266, 57468, 17509, 57866, 48604, 31605, 51049, 43091, 12416, 39623, 38415, 9033, 58372, 46388, 54524, 46122, 29870, 36715, 674, 53693, 35323, 16427, 43955, 29584, 32800, 32136, 20100, 64004, 60979, 22349, 9843, 58280, 43985, 56269, 12736, 29978, 28205, 65488, 14035, 39796, 60797, 30702, 65028, 36309, 40542, 12875, 6845, 41875, 40489, 61197, 47678, 27733, 19535, 50112, 13843, 50473, 13663, 52332, 4480, 29893, 15089, 3117, 100, 28690, 23754, 65410, 43947, 45093, 38778, 12879, 42579, 61708, 11763, 6718, 51709, 42073, 933, 1508, 32924, 54787, 5148, 24692, 23105, 9963, 9588, 19483, 12161, 154, 18080, 45826, 18209, 2786, 65508, 41632, 10749, 47032, 20756, 26681, 54697, 51961, 57412, 36644, 554, 18221, 65158, 6930, 20576, 62494, 8893, 7760, 25927, 48389, 168, 58805, 54036, 27317, 56249, 1331, 3155, 3606, 21884, 25690, 16339, 44768, 859, 36495, 28252, 15724, 35108, 52560, 16244, 61134, 17574, 36860, 55603, 5875, 48449, 27003, 47395, 29883, 15378, 56081, 61909, 30130, 4496, 33147, 61001, 25232, 51634, 2825, 28782, 58861, 40395, 18273, 48182, 42691, 57163, 114, 11229, 2822, 15821, 24037, 60822, 51373, 28559, 25379, 62890, 56451, 9688, 35821, 59961, 24320, 21130, 26914, 16400, 47093, 44095, 25084, 22005, 8875, 42092, 32301, 37057, 50609, 20161, 3845, 11756, 22409, 8333, 33612, 64857, 19327, 10660, 29264, 35404, 25784, 2441, 23833, 58025, 58894, 56974, 33619, 61599, 30056, 43988, 4580, 22447, 34832, 9191, 51713, 9615, 36040, 8668, 27393, 42197, 9038, 35767, 8741, 15090, 64163, 40246, 37262, 36826, 23860, 52089, 51698, 44368, 45960, 22093, 12122, 57272, 45253, 14692, 23556 + ] + ); + + assert_eq!( + test_string(r"'秚놰ꚋ⾏<給齌걿괔鍺江ﬧ䭑钣ᆲ茊琳株໶杴칽\uDCB1渾⭮ⶕ墢啐渍홦䳹艘紕혺镨쾋冻喢喚䣳㙤봽级邒ថ\uD9B8ោ䋀껄䦐椴⎨譴꽲沺᷆롥ᗐ赙쿰⸲᪘꿲鏸帠梯튋궳 ㌦땭ӂ咶鞝卓硄뷬䫾ୢ蘪ク㉃᯲ຮ೚⃊̽詁ꓔ㴺뮢׳Թ尀塠鶈퟾뷊娈鶍х㍣铽렑轨ߵꧮ㒉콳$ꖃ붟섈⟃ᰫ턖\uDAABLꄅ\uDEE2鰔程륡㩜旎ᢛ᜴휫澜䬁쾘྾퍂畐囃꺴ነ泴얽㤢瀊Ⱃྡྷ뙷輇ዉаᅦ㠮㚾졲揿䠭஍磡༛논렺鵠篩㣴셑拨튮ꈌἛ隸눙埊㙺겓셀꠱♌\uDD7E䂼귘檚홗誚͔ꦣ锴ߓ\uDB03匷䏄膟鿕僥粡塕ꎟ宗彲댙䈹⟚ད軵픣㇅燺盰籞睻䋫얨♶኶車\uDBD1젔䖬⬓࣌㺓ྂ꤯⽊᫖ᚋ焹￲甃ꇢ뛉芀ฑ訾蔾\uD96C捈㮙཯㜄'") + .expect("Valid parse").sv(), + vec![31194, 45488, 62680, 42635, 12175, 65308, 32102, 40780, 44159, 44308, 37754, 27743, 64295, 19281, 38051, 65452, 33546, 29747, 26666, 3830, 26484, 52861, 56497, 28222, 11118, 11669, 22690, 21840, 28173, 54886, 19705, 58159, 33368, 32021, 54842, 59331, 62622, 38248, 53131, 20923, 21922, 21914, 18675, 13924, 48445, 61070, 32423, 37010, 6032, 55736, 6084, 17088, 44740, 18832, 26932, 9128, 35700, 61865, 44914, 27834, 7622, 47205, 5584, 36185, 53232, 11826, 6808, 45042, 37880, 24096, 26799, 53899, 44467, 12288, 13094, 46445, 1218, 21686, 38813, 21331, 30788, 59530, 61378, 58739, 48620, 19198, 2914, 34346, 12463, 12867, 7154, 3758, 3290, 8394, 829, 35393, 42196, 15674, 48034, 1523, 1337, 23552, 22624, 40328, 55294, 48586, 23048, 40333, 63659, 1093, 13155, 38141, 47121, 36712, 2037, 43502, 13449, 53107, 36, 42371, 48543, 49416, 10179, 7211, 53526, 55979, 65324, 41221, 57058, 39956, 31243, 47457, 14940, 26062, 6299, 5940, 55083, 28572, 19201, 53144, 4030, 54082, 30032, 22211, 44724, 4752, 27892, 59533, 50621, 14626, 28682, 11283, 4002, 46711, 36615, 4809, 1072, 65479, 14382, 14014, 51314, 25599, 18477, 2957, 30945, 3867, 45436, 47162, 40288, 31721, 14580, 49489, 25320, 53934, 41484, 7963, 38584, 45593, 22474, 13946, 44179, 57572, 49472, 43057, 63050, 9804, 62475, 56702, 16572, 44504, 27290, 57604, 54871, 35482, 852, 43427, 38196, 2003, 56067, 21303, 17348, 33183, 40917, 20709, 31905, 22613, 41887, 23447, 24434, 45849, 60174, 16953, 10202, 3921, 36597, 60241, 54563, 12741, 29178, 30448, 31838, 30587, 17131, 58558, 50600, 9846, 4790, 63746, 56273, 51220, 17836, 11027, 2252, 16019, 3970, 43311, 12106, 59874, 6870, 5771, 28985, 57731, 65522, 29955, 41442, 57924, 46793, 33408, 3601, 35390, 34110, 55660, 25416, 15257, 3951, 14084 + ] + ); + + assert_eq!( + test_string(r"'ᐇ➢ᷙ榃훳휆ꅦ欥㒎ᩀஒ䧓㼿\uDBBE䍷ख़ꔬ쳩呍ꑼ᧡譶䮿뽕ꙴ뢪촗㲪袹쟓Ὴ棅捈批쟹砛▟즣㎜펒巵ꚓ꜐Ꞝ톘ᅿ㣓䐩籮晤饳堓䋤੡㇪ᾚ厤秲猪絓ꡨ俛붷継㤕᠍䌖\uDEDA砇₈㴹牙뛞ꃤ˕ඟ蚍醚픦먜ἺṴ茫뚯﹤唟풰섙碁젋졂∽赞摖隆걑쩒柀瞛擧获㺟染ሏ᧻사汋迪셨㸹嵂䤬闄䏇䒘㎓뻑ꭊ圹衁끇ᮓ빪耔怮⺇䳐묃䅻׫磼脉ò᷾姰佄鶕붬ጛ갲祔奔㖔⣪℄蝱靦ꆯꜮ궻씍ﹶ쑿鞾轪⠱胼螓멣栟跦沭⊾夏尃먗㲳瀆ퟎ콆攂喉ㄻ嶳鸹䉭뾐铥䤰漘뉦ᅭ䐨钞薑涐⹾쏾䏔蝶フ⯐ຒ藧㒴緽듹⇒㛎明黩㛳氓梛辽\uD850㣞\uDD33鼚暤梅㧊Ҩᩰ圄찅甦信矆誐ոꖚ冐䞭㹳㹆鰑'") + .expect("Valid parse").sv(), + vec![59730, 5127, 10146, 7641, 59098, 27011, 55027, 55046, 41318, 27429, 13454, 6720, 2962, 63467, 18899, 16191, 56254, 17271, 2393, 42284, 52457, 21581, 42108, 6625, 63025, 35702, 59558, 19391, 48981, 42612, 47274, 52503, 15530, 35001, 51155, 8138, 26821, 25416, 25209, 51193, 59246, 30747, 9631, 51619, 13212, 54162, 24053, 42643, 42768, 42908, 53656, 4479, 14547, 17449, 31854, 26212, 39283, 22547, 17124, 2657, 60174, 12778, 8090, 21412, 31218, 29482, 32083, 63500, 43112, 20443, 48567, 32153, 14613, 6157, 17174, 58187, 57050, 30727, 8328, 15673, 29273, 46814, 41188, 725, 3487, 34445, 37274, 54566, 47644, 7994, 7796, 33579, 46767, 65124, 21791, 54448, 49433, 30849, 51211, 51266, 8765, 63507, 36190, 25686, 38534, 44113, 51794, 26560, 30619, 58556, 25831, 33719, 16031, 61982, 26579, 4623, 6651, 57722, 49324, 27723, 36842, 49512, 15929, 23874, 58228, 18732, 38340, 17351, 17560, 13203, 48849, 43850, 61644, 22329, 34881, 45127, 7059, 48746, 32788, 24622, 11911, 19664, 47875, 16763, 1515, 30972, 33033, 242, 7678, 23024, 20292, 40341, 48556, 4891, 44082, 31060, 22868, 13716, 10474, 8452, 34673, 60035, 38758, 41391, 62827, 42798, 44475, 63743, 58786, 50445, 62643, 65142, 50303, 38846, 36714, 10289, 58929, 33020, 34707, 47715, 26655, 36326, 27821, 62815, 8894, 22799, 23555, 47639, 60265, 15539, 28678, 58432, 55246, 53062, 25858, 60646, 21897, 60958, 12603, 23987, 40505, 17005, 49040, 38117, 18736, 28440, 45670, 4461, 17448, 38046, 34193, 28048, 11902, 50174, 17364, 34678, 12501, 11216, 3730, 34279, 13492, 32253, 46329, 8658, 60048, 14030, 26126, 40681, 14067, 27667, 61088, 26779, 36797, 55376, 14558, 56627, 40730, 60610, 26276, 26757, 14794, 1192, 6768, 22276, 52229, 29990, 20449, 30662, 35472, 1400, 42394, 20880, 18349, 15987, 15942, 39953 + ] + ); + + assert_eq!( + test_string(r"'祐䇛珈䣏둫䠽㩅⏇ᗊꥷ⛙寎杅똦儣桴糎絪㋢雳쑢㡟ⓘ譏笜穘ᎏ난ᡂᣕ䯹嗔楗鏯⼺㌨떟ሎỬ⵹䪋౿⸬ה\uDCAE釉萳阪櫒洈宀駅뻍슘ᴘ錱⎝ᓛ堼䲃㖭鸜鸍\uDABC掰カ픸⑘佫䔻樟嗌軓\x83喋瀛䙳峦튬酥㫑䔶␱씿芩鵙䗲衜賈\uDF4B㋋颡쩾敯侥㰟ᱍሇ笿뭦ۑ؄ࡾ갆쌨嬓ꑌ⼮犥䏧擌臤ꋪ꡿↊됰㏞讐റᲱ篤ⴓꚹ菙䪦엇⡗袦嵻嶬捡쭇䙑婫⏱韲흛⌠ꊜ볲緙덕㣔鍸暐䄧뭝鳴ᙇ莯覧⑩쿿벹⦠紈ۃ戎쥔븗ꍏ桝\uDCD8໗坊圻賈꧙볰ꁣ칏ᄩ\uDC7F삃爞겕虵䡏έ홰⸱焸왵⒗㚪좵Ϗ훱熞䗳၇ェ죳ਙ\uDE73\uD8E8汃ᚠ鏮恹⺐죾ﮒ툔퐬ઇﵔ촶⊄෈᬴늮\uDE3F䣓攙蘿儠ූആ䞔ꄶ亏㘝迬'") + .expect("Valid parse").sv(), + vec![31056, 57578, 16859, 29640, 18639, 58428, 46187, 18493, 14917, 9159, 5578, 43383, 9945, 23502, 26437, 46630, 20771, 26740, 31950, 32106, 59267, 13026, 38643, 59963, 58248, 50274, 14431, 9432, 35663, 31516, 31320, 5007, 45212, 6210, 6357, 63623, 19449, 60661, 21972, 26967, 37871, 12090, 58152, 13096, 46495, 62016, 4622, 7916, 11641, 19083, 3199, 11820, 1492, 56494, 37321, 33843, 38442, 27346, 27912, 23424, 39365, 48845, 49816, 7448, 37681, 9117, 5339, 22588, 19587, 58498, 13741, 40476, 40461, 55996, 25520, 62609, 65398, 54584, 9304, 20331, 57805, 17723, 27167, 61366, 57712, 57423, 21964, 36563, 131, 21899, 28699, 18035, 23782, 53932, 37221, 61779, 15057, 17718, 9265, 50495, 33449, 40281, 59399, 17906, 59923, 34908, 36040, 61662, 57163, 13003, 39073, 51838, 25967, 20389, 15391, 7245, 60883, 4615, 31551, 47974, 1745, 1540, 2174, 44038, 49960, 58601, 23315, 42060, 12078, 29349, 17383, 25804, 33252, 41706, 43135, 8586, 46128, 13278, 35728, 3377, 57603, 7345, 31716, 11539, 42681, 33753, 19110, 50631, 10327, 34982, 23931, 60456, 23980, 25441, 52039, 63224, 18001, 23147, 9201, 38898, 55131, 8992, 58355, 41628, 48370, 32217, 45909, 14548, 37752, 26256, 16679, 57404, 47965, 40180, 5703, 33711, 35239, 9321, 53247, 48313, 10656, 32008, 1731, 25102, 51540, 48663, 41807, 26717, 56536, 3799, 22346, 22331, 63747, 43481, 48368, 41059, 52815, 58274, 4393, 56447, 49283, 29214, 44181, 34421, 18511, 8051, 62021, 54896, 11825, 28984, 50805, 9367, 13994, 51381, 975, 55025, 57379, 29086, 17907, 4167, 12455, 51443, 2585, 56947, 55528, 27715, 5792, 59015, 37870, 24697, 11920, 51454, 64402, 53780, 54316, 2695, 64852, 52534, 8836, 3528, 6964, 45742, 56895, 18643, 62500, 25881, 63760, 20768, 3542, 3334, 18324, 41270, 20111, 13853, 61106, 36844 + ] + ); + + // This one broke clippy, because text changes directions halfway through, + // but we don't care about that! + assert_eq!( + test_string(r"'䊄㨍䕇㉆鹹䤑謲虉喙帺⫮૚謤㵳骼뜜ᳪﱞ䀅ߢ兾ỷ煡鼱뚹ꕖ䜻\uDC9F終蚔㏼뫨軗쯰붰줓城鱃膫⌶틧ﲔ醛㹣䳵踠圆귚ᇟ赒ᡘ浚預鿹ᘓฑ圲肋ꕬ჆㓘륳쌫텮厬攞ᕇ䮽ꢗ牴쫚굣篁ж怏娈뭑싒樞ጡ矡鸉퉱㾼러⁩⨥ቭ桅做휚࠿멞㓧\uDA80䷸㠻ご砕紭䞏玆䪗ৰﰸ斺㯈璏ﶔꃙ剧뇗ވ㥋༣咨喘벷긳닅厒ᆻ唣퓽뾖跴퉈ㄳ⟵⼚셅쒱輎ᾟ笴㗸䩽\uDF42吓ꅘ軟᢭褶欲෗᪸蠬騻ꥼ籎䋾âꙷನ䮲蹼㗘㞑\uDD45ꦪ쮝乳頇ᘜ智⊴Ꟙ䍹♀뷿짿ꍵᲜ촾㉂냰騞Ҥ晲駀牵揄䤸䆳၅뿐䧨箧곟὚㫽揖쬨繥쨉딭㶲쁉㝓Հ濘⑙࢟鸀兊\uD881ꪨ줢ꁳ㎥哕ヅ䳓רּ㴤됈֕倬詗ʿ깗憭㼟᜗꭛욙⎅⑴ឮ窗'") + .expect("Valid parse").sv(), + vec![ 17028, 14861, 17735, 12870, 40569, 18705, 35634, 34377, 21913, 24122, 10990, 2778, 35620, 61404, 15731, 63376, 39612, 46876, 7402, 64606, 16389, 2018, 20862, 7927, 29025, 40753, 46777, 42326, 18235, 56479, 32066, 61027, 34452, 13308, 47848, 36567, 52208, 48560, 51475, 22478, 40003, 33195, 57595, 9014, 53991, 64660, 37275, 15971, 61616, 19701, 36384, 22278, 57883, 44506, 4575, 36178, 58627, 6232, 27994, 38928, 40953, 5651, 3601, 22322, 32907, 57580, 42348, 4294, 13528, 47475, 61491, 49963, 53614, 21420, 25886, 5447, 19389, 43159, 29300, 51930, 44387, 31681, 1078, 24591, 23048, 47953, 49874, 60619, 27166, 4897, 30689, 40457, 53873, 16316, 61713, 47084, 8297, 10789, 61059, 4717, 26693, 20570, 55066, 2111, 47710, 13543, 55936, 57772, 19960, 57799, 14395, 12372, 30741, 32045, 18319, 29574, 19095, 58510, 2544, 64568, 57946, 26042, 15304, 58185, 29839, 64916, 41177, 21095, 45527, 1928, 14667, 3875, 21672, 63164, 21912, 48311, 44595, 45765, 21394, 4539, 21795, 54525, 49046, 36340, 53832, 12595, 10229, 12058, 49477, 50353, 36622, 8095, 31540, 13816, 19069, 57154, 21523, 41304, 36575, 60038, 6317, 35126, 27442, 3543, 6840, 34860, 39483, 58493, 43388, 31822, 17150, 226, 42615, 3240, 19378, 36476, 13784, 14225, 56645, 43434, 52125, 20083, 38919, 5660, 26234, 8884, 42968, 17273, 9792, 48639, 51711, 41845, 7324, 52542, 12866, 45296, 39454, 1188, 26226, 39360, 29301, 64141, 18744, 57736, 16819, 4165, 62390, 59943, 49104, 18920, 31655, 44255, 8026, 15101, 25558, 52008, 32357, 51721, 46381, 15794, 49225, 14163, 1344, 28632, 9305, 2207, 40448, 60855, 61830, 20810, 55425, 43688, 57670, 51490, 41075, 13221, 21717, 12485, 19667, 64328, 15652, 46088, 1429, 20524, 35415, 703, 44631, 25005, 16159, 5911, 43867, 50841, 9093, 9332, 6062, 31383] + ); + } +} diff --git a/src/lexing/utils/lex_impls.rs b/src/lexing/utils/lex_impls.rs index 3819e6a..bd2628c 100644 --- a/src/lexing/utils/lex_impls.rs +++ b/src/lexing/utils/lex_impls.rs @@ -2,30 +2,37 @@ //! Utility implementations for [Lex]. //! -use std::ops::{Deref, DerefMut}; +use std::{ + marker::PhantomData, + ops::{Deref, DerefMut}, +}; use crate::common::{Source, Span, SpanIter, Spanned}; -use super::{LexError, LexT, SourceStream}; +use super::{Lex, LexResult, LexT, Peek, SourceStream}; /// /// Many (possibly one or zero) of a lexical token. /// pub type Many = Vec; -impl LexT for Many { - fn peek(input: &SourceStream) -> bool { - L::peek(input) +impl Lex for Many { + fn peek(_: &SourceStream) -> Peek { + Peek::Possible(PhantomData::) } - fn lex(input: &mut SourceStream) -> Result { + fn lex(input: &mut SourceStream) -> LexResult { let mut v = vec![]; - while L::peek(input) { - v.push(L::lex(input)?); + loop { + match ::lex(input) { + LexResult::Lexed(lexed) => v.push(lexed), + LexResult::Errant(errant) => return LexResult::Errant(errant), + LexResult::Nothing => break, + } } - Ok(v) + LexResult::Lexed(v) } } @@ -41,23 +48,27 @@ impl Spanned for Many { #[derive(Debug)] pub struct AtLeast(Vec); -impl LexT for AtLeast { - fn peek(input: &SourceStream) -> bool { - L::peek(input) +impl Lex for AtLeast { + fn peek(input: &SourceStream) -> Peek { + if N == 0 { + return Peek::Possible(PhantomData::); + } + + ::peek(input).map() } - fn lex(input: &mut SourceStream) -> Result { - let many: Many = LexT::lex(input)?; + fn lex(input: &mut SourceStream) -> LexResult { + let many: Many = Lex::lex(input)?; if many.len() < N { - return Err(input.error(format!( + return LexResult::Errant(input.error(format!( "Expected at least {N} {} tokens: got {}.", std::any::type_name::(), many.len(), ))); } - Ok(Self(many)) + LexResult::Lexed(Self(many)) } } @@ -89,19 +100,23 @@ pub struct Exactly([L; N]) where [(); N]: Sized; -impl LexT for Exactly +impl Lex for Exactly where [(); N]: Sized, { - fn peek(input: &SourceStream) -> bool { - L::peek(input) + fn peek(input: &SourceStream) -> Peek { + if N == 0 { + return Peek::Possible(PhantomData::); + } + + ::peek(input).map() } - fn lex(input: &mut SourceStream) -> Result { - let many: Many = LexT::lex(input)?; + fn lex(input: &mut SourceStream) -> LexResult { + let many: Many = Lex::lex(input)?; if many.len() != N { - return Err(input.error(format!( + return LexResult::Errant(input.error(format!( "Expected {N} {} tokens: got {}.", std::any::type_name::(), many.len() @@ -111,7 +126,7 @@ where // SAFETY: Just checked the length, so unwrap okay. let many: [L; N] = unsafe { many.try_into().unwrap_unchecked() }; - Ok(Self(many)) + LexResult::Lexed(Self(many)) } } diff --git a/src/lexing/utils/peek.rs b/src/lexing/utils/peek.rs index b2c722d..23c81ee 100644 --- a/src/lexing/utils/peek.rs +++ b/src/lexing/utils/peek.rs @@ -28,4 +28,11 @@ impl Peek { Peek::Absent => LexResult::Nothing, } } + + pub fn map(self) -> Peek { + match self { + Peek::Possible(_) => Peek::Possible(PhantomData::), + Peek::Absent => Peek::Absent, + } + } } diff --git a/src/lexing/utils/result.rs b/src/lexing/utils/result.rs index 0237b54..f8307d0 100644 --- a/src/lexing/utils/result.rs +++ b/src/lexing/utils/result.rs @@ -1,4 +1,9 @@ -use std::any::type_name; +use std::{ + any::type_name, + convert::Infallible, + fmt::Debug, + ops::{ControlFlow, FromResidual, Try}, +}; use avjason_macros::Spanned; @@ -78,6 +83,22 @@ impl LexResult { } } + /// + /// Allegory of [Result::unwrap_err] + /// + pub fn unwrap_err(self) -> LexError + where + L: Debug, + { + match self { + LexResult::Lexed(lexed) => { + panic!("called `LexResult::unwrap()` on an `Lexed` value: {lexed:?}") + } + LexResult::Errant(errant) => errant, + LexResult::Nothing => panic!("called `LexResult::unwrap_err()` on a `Nothing` value"), + } + } + /// /// Is this [LexResult::Errant]? /// @@ -113,13 +134,27 @@ impl LexResult { } } + /// + /// Allegory of [Result::and_then]. + /// + /// If this is [LexResult::Lexed], the mapper function will be called, + /// and its return value is returned. + /// + pub fn and_then LexResult>(self, mapper: F) -> LexResult { + match self { + LexResult::Lexed(lexed) => mapper(lexed), + LexResult::Errant(errant) => LexResult::Errant(errant), + LexResult::Nothing => LexResult::Nothing, + } + } + /// /// Require this potential token to be present, not [LexResult::Nothing] or [LexResult::Errant]. /// /// If this is [LexResult::Nothing], make this into a [LexResult::Errant] /// with the message "expected a {$TOKEN} token". /// - pub fn expected(self, input: SourceStream) -> Self { + pub fn expected(self, input: &SourceStream) -> Self { match self { s @ LexResult::Lexed(_) => s, s @ LexResult::Errant(_) => s, @@ -130,6 +165,23 @@ impl LexResult { } } + /// + /// Require this potential token to be present, not [LexResult::Nothing] or [LexResult::Errant]. + /// + /// If this is [LexResult::Nothing], make this into a [LexResult::Errant] + /// with the message "expected a {$TOKEN} token". + /// + pub fn expected_msg(self, input: &SourceStream, msg: impl ToString) -> Self { + match self { + s @ LexResult::Lexed(_) => s, + s @ LexResult::Errant(_) => s, + LexResult::Nothing => LexResult::Errant(LexError { + span: input.span(), + message: msg.to_string(), + }), + } + } + /// /// If this is [LexResult::Nothing], execute the `or` function instead, /// and return its result. @@ -157,3 +209,31 @@ impl LexResult { } } } + +impl Try for LexResult { + type Output = L; + + type Residual = LexResult; + + fn from_output(output: Self::Output) -> Self { + Self::Lexed(output) + } + + fn branch(self) -> ControlFlow { + match self { + LexResult::Lexed(lexed) => ControlFlow::Continue(lexed), + LexResult::Errant(errant) => ControlFlow::Break(LexResult::Errant(errant)), + LexResult::Nothing => ControlFlow::Break(LexResult::Nothing), + } + } +} + +impl FromResidual for LexResult { + fn from_residual(residual: ::Residual) -> Self { + match residual { + LexResult::Lexed(_) => unreachable!(), + LexResult::Errant(errant) => LexResult::Errant(errant), + LexResult::Nothing => LexResult::Nothing, + } + } +} diff --git a/src/lexing/utils/verbatim.rs b/src/lexing/utils/verbatim.rs index fbac33c..da68431 100644 --- a/src/lexing/utils/verbatim.rs +++ b/src/lexing/utils/verbatim.rs @@ -15,7 +15,7 @@ use crate::lexing::{CharacterRange, LexError, LexT, SourceStream}; /// /// **Do not use me directly, use [crate::verbatim] instead!** /// -#[derive(Debug, Spanned)] +#[derive(Debug, Spanned, Clone)] pub struct Verbatim { span: Span, } diff --git a/src/lib.rs b/src/lib.rs index d82a3d3..de10756 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -14,7 +14,7 @@ // This will have to be removed to solve #5: #![allow(incomplete_features)] -#![feature(adt_const_params)] +#![feature(adt_const_params, try_trait_v2)] pub mod common; pub mod lexing; From 8be2a3fb48a73c331f2c98dd6adbecf74d0533ea Mon Sep 17 00:00:00 2001 From: Sammy99jsp Date: Mon, 29 Jan 2024 03:34:10 +0000 Subject: [PATCH 37/39] Pass over documentation. --- macros/src/lib.rs | 16 ++++---- src/lexing/tokens/comment.rs | 2 +- src/lexing/tokens/escapes.rs | 58 ++++++++++++++++++++++++++++- src/lexing/tokens/number.rs | 30 ++++++++------- src/lexing/tokens/string.rs | 2 +- src/lexing/tokens/whitespace.rs | 3 +- src/lexing/utils/lex_impls.rs | 6 +-- src/lexing/utils/result.rs | 65 ++++++++++++++++----------------- src/lexing/utils/stream.rs | 11 +++++- src/lib.rs | 2 +- 10 files changed, 131 insertions(+), 64 deletions(-) diff --git a/macros/src/lib.rs b/macros/src/lib.rs index e24831a..d58b01b 100644 --- a/macros/src/lib.rs +++ b/macros/src/lib.rs @@ -19,15 +19,15 @@ use verbatim::VerbatimPat; /// /// ## SpecRef /// -/// Allows easy reference of the **JSON5** specification. +/// Allows easy reference of the [**JSON5** specification](https://spec.json5.org/). /// /// This macro will add an additional section at the top of the Rustdoc /// for the item attached, linking to the relevant section in the specification. /// /// ### Example /// -/// ```ignore -/// use crate::SpecRef; +/// ``` +/// use avjason_macros::SpecRef; /// /// // With custom title. /// #[SpecRef("Number", "JSON5Number")] @@ -64,15 +64,15 @@ pub fn SpecRef(params: Tokens, target: Tokens) -> Tokens { /// /// ## ECMARef /// -/// Allows easy reference of the **ECMAScript** specification. +/// Allows easy reference of the [**ECMAScript 5.1** specification](https://262.ecma-international.org/5.1/#). /// /// This macro will add an additional section at the top of the Rustdoc /// for the item attached, linking to the relevant section in the specification. /// /// ### Example /// -/// ```ignore -/// use crate::ECMARef; +/// ``` +/// use avjason_macros::ECMARef; /// /// // You must always include an acompanying URL. /// #[ECMARef("NullLiteral", "https://262.ecma-international.org/5.1/#sec-7.8.1")] @@ -109,6 +109,8 @@ pub fn ECMARef(params: Tokens, target: Tokens) -> Tokens { /// /// ### Terminal Tokens /// ```ignore +/// use avjason_macros::Spanned; +/// use avjason::common::Span; /// /// /// /// (1) Named span field. /// /// @@ -176,7 +178,7 @@ pub fn ECMARef(params: Tokens, target: Tokens) -> Tokens { /// #[derive(Spanned)] /// enum NumberOrHex { /// Base10(AtLeast<1, Digit>), -/// Base16(v!(0x), AtLeast<1, HexDigit>), +/// Base16(v!("0x"), AtLeast<1, HexDigit>), /// } /// ``` /// diff --git a/src/lexing/tokens/comment.rs b/src/lexing/tokens/comment.rs index 505c01e..97cb627 100644 --- a/src/lexing/tokens/comment.rs +++ b/src/lexing/tokens/comment.rs @@ -1,5 +1,5 @@ //! -//! Comments. +//! ## Comments //! use avjason_macros::{verbatim as v, ECMARef, Spanned}; diff --git a/src/lexing/tokens/escapes.rs b/src/lexing/tokens/escapes.rs index 5563bdd..a4c066c 100644 --- a/src/lexing/tokens/escapes.rs +++ b/src/lexing/tokens/escapes.rs @@ -1,11 +1,11 @@ //! -//! Escape +//! ## Escape Codes //! //! Technically not tokens. //! These are used between strings and identifiers. //! -use avjason_macros::{verbatim as v, Spanned}; +use avjason_macros::{verbatim as v, ECMARef, Spanned}; use crate::{ common::{Source, Span}, @@ -17,6 +17,26 @@ use super::{ number::{HexDigit, MathematicalValue}, string::CharacterValue, }; + +/// +/// Any valid ECMAScript escape sequence: +/// +/// ```javascript +/// '\n' // Escaped character +/// '\y' // Non-escaped character +/// '\0' // Null character +/// '\x1A' // Hex code escape +/// '\u0A1B'// Unicode escape +/// ``` +/// +/// *** +/// +/// ### Note +/// Since the octal escape syntax is optional and not part of the main spec +/// (see [Section B.1.2](https://262.ecma-international.org/5.1/#sec-B.1.2)), +/// it is *not* supported. +/// +#[ECMARef("EscapeSequence", "https://262.ecma-international.org/5.1/#sec-7.8.4")] #[derive(Debug, Spanned)] pub enum EscapeSequence { CharacterEscapeSequence(CharacterEscapeSequence), @@ -25,32 +45,66 @@ pub enum EscapeSequence { UnicodeEscapeSequence(UnicodeEscapeSequence), } +/// +/// Single characters that have been escaped +/// with a `\`. +/// +#[ECMARef( + "CharacterEscapeSequence", + "https://262.ecma-international.org/5.1/#sec-7.8.4" +)] #[derive(Debug, Spanned)] pub enum CharacterEscapeSequence { Single(SingleEscapeChar), NonEscape(NonEscapeChar), } +/// +/// An escape character, like `\t` for `HORIZONTAL TAB`. +/// +#[ECMARef( + "SingleEscapeChar", + "https://262.ecma-international.org/5.1/#sec-7.8.4" +)] #[derive(Debug, Spanned)] pub struct SingleEscapeChar { span: Span, raw: char, } +/// +/// A character that's not an escape character, +/// and should be treated verbatim. +/// +#[ECMARef( + "NonEscapeChar", + "https://262.ecma-international.org/5.1/#sec-7.8.4" +)] #[derive(Debug, Spanned)] pub struct NonEscapeChar { span: Span, raw: char, } +/// +/// Represents a `NULL` character `U+0000` +/// #[derive(Debug, Spanned)] pub struct Null { span: Span, } +#[ECMARef( + "HexEscapeSequence", + "https://262.ecma-international.org/5.1/#sec-7.8.4" +)] #[derive(Debug, Spanned)] pub struct HexEscapeSequence(v!('x'), Exactly<2, HexDigit>); +#[ECMARef( + "UnicodeEscapeSequence", + "https://262.ecma-international.org/5.1/#sec-7.8.4" +)] #[derive(Debug, Spanned)] pub struct UnicodeEscapeSequence(v!('u'), Exactly<4, HexDigit>); diff --git a/src/lexing/tokens/number.rs b/src/lexing/tokens/number.rs index 741cc0f..cf5a88d 100644 --- a/src/lexing/tokens/number.rs +++ b/src/lexing/tokens/number.rs @@ -1,5 +1,5 @@ //! -//! ## Number +//! ## Number literals //! //! Number tokens like integers, hex integers, and decimals, //! @@ -13,18 +13,6 @@ use crate::{ lexing::{AtLeast, Exactly, LexError, LexT, SourceStream}, }; -/// -/// The numerical value of a literal. -/// -/// See the [ECMAScript spec](https://262.ecma-international.org/5.1/#sec-7.8.3). -/// -pub trait MathematicalValue { - type Value: Copy + Add; - const BASE: usize; - - fn mv(&self) -> Self::Value; -} - #[ECMARef("DecimalDigit", "https://262.ecma-international.org/5.1/#sec-7.8.3")] pub type DecimalDigit = v!('0'..='9'); @@ -35,6 +23,10 @@ pub struct HexDigit { raw: char, } +// TODO: Implement Lexical grammar for Identifier, rest of Number. +// TODO: Implement syntactical grammar. +// TODO: Implement serde integration (+ fancy Spanned) + // --- impl LexT for HexDigit { @@ -54,6 +46,18 @@ impl LexT for HexDigit { // --- +/// +/// The numerical value of a literal. +/// +/// See the [ECMAScript spec](https://262.ecma-international.org/5.1/#sec-7.8.3). +/// +pub trait MathematicalValue { + type Value: Copy + Add; + const BASE: usize; + + fn mv(&self) -> Self::Value; +} + impl MathematicalValue for DecimalDigit { type Value = u8; const BASE: usize = 10; diff --git a/src/lexing/tokens/string.rs b/src/lexing/tokens/string.rs index 880df20..5ecc243 100644 --- a/src/lexing/tokens/string.rs +++ b/src/lexing/tokens/string.rs @@ -1,5 +1,5 @@ //! -//! String Literals. +//! ## String Literals //! use avjason_macros::{verbatim as v, Spanned, SpecRef}; diff --git a/src/lexing/tokens/whitespace.rs b/src/lexing/tokens/whitespace.rs index 144e001..fc94b70 100644 --- a/src/lexing/tokens/whitespace.rs +++ b/src/lexing/tokens/whitespace.rs @@ -1,5 +1,6 @@ //! -//! ## WhiteSpace +//! ## Whitespace +//! //! Empty space that doesn't contribute syntactically. //! diff --git a/src/lexing/utils/lex_impls.rs b/src/lexing/utils/lex_impls.rs index bd2628c..bfd3f59 100644 --- a/src/lexing/utils/lex_impls.rs +++ b/src/lexing/utils/lex_impls.rs @@ -12,7 +12,7 @@ use crate::common::{Source, Span, SpanIter, Spanned}; use super::{Lex, LexResult, LexT, Peek, SourceStream}; /// -/// Many (possibly one or zero) of a lexical token. +/// Many (possibly one or zero) of `L`-tokens. /// pub type Many = Vec; @@ -43,7 +43,7 @@ impl Spanned for Many { } /// -/// At least N lots of `L`-tokens. +/// At least `N` lots of `L`-tokens. /// #[derive(Debug)] pub struct AtLeast(Vec); @@ -93,7 +93,7 @@ impl DerefMut for AtLeast { } /// -/// Exactly N lots of `L`-tokens: no more, no less. +/// Exactly `N` lots of `L`-tokens: no more, no less. /// #[derive(Debug)] pub struct Exactly([L; N]) diff --git a/src/lexing/utils/result.rs b/src/lexing/utils/result.rs index f8307d0..6d50eba 100644 --- a/src/lexing/utils/result.rs +++ b/src/lexing/utils/result.rs @@ -41,13 +41,13 @@ impl<'a, S: Source> SourceStream<'a, S> { /// pub enum LexResult { /// - /// Valid token. + /// Valid `L` token. /// Lexed(L), /// - /// An attempt was made to parse a token, - /// but it did not fully abide by the lexical grammar. + /// An attempt was made to parse an `L` token, + /// but the input did not fully abide by `L`'s lexical grammar. /// Errant(LexError), @@ -99,6 +99,18 @@ impl LexResult { } } + /// + /// Turn this into a normal Rust [Result], + /// panicking if this is a [LexResult::Nothing]. + /// + pub fn unwrap_as_result(self) -> Result { + match self { + LexResult::Lexed(lexed) => Ok(lexed), + LexResult::Errant(errant) => Err(errant), + LexResult::Nothing => panic!("Called `LexResult::into_result()` on a Nothing value."), + } + } + /// /// Is this [LexResult::Errant]? /// @@ -134,6 +146,21 @@ impl LexResult { } } + /// + /// If this is [LexResult::Nothing], execute the `or` function instead, + /// and return its result. + /// + /// This allows for chaining of results, which may be useful + /// in lexing enums with different variants. + /// + pub fn or Self>(self, or: F) -> Self { + match self { + s @ LexResult::Lexed(_) => s, + s @ LexResult::Errant(_) => s, + LexResult::Nothing => or(), + } + } + /// /// Allegory of [Result::and_then]. /// @@ -152,7 +179,7 @@ impl LexResult { /// Require this potential token to be present, not [LexResult::Nothing] or [LexResult::Errant]. /// /// If this is [LexResult::Nothing], make this into a [LexResult::Errant] - /// with the message "expected a {$TOKEN} token". + /// with the message "expected a {$`L`} token". /// pub fn expected(self, input: &SourceStream) -> Self { match self { @@ -168,8 +195,7 @@ impl LexResult { /// /// Require this potential token to be present, not [LexResult::Nothing] or [LexResult::Errant]. /// - /// If this is [LexResult::Nothing], make this into a [LexResult::Errant] - /// with the message "expected a {$TOKEN} token". + /// If this is [LexResult::Nothing], display the custom message. /// pub fn expected_msg(self, input: &SourceStream, msg: impl ToString) -> Self { match self { @@ -181,33 +207,6 @@ impl LexResult { }), } } - - /// - /// If this is [LexResult::Nothing], execute the `or` function instead, - /// and return its result. - /// - /// This allows for chaining of results, which may be useful - /// in lexing enums with different variants. - /// - pub fn or Self>(self, or: F) -> Self { - match self { - s @ LexResult::Lexed(_) => s, - s @ LexResult::Errant(_) => s, - LexResult::Nothing => or(), - } - } - - /// - /// Turn this into a normal Rust [Result], - /// [panic]-ing if this is a [LexResult::Nothing]. - /// - pub fn unwrap_as_result(self) -> Result { - match self { - LexResult::Lexed(lexed) => Ok(lexed), - LexResult::Errant(errant) => Err(errant), - LexResult::Nothing => panic!("Called `LexResult::into_result()` on a Nothing value."), - } - } } impl Try for LexResult { diff --git a/src/lexing/utils/stream.rs b/src/lexing/utils/stream.rs index 5b1bf0b..419168d 100644 --- a/src/lexing/utils/stream.rs +++ b/src/lexing/utils/stream.rs @@ -31,7 +31,7 @@ impl bool> Lookahead for F { } /// -/// A const-friendly implementation of [std::ops::Range]. +/// A const-friendly implementation of [std::ops::Range]\<char>. /// /// This works with the [crate::verbatim] macro to support /// the range syntax: `v!('0'..='9')`. @@ -62,6 +62,10 @@ impl<'a> Lookahead for &'a CharacterRange { } } +/// +/// Represents a stream of input characters, +/// which can attempted to be lexed into tokens. +/// #[derive(Debug, Clone)] pub struct SourceStream<'a, S: Source> { index: usize, @@ -164,7 +168,7 @@ impl<'a, S: Source> SourceStream<'a, S> { pub fn peek(&self) -> Option<&char> { self.source.characters().get(self.index) } - + /// /// Peeks at the (0-based) n-th next upcoming character. /// @@ -172,6 +176,9 @@ impl<'a, S: Source> SourceStream<'a, S> { self.source.characters().get(self.index + n) } + /// + /// Returns the unlexed portion of this stream — what's next. + /// pub fn left(&self) -> Option { self.source .characters() diff --git a/src/lib.rs b/src/lib.rs index de10756..f83ba51 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -6,7 +6,7 @@ //! //! ## Why? //! This crate provides a very important function: traceability. -//! ### Tracability +//! ### Traceability //! This allows for line-column data to be preserved so that further //! processing can benefit from spanned errors, which tell the end //! user *where* the error happened. From 3de6e8f3a3df1ddb6671cb3ba48a41dd5399698b Mon Sep 17 00:00:00 2001 From: Sammy99jsp Date: Mon, 29 Jan 2024 16:26:58 +0000 Subject: [PATCH 38/39] Add `unicode!`: Declarative Unicode major/minor category matching. --- macros/src/lib.rs | 62 ++++++++ macros/src/unicode_category.rs | 204 ++++++++++++++++++++++++ src/lexing/utils/mod.rs | 1 + src/lexing/utils/unicode.rs | 273 +++++++++++++++++++++++++++++++++ 4 files changed, 540 insertions(+) create mode 100644 macros/src/unicode_category.rs create mode 100644 src/lexing/utils/unicode.rs diff --git a/macros/src/lib.rs b/macros/src/lib.rs index d58b01b..ca8f86e 100644 --- a/macros/src/lib.rs +++ b/macros/src/lib.rs @@ -6,6 +6,7 @@ mod spanned; mod type_traversal; +mod unicode_category; mod utils; mod verbatim; @@ -13,6 +14,7 @@ use proc_macro::{Diagnostic, Level, Span, TokenStream as Tokens}; use quote::ToTokens; use spanned::{derive_spanned_for_enum, derive_spanned_for_struct}; use syn::parse_macro_input; +use unicode_category::UnicodePatInput; use utils::{get_item_attrs, ECMARef, JSON5Ref, ToRustdoc}; use verbatim::VerbatimPat; @@ -242,3 +244,63 @@ pub fn verbatim(params: Tokens) -> Tokens { let ty = params.into_type(); ty.into_token_stream().into() } + +/// +/// ## unicode! +/// +/// Allows you to match entire Unicode major, or minor groups +/// (but not both at the same time!) +/// +/// Use *this* macro instead of `MatchMajorCategory<...>` and `MatchMinorCategory<...>`. +/// +/// ### Examples +/// +/// ```ignore +/// use avjason_macros::unicode; +/// +/// /// +/// /// (1) Major category -> any character in the "Letters (L)" category. +/// /// +/// pub type ULetter = unicode!(L); +/// +/// /// +/// /// (2) Minor category -> any character in the "Math symbols (Sm)" category. +/// /// +/// pub type UMathSymbol = unicode!(Sm); +/// +/// /// +/// /// (3.1) Union of major categories -> any unicode character. +/// /// +/// pub type UAll = unicode!(C | L | M | N | P | S | Z); +/// +/// /// +/// /// (3.2) Union of minor categories -> +/// /// equivalent to major category "Letters (L)". +/// /// +/// pub type ULetterUnion = unicode!(Lu | Ll | Lt | Lm | Lo); +/// ``` +/// +/// ### Syntax +/// This macro accepts either: +/// 1. one-letter Unicode major categories (`C`, `L`, `M`, `N`, `P`, `S`, `Z`). +/// 2. two-letter Unicode minor categories: +/// * `C` -> `Cc`, `Cf`, `Cs`, `Co`, `Cn`; +/// * `L` -> `Lu`, `Ll`, `Lt`, `Lm`, `Lo`; +/// * `M` -> `Mm`, `Mc`, `Me`; +/// * `P` -> `Pc`, `Pd`, `Ps`, `Pe`, `Pi`, `Pf`, `Po`; +/// * `S` -> `Sm`, `Sc`, `Sk`, `So`; +/// * `Z` -> `Zs`, `Zl`, `Zp`; +/// 3. Unions of: +/// 1. Major categories (only) +/// 2. Minor categories (only) +/// +#[proc_macro] +pub fn unicode(params: Tokens) -> Tokens { + let params: UnicodePatInput = syn::parse_macro_input!(params); + + params + .into_type() + .map(ToTokens::into_token_stream) + .map(Into::into) + .unwrap_or_default() +} diff --git a/macros/src/unicode_category.rs b/macros/src/unicode_category.rs new file mode 100644 index 0000000..7f411ca --- /dev/null +++ b/macros/src/unicode_category.rs @@ -0,0 +1,204 @@ +//! +//! Utilities for the [crate::unicode] macro. +//! + +use std::iter::once; + +use proc_macro::{Diagnostic, Level}; +use proc_macro2::Span; +use syn::{ + parse::{Parse, ParseStream}, + punctuated::Punctuated, + spanned::Spanned, +}; + +/// +/// The input into the [crate::unicode] macro. +/// +pub struct UnicodePatInput { + categories: Vec, +} + +fn ident(st: &str) -> syn::Ident { + syn::Ident::new(st, Span::call_site()) +} + +pub enum UnicodeCategory { + Major(syn::Ident), + Minor(syn::Ident), +} + +impl UnicodePatInput { + /// + /// Converts this collection of unicode catgeories + /// into its appropriate matcher type (determined by the first category). + /// + pub fn into_type(self) -> Option { + let mut iter = self.categories.into_iter(); + let Some(first) = iter.next() else { + Diagnostic::new(Level::Error, "Expected unicode major/minor categories!").emit(); + return None; + }; + Some(first.into_matcher(iter)) + } +} + +impl UnicodeCategory { + /// + /// Attempt to parse a unicode major/minor category + /// from an identifier (only checks length). + /// + fn parse(ident: syn::Ident) -> Result { + let st = ident.to_string(); + + match st.len() { + 0 => unreachable!(), + 1 => Ok(Self::Major(ident)), + 2 => Ok(Self::Minor(ident)), + _ => { + Diagnostic::spanned(ident.span().unwrap(), Level::Error, "Expected either a one-letter unicode major catgeory, or a two-letter unicode minor category.") + .emit(); + + Err(()) + } + } + } + + /// + /// Gets this category as an expression. + /// + fn into_expr(self) -> syn::Expr { + let (ty, cat) = match self { + Self::Major(ident) => ("MajorCategory", ident), + Self::Minor(ident) => ("MinorCategory", ident), + }; + + syn::Expr::Path(syn::ExprPath { + attrs: Default::default(), + qself: None, + path: syn::Path { + leading_colon: None, + segments: Punctuated::from_iter( + ["crate", "lexing", "utils", "unicode", ty, &cat.to_string()] + .iter() + .map(|s| syn::Ident::new(s, cat.span())) + .map(syn::PathSegment::from), + ), + }, + }) + } + + /// + /// Returns the type of this category type's matcher, along with supplying + /// itself and the other categories as const params. + /// + pub fn into_matcher(self, others: impl IntoIterator) -> syn::Type { + let ty = match self { + Self::Major(_) => "MatchMajorCategory", + Self::Minor(_) => "MatchMinorCategory", + }; + + let array = syn::Expr::Array(syn::ExprArray { + attrs: Default::default(), + bracket_token: Default::default(), + elems: Punctuated::from_iter(once(self).chain(others).map(Self::into_expr)), + }); + + let static_ref = syn::Expr::Reference(syn::ExprReference { + attrs: Default::default(), + and_token: Default::default(), + mutability: None, + expr: Box::new(array), + }); + + let braced = syn::Expr::Block(syn::ExprBlock { + attrs: Default::default(), + label: None, + block: syn::Block { + brace_token: Default::default(), + stmts: vec![syn::Stmt::Expr(static_ref, None)], + }, + }); + + let generic_arg = syn::GenericArgument::Const(braced); + syn::Type::Path(syn::TypePath { + qself: None, + path: syn::Path { + leading_colon: None, + segments: Punctuated::from_iter( + ["crate", "lexing", "utils", "unicode"] + .into_iter() + .map(ident) + .map(syn::PathSegment::from) + .chain(once(syn::PathSegment { + ident: ident(ty), + arguments: syn::PathArguments::AngleBracketed( + syn::AngleBracketedGenericArguments { + colon2_token: None, + lt_token: Default::default(), + args: Punctuated::from_iter(once(generic_arg)), + gt_token: Default::default(), + }, + ), + })), + ), + }, + }) + } +} + +impl Parse for UnicodePatInput { + fn parse(input: ParseStream) -> syn::Result { + let pat = syn::Pat::parse_multi(input)?; + let cases: Vec<_> = match pat { + syn::Pat::Or(or) => or.cases.into_iter().collect(), + ident @ syn::Pat::Ident(_) => vec![ident], + pat => { + return Err(syn::Error::new_spanned( + pat, + "Expected either one or many (with |) unicode major/minor categories here.", + )) + } + }; + + let idents: Vec<_> = cases + .iter() + .map(|pat| match pat { + syn::Pat::Ident(syn::PatIdent { ident, .. }) => Some(ident), + pat => { + Diagnostic::spanned( + pat.span().unwrap(), + Level::Error, + "Expected either a unicode major or minor category here.", + ) + .emit(); + None + } + }) + .collect(); + + if idents.iter().any(Option::is_none) { + return Err(syn::Error::new( + Span::call_site(), + "An error occurred whilst parsing syntax.", + )); + } + + // ::unwrap() okay since !any(is_none) -> all(is_some) + let idents = idents.into_iter().map(Option::unwrap); + + let categories: Vec<_> = idents.cloned().map(UnicodeCategory::parse).collect(); + + if categories.iter().any(Result::is_err) { + return Err(syn::Error::new( + Span::call_site(), + "Invalid unicode major or minor category.", + )); + } + + // ::unwrap() okay since !any(is_err) -> all(is_ok) + Ok(Self { + categories: categories.into_iter().map(Result::unwrap).collect(), + }) + } +} diff --git a/src/lexing/utils/mod.rs b/src/lexing/utils/mod.rs index 045ea16..53c55dc 100644 --- a/src/lexing/utils/mod.rs +++ b/src/lexing/utils/mod.rs @@ -7,6 +7,7 @@ pub mod peek; pub mod result; pub mod stream; pub mod verbatim; +pub mod unicode; use std::marker::PhantomData; diff --git a/src/lexing/utils/unicode.rs b/src/lexing/utils/unicode.rs new file mode 100644 index 0000000..38e32a3 --- /dev/null +++ b/src/lexing/utils/unicode.rs @@ -0,0 +1,273 @@ +//! +//! Allows for capturing different unicode groups. +//! +//! This is a stupid hack because at the moment, +//! ConstParamTy is not auto-implemented, +//! so [finl_unicode::categories::MinorCategory] doesn't implement it; +//! meaning we must do it nastily. + +use std::marker::ConstParamTy; + +use avjason_macros::Spanned; +use finl_unicode::categories::CharacterCategories; + +use crate::{ + common::{Source, Span}, + lexing::tokens::string::CharacterValue, +}; + +use super::{LexError, LexT, SourceStream}; + +/// +/// Looks for a character in any of the +/// unicode major categories supplied as a const parameter. +/// +/// *** +/// +/// **Do not use me directly, use [crate::unicode] instead!** +/// +#[derive(Debug, Spanned)] +pub struct MatchMajorCategory { + span: Span, + raw: char, +} + +/// +/// Looks for a character in any of the +/// unicode minor categories supplied as a const parameter. +/// +/// *** +/// +/// **Do not use me directly, use [crate::unicode] instead!** +/// +#[derive(Debug, Spanned)] +pub struct MatchMinorCategory { + span: Span, + raw: char, +} + +// --- + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum MajorCategory { + /// Letter + L, + /// Mark + M, + /// Number + N, + /// Punctuation + P, + /// Symbol + S, + /// Separator + Z, + /// Other character + C, +} + +#[doc(hidden)] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum MinorCategory { + /// Uppercase letter + Lu, + /// Lowercase letter + Ll, + /// Titlecase letter + Lt, + /// Modifier letter + Lm, + /// Other letter + Lo, + /// Non-spacing mark + Mn, + /// Spacing mark + Mc, + /// Enclosing mark + Me, + /// Decimal number + Nd, + /// Letterlike number + Nl, + /// Other number + No, + /// Connector punctuation + Pc, + /// Dash punctuation + Pd, + /// Opening punctuation + Ps, + /// Closing punctuation + Pe, + /// Initial punctuation + Pi, + /// Final punctuation + Pf, + /// Other punctuation + Po, + /// Math symbol + Sm, + /// Modifier symbol + Sk, + /// Currency symbol + Sc, + /// Other symbol + So, + /// Space separator + Zs, + /// Line separator + Zl, + /// Paragraph separator + Zp, + /// Control character + Cc, + /// Format character + Cf, + /// Private use character + Co, + /// Unassigned character + Cn, +} + +// --- + +impl LexT for MatchMajorCategory { + fn peek(input: &SourceStream) -> bool { + input + .peek() + .map(|ch| { + let cat = ch.get_major_category(); + C.iter().any(|major| &cat == major) + }) + .unwrap_or(false) + } + + fn lex(input: &mut SourceStream) -> Result { + // .unwrap() ok since Self::peek() -> next character exists. + let (loc, raw) = input.take().unwrap(); + Ok(Self { + span: Span::from(loc), + raw, + }) + } +} + +impl LexT for MatchMinorCategory { + fn peek(input: &SourceStream) -> bool { + input + .peek() + .map(|ch| { + let cat = ch.get_minor_category(); + C.iter().any(|major| &cat == major) + }) + .unwrap_or(false) + } + + fn lex(input: &mut SourceStream) -> Result { + // .unwrap() ok since Self::peek() -> next character exists. + let (loc, raw) = input.take().unwrap(); + Ok(Self { + span: Span::from(loc), + raw, + }) + } +} + +// --- + +impl CharacterValue for MatchMajorCategory { + fn cv<'a, 'b: 'a>(&'a self, buf: &'b mut [u16; 2]) -> &'b [u16] { + self.raw.encode_utf16(buf) + } +} + +impl CharacterValue for MatchMinorCategory { + fn cv<'a, 'b: 'a>(&'a self, buf: &'b mut [u16; 2]) -> &'b [u16] { + self.raw.encode_utf16(buf) + } +} + +// --- + +impl ConstParamTy for MajorCategory {} +impl ConstParamTy for MinorCategory {} + +impl From for finl_unicode::categories::MajorCategory { + fn from(value: MajorCategory) -> Self { + match value { + MajorCategory::L => Self::L, + MajorCategory::M => Self::M, + MajorCategory::N => Self::N, + MajorCategory::P => Self::P, + MajorCategory::S => Self::S, + MajorCategory::Z => Self::Z, + MajorCategory::C => Self::C, + } + } +} + +impl From for finl_unicode::categories::MinorCategory { + fn from(value: MinorCategory) -> Self { + match value { + MinorCategory::Lu => Self::Lu, + MinorCategory::Ll => Self::Ll, + MinorCategory::Lt => Self::Lt, + MinorCategory::Lm => Self::Lm, + MinorCategory::Lo => Self::Lo, + MinorCategory::Mn => Self::Mn, + MinorCategory::Mc => Self::Mc, + MinorCategory::Me => Self::Me, + MinorCategory::Nd => Self::Nd, + MinorCategory::Nl => Self::Nl, + MinorCategory::No => Self::No, + MinorCategory::Pc => Self::Pc, + MinorCategory::Pd => Self::Pd, + MinorCategory::Ps => Self::Ps, + MinorCategory::Pe => Self::Pe, + MinorCategory::Pi => Self::Pi, + MinorCategory::Pf => Self::Pf, + MinorCategory::Po => Self::Po, + MinorCategory::Sm => Self::Sm, + MinorCategory::Sk => Self::Sk, + MinorCategory::Sc => Self::Sc, + MinorCategory::So => Self::So, + MinorCategory::Zs => Self::Zs, + MinorCategory::Zl => Self::Zl, + MinorCategory::Zp => Self::Zp, + MinorCategory::Cc => Self::Cc, + MinorCategory::Cf => Self::Cf, + MinorCategory::Co => Self::Co, + MinorCategory::Cn => Self::Cn, + } + } +} + +impl PartialEq for finl_unicode::categories::MajorCategory { + fn eq(&self, other: &MajorCategory) -> bool { + Self::from(*other).eq(self) + } +} + +impl PartialEq for finl_unicode::categories::MinorCategory { + fn eq(&self, other: &MinorCategory) -> bool { + Self::from(*other).eq(self) + } +} + +#[cfg(test)] +mod tests { + use avjason_macros::unicode; + + use crate::{common::{file::SourceFile, Source}, lexing::Many}; + + type Letter = unicode!(Lu | Ll); + + #[test] + fn test_lex() { + let source = + SourceFile::dummy_file("Apples"); + let input = &mut source.stream(); + let comment: Many = input.lex().expect("Valid parse"); + println!("{comment:?}"); + } +} From 46116c5be51a4073a52840c6a60f61159a2d847a Mon Sep 17 00:00:00 2001 From: Sammy99jsp Date: Mon, 29 Jan 2024 21:32:25 +0000 Subject: [PATCH 39/39] Add `Identifier` support. --- src/lexing/tokens/escapes.rs | 2 +- src/lexing/tokens/identifier.rs | 750 ++++++++++++++++++++++++++++++++ src/lexing/tokens/mod.rs | 1 + src/lexing/tokens/number.rs | 4 +- src/lexing/tokens/string.rs | 46 +- src/lexing/utils/lex_impls.rs | 7 +- src/lexing/utils/result.rs | 3 +- src/lexing/utils/unicode.rs | 15 +- 8 files changed, 801 insertions(+), 27 deletions(-) create mode 100644 src/lexing/tokens/identifier.rs diff --git a/src/lexing/tokens/escapes.rs b/src/lexing/tokens/escapes.rs index a4c066c..4e25d5a 100644 --- a/src/lexing/tokens/escapes.rs +++ b/src/lexing/tokens/escapes.rs @@ -105,7 +105,7 @@ pub struct HexEscapeSequence(v!('x'), Exactly<2, HexDigit>); "UnicodeEscapeSequence", "https://262.ecma-international.org/5.1/#sec-7.8.4" )] -#[derive(Debug, Spanned)] +#[derive(Debug, Spanned, Clone)] pub struct UnicodeEscapeSequence(v!('u'), Exactly<4, HexDigit>); // --- diff --git a/src/lexing/tokens/identifier.rs b/src/lexing/tokens/identifier.rs new file mode 100644 index 0000000..7429850 --- /dev/null +++ b/src/lexing/tokens/identifier.rs @@ -0,0 +1,750 @@ +//! +//! ## Identifiers +//! + +use std::iter::once; + +use finl_unicode::categories::{CharacterCategories, MinorCategory}; + +use crate::{ + common::{Source, Spanned}, + lexing::{Lex, LexError, LexResult, LexT, Many, SourceStream}, + unicode as u, verbatim as v, ECMARef, Spanned, SpecRef, +}; + +use super::{ + escapes::UnicodeEscapeSequence, + string::{collect_cv_into_utf16, CharacterValue, StringValue}, +}; + +#[SpecRef("JSON5Identifier")] +#[derive(Debug, Spanned)] +pub struct Identifier(IdentifierName); + +/// +/// > Identifier Names are tokens that are interpreted +/// > according to the grammar given in the “Identifiers” section +/// > of chapter 5 of the Unicode standard, with some small modifications. +/// +#[ECMARef("IdentifierName", "https://262.ecma-international.org/5.1/#sec-7.6")] +#[derive(Debug, Spanned)] +pub struct IdentifierName(IdentifierStart, Many); + +/// +/// The first character in an identifier. +/// +#[ECMARef("IdentifierStart", "https://262.ecma-international.org/5.1/#sec-7.6")] +#[derive(Debug, Spanned, Clone)] +pub enum IdentifierStart { + Letter(UnicodeLetter), + Dollar(v!('$')), + Underscore(v!('_')), + Escape(v!('\\'), UnicodeEscapeSequence), +} + +/// +/// Any part of an identifier folowing the starting part. +/// +#[ECMARef("IdentifierPart", "https://262.ecma-international.org/5.1/#sec-7.6")] +#[derive(Debug, Spanned, Clone)] +pub enum IdentifierPart { + /// + /// This is not part of the ECMAScript spec, + /// but is necessary in order to get the context + /// correctly in the escaped character's validity checks. + /// + Escape(v!('\\'), UnicodeEscapeSequence), + Start(IdentifierStart), + CombiningMark(UnicodeCombiningMark), + Digit(UnicodeDigit), + ConnectorPunctuation(UnicodeConnectorPunctuation), + + /// + /// Zero width non-joiner + /// + ZWNJ(v!('\u{200C}')), + + /// + /// Zero width joiner + /// + ZWJ(v!('\u{200D}')), +} + +/// +/// > any character in the Unicode categories “Uppercase letter (Lu)”, +/// > “Lowercase letter (Ll)”, “Titlecase letter (Lt)”, “Modifier letter (Lm)”, +/// > “Other letter (Lo)”, or “Letter number (Nl)” +/// +#[ECMARef("UnicodeLetter", "https://262.ecma-international.org/5.1/#sec-7.6")] +pub type UnicodeLetter = u!(Lu | Ll | Lt | Lm | Lo | Nl); + +/// +/// > any character in the Unicode categories “Non-spacing mark (Mn)” +/// > or “Combining spacing mark (Mc)” +/// +#[ECMARef( + "UnicodeCombiningMark", + "https://262.ecma-international.org/5.1/#sec-7.6" +)] +pub type UnicodeCombiningMark = u!(Mn | Mc); + +/// +/// > any character in the Unicode category “Decimal number (Nd)” +/// +#[ECMARef("UnicodeDigit", "https://262.ecma-international.org/5.1/#sec-7.6")] +pub type UnicodeDigit = u!(Nd); + +/// +/// any character in the Unicode category “Connector punctuation (Pc)” +/// +#[ECMARef( + "UnicodeConnectorPunctuation", + "https://262.ecma-international.org/5.1/#sec-7.6" +)] +pub type UnicodeConnectorPunctuation = u!(Pc); + +// --- + +/// +/// What characters does this identifier part accept? +/// +pub trait CharacterAcceptor { + fn accepts(ch: &char) -> bool; +} + +impl CharacterAcceptor for IdentifierStart { + fn accepts(ch: &char) -> bool { + use MinorCategory::*; + match ch { + c if matches!(c.get_minor_category(), Lu | Ll | Lt | Lm | Lo | Nl) => true, + '$' => true, + '_' => true, + _ => false, + } + } +} + +impl CharacterAcceptor for IdentifierPart { + fn accepts(ch: &char) -> bool { + use MinorCategory::*; + match ch { + c if IdentifierStart::accepts(c) => true, + c if matches!(c.get_minor_category(), Mn | Mc | Nd | Pc) => true, + '\u{200C}' => true, + '\u{200D}' => true, + _ => false, + } + } +} + +/// +/// Check to see if the unicode escape code's value +/// is still valid in the context of an identifier part. +/// +/// > A UnicodeEscapeSequence cannot be used to put a +/// > character into an IdentifierName that would otherwise be illegal. +/// +/// — [see more](https://262.ecma-international.org/5.1/#sec-7.6). +/// +pub fn check_unicode_escape( + backslash: v!('\\'), + escape: UnicodeEscapeSequence, + map: fn(v!('\\'), UnicodeEscapeSequence) -> T, +) -> LexResult { + let ch = escape.try_as_char(); + if !ch.map(|ch: char| T::accepts(&ch)).unwrap_or(false) { + return LexResult::Errant(LexError::new( + &backslash.span().combine([escape.span()]), + format!( + "Invalid escaped character in identifier: `{}` is not valid here.", + ch.unwrap() + ), + )); + } + + LexResult::Lexed(map(backslash, escape)) +} + +// --- + +impl LexT for Identifier { + fn peek(input: &SourceStream) -> bool { + ::peek(input) + } + + fn lex(input: &mut SourceStream) -> Result { + Ok(Self(::lex(input)?)) + } +} + +impl LexT for IdentifierName { + fn peek(input: &SourceStream) -> bool { + ::peek(input) + } + + fn lex(input: &mut SourceStream) -> Result { + let start = LexT::lex(input)?; + let after = Lex::lex(input).unwrap_as_result()?; + Ok(Self(start, after)) + } +} + +impl LexT for IdentifierStart { + fn peek(input: &SourceStream) -> bool { + ::peek(input) + || ::peek(input) + || ::peek(input) + || ::peek(input) + } + + fn lex(input: &mut SourceStream) -> Result { + // .unwrap_as_reult() ok since Self::peek() -> one variant exists. + Lex::lex(input) + .map(Self::Letter) + .or(|| input.lex().map(Self::Dollar)) + .or(|| input.lex().map(Self::Underscore)) + .or(|| { + input.lex().and(|backslash: v!('\\')| { + input + .lex() + .expected_msg(input, "Expected a unicode escape sequence `\\uXXXX` here.") + .and(|escape: UnicodeEscapeSequence| { + check_unicode_escape(backslash, escape, Self::Escape) + }) + }) + }) + .unwrap_as_result() + } +} + +impl LexT for IdentifierPart { + fn peek(input: &SourceStream) -> bool { + ::peek(input) + || ::peek(input) + || ::peek(input) + || ::peek(input) + || ::peek(input) + || ::peek(input) + } + + fn lex(input: &mut SourceStream) -> Result { + // .unwrap_as_result() ok since Self::peek() -> exists one of the variants. + Lex::lex(input) + .and(|backslash: v!('\\')| { + input + .lex() + .expected_msg(input, "Expected a unicode escape sequence `\\uXXXX` here.") + .and(|escape: UnicodeEscapeSequence| { + check_unicode_escape(backslash, escape, Self::Escape) + }) + }) + .or(|| input.lex().map(Self::Start)) + .or(|| input.lex().map(Self::CombiningMark)) + .or(|| input.lex().map(Self::Digit)) + .or(|| input.lex().map(Self::ConnectorPunctuation)) + .or(|| input.lex().map(Self::ZWNJ)) + .or(|| input.lex().map(Self::ZWJ)) + .unwrap_as_result() + } +} + +// --- + +impl StringValue for Identifier { + fn sv(&self) -> Vec { + self.0.sv() + } +} + +impl StringValue for IdentifierName { + fn sv(&self) -> Vec { + let binding = IdentifierPart::Start(self.0.clone()); + let tmp: Vec<_> = once(&binding).chain(self.1.iter()).collect(); + collect_cv_into_utf16(tmp) + } +} + +// --- + +impl CharacterValue for IdentifierStart { + fn cv<'a, 'b: 'a>(&'a self, buf: &'b mut [u16; 2]) -> &'b [u16] { + match self { + IdentifierStart::Letter(letter) => letter.cv(buf), + IdentifierStart::Dollar(_) => '$'.encode_utf16(buf), + IdentifierStart::Underscore(_) => '_'.encode_utf16(buf), + IdentifierStart::Escape(_, esc) => esc.cv(buf), + } + } +} + +impl CharacterValue for IdentifierPart { + fn cv<'a, 'b: 'a>(&'a self, buf: &'b mut [u16; 2]) -> &'b [u16] { + match self { + IdentifierPart::Escape(_, escape) => escape.cv(buf), + IdentifierPart::Start(start) => start.cv(buf), + IdentifierPart::CombiningMark(cm) => cm.cv(buf), + IdentifierPart::Digit(digit) => digit.cv(buf), + IdentifierPart::ConnectorPunctuation(cp) => cp.cv(buf), + IdentifierPart::ZWNJ(_) => '\u{200C}'.encode_utf16(buf), + IdentifierPart::ZWJ(_) => '\u{200D}'.encode_utf16(buf), + } + } +} + +#[cfg(test)] +mod tests { + use crate::{ + common::{file::SourceFile, Source}, + lexing::LexResult, + }; + + use super::{Identifier, IdentifierPart, IdentifierStart}; + + fn test_identifier(st: &'static str) -> LexResult { + let source = SourceFile::dummy_file(st); + let input = &mut source.stream(); + input.lex() + } + + fn test_start(st: &'static str) -> LexResult { + let source = SourceFile::dummy_file(st); + let input = &mut source.stream(); + input.lex() + } + + fn test_middle(st: &'static str) -> LexResult { + let source = SourceFile::dummy_file(st); + let input = &mut source.stream(); + input.lex() + } + + #[test] + fn start() { + // Ll + test_identifier("a").expect("Valid parse!"); + test_identifier("ʘ").expect("Valid parse!"); + test_identifier("ξ").expect("Valid parse!"); + test_identifier("я").expect("Valid parse!"); + test_identifier("ᴓ").expect("Valid parse!"); + test_identifier("ⱅ").expect("Valid parse!"); + test_identifier("ꮇ").expect("Valid parse!"); + test_identifier("v").expect("Valid parse!"); + test_identifier("𐳭").expect("Valid parse!"); + test_identifier("𝐨").expect("Valid parse!"); + test_identifier("𝕘").expect("Valid parse!"); + test_identifier("𝛝").expect("Valid parse!"); + test_identifier("𞥃").expect("Valid parse!"); + + // Lm + test_identifier("ˑ").expect("Valid parse!"); + test_identifier("ˬ").expect("Valid parse!"); + test_identifier("ᶾ").expect("Valid parse!"); + test_identifier("〲").expect("Valid parse!"); + test_identifier("ꫝ").expect("Valid parse!"); + test_identifier("𖿡").expect("Valid parse!"); + + // Lo + test_identifier("ڧ").expect("Valid parse!"); + test_identifier("ݦ").expect("Valid parse!"); + test_identifier("ࠊ").expect("Valid parse!"); + test_identifier("ओ").expect("Valid parse!"); + test_identifier("ੴ").expect("Valid parse!"); + test_identifier("ࣅ").expect("Valid parse!"); + test_identifier("ഐ").expect("Valid parse!"); + test_identifier("ᆿ").expect("Valid parse!"); + test_identifier("ሥ").expect("Valid parse!"); + test_identifier("ᐚ").expect("Valid parse!"); + test_identifier("ᑺ").expect("Valid parse!"); + test_identifier("ᔐ").expect("Valid parse!"); + test_identifier("ᖲ").expect("Valid parse!"); + test_identifier("ᚙ").expect("Valid parse!"); + test_identifier("ᛦ").expect("Valid parse!"); + test_identifier("ᠩ").expect("Valid parse!"); + test_identifier("ᩐ").expect("Valid parse!"); + test_identifier("ᮯ").expect("Valid parse!"); + test_identifier("ⶦ").expect("Valid parse!"); + test_identifier("ツ").expect("Valid parse!"); + test_identifier("ㆈ").expect("Valid parse!"); + test_identifier("㐯").expect("Valid parse!"); + test_identifier("㔇").expect("Valid parse!"); + test_identifier("㠓").expect("Valid parse!"); + test_identifier("㨝").expect("Valid parse!"); + + // Lt + test_identifier("ᾫ").expect("Valid parse!"); + test_identifier("ᾝ").expect("Valid parse!"); + test_identifier("Dž").expect("Valid parse!"); + + // Lu + test_identifier("A").expect("Valid parse!"); + test_identifier("Ǡ").expect("Valid parse!"); + test_identifier("Έ").expect("Valid parse!"); + test_identifier("Щ").expect("Valid parse!"); + test_identifier("Ꮿ").expect("Valid parse!"); + test_identifier("Å").expect("Valid parse!"); + test_identifier("ℜ").expect("Valid parse!"); + test_identifier("Ᵽ").expect("Valid parse!"); + test_identifier("T").expect("Valid parse!"); + test_identifier("𐲱").expect("Valid parse!"); + test_identifier("𝓨").expect("Valid parse!"); + test_identifier("𝗨").expect("Valid parse!"); + test_identifier("𝝫").expect("Valid parse!"); + test_identifier("𞤞").expect("Valid parse!"); + + // Nl + test_identifier("Ⅲ").expect("Valid parse!"); + test_identifier("ↈ").expect("Valid parse!"); + test_identifier("𐅰").expect("Valid parse!"); + test_identifier("𒐒").expect("Valid parse!"); + test_identifier("𒐪").expect("Valid parse!"); + test_identifier("𒑚").expect("Valid parse!"); + test_identifier("𒑮").expect("Valid parse!"); + + test_identifier("_").expect("Valid parse!"); + test_identifier("$").expect("Valid parse!"); + test_identifier(r"\u0041").expect("Valid parse"); // `A` + + // Invalid Starting unicode escape code `@` + test_identifier(r"\u0040").unwrap_err(); + + // Middle-only characters + // Mn + assert!(!test_start("◌̣").is_lexed()); + assert!(!test_start("◌ַ").is_lexed()); + assert!(!test_start("◌ܶ").is_lexed()); + assert!(!test_start("◌ࣟ").is_lexed()); + assert!(!test_start("◌ై").is_lexed()); + assert!(!test_start("◌ླྀ").is_lexed()); + assert!(!test_start("◌ᬼ").is_lexed()); + assert!(!test_start("◌ⷻ").is_lexed()); + assert!(!test_start("◌ꦸ").is_lexed()); + assert!(!test_start("◌𝨰").is_lexed()); + assert!(!test_start("◌𝪩").is_lexed()); + assert!(!test_start("◌󠇬").is_lexed()); + + // Mc + assert!(!test_start("ா").is_lexed()); + assert!(!test_start("ௌ").is_lexed()); + assert!(!test_start("ෛ").is_lexed()); + assert!(!test_start("ြ").is_lexed()); + assert!(!test_start("ᬽ").is_lexed()); + assert!(!test_start("ꦾ").is_lexed()); + assert!(!test_start("𑍣").is_lexed()); + assert!(!test_start("𑲩").is_lexed()); + assert!(!test_start("𝅲").is_lexed()); + assert!(!test_start("𝅦").is_lexed()); + + // Nd + assert!(!test_start("1").is_lexed()); + assert!(!test_start("9").is_lexed()); + assert!(!test_start("٢").is_lexed()); + assert!(!test_start("٤").is_lexed()); + assert!(!test_start("৩").is_lexed()); + assert!(!test_start("੦").is_lexed()); + assert!(!test_start("௫").is_lexed()); + assert!(!test_start("൫").is_lexed()); + assert!(!test_start("໙").is_lexed()); + assert!(!test_start("႒").is_lexed()); + assert!(!test_start("᭑").is_lexed()); + assert!(!test_start("꧓").is_lexed()); + assert!(!test_start("꩘").is_lexed()); + assert!(!test_start("𝟯").is_lexed()); + assert!(!test_start("🯷").is_lexed()); + + // Pc + assert!(!test_start("‿").is_lexed()); + assert!(!test_start("⁀").is_lexed()); + assert!(!test_start("⁔").is_lexed()); + assert!(!test_start("︳").is_lexed()); + assert!(!test_start("︴").is_lexed()); + assert!(!test_start("﹍").is_lexed()); + assert!(!test_start("﹎").is_lexed()); + assert!(!test_start("﹏").is_lexed()); + assert!(!test_start("_").is_lexed()); + } + + #[test] + fn middle() { + // Ll + test_identifier("_a").expect("Valid parse!"); + test_identifier("_ʘ").expect("Valid parse!"); + test_identifier("_ξ").expect("Valid parse!"); + test_identifier("_я").expect("Valid parse!"); + test_identifier("_ᴓ").expect("Valid parse!"); + test_identifier("_ⱅ").expect("Valid parse!"); + test_identifier("_ꮇ").expect("Valid parse!"); + test_identifier("_v").expect("Valid parse!"); + test_identifier("_𐳭").expect("Valid parse!"); + test_identifier("_𝐨").expect("Valid parse!"); + test_identifier("_𝕘").expect("Valid parse!"); + test_identifier("_𝛝").expect("Valid parse!"); + test_identifier("_𞥃").expect("Valid parse!"); + + // Lm + test_identifier("_ˑ").expect("Valid parse!"); + test_identifier("_ˬ").expect("Valid parse!"); + test_identifier("_ᶾ").expect("Valid parse!"); + test_identifier("_〲").expect("Valid parse!"); + test_identifier("_ꫝ").expect("Valid parse!"); + test_identifier("_𖿡").expect("Valid parse!"); + + // Lo + test_identifier("_ڧ").expect("Valid parse!"); + test_identifier("_ݦ").expect("Valid parse!"); + test_identifier("_ࠊ").expect("Valid parse!"); + test_identifier("_ओ").expect("Valid parse!"); + test_identifier("_ੴ").expect("Valid parse!"); + test_identifier("_ࣅ").expect("Valid parse!"); + test_identifier("_ഐ").expect("Valid parse!"); + test_identifier("_ᆿ").expect("Valid parse!"); + test_identifier("_ሥ").expect("Valid parse!"); + test_identifier("_ᐚ").expect("Valid parse!"); + test_identifier("_ᑺ").expect("Valid parse!"); + test_identifier("_ᔐ").expect("Valid parse!"); + test_identifier("_ᖲ").expect("Valid parse!"); + test_identifier("_ᚙ").expect("Valid parse!"); + test_identifier("_ᛦ").expect("Valid parse!"); + test_identifier("_ᠩ").expect("Valid parse!"); + test_identifier("_ᩐ").expect("Valid parse!"); + test_identifier("_ᮯ").expect("Valid parse!"); + test_identifier("_ⶦ").expect("Valid parse!"); + test_identifier("_ツ").expect("Valid parse!"); + test_identifier("_ㆈ").expect("Valid parse!"); + test_identifier("_㐯").expect("Valid parse!"); + test_identifier("_㔇").expect("Valid parse!"); + test_identifier("_㠓").expect("Valid parse!"); + test_identifier("_㨝").expect("Valid parse!"); + + // Lt + test_identifier("_ᾫ").expect("Valid parse!"); + test_identifier("_ᾝ").expect("Valid parse!"); + test_identifier("_Dž").expect("Valid parse!"); + + // Lu + test_identifier("_A").expect("Valid parse!"); + test_identifier("_Ǡ").expect("Valid parse!"); + test_identifier("_Έ").expect("Valid parse!"); + test_identifier("_Щ").expect("Valid parse!"); + test_identifier("_Ꮿ").expect("Valid parse!"); + test_identifier("_Å").expect("Valid parse!"); + test_identifier("_ℜ").expect("Valid parse!"); + test_identifier("_Ᵽ").expect("Valid parse!"); + test_identifier("_T").expect("Valid parse!"); + test_identifier("_𐲱").expect("Valid parse!"); + test_identifier("_𝓨").expect("Valid parse!"); + test_identifier("_𝗨").expect("Valid parse!"); + test_identifier("_𝝫").expect("Valid parse!"); + test_identifier("_𞤞").expect("Valid parse!"); + + // Nl + test_identifier("_Ⅲ").expect("Valid parse!"); + test_identifier("_ↈ").expect("Valid parse!"); + test_identifier("_𐅰").expect("Valid parse!"); + test_identifier("_𒐒").expect("Valid parse!"); + test_identifier("_𒐪").expect("Valid parse!"); + test_identifier("_𒑚").expect("Valid parse!"); + test_identifier("_𒑮").expect("Valid parse!"); + + // Mn + test_identifier("_◌̣").expect("Valid parse!"); + test_identifier("_◌ַ").expect("Valid parse!"); + test_identifier("_◌ܶ").expect("Valid parse!"); + test_identifier("_◌ࣟ").expect("Valid parse!"); + test_identifier("_◌ై").expect("Valid parse!"); + test_identifier("_◌ླྀ").expect("Valid parse!"); + test_identifier("_◌ᬼ").expect("Valid parse!"); + test_identifier("_◌ⷻ").expect("Valid parse!"); + test_identifier("_◌ꦸ").expect("Valid parse!"); + test_identifier("_◌𝨰").expect("Valid parse!"); + test_identifier("_◌𝪩").expect("Valid parse!"); + test_identifier("_◌󠇬").expect("Valid parse!"); + + // Mc + test_identifier("_ா").expect("Valid parse!"); + test_identifier("_ௌ").expect("Valid parse!"); + test_identifier("_ෛ").expect("Valid parse!"); + test_identifier("_ြ").expect("Valid parse!"); + test_identifier("_ᬽ").expect("Valid parse!"); + test_identifier("_ꦾ").expect("Valid parse!"); + test_identifier("_𑍣").expect("Valid parse!"); + test_identifier("_𑲩").expect("Valid parse!"); + test_identifier("_𝅲").expect("Valid parse!"); + test_identifier("_𝅦").expect("Valid parse!"); + + // Nd + test_identifier("_1").expect("Valid parse!"); + test_identifier("_9").expect("Valid parse!"); + test_identifier("_٢").expect("Valid parse!"); + test_identifier("_٤").expect("Valid parse!"); + test_identifier("_৩").expect("Valid parse!"); + test_identifier("_੦").expect("Valid parse!"); + test_identifier("_௫").expect("Valid parse!"); + test_identifier("_൫").expect("Valid parse!"); + test_identifier("_໙").expect("Valid parse!"); + test_identifier("_႒").expect("Valid parse!"); + test_identifier("_᭑").expect("Valid parse!"); + test_identifier("_꧓").expect("Valid parse!"); + test_identifier("_꩘").expect("Valid parse!"); + test_identifier("_𝟯").expect("Valid parse!"); + test_identifier("_🯷").expect("Valid parse!"); + + // Pc + test_identifier("_‿").expect("Valid parse!"); + test_identifier("_⁀").expect("Valid parse!"); + test_identifier("_⁔").expect("Valid parse!"); + test_identifier("_︳").expect("Valid parse!"); + test_identifier("_︴").expect("Valid parse!"); + test_identifier("_﹍").expect("Valid parse!"); + test_identifier("_﹎").expect("Valid parse!"); + test_identifier("_﹏").expect("Valid parse!"); + test_identifier("__").expect("Valid parse!"); + + test_identifier("__").expect("Valid parse!"); + test_identifier("_$").expect("Valid parse!"); + test_identifier(r"_\u0041").expect("Valid parse"); // `A` + + test_identifier(r"_\u0040").unwrap_err(); + } + + #[test] + fn invalid() { + // Sm + assert!(!test_start(r"÷").is_lexed()); + assert!(!test_start(r"⅀").is_lexed()); + assert!(!test_start(r"∃").is_lexed()); + assert!(!test_start(r"∉").is_lexed()); + assert!(!test_start(r"∏").is_lexed()); + assert!(!test_start(r"∜").is_lexed()); + assert!(!test_start(r"⌠").is_lexed()); + assert!(!test_start(r"⌡").is_lexed()); + assert!(!test_start(r"⟜").is_lexed()); + assert!(!test_start(r"⨜").is_lexed()); + assert!(!test_start(r"⨷").is_lexed()); + assert!(!test_start(r"⪔").is_lexed()); + assert!(!test_start(r"𞻱").is_lexed()); + + assert!(!test_middle(r"÷").is_lexed()); + assert!(!test_middle(r"⅀").is_lexed()); + assert!(!test_middle(r"∃").is_lexed()); + assert!(!test_middle(r"∉").is_lexed()); + assert!(!test_middle(r"∏").is_lexed()); + assert!(!test_middle(r"∜").is_lexed()); + assert!(!test_middle(r"⌠").is_lexed()); + assert!(!test_middle(r"⌡").is_lexed()); + assert!(!test_middle(r"⟜").is_lexed()); + assert!(!test_middle(r"⨜").is_lexed()); + assert!(!test_middle(r"⨷").is_lexed()); + assert!(!test_middle(r"⪔").is_lexed()); + assert!(!test_middle(r"𞻱").is_lexed()); + } + + #[test] + fn escape_codes() { + // Valid Start tests + test_start(r"\u0061").expect("Valid parse!"); + test_start(r"\u0298").expect("Valid parse!"); + test_start(r"\u03be").expect("Valid parse!"); + test_start(r"\u044f").expect("Valid parse!"); + test_start(r"\u1d13").expect("Valid parse!"); + test_start(r"\u2c45").expect("Valid parse!"); + test_start(r"\uab87").expect("Valid parse!"); + test_start(r"\uff56").expect("Valid parse!"); + + test_start(r"\u02d1").expect("Valid parse!"); + test_start(r"\u02ec").expect("Valid parse!"); + test_start(r"\u1dbe").expect("Valid parse!"); + test_start(r"\u3032").expect("Valid parse!"); + test_start(r"\uaadd").expect("Valid parse!"); + test_start(r"\u06a7").expect("Valid parse!"); + test_start(r"\u0766").expect("Valid parse!"); + test_start(r"\u080a").expect("Valid parse!"); + test_start(r"\u0913").expect("Valid parse!"); + test_start(r"\u0a74").expect("Valid parse!"); + test_start(r"\u08c5").expect("Valid parse!"); + test_start(r"\u0d10").expect("Valid parse!"); + test_start(r"\u11bf").expect("Valid parse!"); + test_start(r"\u1225").expect("Valid parse!"); + test_start(r"\u141a").expect("Valid parse!"); + test_start(r"\u147a").expect("Valid parse!"); + test_start(r"\u1510").expect("Valid parse!"); + test_start(r"\u15b2").expect("Valid parse!"); + test_start(r"\u1699").expect("Valid parse!"); + test_start(r"\u16e6").expect("Valid parse!"); + test_start(r"\u1829").expect("Valid parse!"); + test_start(r"\u1a50").expect("Valid parse!"); + test_start(r"\u1baf").expect("Valid parse!"); + test_start(r"\u2da6").expect("Valid parse!"); + test_start(r"\u30c4").expect("Valid parse!"); + test_start(r"\u3188").expect("Valid parse!"); + test_start(r"\u342f").expect("Valid parse!"); + test_start(r"\u3507").expect("Valid parse!"); + test_start(r"\u3813").expect("Valid parse!"); + test_start(r"\u3a1d").expect("Valid parse!"); + test_start(r"\u1fab").expect("Valid parse!"); + test_start(r"\u1f9d").expect("Valid parse!"); + test_start(r"\u01c5").expect("Valid parse!"); + test_start(r"\u0041").expect("Valid parse!"); + test_start(r"\u01e0").expect("Valid parse!"); + test_start(r"\u0388").expect("Valid parse!"); + test_start(r"\u0429").expect("Valid parse!"); + test_start(r"\u13ef").expect("Valid parse!"); + test_start(r"\u212b").expect("Valid parse!"); + test_start(r"\u211c").expect("Valid parse!"); + test_start(r"\u2c63").expect("Valid parse!"); + test_start(r"\uff34").expect("Valid parse!"); + test_start(r"\u2162").expect("Valid parse!"); + test_start(r"\u2188").expect("Valid parse!"); + test_start(r"\u005f").expect("Valid parse!"); + test_start(r"\u0024").expect("Valid parse!"); + + // Invalid start character tests + assert!(!test_start(r"\u0031").is_lexed()); + assert!(!test_start(r"\u0039").is_lexed()); + assert!(!test_start(r"\u0662").is_lexed()); + assert!(!test_start(r"\u0664").is_lexed()); + assert!(!test_start(r"\u09e9").is_lexed()); + assert!(!test_start(r"\u0a66").is_lexed()); + assert!(!test_start(r"\u0beb").is_lexed()); + assert!(!test_start(r"\u0d6b").is_lexed()); + assert!(!test_start(r"\u0ed9").is_lexed()); + assert!(!test_start(r"\u1092").is_lexed()); + assert!(!test_start(r"\u1b51").is_lexed()); + assert!(!test_start(r"\ua9d3").is_lexed()); + assert!(!test_start(r"\uaa58").is_lexed()); + assert!(!test_start(r"\u203f").is_lexed()); + assert!(!test_start(r"\u2040").is_lexed()); + assert!(!test_start(r"\u2054").is_lexed()); + assert!(!test_start(r"\ufe33").is_lexed()); + assert!(!test_start(r"\ufe34").is_lexed()); + assert!(!test_start(r"\ufe4d").is_lexed()); + assert!(!test_start(r"\ufe4e").is_lexed()); + assert!(!test_start(r"\ufe4f").is_lexed()); + assert!(!test_start(r"\uff3f").is_lexed()); + + // Valid middle character tests + assert!(test_middle(r"\u0031").is_lexed()); + assert!(test_middle(r"\u0039").is_lexed()); + assert!(test_middle(r"\u0662").is_lexed()); + assert!(test_middle(r"\u0664").is_lexed()); + assert!(test_middle(r"\u09e9").is_lexed()); + assert!(test_middle(r"\u0a66").is_lexed()); + assert!(test_middle(r"\u0beb").is_lexed()); + assert!(test_middle(r"\u0d6b").is_lexed()); + assert!(test_middle(r"\u0ed9").is_lexed()); + assert!(test_middle(r"\u1092").is_lexed()); + assert!(test_middle(r"\u1b51").is_lexed()); + assert!(test_middle(r"\ua9d3").is_lexed()); + assert!(test_middle(r"\uaa58").is_lexed()); + assert!(test_middle(r"\u203f").is_lexed()); + assert!(test_middle(r"\u2040").is_lexed()); + assert!(test_middle(r"\u2054").is_lexed()); + assert!(test_middle(r"\ufe33").is_lexed()); + assert!(test_middle(r"\ufe34").is_lexed()); + assert!(test_middle(r"\ufe4d").is_lexed()); + assert!(test_middle(r"\ufe4e").is_lexed()); + assert!(test_middle(r"\ufe4f").is_lexed()); + assert!(test_middle(r"\uff3f").is_lexed()); + assert!(test_middle(r"\u005f").is_lexed()); + assert!(test_middle(r"\u0024").is_lexed()); + } +} diff --git a/src/lexing/tokens/mod.rs b/src/lexing/tokens/mod.rs index 67219a5..405d269 100644 --- a/src/lexing/tokens/mod.rs +++ b/src/lexing/tokens/mod.rs @@ -9,3 +9,4 @@ pub mod whitespace; pub mod number; pub mod escapes; pub mod string; +pub mod identifier; diff --git a/src/lexing/tokens/number.rs b/src/lexing/tokens/number.rs index cf5a88d..84eb28a 100644 --- a/src/lexing/tokens/number.rs +++ b/src/lexing/tokens/number.rs @@ -17,13 +17,13 @@ use crate::{ pub type DecimalDigit = v!('0'..='9'); #[ECMARef("HexDigit", "https://262.ecma-international.org/5.1/#sec-7.8.3")] -#[derive(Debug, Spanned)] +#[derive(Debug, Spanned, Clone)] pub struct HexDigit { span: Span, raw: char, } -// TODO: Implement Lexical grammar for Identifier, rest of Number. +// TODO: Implement Lexical grammar for rest of Number. // TODO: Implement syntactical grammar. // TODO: Implement serde integration (+ fancy Spanned) diff --git a/src/lexing/tokens/string.rs b/src/lexing/tokens/string.rs index 5ecc243..d7b292f 100644 --- a/src/lexing/tokens/string.rs +++ b/src/lexing/tokens/string.rs @@ -57,13 +57,13 @@ impl LexT for LString { fn lex(input: &mut SourceStream) -> Result { input .lex() - .and_then(|opening| { + .and(|opening| { let contents = input.lex()?; let closing = input.lex().expected_msg(input, "Expected closing `\"`")?; LexResult::Lexed(Self::Double(opening, contents, closing)) }) .or(|| { - input.lex().and_then(|opening| { + input.lex().and(|opening| { let contents = input.lex()?; let closing = input.lex().expected_msg(input, "Expected closing `'`")?; LexResult::Lexed(Self::Single(opening, contents, closing)) @@ -93,7 +93,7 @@ impl LexT for StringPart { .or(|| input.lex().map(Self::PS)) .or(|| input.lex().map(Self::Char)) .or(|| { - input.lex().and_then(|backslash: v!('\\')| { + input.lex().and(|backslash: v!('\\')| { input .lex() .map(|esc| Self::Escape(backslash.clone(), esc)) @@ -143,6 +143,16 @@ pub trait CharacterValue { /// buffer, returning a slice of the bytes used. /// fn cv<'a, 'b: 'a>(&'a self, buf: &'b mut [u16; 2]) -> &'b [u16]; + + /// + /// Attempts to convert this utf-16 as a Rust char. + /// + fn try_as_char(&self) -> Option { + let buf = &mut [0u16; 2]; + + let mut a = char::decode_utf16(self.cv(buf).iter().copied()); + a.next().and_then(Result::ok) + } } /// @@ -198,18 +208,28 @@ impl StringValue for LString { } } -impl StringValue for Many> { - fn sv(&self) -> Vec { - // Complete guesswork about the initial capacity: - // I'm assuming that we're not going to get too many multi-u16 chars. - let mut string = Vec::with_capacity(self.len() * 5 / 4); +/// +/// Collect character values as a UTF-16 string. +/// +pub fn collect_cv_into_utf16<'a, CV: CharacterValue + 'a>( + iter: impl IntoIterator + 'a, +) -> Vec { + let iter: Vec<_> = iter.into_iter().collect(); + // Complete guesswork about the initial capacity: + // I'm assuming that we're not going to get too many multi-u16 chars. + let mut string = Vec::with_capacity(iter.len() * 5 / 4); + + let buf = &mut [0; 2]; + for part in iter { + string.extend(part.cv(buf)) + } - let buf = &mut [0; 2]; - for part in self { - string.extend(part.cv(buf)) - } + string +} - string +impl StringValue for Many> { + fn sv(&self) -> Vec { + collect_cv_into_utf16(self.iter()) } } // --- diff --git a/src/lexing/utils/lex_impls.rs b/src/lexing/utils/lex_impls.rs index bfd3f59..0cc3ce4 100644 --- a/src/lexing/utils/lex_impls.rs +++ b/src/lexing/utils/lex_impls.rs @@ -25,7 +25,8 @@ impl Lex for Many { let mut v = vec![]; loop { - match ::lex(input) { + let res = ::lex(input); + match res { LexResult::Lexed(lexed) => v.push(lexed), LexResult::Errant(errant) => return LexResult::Errant(errant), LexResult::Nothing => break, @@ -45,7 +46,7 @@ impl Spanned for Many { /// /// At least `N` lots of `L`-tokens. /// -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct AtLeast(Vec); impl Lex for AtLeast { @@ -95,7 +96,7 @@ impl DerefMut for AtLeast { /// /// Exactly `N` lots of `L`-tokens: no more, no less. /// -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct Exactly([L; N]) where [(); N]: Sized; diff --git a/src/lexing/utils/result.rs b/src/lexing/utils/result.rs index 6d50eba..063808a 100644 --- a/src/lexing/utils/result.rs +++ b/src/lexing/utils/result.rs @@ -39,6 +39,7 @@ impl<'a, S: Source> SourceStream<'a, S> { /// The rust of attempting parse token `L` /// from a [SourceStream]. /// +#[derive(Debug)] pub enum LexResult { /// /// Valid `L` token. @@ -167,7 +168,7 @@ impl LexResult { /// If this is [LexResult::Lexed], the mapper function will be called, /// and its return value is returned. /// - pub fn and_then LexResult>(self, mapper: F) -> LexResult { + pub fn and LexResult>(self, mapper: F) -> LexResult { match self { LexResult::Lexed(lexed) => mapper(lexed), LexResult::Errant(errant) => LexResult::Errant(errant), diff --git a/src/lexing/utils/unicode.rs b/src/lexing/utils/unicode.rs index 38e32a3..b12278e 100644 --- a/src/lexing/utils/unicode.rs +++ b/src/lexing/utils/unicode.rs @@ -26,7 +26,7 @@ use super::{LexError, LexT, SourceStream}; /// /// **Do not use me directly, use [crate::unicode] instead!** /// -#[derive(Debug, Spanned)] +#[derive(Debug, Spanned, Clone)] pub struct MatchMajorCategory { span: Span, raw: char, @@ -40,7 +40,7 @@ pub struct MatchMajorCategory { /// /// **Do not use me directly, use [crate::unicode] instead!** /// -#[derive(Debug, Spanned)] +#[derive(Debug, Spanned, Clone)] pub struct MatchMinorCategory { span: Span, raw: char, @@ -258,16 +258,17 @@ impl PartialEq for finl_unicode::categories::MinorCategory { mod tests { use avjason_macros::unicode; - use crate::{common::{file::SourceFile, Source}, lexing::Many}; + use crate::{ + common::{file::SourceFile, Source}, + lexing::Many, + }; type Letter = unicode!(Lu | Ll); #[test] fn test_lex() { - let source = - SourceFile::dummy_file("Apples"); + let source = SourceFile::dummy_file("Apples"); let input = &mut source.stream(); - let comment: Many = input.lex().expect("Valid parse"); - println!("{comment:?}"); + let _: Many = input.lex().expect("Valid parse"); } }