diff --git a/Cargo.lock b/Cargo.lock index 46443aae..f3833fb6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1151,6 +1151,16 @@ dependencies = [ "phf_codegen", ] +[[package]] +name = "chumsky" +version = "1.0.0-alpha.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7b80276986f86789dc56ca6542d53bba9cda3c66091ebbe7bd96fc1bdf20f1f" +dependencies = [ + "hashbrown 0.14.5", + "unicode-ident", +] + [[package]] name = "cipher" version = "0.4.4" @@ -4787,6 +4797,29 @@ dependencies = [ "thiserror 2.0.9", ] +[[package]] +name = "sail-sql-macro" +version = "0.2.0" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.91", +] + +[[package]] +name = "sail-sql-parser" +version = "0.2.0" +dependencies = [ + "chumsky", + "either", + "paste", + "phf", + "prettyplease", + "quote", + "sail-sql-macro", + "syn 2.0.91", +] + [[package]] name = "sail-telemetry" version = "0.2.0" diff --git a/Cargo.toml b/Cargo.toml index 09d252ae..e2a6bd7f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -43,6 +43,8 @@ comfy-table = "7.1" html-escape = "0.2" syn = "2.0.91" quote = "1.0.37" +paste = "1.0.15" +proc-macro2 = "1.0.92" prettyplease = "0.2.24" phf = { version = "0.11.2", features = ["macros"] } ryu = "1.0.18" @@ -77,6 +79,7 @@ aes-gcm = "0.10.3" cbc = { version = "0.1.2", features = ["std"] } base64 = "0.22.1" md-5 = "0.10.6" +chumsky = { version = "1.0.0-alpha.7", default-features = false } ###### # The versions of the following dependencies are managed manually. diff --git a/crates/sail-spark-connect/build.rs b/crates/sail-spark-connect/build.rs index 135490b8..bca504e4 100644 --- a/crates/sail-spark-connect/build.rs +++ b/crates/sail-spark-connect/build.rs @@ -152,8 +152,6 @@ fn build_spark_config() -> Result<(), Box> { .collect::>(); let tokens = quote! { - use phf::phf_map; - #[derive(Debug, Clone, PartialEq)] pub struct SparkConfigEntry<'a> { pub key: &'a str, @@ -176,16 +174,17 @@ fn build_spark_config() -> Result<(), Box> { // We define the map in a separate macro to avoid slowing down the IDE // when previewing the definition of `SPARK_CONFIG`. macro_rules! spark_config_map { - () => { phf_map! { #(#entries)* } } + () => { phf::phf_map! { #(#entries)* } } } pub static SPARK_CONFIG: phf::Map<&'static str, SparkConfigEntry<'static>> = spark_config_map!(); }; - let tree = syn::parse2(tokens)?; - let formatted = prettyplease::unparse(&tree); let out_dir = PathBuf::from(std::env::var("OUT_DIR")?); - std::fs::write(out_dir.join("spark_config.rs"), formatted)?; + std::fs::write( + out_dir.join("spark_config.rs"), + prettyplease::unparse(&syn::parse2(tokens)?), + )?; Ok(()) } diff --git a/crates/sail-sql-macro/Cargo.toml b/crates/sail-sql-macro/Cargo.toml new file mode 100644 index 00000000..66ec504d --- /dev/null +++ b/crates/sail-sql-macro/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "sail-sql-macro" +version.workspace = true +edition.workspace = true + +[dependencies] +syn = { workspace = true } +quote = { workspace = true } +proc-macro2 = { workspace = true } + +[lib] +proc-macro = true diff --git a/crates/sail-sql-macro/src/attribute.rs b/crates/sail-sql-macro/src/attribute.rs new file mode 100644 index 00000000..80072644 --- /dev/null +++ b/crates/sail-sql-macro/src/attribute.rs @@ -0,0 +1,113 @@ +use std::mem; + +use syn::punctuated::Punctuated; +use syn::spanned::Spanned; +use syn::{Attribute, Expr, Meta, MetaNameValue, Path, Token}; + +/// An extractor for a specific attribute name. +/// The attribute can have named arguments such as `#[attribute(argument = value)]`, +/// or paths such as `#[attribute(path)]`. +pub struct AttributeExtractor<'a> { + name: &'a str, + arguments: Vec, + paths: Vec, +} + +impl<'a> AttributeExtractor<'a> { + /// Creates an extractor for the given attribute name. + /// The arguments and paths are collected from the attribute list and + /// stored in the extractor for further extraction. + pub fn try_new(name: &'a str, attributes: &[Attribute]) -> syn::Result { + let mut arguments = Vec::new(); + let mut paths = Vec::new(); + for attr in attributes { + if !attr.path().is_ident(name) { + continue; + } + let nested = attr.parse_args_with(Punctuated::::parse_terminated)?; + for meta in nested { + match meta { + Meta::Path(x) => { + paths.push(x); + } + Meta::NameValue(x) => { + arguments.push(x); + } + _ => return Err(syn::Error::new(meta.span(), "invalid attribute value")), + } + } + } + Ok(Self { + name, + arguments, + paths, + }) + } + + /// Returns an error if there are any remaining arguments or paths for the attribute. + pub fn expect_empty(&self) -> syn::Result<()> { + if let Some(x) = self.arguments.first() { + Err(syn::Error::new( + x.span(), + format!("unexpected `{}` attribute argument", self.name), + )) + } else if let Some(x) = self.paths.first() { + Err(syn::Error::new( + x.span(), + format!("unexpected `{}` attribute path", self.name), + )) + } else { + Ok(()) + } + } + + /// Extracts a single argument value from the attribute. + /// The argument is removed from the extractor. + /// Returns an error if there are multiple arguments with the same name. + pub fn extract_argument_value(&mut self, argument: &str, transform: F) -> syn::Result + where + F: FnOnce(Option) -> syn::Result, + { + let arguments = mem::take(&mut self.arguments); + let (mut extracted, remaining) = arguments + .into_iter() + .partition::, _>(|x| x.path.is_ident(argument)); + self.arguments = remaining; + let one = extracted.pop(); + if let Some(other) = extracted.last() { + Err(syn::Error::new( + other.span(), + format!( + "duplicated `{}` argument for the `{}` attribute", + argument, self.name + ), + )) + } else { + transform(one.map(|x| x.value)) + } + } + + /// Extracts a single path from the attribute. + /// The path is removed from the extractor. + /// Returns an error if there are multiple paths with the same name. + #[allow(unused)] + pub fn extract_path(&mut self, path: &str) -> syn::Result> { + let paths = mem::take(&mut self.paths); + let (mut extracted, remaining) = paths + .into_iter() + .partition::, _>(|x| x.is_ident(path)); + self.paths = remaining; + let one = extracted.pop(); + if let Some(other) = extracted.last() { + Err(syn::Error::new( + other.span(), + format!( + "duplicated `{}` path for the `{}` attribute", + path, self.name + ), + )) + } else { + Ok(one.map(|_| ())) + } + } +} diff --git a/crates/sail-sql-macro/src/lib.rs b/crates/sail-sql-macro/src/lib.rs new file mode 100644 index 00000000..2b9d8d4b --- /dev/null +++ b/crates/sail-sql-macro/src/lib.rs @@ -0,0 +1,55 @@ +extern crate proc_macro; +extern crate proc_macro2; + +use proc_macro::TokenStream; +use syn::{parse_macro_input, DeriveInput}; + +mod attribute; +mod tree; +pub(crate) mod utils; + +/// Derives the `TreeParser` trait by generating a recursive descent parser for the type. +/// +/// The type can be an enum with struct or tuple variants, or a struct with named or unnamed fields. +/// For enums, the variants are parsed as choices (or nested choices for enums with many variants). +/// For structs, the fields are parsed sequentially. +/// +/// The parser cannot be derived for enums with unit variants, or structs with no fields. +/// The parser cannot be derived for types corresponding to a grammar with left recursion, +/// or a grammar requiring operator precedence handling. +/// In such cases, the `TreeParser` trait should be implemented manually. +/// `TreeParser` should also be implemented manually for terminals such as keywords, literals, and +/// operators. +/// +/// The attribute `parser` can be used to control how the parsers are derived. +/// There are a few supported arguments for the attribute. +/// +/// `parser(dependency = "type")` can be specified at the top level for the enum or the struct, +/// where `type` is a single type `T` or a tuple type `(T1, T2, ...)`. Note that the dependency +/// needs to be specified as a string literal. +/// For a single type `T`, the derived `parser()` method will expect a parser for `T` as the data. +/// For a tuple type `(T1, T2, ...)`, the derived `parser()` method will expect a tuple of parsers +/// for each type as the data. +/// +/// This argument is used to support recursive types, where the parser needs to first be declared +/// via `chumsky::recursive::Recursive::declare()`. `parser()` receives the declared parser(s) +/// and the returned parser can be then used for `chumsky::recursive::Recursive::define()`. +/// +/// If this argument is not specified, the `parser()` method will expect unit data (`()`). +/// +/// `parser(function = expr)` can be specified for individual fields (named or unnamed fields in +/// enum variants or structs), where `expr` is a function that takes the data (one or a tuple of +/// declared parsers) and returns the parser for the field. +/// +/// If this argument is not specified, the parser for the field is derived by calling the `parser()` +/// method of the field type with unit data (`()`). Such unit data is accepted for terminal parsers +/// or derived parsers without the `parser(dependency = "...")` attribute. +/// +/// The `parser` attribute is not allowed for at the enum variant level. +#[proc_macro_derive(TreeParser, attributes(parser))] +pub fn derive_tree_parser(input: TokenStream) -> TokenStream { + let input = parse_macro_input!(input as DeriveInput); + tree::parser::derive_tree_parser(input) + .unwrap_or_else(syn::Error::into_compile_error) + .into() +} diff --git a/crates/sail-sql-macro/src/tree/mod.rs b/crates/sail-sql-macro/src/tree/mod.rs new file mode 100644 index 00000000..67c567fa --- /dev/null +++ b/crates/sail-sql-macro/src/tree/mod.rs @@ -0,0 +1 @@ +pub mod parser; diff --git a/crates/sail-sql-macro/src/tree/parser.rs b/crates/sail-sql-macro/src/tree/parser.rs new file mode 100644 index 00000000..e0ccd7cd --- /dev/null +++ b/crates/sail-sql-macro/src/tree/parser.rs @@ -0,0 +1,242 @@ +use proc_macro2::TokenStream; +use quote::{format_ident, quote}; +use syn::parse::Parse; +use syn::spanned::Spanned; +use syn::{Data, DeriveInput, Expr, Field, Fields, Ident, Type, Variant}; + +use crate::attribute::AttributeExtractor; +use crate::utils::parse_string_value; + +/// The trait to derive for tree parsers. +const TRAIT: &str = "TreeParser"; +/// The attribute name used when deriving the tree parser trait. +const ATTRIBUTE: &str = "parser"; + +/// Argument names used when deriving the tree parser trait. +struct AttributeArgument; + +impl AttributeArgument { + const DEPENDENCY: &'static str = "dependency"; + const FUNCTION: &'static str = "function"; +} + +/// The maximum number of choices in a flat list of parser choices. +/// If there are more choices, they will be grouped into nested choices. +/// `chumsky` allows at most 26 choices for `choice` so this number +/// must be less than that. +const MAX_CHOICES: usize = 20; + +enum ParserDependency { + None, + One(Type), + Tuple(Vec), +} + +impl ParserDependency { + fn extract(e: Option) -> syn::Result { + e.map(|value| { + let t = parse_string_value(&value, Type::parse)?; + match t { + Type::Path(ref path) if path.qself.is_none() => Ok(Self::One(t)), + Type::Tuple(tuple) => Ok(Self::Tuple(tuple.elems.into_iter().collect())), + _ => Err(syn::Error::new( + t.span(), + format!( + "`{}` must be a single type or a tuple type", + AttributeArgument::DEPENDENCY + ), + )), + } + }) + .unwrap_or_else(|| Ok(Self::None)) + } +} + +struct ParseFields { + parser: TokenStream, + args: TokenStream, + initializer: TokenStream, +} + +fn derive_fields_inner<'a>( + spanned: impl Spanned, + fields: impl IntoIterator, +) -> syn::Result { + fields + .into_iter() + .enumerate() + .try_fold(None, |acc, (i, field)| -> syn::Result<_> { + let field_function = { + let mut extractor = AttributeExtractor::try_new(ATTRIBUTE, &field.attrs)?; + let f = extractor.extract_argument_value(AttributeArgument::FUNCTION, Ok)?; + extractor.expect_empty()?; + f + }; + let field_arg = field + .ident + .to_owned() + .unwrap_or_else(|| format_ident!("v{}", i)); + let field_type = &field.ty; + let field_parser = if let Some(function) = field_function { + quote! { { let f = #function; f(data.clone()) } } + } else { + quote! { <#field_type>::parser(()) } + }; + match acc { + Some(ParseFields { + parser, + args, + initializer, + }) => Ok(Some(ParseFields { + parser: quote! { #parser.then(#field_parser) }, + args: quote! { (#args, #field_arg) }, + initializer: quote! { #initializer, #field_arg }, + })), + None => Ok(Some(ParseFields { + parser: field_parser, + args: quote! { #field_arg }, + initializer: quote! { #field_arg }, + })), + } + })? + .ok_or_else(|| { + syn::Error::new( + spanned.span(), + format!("cannot derive `{TRAIT}` for no fields"), + ) + }) +} + +fn derive_fields( + name: TokenStream, + spanned: impl Spanned, + fields: &Fields, +) -> syn::Result { + match fields { + Fields::Named(fields) => { + let ParseFields { + parser, + args, + initializer, + } = derive_fields_inner(spanned, &fields.named)?; + Ok(quote! { + #parser.map(|#args| #name { #initializer }) + }) + } + Fields::Unnamed(fields) => { + let ParseFields { + parser, + args, + initializer, + } = derive_fields_inner(spanned, &fields.unnamed)?; + Ok(quote! { + #parser.map(|#args| #name ( #initializer )) + }) + } + Fields::Unit => Err(syn::Error::new( + spanned.span(), + format!("cannot derive `{TRAIT}` for unit fields"), + )), + } +} + +fn derive_enum_variant(enum_name: &Ident, variant: &Variant) -> syn::Result { + AttributeExtractor::try_new(ATTRIBUTE, &variant.attrs)?.expect_empty()?; + let variant_name = &variant.ident; + let name = quote! { #enum_name::#variant_name }; + derive_fields(name, variant, &variant.fields) +} + +fn derive_struct(struct_name: &Ident, fields: &Fields) -> syn::Result { + derive_fields(quote! { #struct_name }, fields, fields) +} + +fn derive_choices(choices: Vec) -> TokenStream { + let choices = if choices.len() <= MAX_CHOICES { + choices + } else { + let chunk_size = choices.len().div_ceil(MAX_CHOICES); + choices + .chunks(chunk_size) + .map(|chunk| derive_choices(chunk.to_vec())) + .collect() + }; + if choices.len() > 1 { + quote! { chumsky::prelude::choice((#(#choices),*)) } + } else { + quote! { #(#choices),* } + } +} + +pub(crate) fn derive_tree_parser(input: DeriveInput) -> syn::Result { + let name = &input.ident; + + let parser = match &input.data { + Data::Enum(data) => { + if data.variants.is_empty() { + return Err(syn::Error::new( + data.variants.span(), + format!("cannot derive `{TRAIT}` for empty enums"), + )); + } + let choices = data + .variants + .iter() + .map(|variant| derive_enum_variant(name, variant)) + .collect::>>()?; + derive_choices(choices) + } + Data::Struct(data) => derive_struct(name, &data.fields)?, + _ => { + return Err(syn::Error::new( + input.span(), + format!("`{TRAIT}` can only be derived for enums or structs"), + )) + } + }; + + let dependency = { + let mut extractor = AttributeExtractor::try_new(ATTRIBUTE, &input.attrs)?; + let dep = extractor + .extract_argument_value(AttributeArgument::DEPENDENCY, ParserDependency::extract)?; + extractor.expect_empty()?; + dep + }; + let (generics, trait_generics, data, where_clause) = match dependency { + ParserDependency::One(t) => ( + quote! { <'a, P> }, + quote! { <'a, P> }, + quote! { P }, + quote! { where P: chumsky::Parser<'a, &'a [crate::token::Token<'a>], #t> + Clone}, + ), + ParserDependency::Tuple(t) => { + let params = (0..t.len()) + .map(|i| format_ident!("P{}", i + 1)) + .collect::>(); + let bounds = t + .iter() + .zip(params.iter()) + .map(|(t, p)| quote! { #p: chumsky::Parser<'a, &'a [crate::token::Token<'a>], #t> + Clone }) + .collect::>(); + ( + quote! { <'a, #(#params),*> }, + quote! { <'a, (#(#params),*,)> }, + quote! { (#(#params),*,) }, + quote! { where #(#bounds),* }, + ) + } + ParserDependency::None => (quote! { <'a> }, quote! { <'a> }, quote! { () }, quote! {}), + }; + + let trait_name = format_ident!("{TRAIT}"); + + Ok(quote! { + impl #generics crate::tree::#trait_name #trait_generics for #name #where_clause { + fn parser(data: #data) -> impl chumsky::Parser<'a, &'a [crate::token::Token<'a>], Self> + Clone { + use chumsky::Parser; + + #parser + } + } + }) +} diff --git a/crates/sail-sql-macro/src/utils.rs b/crates/sail-sql-macro/src/utils.rs new file mode 100644 index 00000000..45b2e143 --- /dev/null +++ b/crates/sail-sql-macro/src/utils.rs @@ -0,0 +1,22 @@ +use syn::parse::Parser; +use syn::spanned::Spanned; +use syn::{Expr, ExprLit, Lit}; + +pub fn parse_string_value(e: &Expr, parser: F) -> syn::Result { + let Expr::Lit(ExprLit { + lit: Lit::Str(lit), .. + }) = e + else { + return Err(syn::Error::new( + e.span(), + "the value must be a string literal", + )); + }; + if !lit.suffix().is_empty() { + return Err(syn::Error::new( + lit.span(), + "the value cannot have a suffix", + )); + } + lit.parse_with(parser) +} diff --git a/crates/sail-sql-parser/Cargo.toml b/crates/sail-sql-parser/Cargo.toml new file mode 100644 index 00000000..761f7e3f --- /dev/null +++ b/crates/sail-sql-parser/Cargo.toml @@ -0,0 +1,17 @@ +[package] +name = "sail-sql-parser" +version = { workspace = true } +edition = { workspace = true } + +[dependencies] +sail-sql-macro = { path = "../sail-sql-macro" } + +chumsky = { workspace = true } +phf = { workspace = true } +either = { workspace = true } +paste = { workspace = true } + +[build-dependencies] +prettyplease = { workspace = true } +syn = { workspace = true } +quote = { workspace = true } diff --git a/crates/sail-sql-parser/build.rs b/crates/sail-sql-parser/build.rs new file mode 100644 index 00000000..2a22641d --- /dev/null +++ b/crates/sail-sql-parser/build.rs @@ -0,0 +1,80 @@ +use std::path::PathBuf; + +use quote::{format_ident, quote}; + +/// Converts a SQL keyword string in `"SCREAMING_SNAKE_CASE"` to +/// a Rust identifier in `PascalCase`. +fn keyword_identifier(value: &str) -> String { + value + .split('_') + .map(|part| { + let part = part.to_lowercase(); + let mut part = part.chars(); + match part.next() { + Some(first) => first.to_uppercase().chain(part).collect::(), + None => String::new(), + } + }) + .collect::() +} + +/// Define macros that can be used to generate code for SQL keywords. +fn build_keywords_macros() -> Result<(), Box> { + println!("cargo:rerun-if-changed=data/keywords.txt"); + + let data = std::fs::read_to_string("data/keywords.txt")?; + + let keywords = data + .lines() + .map(|line| line.trim()) + .filter(|line| !line.is_empty() && !line.starts_with('#')) + .collect::>(); + let identifiers = keywords + .iter() + .map(|x| { + let ident = format_ident!("{}", keyword_identifier(x)); + quote! { #ident } + }) + .collect::>(); + let items = keywords + .iter() + .zip(identifiers.iter()) + .map(|(k, i)| quote! {(#k, #i)}) + .collect::>(); + let entries = keywords + .iter() + .zip(identifiers.iter()) + .map(|(k, i)| quote! { #k => $value!(#i) }) + .collect::>(); + + let tokens = quote! { + /// Invoke a `callback` macro for the keyword list. + /// The keyword list contains tuples where the first element is the keyword string, + /// and the second element is the keyword identifier. + macro_rules! for_all_keywords { + ($callback:ident) => { $callback!([#(#items,)*]); } + } + + /// Define a compile-time map of SQL keywords where the map key is the keyword string. + /// The `value` macro specifies how to define the map value given the keyword identifier. + /// Note that we cannot define the map via the `for_all_keywords` macro because + /// `phf::phf_map` requires the key to be a string literal, and Rust macros do not + /// support eager expansion in general. + macro_rules! keyword_map { + ($value:ident) => { phf::phf_map! { #(#entries,)* } } + } + }; + + let out_dir = PathBuf::from(std::env::var("OUT_DIR")?); + std::fs::write( + out_dir.join("keywords.rs"), + prettyplease::unparse(&syn::parse2(tokens)?), + )?; + Ok(()) +} + +fn main() -> Result<(), Box> { + println!("cargo:rerun-if-changed=build.rs"); + build_keywords_macros()?; + Ok(()) +} diff --git a/crates/sail-sql-parser/data/keywords.txt b/crates/sail-sql-parser/data/keywords.txt new file mode 100644 index 00000000..96637f95 --- /dev/null +++ b/crates/sail-sql-parser/data/keywords.txt @@ -0,0 +1,346 @@ +# This is the list of SQL keywords. +# The keywords must be defined in ASCII order and must be unique. +# A line starting with '#' is a comment. +ADD +AFTER +ALL +ALTER +ALWAYS +ANALYZE +AND +ANTI +ANY +ANY_VALUE +ARCHIVE +ARRAY +AS +ASC +AT +AUTHORIZATION +BETWEEN +BIGINT +BINARY +BOOL +BOOLEAN +BOTH +BUCKET +BUCKETS +BY +BYTE +BYTEA +CACHE +CASCADE +CASE +CAST +CATALOG +CATALOGS +CHANGE +CHAR +CHARACTER +CHECK +CLEAR +CLUSTER +CLUSTERED +CODEGEN +COLLATE +COLLECTION +COLUMN +COLUMNS +COMMENT +COMMIT +COMPACT +COMPACTIONS +COMPUTE +CONCATENATE +CONSTRAINT +COST +CREATE +CROSS +CUBE +CURRENT +CURRENT_DATE +CURRENT_TIME +CURRENT_TIMESTAMP +CURRENT_USER +DATA +DATABASE +DATABASES +DATE +DATE32 +DATE64 +DATEADD +DATEDIFF +DATE_ADD +DATE_DIFF +DAY +DAYOFYEAR +DAYS +DBPROPERTIES +DEC +DECIMAL +DEFAULT +DEFINED +DELETE +DELIMITED +DESC +DESCRIBE +DFS +DIRECTORIES +DIRECTORY +DISTINCT +DISTRIBUTE +DIV +DOUBLE +DROP +ELSE +END +ESCAPE +ESCAPED +EXCEPT +EXCHANGE +EXCLUDE +EXISTS +EXPLAIN +EXPORT +EXTENDED +EXTERNAL +EXTRACT +FALSE +FETCH +FIELDS +FILEFORMAT +FILTER +FIRST +FLOAT +FLOAT32 +FLOAT64 +FOLLOWING +FOR +FOREIGN +FORMAT +FORMATTED +FROM +FULL +FUNCTION +FUNCTIONS +GENERATED +GLOBAL +GRANT +GROUP +GROUPING +HAVING +HOUR +HOURS +IDENTIFIER +IF +IGNORE +ILIKE +IMPORT +IN +INCLUDE +INDEX +INDEXES +INNER +INPATH +INPUTFORMAT +INSERT +INT +INT16 +INT32 +INT64 +INT8 +INTEGER +INTERSECT +INTERVAL +INTO +IS +ITEMS +JOIN +KEYS +LAST +LATERAL +LAZY +LEADING +LEFT +LIKE +LIMIT +LINES +LIST +LOAD +LOCAL +LOCATION +LOCK +LOCKS +LOGICAL +LONG +MACRO +MAP +MATCHED +MERGE +MICROSECOND +MICROSECONDS +MILLISECOND +MILLISECONDS +MINUS +MINUTE +MINUTES +MONTH +MONTHS +MSCK +NAME +NAMESPACE +NAMESPACES +NANOSECOND +NANOSECONDS +NATURAL +NO +NOT +NULL +NULLS +NUMERIC +OF +OFFSET +ON +ONLY +OPTION +OPTIONS +OR +ORDER +OUT +OUTER +OUTPUTFORMAT +OVER +OVERLAPS +OVERLAY +OVERWRITE +PARTITION +PARTITIONED +PARTITIONS +PERCENT +PERCENTILE_CONT +PERCENTILE_DISC +PIVOT +PLACING +POSITION +PRECEDING +PRIMARY +PRINCIPALS +PROPERTIES +PURGE +QUARTER +QUERY +RANGE +REAL +RECORDREADER +RECORDWRITER +RECOVER +REDUCE +REFERENCES +REFRESH +REGEXP +RENAME +REPAIR +REPEATABLE +REPLACE +RESET +RESPECT +RESTRICT +REVOKE +RIGHT +RLIKE +ROLE +ROLES +ROLLBACK +ROLLUP +ROW +ROWS +SCHEMA +SCHEMAS +SECOND +SECONDS +SELECT +SEMI +SEPARATED +SERDE +SERDEPROPERTIES +SESSION_USER +SET +SETS +SHORT +SHOW +SKEWED +SMALLINT +SOME +SORT +SORTED +SOURCE +START +STATISTICS +STORED +STRATIFY +STRING +STRUCT +SUBSTR +SUBSTRING +SYNC +SYSTEM_TIME +SYSTEM_VERSION +TABLE +TABLES +TABLESAMPLE +TARGET +TBLPROPERTIES +TEMP +TEMPORARY +TERMINATED +THEN +TIME +TIMESTAMP +TIMESTAMPADD +TIMESTAMPDIFF +TIMESTAMP_LTZ +TIMESTAMP_NTZ +TINYINT +TO +TOUCH +TRAILING +TRANSACTION +TRANSACTIONS +TRANSFORM +TRIM +TRUE +TRUNCATE +TRY_CAST +TYPE +UNARCHIVE +UNBOUNDED +UNCACHE +UNION +UNIQUE +UNKNOWN +UNLOCK +UNPIVOT +UNSET +UNSIGNED +UPDATE +USE +USER +USING +VALUES +VARCHAR +VERSION +VIEW +VIEWS +VOID +WEEK +WEEKS +WHEN +WHERE +WINDOW +WITH +WITHIN +WITHOUT +X +YEAR +YEARS +ZONE diff --git a/crates/sail-sql-parser/src/ast/container/either.rs b/crates/sail-sql-parser/src/ast/container/either.rs new file mode 100644 index 00000000..766b2c9e --- /dev/null +++ b/crates/sail-sql-parser/src/ast/container/either.rs @@ -0,0 +1,17 @@ +use chumsky::Parser; +use either::Either; + +use crate::container::either_or; +use crate::token::Token; +use crate::tree::TreeParser; + +impl<'a, L, R, D> TreeParser<'a, D> for Either +where + L: TreeParser<'a, D>, + R: TreeParser<'a, D>, + D: Clone, +{ + fn parser(data: D) -> impl Parser<'a, &'a [Token<'a>], Self> + Clone { + either_or(L::parser(data.clone()), R::parser(data)) + } +} diff --git a/crates/sail-sql-parser/src/ast/container/mod.rs b/crates/sail-sql-parser/src/ast/container/mod.rs new file mode 100644 index 00000000..3a380eb0 --- /dev/null +++ b/crates/sail-sql-parser/src/ast/container/mod.rs @@ -0,0 +1,4 @@ +mod either; +mod option; +mod sequence; +mod tuple; diff --git a/crates/sail-sql-parser/src/ast/container/option.rs b/crates/sail-sql-parser/src/ast/container/option.rs new file mode 100644 index 00000000..082ec278 --- /dev/null +++ b/crates/sail-sql-parser/src/ast/container/option.rs @@ -0,0 +1,13 @@ +use chumsky::Parser; + +use crate::token::Token; +use crate::tree::TreeParser; + +impl<'a, T, D> TreeParser<'a, D> for Option +where + T: TreeParser<'a, D>, +{ + fn parser(data: D) -> impl Parser<'a, &'a [Token<'a>], Self> + Clone { + T::parser(data).or_not() + } +} diff --git a/crates/sail-sql-parser/src/ast/container/sequence.rs b/crates/sail-sql-parser/src/ast/container/sequence.rs new file mode 100644 index 00000000..1eed095f --- /dev/null +++ b/crates/sail-sql-parser/src/ast/container/sequence.rs @@ -0,0 +1,16 @@ +use chumsky::Parser; + +use crate::container::{sequence, Sequence}; +use crate::token::Token; +use crate::tree::TreeParser; + +impl<'a, T, S, D> TreeParser<'a, D> for Sequence +where + T: TreeParser<'a, D>, + S: TreeParser<'a, D>, + D: Clone, +{ + fn parser(data: D) -> impl Parser<'a, &'a [Token<'a>], Self> + Clone { + sequence(T::parser(data.clone()), S::parser(data)) + } +} diff --git a/crates/sail-sql-parser/src/ast/container/tuple.rs b/crates/sail-sql-parser/src/ast/container/tuple.rs new file mode 100644 index 00000000..75ee4a21 --- /dev/null +++ b/crates/sail-sql-parser/src/ast/container/tuple.rs @@ -0,0 +1,40 @@ +use chumsky::Parser; +use paste::paste; + +use crate::token::Token; +use crate::tree::TreeParser; + +macro_rules! nested { + (@fold $acc:tt) => { $acc }; + (@fold $acc:tt $head:ident $($tail:ident)* ) => { nested!(@fold ($acc, $head) $($tail)*) }; + ($T1:ident $($Ts:ident)*) => { nested!(@fold $T1 $($Ts)*) }; +} + +macro_rules! impl_tree_parser_for_tuple { + ($T:ident $(,$Ts:ident)*) => { + impl<'a, $T $(,$Ts)*, D> TreeParser<'a, D> for ($T, $($Ts,)*) + where + $T: TreeParser<'a, D> + $(,$Ts: TreeParser<'a, D>)* + , D: Clone + { + fn parser(data: D) -> impl Parser<'a, &'a [Token<'a>], Self> + Clone{ + let parser = T1::parser(data.clone()) + $(.then($Ts::parser(data.clone())))*; + paste! { + parser.map(|nested!([<$T:lower>] $([<$Ts:lower>])*)| ([<$T:lower>], $([<$Ts:lower>],)*)) + } + } + } + }; +} + +impl_tree_parser_for_tuple!(T1); +impl_tree_parser_for_tuple!(T1, T2); +impl_tree_parser_for_tuple!(T1, T2, T3); +impl_tree_parser_for_tuple!(T1, T2, T3, T4); +impl_tree_parser_for_tuple!(T1, T2, T3, T4, T5); +impl_tree_parser_for_tuple!(T1, T2, T3, T4, T5, T6); +impl_tree_parser_for_tuple!(T1, T2, T3, T4, T5, T6, T7); +impl_tree_parser_for_tuple!(T1, T2, T3, T4, T5, T6, T7, T8); +impl_tree_parser_for_tuple!(T1, T2, T3, T4, T5, T6, T7, T8, T9); diff --git a/crates/sail-sql-parser/src/ast/data_type.rs b/crates/sail-sql-parser/src/ast/data_type.rs new file mode 100644 index 00000000..3e1cb01e --- /dev/null +++ b/crates/sail-sql-parser/src/ast/data_type.rs @@ -0,0 +1,145 @@ +use sail_sql_macro::TreeParser; + +use crate::ast::identifier::Ident; +use crate::ast::keywords::{ + Array, Bigint, Binary, Bool, Boolean, Byte, Bytea, Char, Character, Comment, Date, Date32, + Date64, Decimal, Double, Float, Float32, Float64, Hour, Int, Int16, Int32, Int64, Int8, + Integer, Interval, Local, Long, Map, Minute, Month, Not, Null, Second, Short, Smallint, Struct, + Time, Timestamp, Tinyint, To, Unsigned, Varchar, Void, With, Without, Year, Zone, +}; +use crate::ast::literal::StringLiteral; +use crate::ast::operator::{ + Colon, Comma, GreaterThan, LeftParenthesis, LessThan, RightParenthesis, +}; +use crate::ast::value::IntegerValue; +use crate::container::{boxed, sequence, Sequence}; + +#[allow(unused)] +#[derive(Debug, Clone, TreeParser)] +#[parser(dependency = "DataType")] +pub enum DataType { + Null(Null), + Void(Void), + Boolean(Boolean), + Bool(Bool), + TinyInt(Option, Tinyint), + SmallInt(Option, Smallint), + Int(Option, Int), + BigInt(Option, Bigint), + Byte(Option, Byte), + Short(Option, Short), + Integer(Option, Integer), + Long(Option, Long), + Int8(Option, Int8), + Int16(Option, Int16), + Int32(Option, Int32), + Int64(Option, Int64), + Binary(Binary), + Bytea(Bytea), + Float(Float), + Double(Double), + Float32(Float32), + Float64(Float64), + #[allow(clippy::type_complexity)] + Decimal( + Decimal, + Option<( + LeftParenthesis, + IntegerValue, + Option<(Comma, IntegerValue)>, + RightParenthesis, + )>, + ), + Char( + Char, + Option<(LeftParenthesis, IntegerValue, RightParenthesis)>, + ), + Character( + Character, + Option<(LeftParenthesis, IntegerValue, RightParenthesis)>, + ), + Varchar(Varchar, LeftParenthesis, IntegerValue, RightParenthesis), + String(crate::ast::keywords::String), + Timestamp( + Timestamp, + Option<(LeftParenthesis, IntegerValue, RightParenthesis)>, + Option, + ), + Date(Date), + Date32(Date32), + Date64(Date64), + Interval(IntervalType), + Array( + Array, + LessThan, + #[parser(function = |x| boxed(x))] Box, + GreaterThan, + ), + Struct( + Struct, + LessThan, + #[parser(function = |x| boxed(sequence(StructField::parser(x), Comma::parser(()))))] + Box>, + GreaterThan, + ), + Map( + Map, + LessThan, + #[parser(function = |x| boxed(x))] Box, + Comma, + #[parser(function = |x| boxed(x))] Box, + GreaterThan, + ), +} + +#[allow(unused)] +#[derive(Debug, Clone, TreeParser)] +pub enum IntervalType { + YearMonth( + Interval, + IntervalYearMonthUnit, + Option<(To, IntervalYearMonthUnit)>, + ), + DayTime( + Interval, + IntervalDayTimeUnit, + Option<(To, IntervalDayTimeUnit)>, + ), +} + +#[allow(unused)] +#[derive(Debug, Clone, TreeParser)] +pub enum IntervalYearMonthUnit { + Year(Year), + Month(Month), +} + +#[allow(unused)] +#[derive(Debug, Clone, TreeParser)] +pub enum IntervalDayTimeUnit { + Day(Year), + Hour(Hour), + Minute(Minute), + Second(Second), +} + +#[allow(unused)] +#[allow(clippy::enum_variant_names)] +#[derive(Debug, Clone, TreeParser)] +pub enum TimezoneType { + WithTimeZone(With, Time, Zone), + WithoutTimeZone(Without, Time, Zone), + WithLocalTimeZone(With, Local, Time, Zone), +} + +#[allow(unused)] +#[derive(Debug, Clone, TreeParser)] +#[parser(dependency = "DataType")] +pub struct StructField { + pub identifier: Ident, + pub colon: Option, + #[parser(function = |x| x)] + pub data_type: DataType, + pub not_null: Option<(Not, Null)>, + pub comment: Option<(Comment, StringLiteral)>, +} diff --git a/crates/sail-sql-parser/src/ast/expression.rs b/crates/sail-sql-parser/src/ast/expression.rs new file mode 100644 index 00000000..b1f909c7 --- /dev/null +++ b/crates/sail-sql-parser/src/ast/expression.rs @@ -0,0 +1,20 @@ +use sail_sql_macro::TreeParser; + +use crate::ast::identifier::ObjectName; +use crate::ast::literal::{NumberLiteral, StringLiteral}; +use crate::ast::operator::{LeftParenthesis, RightParenthesis}; +use crate::ast::query::Query; +use crate::container::boxed; + +#[derive(Debug, Clone, TreeParser)] +#[parser(dependency = "(Expr, Query)")] +pub enum Expr { + StringLiteral(StringLiteral), + NumberLiteral(NumberLiteral), + ObjectName(ObjectName), + Parenthesized( + LeftParenthesis, + #[parser(function = |(e, _)| boxed(e))] Box, + RightParenthesis, + ), +} diff --git a/crates/sail-sql-parser/src/ast/identifier.rs b/crates/sail-sql-parser/src/ast/identifier.rs new file mode 100644 index 00000000..53eb49d1 --- /dev/null +++ b/crates/sail-sql-parser/src/ast/identifier.rs @@ -0,0 +1,40 @@ +use chumsky::error::EmptyErr; +use chumsky::prelude::any; +use chumsky::Parser; +use sail_sql_macro::TreeParser; + +use crate::ast::operator::Comma; +use crate::ast::whitespace::whitespace; +use crate::container::Sequence; +use crate::token::{Token, TokenSpan, TokenValue}; +use crate::tree::TreeParser; + +#[allow(unused)] +#[derive(Debug, Clone)] +pub struct Ident { + pub span: TokenSpan, + pub value: String, +} + +impl<'a> TreeParser<'a> for Ident { + fn parser(_: ()) -> impl Parser<'a, &'a [Token<'a>], Self> + Clone { + any() + .try_map(|t: Token<'a>, _| match t { + Token { + value: TokenValue::Word { keyword: _, raw }, + span, + } => Ok(Ident { + span, + // FIXME: handle delimited identifiers + // FIXME: handle escape strings + value: raw.to_string(), + }), + _ => Err(EmptyErr::default()), + }) + .then_ignore(whitespace().repeated()) + } +} + +#[allow(unused)] +#[derive(Debug, Clone, TreeParser)] +pub struct ObjectName(pub Sequence); diff --git a/crates/sail-sql-parser/src/ast/keywords.rs b/crates/sail-sql-parser/src/ast/keywords.rs new file mode 100644 index 00000000..63052124 --- /dev/null +++ b/crates/sail-sql-parser/src/ast/keywords.rs @@ -0,0 +1,53 @@ +use chumsky::primitive::any; +use chumsky::Parser; + +use crate::ast::whitespace::whitespace; +use crate::token::{Keyword, Token, TokenSpan, TokenValue}; +use crate::tree::TreeParser; + +fn keyword_parser<'a, K, F>( + keyword: Keyword, + builder: F, +) -> impl Parser<'a, &'a [Token<'a>], K> + Clone +where + F: Fn(TokenSpan) -> K + Clone + 'static, +{ + any() + .filter(move |t| match t { + Token { + value: TokenValue::Word { + keyword: Some(k), .. + }, + .. + } => *k == keyword, + _ => false, + }) + .then_ignore(whitespace().repeated()) + .map(move |t| builder(t.span)) +} + +macro_rules! keyword_types { + ([$(($_:expr, $identifier:ident),)* $(,)?]) => { + $( + #[allow(unused)] + #[derive(Debug, Clone)] + pub struct $identifier { + pub span: TokenSpan, + } + + impl $identifier { + pub const fn keyword() -> Keyword { + Keyword::$identifier + } + } + + impl<'a> TreeParser<'a> for $identifier { + fn parser(_: ()) -> impl Parser<'a, &'a [Token<'a>], Self> + Clone { + keyword_parser(Self::keyword(), |span| Self { span }) + } + } + )* + } +} + +for_all_keywords!(keyword_types); diff --git a/crates/sail-sql-parser/src/ast/literal.rs b/crates/sail-sql-parser/src/ast/literal.rs new file mode 100644 index 00000000..36143e33 --- /dev/null +++ b/crates/sail-sql-parser/src/ast/literal.rs @@ -0,0 +1,60 @@ +use chumsky::error::EmptyErr; +use chumsky::prelude::any; +use chumsky::Parser; + +use crate::ast::whitespace::whitespace; +use crate::token::{StringStyle, Token, TokenSpan, TokenValue}; +use crate::tree::TreeParser; + +#[allow(unused)] +#[derive(Debug, Clone)] +pub struct NumberLiteral { + pub span: TokenSpan, + pub value: String, + pub suffix: String, +} + +impl<'a> TreeParser<'a> for NumberLiteral { + fn parser(_: ()) -> impl Parser<'a, &'a [Token<'a>], Self> + Clone { + any() + .try_map(|t: Token<'a>, _| match t { + Token { + value: TokenValue::Number { value, suffix }, + span, + } => Ok(NumberLiteral { + span, + value: value.to_string(), + suffix: suffix.to_string(), + }), + _ => Err(EmptyErr::default()), + }) + .then_ignore(whitespace().repeated()) + } +} + +#[allow(unused)] +#[derive(Debug, Clone)] +pub struct StringLiteral { + pub span: TokenSpan, + pub value: String, + pub style: StringStyle, +} + +impl<'a> TreeParser<'a> for StringLiteral { + fn parser(_: ()) -> impl Parser<'a, &'a [Token<'a>], Self> + Clone { + any() + .try_map(|t: Token<'a>, _| match t { + Token { + value: TokenValue::String { raw, style }, + span, + } => Ok(StringLiteral { + span, + // FIXME: handle escape strings + value: raw.to_string(), + style, + }), + _ => Err(EmptyErr::default()), + }) + .then_ignore(whitespace().repeated()) + } +} diff --git a/crates/sail-sql-parser/src/ast/mod.rs b/crates/sail-sql-parser/src/ast/mod.rs new file mode 100644 index 00000000..474fc8fc --- /dev/null +++ b/crates/sail-sql-parser/src/ast/mod.rs @@ -0,0 +1,11 @@ +mod container; +pub mod data_type; +pub mod expression; +pub mod identifier; +pub mod keywords; +pub mod literal; +pub mod operator; +pub mod query; +pub mod statement; +pub mod value; +pub mod whitespace; diff --git a/crates/sail-sql-parser/src/ast/operator.rs b/crates/sail-sql-parser/src/ast/operator.rs new file mode 100644 index 00000000..85c87a7c --- /dev/null +++ b/crates/sail-sql-parser/src/ast/operator.rs @@ -0,0 +1,64 @@ +use chumsky::error::EmptyErr; +use chumsky::prelude::custom; +use chumsky::Parser; + +use crate::ast::whitespace::whitespace; +use crate::token::{Punctuation, Token, TokenSpan, TokenValue}; +use crate::tree::TreeParser; + +fn operator_parser<'a, O, F>( + punctuations: &'static [Punctuation], + builder: F, +) -> impl Parser<'a, &'a [Token<'a>], O> + Clone +where + F: Fn(TokenSpan) -> O + Clone + 'static, +{ + custom(move |input| { + let mut span = TokenSpan::default(); + for punctuation in punctuations { + match input.next() { + Some(Token { + value: TokenValue::Punctuation(p), + span: s, + }) if p == *punctuation => { + span = span.union(&s); + } + _ => return Err(EmptyErr::default()), + } + } + Ok(span) + }) + .then_ignore(whitespace().repeated()) + .map(builder) +} + +macro_rules! define_operator { + ($identifier:ident, [$($punctuation:ident),*]) => { + #[allow(unused)] + #[derive(Debug, Clone)] + pub struct $identifier { + pub span: TokenSpan, + } + + impl $identifier { + pub const fn punctuations() -> &'static [Punctuation] { + &[$(Punctuation::$punctuation),*] + } + } + + impl<'a> TreeParser<'a> for $identifier { + fn parser(_: ()) -> impl Parser<'a, &'a [Token<'a>], Self> + Clone { + operator_parser(Self::punctuations(), |span| Self { span }) + } + } + }; +} + +define_operator!(LeftParenthesis, [LeftParenthesis]); +define_operator!(RightParenthesis, [RightParenthesis]); +define_operator!(LessThan, [LessThan]); +define_operator!(GreaterThan, [GreaterThan]); +define_operator!(Comma, [Comma]); +define_operator!(Period, [Period]); +define_operator!(Colon, [Colon]); +define_operator!(Semicolon, [Semicolon]); diff --git a/crates/sail-sql-parser/src/ast/query.rs b/crates/sail-sql-parser/src/ast/query.rs new file mode 100644 index 00000000..d7772716 --- /dev/null +++ b/crates/sail-sql-parser/src/ast/query.rs @@ -0,0 +1,31 @@ +use sail_sql_macro::TreeParser; + +use crate::ast::expression::Expr; +use crate::ast::keywords::{Select, With}; +use crate::ast::operator::Comma; +use crate::container::sequence; +use crate::Sequence; + +#[allow(unused)] +#[derive(Debug, Clone, TreeParser)] +#[parser(dependency = "(Query, Expr)")] +pub struct Query { + pub with_clause: Option, + #[parser(function = |(q, e)| SelectClause::parser((q, e)))] + pub select_clause: SelectClause, +} + +#[allow(unused)] +#[derive(Debug, Clone, TreeParser)] +pub struct WithClause { + pub with: With, +} + +#[allow(unused)] +#[derive(Debug, Clone, TreeParser)] +#[parser(dependency = "(Query, Expr)")] +pub struct SelectClause { + pub select: Select, + #[parser(function = |(q, e)| sequence(Expr::parser((e, q)), Comma::parser(())))] + pub expressions: Sequence, +} diff --git a/crates/sail-sql-parser/src/ast/statement/explain.rs b/crates/sail-sql-parser/src/ast/statement/explain.rs new file mode 100644 index 00000000..6b69ba6a --- /dev/null +++ b/crates/sail-sql-parser/src/ast/statement/explain.rs @@ -0,0 +1,25 @@ +use sail_sql_macro::TreeParser; + +use crate::ast::keywords::{Codegen, Cost, Explain, Extended, Formatted, Logical}; +use crate::ast::statement::Statement; +use crate::container::boxed; + +#[allow(unused)] +#[derive(Debug, Clone, TreeParser)] +#[parser(dependency = "Statement")] +pub struct ExplainStatement { + pub explain: Explain, + pub format: Option, + #[parser(function = boxed)] + pub statement: Box, +} + +#[allow(unused)] +#[derive(Debug, Clone, TreeParser)] +pub enum ExplainFormat { + Logical(Logical), + Formatted(Formatted), + Extended(Extended), + Codegen(Codegen), + Cost(Cost), +} diff --git a/crates/sail-sql-parser/src/ast/statement/mod.rs b/crates/sail-sql-parser/src/ast/statement/mod.rs new file mode 100644 index 00000000..5186d701 --- /dev/null +++ b/crates/sail-sql-parser/src/ast/statement/mod.rs @@ -0,0 +1,16 @@ +use sail_sql_macro::TreeParser; + +use crate::ast::data_type::DataType; +use crate::ast::expression::Expr; +use crate::ast::query::Query; +use crate::ast::statement::explain::ExplainStatement; + +pub mod explain; + +#[allow(unused)] +#[derive(Debug, Clone, TreeParser)] +#[parser(dependency = "(Statement, Query, Expr, DataType)")] +pub enum Statement { + Query(#[parser(function = |(_, q, e, _)| Query::parser((q, e)))] Query), + Explain(#[parser(function = |(s, _, _, _)| ExplainStatement::parser(s))] ExplainStatement), +} diff --git a/crates/sail-sql-parser/src/ast/value.rs b/crates/sail-sql-parser/src/ast/value.rs new file mode 100644 index 00000000..dac37850 --- /dev/null +++ b/crates/sail-sql-parser/src/ast/value.rs @@ -0,0 +1,31 @@ +use chumsky::error::EmptyErr; +use chumsky::prelude::any; +use chumsky::Parser; + +use crate::ast::whitespace::whitespace; +use crate::token::{Token, TokenSpan, TokenValue}; +use crate::tree::TreeParser; + +#[allow(unused)] +#[derive(Debug, Clone)] +pub struct IntegerValue { + pub span: TokenSpan, + pub value: u64, +} + +impl<'a> TreeParser<'a> for IntegerValue { + fn parser(_: ()) -> impl Parser<'a, &'a [Token<'a>], Self> + Clone { + any() + .try_map(|t: Token<'a>, _| match t { + Token { + value: TokenValue::Number { value, suffix: "" }, + span, + } => { + let value = value.parse().map_err(|_| EmptyErr::default())?; + Ok(IntegerValue { span, value }) + } + _ => Err(EmptyErr::default()), + }) + .then_ignore(whitespace().repeated()) + } +} diff --git a/crates/sail-sql-parser/src/ast/whitespace.rs b/crates/sail-sql-parser/src/ast/whitespace.rs new file mode 100644 index 00000000..f451888b --- /dev/null +++ b/crates/sail-sql-parser/src/ast/whitespace.rs @@ -0,0 +1,20 @@ +use chumsky::prelude::any; +use chumsky::Parser; + +use crate::token::{Token, TokenValue}; + +pub fn whitespace<'a>() -> impl Parser<'a, &'a [Token<'a>], ()> + Clone { + any() + .filter(|t: &Token<'a>| { + matches!( + t.value, + TokenValue::Space { .. } + | TokenValue::Tab { .. } + | TokenValue::LineFeed { .. } + | TokenValue::CarriageReturn { .. } + | TokenValue::SingleLineComment { .. } + | TokenValue::MultiLineComment { .. } + ) + }) + .ignored() +} diff --git a/crates/sail-sql-parser/src/container.rs b/crates/sail-sql-parser/src/container.rs new file mode 100644 index 00000000..737a610b --- /dev/null +++ b/crates/sail-sql-parser/src/container.rs @@ -0,0 +1,45 @@ +use chumsky::input::Input; +use chumsky::{IterParser, Parser}; +use either::Either; + +/// A sequence of item type `T` and separator type `S`. +#[allow(unused)] +#[derive(Debug, Clone)] +pub struct Sequence { + pub head: T, + pub tail: Vec<(S, T)>, +} + +pub fn sequence<'a, I, T, S, PT, PS>( + item: PT, + seperator: PS, +) -> impl Parser<'a, I, Sequence> + Clone +where + I: Input<'a>, + PT: Parser<'a, I, T> + Clone, + PS: Parser<'a, I, S> + Clone, +{ + item.clone() + .then(seperator.then(item).repeated().collect()) + .map(|(head, tail)| Sequence { head, tail }) +} + +pub fn boxed<'a, I, O, P>(parser: P) -> impl Parser<'a, I, Box> + Clone +where + P: Parser<'a, I, O> + Clone, + I: Input<'a>, +{ + parser.map(Box::new) +} + +pub fn either_or<'a, I, L, R, PL, PR>( + left: PL, + right: PR, +) -> impl Parser<'a, I, Either> + Clone +where + I: Input<'a>, + PL: Parser<'a, I, L> + Clone, + PR: Parser<'a, I, R> + Clone, +{ + left.map(Either::Left).or(right.map(Either::Right)) +} diff --git a/crates/sail-sql-parser/src/lexer.rs b/crates/sail-sql-parser/src/lexer.rs new file mode 100644 index 00000000..04ecde71 --- /dev/null +++ b/crates/sail-sql-parser/src/lexer.rs @@ -0,0 +1,280 @@ +use chumsky::error::EmptyErr; +use chumsky::prelude::{any, choice, custom, end, just, none_of, one_of, SimpleSpan}; +use chumsky::{ConfigParser, IterParser, Parser}; + +use crate::options::{QuoteEscape, SqlParserOptions}; +use crate::token::{Keyword, Punctuation, StringStyle, Token, TokenSpan, TokenValue}; + +macro_rules! token { + ($value:expr, $extra:expr) => { + Token::new($value, $extra.span()) + }; +} + +impl From for TokenSpan { + fn from(span: SimpleSpan) -> Self { + TokenSpan { + start: span.start, + end: span.end, + } + } +} + +fn word<'a>() -> impl Parser<'a, &'a str, Token<'a>> { + any() + .filter(|c: &char| c.is_ascii_alphabetic() || *c == '_') + .ignore_then( + any() + .filter(|c: &char| c.is_ascii_alphanumeric() || *c == '_') + .repeated(), + ) + .map_with(|(), e| { + let keyword = Keyword::get(e.slice()); + token!( + TokenValue::Word { + raw: e.slice(), + keyword + }, + e + ) + }) +} + +fn number<'a>() -> impl Parser<'a, &'a str, Token<'a>> { + let digit = any().filter(|c: &char| c.is_ascii_digit()); + let suffix = any().filter(|c: &char| c.is_ascii_alphabetic()).repeated(); + + let value = digit + .repeated() + .at_least(1) + .then(just('.').then(digit.repeated()).or_not()) + .ignored(); + let decimal_only_value = just('.').then(digit.repeated().at_least(1)).ignored(); + let exponent = one_of("eE") + .then(one_of("+-").or_not()) + .then(digit.repeated().at_least(1)) + .or_not() + .ignored(); + + value + .or(decimal_only_value) + .then(exponent) + .to_slice() + .then(suffix.to_slice()) + .map_with(|(value, suffix), e| token!(TokenValue::Number { value, suffix }, e)) +} + +fn single_line_comment<'a>() -> impl Parser<'a, &'a str, Token<'a>> { + just("--") + .ignore_then(none_of("\n\r").repeated()) + .map_with(|(), e| token!(TokenValue::SingleLineComment { raw: e.slice() }, e)) +} + +fn multi_line_comment<'a>() -> impl Parser<'a, &'a str, Token<'a>> { + // The delimiter of a multi-line comment can be nested. + // We implement a custom parser to handle this. + // This avoids the overhead of creating a `recursive` parser in the lexer. + custom(|input| { + let mut last = None; + let mut level = 0; + loop { + let c = input.next(); + match (last, c) { + (None, Some('/')) => {} + (None, _) => break, + (Some('/'), Some('*')) => { + level += 1; + } + (Some('/'), Some(_)) => { + if level == 0 { + break; + } + } + (Some('*'), Some('/')) => { + level -= 1; + if level == 0 { + return Ok(()); + } + } + (Some(_), Some(_)) => {} + (_, None) => break, + } + last = c; + } + Err(EmptyErr::default()) + }) + .map_with(|(), e| token!(TokenValue::MultiLineComment { raw: e.slice() }, e)) +} + +fn none_quote_escaped_text<'a>(delimiter: char) -> impl Parser<'a, &'a str, ()> { + none_of(delimiter).repeated().padded_by(just(delimiter)) +} + +fn dual_quote_escaped_text<'a>(delimiter: char) -> impl Parser<'a, &'a str, ()> { + none_of(delimiter) + .ignored() + .or(just(delimiter).repeated().exactly(2)) + .repeated() + .padded_by(just(delimiter)) +} + +fn backslash_quote_escaped_text<'a>(delimiter: char) -> impl Parser<'a, &'a str, ()> { + any() + .filter(move |c: &char| *c != '\\' && *c != delimiter) + .ignored() + .or(just('\\').then(any()).ignored()) + .repeated() + .padded_by(just(delimiter)) +} + +fn multi_quoted_text<'a>(delimiter: &'static str) -> impl Parser<'a, &'a str, ()> { + any() + .and_is(just(delimiter).not()) + .repeated() + .padded_by(just(delimiter)) +} + +fn quoted_string<'a, P, S>(text: P, style: S) -> impl Parser<'a, &'a str, Token<'a>> +where + P: Parser<'a, &'a str, ()>, + S: Fn(Option) -> StringStyle + 'static, +{ + any() + .filter(|c: &char| c.is_ascii_alphabetic()) + .or_not() + .then_ignore(text) + .map_with(move |prefix, e| { + token!( + TokenValue::String { + raw: e.slice(), + style: style(prefix) + }, + e + ) + }) +} + +fn backtick_quoted_string<'a>() -> impl Parser<'a, &'a str, Token<'a>> { + // TODO: Should we support escaping backticks? + none_of('`') + .repeated() + .padded_by(just('`')) + .map_with(|(), e| { + token!( + TokenValue::String { + raw: e.slice(), + style: StringStyle::BacktickQuoted + }, + e + ) + }) +} + +fn unicode_escape_string<'a, P>(text: P, style: StringStyle) -> impl Parser<'a, &'a str, Token<'a>> +where + P: Parser<'a, &'a str, ()>, +{ + just("U&").ignore_then(text).map_with(move |(), e| { + token!( + TokenValue::String { + raw: e.slice(), + style: style.clone() + }, + e + ) + }) +} + +fn dollar_quoted_string<'a>() -> impl Parser<'a, &'a str, Token<'a>> { + // TODO: Should we restrict the characters allowed in the tag? + let start = none_of('$').repeated().padded_by(just('$')).to_slice(); + let tag = just("").configure(|cfg, ctx| cfg.seq(*ctx)); + + start + .then_with_ctx(any().and_is(tag.not()).repeated().then_ignore(tag)) + .map_with(move |(tag, ()), e| { + token!( + TokenValue::String { + raw: e.slice(), + style: StringStyle::DollarQuoted { + tag: tag.to_string() + } + }, + e + ) + }) +} + +fn whitespace<'a, T>(c: char, token: T) -> impl Parser<'a, &'a str, Token<'a>> +where + T: Fn(usize) -> TokenValue<'a> + 'static, +{ + just(c) + .repeated() + .at_least(1) + .count() + .map_with(move |count, e| token!(token(count), e)) +} + +fn punctuation<'a>() -> impl Parser<'a, &'a str, Token<'a>> { + any().try_map_with(|c: char, e| match Punctuation::from_char(c) { + Some(p) => Ok(token!(TokenValue::Punctuation(p), e)), + None => Err(EmptyErr::default()), + }) +} + +fn string<'a>(options: &SqlParserOptions) -> impl Parser<'a, &'a str, Token<'a>> { + let text = match options.quote_escape { + QuoteEscape::None => |d| none_quote_escaped_text(d).boxed(), + QuoteEscape::Dual => |d| dual_quote_escaped_text(d).boxed(), + QuoteEscape::Backslash => |d| backslash_quote_escaped_text(d).boxed(), + }; + + let string = choice(( + quoted_string(text('\''), |prefix| StringStyle::SingleQuoted { prefix }), + quoted_string(text('"'), |prefix| StringStyle::DoubleQuoted { prefix }), + unicode_escape_string(text('\''), StringStyle::UnicodeSingleQuoted), + unicode_escape_string(text('"'), StringStyle::UnicodeDoubleQuoted), + backtick_quoted_string(), + dollar_quoted_string(), + )); + + let string = if options.allow_triple_quote_string { + choice(( + // Multi-quote delimiter must come before one-quote delimiter. + quoted_string(multi_quoted_text("'''"), |prefix| { + StringStyle::TripleSingleQuoted { prefix } + }), + quoted_string(multi_quoted_text("\"\"\""), |prefix| { + StringStyle::TripleDoubleQuoted { prefix } + }), + string, + )) + .boxed() + } else { + string.boxed() + }; + + string +} + +#[allow(unused)] +pub fn lexer<'a>(options: &SqlParserOptions) -> impl Parser<'a, &'a str, Vec>> { + choice(( + // When the parsers can parse the same prefix, more specific parsers must come before + // more general parsers to avoid ambiguity. + single_line_comment(), + multi_line_comment(), + string(options), + word(), + number(), + whitespace(' ', |count| TokenValue::Space { count }), + whitespace('\n', |count| TokenValue::LineFeed { count }), + whitespace('\r', |count| TokenValue::CarriageReturn { count }), + whitespace('\t', |count| TokenValue::Tab { count }), + punctuation(), + )) + .repeated() + .collect() + .then_ignore(end()) +} diff --git a/crates/sail-sql-parser/src/lib.rs b/crates/sail-sql-parser/src/lib.rs new file mode 100644 index 00000000..d186eaec --- /dev/null +++ b/crates/sail-sql-parser/src/lib.rs @@ -0,0 +1,19 @@ +// Define the keywords macros before other modules so that they can use the macros. +// `[macro_use]` extends the macro scope beyond the end of the `keywords` module. +#[macro_use] +mod keywords { + include!(concat!(env!("OUT_DIR"), "/keywords.rs")); +} + +pub mod ast; +mod container; +mod lexer; +pub mod location; +mod options; +mod parser; +pub mod token; +pub mod tree; + +pub use container::Sequence; +pub use lexer::lexer; +pub use options::SqlParserOptions; diff --git a/crates/sail-sql-parser/src/location.rs b/crates/sail-sql-parser/src/location.rs new file mode 100644 index 00000000..0f49e7dd --- /dev/null +++ b/crates/sail-sql-parser/src/location.rs @@ -0,0 +1,13 @@ +/// A location in the source code. +/// Note that in the SQL lexer and parser, [`crate::token::TokenSpan`] uses the offset value +/// to represent the position in the source code, and the location is not calculated. +/// The location is only useful for human-readable error messages. +/// Therefore, the location is calculated on demand given [`crate::token::TokenSpan`] and +/// the source code string. +#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, PartialOrd, Ord)] +pub struct Location { + /// The line number, starting from 0. + pub line: usize, + /// The column number, starting from 0. + pub column: usize, +} diff --git a/crates/sail-sql-parser/src/options.rs b/crates/sail-sql-parser/src/options.rs new file mode 100644 index 00000000..c3b07e43 --- /dev/null +++ b/crates/sail-sql-parser/src/options.rs @@ -0,0 +1,21 @@ +/// The strategy for quote escape in a string, where +/// the single-character quote is used as the delimiter for the string. +#[derive(Debug, Clone)] +#[allow(unused)] +pub enum QuoteEscape { + /// No escape is supported. + None, + /// The quote character is escaped by repeating it twice. + Dual, + /// The quote character is escaped by a backslash character. + Backslash, +} + +/// Options for the SQL parser. +#[derive(Debug, Clone)] +pub struct SqlParserOptions { + /// The quote (delimiter) escape strategy for string. + pub quote_escape: QuoteEscape, + /// Whether a string can be delimited by triple quote characters. + pub allow_triple_quote_string: bool, +} diff --git a/crates/sail-sql-parser/src/parser.rs b/crates/sail-sql-parser/src/parser.rs new file mode 100644 index 00000000..f9f7ff35 --- /dev/null +++ b/crates/sail-sql-parser/src/parser.rs @@ -0,0 +1,69 @@ +use chumsky::prelude::Recursive; +use chumsky::{IterParser, Parser}; + +use crate::ast::data_type::DataType; +use crate::ast::expression::Expr; +use crate::ast::operator::Semicolon; +use crate::ast::query::Query; +use crate::ast::statement::Statement; +use crate::ast::whitespace::whitespace; +use crate::token::Token; +use crate::tree::TreeParser; +use crate::SqlParserOptions; + +#[allow(unused)] +pub fn parser<'a>(_options: &SqlParserOptions) -> impl Parser<'a, &'a [Token<'a>], Vec> { + let mut statement = Recursive::declare(); + let mut query = Recursive::declare(); + let mut expression = Recursive::declare(); + let mut data_type = Recursive::declare(); + + statement.define(Statement::parser(( + statement.clone(), + query.clone(), + expression.clone(), + data_type.clone(), + ))); + query.define(Query::parser((query.clone(), expression.clone()))); + expression.define(Expr::parser((expression.clone(), query.clone()))); + data_type.define(DataType::parser(data_type.clone())); + + statement + .padded_by( + whitespace() + .ignored() + .or(Semicolon::parser(()).ignored()) + .repeated(), + ) + .repeated() + .collect() +} + +#[cfg(test)] +mod tests { + use chumsky::Parser; + + use crate::ast::query::Query; + use crate::ast::statement::Statement; + use crate::options::{QuoteEscape, SqlParserOptions}; + + #[test] + fn test_parse() { + let sql = "/* */ ; SELECT 1;;; SELECT 2"; + let options = SqlParserOptions { + quote_escape: QuoteEscape::None, + allow_triple_quote_string: false, + }; + let lexer = crate::lexer::lexer(&options); + let tokens = lexer.parse(sql).unwrap(); + let parser = crate::parser::parser(&options); + let tree = parser.parse(&tokens).unwrap(); + assert!(matches!( + tree.as_slice(), + [ + Statement::Query(Query { .. }), + Statement::Query(Query { .. }), + ] + )); + } +} diff --git a/crates/sail-sql-parser/src/token.rs b/crates/sail-sql-parser/src/token.rs new file mode 100644 index 00000000..8d602efb --- /dev/null +++ b/crates/sail-sql-parser/src/token.rs @@ -0,0 +1,275 @@ +/// A token in the SQL lexer output. +#[derive(Debug, Clone, PartialEq)] +pub struct Token<'a> { + pub value: TokenValue<'a>, + pub span: TokenSpan, +} + +impl<'a> Token<'a> { + pub fn new(value: TokenValue<'a>, span: impl Into) -> Self { + Self { + value, + span: span.into(), + } + } +} + +/// A SQL token value. +#[derive(Debug, Clone, PartialEq)] +pub enum TokenValue<'a> { + /// A word that is not quoted nor escaped. + /// The word may match a SQL keyword. + Word { + raw: &'a str, + keyword: Option, + }, + /// A numeric literal with a suffix. The suffix can be empty. + Number { value: &'a str, suffix: &'a str }, + /// A string of a specific style. + /// The raw text includes the delimiters and the prefix (if any). + /// No escape sequences are processed in the raw text. + /// Note that some styles may be used for delimited (quoted) identifiers + /// rather than string literals. + String { raw: &'a str, style: StringStyle }, + /// One or more horizontal tab characters (ASCII 0x09). + Tab { count: usize }, + /// One or more line feed characters (ASCII 0x0A). + LineFeed { count: usize }, + /// One or more carriage return characters (ASCII 0x0D). + CarriageReturn { count: usize }, + /// One or more space characters (ASCII 0x20). + Space { count: usize }, + /// A single-line comment starting with `--`. + /// The raw text includes the `--` prefix. + /// Any newline characters following the comment are not part of this token. + SingleLineComment { raw: &'a str }, + /// A multi-line comment starting with `/*` and ending with `*/`. + /// The start and end delimiters can be nested. + /// The raw text includes the outermost delimiters. + MultiLineComment { raw: &'a str }, + /// A punctuation character. + Punctuation(Punctuation), +} + +/// A style of SQL string literal. +#[derive(Debug, Clone, PartialEq)] +#[allow(clippy::enum_variant_names)] +pub enum StringStyle { + /// A string literal surrounded by one single quote on each side + /// with an optional prefix (e.g., `'hello'` or `N'hello'`). + SingleQuoted { prefix: Option }, + /// A string literal surrounded by one double quote on each side + /// with an optional prefix (e.g., `"hello"` or `r"hello"`). + DoubleQuoted { prefix: Option }, + /// A string literal surrounded by three single quotes on each side + /// with an optional prefix (e.g., `'''hello'''` or `R'''hello'''`). + TripleSingleQuoted { prefix: Option }, + /// A string literal surrounded by three double quotes on each side + /// with an optional prefix (e.g., `"""hello"""` or `B"""hello"""`). + TripleDoubleQuoted { prefix: Option }, + /// A Unicode string literal surrounded by one single quote on each side. + /// (e.g., `U&'hello'`). + UnicodeSingleQuoted, + /// A Unicode string literal surrounded by one double quote on each side. + /// (e.g., `U&"hello"`). + UnicodeDoubleQuoted, + /// A string literal surrounded by one backtick on each side. + BacktickQuoted, + /// A string literal surrounded by the same tag on each side where the tag + /// is some text surrounded by one dollar sign on each side (e.g., `$tag$hello$tag$` + /// with tag `$tag$`). The text of the tag can be an empty string (e.g., `$$hello$$` + /// with an empty tag `$$`). + DollarQuoted { tag: String }, +} + +macro_rules! for_all_punctuations { + ($callback:ident) => { + $callback!([ + (0x21, '!', ExclamationMark), + (0x23, '#', NumberSign), + (0x24, '$', Dollar), + (0x25, '%', Percent), + (0x26, '&', Ampersand), + (0x28, '(', LeftParenthesis), + (0x29, ')', RightParenthesis), + (0x2A, '*', Asterisk), + (0x2B, '+', Plus), + (0x2C, ',', Comma), + (0x2D, '-', Minus), + (0x2E, '.', Period), + (0x2F, '/', Slash), + (0x3A, ':', Colon), + (0x3B, ';', Semicolon), + (0x3C, '<', LessThan), + (0x3D, '=', Equals), + (0x3E, '>', GreaterThan), + (0x3F, '?', QuestionMark), + (0x40, '@', At), + (0x5B, '[', LeftBracket), + (0x5C, '\\', Backslash), + (0x5D, ']', RightBracket), + (0x5E, '^', Caret), + (0x7B, '{', LeftBrace), + (0x7C, '|', VerticalBar), + (0x7D, '}', RightBrace), + (0x7E, '~', Tilde), + ]); + }; +} + +macro_rules! punctuation_enum { + ([$(($ascii:literal, $ch:literal, $p:ident)),* $(,)?]) => { + #[derive(Debug, Clone, PartialEq, Eq)] + pub enum Punctuation { + $( + #[doc = concat!("The `", $ch, "` character (ASCII ", stringify!($ascii), ").")] + $p, + )* + } + + impl Punctuation { + pub fn from_char(c: char) -> Option { + match c { + $($ch => Some(Self::$p),)* + _ => None, + } + } + + #[allow(unused)] + pub fn to_char(&self) -> char { + match self { + $(Self::$p => $ch,)* + } + } + } + }; +} + +for_all_punctuations!(punctuation_enum); + +/// A span in the source code. +/// The offsets are measured in the number of characters from the beginning of the input, +/// starting from 0. +#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)] +pub struct TokenSpan { + /// The start offset of the span. + pub start: usize, + /// The end (exclusive) offset of the span. + pub end: usize, +} + +#[allow(unused)] +impl TokenSpan { + pub fn is_empty(&self) -> bool { + self.start >= self.end + } + + pub fn union(&self, other: &Self) -> Self { + match (self.is_empty(), other.is_empty()) { + (true, true) => TokenSpan::default(), + (true, false) => *other, + (false, true) => *self, + (false, false) => TokenSpan { + start: self.start.min(other.start), + end: self.end.max(other.end), + }, + } + } + + pub fn union_all(iter: I) -> Self + where + I: IntoIterator, + { + iter.into_iter() + .reduce(|acc, span| acc.union(&span)) + .unwrap_or_default() + } +} + +macro_rules! keyword_enum { + ([$(($_:expr, $identifier:ident),)* $(,)?]) => { + /// A SQL keyword. + #[derive(Debug, Clone, Copy, PartialEq, Eq)] + #[allow(unused)] + pub enum Keyword { + $($identifier,)* + } + }; +} + +for_all_keywords!(keyword_enum); + +macro_rules! keyword_map_value { + ($kw:ident) => { + Keyword::$kw + }; +} + +static KEYWORD_MAP: phf::Map<&'static str, Keyword> = keyword_map!(keyword_map_value); + +impl Keyword { + pub fn get(value: &str) -> Option { + KEYWORD_MAP.get(value.to_uppercase().as_str()).cloned() + } +} + +#[cfg(test)] +mod tests { + macro_rules! keyword_values { + ([$(($string:expr, $_:ident),)* $(,)?]) => { + static KEYWORD_VALUES: &[&str] = &[ $($string,)* ]; + }; + } + + for_all_keywords!(keyword_values); + + macro_rules! punctuation_values { + ([$(($ascii:literal, $ch:literal, $_:ident)),* $(,)?]) => { + static PUNCTUATION_VALUES: &[(u8, char)] = &[ $(($ascii, $ch),)* ]; + }; + } + + for_all_punctuations!(punctuation_values); + + /// All keywords must be upper case and contain only alphanumeric characters or underscores, + /// where the first character must be an alphabet or an underscore. + #[test] + fn test_keywords_format() { + for k in KEYWORD_VALUES { + assert!(k.chars().all(|c| matches!(c, 'A'..='Z' | '0'..='9' | '_'))); + assert!(matches!(k.chars().next(), Some('A'..='Z' | '_'))); + } + } + + #[test] + /// The keywords must be listed in ASCII order. + /// The keywords must be unique. + fn test_keywords_order_and_uniqueness() { + let mut keywords = KEYWORD_VALUES.to_vec(); + keywords.sort_unstable(); + keywords.dedup(); + assert_eq!(keywords.as_slice(), KEYWORD_VALUES); + } + + #[test] + /// The punctuation characters must match the ASCII values. + fn test_punctuation_values() { + for &(ascii, ch) in PUNCTUATION_VALUES { + assert_eq!(ascii, ch as u8); + } + } + + #[test] + /// The punctuation characters must be listed in ASCII order. + /// The punctuation characters must be unique. + fn test_punctuation_order_and_uniqueness() { + let punctuations = PUNCTUATION_VALUES + .iter() + .map(|(_, ch)| *ch) + .collect::>(); + let mut copy = punctuations.clone(); + copy.sort_unstable(); + copy.dedup(); + assert_eq!(copy, punctuations); + } +} diff --git a/crates/sail-sql-parser/src/tree.rs b/crates/sail-sql-parser/src/tree.rs new file mode 100644 index 00000000..51a86351 --- /dev/null +++ b/crates/sail-sql-parser/src/tree.rs @@ -0,0 +1,17 @@ +use chumsky::Parser; + +use crate::token::Token; + +/// A trait for defining a parser that can be used to parse the type. +pub trait TreeParser<'a, D = ()>: Sized { + /// Returns a parser for the type. + /// This method receives opaque `data` of generic type `D`. + /// This is useful for defining recursive parsers, where the method receives + /// the declared parser(s) as the `data` and returns the defined parser. + /// + /// For easier whitespace handling, the parser can assume that the first token + /// of the input is part of the type's AST, but the parser should consume all + /// whitespace tokens **after** the AST. This contract must be respected by + /// all implementations of this trait. + fn parser(data: D) -> impl Parser<'a, &'a [Token<'a>], Self> + Clone; +}