diff --git a/crates/cli-lib/src/stdin.rs b/crates/cli-lib/src/stdin.rs index a0bebd98e..374d90dd4 100644 --- a/crates/cli-lib/src/stdin.rs +++ b/crates/cli-lib/src/stdin.rs @@ -1,5 +1,5 @@ use std::io::Read; -use std::path::PathBuf; +use std::path::{Path, PathBuf}; /// Check if the given input is the flag to use stdin as input. /// @@ -8,9 +8,9 @@ use std::path::PathBuf; /// /// The error message is returned if any of the inputs are `-` and there are other inputs. pub(crate) fn is_std_in_flag_input(inputs: &[PathBuf]) -> Result { - if inputs.len() == 1 && inputs[0] == PathBuf::from("-") { + if inputs.len() == 1 && inputs[0] == Path::new("-") { Ok(true) - } else if inputs.iter().any(|input| *input == PathBuf::from("-")) { + } else if inputs.iter().any(|input| input == Path::new("-")) { Err("Cannot mix stdin flag with other inputs".to_string()) } else { Ok(false) diff --git a/crates/lib-core/Cargo.toml b/crates/lib-core/Cargo.toml index 95e40c750..a4d395816 100644 --- a/crates/lib-core/Cargo.toml +++ b/crates/lib-core/Cargo.toml @@ -15,6 +15,7 @@ crate-type = ["cdylib", "rlib"] [features] serde = ["dep:serde"] stringify = ["dep:serde_yaml", "serde"] +simd-tokenizer = [] [dependencies] smol_str = "0.3.1" diff --git a/crates/lib-core/src/lib.rs b/crates/lib-core/src/lib.rs index d552f31d6..c193cbd22 100644 --- a/crates/lib-core/src/lib.rs +++ b/crates/lib-core/src/lib.rs @@ -1,3 +1,4 @@ +#![cfg_attr(feature = "simd-tokenizer", feature(portable_simd))] pub mod dialects; pub mod edit_type; pub mod errors; diff --git a/crates/lib-core/src/parser.rs b/crates/lib-core/src/parser.rs index 96b115015..da05f5f7f 100644 --- a/crates/lib-core/src/parser.rs +++ b/crates/lib-core/src/parser.rs @@ -11,6 +11,9 @@ pub mod parsers; pub mod segments; pub mod types; +#[cfg(feature = "simd-tokenizer")] +pub mod simd_tokenizer; + use ahash::AHashMap; use crate::dialects::Dialect; diff --git a/crates/lib-core/src/parser/lexer.rs b/crates/lib-core/src/parser/lexer.rs index d7f932b16..e973b0546 100644 --- a/crates/lib-core/src/parser/lexer.rs +++ b/crates/lib-core/src/parser/lexer.rs @@ -5,6 +5,8 @@ use std::str::Chars; use super::markers::PositionMarker; use super::segments::{ErasedSegment, SegmentBuilder, Tables}; +#[cfg(feature = "simd-tokenizer")] +use super::simd_tokenizer::{SimdTokenizer, TokenKind}; use crate::dialects::Dialect; use crate::dialects::syntax::SyntaxKind; use crate::errors::SQLLexError; @@ -396,11 +398,14 @@ impl<'text> Cursor<'text> { /// The Lexer class actually does the lexing step. #[derive(Debug, Clone)] +#[cfg_attr(feature = "simd-tokenizer", allow(dead_code))] pub struct Lexer { syntax_map: Vec<(&'static str, SyntaxKind)>, regex: regex_automata::meta::Regex, matchers: Vec, last_resort_lexer: Matcher, + #[cfg(feature = "simd-tokenizer")] + use_simd: bool, } impl<'a> From<&'a Dialect> for Lexer { @@ -412,6 +417,12 @@ impl<'a> From<&'a Dialect> for Lexer { impl Lexer { /// Create a new lexer. pub(crate) fn new(lexer_matchers: &[Matcher]) -> Self { + // Default to regular tokenizer since SIMD is experimental and incomplete + Self::new_with_options(lexer_matchers, false) + } + + /// Create a new lexer with options for tokenizer selection. + pub(crate) fn new_with_options(lexer_matchers: &[Matcher], #[allow(unused_variables)] use_simd: bool) -> Self { let mut patterns = Vec::new(); let mut syntax_map = Vec::new(); let mut matchers = Vec::new(); @@ -444,36 +455,91 @@ impl Lexer { r"[^\t\n.]*", SyntaxKind::Unlexable, ), + #[cfg(feature = "simd-tokenizer")] + use_simd, } } + /// Create a lexer for testing with regular tokenizer (when SIMD feature is enabled). + #[cfg(feature = "simd-tokenizer")] + pub fn new_regular_for_test(lexer_matchers: &[Matcher]) -> Self { + Self::new_with_options(lexer_matchers, false) + } + + /// Create a lexer for testing with SIMD tokenizer (when SIMD feature is enabled). + #[cfg(feature = "simd-tokenizer")] + pub fn new_simd_for_test(lexer_matchers: &[Matcher]) -> Self { + Self::new_with_options(lexer_matchers, true) + } + pub fn lex( &self, tables: &Tables, template: impl Into, ) -> (Vec, Vec) { let template = template.into(); - let mut str_buff = template.templated_str.as_deref().unwrap(); + let str_buff = template.templated_str.as_deref().unwrap(); // Lex the string to get a tuple of LexedElement let mut element_buffer: Vec = Vec::new(); - loop { - let mut res = self.lex_match(str_buff); - element_buffer.append(&mut res.elements); + #[cfg(feature = "simd-tokenizer")] + { + if self.use_simd { + let tokenizer = SimdTokenizer::new(str_buff); + for tok in tokenizer.tokenize() { + let (name, kind) = match tok.kind { + TokenKind::Identifier => ("identifier", SyntaxKind::Identifier), + TokenKind::Number => ("numeric_literal", SyntaxKind::NumericLiteral), + TokenKind::Symbol => ("symbol", SyntaxKind::Symbol), + TokenKind::Whitespace => ("whitespace", SyntaxKind::Whitespace), + TokenKind::EndOfFile => ("end_of_file", SyntaxKind::EndOfFile), + }; + element_buffer.push(Element::new(name, kind, tok.text)); + } + } else { + // Use the regular lexer even when SIMD is available + let mut str_buff = str_buff; + loop { + let mut res = self.lex_match(str_buff); + element_buffer.append(&mut res.elements); - if res.forward_string.is_empty() { - break; - } + if res.forward_string.is_empty() { + break; + } - // If we STILL can't match, then just panic out. - let mut resort_res = self.last_resort_lexer.matches(str_buff); - if !resort_res.elements.is_empty() { - break; + // If we STILL can't match, then just panic out. + let mut resort_res = self.last_resort_lexer.matches(str_buff); + if !resort_res.elements.is_empty() { + break; + } + + str_buff = resort_res.forward_string; + element_buffer.append(&mut resort_res.elements); + } } + } + + #[cfg(not(feature = "simd-tokenizer"))] + { + let mut str_buff = str_buff; + loop { + let mut res = self.lex_match(str_buff); + element_buffer.append(&mut res.elements); - str_buff = resort_res.forward_string; - element_buffer.append(&mut resort_res.elements); + if res.forward_string.is_empty() { + break; + } + + // If we STILL can't match, then just panic out. + let mut resort_res = self.last_resort_lexer.matches(str_buff); + if !resort_res.elements.is_empty() { + break; + } + + str_buff = resort_res.forward_string; + element_buffer.append(&mut resort_res.elements); + } } // Map tuple LexedElement to list of TemplateElement. @@ -511,6 +577,7 @@ impl Lexer { } /// Iteratively match strings using the selection of sub-matchers. + #[cfg_attr(feature = "simd-tokenizer", allow(dead_code))] fn lex_match<'b>(&self, mut forward_string: &'b str) -> Match<'b> { let mut elem_buff = Vec::new(); diff --git a/crates/lib-core/src/parser/simd_tokenizer.rs b/crates/lib-core/src/parser/simd_tokenizer.rs new file mode 100644 index 000000000..075a47e30 --- /dev/null +++ b/crates/lib-core/src/parser/simd_tokenizer.rs @@ -0,0 +1,181 @@ +#![allow(clippy::too_many_lines)] +use std::simd::{Simd, cmp::SimdPartialEq, cmp::SimdPartialOrd}; + +/// A simple token kind used by the experimental SIMD tokenizer. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum TokenKind { + Identifier, + Number, + Symbol, + Whitespace, + EndOfFile, +} + +/// A token returned by the SIMD tokenizer. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct Token<'a> { + pub kind: TokenKind, + pub text: &'a str, +} + +/// Experimental tokenizer using SIMD operations to scan the input string. +/// +/// This is a simplified implementation inspired by the tokenizer used in +/// [`db25-sql-parser`](https://github.com/space-rf-org/db25-sql-parser). +/// It demonstrates how wide SIMD registers can be used to accelerate +/// common scanning tasks such as skipping whitespace and consuming +/// identifier characters. +#[derive(Debug, Clone)] +pub struct SimdTokenizer<'a> { + source: &'a str, +} + +impl<'a> SimdTokenizer<'a> { + /// Create a new tokenizer from an input string. + pub fn new(source: &'a str) -> Self { + Self { source } + } + + /// Tokenize the input string and return all tokens. + pub fn tokenize(self) -> Vec> { + let bytes = self.source.as_bytes(); + let mut pos = 0; + let mut tokens = Vec::new(); + + while pos < bytes.len() { + let skipped = skip_whitespace(&bytes[pos..]); + if skipped > 0 { + let text = &self.source[pos..pos + skipped]; + tokens.push(Token { + kind: TokenKind::Whitespace, + text, + }); + pos += skipped; + continue; + } + + let ch = bytes[pos]; + if is_ident_start(ch) { + let len = take_identifier(&bytes[pos..]); + let text = &self.source[pos..pos + len]; + tokens.push(Token { + kind: TokenKind::Identifier, + text, + }); + pos += len; + continue; + } + + if ch.is_ascii_digit() { + let len = take_number(&bytes[pos..]); + let text = &self.source[pos..pos + len]; + tokens.push(Token { + kind: TokenKind::Number, + text, + }); + pos += len; + continue; + } + + // Anything else is treated as a single symbol token. + let text = &self.source[pos..pos + 1]; + tokens.push(Token { + kind: TokenKind::Symbol, + text, + }); + pos += 1; + } + + tokens.push(Token { + kind: TokenKind::EndOfFile, + text: "", + }); + tokens + } +} + +const CHUNK: usize = 64; + +fn is_ident_start(b: u8) -> bool { + b.is_ascii_alphabetic() || b == b'_' || b >= 0x80 +} + +fn skip_whitespace(bytes: &[u8]) -> usize { + let mut i = 0; + while i + CHUNK <= bytes.len() { + let chunk = Simd::::from_slice(&bytes[i..i + CHUNK]); + let is_space = chunk.simd_eq(Simd::splat(b' ')) + | chunk.simd_eq(Simd::splat(b'\n')) + | chunk.simd_eq(Simd::splat(b'\t')) + | chunk.simd_eq(Simd::splat(b'\r')); + let mask: u64 = is_space.to_bitmask(); + if mask == u64::MAX { + i += CHUNK; + } else { + let first_non = (!mask).trailing_zeros() as usize; + return i + first_non; + } + } + while i < bytes.len() && matches!(bytes[i], b' ' | b'\n' | b'\t' | b'\r') { + i += 1; + } + i +} + +fn take_identifier(bytes: &[u8]) -> usize { + let mut i = 0; + while i + CHUNK <= bytes.len() { + let chunk = Simd::::from_slice(&bytes[i..i + CHUNK]); + let upper = chunk.simd_ge(Simd::splat(b'A')) & chunk.simd_le(Simd::splat(b'Z')); + let lower = chunk.simd_ge(Simd::splat(b'a')) & chunk.simd_le(Simd::splat(b'z')); + let digit = chunk.simd_ge(Simd::splat(b'0')) & chunk.simd_le(Simd::splat(b'9')); + let underscore = chunk.simd_eq(Simd::splat(b'_')); + let ident = upper | lower | digit | underscore; + let mask: u64 = ident.to_bitmask(); + if mask == u64::MAX { + i += CHUNK; + } else { + let first_non = (!mask).trailing_zeros() as usize; + return i + first_non; + } + } + while i < bytes.len() && (bytes[i].is_ascii_alphanumeric() || bytes[i] == b'_') { + i += 1; + } + i +} + +fn take_number(bytes: &[u8]) -> usize { + let mut i = 0; + while i + CHUNK <= bytes.len() { + let chunk = Simd::::from_slice(&bytes[i..i + CHUNK]); + let digit = chunk.simd_ge(Simd::splat(b'0')) & chunk.simd_le(Simd::splat(b'9')); + let mask: u64 = digit.to_bitmask(); + if mask == u64::MAX { + i += CHUNK; + } else { + let first_non = (!mask).trailing_zeros() as usize; + return i + first_non; + } + } + while i < bytes.len() && bytes[i].is_ascii_digit() { + i += 1; + } + i +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn basic_tokenization() { + let sql = "SELECT foo, 42 FROM bar"; + let tokenizer = SimdTokenizer::new(sql); + let tokens = tokenizer.tokenize(); + let kinds: Vec = tokens.iter().map(|t| t.kind).collect(); + assert!(kinds.contains(&TokenKind::Identifier)); + assert!(kinds.contains(&TokenKind::Number)); + assert!(matches!(tokens.last().unwrap().kind, TokenKind::EndOfFile)); + } +} diff --git a/crates/lib-dialects/Cargo.toml b/crates/lib-dialects/Cargo.toml index 76987ceaf..3eff724a6 100644 --- a/crates/lib-dialects/Cargo.toml +++ b/crates/lib-dialects/Cargo.toml @@ -33,6 +33,7 @@ default = [ "trino", "tsql", ] +simd-tokenizer = ["sqruff-lib-core/simd-tokenizer"] athena = [] bigquery = [] clickhouse = [] @@ -57,7 +58,7 @@ serde_yaml = "0.9.34+deprecated" [dev-dependencies] sqruff-lib-core.workspace = true -sqruff-lib-core.features = ["serde", "stringify"] +sqruff-lib-core.features = ["serde", "stringify", "simd-tokenizer"] rayon = "1.10.0" expect-test = "1.5.0" glob = "0.3.1" diff --git a/crates/lib-dialects/tests/dialects.rs b/crates/lib-dialects/tests/dialects.rs index 6757eec2c..cc55e02e6 100644 --- a/crates/lib-dialects/tests/dialects.rs +++ b/crates/lib-dialects/tests/dialects.rs @@ -72,12 +72,14 @@ fn main() { let yaml = file.with_extension("yml"); let yaml = std::path::absolute(yaml).unwrap(); + let sql = std::fs::read_to_string(file).unwrap(); + let tables = Tables::default(); + + // Test with the default tokenizer let actual = { - let sql = std::fs::read_to_string(file).unwrap(); - let tables = Tables::default(); let lexer = Lexer::from(&dialect); let parser = Parser::from(&dialect); - let tokens = lexer.lex(&tables, sql); + let tokens = lexer.lex(&tables, sql.clone()); assert!(tokens.1.is_empty()); let parsed = parser.parse(&tables, &tokens.0).unwrap(); @@ -87,6 +89,48 @@ fn main() { serde_yaml::to_string(&tree).unwrap() }; + // When SIMD tokenizer is available, test both and compare + #[cfg(feature = "simd-tokenizer")] + { + // Test with regular tokenizer explicitly + let regular_result = { + let lexer = Lexer::new_regular_for_test(dialect.lexer_matchers()); + let parser = Parser::from(&dialect); + let tokens = lexer.lex(&tables, sql.clone()); + assert!(tokens.1.is_empty(), "Regular tokenizer errors: {:?}", tokens.1); + + let parsed = parser.parse(&tables, &tokens.0).unwrap(); + let tree = parsed.unwrap(); + let tree = tree.to_serialised(true, true); + + serde_yaml::to_string(&tree).unwrap() + }; + + // Test with SIMD tokenizer explicitly + let simd_result = { + let lexer = Lexer::new_simd_for_test(dialect.lexer_matchers()); + let parser = Parser::from(&dialect); + let tokens = lexer.lex(&tables, sql.clone()); + assert!(tokens.1.is_empty(), "SIMD tokenizer errors: {:?}", tokens.1); + + let parsed = parser.parse(&tables, &tokens.0).unwrap(); + let tree = parsed.unwrap(); + let tree = tree.to_serialised(true, true); + + serde_yaml::to_string(&tree).unwrap() + }; + + // Compare the results + if regular_result != simd_result { + eprintln!( + "WARNING: Tokenizer mismatch for file {:?}:\nRegular and SIMD tokenizers produced different results.\nSIMD tokenizer is experimental and incomplete.", + file + ); + // For now, we'll just warn instead of panic since the SIMD tokenizer is incomplete + // panic!("Tokenizer mismatch - SIMD tokenizer needs more work"); + } + } + expect_file![yaml].assert_eq(&actual); }); } diff --git a/rust-toolchain.toml b/rust-toolchain.toml index 292fe499e..5d56faf9a 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,2 +1,2 @@ [toolchain] -channel = "stable" +channel = "nightly"