Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions crates/lib-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ crate-type = ["cdylib", "rlib"]
[features]
serde = ["dep:serde"]
stringify = ["dep:serde_yaml", "serde"]
simd-tokenizer = []

[dependencies]
smol_str = "0.3.1"
Expand Down
1 change: 1 addition & 0 deletions crates/lib-core/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#![cfg_attr(feature = "simd-tokenizer", feature(portable_simd))]
pub mod dialects;
pub mod edit_type;
pub mod errors;
Expand Down
3 changes: 3 additions & 0 deletions crates/lib-core/src/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@ pub mod parsers;
pub mod segments;
pub mod types;

#[cfg(feature = "simd-tokenizer")]
pub mod simd_tokenizer;

use ahash::AHashMap;

use crate::dialects::Dialect;
Expand Down
49 changes: 35 additions & 14 deletions crates/lib-core/src/parser/lexer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ use std::str::Chars;

use super::markers::PositionMarker;
use super::segments::{ErasedSegment, SegmentBuilder, Tables};
#[cfg(feature = "simd-tokenizer")]
use super::simd_tokenizer::{SimdTokenizer, TokenKind};
use crate::dialects::Dialect;
use crate::dialects::syntax::SyntaxKind;
use crate::errors::SQLLexError;
Expand Down Expand Up @@ -453,27 +455,46 @@ impl Lexer {
template: impl Into<TemplatedFile>,
) -> (Vec<ErasedSegment>, Vec<SQLLexError>) {
let template = template.into();
let mut str_buff = template.templated_str.as_deref().unwrap();
let str_buff = template.templated_str.as_deref().unwrap();

// Lex the string to get a tuple of LexedElement
let mut element_buffer: Vec<Element> = Vec::new();

loop {
let mut res = self.lex_match(str_buff);
element_buffer.append(&mut res.elements);

if res.forward_string.is_empty() {
break;
#[cfg(feature = "simd-tokenizer")]
{
let tokenizer = SimdTokenizer::new(str_buff);
for tok in tokenizer.tokenize() {
let (name, kind) = match tok.kind {
TokenKind::Identifier => ("identifier", SyntaxKind::Identifier),
TokenKind::Number => ("numeric_literal", SyntaxKind::NumericLiteral),
TokenKind::Symbol => ("symbol", SyntaxKind::Symbol),
TokenKind::Whitespace => ("whitespace", SyntaxKind::Whitespace),
TokenKind::EndOfFile => ("end_of_file", SyntaxKind::EndOfFile),
};
element_buffer.push(Element::new(name, kind, tok.text));
}
}

// If we STILL can't match, then just panic out.
let mut resort_res = self.last_resort_lexer.matches(str_buff);
if !resort_res.elements.is_empty() {
break;
}
#[cfg(not(feature = "simd-tokenizer"))]
{
let mut str_buff = str_buff;
loop {
let mut res = self.lex_match(str_buff);
element_buffer.append(&mut res.elements);

str_buff = resort_res.forward_string;
element_buffer.append(&mut resort_res.elements);
if res.forward_string.is_empty() {
break;
}

// If we STILL can't match, then just panic out.
let mut resort_res = self.last_resort_lexer.matches(str_buff);
if !resort_res.elements.is_empty() {
break;
}

str_buff = resort_res.forward_string;
element_buffer.append(&mut resort_res.elements);
}
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Duplicate EndOfFile Segments in SIMD Path

The SIMD tokenizer adds an EndOfFile token, leading to duplicate EndOfFile segments. The elements_to_segments method also unconditionally adds an EndOfFile segment, resulting in two when the SIMD tokenizer is enabled, unlike the single segment from the non-SIMD path.

Additional Locations (1)

Fix in Cursor Fix in Web

}

// Map tuple LexedElement to list of TemplateElement.
Expand Down
181 changes: 181 additions & 0 deletions crates/lib-core/src/parser/simd_tokenizer.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
#![allow(clippy::too_many_lines)]
use std::simd::{Simd, cmp::SimdPartialEq, cmp::SimdPartialOrd};

/// A simple token kind used by the experimental SIMD tokenizer.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum TokenKind {
Identifier,
Number,
Symbol,
Whitespace,
EndOfFile,
}

/// A token returned by the SIMD tokenizer.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct Token<'a> {
pub kind: TokenKind,
pub text: &'a str,
}

/// Experimental tokenizer using SIMD operations to scan the input string.
///
/// This is a simplified implementation inspired by the tokenizer used in
/// [`db25-sql-parser`](https://github.com/space-rf-org/db25-sql-parser).
/// It demonstrates how wide SIMD registers can be used to accelerate
/// common scanning tasks such as skipping whitespace and consuming
/// identifier characters.
#[derive(Debug, Clone)]
pub struct SimdTokenizer<'a> {
source: &'a str,
}

impl<'a> SimdTokenizer<'a> {
/// Create a new tokenizer from an input string.
pub fn new(source: &'a str) -> Self {
Self { source }
}

/// Tokenize the input string and return all tokens.
pub fn tokenize(self) -> Vec<Token<'a>> {
let bytes = self.source.as_bytes();
let mut pos = 0;
let mut tokens = Vec::new();

while pos < bytes.len() {
let skipped = skip_whitespace(&bytes[pos..]);
if skipped > 0 {
let text = &self.source[pos..pos + skipped];
tokens.push(Token {
kind: TokenKind::Whitespace,
text,
});
pos += skipped;
continue;
}

let ch = bytes[pos];
if is_ident_start(ch) {
let len = take_identifier(&bytes[pos..]);
let text = &self.source[pos..pos + len];
tokens.push(Token {
kind: TokenKind::Identifier,
text,
});
pos += len;
continue;
}

if ch.is_ascii_digit() {
let len = take_number(&bytes[pos..]);
let text = &self.source[pos..pos + len];
tokens.push(Token {
kind: TokenKind::Number,
text,
});
pos += len;
continue;
}

// Anything else is treated as a single symbol token.
let text = &self.source[pos..pos + 1];
tokens.push(Token {
kind: TokenKind::Symbol,
text,
});
pos += 1;
}

tokens.push(Token {
kind: TokenKind::EndOfFile,
text: "",
});
tokens
}
}

const CHUNK: usize = 64;

fn is_ident_start(b: u8) -> bool {
b.is_ascii_alphabetic() || b == b'_' || b >= 0x80
}

fn skip_whitespace(bytes: &[u8]) -> usize {
let mut i = 0;
while i + CHUNK <= bytes.len() {
Comment on lines +95 to +105

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[P1] Avoid infinite loop on non‑ASCII identifiers

When the SIMD tokenizer encounters a byte ≥ 0x80 it treats it as a valid identifier start, but take_identifier only advances over ASCII letters/digits/underscores. For non‑ASCII input len becomes 0 and pos += len never advances, so while pos < bytes.len() spins forever as soon as the source contains any UTF‑8 characters beyond ASCII. Either reject non‑ASCII in is_ident_start or teach take_identifier to advance over those bytes.

Useful? React with 👍 / 👎.

let chunk = Simd::<u8, CHUNK>::from_slice(&bytes[i..i + CHUNK]);
let is_space = chunk.simd_eq(Simd::splat(b' '))
| chunk.simd_eq(Simd::splat(b'\n'))
| chunk.simd_eq(Simd::splat(b'\t'))
| chunk.simd_eq(Simd::splat(b'\r'));
let mask: u64 = is_space.to_bitmask();
if mask == u64::MAX {
i += CHUNK;
} else {
let first_non = (!mask).trailing_zeros() as usize;
return i + first_non;
}
}
while i < bytes.len() && matches!(bytes[i], b' ' | b'\n' | b'\t' | b'\r') {
i += 1;
}
i
}

fn take_identifier(bytes: &[u8]) -> usize {
let mut i = 0;
while i + CHUNK <= bytes.len() {
let chunk = Simd::<u8, CHUNK>::from_slice(&bytes[i..i + CHUNK]);
let upper = chunk.simd_ge(Simd::splat(b'A')) & chunk.simd_le(Simd::splat(b'Z'));
let lower = chunk.simd_ge(Simd::splat(b'a')) & chunk.simd_le(Simd::splat(b'z'));
let digit = chunk.simd_ge(Simd::splat(b'0')) & chunk.simd_le(Simd::splat(b'9'));
let underscore = chunk.simd_eq(Simd::splat(b'_'));
let ident = upper | lower | digit | underscore;
let mask: u64 = ident.to_bitmask();
if mask == u64::MAX {
i += CHUNK;
} else {
let first_non = (!mask).trailing_zeros() as usize;
return i + first_non;
}
}
while i < bytes.len() && (bytes[i].is_ascii_alphanumeric() || bytes[i] == b'_') {
i += 1;
}
i
}
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Identifier Parsing Mismatch Causes Infinite Loop

The is_ident_start function incorrectly validates non-ASCII bytes, including UTF-8 continuation bytes, as valid identifier starts. This clashes with take_identifier, which only consumes ASCII characters. This inconsistency causes incorrect tokenization of non-ASCII identifiers and, more critically, an infinite loop when take_identifier fails to advance the position.

Additional Locations (1)

Fix in Cursor Fix in Web


fn take_number(bytes: &[u8]) -> usize {
let mut i = 0;
while i + CHUNK <= bytes.len() {
let chunk = Simd::<u8, CHUNK>::from_slice(&bytes[i..i + CHUNK]);
let digit = chunk.simd_ge(Simd::splat(b'0')) & chunk.simd_le(Simd::splat(b'9'));
let mask: u64 = digit.to_bitmask();
if mask == u64::MAX {
i += CHUNK;
} else {
let first_non = (!mask).trailing_zeros() as usize;
return i + first_non;
}
}
while i < bytes.len() && bytes[i].is_ascii_digit() {
i += 1;
}
i
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn basic_tokenization() {
let sql = "SELECT foo, 42 FROM bar";
let tokenizer = SimdTokenizer::new(sql);
let tokens = tokenizer.tokenize();
let kinds: Vec<TokenKind> = tokens.iter().map(|t| t.kind).collect();
assert!(kinds.contains(&TokenKind::Identifier));
assert!(kinds.contains(&TokenKind::Number));
assert!(matches!(tokens.last().unwrap().kind, TokenKind::EndOfFile));
}
}
2 changes: 1 addition & 1 deletion rust-toolchain.toml
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
[toolchain]
channel = "stable"
channel = "nightly"
Loading