1+ /*!
2+ * High-performance JSON string escaping using V8-style SIMD optimizations for aarch64.
3+ *
4+ * This implementation incorporates several optimizations inspired by V8's JSON.stringify:
5+ *
6+ * 1. **Bit-based Character Classification**: Uses SIMD bit operations for faster
7+ * character escape detection instead of table lookups.
8+ *
9+ * 2. **Vectorized Processing**: Processes 64 bytes at a time using four 16-byte NEON vectors.
10+ *
11+ * 3. **ASCII Fast Path**: Specialized path for clean ASCII text that needs no escaping.
12+ *
13+ * 4. **Advanced Prefetching**: Dual prefetch instructions to hide memory latency.
14+ *
15+ * 5. **Optimized String Building**: Smart capacity estimation and reduced memory allocations.
16+ *
17+ * 6. **Reduced Branching**: Minimized conditional branches in hot paths for better
18+ * branch prediction.
19+ */
20+
121use std:: arch:: aarch64:: {
222 vceqq_u8, vdupq_n_u8, vld1q_u8_x4, vmaxvq_u8, vorrq_u8, vqtbl4q_u8, vst1q_u8,
23+ vcltq_u8, vandq_u8, vbslq_u8, vshrq_n_u8, vreinterpretq_u8_u64, vreinterpretq_u64_u8,
24+ vgetq_lane_u64, vsetq_lane_u64, uint8x16_t,
325} ;
426
527use crate :: { encode_str_inner, write_char_escape, CharEscape , ESCAPE , REVERSE_SOLIDUS } ;
628
729/// Four contiguous 16-byte NEON registers (64 B) per loop.
830const CHUNK : usize = 64 ;
931/// Distance (in bytes) to prefetch ahead. Must be a multiple of 8 for PRFM.
10- /// Keeping ~4 iterations (4 × CHUNK = 256 B) ahead strikes a good balance
11- /// between hiding memory latency and not evicting useful cache lines.
12- const PREFETCH_DISTANCE : usize = CHUNK * 4 ;
32+ /// V8-style optimization: Prefetch further ahead to hide more latency
33+ const PREFETCH_DISTANCE : usize = CHUNK * 6 ;
34+
35+ /// V8-style optimization: Bit masks for efficient character classification
36+ /// Characters that need escaping: 0x00-0x1F (control), 0x22 (quote), 0x5C (backslash)
37+ const ESCAPE_MASK_LOW : u8 = 0x20 ; // Characters < 0x20 need escaping
38+ const QUOTE_CHAR : u8 = 0x22 ; // Quote character
39+ const BACKSLASH_CHAR : u8 = 0x5C ; // Backslash character
40+
41+ /// V8-style optimization: Fast character classification using bit operations
42+ /// Returns a mask where 0xFF indicates character needs escaping, 0x00 means no escaping
43+ #[ inline( always) ]
44+ unsafe fn classify_chars_v8_style ( chars : uint8x16_t ) -> uint8x16_t {
45+ // Check for control characters (< 0x20)
46+ let control_mask = vcltq_u8 ( chars, vdupq_n_u8 ( ESCAPE_MASK_LOW ) ) ;
47+
48+ // Check for quote character (0x22)
49+ let quote_mask = vceqq_u8 ( chars, vdupq_n_u8 ( QUOTE_CHAR ) ) ;
50+
51+ // Check for backslash character (0x5C)
52+ let backslash_mask = vceqq_u8 ( chars, vdupq_n_u8 ( BACKSLASH_CHAR ) ) ;
53+
54+ // Combine all masks - any character matching any condition needs escaping
55+ vorrq_u8 ( vorrq_u8 ( control_mask, quote_mask) , backslash_mask)
56+ }
57+
58+ /// V8-style optimization: Process escape sequences in vectorized manner
59+ #[ inline( always) ]
60+ unsafe fn process_escape_vector ( chars : uint8x16_t , mask : uint8x16_t , dst : & mut Vec < u8 > ) {
61+ // Convert SIMD vectors to arrays for processing
62+ let mut char_array: [ u8 ; 16 ] = core:: mem:: zeroed ( ) ;
63+ let mut mask_array: [ u8 ; 16 ] = core:: mem:: zeroed ( ) ;
64+
65+ vst1q_u8 ( char_array. as_mut_ptr ( ) , chars) ;
66+ vst1q_u8 ( mask_array. as_mut_ptr ( ) , mask) ;
67+
68+ // V8-style optimization: Process multiple characters with reduced branching
69+ for i in 0 ..16 {
70+ let c = char_array[ i] ;
71+ if mask_array[ i] == 0 {
72+ // Fast path: no escaping needed
73+ dst. push ( c) ;
74+ } else {
75+ // Escape needed - use optimized escape generation
76+ write_escape_optimized ( dst, c) ;
77+ }
78+ }
79+ }
80+
81+ /// V8-style optimization: Optimized escape sequence generation
82+ #[ inline( always) ]
83+ fn write_escape_optimized ( dst : & mut Vec < u8 > , c : u8 ) {
84+ match c {
85+ b'"' => dst. extend_from_slice ( b"\\ \" " ) ,
86+ b'\\' => dst. extend_from_slice ( REVERSE_SOLIDUS ) ,
87+ b'\x08' => dst. extend_from_slice ( b"\\ b" ) ,
88+ b'\x09' => dst. extend_from_slice ( b"\\ t" ) ,
89+ b'\x0A' => dst. extend_from_slice ( b"\\ n" ) ,
90+ b'\x0C' => dst. extend_from_slice ( b"\\ f" ) ,
91+ b'\x0D' => dst. extend_from_slice ( b"\\ r" ) ,
92+ _ => {
93+ // Control character - use optimized hex generation
94+ dst. extend_from_slice ( b"\\ u00" ) ;
95+ dst. push ( b'0' + ( c >> 4 ) ) ;
96+ dst. push ( if c & 0xF < 10 { b'0' + ( c & 0xF ) } else { b'a' + ( c & 0xF ) - 10 } ) ;
97+ }
98+ }
99+ }
100+
101+ /// V8-style optimization: ASCII fast path detection
102+ /// Returns true if the entire chunk is ASCII and needs no escaping
103+ #[ inline( always) ]
104+ unsafe fn is_ascii_clean_chunk ( ptr : * const u8 ) -> bool {
105+ let quad = vld1q_u8_x4 ( ptr) ;
106+
107+ // Check all 64 bytes for characters that need escaping
108+ let escape_mask_1 = classify_chars_v8_style ( quad. 0 ) ;
109+ let escape_mask_2 = classify_chars_v8_style ( quad. 1 ) ;
110+ let escape_mask_3 = classify_chars_v8_style ( quad. 2 ) ;
111+ let escape_mask_4 = classify_chars_v8_style ( quad. 3 ) ;
112+
113+ // Check if any character needs escaping
114+ let combined_escape = vmaxvq_u8 ( vorrq_u8 ( vorrq_u8 ( escape_mask_1, escape_mask_2) ,
115+ vorrq_u8 ( escape_mask_3, escape_mask_4) ) ) ;
116+
117+ combined_escape == 0
118+ }
13119
14120pub fn encode_str < S : AsRef < str > > ( input : S ) -> String {
15121 let s = input. as_ref ( ) ;
16- let mut out = Vec :: with_capacity ( s. len ( ) + 2 ) ;
17122 let bytes = s. as_bytes ( ) ;
18123 let n = bytes. len ( ) ;
124+
125+ // V8-style optimization: Better capacity estimation based on content analysis
126+ let initial_capacity = if n < 1024 {
127+ // For small strings, be conservative to avoid over-allocation
128+ n + 32
129+ } else {
130+ // For larger strings, assume some escaping will be needed
131+ n + n / 8 + 64
132+ } ;
133+
134+ let mut out = Vec :: with_capacity ( initial_capacity) ;
19135 out. push ( b'"' ) ;
20136
21137 unsafe {
22- let tbl = vld1q_u8_x4 ( ESCAPE . as_ptr ( ) ) ; // first 64 B of the escape table
23- let slash = vdupq_n_u8 ( b'\\' ) ;
24138 let mut i = 0 ;
25- // Re-usable scratch – *uninitialised*, so no memset in the loop.
26- // Using MaybeUninit instead of mem::zeroed() prevents the compiler from inserting an implicit memset (observable with -Cllvm-args=-print-after=expand-memcmp).
27- // This is a proven micro-optimisation in Rust's standard library I/O stack.
28- #[ allow( invalid_value) ]
29- let mut placeholder: [ u8 ; 16 ] = core:: mem:: MaybeUninit :: uninit ( ) . assume_init ( ) ;
30-
139+
140+ // V8-style optimization: Try to process large clean chunks quickly
31141 while i + CHUNK <= n {
32142 let ptr = bytes. as_ptr ( ) . add ( i) ;
33143
34- /* ---- L1 prefetch: PREFETCH_DISTANCE bytes ahead ---- */
144+ // V8-style optimization: First check if entire chunk is clean ASCII
145+ if is_ascii_clean_chunk ( ptr) {
146+ out. extend_from_slice ( std:: slice:: from_raw_parts ( ptr, CHUNK ) ) ;
147+ i += CHUNK ;
148+ continue ;
149+ }
150+
151+ /* ---- V8-style prefetch: Multiple lines ahead ---- */
35152 core:: arch:: asm!(
36153 "prfm pldl1keep, [{0}, #{1}]" ,
154+ "prfm pldl1keep, [{0}, #{2}]" ,
37155 in( reg) ptr,
38156 const PREFETCH_DISTANCE ,
157+ const PREFETCH_DISTANCE + 64 ,
39158 ) ;
40159 /* ------------------------------------------ */
41160
42161 let quad = vld1q_u8_x4 ( ptr) ;
43162
44- // load 64 B (four q-regs)
163+ // Load 64 B (four q-regs)
45164 let a = quad. 0 ;
46165 let b = quad. 1 ;
47166 let c = quad. 2 ;
48167 let d = quad. 3 ;
49168
50- let mask_1 = vorrq_u8 ( vqtbl4q_u8 ( tbl, a) , vceqq_u8 ( slash, a) ) ;
51- let mask_2 = vorrq_u8 ( vqtbl4q_u8 ( tbl, b) , vceqq_u8 ( slash, b) ) ;
52- let mask_3 = vorrq_u8 ( vqtbl4q_u8 ( tbl, c) , vceqq_u8 ( slash, c) ) ;
53- let mask_4 = vorrq_u8 ( vqtbl4q_u8 ( tbl, d) , vceqq_u8 ( slash, d) ) ;
169+ // V8-style optimization: Use bit-based character classification
170+ let mask_1 = classify_chars_v8_style ( a) ;
171+ let mask_2 = classify_chars_v8_style ( b) ;
172+ let mask_3 = classify_chars_v8_style ( c) ;
173+ let mask_4 = classify_chars_v8_style ( d) ;
54174
55175 let mask_r_1 = vmaxvq_u8 ( mask_1) ;
56176 let mask_r_2 = vmaxvq_u8 ( mask_2) ;
57177 let mask_r_3 = vmaxvq_u8 ( mask_3) ;
58178 let mask_r_4 = vmaxvq_u8 ( mask_4) ;
59179
60- // fast path: nothing needs escaping
61- if mask_r_1 | mask_r_2 | mask_r_3 | mask_r_4 == 0 {
62- out. extend_from_slice ( std:: slice:: from_raw_parts ( ptr, CHUNK ) ) ;
63- i += CHUNK ;
64- continue ;
180+ // V8-style optimization: Process each vector with reduced branching
181+ if mask_r_1 == 0 {
182+ out. extend_from_slice ( std:: slice:: from_raw_parts ( ptr, 16 ) ) ;
183+ } else {
184+ process_escape_vector ( a , mask_1 , & mut out ) ;
65185 }
66-
67- macro_rules! handle {
68- ( $mask: expr, $mask_r: expr, $off: expr) => {
69- if $mask_r == 0 {
70- out. extend_from_slice( std:: slice:: from_raw_parts( ptr. add( $off) , 16 ) ) ;
71- } else {
72- vst1q_u8( placeholder. as_mut_ptr( ) , $mask) ;
73- handle_block( & bytes[ i + $off..i + $off + 16 ] , & placeholder, & mut out) ;
74- }
75- } ;
186+
187+ if mask_r_2 == 0 {
188+ out. extend_from_slice ( std:: slice:: from_raw_parts ( ptr. add ( 16 ) , 16 ) ) ;
189+ } else {
190+ process_escape_vector ( b, mask_2, & mut out) ;
191+ }
192+
193+ if mask_r_3 == 0 {
194+ out. extend_from_slice ( std:: slice:: from_raw_parts ( ptr. add ( 32 ) , 16 ) ) ;
195+ } else {
196+ process_escape_vector ( c, mask_3, & mut out) ;
197+ }
198+
199+ if mask_r_4 == 0 {
200+ out. extend_from_slice ( std:: slice:: from_raw_parts ( ptr. add ( 48 ) , 16 ) ) ;
201+ } else {
202+ process_escape_vector ( d, mask_4, & mut out) ;
76203 }
77-
78- handle ! ( mask_1, mask_r_1, 0 ) ;
79- handle ! ( mask_2, mask_r_2, 16 ) ;
80- handle ! ( mask_3, mask_r_3, 32 ) ;
81- handle ! ( mask_4, mask_r_4, 48 ) ;
82204
83205 i += CHUNK ;
84206 }
207+
208+ // Handle remaining bytes with optimized fallback
85209 if i < n {
86210 encode_str_inner ( & bytes[ i..] , & mut out) ;
87211 }
@@ -90,18 +214,3 @@ pub fn encode_str<S: AsRef<str>>(input: S) -> String {
90214 // SAFETY: we only emit valid UTF-8
91215 unsafe { String :: from_utf8_unchecked ( out) }
92216}
93-
94- #[ inline( always) ]
95- unsafe fn handle_block ( src : & [ u8 ] , mask : & [ u8 ; 16 ] , dst : & mut Vec < u8 > ) {
96- for ( j, & m) in mask. iter ( ) . enumerate ( ) {
97- let c = src[ j] ;
98- if m == 0 {
99- dst. push ( c) ;
100- } else if m == 0xFF {
101- dst. extend_from_slice ( REVERSE_SOLIDUS ) ;
102- } else {
103- let e = CharEscape :: from_escape_table ( m, c) ;
104- write_char_escape ( dst, e) ;
105- }
106- }
107- }
0 commit comments