@@ -12,13 +12,12 @@ Fast JSONL decoder using Cython for performance-critical operations.
1212This decoder uses native C string operations instead of regex for better performance.
1313"""
1414
15- from libc.string cimport memchr, strlen, strstr
15+ from libc.string cimport memchr, strlen, strstr, memcmp
1616from libc.stdlib cimport strtod, strtol, atoi
1717from cpython.bytes cimport PyBytes_AS_STRING, PyBytes_GET_SIZE
1818from libc.stdint cimport int64_t
1919
2020import pyarrow
21- from opteryx.third_party.tktech import csimdjson as simdjson
2221
2322
2423cdef inline const char * find_key_value(const char * line, Py_ssize_t line_len, const char * key, Py_ssize_t key_len, Py_ssize_t* value_start, Py_ssize_t* value_len):
@@ -37,19 +36,23 @@ cdef inline const char* find_key_value(const char* line, Py_ssize_t line_len, co
3736 cdef char first_char
3837 cdef int brace_count
3938 cdef int bracket_count
39+ cdef Py_ssize_t remaining
4040
4141 # Search for the key pattern: "key":
4242 while pos < end:
4343 # Find opening quote of a key
44- key_pos = < const char * > memchr(pos, b' "' , end - pos)
44+ remaining = end - pos
45+ if remaining <= 0 :
46+ return NULL
47+ key_pos = < const char * > memchr(pos, b' "' , < size_t> remaining)
4548 if key_pos == NULL :
4649 return NULL
4750
4851 key_pos += 1 # Move past the opening quote
4952
5053 # Check if this matches our key
5154 if (end - key_pos >= key_len and
52- memcmp(key_pos, key, key_len) == 0 and
55+ memcmp(key_pos, key, < size_t > key_len) == 0 and
5356 key_pos[key_len] == b' "' ):
5457
5558 # Found the key, now find the colon
@@ -71,7 +74,13 @@ cdef inline const char* find_key_value(const char* line, Py_ssize_t line_len, co
7174 quote_start = value_pos + 1
7275 quote_end = quote_start
7376 while quote_end < end:
74- if quote_end[0 ] == b' "' and (quote_end == quote_start or quote_end[- 1 ] != b' \\ ' ):
77+ if quote_end[0 ] == b' "' :
78+ # Check if it's escaped (previous char is backslash)
79+ if quote_end > quote_start and quote_end[- 1 ] == b' \\ ' :
80+ # It's escaped, keep going
81+ quote_end += 1
82+ continue
83+ # Found unescaped quote
7584 value_len[0 ] = (quote_end + 1 ) - value_pos
7685 return value_pos
7786 quote_end += 1
@@ -128,7 +137,8 @@ cdef inline const char* find_key_value(const char* line, Py_ssize_t line_len, co
128137 # Number - find end (space, comma, brace, bracket)
129138 quote_end = value_pos + 1
130139 while quote_end < end:
131- if quote_end[0 ] in (b' ' , b' ,' , b' }' , b' ]' , b' \t ' , b' \n ' ):
140+ # Check for delimiter characters
141+ if quote_end[0 ] == b' ' or quote_end[0 ] == b' ,' or quote_end[0 ] == b' }' or quote_end[0 ] == b' ]' or quote_end[0 ] == b' \t ' or quote_end[0 ] == b' \n ' :
132142 break
133143 quote_end += 1
134144 value_len[0 ] = quote_end - value_pos
@@ -139,10 +149,6 @@ cdef inline const char* find_key_value(const char* line, Py_ssize_t line_len, co
139149 return NULL
140150
141151
142- cdef extern from " string.h" :
143- int memcmp(const void * s1, const void * s2, size_t n)
144-
145-
146152cpdef fast_jsonl_decode_columnar(bytes buffer , list column_names, dict column_types, Py_ssize_t sample_size = 100 ):
147153 """
148154 Fast JSONL decoder that extracts values using C string operations.
@@ -174,9 +180,9 @@ cpdef fast_jsonl_decode_columnar(bytes buffer, list column_names, dict column_ty
174180 cdef dict result = {}
175181 cdef Py_ssize_t num_lines = 0
176182 cdef Py_ssize_t i
177- cdef char * end_ptr
178183 cdef bytes value_bytes
179184 cdef str value_str
185+ cdef Py_ssize_t remaining
180186
181187 # Initialize column data lists
182188 for col in column_names:
@@ -186,7 +192,10 @@ cpdef fast_jsonl_decode_columnar(bytes buffer, list column_names, dict column_ty
186192 # Count lines first
187193 cdef const char * newline_pos = pos
188194 while newline_pos < end:
189- newline_pos = < const char * > memchr(newline_pos, b' \n ' , end - newline_pos)
195+ remaining = end - newline_pos
196+ if remaining <= 0 :
197+ break
198+ newline_pos = < const char * > memchr(newline_pos, b' \n ' , < size_t> remaining)
190199 if newline_pos == NULL :
191200 break
192201 num_lines += 1
@@ -201,7 +210,10 @@ cpdef fast_jsonl_decode_columnar(bytes buffer, list column_names, dict column_ty
201210 for i in range (num_lines):
202211 # Find line end
203212 line_start = pos
204- line_end = < const char * > memchr(line_start, b' \n ' , end - line_start)
213+ remaining = end - line_start
214+ if remaining <= 0 :
215+ break
216+ line_end = < const char * > memchr(line_start, b' \n ' , < size_t> remaining)
205217 if line_end == NULL :
206218 line_end = end
207219
@@ -226,6 +238,10 @@ cpdef fast_jsonl_decode_columnar(bytes buffer, list column_names, dict column_ty
226238 result[col].append(None )
227239 continue
228240
241+ # Create a safe bytes object from the C pointer
242+ # This is crucial to avoid segfaults when slicing
243+ value_bytes = PyBytes_FromStringAndSize(value_ptr, value_len)
244+
229245 # Parse value based on type
230246 if col_type == ' bool' :
231247 if value_len == 4 and memcmp(value_ptr, b" true" , 4 ) == 0 :
@@ -239,8 +255,6 @@ cpdef fast_jsonl_decode_columnar(bytes buffer, list column_names, dict column_ty
239255 if value_len == 4 and memcmp(value_ptr, b" null" , 4 ) == 0 :
240256 result[col].append(None )
241257 else :
242- # Use strtol for integer parsing
243- value_bytes = value_ptr[:value_len]
244258 try :
245259 result[col].append(int (value_bytes))
246260 except ValueError :
@@ -250,8 +264,6 @@ cpdef fast_jsonl_decode_columnar(bytes buffer, list column_names, dict column_ty
250264 if value_len == 4 and memcmp(value_ptr, b" null" , 4 ) == 0 :
251265 result[col].append(None )
252266 else :
253- # Use strtod for float parsing
254- value_bytes = value_ptr[:value_len]
255267 try :
256268 result[col].append(float (value_bytes))
257269 except ValueError :
@@ -260,11 +272,12 @@ cpdef fast_jsonl_decode_columnar(bytes buffer, list column_names, dict column_ty
260272 elif col_type == ' str' :
261273 if value_len == 4 and memcmp(value_ptr, b" null" , 4 ) == 0 :
262274 result[col].append(None )
263- elif value_ptr[0 ] == b' "' :
275+ elif value_ptr[0 ] == b' "' and value_len >= 2 :
264276 # String value - extract without quotes
265- value_bytes = value_ptr[1 :value_len- 1 ]
277+ # Safely extract the string content
278+ string_content = PyBytes_FromStringAndSize(value_ptr + 1 , value_len - 2 )
266279 try :
267- value_str = value_bytes .decode(' utf-8' )
280+ value_str = string_content .decode(' utf-8' )
268281 # Simple unescape
269282 value_str = value_str.replace(' \\ n' , ' \n ' ).replace(' \\ t' , ' \t ' ).replace(' \\ "' , ' "' ).replace(' \\\\ ' , ' \\ ' )
270283 result[col].append(value_str)
@@ -275,7 +288,6 @@ cpdef fast_jsonl_decode_columnar(bytes buffer, list column_names, dict column_ty
275288
276289 else :
277290 # For other types (list, dict, null), fall back to Python
278- value_bytes = value_ptr[:value_len]
279291 if value_len == 4 and memcmp(value_ptr, b" null" , 4 ) == 0 :
280292 result[col].append(None )
281293 else :
@@ -293,3 +305,9 @@ cpdef fast_jsonl_decode_columnar(bytes buffer, list column_names, dict column_ty
293305 pos = line_end + 1 if line_end < end else end
294306
295307 return (num_lines, len (column_names), result)
308+
309+
310+ # Declare the C function we need
311+ cdef extern from " Python.h" :
312+ bytes PyBytes_FromStringAndSize(const char * v, Py_ssize_t len )
313+
0 commit comments