@@ -146,10 +146,6 @@ def ord_char(c: int) -> int:
146146 return c
147147
148148
149- def join_bytes (b : bytes ) -> bytes :
150- return bytes (list (b ))
151-
152-
153149def parse_options_header (value : str | bytes ) -> tuple [bytes , dict [bytes , bytes ]]:
154150 """Parses a Content-Type header into a value in the following format: (content_type, {parameters})."""
155151 # Uses email.message.Message to parse the header as described in PEP 594.
@@ -976,29 +972,11 @@ def __init__(
976972 # Setup marks. These are used to track the state of data received.
977973 self .marks : dict [str , int ] = {}
978974
979- # TODO: Actually use this rather than the dumb version we currently use
980- # # Precompute the skip table for the Boyer-Moore-Horspool algorithm.
981- # skip = [len(boundary) for x in range(256)]
982- # for i in range(len(boundary) - 1):
983- # skip[ord_char(boundary[i])] = len(boundary) - i - 1
984- #
985- # # We use a tuple since it's a constant, and marginally faster.
986- # self.skip = tuple(skip)
987-
988975 # Save our boundary.
989976 if isinstance (boundary , str ): # pragma: no cover
990977 boundary = boundary .encode ("latin-1" )
991978 self .boundary = b"\r \n --" + boundary
992979
993- # Get a set of characters that belong to our boundary.
994- self .boundary_chars = frozenset (self .boundary )
995-
996- # We also create a lookbehind list.
997- # Note: the +8 is since we can have, at maximum, "\r\n--" + boundary +
998- # "--\r\n" at the final boundary, and the length of '\r\n--' and
999- # '--\r\n' is 8 bytes.
1000- self .lookbehind = [NULL for _ in range (len (boundary ) + 8 )]
1001-
1002980 def write (self , data : bytes ) -> int :
1003981 """Write some data to the parser, which will perform size verification,
1004982 and then parse the data into the appropriate location (e.g. header,
@@ -1061,21 +1039,43 @@ def delete_mark(name: str, reset: bool = False) -> None:
10611039 # end of the buffer, and reset the mark, instead of deleting it. This
10621040 # is used at the end of the function to call our callbacks with any
10631041 # remaining data in this chunk.
1064- def data_callback (name : str , remaining : bool = False ) -> None :
1042+ def data_callback (name : str , end_i : int , remaining : bool = False ) -> None :
10651043 marked_index = self .marks .get (name )
10661044 if marked_index is None :
10671045 return
10681046
1069- # If we're getting remaining data, we ignore the current i value
1070- # and just call with the remaining data.
1071- if remaining :
1072- self .callback (name , data , marked_index , length )
1073- self .marks [name ] = 0
1074-
10751047 # Otherwise, we call it from the mark to the current byte we're
10761048 # processing.
1049+ if end_i <= marked_index :
1050+ # There is no additional data to send.
1051+ pass
1052+ elif marked_index >= 0 :
1053+ # We are emitting data from the local buffer.
1054+ self .callback (name , data , marked_index , end_i )
1055+ else :
1056+ # Some of the data comes from a partial boundary match.
1057+ # and requires look-behind.
1058+ # We need to use self.flags (and not flags) because we care about
1059+ # the state when we entered the loop.
1060+ lookbehind_len = - marked_index
1061+ if lookbehind_len <= len (boundary ):
1062+ self .callback (name , boundary , 0 , lookbehind_len )
1063+ elif self .flags & FLAG_PART_BOUNDARY :
1064+ lookback = boundary + b"\r \n "
1065+ self .callback (name , lookback , 0 , lookbehind_len )
1066+ elif self .flags & FLAG_LAST_BOUNDARY :
1067+ lookback = boundary + b"--\r \n "
1068+ self .callback (name , lookback , 0 , lookbehind_len )
1069+ else : # pragma: no cover (error case)
1070+ self .logger .warning ("Look-back buffer error" )
1071+
1072+ if end_i > 0 :
1073+ self .callback (name , data , 0 , end_i )
1074+ # If we're getting remaining data, we have got all the data we
1075+ # can be certain is not a boundary, leaving only a partial boundary match.
1076+ if remaining :
1077+ self .marks [name ] = end_i - length
10771078 else :
1078- self .callback (name , data , marked_index , i )
10791079 self .marks .pop (name , None )
10801080
10811081 # For each byte...
@@ -1183,7 +1183,7 @@ def data_callback(name: str, remaining: bool = False) -> None:
11831183 raise e
11841184
11851185 # Call our callback with the header field.
1186- data_callback ("header_field" )
1186+ data_callback ("header_field" , i )
11871187
11881188 # Move to parsing the header value.
11891189 state = MultipartState .HEADER_VALUE_START
@@ -1212,7 +1212,7 @@ def data_callback(name: str, remaining: bool = False) -> None:
12121212 # If we've got a CR, we're nearly done our headers. Otherwise,
12131213 # we do nothing and just move past this character.
12141214 if c == CR :
1215- data_callback ("header_value" )
1215+ data_callback ("header_value" , i )
12161216 self .callback ("header_end" )
12171217 state = MultipartState .HEADER_VALUE_ALMOST_DONE
12181218
@@ -1256,46 +1256,46 @@ def data_callback(name: str, remaining: bool = False) -> None:
12561256 # We're processing our part data right now. During this, we
12571257 # need to efficiently search for our boundary, since any data
12581258 # on any number of lines can be a part of the current data.
1259- # We use the Boyer-Moore-Horspool algorithm to efficiently
1260- # search through the remainder of the buffer looking for our
1261- # boundary.
12621259
12631260 # Save the current value of our index. We use this in case we
12641261 # find part of a boundary, but it doesn't match fully.
12651262 prev_index = index
12661263
12671264 # Set up variables.
12681265 boundary_length = len (boundary )
1269- boundary_end = boundary_length - 1
12701266 data_length = length
1271- boundary_chars = self .boundary_chars
12721267
12731268 # If our index is 0, we're starting a new part, so start our
12741269 # search.
12751270 if index == 0 :
1276- # Search forward until we either hit the end of our buffer,
1277- # or reach a character that's in our boundary.
1278- i += boundary_end
1279- while i < data_length - 1 and data [i ] not in boundary_chars :
1280- i += boundary_length
1281-
1282- # Reset i back the length of our boundary, which is the
1283- # earliest possible location that could be our match (i.e.
1284- # if we've just broken out of our loop since we saw the
1285- # last character in our boundary)
1286- i -= boundary_end
1271+ # The most common case is likely to be that the whole
1272+ # boundary is present in the buffer.
1273+ # Calling `find` is much faster than iterating here.
1274+ i0 = data .find (boundary , i , data_length )
1275+ if i0 >= 0 :
1276+ # We matched the whole boundary string.
1277+ index = boundary_length - 1
1278+ i = i0 + boundary_length - 1
1279+ else :
1280+ # No match found for whole string.
1281+ # There may be a partial boundary at the end of the
1282+ # data, which the find will not match.
1283+ # Since the length should to be searched is limited to
1284+ # the boundary length, just perform a naive search.
1285+ i = max (i , data_length - boundary_length )
1286+
1287+ # Search forward until we either hit the end of our buffer,
1288+ # or reach a potential start of the boundary.
1289+ while i < data_length - 1 and data [i ] != boundary [0 ]:
1290+ i += 1
1291+
12871292 c = data [i ]
12881293
12891294 # Now, we have a couple of cases here. If our index is before
12901295 # the end of the boundary...
12911296 if index < boundary_length :
12921297 # If the character matches...
12931298 if boundary [index ] == c :
1294- # If we found a match for our boundary, we send the
1295- # existing data.
1296- if index == 0 :
1297- data_callback ("part_data" )
1298-
12991299 # The current character matches, so continue!
13001300 index += 1
13011301 else :
@@ -1332,6 +1332,8 @@ def data_callback(name: str, remaining: bool = False) -> None:
13321332 # Unset the part boundary flag.
13331333 flags &= ~ FLAG_PART_BOUNDARY
13341334
1335+ # We have identified a boundary, callback for any data before it.
1336+ data_callback ("part_data" , i - index )
13351337 # Callback indicating that we've reached the end of
13361338 # a part, and are starting a new one.
13371339 self .callback ("part_end" )
@@ -1353,6 +1355,8 @@ def data_callback(name: str, remaining: bool = False) -> None:
13531355 elif flags & FLAG_LAST_BOUNDARY :
13541356 # We need a second hyphen here.
13551357 if c == HYPHEN :
1358+ # We have identified a boundary, callback for any data before it.
1359+ data_callback ("part_data" , i - index )
13561360 # Callback to end the current part, and then the
13571361 # message.
13581362 self .callback ("part_end" )
@@ -1362,26 +1366,14 @@ def data_callback(name: str, remaining: bool = False) -> None:
13621366 # No match, so reset index.
13631367 index = 0
13641368
1365- # If we have an index, we need to keep this byte for later, in
1366- # case we can't match the full boundary.
1367- if index > 0 :
1368- self .lookbehind [index - 1 ] = c
1369-
13701369 # Otherwise, our index is 0. If the previous index is not, it
13711370 # means we reset something, and we need to take the data we
13721371 # thought was part of our boundary and send it along as actual
13731372 # data.
1374- elif prev_index > 0 :
1375- # Callback to write the saved data.
1376- lb_data = join_bytes (self .lookbehind )
1377- self .callback ("part_data" , lb_data , 0 , prev_index )
1378-
1373+ if index == 0 and prev_index > 0 :
13791374 # Overwrite our previous index.
13801375 prev_index = 0
13811376
1382- # Re-set our mark for part data.
1383- set_mark ("part_data" )
1384-
13851377 # Re-consider the current character, since this could be
13861378 # the start of the boundary itself.
13871379 i -= 1
@@ -1410,9 +1402,9 @@ def data_callback(name: str, remaining: bool = False) -> None:
14101402 # that we haven't yet reached the end of this 'thing'. So, by setting
14111403 # the mark to 0, we cause any data callbacks that take place in future
14121404 # calls to this function to start from the beginning of that buffer.
1413- data_callback ("header_field" , True )
1414- data_callback ("header_value" , True )
1415- data_callback ("part_data" , True )
1405+ data_callback ("header_field" , length , True )
1406+ data_callback ("header_value" , length , True )
1407+ data_callback ("part_data" , length - index , True )
14161408
14171409 # Save values to locals.
14181410 self .state = state
0 commit comments