@@ -320,6 +320,7 @@ async def _wait_to_token_package(
320320 group_request_id = sampling_params .group_request_id
321321 unfinished_count = sampling_params .best_of
322322 is_first_token = True
323+ sub_req_id_to_mtp_accepted_token_num : Dict [int , int ] = {}
323324
324325 client_mode : NodeRole = NodeRole (d_node .mode )
325326
@@ -333,6 +334,7 @@ async def _wait_to_token_package(
333334
334335 prompt_tokens = metadata ["prompt_tokens" ]
335336 out_token_counter += 1
337+ sub_req_id_to_mtp_accepted_token_num [sub_req_id ] = metadata .get ("mtp_accepted_token_num" , 0 )
336338 if is_first_token :
337339 first_token_cost_ms = (time .time () - start_time ) * 1000
338340 is_first_token = False
@@ -351,6 +353,9 @@ async def _wait_to_token_package(
351353 x_session_id = request .headers .get ("X-Session-Id" , "" )
352354 prompt_cache_len = metadata .pop ("prompt_cache_len" , 0 )
353355 prompt_cache_ratio = prompt_cache_len / prompt_tokens
356+ mtp_avg_token_per_step = out_token_counter / max (
357+ (out_token_counter - sum (sub_req_id_to_mtp_accepted_token_num .values ())), 1
358+ )
354359 format_start_time = datetime .datetime .fromtimestamp (start_time ).strftime ("%Y-%m-%d %H:%M:%S" )
355360 logger .info (
356361 f"X-Request-Id:{ x_request_id } "
@@ -361,6 +366,7 @@ async def _wait_to_token_package(
361366 f"prompt_token_num:{ prompt_tokens } "
362367 f"prompt_cache_len:{ prompt_cache_len } "
363368 f"prompt_cache_ratio:{ prompt_cache_ratio } "
369+ f"mtp_avg_token_per_step:{ mtp_avg_token_per_step } "
364370 )
365371 self .metric_client .histogram_observe ("lightllm_request_inference_duration" , total_cost_time_ms / 1000.0 )
366372 self .metric_client .histogram_observe (
@@ -369,6 +375,7 @@ async def _wait_to_token_package(
369375 self .metric_client .histogram_observe ("lightllm_request_first_token_duration" , first_token_cost_ms / 1000.0 )
370376 self .metric_client .histogram_observe ("lightllm_request_generated_tokens" , out_token_counter )
371377 self .metric_client .counter_inc ("lightllm_request_success" )
378+ self .metric_client .histogram_observe ("lightllm_request_mtp_avg_token_per_step" , mtp_avg_token_per_step )
372379 return
373380
374381 async def abort (
0 commit comments