You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
And sen�t requests iteratively using the run_speculative_inference() function in the e2e_grpc_speculative_decoding_client.py file. (actual executed func:run_speculative_inference_with_defaults())
There are no arguments in config "*_log_probs" or "*_logits", so I erased it.
This is no streaming mode. (false decoupled mode)
#!/usr/bin/pythonimportosimportsyssys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
importargparseimportqueueimportsysimportnumpyasnpimportgrpcimporttritonclient.grpcasgrpcclientfromtritonclient.grpc._utilsimport_get_inference_requestfromtritonclient.utilsimportInferenceServerException, np_to_triton_dtypedefprepare_tensor(name, input):
t=grpcclient.InferInput(name, input.shape,
np_to_triton_dtype(input.dtype))
t.set_data_from_numpy(input)
returntclassUserData:
def__init__(self):
self._completed_requests=queue.Queue()
defcallback(user_data, result, error):
iferror:
user_data._completed_requests.put(error)
else:
user_data._completed_requests.put(result)
output=result.as_numpy('text_output')
print(output, flush=True)
defget_preprocessor_inputs(prompt, output_len, bad_words, stop_words, end_id,
pad_id):
input0= [[prompt]]
input0_data=np.array(input0).astype(object)
output0_len=np.ones_like(input0).astype(np.int32) *output_lenpreprocessor_inputs= [
prepare_tensor("QUERY", input0_data),
prepare_tensor("REQUEST_OUTPUT_LEN", output0_len),
]
ifbad_words:
bad_words_list=np.array([bad_words], dtype=object)
preprocessor_inputs+= [
prepare_tensor("BAD_WORDS_DICT", bad_words_list)
]
ifstop_words:
stop_words_list=np.array([stop_words], dtype=object)
preprocessor_inputs+= [
prepare_tensor("STOP_WORDS_DICT", stop_words_list)
]
ifend_id:
end_id_data=np.array([[end_id]], dtype=np.int32)
preprocessor_inputs+= [prepare_tensor("END_ID", end_id_data)]
ifpad_id:
pad_id_data=np.array([[pad_id]], dtype=np.int32)
preprocessor_inputs+= [prepare_tensor("PAD_ID", pad_id_data)]
returnpreprocessor_inputsdefextract_preprocessor_outputs(result):
input_ids=np.squeeze(result.as_numpy("INPUT_ID").astype(np.int32),
axis=0)
bad_words_ids=result.as_numpy("BAD_WORDS_IDS").astype(np.int32)
stop_words_ids=result.as_numpy("STOP_WORDS_IDS").astype(np.int32)
end_id=result.as_numpy("OUT_END_ID").astype(np.int32)[0][0]
pad_id=result.as_numpy("OUT_PAD_ID").astype(np.int32)[0][0]
returninput_ids, bad_words_ids, stop_words_ids, end_id, pad_iddefget_trtllm_inputs(input_ids,
input_length,
request_output_len,
draft_tokens,
beam_width,
temperature,
repetition_penalty,
presence_penalty,
frequency_penalty,
bad_words_ids,
stop_words_ids,
end_id,
pad_id,
return_draft_model_draft_logits=False,
return_target_model_accepted_token_logits=False):
# These two flags correspond to the settings of draft model and target model respectively,# and only one of them can be true at a time.assertnot (return_draft_model_draft_logitsandreturn_target_model_accepted_token_logits)
# input_ids is expected to have shape [input_length]# Add batch dimension of 1input_ids_data=np.expand_dims(input_ids, axis=0)
inputs= [
prepare_tensor("input_ids", input_ids_data),
prepare_tensor("input_lengths",
np.array([[input_length]], dtype=np.int32)),
prepare_tensor("request_output_len",
np.array([[request_output_len]], dtype=np.int32)),
prepare_tensor("bad_words_list", bad_words_ids),
prepare_tensor("stop_words_list", stop_words_ids),
prepare_tensor("beam_width", np.array([[beam_width]], dtype=np.int32)),
prepare_tensor("temperature",
np.array([[temperature]], dtype=np.float32)),
]
ifdraft_tokensisnotNone:
draft_tokens_data=np.array([draft_tokens], dtype=np.int32)
inputs.append(prepare_tensor("draft_input_ids", draft_tokens_data))
ifrepetition_penaltyisnotNone:
repetition_penalty_data=np.array([[repetition_penalty]],
dtype=np.float32)
inputs.append(
prepare_tensor("repetition_penalty", repetition_penalty_data))
ifpresence_penaltyisnotNone:
presence_penalty_data=np.array([[presence_penalty]],
dtype=np.float32)
inputs.append(prepare_tensor("presence_penalty",
presence_penalty_data))
iffrequency_penaltyisnotNone:
frequency_penalty_data=np.array([[frequency_penalty]],
dtype=np.float32)
inputs.append(
prepare_tensor("frequency_penalty", frequency_penalty_data))
ifend_idisnotNone:
end_id_data=np.array([[end_id]], dtype=np.int32)
inputs.append(prepare_tensor("end_id", end_id_data))
ifpad_idisnotNone:
pad_id_data=np.array([[pad_id]], dtype=np.int32)
inputs.append(prepare_tensor("pad_id", pad_id_data))
ifreturn_draft_model_draft_logits:
return_draft_model_draft_logits_data=np.array(
[[return_draft_model_draft_logits]], dtype=bool)
inputs.append(
prepare_tensor("return_generation_logits",
return_draft_model_draft_logits_data))
ifreturn_target_model_accepted_token_logits:
return_target_model_accepted_token_logits_data=np.array(
[[return_target_model_accepted_token_logits]], dtype=bool)
inputs.append(
prepare_tensor("return_generation_logits",
return_target_model_accepted_token_logits_data))
returninputsdefcheck_result(result, model_name):
iftype(result) ==InferenceServerException:
print(
f"Received an error from server while calling {model_name}: {result}"
)
defextract_trtllm_outputs(result):
# Get batch 0, beam 0 output_idsoutput_ids=np.squeeze(result.as_numpy("output_ids").astype(np.int32),
axis=(0, 1))
sequence_length_data=result.as_numpy("sequence_length").astype(np.int32)
assertsequence_length_data.shape[0] ==1assertsequence_length_data.shape[1] ==1sequence_length=sequence_length_data[0, 0]
# cum_log_probs = result.as_numpy("cum_log_probs").astype(np.float32)# output_log_probs = result.as_numpy("output_log_probs").astype(np.float32)# context_logits = result.as_numpy("context_logits").astype(np.float32)# generation_logits = result.as_numpy("generation_logits").astype(np.float32)returnoutput_ids, sequence_length#, cum_log_probs, output_log_probs, context_logits, generation_logitsdefget_postprocessor_inputs(output_ids): # , cum_log_probs, output_log_probs, context_logits, generation_logitsoutput_ids_data=np.expand_dims(output_ids, axis=(0, 1))
inputs= [
prepare_tensor("TOKENS_BATCH", output_ids_data),
prepare_tensor("SEQUENCE_LENGTH",
np.array([[len(output_ids)]], dtype=np.int32)),
# prepare_tensor("CUM_LOG_PROBS", cum_log_probs),# prepare_tensor("OUTPUT_LOG_PROBS", output_log_probs),# prepare_tensor("CONTEXT_LOGITS", context_logits),# prepare_tensor("GENERATION_LOGITS", generation_logits)
]
returninputsdefencountered_stop_words(input_ids, stop_words_ids):
forstop_word_idsinstop_words_ids:
ifnp.array_equal(input_ids[-len(stop_word_ids):], stop_word_ids):
returnTruereturnFalsedefrun_speculative_inference(
client_draft, client_target, prompt, output_len, in_num_draft_tokens,
request_id, repetition_penalty, presence_penalty, frequency_penalty,
temperature, stop_words, bad_words, end_id, pad_id, beam_width,
preprocessor_model_name, draft_tensorrt_llm_model_name,
target_tensorrt_llm_model_name, postprocessor_model_name,
return_draft_model_draft_logits,
return_target_model_accepted_token_logits, verbose):
fromdatetimeimportdatetime##start_time=datetime.now() ### Call the preprocessorpreprocessor_inputs=get_preprocessor_inputs(prompt, output_len,
bad_words, stop_words,
end_id, pad_id)
preprocessor_result=client_draft.infer(preprocessor_model_name,
preprocessor_inputs,
request_id=request_id)
check_result(preprocessor_result, preprocessor_model_name)
prompt_input_ids, bad_words_ids, stop_words_ids, end_id, pad_id=extract_preprocessor_outputs(
preprocessor_result)
input_ids=prompt_input_idslast_input_ids=Nonedraft_output_ids=NonewhileTrue:
num_draft_tokens=min(
in_num_draft_tokens,
len(prompt_input_ids) +output_len-len(input_ids) -1)
ifnum_draft_tokens>0:
ifverbose:
print("Draft model input ids:")
print(input_ids.tolist())
#Generate up to num_draft_tokens with draft modeldraft_inputs=get_trtllm_inputs(
input_ids,
len(input_ids),
num_draft_tokens,
None,
beam_width,
temperature,
repetition_penalty,
presence_penalty,
frequency_penalty,
bad_words_ids,
stop_words_ids,
end_id,
pad_id,
return_draft_model_draft_logits=return_draft_model_draft_logits
)
draft_result=client_draft.infer(draft_tensorrt_llm_model_name,
draft_inputs,
request_id=request_id)
check_result(draft_result, draft_tensorrt_llm_model_name)
draft_output_ids, draft_seq_len=extract_trtllm_outputs( # , cum_log_probs, output_log_probs, context_logits, generation_logitsdraft_result)
ifverbose:
print("Draft model output ids:")
print(draft_output_ids.tolist())
print("draft_sequence_length")
print(draft_seq_len)
# Set the draft token and call the target model to generate up to num_draft_tokens + 1draft_tokens=draft_output_ids[len(input_ids):draft_seq_len]
ifverbose:
print("draft_tokens")
print(draft_tokens.tolist())
ifreturn_draft_model_draft_logits:
draft_model_draft_token_logits=generation_logits.squeeze(
0) # [beam_width, num_draft_tokens, vocab_size]print(
f"draft model draft tokens' logits: shape: {draft_model_draft_token_logits.shape}, value: {draft_model_draft_token_logits}"
)
ifverbose:
print("Target model input ids")
print(input_ids.tolist())
# Generate up to len(draft_tokens) + 1 with target modeltarget_inputs=get_trtllm_inputs(
input_ids,
len(input_ids),
len(draft_tokens) +1ifnum_draft_tokens>0else1,
draft_tokensifnum_draft_tokens>0elseNone,
beam_width,
temperature,
repetition_penalty,
presence_penalty,
frequency_penalty,
bad_words_ids,
stop_words_ids,
end_id,
pad_id,
return_target_model_accepted_token_logits=return_target_model_accepted_token_logits)
target_result=client_target.infer(target_tensorrt_llm_model_name,
target_inputs,
request_id=request_id)
check_result(target_result, target_tensorrt_llm_model_name)
target_output_ids, seq_length=extract_trtllm_outputs( # , cum_log_probs, output_log_probs, context_logits, generation_logitstarget_result)
ifverbose:
print("Target model output_ids")
print(target_output_ids.tolist())
print("target seq_length")
print(seq_length)
ifreturn_target_model_accepted_token_logits:
target_model_accept_token_logits=generation_logits.squeeze(
0).squeeze(0) # [num_accepted_tokens, vocab_size]print(
f"target model accepted tokens' logits: shape: {target_model_accept_token_logits.shape}, value: {target_model_accept_token_logits}"
)
# Store the last iteration input_ids to check if EOS was encounteredlast_input_ids=input_ids# Update the input ids with new output_idsinput_ids=target_output_ids# Evaluate criteria to stop generation loop.# If we've hit or exceeded the max output length, should stoplength_stop= (len(input_ids) >=len(prompt_input_ids) +output_len)
# If draft and target have same outputs, should stop. Normally target should return 1 more token.# If they are the same length, they should differ at the last tokentarget_draft_equal=draft_output_idsisnotNoneandnp.array_equal(
draft_output_ids, target_output_ids)
# If tokens no longer change, should stop, means we have hit early stoppinglast_current_equal=np.array_equal(last_input_ids, input_ids)
# Need to check if stop words was encounteredhit_stop_words=encountered_stop_words(input_ids, stop_words_ids[0])
ifverbose:
print("length_stop:", length_stop)
print("target_draft_equal:", target_draft_equal)
print("last_current_equal:", last_current_equal)
print("hit_stop_words:", hit_stop_words)
if (length_stoportarget_draft_equalorlast_current_equalorhit_stop_words):
break# Call the postprocessorpostprocessor_inputs=get_postprocessor_inputs(input_ids) # , cum_log_probs, output_log_probs, context_logits, generation_logitspostprocessor_result=client_target.infer(postprocessor_model_name,
postprocessor_inputs,
request_id=request_id)
check_result(postprocessor_result, postprocessor_model_name)
output=postprocessor_result.as_numpy("OUTPUT")
# print(f"Output: {output[0].decode('utf-8')}")response_time= (datetime.now() -start_time).total_seconds()
print(f"Response Time: {response_time}") ##returnoutputdefrun_speculative_inference_with_defaults(
prompt: str,
url_target: str="localhost:8001",
url_draft: str=None,
output_len: int=1000,
num_draft_tokens: int=10,
beam_width: int=1,
temperature: float=1.0,
repetition_penalty: float=None,
presence_penalty: float=None,
frequency_penalty: float=None,
stop_words: list=None,
bad_words: list=None,
end_id: int=2,
pad_id: int=0,
preprocessor_model_name: str="preprocessing",
draft_tensorrt_llm_model_name: str="tensorrt_llm_draft",
target_tensorrt_llm_model_name: str="tensorrt_llm",
postprocessor_model_name: str="postprocessing",
return_draft_model_draft_logits: bool=False,
return_target_model_accepted_token_logits: bool=False,
verbose: bool=False
):
# Ensure draft URL defaults to target URL if not providedifurl_draftisNone:
url_draft=url_target# Create Triton clients for target and drafttry:
client_target=grpcclient.InferenceServerClient(url=url_target)
client_draft=grpcclient.InferenceServerClient(
url=url_draft) ifurl_target!=url_draftelseclient_targetexceptExceptionase:
print(f"Failed to create Triton client: {e}")
returnNoneifbeam_width>1:
raiseException(
'Beam width > 1 is not yet supported with speculative decoding'
)
# Call the speculative inference functionreturnrun_speculative_inference(
url_draft=url_draft,
url_target=url_target,
prompt=prompt,
output_len=output_len,
in_num_draft_tokens=num_draft_tokens,
request_id="1", # Default request IDrepetition_penalty=repetition_penalty,
presence_penalty=presence_penalty,
frequency_penalty=frequency_penalty,
temperature=temperature,
stop_words=stop_words,
bad_words=bad_words,
end_id=end_id,
pad_id=pad_id,
beam_width=beam_width,
preprocessor_model_name=preprocessor_model_name,
draft_tensorrt_llm_model_name=draft_tensorrt_llm_model_name,
target_tensorrt_llm_model_name=target_tensorrt_llm_model_name,
postprocessor_model_name=postprocessor_model_name,
return_draft_model_draft_logits=return_draft_model_draft_logits,
return_target_model_accepted_token_logits=return_target_model_accepted_token_logits,
verbose=verbose
)
if__name__=='__main__':
parser=argparse.ArgumentParser()
parser.add_argument('-v',
'--verbose',
action="store_true",
required=False,
default=False,
help='Enable verbose output')
parser.add_argument('--url-target',
type=str,
required=True,
help='Inference server URL for the target model')
parser.add_argument('--url-draft',
type=str,
required=False,
help='Inference server URL for the draft model')
parser.add_argument(
'--preprocessor-model-name',
type=str,
required=False,
default="preprocessing",
help='Name of the preprocessor model (should be hosted at url-draft)')
parser.add_argument(
'--postprocessor-model-name',
type=str,
required=False,
default="postprocessing",
help='Name of the postprocessor model (should be hosted at url-target)'
)
parser.add_argument(
'--draft-tensorrt-llm-model-name',
type=str,
required=False,
default="tensorrt_llm",
help='Name of the tensorrt_llm draft model (hosted at url-draft)')
parser.add_argument(
'--target-tensorrt-llm-model-name',
type=str,
required=False,
default="tensorrt_llm",
help='Name of the tensorrt_llm draft model (hosted at url-target)')
parser.add_argument('-p',
'--prompt',
type=str,
required=True,
help='Input prompt.')
parser.add_argument(
"-b",
"--beam-width",
required=False,
type=int,
default=1,
help="Beam width value",
)
parser.add_argument(
"--temperature",
type=float,
required=False,
default=1.0,
help="temperature value",
)
parser.add_argument(
"--repetition-penalty",
type=float,
required=False,
default=None,
help="The repetition penalty value",
)
parser.add_argument(
"--presence-penalty",
type=float,
required=False,
default=None,
help="The presence penalty value",
)
parser.add_argument(
"--frequency-penalty",
type=float,
required=False,
default=None,
help="The frequency penalty value",
)
parser.add_argument('-o',
'--output-len',
type=int,
default=1000,
required=False,
help='Specify output length')
parser.add_argument(
'--num-draft-tokens',
type=int,
default=5,
required=False,
help='Specify the number of speculative tokens for the draft model to generate per lookahead.'
)
parser.add_argument('--end-id',
type=int,
default=None,
required=False,
help='The end if token')
parser.add_argument('--pad-id',
type=int,
default=None,
required=False,
help='The pad if token')
parser.add_argument('--request-id',
type=str,
default='1',
required=False,
help='The request_id for the stop request')
parser.add_argument('--stop-words',
nargs='+',
default=[],
help='The stop words')
parser.add_argument('--bad-words',
nargs='+',
default=[],
help='The bad words')
parser.add_argument(
"--return-draft-model-draft-logits",
action="store_true",
required=False,
default=False,
help="Return draft model's draft tokens' logits, require to enable `gather_generation_logits` when build engine"
)
parser.add_argument(
"--return-target-model-accepted-token-logits",
action="store_true",
required=False,
default=False,
help="Return target model's accepted token logits, require to enable `gather_generation_logits` when build engine",
)
FLAGS=parser.parse_args()
ifnotFLAGS.url_target:
FLAGS.url_target="localhost:8001"ifnotFLAGS.url_draft:
FLAGS.url_draft=FLAGS.url_targettry:
client_target=grpcclient.InferenceServerClient(url=FLAGS.url_target)
client_draft=grpcclient.InferenceServerClient(
url=FLAGS.url_draft) if (
FLAGS.url_target!=FLAGS.url_draft) elseclient_targetexceptExceptionase:
print("client creation failed: "+str(e))
sys.exit(1)
if (FLAGS.beam_width>1):
raiseException(
'Beam width > 1 is not yet supported with speculative decoding')
output_text=run_speculative_inference(
client_draft, client_target, FLAGS.prompt, FLAGS.output_len,
FLAGS.num_draft_tokens, FLAGS.request_id, FLAGS.repetition_penalty,
FLAGS.presence_penalty, FLAGS.frequency_penalty, FLAGS.temperature,
FLAGS.stop_words, FLAGS.bad_words, FLAGS.end_id, FLAGS.pad_id,
FLAGS.beam_width, FLAGS.preprocessor_model_name,
FLAGS.draft_tensorrt_llm_model_name,
FLAGS.target_tensorrt_llm_model_name, FLAGS.postprocessor_model_name,
FLAGS.return_draft_model_draft_logits,
FLAGS.return_target_model_accepted_token_logits, FLAGS.verbose)
# Print the final textprint("Final text:\n", output_text)
System Info
Who can help?
@juney-nvidia @juney-nvidia @kaiyux
Information
Tasks
examples
folder (such as GLUE/SQuAD, ...)Reproduction
run_speculative_inference()
function in thee2e_grpc_speculative_decoding_client.py
file. (actual executed func:run_speculative_inference_with_defaults()
)Expected behavior
"prompt" in "output" out
actual behavior
I've got an assertion error with the message of
!mTokens.empty()
File "/AutoEval/e2e_grpc_speculative_decoding_client.py", line 367, in run_speculative_inference
raise RuntimeError(grpcclient.InferenceServerException(msg=response.error_message))
RuntimeError: [TensorRT-LLM][ERROR] Assertion failed: !mTokens.empty() (/workspace/tensorrt_llm/cpp/tensorrt_llm/executor/decodingConfig.cpp:31)
1 0x7e52d0b7bc64 tensorrt_llm::common::throwRuntimeError(char const*, int, std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&) + 100
2 0x7e52d3006738 tensorrt_llm::executor::ExternalDraftTokensConfig::ExternalDraftTokensConfig(std::vector<int, std::allocator >, std::optional<tensorrt_llm::executor::Tensor>, std::optional const&, std::optional const&) + 712
3 0x7e544d685d72 triton::backend::inflight_batcher_llm::utils::getExternalDraftTokensConfigFromTensors(std::unordered_map<std::__cxx11::basic_string<char, std::char_traits, std::allocator >, tensorrt_llm::batch_manager::NamedTensor, std::hash<std::__cxx11::basic_string<char, std::char_traits, std::allocator > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits, std::allocator > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits, std::allocator > const, tensorrt_llm::batch_manager::NamedTensor> > > const&, bool) + 818
4 0x7e544d68764d triton::backend::inflight_batcher_llm::utils::createRequestsFromInputTensors(std::vector<std::unordered_map<std::__cxx11::basic_string<char, std::char_traits, std::allocator >, tensorrt_llm::batch_manager::NamedTensor, std::hash<std::__cxx11::basic_string<char, std::char_traits, std::allocator > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits, std::allocator > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits, std::allocator > const, tensorrt_llm::batch_manager::NamedTensor> > >, std::allocator<std::unordered_map<std::__cxx11::basic_string<char, std::char_traits, std::allocator >, tensorrt_llm::batch_manager::NamedTensor, std::hash<std::__cxx11::basic_string<char, std::char_traits, std::allocator > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits, std::allocator > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits, std::allocator > const, tensorrt_llm::batch_manager::NamedTensor> > > > > const&, bool, bool, bool, tensorrt_llm::executor::ModelType, tensorrt_llm::executor::RequestType, bool, bool) + 2813
5 0x7e544d660490 triton::backend::inflight_batcher_llm::ModelInstanceState::createExecutorRequests(TRITONBACKEND_Request*, bool, bool, tensorrt_llm::executor::ModelType, bool, bool) + 144
6 0x7e544d66c5a2 triton::backend::inflight_batcher_llm::ModelInstanceState::enqueue(TRITONBACKEND_Request**, unsigned int) + 434
7 0x7e544d659bb5 TRITONBACKEND_ModelInstanceExecute + 101
8 0x7e545984b384 /opt/tritonserver/bin/../lib/libtritonserver.so(+0x1af384) [0x7e545984b384]
9 0x7e545984b6fb /opt/tritonserver/bin/../lib/libtritonserver.so(+0x1af6fb) [0x7e545984b6fb]
10 0x7e545996d76d /opt/tritonserver/bin/../lib/libtritonserver.so(+0x2d176d) [0x7e545996d76d]
11 0x7e545984f384 /opt/tritonserver/bin/../lib/libtritonserver.so(+0x1b3384) [0x7e545984f384]
12 0x7e545afb7253 /usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0xdc253) [0x7e545afb7253]
13 0x7e5458c6bac3 /usr/lib/x86_64-linux-gnu/libc.so.6(+0x94ac3) [0x7e5458c6bac3]
14 0x7e5458cfca04 clone + 68
additional notes
I tried to use all of the scripts that you mentioned in documents.
It works fine when attempted once, but an error usually occurs when multiple requests are sent.
The text was updated successfully, but these errors were encountered: