@@ -84,7 +84,9 @@ def initialize(self, args):
8484 trust_remote_code = True )
8585 if isinstance (self .tokenizer , T5Tokenizer ):
8686 self .tokenizer_bos_id = self .tokenizer .sp_model .bos_id ()
87- self .tokenizer .pad_token = self .tokenizer .eos_token
87+
88+ if not self .tokenizer .pad_token :
89+ self .tokenizer .pad_token = self .tokenizer .eos_token
8890
8991 self .tokenizer_end_id = self .tokenizer .encode (
9092 self .tokenizer .eos_token , add_special_tokens = False )[0 ]
@@ -93,7 +95,8 @@ def initialize(self, args):
9395
9496 # Parse model output configs and convert Triton types to numpy types
9597 output_names = [
96- "INPUT_ID" , "REQUEST_INPUT_LEN" , "BAD_WORDS_IDS" , "STOP_WORDS_IDS" ,
98+ "INPUT_ID" , "DECODER_INPUT_ID" , "REQUEST_INPUT_LEN" ,
99+ "REQUEST_DECODER_INPUT_LEN" , "BAD_WORDS_IDS" , "STOP_WORDS_IDS" ,
97100 "OUT_END_ID" , "OUT_PAD_ID"
98101 ]
99102 input_names = ["EMBEDDING_BIAS_WORDS" , "EMBEDDING_BIAS_WEIGHTS" ]
@@ -142,6 +145,11 @@ def execute(self, requests):
142145 # Get input tensors
143146 query = pb_utils .get_input_tensor_by_name (request ,
144147 'QUERY' ).as_numpy ()
148+ decoder_query = pb_utils .get_input_tensor_by_name (
149+ request , 'DECODER_QUERY' )
150+ if decoder_query is not None :
151+ decoder_query = decoder_query .as_numpy ()
152+
145153 batch_dim = query .shape [0 ]
146154 if batch_dim != 1 :
147155
@@ -194,6 +202,15 @@ def execute(self, requests):
194202
195203 # Preprocessing input data.
196204 input_id , request_input_len = self ._create_request (query )
205+ print (input_id )
206+ print (request_input_len )
207+ if decoder_query is not None :
208+ decoder_input_id , request_decoder_input_len = self ._create_request (
209+ decoder_query )
210+ else :
211+ decoder_input_id = pad_id * np .ones ((1 , 1 ), np .int32 )
212+ request_decoder_input_len = 1 * np .ones ((1 , 1 ), np .int32 )
213+
197214 bad_words = self ._to_word_list_format (bad_words_dict )
198215 stop_words = self ._to_word_list_format (stop_words_dict )
199216
@@ -208,6 +225,13 @@ def execute(self, requests):
208225 request_input_len_tensor = pb_utils .Tensor (
209226 'REQUEST_INPUT_LEN' ,
210227 request_input_len .astype (self .request_input_len_dtype ))
228+ decoder_input_id_tensor = pb_utils .Tensor (
229+ 'DECODER_INPUT_ID' ,
230+ decoder_input_id .astype (self .decoder_input_id_dtype ))
231+ request_decoder_input_len_tensor = pb_utils .Tensor (
232+ 'REQUEST_DECODER_INPUT_LEN' ,
233+ request_decoder_input_len .astype (
234+ self .request_decoder_input_len_dtype ))
211235 request_output_len_tensor = pb_utils .Tensor (
212236 'REQUEST_OUTPUT_LEN' , request_output_len )
213237 bad_words_ids_tensor = pb_utils .Tensor ('BAD_WORDS_IDS' , bad_words )
@@ -221,8 +245,9 @@ def execute(self, requests):
221245 np .array (pad_id , dtype = np .int32 ))
222246
223247 inference_response = pb_utils .InferenceResponse (output_tensors = [
224- input_id_tensor , bad_words_ids_tensor , stop_words_ids_tensor ,
225- request_input_len_tensor , request_output_len_tensor ,
248+ input_id_tensor , decoder_input_id_tensor , bad_words_ids_tensor ,
249+ stop_words_ids_tensor , request_input_len_tensor ,
250+ request_decoder_input_len_tensor , request_output_len_tensor ,
226251 embedding_bias_tensor , end_id_tensor , pad_id_tensor
227252 ])
228253 responses .append (inference_response )
0 commit comments