@@ -84,7 +84,9 @@ def initialize(self, args):
84
84
trust_remote_code = True )
85
85
if isinstance (self .tokenizer , T5Tokenizer ):
86
86
self .tokenizer_bos_id = self .tokenizer .sp_model .bos_id ()
87
- self .tokenizer .pad_token = self .tokenizer .eos_token
87
+
88
+ if not self .tokenizer .pad_token :
89
+ self .tokenizer .pad_token = self .tokenizer .eos_token
88
90
89
91
self .tokenizer_end_id = self .tokenizer .encode (
90
92
self .tokenizer .eos_token , add_special_tokens = False )[0 ]
@@ -93,7 +95,8 @@ def initialize(self, args):
93
95
94
96
# Parse model output configs and convert Triton types to numpy types
95
97
output_names = [
96
- "INPUT_ID" , "REQUEST_INPUT_LEN" , "BAD_WORDS_IDS" , "STOP_WORDS_IDS" ,
98
+ "INPUT_ID" , "DECODER_INPUT_ID" , "REQUEST_INPUT_LEN" ,
99
+ "REQUEST_DECODER_INPUT_LEN" , "BAD_WORDS_IDS" , "STOP_WORDS_IDS" ,
97
100
"OUT_END_ID" , "OUT_PAD_ID"
98
101
]
99
102
input_names = ["EMBEDDING_BIAS_WORDS" , "EMBEDDING_BIAS_WEIGHTS" ]
@@ -142,6 +145,11 @@ def execute(self, requests):
142
145
# Get input tensors
143
146
query = pb_utils .get_input_tensor_by_name (request ,
144
147
'QUERY' ).as_numpy ()
148
+ decoder_query = pb_utils .get_input_tensor_by_name (
149
+ request , 'DECODER_QUERY' )
150
+ if decoder_query is not None :
151
+ decoder_query = decoder_query .as_numpy ()
152
+
145
153
batch_dim = query .shape [0 ]
146
154
if batch_dim != 1 :
147
155
@@ -194,6 +202,15 @@ def execute(self, requests):
194
202
195
203
# Preprocessing input data.
196
204
input_id , request_input_len = self ._create_request (query )
205
+ print (input_id )
206
+ print (request_input_len )
207
+ if decoder_query is not None :
208
+ decoder_input_id , request_decoder_input_len = self ._create_request (
209
+ decoder_query )
210
+ else :
211
+ decoder_input_id = pad_id * np .ones ((1 , 1 ), np .int32 )
212
+ request_decoder_input_len = 1 * np .ones ((1 , 1 ), np .int32 )
213
+
197
214
bad_words = self ._to_word_list_format (bad_words_dict )
198
215
stop_words = self ._to_word_list_format (stop_words_dict )
199
216
@@ -208,6 +225,13 @@ def execute(self, requests):
208
225
request_input_len_tensor = pb_utils .Tensor (
209
226
'REQUEST_INPUT_LEN' ,
210
227
request_input_len .astype (self .request_input_len_dtype ))
228
+ decoder_input_id_tensor = pb_utils .Tensor (
229
+ 'DECODER_INPUT_ID' ,
230
+ decoder_input_id .astype (self .decoder_input_id_dtype ))
231
+ request_decoder_input_len_tensor = pb_utils .Tensor (
232
+ 'REQUEST_DECODER_INPUT_LEN' ,
233
+ request_decoder_input_len .astype (
234
+ self .request_decoder_input_len_dtype ))
211
235
request_output_len_tensor = pb_utils .Tensor (
212
236
'REQUEST_OUTPUT_LEN' , request_output_len )
213
237
bad_words_ids_tensor = pb_utils .Tensor ('BAD_WORDS_IDS' , bad_words )
@@ -221,8 +245,9 @@ def execute(self, requests):
221
245
np .array (pad_id , dtype = np .int32 ))
222
246
223
247
inference_response = pb_utils .InferenceResponse (output_tensors = [
224
- input_id_tensor , bad_words_ids_tensor , stop_words_ids_tensor ,
225
- request_input_len_tensor , request_output_len_tensor ,
248
+ input_id_tensor , decoder_input_id_tensor , bad_words_ids_tensor ,
249
+ stop_words_ids_tensor , request_input_len_tensor ,
250
+ request_decoder_input_len_tensor , request_output_len_tensor ,
226
251
embedding_bias_tensor , end_id_tensor , pad_id_tensor
227
252
])
228
253
responses .append (inference_response )
0 commit comments