@@ -199,8 +199,6 @@ def forward_token2wav(self, prompt_speech_tokens: torch.Tensor, prompt_speech_fe
199199 Returns:
200200 Generated waveform tensor
201201 """
202- print (prompt_speech_tokens .shape , prompt_speech_feat .shape , prompt_spk_embedding .shape , target_speech_tokens .shape )
203- # Convert tensors to Triton format
204202 prompt_speech_tokens_tensor = pb_utils .Tensor .from_dlpack ("prompt_speech_tokens" , to_dlpack (prompt_speech_tokens ))
205203 prompt_speech_feat_tensor = pb_utils .Tensor .from_dlpack ("prompt_speech_feat" , to_dlpack (prompt_speech_feat ))
206204 prompt_spk_embedding_tensor = pb_utils .Tensor .from_dlpack ("prompt_spk_embedding" , to_dlpack (prompt_spk_embedding ))
@@ -228,9 +226,7 @@ def parse_input(self, text, prompt_text, prompt_speech_tokens):
228226 prompt = self .prompt_template .format (input_text = total_text )
229227 input_ids = self .tokenizer .encode (prompt )
230228 input_ids = torch .tensor ([input_ids ], dtype = torch .int32 )
231- print (input_ids .shape , "before cat" )
232229 input_ids = torch .cat ([input_ids , prompt_speech_tokens ], dim = 1 )
233- print (input_ids .shape , "after cat" , prompt_speech_tokens .shape )
234230 return input_ids
235231
236232 def _extract_spk_embedding (self , speech ):
@@ -271,23 +267,15 @@ def execute(self, requests):
271267 prompt_speech_tokens = self .forward_audio_tokenizer (wav , wav_len )
272268 prompt_speech_tokens = prompt_speech_tokens .unsqueeze (0 )
273269
274- # TODO: FIX ME
270+
275271 wav_tensor = wav .as_numpy ()
276- print (wav_tensor .shape , "wav_tensor" )
277272 wav_tensor = torch .from_numpy (wav_tensor )[:, :wav_len .as_numpy ()[0 ][0 ]]
278- print (wav_tensor .shape , "wav_tensor after" )
279273 prompt_speech_resample = torchaudio .transforms .Resample (orig_freq = 16000 , new_freq = 24000 )(wav_tensor )
280274 speech_feat = self ._extract_speech_feat (prompt_speech_resample )
281- print (speech_feat .shape , "speech_feat" )
282- print (prompt_speech_tokens .shape , "prompt_speech_tokens here" )
283275 token_len = min (int (speech_feat .shape [1 ] / 2 ), prompt_speech_tokens .shape [- 1 ])
284276 prompt_speech_feat = speech_feat [:, :2 * token_len ].contiguous ().half ()
285277 prompt_speech_tokens = prompt_speech_tokens [:, :token_len ].contiguous ()
286- print (prompt_speech_tokens .shape , "prompt_speech_tokens after" )
287- print (speech_feat .shape , "speech_feat after" )
288- print (token_len , "token_len" )
289278
290- # Extract text inputs
291279 reference_text = pb_utils .get_input_tensor_by_name (request , "reference_text" ).as_numpy ()
292280 reference_text = reference_text [0 ][0 ].decode ('utf-8' )
293281
0 commit comments