fix lint

yuekaizhang · yuekaizhang · commit 62d082634ee5 · 2025-07-29T08:40:51.000Z
diff --git a/examples/grpo/cosyvoice2/huggingface_to_pretrained.py b/examples/grpo/cosyvoice2/huggingface_to_pretrained.py
@@ -21,6 +21,7 @@
 from safetensors import safe_open
 from transformers import AutoTokenizer
 
+
 def get_args():
     parser = ArgumentParser()
 
@@ -39,6 +40,7 @@ def get_args():
     args = parser.parse_args()
     return args
 
+
 if __name__ == "__main__":
     args = get_args()
 
@@ -67,4 +69,3 @@ def get_args():
         hf_tensors["llm.model.lm_head.weight"] = hf_tensors["llm.model.model.embed_tokens.weight"]
 
     torch.save(hf_tensors, args.output_path)
-
diff --git a/examples/grpo/cosyvoice2/infer_dataset.py b/examples/grpo/cosyvoice2/infer_dataset.py
@@ -105,6 +105,7 @@ def extract_speech_ids(speech_tokens_str):
             print(f"Unexpected token: {token_str}")
     return speech_ids
 
+
 def convert_cosy2_tokens_to_speech_id_str(cosy2_tokens):
     """Convert CosyVoice2 tokens to speech IDs string like <|s_23456|>"""
     speech_id_str = ""
@@ -182,14 +183,13 @@ def get_args():
     return args
 
 
-
 def data_collator(batch, tokenizer, s3_tokenizer):
     """Simplified data collator for batch_size=1 processing"""
     target_sample_rate = 16000  # CosyVoice2 uses 16kHz for prompt audio
     device = s3_tokenizer.device if s3_tokenizer is not None else torch.device("cpu")
     input_ids_list, prompt_audio_list, prompt_text_list = [], [], []
     mels, prompt_audio_cosy2tokens_list = [], []
-    for i, item in enumerate(batch):
+    for item in batch:
         prompt_text, target_text = (
             item["prompt_text"],
             item["target_text"],
@@ -227,7 +227,7 @@ def data_collator(batch, tokenizer, s3_tokenizer):
         codes, codes_lens = s3_tokenizer.quantize(mels.to(device), mels_lens.to(device))
         for i in range(len(codes)):
             prompt_audio_cosy2tokens_list.append(codes[i, :codes_lens[i].item()])
-    for i, prompt_audio_cosy2tokens in enumerate(prompt_audio_cosy2tokens_list):
+    for prompt_audio_cosy2tokens in prompt_audio_cosy2tokens_list:
         prompt_audio_cosy2_id_str = convert_cosy2_tokens_to_speech_id_str(prompt_audio_cosy2tokens)
         # Create chat template for LLM generation
         chat = [
@@ -244,7 +244,6 @@ def data_collator(batch, tokenizer, s3_tokenizer):
         )
         input_ids_list.append(input_ids.squeeze(0))
 
-
     # For batch_size=1, no need to pad
     if len(input_ids_list) == 1:
         input_ids = input_ids_list[0].unsqueeze(0)
@@ -256,7 +255,7 @@ def data_collator(batch, tokenizer, s3_tokenizer):
             for input_ids in input_ids_list
         ]
         input_ids = torch.stack(input_ids_list)
-    
+
     ids = [item["id"] for item in batch]
 
     return {
@@ -287,7 +286,7 @@ def main():
     assert torch.cuda.is_available()
     world_size, local_rank, rank = init_distributed()
     device = torch.device(f"cuda:{local_rank}")
-    
+
     # Load LLM model and tokenizer directly
     tokenizer = AutoTokenizer.from_pretrained(args.llm_model_name_or_path)
     model = AutoModelForCausalLM.from_pretrained(args.llm_model_name_or_path)
@@ -329,7 +328,7 @@ def main():
     for batch in dataloader:
         with torch.no_grad():
             input_ids = batch["input_ids"].to(device)
-            
+
             # Generate speech tokens using LLM
             outputs = model.generate(
                 input_ids,
@@ -339,31 +338,31 @@ def main():
                 temperature=args.temperature,
                 top_k=args.top_k,
             )
-            
+
             # Process each sample in the batch
             for i in range(len(batch["ids"])):
                 # Extract generated tokens (excluding input)
                 input_length = input_ids[i].shape[0]
                 generated_ids = outputs[i][input_length:-1]  # Remove last token if needed
                 speech_tokens_str = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
-                
+
                 # Extract speech IDs from token strings like <|s_23456|>
                 speech_ids = extract_speech_ids(speech_tokens_str)
-                
+
                 if len(speech_ids) == 0:
                     print(f"Warning: No speech tokens generated for sample {batch['ids'][i]}, skipping")
                     continue
-                
+
                 # Convert to tensor for CosyVoice2
                 audio_tokens = torch.tensor(speech_ids, dtype=torch.long, device=device).unsqueeze(0)
-                
+
                 if args.prompt_text is not None:
                     current_prompt_text = args.prompt_text
                     current_prompt_audio = prompt_speech_16k
                 else:
                     current_prompt_text = batch["prompt_text"][i]
                     current_prompt_audio = batch["prompt_audio_list"][i]
-                
+
                 if current_prompt_audio is not None:
                     # Generate audio using CosyVoice2
                     audio_hat = audio_decode_cosyvoice2(
@@ -372,18 +371,17 @@ def main():
                         current_prompt_audio,
                         cosyvoice_codec,
                     )
-                    
+
                     # Convert to numpy and save
                     generated_wave = audio_hat.squeeze(0).cpu().numpy()
                     target_sample_rate = 24000
-                    
+
                     utt = batch["ids"][i]
                     sf.write(f"{args.output_dir}/{utt}.wav", generated_wave, target_sample_rate)
 
                     print(f"Generated audio for sample {utt} with {len(speech_ids)} tokens")
                 else:
                     print(f"Warning: No prompt audio available for sample {batch['ids'][i]}, skipping")
-                        
 
         if rank == 0:
             progress_bar.update(world_size * len(batch["ids"]))
diff --git a/examples/grpo/cosyvoice2/prepare_data.py b/examples/grpo/cosyvoice2/prepare_data.py
@@ -23,8 +23,6 @@
 
 from verl.utils.hdfs_io import copy, makedirs
 
-
-
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--train_file", required=True, help="Path to training JSON/JSONL file")
diff --git a/examples/grpo/cosyvoice2/pretrained_to_huggingface.py b/examples/grpo/cosyvoice2/pretrained_to_huggingface.py
@@ -31,7 +31,6 @@
 sys.path.append("/workspace/CosyVoice/third_party/Matcha-TTS")
 
 
-
 def get_args():
     parser = ArgumentParser()
 
@@ -96,17 +95,20 @@ def get_args():
         # set the weight and bias of the new lm_head to 0
         new_lm_head.weight.data.zero_()
         new_lm_head.bias.data.zero_()
-        new_lm_head.weight[original_tokenizer_vocab_size:original_tokenizer_vocab_size+cosyvoice2_token_size+3] = llm_decoder.weight
-        new_lm_head.bias[original_tokenizer_vocab_size:original_tokenizer_vocab_size+cosyvoice2_token_size+3] = llm_decoder.bias
+        new_lm_head.weight[original_tokenizer_vocab_size:original_tokenizer_vocab_size + cosyvoice2_token_size + 3] = llm_decoder.weight
+        new_lm_head.bias[original_tokenizer_vocab_size:original_tokenizer_vocab_size + cosyvoice2_token_size + 3] = llm_decoder.bias
 
     llm.lm_head = new_lm_head
     input_embeddings = llm.get_input_embeddings()
 
     with torch.no_grad():
-        input_embeddings.weight[original_tokenizer_vocab_size:original_tokenizer_vocab_size+cosyvoice2_token_size+3] = speech_embedding.weight
-        input_embeddings.weight[original_tokenizer_vocab_size+cosyvoice2_token_size+3:original_tokenizer_vocab_size+cosyvoice2_token_size+3+2] = llm_embedding.weight
+        input_embeddings.weight[original_tokenizer_vocab_size:original_tokenizer_vocab_size + cosyvoice2_token_size + 3] = speech_embedding.weight
+        input_embeddings.weight[original_tokenizer_vocab_size + cosyvoice2_token_size + 3:original_tokenizer_vocab_size + cosyvoice2_token_size + 3 + 2] = llm_embedding.weight
 
-    eos_token_ids = [original_tokenizer_vocab_size + cosyvoice2_token_size, original_tokenizer_vocab_size + cosyvoice2_token_size + 1, original_tokenizer_vocab_size + cosyvoice2_token_size + 2]
+    eos_token_ids = [original_tokenizer_vocab_size + cosyvoice2_token_size,
+                     original_tokenizer_vocab_size + cosyvoice2_token_size + 1,
+                     original_tokenizer_vocab_size + cosyvoice2_token_size + 2,
+                     original_tokenizer_vocab_size + cosyvoice2_token_size + 3]
     llm.generation_config.eos_token_id = eos_token_ids
     llm.generation_config.temperature = 1.0
     llm.generation_config.top_p = 0.8
@@ -121,4 +123,4 @@ def get_args():
 
     TEMPLATE = "{%- for message in messages %}{%- if message['role'] == 'user' %}{{- '<|sos|>' + message['content'] + '<|task_id|>' }}{%- elif message['role'] == 'assistant' %}{{- message['content']}}{%- endif %}{%- endfor %}"
     tokenizer.chat_template = TEMPLATE
-    tokenizer.save_pretrained(args.save_path)
+    tokenizer.save_pretrained(args.save_path)
diff --git a/examples/grpo/cosyvoice2/reward_tts.py b/examples/grpo/cosyvoice2/reward_tts.py
@@ -18,7 +18,10 @@
 
 from __future__ import annotations
 
-import os, re, warnings, json, time, argparse
+import re
+import json
+import time
+import argparse
 from typing import List
 
 import numpy as np
@@ -31,6 +34,7 @@
 def _parse_ids(token_str: str) -> List[int]:
     return [int(t) for t in re.findall(r"<\|s_(\d+)\|>", token_str)]
 
+
 def _remote_reward(tokens: List[int], ground_truth: str, timeout: float = 200.0) -> float:
     """Send token IDs and ground-truth text to the Triton server and get reward."""
 
@@ -100,7 +104,6 @@ def compute_score(
     try:
         reward = _remote_reward(ids, ground_truth)
     except Exception as e:
-        warnings.warn(f"Remote reward server error: {e}; returning 0.0")
         reward = 0.0
 
     if debug_dump:
@@ -110,92 +113,92 @@ def compute_score(
 
     return reward
 
+
 # CLI quick test
 if __name__ == "__main__":
     import sys
-    
+
     def get_args():
         """Parse command line arguments."""
         parser = argparse.ArgumentParser(
             description="Test TTS CER scoring with data from JSONL file",
             formatter_class=argparse.ArgumentDefaultsHelpFormatter
         )
-        
+
         parser.add_argument(
             "--input", "-i",
             type=str,
             default="data/emilia_zh-cosy-tiny-test.jsonl",
             help="Path to input JSONL file"
         )
-        
+
         parser.add_argument(
             "--max-samples", "-n",
             type=int,
             default=None,
             help="Maximum number of samples to process (default: all)"
         )
-        
+
         parser.add_argument(
             "--no-interactive",
             action="store_true",
             help="Run in non-interactive mode (process all samples without prompts)"
         )
-        
-        
+
         parser.add_argument(
             "--debug",
             action="store_true",
             help="Enable debug mode"
         )
-        
+
         return parser.parse_args()
-    
+
     def load_jsonl(file_path: str):
         """Load data from jsonl file."""
         data = []
         with open(file_path, 'r', encoding='utf-8') as f:
             for line in f:
                 data.append(json.loads(line.strip()))
         return data
-    
+
     def code_to_solution_str(code_list: List[int]) -> str:
         """Convert code list to solution string format."""
         return ''.join([f"<|s_{code}|>" for code in code_list])
-    
+
     # Parse command line arguments
     args = get_args()
-    
+
     try:
         # Load data from jsonl file
         print(f"Loading data from: {args.input}")
         data_list = load_jsonl(args.input)
         print(f"Loaded {len(data_list)} samples")
-        
+
         # Limit samples if specified
         if args.max_samples is not None:
             data_list = data_list[:args.max_samples]
             print(f"Processing first {len(data_list)} samples (limited by --max-samples)")
-        
+
         # Process each sample
         begin_time = time.time()
         for i, sample in enumerate(data_list):
             print(f"\n--- Sample {i+1}/{len(data_list)} ---")
             print(f"Index: {sample.get('index', 'unknown')}")
             print(f"Text: {sample['text']}")
-            
+
             # Extract required fields
             code_list = sample['code']
             ground_truth = sample['text']
             data_source = sample.get('index', f'sample_{i}')  # Use index as data_source
-            
+
             # Convert code list to solution string
             solution_str = code_to_solution_str(code_list)
             print(f"Solution tokens: {len(code_list)} tokens")
             if args.debug:
                 print(f"Solution string: {solution_str}")
             else:
                 print(f"Solution string preview: {solution_str[:100]}..." if len(solution_str) > 100 else f"Solution string: {solution_str}")
-            
+
             # Call compute_score function
             try:
                 score = compute_score(
@@ -208,7 +211,7 @@ def code_to_solution_str(code_list: List[int]) -> str:
                 print(f"Final Score: {score:.4f}")
             except Exception as e:
                 print(f"Error computing score: {e}")
-            
+
             # Ask user if they want to continue (for interactive mode)
             if not args.no_interactive and i < len(data_list) - 1:
                 try:
@@ -218,7 +221,7 @@ def code_to_solution_str(code_list: List[int]) -> str:
                 except KeyboardInterrupt:
                     print("\nStopped by user")
                     break
-        
+
         print(f"\nProcessed {min(i+1, len(data_list))} samples")
         end_time = time.time()
         print(f"Time taken: {end_time - begin_time} seconds")
diff --git a/examples/grpo/cosyvoice2/scripts/offline-decode-files.py b/examples/grpo/cosyvoice2/scripts/offline-decode-files.py
diff --git a/examples/grpo/cosyvoice2/token2wav_asr_server.py b/examples/grpo/cosyvoice2/token2wav_asr_server.py