Feat/train tool agent (#17)

nguyenhoangthuan99 · bachvudinh · web-flow · commit 39ed399e9fae · 2025-12-29T09:43:55.000+07:00
* train tool agent * Feat/add search scrape tool (#14) * migrate search, scrape tool into verl and add code to spin up retrieval server * add debug code to train tool agent * Update async_sglang_server.py --------- Co-authored-by: nguyenhoangthuan99 <35255081+nguyenhoangthuan99@users.noreply.github.com> * add data code (#15) * add data code * push reward code * add system prompt and user prompt * fix prompt * fix prompt * update to train 30b model * fix training bug * add reward remove repeate tool call and bulk tool calls --------- Co-authored-by: bachvudinh <bachvudinh02@gmail.com> Co-authored-by: bachvudinh <89349141+bachvudinh@users.noreply.github.com>
diff --git a/examples/vllm_multiturn/config/search_multiturn_grpo.yaml b/examples/vllm_multiturn/config/search_multiturn_grpo.yaml
@@ -19,6 +19,9 @@ actor_rollout_ref:
     name: sglang
     multi_turn:
       enable: True
-      max_assistant_turns: 100
+      max_assistant_turns: 50
       format: hermes
+      max_parallel_calls: 10
+      max_tool_response_length: 16384
+
 
diff --git a/verl/trainer/config/rollout/rollout.yaml b/verl/trainer/config/rollout/rollout.yaml
@@ -160,10 +160,10 @@ multi_turn:
   max_user_turns: null
 
   # max parallel call for tools in single turn
-  max_parallel_calls: 1
+  max_parallel_calls: 10
 
   # max length of tool response
-  max_tool_response_length: 256
+  max_tool_response_length: 16384
 
   # truncate side of tool response: left, middle, right
   tool_response_truncate_side: middle
diff --git a/verl/utils/dataset/rl_dataset.py b/verl/utils/dataset/rl_dataset.py
@@ -46,17 +46,19 @@
    - Consider what information is needed to provide a complete answer
 
 2. Mandatory Logical Analysis (Say It Out Loud):
-   - Before engaging tools, you must articulate your complete thought process in natural language. You must act as a "professional tool caller," demonstrating extreme logic.
+   - Before engaging tools, you must articulate your complete thought process in natural language. You must act as a "professional tool caller," demonstrating extreme logic and NEVER repeat your thought process across tool calls.
    - Analyze the Information Gap: Explicitly state what data is missing.
    - Derive the Strategy: Explain why a specific tool is the logical next step.
    - Justify Parameters: Explain why you chose those specific search keywords or that specific URL.
 
-3. When you need to search for information, call the "web_search" tool using this exact XML format:
+3 Never give an unclear answer immediately right after searching; try scraping the URL to get more information.
+
+4. When you need to search for information, call the "web_search" tool using this exact XML format:
 <tool_call>
 {{"name": "web_search", "arguments": {{"query": "your search query here"}}}}
 </tool_call>
 
-4. If search results show promising URLs/documents but you need more detailed information, use the "scrape" tool:
+5. If search results show promising URLs/documents but you need more detailed information, use the "scrape" tool:
 <tool_call>
 {{"name": "scrape", "arguments": {{"url": "doc_1 or specific URL from search results"}}}}
 </tool_call>
diff --git a/verl/utils/reward_score/jan_v2_reward/format_reward.py b/verl/utils/reward_score/jan_v2_reward/format_reward.py
@@ -1,6 +1,18 @@
-import re
+import re, json
 from scipy.stats import skewnorm
+import Levenshtein as levenshtein
 
+def count_tool_call_turns(text):
+    # Pattern explanation:
+    # assistant   : matches the literal string
+    # \s+         : matches one or more whitespace characters (space, tab, newline, etc.)
+    # <tool_call> : matches the literal string
+    pattern = r'assistant\s+<tool_call>'
+    
+    # re.findall returns a list of all non-overlapping matches
+    matches = re.findall(pattern, text)
+    
+    return len(matches)
 
 def calculate_skewed_penalty(x, center=70, skewness=5, scale=200):
     """
@@ -39,6 +51,18 @@ def parse_assistant_thoughts(text):
     return [match.strip() for match in matches if match.strip()]
 
 
+def parse_all_tool_calls(solution_str):
+    pattern = r'<tool_call>(.*?)</tool_call>'
+    matches = re.findall(pattern, solution_str, flags=re.DOTALL)
+    matches = [match.strip() for match in matches if match.strip()]
+    tool_calls = []
+    for match in matches:
+        try:
+            tool_calls.append(json.loads(match))
+        except:
+            return []
+    return tool_calls
+
 def compute_format_reward(solution_str):
     if "<answer>" not in solution_str and "</answer>" not in solution_str:
         return 0 
@@ -48,7 +72,32 @@ def compute_format_reward(solution_str):
     if solution_str.count("<tool_call>") == 0:
         return 0.
     
+    tool_calls = parse_all_tool_calls(solution_str)
+    if len(tool_calls) == 0:
+        return 0.
+    
+    processed_tool_calls = []
+    for tool_call in tool_calls:
+        if "name" not in tool_call or "arguments" not in tool_call:
+            return 0.
+        if type(tool_call["arguments"]) != dict:
+            return 0.
+        tool = str(tool_call).lower().replace(" ", "")
+        if tool in processed_tool_calls:
+            return 0.
+        processed_tool_calls.append(tool)
+
+    
+
     assistant_thoughts = parse_assistant_thoughts(solution_str)
+    for i, str1 in enumerate(assistant_thoughts):
+        for j, str2 in enumerate(assistant_thoughts):
+            if i != j:
+                ratio = levenshtein.ratio(str2, str1)
+                if ratio > 0.8:
+                    return 0.0
+                
+
     length = [len(x.split(" ")) for x in assistant_thoughts]
     if len(length):
         avg_length = float(sum(length))/len(length)
@@ -57,3 +106,15 @@ def compute_format_reward(solution_str):
     else:
         return 0.
 
+def compute_bulk_tool_call_reward(solution_str):
+    num_turns = count_tool_call_turns(solution_str)
+    num_tools = solution_str.count("<tool_call>")
+    if num_turns == 0 :
+        return 0.0
+    else:
+        ratio = float(num_tools)/num_turns
+        if ratio > 1:
+            return ratio
+        else:
+            return 0.0
+
diff --git a/verl/utils/reward_score/jan_v2_reward/qa_reward.py b/verl/utils/reward_score/jan_v2_reward/qa_reward.py
@@ -2,7 +2,8 @@
 import re
 import string
 from .llm_judge_utils import evaluate_answer
-from .format_reward import compute_format_reward
+from .format_reward import compute_format_reward, compute_bulk_tool_call_reward
+
 
 def normalize_answer(s):
     def remove_articles(text):
@@ -114,10 +115,12 @@ def compute_score(solution_str, ground_truth, question, format_score=0.0, score=
         if evaluate_result['grade_description'] == "CORRECT":
             
             format_score = compute_format_reward(solution_str)
+            bulk_tool_score = compute_bulk_tool_call_reward(solution_str)
             print(f"Solution string with format score {format_score}: {solution_str}")
             if format_score == 0.0:
                 return 0.0
-            return 1.0 + format_score*0.2
+            return 1.0 + format_score*0.2 + bulk_tool_score* 0.2
+
         else:
             
             print(f"Solution string: {solution_str}")