Skip to content

Commit d8c4749

Browse files
authored
migrate search, scrape tool into verl and add code to spin up retrieval server (#12)
1 parent 94e51fb commit d8c4749

28 files changed

Lines changed: 2947 additions & 0 deletions
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
hydra:
2+
searchpath:
3+
- file://verl/trainer/config
4+
5+
defaults:
6+
- ppo_trainer
7+
- _self_
8+
9+
data:
10+
max_prompt_length: 1024
11+
max_response_length: 1024
12+
train_batch_size: 256
13+
return_raw_chat: True
14+
shuffle: False
15+
16+
actor_rollout_ref:
17+
hybrid_engine: True
18+
rollout:
19+
name: sglang
20+
multi_turn:
21+
enable: True
22+
max_assistant_turns: 2
23+
format: qwen
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
tools:
2+
- class_name: "verl.tools.geo3k_tool.Geo3kTool"
3+
config:
4+
type: native
5+
tool_schema:
6+
type: "function"
7+
function:
8+
name: "calc_geo3k_reward"
9+
description: "A tool for calculating the reward of geo3k. (1.0 if parsed answer is correct, 0.0 if parsed answer is incorrect or not correctly parsed)"
10+
parameters:
11+
type: "object"
12+
properties:
13+
answer:
14+
type: "string"
15+
description: "The model's answer to the geo3k problem, must be a digits"
16+
required: ["answer"]
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
tools:
2+
- class_name: "verl.tools.gsm8k_tool.Gsm8kTool"
3+
config:
4+
type: native
5+
tool_schema:
6+
type: "function"
7+
function:
8+
name: "calc_gsm8k_reward"
9+
description: "A tool for calculating the reward of gsm8k. (1.0 if parsed answer is correct, 0.0 if parsed answer is incorrect or not correctly parsed)"
10+
parameters:
11+
type: "object"
12+
properties:
13+
answer:
14+
type: "string"
15+
description: "The model's answer to the GSM8K math problem, must be a digits"
16+
required: ["answer"]
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
{
2+
"mcpServers": {
3+
"Tavily Expert": {
4+
"url": "your_tavily_expert_url",
5+
"auth_token": "your_tavily_api_token"
6+
}
7+
}
8+
}
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
tools:
2+
- class_name: verl.tools.mcp_search_tool.MCPSearchTool
3+
config:
4+
rate_limit: 120
5+
timeout: 120
6+
type: mcp
7+
mcp:
8+
mcp_servers_config_path: ./mcp_server.json
9+
# optional
10+
tool_selected_list:
11+
- tavily_search_tool
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
tools:
2+
- class_name: "verl.tools.sandbox_fusion_tools.SandboxFusionTool"
3+
config:
4+
sandbox_fusion_url: "https://xxx.apigateway-cn-beijing.volceapi.com/run_code"
5+
num_workers: 10
6+
enable_global_rate_limit: true
7+
rate_limit: 10
8+
default_timeout: 30
9+
default_language: "python"
10+
memory_limit_mb: 1024
11+
type: native
12+
13+
tool_schema:
14+
type: "function"
15+
function:
16+
name: "code_interpreter"
17+
description: "A tool for executing code."
18+
parameters:
19+
type: "object"
20+
properties:
21+
code:
22+
type: "string"
23+
description: "The code to execute."
24+
required: ["code"]
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
tools:
2+
- class_name: verl.tools.janv2_tool.web_search_tool.WebSearchTool
3+
config:
4+
type: native
5+
rag_server_url: "http://localhost:3030"
6+
num_results: 10
7+
topk_retrieval: 30
8+
num_workers: 64
9+
rate_limit: 100
10+
timeout: 600
11+
enable_global_rate_limit: true
12+
tool_schema:
13+
type: function
14+
function:
15+
name: web_search
16+
description: |
17+
Search for information using a query string. Returns a list of results
18+
with titles, URLs (document IDs), and preview snippets. Use the visit
19+
tool to retrieve full content of specific documents.
20+
parameters:
21+
type: object
22+
properties:
23+
query:
24+
type: string
25+
description: "The search query to find relevant documents"
26+
required:
27+
- query
28+
29+
- class_name: verl.tools.janv2_tool.scrape_tool.ScrapeTool
30+
config:
31+
type: native
32+
rag_server_url: "http://localhost:3030"
33+
num_workers: 50
34+
rate_limit: 50
35+
timeout: 600
36+
enable_global_rate_limit: true
37+
tool_schema:
38+
type: function
39+
function:
40+
name: scrape
41+
description: |
42+
Visit a URL to retrieve the full content of a document. Use this after
43+
web_search to get complete document content. The URL should be in the
44+
format 'doc_<id>' as returned by web_search.
45+
parameters:
46+
type: object
47+
properties:
48+
url:
49+
type: string
50+
description: "The URL/document ID to visit (e.g., 'doc_123')"
51+
required:
52+
- url
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
# Copyright 2024 Bytedance Ltd. and/or its affiliates
2+
# Copyright 2023-2024 SGLang Team
3+
# Copyright 2025 Search-R1 Contributors
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
# Adapted from https://github.com/PeterGriffinJin/Search-R1/blob/main/scripts/download.py
17+
18+
19+
import argparse
20+
21+
from huggingface_hub import hf_hub_download
22+
23+
parser = argparse.ArgumentParser(description="Download files from a Hugging Face dataset repository.")
24+
parser.add_argument("--repo_id", type=str, default="PeterJinGo/wiki-18-e5-index", help="Hugging Face repository ID")
25+
parser.add_argument("--save_path", type=str, required=True, help="Local directory to save files")
26+
27+
args = parser.parse_args()
28+
29+
repo_id = "PeterJinGo/wiki-18-e5-index"
30+
for file in ["part_aa", "part_ab"]:
31+
hf_hub_download(
32+
repo_id=repo_id,
33+
filename=file, # e.g., "e5_Flat.index"
34+
repo_type="dataset",
35+
local_dir=args.save_path,
36+
)
37+
38+
repo_id = "PeterJinGo/wiki-18-corpus"
39+
hf_hub_download(
40+
repo_id=repo_id,
41+
filename="wiki-18.jsonl.gz",
42+
repo_type="dataset",
43+
local_dir=args.save_path,
44+
)

0 commit comments

Comments
 (0)