diff --git a/data_juicer/tools/DJ_mcp_recipe_flow.py b/data_juicer/tools/DJ_mcp_recipe_flow.py index e303230358..fc3d72a9ff 100644 --- a/data_juicer/tools/DJ_mcp_recipe_flow.py +++ b/data_juicer/tools/DJ_mcp_recipe_flow.py @@ -28,8 +28,6 @@ def get_data_processing_ops( Operators are a collection of basic processes that assist in data modification, cleaning, filtering, deduplication, etc. - Should be used with `run_data_recipe`. - If both tags and ops_type are None, return a list of all operators. The following `op_type` values are supported: diff --git a/data_juicer/tools/mcp_server.py b/data_juicer/tools/mcp_server.py new file mode 100644 index 0000000000..1f8f81c1cb --- /dev/null +++ b/data_juicer/tools/mcp_server.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python3 + +import argparse +import os +import sys + + +def main(): + """Data-Juicer MCP Server CLI entry point.""" + parser = argparse.ArgumentParser( + description="Data-Juicer MCP Server", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Available modes: + granular-ops Launch MCP server with individual operator tools + recipe-flow Launch MCP server with recipe-based workflow tools + +Examples: + dj-mcp granular-ops --transport stdio + dj-mcp recipe-flow --transport sse --port 8000 + """, + ) + + parser.add_argument("mode", choices=["granular-ops", "recipe-flow"], help="MCP server mode to launch") + + parser.add_argument( + "--transport", + choices=["stdio", "sse", "streamable-http"], + default="stdio", + help="Transport protocol for MCP server (default: stdio)", + ) + + parser.add_argument("--port", type=int, default=8080, help="Port number for HTTP-based transports (default: 8080)") + + args = parser.parse_args() + + # Set environment variable for transport + os.environ["SERVER_TRANSPORT"] = args.transport + + try: + if args.mode == "granular-ops": + from data_juicer.tools.DJ_mcp_granular_ops import create_mcp_server + + elif args.mode == "recipe-flow": + from data_juicer.tools.DJ_mcp_recipe_flow import create_mcp_server + + print(f"Starting Data-Juicer MCP Server ({args.mode} mode)") + print(f"Transport: {args.transport}, Port: {args.port}") + + mcp = create_mcp_server(port=str(args.port)) + mcp.run(transport=args.transport) + + except ImportError as e: + print(f"Error: Missing dependencies for MCP server. {e}") + sys.exit(1) + except Exception as e: + print(f"Error starting MCP server: {e}") + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/docs/DJ_service.md b/docs/DJ_service.md index 7ee608700f..a11eb8e959 100644 --- a/docs/DJ_service.md +++ b/docs/DJ_service.md @@ -112,130 +112,142 @@ The server supports two deployment methods: **stdio** and **SSE**. The **stdio** - `export_path` (str, optional): The path to export the dataset to. Default is None, meaning the dataset will be exported to './outputs' - Returns: A string representing the execution result -For specific data processing requests, the MCP client should first call `get_data_processing_ops` to obtain relevant operator information, select operators that match the requirements from it, and then call `run_data_recipe` to run the selected combination of operators. +For specific data processing requests, the MCP client should first call `get_data_processing_ops` to obtain relevant operator information, select the operators that meet the requirements, and then call `run_data_recipe` to execute the chosen operator combination. -#### Configuration - -The following configuration examples demonstrate how to set up the Recipe-Flow server using both stdio and SSE transport methods. These examples are for illustrative purposes and should be adapted to your specific MCP client's configuration format. +### Granular-Operators -##### stdio Transport +By default, this MCP server returns all Data-Juicer operator tools, each running independently. -Add the following to your MCP client's configuration file (e.g., `claude_desktop_config.json` or a similar configuration file): +To control the operator tools returned by the MCP server, specify the environment variable `DJ_OPS_LIST_PATH`: +1. Create a `.txt` file. +2. Add operator names to the file, e.g., [ops_list_example.txt](../configs/mcp/ops_list_example.txt). +3. Set the path to the operator list as the environment variable `DJ_OPS_LIST_PATH`. -```json -"mcpServers": { - "DJ_recipe_flow": { - "transport": "stdio", - "command": "/path/to/python", - "args": [ - "/path/to/data_juicer/tools/DJ_mcp_recipe_flow.py" - ], - "env": { - "SERVER_TRANSPORT": "stdio" - } - } -} -``` +### Configuration -##### SSE Transport +The following configuration examples demonstrate how to set up the two MCP server types using the stdio and SSE methods. These examples are for illustrative purposes only and should be adapted to the specific MCP client's configuration format. -To use the SSE transport, you first need to start the MCP server separately. +#### stdio -1. Run the Server: Execute the server script, specifying the port number: +Suitable for quick local testing and simple scenarios. Add the following to the MCP client's configuration file (e.g., `claude_desktop_config.json` or similar): - ```bash - python /path/to/data_juicer/tools/DJ_mcp_recipe_flow.py --port=8080 - ``` +##### Using uvx -2. Configure your MCP Client: Add the following to your MCP client's configuration file: +Run the latest version of Data-Juicer MCP directly from the repository without manual local installation. - ```json +- Recipe-Flow mode: + ```json + { "mcpServers": { "DJ_recipe_flow": { - "url": "http://127.0.0.1:8080/sse" + "command": "uvx", + "args": [ + "--from", + "git+https://github.com/modelscope/data-juicer", + "dj-mcp", + "recipe-flow" + ] } } - ``` - -Note: - -* URL: The `url` should point to the SSE endpoint of your running server (typically `http://127.0.0.1:/sse`). Adjust the port number if you used a different value when starting the server. -* Separate Server Process: The SSE server must be running before your MCP client attempts to connect. -* Firewall: Ensure that your firewall allows connections to the specified port. - -### Granular-Operators - -By default, this MCP server will return all Data-Juicer operator tools, each running independently. - -You can control the operator tools returned by the MCP server by specifying the environment variable `DJ_OPS_LIST_PATH`: - -1. Create a `.txt` file. -2. Add operator names to the file, such as: [ops_list_example.txt](../configs/mcp/ops_list_example.txt). -3. Set the path to the operators list as the environment variable `DJ_OPS_LIST_PATH`. - -#### Configuration - -The following configuration examples demonstrate how to set up the Granular-Operators server using both stdio and SSE transport methods. These examples are for illustrative purposes and should be adapted to your specific MCP client's configuration format. - -##### stdio Transport - -Add the following to your MCP client's configuration file: + } + ``` -```json -"mcpServers": { - "DJ_granular_ops_stdio": { - "transport": "stdio", - "command": "/path/to/python", - "args": [ - "/path/to/data_juicer/tools/DJ_mcp_granular_ops.py" - ], - "env": { - "DJ_OPS_LIST_PATH": "/path/to/ops_list.txt", - "SERVER_TRANSPORT": "stdio" +- Granular-Operators mode: + ```json + { + "mcpServers": { + "DJ_granular_ops": { + "command": "uvx", + "args": [ + "--from", + "git+https://github.com/modelscope/data-juicer", + "dj-mcp", + "granular-ops", + "--transport", + "stdio" + ], + "env": { + "DJ_OPS_LIST_PATH": "/path/to/ops_list.txt" + } + } } } -} -``` - -##### SSE Transport - -To use the SSE transport, you first need to start the MCP server separately. - -1. Set Environment Variables: Ensure any required environment variables for the server are set, including `DJ_OPS_LIST_PATH` if you're using it. -2. Run the Server: Execute the server script, specifying the port number: - - ```bash - python /path/to/data_juicer/tools/DJ_mcp_granular_ops.py --port=8081 - ``` - -3. Configure your MCP Client: Add the following to your MCP client's configuration file: - - ```json + ``` + Note: If `DJ_OPS_LIST_PATH` is not set, all operators are returned by default. + +##### Local Installation + +1. Clone the Data-Juicer repository locally: + ```bash + git clone https://github.com/modelscope/data-juicer.git + ``` +2. Run Data-Juicer MCP using uv: +- Recipe-Flow mode: + ```json + { "mcpServers": { - "DJ_granular_ops_sse": { - "url": "http://127.0.0.1:8081/sse" + "DJ_recipe_flow": { + "transport": "stdio", + "command": "uv", + "args": [ + "run", + "--directory", + "/abs/path/to/data-juicer", + "dj-mcp", + "recipe-flow" + ] } } - ``` - -Note: - -* URL: The `url` should point to the SSE endpoint of your running server (typically `http://127.0.0.1:/sse`). Adjust the port number if you used a different value when starting the server. -* Separate Server Process: The SSE server must be running before your MCP client attempts to connect. -* Firewall: Ensure that your firewall allows connections to the specified port. - -### Finding Your Python Path - -To find the path to the Python executable, use the following commands: - -Windows (Command Prompt/Terminal): - -```sh -where python -``` - -Linux/macOS (Terminal): - -```sh -which python -``` + } + ``` +- Granular-Operators mode: + ```json + { + "mcpServers": { + "DJ_granular_ops": { + "transport": "stdio", + "command": "uv", + "args": [ + "run", + "--directory", + "/abs/path/to/data-juicer", + "dj-mcp", + "granular-ops" + ], + "env": { + "DJ_OPS_LIST_PATH": "/path/to/ops_list.txt" + } + } + } + } + ``` + +#### SSE + +To use SSE deployment, first start the MCP server separately. + +1. Run the MCP server: Execute the MCP server script and specify the port number: + - Using uvx: + ```bash + uvx --from git+https://github.com/modelscope/data-juicer dj-mcp --transport sse --port 8080 + ``` + - Local execution: + ```bash + uv run dj-mcp --transport sse --port 8080 + ``` + +2. Configure your MCP client: Add the following to the MCP client's configuration file: + ```json + { + "mcpServers": { + "DJ_MCP": { + "url": "http://127.0.0.1:8080/sse" + } + } + } + ``` + +Notes: +- URL: The `url` should point to the SSE endpoint of the running server (typically `http://127.0.0.1:/sse`). Adjust the port number if a different value was used when starting the server. +- Separate server process: The SSE server must be running before the MCP client attempts to connect. +- Firewall: Ensure the firewall allows connections to the specified port. \ No newline at end of file diff --git a/docs/DJ_service_ZH.md b/docs/DJ_service_ZH.md index a2236f5929..8397bbf83a 100644 --- a/docs/DJ_service_ZH.md +++ b/docs/DJ_service_ZH.md @@ -85,152 +85,170 @@ curl -X POST \ ### 概览 -Data-Juicer MCP 服务器提供数据处理算子,以协助完成数据清洗、过滤、去重等任务。 -为了适应不同的使用场景,我们提供两种服务器供选用: +Data-Juicer MCP 服务器提供数据处理算子,以协助完成数据清洗、过滤、去重等任务。为了适应不同的使用场景,我们提供两种服务器供选用: -- Recipe-Flow(数据菜谱): 允许根据算子的类型和标签进行筛选,并支持将多个算子组合成一个数据菜谱来运行。 -- Granular-Operators(细粒度算子): 将每个算子作为一个独立的工具提供,可以灵活地通过环境变量指定需要使用的算子列表,从而构建定制化的数据处理管道。 +- **Recipe-Flow(数据菜谱)**:允许根据算子的类型和标签进行筛选,并支持将多个算子组合成一个数据菜谱来运行。 +- **Granular-Operators(细粒度算子)**:将每个算子作为一个独立的工具提供,可以灵活地通过环境变量指定需要使用的算子列表,从而构建定制化的数据处理管道。 请注意,Data-Juicer MCP 服务器目前处于早期开发阶段。其功能和可用工具可能会随着我们继续开发和改进服务器而发生变化和扩展。 -支持两种部署方式:stdio 和 SSE 。 stdio 方法不支持多进程。如果需要多进程或多线程功能,则必须使用 SSE 部署方法。 以下提供了每种方法的配置详细信息。 + +支持两种部署方式:stdio 和 SSE。stdio 方法不支持多进程。如果需要多进程或多线程功能,则必须使用 SSE 部署方法。以下提供了每种方法的配置详细信息。 ### Recipe-Flow -1. `get_data_processing_ops` - - 根据指定的类型和标签检索可用的数据处理算子列表(若不指定,则返回全部算子) - - 输入: - - `op_type` (str, optional): 要检索的数据处理算子类型 - - `tags` (List[str], optional): 用于过滤算子的标签列表 - - `match_all` (bool): 是否所有指定的标签都必须匹配。默认为 True - - 返回:包含可用算子详细信息的字典 - -2. `run_data_recipe` - - 执行数据菜谱 - - 输入: - - `dataset_path` (str): 要处理的数据集路径 - - `process` (List[Dict]): 要执行的处理步骤列表,字典包含算子名称和参数字典 - - `export_path` (str, optional): 导出数据集的路径,默认为 None,这意味着数据集将导出到 './outputs' - - 返回:执行结果的字符串 +#### 1. get_data_processing_ops +- 根据指定的类型和标签检索可用的数据处理算子列表(若不指定,则返回全部算子) +- 输入: + - `op_type` (str, optional): 要检索的数据处理算子类型 + - `tags` (List[str], optional): 用于过滤算子的标签列表 + - `match_all` (bool): 是否所有指定的标签都必须匹配。默认为 True +- 返回:包含可用算子详细信息的字典 + +#### 2. run_data_recipe +- 执行数据菜谱 +- 输入: + - `dataset_path` (str): 要处理的数据集路径 + - `process` (List[Dict]): 要执行的处理步骤列表,字典包含算子名称和参数字典 + - `export_path` (str, optional): 导出数据集的路径,默认为 None,这意味着数据集将导出到 './outputs' +- 返回:执行结果的字符串 针对特定数据处理请求,MCP client 应先调用`get_data_processing_ops`获取相关的算子信息,从中选择匹配需求的算子,然后调用`run_data_recipe`运行选择的算子组合。 -#### 配置 +### Granular-Operators -以下配置示例演示了如何使用 stdio 和 SSE 方法设置 Recipe-Flow 服务器。 这些示例仅用于说明目的,应根据特定 MCP 客户端的配置格式进行调整。 +默认情况下,该 MCP 服务器将返回所有Data-Juicer算子工具,每个工具都独立运行。 -##### stdio +可通过指定环境变量 `DJ_OPS_LIST_PATH` 控制 MCP 服务器返回的算子工具: +1. 创建一个 `.txt` 文件 +2. 将算子名称添加到文件中,例如:[ops_list_example.txt](../configs/mcp/ops_list_example.txt) +3. 将算子列表的路径设置为环境变量 `DJ_OPS_LIST_PATH` -将以下内容添加到 MCP 客户端的配置文件中(例如,claude_desktop_config.json 或类似的配置文件): +### 配置 -```json -"mcpServers": { - "DJ_recipe_flow": { - "transport": "stdio", - "command": "/path/to/python", - "args": [ - "/path/to/data_juicer/tools/DJ_mcp_recipe_flow.py" - ], - "env": { - "SERVER_TRANSPORT": "stdio" - } - } -} -``` - -##### SSE +以下配置示例演示了如何使用 stdio 和 SSE 方法设置两种不同的 MCP 服务器。这些示例仅用于说明目的,应根据特定 MCP 客户端的配置格式进行调整。 -要使用 SSE 部署,首先需要单独启动 MCP 服务器。 +#### stdio -1. 运行 MCP 服务器:执行 MCP 服务器脚本,指定端口号: +适用于快速本地测试和简单场景。将以下内容添加到 MCP 客户端的配置文件中(例如,claude_desktop_config.json 或类似的配置文件): - ```bash - python /path/to/data_juicer/tools/DJ_mcp_recipe_flow.py --port=8080 - ``` +##### 使用 uvx -2. 配置您的 MCP 客户端:将以下内容添加到 MCP 客户端的配置文件中: +直接从存储库运行最新版本的 Data-Juicer MCP,无需手动进行本地安装。 - ```json +- **Recipe-Flow模式**: + ```json + { "mcpServers": { "DJ_recipe_flow": { - "url": "http://127.0.0.1:8080/sse" + "command": "uvx", + "args": [ + "--from", + "git+https://github.com/modelscope/data-juicer", + "dj-mcp", + "recipe-flow" + ] } } - ``` - -注意: - -* URL:`url` 应指向正在运行的服务器的 SSE 端点(通常为 `http://127.0.0.1:/sse`)。 如果在启动服务器时使用了不同的值,请调整端口号。 -* 单独的服务器进程:SSE 服务器必须在 MCP 客户端尝试连接之前运行。 -* 防火墙:确保防火墙允许连接到指定的端口。 - -### Granular-Operators - -默认情况下,该 MCP 服务器将返回所有Data-Juicer算子工具,每个工具都独立运行。 - -可通过指定环境变量 `DJ_OPS_LIST_PATH` 控制 MCP 服务器返回的算子工具: -1. 创建一个 `.txt` 文件。 -2. 将算子名称添加到文件中,例如:[ops_list_example.txt](../configs/mcp/ops_list_example.txt)。 -3. 将算子列表的路径设置为环境变量 `DJ_OPS_LIST_PATH`。 - -#### 配置 - -以下配置示例演示了如何使用 stdio 和 SSE 方法设置 Granular-Operators 服务器。 这些示例仅用于说明目的,应根据特定 MCP 客户端的配置格式进行调整。 - -##### stdio - -将以下内容添加到 MCP 客户端的配置文件中: + } + ``` -```json -"mcpServers": { - "DJ_granular_ops": { - "transport": "stdio", - "command": "/path/to/python", - "args": [ - "/path/to/data_juicer/tools/DJ_mcp_granular_ops.py" - ], - "env": { - "DJ_OPS_LIST_PATH": "/path/to/ops_list.txt", - "SERVER_TRANSPORT": "stdio" +- **Granular-Operators模式**: + ```json + { + "mcpServers": { + "DJ_granular_ops": { + "command": "uvx", + "args": [ + "--from", + "git+https://github.com/modelscope/data-juicer", + "dj-mcp", + "granular-ops", + "--transport", + "stdio" + ], + "env": { + "DJ_OPS_LIST_PATH": "/path/to/ops_list.txt" + } + } } } -} -``` - -##### SSE - -要使用 SSE 部署,首先需要单独启动 MCP 服务器。 - -1. 设置环境变量:确保已设置服务器所需的任何环境变量,例如 `DJ_OPS_LIST_PATH`。 -2. 运行 MCP 服务器:执行 MCP 服务器脚本,指定端口号: - - ```bash - python /path/to/data_juicer/tools/DJ_mcp_granular_ops.py --port=8081 - ``` - -3. 配置 MCP 客户端:将以下内容添加到 MCP 客户端的配置文件中: - - ```json + ``` + 注意:若不设置`DJ_OPS_LIST_PATH`,则默认返回所有算子。 + +##### 本地安装 + +1. 将 Data-Juicer 仓库克隆到本地: + ```bash + git clone https://github.com/modelscope/data-juicer.git + ``` +2. 使用 uv 运行 Data-Juicer MCP: +- Recipe-Flow 模式: + ```json + { + "mcpServers": { + "DJ_recipe_flow": { + "transport": "stdio", + "command": "uv", + "args": [ + "run", + "--directory", + "/abs/path/to/data-juicer", + "dj-mcp", + "recipe-flow" + ] + } + } + } + ``` +- Granular-Operators 模式: + ```json + { "mcpServers": { "DJ_granular_ops": { - "url": "http://127.0.0.1:8081/sse" + "transport": "stdio", + "command": "uv", + "args": [ + "run", + "--directory", + "/abs/path/to/data-juicer", + "dj-mcp", + "granular-ops" + ], + "env": { + "DJ_OPS_LIST_PATH": "/path/to/ops_list.txt" + } } } - ``` + } + ``` -注意: -* URL:`url` 应指向正在运行的服务器的 SSE 端点(通常为 `http://127.0.0.1:/sse`)。 如果在启动服务器时使用了不同的值,请调整端口号。 -* 单独的服务器进程:SSE 服务器必须在 MCP 客户端尝试连接之前运行。 -* 防火墙:确保防火墙允许连接到指定的端口。 +#### SSE -### 查找你的 Python 路径 -要查找 Python 可执行文件路径,请使用以下命令: +要使用 SSE 部署,首先需要单独启动 MCP 服务器。 -Windows (Command Prompt/Terminal): -``` -where python -``` -Linux/macOS (Terminal): -``` -which python -``` +1. 运行 MCP 服务器:执行 MCP 服务器脚本,指定端口号: + - uvx 启动: + ```bash + uvx --from git+https://github.com/modelscope/data-juicer dj-mcp --transport sse --port 8080 + ``` + - 本地启动: + ```bash + uv run dj-mcp --transport sse --port 8080 + ``` + +2. 配置您的 MCP 客户端:将以下内容添加到 MCP 客户端的配置文件中: + ```json + { + "mcpServers": { + "DJ_MCP": { + "url": "http://127.0.0.1:8080/sse" + } + } + } + ``` + +**注意事项**: +- URL:`url` 应指向正在运行的服务器的 SSE 端点(通常为 `http://127.0.0.1:/sse`)。如果在启动服务器时使用了不同的值,请调整端口号。 +- 单独的服务器进程:SSE 服务器必须在 MCP 客户端尝试连接之前运行。 +- 防火墙:确保防火墙允许连接到指定的端口。 \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index e2b5439ad1..ffa699ecd5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -174,6 +174,7 @@ all = [ dj-process = "data_juicer.tools.process_data:main" dj-analyze = "data_juicer.tools.analyze_data:main" dj-install = "data_juicer.tools.dj_install:main" +dj-mcp = "data_juicer.tools.mcp_server:main" [build-system] requires = ["hatchling", "uv>=0.1.0"]