diff --git a/examples/firecrawl_example.py b/examples/firecrawl_example.py new file mode 100644 index 0000000000..6b44d65b5f --- /dev/null +++ b/examples/firecrawl_example.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +"""Example usage of the Firecrawl tool with MetaGPT.""" + +import asyncio +import os +import sys +from pathlib import Path +import time + +# Add the project root to Python path +project_root = Path(__file__).parent.parent +sys.path.append(str(project_root)) + +from metagpt.tools.libs.firecrawl import Firecrawl + +async def main(): + """Run example usage of Firecrawl tool.""" + # Set up environment variables if not already set + if "FIRECRAWL_API_KEY" not in os.environ: + os.environ["FIRECRAWL_API_KEY"] = "YOUR-FIRECRAWL-API-KEY" + + # Create Firecrawl instance + firecrawl = Firecrawl() + + # Example 1: Search for information + print("\nExample 1: Search for Therapist in Portugal by name") + search_results = firecrawl.search("Psicologa Clínica Mairí Stumpf") + print("Search Results:", search_results) + + # Example 2: Map and crawl a website + print("\nExample 2: Map and crawl a website") + map_results = firecrawl.map_url("https://docs.firecrawl.dev") + print("Map Results:", map_results) + + if map_results.get("links"): + crawl_job = firecrawl.crawl_url(map_results["links"][0]) + print("Crawl Job:", crawl_job) + + if crawl_job.get("id"): + status = firecrawl.get_crawl_status(crawl_job["id"]) + print("Crawl Status:", status) + # While the status is not "completed" we can loop and print the status + while status != "completed": + status = firecrawl.get_crawl_status(crawl_job["id"]) + print("Crawl Status:", status) + time.sleep(5) + + # Example 3: Scrape a specific URL + print("\nExample 3: Scrape a URL") + scrape_results = firecrawl.scrape_url("https://example.com") + print("Scrape Results:", scrape_results) + + # Example 4: Extract information from URLs + print("\nExample 4: Extract information") + extract_job = firecrawl.extract(["https://www.imdb.com/chart/starmeter/"], params ={"prompt":"Extract the top five most popular celebs names and their popularity score if available"}) + print("Extract Job:", extract_job) + + if extract_job.get("id"): + extract_status = firecrawl.get_extract_status(extract_job["id"]) + print("\nExtract Status:", extract_status) + + # While the status is not "completed" we can loop and print the status + while extract_status.get("status") != "completed": + extract_status = firecrawl.get_extract_status(extract_job["id"]) + print("\nUpdated Status:", extract_status) + time.sleep(10) + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/metagpt/tools/libs/firecrawl.py b/metagpt/tools/libs/firecrawl.py new file mode 100644 index 0000000000..c52ce6c462 --- /dev/null +++ b/metagpt/tools/libs/firecrawl.py @@ -0,0 +1,296 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +"""Firecrawl Tool for MetaGPT. + +This module provides a tool for interacting with the Firecrawl API, enabling web scraping, +crawling, searching, and information extraction capabilities. + +Author: Ademílson Tonato +""" + +import os +from typing import Any, Dict, List, Optional, Union + +import requests +from metagpt.tools.tool_registry import register_tool + + +@register_tool(tags=["web", "scraping", "search"], include_functions=["map_url", "scrape_url", "search", "crawl_url", "extract"]) +class Firecrawl: + """A tool for web scraping, crawling, searching and information extraction using Firecrawl API. + + This tool provides methods to interact with the Firecrawl API for various web data collection + and processing tasks. It supports URL mapping, scraping, searching, crawling, and information + extraction. + + Attributes: + api_key (str): The API key for authenticating with Firecrawl API. + api_url (str): The base URL for the Firecrawl API. + """ + + def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None): + """Initialize the Firecrawl tool. + + Args: + api_key (Optional[str]): API key for Firecrawl. Defaults to environment variable. + api_url (Optional[str]): Base URL for Firecrawl API. Defaults to production URL. + """ + self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY') + if not self.api_key: + raise ValueError('No API key provided') + self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev') + self.request_timeout = 60 + + def _prepare_headers(self) -> Dict[str, str]: + """Prepare headers for API requests. + + Returns: + Dict[str, str]: Headers including content type and authorization. + """ + return { + 'Content-Type': 'application/json', + 'Authorization': f'Bearer {self.api_key}', + 'X-Origin': 'metagpt', + 'X-Origin-Type': 'integration', + } + + def _handle_error(self, response: requests.Response, action: str) -> None: + """Handle API errors. + + Args: + response (requests.Response): The response from the API. + action (str): Description of the action being performed. + + Raises: + requests.exceptions.HTTPError: If the API request fails. + """ + try: + error_message = response.json().get('error', 'No error message provided.') + error_details = response.json().get('details', 'No additional error details provided.') + except: + raise requests.exceptions.HTTPError( + f'Failed to parse Firecrawl error response as JSON. Status code: {response.status_code}', + response=response + ) + + message = f"Error during {action}: Status code {response.status_code}. {error_message} - {error_details}" + raise requests.exceptions.HTTPError(message, response=response) + + def map_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: + """Map a URL to discover all available links. + + Args: + url (str): The URL to map. + params (Optional[Dict[str, Any]]): Additional parameters for the mapping operation. + + Returns: + Dict[str, Any]: A dictionary containing the mapped URLs and related information. + + Raises: + requests.exceptions.HTTPError: If the API request fails. + """ + headers = self._prepare_headers() + json_data = {'url': url} + if params: + json_data.update(params) + + response = requests.post( + f'{self.api_url}/v1/map', + headers=headers, + json=json_data, + timeout=self.request_timeout + ) + + if response.status_code == 200: + try: + return response.json() + except: + raise Exception('Failed to parse Firecrawl response as JSON.') + else: + self._handle_error(response, 'map URL') + + def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: + """Scrape content from a specific URL. + + Args: + url (str): The URL to scrape. + params (Optional[Dict[str, Any]]): Additional parameters for the scraping operation. + + Returns: + Dict[str, Any]: A dictionary containing the scraped content and metadata. + + Raises: + requests.exceptions.HTTPError: If the API request fails. + """ + headers = self._prepare_headers() + json_data = {'url': url} + if params: + json_data.update(params) + + response = requests.post( + f'{self.api_url}/v1/scrape', + headers=headers, + json=json_data, + timeout=self.request_timeout + ) + + if response.status_code == 200: + try: + return response.json() + except: + raise Exception('Failed to parse Firecrawl response as JSON.') + else: + self._handle_error(response, 'scrape URL') + + def search(self, query: str, params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: + """Perform a web search using Firecrawl. + + Args: + query (str): The search query. + params (Optional[Dict[str, Any]]): Additional parameters for the search operation. + + Returns: + Dict[str, Any]: A dictionary containing search results and metadata. + + Raises: + requests.exceptions.HTTPError: If the API request fails. + """ + headers = self._prepare_headers() + json_data = {'query': query} + if params: + json_data.update(params) + + response = requests.post( + f'{self.api_url}/v1/search', + headers=headers, + json=json_data, + timeout=self.request_timeout + ) + + if response.status_code == 200: + try: + return response.json() + except: + raise Exception('Failed to parse Firecrawl response as JSON.') + else: + self._handle_error(response, 'search') + + def crawl_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: + """Start a crawl job for a given URL. + + Args: + url (str): The URL to crawl. + params (Optional[Dict[str, Any]]): Additional parameters for the crawl operation. + + Returns: + Dict[str, Any]: A dictionary containing the crawl results and metadata. + + Raises: + requests.exceptions.HTTPError: If the API request fails. + """ + headers = self._prepare_headers() + json_data = {'url': url} + if params: + json_data.update(params) + + response = requests.post( + f'{self.api_url}/v1/crawl', + headers=headers, + json=json_data, + timeout=self.request_timeout + ) + + if response.status_code == 200: + try: + return response.json() + except: + raise Exception('Failed to parse Firecrawl response as JSON.') + else: + self._handle_error(response, 'start crawl job') + + def get_crawl_status(self, job_id: str) -> Dict[str, Any]: + """Get the status of a crawl job. + + Args: + job_id (str): The ID of the crawl job. + + Returns: + Dict[str, Any]: A dictionary containing the crawl job status and results. + + Raises: + requests.exceptions.HTTPError: If the API request fails. + """ + headers = self._prepare_headers() + response = requests.get( + f'{self.api_url}/v1/crawl/{job_id}', + headers=headers, + timeout=self.request_timeout + ) + + if response.status_code == 200: + try: + return response.json() + except: + raise Exception('Failed to parse Firecrawl response as JSON.') + else: + self._handle_error(response, 'check crawl status') + + def extract(self, urls: List[str], params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: + """Extract structured information from URLs. + + Args: + urls (List[str]): List of URLs to extract information from. + params (Optional[Dict[str, Any]]): Additional parameters for the extraction operation. + + Returns: + Dict[str, Any]: A dictionary containing the extracted information and metadata. + + Raises: + requests.exceptions.HTTPError: If the API request fails. + """ + headers = self._prepare_headers() + json_data = {'urls': urls} + if params: + json_data.update(params) + + response = requests.post( + f'{self.api_url}/v1/extract', + headers=headers, + json=json_data, + timeout=self.request_timeout + ) + + if response.status_code == 200: + try: + return response.json() + except: + raise Exception('Failed to parse Firecrawl response as JSON.') + else: + self._handle_error(response, 'extract') + + def get_extract_status(self, job_id: str) -> Dict[str, Any]: + """Get the status of an extract job. + + Args: + job_id (str): The ID of the extract job. + + Returns: + Dict[str, Any]: A dictionary containing the extract job status and results. + + Raises: + requests.exceptions.HTTPError: If the API request fails. + """ + headers = self._prepare_headers() + response = requests.get( + f'{self.api_url}/v1/extract/{job_id}', + headers=headers, + timeout=self.request_timeout + ) + + if response.status_code == 200: + try: + return response.json() + except: + raise Exception('Failed to parse Firecrawl response as JSON.') + else: + self._handle_error(response, 'check extract status') \ No newline at end of file diff --git a/tests/metagpt/tools/libs/test_firecrawl.py b/tests/metagpt/tools/libs/test_firecrawl.py new file mode 100644 index 0000000000..ff97dc89f9 --- /dev/null +++ b/tests/metagpt/tools/libs/test_firecrawl.py @@ -0,0 +1,176 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +"""Test module for the Firecrawl tool.""" + +import os +import pytest +from unittest.mock import MagicMock, patch +import requests + +from metagpt.tools.libs.firecrawl import Firecrawl + +API_KEY = "YOUR-FIRECRAWL-API-KEY" +API_URL = "https://api.firecrawl.dev" + +EXPECTED_HEADERS = { + 'Content-Type': 'application/json', + 'Authorization': f'Bearer {API_KEY}', + 'X-Origin': 'metagpt', + 'X-Origin-Type': 'integration', +} + +@pytest.fixture +def mock_response(): + """Create a mock response object.""" + response = MagicMock() + response.status_code = 200 + response.json.return_value = {"success": True} + return response + +@pytest.fixture +def firecrawl(): + """Create a Firecrawl instance for testing.""" + return Firecrawl(api_key=API_KEY, api_url=API_URL) + +def test_initialization(): + """Test initialization with direct parameters.""" + tool = Firecrawl(api_key=API_KEY, api_url=API_URL) + assert tool.api_key == API_KEY + assert tool.api_url == API_URL + +def test_initialization_with_env_vars(): + """Test initialization with environment variables.""" + os.environ["FIRECRAWL_API_KEY"] = API_KEY + os.environ["FIRECRAWL_API_URL"] = API_URL + + tool = Firecrawl() + assert tool.api_key == API_KEY + assert tool.api_url == API_URL + + # Clean up environment variables + del os.environ["FIRECRAWL_API_KEY"] + del os.environ["FIRECRAWL_API_URL"] + +def test_initialization_without_api_key(): + """Test initialization without API key raises error.""" + with pytest.raises(ValueError, match="No API key provided"): + Firecrawl() + +def test_map_url(firecrawl, mock_response): + """Test the map_url method.""" + mock_response.json.return_value = {"success": True, "links": ["http://example.com/page1"]} + + with patch('requests.post', return_value=mock_response) as mock_post: + result = firecrawl.map_url("http://example.com") + + assert result == {"success": True, "links": ["http://example.com/page1"]} + mock_post.assert_called_once_with( + f'{API_URL}/v1/map', + headers=EXPECTED_HEADERS, + json={'url': 'http://example.com'}, + timeout=60 + ) + +def test_scrape_url(firecrawl, mock_response): + """Test the scrape_url method.""" + mock_response.json.return_value = {"success": True, "data": {"title": "Example"}} + + with patch('requests.post', return_value=mock_response) as mock_post: + result = firecrawl.scrape_url("http://example.com") + + assert result == {"success": True, "data": {"title": "Example"}} + mock_post.assert_called_once_with( + f'{API_URL}/v1/scrape', + headers=EXPECTED_HEADERS, + json={'url': 'http://example.com'}, + timeout=60 + ) + +def test_search(firecrawl, mock_response): + """Test the search method.""" + mock_response.json.return_value = {"success": True, "results": [{"title": "Test Result"}]} + + with patch('requests.post', return_value=mock_response) as mock_post: + result = firecrawl.search("test query") + + assert result == {"success": True, "results": [{"title": "Test Result"}]} + mock_post.assert_called_once_with( + f'{API_URL}/v1/search', + headers=EXPECTED_HEADERS, + json={'query': 'test query'}, + timeout=60 + ) + +def test_crawl_url(firecrawl, mock_response): + """Test the crawl_url method.""" + mock_response.json.return_value = {"success": True, "id": "test_job_id"} + + with patch('requests.post', return_value=mock_response) as mock_post: + result = firecrawl.crawl_url("http://example.com") + + assert result == {"success": True, "id": "test_job_id"} + mock_post.assert_called_once_with( + f'{API_URL}/v1/crawl', + headers=EXPECTED_HEADERS, + json={'url': 'http://example.com'}, + timeout=60 + ) + +def test_get_crawl_status(firecrawl, mock_response): + """Test the get_crawl_status method.""" + mock_response.json.return_value = {"success": True, "status": "completed"} + + with patch('requests.get', return_value=mock_response) as mock_get: + result = firecrawl.get_crawl_status("test_job_id") + + assert result == {"success": True, "status": "completed"} + mock_get.assert_called_once_with( + f'{API_URL}/v1/crawl/test_job_id', + headers=EXPECTED_HEADERS, + timeout=60 + ) + +def test_extract(firecrawl, mock_response): + """Test the extract method.""" + mock_response.json.return_value = {"success": True, "data": {"extracted": "content"}} + + with patch('requests.post', return_value=mock_response) as mock_post: + result = firecrawl.extract(["http://example.com"]) + + assert result == {"success": True, "data": {"extracted": "content"}} + mock_post.assert_called_once_with( + f'{API_URL}/v1/extract', + headers=EXPECTED_HEADERS, + json={'urls': ['http://example.com']}, + timeout=60 + ) + +def test_get_extract_status(firecrawl, mock_response): + """Test the get_extract_status method.""" + mock_response.json.return_value = {"success": True, "status": "completed"} + + with patch('requests.get', return_value=mock_response) as mock_get: + result = firecrawl.get_extract_status("test_job_id") + + assert result == {"success": True, "status": "completed"} + mock_get.assert_called_once_with( + f'{API_URL}/v1/extract/test_job_id', + headers=EXPECTED_HEADERS, + timeout=60 + ) + +def test_error_handling(firecrawl): + """Test error handling.""" + mock_error_response = MagicMock() + mock_error_response.status_code = 400 + mock_error_response.json.return_value = { + "error": "Test error", + "details": "Test error details" + } + + with patch('requests.post', return_value=mock_error_response): + with pytest.raises(requests.exceptions.HTTPError) as exc_info: + firecrawl.map_url("http://example.com") + + assert "Test error" in str(exc_info.value) + assert "Test error details" in str(exc_info.value) \ No newline at end of file diff --git a/tests/metagpt/tools/test_firecrawl_tool.py b/tests/metagpt/tools/test_firecrawl_tool.py new file mode 100644 index 0000000000..97af4bbc82 --- /dev/null +++ b/tests/metagpt/tools/test_firecrawl_tool.py @@ -0,0 +1,177 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# @Desc : Tests for Firecrawl Tool + +import os +import pytest +from unittest.mock import patch, MagicMock + +from metagpt.tools.firecrawl_tool import FirecrawlTool +from metagpt.tools.firecrawl_env import FirecrawlEnv + + +@pytest.fixture +def firecrawl_tool(): + """Create a FirecrawlTool instance for testing.""" + with patch.dict(os.environ, {'FIRECRAWL_API_KEY': 'test_api_key'}): + return FirecrawlTool() + + +@pytest.mark.asyncio +async def test_map_url(firecrawl_tool): + """Test the map_url method.""" + mock_response = { + "success": True, + "links": ["http://example.com/1", "http://example.com/2"] + } + + with patch('requests.post') as mock_post: + mock_post.return_value.status_code = 200 + mock_post.return_value.json.return_value = mock_response + + result = await firecrawl_tool.map_url("http://example.com") + assert result == mock_response + assert firecrawl_tool.env.current_operation == "map_url" + assert firecrawl_tool.env.operation_status == "completed" + + +@pytest.mark.asyncio +async def test_scrape_url(firecrawl_tool): + """Test the scrape_url method.""" + mock_response = { + "success": True, + "data": {"title": "Test Page", "content": "Test Content"} + } + + with patch('requests.post') as mock_post: + mock_post.return_value.status_code = 200 + mock_post.return_value.json.return_value = mock_response + + result = await firecrawl_tool.scrape_url("http://example.com") + assert result == mock_response + assert firecrawl_tool.env.current_operation == "scrape_url" + assert firecrawl_tool.env.operation_status == "completed" + + +@pytest.mark.asyncio +async def test_search(firecrawl_tool): + """Test the search method.""" + mock_response = { + "success": True, + "data": [ + {"title": "Result 1", "url": "http://example.com/1"}, + {"title": "Result 2", "url": "http://example.com/2"} + ] + } + + with patch('requests.post') as mock_post: + mock_post.return_value.status_code = 200 + mock_post.return_value.json.return_value = mock_response + + result = await firecrawl_tool.search("test query") + assert result == mock_response + assert firecrawl_tool.env.current_operation == "search" + assert firecrawl_tool.env.operation_status == "completed" + + +@pytest.mark.asyncio +async def test_crawl_url(firecrawl_tool): + """Test the crawl_url method.""" + mock_response = { + "success": True, + "id": "test_job_id" + } + + with patch('requests.post') as mock_post: + mock_post.return_value.status_code = 200 + mock_post.return_value.json.return_value = mock_response + + result = await firecrawl_tool.crawl_url("http://example.com") + assert result == mock_response + assert firecrawl_tool.env.current_operation == "crawl_url" + assert firecrawl_tool.env.operation_status == "completed" + assert "test_job_id" in firecrawl_tool.env.active_jobs + assert firecrawl_tool.env.active_jobs["test_job_id"] == "crawl" + + +@pytest.mark.asyncio +async def test_get_crawl_status(firecrawl_tool): + """Test the get_crawl_status method.""" + mock_response = { + "success": True, + "status": "completed", + "data": {"pages": 10} + } + + firecrawl_tool.env.track_job("test_job_id", "crawl") + + with patch('requests.get') as mock_get: + mock_get.return_value.status_code = 200 + mock_get.return_value.json.return_value = mock_response + + result = await firecrawl_tool.get_crawl_status("test_job_id") + assert result == mock_response + assert firecrawl_tool.env.current_operation == "get_crawl_status" + assert firecrawl_tool.env.operation_status == "completed" + assert "test_job_id" not in firecrawl_tool.env.active_jobs + + +@pytest.mark.asyncio +async def test_extract(firecrawl_tool): + """Test the extract method.""" + mock_response = { + "success": True, + "id": "test_job_id" + } + + with patch('requests.post') as mock_post: + mock_post.return_value.status_code = 200 + mock_post.return_value.json.return_value = mock_response + + result = await firecrawl_tool.extract(["http://example.com"]) + assert result == mock_response + assert firecrawl_tool.env.current_operation == "extract" + assert firecrawl_tool.env.operation_status == "completed" + assert "test_job_id" in firecrawl_tool.env.active_jobs + assert firecrawl_tool.env.active_jobs["test_job_id"] == "extract" + + +@pytest.mark.asyncio +async def test_get_extract_status(firecrawl_tool): + """Test the get_extract_status method.""" + mock_response = { + "success": True, + "status": "completed", + "data": {"extracted": 5} + } + + firecrawl_tool.env.track_job("test_job_id", "extract") + + with patch('requests.get') as mock_get: + mock_get.return_value.status_code = 200 + mock_get.return_value.json.return_value = mock_response + + result = await firecrawl_tool.get_extract_status("test_job_id") + assert result == mock_response + assert firecrawl_tool.env.current_operation == "get_extract_status" + assert firecrawl_tool.env.operation_status == "completed" + assert "test_job_id" not in firecrawl_tool.env.active_jobs + + +@pytest.mark.asyncio +async def test_error_handling(firecrawl_tool): + """Test error handling in the tool.""" + with patch('requests.post') as mock_post: + mock_post.return_value.status_code = 500 + mock_post.return_value.json.return_value = {"error": "Internal Server Error"} + + with pytest.raises(Exception): + await firecrawl_tool.map_url("http://example.com") + + assert firecrawl_tool.env.operation_status == "failed" + + +def test_missing_api_key(): + """Test initialization without API key.""" + with pytest.raises(ValueError): + FirecrawlTool() \ No newline at end of file