diff --git a/ag2-web-research/.env.example b/ag2-web-research/.env.example new file mode 100644 index 00000000..8c39d8a8 --- /dev/null +++ b/ag2-web-research/.env.example @@ -0,0 +1,4 @@ +FIRECRAWL_API_KEY=fc-your_firecrawl_api_key +OPENAI_API_KEY=your_openai_api_key +OPENAI_BASE_URL=https://api.openai.com/v1 # override for Azure, SambaNova, etc. +LLM_MODEL=gpt-4o-mini diff --git a/ag2-web-research/README.md b/ag2-web-research/README.md new file mode 100644 index 00000000..d6bac475 --- /dev/null +++ b/ag2-web-research/README.md @@ -0,0 +1,45 @@ +# AG2 Web Research Pipeline with Firecrawl + +A multi-agent web research pipeline using [AG2](https://github.com/ag2ai/ag2) (formerly AutoGen) +and [Firecrawl](https://www.firecrawl.dev/). + +Three specialist agents run in sequence via `GroupChat`: + +``` +searcher ──> scraper ──> reporter + │ │ │ +search_web() scrape_pages() deep_research() +(Firecrawl) (Firecrawl) (Firecrawl) +``` + +## Features + +- **AG2 GroupChat** — three specialist agents with `round_robin` ordering; each runs once in sequence +- **Firecrawl tools** registered via `@register_for_llm` / `@register_for_execution` — tool description and execution are separate, independently replaceable +- **Firecrawl deep research** — premium analysis tool available to the reporter for additional depth + +## Prerequisites + +- [Firecrawl API key](https://www.firecrawl.dev/) (free tier works for search + scrape) +- OpenAI API key (or compatible endpoint) + +## Quick Start + +```bash +cd ag2-web-research +pip install -r requirements.txt +cp .env.example .env # add FIRECRAWL_API_KEY and OPENAI_API_KEY +python main.py "Latest developments in AI agent frameworks" +``` + +## How It Works + +1. **searcher** calls `firecrawl.search()` to find relevant pages across multiple angles of the topic +2. **scraper** reads the searcher's findings, selects the top 3 URLs, and calls `firecrawl.scrape_url()` on each +3. **reporter** synthesises all content into a structured Markdown report; may call `firecrawl.deep_research()` for additional depth + +## AG2 Concepts Demonstrated + +- `GroupChat` with `speaker_selection_method="round_robin"` — deterministic agent ordering +- `@register_for_llm` / `@register_for_execution` decorator pattern — LLM tool description separated from execution +- `is_termination_msg` — reporter signals pipeline completion diff --git a/ag2-web-research/main.py b/ag2-web-research/main.py new file mode 100644 index 00000000..adcf8dc5 --- /dev/null +++ b/ag2-web-research/main.py @@ -0,0 +1,146 @@ +""" +Multi-agent web research pipeline using AG2 (formerly AutoGen). + +Three specialist agents collaborate in a fixed sequence via GroupChat: + searcher — finds relevant pages via Firecrawl search + scraper — extracts content from top results via Firecrawl scrape + reporter — synthesises findings into a structured Markdown report + +Tools are registered using AG2's @register_for_llm / @register_for_execution +decorator pattern, separating tool description (for the LLM) from execution. +""" +import os +from dotenv import load_dotenv +from firecrawl import FirecrawlApp +from autogen import ConversableAgent, GroupChat, GroupChatManager, UserProxyAgent + +load_dotenv() + +# ── Firecrawl client ─────────────────────────────────────────────────────────── + +firecrawl = FirecrawlApp(api_key=os.environ["FIRECRAWL_API_KEY"]) + +# ── LLM config ──────────────────────────────────────────────────────────────── + +llm_config = { + "config_list": [{ + "model": os.getenv("LLM_MODEL", "gpt-4o-mini"), + "api_key": os.environ["OPENAI_API_KEY"], + "base_url": os.getenv("OPENAI_BASE_URL", "https://api.openai.com/v1"), + }], + "temperature": 0.3, + "cache_seed": None, +} + +# ── Tool functions ───────────────────────────────────────────────────────────── + +def search_web(query: str, num_results: int = 5) -> str: + """Search the web using Firecrawl and return top results with URLs and snippets.""" + results = firecrawl.search(query, limit=num_results) + formatted = "\n".join( + f"{i+1}. {r.get('title', 'No title')}\n URL: {r.get('url', '')}\n {r.get('description', '')}" + for i, r in enumerate(results.get("data", [])) + ) + return f"Found {len(results.get('data', []))} results:\n\n{formatted}" + + +def scrape_pages(urls: list[str]) -> str: + """Scrape and extract Markdown content from a list of URLs using Firecrawl (max 3).""" + scraped = [] + for url in urls[:3]: + try: + result = firecrawl.scrape_url(url, formats=["markdown"]) + content = result.get("markdown", "")[:2000] + scraped.append({"url": url, "content": content}) + except Exception as exc: + scraped.append({"url": url, "error": str(exc)}) + return "\n\n".join( + f"**{s['url']}**\n{s.get('content', s.get('error', 'No content'))}" + for s in scraped + ) + + +def deep_research(topic: str) -> str: + """Run Firecrawl deep research on a topic for comprehensive analysis.""" + result = firecrawl.deep_research(topic, max_depth=3, time_limit=60) + return result.get("data", {}).get("finalAnalysis", "Deep research returned no analysis.") + + +# ── Agents ───────────────────────────────────────────────────────────────────── + +searcher = ConversableAgent( + name="searcher", + system_message=( + "You are a web research specialist. Use the search_web tool to find the most " + "relevant and recent pages on the research topic. Perform at least 3 searches " + "covering different angles. Report the URLs and key findings." + ), + llm_config=llm_config, +) + +scraper = ConversableAgent( + name="scraper", + system_message=( + "You are a content extraction specialist. Review the searcher's findings, " + "identify the top 3 most relevant URLs, and use the scrape_pages tool to extract " + "their full content. Summarise the key information from each page." + ), + llm_config=llm_config, +) + +reporter = ConversableAgent( + name="reporter", + system_message=( + "You are a research analyst and writer. Synthesise all findings from the searcher " + "and scraper into a comprehensive Markdown report with:\n" + "## Executive Summary\n" + "## Key Findings\n" + "## Detailed Analysis\n" + "## Sources\n" + "## Conclusion\n" + "You may use the deep_research tool for additional depth on key points. " + "End your final message with TERMINATE." + ), + llm_config=llm_config, + is_termination_msg=lambda m: "TERMINATE" in (m.get("content") or ""), +) + +# ── Tool registration ────────────────────────────────────────────────────────── +# @register_for_llm — provides tool description to the agent's LLM +# @register_for_execution — provides the callable so the agent can run it + +searcher.register_for_llm(name="search_web", description=search_web.__doc__)(search_web) +searcher.register_for_execution(name="search_web")(search_web) + +scraper.register_for_llm(name="scrape_pages", description=scrape_pages.__doc__)(scrape_pages) +scraper.register_for_execution(name="scrape_pages")(scrape_pages) + +reporter.register_for_llm(name="deep_research", description=deep_research.__doc__)(deep_research) +reporter.register_for_execution(name="deep_research")(deep_research) + +# ── GroupChat ────────────────────────────────────────────────────────────────── + +groupchat = GroupChat( + agents=[searcher, scraper, reporter], + messages=[], + max_round=12, + speaker_selection_method="round_robin", +) +manager = GroupChatManager( + groupchat=groupchat, + llm_config=llm_config, + is_termination_msg=lambda m: "TERMINATE" in (m.get("content") or ""), +) + +# ── Entry point ─────────────────────────────────────────────────────────────── + +def run_research(topic: str) -> None: + user = UserProxyAgent(name="user", human_input_mode="NEVER", code_execution_config=False) + user.initiate_chat(manager, message=f"Research this topic thoroughly: {topic}") + + +if __name__ == "__main__": + import sys + topic = " ".join(sys.argv[1:]) or "Latest developments in AI agent frameworks 2025" + print(f"\n=== AG2 Web Research Pipeline ===\nTopic: {topic}\n") + run_research(topic) diff --git a/ag2-web-research/requirements.txt b/ag2-web-research/requirements.txt new file mode 100644 index 00000000..6d575a07 --- /dev/null +++ b/ag2-web-research/requirements.txt @@ -0,0 +1,3 @@ +ag2[openai]>=0.11.0 +firecrawl-py>=1.5.0 +python-dotenv>=1.0.0 diff --git a/ag2-web-research/tests/conftest.py b/ag2-web-research/tests/conftest.py new file mode 100644 index 00000000..4fb3b4a9 --- /dev/null +++ b/ag2-web-research/tests/conftest.py @@ -0,0 +1,26 @@ +""" +Stub firecrawl and add project root to sys.path before test collection. +test_tools.py patches firecrawl.FirecrawlApp at module level, so the stub +must be in sys.modules before that file is imported. +""" +import sys +import os +from types import ModuleType +from unittest.mock import MagicMock + +# Add project root so 'import main' works +parent = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +if parent not in sys.path: + sys.path.insert(0, parent) + + +def _stub_module(name: str, **attrs) -> ModuleType: + m = ModuleType(name) + for k, v in attrs.items(): + setattr(m, k, v) + sys.modules[name] = m + return m + + +# Stub firecrawl so patch("firecrawl.FirecrawlApp") can resolve the module +_stub_module("firecrawl", FirecrawlApp=MagicMock()) diff --git a/ag2-web-research/tests/test_agent_setup.py b/ag2-web-research/tests/test_agent_setup.py new file mode 100644 index 00000000..04dab7c4 --- /dev/null +++ b/ag2-web-research/tests/test_agent_setup.py @@ -0,0 +1,33 @@ +"""Verify GroupChat agent setup without initiating chat or LLM calls.""" +import os +os.environ.setdefault("FIRECRAWL_API_KEY", "test") +os.environ.setdefault("OPENAI_API_KEY", "test") + +def test_agents_created(): + from unittest.mock import patch + with patch("firecrawl.FirecrawlApp"): + import main as m + assert m.searcher.name == "searcher" + assert m.scraper.name == "scraper" + assert m.reporter.name == "reporter" + +def test_termination_condition(): + from unittest.mock import patch + with patch("firecrawl.FirecrawlApp"): + import main as m + assert m.reporter._is_termination_msg({"content": "Report. TERMINATE"}) is True + assert m.reporter._is_termination_msg({"content": "Still working"}) is False + +def test_tools_registered_on_searcher(): + from unittest.mock import patch + with patch("firecrawl.FirecrawlApp"): + import main as m + registered = getattr(m.searcher, "_function_map", {}) + assert "search_web" in registered + +def test_groupchat_round_robin(): + from unittest.mock import patch + with patch("firecrawl.FirecrawlApp"): + import main as m + assert m.groupchat.speaker_selection_method == "round_robin" + assert [a.name for a in m.groupchat.agents] == ["searcher", "scraper", "reporter"] diff --git a/ag2-web-research/tests/test_tools.py b/ag2-web-research/tests/test_tools.py new file mode 100644 index 00000000..350d5294 --- /dev/null +++ b/ag2-web-research/tests/test_tools.py @@ -0,0 +1,56 @@ +"""Unit tests for Firecrawl tool functions — no API calls.""" +import os +import pytest +from unittest.mock import MagicMock, patch + +os.environ.setdefault("FIRECRAWL_API_KEY", "test-key") +os.environ.setdefault("OPENAI_API_KEY", "test-key") + +# Patch FirecrawlApp before importing main +with patch("firecrawl.FirecrawlApp") as mock_fc_cls: + mock_fc = MagicMock() + mock_fc_cls.return_value = mock_fc + import main as ag2_main + + +def test_search_web_returns_formatted_results(): + ag2_main.firecrawl.search.return_value = { + "data": [ + {"title": "Page 1", "url": "https://example.com/1", "description": "Desc 1"}, + {"title": "Page 2", "url": "https://example.com/2", "description": "Desc 2"}, + ] + } + result = ag2_main.search_web("test query", num_results=2) + assert "Page 1" in result + assert "https://example.com/1" in result + assert "Found 2 results" in result + + +def test_search_web_handles_empty_results(): + ag2_main.firecrawl.search.return_value = {"data": []} + result = ag2_main.search_web("obscure topic") + assert "Found 0 results" in result + + +def test_scrape_pages_limits_to_three(): + ag2_main.firecrawl.scrape_url.reset_mock() + ag2_main.firecrawl.scrape_url.return_value = {"markdown": "Content here"} + ag2_main.firecrawl.scrape_url.side_effect = None + urls = [f"https://example.com/{i}" for i in range(5)] + ag2_main.scrape_pages(urls) + assert ag2_main.firecrawl.scrape_url.call_count == 3 # max 3 + + +def test_scrape_pages_handles_errors_gracefully(): + ag2_main.firecrawl.scrape_url.side_effect = Exception("Rate limited") + result = ag2_main.scrape_pages(["https://example.com/fail"]) + assert "Rate limited" in result or "error" in result.lower() + ag2_main.firecrawl.scrape_url.side_effect = None + + +def test_deep_research_returns_analysis(): + ag2_main.firecrawl.deep_research.return_value = { + "data": {"finalAnalysis": "This is the deep analysis."} + } + result = ag2_main.deep_research("AI agents") + assert "deep analysis" in result