diff --git a/human-fallback-agent/.env.example b/human-fallback-agent/.env.example new file mode 100644 index 00000000..dcc710c7 --- /dev/null +++ b/human-fallback-agent/.env.example @@ -0,0 +1,2 @@ +FIRECRAWL_API_KEY=your_firecrawl_key +HUMANPAGES_API_KEY=hp_your_key diff --git a/human-fallback-agent/README.md b/human-fallback-agent/README.md new file mode 100644 index 00000000..211cd388 --- /dev/null +++ b/human-fallback-agent/README.md @@ -0,0 +1,36 @@ +# Human Fallback Agent + +Scrape with Firecrawl, delegate failures to real humans via Human Pages. + +When Firecrawl can't extract data from a page (login walls, CAPTCHAs, dynamic gated content), this agent finds a human who can do it manually and returns structured data. + +## Setup + +```bash +pip install firecrawl-py requests +``` + +```bash +cp .env.example .env +# Add your keys +``` + +## How it works + +1. Try scraping with Firecrawl +2. If extraction fails or returns empty, search Human Pages for someone who can help +3. Create a job offer describing what data to extract +4. Poll until the human submits results +5. Continue with the extracted data + +## Run + +```bash +python agent.py +``` + +## Links + +- [Firecrawl docs](https://docs.firecrawl.dev) +- [Human Pages docs](https://humanpages.ai/dev) +- [Human Pages MCP server](https://www.npmjs.com/package/humanpages) diff --git a/human-fallback-agent/agent.py b/human-fallback-agent/agent.py new file mode 100644 index 00000000..bd2f1750 --- /dev/null +++ b/human-fallback-agent/agent.py @@ -0,0 +1,108 @@ +""" +Firecrawl + Human Pages: scrape with fallback to real humans. + +Try Firecrawl first. If extraction fails, hire a human via Human Pages +to extract the data manually. +""" + +import os +import time + +import requests +from dotenv import load_dotenv +from firecrawl import FirecrawlApp + +load_dotenv() + +firecrawl = FirecrawlApp(api_key=os.environ["FIRECRAWL_API_KEY"]) +HP_KEY = os.environ["HUMANPAGES_API_KEY"] +HP_BASE = "https://humanpages.ai" +HP_HEADERS = {"Content-Type": "application/json", "X-Agent-Key": HP_KEY} + +TARGET_URL = "https://example.com/pricing" +EXTRACT_PROMPT = "Extract all pricing tiers with name, price, and features list" + + +def scrape_with_firecrawl(url: str, prompt: str) -> dict | None: + result = firecrawl.scrape_url(url, params={"formats": ["extract"], "extract": {"prompt": prompt}}) + extracted = result.get("extract") + if extracted and len(str(extracted)) > 20: + return extracted + return None + + +def search_humans(skill: str) -> list: + resp = requests.get( + f"{HP_BASE}/api/humans/search", + params={"skill": skill, "available": "true"}, + headers=HP_HEADERS, + ) + resp.raise_for_status() + return resp.json()["results"] + + +def create_job(human_id: str, url: str, prompt: str) -> str: + resp = requests.post( + f"{HP_BASE}/api/jobs", + headers=HP_HEADERS, + json={ + "humanId": human_id, + "title": f"Extract data from {url}", + "description": f"Please visit {url} and extract the following:\n\n{prompt}\n\nReturn the data as JSON.", + "priceUsdc": "5.00", + "deadlineHours": 2, + }, + ) + resp.raise_for_status() + return resp.json()["job"]["id"] + + +def wait_for_job(job_id: str, timeout: int = 3600) -> dict: + start = time.time() + while time.time() - start < timeout: + resp = requests.get(f"{HP_BASE}/api/jobs/{job_id}", headers=HP_HEADERS) + resp.raise_for_status() + job = resp.json()["job"] + if job["status"] in ("SUBMITTED", "COMPLETED"): + return job + if job["status"] in ("REJECTED", "CANCELLED"): + raise RuntimeError(f"Job {job_id} was {job['status']}") + print(f" Job status: {job['status']}, waiting...") + time.sleep(30) + raise TimeoutError(f"Job {job_id} timed out") + + +def main(): + print(f"Scraping {TARGET_URL} with Firecrawl...") + data = scrape_with_firecrawl(TARGET_URL, EXTRACT_PROMPT) + + if data: + print("Firecrawl succeeded:") + print(data) + return data + + print("Firecrawl couldn't extract the data. Searching for a human...") + humans = search_humans("data entry") + if not humans: + print("No humans available.") + return None + + human = humans[0] + print(f"Found {human.get('name', human['id'])} — creating job...") + job_id = create_job(human["id"], TARGET_URL, EXTRACT_PROMPT) + print(f"Job created: {job_id}. Waiting for human...") + + job = wait_for_job(job_id) + print(f"Human finished! Status: {job['status']}") + + msgs = requests.get(f"{HP_BASE}/api/jobs/{job_id}/messages", headers=HP_HEADERS) + msgs.raise_for_status() + for msg in msgs.json()["messages"]: + if msg["role"] == "human": + print(f"Human's response: {msg['content']}") + + return job + + +if __name__ == "__main__": + main() diff --git a/human-fallback-agent/requirements.txt b/human-fallback-agent/requirements.txt new file mode 100644 index 00000000..294aa1c6 --- /dev/null +++ b/human-fallback-agent/requirements.txt @@ -0,0 +1,3 @@ +firecrawl-py>=1.0.0 +requests>=2.31.0 +python-dotenv>=1.0.0