Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions human-fallback-agent/.env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
FIRECRAWL_API_KEY=your_firecrawl_key
HUMANPAGES_API_KEY=hp_your_key
36 changes: 36 additions & 0 deletions human-fallback-agent/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Human Fallback Agent

Scrape with Firecrawl, delegate failures to real humans via Human Pages.

When Firecrawl can't extract data from a page (login walls, CAPTCHAs, dynamic gated content), this agent finds a human who can do it manually and returns structured data.

## Setup

```bash
pip install firecrawl-py requests
```

```bash
cp .env.example .env
# Add your keys
```

## How it works

1. Try scraping with Firecrawl
2. If extraction fails or returns empty, search Human Pages for someone who can help
3. Create a job offer describing what data to extract
4. Poll until the human submits results
5. Continue with the extracted data

## Run

```bash
python agent.py
```

## Links

- [Firecrawl docs](https://docs.firecrawl.dev)
- [Human Pages docs](https://humanpages.ai/dev)
- [Human Pages MCP server](https://www.npmjs.com/package/humanpages)
108 changes: 108 additions & 0 deletions human-fallback-agent/agent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
"""
Firecrawl + Human Pages: scrape with fallback to real humans.

Try Firecrawl first. If extraction fails, hire a human via Human Pages
to extract the data manually.
"""

import os
import time

import requests
from dotenv import load_dotenv
from firecrawl import FirecrawlApp

load_dotenv()

firecrawl = FirecrawlApp(api_key=os.environ["FIRECRAWL_API_KEY"])
HP_KEY = os.environ["HUMANPAGES_API_KEY"]
HP_BASE = "https://humanpages.ai"
HP_HEADERS = {"Content-Type": "application/json", "X-Agent-Key": HP_KEY}

TARGET_URL = "https://example.com/pricing"
EXTRACT_PROMPT = "Extract all pricing tiers with name, price, and features list"


def scrape_with_firecrawl(url: str, prompt: str) -> dict | None:
result = firecrawl.scrape_url(url, params={"formats": ["extract"], "extract": {"prompt": prompt}})
extracted = result.get("extract")
if extracted and len(str(extracted)) > 20:
return extracted
return None


def search_humans(skill: str) -> list:
resp = requests.get(
f"{HP_BASE}/api/humans/search",
params={"skill": skill, "available": "true"},
headers=HP_HEADERS,
)
resp.raise_for_status()
return resp.json()["results"]


def create_job(human_id: str, url: str, prompt: str) -> str:
resp = requests.post(
f"{HP_BASE}/api/jobs",
headers=HP_HEADERS,
json={
"humanId": human_id,
"title": f"Extract data from {url}",
"description": f"Please visit {url} and extract the following:\n\n{prompt}\n\nReturn the data as JSON.",
"priceUsdc": "5.00",
"deadlineHours": 2,
},
)
resp.raise_for_status()
return resp.json()["job"]["id"]


def wait_for_job(job_id: str, timeout: int = 3600) -> dict:
start = time.time()
while time.time() - start < timeout:
resp = requests.get(f"{HP_BASE}/api/jobs/{job_id}", headers=HP_HEADERS)
resp.raise_for_status()
job = resp.json()["job"]
if job["status"] in ("SUBMITTED", "COMPLETED"):
return job
if job["status"] in ("REJECTED", "CANCELLED"):
raise RuntimeError(f"Job {job_id} was {job['status']}")
print(f" Job status: {job['status']}, waiting...")
time.sleep(30)
raise TimeoutError(f"Job {job_id} timed out")


def main():
print(f"Scraping {TARGET_URL} with Firecrawl...")
data = scrape_with_firecrawl(TARGET_URL, EXTRACT_PROMPT)

if data:
print("Firecrawl succeeded:")
print(data)
return data

print("Firecrawl couldn't extract the data. Searching for a human...")
humans = search_humans("data entry")
if not humans:
print("No humans available.")
return None

human = humans[0]
print(f"Found {human.get('name', human['id'])} — creating job...")
job_id = create_job(human["id"], TARGET_URL, EXTRACT_PROMPT)
print(f"Job created: {job_id}. Waiting for human...")

job = wait_for_job(job_id)
print(f"Human finished! Status: {job['status']}")

msgs = requests.get(f"{HP_BASE}/api/jobs/{job_id}/messages", headers=HP_HEADERS)
msgs.raise_for_status()
for msg in msgs.json()["messages"]:
if msg["role"] == "human":
print(f"Human's response: {msg['content']}")

return job


if __name__ == "__main__":
main()
3 changes: 3 additions & 0 deletions human-fallback-agent/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
firecrawl-py>=1.0.0
requests>=2.31.0
python-dotenv>=1.0.0