Skip to content

Commit 043d3b3

Browse files
committed
feat: add cookies integration
1 parent 10f1227 commit 043d3b3

File tree

8 files changed

+725
-3
lines changed

8 files changed

+725
-3
lines changed

scrapegraph-py/README.md

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,75 @@ response = client.smartscraper(
9595

9696
</details>
9797

98+
<details>
99+
<summary>🍪 Cookies Support</summary>
100+
101+
Use cookies for authentication and session management:
102+
103+
```python
104+
from scrapegraph_py import Client
105+
106+
client = Client(api_key="your-api-key-here")
107+
108+
# Define cookies for authentication
109+
cookies = {
110+
"session_id": "abc123def456",
111+
"auth_token": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9...",
112+
"user_preferences": "dark_mode,usd"
113+
}
114+
115+
response = client.smartscraper(
116+
website_url="https://example.com/dashboard",
117+
user_prompt="Extract user profile information",
118+
cookies=cookies
119+
)
120+
```
121+
122+
**Common Use Cases:**
123+
- **E-commerce sites**: User authentication, shopping cart persistence
124+
- **Social media**: Session management, user preferences
125+
- **Banking/Financial**: Secure authentication, transaction history
126+
- **News sites**: User preferences, subscription content
127+
- **API endpoints**: Authentication tokens, API keys
128+
129+
</details>
130+
131+
<details>
132+
<summary>🔄 Advanced Features</summary>
133+
134+
**Infinite Scrolling:**
135+
```python
136+
response = client.smartscraper(
137+
website_url="https://example.com/feed",
138+
user_prompt="Extract all posts from the feed",
139+
cookies=cookies,
140+
number_of_scrolls=10 # Scroll 10 times to load more content
141+
)
142+
```
143+
144+
**Pagination:**
145+
```python
146+
response = client.smartscraper(
147+
website_url="https://example.com/products",
148+
user_prompt="Extract all product information",
149+
cookies=cookies,
150+
total_pages=5 # Scrape 5 pages
151+
)
152+
```
153+
154+
**Combined with Cookies:**
155+
```python
156+
response = client.smartscraper(
157+
website_url="https://example.com/dashboard",
158+
user_prompt="Extract user data from all pages",
159+
cookies=cookies,
160+
number_of_scrolls=5,
161+
total_pages=3
162+
)
163+
```
164+
165+
</details>
166+
98167
### 🔍 SearchScraper
99168

100169
Perform AI-powered web searches with structured results and reference URLs.
Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
"""
2+
Example demonstrating how to use the SmartScraper API with cookies (Async).
3+
4+
This example shows how to:
5+
1. Set up the API request with cookies for authentication
6+
2. Use cookies with infinite scrolling
7+
3. Define a Pydantic model for structured output
8+
4. Make the API call and handle the response
9+
5. Process the extracted data
10+
11+
Requirements:
12+
- Python 3.7+
13+
- scrapegraph-py
14+
- A .env file with your SGAI_API_KEY
15+
16+
Example .env file:
17+
SGAI_API_KEY=your_api_key_here
18+
"""
19+
20+
import asyncio
21+
import json
22+
import os
23+
from typing import Dict, Optional
24+
25+
from dotenv import load_dotenv
26+
from pydantic import BaseModel, Field
27+
28+
from scrapegraph_py import AsyncClient
29+
30+
# Load environment variables from .env file
31+
load_dotenv()
32+
33+
34+
# Define the data models for structured output
35+
class CookieInfo(BaseModel):
36+
"""Model representing cookie information."""
37+
38+
cookies: Dict[str, str] = Field(description="Dictionary of cookie key-value pairs")
39+
40+
41+
async def main():
42+
"""Example usage of the cookies scraper."""
43+
# Check if API key is available
44+
if not os.getenv("SGAI_API_KEY"):
45+
print("Error: SGAI_API_KEY not found in .env file")
46+
print("Please create a .env file with your API key:")
47+
print("SGAI_API_KEY=your_api_key_here")
48+
return
49+
50+
# Initialize the async client
51+
async with AsyncClient.from_env() as client:
52+
# Example 1: Basic cookies example (httpbin.org/cookies)
53+
print("=" * 60)
54+
print("EXAMPLE 1: Basic Cookies Example")
55+
print("=" * 60)
56+
57+
website_url = "https://httpbin.org/cookies"
58+
user_prompt = "Extract all cookies info"
59+
cookies = {"cookies_key": "cookies_value"}
60+
61+
try:
62+
# Perform the scraping with cookies
63+
response = await client.smartscraper(
64+
website_url=website_url,
65+
user_prompt=user_prompt,
66+
cookies=cookies,
67+
output_schema=CookieInfo,
68+
)
69+
70+
# Print the results
71+
print("\nExtracted Cookie Information:")
72+
print(json.dumps(response, indent=2))
73+
74+
except Exception as e:
75+
print(f"Error occurred: {str(e)}")
76+
77+
# Example 2: Cookies with infinite scrolling
78+
print("\n" + "=" * 60)
79+
print("EXAMPLE 2: Cookies with Infinite Scrolling")
80+
print("=" * 60)
81+
82+
website_url = "https://httpbin.org/cookies"
83+
user_prompt = "Extract all cookies and scroll information"
84+
cookies = {"session_id": "abc123", "user_token": "xyz789"}
85+
86+
try:
87+
# Perform the scraping with cookies and infinite scrolling
88+
response = await client.smartscraper(
89+
website_url=website_url,
90+
user_prompt=user_prompt,
91+
cookies=cookies,
92+
number_of_scrolls=3,
93+
output_schema=CookieInfo,
94+
)
95+
96+
# Print the results
97+
print("\nExtracted Cookie Information with Scrolling:")
98+
print(json.dumps(response, indent=2))
99+
100+
except Exception as e:
101+
print(f"Error occurred: {str(e)}")
102+
103+
# Example 3: Cookies with pagination
104+
print("\n" + "=" * 60)
105+
print("EXAMPLE 3: Cookies with Pagination")
106+
print("=" * 60)
107+
108+
website_url = "https://httpbin.org/cookies"
109+
user_prompt = "Extract all cookies from multiple pages"
110+
cookies = {"auth_token": "secret123", "preferences": "dark_mode"}
111+
112+
try:
113+
# Perform the scraping with cookies and pagination
114+
response = await client.smartscraper(
115+
website_url=website_url,
116+
user_prompt=user_prompt,
117+
cookies=cookies,
118+
total_pages=3,
119+
output_schema=CookieInfo,
120+
)
121+
122+
# Print the results
123+
print("\nExtracted Cookie Information with Pagination:")
124+
print(json.dumps(response, indent=2))
125+
126+
except Exception as e:
127+
print(f"Error occurred: {str(e)}")
128+
129+
130+
if __name__ == "__main__":
131+
asyncio.run(main())

0 commit comments

Comments
 (0)