Skip to content

Commit d563b0b

Browse files
committed
doc: add examples
1 parent 10f1227 commit d563b0b

10 files changed

+2714
-0
lines changed

scrapegraph-py/examples/async/async_smartscraper_cookies_example.py

Lines changed: 414 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
SGAI_API_KEY="your_sgai_api_key"
Lines changed: 357 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,357 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Async Step-by-Step Cookies Example
4+
5+
This example demonstrates how to use cookies with SmartScraper API using async/await patterns.
6+
It shows how to set up and execute requests with custom cookies for authentication and session management.
7+
"""
8+
9+
import asyncio
10+
import json
11+
import logging
12+
import os
13+
import time
14+
15+
import httpx
16+
from dotenv import load_dotenv
17+
18+
# Configure logging
19+
logging.basicConfig(
20+
level=logging.INFO,
21+
format="%(asctime)s - %(levelname)s - %(message)s",
22+
handlers=[logging.StreamHandler()],
23+
)
24+
logger = logging.getLogger(__name__)
25+
26+
# Load environment variables from .env file
27+
load_dotenv()
28+
29+
30+
async def step_1_environment_setup():
31+
"""Step 1: Set up environment and API key"""
32+
print("STEP 1: Environment Setup")
33+
print("=" * 40)
34+
35+
# Check if API key is available
36+
api_key = os.getenv("TEST_API_KEY")
37+
if not api_key:
38+
print("❌ Error: TEST_API_KEY environment variable not set")
39+
print("Please either:")
40+
print(" 1. Set environment variable: export TEST_API_KEY='your-api-key-here'")
41+
print(" 2. Create a .env file with: TEST_API_KEY=your-api-key-here")
42+
return None
43+
44+
print("✅ API key found in environment")
45+
print(f"🔑 API Key: {api_key[:8]}...{api_key[-4:]}")
46+
return api_key
47+
48+
49+
async def step_2_server_connectivity_check(api_key):
50+
"""Step 2: Check server connectivity"""
51+
print("\nSTEP 2: Server Connectivity Check")
52+
print("=" * 40)
53+
54+
url = "http://localhost:8001/v1/smartscraper"
55+
56+
try:
57+
async with httpx.AsyncClient(timeout=5.0) as client:
58+
# Try to access the health endpoint
59+
health_url = url.replace("/v1/smartscraper", "/healthz")
60+
response = await client.get(health_url)
61+
62+
if response.status_code == 200:
63+
print("✅ Server is accessible")
64+
print(f"🔗 Health endpoint: {health_url}")
65+
return True
66+
else:
67+
print(f"❌ Server health check failed with status {response.status_code}")
68+
return False
69+
except Exception as e:
70+
print(f"❌ Server connectivity check failed: {e}")
71+
print("Please ensure the server is running:")
72+
print(" poetry run uvicorn app.main:app --host 0.0.0.0 --port 8001 --reload")
73+
return False
74+
75+
76+
def step_3_define_cookies():
77+
"""Step 3: Define cookies for authentication"""
78+
print("\nSTEP 3: Define Cookies")
79+
print("=" * 40)
80+
81+
# Example cookies for a website that requires authentication
82+
cookies = {
83+
"session_id": "abc123def456ghi789",
84+
"user_token": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9...",
85+
"remember_me": "true",
86+
"language": "en",
87+
"theme": "dark"
88+
}
89+
90+
print("🍪 Cookies configured:")
91+
for key, value in cookies.items():
92+
if "token" in key.lower():
93+
# Mask sensitive tokens
94+
masked_value = value[:20] + "..." if len(value) > 20 else value
95+
print(f" {key}: {masked_value}")
96+
else:
97+
print(f" {key}: {value}")
98+
99+
print(f"\n📊 Total cookies: {len(cookies)}")
100+
return cookies
101+
102+
103+
def step_4_define_request_parameters():
104+
"""Step 4: Define the request parameters"""
105+
print("\nSTEP 4: Define Request Parameters")
106+
print("=" * 40)
107+
108+
# Configuration parameters
109+
website_url = "https://example.com/dashboard"
110+
user_prompt = "Extract user profile information and account details"
111+
112+
print("🌐 Website URL:")
113+
print(f" {website_url}")
114+
print("\n📝 User Prompt:")
115+
print(f" {user_prompt}")
116+
print("\n🎯 Goal: Access authenticated content using cookies")
117+
118+
return {
119+
"website_url": website_url,
120+
"user_prompt": user_prompt
121+
}
122+
123+
124+
def step_5_prepare_headers(api_key):
125+
"""Step 5: Prepare request headers"""
126+
print("\nSTEP 5: Prepare Request Headers")
127+
print("=" * 40)
128+
129+
headers = {
130+
"SGAI-APIKEY": api_key,
131+
"Content-Type": "application/json",
132+
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36",
133+
"Accept": "application/json",
134+
"Accept-Language": "en-US,en;q=0.9",
135+
"Accept-Encoding": "gzip, deflate, br",
136+
"Connection": "keep-alive",
137+
}
138+
139+
print("📋 Headers configured:")
140+
for key, value in headers.items():
141+
if key == "SGAI-APIKEY":
142+
print(f" {key}: {value[:10]}...{value[-10:]}") # Mask API key
143+
else:
144+
print(f" {key}: {value}")
145+
146+
return headers
147+
148+
149+
async def step_6_execute_cookies_request(headers, cookies, config):
150+
"""Step 6: Execute the request with cookies"""
151+
print("\nSTEP 6: Execute Request with Cookies")
152+
print("=" * 40)
153+
154+
url = "http://localhost:8001/v1/smartscraper"
155+
156+
# Request payload with cookies
157+
payload = {
158+
"website_url": config["website_url"],
159+
"user_prompt": config["user_prompt"],
160+
"output_schema": {},
161+
"cookies": cookies,
162+
}
163+
164+
print("🚀 Starting request with cookies...")
165+
print("🍪 Using authentication cookies for access...")
166+
167+
try:
168+
# Start timing
169+
start_time = time.time()
170+
171+
# Use timeout for cookies requests
172+
async with httpx.AsyncClient(timeout=120.0) as client:
173+
response = await client.post(url, headers=headers, json=payload)
174+
175+
# Calculate duration
176+
duration = time.time() - start_time
177+
178+
print(f"✅ Request completed in {duration:.2f} seconds")
179+
print(f"📊 Response Status: {response.status_code}")
180+
181+
if response.status_code == 200:
182+
result = response.json()
183+
return result, duration
184+
else:
185+
print(f"❌ Request failed with status {response.status_code}")
186+
print(f"Response: {response.text}")
187+
return None, duration
188+
189+
except httpx.TimeoutException:
190+
duration = time.time() - start_time
191+
print(f"❌ Request timed out after {duration:.2f} seconds (>120s timeout)")
192+
print("This may indicate authentication issues or slow response.")
193+
return None, duration
194+
195+
except httpx.RequestError as e:
196+
duration = time.time() - start_time
197+
print(f"❌ Request error after {duration:.2f} seconds: {e}")
198+
print("Common causes:")
199+
print(" - Server is not running")
200+
print(" - Invalid cookies")
201+
print(" - Network connectivity issues")
202+
return None, duration
203+
204+
except Exception as e:
205+
duration = time.time() - start_time
206+
print(f"❌ Unexpected error after {duration:.2f} seconds: {e}")
207+
return None, duration
208+
209+
210+
def step_7_process_results(result, duration):
211+
"""Step 7: Process and display the results"""
212+
print("\nSTEP 7: Process Results")
213+
print("=" * 40)
214+
215+
if result is None:
216+
print("❌ No results to process")
217+
return
218+
219+
print("📋 Processing authenticated results...")
220+
221+
# Display results based on type
222+
if isinstance(result, dict):
223+
print("\n🔍 Response Structure:")
224+
print(json.dumps(result, indent=2, ensure_ascii=False))
225+
226+
# Check for authentication success indicators
227+
if "result" in result:
228+
print(f"\n✨ Authentication successful! Data extracted with cookies")
229+
230+
elif isinstance(result, list):
231+
print(f"\n✅ Authentication successful! Extracted {len(result)} items")
232+
233+
# Show first few items
234+
print("\n📦 Sample Results:")
235+
for i, item in enumerate(result[:3]): # Show first 3 items
236+
print(f" {i+1}. {item}")
237+
238+
if len(result) > 3:
239+
print(f" ... and {len(result) - 3} more items")
240+
241+
else:
242+
print(f"\n📋 Result: {result}")
243+
244+
print(f"\n⏱️ Total processing time: {duration:.2f} seconds")
245+
246+
247+
def step_8_show_curl_equivalent(api_key, cookies, config):
248+
"""Step 8: Show equivalent curl command"""
249+
print("\nSTEP 8: Equivalent curl Command")
250+
print("=" * 40)
251+
252+
# Convert cookies dict to curl format
253+
cookies_str = "; ".join([f"{k}={v}" for k, v in cookies.items()])
254+
255+
curl_command = f"""
256+
curl --location 'http://localhost:8001/v1/smartscraper' \\
257+
--header 'SGAI-APIKEY: {api_key}' \\
258+
--header 'Content-Type: application/json' \\
259+
--header 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36' \\
260+
--header 'Accept: application/json' \\
261+
--header 'Accept-Language: en-US,en;q=0.9' \\
262+
--header 'Accept-Encoding: gzip, deflate, br' \\
263+
--header 'Connection: keep-alive' \\
264+
--cookie '{cookies_str}' \\
265+
--data '{{
266+
"website_url": "{config['website_url']}",
267+
"user_prompt": "{config['user_prompt']}",
268+
"output_schema": {{}},
269+
"cookies": {json.dumps(cookies)}
270+
}}'
271+
"""
272+
273+
print("Equivalent curl command:")
274+
print(curl_command)
275+
276+
277+
def step_9_cookie_management_tips():
278+
"""Step 9: Provide cookie management tips"""
279+
print("\nSTEP 9: Cookie Management Tips")
280+
print("=" * 40)
281+
282+
print("🍪 Best Practices for Cookie Management:")
283+
print("1. 🔐 Store sensitive cookies securely (environment variables)")
284+
print("2. ⏰ Set appropriate expiration times")
285+
print("3. 🧹 Clean up expired cookies regularly")
286+
print("4. 🔄 Refresh tokens before they expire")
287+
print("5. 🛡️ Use HTTPS for cookie transmission")
288+
print("6. 📝 Log cookie usage for debugging")
289+
print("7. 🚫 Don't hardcode cookies in source code")
290+
print("8. 🔍 Validate cookie format before sending")
291+
292+
293+
async def main():
294+
"""Main function to run the async step-by-step cookies example"""
295+
total_start_time = time.time()
296+
logger.info("Starting Async Step-by-Step Cookies Example")
297+
298+
print("ScrapeGraph SDK - Async Step-by-Step Cookies Example")
299+
print("=" * 60)
300+
print("This example shows the complete async process of setting up and")
301+
print("executing requests with cookies for authentication")
302+
print("=" * 60)
303+
304+
# Step 1: Environment setup
305+
api_key = await step_1_environment_setup()
306+
if not api_key:
307+
return
308+
309+
# Step 2: Server connectivity check
310+
server_ok = await step_2_server_connectivity_check(api_key)
311+
if not server_ok:
312+
return
313+
314+
# Step 3: Define cookies
315+
cookies = step_3_define_cookies()
316+
317+
# Step 4: Define request parameters
318+
config = step_4_define_request_parameters()
319+
320+
# Step 5: Prepare headers
321+
headers = step_5_prepare_headers(api_key)
322+
323+
# Step 6: Execute request
324+
result, duration = await step_6_execute_cookies_request(headers, cookies, config)
325+
326+
# Step 7: Process results
327+
step_7_process_results(result, duration)
328+
329+
# Step 8: Show curl equivalent
330+
step_8_show_curl_equivalent(api_key, cookies, config)
331+
332+
# Step 9: Cookie management tips
333+
step_9_cookie_management_tips()
334+
335+
total_duration = time.time() - total_start_time
336+
logger.info(f"Example completed! Total execution time: {total_duration:.2f} seconds")
337+
338+
print("\n" + "=" * 60)
339+
print("Async step-by-step cookies example completed!")
340+
print(f"⏱️ Total execution time: {total_duration:.2f} seconds")
341+
print("\nKey takeaways:")
342+
print("1. Async/await provides better performance for I/O operations")
343+
print("2. Cookies enable access to authenticated content")
344+
print("3. Always validate API key and server connectivity first")
345+
print("4. Secure cookie storage is crucial for production use")
346+
print("5. Handle authentication errors gracefully")
347+
print("6. Use equivalent curl commands for testing")
348+
print("\nNext steps:")
349+
print("- Implement secure cookie storage")
350+
print("- Add cookie refresh logic")
351+
print("- Handle authentication failures")
352+
print("- Monitor cookie expiration")
353+
print("- Implement retry logic for failed requests")
354+
355+
356+
if __name__ == "__main__":
357+
asyncio.run(main())

0 commit comments

Comments
 (0)