Skip to content

Commit 5f5ec1b

Browse files
committed
feat: add python integration
1 parent 31c40fd commit 5f5ec1b

File tree

9 files changed

+1071
-2
lines changed

9 files changed

+1071
-2
lines changed
Lines changed: 260 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,260 @@
1+
#!/usr/bin/env python3
2+
"""
3+
SmartScraper Pagination Example (Async)
4+
5+
This example demonstrates how to use pagination functionality with SmartScraper API using the asynchronous client.
6+
"""
7+
8+
import asyncio
9+
import json
10+
import logging
11+
import os
12+
import time
13+
from pydantic import BaseModel
14+
from typing import List, Optional
15+
16+
from scrapegraph_py import AsyncClient
17+
from scrapegraph_py.exceptions import APIError
18+
19+
20+
# Configure logging
21+
logging.basicConfig(
22+
level=logging.INFO,
23+
format="%(asctime)s - %(levelname)s - %(message)s",
24+
handlers=[logging.StreamHandler()],
25+
)
26+
logger = logging.getLogger(__name__)
27+
28+
29+
class ProductInfo(BaseModel):
30+
"""Schema for product information"""
31+
name: str
32+
price: Optional[str] = None
33+
rating: Optional[str] = None
34+
image_url: Optional[str] = None
35+
description: Optional[str] = None
36+
37+
38+
class ProductList(BaseModel):
39+
"""Schema for list of products"""
40+
products: List[ProductInfo]
41+
42+
43+
async def smartscraper_pagination_example():
44+
"""Example of using pagination with SmartScraper (async)"""
45+
46+
print("SmartScraper Pagination Example (Async)")
47+
print("=" * 50)
48+
49+
# Initialize client from environment variable
50+
try:
51+
client = AsyncClient.from_env()
52+
except ValueError as e:
53+
print(f"❌ Error initializing client: {e}")
54+
print("Please set SGAI_API_KEY environment variable")
55+
return
56+
57+
# Configuration
58+
website_url = "https://www.amazon.in/s?k=tv&crid=1TEF1ZFVLU8R8&sprefix=t%2Caps%2C390&ref=nb_sb_noss_2"
59+
user_prompt = "Extract all product info including name, price, rating, image_url, and description"
60+
total_pages = 3 # Number of pages to scrape
61+
62+
print(f"🌐 Website URL: {website_url}")
63+
print(f"📝 User Prompt: {user_prompt}")
64+
print(f"📄 Total Pages: {total_pages}")
65+
print("-" * 50)
66+
67+
try:
68+
# Start timing
69+
start_time = time.time()
70+
71+
# Make the request with pagination
72+
result = await client.smartscraper(
73+
user_prompt=user_prompt,
74+
website_url=website_url,
75+
output_schema=ProductList,
76+
total_pages=total_pages
77+
)
78+
79+
# Calculate duration
80+
duration = time.time() - start_time
81+
82+
print(f"✅ Request completed in {duration:.2f} seconds")
83+
print(f"📊 Response type: {type(result)}")
84+
85+
# Display results
86+
if isinstance(result, dict):
87+
print("\n🔍 Response:")
88+
print(json.dumps(result, indent=2, ensure_ascii=False))
89+
90+
# Check for pagination success indicators
91+
if "data" in result:
92+
print(f"\n✨ Pagination successful! Data extracted from {total_pages} pages")
93+
94+
elif isinstance(result, list):
95+
print(f"\n✅ Pagination successful! Extracted {len(result)} items")
96+
for i, item in enumerate(result[:5]): # Show first 5 items
97+
print(f" {i+1}. {item}")
98+
if len(result) > 5:
99+
print(f" ... and {len(result) - 5} more items")
100+
else:
101+
print(f"\n📋 Result: {result}")
102+
103+
except APIError as e:
104+
print(f"❌ API Error: {e}")
105+
print("This could be due to:")
106+
print(" - Invalid API key")
107+
print(" - Rate limiting")
108+
print(" - Server issues")
109+
110+
except Exception as e:
111+
print(f"❌ Unexpected error: {e}")
112+
print("This could be due to:")
113+
print(" - Network connectivity issues")
114+
print(" - Invalid website URL")
115+
print(" - Pagination limitations")
116+
117+
118+
async def test_concurrent_pagination():
119+
"""Test multiple pagination requests concurrently"""
120+
121+
print("\n" + "=" * 50)
122+
print("Testing concurrent pagination requests")
123+
print("=" * 50)
124+
125+
try:
126+
client = AsyncClient.from_env()
127+
except ValueError as e:
128+
print(f"❌ Error initializing client: {e}")
129+
return
130+
131+
# Test concurrent requests
132+
urls = [
133+
"https://example.com/products?page=1",
134+
"https://example.com/products?page=2",
135+
"https://example.com/products?page=3",
136+
]
137+
138+
tasks = []
139+
for i, url in enumerate(urls):
140+
print(f"🚀 Creating task {i+1} for URL: {url}")
141+
# Note: In a real scenario, you would use actual URLs
142+
# This is just to demonstrate the async functionality
143+
tasks.append(asyncio.create_task(
144+
simulate_pagination_request(client, url, i+1)
145+
))
146+
147+
print(f"⏱️ Starting {len(tasks)} concurrent tasks...")
148+
start_time = time.time()
149+
150+
try:
151+
results = await asyncio.gather(*tasks, return_exceptions=True)
152+
duration = time.time() - start_time
153+
154+
print(f"✅ All tasks completed in {duration:.2f} seconds")
155+
156+
for i, result in enumerate(results):
157+
if isinstance(result, Exception):
158+
print(f"❌ Task {i+1} failed: {result}")
159+
else:
160+
print(f"✅ Task {i+1} succeeded: {result}")
161+
162+
except Exception as e:
163+
print(f"❌ Concurrent execution failed: {e}")
164+
165+
166+
async def simulate_pagination_request(client: AsyncClient, url: str, task_id: int):
167+
"""Simulate a pagination request (for demonstration)"""
168+
169+
print(f"📋 Task {task_id}: Processing {url}")
170+
171+
# Simulate some work
172+
await asyncio.sleep(0.5)
173+
174+
# Return a simulated result
175+
return f"Task {task_id} completed successfully"
176+
177+
178+
async def test_pagination_with_different_parameters():
179+
"""Test pagination with different parameters"""
180+
181+
print("\n" + "=" * 50)
182+
print("Testing pagination with different parameters")
183+
print("=" * 50)
184+
185+
try:
186+
client = AsyncClient.from_env()
187+
except ValueError as e:
188+
print(f"❌ Error initializing client: {e}")
189+
return
190+
191+
# Test cases
192+
test_cases = [
193+
{
194+
"name": "Single page (default)",
195+
"url": "https://example.com",
196+
"total_pages": None,
197+
"user_prompt": "Extract basic info"
198+
},
199+
{
200+
"name": "Two pages with schema",
201+
"url": "https://example.com/products",
202+
"total_pages": 2,
203+
"user_prompt": "Extract product information",
204+
"output_schema": ProductList
205+
},
206+
{
207+
"name": "Maximum pages with scrolling",
208+
"url": "https://example.com/search",
209+
"total_pages": 5,
210+
"user_prompt": "Extract all available data",
211+
"number_of_scrolls": 3
212+
}
213+
]
214+
215+
for test_case in test_cases:
216+
print(f"\n🧪 Test: {test_case['name']}")
217+
print(f" Pages: {test_case['total_pages']}")
218+
print(f" Prompt: {test_case['user_prompt']}")
219+
220+
try:
221+
# This is just to demonstrate the API call structure
222+
# In a real scenario, you'd make actual API calls
223+
print(f" ✅ Configuration valid")
224+
225+
except Exception as e:
226+
print(f" ❌ Configuration error: {e}")
227+
228+
229+
async def main():
230+
"""Main function to run the pagination examples"""
231+
232+
print("ScrapeGraph SDK - SmartScraper Pagination Examples (Async)")
233+
print("=" * 60)
234+
235+
# Run the main example
236+
await smartscraper_pagination_example()
237+
238+
# Test concurrent pagination
239+
await test_concurrent_pagination()
240+
241+
# Test different parameters
242+
await test_pagination_with_different_parameters()
243+
244+
print("\n" + "=" * 60)
245+
print("Examples completed!")
246+
print("\nNext steps:")
247+
print("1. Set SGAI_API_KEY environment variable")
248+
print("2. Replace example URLs with real websites")
249+
print("3. Adjust total_pages parameter (1-10)")
250+
print("4. Customize user_prompt for your use case")
251+
print("5. Define output_schema for structured data")
252+
print("\nAsync-specific tips:")
253+
print("- Use asyncio.gather() for concurrent requests")
254+
print("- Consider rate limiting with asyncio.Semaphore")
255+
print("- Handle exceptions properly in async context")
256+
print("- Use proper context managers for cleanup")
257+
258+
259+
if __name__ == "__main__":
260+
asyncio.run(main())

0 commit comments

Comments
 (0)