1
+ #!/usr/bin/env python3
2
+ """
3
+ SmartScraper Pagination Example (Async)
4
+
5
+ This example demonstrates how to use pagination functionality with SmartScraper API using the asynchronous client.
6
+ """
7
+
8
+ import asyncio
9
+ import json
10
+ import logging
11
+ import os
12
+ import time
13
+ from pydantic import BaseModel
14
+ from typing import List , Optional
15
+
16
+ from scrapegraph_py import AsyncClient
17
+ from scrapegraph_py .exceptions import APIError
18
+
19
+
20
+ # Configure logging
21
+ logging .basicConfig (
22
+ level = logging .INFO ,
23
+ format = "%(asctime)s - %(levelname)s - %(message)s" ,
24
+ handlers = [logging .StreamHandler ()],
25
+ )
26
+ logger = logging .getLogger (__name__ )
27
+
28
+
29
+ class ProductInfo (BaseModel ):
30
+ """Schema for product information"""
31
+ name : str
32
+ price : Optional [str ] = None
33
+ rating : Optional [str ] = None
34
+ image_url : Optional [str ] = None
35
+ description : Optional [str ] = None
36
+
37
+
38
+ class ProductList (BaseModel ):
39
+ """Schema for list of products"""
40
+ products : List [ProductInfo ]
41
+
42
+
43
+ async def smartscraper_pagination_example ():
44
+ """Example of using pagination with SmartScraper (async)"""
45
+
46
+ print ("SmartScraper Pagination Example (Async)" )
47
+ print ("=" * 50 )
48
+
49
+ # Initialize client from environment variable
50
+ try :
51
+ client = AsyncClient .from_env ()
52
+ except ValueError as e :
53
+ print (f"❌ Error initializing client: { e } " )
54
+ print ("Please set SGAI_API_KEY environment variable" )
55
+ return
56
+
57
+ # Configuration
58
+ website_url = "https://www.amazon.in/s?k=tv&crid=1TEF1ZFVLU8R8&sprefix=t%2Caps%2C390&ref=nb_sb_noss_2"
59
+ user_prompt = "Extract all product info including name, price, rating, image_url, and description"
60
+ total_pages = 3 # Number of pages to scrape
61
+
62
+ print (f"🌐 Website URL: { website_url } " )
63
+ print (f"📝 User Prompt: { user_prompt } " )
64
+ print (f"📄 Total Pages: { total_pages } " )
65
+ print ("-" * 50 )
66
+
67
+ try :
68
+ # Start timing
69
+ start_time = time .time ()
70
+
71
+ # Make the request with pagination
72
+ result = await client .smartscraper (
73
+ user_prompt = user_prompt ,
74
+ website_url = website_url ,
75
+ output_schema = ProductList ,
76
+ total_pages = total_pages
77
+ )
78
+
79
+ # Calculate duration
80
+ duration = time .time () - start_time
81
+
82
+ print (f"✅ Request completed in { duration :.2f} seconds" )
83
+ print (f"📊 Response type: { type (result )} " )
84
+
85
+ # Display results
86
+ if isinstance (result , dict ):
87
+ print ("\n 🔍 Response:" )
88
+ print (json .dumps (result , indent = 2 , ensure_ascii = False ))
89
+
90
+ # Check for pagination success indicators
91
+ if "data" in result :
92
+ print (f"\n ✨ Pagination successful! Data extracted from { total_pages } pages" )
93
+
94
+ elif isinstance (result , list ):
95
+ print (f"\n ✅ Pagination successful! Extracted { len (result )} items" )
96
+ for i , item in enumerate (result [:5 ]): # Show first 5 items
97
+ print (f" { i + 1 } . { item } " )
98
+ if len (result ) > 5 :
99
+ print (f" ... and { len (result ) - 5 } more items" )
100
+ else :
101
+ print (f"\n 📋 Result: { result } " )
102
+
103
+ except APIError as e :
104
+ print (f"❌ API Error: { e } " )
105
+ print ("This could be due to:" )
106
+ print (" - Invalid API key" )
107
+ print (" - Rate limiting" )
108
+ print (" - Server issues" )
109
+
110
+ except Exception as e :
111
+ print (f"❌ Unexpected error: { e } " )
112
+ print ("This could be due to:" )
113
+ print (" - Network connectivity issues" )
114
+ print (" - Invalid website URL" )
115
+ print (" - Pagination limitations" )
116
+
117
+
118
+ async def test_concurrent_pagination ():
119
+ """Test multiple pagination requests concurrently"""
120
+
121
+ print ("\n " + "=" * 50 )
122
+ print ("Testing concurrent pagination requests" )
123
+ print ("=" * 50 )
124
+
125
+ try :
126
+ client = AsyncClient .from_env ()
127
+ except ValueError as e :
128
+ print (f"❌ Error initializing client: { e } " )
129
+ return
130
+
131
+ # Test concurrent requests
132
+ urls = [
133
+ "https://example.com/products?page=1" ,
134
+ "https://example.com/products?page=2" ,
135
+ "https://example.com/products?page=3" ,
136
+ ]
137
+
138
+ tasks = []
139
+ for i , url in enumerate (urls ):
140
+ print (f"🚀 Creating task { i + 1 } for URL: { url } " )
141
+ # Note: In a real scenario, you would use actual URLs
142
+ # This is just to demonstrate the async functionality
143
+ tasks .append (asyncio .create_task (
144
+ simulate_pagination_request (client , url , i + 1 )
145
+ ))
146
+
147
+ print (f"⏱️ Starting { len (tasks )} concurrent tasks..." )
148
+ start_time = time .time ()
149
+
150
+ try :
151
+ results = await asyncio .gather (* tasks , return_exceptions = True )
152
+ duration = time .time () - start_time
153
+
154
+ print (f"✅ All tasks completed in { duration :.2f} seconds" )
155
+
156
+ for i , result in enumerate (results ):
157
+ if isinstance (result , Exception ):
158
+ print (f"❌ Task { i + 1 } failed: { result } " )
159
+ else :
160
+ print (f"✅ Task { i + 1 } succeeded: { result } " )
161
+
162
+ except Exception as e :
163
+ print (f"❌ Concurrent execution failed: { e } " )
164
+
165
+
166
+ async def simulate_pagination_request (client : AsyncClient , url : str , task_id : int ):
167
+ """Simulate a pagination request (for demonstration)"""
168
+
169
+ print (f"📋 Task { task_id } : Processing { url } " )
170
+
171
+ # Simulate some work
172
+ await asyncio .sleep (0.5 )
173
+
174
+ # Return a simulated result
175
+ return f"Task { task_id } completed successfully"
176
+
177
+
178
+ async def test_pagination_with_different_parameters ():
179
+ """Test pagination with different parameters"""
180
+
181
+ print ("\n " + "=" * 50 )
182
+ print ("Testing pagination with different parameters" )
183
+ print ("=" * 50 )
184
+
185
+ try :
186
+ client = AsyncClient .from_env ()
187
+ except ValueError as e :
188
+ print (f"❌ Error initializing client: { e } " )
189
+ return
190
+
191
+ # Test cases
192
+ test_cases = [
193
+ {
194
+ "name" : "Single page (default)" ,
195
+ "url" : "https://example.com" ,
196
+ "total_pages" : None ,
197
+ "user_prompt" : "Extract basic info"
198
+ },
199
+ {
200
+ "name" : "Two pages with schema" ,
201
+ "url" : "https://example.com/products" ,
202
+ "total_pages" : 2 ,
203
+ "user_prompt" : "Extract product information" ,
204
+ "output_schema" : ProductList
205
+ },
206
+ {
207
+ "name" : "Maximum pages with scrolling" ,
208
+ "url" : "https://example.com/search" ,
209
+ "total_pages" : 5 ,
210
+ "user_prompt" : "Extract all available data" ,
211
+ "number_of_scrolls" : 3
212
+ }
213
+ ]
214
+
215
+ for test_case in test_cases :
216
+ print (f"\n 🧪 Test: { test_case ['name' ]} " )
217
+ print (f" Pages: { test_case ['total_pages' ]} " )
218
+ print (f" Prompt: { test_case ['user_prompt' ]} " )
219
+
220
+ try :
221
+ # This is just to demonstrate the API call structure
222
+ # In a real scenario, you'd make actual API calls
223
+ print (f" ✅ Configuration valid" )
224
+
225
+ except Exception as e :
226
+ print (f" ❌ Configuration error: { e } " )
227
+
228
+
229
+ async def main ():
230
+ """Main function to run the pagination examples"""
231
+
232
+ print ("ScrapeGraph SDK - SmartScraper Pagination Examples (Async)" )
233
+ print ("=" * 60 )
234
+
235
+ # Run the main example
236
+ await smartscraper_pagination_example ()
237
+
238
+ # Test concurrent pagination
239
+ await test_concurrent_pagination ()
240
+
241
+ # Test different parameters
242
+ await test_pagination_with_different_parameters ()
243
+
244
+ print ("\n " + "=" * 60 )
245
+ print ("Examples completed!" )
246
+ print ("\n Next steps:" )
247
+ print ("1. Set SGAI_API_KEY environment variable" )
248
+ print ("2. Replace example URLs with real websites" )
249
+ print ("3. Adjust total_pages parameter (1-10)" )
250
+ print ("4. Customize user_prompt for your use case" )
251
+ print ("5. Define output_schema for structured data" )
252
+ print ("\n Async-specific tips:" )
253
+ print ("- Use asyncio.gather() for concurrent requests" )
254
+ print ("- Consider rate limiting with asyncio.Semaphore" )
255
+ print ("- Handle exceptions properly in async context" )
256
+ print ("- Use proper context managers for cleanup" )
257
+
258
+
259
+ if __name__ == "__main__" :
260
+ asyncio .run (main ())
0 commit comments