1
+ /**
2
+ * Comprehensive example demonstrating cookies integration for web scraping.
3
+ *
4
+ * This example shows various real-world scenarios where cookies are essential:
5
+ * 1. E-commerce site scraping with authentication
6
+ * 2. Social media scraping with session cookies
7
+ * 3. Banking/financial site scraping with secure cookies
8
+ * 4. News site scraping with user preferences
9
+ * 5. API endpoint scraping with authentication tokens
10
+ *
11
+ * Requirements:
12
+ * - Node.js 16+
13
+ * - scrapegraph-js
14
+ * - A .env file with your SGAI_APIKEY
15
+ *
16
+ * Example .env file:
17
+ * SGAI_APIKEY=your_api_key_here
18
+ */
19
+
20
+ import { smartScraper } from 'scrapegraph-js' ;
21
+ import { z } from 'zod' ;
22
+ import 'dotenv/config' ;
23
+
24
+ // Define data schemas for different scenarios
25
+ const ProductInfoSchema = z . object ( {
26
+ name : z . string ( ) . describe ( 'Product name' ) ,
27
+ price : z . string ( ) . describe ( 'Product price' ) ,
28
+ availability : z . string ( ) . describe ( 'Product availability status' ) ,
29
+ rating : z . string ( ) . optional ( ) . describe ( 'Product rating' )
30
+ } ) ;
31
+
32
+ const SocialMediaPostSchema = z . object ( {
33
+ author : z . string ( ) . describe ( 'Post author' ) ,
34
+ content : z . string ( ) . describe ( 'Post content' ) ,
35
+ likes : z . string ( ) . optional ( ) . describe ( 'Number of likes' ) ,
36
+ comments : z . string ( ) . optional ( ) . describe ( 'Number of comments' ) ,
37
+ timestamp : z . string ( ) . optional ( ) . describe ( 'Post timestamp' )
38
+ } ) ;
39
+
40
+ const NewsArticleSchema = z . object ( {
41
+ title : z . string ( ) . describe ( 'Article title' ) ,
42
+ summary : z . string ( ) . describe ( 'Article summary' ) ,
43
+ author : z . string ( ) . optional ( ) . describe ( 'Article author' ) ,
44
+ publish_date : z . string ( ) . optional ( ) . describe ( 'Publish date' )
45
+ } ) ;
46
+
47
+ const BankTransactionSchema = z . object ( {
48
+ date : z . string ( ) . describe ( 'Transaction date' ) ,
49
+ description : z . string ( ) . describe ( 'Transaction description' ) ,
50
+ amount : z . string ( ) . describe ( 'Transaction amount' ) ,
51
+ type : z . string ( ) . describe ( 'Transaction type (credit/debit)' )
52
+ } ) ;
53
+
54
+ async function scrapeEcommerceWithAuth ( ) {
55
+ console . log ( '=' . repeat ( 60 ) ) ;
56
+ console . log ( 'E-COMMERCE SITE SCRAPING WITH AUTHENTICATION' ) ;
57
+ console . log ( '=' . repeat ( 60 ) ) ;
58
+
59
+ // Example cookies for an e-commerce site
60
+ const cookies = {
61
+ session_id : 'abc123def456' ,
62
+ user_id : 'user789' ,
63
+ cart_id : 'cart101112' ,
64
+ preferences : 'dark_mode,usd' ,
65
+ auth_token : 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9...'
66
+ } ;
67
+
68
+ const websiteUrl = 'https://example-ecommerce.com/products' ;
69
+ const userPrompt = 'Extract product information including name, price, availability, and rating' ;
70
+
71
+ try {
72
+ const response = await smartScraper (
73
+ process . env . SGAI_APIKEY ,
74
+ websiteUrl ,
75
+ userPrompt ,
76
+ ProductInfoSchema ,
77
+ 5 , // numberOfScrolls - Scroll to load more products
78
+ null , // totalPages
79
+ cookies
80
+ ) ;
81
+
82
+ console . log ( '✅ E-commerce scraping completed successfully' ) ;
83
+ console . log ( JSON . stringify ( response , null , 2 ) ) ;
84
+
85
+ } catch ( error ) {
86
+ console . error ( `❌ Error in e-commerce scraping: ${ error . message } ` ) ;
87
+ }
88
+ }
89
+
90
+ async function scrapeSocialMediaWithSession ( ) {
91
+ console . log ( '\n' + '=' . repeat ( 60 ) ) ;
92
+ console . log ( 'SOCIAL MEDIA SCRAPING WITH SESSION COOKIES' ) ;
93
+ console . log ( '=' . repeat ( 60 ) ) ;
94
+
95
+ // Example cookies for a social media site
96
+ const cookies = {
97
+ session_token : 'xyz789abc123' ,
98
+ user_session : 'def456ghi789' ,
99
+ csrf_token : 'jkl012mno345' ,
100
+ remember_me : 'true' ,
101
+ language : 'en_US'
102
+ } ;
103
+
104
+ const websiteUrl = 'https://example-social.com/feed' ;
105
+ const userPrompt = 'Extract posts from the feed including author, content, likes, and comments' ;
106
+
107
+ try {
108
+ const response = await smartScraper (
109
+ process . env . SGAI_APIKEY ,
110
+ websiteUrl ,
111
+ userPrompt ,
112
+ SocialMediaPostSchema ,
113
+ 10 , // numberOfScrolls - Scroll to load more posts
114
+ null , // totalPages
115
+ cookies
116
+ ) ;
117
+
118
+ console . log ( '✅ Social media scraping completed successfully' ) ;
119
+ console . log ( JSON . stringify ( response , null , 2 ) ) ;
120
+
121
+ } catch ( error ) {
122
+ console . error ( `❌ Error in social media scraping: ${ error . message } ` ) ;
123
+ }
124
+ }
125
+
126
+ async function scrapeNewsWithPreferences ( ) {
127
+ console . log ( '\n' + '=' . repeat ( 60 ) ) ;
128
+ console . log ( 'NEWS SITE SCRAPING WITH USER PREFERENCES' ) ;
129
+ console . log ( '=' . repeat ( 60 ) ) ;
130
+
131
+ // Example cookies for a news site
132
+ const cookies = {
133
+ user_preferences : 'technology,science,ai' ,
134
+ reading_level : 'advanced' ,
135
+ region : 'US' ,
136
+ subscription_tier : 'premium' ,
137
+ theme : 'dark'
138
+ } ;
139
+
140
+ const websiteUrl = 'https://example-news.com/technology' ;
141
+ const userPrompt = 'Extract news articles including title, summary, author, and publish date' ;
142
+
143
+ try {
144
+ const response = await smartScraper (
145
+ process . env . SGAI_APIKEY ,
146
+ websiteUrl ,
147
+ userPrompt ,
148
+ NewsArticleSchema ,
149
+ null , // numberOfScrolls
150
+ 3 , // totalPages - Scrape multiple pages
151
+ cookies
152
+ ) ;
153
+
154
+ console . log ( '✅ News scraping completed successfully' ) ;
155
+ console . log ( JSON . stringify ( response , null , 2 ) ) ;
156
+
157
+ } catch ( error ) {
158
+ console . error ( `❌ Error in news scraping: ${ error . message } ` ) ;
159
+ }
160
+ }
161
+
162
+ async function scrapeBankingWithSecureCookies ( ) {
163
+ console . log ( '\n' + '=' . repeat ( 60 ) ) ;
164
+ console . log ( 'BANKING SITE SCRAPING WITH SECURE COOKIES' ) ;
165
+ console . log ( '=' . repeat ( 60 ) ) ;
166
+
167
+ // Example secure cookies for a banking site
168
+ const cookies = {
169
+ secure_session : 'pqr678stu901' ,
170
+ auth_token : 'vwx234yz567' ,
171
+ mfa_verified : 'true' ,
172
+ device_id : 'device_abc123' ,
173
+ last_activity : '2024-01-15T10:30:00Z'
174
+ } ;
175
+
176
+ const websiteUrl = 'https://example-bank.com/transactions' ;
177
+ const userPrompt = 'Extract recent transactions including date, description, amount, and type' ;
178
+
179
+ try {
180
+ const response = await smartScraper (
181
+ process . env . SGAI_APIKEY ,
182
+ websiteUrl ,
183
+ userPrompt ,
184
+ BankTransactionSchema ,
185
+ null , // numberOfScrolls
186
+ 5 , // totalPages - Scrape multiple pages of transactions
187
+ cookies
188
+ ) ;
189
+
190
+ console . log ( '✅ Banking scraping completed successfully' ) ;
191
+ console . log ( JSON . stringify ( response , null , 2 ) ) ;
192
+
193
+ } catch ( error ) {
194
+ console . error ( `❌ Error in banking scraping: ${ error . message } ` ) ;
195
+ }
196
+ }
197
+
198
+ async function scrapeApiWithAuthTokens ( ) {
199
+ console . log ( '\n' + '=' . repeat ( 60 ) ) ;
200
+ console . log ( 'API ENDPOINT SCRAPING WITH AUTH TOKENS' ) ;
201
+ console . log ( '=' . repeat ( 60 ) ) ;
202
+
203
+ // Example API authentication cookies
204
+ const cookies = {
205
+ api_token : 'api_abc123def456' ,
206
+ client_id : 'client_789' ,
207
+ access_token : 'access_xyz789' ,
208
+ refresh_token : 'refresh_abc123' ,
209
+ scope : 'read:all'
210
+ } ;
211
+
212
+ const websiteUrl = 'https://api.example.com/data' ;
213
+ const userPrompt = 'Extract data from the API response' ;
214
+
215
+ try {
216
+ const response = await smartScraper (
217
+ process . env . SGAI_APIKEY ,
218
+ websiteUrl ,
219
+ userPrompt ,
220
+ null , // No schema for generic API response
221
+ null , // numberOfScrolls
222
+ null , // totalPages
223
+ cookies
224
+ ) ;
225
+
226
+ console . log ( '✅ API scraping completed successfully' ) ;
227
+ console . log ( JSON . stringify ( response , null , 2 ) ) ;
228
+
229
+ } catch ( error ) {
230
+ console . error ( `❌ Error in API scraping: ${ error . message } ` ) ;
231
+ }
232
+ }
233
+
234
+ async function main ( ) {
235
+ const apiKey = process . env . SGAI_APIKEY ;
236
+
237
+ // Check if API key is available
238
+ if ( ! apiKey ) {
239
+ console . error ( 'Error: SGAI_APIKEY not found in .env file' ) ;
240
+ console . log ( 'Please create a .env file with your API key:' ) ;
241
+ console . log ( 'SGAI_APIKEY=your_api_key_here' ) ;
242
+ return ;
243
+ }
244
+
245
+ console . log ( '🍪 COOKIES INTEGRATION EXAMPLES' ) ;
246
+ console . log ( 'This demonstrates various real-world scenarios where cookies are essential for web scraping.' ) ;
247
+
248
+ // Run all examples
249
+ await scrapeEcommerceWithAuth ( ) ;
250
+ await scrapeSocialMediaWithSession ( ) ;
251
+ await scrapeNewsWithPreferences ( ) ;
252
+ await scrapeBankingWithSecureCookies ( ) ;
253
+ await scrapeApiWithAuthTokens ( ) ;
254
+
255
+ console . log ( '\n' + '=' . repeat ( 60 ) ) ;
256
+ console . log ( '✅ All examples completed!' ) ;
257
+ console . log ( '=' . repeat ( 60 ) ) ;
258
+ }
259
+
260
+ // Run the example
261
+ main ( ) . catch ( console . error ) ;
0 commit comments