Skip to content

Commit a8de72f

Browse files
authored
Merge pull request #45 from ScrapeGraphAI/42-i-cant-manage-to-extract-the-contents-of-a-page-behind-cookies
feat: add cookies integration
2 parents d563b0b + 06b6137 commit a8de72f

15 files changed

+1280
-381
lines changed

scrapegraph-js/README.md

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,65 @@ const numberOfScrolls = 10; // Will scroll 10 times to load more content
131131

132132
The `numberOfScrolls` parameter accepts values between 0 and 100, allowing you to control how many times the page should be scrolled before extraction.
133133

134+
#### Scraping with Cookies
135+
136+
Use cookies for authentication and session management when scraping websites that require login or have user-specific content:
137+
138+
```javascript
139+
import { smartScraper } from 'scrapegraph-js';
140+
141+
const apiKey = 'your-api-key';
142+
const url = 'https://example.com/dashboard';
143+
const prompt = 'Extract user profile information';
144+
145+
// Define cookies for authentication
146+
const cookies = {
147+
session_id: 'abc123def456',
148+
auth_token: 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9...',
149+
user_preferences: 'dark_mode,usd'
150+
};
151+
152+
(async () => {
153+
try {
154+
const response = await smartScraper(apiKey, url, prompt, null, null, null, cookies);
155+
console.log(response.result);
156+
} catch (error) {
157+
console.error('Error:', error);
158+
}
159+
})();
160+
```
161+
162+
**Common Use Cases:**
163+
- **E-commerce sites**: User authentication, shopping cart persistence
164+
- **Social media**: Session management, user preferences
165+
- **Banking/Financial**: Secure authentication, transaction history
166+
- **News sites**: User preferences, subscription content
167+
- **API endpoints**: Authentication tokens, API keys
168+
169+
#### Advanced Scraping with Cookies, Scrolling, and Pagination
170+
171+
Combine cookies with infinite scrolling and pagination for comprehensive data extraction:
172+
173+
```javascript
174+
import { smartScraper } from 'scrapegraph-js';
175+
176+
const apiKey = 'your-api-key';
177+
const url = 'https://example.com/feed';
178+
const prompt = 'Extract all posts from the feed';
179+
const cookies = { session_token: 'xyz789abc123' };
180+
const numberOfScrolls = 10; // Scroll 10 times
181+
const totalPages = 5; // Scrape 5 pages
182+
183+
(async () => {
184+
try {
185+
const response = await smartScraper(apiKey, url, prompt, null, numberOfScrolls, totalPages, cookies);
186+
console.log('Extracted data:', response);
187+
} catch (error) {
188+
console.error('Error:', error);
189+
}
190+
})();
191+
```
192+
134193
### Search Scraping
135194

136195
Search and extract information from multiple web sources using AI.
Lines changed: 261 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,261 @@
1+
/**
2+
* Comprehensive example demonstrating cookies integration for web scraping.
3+
*
4+
* This example shows various real-world scenarios where cookies are essential:
5+
* 1. E-commerce site scraping with authentication
6+
* 2. Social media scraping with session cookies
7+
* 3. Banking/financial site scraping with secure cookies
8+
* 4. News site scraping with user preferences
9+
* 5. API endpoint scraping with authentication tokens
10+
*
11+
* Requirements:
12+
* - Node.js 16+
13+
* - scrapegraph-js
14+
* - A .env file with your SGAI_APIKEY
15+
*
16+
* Example .env file:
17+
* SGAI_APIKEY=your_api_key_here
18+
*/
19+
20+
import { smartScraper } from 'scrapegraph-js';
21+
import { z } from 'zod';
22+
import 'dotenv/config';
23+
24+
// Define data schemas for different scenarios
25+
const ProductInfoSchema = z.object({
26+
name: z.string().describe('Product name'),
27+
price: z.string().describe('Product price'),
28+
availability: z.string().describe('Product availability status'),
29+
rating: z.string().optional().describe('Product rating')
30+
});
31+
32+
const SocialMediaPostSchema = z.object({
33+
author: z.string().describe('Post author'),
34+
content: z.string().describe('Post content'),
35+
likes: z.string().optional().describe('Number of likes'),
36+
comments: z.string().optional().describe('Number of comments'),
37+
timestamp: z.string().optional().describe('Post timestamp')
38+
});
39+
40+
const NewsArticleSchema = z.object({
41+
title: z.string().describe('Article title'),
42+
summary: z.string().describe('Article summary'),
43+
author: z.string().optional().describe('Article author'),
44+
publish_date: z.string().optional().describe('Publish date')
45+
});
46+
47+
const BankTransactionSchema = z.object({
48+
date: z.string().describe('Transaction date'),
49+
description: z.string().describe('Transaction description'),
50+
amount: z.string().describe('Transaction amount'),
51+
type: z.string().describe('Transaction type (credit/debit)')
52+
});
53+
54+
async function scrapeEcommerceWithAuth() {
55+
console.log('='.repeat(60));
56+
console.log('E-COMMERCE SITE SCRAPING WITH AUTHENTICATION');
57+
console.log('='.repeat(60));
58+
59+
// Example cookies for an e-commerce site
60+
const cookies = {
61+
session_id: 'abc123def456',
62+
user_id: 'user789',
63+
cart_id: 'cart101112',
64+
preferences: 'dark_mode,usd',
65+
auth_token: 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9...'
66+
};
67+
68+
const websiteUrl = 'https://example-ecommerce.com/products';
69+
const userPrompt = 'Extract product information including name, price, availability, and rating';
70+
71+
try {
72+
const response = await smartScraper(
73+
process.env.SGAI_APIKEY,
74+
websiteUrl,
75+
userPrompt,
76+
ProductInfoSchema,
77+
5, // numberOfScrolls - Scroll to load more products
78+
null, // totalPages
79+
cookies
80+
);
81+
82+
console.log('✅ E-commerce scraping completed successfully');
83+
console.log(JSON.stringify(response, null, 2));
84+
85+
} catch (error) {
86+
console.error(`❌ Error in e-commerce scraping: ${error.message}`);
87+
}
88+
}
89+
90+
async function scrapeSocialMediaWithSession() {
91+
console.log('\n' + '='.repeat(60));
92+
console.log('SOCIAL MEDIA SCRAPING WITH SESSION COOKIES');
93+
console.log('='.repeat(60));
94+
95+
// Example cookies for a social media site
96+
const cookies = {
97+
session_token: 'xyz789abc123',
98+
user_session: 'def456ghi789',
99+
csrf_token: 'jkl012mno345',
100+
remember_me: 'true',
101+
language: 'en_US'
102+
};
103+
104+
const websiteUrl = 'https://example-social.com/feed';
105+
const userPrompt = 'Extract posts from the feed including author, content, likes, and comments';
106+
107+
try {
108+
const response = await smartScraper(
109+
process.env.SGAI_APIKEY,
110+
websiteUrl,
111+
userPrompt,
112+
SocialMediaPostSchema,
113+
10, // numberOfScrolls - Scroll to load more posts
114+
null, // totalPages
115+
cookies
116+
);
117+
118+
console.log('✅ Social media scraping completed successfully');
119+
console.log(JSON.stringify(response, null, 2));
120+
121+
} catch (error) {
122+
console.error(`❌ Error in social media scraping: ${error.message}`);
123+
}
124+
}
125+
126+
async function scrapeNewsWithPreferences() {
127+
console.log('\n' + '='.repeat(60));
128+
console.log('NEWS SITE SCRAPING WITH USER PREFERENCES');
129+
console.log('='.repeat(60));
130+
131+
// Example cookies for a news site
132+
const cookies = {
133+
user_preferences: 'technology,science,ai',
134+
reading_level: 'advanced',
135+
region: 'US',
136+
subscription_tier: 'premium',
137+
theme: 'dark'
138+
};
139+
140+
const websiteUrl = 'https://example-news.com/technology';
141+
const userPrompt = 'Extract news articles including title, summary, author, and publish date';
142+
143+
try {
144+
const response = await smartScraper(
145+
process.env.SGAI_APIKEY,
146+
websiteUrl,
147+
userPrompt,
148+
NewsArticleSchema,
149+
null, // numberOfScrolls
150+
3, // totalPages - Scrape multiple pages
151+
cookies
152+
);
153+
154+
console.log('✅ News scraping completed successfully');
155+
console.log(JSON.stringify(response, null, 2));
156+
157+
} catch (error) {
158+
console.error(`❌ Error in news scraping: ${error.message}`);
159+
}
160+
}
161+
162+
async function scrapeBankingWithSecureCookies() {
163+
console.log('\n' + '='.repeat(60));
164+
console.log('BANKING SITE SCRAPING WITH SECURE COOKIES');
165+
console.log('='.repeat(60));
166+
167+
// Example secure cookies for a banking site
168+
const cookies = {
169+
secure_session: 'pqr678stu901',
170+
auth_token: 'vwx234yz567',
171+
mfa_verified: 'true',
172+
device_id: 'device_abc123',
173+
last_activity: '2024-01-15T10:30:00Z'
174+
};
175+
176+
const websiteUrl = 'https://example-bank.com/transactions';
177+
const userPrompt = 'Extract recent transactions including date, description, amount, and type';
178+
179+
try {
180+
const response = await smartScraper(
181+
process.env.SGAI_APIKEY,
182+
websiteUrl,
183+
userPrompt,
184+
BankTransactionSchema,
185+
null, // numberOfScrolls
186+
5, // totalPages - Scrape multiple pages of transactions
187+
cookies
188+
);
189+
190+
console.log('✅ Banking scraping completed successfully');
191+
console.log(JSON.stringify(response, null, 2));
192+
193+
} catch (error) {
194+
console.error(`❌ Error in banking scraping: ${error.message}`);
195+
}
196+
}
197+
198+
async function scrapeApiWithAuthTokens() {
199+
console.log('\n' + '='.repeat(60));
200+
console.log('API ENDPOINT SCRAPING WITH AUTH TOKENS');
201+
console.log('='.repeat(60));
202+
203+
// Example API authentication cookies
204+
const cookies = {
205+
api_token: 'api_abc123def456',
206+
client_id: 'client_789',
207+
access_token: 'access_xyz789',
208+
refresh_token: 'refresh_abc123',
209+
scope: 'read:all'
210+
};
211+
212+
const websiteUrl = 'https://api.example.com/data';
213+
const userPrompt = 'Extract data from the API response';
214+
215+
try {
216+
const response = await smartScraper(
217+
process.env.SGAI_APIKEY,
218+
websiteUrl,
219+
userPrompt,
220+
null, // No schema for generic API response
221+
null, // numberOfScrolls
222+
null, // totalPages
223+
cookies
224+
);
225+
226+
console.log('✅ API scraping completed successfully');
227+
console.log(JSON.stringify(response, null, 2));
228+
229+
} catch (error) {
230+
console.error(`❌ Error in API scraping: ${error.message}`);
231+
}
232+
}
233+
234+
async function main() {
235+
const apiKey = process.env.SGAI_APIKEY;
236+
237+
// Check if API key is available
238+
if (!apiKey) {
239+
console.error('Error: SGAI_APIKEY not found in .env file');
240+
console.log('Please create a .env file with your API key:');
241+
console.log('SGAI_APIKEY=your_api_key_here');
242+
return;
243+
}
244+
245+
console.log('🍪 COOKIES INTEGRATION EXAMPLES');
246+
console.log('This demonstrates various real-world scenarios where cookies are essential for web scraping.');
247+
248+
// Run all examples
249+
await scrapeEcommerceWithAuth();
250+
await scrapeSocialMediaWithSession();
251+
await scrapeNewsWithPreferences();
252+
await scrapeBankingWithSecureCookies();
253+
await scrapeApiWithAuthTokens();
254+
255+
console.log('\n' + '='.repeat(60));
256+
console.log('✅ All examples completed!');
257+
console.log('='.repeat(60));
258+
}
259+
260+
// Run the example
261+
main().catch(console.error);

0 commit comments

Comments
 (0)