Skip to content

feat: add cookies integration #45

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 59 additions & 0 deletions scrapegraph-js/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,65 @@ const numberOfScrolls = 10; // Will scroll 10 times to load more content

The `numberOfScrolls` parameter accepts values between 0 and 100, allowing you to control how many times the page should be scrolled before extraction.

#### Scraping with Cookies

Use cookies for authentication and session management when scraping websites that require login or have user-specific content:

```javascript
import { smartScraper } from 'scrapegraph-js';

const apiKey = 'your-api-key';
const url = 'https://example.com/dashboard';
const prompt = 'Extract user profile information';

// Define cookies for authentication
const cookies = {
session_id: 'abc123def456',
auth_token: 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9...',
user_preferences: 'dark_mode,usd'
};

(async () => {
try {
const response = await smartScraper(apiKey, url, prompt, null, null, null, cookies);
console.log(response.result);
} catch (error) {
console.error('Error:', error);
}
})();
```

**Common Use Cases:**
- **E-commerce sites**: User authentication, shopping cart persistence
- **Social media**: Session management, user preferences
- **Banking/Financial**: Secure authentication, transaction history
- **News sites**: User preferences, subscription content
- **API endpoints**: Authentication tokens, API keys

#### Advanced Scraping with Cookies, Scrolling, and Pagination

Combine cookies with infinite scrolling and pagination for comprehensive data extraction:

```javascript
import { smartScraper } from 'scrapegraph-js';

const apiKey = 'your-api-key';
const url = 'https://example.com/feed';
const prompt = 'Extract all posts from the feed';
const cookies = { session_token: 'xyz789abc123' };
const numberOfScrolls = 10; // Scroll 10 times
const totalPages = 5; // Scrape 5 pages

(async () => {
try {
const response = await smartScraper(apiKey, url, prompt, null, numberOfScrolls, totalPages, cookies);
console.log('Extracted data:', response);
} catch (error) {
console.error('Error:', error);
}
})();
```

### Search Scraping

Search and extract information from multiple web sources using AI.
Expand Down
261 changes: 261 additions & 0 deletions scrapegraph-js/examples/cookies_integration_example.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,261 @@
/**
* Comprehensive example demonstrating cookies integration for web scraping.
*
* This example shows various real-world scenarios where cookies are essential:
* 1. E-commerce site scraping with authentication
* 2. Social media scraping with session cookies
* 3. Banking/financial site scraping with secure cookies
* 4. News site scraping with user preferences
* 5. API endpoint scraping with authentication tokens
*
* Requirements:
* - Node.js 16+
* - scrapegraph-js
* - A .env file with your SGAI_APIKEY
*
* Example .env file:
* SGAI_APIKEY=your_api_key_here
*/

import { smartScraper } from 'scrapegraph-js';
import { z } from 'zod';
import 'dotenv/config';

// Define data schemas for different scenarios
const ProductInfoSchema = z.object({
name: z.string().describe('Product name'),
price: z.string().describe('Product price'),
availability: z.string().describe('Product availability status'),
rating: z.string().optional().describe('Product rating')
});

const SocialMediaPostSchema = z.object({
author: z.string().describe('Post author'),
content: z.string().describe('Post content'),
likes: z.string().optional().describe('Number of likes'),
comments: z.string().optional().describe('Number of comments'),
timestamp: z.string().optional().describe('Post timestamp')
});

const NewsArticleSchema = z.object({
title: z.string().describe('Article title'),
summary: z.string().describe('Article summary'),
author: z.string().optional().describe('Article author'),
publish_date: z.string().optional().describe('Publish date')
});

const BankTransactionSchema = z.object({
date: z.string().describe('Transaction date'),
description: z.string().describe('Transaction description'),
amount: z.string().describe('Transaction amount'),
type: z.string().describe('Transaction type (credit/debit)')
});

async function scrapeEcommerceWithAuth() {
console.log('='.repeat(60));
console.log('E-COMMERCE SITE SCRAPING WITH AUTHENTICATION');
console.log('='.repeat(60));

// Example cookies for an e-commerce site
const cookies = {
session_id: 'abc123def456',
user_id: 'user789',
cart_id: 'cart101112',
preferences: 'dark_mode,usd',
auth_token: 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9...'
};

const websiteUrl = 'https://example-ecommerce.com/products';
const userPrompt = 'Extract product information including name, price, availability, and rating';

try {
const response = await smartScraper(
process.env.SGAI_APIKEY,
websiteUrl,
userPrompt,
ProductInfoSchema,
5, // numberOfScrolls - Scroll to load more products
null, // totalPages
cookies
);

console.log('✅ E-commerce scraping completed successfully');
console.log(JSON.stringify(response, null, 2));

} catch (error) {
console.error(`❌ Error in e-commerce scraping: ${error.message}`);
}
}

async function scrapeSocialMediaWithSession() {
console.log('\n' + '='.repeat(60));
console.log('SOCIAL MEDIA SCRAPING WITH SESSION COOKIES');
console.log('='.repeat(60));

// Example cookies for a social media site
const cookies = {
session_token: 'xyz789abc123',
user_session: 'def456ghi789',
csrf_token: 'jkl012mno345',
remember_me: 'true',
language: 'en_US'
};

const websiteUrl = 'https://example-social.com/feed';
const userPrompt = 'Extract posts from the feed including author, content, likes, and comments';

try {
const response = await smartScraper(
process.env.SGAI_APIKEY,
websiteUrl,
userPrompt,
SocialMediaPostSchema,
10, // numberOfScrolls - Scroll to load more posts
null, // totalPages
cookies
);

console.log('✅ Social media scraping completed successfully');
console.log(JSON.stringify(response, null, 2));

} catch (error) {
console.error(`❌ Error in social media scraping: ${error.message}`);
}
}

async function scrapeNewsWithPreferences() {
console.log('\n' + '='.repeat(60));
console.log('NEWS SITE SCRAPING WITH USER PREFERENCES');
console.log('='.repeat(60));

// Example cookies for a news site
const cookies = {
user_preferences: 'technology,science,ai',
reading_level: 'advanced',
region: 'US',
subscription_tier: 'premium',
theme: 'dark'
};

const websiteUrl = 'https://example-news.com/technology';
const userPrompt = 'Extract news articles including title, summary, author, and publish date';

try {
const response = await smartScraper(
process.env.SGAI_APIKEY,
websiteUrl,
userPrompt,
NewsArticleSchema,
null, // numberOfScrolls
3, // totalPages - Scrape multiple pages
cookies
);

console.log('✅ News scraping completed successfully');
console.log(JSON.stringify(response, null, 2));

} catch (error) {
console.error(`❌ Error in news scraping: ${error.message}`);
}
}

async function scrapeBankingWithSecureCookies() {
console.log('\n' + '='.repeat(60));
console.log('BANKING SITE SCRAPING WITH SECURE COOKIES');
console.log('='.repeat(60));

// Example secure cookies for a banking site
const cookies = {
secure_session: 'pqr678stu901',
auth_token: 'vwx234yz567',
mfa_verified: 'true',
device_id: 'device_abc123',
last_activity: '2024-01-15T10:30:00Z'
};

const websiteUrl = 'https://example-bank.com/transactions';
const userPrompt = 'Extract recent transactions including date, description, amount, and type';

try {
const response = await smartScraper(
process.env.SGAI_APIKEY,
websiteUrl,
userPrompt,
BankTransactionSchema,
null, // numberOfScrolls
5, // totalPages - Scrape multiple pages of transactions
cookies
);

console.log('✅ Banking scraping completed successfully');
console.log(JSON.stringify(response, null, 2));

} catch (error) {
console.error(`❌ Error in banking scraping: ${error.message}`);
}
}

async function scrapeApiWithAuthTokens() {
console.log('\n' + '='.repeat(60));
console.log('API ENDPOINT SCRAPING WITH AUTH TOKENS');
console.log('='.repeat(60));

// Example API authentication cookies
const cookies = {
api_token: 'api_abc123def456',
client_id: 'client_789',
access_token: 'access_xyz789',
refresh_token: 'refresh_abc123',
scope: 'read:all'
};

const websiteUrl = 'https://api.example.com/data';
const userPrompt = 'Extract data from the API response';

try {
const response = await smartScraper(
process.env.SGAI_APIKEY,
websiteUrl,
userPrompt,
null, // No schema for generic API response
null, // numberOfScrolls
null, // totalPages
cookies
);

console.log('✅ API scraping completed successfully');
console.log(JSON.stringify(response, null, 2));

} catch (error) {
console.error(`❌ Error in API scraping: ${error.message}`);
}
}

async function main() {
const apiKey = process.env.SGAI_APIKEY;

// Check if API key is available
if (!apiKey) {
console.error('Error: SGAI_APIKEY not found in .env file');
console.log('Please create a .env file with your API key:');
console.log('SGAI_APIKEY=your_api_key_here');
return;
}

console.log('🍪 COOKIES INTEGRATION EXAMPLES');
console.log('This demonstrates various real-world scenarios where cookies are essential for web scraping.');

// Run all examples
await scrapeEcommerceWithAuth();
await scrapeSocialMediaWithSession();
await scrapeNewsWithPreferences();
await scrapeBankingWithSecureCookies();
await scrapeApiWithAuthTokens();

console.log('\n' + '='.repeat(60));
console.log('✅ All examples completed!');
console.log('='.repeat(60));
}

// Run the example
main().catch(console.error);
Loading