From 043d3b3284e8d89c434cc86f3e4f2f26b2b2a1b7 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Fri, 18 Jul 2025 19:29:43 +0200 Subject: [PATCH 1/3] feat: add cookies integration --- scrapegraph-py/README.md | 69 +++++ .../async_smartscraper_cookies_example.py | 131 +++++++++ .../cookies_integration_example.py | 278 ++++++++++++++++++ .../sync/smartscraper_cookies_example.py | 134 +++++++++ scrapegraph-py/scrapegraph_py/async_client.py | 6 +- scrapegraph-py/scrapegraph_py/client.py | 6 +- .../scrapegraph_py/models/smartscraper.py | 7 +- scrapegraph-py/test_cookies_integration.py | 97 ++++++ 8 files changed, 725 insertions(+), 3 deletions(-) create mode 100644 scrapegraph-py/examples/async/async_smartscraper_cookies_example.py create mode 100644 scrapegraph-py/examples/miscellaneous/cookies_integration_example.py create mode 100644 scrapegraph-py/examples/sync/smartscraper_cookies_example.py create mode 100644 scrapegraph-py/test_cookies_integration.py diff --git a/scrapegraph-py/README.md b/scrapegraph-py/README.md index bb4c3ce..43ce70e 100644 --- a/scrapegraph-py/README.md +++ b/scrapegraph-py/README.md @@ -95,6 +95,75 @@ response = client.smartscraper( +
+๐Ÿช Cookies Support + +Use cookies for authentication and session management: + +```python +from scrapegraph_py import Client + +client = Client(api_key="your-api-key-here") + +# Define cookies for authentication +cookies = { + "session_id": "abc123def456", + "auth_token": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9...", + "user_preferences": "dark_mode,usd" +} + +response = client.smartscraper( + website_url="https://example.com/dashboard", + user_prompt="Extract user profile information", + cookies=cookies +) +``` + +**Common Use Cases:** +- **E-commerce sites**: User authentication, shopping cart persistence +- **Social media**: Session management, user preferences +- **Banking/Financial**: Secure authentication, transaction history +- **News sites**: User preferences, subscription content +- **API endpoints**: Authentication tokens, API keys + +
+ +
+๐Ÿ”„ Advanced Features + +**Infinite Scrolling:** +```python +response = client.smartscraper( + website_url="https://example.com/feed", + user_prompt="Extract all posts from the feed", + cookies=cookies, + number_of_scrolls=10 # Scroll 10 times to load more content +) +``` + +**Pagination:** +```python +response = client.smartscraper( + website_url="https://example.com/products", + user_prompt="Extract all product information", + cookies=cookies, + total_pages=5 # Scrape 5 pages +) +``` + +**Combined with Cookies:** +```python +response = client.smartscraper( + website_url="https://example.com/dashboard", + user_prompt="Extract user data from all pages", + cookies=cookies, + number_of_scrolls=5, + total_pages=3 +) +``` + +
+ ### ๐Ÿ” SearchScraper Perform AI-powered web searches with structured results and reference URLs. diff --git a/scrapegraph-py/examples/async/async_smartscraper_cookies_example.py b/scrapegraph-py/examples/async/async_smartscraper_cookies_example.py new file mode 100644 index 0000000..cca05bb --- /dev/null +++ b/scrapegraph-py/examples/async/async_smartscraper_cookies_example.py @@ -0,0 +1,131 @@ +""" +Example demonstrating how to use the SmartScraper API with cookies (Async). + +This example shows how to: +1. Set up the API request with cookies for authentication +2. Use cookies with infinite scrolling +3. Define a Pydantic model for structured output +4. Make the API call and handle the response +5. Process the extracted data + +Requirements: +- Python 3.7+ +- scrapegraph-py +- A .env file with your SGAI_API_KEY + +Example .env file: +SGAI_API_KEY=your_api_key_here +""" + +import asyncio +import json +import os +from typing import Dict, Optional + +from dotenv import load_dotenv +from pydantic import BaseModel, Field + +from scrapegraph_py import AsyncClient + +# Load environment variables from .env file +load_dotenv() + + +# Define the data models for structured output +class CookieInfo(BaseModel): + """Model representing cookie information.""" + + cookies: Dict[str, str] = Field(description="Dictionary of cookie key-value pairs") + + +async def main(): + """Example usage of the cookies scraper.""" + # Check if API key is available + if not os.getenv("SGAI_API_KEY"): + print("Error: SGAI_API_KEY not found in .env file") + print("Please create a .env file with your API key:") + print("SGAI_API_KEY=your_api_key_here") + return + + # Initialize the async client + async with AsyncClient.from_env() as client: + # Example 1: Basic cookies example (httpbin.org/cookies) + print("=" * 60) + print("EXAMPLE 1: Basic Cookies Example") + print("=" * 60) + + website_url = "https://httpbin.org/cookies" + user_prompt = "Extract all cookies info" + cookies = {"cookies_key": "cookies_value"} + + try: + # Perform the scraping with cookies + response = await client.smartscraper( + website_url=website_url, + user_prompt=user_prompt, + cookies=cookies, + output_schema=CookieInfo, + ) + + # Print the results + print("\nExtracted Cookie Information:") + print(json.dumps(response, indent=2)) + + except Exception as e: + print(f"Error occurred: {str(e)}") + + # Example 2: Cookies with infinite scrolling + print("\n" + "=" * 60) + print("EXAMPLE 2: Cookies with Infinite Scrolling") + print("=" * 60) + + website_url = "https://httpbin.org/cookies" + user_prompt = "Extract all cookies and scroll information" + cookies = {"session_id": "abc123", "user_token": "xyz789"} + + try: + # Perform the scraping with cookies and infinite scrolling + response = await client.smartscraper( + website_url=website_url, + user_prompt=user_prompt, + cookies=cookies, + number_of_scrolls=3, + output_schema=CookieInfo, + ) + + # Print the results + print("\nExtracted Cookie Information with Scrolling:") + print(json.dumps(response, indent=2)) + + except Exception as e: + print(f"Error occurred: {str(e)}") + + # Example 3: Cookies with pagination + print("\n" + "=" * 60) + print("EXAMPLE 3: Cookies with Pagination") + print("=" * 60) + + website_url = "https://httpbin.org/cookies" + user_prompt = "Extract all cookies from multiple pages" + cookies = {"auth_token": "secret123", "preferences": "dark_mode"} + + try: + # Perform the scraping with cookies and pagination + response = await client.smartscraper( + website_url=website_url, + user_prompt=user_prompt, + cookies=cookies, + total_pages=3, + output_schema=CookieInfo, + ) + + # Print the results + print("\nExtracted Cookie Information with Pagination:") + print(json.dumps(response, indent=2)) + + except Exception as e: + print(f"Error occurred: {str(e)}") + + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/scrapegraph-py/examples/miscellaneous/cookies_integration_example.py b/scrapegraph-py/examples/miscellaneous/cookies_integration_example.py new file mode 100644 index 0000000..a2d2977 --- /dev/null +++ b/scrapegraph-py/examples/miscellaneous/cookies_integration_example.py @@ -0,0 +1,278 @@ +""" +Comprehensive example demonstrating cookies integration for web scraping. + +This example shows various real-world scenarios where cookies are essential: +1. E-commerce site scraping with authentication +2. Social media scraping with session cookies +3. Banking/financial site scraping with secure cookies +4. News site scraping with user preferences +5. API endpoint scraping with authentication tokens + +Requirements: +- Python 3.7+ +- scrapegraph-py +- A .env file with your SGAI_API_KEY + +Example .env file: +SGAI_API_KEY=your_api_key_here +""" + +import json +import os +from typing import Dict, List, Optional + +from dotenv import load_dotenv +from pydantic import BaseModel, Field + +from scrapegraph_py import Client + +# Load environment variables from .env file +load_dotenv() + + +# Define data models for different scenarios +class ProductInfo(BaseModel): + """Model for e-commerce product information.""" + + name: str = Field(description="Product name") + price: str = Field(description="Product price") + availability: str = Field(description="Product availability status") + rating: Optional[str] = Field(description="Product rating", default=None) + + +class SocialMediaPost(BaseModel): + """Model for social media post information.""" + + author: str = Field(description="Post author") + content: str = Field(description="Post content") + likes: Optional[str] = Field(description="Number of likes", default=None) + comments: Optional[str] = Field(description="Number of comments", default=None) + timestamp: Optional[str] = Field(description="Post timestamp", default=None) + + +class NewsArticle(BaseModel): + """Model for news article information.""" + + title: str = Field(description="Article title") + summary: str = Field(description="Article summary") + author: Optional[str] = Field(description="Article author", default=None) + publish_date: Optional[str] = Field(description="Publish date", default=None) + + +class BankTransaction(BaseModel): + """Model for banking transaction information.""" + + date: str = Field(description="Transaction date") + description: str = Field(description="Transaction description") + amount: str = Field(description="Transaction amount") + type: str = Field(description="Transaction type (credit/debit)") + + +def scrape_ecommerce_with_auth(): + """Example: Scrape e-commerce site with authentication cookies.""" + print("=" * 60) + print("E-COMMERCE SITE SCRAPING WITH AUTHENTICATION") + print("=" * 60) + + # Example cookies for an e-commerce site + cookies = { + "session_id": "abc123def456", + "user_id": "user789", + "cart_id": "cart101112", + "preferences": "dark_mode,usd", + "auth_token": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9..." + } + + website_url = "https://example-ecommerce.com/products" + user_prompt = "Extract product information including name, price, availability, and rating" + + try: + client = Client.from_env() + response = client.smartscraper( + website_url=website_url, + user_prompt=user_prompt, + cookies=cookies, + output_schema=ProductInfo, + number_of_scrolls=5 # Scroll to load more products + ) + + print("โœ… E-commerce scraping completed successfully") + print(json.dumps(response, indent=2)) + client.close() + + except Exception as e: + print(f"โŒ Error in e-commerce scraping: {str(e)}") + + +def scrape_social_media_with_session(): + """Example: Scrape social media with session cookies.""" + print("\n" + "=" * 60) + print("SOCIAL MEDIA SCRAPING WITH SESSION COOKIES") + print("=" * 60) + + # Example cookies for a social media site + cookies = { + "session_token": "xyz789abc123", + "user_session": "def456ghi789", + "csrf_token": "jkl012mno345", + "remember_me": "true", + "language": "en_US" + } + + website_url = "https://example-social.com/feed" + user_prompt = "Extract posts from the feed including author, content, likes, and comments" + + try: + client = Client.from_env() + response = client.smartscraper( + website_url=website_url, + user_prompt=user_prompt, + cookies=cookies, + output_schema=SocialMediaPost, + number_of_scrolls=10 # Scroll to load more posts + ) + + print("โœ… Social media scraping completed successfully") + print(json.dumps(response, indent=2)) + client.close() + + except Exception as e: + print(f"โŒ Error in social media scraping: {str(e)}") + + +def scrape_news_with_preferences(): + """Example: Scrape news site with user preference cookies.""" + print("\n" + "=" * 60) + print("NEWS SITE SCRAPING WITH USER PREFERENCES") + print("=" * 60) + + # Example cookies for a news site + cookies = { + "user_preferences": "technology,science,ai", + "reading_level": "advanced", + "region": "US", + "subscription_tier": "premium", + "theme": "dark" + } + + website_url = "https://example-news.com/technology" + user_prompt = "Extract news articles including title, summary, author, and publish date" + + try: + client = Client.from_env() + response = client.smartscraper( + website_url=website_url, + user_prompt=user_prompt, + cookies=cookies, + output_schema=NewsArticle, + total_pages=3 # Scrape multiple pages + ) + + print("โœ… News scraping completed successfully") + print(json.dumps(response, indent=2)) + client.close() + + except Exception as e: + print(f"โŒ Error in news scraping: {str(e)}") + + +def scrape_banking_with_secure_cookies(): + """Example: Scrape banking site with secure authentication cookies.""" + print("\n" + "=" * 60) + print("BANKING SITE SCRAPING WITH SECURE COOKIES") + print("=" * 60) + + # Example secure cookies for a banking site + cookies = { + "secure_session": "pqr678stu901", + "auth_token": "vwx234yz567", + "mfa_verified": "true", + "device_id": "device_abc123", + "last_activity": "2024-01-15T10:30:00Z" + } + + website_url = "https://example-bank.com/transactions" + user_prompt = "Extract recent transactions including date, description, amount, and type" + + try: + client = Client.from_env() + response = client.smartscraper( + website_url=website_url, + user_prompt=user_prompt, + cookies=cookies, + output_schema=BankTransaction, + total_pages=5 # Scrape multiple pages of transactions + ) + + print("โœ… Banking scraping completed successfully") + print(json.dumps(response, indent=2)) + client.close() + + except Exception as e: + print(f"โŒ Error in banking scraping: {str(e)}") + + +def scrape_api_with_auth_tokens(): + """Example: Scrape API endpoint with authentication tokens.""" + print("\n" + "=" * 60) + print("API ENDPOINT SCRAPING WITH AUTH TOKENS") + print("=" * 60) + + # Example API authentication cookies + cookies = { + "api_token": "api_abc123def456", + "client_id": "client_789", + "access_token": "access_xyz789", + "refresh_token": "refresh_abc123", + "scope": "read:all" + } + + website_url = "https://api.example.com/data" + user_prompt = "Extract data from the API response" + + try: + client = Client.from_env() + response = client.smartscraper( + website_url=website_url, + user_prompt=user_prompt, + cookies=cookies, + headers={ + "Accept": "application/json", + "Content-Type": "application/json" + } + ) + + print("โœ… API scraping completed successfully") + print(json.dumps(response, indent=2)) + client.close() + + except Exception as e: + print(f"โŒ Error in API scraping: {str(e)}") + + +def main(): + """Run all cookies integration examples.""" + # Check if API key is available + if not os.getenv("SGAI_API_KEY"): + print("Error: SGAI_API_KEY not found in .env file") + print("Please create a .env file with your API key:") + print("SGAI_API_KEY=your_api_key_here") + return + + print("๐Ÿช COOKIES INTEGRATION EXAMPLES") + print("This demonstrates various real-world scenarios where cookies are essential for web scraping.") + + # Run all examples + scrape_ecommerce_with_auth() + scrape_social_media_with_session() + scrape_news_with_preferences() + scrape_banking_with_secure_cookies() + scrape_api_with_auth_tokens() + + print("\n" + "=" * 60) + print("โœ… All examples completed!") + print("=" * 60) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/scrapegraph-py/examples/sync/smartscraper_cookies_example.py b/scrapegraph-py/examples/sync/smartscraper_cookies_example.py new file mode 100644 index 0000000..01cba1d --- /dev/null +++ b/scrapegraph-py/examples/sync/smartscraper_cookies_example.py @@ -0,0 +1,134 @@ +""" +Example demonstrating how to use the SmartScraper API with cookies. + +This example shows how to: +1. Set up the API request with cookies for authentication +2. Use cookies with infinite scrolling +3. Define a Pydantic model for structured output +4. Make the API call and handle the response +5. Process the extracted data + +Requirements: +- Python 3.7+ +- scrapegraph-py +- A .env file with your SGAI_API_KEY + +Example .env file: +SGAI_API_KEY=your_api_key_here +""" + +import json +import os +from typing import Dict, Optional + +from dotenv import load_dotenv +from pydantic import BaseModel, Field + +from scrapegraph_py import Client + +# Load environment variables from .env file +load_dotenv() + + +# Define the data models for structured output +class CookieInfo(BaseModel): + """Model representing cookie information.""" + + cookies: Dict[str, str] = Field(description="Dictionary of cookie key-value pairs") + + +def main(): + """Example usage of the cookies scraper.""" + # Check if API key is available + if not os.getenv("SGAI_API_KEY"): + print("Error: SGAI_API_KEY not found in .env file") + print("Please create a .env file with your API key:") + print("SGAI_API_KEY=your_api_key_here") + return + + # Initialize the client + client = Client.from_env() + + # Example 1: Basic cookies example (httpbin.org/cookies) + print("=" * 60) + print("EXAMPLE 1: Basic Cookies Example") + print("=" * 60) + + website_url = "https://httpbin.org/cookies" + user_prompt = "Extract all cookies info" + cookies = {"cookies_key": "cookies_value"} + + try: + # Perform the scraping with cookies + response = client.smartscraper( + website_url=website_url, + user_prompt=user_prompt, + cookies=cookies, + output_schema=CookieInfo, + ) + + # Print the results + print("\nExtracted Cookie Information:") + print(json.dumps(response, indent=2)) + + except Exception as e: + print(f"Error occurred: {str(e)}") + + # Example 2: Cookies with infinite scrolling + print("\n" + "=" * 60) + print("EXAMPLE 2: Cookies with Infinite Scrolling") + print("=" * 60) + + website_url = "https://httpbin.org/cookies" + user_prompt = "Extract all cookies and scroll information" + cookies = {"session_id": "abc123", "user_token": "xyz789"} + + try: + # Perform the scraping with cookies and infinite scrolling + response = client.smartscraper( + website_url=website_url, + user_prompt=user_prompt, + cookies=cookies, + number_of_scrolls=3, + output_schema=CookieInfo, + ) + + # Print the results + print("\nExtracted Cookie Information with Scrolling:") + print(json.dumps(response, indent=2)) + + except Exception as e: + print(f"Error occurred: {str(e)}") + + # Example 3: Cookies with pagination + print("\n" + "=" * 60) + print("EXAMPLE 3: Cookies with Pagination") + print("=" * 60) + + website_url = "https://httpbin.org/cookies" + user_prompt = "Extract all cookies from multiple pages" + cookies = {"auth_token": "secret123", "preferences": "dark_mode"} + + try: + # Perform the scraping with cookies and pagination + response = client.smartscraper( + website_url=website_url, + user_prompt=user_prompt, + cookies=cookies, + total_pages=3, + output_schema=CookieInfo, + ) + + # Print the results + print("\nExtracted Cookie Information with Pagination:") + print(json.dumps(response, indent=2)) + + except Exception as e: + print(f"Error occurred: {str(e)}") + + # Close the client + client.close() + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/scrapegraph-py/scrapegraph_py/async_client.py b/scrapegraph-py/scrapegraph_py/async_client.py index 555a7d3..155a3f0 100644 --- a/scrapegraph-py/scrapegraph_py/async_client.py +++ b/scrapegraph-py/scrapegraph_py/async_client.py @@ -174,11 +174,12 @@ async def smartscraper( website_url: Optional[str] = None, website_html: Optional[str] = None, headers: Optional[dict[str, str]] = None, + cookies: Optional[Dict[str, str]] = None, output_schema: Optional[BaseModel] = None, number_of_scrolls: Optional[int] = None, total_pages: Optional[int] = None, ): - """Send a smartscraper request with optional pagination support""" + """Send a smartscraper request with optional pagination support and cookies""" logger.info("๐Ÿ” Starting smartscraper request") if website_url: logger.debug(f"๐ŸŒ URL: {website_url}") @@ -186,6 +187,8 @@ async def smartscraper( logger.debug("๐Ÿ“„ Using provided HTML content") if headers: logger.debug("๐Ÿ”ง Using custom headers") + if cookies: + logger.debug("๐Ÿช Using cookies for authentication/session management") if number_of_scrolls is not None: logger.debug(f"๐Ÿ”„ Number of scrolls: {number_of_scrolls}") if total_pages is not None: @@ -196,6 +199,7 @@ async def smartscraper( website_url=website_url, website_html=website_html, headers=headers, + cookies=cookies, user_prompt=user_prompt, output_schema=output_schema, number_of_scrolls=number_of_scrolls, diff --git a/scrapegraph-py/scrapegraph_py/client.py b/scrapegraph-py/scrapegraph_py/client.py index 1a845d4..9cde1f6 100644 --- a/scrapegraph-py/scrapegraph_py/client.py +++ b/scrapegraph-py/scrapegraph_py/client.py @@ -182,11 +182,12 @@ def smartscraper( website_url: Optional[str] = None, website_html: Optional[str] = None, headers: Optional[dict[str, str]] = None, + cookies: Optional[Dict[str, str]] = None, output_schema: Optional[BaseModel] = None, number_of_scrolls: Optional[int] = None, total_pages: Optional[int] = None, ): - """Send a smartscraper request with optional pagination support""" + """Send a smartscraper request with optional pagination support and cookies""" logger.info("๐Ÿ” Starting smartscraper request") if website_url: logger.debug(f"๐ŸŒ URL: {website_url}") @@ -194,6 +195,8 @@ def smartscraper( logger.debug("๐Ÿ“„ Using provided HTML content") if headers: logger.debug("๐Ÿ”ง Using custom headers") + if cookies: + logger.debug("๐Ÿช Using cookies for authentication/session management") if number_of_scrolls is not None: logger.debug(f"๐Ÿ”„ Number of scrolls: {number_of_scrolls}") if total_pages is not None: @@ -204,6 +207,7 @@ def smartscraper( website_url=website_url, website_html=website_html, headers=headers, + cookies=cookies, user_prompt=user_prompt, output_schema=output_schema, number_of_scrolls=number_of_scrolls, diff --git a/scrapegraph-py/scrapegraph_py/models/smartscraper.py b/scrapegraph-py/scrapegraph_py/models/smartscraper.py index 33d233d..b57fb93 100644 --- a/scrapegraph-py/scrapegraph_py/models/smartscraper.py +++ b/scrapegraph-py/scrapegraph_py/models/smartscraper.py @@ -1,6 +1,6 @@ # Models for smartscraper endpoint -from typing import Optional, Type +from typing import Optional, Type, Dict from uuid import UUID from bs4 import BeautifulSoup @@ -28,6 +28,11 @@ class SmartScraperRequest(BaseModel): }, description="Optional headers to send with the request, including cookies and user agent", ) + cookies: Optional[Dict[str, str]] = Field( + None, + example={"session_id": "abc123", "user_token": "xyz789"}, + description="Dictionary of cookies to send with the request for authentication or session management", + ) output_schema: Optional[Type[BaseModel]] = None number_of_scrolls: Optional[conint(ge=0, le=100)] = Field( default=None, diff --git a/scrapegraph-py/test_cookies_integration.py b/scrapegraph-py/test_cookies_integration.py new file mode 100644 index 0000000..9cac46f --- /dev/null +++ b/scrapegraph-py/test_cookies_integration.py @@ -0,0 +1,97 @@ +""" +Test file to verify cookies integration functionality. +""" + +import json +from pydantic import BaseModel, Field + +from scrapegraph_py.models.smartscraper import SmartScraperRequest + + +class TestCookieInfo(BaseModel): + """Test model for cookie information.""" + + cookies: dict = Field(description="Dictionary of cookie key-value pairs") + + +def test_cookies_integration(): + """Test that cookies are properly integrated into SmartScraperRequest.""" + + print("๐Ÿงช Testing Cookies Integration") + print("=" * 50) + + # Test 1: Basic cookies + print("\n1. Testing basic cookies...") + cookies = {"session_id": "abc123", "auth_token": "xyz789"} + + request = SmartScraperRequest( + user_prompt="Extract cookie information", + website_url="https://httpbin.org/cookies", + cookies=cookies + ) + + data = request.model_dump() + print(f"โœ… Cookies included in request: {data.get('cookies')}") + + # Test 2: Cookies with output schema + print("\n2. Testing cookies with output schema...") + + request_with_schema = SmartScraperRequest( + user_prompt="Extract cookie information", + website_url="https://httpbin.org/cookies", + cookies=cookies, + output_schema=TestCookieInfo + ) + + data_with_schema = request_with_schema.model_dump() + print(f"โœ… Cookies with schema: {data_with_schema.get('cookies')}") + print(f"โœ… Output schema included: {data_with_schema.get('output_schema') is not None}") + + # Test 3: Cookies with scrolling and pagination + print("\n3. Testing cookies with advanced features...") + + request_advanced = SmartScraperRequest( + user_prompt="Extract cookie information from multiple pages", + website_url="https://httpbin.org/cookies", + cookies=cookies, + number_of_scrolls=5, + total_pages=3, + output_schema=TestCookieInfo + ) + + data_advanced = request_advanced.model_dump() + print(f"โœ… Advanced request cookies: {data_advanced.get('cookies')}") + print(f"โœ… Number of scrolls: {data_advanced.get('number_of_scrolls')}") + print(f"โœ… Total pages: {data_advanced.get('total_pages')}") + + # Test 4: Complex cookies scenario + print("\n4. Testing complex cookies scenario...") + + complex_cookies = { + "session_id": "abc123def456", + "user_id": "user789", + "auth_token": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9...", + "preferences": "dark_mode,usd", + "cart_id": "cart101112", + "csrf_token": "csrf_xyz789" + } + + request_complex = SmartScraperRequest( + user_prompt="Extract user profile and preferences", + website_url="https://example.com/dashboard", + cookies=complex_cookies, + headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}, + output_schema=TestCookieInfo + ) + + data_complex = request_complex.model_dump() + print(f"โœ… Complex cookies count: {len(data_complex.get('cookies', {}))}") + print(f"โœ… Headers included: {data_complex.get('headers') is not None}") + + print("\n" + "=" * 50) + print("โœ… All cookies integration tests passed!") + print("=" * 50) + + +if __name__ == "__main__": + test_cookies_integration() \ No newline at end of file From 767f81018d7060043bcd4420446a57acabff7bf7 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Fri, 18 Jul 2025 19:32:17 +0200 Subject: [PATCH 2/3] feat: add js --- scrapegraph-js/README.md | 59 ++++ .../examples/cookies_integration_example.js | 261 ++++++++++++++++++ .../examples/smartScraper_cookies_example.js | 125 +++++++++ scrapegraph-js/src/smartScraper.js | 11 +- 4 files changed, 455 insertions(+), 1 deletion(-) create mode 100644 scrapegraph-js/examples/cookies_integration_example.js create mode 100644 scrapegraph-js/examples/smartScraper_cookies_example.js diff --git a/scrapegraph-js/README.md b/scrapegraph-js/README.md index 0b6fb8e..05fe6dd 100644 --- a/scrapegraph-js/README.md +++ b/scrapegraph-js/README.md @@ -131,6 +131,65 @@ const numberOfScrolls = 10; // Will scroll 10 times to load more content The `numberOfScrolls` parameter accepts values between 0 and 100, allowing you to control how many times the page should be scrolled before extraction. +#### Scraping with Cookies + +Use cookies for authentication and session management when scraping websites that require login or have user-specific content: + +```javascript +import { smartScraper } from 'scrapegraph-js'; + +const apiKey = 'your-api-key'; +const url = 'https://example.com/dashboard'; +const prompt = 'Extract user profile information'; + +// Define cookies for authentication +const cookies = { + session_id: 'abc123def456', + auth_token: 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9...', + user_preferences: 'dark_mode,usd' +}; + +(async () => { + try { + const response = await smartScraper(apiKey, url, prompt, null, null, null, cookies); + console.log(response.result); + } catch (error) { + console.error('Error:', error); + } +})(); +``` + +**Common Use Cases:** +- **E-commerce sites**: User authentication, shopping cart persistence +- **Social media**: Session management, user preferences +- **Banking/Financial**: Secure authentication, transaction history +- **News sites**: User preferences, subscription content +- **API endpoints**: Authentication tokens, API keys + +#### Advanced Scraping with Cookies, Scrolling, and Pagination + +Combine cookies with infinite scrolling and pagination for comprehensive data extraction: + +```javascript +import { smartScraper } from 'scrapegraph-js'; + +const apiKey = 'your-api-key'; +const url = 'https://example.com/feed'; +const prompt = 'Extract all posts from the feed'; +const cookies = { session_token: 'xyz789abc123' }; +const numberOfScrolls = 10; // Scroll 10 times +const totalPages = 5; // Scrape 5 pages + +(async () => { + try { + const response = await smartScraper(apiKey, url, prompt, null, numberOfScrolls, totalPages, cookies); + console.log('Extracted data:', response); + } catch (error) { + console.error('Error:', error); + } +})(); +``` + ### Search Scraping Search and extract information from multiple web sources using AI. diff --git a/scrapegraph-js/examples/cookies_integration_example.js b/scrapegraph-js/examples/cookies_integration_example.js new file mode 100644 index 0000000..5c71e37 --- /dev/null +++ b/scrapegraph-js/examples/cookies_integration_example.js @@ -0,0 +1,261 @@ +/** + * Comprehensive example demonstrating cookies integration for web scraping. + * + * This example shows various real-world scenarios where cookies are essential: + * 1. E-commerce site scraping with authentication + * 2. Social media scraping with session cookies + * 3. Banking/financial site scraping with secure cookies + * 4. News site scraping with user preferences + * 5. API endpoint scraping with authentication tokens + * + * Requirements: + * - Node.js 16+ + * - scrapegraph-js + * - A .env file with your SGAI_APIKEY + * + * Example .env file: + * SGAI_APIKEY=your_api_key_here + */ + +import { smartScraper } from 'scrapegraph-js'; +import { z } from 'zod'; +import 'dotenv/config'; + +// Define data schemas for different scenarios +const ProductInfoSchema = z.object({ + name: z.string().describe('Product name'), + price: z.string().describe('Product price'), + availability: z.string().describe('Product availability status'), + rating: z.string().optional().describe('Product rating') +}); + +const SocialMediaPostSchema = z.object({ + author: z.string().describe('Post author'), + content: z.string().describe('Post content'), + likes: z.string().optional().describe('Number of likes'), + comments: z.string().optional().describe('Number of comments'), + timestamp: z.string().optional().describe('Post timestamp') +}); + +const NewsArticleSchema = z.object({ + title: z.string().describe('Article title'), + summary: z.string().describe('Article summary'), + author: z.string().optional().describe('Article author'), + publish_date: z.string().optional().describe('Publish date') +}); + +const BankTransactionSchema = z.object({ + date: z.string().describe('Transaction date'), + description: z.string().describe('Transaction description'), + amount: z.string().describe('Transaction amount'), + type: z.string().describe('Transaction type (credit/debit)') +}); + +async function scrapeEcommerceWithAuth() { + console.log('='.repeat(60)); + console.log('E-COMMERCE SITE SCRAPING WITH AUTHENTICATION'); + console.log('='.repeat(60)); + + // Example cookies for an e-commerce site + const cookies = { + session_id: 'abc123def456', + user_id: 'user789', + cart_id: 'cart101112', + preferences: 'dark_mode,usd', + auth_token: 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9...' + }; + + const websiteUrl = 'https://example-ecommerce.com/products'; + const userPrompt = 'Extract product information including name, price, availability, and rating'; + + try { + const response = await smartScraper( + process.env.SGAI_APIKEY, + websiteUrl, + userPrompt, + ProductInfoSchema, + 5, // numberOfScrolls - Scroll to load more products + null, // totalPages + cookies + ); + + console.log('โœ… E-commerce scraping completed successfully'); + console.log(JSON.stringify(response, null, 2)); + + } catch (error) { + console.error(`โŒ Error in e-commerce scraping: ${error.message}`); + } +} + +async function scrapeSocialMediaWithSession() { + console.log('\n' + '='.repeat(60)); + console.log('SOCIAL MEDIA SCRAPING WITH SESSION COOKIES'); + console.log('='.repeat(60)); + + // Example cookies for a social media site + const cookies = { + session_token: 'xyz789abc123', + user_session: 'def456ghi789', + csrf_token: 'jkl012mno345', + remember_me: 'true', + language: 'en_US' + }; + + const websiteUrl = 'https://example-social.com/feed'; + const userPrompt = 'Extract posts from the feed including author, content, likes, and comments'; + + try { + const response = await smartScraper( + process.env.SGAI_APIKEY, + websiteUrl, + userPrompt, + SocialMediaPostSchema, + 10, // numberOfScrolls - Scroll to load more posts + null, // totalPages + cookies + ); + + console.log('โœ… Social media scraping completed successfully'); + console.log(JSON.stringify(response, null, 2)); + + } catch (error) { + console.error(`โŒ Error in social media scraping: ${error.message}`); + } +} + +async function scrapeNewsWithPreferences() { + console.log('\n' + '='.repeat(60)); + console.log('NEWS SITE SCRAPING WITH USER PREFERENCES'); + console.log('='.repeat(60)); + + // Example cookies for a news site + const cookies = { + user_preferences: 'technology,science,ai', + reading_level: 'advanced', + region: 'US', + subscription_tier: 'premium', + theme: 'dark' + }; + + const websiteUrl = 'https://example-news.com/technology'; + const userPrompt = 'Extract news articles including title, summary, author, and publish date'; + + try { + const response = await smartScraper( + process.env.SGAI_APIKEY, + websiteUrl, + userPrompt, + NewsArticleSchema, + null, // numberOfScrolls + 3, // totalPages - Scrape multiple pages + cookies + ); + + console.log('โœ… News scraping completed successfully'); + console.log(JSON.stringify(response, null, 2)); + + } catch (error) { + console.error(`โŒ Error in news scraping: ${error.message}`); + } +} + +async function scrapeBankingWithSecureCookies() { + console.log('\n' + '='.repeat(60)); + console.log('BANKING SITE SCRAPING WITH SECURE COOKIES'); + console.log('='.repeat(60)); + + // Example secure cookies for a banking site + const cookies = { + secure_session: 'pqr678stu901', + auth_token: 'vwx234yz567', + mfa_verified: 'true', + device_id: 'device_abc123', + last_activity: '2024-01-15T10:30:00Z' + }; + + const websiteUrl = 'https://example-bank.com/transactions'; + const userPrompt = 'Extract recent transactions including date, description, amount, and type'; + + try { + const response = await smartScraper( + process.env.SGAI_APIKEY, + websiteUrl, + userPrompt, + BankTransactionSchema, + null, // numberOfScrolls + 5, // totalPages - Scrape multiple pages of transactions + cookies + ); + + console.log('โœ… Banking scraping completed successfully'); + console.log(JSON.stringify(response, null, 2)); + + } catch (error) { + console.error(`โŒ Error in banking scraping: ${error.message}`); + } +} + +async function scrapeApiWithAuthTokens() { + console.log('\n' + '='.repeat(60)); + console.log('API ENDPOINT SCRAPING WITH AUTH TOKENS'); + console.log('='.repeat(60)); + + // Example API authentication cookies + const cookies = { + api_token: 'api_abc123def456', + client_id: 'client_789', + access_token: 'access_xyz789', + refresh_token: 'refresh_abc123', + scope: 'read:all' + }; + + const websiteUrl = 'https://api.example.com/data'; + const userPrompt = 'Extract data from the API response'; + + try { + const response = await smartScraper( + process.env.SGAI_APIKEY, + websiteUrl, + userPrompt, + null, // No schema for generic API response + null, // numberOfScrolls + null, // totalPages + cookies + ); + + console.log('โœ… API scraping completed successfully'); + console.log(JSON.stringify(response, null, 2)); + + } catch (error) { + console.error(`โŒ Error in API scraping: ${error.message}`); + } +} + +async function main() { + const apiKey = process.env.SGAI_APIKEY; + + // Check if API key is available + if (!apiKey) { + console.error('Error: SGAI_APIKEY not found in .env file'); + console.log('Please create a .env file with your API key:'); + console.log('SGAI_APIKEY=your_api_key_here'); + return; + } + + console.log('๐Ÿช COOKIES INTEGRATION EXAMPLES'); + console.log('This demonstrates various real-world scenarios where cookies are essential for web scraping.'); + + // Run all examples + await scrapeEcommerceWithAuth(); + await scrapeSocialMediaWithSession(); + await scrapeNewsWithPreferences(); + await scrapeBankingWithSecureCookies(); + await scrapeApiWithAuthTokens(); + + console.log('\n' + '='.repeat(60)); + console.log('โœ… All examples completed!'); + console.log('='.repeat(60)); +} + +// Run the example +main().catch(console.error); \ No newline at end of file diff --git a/scrapegraph-js/examples/smartScraper_cookies_example.js b/scrapegraph-js/examples/smartScraper_cookies_example.js new file mode 100644 index 0000000..93786fb --- /dev/null +++ b/scrapegraph-js/examples/smartScraper_cookies_example.js @@ -0,0 +1,125 @@ +/** + * Example demonstrating how to use the SmartScraper API with cookies. + * + * This example shows how to: + * 1. Set up the API request with cookies for authentication + * 2. Use cookies with infinite scrolling + * 3. Define a Zod schema for structured output + * 4. Make the API call and handle the response + * 5. Process the extracted data + * + * Requirements: + * - Node.js 16+ + * - scrapegraph-js + * - A .env file with your SGAI_APIKEY + * + * Example .env file: + * SGAI_APIKEY=your_api_key_here + */ + +import { smartScraper } from 'scrapegraph-js'; +import { z } from 'zod'; +import 'dotenv/config'; + +// Define the data schema for structured output +const CookieInfoSchema = z.object({ + cookies: z.record(z.string()).describe('Dictionary of cookie key-value pairs') +}); + +async function main() { + const apiKey = process.env.SGAI_APIKEY; + + // Check if API key is available + if (!apiKey) { + console.error('Error: SGAI_APIKEY not found in .env file'); + console.log('Please create a .env file with your API key:'); + console.log('SGAI_APIKEY=your_api_key_here'); + return; + } + + // Example 1: Basic cookies example (httpbin.org/cookies) + console.log('='.repeat(60)); + console.log('EXAMPLE 1: Basic Cookies Example'); + console.log('='.repeat(60)); + + const websiteUrl = 'https://httpbin.org/cookies'; + const userPrompt = 'Extract all cookies info'; + const cookies = { cookies_key: 'cookies_value' }; + + try { + // Perform the scraping with cookies + const response = await smartScraper( + apiKey, + websiteUrl, + userPrompt, + CookieInfoSchema, + null, // numberOfScrolls + null, // totalPages + cookies + ); + + // Print the results + console.log('\nExtracted Cookie Information:'); + console.log(JSON.stringify(response, null, 2)); + + } catch (error) { + console.error(`Error occurred: ${error.message}`); + } + + // Example 2: Cookies with infinite scrolling + console.log('\n' + '='.repeat(60)); + console.log('EXAMPLE 2: Cookies with Infinite Scrolling'); + console.log('='.repeat(60)); + + const cookiesWithScroll = { session_id: 'abc123', user_token: 'xyz789' }; + + try { + // Perform the scraping with cookies and infinite scrolling + const response = await smartScraper( + apiKey, + websiteUrl, + 'Extract all cookies and scroll information', + CookieInfoSchema, + 3, // numberOfScrolls + null, // totalPages + cookiesWithScroll + ); + + // Print the results + console.log('\nExtracted Cookie Information with Scrolling:'); + console.log(JSON.stringify(response, null, 2)); + + } catch (error) { + console.error(`Error occurred: ${error.message}`); + } + + // Example 3: Cookies with pagination + console.log('\n' + '='.repeat(60)); + console.log('EXAMPLE 3: Cookies with Pagination'); + console.log('='.repeat(60)); + + const cookiesWithPagination = { auth_token: 'secret123', preferences: 'dark_mode' }; + + try { + // Perform the scraping with cookies and pagination + const response = await smartScraper( + apiKey, + websiteUrl, + 'Extract all cookies from multiple pages', + CookieInfoSchema, + null, // numberOfScrolls + 3, // totalPages + cookiesWithPagination + ); + + // Print the results + console.log('\nExtracted Cookie Information with Pagination:'); + console.log(JSON.stringify(response, null, 2)); + + } catch (error) { + console.error(`Error occurred: ${error.message}`); + } +} + +// Run the example +main().catch(console.error); \ No newline at end of file diff --git a/scrapegraph-js/src/smartScraper.js b/scrapegraph-js/src/smartScraper.js index 2eff633..fc93573 100644 --- a/scrapegraph-js/src/smartScraper.js +++ b/scrapegraph-js/src/smartScraper.js @@ -12,10 +12,11 @@ import { zodToJsonSchema } from 'zod-to-json-schema'; * @param {Object} [schema] - Optional schema object defining the output structure * @param {number} [numberOfScrolls] - Optional number of times to scroll the page (0-100). If not provided, no scrolling will be performed. * @param {number} [totalPages] - Optional number of pages to scrape (1-10). If not provided, only the first page will be scraped. + * @param {Object} [cookies] - Optional cookies object for authentication and session management * @returns {Promise} Extracted data in JSON format matching the provided schema * @throws - Will throw an error in case of an HTTP failure. */ -export async function smartScraper(apiKey, url, prompt, schema = null, numberOfScrolls = null, totalPages = null) { +export async function smartScraper(apiKey, url, prompt, schema = null, numberOfScrolls = null, totalPages = null, cookies = null) { const endpoint = 'https://api.scrapegraphai.com/v1/smartscraper'; const headers = { 'accept': 'application/json', @@ -28,6 +29,14 @@ export async function smartScraper(apiKey, url, prompt, schema = null, numberOfS user_prompt: prompt, }; + if (cookies) { + if (typeof cookies === 'object' && cookies !== null) { + payload.cookies = cookies; + } else { + throw new Error('Cookies must be an object with key-value pairs'); + } + } + if (schema) { if (schema instanceof ZodType) { payload.output_schema = zodToJsonSchema(schema); From 6ee9fada9f47766c7d6a855112cbef62f75673ee Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Fri, 18 Jul 2025 19:56:40 +0200 Subject: [PATCH 3/3] feat add js info --- .../smartScraper_cookies_simple_example.js | 40 ++++++++ scrapegraph-js/package.json | 9 +- scrapegraph-js/test_cookies_integration.js | 92 +++++++++++++++++++ 3 files changed, 139 insertions(+), 2 deletions(-) create mode 100644 scrapegraph-js/examples/smartScraper_cookies_simple_example.js create mode 100644 scrapegraph-js/test_cookies_integration.js diff --git a/scrapegraph-js/examples/smartScraper_cookies_simple_example.js b/scrapegraph-js/examples/smartScraper_cookies_simple_example.js new file mode 100644 index 0000000..dbe975c --- /dev/null +++ b/scrapegraph-js/examples/smartScraper_cookies_simple_example.js @@ -0,0 +1,40 @@ +/** + * Simple example demonstrating cookies usage with SmartScraper. + * + * This example shows the basic pattern for using cookies with the API. + */ + +import { smartScraper } from 'scrapegraph-js'; +import 'dotenv/config'; + +const apiKey = process.env.SGAI_APIKEY; + +// Example cookies for authentication +const cookies = { + session_id: 'abc123def456', + auth_token: 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9...', + user_preferences: 'dark_mode,usd' +}; + +async function scrapeWithCookies() { + try { + const response = await smartScraper( + apiKey, + 'https://example.com/dashboard', + 'Extract user profile information', + null, // schema + null, // numberOfScrolls + null, // totalPages + cookies // cookies parameter + ); + + console.log('โœ… Scraping with cookies completed successfully'); + console.log(JSON.stringify(response, null, 2)); + + } catch (error) { + console.error('โŒ Error:', error.message); + } +} + +// Run the example +scrapeWithCookies(); \ No newline at end of file diff --git a/scrapegraph-js/package.json b/scrapegraph-js/package.json index 8528db9..a1a21b2 100644 --- a/scrapegraph-js/package.json +++ b/scrapegraph-js/package.json @@ -2,7 +2,7 @@ "name": "scrapegraph-js", "author": "ScrapeGraphAI", "version": "0.1.1", - "description": "Scrape and extract structured data from a webpage using ScrapeGraphAI's APIs.", + "description": "Scrape and extract structured data from a webpage using ScrapeGraphAI's APIs. Supports cookies for authentication, infinite scrolling, and pagination.", "repository": { "type": "git", "url": "https://github.com/ScrapeGraphAI/scrapegraph-sdk", @@ -22,7 +22,12 @@ "gpt-3", "gpt-4", "llm", - "ai" + "ai", + "cookies", + "authentication", + "session-management", + "infinite-scroll", + "pagination" ], "main": "index.js", "module": "index.js", diff --git a/scrapegraph-js/test_cookies_integration.js b/scrapegraph-js/test_cookies_integration.js new file mode 100644 index 0000000..4860f0e --- /dev/null +++ b/scrapegraph-js/test_cookies_integration.js @@ -0,0 +1,92 @@ +/** + * Test file to verify cookies integration functionality. + */ + +import { smartScraper } from './src/smartScraper.js'; + +function testCookiesIntegration() { + console.log('๐Ÿงช Testing Cookies Integration'); + console.log('='.repeat(50)); + + // Test 1: Basic cookies validation + console.log('\n1. Testing basic cookies validation...'); + + const cookies = { session_id: 'abc123', auth_token: 'xyz789' }; + + // Create a mock payload to test the logic + const mockPayload = { + website_url: 'https://httpbin.org/cookies', + user_prompt: 'Extract cookie information' + }; + + // Simulate the cookies validation logic + if (cookies) { + if (typeof cookies === 'object' && cookies !== null) { + mockPayload.cookies = cookies; + console.log('โœ… Cookies validation passed'); + console.log(`โœ… Cookies included: ${JSON.stringify(mockPayload.cookies)}`); + } else { + console.log('โŒ Cookies validation failed - not an object'); + } + } + + // Test 2: Complex cookies scenario + console.log('\n2. Testing complex cookies scenario...'); + + const complexCookies = { + session_id: 'abc123def456', + user_id: 'user789', + auth_token: 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9...', + preferences: 'dark_mode,usd', + cart_id: 'cart101112', + csrf_token: 'csrf_xyz789' + }; + + const complexPayload = { + website_url: 'https://example.com/dashboard', + user_prompt: 'Extract user profile and preferences' + }; + + if (complexCookies) { + if (typeof complexCookies === 'object' && complexCookies !== null) { + complexPayload.cookies = complexCookies; + console.log('โœ… Complex cookies validation passed'); + console.log(`โœ… Complex cookies count: ${Object.keys(complexPayload.cookies).length}`); + } + } + + // Test 3: Invalid cookies + console.log('\n3. Testing invalid cookies...'); + + const invalidCookies = 'not_an_object'; + + try { + if (invalidCookies) { + if (typeof invalidCookies === 'object' && invalidCookies !== null) { + console.log('โŒ Should have failed validation'); + } else { + console.log('โœ… Invalid cookies correctly rejected'); + } + } + } catch (error) { + console.log('โœ… Error handling works correctly'); + } + + // Test 4: Function signature validation + console.log('\n4. Testing function signature...'); + + // Check if the function accepts the cookies parameter + const functionString = smartScraper.toString(); + if (functionString.includes('cookies = null')) { + console.log('โœ… Function signature includes cookies parameter'); + } else { + console.log('โŒ Function signature missing cookies parameter'); + } + + console.log('\n' + '='.repeat(50)); + console.log('โœ… All cookies integration tests completed!'); + console.log('='.repeat(50)); +} + +// Run the test +testCookiesIntegration(); \ No newline at end of file