fix: broken sdk

VinciGit00 · VinciGit00 · commit b2890d5128b1 · 2025-07-12T19:37:47.000+02:00
diff --git a/scrapegraph-py/examples/async/async_crawl_example.py b/scrapegraph-py/examples/async/async_crawl_example.py
@@ -94,7 +94,7 @@ async def main():
             crawl_response = await client.crawl(
                 url=url,
                 prompt=prompt,
-                schema=schema,
+                data_schema=schema,
                 cache_website=True,
                 depth=2,
                 max_pages=2,
diff --git a/scrapegraph-py/examples/miscellaneous/crawl_example.py b/scrapegraph-py/examples/miscellaneous/crawl_example.py
@@ -89,7 +89,7 @@ def main():
         crawl_response = client.crawl(
             url=url,
             prompt=prompt,
-            schema=schema,
+            data_schema=schema,
             cache_website=True,
             depth=2,
             max_pages=2,
diff --git a/scrapegraph-py/examples/sync/crawl_example.py b/scrapegraph-py/examples/sync/crawl_example.py
@@ -75,7 +75,7 @@ def main():
         crawl_response = client.crawl(
             url=url,
             prompt=prompt,
-            schema=schema,
+            data_schema=schema,
             cache_website=True,
             depth=2,
             max_pages=2,
diff --git a/scrapegraph-py/examples/sync/smartscraper_infinite_scroll_example.py b/scrapegraph-py/examples/sync/smartscraper_infinite_scroll_example.py
@@ -1,3 +1,4 @@
+import os
 from scrapegraph_py import Client
 from scrapegraph_py.logger import sgai_logger
 from pydantic import BaseModel
@@ -14,8 +15,9 @@ class Company(BaseModel):
 class CompaniesResponse(BaseModel):
     companies: List[Company]
 
-# Initialize the client with explicit API key
-sgai_client = Client(api_key="sgai-api-key")
+# Initialize the client with API key from environment variable
+# Make sure to set SGAI_API_KEY in your environment or .env file
+sgai_client = Client.from_env()
 
 try:
     # SmartScraper request with infinite scroll
diff --git a/scrapegraph-py/scrapegraph_py/async_client.py b/scrapegraph-py/scrapegraph_py/async_client.py
@@ -306,7 +306,7 @@ async def crawl(
         self,
         url: str,
         prompt: str,
-        schema: Dict[str, Any],
+        data_schema: Dict[str, Any],
         cache_website: bool = True,
         depth: int = 2,
         max_pages: int = 2,
@@ -317,7 +317,7 @@ async def crawl(
         logger.info("🔍 Starting crawl request")
         logger.debug(f"🌐 URL: {url}")
         logger.debug(f"📝 Prompt: {prompt}")
-        logger.debug(f"📊 Schema provided: {bool(schema)}")
+        logger.debug(f"📊 Schema provided: {bool(data_schema)}")
         logger.debug(f"💾 Cache website: {cache_website}")
         logger.debug(f"🔍 Depth: {depth}")
         logger.debug(f"📄 Max pages: {max_pages}")
@@ -327,7 +327,7 @@ async def crawl(
         request = CrawlRequest(
             url=url,
             prompt=prompt,
-            schema=schema,
+            data_schema=data_schema,
             cache_website=cache_website,
             depth=depth,
             max_pages=max_pages,
diff --git a/scrapegraph-py/scrapegraph_py/client.py b/scrapegraph-py/scrapegraph_py/client.py
@@ -309,7 +309,7 @@ def crawl(
         self,
         url: str,
         prompt: str,
-        schema: Dict[str, Any],
+        data_schema: Dict[str, Any],
         cache_website: bool = True,
         depth: int = 2,
         max_pages: int = 2,
@@ -320,7 +320,7 @@ def crawl(
         logger.info("🔍 Starting crawl request")
         logger.debug(f"🌐 URL: {url}")
         logger.debug(f"📝 Prompt: {prompt}")
-        logger.debug(f"📊 Schema provided: {bool(schema)}")
+        logger.debug(f"📊 Schema provided: {bool(data_schema)}")
         logger.debug(f"💾 Cache website: {cache_website}")
         logger.debug(f"🔍 Depth: {depth}")
         logger.debug(f"📄 Max pages: {max_pages}")
@@ -330,7 +330,7 @@ def crawl(
         request = CrawlRequest(
             url=url,
             prompt=prompt,
-            schema=schema,
+            data_schema=data_schema,
             cache_website=cache_website,
             depth=depth,
             max_pages=max_pages,
diff --git a/scrapegraph-py/scrapegraph_py/models/crawl.py b/scrapegraph-py/scrapegraph_py/models/crawl.py
@@ -17,7 +17,7 @@ class CrawlRequest(BaseModel):
         example="What does the company do? and I need text content from there privacy and terms",
         description="The prompt to guide the crawl and extraction"
     )
-    schema: Dict[str, Any] = Field(
+    data_schema: Dict[str, Any] = Field(
         ...,
         description="JSON schema defining the structure of the extracted data"
     )
@@ -62,11 +62,11 @@ def validate_prompt(self) -> "CrawlRequest":
         return self
 
     @model_validator(mode="after")
-    def validate_schema(self) -> "CrawlRequest":
-        if not isinstance(self.schema, dict):
-            raise ValueError("Schema must be a dictionary")
-        if not self.schema:
-            raise ValueError("Schema cannot be empty")
+    def validate_data_schema(self) -> "CrawlRequest":
+        if not isinstance(self.data_schema, dict):
+            raise ValueError("Data schema must be a dictionary")
+        if not self.data_schema:
+            raise ValueError("Data schema cannot be empty")
         return self
 
 
diff --git a/scrapegraph-py/scrapegraph_py/models/feedback.py b/scrapegraph-py/scrapegraph_py/models/feedback.py
@@ -20,3 +20,8 @@ def validate_request_id(self) -> "FeedbackRequest":
         except ValueError:
             raise ValueError("request_id must be a valid UUID")
         return self
+
+    def model_dump(self, *args, **kwargs) -> dict:
+        # Set exclude_none=True to exclude None values from serialization
+        kwargs.setdefault('exclude_none', True)
+        return super().model_dump(*args, **kwargs)
diff --git a/scrapegraph-py/scrapegraph_py/models/markdownify.py b/scrapegraph-py/scrapegraph_py/models/markdownify.py
@@ -28,6 +28,11 @@ def validate_url(self) -> "MarkdownifyRequest":
             raise ValueError("Invalid URL")
         return self
 
+    def model_dump(self, *args, **kwargs) -> dict:
+        # Set exclude_none=True to exclude None values from serialization
+        kwargs.setdefault('exclude_none', True)
+        return super().model_dump(*args, **kwargs)
+
 
 class GetMarkdownifyRequest(BaseModel):
     """Request model for get_markdownify endpoint"""
diff --git a/scrapegraph-py/scrapegraph_py/models/searchscraper.py b/scrapegraph-py/scrapegraph_py/models/searchscraper.py
@@ -34,6 +34,8 @@ def validate_user_prompt(self) -> "SearchScraperRequest":
         return self
 
     def model_dump(self, *args, **kwargs) -> dict:
+        # Set exclude_none=True to exclude None values from serialization
+        kwargs.setdefault('exclude_none', True)
         data = super().model_dump(*args, **kwargs)
         # Convert the Pydantic model schema to dict if present
         if self.output_schema is not None:
diff --git a/scrapegraph-py/scrapegraph_py/models/smartscraper.py b/scrapegraph-py/scrapegraph_py/models/smartscraper.py
@@ -67,6 +67,8 @@ def validate_url_and_html(self) -> "SmartScraperRequest":
         return self
 
     def model_dump(self, *args, **kwargs) -> dict:
+        # Set exclude_none=True to exclude None values from serialization
+        kwargs.setdefault('exclude_none', True)
         data = super().model_dump(*args, **kwargs)
         # Convert the Pydantic model schema to dict if present
         if self.output_schema is not None:
diff --git a/scrapegraph-py/scrapegraph_py/utils/helpers.py b/scrapegraph-py/scrapegraph_py/utils/helpers.py
@@ -23,20 +23,32 @@ def validate_api_key(api_key: str) -> bool:
 
 
 def handle_sync_response(response: Response) -> Dict[str, Any]:
-    data = response.json()
+    try:
+        data = response.json()
+    except ValueError:
+        # If response is not JSON, use the raw text
+        data = {"error": response.text}
 
     if response.status_code >= 400:
-        error_msg = data.get("error", "Unknown error occurred")
+        error_msg = data.get("error", data.get("detail", f"HTTP {response.status_code}: {response.text}"))
         raise APIError(error_msg, status_code=response.status_code)
 
     return data
 
 
 async def handle_async_response(response: aiohttp.ClientResponse) -> Dict[str, Any]:
-    data = await response.json()
+    try:
+        data = await response.json()
+        text = None
+    except ValueError:
+        # If response is not JSON, use the raw text
+        text = await response.text()
+        data = {"error": text}
 
     if response.status >= 400:
-        error_msg = data.get("error", "Unknown error occurred")
+        if text is None:
+            text = await response.text()
+        error_msg = data.get("error", data.get("detail", f"HTTP {response.status}: {text}"))
         raise APIError(error_msg, status_code=response.status)
 
     return data
diff --git a/scrapegraph-py/test_models_fix.py b/scrapegraph-py/test_models_fix.py
@@ -0,0 +1,113 @@
+#!/usr/bin/env python3
+"""
+Test script to verify that the Pydantic warning is resolved and models work correctly.
+"""
+
+import warnings
+from scrapegraph_py.models.crawl import CrawlRequest
+from scrapegraph_py.models.smartscraper import SmartScraperRequest
+from scrapegraph_py.models.searchscraper import SearchScraperRequest
+from scrapegraph_py.models.markdownify import MarkdownifyRequest
+from scrapegraph_py.models.feedback import FeedbackRequest
+
+# Capture warnings
+warnings.simplefilter("always")
+
+def test_crawl_request():
+    """Test CrawlRequest model"""
+    print("Testing CrawlRequest...")
+    
+    schema = {
+        "type": "object",
+        "properties": {
+            "name": {"type": "string"},
+            "description": {"type": "string"}
+        }
+    }
+    
+    request = CrawlRequest(
+        url="https://example.com",
+        prompt="Test prompt",
+        data_schema=schema
+    )
+    
+    # Test model_dump
+    data = request.model_dump()
+    print(f"✅ CrawlRequest model_dump works: {len(data)} fields")
+    assert "data_schema" in data
+    assert "schema" not in data  # Old field should not be present
+    
+def test_smartscraper_request():
+    """Test SmartScraperRequest model"""
+    print("Testing SmartScraperRequest...")
+    
+    # Test without number_of_scrolls (should be None)
+    request = SmartScraperRequest(
+        user_prompt="Test prompt",
+        website_url="https://example.com"
+    )
+    
+    # Test model_dump - number_of_scrolls should be excluded when None
+    data = request.model_dump()
+    print(f"✅ SmartScraperRequest model_dump works: {len(data)} fields")
+    assert "number_of_scrolls" not in data  # Should be excluded when None
+    
+    # Test with number_of_scrolls
+    request_with_scrolls = SmartScraperRequest(
+        user_prompt="Test prompt",
+        website_url="https://example.com",
+        number_of_scrolls=5
+    )
+    
+    data_with_scrolls = request_with_scrolls.model_dump()
+    assert "number_of_scrolls" in data_with_scrolls  # Should be included when not None
+    assert data_with_scrolls["number_of_scrolls"] == 5
+
+def test_searchscraper_request():
+    """Test SearchScraperRequest model"""
+    print("Testing SearchScraperRequest...")
+    
+    request = SearchScraperRequest(
+        user_prompt="Test prompt"
+    )
+    
+    data = request.model_dump()
+    print(f"✅ SearchScraperRequest model_dump works: {len(data)} fields")
+    assert "headers" not in data  # Should be excluded when None
+
+def test_markdownify_request():
+    """Test MarkdownifyRequest model"""
+    print("Testing MarkdownifyRequest...")
+    
+    request = MarkdownifyRequest(
+        website_url="https://example.com"
+    )
+    
+    data = request.model_dump()
+    print(f"✅ MarkdownifyRequest model_dump works: {len(data)} fields")
+    assert "headers" not in data  # Should be excluded when None
+
+def test_feedback_request():
+    """Test FeedbackRequest model"""
+    print("Testing FeedbackRequest...")
+    
+    request = FeedbackRequest(
+        request_id="123e4567-e89b-12d3-a456-426614174000",
+        rating=5
+    )
+    
+    data = request.model_dump()
+    print(f"✅ FeedbackRequest model_dump works: {len(data)} fields")
+    assert "feedback_text" not in data  # Should be excluded when None
+
+if __name__ == "__main__":
+    print("🧪 Testing Pydantic model fixes...")
+    
+    test_crawl_request()
+    test_smartscraper_request()
+    test_searchscraper_request()
+    test_markdownify_request()
+    test_feedback_request()
+    
+    print("\n✅ All tests passed! The Pydantic warning should be resolved.")
+    print("🎉 Models now properly exclude None values from serialization.") 
diff --git a/scrapegraph-py/test_schema_fix.py b/scrapegraph-py/test_schema_fix.py
@@ -0,0 +1,35 @@
+#!/usr/bin/env python3
+"""
+Test script to verify the schema field fix works without Pydantic warnings
+"""
+
+import warnings
+from scrapegraph_py.models.crawl import CrawlRequest
+
+# Capture warnings
+warnings.simplefilter("always")
+
+# Test creating a CrawlRequest model
+try:
+    schema = {
+        "type": "object",
+        "properties": {
+            "title": {"type": "string"},
+            "description": {"type": "string"}
+        }
+    }
+    
+    request = CrawlRequest(
+        url="https://example.com",
+        prompt="Extract title and description",
+        data_schema=schema
+    )
+    
+    print("✅ CrawlRequest created successfully without Pydantic warnings!")
+    print(f"URL: {request.url}")
+    print(f"Prompt: {request.prompt}")
+    print(f"Data schema type: {type(request.data_schema)}")
+    print(f"Data schema keys: {list(request.data_schema.keys())}")
+    
+except Exception as e:
+    print(f"❌ Error creating CrawlRequest: {e}") 
diff --git a/scrapegraph-py/tests/test_models.py b/scrapegraph-py/tests/test_models.py