Merge remote-tracking branch 'origin/master' into integrate-browserfo…

…rge-fingerprints
apify · Jan 10, 2025 · 3d37bca · 3d37bca
2 parents ddfabea + f03a4e1
commit 3d37bca
Show file tree

Hide file tree

Showing 18 changed files with 610 additions and 157 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,13 @@
 
 All notable changes to this project will be documented in this file.
 
+## [0.5.1](https://github.com/apify/crawlee-python/releases/tag/v0.5.1) (2025-01-07)
+
+### 🐛 Bug Fixes
+
+- Make result of RequestList.is_empty independent of fetch_next_request calls ([#876](https://github.com/apify/crawlee-python/pull/876)) ([d50249e](https://github.com/apify/crawlee-python/commit/d50249ecbfe2a04f508fcdc3261e050349bd0da2)) by [@janbuchar](https://github.com/janbuchar)
+
+
 ## [0.5.0](https://github.com/apify/crawlee-python/releases/tag/v0.5.0) (2025-01-02)
 
 ### 🚀 Features

diff --git a/README.md b/README.md
@@ -89,7 +89,7 @@ The [`BeautifulSoupCrawler`](https://crawlee.dev/python/api/class/BeautifulSoupC
 ```python
 import asyncio
 
-from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
+from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
 
 
 async def main() -> None:
@@ -129,7 +129,7 @@ The [`PlaywrightCrawler`](https://crawlee.dev/python/api/class/PlaywrightCrawler
 ```python
 import asyncio
 
-from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext
+from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
 
 
 async def main() -> None:

diff --git a/docs/guides/code/session_management/session_management_basic.py b/docs/guides/code/session_management/session_management_basic.py
@@ -0,0 +1,48 @@
+import asyncio
+import re
+
+from crawlee.crawlers import BasicCrawler, BasicCrawlingContext
+from crawlee.proxy_configuration import ProxyConfiguration
+from crawlee.sessions import SessionPool
+
+
+async def main() -> None:
+    # To use the proxy IP session rotation logic, you must turn the proxy usage on.
+    proxy_configuration = ProxyConfiguration(
+        # options
+    )
+
+    # Initialize crawler with a custom SessionPool configuration
+    # to manage concurrent sessions and proxy rotation
+    crawler = BasicCrawler(
+        proxy_configuration=proxy_configuration,
+        # Activates the Session pool (default is true).
+        use_session_pool=True,
+        # Overrides default Session pool configuration.
+        session_pool=SessionPool(max_pool_size=100),
+    )
+
+    # Define the default request handler that manages session states
+    @crawler.router.default_handler
+    async def default_handler(context: BasicCrawlingContext) -> None:
+        # Send request, BasicCrawler automatically selects a session from the pool
+        # and sets a proxy for it. You can check it with `context.session`
+        # and `context.proxy_info`.
+        response = await context.send_request(context.request.url)
+
+        page_content = response.read().decode()
+        title_match = re.search(r'<title(?:.*?)>(.*?)</title>', page_content)
+
+        if context.session and (title := title_match.group(1) if title_match else None):
+            if title == 'Blocked':
+                context.session.retire()
+            elif title == 'Not sure if blocked, might also be a connection error':
+                context.session.mark_bad()
+            else:
+                context.session.mark_good()  # BasicCrawler handles this automatically.
+
+    await crawler.run(['https://crawlee.dev/'])
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/docs/guides/code/session_management/session_management_beautifulsoup.py b/docs/guides/code/session_management/session_management_beautifulsoup.py
@@ -0,0 +1,42 @@
+import asyncio
+
+from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
+from crawlee.proxy_configuration import ProxyConfiguration
+from crawlee.sessions import SessionPool
+
+
+async def main() -> None:
+    # To use the proxy IP session rotation logic, you must turn the proxy usage on.
+    proxy_configuration = ProxyConfiguration(
+        # options
+    )
+
+    # Initialize crawler with a custom SessionPool configuration
+    # to manage concurrent sessions and proxy rotation
+    crawler = BeautifulSoupCrawler(
+        proxy_configuration=proxy_configuration,
+        # Activates the Session pool (default is true).
+        use_session_pool=True,
+        # Overrides default Session pool configuration.
+        session_pool=SessionPool(max_pool_size=100),
+    )
+
+    # Define the default request handler that manages session states
+    # based on the response content and potential blocking
+    @crawler.router.default_handler
+    async def default_handler(context: BeautifulSoupCrawlingContext) -> None:
+        title = context.soup.title.get_text() if context.soup.title else None
+
+        if context.session:
+            if title == 'Blocked':
+                context.session.retire()
+            elif title == 'Not sure if blocked, might also be a connection error':
+                context.session.mark_bad()
+            else:
+                context.session.mark_good()  # BasicCrawler handles this automatically.
+
+    await crawler.run(['https://crawlee.dev/'])
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/docs/guides/code/session_management/session_management_http.py b/docs/guides/code/session_management/session_management_http.py
@@ -0,0 +1,44 @@
+import asyncio
+import re
+
+from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
+from crawlee.proxy_configuration import ProxyConfiguration
+from crawlee.sessions import SessionPool
+
+
+async def main() -> None:
+    # To use the proxy IP session rotation logic, you must turn the proxy usage on.
+    proxy_configuration = ProxyConfiguration(
+        # options
+    )
+
+    # Initialize crawler with a custom SessionPool configuration
+    # to manage concurrent sessions and proxy rotation
+    crawler = HttpCrawler(
+        proxy_configuration=proxy_configuration,
+        # Activates the Session pool (default is true).
+        use_session_pool=True,
+        # Overrides default Session pool configuration.
+        session_pool=SessionPool(max_pool_size=100),
+    )
+
+    # Define the default request handler that manages session states
+    # based on the response content and potential blocking
+    @crawler.router.default_handler
+    async def default_handler(context: HttpCrawlingContext) -> None:
+        page_content = context.http_response.read().decode()
+        title_match = re.search(r'<title(?:.*?)>(.*?)</title>', page_content)
+
+        if context.session and (title := title_match.group(1) if title_match else None):
+            if title == 'Blocked':
+                context.session.retire()
+            elif title == 'Not sure if blocked, might also be a connection error':
+                context.session.mark_bad()
+            else:
+                context.session.mark_good()  # BasicCrawler handles this automatically.
+
+    await crawler.run(['https://crawlee.dev/'])
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/docs/guides/code/session_management/session_management_parsel.py b/docs/guides/code/session_management/session_management_parsel.py
@@ -0,0 +1,42 @@
+import asyncio
+
+from crawlee.crawlers import ParselCrawler, ParselCrawlingContext
+from crawlee.proxy_configuration import ProxyConfiguration
+from crawlee.sessions import SessionPool
+
+
+async def main() -> None:
+    # To use the proxy IP session rotation logic, you must turn the proxy usage on.
+    proxy_configuration = ProxyConfiguration(
+        # options
+    )
+
+    # Initialize crawler with a custom SessionPool configuration
+    # to manage concurrent sessions and proxy rotation
+    crawler = ParselCrawler(
+        proxy_configuration=proxy_configuration,
+        # Activates the Session pool (default is true).
+        use_session_pool=True,
+        # Overrides default Session pool configuration.
+        session_pool=SessionPool(max_pool_size=100),
+    )
+
+    # Define the default request handler that manages session states
+    # based on the response content and potential blocking
+    @crawler.router.default_handler
+    async def default_handler(context: ParselCrawlingContext) -> None:
+        title = context.selector.css('title::text').get()
+
+        if context.session:
+            if title == 'Blocked':
+                context.session.retire()
+            elif title == 'Not sure if blocked, might also be a connection error':
+                context.session.mark_bad()
+            else:
+                context.session.mark_good()  # BasicCrawler handles this automatically.
+
+    await crawler.run(['https://crawlee.dev/'])
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/docs/guides/code/session_management/session_management_playwright.py b/docs/guides/code/session_management/session_management_playwright.py
@@ -0,0 +1,42 @@
+import asyncio
+
+from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
+from crawlee.proxy_configuration import ProxyConfiguration
+from crawlee.sessions import SessionPool
+
+
+async def main() -> None:
+    # To use the proxy IP session rotation logic, you must turn the proxy usage on.
+    proxy_configuration = ProxyConfiguration(
+        # options
+    )
+
+    # Initialize crawler with a custom SessionPool configuration
+    # to manage concurrent sessions and proxy rotation
+    crawler = PlaywrightCrawler(
+        proxy_configuration=proxy_configuration,
+        # Activates the Session pool (default is true).
+        use_session_pool=True,
+        # Overrides default Session pool configuration.
+        session_pool=SessionPool(max_pool_size=100),
+    )
+
+    # Define the default request handler that manages session states
+    # based on the response content and potential blocking
+    @crawler.router.default_handler
+    async def default_handler(context: PlaywrightCrawlingContext) -> None:
+        title = await context.page.title()
+
+        if context.session:
+            if title == 'Blocked':
+                context.session.retire()
+            elif title == 'Not sure if blocked, might also be a connection error':
+                context.session.mark_bad()
+            else:
+                context.session.mark_good()  # BasicCrawler handles this automatically.
+
+    await crawler.run(['https://crawlee.dev/'])
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/docs/guides/code/session_management/session_management_standalone.py b/docs/guides/code/session_management/session_management_standalone.py
@@ -0,0 +1,25 @@
+import asyncio
+
+from crawlee.sessions import SessionPool
+
+
+async def main() -> None:
+    # Override the default Session pool configuration.
+    async with SessionPool(
+        max_pool_size=100,
+        create_session_settings={'max_usage_count': 10, 'blocked_status_codes': [403]},
+    ) as session_pool:
+        session = await session_pool.get_session()
+
+        # Increase the error_score.
+        session.mark_bad()
+
+        # Throw away the session.
+        session.retire()
+
+        # Lower the error_score and mark the session good.
+        session.mark_good()
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/docs/guides/proxy_management.mdx b/docs/guides/proxy_management.mdx
@@ -62,7 +62,7 @@ Our crawlers will now use the selected proxies for all connections.
 
 The <ApiLink to="class/ProxyConfiguration#new_url">`proxy_configuration.new_url()`</ApiLink> method allows us to pass a `session_id` parameter. This creates a `session_id`-`proxy_url` pair, ensuring that subsequent `new_url()` calls with the same `session_id` return the same `proxy_url`. This is extremely useful in scraping, because we want to create the impression of a real user. See the <ApiLink to="class/SessionPool">`SessionPool`</ApiLink> class for more information on how maintaining a real session helps avoid blocking.
 
-{/* TODO: link session management guide */}
+For more details on session management, check out the [Session management](./session-management) guide.
 
 When no `session_id` is provided, our proxy URLs are rotated round-robin.
 

diff --git a/docs/guides/session_management.mdx b/docs/guides/session_management.mdx
@@ -0,0 +1,70 @@
+---
+id: session-management
+title: Session management
+description: How to manage your cookies, proxy IP rotations and more.
+---
+
+import ApiLink from '@site/src/components/ApiLink';
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+import CodeBlock from '@theme/CodeBlock';
+
+import BasicSource from '!!raw-loader!./code/session_management/session_management_basic.py';
+import HttpSource from '!!raw-loader!./code/session_management/session_management_http.py';
+import BeautifulSoupSource from '!!raw-loader!./code/session_management/session_management_beautifulsoup.py';
+import ParselSource from '!!raw-loader!./code/session_management/session_management_parsel.py';
+import PlaywrightSource from '!!raw-loader!./code/session_management/session_management_playwright.py';
+import StandaloneSource from '!!raw-loader!./code/session_management/session_management_standalone.py';
+
+The <ApiLink to="class/SessionPool">`SessionPool`</ApiLink> class provides a robust way to manage the rotation of proxy IP addresses, cookies, and other custom settings in Crawlee. Its primary advantage is the ability to filter out blocked or non-functional proxies, ensuring that your scraper avoids retrying requests through known problematic proxies.
+
+Additionally, it enables storing information tied to specific IP addresses, such as cookies, authentication tokens, and custom headers. This association reduces the probability of detection and blocking by ensuring cookies and other identifiers are used consistently with the same IP address.
+
+Finally, it ensures even IP address rotation by randomly selecting sessions. This helps prevent overuse of a limited pool of available IPs, reducing the risk of IP bans and enhancing the efficiency of your scraper.
+
+For more details on configuring proxies, refer to the [Proxy management](./proxy-management) guide.
+
+Now, let's explore examples of how to use the <ApiLink to="class/SessionPool">`SessionPool`</ApiLink> in different scenarios:
+- with <ApiLink to="class/BasicCrawler">`BasicCrawler`</ApiLink>;
+- with <ApiLink to="class/HttpCrawler">`HttpCrawler`</ApiLink>;
+- with <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink>;
+- with <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink>;
+- with <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink>;
+- without a crawler (standalone usage to manage sessions manually).
+
+<Tabs groupId="session_pool">
+    <TabItem value="basic" label="BasicSource">
+        <CodeBlock language="py">
+            {BasicSource}
+        </CodeBlock>
+    </TabItem>
+    <TabItem value="http" label="HttpCrawler">
+        <CodeBlock language="py">
+            {HttpSource}
+        </CodeBlock>
+    </TabItem>
+    <TabItem value="beautifulsoup" label="BeautifulSoupCrawler">
+        <CodeBlock language="py">
+            {BeautifulSoupSource}
+        </CodeBlock>
+    </TabItem>
+    <TabItem value="parsel" label="ParselCrawler">
+        <CodeBlock language="py">
+            {ParselSource}
+        </CodeBlock>
+    </TabItem>
+    <TabItem value="playwright" label="PlaywrightCrawler">
+        <CodeBlock language="py">
+            {PlaywrightSource}
+        </CodeBlock>
+    </TabItem>
+    <TabItem value="standalone" label="Standalone">
+        <CodeBlock language="py">
+            {StandaloneSource}
+        </CodeBlock>
+    </TabItem>
+</Tabs>
+
+These examples demonstrate the basics of configuring and using the <ApiLink to="class/SessionPool">`SessionPool`</ApiLink>.
+
+Please, bear in mind that <ApiLink to="class/SessionPool">`SessionPool`</ApiLink> requires some time to establish a stable pool of working IPs. During the initial setup, you may encounter errors as the pool identifies and filters out blocked or non-functional IPs. This stabilization period is expected and will improve over time.