apify · Pijukatel · Dec 24, 2024 · Dec 31, 2024 · Dec 31, 2024 · Jan 1, 2025
diff --git a/docs/examples/code/adaptive_playwright_crawler.py b/docs/examples/code/adaptive_playwright_crawler.py
@@ -0,0 +1,55 @@
+import asyncio
+
+from playwright.async_api import Route
+
+from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawler import AdaptivePlaywrightCrawler
+from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawling_context import (
+    AdaptiveContextError,
+    AdaptivePlaywrightCrawlingContext,
+    AdaptivePlaywrightPreNavCrawlingContext,
+)
+
+
+async def main() -> None:
+    crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(
+        max_requests_per_crawl=5, playwright_crawler_specific_kwargs={'headless': False}
+    )
+
+    @crawler.router.default_handler
+    async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
+        # Code that will be executed in both crawl types
+        context.log.info(f'User handler processing: {context.request.url} ...')
+
+        try:
+            some_locator = context.page.locator('div').first
+            # Code that will be executed only in Playwright crawl.
+            # Trying to access `context.page` in static crawl will throw `AdaptiveContextError`.
+
+            await some_locator.wait_for()
+            # Do stuff with locator...
+            context.log.info(f'Playwright processing of: {context.request.url} ...')
+        except AdaptiveContextError:
+            # Code that will be executed in only in static crawl
+            context.log.info(f'Static processing of: {context.request.url} ...')
+
+        # FInd more links and enqueue them.
+        await context.enqueue_links()
+        await context.push_data({'Top crawler Url': context.request.url})
+
+    @crawler.pre_navigation_hook
+    async def hook(context: AdaptivePlaywrightPreNavCrawlingContext) -> None:
+        async def some_routing_function(route: Route) -> None:
+            await route.continue_()
+
+        try:
+            await context.page.route('*/**', some_routing_function)
+            context.log.info(f'Playwright pre navigation hook for: {context.request.url} ...')
+        except AdaptiveContextError:
+            context.log.info(f'Static pre navigation hook for: {context.request.url} ...')
+
+    # Run the crawler with the initial list of URLs.
+    await crawler.run(['https://warehouse-theme-metal.myshopify.com/'])
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/poetry.lock b/poetry.lock
diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py
@@ -5,7 +5,7 @@
 from typing import TYPE_CHECKING, Any, Callable, Generic
 
 from pydantic import ValidationError
-from typing_extensions import NotRequired, TypeVar
+from typing_extensions import NotRequired, TypedDict, TypeVar
 
 from crawlee import EnqueueStrategy
 from crawlee._request import BaseRequestData
@@ -14,6 +14,7 @@
 from crawlee.crawlers._basic import BasicCrawler, BasicCrawlerOptions, ContextPipeline
 from crawlee.errors import SessionError
 from crawlee.http_clients import HttpxHttpClient
+from crawlee.statistics import StatisticsState
 
 from ._http_crawling_context import HttpCrawlingContext, ParsedHttpCrawlingContext, TParseResult
 
@@ -27,24 +28,33 @@
     from ._abstract_http_parser import AbstractHttpParser
 
 TCrawlingContext = TypeVar('TCrawlingContext', bound=ParsedHttpCrawlingContext)
+TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
 
 
-@docs_group('Data structures')
-class HttpCrawlerOptions(Generic[TCrawlingContext], BasicCrawlerOptions[TCrawlingContext]):
-    """Arguments for the `AbstractHttpCrawler` constructor.
-
-    It is intended for typing forwarded `__init__` arguments in the subclasses.
-    """
-
+class _HttpCrawlerAdditionalOptions(TypedDict):
     additional_http_error_status_codes: NotRequired[Iterable[int]]
     """Additional HTTP status codes to treat as errors, triggering automatic retries when encountered."""
 
     ignore_http_error_status_codes: NotRequired[Iterable[int]]
     """HTTP status codes that are typically considered errors but should be treated as successful responses."""
 
 
+@docs_group('Data structures')
+class HttpCrawlerOptions(
+    Generic[TCrawlingContext, TStatisticsState],
+    _HttpCrawlerAdditionalOptions,
+    BasicCrawlerOptions[TCrawlingContext, StatisticsState],
+):
+    """Arguments for the `AbstractHttpCrawler` constructor.
+
+    It is intended for typing forwarded `__init__` arguments in the subclasses.
+    """
+
+
 @docs_group('Abstract classes')
-class AbstractHttpCrawler(Generic[TCrawlingContext, TParseResult], BasicCrawler[TCrawlingContext], ABC):
+class AbstractHttpCrawler(
+    Generic[TCrawlingContext, TParseResult], BasicCrawler[TCrawlingContext, StatisticsState], ABC
+):
     """A web crawler for performing HTTP requests.
 
     The `AbstractHttpCrawler` builds on top of the `BasicCrawler`, inheriting all its features. Additionally,
@@ -65,7 +75,7 @@ def __init__(
         parser: AbstractHttpParser[TParseResult],
         additional_http_error_status_codes: Iterable[int] = (),
         ignore_http_error_status_codes: Iterable[int] = (),
-        **kwargs: Unpack[BasicCrawlerOptions[TCrawlingContext]],
+        **kwargs: Unpack[BasicCrawlerOptions[TCrawlingContext, StatisticsState]],
     ) -> None:
         self._parser = parser
         self._pre_navigation_hooks: list[Callable[[BasicCrawlingContext], Awaitable[None]]] = []
@@ -87,6 +97,31 @@ def __init__(
         kwargs.setdefault('_logger', logging.getLogger(__name__))
         super().__init__(**kwargs)
 
+    @staticmethod
+    def create_parsed_http_crawler_class(
+        static_parser: AbstractHttpParser[TParseResult],
+    ) -> type[AbstractHttpCrawler[ParsedHttpCrawlingContext[TParseResult], TParseResult]]:
+        """Convenience class factory that creates specific version of `AbstractHttpCrawler` class.
+
+        In general typing sense two generic types of `AbstractHttpCrawler` do not have to be dependent on each other.
+        This is convenience constructor for specific cases when `TParseResult` is used to specify both generic
+        parameters in `AbstractHttpCrawler`.
+        """
+
+        class _ParsedHttpCrawler(AbstractHttpCrawler[ParsedHttpCrawlingContext[TParseResult], TParseResult]):
+            def __init__(
+                self,
+                parser: AbstractHttpParser[TParseResult] = static_parser,
+                **kwargs: Unpack[HttpCrawlerOptions[ParsedHttpCrawlingContext[TParseResult]]],
+            ) -> None:
+                kwargs['_context_pipeline'] = self._create_static_content_crawler_pipeline()
+                super().__init__(
+                    parser=parser,
+                    **kwargs,
+                )
+
+        return _ParsedHttpCrawler
+
     def _create_static_content_crawler_pipeline(self) -> ContextPipeline[ParsedHttpCrawlingContext[TParseResult]]:
         """Create static content crawler context pipeline with expected pipeline steps."""
         return (

diff --git a/src/crawlee/crawlers/_adaptive_playwright/__init__.py b/src/crawlee/crawlers/_adaptive_playwright/__init__.py
@@ -0,0 +1,7 @@
+from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawler import AdaptivePlaywrightCrawler
+from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawling_context import (
+    AdaptivePlaywrightCrawlingContext,
+    AdaptivePlaywrightPreNavCrawlingContext,
+)
+
+__all__ = ['AdaptivePlaywrightCrawler', 'AdaptivePlaywrightCrawlingContext', 'AdaptivePlaywrightPreNavCrawlingContext']