Skip to content

Commit 5ba70b6

Browse files
Pijukateljanbuchar
andauthored
feat: Add AdaptivePlaywrightCrawler (#872)
Add AdaptivePlaywrightCrawler. Adaptive crawler can choose to crawl page with either static crawler(like BeautifulSoupCrawler or ParselCrawler) or browser-based PlaywrightCrawler. --------- Co-authored-by: Jan Buchar <[email protected]>
1 parent fd0193f commit 5ba70b6

20 files changed

+1596
-64
lines changed
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
import asyncio
2+
3+
from playwright.async_api import Route
4+
5+
from crawlee.crawlers import (
6+
AdaptivePlaywrightCrawler,
7+
AdaptivePlaywrightCrawlingContext,
8+
AdaptivePlaywrightPreNavCrawlingContext,
9+
)
10+
11+
12+
async def main() -> None:
13+
crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(
14+
max_requests_per_crawl=5, playwright_crawler_specific_kwargs={'headless': False}
15+
)
16+
17+
@crawler.router.handler(label='label')
18+
async def request_handler_for_label(context: AdaptivePlaywrightCrawlingContext) -> None:
19+
# Do some processing using `page`
20+
some_locator = context.page.locator('div').first
21+
await some_locator.wait_for()
22+
# Do stuff with locator...
23+
context.log.info(f'Playwright processing of: {context.request.url} ...')
24+
25+
@crawler.router.default_handler
26+
async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
27+
context.log.info(f'User handler processing: {context.request.url} ...')
28+
# Do some processing using `parsed_content`
29+
context.log.info(context.parsed_content.title)
30+
31+
# Find more links and enqueue them.
32+
await context.enqueue_links()
33+
await context.push_data({'Top crawler Url': context.request.url})
34+
35+
@crawler.pre_navigation_hook
36+
async def hook(context: AdaptivePlaywrightPreNavCrawlingContext) -> None:
37+
"""Hook executed both in static sub crawler and playwright sub crawler."""
38+
# Trying to access context.page in this hook would raise `AdaptiveContextError` for pages crawled
39+
# without playwright.
40+
context.log.info(f'pre navigation hook for: {context.request.url} ...')
41+
42+
@crawler.pre_navigation_hook(playwright_only=True)
43+
async def hook_playwright(context: AdaptivePlaywrightPreNavCrawlingContext) -> None:
44+
"""Hook executed only in playwright sub crawler."""
45+
46+
async def some_routing_function(route: Route) -> None:
47+
await route.continue_()
48+
49+
await context.page.route('*/**', some_routing_function)
50+
context.log.info(f'Playwright only pre navigation hook for: {context.request.url} ...')
51+
52+
# Run the crawler with the initial list of URLs.
53+
await crawler.run(['https://warehouse-theme-metal.myshopify.com/'])
54+
55+
56+
if __name__ == '__main__':
57+
asyncio.run(main())

src/crawlee/_types.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -565,3 +565,7 @@ class BasicCrawlingContext:
565565

566566
log: logging.Logger
567567
"""Logger instance."""
568+
569+
def __hash__(self) -> int:
570+
"""Return hash of the context. Each context is considered unique."""
571+
return id(self)

src/crawlee/crawlers/__init__.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,31 @@
1818
with _try_import(__name__, 'PlaywrightCrawler', 'PlaywrightCrawlingContext', 'PlaywrightPreNavCrawlingContext'):
1919
from ._playwright import PlaywrightCrawler, PlaywrightCrawlingContext, PlaywrightPreNavCrawlingContext
2020

21+
with _try_import(
22+
__name__,
23+
'AdaptivePlaywrightCrawler',
24+
'AdaptivePlaywrightCrawlingContext',
25+
'AdaptivePlaywrightPreNavCrawlingContext',
26+
'RenderingType',
27+
'RenderingTypePrediction',
28+
'RenderingTypePredictor',
29+
):
30+
from ._adaptive_playwright import (
31+
AdaptivePlaywrightCrawler,
32+
AdaptivePlaywrightCrawlingContext,
33+
AdaptivePlaywrightPreNavCrawlingContext,
34+
RenderingType,
35+
RenderingTypePrediction,
36+
RenderingTypePredictor,
37+
)
38+
2139

2240
__all__ = [
2341
'AbstractHttpCrawler',
2442
'AbstractHttpParser',
43+
'AdaptivePlaywrightCrawler',
44+
'AdaptivePlaywrightCrawlingContext',
45+
'AdaptivePlaywrightPreNavCrawlingContext',
2546
'BasicCrawler',
2647
'BasicCrawlerOptions',
2748
'BasicCrawlingContext',
@@ -39,4 +60,7 @@
3960
'PlaywrightCrawler',
4061
'PlaywrightCrawlingContext',
4162
'PlaywrightPreNavCrawlingContext',
63+
'RenderingType',
64+
'RenderingTypePrediction',
65+
'RenderingTypePredictor',
4266
]

src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py

Lines changed: 46 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from typing import TYPE_CHECKING, Any, Callable, Generic
66

77
from pydantic import ValidationError
8-
from typing_extensions import NotRequired, TypeVar
8+
from typing_extensions import NotRequired, TypedDict, TypeVar
99

1010
from crawlee import EnqueueStrategy, RequestTransformAction
1111
from crawlee._request import Request, RequestOptions
@@ -14,6 +14,7 @@
1414
from crawlee.crawlers._basic import BasicCrawler, BasicCrawlerOptions, ContextPipeline
1515
from crawlee.errors import SessionError
1616
from crawlee.http_clients import HttpxHttpClient
17+
from crawlee.statistics import StatisticsState
1718

1819
from ._http_crawling_context import HttpCrawlingContext, ParsedHttpCrawlingContext, TParseResult
1920

@@ -27,24 +28,33 @@
2728
from ._abstract_http_parser import AbstractHttpParser
2829

2930
TCrawlingContext = TypeVar('TCrawlingContext', bound=ParsedHttpCrawlingContext)
31+
TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
3032

3133

32-
@docs_group('Data structures')
33-
class HttpCrawlerOptions(Generic[TCrawlingContext], BasicCrawlerOptions[TCrawlingContext]):
34-
"""Arguments for the `AbstractHttpCrawler` constructor.
35-
36-
It is intended for typing forwarded `__init__` arguments in the subclasses.
37-
"""
38-
34+
class _HttpCrawlerAdditionalOptions(TypedDict):
3935
additional_http_error_status_codes: NotRequired[Iterable[int]]
4036
"""Additional HTTP status codes to treat as errors, triggering automatic retries when encountered."""
4137

4238
ignore_http_error_status_codes: NotRequired[Iterable[int]]
4339
"""HTTP status codes that are typically considered errors but should be treated as successful responses."""
4440

4541

42+
@docs_group('Data structures')
43+
class HttpCrawlerOptions(
44+
Generic[TCrawlingContext, TStatisticsState],
45+
_HttpCrawlerAdditionalOptions,
46+
BasicCrawlerOptions[TCrawlingContext, StatisticsState],
47+
):
48+
"""Arguments for the `AbstractHttpCrawler` constructor.
49+
50+
It is intended for typing forwarded `__init__` arguments in the subclasses.
51+
"""
52+
53+
4654
@docs_group('Abstract classes')
47-
class AbstractHttpCrawler(Generic[TCrawlingContext, TParseResult], BasicCrawler[TCrawlingContext], ABC):
55+
class AbstractHttpCrawler(
56+
Generic[TCrawlingContext, TParseResult], BasicCrawler[TCrawlingContext, StatisticsState], ABC
57+
):
4858
"""A web crawler for performing HTTP requests.
4959
5060
The `AbstractHttpCrawler` builds on top of the `BasicCrawler`, inheriting all its features. Additionally,
@@ -65,7 +75,7 @@ def __init__(
6575
parser: AbstractHttpParser[TParseResult],
6676
additional_http_error_status_codes: Iterable[int] = (),
6777
ignore_http_error_status_codes: Iterable[int] = (),
68-
**kwargs: Unpack[BasicCrawlerOptions[TCrawlingContext]],
78+
**kwargs: Unpack[BasicCrawlerOptions[TCrawlingContext, StatisticsState]],
6979
) -> None:
7080
self._parser = parser
7181
self._pre_navigation_hooks: list[Callable[[BasicCrawlingContext], Awaitable[None]]] = []
@@ -87,6 +97,32 @@ def __init__(
8797
kwargs.setdefault('_logger', logging.getLogger(__name__))
8898
super().__init__(**kwargs)
8999

100+
@classmethod
101+
def create_parsed_http_crawler_class(
102+
cls,
103+
static_parser: AbstractHttpParser[TParseResult],
104+
) -> type[AbstractHttpCrawler[ParsedHttpCrawlingContext[TParseResult], TParseResult]]:
105+
"""Convenience class factory that creates specific version of `AbstractHttpCrawler` class.
106+
107+
In general typing sense two generic types of `AbstractHttpCrawler` do not have to be dependent on each other.
108+
This is convenience constructor for specific cases when `TParseResult` is used to specify both generic
109+
parameters in `AbstractHttpCrawler`.
110+
"""
111+
112+
class _ParsedHttpCrawler(AbstractHttpCrawler[ParsedHttpCrawlingContext[TParseResult], TParseResult]):
113+
def __init__(
114+
self,
115+
parser: AbstractHttpParser[TParseResult] = static_parser,
116+
**kwargs: Unpack[HttpCrawlerOptions[ParsedHttpCrawlingContext[TParseResult]]],
117+
) -> None:
118+
kwargs['_context_pipeline'] = self._create_static_content_crawler_pipeline()
119+
super().__init__(
120+
parser=parser,
121+
**kwargs,
122+
)
123+
124+
return _ParsedHttpCrawler
125+
90126
def _create_static_content_crawler_pipeline(self) -> ContextPipeline[ParsedHttpCrawlingContext[TParseResult]]:
91127
"""Create static content crawler context pipeline with expected pipeline steps."""
92128
return (
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
try:
2+
from ._rendering_type_predictor import RenderingType, RenderingTypePrediction, RenderingTypePredictor
3+
except ImportError as exc:
4+
raise ImportError(
5+
"To import this, you need to install the 'adaptive-playwright' extra. "
6+
"For example, if you use pip, run `pip install 'crawlee[adaptive-playwright]'`.",
7+
) from exc
8+
9+
from ._adaptive_playwright_crawler import AdaptivePlaywrightCrawler
10+
from ._adaptive_playwright_crawling_context import (
11+
AdaptivePlaywrightCrawlingContext,
12+
AdaptivePlaywrightPreNavCrawlingContext,
13+
)
14+
15+
__all__ = [
16+
'AdaptivePlaywrightCrawler',
17+
'AdaptivePlaywrightCrawlingContext',
18+
'AdaptivePlaywrightPreNavCrawlingContext',
19+
'RenderingType',
20+
'RenderingTypePrediction',
21+
'RenderingTypePredictor',
22+
]

0 commit comments

Comments
 (0)