Skip to content

Commit

Permalink
Add crawl_one_required_contexts property. (Alternative to accessing i…
Browse files Browse the repository at this point in the history
…nternals of sub crawlers)

Cleanup commit results.
  • Loading branch information
Pijukatel committed Jan 7, 2025
1 parent 957915a commit 340c53d
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 14 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import asyncio
import logging
from contextlib import AsyncExitStack
from copy import deepcopy
from logging import getLogger
from random import random
Expand Down Expand Up @@ -189,14 +190,27 @@ async def run(
purge_request_queue: If this is `True` and the crawler is not being run for the first time, the default
request queue will be purged.
"""
# TODO: Create something more robust that does not leak implementation so much
async with (self.beautifulsoup_crawler.statistics, self.playwright_crawler.statistics,
self.playwright_crawler._additional_context_managers[0]):
contexts_to_enter = [
cm
for cm in (self.beautifulsoup_crawler.crawl_one_required_contexts
+ self.playwright_crawler.crawl_one_required_contexts)
if cm and getattr(cm, 'active', False) is False
]

# Enter contexts required by sub crawler for them to be able to do `crawl_one`
async with AsyncExitStack() as exit_stack:
for context in contexts_to_enter:
await exit_stack.enter_async_context(context)
return await super().run(requests=requests, purge_request_queue=purge_request_queue)

# AsyncExitStack can in theory swallow exceptions and so the return might not execute.
# https://github.com/python/mypy/issues/7726
raise RuntimeError('FinalStatistics not created.')


# Can't use override as mypy does not like it for double underscore private method.
async def _BasicCrawler__run_request_handler(self, context: BasicCrawlingContext) -> None: # noqa: N802
"""Overrided BasicCrawler method that delegates request processing to sub crawlers.
"""Override BasicCrawler method that delegates request processing to sub crawlers.
To decide which sub crawler should process the request it runs `rendering_type_predictor`.
To check if results are valid it uses `result_checker`.
Expand Down Expand Up @@ -271,17 +285,16 @@ async def _run_subcrawler(crawler: BeautifulSoupCrawler | PlaywrightCrawler,
self.rendering_type_predictor.store_result(context.request.url, context.request.label, detection_result)

async def commit_result(self, result: RequestHandlerRunResult, context: BasicCrawlingContext) -> None:
result_tasks = []
result_tasks.extend([
asyncio.create_task(context.push_data(**kwargs)) for kwargs in result.push_data_calls])
result_tasks.extend([
asyncio.create_task(context.add_requests(**kwargs)) for kwargs in result.add_requests_calls])
result_tasks = [
asyncio.create_task(context.push_data(**kwargs)) for kwargs in result.push_data_calls
] + [
asyncio.create_task(context.add_requests(**kwargs)) for kwargs in result.add_requests_calls
] + [
asyncio.create_task(self._commit_key_value_store_changes(result))
]

# What to do with KV changes????
await asyncio.gather(*result_tasks)

# Optimize if needed
await self._commit_key_value_store_changes(result)


def pre_navigation_hook(self, hook: Callable[[Any], Awaitable[None]]) -> None:
Expand Down
8 changes: 8 additions & 0 deletions src/crawlee/crawlers/_basic/_basic_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -523,6 +523,7 @@ def sigint_handler() -> None:

return final_statistics


async def _run_crawler(self) -> None:
event_manager = service_locator.get_event_manager()

Expand Down Expand Up @@ -1122,6 +1123,13 @@ async def __run_request_handler(self, context: BasicCrawlingContext) -> None:
await self._context_pipeline(context, self.router)


@property
def crawl_one_required_contexts(self) -> list[AbstractAsyncContextManager]:
"""Contexts that have to be active before `crawl_one` can be called."""
contexts: list[AbstractAsyncContextManager] = []
contexts.append(self.statistics)
return contexts

async def crawl_one(self, *, context: BasicCrawlingContext,
request_handler_timeout: timedelta,
result: RequestHandlerRunResult,
Expand Down
9 changes: 9 additions & 0 deletions src/crawlee/crawlers/_playwright/_playwright_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

if TYPE_CHECKING:
from collections.abc import AsyncGenerator, Awaitable, Mapping
from contextlib import AbstractAsyncContextManager

from typing_extensions import Unpack

Expand Down Expand Up @@ -285,3 +286,11 @@ def pre_navigation_hook(self, hook: Callable[[PlaywrightPreNavCrawlingContext],
hook: A coroutine function to be called before each navigation.
"""
self._pre_navigation_hooks.append(hook)


@property
def crawl_one_required_contexts(self) -> list[AbstractAsyncContextManager]:
"""Contexts that have to be active before `crawl_one` can be called."""
contexts = super().crawl_one_required_contexts
contexts.append(self._browser_pool)
return contexts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import logging
from datetime import timedelta
from itertools import cycle
from typing import TYPE_CHECKING, cast
from typing import TYPE_CHECKING, Any, cast
from unittest.mock import Mock, patch

import pytest
Expand Down Expand Up @@ -250,7 +250,7 @@ def test_adaptive_default_hooks_raise_exception() -> None:

with pytest.raises(RuntimeError):
@crawler.pre_navigation_hook
def some_hook() -> None:
async def some_hook(whatever: Any) -> None:
pass


Expand Down

0 comments on commit 340c53d

Please sign in to comment.