Add crawl_one_required_contexts property. (Alternative to accessing i…

…nternals of sub crawlers) Cleanup commit results.
apify · Jan 7, 2025 · 340c53d · 340c53d
1 parent 957915a
commit 340c53d
Show file tree

Hide file tree

Showing 4 changed files with 44 additions and 14 deletions.
diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py
@@ -2,6 +2,7 @@
 
 import asyncio
 import logging
+from contextlib import AsyncExitStack
 from copy import deepcopy
 from logging import getLogger
 from random import random
@@ -189,14 +190,27 @@ async def run(
             purge_request_queue: If this is `True` and the crawler is not being run for the first time, the default
                 request queue will be purged.
         """
-        # TODO: Create something more robust that does not leak implementation so much
-        async with (self.beautifulsoup_crawler.statistics, self.playwright_crawler.statistics,
-                    self.playwright_crawler._additional_context_managers[0]):
+        contexts_to_enter = [
+            cm
+            for cm in (self.beautifulsoup_crawler.crawl_one_required_contexts
+                             + self.playwright_crawler.crawl_one_required_contexts)
+            if cm and getattr(cm, 'active', False) is False
+        ]
+
+        # Enter contexts required by sub crawler for them to be able to do `crawl_one`
+        async with AsyncExitStack() as exit_stack:
+            for context in contexts_to_enter:
+                await exit_stack.enter_async_context(context)
             return await super().run(requests=requests, purge_request_queue=purge_request_queue)
 
+        # AsyncExitStack can in theory swallow exceptions and so the return might not execute.
+        # https://github.com/python/mypy/issues/7726
+        raise RuntimeError('FinalStatistics not created.')
+
+
     # Can't use override as mypy does not like it for double underscore private method.
     async def _BasicCrawler__run_request_handler(self, context: BasicCrawlingContext) -> None: # noqa: N802
-        """Overrided BasicCrawler method that delegates request processing to sub crawlers.
+        """Override BasicCrawler method that delegates request processing to sub crawlers.
 
         To decide which sub crawler should process the request it runs `rendering_type_predictor`.
         To check if results are valid it uses `result_checker`.
@@ -271,17 +285,16 @@ async def _run_subcrawler(crawler: BeautifulSoupCrawler | PlaywrightCrawler,
                 self.rendering_type_predictor.store_result(context.request.url, context.request.label, detection_result)
 
     async def commit_result(self, result: RequestHandlerRunResult, context: BasicCrawlingContext) -> None:
-        result_tasks = []
-        result_tasks.extend([
-            asyncio.create_task(context.push_data(**kwargs)) for kwargs in result.push_data_calls])
-        result_tasks.extend([
-            asyncio.create_task(context.add_requests(**kwargs)) for kwargs in result.add_requests_calls])
+        result_tasks = [
+            asyncio.create_task(context.push_data(**kwargs)) for kwargs in result.push_data_calls
+        ] + [
+            asyncio.create_task(context.add_requests(**kwargs)) for kwargs in result.add_requests_calls
+        ] + [
+            asyncio.create_task(self._commit_key_value_store_changes(result))
+        ]
 
-        # What to do with KV changes????
         await asyncio.gather(*result_tasks)
 
-        # Optimize if needed
-        await self._commit_key_value_store_changes(result)
 
 
     def pre_navigation_hook(self, hook: Callable[[Any], Awaitable[None]]) -> None:

diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py
@@ -523,6 +523,7 @@ def sigint_handler() -> None:
 
         return final_statistics
 
+
     async def _run_crawler(self) -> None:
         event_manager = service_locator.get_event_manager()
 
@@ -1122,6 +1123,13 @@ async def __run_request_handler(self, context: BasicCrawlingContext) -> None:
         await self._context_pipeline(context, self.router)
 
 
+    @property
+    def crawl_one_required_contexts(self) -> list[AbstractAsyncContextManager]:
+        """Contexts that have to be active before `crawl_one` can be called."""
+        contexts: list[AbstractAsyncContextManager] = []
+        contexts.append(self.statistics)
+        return contexts
+
     async def crawl_one(self, *, context: BasicCrawlingContext,
                         request_handler_timeout: timedelta,
                         result: RequestHandlerRunResult,

diff --git a/src/crawlee/crawlers/_playwright/_playwright_crawler.py b/src/crawlee/crawlers/_playwright/_playwright_crawler.py
@@ -20,6 +20,7 @@
 
 if TYPE_CHECKING:
     from collections.abc import AsyncGenerator, Awaitable, Mapping
+    from contextlib import AbstractAsyncContextManager
 
     from typing_extensions import Unpack
 
@@ -285,3 +286,11 @@ def pre_navigation_hook(self, hook: Callable[[PlaywrightPreNavCrawlingContext],
             hook: A coroutine function to be called before each navigation.
         """
         self._pre_navigation_hooks.append(hook)
+
+
+    @property
+    def crawl_one_required_contexts(self) -> list[AbstractAsyncContextManager]:
+        """Contexts that have to be active before `crawl_one` can be called."""
+        contexts = super().crawl_one_required_contexts
+        contexts.append(self._browser_pool)
+        return contexts
diff --git a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py
@@ -3,7 +3,7 @@
 import logging
 from datetime import timedelta
 from itertools import cycle
-from typing import TYPE_CHECKING, cast
+from typing import TYPE_CHECKING, Any, cast
 from unittest.mock import Mock, patch
 
 import pytest
@@ -250,7 +250,7 @@ def test_adaptive_default_hooks_raise_exception() -> None:
 
     with pytest.raises(RuntimeError):
         @crawler.pre_navigation_hook
-        def some_hook() -> None:
+        async def some_hook(whatever: Any) -> None:
             pass