diff --git a/Makefile b/Makefile index 707ebec7..73f69455 100644 --- a/Makefile +++ b/Makefile @@ -26,13 +26,13 @@ type-check: uv run mypy unit-tests: - uv run pytest --numprocesses=auto --verbose --cov=src/apify tests/unit + uv run pytest --numprocesses=auto -vv --cov=src/apify tests/unit unit-tests-cov: - uv run pytest --numprocesses=auto --verbose --cov=src/apify --cov-report=html tests/unit + uv run pytest --numprocesses=auto -vv --cov=src/apify --cov-report=html tests/unit integration-tests: - uv run pytest --numprocesses=$(INTEGRATION_TESTS_CONCURRENCY) --verbose tests/integration + uv run pytest --numprocesses=$(INTEGRATION_TESTS_CONCURRENCY) -vv tests/integration format: uv run ruff check --fix diff --git a/docs/03_concepts/code/03_dataset_exports.py b/docs/03_concepts/code/03_dataset_exports.py index 78f0f5b9..4f0c01c4 100644 --- a/docs/03_concepts/code/03_dataset_exports.py +++ b/docs/03_concepts/code/03_dataset_exports.py @@ -11,14 +11,14 @@ async def main() -> None: await dataset.export_to( content_type='csv', key='data.csv', - to_key_value_store_name='my-cool-key-value-store', + to_kvs_name='my-cool-key-value-store', ) # Export the data as JSON await dataset.export_to( content_type='json', key='data.json', - to_key_value_store_name='my-cool-key-value-store', + to_kvs_name='my-cool-key-value-store', ) # Print the exported records diff --git a/docs/03_concepts/code/conditional_actor_charge.py b/docs/03_concepts/code/conditional_actor_charge.py index 926c591d..f4695cc4 100644 --- a/docs/03_concepts/code/conditional_actor_charge.py +++ b/docs/03_concepts/code/conditional_actor_charge.py @@ -6,8 +6,8 @@ async def main() -> None: # Check the dataset because there might already be items # if the run migrated or was restarted default_dataset = await Actor.open_dataset() - dataset_info = await default_dataset.get_info() - charged_items = dataset_info.item_count if dataset_info else 0 + metadata = await default_dataset.get_metadata() + charged_items = metadata.item_count # highlight-start if Actor.get_charging_manager().get_pricing_info().is_pay_per_event: diff --git a/pyproject.toml b/pyproject.toml index 4f9d2930..10ec8cea 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,9 +34,10 @@ keywords = [ "scraping", ] dependencies = [ - "apify-client>=1.11.0", + "apify-client>=1.12.0", "apify-shared>=1.3.0", - "crawlee~=0.6.0", + "cachetools>=5.5.0", + "crawlee@git+https://github.com/apify/crawlee-python.git@9dfac4b8afb8027979d85947f0db303f384b7158", "cryptography>=42.0.0", "httpx>=0.27.0", # TODO: ensure compatibility with the latest version of lazy-object-proxy @@ -76,11 +77,15 @@ dev = [ "respx~=0.22.0", "ruff~=0.12.0", "setuptools", # setuptools are used by pytest but not explicitly required + "types-cachetools>=6.0.0.20250525", ] [tool.hatch.build.targets.wheel] packages = ["src/apify"] +[tool.hatch.metadata] +allow-direct-references = true + [tool.ruff] line-length = 120 include = ["src/**/*.py", "tests/**/*.py", "docs/**/*.py", "website/**/*.py"] diff --git a/src/apify/_actor.py b/src/apify/_actor.py index e69e213b..78e17bc5 100644 --- a/src/apify/_actor.py +++ b/src/apify/_actor.py @@ -33,8 +33,8 @@ from apify._platform_event_manager import EventManager, LocalEventManager, PlatformEventManager from apify._proxy_configuration import ProxyConfiguration from apify._utils import docs_group, docs_name, get_system_info, is_running_in_ipython -from apify.apify_storage_client import ApifyStorageClient from apify.log import _configure_logging, logger +from apify.storage_clients import ApifyStorageClient from apify.storages import Dataset, KeyValueStore, RequestQueue if TYPE_CHECKING: @@ -89,7 +89,7 @@ def __init__( # Create an instance of the cloud storage client, the local storage client is obtained # from the service locator. - self._cloud_storage_client = ApifyStorageClient.from_config(config=self._configuration) + self._cloud_storage_client = ApifyStorageClient() # Set the event manager based on whether the Actor is running on the platform or locally. self._event_manager = ( diff --git a/src/apify/_configuration.py b/src/apify/_configuration.py index 4e12304c..aa584055 100644 --- a/src/apify/_configuration.py +++ b/src/apify/_configuration.py @@ -140,6 +140,39 @@ class Configuration(CrawleeConfiguration): ), ] = None + default_dataset_id: Annotated[ + str, + Field( + validation_alias=AliasChoices( + 'actor_default_dataset_id', + 'apify_default_dataset_id', + ), + description='Default dataset ID used by the Apify storage client when no ID or name is provided.', + ), + ] = 'default' + + default_key_value_store_id: Annotated[ + str, + Field( + validation_alias=AliasChoices( + 'actor_default_key_value_store_id', + 'apify_default_key_value_store_id', + ), + description='Default key-value store ID for the Apify storage client when no ID or name is provided.', + ), + ] = 'default' + + default_request_queue_id: Annotated[ + str, + Field( + validation_alias=AliasChoices( + 'actor_default_request_queue_id', + 'apify_default_request_queue_id', + ), + description='Default request queue ID for the Apify storage client when no ID or name is provided.', + ), + ] = 'default' + disable_outdated_warning: Annotated[ bool, Field( diff --git a/src/apify/_proxy_configuration.py b/src/apify/_proxy_configuration.py index 1d5b9f72..f56cb2a1 100644 --- a/src/apify/_proxy_configuration.py +++ b/src/apify/_proxy_configuration.py @@ -21,7 +21,8 @@ if TYPE_CHECKING: from apify_client import ApifyClientAsync - from crawlee import Request + + from apify import Request APIFY_PROXY_VALUE_REGEX = re.compile(r'^[\w._~]+$') COUNTRY_CODE_REGEX = re.compile(r'^[A-Z]{2}$') diff --git a/src/apify/apify_storage_client/__init__.py b/src/apify/apify_storage_client/__init__.py deleted file mode 100644 index 8b6d517c..00000000 --- a/src/apify/apify_storage_client/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from apify.apify_storage_client._apify_storage_client import ApifyStorageClient - -__all__ = ['ApifyStorageClient'] diff --git a/src/apify/apify_storage_client/_apify_storage_client.py b/src/apify/apify_storage_client/_apify_storage_client.py deleted file mode 100644 index 51e3fc24..00000000 --- a/src/apify/apify_storage_client/_apify_storage_client.py +++ /dev/null @@ -1,72 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING - -from typing_extensions import override - -from apify_client import ApifyClientAsync -from crawlee._utils.crypto import crypto_random_object_id -from crawlee.storage_clients import StorageClient - -from apify._utils import docs_group -from apify.apify_storage_client._dataset_client import DatasetClient -from apify.apify_storage_client._dataset_collection_client import DatasetCollectionClient -from apify.apify_storage_client._key_value_store_client import KeyValueStoreClient -from apify.apify_storage_client._key_value_store_collection_client import KeyValueStoreCollectionClient -from apify.apify_storage_client._request_queue_client import RequestQueueClient -from apify.apify_storage_client._request_queue_collection_client import RequestQueueCollectionClient - -if TYPE_CHECKING: - from apify._configuration import Configuration - - -@docs_group('Classes') -class ApifyStorageClient(StorageClient): - """A storage client implementation based on the Apify platform storage.""" - - def __init__(self, *, configuration: Configuration) -> None: - self._client_key = crypto_random_object_id() - self._apify_client = ApifyClientAsync( - token=configuration.token, - api_url=configuration.api_base_url, - max_retries=8, - min_delay_between_retries_millis=500, - timeout_secs=360, - ) - self._configuration = configuration - - @classmethod - def from_config(cls, config: Configuration) -> ApifyStorageClient: - return cls(configuration=config) - - @override - def dataset(self, id: str) -> DatasetClient: - return DatasetClient(self._apify_client.dataset(id)) - - @override - def datasets(self) -> DatasetCollectionClient: - return DatasetCollectionClient(self._apify_client.datasets()) - - @override - def key_value_store(self, id: str) -> KeyValueStoreClient: - return KeyValueStoreClient(self._apify_client.key_value_store(id), self._configuration.api_public_base_url) - - @override - def key_value_stores(self) -> KeyValueStoreCollectionClient: - return KeyValueStoreCollectionClient(self._apify_client.key_value_stores()) - - @override - def request_queue(self, id: str) -> RequestQueueClient: - return RequestQueueClient(self._apify_client.request_queue(id, client_key=self._client_key)) - - @override - def request_queues(self) -> RequestQueueCollectionClient: - return RequestQueueCollectionClient(self._apify_client.request_queues()) - - @override - async def purge_on_start(self) -> None: - pass - - @override - def get_rate_limit_errors(self) -> dict[int, int]: - return self._apify_client.stats.rate_limit_errors diff --git a/src/apify/apify_storage_client/_dataset_client.py b/src/apify/apify_storage_client/_dataset_client.py deleted file mode 100644 index 93c8d575..00000000 --- a/src/apify/apify_storage_client/_dataset_client.py +++ /dev/null @@ -1,190 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING - -from typing_extensions import override - -from crawlee.storage_clients._base import DatasetClient as BaseDatasetClient -from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata - -if TYPE_CHECKING: - from collections.abc import AsyncIterator - from contextlib import AbstractAsyncContextManager - - from httpx import Response - - from apify_client.clients import DatasetClientAsync - from crawlee._types import JsonSerializable - - -class DatasetClient(BaseDatasetClient): - """Dataset resource client implementation based on the Apify platform storage.""" - - def __init__(self, apify_dataset_client: DatasetClientAsync) -> None: - self._client = apify_dataset_client - - @override - async def get(self) -> DatasetMetadata | None: - result = await self._client.get() - return DatasetMetadata.model_validate(result) if result else None - - @override - async def update( - self, - *, - name: str | None = None, - ) -> DatasetMetadata: - return DatasetMetadata.model_validate( - await self._client.update( - name=name, - ) - ) - - @override - async def delete(self) -> None: - await self._client.delete() - - @override - async def list_items( - self, - *, - offset: int | None = 0, - limit: int | None = BaseDatasetClient._LIST_ITEMS_LIMIT, # noqa: SLF001 - clean: bool = False, - desc: bool = False, - fields: list[str] | None = None, - omit: list[str] | None = None, - unwind: str | None = None, - skip_empty: bool = False, - skip_hidden: bool = False, - flatten: list[str] | None = None, - view: str | None = None, - ) -> DatasetItemsListPage: - return DatasetItemsListPage.model_validate( - vars( - await self._client.list_items( - offset=offset, - limit=limit, - clean=clean, - desc=desc, - fields=fields, - omit=omit, - unwind=unwind, - skip_empty=skip_empty, - skip_hidden=skip_hidden, - flatten=flatten, - view=view, - ) - ) - ) - - @override - async def iterate_items( - self, - *, - offset: int = 0, - limit: int | None = None, - clean: bool = False, - desc: bool = False, - fields: list[str] | None = None, - omit: list[str] | None = None, - unwind: str | None = None, - skip_empty: bool = False, - skip_hidden: bool = False, - ) -> AsyncIterator[dict]: - async for item in self._client.iterate_items( - offset=offset, - limit=limit, - clean=clean, - desc=desc, - fields=fields, - omit=omit, - unwind=unwind, - skip_empty=skip_empty, - skip_hidden=skip_hidden, - ): - yield item - - @override - async def get_items_as_bytes( - self, - *, - item_format: str = 'json', - offset: int | None = None, - limit: int | None = None, - desc: bool = False, - clean: bool = False, - bom: bool = False, - delimiter: str | None = None, - fields: list[str] | None = None, - omit: list[str] | None = None, - unwind: str | None = None, - skip_empty: bool = False, - skip_header_row: bool = False, - skip_hidden: bool = False, - xml_root: str | None = None, - xml_row: str | None = None, - flatten: list[str] | None = None, - ) -> bytes: - return await self._client.get_items_as_bytes( - item_format=item_format, - offset=offset, - limit=limit, - desc=desc, - clean=clean, - bom=bom, - delimiter=delimiter, - fields=fields, - omit=omit, - unwind=unwind, - skip_empty=skip_empty, - skip_header_row=skip_header_row, - skip_hidden=skip_hidden, - xml_root=xml_root, - xml_row=xml_row, - flatten=flatten, - ) - - @override - async def stream_items( - self, - *, - item_format: str = 'json', - offset: int | None = None, - limit: int | None = None, - desc: bool = False, - clean: bool = False, - bom: bool = False, - delimiter: str | None = None, - fields: list[str] | None = None, - omit: list[str] | None = None, - unwind: str | None = None, - skip_empty: bool = False, - skip_header_row: bool = False, - skip_hidden: bool = False, - xml_root: str | None = None, - xml_row: str | None = None, - ) -> AbstractAsyncContextManager[Response | None]: - return self._client.stream_items( - item_format=item_format, - offset=offset, - limit=limit, - desc=desc, - clean=clean, - bom=bom, - delimiter=delimiter, - fields=fields, - omit=omit, - unwind=unwind, - skip_empty=skip_empty, - skip_header_row=skip_header_row, - skip_hidden=skip_hidden, - xml_root=xml_root, - xml_row=xml_row, - ) - - @override - async def push_items(self, items: JsonSerializable) -> None: - await self._client.push_items( - items=items, - ) diff --git a/src/apify/apify_storage_client/_dataset_collection_client.py b/src/apify/apify_storage_client/_dataset_collection_client.py deleted file mode 100644 index f8ffc3e8..00000000 --- a/src/apify/apify_storage_client/_dataset_collection_client.py +++ /dev/null @@ -1,51 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING - -from typing_extensions import override - -from crawlee.storage_clients._base import DatasetCollectionClient as BaseDatasetCollectionClient -from crawlee.storage_clients.models import DatasetListPage, DatasetMetadata - -if TYPE_CHECKING: - from apify_client.clients import DatasetCollectionClientAsync - - -class DatasetCollectionClient(BaseDatasetCollectionClient): - """Dataset collection resource client implementation based on the Apify platform storage.""" - - def __init__(self, apify_dataset_collection_client: DatasetCollectionClientAsync) -> None: - self._client = apify_dataset_collection_client - - @override - async def get_or_create( - self, - *, - id: str | None = None, - name: str | None = None, - schema: dict | None = None, - ) -> DatasetMetadata: - return DatasetMetadata.model_validate( - await self._client.get_or_create( - name=id if id is not None else name, - schema=schema, - ) - ) - - @override - async def list( - self, - *, - unnamed: bool = False, - limit: int | None = None, - offset: int | None = None, - desc: bool = False, - ) -> DatasetListPage: - return DatasetListPage.model_validate( - await self._client.list( - unnamed=unnamed, - limit=limit, - offset=offset, - desc=desc, - ) - ) diff --git a/src/apify/apify_storage_client/_key_value_store_client.py b/src/apify/apify_storage_client/_key_value_store_client.py deleted file mode 100644 index 49883b3f..00000000 --- a/src/apify/apify_storage_client/_key_value_store_client.py +++ /dev/null @@ -1,109 +0,0 @@ -from __future__ import annotations - -from contextlib import asynccontextmanager -from typing import TYPE_CHECKING, Any - -from typing_extensions import override -from yarl import URL - -from crawlee.storage_clients._base import KeyValueStoreClient as BaseKeyValueStoreClient -from crawlee.storage_clients.models import KeyValueStoreListKeysPage, KeyValueStoreMetadata, KeyValueStoreRecord - -from apify._crypto import create_hmac_signature - -if TYPE_CHECKING: - from collections.abc import AsyncIterator - from contextlib import AbstractAsyncContextManager - - from httpx import Response - - from apify_client.clients import KeyValueStoreClientAsync - - -class KeyValueStoreClient(BaseKeyValueStoreClient): - """Key-value store resource client implementation based on the Apify platform storage.""" - - def __init__(self, apify_key_value_store_client: KeyValueStoreClientAsync, api_public_base_url: str) -> None: - self._client = apify_key_value_store_client - self._api_public_base_url = api_public_base_url - - @override - async def get(self) -> KeyValueStoreMetadata | None: - result = await self._client.get() - return KeyValueStoreMetadata.model_validate(result) if result else None - - @override - async def update( - self, - *, - name: str | None = None, - ) -> KeyValueStoreMetadata: - return KeyValueStoreMetadata.model_validate(await self._client.update()) - - @override - async def delete(self) -> None: - await self._client.delete() - - @override - async def list_keys( - self, - *, - limit: int = 1000, - exclusive_start_key: str | None = None, - ) -> KeyValueStoreListKeysPage: - return KeyValueStoreListKeysPage.model_validate(await self._client.list_keys()) - - @override - async def get_record(self, key: str) -> KeyValueStoreRecord | None: - result = await self._client.get_record(key) - return KeyValueStoreRecord.model_validate(result) if result else None - - @override - async def get_record_as_bytes(self, key: str) -> KeyValueStoreRecord | None: - result = await self._client.get_record_as_bytes(key) - return KeyValueStoreRecord.model_validate(result) if result else None - - @override - async def stream_record(self, key: str) -> AbstractAsyncContextManager[KeyValueStoreRecord[Response] | None]: - return self._stream_record_internal(key) - - @asynccontextmanager - async def _stream_record_internal(self, key: str) -> AsyncIterator[KeyValueStoreRecord[Response] | None]: - async with self._client.stream_record(key) as response: - yield KeyValueStoreRecord.model_validate(response) - - @override - async def set_record(self, key: str, value: Any, content_type: str | None = None) -> None: - await self._client.set_record( - key=key, - value=value, - content_type=content_type, - ) - - @override - async def delete_record(self, key: str) -> None: - await self._client.delete_record( - key=key, - ) - - async def get_public_url(self, key: str) -> str: - """Get a URL for the given key that may be used to publicly access the value in the remote key-value store. - - Args: - key: The key for which the URL should be generated. - """ - if self._client.resource_id is None: - raise ValueError('resource_id cannot be None when generating a public URL') - - public_url = ( - URL(self._api_public_base_url) / 'v2' / 'key-value-stores' / self._client.resource_id / 'records' / key - ) - - key_value_store = await self.get() - - if key_value_store is not None and isinstance(key_value_store.model_extra, dict): - url_signing_secret_key = key_value_store.model_extra.get('urlSigningSecretKey') - if url_signing_secret_key: - public_url = public_url.with_query(signature=create_hmac_signature(url_signing_secret_key, key)) - - return str(public_url) diff --git a/src/apify/apify_storage_client/_key_value_store_collection_client.py b/src/apify/apify_storage_client/_key_value_store_collection_client.py deleted file mode 100644 index 0d4caca7..00000000 --- a/src/apify/apify_storage_client/_key_value_store_collection_client.py +++ /dev/null @@ -1,51 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING - -from typing_extensions import override - -from crawlee.storage_clients._base import KeyValueStoreCollectionClient as BaseKeyValueStoreCollectionClient -from crawlee.storage_clients.models import KeyValueStoreListPage, KeyValueStoreMetadata - -if TYPE_CHECKING: - from apify_client.clients import KeyValueStoreCollectionClientAsync - - -class KeyValueStoreCollectionClient(BaseKeyValueStoreCollectionClient): - """Key-value store collection resource client implementation based on the Apify platform storage.""" - - def __init__(self, apify_dataset_collection_client: KeyValueStoreCollectionClientAsync) -> None: - self._client = apify_dataset_collection_client - - @override - async def get_or_create( - self, - *, - id: str | None = None, - name: str | None = None, - schema: dict | None = None, - ) -> KeyValueStoreMetadata: - return KeyValueStoreMetadata.model_validate( - await self._client.get_or_create( - name=id if id is not None else name, - schema=schema, - ) - ) - - @override - async def list( - self, - *, - unnamed: bool = False, - limit: int | None = None, - offset: int | None = None, - desc: bool = False, - ) -> KeyValueStoreListPage: - return KeyValueStoreListPage.model_validate( - await self._client.list( - unnamed=unnamed, - limit=limit, - offset=offset, - desc=desc, - ) - ) diff --git a/src/apify/apify_storage_client/_request_queue_client.py b/src/apify/apify_storage_client/_request_queue_client.py deleted file mode 100644 index 036eb2ab..00000000 --- a/src/apify/apify_storage_client/_request_queue_client.py +++ /dev/null @@ -1,176 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING - -from typing_extensions import override - -from crawlee import Request -from crawlee.storage_clients._base import RequestQueueClient as BaseRequestQueueClient -from crawlee.storage_clients.models import ( - BatchRequestsOperationResponse, - ProcessedRequest, - ProlongRequestLockResponse, - RequestQueueHead, - RequestQueueHeadWithLocks, - RequestQueueMetadata, -) - -if TYPE_CHECKING: - from collections.abc import Sequence - - from apify_client.clients import RequestQueueClientAsync - - -class RequestQueueClient(BaseRequestQueueClient): - """Request queue resource client implementation based on the Apify platform storage.""" - - def __init__(self, apify_request_queue_client: RequestQueueClientAsync) -> None: - self._client = apify_request_queue_client - - @override - async def get(self) -> RequestQueueMetadata | None: - result = await self._client.get() - return RequestQueueMetadata.model_validate({'resourceDirectory': ''} | result) if result else None - - @override - async def update( - self, - *, - name: str | None = None, - ) -> RequestQueueMetadata: - return RequestQueueMetadata.model_validate( - {'resourceDirectory': ''} - | await self._client.update( - name=name, - ) - ) - - @override - async def delete(self) -> None: - await self._client.delete() - - @override - async def list_head(self, *, limit: int | None = None) -> RequestQueueHead: - return RequestQueueHead.model_validate( - await self._client.list_head( - limit=limit, - ), - ) - - @override - async def list_and_lock_head(self, *, lock_secs: int, limit: int | None = None) -> RequestQueueHeadWithLocks: - return RequestQueueHeadWithLocks.model_validate( - await self._client.list_and_lock_head( - lock_secs=lock_secs, - limit=limit, - ) - ) - - @override - async def add_request( - self, - request: Request, - *, - forefront: bool = False, - ) -> ProcessedRequest: - return ProcessedRequest.model_validate( - {'id': request.id, 'uniqueKey': request.unique_key} - | await self._client.add_request( - request=request.model_dump( - by_alias=True, - exclude={ - 'id', - }, - ), - forefront=forefront, - ) - ) - - @override - async def get_request(self, request_id: str) -> Request | None: - result = await self._client.get_request(request_id) - return Request.model_validate(result) if result else None - - @override - async def update_request( - self, - request: Request, - *, - forefront: bool = False, - ) -> ProcessedRequest: - return ProcessedRequest.model_validate( - {'id': request.id, 'uniqueKey': request.unique_key} - | await self._client.update_request( - request=request.model_dump( - by_alias=True, - ), - forefront=forefront, - ) - ) - - @override - async def delete_request(self, request_id: str) -> None: - await self._client.delete_request(request_id) - - @override - async def prolong_request_lock( - self, - request_id: str, - *, - forefront: bool = False, - lock_secs: int, - ) -> ProlongRequestLockResponse: - return ProlongRequestLockResponse.model_validate( - await self._client.prolong_request_lock( - request_id=request_id, - forefront=forefront, - lock_secs=lock_secs, - ) - ) - - @override - async def delete_request_lock( - self, - request_id: str, - *, - forefront: bool = False, - ) -> None: - await self._client.delete_request_lock( - request_id=request_id, - forefront=forefront, - ) - - @override - async def batch_add_requests( - self, - requests: Sequence[Request], - *, - forefront: bool = False, - ) -> BatchRequestsOperationResponse: - return BatchRequestsOperationResponse.model_validate( - await self._client.batch_add_requests( - requests=[ - r.model_dump( - by_alias=True, - exclude={ - 'id', - }, - ) - for r in requests - ], - forefront=forefront, - ) - ) - - @override - async def batch_delete_requests(self, requests: list[Request]) -> BatchRequestsOperationResponse: - return BatchRequestsOperationResponse.model_validate( - await self._client.batch_delete_requests( - requests=[ - r.model_dump( - by_alias=True, - ) - for r in requests - ], - ) - ) diff --git a/src/apify/apify_storage_client/_request_queue_collection_client.py b/src/apify/apify_storage_client/_request_queue_collection_client.py deleted file mode 100644 index 5bf28836..00000000 --- a/src/apify/apify_storage_client/_request_queue_collection_client.py +++ /dev/null @@ -1,51 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING - -from typing_extensions import override - -from crawlee.storage_clients._base import RequestQueueCollectionClient as BaseRequestQueueCollectionClient -from crawlee.storage_clients.models import RequestQueueListPage, RequestQueueMetadata - -if TYPE_CHECKING: - from apify_client.clients import RequestQueueCollectionClientAsync - - -class RequestQueueCollectionClient(BaseRequestQueueCollectionClient): - """Request queue collection resource client implementation based on the Apify platform storage.""" - - def __init__(self, apify_request_queue_collection_client: RequestQueueCollectionClientAsync) -> None: - self._client = apify_request_queue_collection_client - - @override - async def get_or_create( - self, - *, - id: str | None = None, - name: str | None = None, - schema: dict | None = None, - ) -> RequestQueueMetadata: - return RequestQueueMetadata.model_validate( - {'resourceDirectory': ''} - | await self._client.get_or_create( - name=id if id is not None else name, - ) - ) - - @override - async def list( - self, - *, - unnamed: bool = False, - limit: int | None = None, - offset: int | None = None, - desc: bool = False, - ) -> RequestQueueListPage: - return RequestQueueListPage.model_validate( - await self._client.list( - unnamed=unnamed, - limit=limit, - offset=offset, - desc=desc, - ) - ) diff --git a/src/apify/scrapy/extensions/_httpcache.py b/src/apify/scrapy/extensions/_httpcache.py index 509c4d8a..14d8753d 100644 --- a/src/apify/scrapy/extensions/_httpcache.py +++ b/src/apify/scrapy/extensions/_httpcache.py @@ -13,8 +13,8 @@ from scrapy.responsetypes import responsetypes from apify import Configuration -from apify.apify_storage_client import ApifyStorageClient from apify.scrapy._async_thread import AsyncThread +from apify.storage_clients import ApifyStorageClient from apify.storages import KeyValueStore if TYPE_CHECKING: @@ -51,10 +51,14 @@ def open_spider(self, spider: Spider) -> None: kvs_name = get_kvs_name(spider.name) async def open_kvs() -> KeyValueStore: - config = Configuration.get_global_configuration() - if config.is_at_home: - storage_client = ApifyStorageClient.from_config(config) - return await KeyValueStore.open(name=kvs_name, storage_client=storage_client) + configuration = Configuration.get_global_configuration() + if configuration.is_at_home: + storage_client = ApifyStorageClient() + return await KeyValueStore.open( + name=kvs_name, + configuration=configuration, + storage_client=storage_client, + ) return await KeyValueStore.open(name=kvs_name) logger.debug("Starting background thread for cache storage's event loop") diff --git a/src/apify/scrapy/requests.py b/src/apify/scrapy/requests.py index a262b920..63bba3c7 100644 --- a/src/apify/scrapy/requests.py +++ b/src/apify/scrapy/requests.py @@ -10,9 +10,10 @@ from scrapy.http.headers import Headers from scrapy.utils.request import request_from_dict -from crawlee import Request as ApifyRequest from crawlee._types import HttpHeaders +from apify import Request as ApifyRequest + logger = getLogger(__name__) diff --git a/src/apify/scrapy/scheduler.py b/src/apify/scrapy/scheduler.py index a243a368..2dcacd9a 100644 --- a/src/apify/scrapy/scheduler.py +++ b/src/apify/scrapy/scheduler.py @@ -11,7 +11,7 @@ from ._async_thread import AsyncThread from .requests import to_apify_request, to_scrapy_request from apify import Configuration -from apify.apify_storage_client import ApifyStorageClient +from apify.storage_clients import ApifyStorageClient from apify.storages import RequestQueue if TYPE_CHECKING: @@ -49,10 +49,13 @@ def open(self, spider: Spider) -> Deferred[None] | None: self.spider = spider async def open_rq() -> RequestQueue: - config = Configuration.get_global_configuration() - if config.is_at_home: - storage_client = ApifyStorageClient.from_config(config) - return await RequestQueue.open(storage_client=storage_client) + configuration = Configuration.get_global_configuration() + if configuration.is_at_home: + storage_client = ApifyStorageClient() + return await RequestQueue.open( + configuration=configuration, + storage_client=storage_client, + ) return await RequestQueue.open() try: diff --git a/src/apify/storage_clients/__init__.py b/src/apify/storage_clients/__init__.py new file mode 100644 index 00000000..ca93ae43 --- /dev/null +++ b/src/apify/storage_clients/__init__.py @@ -0,0 +1,9 @@ +from crawlee.storage_clients import FileSystemStorageClient, MemoryStorageClient + +from ._apify import ApifyStorageClient + +__all__ = [ + 'ApifyStorageClient', + 'FileSystemStorageClient', + 'MemoryStorageClient', +] diff --git a/src/apify/storage_clients/_apify/__init__.py b/src/apify/storage_clients/_apify/__init__.py new file mode 100644 index 00000000..4af7c8ee --- /dev/null +++ b/src/apify/storage_clients/_apify/__init__.py @@ -0,0 +1,11 @@ +from ._dataset_client import ApifyDatasetClient +from ._key_value_store_client import ApifyKeyValueStoreClient +from ._request_queue_client import ApifyRequestQueueClient +from ._storage_client import ApifyStorageClient + +__all__ = [ + 'ApifyDatasetClient', + 'ApifyKeyValueStoreClient', + 'ApifyRequestQueueClient', + 'ApifyStorageClient', +] diff --git a/src/apify/storage_clients/_apify/_dataset_client.py b/src/apify/storage_clients/_apify/_dataset_client.py new file mode 100644 index 00000000..f9bf3d6a --- /dev/null +++ b/src/apify/storage_clients/_apify/_dataset_client.py @@ -0,0 +1,288 @@ +from __future__ import annotations + +import asyncio +from logging import getLogger +from typing import TYPE_CHECKING, Any + +from typing_extensions import override + +from apify_client import ApifyClientAsync +from crawlee._utils.byte_size import ByteSize +from crawlee._utils.file import json_dumps +from crawlee.storage_clients._base import DatasetClient +from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata + +if TYPE_CHECKING: + from collections.abc import AsyncIterator + + from apify_client.clients import DatasetClientAsync + from crawlee._types import JsonSerializable + + from apify import Configuration + +logger = getLogger(__name__) + + +class ApifyDatasetClient(DatasetClient): + """An Apify platform implementation of the dataset client.""" + + _MAX_PAYLOAD_SIZE = ByteSize.from_mb(9) + """Maximum size for a single payload.""" + + _SAFETY_BUFFER_COEFFICIENT = 0.01 / 100 # 0.01% + """Percentage buffer to reduce payload limit slightly for safety.""" + + _EFFECTIVE_LIMIT_SIZE = _MAX_PAYLOAD_SIZE - (_MAX_PAYLOAD_SIZE * _SAFETY_BUFFER_COEFFICIENT) + """Calculated payload limit considering safety buffer.""" + + def __init__( + self, + *, + api_client: DatasetClientAsync, + api_public_base_url: str, + lock: asyncio.Lock, + ) -> None: + """Initialize a new instance. + + Preferably use the `ApifyDatasetClient.open` class method to create a new instance. + """ + self._api_client = api_client + """The Apify dataset client for API operations.""" + + self._api_public_base_url = api_public_base_url + """The public base URL for accessing the key-value store records.""" + + self._lock = lock + """A lock to ensure that only one operation is performed at a time.""" + + @override + async def get_metadata(self) -> DatasetMetadata: + metadata = await self._api_client.get() + return DatasetMetadata.model_validate(metadata) + + @classmethod + async def open( + cls, + *, + id: str | None, + name: str | None, + configuration: Configuration, + ) -> ApifyDatasetClient: + """Open an Apify dataset client. + + This method creates and initializes a new instance of the Apify dataset client. + It handles authentication, storage lookup/creation, and metadata retrieval. + + Args: + id: The ID of an existing dataset to open. If provided, the client will connect to this specific storage. + Cannot be used together with `name`. + name: The name of a dataset to get or create. If a storage with this name exists, it will be opened; + otherwise, a new one will be created. Cannot be used together with `id`. + configuration: The configuration object containing API credentials and settings. Must include a valid + `token` and `api_base_url`. May also contain a `default_dataset_id` for fallback when neither + `id` nor `name` is provided. + + Returns: + An instance for the opened or created storage client. + + Raises: + ValueError: If the configuration is missing required fields (token, api_base_url), if both `id` and `name` + are provided, or if neither `id` nor `name` is provided and no default storage ID is available in + the configuration. + """ + token = configuration.token + if not token: + raise ValueError(f'Apify storage client requires a valid token in Configuration (token={token}).') + + api_url = configuration.api_base_url + if not api_url: + raise ValueError(f'Apify storage client requires a valid API URL in Configuration (api_url={api_url}).') + + api_public_base_url = configuration.api_public_base_url + if not api_public_base_url: + raise ValueError( + 'Apify storage client requires a valid API public base URL in Configuration ' + f'(api_public_base_url={api_public_base_url}).' + ) + + if id and name: + raise ValueError('Only one of "id" or "name" can be specified, not both.') + + # Create Apify client with the provided token and API URL. + apify_client_async = ApifyClientAsync( + token=token, + api_url=api_url, + max_retries=8, + min_delay_between_retries_millis=500, + timeout_secs=360, + ) + apify_datasets_client = apify_client_async.datasets() + + # If name is provided, get or create the storage by name. + if name is not None and id is None: + id = DatasetMetadata.model_validate( + await apify_datasets_client.get_or_create(name=name), + ).id + + # If both id and name are None, try to get the default storage ID from environment variables. + if id is None and name is None: + id = getattr(configuration, 'default_dataset_id', None) + + if id is None: + raise ValueError( + 'Either "id" or "name" must be provided, or the storage ID must be set in environment variable.' + ) + + # Get the client for the specific storage by ID. + apify_dataset_client = apify_client_async.dataset(dataset_id=id) + + return cls( + api_client=apify_dataset_client, + api_public_base_url=api_public_base_url, + lock=asyncio.Lock(), + ) + + @override + async def purge(self) -> None: + raise NotImplementedError( + 'Purging datasets is not supported in the Apify platform. ' + 'Use the `drop` method to delete the dataset instead.' + ) + + @override + async def drop(self) -> None: + async with self._lock: + await self._api_client.delete() + + @override + async def push_data(self, data: list[Any] | dict[str, Any]) -> None: + async def payloads_generator() -> AsyncIterator[str]: + for index, item in enumerate(data): + yield await self._check_and_serialize(item, index) + + async with self._lock: + # Handle lists + if isinstance(data, list): + # Invoke client in series to preserve the order of data + async for items in self._chunk_by_size(payloads_generator()): + await self._api_client.push_items(items=items) + + # Handle singular items + else: + items = await self._check_and_serialize(data) + await self._api_client.push_items(items=items) + + @override + async def get_data( + self, + *, + offset: int = 0, + limit: int | None = 999_999_999_999, + clean: bool = False, + desc: bool = False, + fields: list[str] | None = None, + omit: list[str] | None = None, + unwind: str | None = None, + skip_empty: bool = False, + skip_hidden: bool = False, + flatten: list[str] | None = None, + view: str | None = None, + ) -> DatasetItemsListPage: + response = await self._api_client.list_items( + offset=offset, + limit=limit, + clean=clean, + desc=desc, + fields=fields, + omit=omit, + unwind=unwind, + skip_empty=skip_empty, + skip_hidden=skip_hidden, + flatten=flatten, + view=view, + ) + return DatasetItemsListPage.model_validate(vars(response)) + + @override + async def iterate_items( + self, + *, + offset: int = 0, + limit: int | None = None, + clean: bool = False, + desc: bool = False, + fields: list[str] | None = None, + omit: list[str] | None = None, + unwind: str | None = None, + skip_empty: bool = False, + skip_hidden: bool = False, + ) -> AsyncIterator[dict]: + async for item in self._api_client.iterate_items( + offset=offset, + limit=limit, + clean=clean, + desc=desc, + fields=fields, + omit=omit, + unwind=unwind, + skip_empty=skip_empty, + skip_hidden=skip_hidden, + ): + yield item + + @classmethod + async def _check_and_serialize(cls, item: JsonSerializable, index: int | None = None) -> str: + """Serialize a given item to JSON, checks its serializability and size against a limit. + + Args: + item: The item to serialize. + index: Index of the item, used for error context. + + Returns: + Serialized JSON string. + + Raises: + ValueError: If item is not JSON serializable or exceeds size limit. + """ + s = ' ' if index is None else f' at index {index} ' + + try: + payload = await json_dumps(item) + except Exception as exc: + raise ValueError(f'Data item{s}is not serializable to JSON.') from exc + + payload_size = ByteSize(len(payload.encode('utf-8'))) + if payload_size > cls._EFFECTIVE_LIMIT_SIZE: + raise ValueError(f'Data item{s}is too large (size: {payload_size}, limit: {cls._EFFECTIVE_LIMIT_SIZE})') + + return payload + + async def _chunk_by_size(self, items: AsyncIterator[str]) -> AsyncIterator[str]: + """Yield chunks of JSON arrays composed of input strings, respecting a size limit. + + Groups an iterable of JSON string payloads into larger JSON arrays, ensuring the total size + of each array does not exceed `EFFECTIVE_LIMIT_SIZE`. Each output is a JSON array string that + contains as many payloads as possible without breaching the size threshold, maintaining the + order of the original payloads. Assumes individual items are below the size limit. + + Args: + items: Iterable of JSON string payloads. + + Yields: + Strings representing JSON arrays of payloads, each staying within the size limit. + """ + last_chunk_size = ByteSize(2) # Add 2 bytes for [] wrapper. + current_chunk = [] + + async for payload in items: + payload_size = ByteSize(len(payload.encode('utf-8'))) + + if last_chunk_size + payload_size <= self._EFFECTIVE_LIMIT_SIZE: + current_chunk.append(payload) + last_chunk_size += payload_size + ByteSize(1) # Add 1 byte for ',' separator. + else: + yield f'[{",".join(current_chunk)}]' + current_chunk = [payload] + last_chunk_size = payload_size + ByteSize(2) # Add 2 bytes for [] wrapper. + + yield f'[{",".join(current_chunk)}]' diff --git a/src/apify/storage_clients/_apify/_key_value_store_client.py b/src/apify/storage_clients/_apify/_key_value_store_client.py new file mode 100644 index 00000000..8a1c5433 --- /dev/null +++ b/src/apify/storage_clients/_apify/_key_value_store_client.py @@ -0,0 +1,238 @@ +from __future__ import annotations + +import asyncio +from logging import getLogger +from typing import TYPE_CHECKING, Any + +from typing_extensions import override +from yarl import URL + +from apify_client import ApifyClientAsync +from crawlee.storage_clients._base import KeyValueStoreClient +from crawlee.storage_clients.models import KeyValueStoreMetadata, KeyValueStoreRecord, KeyValueStoreRecordMetadata + +from ._models import KeyValueStoreListKeysPage +from apify._crypto import create_hmac_signature + +if TYPE_CHECKING: + from collections.abc import AsyncIterator + + from apify_client.clients import KeyValueStoreClientAsync + + from apify import Configuration + +logger = getLogger(__name__) + + +class ApifyKeyValueStoreClient(KeyValueStoreClient): + """An Apify platform implementation of the key-value store client.""" + + def __init__( + self, + *, + api_client: KeyValueStoreClientAsync, + api_public_base_url: str, + lock: asyncio.Lock, + ) -> None: + """Initialize a new instance. + + Preferably use the `ApifyKeyValueStoreClient.open` class method to create a new instance. + """ + self._api_client = api_client + """The Apify KVS client for API operations.""" + + self._api_public_base_url = api_public_base_url + """The public base URL for accessing the key-value store records.""" + + self._lock = lock + """A lock to ensure that only one operation is performed at a time.""" + + @override + async def get_metadata(self) -> KeyValueStoreMetadata: + metadata = await self._api_client.get() + return KeyValueStoreMetadata.model_validate(metadata) + + @classmethod + async def open( + cls, + *, + id: str | None, + name: str | None, + configuration: Configuration, + ) -> ApifyKeyValueStoreClient: + """Open an Apify key-value store client. + + This method creates and initializes a new instance of the Apify key-value store client. + It handles authentication, storage lookup/creation, and metadata retrieval. + + Args: + id: The ID of an existing key-value store to open. If provided, the client will connect to this specific + storage. Cannot be used together with `name`. + name: The name of a key-value store to get or create. If a storage with this name exists, it will be + opened; otherwise, a new one will be created. Cannot be used together with `id`. + configuration: The configuration object containing API credentials and settings. Must include a valid + `token` and `api_base_url`. May also contain a `default_key_value_store_id` for fallback when + neither `id` nor `name` is provided. + + Returns: + An instance for the opened or created storage client. + + Raises: + ValueError: If the configuration is missing required fields (token, api_base_url), if both `id` and `name` + are provided, or if neither `id` nor `name` is provided and no default storage ID is available + in the configuration. + """ + token = configuration.token + if not token: + raise ValueError(f'Apify storage client requires a valid token in Configuration (token={token}).') + + api_url = configuration.api_base_url + if not api_url: + raise ValueError(f'Apify storage client requires a valid API URL in Configuration (api_url={api_url}).') + + api_public_base_url = configuration.api_public_base_url + if not api_public_base_url: + raise ValueError( + 'Apify storage client requires a valid API public base URL in Configuration ' + f'(api_public_base_url={api_public_base_url}).' + ) + + if id and name: + raise ValueError('Only one of "id" or "name" can be specified, not both.') + + # Create Apify client with the provided token and API URL. + apify_client_async = ApifyClientAsync( + token=token, + api_url=api_url, + max_retries=8, + min_delay_between_retries_millis=500, + timeout_secs=360, + ) + apify_kvss_client = apify_client_async.key_value_stores() + + # If name is provided, get or create the storage by name. + if name is not None and id is None: + id = KeyValueStoreMetadata.model_validate( + await apify_kvss_client.get_or_create(name=name), + ).id + + # If both id and name are None, try to get the default storage ID from environment variables. + if id is None and name is None: + id = getattr(configuration, 'default_key_value_store_id', None) + + if id is None: + raise ValueError( + 'Either "id" or "name" must be provided, or the storage ID must be set in environment variable.' + ) + + # Get the client for the specific storage by ID. + apify_kvs_client = apify_client_async.key_value_store(key_value_store_id=id) + + return cls( + api_client=apify_kvs_client, + api_public_base_url=api_public_base_url, + lock=asyncio.Lock(), + ) + + @override + async def purge(self) -> None: + raise NotImplementedError( + 'Purging key-value stores is not supported in the Apify platform. ' + 'Use the `drop` method to delete the key-value store instead.' + ) + + @override + async def drop(self) -> None: + async with self._lock: + await self._api_client.delete() + + @override + async def get_value(self, key: str) -> KeyValueStoreRecord | None: + response = await self._api_client.get_record(key) + record = KeyValueStoreRecord.model_validate(response) if response else None + await self._update_metadata() + return record + + @override + async def set_value(self, key: str, value: Any, content_type: str | None = None) -> None: + async with self._lock: + await self._api_client.set_record( + key=key, + value=value, + content_type=content_type, + ) + await self._update_metadata() + + @override + async def delete_value(self, key: str) -> None: + async with self._lock: + await self._api_client.delete_record(key=key) + await self._update_metadata() + + @override + async def iterate_keys( + self, + *, + exclusive_start_key: str | None = None, + limit: int | None = None, + ) -> AsyncIterator[KeyValueStoreRecordMetadata]: + count = 0 + + while True: + response = await self._api_client.list_keys(exclusive_start_key=exclusive_start_key) + list_key_page = KeyValueStoreListKeysPage.model_validate(response) + + for item in list_key_page.items: + # Convert KeyValueStoreKeyInfo to KeyValueStoreRecordMetadata + record_metadata = KeyValueStoreRecordMetadata( + key=item.key, + size=item.size, + content_type='application/octet-stream', # Content type not available from list_keys + ) + yield record_metadata + count += 1 + + # If we've reached the limit, stop yielding + if limit and count >= limit: + break + + # If we've reached the limit or there are no more pages, exit the loop + if (limit and count >= limit) or not list_key_page.is_truncated: + break + + exclusive_start_key = list_key_page.next_exclusive_start_key + + await self._update_metadata() + + @override + async def record_exists(self, key: str) -> bool: + return await self._api_client.record_exists(key=key) + + async def get_public_url(self, key: str) -> str: + """Get a URL for the given key that may be used to publicly access the value in the remote key-value store. + + Args: + key: The key for which the URL should be generated. + + Returns: + A public URL that can be used to access the value of the given key in the KVS. + """ + if self._api_client.resource_id is None: + raise ValueError('resource_id cannot be None when generating a public URL') + + public_url = ( + URL(self._api_public_base_url) / 'v2' / 'key-value-stores' / self._api_client.resource_id / 'records' / key + ) + metadata = await self.get_metadata() + + if metadata.model_extra is not None: + url_signing_secret_key = metadata.model_extra.get('urlSigningSecretKey') + if url_signing_secret_key is not None: + public_url = public_url.with_query(signature=create_hmac_signature(url_signing_secret_key, key)) + + return str(public_url) + + async def _update_metadata(self) -> None: + """Update the key-value store metadata with current information.""" + metadata = await self._api_client.get() + self._metadata = KeyValueStoreMetadata.model_validate(metadata) diff --git a/src/apify/storage_clients/_apify/_models.py b/src/apify/storage_clients/_apify/_models.py new file mode 100644 index 00000000..abb7aca1 --- /dev/null +++ b/src/apify/storage_clients/_apify/_models.py @@ -0,0 +1,89 @@ +from __future__ import annotations + +from datetime import datetime, timedelta +from typing import Annotated + +from pydantic import BaseModel, ConfigDict, Field + +from crawlee._utils.docs import docs_group + +from apify import Request + + +@docs_group('Data structures') +class ProlongRequestLockResponse(BaseModel): + """Response to prolong request lock calls.""" + + model_config = ConfigDict(populate_by_name=True) + + lock_expires_at: Annotated[datetime, Field(alias='lockExpiresAt')] + + +@docs_group('Data structures') +class RequestQueueHead(BaseModel): + """Model for request queue head. + + Represents a collection of requests retrieved from the beginning of a queue, + including metadata about the queue's state and lock information for the requests. + """ + + model_config = ConfigDict(populate_by_name=True) + + limit: Annotated[int | None, Field(alias='limit', default=None)] + """The maximum number of requests that were requested from the queue.""" + + had_multiple_clients: Annotated[bool, Field(alias='hadMultipleClients', default=False)] + """Indicates whether the queue has been accessed by multiple clients (consumers).""" + + queue_modified_at: Annotated[datetime, Field(alias='queueModifiedAt')] + """The timestamp when the queue was last modified.""" + + lock_time: Annotated[timedelta | None, Field(alias='lockSecs', default=None)] + """The duration for which the returned requests are locked and cannot be processed by other clients.""" + + queue_has_locked_requests: Annotated[bool | None, Field(alias='queueHasLockedRequests', default=False)] + """Indicates whether the queue contains any locked requests.""" + + items: Annotated[list[Request], Field(alias='items', default_factory=list[Request])] + """The list of request objects retrieved from the beginning of the queue.""" + + +class KeyValueStoreKeyInfo(BaseModel): + """Model for a key-value store key info.""" + + model_config = ConfigDict(populate_by_name=True) + + key: Annotated[str, Field(alias='key')] + size: Annotated[int, Field(alias='size')] + + +class KeyValueStoreListKeysPage(BaseModel): + """Model for listing keys in the key-value store.""" + + model_config = ConfigDict(populate_by_name=True) + + count: Annotated[int, Field(alias='count')] + limit: Annotated[int, Field(alias='limit')] + is_truncated: Annotated[bool, Field(alias='isTruncated')] + items: Annotated[list[KeyValueStoreKeyInfo], Field(alias='items', default_factory=list)] + exclusive_start_key: Annotated[str | None, Field(alias='exclusiveStartKey', default=None)] + next_exclusive_start_key: Annotated[str | None, Field(alias='nextExclusiveStartKey', default=None)] + + +class CachedRequest(BaseModel): + """Pydantic model for cached request information.""" + + id: str + """The ID of the request.""" + + was_already_handled: bool + """Whether the request was already handled.""" + + hydrated: Request | None = None + """The hydrated request object (the original one).""" + + lock_expires_at: datetime | None = None + """The expiration time of the lock on the request.""" + + forefront: bool = False + """Whether the request was added to the forefront of the queue.""" diff --git a/src/apify/storage_clients/_apify/_request_queue_client.py b/src/apify/storage_clients/_apify/_request_queue_client.py new file mode 100644 index 00000000..f24696c3 --- /dev/null +++ b/src/apify/storage_clients/_apify/_request_queue_client.py @@ -0,0 +1,655 @@ +from __future__ import annotations + +import asyncio +from collections import deque +from datetime import datetime, timedelta, timezone +from logging import getLogger +from typing import TYPE_CHECKING, Final + +from cachetools import LRUCache +from typing_extensions import override + +from apify_client import ApifyClientAsync +from crawlee._utils.requests import unique_key_to_request_id +from crawlee.storage_clients._base import RequestQueueClient +from crawlee.storage_clients.models import AddRequestsResponse, ProcessedRequest, RequestQueueMetadata + +from ._models import CachedRequest, ProlongRequestLockResponse, RequestQueueHead +from apify import Request + +if TYPE_CHECKING: + from collections.abc import Sequence + + from apify_client.clients import RequestQueueClientAsync + + from apify import Configuration + +logger = getLogger(__name__) + + +class ApifyRequestQueueClient(RequestQueueClient): + """An Apify platform implementation of the request queue client.""" + + _DEFAULT_LOCK_TIME: Final[timedelta] = timedelta(minutes=3) + """The default lock time for requests in the queue.""" + + _MAX_CACHED_REQUESTS: Final[int] = 1_000_000 + """Maximum number of requests that can be cached.""" + + def __init__( + self, + *, + metadata: RequestQueueMetadata, + api_client: RequestQueueClientAsync, + api_public_base_url: str, + lock: asyncio.Lock, + ) -> None: + """Initialize a new instance. + + Preferably use the `ApifyRequestQueueClient.open` class method to create a new instance. + """ + self._metadata = metadata + + self._api_client = api_client + """The Apify request queue client for API operations.""" + + self._api_public_base_url = api_public_base_url + """The public base URL for accessing the key-value store records.""" + + self._lock = lock + """A lock to ensure that only one operation is performed at a time.""" + + self._queue_head = deque[str]() + """A deque to store request IDs in the queue head.""" + + self._requests_cache: LRUCache[str, CachedRequest] = LRUCache(maxsize=self._MAX_CACHED_REQUESTS) + """A cache to store request objects.""" + + self._queue_has_locked_requests: bool | None = None + """Whether the queue has requests locked by another client.""" + + self._should_check_for_forefront_requests = False + """Whether to check for forefront requests in the next list_head call.""" + + @override + async def get_metadata(self) -> RequestQueueMetadata: + metadata = await self._api_client.get() + return RequestQueueMetadata.model_validate(metadata) + + @classmethod + async def open( + cls, + *, + id: str | None, + name: str | None, + configuration: Configuration, + ) -> ApifyRequestQueueClient: + """Open an Apify request queue client. + + This method creates and initializes a new instance of the Apify request queue client. It handles + authentication, storage lookup/creation, and metadata retrieval, and sets up internal caching and queue + management structures. + + Args: + id: The ID of an existing request queue to open. If provided, the client will connect to this specific + storage. Cannot be used together with `name`. + name: The name of a request queue to get or create. If a storage with this name exists, it will be opened; + otherwise, a new one will be created. Cannot be used together with `id`. + configuration: The configuration object containing API credentials and settings. Must include a valid + `token` and `api_base_url`. May also contain a `default_request_queue_id` for fallback when neither + `id` nor `name` is provided. + + Returns: + An instance for the opened or created storage client. + + Raises: + ValueError: If the configuration is missing required fields (token, api_base_url), if both `id` and `name` + are provided, or if neither `id` nor `name` is provided and no default storage ID is available + in the configuration. + """ + token = configuration.token + if not token: + raise ValueError(f'Apify storage client requires a valid token in Configuration (token={token}).') + + api_url = configuration.api_base_url + if not api_url: + raise ValueError(f'Apify storage client requires a valid API URL in Configuration (api_url={api_url}).') + + api_public_base_url = configuration.api_public_base_url + if not api_public_base_url: + raise ValueError( + 'Apify storage client requires a valid API public base URL in Configuration ' + f'(api_public_base_url={api_public_base_url}).' + ) + + if id and name: + raise ValueError('Only one of "id" or "name" can be specified, not both.') + + # Create Apify client with the provided token and API URL. + apify_client_async = ApifyClientAsync( + token=token, + api_url=api_url, + max_retries=8, + min_delay_between_retries_millis=500, + timeout_secs=360, + ) + apify_rqs_client = apify_client_async.request_queues() + + # If name is provided, get or create the storage by name. + if name is not None and id is None: + id = RequestQueueMetadata.model_validate( + await apify_rqs_client.get_or_create(name=name), + ).id + + # If both id and name are None, try to get the default storage ID from environment variables. + if id is None and name is None: + id = getattr(configuration, 'default_request_queue_id', None) + + if id is None: + raise ValueError( + 'Either "id" or "name" must be provided, or the storage ID must be set in environment variable.' + ) + + # Get the client for the specific storage by ID. + apify_rq_client = apify_client_async.request_queue(request_queue_id=id) + + # Fetch its metadata. + metadata = RequestQueueMetadata.model_validate(await apify_rq_client.get()) + + return cls( + metadata=metadata, + api_client=apify_rq_client, + api_public_base_url=api_public_base_url, + lock=asyncio.Lock(), + ) + + @override + async def purge(self) -> None: + raise NotImplementedError( + 'Purging the request queue is not supported in the Apify platform. ' + 'Use the `drop` method to delete the request queue instead.' + ) + + @override + async def drop(self) -> None: + async with self._lock: + await self._api_client.delete() + + @override + async def add_batch_of_requests( + self, + requests: Sequence[Request], + *, + forefront: bool = False, + ) -> AddRequestsResponse: + """Add a batch of requests to the queue. + + Args: + requests: The requests to add. + forefront: Whether to add the requests to the beginning of the queue. + + Returns: + Response containing information about the added requests. + """ + # Prepare requests for API by converting to dictionaries + requests_dict = [request.model_dump(by_alias=True) for request in requests] + + # Remove 'id' fields from requests as the API doesn't accept them + for request_dict in requests_dict: + if 'id' in request_dict: + del request_dict['id'] + + # Send requests to API + response = await self._api_client.batch_add_requests(requests=requests_dict, forefront=forefront) + + # Update metadata after adding requests + await self._update_metadata() + + return AddRequestsResponse.model_validate(response) + + @override + async def get_request(self, request_id: str) -> Request | None: + """Get a request by ID. + + Args: + request_id: The ID of the request to get. + + Returns: + The request or None if not found. + """ + response = await self._api_client.get_request(request_id) + await self._update_metadata() + + if response is None: + return None + + return Request.model_validate(**response) + + @override + async def fetch_next_request(self) -> Request | None: + """Return the next request in the queue to be processed. + + Once you successfully finish processing of the request, you need to call `mark_request_as_handled` + to mark the request as handled in the queue. If there was some error in processing the request, call + `reclaim_request` instead, so that the queue will give the request to some other consumer + in another call to the `fetch_next_request` method. + + Returns: + The request or `None` if there are no more pending requests. + """ + # Ensure the queue head has requests if available + await self._ensure_head_is_non_empty() + + # If queue head is empty after ensuring, there are no requests + if not self._queue_head: + return None + + # Get the next request ID from the queue head + next_request_id = self._queue_head.popleft() + request = await self._get_or_hydrate_request(next_request_id) + + # Handle potential inconsistency where request might not be in the main table yet + if request is None: + logger.debug( + 'Cannot find a request from the beginning of queue, will be retried later', + extra={'nextRequestId': next_request_id}, + ) + return None + + # If the request was already handled, skip it + if request.handled_at is not None: + logger.debug( + 'Request fetched from the beginning of queue was already handled', + extra={'nextRequestId': next_request_id}, + ) + return None + + return request + + @override + async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None: + """Mark a request as handled after successful processing. + + Handled requests will never again be returned by the `fetch_next_request` method. + + Args: + request: The request to mark as handled. + + Returns: + Information about the queue operation. `None` if the given request was not in progress. + """ + # Set the handled_at timestamp if not already set + if request.handled_at is None: + request.handled_at = datetime.now(tz=timezone.utc) + + try: + # Update the request in the API + processed_request = await self._update_request(request) + processed_request.unique_key = request.unique_key + + # Update the cache with the handled request + cache_key = unique_key_to_request_id(request.unique_key) + self._cache_request( + cache_key, + processed_request, + forefront=False, + hydrated_request=request, + ) + + # Update metadata after marking request as handled + await self._update_metadata() + except Exception as exc: + logger.debug(f'Error marking request {request.id} as handled: {exc!s}') + return None + else: + return processed_request + + @override + async def reclaim_request( + self, + request: Request, + *, + forefront: bool = False, + ) -> ProcessedRequest | None: + """Reclaim a failed request back to the queue. + + The request will be returned for processing later again by another call to `fetch_next_request`. + + Args: + request: The request to return to the queue. + forefront: Whether to add the request to the head or the end of the queue. + + Returns: + Information about the queue operation. `None` if the given request was not in progress. + """ + try: + # Update the request in the API + processed_request = await self._update_request(request, forefront=forefront) + processed_request.unique_key = request.unique_key + + # Update the cache + cache_key = unique_key_to_request_id(request.unique_key) + self._cache_request( + cache_key, + processed_request, + forefront=forefront, + hydrated_request=request, + ) + + # If we're adding to the forefront, we need to check for forefront requests + # in the next list_head call + if forefront: + self._should_check_for_forefront_requests = True + + # Try to release the lock on the request + try: + await self._delete_request_lock(request.id, forefront=forefront) + except Exception as err: + logger.debug(f'Failed to delete request lock for request {request.id}', exc_info=err) + + # Update metadata after reclaiming request + await self._update_metadata() + except Exception as exc: + logger.debug(f'Error reclaiming request {request.id}: {exc!s}') + return None + else: + return processed_request + + @override + async def is_empty(self) -> bool: + """Check if the queue is empty. + + Returns: + True if the queue is empty, False otherwise. + """ + head = await self._list_head(limit=1, lock_time=None) + return len(head.items) == 0 + + async def _ensure_head_is_non_empty(self) -> None: + """Ensure that the queue head has requests if they are available in the queue.""" + # If queue head has adequate requests, skip fetching more + if len(self._queue_head) > 1 and not self._should_check_for_forefront_requests: + return + + # Fetch requests from the API and populate the queue head + await self._list_head(lock_time=self._DEFAULT_LOCK_TIME) + + async def _get_or_hydrate_request(self, request_id: str) -> Request | None: + """Get a request by ID, either from cache or by fetching from API. + + Args: + request_id: The ID of the request to get. + + Returns: + The request if found and valid, otherwise None. + """ + # First check if the request is in our cache + cached_entry = self._requests_cache.get(request_id) + + if cached_entry and cached_entry.hydrated: + # If we have the request hydrated in cache, check if lock is expired + if cached_entry.lock_expires_at and cached_entry.lock_expires_at < datetime.now(tz=timezone.utc): + # Try to prolong the lock if it's expired + try: + lock_secs = int(self._DEFAULT_LOCK_TIME.total_seconds()) + response = await self._prolong_request_lock( + request_id, forefront=cached_entry.forefront, lock_secs=lock_secs + ) + cached_entry.lock_expires_at = response.lock_expires_at + except Exception: + # If prolonging the lock fails, we lost the request + logger.debug(f'Failed to prolong lock for request {request_id}, returning None') + return None + + return cached_entry.hydrated + + # If not in cache or not hydrated, fetch the request + try: + # Try to acquire or prolong the lock + lock_secs = int(self._DEFAULT_LOCK_TIME.total_seconds()) + await self._prolong_request_lock(request_id, forefront=False, lock_secs=lock_secs) + + # Fetch the request data + request = await self.get_request(request_id) + + # If request is not found, release lock and return None + if not request: + await self._delete_request_lock(request_id) + return None + + # Update cache with hydrated request + cache_key = unique_key_to_request_id(request.unique_key) + self._cache_request( + cache_key, + ProcessedRequest( + id=request_id, + unique_key=request.unique_key, + was_already_present=True, + was_already_handled=request.handled_at is not None, + ), + forefront=False, + hydrated_request=request, + ) + except Exception as exc: + logger.debug(f'Error fetching or locking request {request_id}: {exc!s}') + return None + else: + return request + + async def _update_request( + self, + request: Request, + *, + forefront: bool = False, + ) -> ProcessedRequest: + """Update a request in the queue. + + Args: + request: The updated request. + forefront: Whether to put the updated request in the beginning or the end of the queue. + + Returns: + The updated request + """ + response = await self._api_client.update_request( + request=request.model_dump(by_alias=True), + forefront=forefront, + ) + + return ProcessedRequest.model_validate( + {'id': request.id, 'uniqueKey': request.unique_key} | response, + ) + + async def _list_head( + self, + *, + lock_time: timedelta | None = None, + limit: int = 25, + ) -> RequestQueueHead: + """Retrieve requests from the beginning of the queue. + + Args: + lock_time: Duration for which to lock the retrieved requests. + If None, requests will not be locked. + limit: Maximum number of requests to retrieve. + + Returns: + A collection of requests from the beginning of the queue. + """ + # Return from cache if available and we're not checking for new forefront requests + if self._queue_head and not self._should_check_for_forefront_requests: + logger.debug(f'Using cached queue head with {len(self._queue_head)} requests') + + # Create a list of requests from the cached queue head + items = [] + for request_id in list(self._queue_head)[:limit]: + cached_request = self._requests_cache.get(request_id) + if cached_request and cached_request.hydrated: + items.append(cached_request.hydrated) + + return RequestQueueHead( + limit=limit, + had_multiple_clients=self._metadata.had_multiple_clients, + queue_modified_at=self._metadata.modified_at, + items=items, + queue_has_locked_requests=self._queue_has_locked_requests, + lock_time=lock_time, + ) + + # Otherwise fetch from API + lock_time = lock_time or self._DEFAULT_LOCK_TIME + lock_secs = int(lock_time.total_seconds()) + + response = await self._api_client.list_and_lock_head( + lock_secs=lock_secs, + limit=limit, + ) + + # Update the queue head cache + self._queue_has_locked_requests = response.get('queueHasLockedRequests', False) + + # Clear current queue head if we're checking for forefront requests + if self._should_check_for_forefront_requests: + self._queue_head.clear() + self._should_check_for_forefront_requests = False + + # Process and cache the requests + head_id_buffer = list[str]() + forefront_head_id_buffer = list[str]() + + for request_data in response.get('items', []): + request = Request.model_validate(request_data) + + # Skip requests without ID or unique key + if not request.id or not request.unique_key: + logger.debug( + 'Skipping request from queue head, missing ID or unique key', + extra={ + 'id': request.id, + 'unique_key': request.unique_key, + }, + ) + continue + + # Check if this request was already cached and if it was added to forefront + cache_key = unique_key_to_request_id(request.unique_key) + cached_request = self._requests_cache.get(cache_key) + forefront = cached_request.forefront if cached_request else False + + # Add to appropriate buffer based on forefront flag + if forefront: + forefront_head_id_buffer.insert(0, request.id) + else: + head_id_buffer.append(request.id) + + # Cache the request + self._cache_request( + cache_key, + ProcessedRequest( + id=request.id, + unique_key=request.unique_key, + was_already_present=True, + was_already_handled=False, + ), + forefront=forefront, + hydrated_request=request, + ) + + # Update the queue head deque + for request_id in head_id_buffer: + self._queue_head.append(request_id) + + for request_id in forefront_head_id_buffer: + self._queue_head.appendleft(request_id) + + return RequestQueueHead.model_validate(response) + + async def _prolong_request_lock( + self, + request_id: str, + *, + forefront: bool = False, + lock_secs: int, + ) -> ProlongRequestLockResponse: + """Prolong the lock on a specific request in the queue. + + Args: + request_id: The identifier of the request whose lock is to be prolonged. + forefront: Whether to put the request in the beginning or the end of the queue after lock expires. + lock_secs: The additional amount of time, in seconds, that the request will remain locked. + + Returns: + A response containing the time at which the lock will expire. + """ + response = await self._api_client.prolong_request_lock( + request_id=request_id, + forefront=forefront, + lock_secs=lock_secs, + ) + + result = ProlongRequestLockResponse( + lock_expires_at=datetime.fromisoformat(response['lockExpiresAt'].replace('Z', '+00:00')) + ) + + # Update the cache with the new lock expiration + for cached_request in self._requests_cache.values(): + if cached_request.id == request_id: + cached_request.lock_expires_at = result.lock_expires_at + break + + return result + + async def _delete_request_lock( + self, + request_id: str, + *, + forefront: bool = False, + ) -> None: + """Delete the lock on a specific request in the queue. + + Args: + request_id: ID of the request to delete the lock. + forefront: Whether to put the request in the beginning or the end of the queue after the lock is deleted. + """ + try: + await self._api_client.delete_request_lock( + request_id=request_id, + forefront=forefront, + ) + + # Update the cache to remove the lock + for cached_request in self._requests_cache.values(): + if cached_request.id == request_id: + cached_request.lock_expires_at = None + break + except Exception as err: + logger.debug(f'Failed to delete request lock for request {request_id}', exc_info=err) + + def _cache_request( + self, + cache_key: str, + processed_request: ProcessedRequest, + *, + forefront: bool, + hydrated_request: Request | None = None, + ) -> None: + """Cache a request for future use. + + Args: + cache_key: The key to use for caching the request. + processed_request: The processed request information. + forefront: Whether the request was added to the forefront of the queue. + hydrated_request: The hydrated request object, if available. + """ + self._requests_cache[cache_key] = CachedRequest( + id=processed_request.id, + was_already_handled=processed_request.was_already_handled, + hydrated=hydrated_request, + lock_expires_at=None, + forefront=forefront, + ) + + async def _update_metadata(self) -> None: + """Update the request queue metadata with current information.""" + metadata = await self._api_client.get() + self._metadata = RequestQueueMetadata.model_validate(metadata) diff --git a/src/apify/storage_clients/_apify/_storage_client.py b/src/apify/storage_clients/_apify/_storage_client.py new file mode 100644 index 00000000..9d43b983 --- /dev/null +++ b/src/apify/storage_clients/_apify/_storage_client.py @@ -0,0 +1,78 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from typing_extensions import override + +from crawlee.storage_clients._base import StorageClient + +from ._dataset_client import ApifyDatasetClient +from ._key_value_store_client import ApifyKeyValueStoreClient +from ._request_queue_client import ApifyRequestQueueClient + +if TYPE_CHECKING: + from crawlee.configuration import Configuration + + +class ApifyStorageClient(StorageClient): + """Apify storage client.""" + + @override + async def create_dataset_client( + self, + *, + id: str | None = None, + name: str | None = None, + configuration: Configuration | None = None, + ) -> ApifyDatasetClient: + # Import here to avoid circular imports. + from apify import Configuration as ApifyConfiguration # noqa: PLC0415 + + configuration = configuration or ApifyConfiguration.get_global_configuration() + if isinstance(configuration, ApifyConfiguration): + return await ApifyDatasetClient.open(id=id, name=name, configuration=configuration) + + raise TypeError( + f'Expected "configuration" to be an instance of "apify.Configuration", ' + f'but got {type(configuration).__name__} instead.' + ) + + @override + async def create_kvs_client( + self, + *, + id: str | None = None, + name: str | None = None, + configuration: Configuration | None = None, + ) -> ApifyKeyValueStoreClient: + # Import here to avoid circular imports. + from apify import Configuration as ApifyConfiguration # noqa: PLC0415 + + configuration = configuration or ApifyConfiguration.get_global_configuration() + if isinstance(configuration, ApifyConfiguration): + return await ApifyKeyValueStoreClient.open(id=id, name=name, configuration=configuration) + + raise TypeError( + f'Expected "configuration" to be an instance of "apify.Configuration", ' + f'but got {type(configuration).__name__} instead.' + ) + + @override + async def create_rq_client( + self, + *, + id: str | None = None, + name: str | None = None, + configuration: Configuration | None = None, + ) -> ApifyRequestQueueClient: + # Import here to avoid circular imports. + from apify import Configuration as ApifyConfiguration # noqa: PLC0415 + + configuration = configuration or ApifyConfiguration.get_global_configuration() + if isinstance(configuration, ApifyConfiguration): + return await ApifyRequestQueueClient.open(id=id, name=name, configuration=configuration) + + raise TypeError( + f'Expected "configuration" to be an instance of "apify.Configuration", ' + f'but got {type(configuration).__name__} instead.' + ) diff --git a/src/apify/apify_storage_client/py.typed b/src/apify/storage_clients/_apify/py.typed similarity index 100% rename from src/apify/apify_storage_client/py.typed rename to src/apify/storage_clients/_apify/py.typed diff --git a/src/apify/storage_clients/py.typed b/src/apify/storage_clients/py.typed new file mode 100644 index 00000000..e69de29b diff --git a/src/apify/storages/_request_list.py b/src/apify/storages/_request_list.py index cbc56dfb..3e784064 100644 --- a/src/apify/storages/_request_list.py +++ b/src/apify/storages/_request_list.py @@ -8,11 +8,11 @@ from pydantic import BaseModel, Field, TypeAdapter -from crawlee import Request from crawlee._types import HttpMethod from crawlee.http_clients import HttpClient, HttpxHttpClient from crawlee.request_loaders import RequestList as CrawleeRequestList +from apify import Request from apify._utils import docs_group URL_NO_COMMAS_REGEX = re.compile( diff --git a/tests/integration/actor_source_base/Dockerfile b/tests/integration/actor_source_base/Dockerfile index b65eab68..9edfb387 100644 --- a/tests/integration/actor_source_base/Dockerfile +++ b/tests/integration/actor_source_base/Dockerfile @@ -3,6 +3,10 @@ FROM apify/actor-python:BASE_IMAGE_VERSION_PLACEHOLDER COPY . ./ +RUN apt-get update && apt-get install -y \ + git \ + && rm -rf /var/lib/apt/lists/* + RUN echo "Python version:" \ && python --version \ && echo "Pip version:" \ diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 4cfb76ec..07b6c758 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -15,7 +15,6 @@ from apify_client import ApifyClient, ApifyClientAsync from apify_shared.consts import ActorJobStatus, ActorSourceType, ApifyEnvVars from crawlee import service_locator -from crawlee.storages import _creation_management import apify._actor from ._utils import generate_unique_resource_name @@ -53,24 +52,16 @@ def _prepare_test_env() -> None: # Set the environment variable for the local storage directory to the temporary path. monkeypatch.setenv(ApifyEnvVars.LOCAL_STORAGE_DIR, str(tmp_path)) - # Reset the flags in the service locator to indicate that no services are explicitly set. This ensures - # a clean state, as services might have been set during a previous test and not reset properly. - service_locator._configuration_was_retrieved = False - service_locator._storage_client_was_retrieved = False - service_locator._event_manager_was_retrieved = False - # Reset the services in the service locator. service_locator._configuration = None service_locator._event_manager = None service_locator._storage_client = None + service_locator._storage_instance_manager = None - # Clear creation-related caches to ensure no state is carried over between tests. - monkeypatch.setattr(_creation_management, '_cache_dataset_by_id', {}) - monkeypatch.setattr(_creation_management, '_cache_dataset_by_name', {}) - monkeypatch.setattr(_creation_management, '_cache_kvs_by_id', {}) - monkeypatch.setattr(_creation_management, '_cache_kvs_by_name', {}) - monkeypatch.setattr(_creation_management, '_cache_rq_by_id', {}) - monkeypatch.setattr(_creation_management, '_cache_rq_by_name', {}) + # Reset the retrieval flags. + service_locator._configuration_was_retrieved = False + service_locator._event_manager_was_retrieved = False + service_locator._storage_client_was_retrieved = False # Verify that the test environment was set up correctly. assert os.environ.get(ApifyEnvVars.LOCAL_STORAGE_DIR) == str(tmp_path) diff --git a/tests/integration/test_actor_api_helpers.py b/tests/integration/test_actor_api_helpers.py index 5327af9c..47ecfb66 100644 --- a/tests/integration/test_actor_api_helpers.py +++ b/tests/integration/test_actor_api_helpers.py @@ -46,9 +46,6 @@ async def main() -> None: assert len(env_dict.get('actor_id', '')) == 17 assert len(env_dict.get('actor_run_id', '')) == 17 assert len(env_dict.get('user_id', '')) == 17 - assert len(env_dict.get('default_dataset_id', '')) == 17 - assert len(env_dict.get('default_key_value_store_id', '')) == 17 - assert len(env_dict.get('default_request_queue_id', '')) == 17 actor = await make_actor(label='get-env', main_func=main) run_result = await run_actor(actor) diff --git a/tests/integration/test_actor_dataset.py b/tests/integration/test_actor_dataset.py index 20a71750..1cce4fd9 100644 --- a/tests/integration/test_actor_dataset.py +++ b/tests/integration/test_actor_dataset.py @@ -104,8 +104,9 @@ async def main() -> None: dataset_by_name_2 = await Actor.open_dataset(name=dataset_name) assert dataset_by_name_1 is dataset_by_name_2 - dataset_by_id_1 = await Actor.open_dataset(id=dataset_by_name_1._id) - dataset_by_id_2 = await Actor.open_dataset(id=dataset_by_name_1._id) + dataset_1_metadata = await dataset_by_name_1.get_metadata() + dataset_by_id_1 = await Actor.open_dataset(id=dataset_1_metadata.id) + dataset_by_id_2 = await Actor.open_dataset(id=dataset_1_metadata.id) assert dataset_by_id_1 is dataset_by_name_1 assert dataset_by_id_2 is dataset_by_id_1 @@ -129,7 +130,7 @@ async def test_force_cloud( async with Actor: dataset = await Actor.open_dataset(name=dataset_name, force_cloud=True) - dataset_id = dataset._id + dataset_id = (await dataset.get_metadata()).id await dataset.push_data(dataset_item) diff --git a/tests/integration/test_actor_key_value_store.py b/tests/integration/test_actor_key_value_store.py index 6b6dd767..3d0fc22b 100644 --- a/tests/integration/test_actor_key_value_store.py +++ b/tests/integration/test_actor_key_value_store.py @@ -45,8 +45,9 @@ async def main() -> None: kvs_by_name_2 = await Actor.open_key_value_store(name=kvs_name) assert kvs_by_name_1 is kvs_by_name_2 - kvs_by_id_1 = await Actor.open_key_value_store(id=kvs_by_name_1._id) - kvs_by_id_2 = await Actor.open_key_value_store(id=kvs_by_name_1._id) + kvs_1_metadata = await kvs_by_name_1.get_metadata() + kvs_by_id_1 = await Actor.open_key_value_store(id=kvs_1_metadata.id) + kvs_by_id_2 = await Actor.open_key_value_store(id=kvs_1_metadata.id) assert kvs_by_id_1 is kvs_by_name_1 assert kvs_by_id_2 is kvs_by_id_1 @@ -69,7 +70,7 @@ async def test_force_cloud( async with Actor: key_value_store = await Actor.open_key_value_store(name=key_value_store_name, force_cloud=True) - key_value_store_id = key_value_store._id + key_value_store_id = (await key_value_store.get_metadata()).id await key_value_store.set_value('foo', 'bar') @@ -205,25 +206,26 @@ async def main() -> None: async with Actor: public_api_url = Actor.config.api_public_base_url - default_store_id = Actor.config.default_key_value_store_id + default_kvs_id = Actor.config.default_key_value_store_id record_key = 'public-record-key' - store = await Actor.open_key_value_store() + kvs = await Actor.open_key_value_store() + metadata = await kvs.get_metadata() + assert metadata.model_extra is not None - assert isinstance(store.storage_object.model_extra, dict) - url_signing_secret_key = store.storage_object.model_extra.get('urlSigningSecretKey') + url_signing_secret_key = metadata.model_extra.get('urlSigningSecretKey') assert url_signing_secret_key is not None - await store.set_value(record_key, {'exposedData': 'test'}, 'application/json') - - record_url = await store.get_public_url(record_key) + await kvs.set_value(record_key, {'exposedData': 'test'}, 'application/json') + record_url = await kvs.get_public_url(record_key) signature = create_hmac_signature(url_signing_secret_key, record_key) - assert ( - record_url - == f'{public_api_url}/v2/key-value-stores/{default_store_id}/records/{record_key}?signature={signature}' + expected_record_url = ( + f'{public_api_url}/v2/key-value-stores/{default_kvs_id}/records/{record_key}?signature={signature}' ) + assert record_url == expected_record_url + actor = await make_actor(label='kvs-get-public-url', main_func=main) run_result = await run_actor(actor) diff --git a/tests/integration/test_actor_request_queue.py b/tests/integration/test_actor_request_queue.py index 06e8529e..9689367a 100644 --- a/tests/integration/test_actor_request_queue.py +++ b/tests/integration/test_actor_request_queue.py @@ -3,10 +3,9 @@ from typing import TYPE_CHECKING from apify_shared.consts import ApifyEnvVars -from crawlee import Request from ._utils import generate_unique_resource_name -from apify import Actor +from apify import Actor, Request if TYPE_CHECKING: import pytest @@ -46,8 +45,9 @@ async def main() -> None: rq_by_name_2 = await Actor.open_request_queue(name=rq_name) assert rq_by_name_1 is rq_by_name_2 - rq_by_id_1 = await Actor.open_request_queue(id=rq_by_name_1._id) - rq_by_id_2 = await Actor.open_request_queue(id=rq_by_name_1._id) + rq_1_metadata = await rq_by_name_1.get_metadata() + rq_by_id_1 = await Actor.open_request_queue(id=rq_1_metadata.id) + rq_by_id_2 = await Actor.open_request_queue(id=rq_1_metadata.id) assert rq_by_id_1 is rq_by_name_1 assert rq_by_id_2 is rq_by_id_1 @@ -70,7 +70,7 @@ async def test_force_cloud( async with Actor: request_queue = await Actor.open_request_queue(name=request_queue_name, force_cloud=True) - request_queue_id = request_queue._id + request_queue_id = (await request_queue.get_metadata()).id request_info = await request_queue.add_request(Request.from_url('http://example.com')) diff --git a/tests/integration/test_request_queue.py b/tests/integration/test_request_queue.py index 4bce884a..8c8cecec 100644 --- a/tests/integration/test_request_queue.py +++ b/tests/integration/test_request_queue.py @@ -53,7 +53,7 @@ async def main() -> None: # I have seen it get stuck on this call rq = await Actor.open_request_queue() # Add some requests - await rq.add_requests_batched([f'https://example.com/{i}' for i in range(desired_request_count)]) + await rq.add_requests([f'https://example.com/{i}' for i in range(desired_request_count)]) handled_request_count = 0 while next_request := await rq.fetch_next_request(): @@ -79,7 +79,7 @@ async def test_add_non_unique_requests_in_batch( run_actor: RunActorFunction, ) -> None: async def main() -> None: - from crawlee import Request + from apify import Request async with Actor: desired_request_count = 100 @@ -87,7 +87,7 @@ async def main() -> None: # I have seen it get stuck on this call rq = await Actor.open_request_queue() # Add some requests - await rq.add_requests_batched( + await rq.add_requests( [ Request.from_url(f'https://example.com/{i}', unique_key=str(i - 1 if i % 4 == 1 else i)) for i in range(desired_request_count) diff --git a/tests/unit/actor/test_actor_dataset.py b/tests/unit/actor/test_actor_dataset.py index ef6282bb..4e1b99d9 100644 --- a/tests/unit/actor/test_actor_dataset.py +++ b/tests/unit/actor/test_actor_dataset.py @@ -1,19 +1,9 @@ from __future__ import annotations -from typing import TYPE_CHECKING - import pytest -from apify_shared.consts import ActorEnvVars - from apify import Actor -if TYPE_CHECKING: - from crawlee.storage_clients import MemoryStorageClient - -# NOTE: We only test the dataset methods available on Actor class/instance. -# Actual tests for the implementations are in storages/. - async def test_throws_error_without_actor_init() -> None: with pytest.raises(RuntimeError): @@ -31,34 +21,19 @@ async def test_open_dataset_returns_same_references() -> None: dataset_by_name_2 = await Actor.open_dataset(name=dataset_name) assert dataset_by_name_1 is dataset_by_name_2 - dataset_by_id_1 = await Actor.open_dataset(id=dataset_by_name_1._id) - dataset_by_id_2 = await Actor.open_dataset(id=dataset_by_name_1._id) + dataset_1_metadata = await dataset_by_name_1.get_metadata() + dataset_by_id_1 = await Actor.open_dataset(id=dataset_1_metadata.id) + dataset_by_id_2 = await Actor.open_dataset(id=dataset_1_metadata.id) assert dataset_by_id_1 is dataset_by_name_1 assert dataset_by_id_2 is dataset_by_id_1 -async def test_open_dataset_uses_env_var( - monkeypatch: pytest.MonkeyPatch, - memory_storage_client: MemoryStorageClient, -) -> None: - default_dataset_id = 'my-new-default-id' - monkeypatch.setenv(ActorEnvVars.DEFAULT_DATASET_ID, default_dataset_id) - - async with Actor: - ddt = await Actor.open_dataset() - assert ddt._id == default_dataset_id - await memory_storage_client.dataset(ddt._id).delete() - - async def test_push_data_to_dataset() -> None: - async with Actor as my_actor: - dataset = await my_actor.open_dataset() + async with Actor as actor: + dataset = await actor.open_dataset() desired_item_count = 100 await dataset.push_data([{'id': i} for i in range(desired_item_count)]) - dataset_info = await dataset.get_info() - assert dataset_info is not None - list_page = await dataset.get_data(limit=desired_item_count) assert {item['id'] for item in list_page.items} == set(range(desired_item_count)) diff --git a/tests/unit/actor/test_actor_env_helpers.py b/tests/unit/actor/test_actor_env_helpers.py index e9eacdb2..27fc1c39 100644 --- a/tests/unit/actor/test_actor_env_helpers.py +++ b/tests/unit/actor/test_actor_env_helpers.py @@ -44,6 +44,7 @@ async def test_get_env_with_randomized_env_vars(monkeypatch: pytest.MonkeyPatch) ApifyEnvVars.LOG_FORMAT, ApifyEnvVars.LOG_LEVEL, ActorEnvVars.STANDBY_PORT, + ApifyEnvVars.PERSIST_STORAGE, } legacy_env_vars = { @@ -59,7 +60,7 @@ async def test_get_env_with_randomized_env_vars(monkeypatch: pytest.MonkeyPatch) } # Set up random env vars - expected_get_env: dict[str, Any] = {} + expected_get_env = dict[str, Any]() expected_get_env[ApifyEnvVars.LOG_LEVEL.name.lower()] = 'INFO' for int_env_var in INTEGER_ENV_VARS: diff --git a/tests/unit/actor/test_actor_key_value_store.py b/tests/unit/actor/test_actor_key_value_store.py index 821065e1..405aa977 100644 --- a/tests/unit/actor/test_actor_key_value_store.py +++ b/tests/unit/actor/test_actor_key_value_store.py @@ -1,23 +1,15 @@ from __future__ import annotations -from typing import TYPE_CHECKING - import pytest from apify_shared.consts import ApifyEnvVars -from apify_shared.utils import json_dumps from ..test_crypto import PRIVATE_KEY_PASSWORD, PRIVATE_KEY_PEM_BASE64, PUBLIC_KEY from apify import Actor from apify._consts import ENCRYPTED_INPUT_VALUE_PREFIX from apify._crypto import public_encrypt -if TYPE_CHECKING: - from crawlee.storage_clients import MemoryStorageClient - -# NOTE: We only test the key-value store methods available on Actor class/instance. -# Actual tests for the implementations are in storages/. async def test_open_returns_same_references() -> None: async with Actor: kvs1 = await Actor.open_key_value_store() @@ -29,8 +21,9 @@ async def test_open_returns_same_references() -> None: kvs_by_name_2 = await Actor.open_key_value_store(name=kvs_name) assert kvs_by_name_1 is kvs_by_name_2 - kvs_by_id_1 = await Actor.open_key_value_store(id=kvs_by_name_1._id) - kvs_by_id_2 = await Actor.open_key_value_store(id=kvs_by_name_1._id) + kvs_1_metadata = await kvs_by_name_1.get_metadata() + kvs_by_id_1 = await Actor.open_key_value_store(id=kvs_1_metadata.id) + kvs_by_id_2 = await Actor.open_key_value_store(id=kvs_1_metadata.id) assert kvs_by_id_1 is kvs_by_name_1 assert kvs_by_id_2 is kvs_by_id_1 @@ -44,32 +37,24 @@ async def test_set_and_get_value() -> None: test_key = 'test_key' test_value = 'test_value' test_content_type = 'text/plain' - async with Actor as my_actor: - await my_actor.set_value(key=test_key, value=test_value, content_type=test_content_type) - value = await my_actor.get_value(key=test_key) + + async with Actor as actor: + await actor.set_value(key=test_key, value=test_value, content_type=test_content_type) + value = await actor.get_value(key=test_key) assert value == test_value -async def test_get_input(memory_storage_client: MemoryStorageClient) -> None: +async def test_get_input() -> None: input_key = 'INPUT' test_input = {'foo': 'bar'} - await memory_storage_client.key_value_stores().get_or_create(id='default') - await memory_storage_client.key_value_store('default').set_record( - key=input_key, - value=json_dumps(test_input), - content_type='application/json', - ) - - async with Actor as my_actor: - input = await my_actor.get_input() # noqa: A001 - assert input['foo'] == test_input['foo'] + async with Actor as actor: + await actor.set_value(key=input_key, value=test_input) + actor_input = await actor.get_input() + assert actor_input['foo'] == test_input['foo'] -async def test_get_input_with_encrypted_secrets( - monkeypatch: pytest.MonkeyPatch, - memory_storage_client: MemoryStorageClient, -) -> None: +async def test_get_input_with_encrypted_secrets(monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.setenv(ApifyEnvVars.INPUT_SECRETS_PRIVATE_KEY_FILE, PRIVATE_KEY_PEM_BASE64) monkeypatch.setenv(ApifyEnvVars.INPUT_SECRETS_PRIVATE_KEY_PASSPHRASE, PRIVATE_KEY_PASSWORD) @@ -81,14 +66,8 @@ async def test_get_input_with_encrypted_secrets( 'secret': f'{ENCRYPTED_INPUT_VALUE_PREFIX}:{encrypted_secret["encrypted_password"]}:{encrypted_secret["encrypted_value"]}', # noqa: E501 } - await memory_storage_client.key_value_stores().get_or_create(id='default') - await memory_storage_client.key_value_store('default').set_record( - key=input_key, - value=json_dumps(input_with_secret), - content_type='application/json', - ) - - async with Actor as my_actor: - input = await my_actor.get_input() # noqa: A001 - assert input['foo'] == input_with_secret['foo'] - assert input['secret'] == secret_string + async with Actor as actor: + await actor.set_value(key=input_key, value=input_with_secret) + actor_input = await actor.get_input() + assert actor_input['foo'] == input_with_secret['foo'] + assert actor_input['secret'] == secret_string diff --git a/tests/unit/actor/test_actor_request_queue.py b/tests/unit/actor/test_actor_request_queue.py index 5504715f..d7c52771 100644 --- a/tests/unit/actor/test_actor_request_queue.py +++ b/tests/unit/actor/test_actor_request_queue.py @@ -4,8 +4,6 @@ from apify import Actor -# NOTE: We only test the references here. Actual tests for the implementations are in storages/ - async def test_open_throws_without_init() -> None: with pytest.raises(RuntimeError): @@ -23,7 +21,8 @@ async def test_open_returns_same_references() -> None: rq_by_name_2 = await Actor.open_key_value_store(name=rq_name) assert rq_by_name_1 is rq_by_name_2 - rq_by_id_1 = await Actor.open_key_value_store(id=rq_by_name_1._id) - rq_by_id_2 = await Actor.open_key_value_store(id=rq_by_name_1._id) + rq_1_metadata = await rq_by_name_1.get_metadata() + rq_by_id_1 = await Actor.open_key_value_store(id=rq_1_metadata.id) + rq_by_id_2 = await Actor.open_key_value_store(id=rq_1_metadata.id) assert rq_by_id_1 is rq_by_name_1 assert rq_by_id_2 is rq_by_id_1 diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index 929173ea..7bdd1318 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -11,9 +11,6 @@ from apify_client import ApifyClientAsync from apify_shared.consts import ApifyEnvVars from crawlee import service_locator -from crawlee.configuration import Configuration as CrawleeConfiguration -from crawlee.storage_clients import MemoryStorageClient -from crawlee.storages import _creation_management import apify._actor @@ -45,24 +42,16 @@ def _prepare_test_env() -> None: # Set the environment variable for the local storage directory to the temporary path. monkeypatch.setenv(ApifyEnvVars.LOCAL_STORAGE_DIR, str(tmp_path)) - # Reset the flags in the service locator to indicate that no services are explicitly set. This ensures - # a clean state, as services might have been set during a previous test and not reset properly. - service_locator._configuration_was_retrieved = False - service_locator._storage_client_was_retrieved = False - service_locator._event_manager_was_retrieved = False - # Reset the services in the service locator. service_locator._configuration = None service_locator._event_manager = None service_locator._storage_client = None + service_locator._storage_instance_manager = None - # Clear creation-related caches to ensure no state is carried over between tests. - monkeypatch.setattr(_creation_management, '_cache_dataset_by_id', {}) - monkeypatch.setattr(_creation_management, '_cache_dataset_by_name', {}) - monkeypatch.setattr(_creation_management, '_cache_kvs_by_id', {}) - monkeypatch.setattr(_creation_management, '_cache_kvs_by_name', {}) - monkeypatch.setattr(_creation_management, '_cache_rq_by_id', {}) - monkeypatch.setattr(_creation_management, '_cache_rq_by_name', {}) + # Reset the retrieval flags. + service_locator._configuration_was_retrieved = False + service_locator._event_manager_was_retrieved = False + service_locator._storage_client_was_retrieved = False # Verify that the test environment was set up correctly. assert os.environ.get(ApifyEnvVars.LOCAL_STORAGE_DIR) == str(tmp_path) @@ -178,12 +167,3 @@ def getattr_override(apify_client_instance: Any, attr_name: str) -> Any: @pytest.fixture def apify_client_async_patcher(monkeypatch: pytest.MonkeyPatch) -> ApifyClientAsyncPatcher: return ApifyClientAsyncPatcher(monkeypatch) - - -@pytest.fixture -def memory_storage_client() -> MemoryStorageClient: - configuration = CrawleeConfiguration() - configuration.persist_storage = True - configuration.write_metadata = True - - return MemoryStorageClient.from_config(configuration) diff --git a/tests/unit/scrapy/requests/test_to_scrapy_request.py b/tests/unit/scrapy/requests/test_to_scrapy_request.py index d1481a98..2b8f0ab7 100644 --- a/tests/unit/scrapy/requests/test_to_scrapy_request.py +++ b/tests/unit/scrapy/requests/test_to_scrapy_request.py @@ -5,9 +5,9 @@ import pytest from scrapy import Request, Spider -from crawlee import Request as CrawleeRequest from crawlee._types import HttpHeaders +from apify import Request as ApifyRequest from apify.scrapy.requests import to_scrapy_request @@ -23,7 +23,7 @@ def spider() -> DummySpider: def test_without_reconstruction(spider: Spider) -> None: # Without reconstruction of encoded Scrapy request - apify_request = CrawleeRequest( + apify_request = ApifyRequest( url='https://example.com', method='GET', unique_key='https://example.com', @@ -42,7 +42,7 @@ def test_without_reconstruction(spider: Spider) -> None: def test_without_reconstruction_with_optional_fields(spider: Spider) -> None: # Without reconstruction of encoded Scrapy request - apify_request = CrawleeRequest( + apify_request = ApifyRequest( url='https://crawlee.dev', method='GET', unique_key='https://crawlee.dev', @@ -67,7 +67,7 @@ def test_without_reconstruction_with_optional_fields(spider: Spider) -> None: def test_with_reconstruction(spider: Spider) -> None: # With reconstruction of encoded Scrapy request - apify_request = CrawleeRequest( + apify_request = ApifyRequest( url='https://apify.com', method='GET', id='fvwscO2UJLdr10B', @@ -89,7 +89,7 @@ def test_with_reconstruction(spider: Spider) -> None: def test_with_reconstruction_with_optional_fields(spider: Spider) -> None: # With reconstruction of encoded Scrapy request - apify_request = CrawleeRequest( + apify_request = ApifyRequest( url='https://apify.com', method='GET', id='fvwscO2UJLdr10B', @@ -116,7 +116,7 @@ def test_with_reconstruction_with_optional_fields(spider: Spider) -> None: def test_invalid_request_for_reconstruction(spider: Spider) -> None: - apify_request = CrawleeRequest( + apify_request = ApifyRequest( url='https://example.com', method='GET', id='invalid123', diff --git a/uv.lock b/uv.lock index 1017881e..d2de9016 100644 --- a/uv.lock +++ b/uv.lock @@ -33,6 +33,7 @@ source = { editable = "." } dependencies = [ { name = "apify-client" }, { name = "apify-shared" }, + { name = "cachetools" }, { name = "crawlee" }, { name = "cryptography" }, { name = "httpx" }, @@ -63,13 +64,15 @@ dev = [ { name = "respx" }, { name = "ruff" }, { name = "setuptools" }, + { name = "types-cachetools" }, ] [package.metadata] requires-dist = [ - { name = "apify-client", specifier = ">=1.11.0" }, + { name = "apify-client", specifier = ">=1.12.0" }, { name = "apify-shared", specifier = ">=1.3.0" }, - { name = "crawlee", specifier = "~=0.6.0" }, + { name = "cachetools", specifier = ">=5.5.0" }, + { name = "crawlee", git = "https://github.com/apify/crawlee-python.git?rev=9dfac4b8afb8027979d85947f0db303f384b7158" }, { name = "cryptography", specifier = ">=42.0.0" }, { name = "httpx", specifier = ">=0.27.0" }, { name = "lazy-object-proxy", specifier = "<1.11.0" }, @@ -96,6 +99,7 @@ dev = [ { name = "respx", specifier = "~=0.22.0" }, { name = "ruff", specifier = "~=0.12.0" }, { name = "setuptools" }, + { name = "types-cachetools", specifier = ">=6.0.0.20250525" }, ] [[package]] @@ -310,11 +314,11 @@ wheels = [ [[package]] name = "certifi" -version = "2025.6.15" +version = "2025.1.31" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/73/f7/f14b46d4bcd21092d7d3ccef689615220d8a08fb25e564b65d20738e672e/certifi-2025.6.15.tar.gz", hash = "sha256:d747aa5a8b9bbbb1bb8c22bb13e22bd1f18e9796defa16bab421f7f7a317323b", size = 158753, upload-time = "2025-06-15T02:45:51.329Z" } +sdist = { url = "https://files.pythonhosted.org/packages/1c/ab/c9f1e32b7b1bf505bf26f0ef697775960db7932abeb7b516de930ba2705f/certifi-2025.1.31.tar.gz", hash = "sha256:3d5da6925056f6f18f119200434a4780a94263f10d1c21d032a6f6b2baa20651", size = 167577, upload-time = "2025-01-31T02:16:47.166Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/84/ae/320161bd181fc06471eed047ecce67b693fd7515b16d495d8932db763426/certifi-2025.6.15-py3-none-any.whl", hash = "sha256:2e0c7ce7cb5d8f8634ca55d2ba7e6ec2689a2fd6537d8dec1296a477a4910057", size = 157650, upload-time = "2025-06-15T02:45:49.977Z" }, + { url = "https://files.pythonhosted.org/packages/38/fc/bce832fd4fd99766c04d1ee0eead6b0ec6486fb100ae5e74c1d91292b982/certifi-2025.1.31-py3-none-any.whl", hash = "sha256:ca78db4565a652026a4db2bcdf68f2fb589ea80d0be70e03929ed730746b84fe", size = 166393, upload-time = "2025-01-31T02:16:45.015Z" }, ] [[package]] @@ -546,11 +550,12 @@ toml = [ [[package]] name = "crawlee" version = "0.6.11" -source = { registry = "https://pypi.org/simple" } +source = { git = "https://github.com/apify/crawlee-python.git?rev=9dfac4b8afb8027979d85947f0db303f384b7158#9dfac4b8afb8027979d85947f0db303f384b7158" } dependencies = [ { name = "apify-fingerprint-datapoints" }, { name = "browserforge" }, { name = "cachetools" }, + { name = "certifi" }, { name = "colorama" }, { name = "eval-type-backport" }, { name = "httpx", extra = ["brotli", "http2", "zstd"] }, @@ -566,10 +571,6 @@ dependencies = [ { name = "typing-extensions" }, { name = "yarl" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/a4/61/76d4c43a244bcea123500989a03729ab999054a1d57ebfa85cb66fb86cb7/crawlee-0.6.11.tar.gz", hash = "sha256:746c59b726cce728d7d703e9d2e737ed5f9b2ea8409d3c5b4de0d728af7c0249", size = 24144865, upload-time = "2025-06-23T08:49:53.162Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/02/8c/9f6cdcc80acca132721331cd07ebe19b6a6509e792eb8f04f9a519c525f3/crawlee-0.6.11-py3-none-any.whl", hash = "sha256:899ae74f891ad87c7c0fc9ae6f448be7f1163f54cda5ec4b9b2e080a0758f6c2", size = 263313, upload-time = "2025-06-23T08:49:51.057Z" }, -] [[package]] name = "cryptography" @@ -2119,6 +2120,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b6/33/38da585b06978d262cc2b2b45bc57ee75f0ce5e0b4ef1cab1b86461e9298/typeapi-2.2.4-py3-none-any.whl", hash = "sha256:bd6d5e5907fa47e0303bf254e7cc8712d4be4eb26d7ffaedb67c9e7844c53bb8", size = 26387, upload-time = "2025-01-29T11:40:12.328Z" }, ] +[[package]] +name = "types-cachetools" +version = "6.0.0.20250525" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/03/d0/55ff0eeda141436c1bd2142cd026906870c661b3f7755070d6da7ea7210f/types_cachetools-6.0.0.20250525.tar.gz", hash = "sha256:baf06f234cac3aeb44c07893447ba03ecdb6c0742ba2607e28a35d38e6821b02", size = 8925, upload-time = "2025-05-25T03:13:53.498Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/47/8c/4ab0a17ece30fe608270b89cf066387051862899fff9f54ab12511fc7fdd/types_cachetools-6.0.0.20250525-py3-none-any.whl", hash = "sha256:1de8f0fe4bdcb187a48d2026c1e3672830f67943ad2bf3486abe031b632f1252", size = 8938, upload-time = "2025-05-25T03:13:52.406Z" }, +] + [[package]] name = "typing-extensions" version = "4.14.0"