Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Integrate browserforge fingerprints #829

Merged
merged 33 commits into from
Feb 5, 2025
Merged
Show file tree
Hide file tree
Changes from 26 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
b0d52f2
Draft of integration of browserforge fingerprint generation.
Pijukatel Dec 16, 2024
be15847
Works with page.evaluate.
Pijukatel Dec 17, 2024
a9415ec
Use add_init_script
Pijukatel Dec 17, 2024
36727a1
WIP
Pijukatel Dec 18, 2024
42eff80
Fix format, type check and tests.
Pijukatel Dec 18, 2024
998cbb6
Fix rootcause for flakiness in fingerprint generation
Pijukatel Dec 18, 2024
e1025c8
Use browserforge.injector code for fingerprints
Pijukatel Dec 19, 2024
33fdd6e
Merge remote-tracking branch 'origin/master' into integrate-browserfo…
Pijukatel Dec 19, 2024
85ba877
Regenerate poetry lock after merge
Pijukatel Dec 19, 2024
6e35c1d
Remove unintentional change to headless test
Pijukatel Dec 19, 2024
3f96456
Merge branch 'master' into integrate-browserforge-fingerprints
Pijukatel Jan 3, 2025
ddfabea
chore: revert React version bump
barjin Jan 3, 2025
3d37bca
Merge remote-tracking branch 'origin/master' into integrate-browserfo…
Pijukatel Jan 10, 2025
1b8e6a3
Add ScreenFingerprint and NavigatorFingerprint
Pijukatel Jan 10, 2025
9828a36
Add Fingerprint and their options types
Pijukatel Jan 13, 2025
f733c07
Add adapter tests
Pijukatel Jan 13, 2025
97011d9
Integrate into pw_crawler
Pijukatel Jan 13, 2025
debe900
Further integration into our code.
Pijukatel Jan 14, 2025
3d8340c
Finalize draft.
Pijukatel Jan 14, 2025
3d9b170
Set fiongerprint generator as top level argument to pw crawler
Pijukatel Jan 14, 2025
25aa4e2
Revert unnecessary change to function doc string.
Pijukatel Jan 14, 2025
5e46b78
Make test adapter-generic.
Pijukatel Jan 14, 2025
69b6974
Add types to __init__ if fingerprint_suite
Pijukatel Jan 14, 2025
27479be
Remove FingerprintGeneratorOptions
Pijukatel Jan 20, 2025
751f67c
Merge remote-tracking branch 'origin/master' into integrate-browserfo…
Pijukatel Jan 23, 2025
1cbadb0
Review commnets
Pijukatel Jan 23, 2025
8e44acd
Handle inconsistent result from browserforge fingerprint generator
Pijukatel Jan 24, 2025
d8001e7
Apply suggestions from code review
Pijukatel Jan 27, 2025
07acbfa
Docs
Pijukatel Jan 27, 2025
866fe98
Make sure browserforge files are downloaded before tests.
Pijukatel Jan 27, 2025
b3eee4f
Merge remote-tracking branch 'origin/master' into integrate-browserfo…
Pijukatel Jan 31, 2025
5c15db1
Review comments
Pijukatel Feb 4, 2025
fa9b0f9
Merge remote-tracking branch 'origin/master' into integrate-browserfo…
Pijukatel Feb 5, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
263 changes: 144 additions & 119 deletions poetry.lock

Large diffs are not rendered by default.

5 changes: 3 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ keywords = [
python = "^3.9"
apify = { version = ">=2.0.0", optional = true }
beautifulsoup4 = { version = ">=4.12.0", optional = true }
browserforge = { version = "*", optional = true }
colorama = ">=0.4.0"
cookiecutter = ">=2.6.0"
curl-cffi = { version = ">=0.7.2", optional = true }
Expand Down Expand Up @@ -92,10 +93,10 @@ types-psutil = "~5.9.5.20240205"
types-python-dateutil = "~2.9.0.20240316"

[tool.poetry.extras]
all = ["beautifulsoup4", "curl-cffi", "lxml", "html5lib", "parsel", "playwright"]
all = ["beautifulsoup4", "curl-cffi", "lxml", "html5lib", "parsel", "playwright", "browserforge"]
beautifulsoup = ["beautifulsoup4", "lxml", "html5lib"]
curl-impersonate = ["curl-cffi"]
playwright = ["playwright"]
playwright = ["playwright", "browserforge"]
parsel = ["parsel"]

[tool.poetry.scripts]
Expand Down
9 changes: 8 additions & 1 deletion src/crawlee/browsers/_browser_pool.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from types import TracebackType

from crawlee.browsers._base_browser_plugin import BaseBrowserPlugin
from crawlee.fingerprint_suite._fingerprint_generator import FingerprintGenerator
Pijukatel marked this conversation as resolved.
Show resolved Hide resolved
from crawlee.proxy_configuration import ProxyInfo

logger = getLogger(__name__)
Expand Down Expand Up @@ -103,6 +104,7 @@ def with_default_plugin(
browser_launch_options: Mapping[str, Any] | None = None,
browser_new_context_options: Mapping[str, Any] | None = None,
headless: bool | None = None,
fingerprint_generator: FingerprintGenerator | None = None,
**kwargs: Any,
) -> BrowserPool:
"""Create a new instance with a single `PlaywrightBrowserPlugin` configured with the provided options.
Expand All @@ -116,6 +118,8 @@ def with_default_plugin(
are provided directly to Playwright's `browser.new_context` method. For more details, refer to the
Playwright documentation: https://playwright.dev/python/docs/api/class-browser#browser-new-context.
headless: Whether to run the browser in headless mode.
fingerprint_generator: An optional instance of implementation of `FingerprintGenerator` that is used
to generate browser fingerprints together with consistent headers.
kwargs: Additional arguments for default constructor.
"""
plugin_options: dict = defaultdict(dict)
Expand All @@ -128,7 +132,10 @@ def with_default_plugin(
if browser_type:
plugin_options['browser_type'] = browser_type

plugin = PlaywrightBrowserPlugin(**plugin_options)
plugin = PlaywrightBrowserPlugin(
**plugin_options,
fingerprint_generator=fingerprint_generator,
)
return cls(plugins=[plugin], **kwargs)

@property
Expand Down
55 changes: 42 additions & 13 deletions src/crawlee/browsers/_playwright_browser_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from datetime import datetime, timedelta, timezone
from typing import TYPE_CHECKING, Any, cast

from browserforge.injectors.playwright import AsyncNewContext
from playwright.async_api import BrowserContext, Page, ProxySettings
from typing_extensions import override

Expand All @@ -18,6 +19,7 @@

from playwright.async_api import Browser

from crawlee.fingerprint_suite._fingerprint_generator import FingerprintGenerator
Pijukatel marked this conversation as resolved.
Show resolved Hide resolved
from crawlee.proxy_configuration import ProxyInfo

from logging import getLogger
Expand All @@ -42,6 +44,7 @@ def __init__(
*,
max_open_pages_per_browser: int = 20,
header_generator: HeaderGenerator | None = _DEFAULT_HEADER_GENERATOR,
fingerprint_generator: FingerprintGenerator | None = None,
) -> None:
"""A default constructor.

Expand All @@ -51,10 +54,18 @@ def __init__(
header_generator: An optional `HeaderGenerator` instance used to generate and manage HTTP headers for
requests made by the browser. By default, a predefined header generator is used. Set to `None` to
disable automatic header modifications.
fingerprint_generator: An optional instance of implementation of `FingerprintGenerator` that is used
to generate browser fingerprints together with consistent headers.
"""
if fingerprint_generator and header_generator is not self._DEFAULT_HEADER_GENERATOR:
raise ValueError(
'Do not use `header_generator` and `fingerprint_generator` arguments at the same time. '
'Choose only one. `fingerprint_generator` generates headers as well.'
)
self._browser = browser
self._max_open_pages_per_browser = max_open_pages_per_browser
self._header_generator = header_generator
self._fingerprint_generator = fingerprint_generator

self._browser_context: BrowserContext | None = None
self._pages = list[Page]()
Expand Down Expand Up @@ -116,7 +127,10 @@ async def new_page(
ValueError: If the browser has reached the maximum number of open pages.
"""
if not self._browser_context:
self._browser_context = await self._create_browser_context(browser_new_context_options, proxy_info)
self._browser_context = await self._create_browser_context(
browser_new_context_options=browser_new_context_options,
proxy_info=proxy_info,
)

if not self.has_free_capacity:
raise ValueError('Cannot open more pages in this browser.')
Expand Down Expand Up @@ -154,21 +168,18 @@ def _on_page_close(self, page: Page) -> None:
self._pages.remove(page)

async def _create_browser_context(
self, browser_new_context_options: Mapping[str, Any] | None = None, proxy_info: ProxyInfo | None = None
self,
browser_new_context_options: Mapping[str, Any] | None = None,
proxy_info: ProxyInfo | None = None,
) -> BrowserContext:
"""Create a new browser context with the specified proxy settings."""
if self._header_generator:
common_headers = self._header_generator.get_common_headers()
sec_ch_ua_headers = self._header_generator.get_sec_ch_ua_headers(browser_type=self.browser_type)
user_agent_header = self._header_generator.get_user_agent_header(browser_type=self.browser_type)
extra_http_headers = dict(common_headers | sec_ch_ua_headers | user_agent_header)
else:
extra_http_headers = None
"""Create a new browser context with the specified proxy settings.

Create context with fingerprints and headers using with `self._fingerprint_generator` if available.
Create context without fingerprints, but with headers based on `self._header_generator` if available.
Create context without headers and without fingerprints if neither `self._header_generator` nor
`self._fingerprint_generator` is available.
"""
browser_new_context_options = dict(browser_new_context_options) if browser_new_context_options else {}
browser_new_context_options['extra_http_headers'] = browser_new_context_options.get(
'extra_http_headers', extra_http_headers
)

if proxy_info:
if browser_new_context_options.get('proxy'):
Expand All @@ -180,4 +191,22 @@ async def _create_browser_context(
password=proxy_info.password,
)

if self._fingerprint_generator:
return await AsyncNewContext(
browser=self._browser, fingerprint=self._fingerprint_generator.generate(), **browser_new_context_options
)

if self._header_generator:
common_headers = self._header_generator.get_common_headers()
sec_ch_ua_headers = self._header_generator.get_sec_ch_ua_headers(browser_type=self.browser_type)
user_agent_header = self._header_generator.get_user_agent_header(browser_type=self.browser_type)
headers = dict(common_headers | sec_ch_ua_headers | user_agent_header)
extra_http_headers = headers
else:
extra_http_headers = None

browser_new_context_options['extra_http_headers'] = browser_new_context_options.get(
'extra_http_headers', extra_http_headers
)

return await self._browser.new_context(**browser_new_context_options)
7 changes: 7 additions & 0 deletions src/crawlee/browsers/_playwright_browser_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from types import TracebackType

from crawlee.browsers._types import BrowserType
from crawlee.fingerprint_suite._fingerprint_generator import FingerprintGenerator
Pijukatel marked this conversation as resolved.
Show resolved Hide resolved

logger = getLogger(__name__)

Expand All @@ -43,6 +44,7 @@ def __init__(
browser_launch_options: dict[str, Any] | None = None,
browser_new_context_options: dict[str, Any] | None = None,
max_open_pages_per_browser: int = 20,
fingerprint_generator: FingerprintGenerator | None = None,
) -> None:
"""A default constructor.

Expand All @@ -56,6 +58,8 @@ def __init__(
Playwright documentation: https://playwright.dev/python/docs/api/class-browser#browser-new-context.
max_open_pages_per_browser: The maximum number of pages that can be opened in a single browser instance.
Once reached, a new browser instance will be launched to handle the excess.
fingerprint_generator: An optional instance of implementation of `FingerprintGenerator` that is used
to generate browser fingerprints together with consistent headers.
"""
config = service_locator.get_configuration()

Expand All @@ -77,6 +81,8 @@ def __init__(
# Flag to indicate the context state.
self._active = False

self._fingerprint_generator = fingerprint_generator

@property
@override
def active(self) -> bool:
Expand Down Expand Up @@ -154,4 +160,5 @@ async def new_browser(self) -> PlaywrightBrowserController:
return PlaywrightBrowserController(
browser,
max_open_pages_per_browser=self._max_open_pages_per_browser,
fingerprint_generator=self._fingerprint_generator,
)
19 changes: 16 additions & 3 deletions src/crawlee/crawlers/_playwright/_playwright_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@

from crawlee._types import BasicCrawlingContext, EnqueueLinksKwargs
from crawlee.browsers._types import BrowserType
from crawlee.fingerprint_suite import FingerprintGenerator


@docs_group('Classes')
Expand Down Expand Up @@ -70,10 +71,12 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:

def __init__(
self,
*,
browser_pool: BrowserPool | None = None,
browser_type: BrowserType | None = None,
browser_launch_options: Mapping[str, Any] | None = None,
browser_new_context_options: Mapping[str, Any] | None = None,
fingerprint_generator: FingerprintGenerator | None = None,
headless: bool | None = None,
**kwargs: Unpack[BasicCrawlerOptions[PlaywrightCrawlingContext]],
) -> None:
Expand All @@ -91,6 +94,8 @@ def __init__(
are provided directly to Playwright's `browser.new_context` method. For more details, refer to the
[Playwright documentation](https://playwright.dev/python/docs/api/class-browser#browser-new-context).
This option should not be used if `browser_pool` is provided.
fingerprint_generator: An optional instance of implementation of `FingerprintGenerator` that is used
to generate browser fingerprints together with consistent headers.
headless: Whether to run the browser in headless mode.
This option should not be used if `browser_pool` is provided.
kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`.
Expand All @@ -99,11 +104,18 @@ def __init__(
# Raise an exception if browser_pool is provided together with other browser-related arguments.
if any(
param is not None
for param in (headless, browser_type, browser_launch_options, browser_new_context_options)
for param in (
headless,
browser_type,
browser_launch_options,
browser_new_context_options,
fingerprint_generator,
)
):
raise ValueError(
'You cannot provide `headless`, `browser_type`, `browser_launch_options` or '
'`browser_new_context_options` arguments when `browser_pool` is provided.'
'You cannot provide `headless`, `browser_type`, `browser_launch_options`, '
'`browser_new_context_options` or `fingerprint_generator` arguments when `browser_pool` '
'is provided.'
)

# If browser_pool is not provided, create a new instance of BrowserPool with specified arguments.
Expand All @@ -113,6 +125,7 @@ def __init__(
browser_type=browser_type,
browser_launch_options=browser_launch_options,
browser_new_context_options=browser_new_context_options,
fingerprint_generator=fingerprint_generator,
)

self._browser_pool = browser_pool
Expand Down
3 changes: 3 additions & 0 deletions src/crawlee/fingerprint_suite/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,4 @@
from ._browserforge_adapter import BrowserforgeFingerprintGenerator as DefaultFingerprintGenerator
vdusek marked this conversation as resolved.
Show resolved Hide resolved
from ._fingerprint_generator import FingerprintGenerator
from ._header_generator import HeaderGenerator
from ._types import HeaderGeneratorOptions, ScreenOptions
60 changes: 60 additions & 0 deletions src/crawlee/fingerprint_suite/_browserforge_adapter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
from __future__ import annotations

from copy import deepcopy
from typing import TYPE_CHECKING, Any

from browserforge.fingerprints import Fingerprint as bf_Fingerprint
from browserforge.fingerprints import FingerprintGenerator as bf_FingerprintGenerator
from browserforge.fingerprints import Screen
from typing_extensions import override

from ._fingerprint_generator import FingerprintGenerator

if TYPE_CHECKING:
from ._types import HeaderGeneratorOptions, ScreenOptions


class BrowserforgeFingerprintGenerator(FingerprintGenerator):
vdusek marked this conversation as resolved.
Show resolved Hide resolved
def __init__(
self,
*,
header_options: HeaderGeneratorOptions | None = None,
screen_options: ScreenOptions | None = None,
mock_web_rtc: bool | None = None,
slim: bool | None = None,
) -> None:
"""A default constructor.

All generator options are optional. If any value is not specified, then `None` is set in the options.
Default values for options set to `None` are implementation detail of used fingerprint generator.
Specific default values should not be relied upon. Use explicit values if it matters for your use case.

Args:
header_options: Collection of header related attributes that can be used by the fingerprint generator.
screen_options: Defines the screen constrains for the fingerprint generator.
mock_web_rtc: Whether to mock WebRTC when injecting the fingerprint.
slim: Disables performance-heavy evasions when injecting the fingerprint.
strict: If set to `True`, it will raise error if it is not possible to generate fingerprints based on the
`options`. Default behavior is relaxation of `options` until it is possible to generate a fingerprint.
"""
bf_options: dict[str, Any] = {'mock_webrtc': mock_web_rtc, 'slim': slim}

if header_options is None:
bf_header_options = {}
else:
bf_header_options = deepcopy(header_options.model_dump())
bf_header_options['browser'] = bf_header_options.pop('browsers', None)
bf_header_options['os'] = bf_header_options.pop('operating_systems', None)
bf_header_options['device'] = bf_header_options.pop('devices', None)
bf_header_options['locale'] = bf_header_options.pop('locales', None)

if screen_options is None:
bf_options['screen'] = Screen()
else:
bf_options['screen'] = Screen(**screen_options.model_dump())

self._options = {**bf_options, **bf_header_options}

@override
def generate(self) -> bf_Fingerprint:
return bf_FingerprintGenerator().generate(**self._options)
19 changes: 19 additions & 0 deletions src/crawlee/fingerprint_suite/_fingerprint_generator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from __future__ import annotations

from abc import ABC, abstractmethod
from typing import TYPE_CHECKING

if TYPE_CHECKING:
from browserforge.fingerprints import Fingerprint


class FingerprintGenerator(ABC):
vdusek marked this conversation as resolved.
Show resolved Hide resolved
@abstractmethod
def generate(self) -> Fingerprint:
"""Method that is capable of generating fingerprints.

This is experimental feature.
Return type is temporarily set to `Fingerprint` from `browserforge`. This is subject to change and most likely
it will change to custom `Fingerprint` class defined in this repo later.
"""
...
Pijukatel marked this conversation as resolved.
Show resolved Hide resolved
6 changes: 3 additions & 3 deletions src/crawlee/fingerprint_suite/_header_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
)

if TYPE_CHECKING:
from crawlee.browsers._types import BrowserType
from crawlee.fingerprint_suite._types import SupportedBrowserType


@docs_group('Classes')
Expand All @@ -45,7 +45,7 @@ def get_random_user_agent_header(self) -> HttpHeaders:
def get_user_agent_header(
self,
*,
browser_type: BrowserType = 'chromium',
browser_type: SupportedBrowserType = 'chromium',
) -> HttpHeaders:
"""Get the User-Agent header based on the browser type."""
headers = dict[str, str]()
Expand All @@ -67,7 +67,7 @@ def get_user_agent_header(
def get_sec_ch_ua_headers(
self,
*,
browser_type: BrowserType = 'chromium',
browser_type: SupportedBrowserType = 'chromium',
) -> HttpHeaders:
"""Get the Sec-Ch-Ua headers based on the browser type."""
headers = dict[str, str]()
Expand Down
Loading
Loading