Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Integrate browserforge fingerprints #829

Merged
merged 33 commits into from
Feb 5, 2025
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
b0d52f2
Draft of integration of browserforge fingerprint generation.
Pijukatel Dec 16, 2024
be15847
Works with page.evaluate.
Pijukatel Dec 17, 2024
a9415ec
Use add_init_script
Pijukatel Dec 17, 2024
36727a1
WIP
Pijukatel Dec 18, 2024
42eff80
Fix format, type check and tests.
Pijukatel Dec 18, 2024
998cbb6
Fix rootcause for flakiness in fingerprint generation
Pijukatel Dec 18, 2024
e1025c8
Use browserforge.injector code for fingerprints
Pijukatel Dec 19, 2024
33fdd6e
Merge remote-tracking branch 'origin/master' into integrate-browserfo…
Pijukatel Dec 19, 2024
85ba877
Regenerate poetry lock after merge
Pijukatel Dec 19, 2024
6e35c1d
Remove unintentional change to headless test
Pijukatel Dec 19, 2024
3f96456
Merge branch 'master' into integrate-browserforge-fingerprints
Pijukatel Jan 3, 2025
ddfabea
chore: revert React version bump
barjin Jan 3, 2025
3d37bca
Merge remote-tracking branch 'origin/master' into integrate-browserfo…
Pijukatel Jan 10, 2025
1b8e6a3
Add ScreenFingerprint and NavigatorFingerprint
Pijukatel Jan 10, 2025
9828a36
Add Fingerprint and their options types
Pijukatel Jan 13, 2025
f733c07
Add adapter tests
Pijukatel Jan 13, 2025
97011d9
Integrate into pw_crawler
Pijukatel Jan 13, 2025
debe900
Further integration into our code.
Pijukatel Jan 14, 2025
3d8340c
Finalize draft.
Pijukatel Jan 14, 2025
3d9b170
Set fiongerprint generator as top level argument to pw crawler
Pijukatel Jan 14, 2025
25aa4e2
Revert unnecessary change to function doc string.
Pijukatel Jan 14, 2025
5e46b78
Make test adapter-generic.
Pijukatel Jan 14, 2025
69b6974
Add types to __init__ if fingerprint_suite
Pijukatel Jan 14, 2025
27479be
Remove FingerprintGeneratorOptions
Pijukatel Jan 20, 2025
751f67c
Merge remote-tracking branch 'origin/master' into integrate-browserfo…
Pijukatel Jan 23, 2025
1cbadb0
Review commnets
Pijukatel Jan 23, 2025
8e44acd
Handle inconsistent result from browserforge fingerprint generator
Pijukatel Jan 24, 2025
d8001e7
Apply suggestions from code review
Pijukatel Jan 27, 2025
07acbfa
Docs
Pijukatel Jan 27, 2025
866fe98
Make sure browserforge files are downloaded before tests.
Pijukatel Jan 27, 2025
b3eee4f
Merge remote-tracking branch 'origin/master' into integrate-browserfo…
Pijukatel Jan 31, 2025
5c15db1
Review comments
Pijukatel Feb 4, 2025
fa9b0f9
Merge remote-tracking branch 'origin/master' into integrate-browserfo…
Pijukatel Feb 5, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 25 additions & 7 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 3 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ keywords = [
python = "^3.9"
apify = { version = ">=2.0.0", optional = true }
beautifulsoup4 = { version = ">=4.12.0", optional = true }
browserforge = { version = "*", optional = true }
colorama = ">=0.4.0"
cookiecutter = ">=2.6.0"
curl-cffi = { version = ">=0.7.2", optional = true }
Expand Down Expand Up @@ -94,10 +95,10 @@ types-psutil = "~5.9.5.20240205"
types-python-dateutil = "~2.9.0.20240316"

[tool.poetry.extras]
all = ["beautifulsoup4", "curl-cffi", "lxml", "html5lib", "parsel", "playwright"]
all = ["beautifulsoup4", "curl-cffi", "lxml", "html5lib", "parsel", "playwright", "browserforge"]
beautifulsoup = ["beautifulsoup4", "lxml", "html5lib"]
curl-impersonate = ["curl-cffi"]
playwright = ["playwright"]
playwright = ["playwright", "browserforge"]
parsel = ["parsel"]

[tool.poetry.scripts]
Expand Down
17 changes: 16 additions & 1 deletion src/crawlee/browsers/_browser_pool.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ def __init__(
browser_inactive_threshold: timedelta = timedelta(seconds=10),
identify_inactive_browsers_interval: timedelta = timedelta(seconds=20),
close_inactive_browsers_interval: timedelta = timedelta(seconds=30),
use_fingerprints: bool = True,
fingerprint_generator_options: dict[str, Any] | None = None,
) -> None:
"""A default constructor.

Expand All @@ -66,6 +68,8 @@ def __init__(
close_inactive_browsers_interval: The interval at which the pool checks for inactive browsers
and closes them. The browser is considered as inactive if it has no active pages and has been idle
for the specified period.
use_fingerprints: Inject generated fingerprints to page.
fingerprint_generator_options: Override generated fingerprints with these specific values, if possible.
"""
self._plugins = plugins or [PlaywrightBrowserPlugin()]
self._operation_timeout = operation_timeout
Expand Down Expand Up @@ -95,6 +99,9 @@ def __init__(
# Flag to indicate the context state.
self._active = False

self._use_fingerprints = use_fingerprints
self._fingerprint_generator_options = fingerprint_generator_options

@classmethod
def with_default_plugin(
cls,
Expand All @@ -103,6 +110,8 @@ def with_default_plugin(
browser_options: Mapping[str, Any] | None = None,
page_options: Mapping[str, Any] | None = None,
headless: bool | None = None,
use_fingerprints: bool = False,
fingerprint_generator_options: dict[str, Any] | None = None,
**kwargs: Any,
) -> BrowserPool:
"""Create a new instance with a single `PlaywrightBrowserPlugin` configured with the provided options.
Expand All @@ -116,6 +125,8 @@ def with_default_plugin(
Playwright's `browser_context.new_page` method. For more details, refer to the Playwright documentation:
https://playwright.dev/python/docs/api/class-browsercontext#browser-context-new-page.
headless: Whether to run the browser in headless mode.
use_fingerprints: Inject generated fingerprints to page.
fingerprint_generator_options: Override generated fingerprints with these specific values, if possible.
kwargs: Additional arguments for default constructor.
"""
plugin_options: dict = defaultdict(dict)
Expand All @@ -128,7 +139,11 @@ def with_default_plugin(
if browser_type:
plugin_options['browser_type'] = browser_type

plugin = PlaywrightBrowserPlugin(**plugin_options)
plugin = PlaywrightBrowserPlugin(
**plugin_options,
use_fingerprints=use_fingerprints,
fingerprint_generator_options=fingerprint_generator_options,
)
return cls(plugins=[plugin], **kwargs)

@property
Expand Down
83 changes: 74 additions & 9 deletions src/crawlee/browsers/_playwright_browser_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,15 @@
from datetime import datetime, timedelta, timezone
from typing import TYPE_CHECKING, Any, cast

from browserforge.fingerprints import Fingerprint, FingerprintGenerator
from playwright.async_api import BrowserContext, Page, ProxySettings
from typing_extensions import override

from crawlee._utils.docs import docs_group
from crawlee.browsers._base_browser_controller import BaseBrowserController
from crawlee.browsers._types import BrowserType
from crawlee.fingerprint_suite import HeaderGenerator
from crawlee.fingerprint_suite._injected_page_function import create_init_script_with_fingerprint

if TYPE_CHECKING:
from collections.abc import Mapping
Expand All @@ -38,6 +40,8 @@ def __init__(
*,
max_open_pages_per_browser: int = 20,
header_generator: HeaderGenerator | None = _DEFAULT_HEADER_GENERATOR,
use_fingerprints: bool = True,
fingerprint_generator_options: dict[str, Any] | None = None,
) -> None:
"""A default constructor.

Expand All @@ -47,15 +51,22 @@ def __init__(
header_generator: An optional `HeaderGenerator` instance used to generate and manage HTTP headers for
requests made by the browser. By default, a predefined header generator is used. Set to `None` to
disable automatic header modifications.
use_fingerprints: Inject generated fingerprints to page.
fingerprint_generator_options: Override generated fingerprints with these specific values, if possible.
"""
self._browser = browser
self._max_open_pages_per_browser = max_open_pages_per_browser
self._header_generator = header_generator

self._fingerprint_generator = FingerprintGenerator(**(fingerprint_generator_options or {}))

self._browser_context: BrowserContext | None = None
self._pages = list[Page]()
self._last_page_opened_at = datetime.now(timezone.utc)

self._use_fingerprints = use_fingerprints
self._fingerprint: Fingerprint | None = None

@property
@override
def pages(self) -> list[Page]:
Expand Down Expand Up @@ -98,13 +109,14 @@ async def new_page(
proxy_info: ProxyInfo | None = None,
) -> Page:
if not self._browser_context:
self._browser_context = await self._create_browser_context(proxy_info)
await self._set_fingerprint()
await self._set_browser_context(fingerprint=self._fingerprint)

if not self.has_free_capacity:
raise ValueError('Cannot open more pages in this browser.')

page_options = dict(page_options) if page_options else {}
page = await self._browser_context.new_page(**page_options)
page = await self._get_browser_context().new_page(**page_options)

# Handle page close event
page.on(event='close', f=self._on_page_close)
Expand All @@ -113,8 +125,62 @@ async def new_page(
self._pages.append(page)
self._last_page_opened_at = datetime.now(timezone.utc)

# Inject fingerprint
if self._use_fingerprints:
await self._inject_fingerprint_to_page(page)

return page

async def _set_browser_context(
self, proxy_info: ProxyInfo | None = None, fingerprint: Fingerprint | None = None
) -> None:
"""Set browser context.

Set headers based on fingerprint if available to ensure consistency between headers and fingerprint.
Fallback to header generator if no fingerprint is available.
"""
if fingerprint:
headers = fingerprint.headers
elif self._header_generator:
common_headers = self._header_generator.get_common_headers()
sec_ch_ua_headers = self._header_generator.get_sec_ch_ua_headers(browser_type=self.browser_type)
user_agent_header = self._header_generator.get_user_agent_header(browser_type=self.browser_type)
headers = dict(common_headers | sec_ch_ua_headers | user_agent_header)
else:
headers = None
self._browser_context = await self._create_browser_context(proxy_info, headers)

def _get_browser_context(self) -> BrowserContext:
if not self._browser_context:
raise RuntimeError('Browser context was not set yet.')
return self._browser_context

async def _set_fingerprint(self) -> None:
if self._use_fingerprints and not self._fingerprint:
while fingerprint := self._fingerprint_generator.generate():
if self._is_good_fingerprint(fingerprint):
break
self._fingerprint = fingerprint

@staticmethod
def _is_good_fingerprint(fingerprint: Fingerprint) -> bool:
"""Check if fingerprint is ok to use.

By trial and error it was found out that some generated fingerprints are not working well.
All fingerprints that were not working well had 'Te': 'trailers' in headers.
"""
return fingerprint.headers.get('Te', '') != 'trailers'

def _get_fingerprint(self) -> Fingerprint:
if not self._use_fingerprints:
raise RuntimeError('Fingerprint was is not allowed. use_fingerprints = False.')
if not self._fingerprint:
raise RuntimeError('Fingerprint was not set yet.')
return self._fingerprint

async def _inject_fingerprint_to_page(self, page: Page) -> None:
await page.add_init_script(create_init_script_with_fingerprint(self._get_fingerprint().dumps()))

@override
Pijukatel marked this conversation as resolved.
Show resolved Hide resolved
async def close(self, *, force: bool = False) -> None:
if force:
Expand All @@ -130,14 +196,13 @@ def _on_page_close(self, page: Page) -> None:
"""Handle actions after a page is closed."""
self._pages.remove(page)

async def _create_browser_context(self, proxy_info: ProxyInfo | None = None) -> BrowserContext:
async def _create_browser_context(
self, proxy_info: ProxyInfo | None = None, headers: dict[str, str] | None = None
) -> BrowserContext:
"""Create a new browser context with the specified proxy settings."""
if self._header_generator:
common_headers = self._header_generator.get_common_headers()
sec_ch_ua_headers = self._header_generator.get_sec_ch_ua_headers(browser_type=self.browser_type)
user_agent_header = self._header_generator.get_user_agent_header(browser_type=self.browser_type)
extra_http_headers = dict(common_headers | sec_ch_ua_headers | user_agent_header)
user_agent = user_agent_header.get('User-Agent')
if headers:
extra_http_headers = headers
user_agent = headers.get('User-Agent')
else:
extra_http_headers = None
user_agent = None
Expand Down
10 changes: 10 additions & 0 deletions src/crawlee/browsers/_playwright_browser_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ def __init__(
browser_options: Mapping[str, Any] | None = None,
page_options: Mapping[str, Any] | None = None,
max_open_pages_per_browser: int = 20,
use_fingerprints: bool = True,
fingerprint_generator_options: dict[str, Any] | None = None,
) -> None:
"""A default constructor.

Expand All @@ -51,6 +53,9 @@ def __init__(
https://playwright.dev/python/docs/api/class-browsercontext#browser-context-new-page.
max_open_pages_per_browser: The maximum number of pages that can be opened in a single browser instance.
Once reached, a new browser instance will be launched to handle the excess.
use_fingerprints: Inject generated fingerprints to page.
fingerprint_generator_options: Override generated fingerprints with these specific values, if possible.

"""
self._browser_type = browser_type
self._browser_options = browser_options or {}
Expand All @@ -63,6 +68,9 @@ def __init__(
# Flag to indicate the context state.
self._active = False

self._use_fingerprints = use_fingerprints
self._fingerprint_generator_options = fingerprint_generator_options

@property
@override
def active(self) -> bool:
Expand Down Expand Up @@ -128,4 +136,6 @@ async def new_browser(self) -> PlaywrightBrowserController:
return PlaywrightBrowserController(
browser,
max_open_pages_per_browser=self._max_open_pages_per_browser,
use_fingerprints=self._use_fingerprints,
fingerprint_generator_options=self._fingerprint_generator_options,
)
Loading
Loading