Skip to content

[Bug]: Unable to pass Cloudflare bot detectionΒ #1757

@palash-involvio

Description

@palash-involvio

crawl4ai version

0.8.0

Expected Behavior

Library should extract content from the specified URL.

Current Behavior

Library is unable to extract content from the specified URL.

Is this reproducible?

Yes

Inputs Causing the Bug

Steps to Reproduce

Run the below script and you'll be able to reproduce this issue.

Code snippets

import asyncio
import os
import shutil
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BrowserConfig, CacheMode, UndetectedAdapter
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
from crawl4ai.deep_crawling.filters import (
  FilterChain,
  URLPatternFilter,
  DomainFilter,
)
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy

async def main():
  # Delete output directory if it exists
  output_dir = "output_data"
  if os.path.exists(output_dir):
    shutil.rmtree(output_dir)

  raw_md_generator = DefaultMarkdownGenerator(
    content_source="raw_html"
  )

  config = CrawlerRunConfig(
    deep_crawl_strategy=BFSDeepCrawlStrategy(
      max_depth=0,
      include_external=False,
      max_pages=50,
      filter_chain=FilterChain([
        URLPatternFilter(patterns=[]),
        DomainFilter(allowed_domains=[])
      ])
    ),

    stream=True,
    cache_mode=CacheMode.BYPASS,
    excluded_tags=[],
    exclude_external_links=True,
    exclude_social_media_links=True,
    exclude_external_images=True,
    exclude_all_images=True,
    process_iframes=True,
    remove_overlay_elements=True,
    delay_before_return_html=5,
    check_robots_txt=True,
    markdown_generator=raw_md_generator,
    verbose=True,
  )

  # Create the undetected adapter
  undetected_adapter = UndetectedAdapter()

  # Create browser config
  browser_config = BrowserConfig(
    headless=False,  # Headless mode can be detected easier
    verbose=True,
    enable_stealth=True,
  )

  # Create the crawler strategy with undetected adapter
  crawler_strategy = AsyncPlaywrightCrawlerStrategy(
    browser_config=browser_config,
    browser_adapter=undetected_adapter
  )

  async with AsyncWebCrawler(
    crawler_strategy=crawler_strategy,
    config=browser_config
  ) as crawler:
    async for result in await crawler.arun(
      "https://www.thaigov.go.th/th/home",
      config=config
    ):
      # breakpoint()  # Debugger - inspect result here
      print(f"URL: {result.url}")
      print(f"Depth: {result.metadata.get('depth', 0)}")

      # Create filename from URL
      safe_filename = result.url.replace("https://", "").replace("http://", "").replace("/", "_").replace("?", "_").replace("&", "_").replace("=", "_")
      output_dir = "output_data"
      os.makedirs(output_dir, exist_ok=True)
      output_filename = f"{output_dir}/{safe_filename}.md"

      with open(output_filename, "w") as file:
        file.write(result.markdown or "")

if __name__ == "__main__":
  asyncio.run(main())

OS

macOS

Python version

3.13.3

Browser

No response

Browser version

No response

Error logs & Screenshots (if applicable)

No response

Metadata

Metadata

Assignees

No one assigned

    Labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions