docs/lychee.toml at main · alchemyplatform/docs · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# Base URL or website root directory to check relative URLs.
base_url = "https://www.alchemy.com"
format = "markdown"

# Timeout in seconds for each request
timeout = 10

# Exclude files/paths from being scanned
exclude_path = [
    "(?i).*/README\\.md$",
]

# Exclude links with these patterns
exclude = [
    "(?i).*\\.png$",                # All PNG files (case insensitive) - relative asset paths won't work
    "(?i).*\\.jpe?g$",              # All JPEG/JPG files (case insensitive) - relative asset paths won't work
    ".*\\{.*\\}.*",                 # Any path containing curly braces {} - code examples aren't crawlable links
    ".*g\\.alchemy\\.com.*",        # Any path containing g.alchemy.com - RPC endpoints require API keys
    ".*\\.mdx$",                    # Links to MDX files - these should only be for snippets, not HTML pages
    "^https?://localhost",           # Localhost URLs are code examples, not real links
    ".*trends\\.google\\.com.*",     # Google Trends always rate-limits bots
    ".*arb1\\.arbitrum\\.io/rpc.*",  # RPC endpoint - only responds to POST
    ".*mainnet\\.base\\.org.*",      # RPC endpoint - only responds to POST
    ".*mainnet\\.optimism\\.io.*",   # RPC endpoint - only responds to POST
    ".*sepolia\\.arbiscan\\.io.*",   # Bot protection blocks crawlers
    ".*holesky\\.fraxscan\\.com.*",  # Intermittent - blocks crawlers
    ".*rpc\\.devnet\\.alchemy\\.com.*", # Placeholder URL with API_KEY
    ".*docs\\.unichain\\.org.*",     # Intermittent timeouts for crawlers
    ".*docs\\.google\\.com/forms.*", # Requires Google authentication
    ".*evm\\.astar\\.network.*",    # RPC endpoint - only responds to POST
    ".*rpc\\.zora\\.energy.*",      # RPC endpoint - only responds to POST
    ".*sepolia\\.optimism\\.io.*",  # RPC endpoint - only responds to POST
    ".*sepolia\\.rpc\\.zora\\.energy.*", # RPC endpoint - only responds to POST
    ".*medium\\..*",                # Medium frequently blocks bot access with 403 (including subdomains)
    ".*npmjs\\.com.*",              # npm blocks bot access with 403
    ".*computerhope\\.com.*",       # Bot protection blocks crawlers
    ".*dashboard\\.alchemy\\.com/api.*", # API endpoints require authentication
    ".*alchemysupercharged\\.substack\\.com.*", # Substack blocks bot traffic with 403
    ".*bridge\\.arbitrum\\.io.*",   # Bot protection blocks crawlers with 403
    ".*ethereumbuilders\\.gitbooks\\.io.*", # Legacy GitBook URL returns 401 to crawlers
    ".*ethernodes\\.org.*",         # Bot protection blocks crawlers with 403
    ".*theblock\\.co.*",            # Bot protection blocks crawlers with 403
    ".*theblockcrypto\\.com.*",     # Bot protection blocks crawlers with 403
    ".*codeburst\\.io.*",           # TLS/certificate validation fails for crawler clients
    ".*temp-mail\\.org.*",          # Bot protection blocks crawlers with 403
    ".*metamask\\.zendesk\\.com.*", # Zendesk knowledge base pages can require auth / block crawlers
    ".*openzeppelin\\.com.*",       # Bot protection blocks crawlers
    ".*explorer\\.zora\\.energy.*",  # SSL cert issue (ERR_SSL_VERSION_OR_CIPHER_MISMATCH)
    ".*support\\.brave\\.com.*",     # Bot protection blocks crawlers with 403
    ".*developer\\.offchainlabs\\.com.*", # Rate-limits crawlers with 429
    ".*openai\\.com/index/.*",       # Returns 404 to bot traffic

]

# Include these patterns even if they match exclude patterns
include = [
    "(?i).*cloudinary\\.com.*\\.png$",    # PNG files from Cloudinary (case insensitive)
    "(?i).*cloudinary\\.com.*\\.jpe?g$",  # JPEG/JPG files from Cloudinary (case insensitive)
]