-
Notifications
You must be signed in to change notification settings - Fork 15
Expand file tree
/
Copy pathlychee.toml
More file actions
59 lines (54 loc) · 3.54 KB
/
lychee.toml
File metadata and controls
59 lines (54 loc) · 3.54 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# Base URL or website root directory to check relative URLs.
base_url = "https://www.alchemy.com"
format = "markdown"
# Timeout in seconds for each request
timeout = 10
# Exclude files/paths from being scanned
exclude_path = [
"(?i).*/README\\.md$",
]
# Exclude links with these patterns
exclude = [
"(?i).*\\.png$", # All PNG files (case insensitive) - relative asset paths won't work
"(?i).*\\.jpe?g$", # All JPEG/JPG files (case insensitive) - relative asset paths won't work
".*\\{.*\\}.*", # Any path containing curly braces {} - code examples aren't crawlable links
".*g\\.alchemy\\.com.*", # Any path containing g.alchemy.com - RPC endpoints require API keys
".*\\.mdx$", # Links to MDX files - these should only be for snippets, not HTML pages
"^https?://localhost", # Localhost URLs are code examples, not real links
".*trends\\.google\\.com.*", # Google Trends always rate-limits bots
".*arb1\\.arbitrum\\.io/rpc.*", # RPC endpoint - only responds to POST
".*mainnet\\.base\\.org.*", # RPC endpoint - only responds to POST
".*mainnet\\.optimism\\.io.*", # RPC endpoint - only responds to POST
".*sepolia\\.arbiscan\\.io.*", # Bot protection blocks crawlers
".*holesky\\.fraxscan\\.com.*", # Intermittent - blocks crawlers
".*rpc\\.devnet\\.alchemy\\.com.*", # Placeholder URL with API_KEY
".*docs\\.unichain\\.org.*", # Intermittent timeouts for crawlers
".*docs\\.google\\.com/forms.*", # Requires Google authentication
".*evm\\.astar\\.network.*", # RPC endpoint - only responds to POST
".*rpc\\.zora\\.energy.*", # RPC endpoint - only responds to POST
".*sepolia\\.optimism\\.io.*", # RPC endpoint - only responds to POST
".*sepolia\\.rpc\\.zora\\.energy.*", # RPC endpoint - only responds to POST
".*medium\\..*", # Medium frequently blocks bot access with 403 (including subdomains)
".*npmjs\\.com.*", # npm blocks bot access with 403
".*computerhope\\.com.*", # Bot protection blocks crawlers
".*dashboard\\.alchemy\\.com/api.*", # API endpoints require authentication
".*alchemysupercharged\\.substack\\.com.*", # Substack blocks bot traffic with 403
".*bridge\\.arbitrum\\.io.*", # Bot protection blocks crawlers with 403
".*ethereumbuilders\\.gitbooks\\.io.*", # Legacy GitBook URL returns 401 to crawlers
".*ethernodes\\.org.*", # Bot protection blocks crawlers with 403
".*theblock\\.co.*", # Bot protection blocks crawlers with 403
".*theblockcrypto\\.com.*", # Bot protection blocks crawlers with 403
".*codeburst\\.io.*", # TLS/certificate validation fails for crawler clients
".*temp-mail\\.org.*", # Bot protection blocks crawlers with 403
".*metamask\\.zendesk\\.com.*", # Zendesk knowledge base pages can require auth / block crawlers
".*openzeppelin\\.com.*", # Bot protection blocks crawlers
".*explorer\\.zora\\.energy.*", # SSL cert issue (ERR_SSL_VERSION_OR_CIPHER_MISMATCH)
".*support\\.brave\\.com.*", # Bot protection blocks crawlers with 403
".*developer\\.offchainlabs\\.com.*", # Rate-limits crawlers with 429
".*openai\\.com/index/.*", # Returns 404 to bot traffic
]
# Include these patterns even if they match exclude patterns
include = [
"(?i).*cloudinary\\.com.*\\.png$", # PNG files from Cloudinary (case insensitive)
"(?i).*cloudinary\\.com.*\\.jpe?g$", # JPEG/JPG files from Cloudinary (case insensitive)
]