-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawler-conf.yaml
115 lines (102 loc) · 4.33 KB
/
crawler-conf.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# Custom configuration for StormCrawler
# This is used to override the default values from crawler-default.xml and provide additional ones
# for your custom components.
# Use this file with the parameter -conf when launching your extension of ConfigurableTopology.
# This file does not contain all the key values but only the most frequently used ones. See crawler-default.xml for an extensive list.
config:
topology.workers: 2
topology.message.timeout.secs: 300
topology.max.spout.pending: 250
topology.debug: false
# mandatory when using Flux
topology.kryo.register:
- com.digitalpebble.stormcrawler.Metadata
fetcher.threads.number: 250
fetcher.server.delay: 0.2
# A FetchQueue can also be used by more than one FetchingThread at a time (in which case fetcher.server.min.delay is used),
# based on the value of fetcher.threads.per.queue.
fetcher.server.min.delay: 0.2
# behavior of fetcher when the crawl-delay in the robots.txt
# is larger than fetcher.max.crawl.delay:
# (if false)
# skip URLs from this queue to avoid that any overlong
# crawl-delay throttles the crawler
# (if true)
# set the delay to fetcher.max.crawl.delay,
# making fetcher more aggressive than requested
fetcher.max.crawl.delay.force: true
# behavior of fetcher when the crawl-delay in the robots.txt
# is smaller (ev. less than one second) than the default delay:
# (if true)
# use the larger default delay (fetcher.server.delay)
# and ignore the shorter crawl-delay in the robots.txt
# (if false)
# use the delay specified in the robots.txt
fetcher.server.delay.force: false
# give 2gb to the workers
worker.heap.memory.mb: 2048
# metadata to transfer to the outlinks
# used by Fetcher for redirections, sitemapparser, etc...
# these are also persisted for the parent document (see below)
# metadata.transfer:
# - customMetadataName
metadata.track.path: false
metadata.track.depth: true
# lists the metadata to persist to storage
# these are not transfered to the outlinks
metadata.persist:
- _redirTo
- error.cause
- error.source
- isSitemap
- isFeed
http.agent.name: "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
# http.agent.name: "Anonymous Coward"
# http.agent.version: "1.0"
# http.agent.description: "built with StormCrawler Archetype 1.10"
# http.agent.url: "http://someorganization.com/"
# http.agent.email: "[email protected]"
# The maximum number of bytes for returned HTTP response bodies.
# The fetched page will be trimmed to 65536 or 65KB in this case
# Set -1 to disable the limit.
http.content.limit: -1
# Generally ignore all robots.txt rules
http.skip.robots: true
# FetcherBolt queue dump => comment out to activate
# if a file exists on the worker machine with the corresponding port number
# the FetcherBolt will log the content of its internal queues to the logs
# fetcherbolt.queue.debug.filepath: "/tmp/fetcher-dump-{port}"
parsefilters.config.file: "parsefilters.json"
urlfilters.config.file: "urlfilters.json"
sitemap.discovery: false
sitemap.sniffContent: false
detect.charset.maxlength: 2048
# revisit a page daily (value in minutes)
# set it to -1 to never refetch a page
fetchInterval.default: -1
# revisit a page with a fetch error after 2 hours (value in minutes)
# set it to -1 to never refetch a page
fetchInterval.fetch.error: 120
# never revisit a page with an error (or set a value in minutes)
fetchInterval.error: -1
# custom fetch interval to be used when a document has the key/value in its metadata
# and has been fetched succesfully (value in minutes)
# fetchInterval.FETCH_ERROR.isFeed=true
# fetchInterval.isFeed=true: 10
# configuration for the classes extending AbstractIndexerBolt
# indexer.md.filter: "someKey=aValue"
indexer.url.fieldname: "url"
indexer.text.fieldname: "content"
indexer.canonical.name: "canonical"
indexer.md.mapping:
- parse.title=title
- parse.keywords=keywords
- parse.description=description
- host=host
- domain=domain
# Metrics consumers:
topology.metrics.consumer.register:
- class: "com.digitalpebble.stormcrawler.elasticsearch.metrics.IndexPerDayMetricsConsumer"
parallelism.hint: 1
#- class: "com.digitalpebble.stormcrawler.elasticsearch.metrics.MetricsConsumer"
# parallelism.hint: 1