crawler-conf.yaml

# Custom configuration for StormCrawler
# This is used to override the default values from crawler-default.xml and provide additional ones 
# for your custom components.
# Use this file with the parameter -conf when launching your extension of ConfigurableTopology.
# This file does not contain all the key values but only the most frequently used ones. See crawler-default.xml for an extensive list.

config: 
  topology.workers: 2
  topology.message.timeout.secs: 300
  topology.max.spout.pending: 250
  topology.debug: false
  # mandatory when using Flux
  topology.kryo.register:
    - com.digitalpebble.stormcrawler.Metadata

  fetcher.threads.number: 250
  fetcher.server.delay: 0.2
  # A FetchQueue can also be used by more than one FetchingThread at a time (in which case fetcher.server.min.delay is used),
  # based on the value of fetcher.threads.per.queue.
  fetcher.server.min.delay: 0.2
  # behavior of fetcher when the crawl-delay in the robots.txt
  # is larger than fetcher.max.crawl.delay:
  #  (if false)
  #    skip URLs from this queue to avoid that any overlong
  #    crawl-delay throttles the crawler
  #  (if true)
  #    set the delay to fetcher.max.crawl.delay,
  #    making fetcher more aggressive than requested
  fetcher.max.crawl.delay.force: true
  # behavior of fetcher when the crawl-delay in the robots.txt
  # is smaller (ev. less than one second) than the default delay:
  #  (if true)
  #    use the larger default delay (fetcher.server.delay)
  #    and ignore the shorter crawl-delay in the robots.txt
  #  (if false)
  #    use the delay specified in the robots.txt
  fetcher.server.delay.force: false

  # give 2gb to the workers
  worker.heap.memory.mb: 2048

  # metadata to transfer to the outlinks
  # used by Fetcher for redirections, sitemapparser, etc...
  # these are also persisted for the parent document (see below)
  # metadata.transfer:
  # - customMetadataName
  metadata.track.path: false
  metadata.track.depth: true
  # lists the metadata to persist to storage
  # these are not transfered to the outlinks
  metadata.persist:
   - _redirTo
   - error.cause
   - error.source
   - isSitemap
   - isFeed

  http.agent.name: "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
  # http.agent.name: "Anonymous Coward"
  # http.agent.version: "1.0"
  # http.agent.description: "built with StormCrawler Archetype 1.10"
  # http.agent.url: "http://someorganization.com/"
  # http.agent.email: "someone@someorganization.com"

  # The maximum number of bytes for returned HTTP response bodies.
  # The fetched page will be trimmed to 65536 or 65KB in this case
  # Set -1 to disable the limit.
  http.content.limit: -1
  # Generally ignore all robots.txt rules
  http.skip.robots: true

  # FetcherBolt queue dump => comment out to activate
  # if a file exists on the worker machine with the corresponding port number
  # the FetcherBolt will log the content of its internal queues to the logs
  # fetcherbolt.queue.debug.filepath: "/tmp/fetcher-dump-{port}"

  parsefilters.config.file: "parsefilters.json"
  urlfilters.config.file: "urlfilters.json"

  sitemap.discovery: false
  sitemap.sniffContent: false
  detect.charset.maxlength: 2048

  # revisit a page daily (value in minutes)
  # set it to -1 to never refetch a page
  fetchInterval.default: -1
  # revisit a page with a fetch error after 2 hours (value in minutes)
  # set it to -1 to never refetch a page
  fetchInterval.fetch.error: 120
  # never revisit a page with an error (or set a value in minutes)
  fetchInterval.error: -1

  # custom fetch interval to be used when a document has the key/value in its metadata
  # and has been fetched succesfully (value in minutes)
  # fetchInterval.FETCH_ERROR.isFeed=true
  # fetchInterval.isFeed=true: 10

  # configuration for the classes extending AbstractIndexerBolt
  # indexer.md.filter: "someKey=aValue"
  indexer.url.fieldname: "url"
  indexer.text.fieldname: "content"
  indexer.canonical.name: "canonical"
  indexer.md.mapping:
  - parse.title=title
  - parse.keywords=keywords
  - parse.description=description
  - host=host
  - domain=domain

  # Metrics consumers:
  topology.metrics.consumer.register:
    - class: "com.digitalpebble.stormcrawler.elasticsearch.metrics.IndexPerDayMetricsConsumer"
      parallelism.hint: 1
     #- class: "com.digitalpebble.stormcrawler.elasticsearch.metrics.MetricsConsumer"
     #  parallelism.hint: 1