Skip to content

Allow spider attr #235

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
14 changes: 14 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,20 @@ Alternatively, you can use regular scrapy.Request and
}
})

It is also possible to configure Splash for all requests in a Spider by default
using a ``splash`` spider attribute::

class MySpider(Spider):
name = 'myspider'
splash = {
# …
}

If you use a ``splash`` spider attribute, you can still override those Splash
settings for specific requests using the ``splash`` request meta key, or
disable Splash completely setting the ``dont_splash`` request meta key to
``True``.

Use ``request.meta['splash']`` API in middlewares or when scrapy.Request
subclasses are used (there is also ``SplashFormRequest`` described below).
For example, ``meta['splash']`` allows to create a middleware which enables
Expand Down
13 changes: 10 additions & 3 deletions scrapy_splash/middleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,8 +257,16 @@ def _argument_values(self):
def _remote_keys(self):
return self.crawler.spider.state[self.remote_keys_key]

def _get_splash_options(self, request, spider):
if request.meta.get("dont_splash") is True:
return
spider_options = getattr(spider, "splash", {})
request_options = request.meta.get("splash")
return request_options or spider_options

def process_request(self, request, spider):
if 'splash' not in request.meta:
splash_options = self._get_splash_options(request, spider)
if not splash_options:
return

if request.method not in {'GET', 'POST'}:
Expand All @@ -274,7 +282,6 @@ def process_request(self, request, spider):
# don't process the same request more than once
return

splash_options = request.meta['splash']
request.meta['_splash_processed'] = True

slot_policy = splash_options.get('slot_policy', self.slot_policy)
Expand Down Expand Up @@ -368,7 +375,7 @@ def process_response(self, request, response, spider):
if not request.meta.get("_splash_processed"):
return response

splash_options = request.meta['splash']
splash_options = self._get_splash_options(request, spider)
if not splash_options:
return response

Expand Down
56 changes: 56 additions & 0 deletions tests/test_middleware.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# -*- coding: utf-8 -*-

from __future__ import absolute_import
import copy
import json
Expand Down Expand Up @@ -765,3 +766,58 @@ def test_adjust_timeout():
})
req2 = mw.process_request(req2, None)
assert req2.meta['download_timeout'] == 30


def test_spider_attribute():
req_url = "http://scrapy.org"
spider = scrapy.Spider("example")
mw = _get_mw()

req1 = scrapy.Request(req_url)
spider.splash = {"args": {"images": 0}}

mw = _get_mw()
req2 = mw.process_request(req1, spider)
assert "_splash_processed" in req2.meta
assert "render.json" in req2.url
request_data = json.loads(req2.body.decode('utf8'))
assert "url" in request_data
assert request_data.get("url") == req_url
assert "images" in request_data
assert req2.method == 'POST'

response = Response(req_url, request=req2)
response2 = mw.process_response(req2, response, spider)
assert response2 is not response


def test_spider_attribute_dont_splash():
req_url = "http://scrapy.org"
spider = scrapy.Spider("example")
mw = _get_mw()

req1 = scrapy.Request(req_url, meta={'dont_splash': True})
spider.splash = {"args": {"images": 0}}

req2 = mw.process_request(req1, spider)
assert req2 is None

response = Response(req_url, request=req1)
response2 = mw.process_response(req1, response, spider)
assert response2 is response


def test_spider_attribute_blank():
req_url = "http://scrapy.org"
spider = scrapy.Spider("example")
mw = _get_mw()

req1 = scrapy.Request(req_url)
spider.splash = {}

req2 = mw.process_request(req1, spider)
assert req2 is None

response = Response(req_url, request=req1)
response2 = mw.process_response(req1, response, spider)
assert response2 is response