diff --git a/README.rst b/README.rst index 91b8855..f5c57f1 100644 --- a/README.rst +++ b/README.rst @@ -162,6 +162,20 @@ Alternatively, you can use regular scrapy.Request and } }) +It is also possible to configure Splash for all requests in a Spider by default +using a ``splash`` spider attribute:: + + class MySpider(Spider): + name = 'myspider' + splash = { + # … + } + +If you use a ``splash`` spider attribute, you can still override those Splash +settings for specific requests using the ``splash`` request meta key, or +disable Splash completely setting the ``dont_splash`` request meta key to +``True``. + Use ``request.meta['splash']`` API in middlewares or when scrapy.Request subclasses are used (there is also ``SplashFormRequest`` described below). For example, ``meta['splash']`` allows to create a middleware which enables diff --git a/scrapy_splash/middleware.py b/scrapy_splash/middleware.py index 24ab23a..70a03b2 100644 --- a/scrapy_splash/middleware.py +++ b/scrapy_splash/middleware.py @@ -257,8 +257,16 @@ def _argument_values(self): def _remote_keys(self): return self.crawler.spider.state[self.remote_keys_key] + def _get_splash_options(self, request, spider): + if request.meta.get("dont_splash") is True: + return + spider_options = getattr(spider, "splash", {}) + request_options = request.meta.get("splash") + return request_options or spider_options + def process_request(self, request, spider): - if 'splash' not in request.meta: + splash_options = self._get_splash_options(request, spider) + if not splash_options: return if request.method not in {'GET', 'POST'}: @@ -274,7 +282,6 @@ def process_request(self, request, spider): # don't process the same request more than once return - splash_options = request.meta['splash'] request.meta['_splash_processed'] = True slot_policy = splash_options.get('slot_policy', self.slot_policy) @@ -368,7 +375,7 @@ def process_response(self, request, response, spider): if not request.meta.get("_splash_processed"): return response - splash_options = request.meta['splash'] + splash_options = self._get_splash_options(request, spider) if not splash_options: return response diff --git a/tests/test_middleware.py b/tests/test_middleware.py index 66b79ce..12e53e8 100644 --- a/tests/test_middleware.py +++ b/tests/test_middleware.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- + from __future__ import absolute_import import copy import json @@ -765,3 +766,58 @@ def test_adjust_timeout(): }) req2 = mw.process_request(req2, None) assert req2.meta['download_timeout'] == 30 + + +def test_spider_attribute(): + req_url = "http://scrapy.org" + spider = scrapy.Spider("example") + mw = _get_mw() + + req1 = scrapy.Request(req_url) + spider.splash = {"args": {"images": 0}} + + mw = _get_mw() + req2 = mw.process_request(req1, spider) + assert "_splash_processed" in req2.meta + assert "render.json" in req2.url + request_data = json.loads(req2.body.decode('utf8')) + assert "url" in request_data + assert request_data.get("url") == req_url + assert "images" in request_data + assert req2.method == 'POST' + + response = Response(req_url, request=req2) + response2 = mw.process_response(req2, response, spider) + assert response2 is not response + + +def test_spider_attribute_dont_splash(): + req_url = "http://scrapy.org" + spider = scrapy.Spider("example") + mw = _get_mw() + + req1 = scrapy.Request(req_url, meta={'dont_splash': True}) + spider.splash = {"args": {"images": 0}} + + req2 = mw.process_request(req1, spider) + assert req2 is None + + response = Response(req_url, request=req1) + response2 = mw.process_response(req1, response, spider) + assert response2 is response + + +def test_spider_attribute_blank(): + req_url = "http://scrapy.org" + spider = scrapy.Spider("example") + mw = _get_mw() + + req1 = scrapy.Request(req_url) + spider.splash = {} + + req2 = mw.process_request(req1, spider) + assert req2 is None + + response = Response(req_url, request=req1) + response2 = mw.process_response(req1, response, spider) + assert response2 is response