From 7a82e3d875e7c0137a2270189c98124104fcd243 Mon Sep 17 00:00:00 2001 From: binux Date: Fri, 31 Oct 2014 20:25:37 +0800 Subject: [PATCH] add phantomjs to run.py add logs for phantomjs_fetcher update README fix timeout error message for tornado fetcher proxy to phantomjs fetcher run test exit with right exit code when test is failed --- Dockerfile | 2 +- README.md | 10 +++++++++- fetcher/phantomjs_fetcher.js | 38 ++++++++++++++++++++++-------------- fetcher/tornado_fetcher.py | 28 +++++++++++++++++--------- libs/base_handler.py | 5 +++++ run.py | 20 +++++++++++++++++++ runtest.py | 5 ++++- 7 files changed, 81 insertions(+), 27 deletions(-) diff --git a/Dockerfile b/Dockerfile index 31e4fcaa2..3854b65ca 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,7 +4,7 @@ MAINTAINER binux # install python RUN apt-get update && \ apt-get install -y python python-dev python-distribute python-pip && \ - apt-get install -y libcurl4-openssl-dev libxml2-dev libxslt1-dev + apt-get install -y libcurl4-openssl-dev libxml2-dev libxslt1-dev python-lxml # install requirements ADD requirements.txt /opt/pyspider/requirements.txt diff --git a/README.md b/README.md index 19a057e48..0c7f052fa 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,15 @@ pyspider [![Build Status](https://travis-ci.org/binux/pyspider.png?branch=master)](https://travis-ci.org/binux/pyspider) ======== -a spider in python! [Try It Now!](http://demo.pyspider.org/) +A spider system in python. [Try It Now!](http://demo.pyspider.org/) + +- Write script with python +- Web script editor, debugger, task monitor, project manager and result viewer +- Distributed architecture +- MySQL, MongoDB and SQLite as database backend +- Full control of crawl process with powerful API +- Javascript pages Support! (with phantomjs fetcher) + ![debug demo](http://f.binux.me/debug_demo.png) demo code: [gist:9424801](https://gist.github.com/binux/9424801) diff --git a/fetcher/phantomjs_fetcher.js b/fetcher/phantomjs_fetcher.js index cc6266e06..c7e12b5b8 100644 --- a/fetcher/phantomjs_fetcher.js +++ b/fetcher/phantomjs_fetcher.js @@ -4,6 +4,7 @@ // Created on 2014-10-29 22:12:14 var port, server, service, + wait_before_end = 300, system = require('system'), webpage = require('webpage'); @@ -26,7 +27,7 @@ if (system.args.length !== 2) { } var fetch = JSON.parse(request.postRaw); - console.log(JSON.stringify(fetch, null, 2)); + console.debug(JSON.stringify(fetch, null, 2)); // create and set page var page = webpage.create(); @@ -55,31 +56,37 @@ if (system.args.length !== 2) { }; page.onLoadFinished = function(status) { page_loaded = true; - if (status !== "success") { - return; - } if (fetch.js_script && fetch.js_run_at !== "document-start") { page.evaluateJavaScript(fetch.js_script); } - end_time = Date.now() + 300; - setTimeout(make_result, 310, page); + console.debug("waiting "+wait_before_end+"ms before finished."); + end_time = Date.now() + wait_before_end; + setTimeout(make_result, wait_before_end+10, page); }; - page.onResourceRequested = function() { + page.onResourceRequested = function(request) { + console.debug("Starting request: #"+request.id+" ["+request.method+"]"+request.url); end_time = null; }; page.onResourceReceived = function(response) { + console.debug("Request finished: #"+response.id+" ["+response.statusText+"]"+response.url+" "+response.time+"ms"); if (first_response === null) { first_response = response; } if (page_loaded) { - end_time = Date.now() + 300; - setTimeout(make_result, 310, page); + console.debug("waiting "+wait_before_end+"ms before finished."); + end_time = Date.now() + wait_before_end; + setTimeout(make_result, wait_before_end+10, page); } } - page.onResourceError=page.onResourceTimeout=function() { + page.onResourceError=page.onResourceTimeout=function(response) { + console.info("Request error: #"+response.id+" ["+response.errorCode+"="+response.errorString+"]"+response.url); + if (first_response === null) { + first_response = response; + } if (page_loaded) { - end_time = Date.now() + 300; - setTimeout(make_result, 310, page); + console.debug("waiting "+wait_before_end+"ms before finished."); + end_time = Date.now() + wait_before_end; + setTimeout(make_result, wait_before_end+10, page); } } @@ -106,14 +113,15 @@ if (system.args.length !== 2) { var result = { orig_url: fetch.url, - content: page.content, - headers: first_response.headers, - status_code: first_response.status, + content: first_response.errorString || page.content, + headers: first_response.headers || {}, + status_code: first_response.status || 599, url: page.url, cookies: cookies, time: (end_time - start_time) / 1000, save: fetch.save } + console.log("["+result.status_code+"] "+result.orig_url+" "+result.time) var body = JSON.stringify(result, null, 2); response.statusCode = 200; diff --git a/fetcher/tornado_fetcher.py b/fetcher/tornado_fetcher.py index 5acad8802..f865d45a2 100644 --- a/fetcher/tornado_fetcher.py +++ b/fetcher/tornado_fetcher.py @@ -85,7 +85,7 @@ def fetch(self, task, callback=None): callback = self.send_result if url.startswith('data:'): return self.data_fetch(url, task, callback) - elif task.get('fetch', {}).get('fetch_type') == 'phantomjs': + elif task.get('fetch', {}).get('fetch_type') in ('js', 'phantomjs'): return self.phantomjs_fetch(url, task, callback) else: return self.http_fetch(url, task, callback) @@ -283,20 +283,30 @@ def phantomjs_fetch(self, url, task, callback): start_time = time.time() def handle_response(response): - try: - return task, json.loads(response.body) - except Exception as e: + if not response: result = { 'status_code': 599, - 'content': "%r" % e, + 'content': "timeout error", 'time': time.time() - start_time, 'orig_url': url, 'url': url, } - logger.exception("[599] %s, %r %.2fs", url, e, result['time']) - callback('phantomjs', task, result) - self.on_result('phantomjs', task, result) - return task, result + else: + try: + return task, json.loads(response.body) + except Exception as e: + result = { + 'status_code': 599, + 'content': "%r" % e, + 'time': time.time() - start_time, + 'orig_url': url, + 'url': url, + } + logger.exception("[599] %s, %r %.2fs", + url, result['content'], result['time']) + callback('phantomjs', task, result) + self.on_result('phantomjs', task, result) + return task, result try: request = tornado.httpclient.HTTPRequest( diff --git a/libs/base_handler.py b/libs/base_handler.py index eaec79284..b3108270d 100644 --- a/libs/base_handler.py +++ b/libs/base_handler.py @@ -263,6 +263,11 @@ def crawl(self, url, **kwargs): etag last_modifed + fetch_type + js_run_at + js_script + load_images + priority retries exetime diff --git a/run.py b/run.py index 9f4c3d402..119f32df0 100755 --- a/run.py +++ b/run.py @@ -26,6 +26,7 @@ def __get__(self, instance, owner): class g(object): scheduler_xmlrpc_port = int(os.environ.get('SCHEDULER_XMLRPC_PORT', 23333)) fetcher_xmlrpc_port = int(os.environ.get('FETCHER_XMLRPC_PORT', 24444)) + phantomjs_proxy_port = int(os.environ.get('PHANTOMJS_PROXY_PORT', 25555)) webui_host = os.environ.get('WEBUI_HOST', '0.0.0.0') webui_port = int(os.environ.get('WEBUI_PORT', 5000)) debug = bool(os.environ.get('DEBUG')) @@ -99,6 +100,15 @@ class g(object): else: scheduler_rpc = None + # phantomjs_proxy + if os.environ.get('PHANTOMJS_NAME'): + phantomjs_proxy = "%s:%s" % ( + os.environ['PHANTOMJS_PORT_%d_TCP_ADDR' % phantomjs_proxy_port], + os.environ['PHANTOMJS_PORT_%d_TCP_PORT' % phantomjs_proxy_port] + ) + else: + phantomjs_proxy = None + # run commands ------------------------------------------ def run_scheduler(g=g): from scheduler import Scheduler @@ -114,6 +124,7 @@ def run_scheduler(g=g): def run_fetcher(g=g): from fetcher.tornado_fetcher import Fetcher fetcher = Fetcher(inqueue=g.scheduler2fetcher, outqueue=g.fetcher2processor) + fetcher.phantomjs_proxy = g.phantomjs_proxy run_in_thread(fetcher.xmlrpc_run, port=g.fetcher_xmlrpc_port, bind=g.webui_host) fetcher.run() @@ -135,10 +146,15 @@ def run_result_worker(g=g): def run_webui(g=g): import cPickle as pickle + from fetcher.tornado_fetcher import Fetcher + fetcher = Fetcher(inqueue=None, outqueue=None, async=False) + fetcher.phantomjs_proxy = g.phantomjs_proxy + from webui.app import app app.config['taskdb'] = g.taskdb app.config['projectdb'] = g.projectdb app.config['resultdb'] = g.resultdb + app.config['fetch'] = lambda x: fetcher.fetch(x)[1] app.config['scheduler_rpc'] = g.scheduler_rpc #app.config['cdn'] = '//cdnjs.cloudflare.com/ajax/libs/' if g.demo_mode: @@ -174,6 +190,10 @@ def all_in_one(): each.join() if __name__ == '__main__': + print "running with config:" + for key in dir(g): + print "%s=%r" % (key, getattr(g, key)) + if len(sys.argv) < 2: all_in_one() else: diff --git a/runtest.py b/runtest.py index 656f8a3b5..2f7dc86da 100755 --- a/runtest.py +++ b/runtest.py @@ -14,4 +14,7 @@ glob = sys.argv[1] suite = unittest.TestLoader().discover('test', glob) - unittest.TextTestRunner(verbosity=1).run(suite) + result = unittest.TextTestRunner(verbosity=1).run(suite) + if result.errors or result.failures: + sys.exit(1) + sys.exit(0)