Skip to content
This repository was archived by the owner on Jun 10, 2024. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
68 commits
Select commit Hold shift + click to select a range
49bdadf
Changes pycurl version
Apr 22, 2022
6068992
update
Apr 22, 2022
858b92a
修复 express 等没有安装的问题
Apr 24, 2022
3da32f7
isort pyspider
Apr 24, 2022
7a8f586
isort pyspider
Apr 24, 2022
6fa962c
review
Apr 24, 2022
0838460
review
Apr 24, 2022
2f39bc3
update python version
Apr 27, 2022
1fcdb9a
update elasticsearch version
Apr 27, 2022
32a2d01
add requirements for company
Apr 27, 2022
b0c48a5
update seadaka requirements
Apr 27, 2022
fec39cf
Merge branch 'release/v0.3.11'
Apr 27, 2022
6f7e50f
Merge tag 'v0.3.11' into develop
Apr 27, 2022
41029d6
update click version
Apr 29, 2022
fe0cd4f
update lxml version
May 6, 2022
ddfe171
update psycopg2 version
May 6, 2022
436c5e6
修改 gcd 过时方法
May 6, 2022
34a2896
Merge remote-tracking branch 'origin/develop' into develop
May 6, 2022
05bca89
update getargspec to getfullargspec
May 6, 2022
55a0a72
pprint 使用 buildin
May 6, 2022
3ff7f85
review
May 6, 2022
d2dc41b
review
May 6, 2022
397dff4
update
May 7, 2022
c0126f8
update
May 7, 2022
a84149e
raname list_io
May 7, 2022
90a98eb
remove "encoding: utf-8" line
May 7, 2022
4e421a0
review libs
May 7, 2022
3e77df5
review
May 7, 2022
4178546
review
May 7, 2022
eb4c3b7
sort import
May 7, 2022
1212cad
review
May 7, 2022
297aa32
update
May 11, 2022
9512717
update requirements
May 11, 2022
2a9b2d8
Merge branch 'develop' into bugfix/fix_3.9
May 12, 2022
0bceaf9
update
May 12, 2022
7b783b2
update
May 12, 2022
906cc6f
update
May 15, 2022
e246a9a
update
May 26, 2022
08c2f0a
update
May 27, 2022
a241037
Merge branch 'develop' into bugfix/fix_3.9
May 27, 2022
7d52a7d
update flask flask-login Werkzeug version
May 27, 2022
9b03abc
update
May 27, 2022
08e2228
update dependents
May 27, 2022
7b2cf03
update dependents
May 27, 2022
e82c1f3
update dependents
May 27, 2022
59ed77e
fix bugs
May 27, 2022
cc7db5c
update
May 27, 2022
0a7b95d
update
May 27, 2022
9270501
remove unused import
May 27, 2022
20b65ad
remove xmlrpclib
May 27, 2022
d7d24f9
review
May 27, 2022
b80f497
Merge branch 'bugfix/fix_3.9' into develop
May 27, 2022
522632c
update
May 27, 2022
b7deb5e
Merge branch '0.4.0release'
May 27, 2022
913a0a2
Merge tag '0.4.0release' into develop
May 27, 2022
97ec8bb
rename LICENSE file name
May 27, 2022
cf1e1eb
update
May 27, 2022
348c003
update
May 27, 2022
4940ece
Merge branch 'fix_readme' into develop
May 27, 2022
22db274
update
May 29, 2022
500b2cf
update
May 29, 2022
ec1f8e9
update pika,kombu, amqp, vine versions
lusi1990 Jun 7, 2022
4be7a96
upgrade elasticsearch version
lusi1990 Jun 14, 2022
18808be
upgrade elasticsearch version
lusi1990 Jun 14, 2022
3c0cad6
update Dockerfile
lusi1990 Jul 15, 2022
7181538
review
Sep 23, 2022
d42be29
downgrade es version
Nov 17, 2022
c74b1a1
downgrade es version
Nov 17, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,5 @@ nosetests.xml
.mr.developer.cfg
.project
.pydevproject
.idea
config.json
LICENSE
57 changes: 29 additions & 28 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,44 +1,45 @@
language: python
cache: pip
python:
- 3.5
- 3.6
- 3.7
#- 3.8
- 3.8
- 3.9
services:
- docker
- mongodb
- rabbitmq
- redis
- mysql
# - elasticsearch
- postgresql
- docker
- mongodb
- rabbitmq
- redis
- mysql
# - elasticsearch
- postgresql
addons:
postgresql: "9.4"
apt:
packages:
- rabbitmq-server
- rabbitmq-server
env:
- IGNORE_COUCHDB=1
- IGNORE_COUCHDB=1

before_install:
- sudo apt-get update -qq
- curl -O https://download.elastic.co/elasticsearch/release/org/elasticsearch/distribution/deb/elasticsearch/2.4.0/elasticsearch-2.4.0.deb && sudo dpkg -i --force-confnew elasticsearch-2.4.0.deb && sudo service elasticsearch restart
- npm install express puppeteer
- sudo docker pull scrapinghub/splash
- sudo docker run -d --net=host scrapinghub/splash
before_script:
- psql -c "CREATE DATABASE pyspider_test_taskdb ENCODING 'UTF8' TEMPLATE=template0;" -U postgres
- psql -c "CREATE DATABASE pyspider_test_projectdb ENCODING 'UTF8' TEMPLATE=template0;" -U postgres
- psql -c "CREATE DATABASE pyspider_test_resultdb ENCODING 'UTF8' TEMPLATE=template0;" -U postgres
- sleep 10
- sudo apt-get update -qq
- curl -O https://download.elastic.co/elasticsearch/release/org/elasticsearch/distribution/deb/elasticsearch/2.4.0/elasticsearch-2.4.0.deb && sudo dpkg -i --force-confnew elasticsearch-2.4.0.deb && sudo service elasticsearch restart
- npm install express puppeteer
- sudo docker pull scrapinghub/splash
- sudo docker run -d --net=host scrapinghub/splash
install:
- pip install https://github.com/marcus67/easywebdav/archive/master.zip
- sudo apt-get install libgnutls28-dev
- pip install -e .[all,test]
- pip install coveralls
- pip install https://github.com/marcus67/easywebdav/archive/master.zip
- sudo apt-get install libgnutls28-dev
- pip install -e .[all,test]
- pip install coveralls
before_script:
- psql -c "CREATE DATABASE pyspider_test_taskdb ENCODING 'UTF8' TEMPLATE=template0;" -U postgres
- psql -c "CREATE DATABASE pyspider_test_projectdb ENCODING 'UTF8' TEMPLATE=template0;" -U postgres
- psql -c "CREATE DATABASE pyspider_test_resultdb ENCODING 'UTF8' TEMPLATE=template0;" -U postgres
- sleep 10

script:
- coverage run setup.py test
- coverage run setup.py test
after_success:
- coverage combine
- coveralls
- coverage combine
- coveralls
4 changes: 2 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM python:3.6
FROM python:3.7
MAINTAINER binux <[email protected]>

# install phantomjs
Expand All @@ -22,7 +22,7 @@ RUN npm install puppeteer express

# install requirements
COPY requirements.txt /opt/pyspider/requirements.txt
RUN pip install -r /opt/pyspider/requirements.txt
RUN pip install --no-cache-dir -r /opt/pyspider/requirements.txt

# add all repo
ADD ./ /opt/pyspider
Expand Down
File renamed without changes.
10 changes: 8 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ A Powerful Spider(Web Crawler) System in Python.
- [MySQL](https://www.mysql.com/), [MongoDB](https://www.mongodb.org/), [Redis](http://redis.io/), [SQLite](https://www.sqlite.org/), [Elasticsearch](https://www.elastic.co/products/elasticsearch); [PostgreSQL](http://www.postgresql.org/) with [SQLAlchemy](http://www.sqlalchemy.org/) as database backend
- [RabbitMQ](http://www.rabbitmq.com/), [Redis](http://redis.io/) and [Kombu](http://kombu.readthedocs.org/) as message queue
- Task priority, retry, periodical, recrawl by age, etc...
- Distributed architecture, Crawl Javascript pages, Python 2.{6,7}, 3.{3,4,5,6} support, etc...
- Distributed architecture, Crawl Javascript pages, Python 3.{6, 7, 8, 9} support, etc...

Tutorial: [http://docs.pyspider.org/en/latest/tutorial/](http://docs.pyspider.org/en/latest/tutorial/)
Documentation: [http://docs.pyspider.org/](http://docs.pyspider.org/)
Expand All @@ -18,7 +18,7 @@ Sample Code
-----------

```python
from pyspider.libs.base_handler import *
from pyspider.libs.base_handler import BaseHandler, config, every


class Handler(BaseHandler):
Expand Down Expand Up @@ -66,6 +66,12 @@ TODO

### v0.4.0

- [ ] 适配 python 3.9, 放弃 3.6 以下版本, try my best to fix bug
- [ ] fix travis and Coverage
- [ ] review docker

### v0.5.0

- [ ] a visual scraping interface like [portia](https://github.com/scrapinghub/portia)


Expand Down
2 changes: 1 addition & 1 deletion docker-compose.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
version: "3.7"
version: "3.3"

# replace /path/to/dir/ to point to config.json

Expand Down
7 changes: 4 additions & 3 deletions docs/conf.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,25 @@
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<[email protected]>
# http://binux.me
# Created on 2015-11-10 01:31:54

import sys
from unittest.mock import MagicMock

from recommonmark.parser import CommonMarkParser


class Mock(MagicMock):
@classmethod
def __getattr__(cls, name):
return Mock()
return Mock()

MOCK_MODULES = ['pycurl', 'lxml', 'psycopg2']
sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES)

source_parsers = {
'.md': CommonMarkParser,
'.md': CommonMarkParser,
}

source_suffix = ['.rst', '.md']
3 changes: 1 addition & 2 deletions pyspider/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<[email protected]>
# http://binux.me
# Created on 2014-11-17 19:17:12

__version__ = '0.4.0'
__version__ = '0.4.1'
47 changes: 24 additions & 23 deletions pyspider/database/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<[email protected]>
# http://binux.me
# Created on 2014-10-08 15:04:08

import os, requests, json
from six.moves.urllib.parse import urlparse, parse_qs
import json
import os

import requests
from six.moves.urllib.parse import parse_qs, urlparse


def connect_database(url):
Expand Down Expand Up @@ -60,16 +62,15 @@ def _connect_database(url): # NOQA
other_scheme = "+".join(scheme[1:-1])

if dbtype not in ('taskdb', 'projectdb', 'resultdb'):
raise LookupError('unknown database type: %s, '
'type should be one of ["taskdb", "projectdb", "resultdb"]', dbtype)
raise LookupError(f'unknown database type: {dbtype}, type should be one of ["taskdb", "projectdb", "resultdb"]')

if engine == 'mysql':
return _connect_mysql(parsed,dbtype)
return _connect_mysql(parsed, dbtype)

elif engine == 'sqlite':
return _connect_sqlite(parsed,dbtype)
return _connect_sqlite(parsed, dbtype)
elif engine == 'mongodb':
return _connect_mongodb(parsed,dbtype,url)
return _connect_mongodb(parsed, dbtype, url)

elif engine == 'sqlalchemy':
return _connect_sqlalchemy(parsed, dbtype, url, other_scheme)
Expand All @@ -88,44 +89,44 @@ def _connect_database(url): # NOQA
from .local.projectdb import ProjectDB
return ProjectDB(scripts)
else:
raise LookupError('not supported dbtype: %s', dbtype)
raise LookupError(f'not supported dbtype: {dbtype}')
elif engine == 'elasticsearch' or engine == 'es':
return _connect_elasticsearch(parsed, dbtype)

elif engine == 'couchdb':
return _connect_couchdb(parsed, dbtype, url)

else:
raise Exception('unknown engine: %s' % engine)
raise Exception(f'unknown engine: {engine}')


def _connect_mysql(parsed,dbtype):
parames = {}
def _connect_mysql(parsed, dbtype):
params = dict()
if parsed.username:
parames['user'] = parsed.username
params['user'] = parsed.username
if parsed.password:
parames['passwd'] = parsed.password
params['passwd'] = parsed.password
if parsed.hostname:
parames['host'] = parsed.hostname
params['host'] = parsed.hostname
if parsed.port:
parames['port'] = parsed.port
params['port'] = parsed.port
if parsed.path.strip('/'):
parames['database'] = parsed.path.strip('/')
params['database'] = parsed.path.strip('/')

if dbtype == 'taskdb':
from .mysql.taskdb import TaskDB
return TaskDB(**parames)
return TaskDB(**params)
elif dbtype == 'projectdb':
from .mysql.projectdb import ProjectDB
return ProjectDB(**parames)
return ProjectDB(**params)
elif dbtype == 'resultdb':
from .mysql.resultdb import ResultDB
return ResultDB(**parames)
return ResultDB(**params)
else:
raise LookupError


def _connect_sqlite(parsed,dbtype):
def _connect_sqlite(parsed, dbtype):
if parsed.path.startswith('//'):
path = '/' + parsed.path.strip('/')
elif parsed.path.startswith('/'):
Expand All @@ -148,7 +149,7 @@ def _connect_sqlite(parsed,dbtype):
raise LookupError


def _connect_mongodb(parsed,dbtype,url):
def _connect_mongodb(parsed, dbtype, url):
url = url.replace(parsed.scheme, 'mongodb')
parames = {}
if parsed.path.strip('/'):
Expand All @@ -167,7 +168,7 @@ def _connect_mongodb(parsed,dbtype,url):
raise LookupError


def _connect_sqlalchemy(parsed, dbtype,url, other_scheme):
def _connect_sqlalchemy(parsed, dbtype, url, other_scheme):
if not other_scheme:
raise Exception('wrong scheme format: %s' % parsed.scheme)
url = url.replace(parsed.scheme, other_scheme)
Expand Down
11 changes: 6 additions & 5 deletions pyspider/database/base/projectdb.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<[email protected]>
# http://binux.me
# Created on 2014-02-09 11:28:52

import re
from typing import Dict

# NOTE: When get/get_all/check_update from database with default fields,
# all following fields should be included in output dict.

{
'project': {
'name': str,
Expand All @@ -34,10 +35,10 @@ class ProjectDB(object):
'RUNNING',
]

def insert(self, name, obj={}):
def insert(self, name, obj: Dict = None):
raise NotImplementedError

def update(self, name, obj={}, **kwargs):
def update(self, name, obj: Dict = None, **kwargs):
raise NotImplementedError

def get_all(self, fields=None):
Expand All @@ -54,9 +55,9 @@ def check_update(self, timestamp, fields=None):

def split_group(self, group, lower=True):
if lower:
return re.split("\W+", (group or '').lower())
return re.split(r"\W+", (group or '').lower())
else:
return re.split("\W+", group or '')
return re.split(r"\W+", group or '')

def verify_project_name(self, name):
if len(name) > 64:
Expand Down
1 change: 0 additions & 1 deletion pyspider/database/base/resultdb.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<[email protected]>
# http://binux.me
Expand Down
9 changes: 6 additions & 3 deletions pyspider/database/base/taskdb.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Author: Binux<[email protected]>
# http://binux.me
Expand Down Expand Up @@ -76,10 +75,14 @@ def status_count(self, project):
'''
raise NotImplementedError

def insert(self, project, taskid, obj={}):
def insert(self, project, taskid, obj: dict = None):
if obj is None:
obj = dict()
raise NotImplementedError

def update(self, project, taskid, obj={}, **kwargs):
def update(self, project, taskid, obj: dict = None, **kwargs):
if obj is None:
obj = dict()
raise NotImplementedError

def drop(self, project):
Expand Down
Loading