Skip to content
This repository was archived by the owner on Jun 10, 2024. It is now read-only.

Commit 897891c

Browse files
authored
Merge pull request #955 from binux/fix-test
Fix test
2 parents 15157ea + 360d131 commit 897891c

File tree

14 files changed

+71
-189
lines changed

14 files changed

+71
-189
lines changed

.travis.yml

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
1-
sudo: required
21
language: python
32
cache: pip
43
python:
5-
- 3.4
64
- 3.5
75
- 3.6
86
- 3.7
@@ -11,29 +9,25 @@ services:
119
- docker
1210
- mongodb
1311
- rabbitmq
14-
- redis-server
12+
- redis
1513
- mysql
16-
#- elasticsearch
14+
# - elasticsearch
1715
- postgresql
1816
addons:
1917
postgresql: "9.4"
2018
apt:
2119
packages:
2220
- rabbitmq-server
21+
env:
22+
- IGNORE_COUCHDB=1
2323

2424
before_install:
25-
- echo "deb https://apache.bintray.com/couchdb-deb xenial main" | sudo tee -a /etc/apt/sources.list
26-
- curl -L https://couchdb.apache.org/repo/bintray-pubkey.asc | sudo apt-key add -
2725
- sudo apt-get update -qq
28-
- sudo apt-get install -y couchdb
29-
- sudo systemctl start couchdb
3026
- curl -O https://download.elastic.co/elasticsearch/release/org/elasticsearch/distribution/deb/elasticsearch/2.4.0/elasticsearch-2.4.0.deb && sudo dpkg -i --force-confnew elasticsearch-2.4.0.deb && sudo service elasticsearch restart
3127
- npm install express puppeteer
3228
- sudo docker pull scrapinghub/splash
3329
- sudo docker run -d --net=host scrapinghub/splash
3430
before_script:
35-
- curl -X PUT http://127.0.0.1:5984/_users
36-
- curl -X PUT http://127.0.0.1:5984/_replicator
3731
- psql -c "CREATE DATABASE pyspider_test_taskdb ENCODING 'UTF8' TEMPLATE=template0;" -U postgres
3832
- psql -c "CREATE DATABASE pyspider_test_projectdb ENCODING 'UTF8' TEMPLATE=template0;" -U postgres
3933
- psql -c "CREATE DATABASE pyspider_test_resultdb ENCODING 'UTF8' TEMPLATE=template0;" -U postgres

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ A Powerful Spider(Web Crawler) System in Python.
55

66
- Write script in Python
77
- Powerful WebUI with script editor, task monitor, project manager and result viewer
8-
- [MySQL](https://www.mysql.com/), [CouchDB](https://couchdb.apache.org), [MongoDB](https://www.mongodb.org/), [Redis](http://redis.io/), [SQLite](https://www.sqlite.org/), [Elasticsearch](https://www.elastic.co/products/elasticsearch); [PostgreSQL](http://www.postgresql.org/) with [SQLAlchemy](http://www.sqlalchemy.org/) as database backend
8+
- [MySQL](https://www.mysql.com/), [MongoDB](https://www.mongodb.org/), [Redis](http://redis.io/), [SQLite](https://www.sqlite.org/), [Elasticsearch](https://www.elastic.co/products/elasticsearch); [PostgreSQL](http://www.postgresql.org/) with [SQLAlchemy](http://www.sqlalchemy.org/) as database backend
99
- [RabbitMQ](http://www.rabbitmq.com/), [Redis](http://redis.io/) and [Kombu](http://kombu.readthedocs.org/) as message queue
1010
- Task priority, retry, periodical, recrawl by age, etc...
1111
- Distributed architecture, Crawl Javascript pages, Python 2.{6,7}, 3.{3,4,5,6} support, etc...

docker-compose.yaml

Lines changed: 6 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -13,26 +13,15 @@ services:
1313
networks:
1414
- pyspider
1515
command: rabbitmq-server
16-
couchdb:
17-
image: couchdb:latest
18-
container_name: couchdb
16+
mysql:
17+
image: mysql:latest
18+
container_name: mysql
19+
volumes:
20+
- /tmp:/var/lib/mysql
1921
environment:
20-
- COUCHDB_USER=user
21-
- COUCHDB_PASSWORD=password
22+
- MYSQL_ALLOW_EMPTY_PASSWORD=yes
2223
networks:
2324
- pyspider
24-
ports:
25-
- "5984:5984"
26-
# OR we can replace couchdb with mysql
27-
#mysql:
28-
# image: mysql:latest
29-
# container_name: mysql
30-
# volumes:
31-
# - /tmp:/var/lib/mysql
32-
# environment:
33-
# - MYSQL_ALLOW_EMPTY_PASSWORD=yes
34-
# networks:
35-
# - pyspider
3625
phantomjs:
3726
image: pyspider:latest
3827
container_name: phantomjs

pyspider/database/__init__.py

Lines changed: 2 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -214,26 +214,8 @@ def _connect_couchdb(parsed, dbtype, url):
214214
params = {}
215215

216216
# default to env, then url, then hard coded
217-
params['username'] = os.environ.get('COUCHDB_USER') or parsed.username or 'user'
218-
params['password'] = os.environ.get('COUCHDB_PASSWORD') or parsed.password or 'password'
219-
220-
# create necessary DBs + the admin user
221-
res = requests.put(url + "_users")
222-
if 'error' in res and res['error'] == 'unauthorized':
223-
# user is already created. This will happen if CouchDB is running in docker
224-
# and COUCHDB_USER and COUCHDB_PASSWORD are set
225-
from requests.auth import HTTPBasicAuth
226-
requests.put(url + "_users",
227-
auth=HTTPBasicAuth(params['username'], params['password']))
228-
requests.put(url + "_replicator",
229-
auth=HTTPBasicAuth(params['username'], params['password']))
230-
requests.put(url + '_node/_local/_config/admins/' + params['username'],
231-
data=params['password'],
232-
auth=HTTPBasicAuth(params['username'], params['password']))
233-
else:
234-
requests.put(url + "_replicator")
235-
requests.put(url + '_node/_local/_config/admins/' + params['username'],
236-
data=params['password'])
217+
params['username'] = os.environ.get('COUCHDB_USER') or parsed.username
218+
params['password'] = os.environ.get('COUCHDB_PASSWORD') or parsed.password
237219

238220
if dbtype == 'taskdb':
239221
from .couchdb.taskdb import TaskDB

pyspider/database/couchdb/couchdbbase.py

Lines changed: 13 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,12 @@
44
class SplitTableMixin(object):
55
UPDATE_PROJECTS_TIME = 10 * 60
66

7+
def __init__(self):
8+
self.session = requests.session()
9+
if self.username:
10+
self.session.auth = HTTPBasicAuth(self.username, self.password)
11+
self.session.headers.update({'Content-Type': 'application/json'})
12+
713
def _collection_name(self, project):
814
if self.collection_prefix:
915
return "%s_%s" % (self.collection_prefix, project)
@@ -32,10 +38,7 @@ def _list_project(self):
3238
prefix = ''
3339

3440
url = self.base_url + "_all_dbs"
35-
res = requests.get(url,
36-
data=json.dumps({}),
37-
headers={"Content-Type": "application/json"},
38-
auth=HTTPBasicAuth(self.username, self.password)).json()
41+
res = self.session.get(url, json={}).json()
3942
for each in res:
4043
if each.startswith('_'):
4144
continue
@@ -45,19 +48,15 @@ def _list_project(self):
4548

4649
def create_database(self, name):
4750
url = self.base_url + name
48-
res = requests.put(url,
49-
headers={"Content-Type": "application/json"},
50-
auth=HTTPBasicAuth(self.username, self.password)).json()
51+
res = self.session.put(url).json()
5152
if 'error' in res and res['error'] == 'unauthorized':
5253
raise Exception("Supplied credentials are incorrect. Reason: {} for User: {} Password: {}".format(res['reason'], self.username, self.password))
5354
return res
5455

5556

5657
def get_doc(self, db_name, doc_id):
5758
url = self.base_url + db_name + "/" + doc_id
58-
res = requests.get(url,
59-
headers={"Content-Type": "application/json"},
60-
auth=HTTPBasicAuth(self.username, self.password)).json()
59+
res = self.session.get(url).json()
6160
if "error" in res and res["error"] == "not_found":
6261
return None
6362
return res
@@ -66,10 +65,7 @@ def get_doc(self, db_name, doc_id):
6665
def get_docs(self, db_name, selector):
6766
url = self.base_url + db_name + "/_find"
6867
selector['use_index'] = self.index
69-
res = requests.post(url,
70-
data=json.dumps(selector),
71-
headers={"Content-Type": "application/json"},
72-
auth=HTTPBasicAuth(self.username, self.password)).json()
68+
res = self.session.post(url, json=selector).json()
7369
if 'error' in res and res['error'] == 'not_found':
7470
return []
7571
return res['docs']
@@ -81,10 +77,7 @@ def get_all_docs(self, db_name):
8177

8278
def insert_doc(self, db_name, doc_id, doc):
8379
url = self.base_url + db_name + "/" + doc_id
84-
return requests.put(url,
85-
data=json.dumps(doc),
86-
headers={"Content-Type": "application/json"},
87-
auth=HTTPBasicAuth(self.username, self.password)).json()
80+
return self.session.put(url, json=doc).json()
8881

8982

9083
def update_doc(self, db_name, doc_id, new_doc):
@@ -94,14 +87,9 @@ def update_doc(self, db_name, doc_id, new_doc):
9487
for key in new_doc:
9588
doc[key] = new_doc[key]
9689
url = self.base_url + db_name + "/" + doc_id
97-
return requests.put(url,
98-
data=json.dumps(doc),
99-
headers={"Content-Type": "application/json"},
100-
auth=HTTPBasicAuth(self.username, self.password)).json()
90+
return self.session.put(url, json=doc).json()
10191

10292

10393
def delete(self, url):
104-
return requests.delete(url,
105-
headers={"Content-Type": "application/json"},
106-
auth=HTTPBasicAuth(self.username, self.password)).json()
94+
return self.session.delete(url).json()
10795

pyspider/database/couchdb/projectdb.py

Lines changed: 13 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -6,17 +6,19 @@
66
class ProjectDB(BaseProjectDB):
77
__collection_name__ = 'projectdb'
88

9-
def __init__(self, url, database='projectdb', username='username', password='password'):
9+
def __init__(self, url, database='projectdb', username=None, password=None):
1010
self.username = username
1111
self.password = password
1212
self.url = url + self.__collection_name__ + "_" + database + "/"
1313
self.database = database
14-
self.insert('', {})
14+
15+
self.session = requests.session()
16+
if username:
17+
self.session.auth = HTTPBasicAuth(self.username, self.password)
18+
self.session.headers.update({'Content-Type': 'application/json'})
1519

1620
# Create the db
17-
res = requests.put(self.url,
18-
headers={"Content-Type": "application/json"},
19-
auth=HTTPBasicAuth(self.username, self.password)).json()
21+
res = self.session.put(self.url).json()
2022
if 'error' in res and res['error'] == 'unauthorized':
2123
raise Exception(
2224
"Supplied credentials are incorrect. Reason: {} for User: {} Password: {}".format(res['reason'],
@@ -29,9 +31,7 @@ def __init__(self, url, database='projectdb', username='username', password='pas
2931
},
3032
'name': self.__collection_name__ + "_" + database
3133
}
32-
res = requests.post(self.url+"_index", data=json.dumps(payload),
33-
headers={"Content-Type": "application/json"},
34-
auth=HTTPBasicAuth(self.username, self.password)).json()
34+
res = self.session.post(self.url + "_index", json=payload).json()
3535
self.index = res['id']
3636

3737
def _default_fields(self, each):
@@ -51,10 +51,7 @@ def insert(self, name, obj={}):
5151
obj = dict(obj)
5252
obj['name'] = name
5353
obj['updatetime'] = time.time()
54-
res = requests.put(url,
55-
data = json.dumps(obj),
56-
headers = {"Content-Type": "application/json"},
57-
auth=HTTPBasicAuth(self.username, self.password)).json()
54+
res = self.session.put(url, json=obj).json()
5855
return res
5956

6057
def update(self, name, obj={}, **kwargs):
@@ -78,10 +75,7 @@ def get_all(self, fields=None):
7875
"use_index": self.index
7976
}
8077
url = self.url + "_find"
81-
res = requests.post(url,
82-
data=json.dumps(payload),
83-
headers={"Content-Type": "application/json"},
84-
auth=HTTPBasicAuth(self.username, self.password)).json()
78+
res = self.session.post(url, json=payload).json()
8579
for doc in res['docs']:
8680
yield self._default_fields(doc)
8781

@@ -95,10 +89,7 @@ def get(self, name, fields=None):
9589
"use_index": self.index
9690
}
9791
url = self.url + "_find"
98-
res = requests.post(url,
99-
data=json.dumps(payload),
100-
headers={"Content-Type": "application/json"},
101-
auth=HTTPBasicAuth(self.username, self.password)).json()
92+
res = self.session.post(url, json=payload).json()
10293
if len(res['docs']) == 0:
10394
return None
10495
return self._default_fields(res['docs'][0])
@@ -115,13 +106,7 @@ def drop(self, name):
115106
doc = self.get(name)
116107
payload = {"rev": doc["_rev"]}
117108
url = self.url + name
118-
return requests.delete(url,
119-
params=payload,
120-
headers={"Content-Type": "application/json"},
121-
auth=HTTPBasicAuth(self.username, self.password)).json()
109+
return self.session.delete(url, params=payload).json()
122110

123111
def drop_database(self):
124-
return requests.delete(self.url,
125-
headers={"Content-Type": "application/json"},
126-
auth=HTTPBasicAuth(self.username, self.password)).json()
127-
112+
return self.session.delete(self.url).json()

pyspider/database/couchdb/resultdb.py

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,19 @@
1-
import time, json, requests
2-
from requests.auth import HTTPBasicAuth
1+
import time, json
32
from pyspider.database.base.resultdb import ResultDB as BaseResultDB
43
from .couchdbbase import SplitTableMixin
54

65

76
class ResultDB(SplitTableMixin, BaseResultDB):
87
collection_prefix = ''
98

10-
def __init__(self, url, database='resultdb', username='username', password='password'):
9+
def __init__(self, url, database='resultdb', username=None, password=None):
1110
self.username = username
1211
self.password = password
13-
1412
self.base_url = url
1513
self.url = url + database + "/"
1614
self.database = database
15+
16+
super().__init__()
1717
self.create_database(database)
1818
self.index = None
1919

@@ -31,10 +31,7 @@ def _create_project(self, project):
3131
'name': collection_name
3232
}
3333

34-
res = requests.post(self.base_url + collection_name + "/_index",
35-
data=json.dumps(payload),
36-
headers={"Content-Type": "application/json"},
37-
auth=HTTPBasicAuth(self.username, self.password)).json()
34+
res = self.session.post(self.base_url + collection_name + "/_index", json=payload).json()
3835
self.index = res['id']
3936
self._list_project()
4037

pyspider/database/couchdb/taskdb.py

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,22 @@
1-
import json, time, requests
2-
from requests.auth import HTTPBasicAuth
1+
import json, time
32
from pyspider.database.base.taskdb import TaskDB as BaseTaskDB
43
from .couchdbbase import SplitTableMixin
54

65

76
class TaskDB(SplitTableMixin, BaseTaskDB):
87
collection_prefix = ''
98

10-
def __init__(self, url, database='taskdb', username='username', password='password'):
9+
def __init__(self, url, database='taskdb', username=None, password=None):
1110
self.username = username
1211
self.password = password
1312
self.base_url = url
1413
self.url = url + database + "/"
1514
self.database = database
16-
self.create_database(database)
1715
self.index = None
1816

17+
super().__init__()
18+
19+
self.create_database(database)
1920
self.projects = set()
2021
self._list_project()
2122

@@ -32,10 +33,7 @@ def _create_project(self, project):
3233
},
3334
'name': collection_name
3435
}
35-
res = requests.post(self.base_url + collection_name + "/_index",
36-
data=json.dumps(payload),
37-
headers={"Content-Type": "application/json"},
38-
auth=HTTPBasicAuth(self.username, self.password)).json()
36+
res = self.session.post(self.base_url + collection_name + "/_index", json=payload).json()
3937
self.index = res['id']
4038
self._list_project()
4139

pyspider/libs/utils.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -432,9 +432,9 @@ def python_console(namespace=None):
432432

433433

434434
def check_port_open(port, addr='127.0.0.1'):
435-
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
436-
result = sock.connect_ex((addr, port))
437-
if result == 0:
438-
return True
439-
else:
440-
return False
435+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
436+
result = sock.connect_ex((addr, port))
437+
if result == 0:
438+
return True
439+
else:
440+
return False

requirements.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
Flask==0.10
22
Jinja2==2.7
3-
chardet==2.2.1
3+
chardet==3.0.4
44
cssselect==0.9
55
lxml==4.3.3
66
pycurl==7.43.0.3
77
pyquery==1.4.0
8-
requests==2.2
8+
requests==2.24.0
99
tornado==4.5.3
1010
mysql-connector-python==8.0.16
1111
pika==1.1.0

0 commit comments

Comments
 (0)