Skip to content

Commit dee413a

Browse files
authored
Add stats extension based on Redis (#186)
1 parent 95b4751 commit dee413a

File tree

3 files changed

+84
-1
lines changed

3 files changed

+84
-1
lines changed

README.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,9 @@ Use the following settings in your project:
7373
# Ensure all spiders share same duplicates filter through redis.
7474
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
7575
76+
# Enables stats shared based on Redis
77+
STATS_CLASS = "scrapy_redis.stats.RedisStatsCollector"
78+
7679
# Default requests serializer is pickle, but it can be changed to any module
7780
# with loads and dumps functions. Note that pickle is not compatible between
7881
# python versions.

src/scrapy_redis/defaults.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66

77
PIPELINE_KEY = '%(spider)s:items'
88

9+
STATS_KEY = '%(spider)s:stats'
10+
911
REDIS_CLS = redis.StrictRedis
1012
REDIS_ENCODING = 'utf-8'
1113
# Sane connection defaults.
@@ -20,7 +22,7 @@
2022
SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue'
2123
SCHEDULER_DUPEFILTER_KEY = '%(spider)s:dupefilter'
2224
SCHEDULER_DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter'
23-
25+
SCHEDULER_PERSIST = False
2426
START_URLS_KEY = '%(name)s:start_urls'
2527
START_URLS_AS_SET = False
2628
START_URLS_AS_ZSET = False

src/scrapy_redis/stats.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
from scrapy.statscollectors import StatsCollector
2+
from .connection import from_settings as redis_from_settings
3+
from .defaults import STATS_KEY, SCHEDULER_PERSIST
4+
5+
6+
class RedisStatsCollector(StatsCollector):
7+
"""
8+
Stats Collector based on Redis
9+
"""
10+
11+
def __init__(self, crawler, spider=None):
12+
super().__init__(crawler)
13+
self.server = redis_from_settings(crawler.settings)
14+
self.spider = spider
15+
self.spider_name = spider.name if spider else crawler.spidercls.name
16+
self.stats_key = crawler.settings.get('STATS_KEY', STATS_KEY)
17+
self.persist = crawler.settings.get(
18+
'SCHEDULER_PERSIST', SCHEDULER_PERSIST)
19+
20+
def _get_key(self, spider=None):
21+
"""Return the hash name of stats"""
22+
if spider:
23+
self.stats_key % {'spider': spider.name}
24+
if self.spider:
25+
return self.stats_key % {'spider': self.spider.name}
26+
return self.stats_key % {'spider': self.spider_name or 'scrapy'}
27+
28+
@classmethod
29+
def from_crawler(cls, crawler):
30+
return cls(crawler)
31+
32+
def get_value(self, key, default=None, spider=None):
33+
"""Return the value of hash stats"""
34+
if self.server.hexists(self._get_key(spider), key):
35+
return int(self.server.hget(self._get_key(spider), key))
36+
else:
37+
return default
38+
39+
def get_stats(self, spider=None):
40+
"""Return the all of the values of hash stats"""
41+
return self.server.hgetall(self._get_key(spider))
42+
43+
def set_value(self, key, value, spider=None):
44+
"""Set the value according to hash key of stats"""
45+
self.server.hset(self._get_key(spider), key, value)
46+
47+
def set_stats(self, stats, spider=None):
48+
"""Set all the hash stats"""
49+
self.server.hmset(self._get_key(spider), stats)
50+
51+
def inc_value(self, key, count=1, start=0, spider=None):
52+
"""Set increment of value according to key"""
53+
if not self.server.hexists(self._get_key(spider), key):
54+
self.set_value(key, start)
55+
self.server.hincrby(self._get_key(spider), key, count)
56+
57+
def max_value(self, key, value, spider=None):
58+
"""Set max value between current and new value"""
59+
self.set_value(key, max(self.get_value(key, value), value))
60+
61+
def min_value(self, key, value, spider=None):
62+
"""Set min value between current and new value"""
63+
self.set_value(key, min(self.get_value(key, value), value))
64+
65+
def clear_stats(self, spider=None):
66+
"""Clarn all the hash stats"""
67+
self.server.delete(self._get_key(spider))
68+
69+
def open_spider(self, spider):
70+
"""Set spider to self"""
71+
if spider:
72+
self.spider = spider
73+
74+
def close_spider(self, spider, reason):
75+
"""Clear spider and clear stats"""
76+
self.spider = None
77+
if not self.persist:
78+
self.clear_stats(spider)

0 commit comments

Comments
 (0)