目前公司做的爬虫,不管测试还是爬取都是从网络直接请求资源,在调试的时候个人感觉十分耗时间,效率太低。最近受到同事的启发,做一个本地缓存来提高速度。
添加中间件cache_middleware()
class cache_middleware(object):
conn_pool = redis.ConnectionPool(host='localhost', port=6379, decode_responses=True)
def process_request(self, request, spider):
try:
c = self.get_conn()
md = hashlib.md5()
x = request.url.encode('utf-8')
md.update(x)
key = md.hexdigest()
result = c.get(key)
if result:
res = scrapy.http.TextResponse(url=request.url, status=200, headers=None, body=result, request=None, encoding='utf-8')
print '从本地缓存中取数据'
return res
except Exception as e:
print 'middleware:' + str(e)
pass
@cl