微博爬取并发高了就不让访问,这里对接一下cookies池和代理池
middlewares.py
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
import requests
import logging
import json
#对接cookies池
class CookiesMiddleware():
def __init__(self,cookies_url='http://127.0.0.1:5000/weibo/random'):
self.logger=logging.getLogger(__name__)
self.cookies_url=cookies_url
def get_random_cookies(self):
try:
response=requests.get(self.cookies_url)
if response.status_code== 200:
cookiess=json.loads(response.text)
cookies={}
for cookie in cookiess:
cookies[cookie['name']]=cookie['value']
return cookies
except Exception:
print('caonima')
return False
def process_request(self,request,spider):
self.logger.debug('正在获取cookies')
cookies=self.get_random_cookies()
if cookies:
request.cookies=cookies
self.logger.debug('使用cookies'+json.dumps(cookies))
#对接代理池
class ProxyMiddleware():
def __init__(self,proxy_url='http://127.0.0.1:5002/daili'):
self.logger=logging.getLogger(__name__)
self.proxy_url=proxy_url
def get_random_proxy(self):
try:
response=requests.get(self.proxy_url)
if response.status_code== 200:
proxy=response.text
return proxy
except Exception:
print('limabi')
return False
def process_request(self,request,spider):
#if request.meta.get('retry_times'):
proxy=self.get_random_proxy()
if proxy:
uri='http://{proxy}'.format(proxy=proxy)
self.logger.debug('使用代理'+proxy)
request.meta['proxy']=uri
setting.py添加DOWNLOADER_MIDDLEWARES
DOWNLOADER_MIDDLEWARES = {
'weibo.middlewares.WeiboDownloaderMiddleware': 543,
'weibo.middlewares.CookiesMiddleware': 545,
'weibo.middlewares.ProxyMiddleware': 546
}
然后设置一下settings.py其他设置,运行爬虫即可