在settings.py里从数据库取出ip:
import pymongo
def get_proxy():
mongoclient = pymongo.MongoClient(MONHOST,MONPORT)
db = mongoclient[MONDB]
proxy_list = db.host.find()
PROXIES = []
for proxy in proxy_list:
PROXIES.append(proxy)
return PROXIES
PROXIES = get_proxy()
创建中间件文件mymiddleware.py,settings里设置DOWNLOADER_MIDDLEWARES:
from scrapy.conf import settings
import random
class RandomProxyMiddleware(object):
def process_request(self,request,spider):
proxy = random.choice(settings['PROXIES'])
if proxy.get('auth') is None:
request.meta['proxy'] = 'http://'+':'.join([proxy['ip'], proxy['port']])
settings超时设置:
# 请求超时
DOWNLOAD_TIMEOUT = 5
# 重新请求
RETRY_ENABLED = True
# 重试次数
RETRY_TIMES = 3