Downloader Middleware(下载器中间件)
包括Scrapy内置的下载器中间件和用户自定义的下载器中间件。
内置下载器中间件的用法参考官方文档
自定义下载器中间件,需要以下2步:
- 定义下载器中间件,继承自object
- 在settings.py文件中的DOWNLOADER_MIDDLEWARES 字段中配置,激活中间件
User-Agent下载器中间件
middlewares.py
import random
import datetime
class UserAgentDownloaderMiddleware(object):
"""
User-Agent下载器中间件
"""
user_agents=[
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
'Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.2)',
'Opera/9.27 (Windows NT 5.2; U; zh-cn)',
'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)',
'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0',
]
def process_request(self,request,spider):
"""当每个request通过下载中间件时,该方法被调用。"""
random.seed(datetime.datetime.now())
user_agent=random.choice(self.user_agents)
spider.logger.info('User-Agent : %s' % user_agent)
# 相当于设置请求头中的User-Agent字段(其它字段同理)
request.headers.setdefault('User-Agent',user_agent)
settings.py
DOWNLOADER_MIDDLEWARES = {
# 值为None,表示禁用该中间件
'myscrapy.middlewares.UserAgentDownloaderMiddleware': 99,
}
HTTP代理中间件
middlewares.py
import random
import datetime
class MyHttpProxyMiddleware(object):
"""
代理下载器中间件
Scrapy中,负责代理设置的组件为scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware
代理协议信令格式:
CONNECT 59.64.128.198:21 HTTP/1.1
Host: 59.64.128.198:21
Proxy-Authorization: Basic bGV2I1TU5OTIz
User-Agent: OpenFetion
"""
proxies = [
# 私有代理
{"ip_port" : "125.117.134.117:9000", "username_passwd" : b"mr_mao_hacker:sffqry9r"},
# 开放代理
{"ip_port" : "125.117.134.117:9000"},
{"ip_port" : "117.90.7.156:9000"},
{"ip_port" : "125.117.134.117:9000"},
{"ip_port" : "121.31.159.91:8123"},
{"ip_port" : "49.70.209.144:9000"},
{"ip_port" : "121.232.145.65:9000"},
]
def process_request(self, request, spider):
random.seed(datetime.datetime.now())
proxy=random.choice(self.proxies)
spider.logger.info('Proxy : %s' % proxy)
if proxy.get('username_passwd'):
# 私有代理
# 对账户密码进行base64编码转换
import base64
base64_userpasswd = base64.b64encode(proxy['username_passwd'])
# 对应到代理服务器的信令格式里(注意Basic后面的空格)
request.headers['Proxy-Authorization'] = b'Basic ' + base64_userpasswd
# 在Request的meta中设置proxy字段即可
request.meta['proxy'] = "http://" + proxy['ip_port']
settings.py
DOWNLOADER_MIDDLEWARES = {
'myscrapy.middlewares.MyHttpProxyMiddleware': 100,
}