建立user-agent池:在request的headers设置User-Agent
class RandomUserAgent(object):
def __init__(self,user_agent):
self.user_agent = user_agent
@classmethod
def from_crawler(cls,crawler):
# 从settings中加载USER_AGENT的值
return cls(
user_agent=crawler.settings.getlist('USER_AGENT')
)
def process_request(self, request, spider):
# 在process_request中设置User-Agent的值
request.headers.setdefault('User-Agent',random.choice(self.user_agent))
建立IP代理池:在meta中的proxy字段改写代理
class RandomProxy(object):
def __init__(self,iplist):
self.iplist = iplist
@classmethod
def from_crawler(cls,crawler):
# 在settings中加载IPLIST的值
return cls(
iplist=crawler.settings.getlist('IPLIST')
)
def process_request(self, request,