1. 通过命令创建项目
scrapy startproject JobSpider
2. 用pycharm打开项目
3. 通过命令创建爬虫
scrapy genspider job baidu.com
scrapy startproject JobSpider
2. 用pycharm打开项目
3. 通过命令创建爬虫
scrapy genspider job baidu.com
4. 配置settings
robots_obey=False
Download_delay=0.5
Cookie_enable=False
DOWNLOADER_MIDDLEWARES = {
'JobSpider.middlewares.JobUserAgentMiddleware': 543,
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None
}
"""调用Pipeline中自己写的类"""
ITEM_PIPELINES = {
'JobSpider.pipelines.ToCsvPipeline': 300,
}
5. 自定义UserAgentMiddleWare
可以直接粘现成的
# 自己添加的获取useragent类
class JobUserAgentMiddleware(object):
"""This middleware allows spiders to override the user_agent"""
def __init__(self, user_agent='Scrapy', name=''):
self.user_agent = UserAgent()
@classmethod
def from_crawler(cls, crawler):
# o = cls(crawler.settings['USER_AGENT'],'ZhangSAN)
o = cls()
# cls后的数据会自动赋值给构造函数的对应参数
# crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
return o
# 这个函数不能删,否则会报错
def spider_opened(self, spider):
# 等号右边代码的含义是 从spider中获得user_agent的属性
# 如果没有默认为self.user_agent的内容
# self.user_agent = getattr(spider, 'user_agent', self.user_agent)
pass
def process_request(self, request, spider):
if self.user_agent:
# b 转换为二进制,不能改
request.headers.setdefault(b'User-Agent', self.user_agent.random)