一、Scrapy创建项目
1.cmd命令
1.cd Desktop
2.scrapy startproject 文件名
3.cd 文件名
4.scrapy genspidr 爬虫名字 网站域名、
二、setting设置
1.robots.txt协议(爬虫协议),默认True遵守
2.延迟发送请求
3.禁用cookie追踪
4.设置user-agent
改前
改后
5.为pipelines配置
改前
改后
三、middlewares设置
设置User-Agent
需导入
from fake_useragent import UserAgent
找到设置复制
D:/Anaconda/Lib/site-packages/scrapy/downloadermiddlewares/useragent.py
class UserAgentMiddleware(object):
"""This middleware allows spiders to override the user_agent"""
def __init__(self, user_agent='Scrapy'):
self.user_agent = user_agent
@classmethod
def from_crawler(cls, crawler):
o = cls(crawler.settings['USER_AGENT'])
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
return o
def spider_opened(self, spider):
self.user_agent = getattr(spider, 'user_agent', self.user_agent)
def process_request(self, request, spider):
if self.user_agent:
request.headers.setdefault(b'User-Agent', self.user_agent)
复制放在middlewares
改完之后为
class JobUserMiddleware(object):
"""This middleware allows spiders to override the user_agent"""
def __init__(self, user_agent='Scrapy',name=''):
self.user_agent = UserAgent()
@classmethod
def from_crawler(cls, crawler):
# o = cls(crawler.settings['USER_AGENT'],'张三')
# cls后的数据会自动赋值给构造函数的对应参数
o = cls()
# crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
return o
def spider_opened(self, spider):
# =右边代码的含义是从spider中获得user_agent的属性,
# 如果没有默认为self.user_agent的内容
# self.user_agent = getattr(spider, 'user_agent', self.user_agent)
pass
def process_request(self, request, spider):
if self.user_agent:
request.headers.setdefault(b'User-Agent', self.user_agent.random)