一:首先修改settings
1.机器遵循的特定的规则,如果遵守可能不能爬取数据,所以需要设值为False
ROBOTSTXT_OBEY = False
2.爬取速度过快,可能会被对方发现,所以需要修改停顿间隔
DOWNLOAD_DELAY = 0.5
3.禁用cookie追踪
COOKIES_ENABLED = False
4.修改系统默认值 调用useragent
DOWNLOADER_MIDDLEWARES = {
'JobSpider.middlewares.JobUserAgentMiddleware':543,
# 查找External Libraries下的sit-package/scrapy/downloadmiddlewares里的useragent,系统自动调用里边的类UserAgentMiddleware,所以我们需要禁用它,修改后的useragent在建立的爬虫文件JobSpider下的middlewares里的类JobUserAgentMiddleware里面(见下方middlewares)
# 数字表示优先级,数字越小越先执行,填为None表示不执行
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware':None
}
5.解注释pipeline,需要将pipeline.后的值设置为pipeline里的类
# pipeline 管道 用于接收爬虫返回的item数据
ITEM_PIPELINE = {
'JobSpider.pipeline.ToCsvPipeline':300,
}
二:修改middlewares(调用随机useragent)
1.将External Libraries下的sit-package/scrapy/downloadmiddlewares里的useragent里的类复制粘贴到middlewares里,默认是这样的
class UserAgentMiddleware(object):
def __init__(self, user_agent='Scrapy')
self.user_agent = user_agent
@classmethod
def from_crawler(cls,crawler):
o = cls(crawler.settings['USER_AGENT])
crawler.signals.connect(o.spider_opend, signal=signals.spider_opened)
return o
def spider_opened(self, spider):
self.user_agent = getattr(spider, 'user_agent', self.user_agent)
def process_requests(self, request, spider):
if self.user_agent:
requests.headers.setdefault(b'User-Agent', self.user_agent)
2.首先修改一下类名为:JobUserAgentMiddleware
3.__init__ 导入useagent工具包
from fake_useragent import UserAgent
将self.user_agent = user_agent 改为self.user_agent = UserAgent()
4.from_crawler cls后的数据会自动赋值给构造函数的对应参数,所以红字两行注释掉后加上
# crawler.settings后边是个列表,可以添加多个参数会传递给构造参数
# def __init__(self, user_agent='Scrapy', name='')
# o = cls(crawler.settings['USER_AGENT', 'zhangsan'])
o = cls()
5.spider_opened 等号右边代码的含义是从spider对象中获得user_agent属性
如果没有这个属性默认设置为user_agent的内容,所以这行注释掉 加pass
6.process_requests 加上random调用工具包里的随机值 user_agent.random
调用工具
三:写爬虫代码
用上次scrapy的用法里的scrapy genspider 名称 域名 建一个爬虫文件我们以job为例(爬取51job,智联招聘等)
# 类和类之间空两行
import scrapy
class JobSpider(scrapy.Spider):
name = 'job'
# 域名是可以修改的
allowed_domains = ['51job.com']
# 我选择了地点上海 关键字python,如果想同时搜索Java,HTML之类的可以在列表里直接加入网址start_urls = [网址1, 网址2, 网址3...]
start_urls = ["http://search.51job.com/list/020000,020000,0000,00,9,99,python,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=4&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="
]
# 暂时理解为:根据callback后的内容将url传递给对应函数的response
def parse(self, response):
# 解析第一页的内容 后next()
yield scrapy.Request(
url=response.url,
callback=self.parse_job_info,
meta={},
dont_filter=True,
)
yield scrapy.Request(
url=response.url,
callback=self.parse_next_page,
dont_filter=True,
)
def parse_next_page(self, response):
# 解析下一页 此处用的是xpath
next_page = response.xpath("//li[@class='bk'][2]/a/@href").extract_first('')
# 如果下一页存在的话
if next_page:
yield scrapy.Request(
url=next_page,
callback=self.parse_job_info,
meta={},
dont_filter=True,
)# 递归:如果一个函数内部自己调用自己这种形式就叫递归
yield scrapy.Request(
url=next_page,
callback=self.parse_next_page,
meta={},
dont_filter=True,
)
def parse_job_info(self,response):
# 解析工作信息 id是唯一的,class不一定
job_div_list = response.xpath("//div[@id='resultList']/div[@class='el']")
for job_div in job_div_list:
# 数据清洗:负责清除数据两端的空格空行特殊符号等
# 常用操作一般是strip 包括清除无效数据,例如数据格式不完整的数据UI及重复的数据
job_name = job_div.xpath("p/span/a/title").extract_first('无工作').strip().replace(",","/")
job_company_name = job_div.xpath("span[@class='t2']/a/@title").extract_first('无公司详情').strip()
job_place = job_div.xpath("span[@class='t3']/text()").extract_first('无地点详情').strip()
job_salary = job_div.xpath("span[@class='t4']/text()").extract_first('工资面议').strip()
job_time = job_div.xpath("span[@class='t5']/text()").extract_first('招聘发布时间').strip()
job_type = '51job' if '51job.com' in response.url else '其他'
print(job_type, job_name, job_company_name, job_place, job_salary, job_time)
四:将爬取的数据存储到文件里(分三步)
1.修改item
import scrapy
class JobspiderItem(scrapy.Item):
job_name = scrapy.Field()
job_company_name = scrapy.Field()
job_place = scrapy.Field()
job_salary = scrapy.Field()
job_time = scrapy.Field()
job_type = scrapy.Field()
2.修改爬虫代码
# 从item中传入JobspiderItem这个类
from ..item import JobspiderItem
# 在parse_job_info的for循环里添加以下代码:
item = JobspiderItem()
item['job_name'] = job_name
item['job_company_name'] = job_company_name
item['job_place'] = job_place
item['job_salary'] = job_salary
item['job_time'] = job_time
item['job_type'] = job_type
yield item
3.修改pipeline(相当于新建一个类)
class ToCsvPipeline(object):
def process_item(self, item, spider):
with open("job.csv","a",encoding="gb18030") as f:
job_name = item['job_name']
job_company_name = item['job_company_name']
job_place = item['job_place']
job_salary = item['job_salary']
job_time = item['job_time']
job_type = item['job_type']
# 加上换行符号不然数据会只有一行
job_info = [job_name, job_company_name, job_place, job_salary, job_time, job_type, '\n']
f.write(",".join(job_info))
# 把item传递给下一个pipeline
return item