python爬取前程无忧宁波职位薪酬进行初步分析
对自己的学习经历做个记录。
一、用Scrapy爬取数据并存入MongoDB
spider.py
import scrapy
from www51job.items import Www51JobItem
class nbcaiwu(scrapy.Spider):#要使用 scrapy 爬虫,继承 scrapy.Spider 这个类,这样才能使用它定义的一些方法
name = "nbcaiwu" #定义一个爬虫的名称
#定义请求
def start_requests(self):
urls =[
'https://search.51job.com/list/080300,000000,0000,00,3,99,%25E8%25B4%25A2%25E5%258A%25A1,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=',
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
content_list_div = response.css('div#resultList.dw_table div.el')
for content_div in content_list_div:
item = Www51JobItem()
item['职位名'] = content_div.css('p.t1 span a::text').get()
item['公司名'] = content_div.css('span.t2 a::attr(title)').get()
item['工作地点'] = content_div.css('span.t3::text').get()
item['薪资'] = content_div.css('span.t