推荐使用插件XPath Helper:
XPath Helper可以支持在网页点击元素生成xpath,整个抓取使用了xpath、正则表达式、消息中间件、多线程调度框架的chrome插件。
最主要的是分析xpath,这是一切的灵魂.
1.创建爬虫项目
scrapy startproject[项目名]
使用命令创建一个爬虫:
scrapy genspider yingcaiwang "域名"
运行爬虫命令
scrapy crawl knowlegde (name名)
创建启动start.py
2.目录结构介绍
2.1 items.py:用于存放爬虫爬取下来数据的模型
2.2 middlewares.py: 用于存放各种中间件的文件
2.3 pipelines.py: 用于将items的模型存储到本地磁盘中.
2.4 settings.py:本爬虫的一些配置文件信息(比如请求头、多久发送一次请求、ip代理池等)
2.5 scrapy.cfg: 项目中的配置文件
2.6 spiders包:以后所有的爬虫,都是存在这个里面.
3.开始我的爬虫
先看我的项目结构
直接代码
# -*- coding: utf-8 -*-
import scrapy
from qianchengwuyou.items import QianchengwuyouItem
class QiangchengSpider(scrapy.Spider):
name = 'qiangcheng'
allowed_domains = ['51job.com']
start_urls = ["https://search.51job.com/list/000000,000000,0000,00,9,99,%2520,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="]
def parse(self, response):
job_list = response.xpath("//div[@class='dw_table']//div[@class='el']/p[1]//span/a/@href").getall()
for i_items in job_list:
yield scrapy.Request(url=i_items,callback=self.jiexi_content,dont_filter=True)
#获取下一页的href
next_pages = response.xpath("//div[@class='p_in']//li[last()]/a/@href").get()
if next_pages:
yield scrapy.Request(url=next_pages,callback=self.parse,dont_filter=True)
def jiexi_content(self,response):
item = QianchengwuyouItem()
item['job_name'] = response.xpath("//div[@class='cn']/h1/text()").extract_first()
try:
item['job_money'] = response.xpath("//div[@class='cn']/strong/text()").extract_first()
except:
item['job_money'] = "面议"
item['job_company'] = response.xpath("//div[@class='cn']//a[1]/text()").extract_first()
#抓取所有工作地点和教育程度
job_all = response.xpath("//div[@class='cn']//p[2]/text()").getall()
#对抓取的列表进行分割取出,列表中包含了工作地点和工作经历和教育程度.
try:
item['job_place'] = job_all[0].strip()
except:
item['job_place'] = ""
# try:
# item['jineng'] = response.xpath("//div[@class='mt10']/p[2]/a/text()").extract()
# except:
# item['jineng'] =""
try:
item['job_experience'] = job_all[1].strip()
except:
item['job_experience'] = '无要求'
try:
item['job_education'] = job_all[2].strip()
except:
item['job_education'] = '不要求'
# try:
# item['import_zi'] = response.xpath('//div[@class="bmsg job_msg inbox"]/div[@class="mt10"]/p[2]/text()').get()
# print(item['import_zi'])
# except:
# item['import_zi'] = '不做要求'
#对所有岗位职责和任务进行提取,都爬取了的,爬取的岗位职责和任职要求
job_content = response.xpath("//div[@class='bmsg job_msg inbox']/p/text()").getall()
job_nz = []
#遍历循环取得岗位职责和任职要求的数据添加到job_nz[]中,在取出数据.
for job_nzs in job_content:
job_nz.append(job_nzs)
try:
item['job_nz'] = job_nz
except:
item['job_nz'] = "上岗安排"
yield item
在setting设置存入数据库和设置请求头:
在pipelines.py 存入数据库
import pymongo
from scrapy.utils.project import get_project_settings
settings = get_project_settings()
class QianchengwuyouPipeline:
def init(self):
pass
# 链接数据库
client = pymongo.MongoClient(host=settings[‘MONGO_HOST’], port=settings[‘27017’])
self.db = client[settings[‘MONGO_DB’]] # 获得数据库的句柄
self.coll = self.db[settings[‘MONGO_COLL’]] # 获得collection的句柄
#数据库登录需要账号密码的话
#self.db.authenticate(settings[‘MONGO_USER’], settings[‘MONGO_PSW’])
def process_item(self, item, spider):
pass
postItem = dict(item) # 把item转化成字典形式
self.coll.insert(postItem) # 向数据库插入一条记录
return item # 会在控制台输出原item数据,可以选择不写
def close(self):
close.client.close()
正在爬取数据
存入Mongodb
4.对数据做可视化
from pyecharts import options as opts
from pyecharts.charts import Pie
import pymongo
#连接mongodb
client = pymongo.MongoClient("127.0.0.1",port=27017)
#连接数据库
db = client['admin']
#连接表
mytable =db['qiangcheng']
#提取mogodb数据
chengdu ={"$and": [{"job_place":{"$regex":"成都"}},{"job_name":{"$regex":"数据采集"}}]}
beijing ={"$and": [{"job_place":{"$regex":"北京"}},{"job_name":{"$regex":"数据采集"}}]}
shanghai ={"$and": [{"job_place":{"$regex":"上海"}},{"job_name":{"$regex":"数据采集"}}]}
guangzhou ={"$and": [{"job_place":{"$regex":"广州"}},{"job_name":{"$regex":"数据采集"}}]}
shengzheng ={"$and": [{"job_place":{"$regex":"深圳"}},{"job_name":{"$regex":"数据采集"}}]}
num1 = 0
num2 =0
num3 =0
num4 = 0
num5 =0
for a in mytable.find(chengdu):
if "成都" in a["job_place"]:
num1 += 1
print(num1)
# print("***6***")
for b in mytable.find(beijing):
if "北京" in b["job_place"]:
num2 += 1
# print(num2)
# print("***6***")
for c in mytable.find(shanghai):
if "上海" in c["job_place"]:
num3 += 1
# print(num3)
# print("***6***")
for d in mytable.find(guangzhou):
if "广州" in d["job_place"]:
num4 += 1
# print(num4)
# print("***6***")
for f in mytable.find(shengzheng):
if "深圳" in f["job_place"]:
num5 += 1
# print(num5)
# print("***6***")
x_data =["成都","北京","上海","广州","深圳"]
y_data =[num1,num2,num3,num4,num5]
data_pir = list(zip(x_data,y_data))
pie =Pie()
pie.add(series_name="数据采集地区岗位",data_pair=data_pir)
pie.render("数据采集饼图.html")
5.分析数据饼图
6.分析大数据、数据采集、数据分析、平均工资,最大工资和最小工资分析图
from pyecharts import options as opts
from pyecharts.charts import Bar
import re
import pymongo
#连接mongodb
client = pymongo.MongoClient("127.0.0.1",port=27017)
#连接数据库
db = client['admin']
#连接表
mytable =db['qiangcheng']
#获取mogodb中的数据
#提取mogodb数据
def one_shujufenxi(shujufenxi):
#最高工资
slaray_high = []
for i in mytable.find(shujufenxi):
a = i["job_money"].split("-")[0]
slaray_high.append(float(a))
max_b = max(slaray_high)
min_c = min(slaray_high)
#设置初始值
avg1 = "{:.1f}".format(sum(slaray_high)/len(slaray_high))
ls = [max_b,min_c,float(avg1)]
return ls
def two_dashujugongchengshi(dashujugongchengshi):
# 最高工资
slaray_high = []
for i in mytable.find(dashujugongchengshi):
a = i["job_money"].split("-")[0]
slaray_high.append(float(a))
max_b = max(slaray_high)
min_c = min(slaray_high)
# 设置初始值
avg1 = "{:.1f}".format(sum(slaray_high) / len(slaray_high))
ls = [max_b, min_c, float(avg1)]
return ls
def three_shujucaiji(shujucaiji):
# 最高工资
slaray_high = []
for i in mytable.find(shujucaiji):
a = i["job_money"].split("-")[0]
try:
slaray_high.append(float(a))
except:
continue
slaray_high = sorted(slaray_high,reverse=True)
max_b = slaray_high[0]
min_c = slaray_high[-1]
# 设置初始值
avg1 = "{:.1f}".format(sum(slaray_high) / len(slaray_high))
ls = [max_b, min_c, float(avg1)]
return ls
def tu(y1,y2,y3):
gongzi = ['数据分析','大数据开发','数据采集']
bar = Bar()
bar.add_xaxis(xaxis_data=gongzi)
# 第一个参数是图例名称,第二个参数是y轴数据
bar.add_yaxis(series_name= "最高工资",y_axis=y1)
bar.add_yaxis(series_name="最低工资",y_axis=y2)
bar.add_yaxis(series_name="平均工资",y_axis=y3)
# 设置表的名称
bar.set_global_opts(title_opts=opts.TitleOpts(title='工资分析图',subtitle='工资单位:/月'),toolbox_opts=opts.ToolboxOpts(),)
bar.render("三个地区的数据分析.html")
if __name__ == '__main__':
shujufenxi = {"$and": [{"job_money": {"$regex": "/月"}}, {"job_name": {"$regex": "数据分析"}}]}
# print(one_shujufenxi(shujufenxi))
dashujugongchengshi = {"$and": [{"job_money": {"$regex": "/月"}}, {"job_name": {"$regex": "大数据工程师"}}]}
# print(two_dashujugongchengshi(dashujugongchengshi))
shujucaiji = {"$and": [{"job_money": {"$regex": "/月"}}, {"job_name": {"$regex": "数据采集"}}]}
# print(three_shujucaiji(shujucaiji))
# 最高
y1 = [one_shujufenxi(shujufenxi)[0],two_dashujugongchengshi(dashujugongchengshi)[0],three_shujucaiji(shujucaiji)[0]]
# 最低
y2 = [one_shujufenxi(shujufenxi)[1],two_dashujugongchengshi(dashujugongchengshi)[1],three_shujucaiji(shujucaiji)[1]]
# 平均
y3 = [one_shujufenxi(shujufenxi)[2],two_dashujugongchengshi(dashujugongchengshi)[2],three_shujucaiji(shujucaiji)[2]]
tu(y1,y2,y3)
还有分析哪一年的工资薪资图,其实思路都差不多.
最后把数据展示在网页上,我展示的是静态的网页.