python scrapy爬起招聘网站+可视化

最新推荐文章于 2024-06-20 01:07:16 发布

黑夜中奔跑

最新推荐文章于 2024-06-20 01:07:16 发布

阅读量1.4k

点赞数 2

分类专栏： python爬虫

本文链接：https://blog.csdn.net/ysy_1_2/article/details/106897523

版权

python爬虫专栏收录该内容

12 篇文章 8 订阅

订阅专栏

推荐使用插件XPath Helper：
XPath Helper可以支持在网页点击元素生成xpath，整个抓取使用了xpath、正则表达式、消息中间件、多线程调度框架的chrome插件。
最主要的是分析xpath,这是一切的灵魂.
在这里插入图片描述

1.创建爬虫项目

scrapy startproject[项目名]

使用命令创建一个爬虫:

scrapy genspider yingcaiwang  "域名"

运行爬虫命令

scrapy crawl knowlegde (name名)

创建启动start.py
在这里插入图片描述

2.目录结构介绍
2.1 items.py：用于存放爬虫爬取下来数据的模型
2.2 middlewares.py: 用于存放各种中间件的文件
2.3 pipelines.py: 用于将items的模型存储到本地磁盘中.
2.4 settings.py:本爬虫的一些配置文件信息(比如请求头、多久发送一次请求、ip代理池等)
2.5 scrapy.cfg: 项目中的配置文件
2.6 spiders包:以后所有的爬虫,都是存在这个里面.
在这里插入图片描述

3.开始我的爬虫
先看我的项目结构
在这里插入图片描述
直接代码

# -*- coding: utf-8 -*-
import scrapy

from qianchengwuyou.items import QianchengwuyouItem


class QiangchengSpider(scrapy.Spider):
    name = 'qiangcheng'
    allowed_domains = ['51job.com']
    start_urls = ["https://search.51job.com/list/000000,000000,0000,00,9,99,%2520,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="]

    def parse(self, response):
        job_list = response.xpath("//div[@class='dw_table']//div[@class='el']/p[1]//span/a/@href").getall()
        for i_items in job_list:
            yield scrapy.Request(url=i_items,callback=self.jiexi_content,dont_filter=True)
        #获取下一页的href
        next_pages = response.xpath("//div[@class='p_in']//li[last()]/a/@href").get()
        if next_pages:
            yield scrapy.Request(url=next_pages,callback=self.parse,dont_filter=True)

    def jiexi_content(self,response):
        item = QianchengwuyouItem()
        item['job_name'] = response.xpath("//div[@class='cn']/h1/text()").extract_first()
        try:
            item['job_money']  = response.xpath("//div[@class='cn']/strong/text()").extract_first()
        except:
            item['job_money'] = "面议"
        item['job_company'] = response.xpath("//div[@class='cn']//a[1]/text()").extract_first()
        #抓取所有工作地点和教育程度
        job_all = response.xpath("//div[@class='cn']//p[2]/text()").getall()
        #对抓取的列表进行分割取出,列表中包含了工作地点和工作经历和教育程度.
        try:
            item['job_place'] = job_all[0].strip()
        except:
            item['job_place'] = ""
        # try:
        #     item['jineng'] = response.xpath("//div[@class='mt10']/p[2]/a/text()").extract()
        # except:
        #     item['jineng'] =""
        try:
            item['job_experience'] = job_all[1].strip()
        except:
            item['job_experience'] = '无要求'
        try:
            item['job_education'] = job_all[2].strip()
        except:
            item['job_education'] = '不要求'
        # try:
        #     item['import_zi'] = response.xpath('//div[@class="bmsg job_msg inbox"]/div[@class="mt10"]/p[2]/text()').get()
        #     print(item['import_zi'])
        # except:
        #     item['import_zi'] = '不做要求'
        #对所有岗位职责和任务进行提取,都爬取了的,爬取的岗位职责和任职要求
        job_content = response.xpath("//div[@class='bmsg job_msg inbox']/p/text()").getall()
        job_nz = []
        #遍历循环取得岗位职责和任职要求的数据添加到job_nz[]中,在取出数据.
        for job_nzs in job_content:
            job_nz.append(job_nzs)
        try:
            item['job_nz'] = job_nz
        except:
            item['job_nz'] = "上岗安排"
        yield item

在setting设置存入数据库和设置请求头:
在这里插入图片描述

在pipelines.py 存入数据库

import pymongo
from scrapy.utils.project import get_project_settings
settings = get_project_settings()

class QianchengwuyouPipeline:
def init(self):
pass
# 链接数据库
client = pymongo.MongoClient(host=settings[‘MONGO_HOST’], port=settings[‘27017’])
self.db = client[settings[‘MONGO_DB’]] # 获得数据库的句柄
self.coll = self.db[settings[‘MONGO_COLL’]] # 获得collection的句柄
#数据库登录需要账号密码的话
#self.db.authenticate(settings[‘MONGO_USER’], settings[‘MONGO_PSW’])

def process_item(self, item, spider):
    pass
    postItem = dict(item)  # 把item转化成字典形式
    self.coll.insert(postItem)  # 向数据库插入一条记录
    return item  # 会在控制台输出原item数据，可以选择不写

def close(self):
    close.client.close()

正在爬取数据
在这里插入图片描述
存入Mongodb

4.对数据做可视化

from pyecharts import options as opts
from pyecharts.charts import Pie
import pymongo
#连接mongodb
client = pymongo.MongoClient("127.0.0.1",port=27017)
#连接数据库
db = client['admin']
#连接表
mytable =db['qiangcheng']
#提取mogodb数据
chengdu ={"$and": [{"job_place":{"$regex":"成都"}},{"job_name":{"$regex":"数据采集"}}]}
beijing ={"$and": [{"job_place":{"$regex":"北京"}},{"job_name":{"$regex":"数据采集"}}]}
shanghai ={"$and": [{"job_place":{"$regex":"上海"}},{"job_name":{"$regex":"数据采集"}}]}
guangzhou ={"$and": [{"job_place":{"$regex":"广州"}},{"job_name":{"$regex":"数据采集"}}]}
shengzheng ={"$and": [{"job_place":{"$regex":"深圳"}},{"job_name":{"$regex":"数据采集"}}]}

num1 = 0
num2 =0
num3 =0
num4 = 0
num5 =0
for a in mytable.find(chengdu):
    if "成都" in a["job_place"]:
        num1 += 1
print(num1)
# print("***6***")
for b in mytable.find(beijing):
    if "北京" in b["job_place"]:
        num2 += 1
# print(num2)
# print("***6***")
for c in mytable.find(shanghai):
    if "上海" in c["job_place"]:
        num3 += 1
# print(num3)
# print("***6***")
for d in mytable.find(guangzhou):
    if "广州" in d["job_place"]:
        num4 += 1
# print(num4)
# print("***6***")
for f in mytable.find(shengzheng):
    if "深圳" in f["job_place"]:
        num5 += 1
# print(num5)
# print("***6***")
x_data =["成都","北京","上海","广州","深圳"]
y_data =[num1,num2,num3,num4,num5]
data_pir = list(zip(x_data,y_data))
pie =Pie()
pie.add(series_name="数据采集地区岗位",data_pair=data_pir)
pie.render("数据采集饼图.html")

5.分析数据饼图
在这里插入图片描述
6.分析大数据、数据采集、数据分析、平均工资,最大工资和最小工资分析图

from pyecharts import options as opts
from pyecharts.charts import Bar
import re
import pymongo

#连接mongodb
client = pymongo.MongoClient("127.0.0.1",port=27017)
#连接数据库
db = client['admin']
#连接表
mytable =db['qiangcheng']

#获取mogodb中的数据
#提取mogodb数据



def one_shujufenxi(shujufenxi):
    #最高工资
    slaray_high = []

    for i in mytable.find(shujufenxi):
        a = i["job_money"].split("-")[0]
        slaray_high.append(float(a))
    max_b = max(slaray_high)
    min_c = min(slaray_high)
    #设置初始值
    avg1 = "{:.1f}".format(sum(slaray_high)/len(slaray_high))
    ls = [max_b,min_c,float(avg1)]
    return ls

def two_dashujugongchengshi(dashujugongchengshi):
    # 最高工资
    slaray_high = []
    for i in mytable.find(dashujugongchengshi):
        a = i["job_money"].split("-")[0]
        slaray_high.append(float(a))
    max_b = max(slaray_high)
    min_c = min(slaray_high)
    # 设置初始值
    avg1 = "{:.1f}".format(sum(slaray_high) / len(slaray_high))
    ls = [max_b, min_c, float(avg1)]
    return ls




def three_shujucaiji(shujucaiji):
    # 最高工资
    slaray_high = []
    for i in mytable.find(shujucaiji):
        a = i["job_money"].split("-")[0]
        try:
            slaray_high.append(float(a))
        except:
            continue
    slaray_high = sorted(slaray_high,reverse=True)
    max_b = slaray_high[0]
    min_c = slaray_high[-1]
    # 设置初始值
    avg1 = "{:.1f}".format(sum(slaray_high) / len(slaray_high))
    ls = [max_b, min_c, float(avg1)]
    return ls




def tu(y1,y2,y3):
    gongzi = ['数据分析','大数据开发','数据采集']
    bar = Bar()
    bar.add_xaxis(xaxis_data=gongzi)
    # 第一个参数是图例名称，第二个参数是y轴数据
    bar.add_yaxis(series_name= "最高工资",y_axis=y1)
    bar.add_yaxis(series_name="最低工资",y_axis=y2)
    bar.add_yaxis(series_name="平均工资",y_axis=y3)
    # 设置表的名称
    bar.set_global_opts(title_opts=opts.TitleOpts(title='工资分析图',subtitle='工资单位：/月'),toolbox_opts=opts.ToolboxOpts(),)
    bar.render("三个地区的数据分析.html")

if __name__ == '__main__':
    shujufenxi = {"$and": [{"job_money": {"$regex": "/月"}}, {"job_name": {"$regex": "数据分析"}}]}
    # print(one_shujufenxi(shujufenxi))

    dashujugongchengshi  = {"$and": [{"job_money": {"$regex": "/月"}}, {"job_name": {"$regex": "大数据工程师"}}]}
    # print(two_dashujugongchengshi(dashujugongchengshi))

    shujucaiji = {"$and": [{"job_money": {"$regex": "/月"}}, {"job_name": {"$regex": "数据采集"}}]}
    # print(three_shujucaiji(shujucaiji))
    # 最高
    y1 = [one_shujufenxi(shujufenxi)[0],two_dashujugongchengshi(dashujugongchengshi)[0],three_shujucaiji(shujucaiji)[0]]
    # 最低
    y2 = [one_shujufenxi(shujufenxi)[1],two_dashujugongchengshi(dashujugongchengshi)[1],three_shujucaiji(shujucaiji)[1]]
    # 平均
    y3 = [one_shujufenxi(shujufenxi)[2],two_dashujugongchengshi(dashujugongchengshi)[2],three_shujucaiji(shujucaiji)[2]]
    tu(y1,y2,y3)