python爬取广西人才网招聘信息并可视化

最新推荐文章于 2022-06-20 20:03:22 发布

C01acat

最新推荐文章于 2022-06-20 20:03:22 发布

阅读量2.6k

点赞数 10

分类专栏： Python 网络爬虫开发实战文章标签： python 数据分析可视化

本文链接：https://blog.csdn.net/qq_43342294/article/details/115642173

版权

Python 同时被 2 个专栏收录

10 篇文章 1 订阅

订阅专栏

网络爬虫开发实战

3 篇文章 0 订阅

订阅专栏

本文介绍了一种针对指定关键字'信息管理'的网页数据抓取策略，包括职位、公司名、薪资等9维信息的提取，并利用BeautifulSoup和正则表达式处理复杂结构。通过多进程爬取，分析了490页数据，最终可视化了薪资分布、工作经验等关键指标。

摘要由CSDN通过智能技术生成

对要爬取的数据进行分析

要爬取的网页：https://s.gxrc.com/sJob?district=1&pageSize=20&orderType=0&listValue=1.html
我们需要爬取的信息有9个维度，分别是：职位，名称，公司名称，薪资，工作地，更新时间，学历，经验，岗位要求。

职位信息的爬取格式如上图所示，但是如果我们规定了某一个关键字(key)后，信息格式会变成
会出现<span class='highlight>的标志，这样对数据的爬取有不小的限制，所以采取另一种方案。
进一步观察，可以得到图一中，href后的链接为我们需要的岗位描述以及经验、学历、岗位名称。如下图：
在这里插入图片描述
岗位名称就在标签</h1 titile="">内，而学历，经验等信息都在v1的标签之内：
可以注意到岗位描述在标签id="examineSensitiveWordsContent"内，这里可以用BeautifulSoup直接获取text即可，其他的数据全部用正则匹配即可出来（我是能用正则就用正则）
至于公司名称，薪资，工作地，更新时间信息，在第一个页面都能获取到。

`翻页分析`

点开第2页，可以得到地址栏的变化为：

https://s.gxrc.com/sJob?district=1&pageSize=20&orderType=0&listValue=1.hml&keyword=信息管理&page=2

pageSize、listValue信息不用处理，只需要处理keywork与page两个信息即可，page明显是翻页处理的，keyword则是查询的关键字。

python代码

# -*- coding:utf-8 -*-
import requests
import re
import time
from openpyxl import Workbook
from bs4 import BeautifulSoup

key_word = '信息管理'  # 查询的关键字

#获取原码信息
def getContent(page):
    url = 'https://s.gxrc.com/sJob?district=1&pageSize=20&orderType=0&listValue=1.hml'+'.hml&keyword={}&page={}'.format(key_word, page)
    html = requests.get(url).content.decode("utf-8") # 获取网页信息
    return html

def getInfo(html, ws):
    '''一页信息有20个'''
    job_name, job_edu, job_expe, job_requirement = [], [], [], []  # 岗位名字，学历， 经验， 岗位要求都是列表形式存放
    ent_name = re.findall(r'class="entName">(.*?)</a>', html)           # 公司名字
    price = re.findall(r'<li class="w3">(.*?)</li>', html)[1:]          # 薪资
    city = re.findall(r'<li class="w4">(.*?)</li>', html)[1:]           # 工作地
    refresh_time = re.findall(r'<li class="w5">(.*?)</li>', html)[1:]   # 更新时间
    deatil_info_url = re.findall(r'<a href="//(.*?)" target="_blank" class="posName">', html)   # 详细信息的页面网址，用于爬取职位要求和描述
    for url in deatil_info_url:
        url = "https://"+url
        html = requests.get(url).content.decode('utf-8') # 进一步解析网页的信息，获得职位名称和岗位要求两个信息
        name = re.findall(r'<h1 title="(.*?)">', html)[0] # 只有一个
        edu = re.findall(r'</span>学历(.*?)<span class="vl">', html)[0]  # 学历
        expe = re.findall(r'</span>经验(.*?)<span class="vl">', html)[0]  # 经验
        soup = BeautifulSoup(html, "html.parser")  # 正则无法完全匹配岗位要求，所以采用BeautifulSoup
        requirement = soup.find(id='examineSensitiveWordsContent').text  # 岗位要求
        job_name.append(name)
        job_edu.append(edu)
        job_expe.append(expe)
        job_requirement.append(requirement)
    for i in range(0, 20):
        ws.append([job_name[i], ent_name[i], price[i], city[i], refresh_time[i], job_edu[i], job_expe[i], job_requirement[i]])

def main():  # 页码和对应着的文件名字次数，以防读写卡死
    title = ['职位名称','公司名称', '薪资','工作地','更新时间','学历','经验 ','岗位要求']  # 写入到excel的标题，可以更改
    wb = Workbook()
    ws = wb[wb.sheetnames[0]]
    ws.append(title) # 写入标题头到excel去
    save_file = '{}.xlsx'.format(key_word)  # 以搜索的关键字作为excel的名字
    total_page = 480    # 最大页码数
    for page in range(1, total_page+1): # 获取490页的内容
        try:
            print("正在爬取第{}页".format(page))
            html = getContent(page)  # 一页一页获取数据
            if page % 10 == 0:  # 爬到了10的倍数，暂停2s
                time.sleep(2)
            getInfo(html, ws)  # 传入一页的页面信息和对应的表格去
        except Exception as e:
            print(e)
            wb.save(save_file)  # 将爬到的数据写入到excel去
    wb.save(save_file)
if __name__ == '__main__':
    main()

在测试的时候，注意到爬取速度比较慢，于是尝试用多进程爬取，爬取太快导致ip被封了，那就只能单线程爬取了，并且我还设定了2s暂停一下，全部爬取完大概在30分钟左右，总共有9728条信息(在关键字信息管理的条件下)

数据可视化

在这里插入图片描述
数据可视化的代码大多差不多，就不分析如何写的了，贴在下面：

from openpyxl import load_workbook
import matplotlib.pyplot as plt
import numpy as np
import matplotlib
import wordcloud
import re
import jieba  # 分词jieba库

matplotlib.rcParams['font.sans-serif'] = ['SimHei']  # 汉字
matplotlib.rcParams['axes.unicode_minus'] = False

class DrawingPicture:
    def __int__(self):
        self.excel_file         # excle文本数据
        self.city_txt_file     	# 城市文本数据
        self.wb  				# excel内的表格数据
        self.ws
        self.font          		# 指定中文字体

    def getFile(self,excel, txt = ''):        # 获取文本数据
        self.excel_file = excel
        self.city_txt_file= txt
        self.wb = load_workbook(self.excel_file)
        self.ws = self.wb[self.wb.sheetnames[0]]  # 获取第一个sheet
        self.font = r'C:\Windows\Fonts\simfang.ttf' # 必须要下一个simfang.tff文字库，安装到C:\Windows\fonts
        
    def getWorldCouldPic(self):
        text = ""
        for i in range(2, self.ws.max_row+1):
            text += self.ws['A'+str(i)].value + " "

        w = wordcloud.WordCloud(background_color="WHITE", font_path = self.font, stopwords=['南宁','桂林','柳州'],)  # 可根据需要添加要不考虑的词语
        w.generate(" ".join(jieba.lcut(text)))
        w.to_file("岗位词云图.png")
        
    def getSalaryPic(self):
        salary_num = {}
        for i in range(2, self.ws.max_row+1):
            salary = self.ws['C'+str(i)].value
            if salary != ' ':  # 不考虑空值
                if salary not in salary_num.keys():  # 计算薪水
                    salary_num[salary] = 1
                else:
                    salary_num[salary] += 1  # 如果有，则加1
        if None in salary_num.keys():  # 删除None值
            del salary_num[None]
        sorted(salary_num.keys())
        labels = np.array(list(salary_num.keys()))
        total = 0
        for i in salary_num.values():
            total += i
        sizes_pre = [round((x/total*100), 2) for x in list(salary_num.values())]
        plt.barh(range(len(salary_num)), sizes_pre, height=0.6, color='steelblue', alpha=0.8)
        plt.yticks(range(len(salary_num)), list(salary_num.keys()))
        plt.xlabel("百分比")
        plt.title("薪资占比条形图")
        for x, y in enumerate(sizes_pre):
            plt.text(y + 0.2, x - 0.1, '%s' % y)
        plt.savefig("薪水分布条形图.png", dpi = 200, bbox_inches ='tight')
        plt.close()

    def getFreshTimePic(self):
        time_dict = {} # 年-月对应的数字
        for i in range(2, self.ws.max_row+1):
            time = self.ws['E'+str(i)].value
            if time != ' ':
                month = re.findall(r'(\d{4}-\d{2})', time)[0]  # 获取年份-月份数据
                year = re.findall(r'\d{4}', month)[0]
                if int(year) not in [x for x in range(2016, 2020, 1)]:  # 不考率2019年之前的数据
                    if month in list(time_dict.keys()):  # 如果在要寻找的列表内
                        time_dict[month] += 1
                    else:
                        time_dict[month] = 1
        temp_dict = sorted(time_dict.items(), key=lambda item:item[0])
        time_dict.clear()
        for d in temp_dict:  # 按照时间顺序来排序
            time_dict[d[0]] = d[1]
        month = list(time_dict.keys())
        # print(month)

        plt.plot(month, list(time_dict.values()), label='信息管理与信息系统月份职位数', )
        plt.xticks(rotation=30)   # 将更坐标旋转30度
        plt.xlabel("年-月份")
        interval = 400  # 以400为一个间隔
        y_ticks = np.arange(0, max(list(time_dict.values()))+ interval, interval) # 参数说明： 起始位置， 结束位置(这里以数据的最大值+一个间隔为最大值)， 间隔数
        plt.yticks(y_ticks)
        for a, b in zip(list(time_dict.keys()), list(time_dict.values())):
            plt.text(a, b, b, ha='center', va ='bottom', fontsize=10)
        plt.legend()
        # plt.show()
        plt.savefig("岗位数量变化折线图.png", dpi = 200, bbox_inches='tight')
        plt.close()

    def getEduPic(self):
        edu_dict = {}
        for i in range(2, self.ws.max_row+1):
            edu = self.ws['F'+str(i)].value
            if edu != ' ':
                if edu not in edu_dict.keys():
                    edu_dict[edu] = 1
                else:
                    edu_dict[edu] += 1
        total = 0
        for i in edu_dict.values():
            total += i
        name_pre = []
        for edu, num in edu_dict.items():
            name_pre.append(edu + str(round(num/total*100, 2)) +'%')  # 名字+占比
        # plt.close()
        plt.barh(range(len(edu_dict)), edu_dict.values(), height=0.7, color='steelblue', alpha=0.8)
        plt.yticks(range(len(edu_dict)), edu_dict.keys())
        plt.xlabel('数量')
        plt.title('企业对学历要求')
        for x, y in enumerate(edu_dict.values()):
            plt.text(y/2, x-0.1, '%s' % y)
            # plt.text(y + 30, x - 0.1, '%s' % name_pre[x])   # 此行代码显示百分比
        plt.savefig("学历要求条形图.png", dpi = 200, bbox_inches='tight')
        # plt.show()
        plt.close()

    def getExpPic(self):
        exp_dict = {}
        for i in range(2, self.ws.max_row+1):
            exp = self.ws['G'+str(i)].value
            if exp != ' ':
                if exp not in exp_dict:
                    exp_dict[exp] = 1
                else:
                    exp_dict[exp] += 1
        total = 0
        for i in exp_dict.values():
            total += i
        name_pre = []
        for exp, num in exp_dict.items():
            name_pre.append(str(round(num / total * 100, 2)) + '%')  # 名字+占比
        plt.barh(range(len(exp_dict)), exp_dict.values(), height=0.7, color='steelblue', alpha=0.8)
        plt.yticks(range(len(exp_dict)), exp_dict.keys())
        plt.xlabel('次数')
        plt.title('企业对经验要求')
        for x, y in enumerate(exp_dict.values()):
            plt.text(y / 3, x - 0.1, '%s' % y)
            plt.text(y + 10, x - 0.1, '-->%s' % name_pre[x])
        # plt.show()
        plt.savefig("企业对经验要求条形图.png", dpi=200, bbox_inches='tight')
        plt.close()
        
    def getWordCouldRequirePic(self):  # 岗位要求词云图
        text = ""
        for i in range(2, self.ws.max_row+1):
            text += self.ws['H' + str(i)].value + " "
        w = wordcloud.WordCloud(background_color="WHITE", font_path = self.font, stopwords=['及','良好','的','和','与','等']).generate(" ".join(jieba.lcut(text)))
        w.to_file("岗位要求词云图.png")

    def getPlacePic(self):  # 获取岗位所在地区数量图
        area_place = {}
        place_num = {}  # 城市：数量 字典关系变量
        with open(self.city_txt_file, "r", encoding='utf-8') as f:
            text = f.read()
            all_info = re.findall(r'[^=\s]+', text)
        for p in range(0, len(all_info)-1, 2):
            area_place[all_info[p+1]] = all_info[p]
            place_num[all_info[p]] = 0
        for i in range(2, self.ws.max_row+1):
            place = self.ws['D' + str(i)].value
            for v in area_place.keys():
                if place in v:
                    chinese = re.findall(r'"(.*?)"', v)[0]
                    place_num[chinese] += 1
                    break
        plt.bar(range(len(place_num)), list(place_num.values()), align='center', alpha=0.7, width=0.7)
        plt.xticks(range(len(place_num)), list(place_num.keys()), rotation='vertical')
        plt.ylabel('个数')
        plt.xlabel("城市")
        plt.title('信息管理与信息系统岗位工作分布条形图')
        for x, y in enumerate(place_num.values()):
            plt.text(x-0.3, y, '%s' % y)
        plt.savefig("信息管理与信息系统岗位工作分布条形图.png", dpi=200, bbox_inches='tight')
        plt.close()
        
def main():
    ''' 如果没有城市数据，则getFile第二个参数不用设置，并且getPlacePic方法不用 '''
    demo = DrawingPicture()
    # demo.getFile("./信息管理.xlsx", "./城市分布.txt" ) # excel数据 与 城市文本数据，绝对路径
    demo.getFile("./信息管理.xlsx")
    demo.getWorldCouldPic()     # 岗位名称次元图
    demo.getSalaryPic()         # 薪水
    demo.getFreshTimePic()      # 更新时间（岗位数量
    demo.getEduPic()            # 学历
    demo.getExpPic()            # 经验
    demo.getWordCouldRequirePic() #岗位要求词云
    # demo.getPlacePic()              # 地区

if __name__ == '__main__':
    main()