Python可视化分析职位信息
又是一年毕业季,小伙伴们是否找到自己心仪的工作呢。工作难,找工作更难。╮( ̄▽ ̄")╭哎!人生苦短,工作真难。
话不多说,就进入正题吧!
第一关 爬取数据
在此,我选择爬取某招聘网站上的数据(希望网站爸爸不要禁我IP,我只爬一丢丢数据)。以下附上Python源码,大家也可自行发挥。。
# -*- coding: utf-8 -*-
import requests
from lxml import etree
import time
import pandas as pd
import random
#设置网页头部信息
head = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4098.3 Safari/537.36'}
#设置超时
tm = 10
#url = "https://search.51job.com/list/000000,000000,0000,00,0,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare="
url_head = "https://search.51job.com/list/000000,000000,0000,00,0,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE,2,"
url_end = ".html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare="
for pageNum in range(1,13):
try:
print('正在爬取第', pageNum, '页数据...')
url = url_head + str(pageNum) + url_end
web = requests.get(url,headers=head,timeout=tm)
#print(web.text[:100]) #test OK
#设置网页的编码方式
web.encoding = 'GBK'
#解析网页元素
dom = etree.HTML(web.text)
#print(len(dom.xpath("//div[@class='dw_table']/div[@class='el']"))) #每页50条招聘信息
#print(len(dom.xpath("//div[@class='dw_table']/div[@class='el']//p/span/a[@target='_blank']/@title")))#职位名称
#依次获得职位名称、公司名称、工作地点、薪资水平、发布时间
JobName = dom.xpath("//div[@class='dw_table']/div[@class='el']//p/span/a[@target='_blank']/@title")
CompanyName = dom.xpath("//div[@class='dw_table']/div[@class='el']/span[@class='t2']/a[@target='_blank']/@title")
CompanyArea = dom.xpath("//div[@class='dw_table']/div[@class='el']/span[@class='t3']/text()")
JobSalary = [i.text for i in dom.xpath("//div[@class='dw_table']/div[@class='el']/span[@class='t4']")]
JobReleaseTime = dom.xpath("//div[@class='dw_table']/div[@class='el']/span[@class='t5']/text()")
href = dom.xpath("//div[@class='dw_table']/div[@class='el']//p/span/a[@target='_blank']/@href")
#print(len(href))
JobDes = []
CompanyType = []
CompanyScale = []
CompanyProfession = []
JobOtherInfo= []
#爬取下级网页
for i in range(len(href)):
url_sub = href[i]
web_sub = requests.get(url_sub,headers=head,timeout=tm)
web_sub.encoding = 'GBK'
#解析网页元素
dom_sub = etree.HTML(web_sub.text)
# 获得职位标签下的所有文本
job_des = dom_sub.xpath("//div[@class='bmsg job_msg inbox']//text()")
job_des_text = ''
for i in job_des:
job_des_text += str(i)
job_des_text = job_des_text.replace(" ","")
job_des_text = job_des_text.replace("\xa0","")
#print(job_des_text)
company_type = dom_sub.xpath("/html/body/div[3]/div[2]/div[4]/div[1]/div[2]/p[1]/@title")
company_scale = dom_sub.xpath("/html/body/div[3]/div[2]/div[4]/div[1]/div[2]/p[2]/@title")
company_profession = dom_sub.xpath("/html/body/div[3]/div[2]/div[4]/div[1]/div[2]/p[3]/@title")
#print(company_type)
#print(company_scale)
#print(company_profession)
job_other_info = dom_sub.xpath("/html/body/div[3]/div[2]/div[2]/div/div[1]/p[2]/@title")[0].replace("\xa0","")
#print(job_other_info)
#往列表中添加元素
if job_des_text:
JobDes.append(job_des_text)
if company_type:
CompanyType.append(company_type)
if company_scale:
CompanyScale.append(company_scale)
if company_profession:
CompanyProfession.append(company_profession)
if job_other_info:
JobOtherInfo.append(job_other_info)
time.sleep(2 + float(random.randint(1, 20))/20)
df = pd.DataFrame()
df['职位名称'] = JobName
df['公司名称'] = CompanyName
df['工作地点'] = CompanyArea
df['公司类型'] = CompanyType
df['公司规模'] = CompanyScale
df['行业领域'] = CompanyProfession
df['薪资水平'] = JobSalary
df['发布时间'] = JobReleaseTime
df['职位描述'] = JobDes
df['其他信息'] = JobOtherInfo
except:
print("数据爬取异常,List index out of range")
time.sleep(2 + float(random.randint(1, 20))/20)
continue
try:
df.to_csv("jobinfo.csv",mode='a+',encoding='GBK',header=None)
except:
print("出现异常")
continue
time.sleep(2 + float(random.randint(1, 20))/20)
爬取完毕,开始数据处理。。。我好难,第一次用FineBi,,,说实话FineBI数据处理还可行,,但是奈何我笨到用不来。。只好用office大法,来完成对数据的处理。。。
第二关 数据处理
我就不展示过程了,一把心酸泪(┳_┳)…直接附上结果
第三关 数据可视化
使用可视化软件FineBI,点击FineBI的数据准备。。将第二关的csv文件上传到数据集中。。上传后可以对csv文件进行编辑,操作很(hua)多(哨)。
最后的效果。。。
审美不行的我,能做出这样已经知足了。。这方面的大佬很多,我只是一只刚入门的小白。啊啊啊,要期末了,还没复习,赶紧复习!!最后,祝愿小伙伴们都取得好成绩。。