前言:相信来看可视化的兄弟多多少少都有一点python的基础,那么就开始进入我们今天的正题了。
主要内容:
1.用python里面的xlrd库获取我们爬下来的数据,不知道这个库也没事,我直接反手一个链接
2.相信都去看了一手这个库了,然后就是根据获取到的数据来分析了
3.再用一下python中的pyecharts这个库,说实话,这是真的好用
4.下面来康康我们的数据分析,哦不对,先说说我队友爬的是什么,我们爬的是前程无忧招聘网上的信息,先放一张照片给你们康康大概有哪些,等一下丢一个队友的链接在后面,想了解的可以去看看
代码展示:
import xlrd # 读Excel数据用
file_location = "51job工作列表-北京,上海,广州,深圳,武汉-c++.xls"
data = xlrd.open_workbook(file_location)
sheet = data.sheet_by_index(0)
table = data.sheet_by_name('51job')
def numberstr2num(num_str):
'''
能完美处理整数小数字符串转数字的算法
:param num_str:
:return:
'''
import re
assert isinstance(num_str, str)
if num_str and re.match('(-|\\+)?\\d+(\\.\\d+)?', num_str):
capital_char = num_str[0]
if capital_char == '-' or capital_char == '+':
num_str = num_str[1:]
segs = num_str.split('.')
# 整数部分字符串
num_seg = segs[0]
# 得到整数部分数值
total_num = str2num(num_seg)
# 存在小数部分字符串
if len(segs) == 2:
point_num_seg = segs[1]
# 加上小数部分数值
total_num += pointstr2num(point_num_seg)
return total_num if not capital_char == '-' else 0 - total_num
def str2num(num_str):
'''
整数字符串转成数字
:param num_str:
:return:
'''
# 主方法已经验证过数字有效性,这里就不必再验证了
index = 0
str_len = len(num_str)
num = 0
for c in num_str:
index += 1
num += (ord(c) - 48) * pow(10, str_len - index)
return num
def pointstr2num(point_str):
'''
小数部分字符串转成数字
:param point_str:
:return:
'''
# 主方法已经验证过数字有效性,这里就不必再验证了
index = 0
point_num = 0
for c in point_str:
index += 1
point_num += (ord(c) - 48) / pow(10, index)
return point_num
def change(s):
s = s
salary = []
st = s[-1:-4:-1]
time = s[-1]
try:
danwei = s[-3]
except Exception as f:
pass
x=0
j=0
if time == '月' and danwei == '万':
s = s.replace('万/月', '')
s = s.split('-')
x=0
for i in s:
b = numberstr2num(i) * 10000
x+=b
j+=1
#salary.append(b)
elif time=='月'and danwei=='千':
s = s.replace('千/月', '')
s = s.split('-')
x=0
for i in s:
b=numberstr2num(i)*1000
x+=b
j+=1
#salary.append(b)
elif time=='年'and danwei=='万':
s=s.replace('万/年','')
s=s.split('-')
x=0
for i in s:
b = numberstr2num(i) * 10000/365
x+=b
j+=1
#salary.append(b)
elif time=='年'and danwei=='千':
s = s.replace('千/年', '')
s = s.split('-')
x=0
for i in s:
b = numberstr2num(i) * 1000/365
x+=b
j+=1
#salary.append(b)
elif time=='千'and danwei=='天':
s = s.replace('千/天', '')
s = s.split('-')
x=0
for i in s:
b = numberstr2num(i) * 1000*30
x+=b
j+=1
#salary.append(b)
elif time=='万'and danwei=='天':
s = s.replace('万/天', '')
s = s.split('-')
x=0
for i in s:
b = numberstr2num(i) * 10000 * 30
x+=b
j+=1
elif time=='天'and danwei=='元':
s = s.replace('元/天', '')
s = s.split('-')
x=0
for i in s:
b = numberstr2num(i)* 30
x+=b
j+=1
if j>0:
salary.append(x/j)
else:
salary.append(0)
return salary
# s = '0.8-1.6万/月'
# s = change(s)
# print(s)
sum1 = 0
sum2 = 0
sum3 = 0
sum4 = 0
sum5 = 0
sum6 = 0
sum7 = 0
days = [sheet.cell_value(r, 2) for r in range(1, sheet.nrows)]
print(days)
for i in days:
i = change(i)
i=int(i[0])
print(i)
if i==0:
continue
if i <= 5000:
sum1 += 1
if i > 5000 and i <= 10000:
sum2 += 1
if i > 10000 and i <= 20000:
sum3 += 1
if i > 20000 and i <= 30000:
sum4 += 1
if i > 30000 and i <= 40000:
sum5 += 1
if i > 40000 and i <= 50000:
sum6 += 1
if i > 50000:
sum7 += 1
from pyecharts import options as opts
from pyecharts.charts import Bar
l1 = [sum1, sum2, sum3, sum4, sum5, sum6, sum7]
file_location = "51job工作列表-成都-java.xls"
data = xlrd.open_workbook(file_location)
# data是Excel里的数据
sheet = data.sheet_by_index(0)
table = data.sheet_by_name('51job')
c1=0
c2=0
c3=0
c4=0
c5=0
c6=0
c7=0
days1 = [sheet.cell_value(r, 2) for r in range(1, sheet.nrows)]
for i in days1:
i = change(i)
i=int(i[0])
if i==0:
continue
if i <= 5000:
c1+= 1
if i > 5000 and i <= 10000:
c2 += 1
if i > 10000 and i <= 20000:
c3 += 1
if i > 20000 and i <= 30000:
c4 += 1
if i > 30000 and i <= 40000:
c5 += 1
if i > 40000 and i <= 50000:
c6 += 1
if i > 50000:
c7 += 1
ll1=[c1,c2,c3,c4,c5,c6,c7]
file_location = "51job工作列表-成都python.xls"
data = xlrd.open_workbook(file_location)
# data是Excel里的数据
sheet = data.sheet_by_index(0)
table = data.sheet_by_name('51job')
cc1=0
cc2=0
cc3=0
cc4=0
cc5=0
cc6=0
cc7=0
days2 = [sheet.cell_value(r, 2) for r in range(1, sheet.nrows)]
for i in days2:
i = change(i)
i=int(i[0])
if i==0:
continue
if i <= 5000:
cc1 += 1
if i > 5000 and i <= 10000:
cc2 += 1
if i > 10000 and i <= 20000:
cc3 += 1
if i > 20000 and i <= 30000:
cc4 += 1
if i > 30000 and i <= 40000:
cc5 += 1
if i > 40000 and i <= 50000:
cc6 += 1
if i > 50000:
cc7 += 1
ll11=[cc1,cc2,cc3,cc4,cc5,cc6,cc7]
l2 = ['0-5000', '5000-10000', '10000-20000', '20000-30000', '30000-40000', '40000-50000', '50000以上']
from pyecharts.charts import Bar
from pyecharts import options as opts
bar = Bar()
#指定柱状图的横坐标
bar.add_xaxis(['0-5000', '5000-10000', '10000-20000', '20000-30000', '30000-40000', '40000-50000', '50000以上'])
#指定柱状图的纵坐标,而且可以指定多个纵坐标
bar.add_yaxis("c++", l1)
bar.add_yaxis("java", ll11)
bar.add_yaxis("python", ll1)
#指定柱状图的标题
bar.set_global_opts(title_opts=opts.TitleOpts(title="热门语言"))
#参数指定生成的html名称
bar.render('E:热门语言工资大概状况.html')
看内里 :
不是说只有这几种语言比较热门,看我们学校的大多数学的就是这些,所以就分析了这三个,
有学编程语言的可以那代码去分析你想要了解的语言的情况,这里就不多说了
用饼图分析数据:
dayss = [sheet.cell_value(r, 5) for r in range(1, sheet.nrows)]
for i in dayss:
c6+=1
if i=="本科":
c1+=1
elif i=="大专":
c2+=1
elif i=="硕士":
c3+=1
elif i=="博士":
c4+=1
else:
c5+=1
# print(c4)
num = [round(c1/c6*100,3),round(c2/c6*100,3),round(c3/c6*100,3),round(c4/c6*100,3),round(c5/c6*100,3)]
# for i in num:
# print(i)
c = Pie(init_opts=opts.InitOpts(theme=ThemeType.LIGHT))
c.add("",[list(z) for z in zip(L1,num)])
w=s1[1]
c.set_global_opts(title_opts = opts.TitleOpts(title="%s-学位概况"%w),
toolbox_opts = opts.ToolboxOpts(is_show=True))
c.set_series_opts(label_opts = opts.LabelOpts(formatter='{b}:{c}%'))
c.render("学位占比.html")
再来看看几个城市的程序岗位分布:
c1=0
c2=0
c3=0
c4=0
c5=0
l=["北京","上海","深圳","武汉","广州"]
days = [sheet.cell_value(r,7) for r in range(1,sheet.nrows)]
for i in days:
x=i[0:2]
if x=="北京":
c1+=1
elif x=="上海":
c2+=1
elif x=="深圳":
c3+=1
elif x=="武汉":
c4+=1
elif x=="广州":
c5+=1
v=[c1,c2,c3,c4,c5]
c = (
Geo()
.add_schema(maptype="china")
.add("geo", [list(z) for z in zip(l, [c1,c2,c3,c4,c5])])
.set_series_opts(label_opts=opts.LabelOpts(is_show=False))
.set_global_opts(
visualmap_opts=opts.VisualMapOpts(),
title_opts=opts.TitleOpts(title="全国各省程序员工作地方数据分布"),
)
)
c.render("E:分布地图.html")
点地图上的小红点可以看数据哦
再来个上面地区的词云耍耍:
words=list(zip(list(name),list(value)))
word=WordCloud()
word.add("",words,word_size_range=[20, 200],)
work=s.split("-")[2].replace('.xls','')
place=s.split("-")[1].replace('.xls','')
if place=="1":
place="全国"
else:
pass
word.set_global_opts(title_opts=opts.TitleOpts(title="%s在%s岗位分布词云图:"%(work,place)))
word.render("%s在%s工作岗位分布.html"%(work,place))
看看就行
丢个队友链接
总结:我是废物QAQ