上一篇想要的数据已经基本都爬到了,下一步就是数据的清洗了。
仔细观察了数据的特征,发现数据中存在太多的脏数据了,就例如很多搞房地产的,销售的,等等很多其他的一些无关职业,并且还包含很多重复项。我就把数据一遍一遍的清洗,第一遍我先在职位标题中挑出来带有特定关键词的职位,例如我搜索的是大数据的工作,那么我的关键词里就包含['数据', '分析', 'ETL', 'java', 'JAVA', '人工智能', '智能', '互联网', 'ELK','工程师']这些,凡是带有这些关键词的职位都留下来,不带的都舍弃。这是第一遍
这里的第一个函数是读取文件
第二个函数调用第一个读取的结果
def ReadFile(self, FileName): # 读取数据
df = pd.read_csv('E:/51JobData/{}'.format(FileName), engine='python', encoding='gbk', iterator=False, header=0)
return df
def KeyWord(self): # 第一遍清洗提取带有关键字的数据
ReadFile = 'all_jobmessage.csv'
data = self.ReadFile(ReadFile)
ss = []
key_word = ['数据', '分析', 'ETL', 'java', 'JAVA', '人工智能', '智能', '互联网', 'ELK', '工程师']
for index, row in data.iterrows():
try:
if any(word in row['position_name'] for word in key_word):
ss.append(row)
except Exception as e:
print(e)
df = pd.DataFrame(ss)
df.to_csv('E:/51JobData/After_JobCountry.csv', encoding='gbk', index=False)
print('筛选完毕')
第二遍清洗就是去掉重复行:
def RemoveRepeat(self):
# 去掉重复的行数
ReadFile = 'After_JobCountry.csv'
data = self.ReadFile(ReadFile)
data = data.drop_duplicates(['company', 'position_name', 'workplace'], keep='first')
data.to_csv('E:/51JobData/After_JobCountry1.csv', encoding='gbk', index=False)
print('去重完毕')
第三遍清洗去掉职位名称里带有“房产”的职位:
def RemoveKeyWord(self):
# 去掉带有关键字的一行
ReadFile = 'After_JobCountry1.csv'
data = self.ReadFile(ReadFile)
for index, row in data.iterrows():
try:
if '房产'in row['position_name']:
data.drop(index, axis=0, inplace=True)
except Exception as e:
print(e)
data.to_csv('E:/51JobData/FinalAfter_clear.csv', encoding='gbk', index=False)
print('去掉关键字完毕')
第四步清洗是对薪资的整理,由于爬出来的数据里薪资单位不统一,在这里就把薪资单位统一起来,分成最高薪资和最低薪资。
def salary(self):
ReadFile = 'FinalAfter_clear.csv'
data = self.ReadFile(ReadFile)
for index, row in data.iterrows():
str_salary = row['salary']
str_salary = str(str_salary) # 把每一行的薪资找出来转化为字符串类型
if str_salary!='nan': # 判断字符串是否存在,存在的话向下进行
if '万' in str_salary: #如果薪资里边包含万
if '月' in str_salary: #薪资包含月可能是(每月....万)或者(每月...万以下)
if '以下' in str_salary: #如果包含 以下,字符串分割之后第一个就是最高工资
str_salary = re.split('-|万|千|元|以下', str_salary)
print(str_salary)
data.iloc[index, 12] = float(str_salary[0])*10000 # 把字符串类型的数字转化为float类型的
else:
str_salary = re.split('-|万|千|元|以下', str_salary) #薪资不包含以下的时候第一个就是最低工资,第二个是最高工资
print(str_salary)
data.iloc[index, 11] = float(str_salary[0])*10000
data.iloc[index, 12] = float(str_salary[1])*10000
elif '年' in str_salary: # 如果薪资里包含年 那么一般是(每年...万)或者(每年...万以下)
if '以下' in str_salary:
str_salary = re.split('-|万|千|元|以下', str_salary)
print(str_salary)
data.iloc[index, 12] = (float(str_salary[0])*10000)/12 #带有'以下'的最大值都是第一个数
else:
str_salary = re.split('-|万|千|元|以下', str_salary)
print(str_salary)
data.iloc[index, 11] = (float(str_salary[0]) * 10000) / 12
data.iloc[index, 12] = (float(str_salary[1])*10000)/12
elif '千' in str_salary: #如果薪资包含千 那么继续判断是否有年和月
if '月' in str_salary: #如果有月继续判断是否有以下
if '以下' in str_salary:
str_salary = re.split('-|万|千|元|以下', str_salary)
print(str_salary)
data.iloc[index, 12] = float(str_salary[0])*1000# 步骤同上
else:
str_salary = re.split('-|万|千|元|以下', str_salary)
print(str_salary)
data.iloc[index, 11] = float(str_salary[0])*1000
data.iloc[index, 12] = float(str_salary[1])*1000
elif '年' in str_salary:
if '以下' in str_salary:
str_salary = re.split('-|万|千|元|以下', str_salary)
print(str_salary)
data.iloc[index, 12] = (float(str_salary[0])*1000)/12
else:
print(str_salary)
str_salary = re.split('-|万|千|元|以下', str_salary)
data.iloc[index, 12] = (float(str_salary[0]) * 1000) / 12
data.iloc[index, 12] = (float(str_salary[1]) * 1000) / 12
else:
pass
# data.drop(index, axis=0, inplace=True)
data.to_csv('E:/51JobData/text1.csv', encoding='gbk',index=False)
print('工资清洗完毕')
第五步是对于薪资空白的,用薪资的均值进行填充,例如最低薪资里的空白,就用最低薪资的均值填充,最高薪资就用最高薪资均值填充
def pull(self):
# 使用均值填充空白工资
ReadFile = 'full.csv'
data = self.ReadFile(ReadFile)
df1 = data[['last_salary']].fillna(data['last_salary'].mean())# 用均值填充最低薪这一列
df2 = data[['high_salary']].fillna(data['high_salary'].mean())# 用均值填充最高新这一列
data = data.drop(['last_salary', 'high_salary'], axis=1) # 将原来的最低薪最高新删除,添加填充好的最低薪和最高薪
new_data = pd.concat([data, df1, df2], axis=1) # 将三者竖直拼接
new_data.to_csv('E:/51JobData/TheFinal.csv', encoding='gbk', index=False) # 保存
print('填充完毕')
最后就清洗完成了,数据看起来也整齐很多
里边所有的文件都是固定写好的,只需要在自己的目录下放一个自己已经爬好的数据就可以挨个函数运行了,最终生成的文件就是清洗完毕的,最后要说的一点的是数据格式不要错了,我的all_jobmessage.csv文件是这种的,可以根据自己的改一改代码。
下一步是该对数据进行分析了吧
首先是对职位分布的地点进行分析,提取出地点,取出城市,在这里用到了python的一个可视化工具pytharts,他是百度开源的可视化工具包,用它来作图方便又灵活。
画地图的话首先是要导入地图包
pip install echarts-countries-pypkg
pip install echarts-china-provinces-pypkg
pip install echarts-china-cities-pypkg
pip install echarts-china-counties-pypkg
pip install echarts-china-misc-pypkg
pip install echarts-cities-pypkg
将城市放在地图上显示,画出来城市的热力图和分布散点图,更直观的了解到职位分布的聚集地,需要说的是画散点图的时候要注意,如果是省份就要把”省“这个字去掉,如果是市,就一定要加上“市”这个字,要不然地图上是没有办法显示出来的。
加上之后有些市区可能还是无法显示,或者存在自治州,在后面加上了市,地图也找不到这个地方。在Geo这个包里有一个函数get_coordinate(),他的参数是城市,如果这个城市在原有的json文件里的话,那么就会返回该城市放入字典里,如果名字不在那么就返回None,并且抛出异常。所以就可以判断,如果说,返回值是None据认为这个地点在地图上显示不了,过滤掉,把不反回None的地点单独存放到新的列表里。
# 散点图
def city1():
areas = []
values = []
geo = Geo("招聘分布地", "全球分布", title_color="#fff",
title_pos="center", width=1200,
height=600, background_color='#404a59')
for key, value in city_count().items():
if geo.get_coordinate(key)!= None:
areas.append(key)
values.append(value)
geo.add("", areas, values, visual_range=[0, 2000], maptype='china', visual_text_color="#fff",
symbol_size=10, is_visualmap=True)
geo.show_config()
geo.render("招聘分布1.html") # 生成html文件
#热力图
def city2():
areas = []
values = []
geo = Geo("招聘分布", "全国分布", title_color="#fff", title_pos="effectscatter", width=1200, height=600,
background_color='#404a59')
for key, value in city_count().items():
if geo.get_coordinate(key) != None:
areas.append(key)
values.append(value)
geo.add("招聘", areas, values, visual_range=[0, 700], type='heatmap', visual_text_color="#fff", symbol_size=15,
is_visualmap=True, is_roam=False)
geo.show_config()
geo.render(path="招聘分布2.html")
接下来就是分析这些工作对经验的要求状况画出柱状图横坐标是经验年限,纵坐标是人才需求量,还是3-4年的经验的最吃香啊
:
# 柱状图
def PicForExperience():
attr = ['无工作经验', '1', '2', '3-4', '5-7', '8-9', '10']
v1 = [2311, 1181, 2229, 4067, 1883, 175, 63, ]
bar = Bar("经验要求", "经验")
bar.add("经验", attr, v1, is_more_utils=True)
bar.show_config()
bar.render('经验柱状图.html')
# 漏斗图
def loc3():
# 漏斗图
attr = ['无工作经验', '1', '2', '3-4', '5-7', '8-9', '10']
v1 = [2311, 1181, 2229, 4067, 1883, 175, 63]
funnel = Funnel('漏斗图')
funnel.add('商品', attr, v1, is_label_show=True, label_pos='inside', label_text_color="#fff")
funnel.show_config()
funnel.render(path="漏斗图.html")
学历要求,本科需求量是最大的:
def Education():
df = pd.read_csv('E:/51JobData/text666.csv', encoding='gbk')
lists = list(df['education'])
sets = set(lists)
dir_data = {i: lists.count(i) for i in sets}
dir_data = sorted(dir_data.items(), key=operator.itemgetter(1), reverse=True)
dir1 = dir_data[:6]
dir_data1 = {i[0]: i[1] for i in dir1}
dir_data1['其他']=sum(list(map(lambda x: x[1], dir_data[7:])))
attr = dir_data1.keys()
value = dir_data1.values()
return attr, value
# 普通饼图
def pie():
attr, value = Education()
pie = Pie("学历要求")
pie.add("", attr, value, is_label_show=True)
pie.show_config()
pie.render('picture/pie.html')
# 玫瑰饼图
def pan8():
attr, value = Education()
pie2 = Pie("学历需求情况", title_pos='center', width=900)
pie2.add("学历", attr, value, center=[50, 50], is_random=True, radius=[30, 75], rosetype='area',
is_legend_show=False,
is_label_show=True)
pie2.show_config()
pie2.render(path='玫瑰饼图.html')
不同经验薪资对比:
# 数据提取
def PicExperience1():
last1=last2=last3=last4=last5=last6= 0
high1=high2=high3=high4=high5=high6=0
flag1=flag2=flag3=flag4=flag5=flag6=0
df = pd.read_csv('E:/51JobData/text666.csv', encoding='gbk')
flag ,NoExperience_last, NoExperience_high, Experience_list= PicExperience()
# lls = list(set(Experience_list))
# lls.append('无工作经验')
lls = ['1','2','3-4','5-7','8-9','10','无工作经验']
for index, row in df.iterrows():
StrExperience = row['experience']
StrExperience = re.split('经验|年|以下', StrExperience)
if StrExperience[0] == '1':
flag1+=1
last1 = last1+row['last_salary']
high1 = high1+row['high_salary']
elif StrExperience[0]=='2':
flag2+=1
last2 = last2+row['last_salary']
high2 = high2+row['high_salary']
elif StrExperience[0]=='3-4':
flag3+=1
last3 = last3+row['last_salary']
high3 = high3+row['high_salary']
elif StrExperience[0]=='5-7':
flag4+=1
last4 = last4+row['last_salary']
high4 = high4+row['high_salary']
elif StrExperience[0]=='8-9':
flag5+=1
last5 = last5+row['last_salary']
high5 = high5+row['high_salary']
elif StrExperience[0]=='10':
flag6+=1
last6 = last6+row['last_salary']
high6 = high6+row['high_salary']
salary_low = [int(last1/flag1), int(last2/flag2), int(last3/flag3), int(last4/flag4), int(last5/flag5), int(last6/flag6), int(sum(NoExperience_last)/flag)]
salary_high = [int(high1/flag1), int(high2/flag2), int(high3/flag3), int(high4/flag4), int(high5/flag5), int(high6/flag6), int(sum(NoExperience_high)/flag)]
# 普通柱形图
bar = Bar("大数据方向工资")
bar.add("最低薪资", lls, salary_low, is_label_show=True)
bar.add("最高薪资", lls, salary_high, is_label_show=True)
bar.render('picture/salary.html')
# 普通折线图
def pan4():
salary_high = [15521, 12761, 16451, 20039, 26566, 33127, 42285]
salary_low = [9558, 7609, 9801, 12294, 16773, 21111, 26732]
lls = ['无工作经验', '1', '2', '3-4', '5-7', '8-9', '10']
line = Line('工资折线图')
line.add('最高工资', lls, salary_high, mark_point=['max'])
line.add('最低工资', lls, salary_low, mark_point=['min'], is_smooth=True)
line.show_config()
line.render(path='01-04折线图.html')
# 阶梯折线图
def pan5():
line2 = Line('阶梯折线图')
salary_high = [15521, 12761, 16451, 20039, 26566, 33127, 42285]
salary_low = [9558, 7609, 9801, 12294, 16773, 21111, 26732]
lls = ['无工作经验', '1', '2', '3-4', '5-7', '8-9', '10']
line2.add('最低工资', lls, salary_low, is_step=True, is_label_show=True)
line2.add('最高工资', lls, salary_high, is_step=True, is_label_show=True)
line2.show_config()
line2.render(path='01-05阶梯折线图.html')
# 面积折线图
def pan6():
line3 = Line("面积折线图")
salary_high = [15521, 12761, 16451, 20039, 26566, 33127, 42285]
salary_low = [9558, 7609, 9801, 12294, 16773, 21111, 26732]
lls = ['无工作经验', '1', '2', '3-4', '5-7', '8-9', '10']
line3.add("最低薪资", lls, salary_low, is_fill=True, line_opacity=0.2, area_opacity=0.4, symbol=None,
mark_point=['max'])
line3.add("最高薪资", lls, salary_high, is_fill=True, area_color='#a3aed5', area_opacity=0.3, is_smooth=True)
line3.show_config()
line3.render(path='面积折线图.html')
# 柱形图-折线图
def pan7():
from pyecharts import Bar, Line, Overlap
salary_high = [15521, 12761, 16451, 20039, 26566, 33127, 42285]
salary_low = [9558, 7609, 9801, 12294, 16773, 21111, 26732]
lls = ['无工作经验', '1', '2', '3-4', '5-7', '8-9', '10']
bar = Bar("柱形图-折线图")
bar.add('最低薪资', lls, salary_low)
line = Line()
line.add('最高薪资', lls, salary_high)
overlap = Overlap()
overlap.add(bar)
overlap.add(line)
overlap.show_config()
overlap.render(path='柱形图-折线图.html')