前言:由于很多人问我清洗后job.csv里无数据,
经排查,感觉是这几个python包的版本原因,
在此提供我当时用的这几个包的版本(主要是pandas库):
一.数据采集(python爬虫)
爬虫这块不多说啦(主打数据分析),采集到的信息先转入excel表格,然后进行清洗、可视化。
#!user/bin/env python3
# -*- coding: utf-8 -*-
import re
import requests
import random,time
from lxml import html
key = '大数据'
headers = {"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "en-US,en;q=0.5",
"Connection": "keep-alive",
"Host": "jobs.51job.com",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0"}
def get_urls(page):
url0 = 'http://search.51job.com/list/000000,000000,0000,00,9,99,' + key + ',2,' + str(page) + '.html'
req0 = requests.get(url0, headers, timeout=10)
req0.decoding = 'gbk'
result = re.compile(r'class="t1 ">.*? <a target="_blank" title=".*?" href="(.*?)".*? <span class="t2">',re.S)#无re.S只在每一行内匹配
url = re.findall(result, req0.text)
return url
def str1(x):
y = ''
for x0 in x:
y = y + x0 + ';'
return y
def get_content(url):
req1 = requests.get(url, headers, timeout=10)
req1.encoding = 'gbk'
t1 = html.fromstring(req1.text)
try:
job_name = t1.xpath('//div[@class="tHeader tHjob"]//h1/text()')[0].strip()
company_name = t1.xpath('//p[@class="cname"]//a/text()')[0].strip()
job_place = t1.xpath('//p[@class="msg ltype"]/text()')[0].strip().split('-')[0]#由于有的职位信息只有城市,此处只保留城市
workExperience = t1.xpath('//p[@class="msg ltype"]/text()')[1].strip()
educationBackground = t1.xpath('//p[@class="msg ltype"]/text()')[2].strip()
require_num = t1.xpath('//p[@class="msg ltype"]/text()')[3].strip()
date = t1.xpath('//p[@class="msg ltype"]/text()')[4].strip()
Salary = t1.xpath('//div[@class="tHeader tHjob"]//strong/text()')[0].strip()
company_type = t1.xpath('//div[@class="com_tag"]/p/text()')[0].strip()
company_num = t1.xpath('//div[@class="com_tag"]/p/text()')[1].strip()
company_business = str1(t1.xpath('//div[@class="com_tag"]/p//a/text()'))#自定义函数str()
job_treatment = str1(re.findall(re.compile(r'<span class="sp4">(.*?)</span>', re.S), req1.text))
job_describe = str1(t1.xpath('//div[@class="bmsg job_msg inbox"]/p/text()'))
content = str(company_name) + ',' + str(job_name) + ',' + str(job_place) + ',' + str(workExperience) + ',' +\
str(educationBackground) + ',' + str(require_num) + ',' + str(date) + ',' + str(Salary) + ',' + str(
company_type) + ',' + str(company_num) + ',' + str(company_business) + ',' + str(job_treatment)+ ',' + str(job_describe) + '\n'
file.write(content)
return content
except Exception as e:
print(e)
if __name__ == '__main__':
file = open('51job.csv', 'w', encoding='gbk')
content0 = 'company_name,job_name,job_place,workExperience,educationBackground,require_num,date,Salary,company_type,' \
'company_num,company_business,job_treatment,job_describe'+'\n'
file.write(content0)
for page in range(1,200):
print('正在爬取第{}页信息'.format(page))
#time.sleep(random.random() + random.randint(1, 5))
urls = get_urls(page)
try:
for url in urls:
try:
data = get_content(url)
print(data)
#time.sleep(random.random() + random.randint(0,1))
except :
print('None')
except Exception as e:
print(e)
file.close()
二.数据清洗及处理(用到pandas,numpy等库,需自行安装)
(生成文件job.csv)
#!user/bin/env python3
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import re
data0 = pd.read_excel('data0.xlsx', encoding='gbk')
def get_salary(salary): #自定义针对薪资信息处理的函数
if '-' in salary: # 针对1-2万/月或者10-20万/年的情况,包含-
low_salary = re.findall(re.compile('(\d*\.?\d+)'), salary)[0]
high_salary = re.findall(re.compile('(\d?\.?\d+)'), salary)[1]
if u'万' in salary and u'年' in salary: # 单位统一成千/月的形式
low_salary = round(float(low_salary) / 12 * 10,2)
high_salary = round(float(high_salary) / 12 * 10,2)
elif u'万' in salary and u'月' in salary:
low_salary = round(float(low_salary) * 10,2)
high_salary = round(float(high_salary) * 10,2)
else: # 针对20万以上/年和100元/天这种情况,不包含-,取最低工资,没有最高工资
low_salary = re.findall(re.compile('(\d*\.?\d+)'), salary)[0]
high_salary = np.NAN
if u'万' in salary and u'年' in salary: # 单位统一成千/月的形式
low_salary = round(float(low_salary) / 12 * 10,2)
elif u'万' in salary and u'月' in salary:
low_salary = round(float(low_salary) * 10,2)
elif u'元' in salary and u'天' in salary:
low_salary = round(float(low_salary) / 1000 * 21,2) # 每月工作日21天
return low_salary,high_salary
def get_experience(exp): #自定义处理工作经验的函数
if u'无' in exp:
low_exp = [0]
else:
low_exp = re.findall(re.compile('(\d*)'), exp)
return low_exp[0]
#数据清洗
def data_clear(df):
df.drop(columns='job_describe',inplace=True)
#data.drop_duplicates(subset='company_name', inplace=True)
df.dropna(axis=0, how='any', inplace=True) # 删除有缺失值的数据
print('清洗有缺失值的数据后的数据shape:', df.shape)
df = df[df.job_name.str.contains(r'.*?数据.*?')] # 挑选含有'...数据...'的行
print('挑选职位信息中含有"数据"后的数据shape:', df.shape)
df.to_csv('job0.csv')
df = pd.read_csv('job0.csv')
df = df.iloc[:,1:]
cols1 = [x for i, x in enumerate(df.index) if u'招' in df.loc[i, 'educationBackground']]
df.drop(cols1, axis=0,inplace=True)
df.to_csv('job0.csv')
print('清洗学历信息含有招人数后的数据shape:', df.shape)
df = pd.read_csv('job0.csv')
df = df.iloc[:,1:]
cols2 = [x for i, x in enumerate(df.index) if u'经验' in df.loc[i, 'educationBackground']]
df.drop(cols2, axis=0,inplace=True)
df.to_csv('job0.csv')
print('清洗学历信息含有工作经验后的数据shape:', df.shape)
df = pd.read_csv('job0.csv')
df = df.iloc[:,1:]
cols3 = [x for i,x in enumerate(df.index) if u'省' in df.loc[i, 'job_place']]
df.drop(cols3, axis=0,inplace=True)
df.to_csv('job0.csv')
print('清洗job_place含有省份后的数据shape:',df.shape)
# 数据预处理
def data_deal():
data = pd.read_csv('job0.csv')
data= data.iloc[:, 1:]
low_list, high_list, exp_list = [], [], []
for i in range(0,data.shape[0]):
salary_lh = get_salary(data['Salary'][i])
low_list.append(salary_lh[0])
high_list.append(salary_lh[1])
exp = get_experience(data['workExperience'][i])
exp_list.append(exp)
col_name = data.columns.tolist()
col_name.insert(col_name.index('Salary'),'low_salary')
data = data.reindex(columns=col_name)
data.rename(columns={'Salary': 'high_salary'},inplace=True)
data['low_salary'], data['high_salary'] = low_list, high_list # 解决高低工资,工资含有年薪等,统一月薪k/约、月
data['workExperience'] = exp_list # 解决工作经验,统一取最低工作经验
#print(data['high_salary'].isnull().sum()) #high_salary存在45个缺失值,故删除
data.dropna(axis=0,subset=['high_salary'],inplace=True)#删除45high_salary
data.index = np.arange(data.shape[0]) # data.index = np.arange(len(data.index))
data.to_csv('job.csv', encoding='utf-8')
print('预处理后数据shape:',data.shape)
if __name__ == '__main__':
data_clear(data0)
data_deal()
三.数据可视化(matplotlib,pyecharts库)
# !user/bin/env python3
# -*- coding: utf-8 -*-
from pyecharts import Geo
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pyecharts import WordCloud
from mpl_toolkits.mplot3d import Axes3D
plt.rcParams['font.sans-serif'] = 'simhei'
plt.rcParams['axes.unicode_minus'] = False
data = pd.read_csv('job.csv',encoding='gbk')
def edu_com(df):
edu = df['educationBackground'].value_counts()
y1 = [edu.index[i] for i in range(0, edu.shape[0])]
x1 = [edu[i] for i in range(0, edu.shape[0])]
ax = plt.figure(figsize=(12, 6))
ax1 = ax.add_subplot(1, 2, 1)
plt.barh(y1, x1, height=0.2, color='green')
plt.xlabel('公司数量')
plt.ylabel('学历要求')
plt.title('大数据职位对学历要求条形图')
ax2 = ax.add_subplot(1, 2, 2)
com = df['company_type'].value_counts()
y = [com.index[i] for i in range(0, com.shape[0])]
x = [com[i] for i in range(0, com.shape[0])]
plt.barh(y, x, height=0.2, color='purple')
plt.xlabel('公司数量')
plt.ylabel('公司类型')
plt.title('大数据职位公司类型条形图')
plt.savefig('大数据职位学历及公司类型条形图.png')
plt.show()
def exp_edu_sal(df):
d1 = df[['workExperience','low_salary','high_salary']].groupby('workExperience').mean()
s1 = d1.index
s11 = d1['low_salary']
s12 = d1['high_salary']
d2 = df[['educationBackground','low_salary','high_salary']].groupby('educationBackground').mean()
s2 = d2.index
s21 = d2['low_salary']
s22 = d2['high_salary']
p = plt.figure(figsize=(12,6))
p1 = p.add_subplot(1,2,1)
plt.plot(s1,s11,c='r')
plt.plot(s1,s12,c='b')
plt.xlabel('工作经验(年)')
plt.ylabel('薪资(k/月)')
plt.legend(['最低薪资','最高薪资'])
plt.title('大数据职位工作经验对薪资影响折线图')
p2 = p.add_subplot(1, 2, 2)
plt.plot(s2, s21, c='r')
plt.plot(s2, s22, c='b')
plt.xlabel('学历')
plt.ylabel('薪资(k/月)')
plt.legend(['最低薪资', '最高薪资'])
plt.title('大数据职位学历对薪资影响折线图')
plt.savefig('大数据职位工作经验及学历对薪资影响折线图.png')
plt.show()
def job_city_geo(df):
job_place = df['job_place'].value_counts()[0:40]
job_indexs = [x for i, x in enumerate(job_place.index)]
counts = [job_place[i] for i in job_indexs]
# list_city = []
# for i, x in enumerate(ser.index):
# tuple_city = (x, ser[i])
# list_city.append(tuple_city)
# list_city = list_city[0:40]
geo = Geo("大数据职位在全国分布(前40城市排名)地理坐标系图", title_color="#fff",
title_pos="center", width=1200,
height=600, background_color='#404a59')
# attr, value = geo.cast(list_city)
geo.add("", attr=job_indexs, value=counts, visual_range=[0, 1400], visual_text_color="#fff",
symbol_size=12, is_visualmap=True, is_roam=False)
geo.render('大数据职位在全国分布(前40城市排名)地理坐标系图.html')
def job_name_wc(df):
job_name = df['job_name'].value_counts()
job_indexs = [x for i, x in enumerate(job_name.index)]
counts = [job_name[i] for i in job_indexs]
wordcloud = WordCloud('大数据工作名称词云图', width=1300, height=620)
wordcloud.add("", attr=job_indexs, value=counts, word_size_range=np.arange(100))
wordcloud.render('大数据工作名称词云图.html')
if __name__ == '__main__':
edu_com(data)
exp_edu_sal(data)
job_city_geo(data)
job_name_wc(data)