Data Cleaning
import pandas as pd
import numpy as np
import re
from pyecharts.charts import Bar,Map,WordCloud,HeatMap
from pyecharts import options as opts
import matplotlib.pyplot as plt
import nltk
# 下载所需的资源
nltk.download('punkt')
import jieba
from collections import Counter
from matplotlib import rcParams
config = {
"font.family":'serif',
"font.size": 13,
"mathtext.fontset":'stix',
"font.serif": ['SimSun'],
}
rcParams.update(config)
plt.rcParams['font.sans-serif'] = ['SimHei'] #解决中文显示问题
plt.rcParams['axes.unicode_minus'] = False # 解决中文显示问题
data = pd.read_csv('liepin.csv')
data.head()
# 去重
print(len(data))
data1 = data.drop_duplicates()
print(len(data1))
# 对于缺失值删除所在行的数据
data.dropna(inplace=True)
#统计所有列缺失值
data.isnull().sum()
data['工作地区']=data['工作地区'].str.replace(r'-.*', '').str.strip()
# 对“工资”列数据进行数据清洗,取中间值来替换
def get_middle_salary(salary_str):
# 匹配 8-15k·13薪" 类型的字符串
match = re.match(r'(\d{1,2})-?(\d{1,3})k·(\d+)薪', salary_str)
if match:
min_salary = ((float(match.group(1)) + float(match.group(2))) / 2) * 1000
salary_per_month = float(match.group(3))
month = salary_per_month - 12
middle_salary = ((month * min_salary) / 12) + min_salary
return int(middle_salary)
match = re.match(r'(\d{1,2})-?(\d{1,3})k', salary_str)
if match:
middle_salary = (float(match.group(1)) + float(match.group(2))) / 2
return int(middle_salary * 1000)
# 如果都匹配不上,返回 4000
return 4000
data['工资'] = data['工资'].apply(get_middle_salary)
Data Visualization
xueli = data['学历'].value_counts().index.to_list()
xueli_num = data['学历'].value_counts().values.tolist()
fig, ax = plt.subplots(figsize=(9,6))
plt.bar(xueli, xueli_num, label='academic qualifications')
for a,b,i in zip(xueli,xueli_num,range(len(xueli))): # zip 函数
plt.text(a,b+0.01,"%.f"%xueli_num[i],ha='center',fontsize=13) # plt.text 函数
plt.title('学历占比情况', size=18)
plt.legend(loc=1)
plt.xlabel('学历', size