import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
data=pd.read_csv(r'D:\data\jobs_large.csv',sep='\t',encoding='utf8')
data.head()
data.info()
columns = [
'positionId', 'positionName', 'companyId',
'companyShortName', 'companySize', 'industryField', 'financeStage',
'skillLables', 'positionLables', 'createTime', 'city', 'salary',
'companyLabelList', 'workYear', 'jobNature', 'education'
]
data=data[columns]
data.sort_values(by='positionId',inplace=True)
data.tail(15)
data.drop(labels=[1421,1420,251,101,1508,1507],inplace=True)
data[data.duplicated(subset='positionId')]
data.drop_duplicates(subset='positionId',keep='first',inplace=True)
data['salary'] = data['salary'].apply(
lambda x: int(int(x.split('-')[0][:-1]) + int(x.split('-')[1][:-1])) / 2)
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False
data[(data['city']=='北京')|(data['city']=='上海')|(data['city']=='广州')|(data['city']=='深圳')][['city','salary']].boxplot(by='city')
plt.legend()
plt.show()
city_avg_salary=data[['city','salary']].groupby(by='city',as_index=False).agg('mean').rename(columns={'salary':'avg_salary'})
city_avg_salary.sort_values(by='avg_salary',ascending=False,inplace=True)
city_avg_salary.plot.bar(x='city',y='avg_salary')