import pandas as pd
import numpy as np
df = pd.read_csv(r"C:\Users\Administrator\Desktop\python 0712\DataAnalyst.csv",encoding = 'gb2312')
df.head()
len(df.positionId.unique())
df_duplicates = df.drop_duplicates(subset = 'positionId',keep = 'first')
df_duplicates.head()
def cut_word(word,method):
position = word.find('-')
length = len(word)
if position !=-1:
bottomsalary = word[:position-1]
topsalary = word[position+1:length-1] #左边的数能取到,右边的数取不到,之所以还要-1,因为起始位置是从0开始
else:
bottomsalary = word[:word.upper().find('K')]
topsalary = bottomsalary
if method == 'bottom':
return bottomsalary
else:
return topsalary
df_duplicates['topsalary'] = df_duplicates.salary.apply(cut_word, method = 'top') #apply函数将自定义的word函数应用在salary列的数据上
df_duplicates['bottomsalary'] = df_duplicates.salary.apply(cut_word, method = 'bottom') #这两步赋值df_duplicates的topsalary/bottomsalary属性的值
df_duplicates.topsalary = df_duplicates.topsalary.astype('int')
df_duplicates.bottomsalary = df_duplicates.bottomsalary.astype('int') #这两步将字段中清洗出来的数字(str)转化为数字(int)格式
df_duplicates['avgsalary'] = (df_duplicates.topsalary+ df_duplicates.bottomsalary)/2 #整数格式数字直接运算
df_duplicates.avgsalary
df_clean = df_duplicates[['city','companyShortName','companySize','education','positionName','positionLables','workYear','avgsalary']] #从原表中切出想要的数据字段
df_clean.head()
df_clean.city.value_counts #统计所有非零元素个数
df_clean.describe() #数值统计,生成各类统计指标
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
df_clean.avgsalary.hist(bins=15) #bins是指直方图个数
from matplotlib.font_manager import FontProperties
ax = df_clean.boxplot(column = 'avgsalary',by='city',figsize=(9,7)) #按城市看;figsize参数为图片大小
for label in ax.get_xticklabels(): #ax.get_xticklabels获取坐标轴刻度
label.set_fontproperties('SimHei') #for循环依次替换X坐标轴上的中文刻度
ax = df_clean.boxplot(column = 'avgsalary',by='education',figsize=(9,7)) #按学历看
for label in ax.get_xticklabels():
label.set_fontproperties('SimHei')
ax = df_clean.boxplot(column = 'avgsalary',by='workYear',figsize=(9,7)) #按工作年限看
for label in ax.get_xticklabels():
label.set_fontproperties('SimHei')
df_clean.groupby(['city','education']).mean()
df_clean.groupby(['city','education']).mean().unstack() #unstack行列转置
df_clean.groupby('companyShortName').avgsalary.agg(['count','mean']).sort_values(by='count',ascending=False)
ax = df_clean.groupby=('city').mean().plot.bar() #按城市排在平均工资条形图
for label in ax.get_xticklabels():
label.set_fontproperties('SimHei')
pandas数据分析0723
最新推荐文章于 2021-08-21 10:00:41 发布