工资数据集处理


import numpy as np
import pandas as pd
from pandas import Series,DataFrame

# 1--读取数据文件
file_obj=open('Baltimore_City_Employee_Salaries_FY2016.csv')
salary_df=pd.read_csv(file_obj)
file_obj.close()
salary_df.head()
NameJobTitleAgencyIDAgencyHireDateAnnualSalaryGrossPay
0Aaron,Patricia GFacilities/Office Services IIA03031OED-Employment Dev (031)10/24/1979 12:00:00 AM$56705.00$54135.44
1Aaron,Petra LASSISTANT STATE'S ATTORNEYA29045States Attorneys Office (045)09/25/2006 12:00:00 AM$75500.00$72445.87
2Abbey,EmmanuelCONTRACT SERV SPEC IIA40001M-R Info Technology (001)05/01/2013 12:00:00 AM$60060.00$59602.58
3Abbott-Cole,MichelleOperations Officer IIIA90005TRANS-Traffic (005)11/28/2014 12:00:00 AM$70000.00$59517.21
4Abdal-Rahim,Naim AEMT Firefighter SuppressionA64120Fire Department (120)03/30/2011 12:00:00 AM$64365.00$74770.82
# 2--查看数据情况
salary_df.shape  # (13818, 7)
salary_df.describe()
salary_df.info()  # 可以看到 Grosspay 的个数为 13546,不是13818,有缺失值
salary_df.isnull().sum() #  可以看到 Grosspay缺失值的个数为 272
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13818 entries, 0 to 13817
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Name          13818 non-null  object
 1   JobTitle      13818 non-null  object
 2   AgencyID      13818 non-null  object
 3   Agency        13818 non-null  object
 4   HireDate      13818 non-null  object
 5   AnnualSalary  13818 non-null  object
 6   GrossPay      13546 non-null  object
dtypes: object(7)
memory usage: 755.8+ KB


Name              0
JobTitle          0
AgencyID          0
Agency            0
HireDate          0
AnnualSalary      0
GrossPay        272
dtype: int64
# 3--数据清理
# 3-1)删除缺失值
salary_df=salary_df.dropna(how='any',axis=0)  # 删除存在缺失值的整行数据
salary_df.isnull().sum()
Name            0
JobTitle        0
AgencyID        0
Agency          0
HireDate        0
AnnualSalary    0
GrossPay        0
dtype: int64
# 3-2)去除 AnnualSalary,GrossPay 两列中的 $号,并将这两列数据类型转为 float
salary_df['AnnualSalary']=salary_df['AnnualSalary'].str.strip('$')
salary_df['GrossPay']=salary_df['GrossPay'].str.strip('$')
salary_df.head()
NameJobTitleAgencyIDAgencyHireDateAnnualSalaryGrossPay
0Aaron,Patricia GFacilities/Office Services IIA03031OED-Employment Dev (031)10/24/1979 12:00:00 AM56705.0054135.44
1Aaron,Petra LASSISTANT STATE'S ATTORNEYA29045States Attorneys Office (045)09/25/2006 12:00:00 AM75500.0072445.87
2Abbey,EmmanuelCONTRACT SERV SPEC IIA40001M-R Info Technology (001)05/01/2013 12:00:00 AM60060.0059602.58
3Abbott-Cole,MichelleOperations Officer IIIA90005TRANS-Traffic (005)11/28/2014 12:00:00 AM70000.0059517.21
4Abdal-Rahim,Naim AEMT Firefighter SuppressionA64120Fire Department (120)03/30/2011 12:00:00 AM64365.0074770.82
salary_df['GrossPay'].dtype  # dtype('O') 不是 dtype('float')
# salary_df['AnnualSalary']=salary_df['AnnualSalary'].astype(float)
# salary_df['GrossPay']=salary_df['GrossPay'].astype(float)  # 直接用下面这句:
salary_df[['AnnualSalary','GrossPay']]=salary_df[['AnnualSalary','GrossPay']].astype(float)
salary_df['GrossPay'].dtype   # dtype('float64')
dtype('O')
# 3-3) 新建一列,用于存放入职月份:
salary_df['month']=salary_df['HireDate'].str.split('/').str[0]  # 或者:
# salary_df['month']=salary_df['HireDate'].str.split('/').str.get(0)  
salary_df.head()
salary_df[['HireDate','month']].head()  # 只查看这两列
HireDatemonth
010/24/1979 12:00:00 AM10
109/25/2006 12:00:00 AM09
205/01/2013 12:00:00 AM05
311/28/2014 12:00:00 AM11
403/30/2011 12:00:00 AM03
# 4--数据探索
# 4-1)工资分布:
salary_df['AnnualSalary'].hist(bins=20)  # 基本呈正态分布,高薪的人员较少,30000美元左右的人最多
<matplotlib.axes._subplots.AxesSubplot at 0x21999d528c8>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-4Qb3Vh1N-1585835263436)(output_6_1.png)]

# 4-2)入职月份统计:
month=salary_df['month'].value_counts()
month.plot(kind='barh')  # 6月份入职人数最多,HR最忙
<matplotlib.axes._subplots.AxesSubplot at 0x219a114ef08>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-bycizMRr-1585835263437)(output_7_1.png)]

# 5-聚合运算
# 5-1)计算年薪平均值和职位个数:
salary_df['AnnualSalary'].mean()  # 53507.98394359959  所有员工的平均年薪
# 5-2) 计算各职位的年薪平均值 看哪个职位的平均工资高,职位数,及该职位的最高工资
salary_df.groupby('JobTitle')['AnnualSalary'].mean()
salary_df.groupby('JobTitle')['AnnualSalary'].agg(['mean','count','max'])
meancountmax
JobTitle
911 LEAD OPERATOR49816.750000450162.0
911 OPERATOR44829.4615386550829.0
911 OPERATOR SUPERVISOR57203.500000457579.0
ACCOUNT EXECUTIVE57200.000000457200.0
ACCOUNTANT I49065.8666671557579.0
............
ZONING APPEALS ADVISOR BMZA53636.000000153636.0
ZONING APPEALS OFFICER67800.000000167800.0
ZONING ENFORCEMENT OFFICER65800.000000165800.0
ZONING EXAMINER I45628.500000248811.0
ZONING EXAMINER II56150.000000156150.0

1034 rows × 3 columns

# 5-3) 对上述各职位的年薪的平均值进行降序排列
jobtitle_salary_df=salary_df.groupby('JobTitle')['AnnualSalary'].agg(['mean','count','max'])
jobtitle_sort_salary_df=jobtitle_salary_df.sort_values(by='mean',ascending=False)[:30]  # 知识点:df.sort_values(by=列名,ascending=False)
jobtitle_sort_salary_df['mean'].plot(kind='bar')
<matplotlib.axes._subplots.AxesSubplot at 0x219a52aa788>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-tlawTPZu-1585835263439)(output_9_1.png)]

# 5-4) 对上述各职位的人数进行降序排列
jobtitle_salary_df=salary_df.groupby('JobTitle')['AnnualSalary'].agg(['mean','count','max'])
jobtitle_sort_salary_df=jobtitle_salary_df.sort_values(by='count',ascending=False)[:30]  # 知识点:df.sort_values(by=列名,ascending=False)
jobtitle_sort_salary_df['count'].plot(kind='bar')
<matplotlib.axes._subplots.AxesSubplot at 0x219a6327e48>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-u6352dec-1585835263439)(output_10_1.png)]

# 我们再看看其他的需求:
# 从头开始:
import numpy as np
import pandas as pd
from pandas import Series,DataFrame

# 1--读取数据文件
file_obj=open('Baltimore_City_Employee_Salaries_FY2016.csv')
salary_df=pd.read_csv(file_obj)
file_obj.close()

# 2--删除含缺失值的行,
salary_df=salary_df.dropna(how='any',axis=0)  # 删除存在缺失值的整行数据
salary_df.isnull().sum()

# 3--去除 AnnualSalary,GrossPay 两列中的 $号,并将这两列数据类型转为 float
salary_df['AnnualSalary']=salary_df['AnnualSalary'].str.strip('$')
salary_df['GrossPay']=salary_df['GrossPay'].str.strip('$')
salary_df[['AnnualSalary','GrossPay']]=salary_df[['AnnualSalary','GrossPay']].astype(float)
salary_df.head()

# 4--查看 AnnualSalary的最大值及所在的行号:
salary_df['AnnualSalary'].max()  # 238772.0
salary_df['AnnualSalary'].argmax()  # 8701  # se.argmax()之前没有学到,这里补充一下
8701
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值