数据清洗
-
发现重复与冗余
设数据集如下:
{state’:[‘a’,‘b’,‘c’,‘a’,‘b’,‘c’],‘year’:[2018,2016,2017,2018,2016,2017],‘average’:[87,85,88,87,85,88]}
1、统计所有数据全部重复的有多少?
2、显示重复的数据是哪几个?
3、把重复的删除,但是不要删除原始数据#输入数据 data=pd.DataFrame({'state':['a','b','c','a','b','c'],'year':[2018,2016,2017,2018,2016,2017], 'average':[87,85,88,87,85,88]}) # print(data) #得到重复数据b b=data.groupby(['state','year','average']).count()>1 #去重后得到a a=data.drop_duplicates(subset=['state','year','average'],keep='first') print("所有数据全部重复的有",data.__len__()-a.__len__(),"个") print("显示重复的数据",b) print("删除重复后的数据:\n",a)
-
处理缺失值
设数据集如下: {‘state’:[‘a’,‘b’,‘c’,np.nan,‘b’,‘c’],‘year’:[2018,2016,np.nan,2018,2016,2017],‘average’:[87,85,88,np.nan,85,88]}
1、统计行数及每列非空取值个数
2、统计每列缺失值的个数
3、把缺失值都填充为0
4、对于不同的列,动态指定不同列的填充值,第一列指定众数填充,第二列使用众数,第三列使用均值
5、运用自定义函数来填充, 假设自定义函数就是取众数的第一个值data=pd.DataFrame({'state':['a','b','c',np.nan,'b','c'],'year':[2018,2016,np.nan,2018,2016,2017], 'average':[87,85,88,np.nan,85,88]}) print("行数为",data.__len__()) print("每列非空取值个数:\n",data.notnull().sum(axis=0)) print("每列缺失值个数:\n",data.isnull().sum(axis=0)) print("把缺失值都填充为0后:\n",data.fillna(0)) print("****************") #动态指定不同列的填充值,第一列指定众数填充,第二列使用众数,第三列使用均值 # data['state'].fillna(data['state'].mode()[0],inplace=True) data['year'].fillna(data['year'].mode()[0],inplace=True) data['average'].fillna(value=data['average'].mean(),inplace=True ) print("第一列指定众数填充,第二列使用众数,第三列使用均值填充空值后:\n",data) #定义函数取众数的第一个值 def FirstMode(data): return list(data.mode())[0] # print(FirstMode(data['state'])) data['state'].fillna(value=FirstMode(data['state']),inplace=True) data['year'].fillna(value=FirstMode(data['year']),inplace=True) data['average'].fillna(value=data['average'].mean(),inplace=True ) print("&&&&") print("动态填充后:\n",data)
数据预处理
-
数据集成
设数据集如下:
dat1 = DataFrame({‘key’:[‘a’,‘b’,‘c’],
‘value1’:[1,2,3],
‘year’:[1998,1999,2001]})
dat2 = DataFrame({'address:[‘Hangzhou’,‘Guangzhou’,‘Nanjing’],
‘type’:[‘l1’,’l2’,’l3’]})
dat3= DataFrame({‘key’:[‘a’,‘b’,‘c’],
‘value1’:[1,2,3],
‘year’:[1998,1999,2001]})
dat4 = DataFrame({‘key’:[‘d’,‘e’],
‘value1’:[43,32],
‘year’:[1989,1990]})
1、将dat1与dat2列方向合并(列数增加方向),并输出
2、将dat3与dat4行方向合并(行数增加方向),并输出。#数据集成 dat1=pd.DataFrame({'key':['a','b','c'],'value':[1,2,3],'year':[1998,1999,2001]}) dat2=pd.DataFrame({'address':['zhengzhou','guangzhou','nanjing'],"type":['11','12','13']}) dat3=pd.DataFrame({'key':['a','b','c'],'value':[1,2,3],'year':[1998,1999,2001]}) dat4=pd.DataFrame({'key':['d','e'],'value':[43,32],'year':[1989,1990]}) print("列合并:\n",pd.concat([dat1,dat2])) print("行合并:\n",pd.concat([dat3,dat4],axis=0))
-
数据变换
设数据集如下:
{‘student’:[‘张山’,’李尔’,’王五’,’赵明’,’王迪’,’肖晓’],
‘optional’:[3,4,2,5,3,4],
‘required’:[90,83,67,87,81,91],
‘ideology’:[‘优’,’良’,’良’,’优’,’及格’,’优’]}
1、将optional和required列进行归一化
2、将optional和required列进行标准化
3、试求各位同学的总评成绩。#数据变换 #数据输入 data=pd.DataFrame({'student':['张三','李尔','王五','赵明','王迪','肖晓'],'option':[3,4,2,5,3,4], 'required':[90,83,67,87,81,91],'ideology':['优','良','良','优','及格','优']}) # print("原始数据:\n",data) # print("************") #归一化 data2=data data2['option']=(data2['option']-min(data2['option']))/(max(data2['option']-min(data2['option']))) data2['required']=(data2['required']-min(data2['required']))/(max(data2['required']-min(data2['required']))) # print("将optional和required列进行归一化后:\n",data2) # print("**************") #标准化 data3=pd.DataFrame({'student':['张三','李尔','王五','赵明','王迪','肖晓'],'option':[3,4,2,5,3,4], 'required':[90,83,67,87,81,91], 'ideology':['优','良','良','优','及格','优']}) data3['option']=(data3['option']-np.mean(data3['option']))/(data3['option']).std() data3['required']=(data3['required']-np.mean(data3['required']))/(data3['required']).std() # print("将optional和required列进行标准化后:\n",data3) #求总评 #把优秀设置为95,良好设置为80,及格设置为60 #方法一:归一化求总评 data4=data2 data4['ideology'].loc[data2['ideology']=='优']=95 data4['ideology'].loc[data2['ideology']=='良']=80 data4['ideology'].loc[data2['ideology']=='及格']=60 #将ideology数据归一化 data4['ideology']=(data4['ideology']-min(data4['ideology']))/(max(data4['ideology']-min(data4['ideology']))) #去除姓名栏 data44=list(data4) data44.remove('student') #添加新列 data4['总评']=data4[data44].apply(lambda x:x.sum(),axis=1) print("根据归一化求总评:\n",data4) #方法二:标准化求总评 data5=data3 data5['ideology'].loc[data3['ideology']=='优']=95 data5['ideology'].loc[data3['ideology']=='良']=80 data5['ideology'].loc[data3['ideology']=='及格']=60 #将ideology数据标准化 data5['ideology']=(data5['ideology']-np.mean(data5['ideology']))/(data5['ideology']).std() #去除姓名栏 data55=list(data5) data55.remove('student') #添加新列 data5['总评']=data5[data55].apply(lambda x:x.sum(),axis=1) print("根据标准化求总评:\n",data5)
实验总结
学习了数据分析包pandas,利用pandas包中的数据结构DataFrame以及该结构中的方法对数据进行分析。