3种数据缺失处理
(1)删除缺失值
代码:
import pandas as pd
import numpy as np
from sklearn.preprocessing import Imputer
#生成缺失数据
data=pd.DataFrame({‘name’:[‘Jane’,‘Lily’,‘Mary’,‘Alice’,‘Bob’],‘English’:[92,78,np.nan,65,90],‘Math’:[69,87,91,np.nan,90],‘Chinese’:[np.nan,78,96,np.nan,75]})
print(data)
#1.删除缺失值
data2=data.dropna()
print(data2)
(2)均值替换缺失值(利用sklearn)
代码:
import pandas as pd
import numpy as np
from sklearn.preprocessing import Imputer
#生成缺失数据
data=pd.DataFrame({‘name’:[‘Jane’,‘Lily’,‘Mary’,‘Alice’,‘Bob’],‘English’:[92,78,np.nan,65,90],‘Math’:[69,87,91,np.nan,90],‘Chinese’:[np.nan,78,96,np.nan,75]})
print(data)
#2.均值替换缺失值(利用sklearn)
data.index=data[‘name’]#将第一列作为索引
data=data.drop([‘name’],axis=1)#删除第一列
nan_model=Imputer(missing_values=‘NaN’,strategy=‘mean’,axis=0)#按照行均值替换对应缺失值。
nan_result=nan_model.fit_transform(data)
print(nan_result)
(3).利用pandas替换缺失值
代码:
#用0替换
import pandas as pd
import numpy as np
from sklearn.preprocessing import Imputer
#生成缺失数据
data=pd.DataFrame({‘name’:[‘Jane’,‘Lily’,‘Mary’,‘Alice’,‘Bob’],‘English’:[92,78,np.nan,65,90],‘Math’:[69,87,91,np.nan,90],‘Chinese’:[np.nan,78,96,np.nan,75]})
print(data)
data2=data.fillna(0) #缺失值用0替换
print(data2)
#或者,因为第一行缺失,用后面的值替换
data2=data.fillna(method=‘backfill’)#用后面的值替换
print(data2)