导入模块
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
读取数据
data = {
'size': ['XL', 'L', 'M', np.nan, 'M', 'M'],
'color': ['red', 'green', 'blue', 'green', 'red', 'green'],
'gender': ['female', 'male', np.nan, 'female', 'female', 'male'],
'price': [199.0, 89.0, np.nan, 129.0, 79.0, 89.0],
'weight': [500, 450, 300, np.nan, 410, np.nan],
'bought': ['yes', 'no', 'yes', 'no', 'yes', 'no']
}
构建pandas.Dataframe,并打印出来
df = pd.DataFrame(data)
df
查看每一列空值的个数
df.isnull().sum()
输出data的行数
len(df)
每一列空值的比
df.isnull().sum() / len(df)
使用均值填充缺失值
创建一个策略对象 传两个对象 第一个空值 第二个填充的数据(用均值来填充)
imputer = SimpleImputer(missing_values = np.nan,strategy = 'mean')
df[["weight"]] = imputer.fit_transform(df[["weight"]])
df
使用常量填充缺失值
imputer = SimpleImputer(missing_values = np.nan,strategy = 'constant',fill_value = 99.0)
df[["price"]] = imputer.fit_transform(df[["price"]])
df
使用最频繁的值填充缺失值(一般是非数值列)
imputer = SimpleImputer(missing_values = np.nan,strategy = 'most_frequent')
df[["size"]] = imputer.fit_transform(df[["size"]])
df