一:按缺失百分比去除缺失值过多的特征
#缺失超过77%的特征被去除
many_null_cols = [col for col in train_x.columns if train_x[col].isnull().sum() / train_x.shape[0] > 0.77]
many_null_cols_test = [col for col in test.columns if test[col].isnull().sum() / test.shape[0] > 0.77]
cols_to_drop = list(set(many_null_cols))
cols_to_drop_test = list(set(many_null_cols_test))
train_x = train_x.drop(cols_to_drop, axis=1)
test_x = test_x.drop(cols_to_drop, axis=1)
二:绘制数据集缺失百分比图
def describe_missing_values(df):
na_percent = {}
N = df.shape[0]
for column in df:
na_percent[column] = df[column].isnull().sum() * 100 / N
na_percent = dict(filter(lambda x: x[1] != 0, na_percent.items()))
plt.bar(range(len(na_percent)), na_percent.values())
plt.ylabel('Percent')
plt.xticks(range(len(na_percent)), na_percent.keys(), rotation='vertical')
plt.show()
print