kaggle房价预测 第二次练习总结(处理缺失值)

kaggle房价预测参考danB

链接:https://www.kaggle.com/learn/machine-learning

接下来开始处理缺失值NA,用使用数字预测器选出数字特征

用了三个方式处理缺失值:

1.直接把train和test中都有NA的列删除

2.把train的列删除,用Imputer()填补

3.在Imputer()前在X中增加bool型有缺失值得列

import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer
original_data = pd.read_csv( 'D:/NOTEBOOK/train.csv') #读取训练数据
test_data = pd.read_csv( 'D:/NOTEBOOK/test.csv')  #读取测试数据

original_data_y = original_data.SalePrice #获取y
original_data = original_data.drop(['SalePrice'], axis=1) #删除y

X_train = original_data.select_dtypes(exclude=['object']) #只使用数字预测器
X_test = test_data.select_dtypes(exclude=['object'])
#############################################################################################
cols_with_missing = [col for col in X_train.columns if X_train[col].isnull().any()] #删除na的train列
reduced_X_train = X_train.drop(cols_with_missing, axis=1)
reduced_X_test = X_test.drop(cols_with_missing, axis=1)
#删除了train和test中train有na的列,但test仍有na
cols_with_missing = [col for col in reduced_X_test.columns if reduced_X_test[col].isnull().any()] #删除na的test列
reduced_X_train1 = reduced_X_train.drop(cols_with_missing, axis=1)
reduced_X_test1 = reduced_X_test.drop(cols_with_missing, axis=1)

my_model = RandomForestRegressor()
my_model.fit(reduced_X_train1,original_data_y)

pre_test_y = my_model.predict(reduced_X_test1)
my_submission = pd.DataFrame({'Id':X_test.Id, 'SalePrice':pre_test_y})
my_submission.to_csv('submission4.csv', index=False) #把train和test中都有na的列删除  0.16393  
my_imputer = Imputer() #均值插补
imputed_X_train = my_imputer.fit_transform(X_train)
imputed_X_test = my_imputer.transform(X_test)

my_model = RandomForestRegressor()
my_model.fit(imputed_X_train,original_data_y)
pre_test_y = my_model.predict(imputed_X_test)
my_submission = pd.DataFrame({'Id':X_test.Id, 'SalePrice':pre_test_y})
my_submission.to_csv('submission5.csv', index=False) #0.16347
imputed_X_train_plus = X_train.copy()
imputed_X_test_plus = X_test.copy()

cols_with_missing = (col for col in X_train.columns if X_train[col].isnull().any()) 
#print(list(cols_with_missing))  #观察cols_with_missing(三个)
for col in cols_with_missing:#插补的扩展
    imputed_X_train_plus[col + '_was_missing'] = imputed_X_train_plus[col].isnull()
    imputed_X_test_plus[col + '_was_missing'] = imputed_X_test_plus[col].isnull()
    #print(imputed_X_test_plus[col + '_was_missing']) #这一列表示取值是否为na
#print(imputed_X_test_plus) #观察imputed_X_train_plus  最后3列是col + '_was_missing'

my_imputer = Imputer() #均值插补
imputed_X_train_plus = my_imputer.fit_transform(imputed_X_train_plus)
imputed_X_test_plus = my_imputer.transform(imputed_X_test_plus)

my_model = RandomForestRegressor()
my_model.fit(imputed_X_train,original_data_y)
pre_test_y = my_model.predict(imputed_X_test)
my_submission = pd.DataFrame({'Id':X_test.Id, 'SalePrice':pre_test_y})
my_submission.to_csv('submission6.csv', index=False)  #   0.15685 2640

尝试了下,发现删除train,填补test会更高

import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer
original_data = pd.read_csv( 'D:/NOTEBOOK/train.csv') #读取训练数据
test_data = pd.read_csv( 'D:/NOTEBOOK/test.csv')  #读取测试数据

#print(original_data.isnull().sum()) #统计na的总数
#print(original_data.describe())  #显示描述
#print(original_data.columns) #显示列
#print(original_data.isnull())   #bool显示是否为na
#data_without_missing_values = original_data.dropna(axis=1) 删除na

original_data_y = original_data.SalePrice #获取y
original_data = original_data.drop(['SalePrice'], axis=1) #删除y

X_train = original_data.select_dtypes(exclude=['object']) #只使用数字预测器
X_test = test_data.select_dtypes(exclude=['object'])

cols_with_missing = [col for col in X_train.columns if X_train[col].isnull().any()] #删除na的train列
reduced_X_train = X_train.drop(cols_with_missing, axis=1)
reduced_X_test = X_test.drop(cols_with_missing, axis=1)

my_imputer = Imputer()
imputed_X_train = my_imputer.fit_transform(reduced_X_train)
imputed_X_test = my_imputer.transform(reduced_X_test)

my_model = RandomForestRegressor()
my_model.fit(imputed_X_train,original_data_y)

pre_test_y = my_model.predict(imputed_X_test)
my_submission = pd.DataFrame({'Id':X_test.Id, 'SalePrice':pre_test_y})
my_submission.to_csv('submission8.csv', index=False) #不填补train的缺失 直接填补去掉train列的test的缺失 0.15503 2614

接下来用独热编码,one-hot

import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer
original_data = pd.read_csv( 'D:/NOTEBOOK/train.csv') #读取训练数据
test_data = pd.read_csv( 'D:/NOTEBOOK/test.csv')  #读取测试数据

#print(original_data.isnull().sum()) #统计na的总数
#print(original_data.describe())  #显示描述
#print(original_data.columns) #显示列
#print(original_data.isnull())   #bool显示是否为na
#data_without_missing_values = original_data.dropna(axis=1) 删除na

original_data_y = original_data.SalePrice #获取y
original_data = original_data.drop(['SalePrice'], axis=1) #删除y

#############################################################################
X_train = original_data
X_test = test_data
choose_column = [col for col in X_train.columns if (X_train[col].nunique() < 10 and X_train[col].dtype == "object") or X_train[col].dtype in ['int64','float64']]
X_train = X_train[choose_column]
X_test = X_test[choose_column] #提取中少于10的和数字类型

X_train = pd.get_dummies(X_train) #one-hot
X_test = pd.get_dummies(X_test)
X_train, X_test = X_train.align(X_test, join = 'left', axis=1)

my_imputer = Imputer()
imputed_X_train = my_imputer.fit_transform(X_train)
imputed_X_test = my_imputer.transform(X_test)

my_model = RandomForestRegressor()
my_model.fit(imputed_X_train,original_data_y)

pre_test_y = my_model.predict(imputed_X_test)
my_submission = pd.DataFrame({'Id':X_test.Id, 'SalePrice':pre_test_y})
my_submission.to_csv('submission10.csv', index=False)  #0.15822


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值