预测房价
1)导入数据
import pandas as pd
from sklearn.model_selection import train_test_split
# 导入数据
data = pd.read_csv('../input/melbourne-housing-snapshot/melb_data.csv')
# 选择目标标签
y = data.Price
melb_predictors = data.drop(['Price'], axis=1)
X = melb_predictors.select_dtypes(exclude=['object'])
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
random_state=0)
2)定义函数,构建随机森林算法,返回值为模型验证的平均绝对误差,用来比较两种不同情况的数据集对模型精度的影响。
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
# 构建随机森林算法,返回值为模型验证的平均绝对误差,用来比较两种不同情况的数据集对模型精度的影响
def score_dataset(X_train, X_valid, y_train, y_valid):
model = RandomForestRegressor(n_estimators=10, random_state=0)
model.fit(X_train, y_train)
preds = model.predict(X_valid)
return mean_absolute_error(y_valid, preds)
3)用第一种方式处理缺失值
# 得到所有有缺失值的列
cols_with_missing = [col for col in X_train.columns
if X_train[col].isnull().any()]
# 对训练数据和验证数据删除有缺失值的整列
reduced_X_train = X_train.drop(cols_with_missing, axis=1)
reduced_X_valid = X_valid.drop(cols_with_missing, axis=1)
print("MAE from Approach 1 (Drop columns with missing values):")
print(score_dataset(reduced_X_train, reduced_X_valid, y_train, y_valid))
from sklearn.impute import SimpleImputer
my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train)) #填补后的训练集
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid)) #填补后的测试集
# 由于插补删除了列名,将列名赋值回去
imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns
#输出训练后的随机森林模型的平均绝对误差
print("MAE from Approach 2 (Imputation):")
print(score_dataset(imputed_X_train, imputed_X_valid, y_train, y_valid))