kaggle住房预测项目——第3部分(stacking)
import numpy as np
import pandas as pd
import random
import datetime
import copy
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
数据准备
读取特征工程后的数据
train = pd.read_csv("train_after_fe.csv")
test = pd.read_csv("test_after_fe.csv")
X_train = train.drop(['SalePrice'],axis=1)
Y_train = train.loc[:,'SalePrice'].values
X_test = test.drop(['SalePrice'], axis=1)
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
(1460, 294)
(1460,)
(1459, 294)
模型准备
adbt
adbt_best_params = {
'learning_rate':0.1,
'n_estimators': 285,
'loss':'exponential',
'random_state':666
}
best_adbt = AdaBoostRegressor(**adbt_best_params)
# 最佳模型得分:0.19482008970155343
et
et_best_params = {
'n_estimators': 1164,
'max_depth': 16,
'min_samples_leaf': 1,
'min_samples_split':2,
'max_features': 'auto',
'criterion': 'mse',
'n_jobs':-1,
'random_state':666
}
best_et = ExtraTreesRegressor(**et_best_params)
# 最佳模型得分:0.14230205326532733
gdbt
gdbt_best_params = {
'n_estimators': 374,
'learning_rate':0.1,
'max_depth': 2,
'min_samples_leaf':1,
'min_samples_split':12,
'max_features': 0.2,
'subsample':0.9 ,
'criterion': 'friedman_mse',
'random_state':666
}
best_gdbt = GradientBoostingRegressor(**gdbt_best_params)
# 最佳模型得分:0.12760946445307936
rf
rf_best_params = {
'n_estimators': 234,
'max_depth': 18,
'min_samples_leaf': 1,
'min_samples_split':2,
'max_features': 0.3,
'criterion': 'mse',
'n_jobs':-1,
'random_state':666
}
best_rf = RandomForestRegressor(**rf_best_params)
# 最佳模型得分:0.14357134506907396
xgb
xgb_best_params = {
'learning_rate': 0.1,
'n_estimators':452,
'max_depth': 4,
'min_child_weight': 4,
'subsample': 0.9,
'colsample_bytree': 0.6,
'gamma': 0,
'reg_alpha': 1,
'reg_lambda': 1,
'n_jobs':-1,
'random_state':666
}
best_xgb = xgb.XGBRegressor(**xgb_best_params)
# 最佳模型得分:0.1220416129257937
Stacking
Helpers via Python Classes
在下面的代码部分中,我们基本上编写了一个类SklearnHelper,它允许扩展所有Sklearn分类器中通用的内建方法(如train、predict和fit)。因此,如果我们想调用五个不同的分类器,这样就不需要编写相同的方法五次,从而减少了冗余。
ntrain = train.shape[0] # train样本个数
ntest = test.shape[0] # test样本个数
SEED = 666 # for reproducibility
NFOLDS = 5 # set folds for out-of-fold prediction
kf = KFold(n_splits= NFOLDS, random_state=SEED)
# kf = KFold(ntrain, n_folds= NFOLDS, random_state=SEED) # 旧版本
# Class to extend the Sklearn classifier
class SklearnHelper(object):
def __init__(self, clf, seed=0, params=None):
params['random_state'] = seed
self.clf = clf(**params)
def train(self, x_train, y_train):
self.clf.fit(x_train, y_train)
def predict(self, x):
return self.clf.predict(x)
def fit(self,x,y):
return self.clf.fit(x,y)
def feature_importances(self,x,y):
print(self.clf.fit(x,y).feature_importances_)
d:\downloadinstall\python3.7\python3.7.5\lib\site-packages\sklearn\model_selection\_split.py:297: FutureWarning: Setting a random_state has no effect since shuffle is False. This will raise an error in 0.24. You should leave random_state to its default (None), or set shuffle=True.
FutureWarning
Out-of-Fold Predictions
def get_oof(clf, x_train, y_train, x_test):
oof_train = np.zeros((ntrain,)) # shape: (train样本个数891,)
oof_test = np.zeros((ntest,)) # shape: (test样本个数418,)
oof_test_skf = np.empty((NFOLDS, ntest)) # shape: (5, test样本个数418)
for i, (train_index, test_index) in enumerate(kf.split(train)):
x_tr = x_train[train_index]
y_tr = y_train[train_index]
x_te = x_train[test_index]
clf.train(x_tr, y_tr)
oof_train[test_index] = clf.predict(x_te)
oof_test_skf[i, :] = clf.predict(x_test)
oof_test[:] = oof_test_skf.mean(axis=0)
return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)
第一层
rf = SklearnHelper(clf=RandomForestRegressor, seed=SEED, params=rf_best_params)
et = SklearnHelper(clf=ExtraTreesRegressor, seed=SEED, params=et_best_params)
adbt= SklearnHelper(clf=AdaBoostRegressor, seed=SEED, params=adbt_best_params)
gdbt = SklearnHelper