kaggle住房预测项目——第3部分(stacking)

kaggle住房预测项目——第3部分(stacking)

import numpy as np
import pandas as pd
import random
import datetime
import copy
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression

数据准备

读取特征工程后的数据

train = pd.read_csv("train_after_fe.csv")
test = pd.read_csv("test_after_fe.csv")
X_train = train.drop(['SalePrice'],axis=1)
Y_train = train.loc[:,'SalePrice'].values
X_test = test.drop(['SalePrice'], axis=1)
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
(1460, 294)
(1460,)
(1459, 294)

模型准备

adbt

adbt_best_params = {
      
    'learning_rate':0.1,
    'n_estimators': 285,
    'loss':'exponential',
    'random_state':666
}
best_adbt = AdaBoostRegressor(**adbt_best_params)
# 最佳模型得分:0.19482008970155343

et

et_best_params = {
   
    'n_estimators': 1164,
    'max_depth': 16,
    'min_samples_leaf': 1,
    'min_samples_split':2,
    'max_features': 'auto',
    'criterion': 'mse',
    'n_jobs':-1,
    'random_state':666
}

best_et = ExtraTreesRegressor(**et_best_params)
# 最佳模型得分:0.14230205326532733

gdbt

gdbt_best_params = {
   
    'n_estimators': 374,
    'learning_rate':0.1,
    'max_depth': 2,
    'min_samples_leaf':1,
    'min_samples_split':12,
    'max_features': 0.2,
    'subsample':0.9 ,  
    'criterion': 'friedman_mse',
    'random_state':666
}
best_gdbt = GradientBoostingRegressor(**gdbt_best_params)
# 最佳模型得分:0.12760946445307936

rf

rf_best_params = {
   
    'n_estimators': 234,
    'max_depth': 18,
    'min_samples_leaf': 1,
    'min_samples_split':2,
    'max_features': 0.3, 
    'criterion': 'mse',
    'n_jobs':-1,
    'random_state':666
}

best_rf = RandomForestRegressor(**rf_best_params)
# 最佳模型得分:0.14357134506907396

xgb

xgb_best_params = {
   
    'learning_rate': 0.1, 
    'n_estimators':452,
    'max_depth': 4, 
    'min_child_weight': 4, 
    'subsample': 0.9, 
    'colsample_bytree': 0.6, 
    'gamma': 0, 
    'reg_alpha': 1, 
    'reg_lambda': 1,
    'n_jobs':-1,
    'random_state':666
}
best_xgb = xgb.XGBRegressor(**xgb_best_params)
# 最佳模型得分:0.1220416129257937

Stacking

Helpers via Python Classes

在下面的代码部分中,我们基本上编写了一个类SklearnHelper,它允许扩展所有Sklearn分类器中通用的内建方法(如train、predict和fit)。因此,如果我们想调用五个不同的分类器,这样就不需要编写相同的方法五次,从而减少了冗余。

ntrain = train.shape[0] # train样本个数
ntest = test.shape[0] # test样本个数
SEED = 666 # for reproducibility
NFOLDS = 5 # set folds for out-of-fold prediction
kf = KFold(n_splits= NFOLDS, random_state=SEED)

# kf = KFold(ntrain, n_folds= NFOLDS, random_state=SEED) # 旧版本

# Class to extend the Sklearn classifier
class SklearnHelper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)
    
    def fit(self,x,y):
        return self.clf.fit(x,y)
    
    def feature_importances(self,x,y):
        print(self.clf.fit(x,y).feature_importances_)
d:\downloadinstall\python3.7\python3.7.5\lib\site-packages\sklearn\model_selection\_split.py:297: FutureWarning: Setting a random_state has no effect since shuffle is False. This will raise an error in 0.24. You should leave random_state to its default (None), or set shuffle=True.
  FutureWarning

Out-of-Fold Predictions

def get_oof(clf, x_train, y_train, x_test):
    oof_train = np.zeros((ntrain,)) # shape: (train样本个数891,)
    oof_test = np.zeros((ntest,)) # shape: (test样本个数418,)
    oof_test_skf = np.empty((NFOLDS, ntest)) # shape: (5, test样本个数418)

    for i, (train_index, test_index) in enumerate(kf.split(train)):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

第一层

rf = SklearnHelper(clf=RandomForestRegressor, seed=SEED, params=rf_best_params)
et = SklearnHelper(clf=ExtraTreesRegressor, seed=SEED, params=et_best_params)
adbt= SklearnHelper(clf=AdaBoostRegressor, seed=SEED, params=adbt_best_params)
gdbt = SklearnHelper
  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值