kaggle住房预测项目——第3部分(stacking)

最新推荐文章于 2022-06-28 16:55:05 发布

BernadetteDi

最新推荐文章于 2022-06-28 16:55:05 发布

阅读量902

点赞数 1

分类专栏： machine learning python 文章标签：机器学习 python

本文链接：https://blog.csdn.net/weixin_45004761/article/details/114855928

版权

kaggle住房预测项目——第3部分(stacking)

import numpy as np
import pandas as pd
import random
import datetime
import copy
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression

数据准备

读取特征工程后的数据

train = pd.read_csv("train_after_fe.csv")
test = pd.read_csv("test_after_fe.csv")

X_train = train.drop(['SalePrice'],axis=1)
Y_train = train.loc[:,'SalePrice'].values
X_test = test.drop(['SalePrice'], axis=1)

print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)

(1460, 294)
(1460,)
(1459, 294)

模型准备

adbt

adbt_best_params = {
      
    'learning_rate':0.1,
    'n_estimators': 285,
    'loss':'exponential',
    'random_state':666
}
best_adbt = AdaBoostRegressor(**adbt_best_params)
# 最佳模型得分:0.19482008970155343

et

et_best_params = {
   
    'n_estimators': 1164,
    'max_depth': 16,
    'min_samples_leaf': 1,
    'min_samples_split':2,
    'max_features': 'auto',
    'criterion': 'mse',
    'n_jobs':-1,
    'random_state':666
}

best_et = ExtraTreesRegressor(**et_best_params)
# 最佳模型得分:0.14230205326532733

gdbt

gdbt_best_params = {
   
    'n_estimators': 374,
    'learning_rate':0.1,
    'max_depth': 2,
    'min_samples_leaf':1,
    'min_samples_split':12,
    'max_features': 0.2,
    'subsample':0.9 ,  
    'criterion': 'friedman_mse',
    'random_state':666
}
best_gdbt = GradientBoostingRegressor(**gdbt_best_params)
# 最佳模型得分:0.12760946445307936

rf

rf_best_params = {
   
    'n_estimators': 234,
    'max_depth': 18,
    'min_samples_leaf': 1,
    'min_samples_split':2,
    'max_features': 0.3, 
    'criterion': 'mse',
    'n_jobs':-1,
    'random_state':666
}

best_rf = RandomForestRegressor(**rf_best_params)
# 最佳模型得分:0.14357134506907396

xgb

xgb_best_params = {
   
    'learning_rate': 0.1, 
    'n_estimators':452,
    'max_depth': 4, 
    'min_child_weight': 4, 
    'subsample': 0.9, 
    'colsample_bytree': 0.6, 
    'gamma': 0, 
    'reg_alpha': 1, 
    'reg_lambda': 1,
    'n_jobs':-1,
    'random_state':666
}
best_xgb = xgb.XGBRegressor(**xgb_best_params)
# 最佳模型得分:0.1220416129257937

Stacking

Helpers via Python Classes

在下面的代码部分中，我们基本上编写了一个类SklearnHelper，它允许扩展所有Sklearn分类器中通用的内建方法(如train、predict和fit)。因此，如果我们想调用五个不同的分类器，这样就不需要编写相同的方法五次，从而减少了冗余。

ntrain = train.shape[0] # train样本个数
ntest = test.shape[0] # test样本个数
SEED = 666 # for reproducibility
NFOLDS = 5 # set folds for out-of-fold prediction
kf = KFold(n_splits= NFOLDS, random_state=SEED)

# kf = KFold(ntrain, n_folds= NFOLDS, random_state=SEED) # 旧版本

# Class to extend the Sklearn classifier
class SklearnHelper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)
    
    def fit(self,x,y):
        return self.clf.fit(x,y)
    
    def feature_importances(self,x,y):
        print(self.clf.fit(x,y).feature_importances_)

d:\downloadinstall\python3.7\python3.7.5\lib\site-packages\sklearn\model_selection\_split.py:297: FutureWarning: Setting a random_state has no effect since shuffle is False. This will raise an error in 0.24. You should leave random_state to its default (None), or set shuffle=True.
  FutureWarning

Out-of-Fold Predictions

def get_oof(clf, x_train, y_train, x_test):
    oof_train = np.zeros((ntrain,)) # shape: (train样本个数891,)
    oof_test = np.zeros((ntest,)) # shape: (test样本个数418,)
    oof_test_skf = np.empty((NFOLDS, ntest)) # shape: (5, test样本个数418)

    for i, (train_index, test_index) in enumerate(kf.split(train)):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

第一层

rf = SklearnHelper(clf=RandomForestRegressor, seed=SEED, params=rf_best_params)
et = SklearnHelper(clf=ExtraTreesRegressor, seed=SEED, params=et_best_params)
adbt= SklearnHelper(clf=AdaBoostRegressor, seed=SEED, params=adbt_best_params)
gdbt = SklearnHelper

最低0.47元/天解锁文章

BernadetteDi

关注

1
点赞
踩
1

收藏

觉得还不错? 一键收藏
1
评论
kaggle住房预测项目——第3部分(stacking)

kaggle住房预测项目——第3部分(stacking)import numpy as npimport pandas as pdimport randomimport datetimeimport copyfrom sklearn.metrics import make_scorerfrom sklearn.metrics import mean_squared_log_errorfrom sklearn.model_selection import GridSearchCVimport
复制链接

扫一扫