12.房价预测集成学习Stacking Learning

最新推荐文章于 2022-07-15 19:17:58 发布

appke_

最新推荐文章于 2022-07-15 19:17:58 发布

阅读量1k

点赞数

分类专栏：洛杉矶房价预测

本文链接：https://blog.csdn.net/oppo603/article/details/102555068

版权

洛杉矶房价预测专栏收录该内容

12 篇文章 0 订阅

订阅专栏

代码地址：appke/Los-House-Prices: 洛杉矶房价预测

import numpy as np 
import pandas as pd

# 忽略警告信息
import warnings
warnings.filterwarnings("ignore")

数据集的准备

from sklearn.model_selection import train_test_split
train=pd.read_csv('datas/house_data.csv')
y=train['SalePrice']
train1=train.drop(['Id','SalePrice'],axis=1)
X=pd.get_dummies(train1).reset_index(drop=True)
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=123)

# tmp=train.isnull().sum()
# tmp[tmp>0]

模型测评

from sklearn.metrics import mean_squared_error
def benchmark(model,testset,label):
    pred=model.predict(testset)
    if pred[pred<0].shape[0]>0:
        print('Neg Value')
    rmse=np.sqrt(mean_squared_error(label,pred))
    lrmse=np.sqrt(mean_squared_error(np.log(label),np.log(pred)))

    print('RMSE:',rmse)
    print('LRMSE:',lrmse)
    return lrmse

基础模型训练

ElasticNet

from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model import ElasticNetCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold

kfolds=KFold(n_splits=10, shuffle=True, random_state=123)
e_l1ratio=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.85,0.9,0.95]
e_alphas=np.logspace(-10,2.8,150)

def elastic_train_test(alpha,l1ratio):
    e_model=make_pipeline(RobustScaler(),ElasticNetCV(alphas=[alpha],l1_ratio=[l1ratio]))
    e_model.fit(X_train,y_train)
    lrmse=benchmark(e_model,X_test,y_test)
    return lrmse

elastic_train_test(50,0.5)

RMSE: 64803.88956616406
LRMSE: 0.3056812482960621

0.3056812482960621

elastic_model=make_pipeline(RobustScaler(), ElasticNetCV(alphas=e_alphas, l1_ratio=e_l1ratio)).fit(X_train,y_train)

benchmark(elastic_model,X_test,y_test)

RMSE: 25991.07955736571
LRMSE: 0.12567210233778722

0.12567210233778722

elastic_model.steps[1][1].alpha_

0.3432183268134919

elastic_model.steps[1][1].l1_ratio_

0.9

XGBoost训练

import xgboost as xgb
xg_reg=xgb.XGBRegressor(objective='reg:linear',colsample_bytree=0.7,learning_rate=0.01,max_depth=3,n_estimators=3400,subsample=0.7,nthread=6,seed=123)

xg_reg.fit(X_train,y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bytree=0.7, gamma=0, importance_type='gain',
             learning_rate=0.01, max_delta_step=0, max_depth=3,
             min_child_weight=1, missing=None, n_estimators=3400, n_jobs=1,
             nthread=6, objective='reg:linear', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, seed=123, silent=True,
             subsample=0.7)

benchmark(xg_reg,X_test,y_test)

RMSE: 22926.489730019464
LRMSE: 0.10024704840338212

0.10024704840338212

Stacking集成算法

底层算法

from mlxtend.regressor import StackingCVRegressor

alphas_alt=np.logspace(-10,2.8,150)

ridge=make_pipeline(RobustScaler(),RidgeCV(alphas=alphas_alt,cv=kfolds))

lasso=make_pipeline(RobustScaler(),LassoCV(alphas=alphas_alt,cv=kfolds))

elasticnet=make_pipeline(RobustScaler(),ElasticNetCV(alphas=e_alphas,cv=kfolds, l1_ratio=e_l1ratio))

xgboost=make_pipeline(RobustScaler(),xgb.XGBRegressor(objective='reg:linear',colsample_bytree=0.7,learning_rate=0.01,max_depth=3,n_estimators=3460,subsample=0.7,reg_alpha=0.00006,gamma=0,nthread=6,scale_pos_weight=1,seed=27))

上层算法

# 是否使用原训练集中的feature
stack_alg=StackingCVRegressor(regressors=(ridge, lasso, elasticnet, xgboost),
                              meta_regressor=xgboost, use_features_in_secondary=True)