import numpy as np
import pandas as pd
import warnings
warnings. filterwarnings( "ignore" )
数据集的准备
from sklearn. model_selection import train_test_split
train= pd. read_csv( 'datas/house_data.csv' )
y= train[ 'SalePrice' ]
train1= train. drop( [ 'Id' , 'SalePrice' ] , axis= 1 )
X= pd. get_dummies( train1) . reset_index( drop= True )
X_train, X_test, y_train, y_test= train_test_split( X, y, test_size= 0.2 , random_state= 123 )
模型测评
from sklearn. metrics import mean_squared_error
def benchmark ( model, testset, label) :
pred= model. predict( testset)
if pred[ pred< 0 ] . shape[ 0 ] > 0 :
print ( 'Neg Value' )
rmse= np. sqrt( mean_squared_error( label, pred) )
lrmse= np. sqrt( mean_squared_error( np. log( label) , np. log( pred) ) )
print ( 'RMSE:' , rmse)
print ( 'LRMSE:' , lrmse)
return lrmse
基础模型训练
ElasticNet
from sklearn. linear_model import RidgeCV
from sklearn. linear_model import LassoCV
from sklearn. linear_model import ElasticNetCV
from sklearn. pipeline import make_pipeline
from sklearn. preprocessing import RobustScaler
from sklearn. model_selection import KFold
kfolds= KFold( n_splits= 10 , shuffle= True , random_state= 123 )
e_l1ratio= [ 0.1 , 0.2 , 0.3 , 0.4 , 0.5 , 0.6 , 0.7 , 0.8 , 0.85 , 0.9 , 0.95 ]
e_alphas= np. logspace( - 10 , 2.8 , 150 )
def elastic_train_test ( alpha, l1ratio) :
e_model= make_pipeline( RobustScaler( ) , ElasticNetCV( alphas= [ alpha] , l1_ratio= [ l1ratio] ) )
e_model. fit( X_train, y_train)
lrmse= benchmark( e_model, X_test, y_test)
return lrmse
elastic_train_test( 50 , 0.5 )
RMSE: 64803.88956616406
LRMSE: 0.3056812482960621
0.3056812482960621
elastic_model= make_pipeline( RobustScaler( ) , ElasticNetCV( alphas= e_alphas, l1_ratio= e_l1ratio) ) . fit( X_train, y_train)
benchmark( elastic_model, X_test, y_test)
RMSE: 25991.07955736571
LRMSE: 0.12567210233778722
0.12567210233778722
elastic_model. steps[ 1 ] [ 1 ] . alpha_
0.3432183268134919
elastic_model. steps[ 1 ] [ 1 ] . l1_ratio_
0.9
XGBoost训练
import xgboost as xgb
xg_reg= xgb. XGBRegressor( objective= 'reg:linear' , colsample_bytree= 0.7 , learning_rate= 0.01 , max_depth= 3 , n_estimators= 3400 , subsample= 0.7 , nthread= 6 , seed= 123 )
xg_reg. fit( X_train, y_train)
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bytree=0.7, gamma=0, importance_type='gain',
learning_rate=0.01, max_delta_step=0, max_depth=3,
min_child_weight=1, missing=None, n_estimators=3400, n_jobs=1,
nthread=6, objective='reg:linear', random_state=0, reg_alpha=0,
reg_lambda=1, scale_pos_weight=1, seed=123, silent=True,
subsample=0.7)
benchmark( xg_reg, X_test, y_test)
RMSE: 22926.489730019464
LRMSE: 0.10024704840338212
0.10024704840338212
Stacking集成算法
底层算法
from mlxtend. regressor import StackingCVRegressor
alphas_alt= np. logspace( - 10 , 2.8 , 150 )
ridge= make_pipeline( RobustScaler( ) , RidgeCV( alphas= alphas_alt, cv= kfolds) )
lasso= make_pipeline( RobustScaler( ) , LassoCV( alphas= alphas_alt, cv= kfolds) )
elasticnet= make_pipeline( RobustScaler( ) , ElasticNetCV( alphas= e_alphas, cv= kfolds, l1_ratio= e_l1ratio) )
xgboost= make_pipeline( RobustScaler( ) , xgb. XGBRegressor( objective= 'reg:linear' , colsample_bytree= 0.7 , learning_rate= 0.01 , max_depth= 3 , n_estimators= 3460 , subsample= 0.7 , reg_alpha= 0.00006 , gamma= 0 , nthread= 6 , scale_pos_weight= 1 , seed= 27 ) )
上层算法
stack_alg= StackingCVRegressor( regressors= ( ridge, lasso, elasticnet, xgboost) ,
meta_regressor= xgboost, use_features_in_secondary= True )
stackX= np. array( X_train)
stacky= np. array( y_train)
stack_alg. fit( stackX, stacky)
p
benchmark( stack_alg, X_test, y_test)