接上篇,首先导入数据
import pandas as pd
import numpy as np
traindata=pd.read_csv('train_new.csv')
testdata=pd.read_csv('test_new.csv')
train=pd.read_csv(r'C:\Users\Administrator\Downloads\train.csv')
train['SalePrice']=np.log1p(train['SalePrice'])
使用XGBT默认参数进行计算
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn import linear_model
def crossvalscore(model):
score=np.sqrt(-cross_val_score(model, traindata, train['SalePrice'],\
scoring="neg_mean_squared_error",cv=3)) #3折交叉验证
return score
#导入GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingRegressor
gbm0 = GradientBoostingRegressor(random_state=10)
crossvalscore(gbm0).mean()
输出:
0.13031795384603356
调整n_estimators–迭代次数
param_test1= [{'n_estimators':range(20,251,10)}]
gsearch1=GridSearchCV(GradientBoostingRegressor(random_state=10),param_test1,cv=3, refit=False)
gsearch1.fit(traindata,train['SalePrice'])
gsearch1.cv_results_[ 'mean_test_score' ],gsearch1.best_params_, gsearch1.best_score_
得到:
{‘n_estimators’: 110}
调整max_depth–最大深度和min_samples_split–内部节点再划分所需最小样本数
param_test2 = {'max_depth':range(3,19,2), 'min_samples_split':range(100,801,200)}
gsearch2=GridSearchCV(GradientBoostingRegressor(n_estimators=110,random_state=10),param_test2,cv=3, refit=False)
gsearch2.fit(traindata,train['SalePrice'])
gsearch2.cv_results_[ 'mean_test_score' ],gsearch2.best_params_, gsearch2.best_score_
得到:
{‘max_depth’: 13, ‘min_samples_split’: 500}
再次调整n_estimators
param_test3= [{'n_estimators':range(20,251,10)}]
gsearch3=GridSearchCV(GradientBoostingRegressor(max_depth=13, min_samples_split= 500,random_state=10),param_test3,cv=3, refit=False)
gsearch3.fit(traindata,train['SalePrice'])
gsearch3.cv_results_[ 'mean_test_score' ],gsearch3.best_params_, gsearch3.best_score_
得到:
{‘n_estimators’: 120}
min_samples_split–内部节点再划分所需最小样本数和min_samples_leaf–叶子节点最少样本数
param_test4 = {'min_samples_split':range(100,1300,200), 'min_samples_leaf':range(1,31,5)}
gsearch4=GridSearchCV(GradientBoostingRegressor(n_estimators= 120,max_depth=13, random_state=10),param_test4,cv=3, refit=False)
gsearch4.fit(traindata,train['SalePrice'])
gsearch4.cv_results_[ 'mean_test_score' ],gsearch4.best_params_, gsearch4.best_score_
得到:
{‘min_samples_leaf’: 1, ‘min_samples_split’: 500}
param_test5 = {'max_features':range(7,20,2)}
gsearch5=GridSearchCV(GradientBoostingRegressor(n_estimators= 120,max_depth=13, min_samples_split= 500,random_state=10),param_test5,cv=3, refit=False)
gsearch5.fit(traindata,train['SalePrice'])
gsearch5.cv_results_[ 'mean_test_score' ],gsearch5.best_params_, gsearch5.best_score_
得到:
{‘max_features’: 15}
将参数代入公式
gbm2=GradientBoostingRegressor(n_estimators= 120,max_depth=13, min_samples_split= 500,max_features=15,random_state=10)
crossvalscore(gbm2).mean()
输出:
0.1271688683650891
缩短步长,提高迭代次数
gbm3=GradientBoostingRegressor(learning_rate=0.01,n_estimators= 1200,max_depth=13, min_samples_split= 500,max_features=15,random_state=10)
crossvalscore(gbm3).mean()
输出:
0.1246806912234059
所以可以看到,通过参数调整,计算精度有所提高