集成学习
使用单个模型进行预测还不能达到最好的效果,因此考虑使用集成学习的方法来进一步减小误差。
集成学习就是不同模型的堆叠与集成,并且选择最优参数
以下集成学习中会用到 13 个模型,首先导入需要用到的包:
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.svm import SVR, LinearSVR
from sklearn.linear_model import ElasticNet, SGDRegressor, BayesianRidge
from sklearn.kernel_ridge import KernelRidge
from xgboost import XGBRegressor
1. 基本建模与评估
按照比赛要求首先定义基于 RMSE 的交叉验证评估指标:
#定义交叉验证的策略,以及评估函数
def rmse_cv(model,X,y):
rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=5))
return rmse
保存所有要用到的模型
models = [LinearRegression(),Ridge(),Lasso(alpha=0.01,max_iter=10000),RandomForestRegressor(),GradientBoostingRegressor(),SVR(),LinearSVR(),
ElasticNet(alpha=0.001,max_iter=10000),SGDRegressor(max_iter=1000,tol=1e-3),BayesianRidge(),KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5),
ExtraTreesRegressor(),XGBRegressor()]
首先分别使用了 13 个模型和5折交叉验证来评估每个模型的预测效果
names = ["LR", "Ridge", "Lasso", "RF", "GBR", "SVR", "LinSVR", "Ela","SGD","Bay","Ker","Extra","Xgb"]
for name, model in zip(names, models):
score = rmse_cv(model, X_scaled, y_log)
print("{}: {:.6f}, {:.4f}".format(name,score.mean(),score.std()))
LR: 621771864740.772827, 411989810656.7503
Ridge: 0.118922, 0.0076
Lasso: 0.118914, 0.0065
RF: 0.148033, 0.0049
GBR: 0.123131, 0.0076
SVR: 0.179019, 0.0129
LinSVR: 1.239646, 0.4882
Ela: 0.116366, 0.0070
SGD: 2.814280, 0.5916
Bay: 0.117589, 0.0066
Ker: 0.114100, 0.0081
Extra: 0.141861, 0.0112
Xgb: 0.124880, 0.0058
2. 对每个模型进行调参
建立一个调参的方法,这里的评估指标是RMSE,所以打印出的分数也要是RMSE。定义交叉方式,先指定模型后指定参数,方便测试多个模型,使用网格交叉验证
class grid():
def __init__(self,model):
self.model = model
def grid_get(self,X,y,param_grid):
grid_search = GridSearchCV(self.model,param_grid,cv=5, scoring="neg_mean_squared_error")
grid_search.fit(X,y)
print(grid_search.best_params_, np.sqrt(-grid_search.best_score_))
grid_search.cv_results_['mean_test_score'] = np.sqrt(-grid_search.cv_results_['mean_test_score'])
print(pd.DataFrame(grid_search.cv_results_)[['params','mean_test_score','std_test_score']])
Lasso()的调参结果
grid(Lasso()).grid_get(X_scaled,y_log,{'alpha': [0.0004,0.0005,0.0007,0.0006,0.0009,0.0008],'max_iter':[10000]})
{'alpha': 0.0009, 'max_iter': 10000} 0.11557402177546283
params mean_test_score std_test_score
0 {'alpha': 0.0004, 'max_iter': 10000} 0.116897 0.001659
1 {'alpha': 0.0005, 'max_iter': 10000} 0.116580 0.001644
2 {'alpha': 0.0007, 'max_iter': 10000} 0.116041 0.001612
3 {'alpha': 0.0006, 'max_iter': 10000} 0.116301 0.001630
4 {'alpha': 0.0009, 'max_iter': 10000} 0.115574 0.001574
5 {'alpha': 0.0008, 'max_iter': 10000} 0.115794 0.001591
Ridge()的调参结果
grid(Ridge()).grid_get(X_scaled,y_log,{'alpha':[35,40,45,50,55,60,65,70,80,90]})
{'alpha': 90} 0.11753822142197719
params mean_test_score std_test_score
0 {'alpha': 35} 0.118097 0.001621
1 {'alpha': 40} 0.118003 0.001607
2 {'alpha': 45} 0.117921 0.001595
3 {'alpha': 50} 0.117849 0.001583
4 {'alpha': 55} 0.117787 0.001573
5 {'alpha': 60} 0.117733 0.001564
6 {'alpha': 65} 0.117686 0.001555
7 {'alpha': 70} 0.117646 0.001547
8 {'alpha': 80} 0.117582 0.001533
9 {'alpha': 90} 0.117538 0.001522
经过多轮测试,最终选择以下六个模型及对应的最优参数
lasso = Lasso(alpha=0.0005,max_iter=10000)
ridge = Ridge(alpha=60)
svr = SVR(gamma= 0.0004,kernel='rbf',C=13,epsilon=0.009)
ker = KernelRidge(alpha=0.2 ,kernel='polynomial',degree=3 , coef0=0.8)
ela = ElasticNet(alpha=0.005,l1_ratio=0.08,max_iter=10000)
bay = BayesianRidge()
3. 集成方法,使用加权平均
根据权重对各个模型加权平均
##定义加权平均值,就相当于自己写fit_transform()
class AverageWeight(BaseEstimator, RegressorMixin):
def __init__(self,mod,weight):
self.mod = mod##模型的个数
self.weight = weight##权重
def fit(self,X,y):
self.models_ = [clone(x) for x in self.mod]
for model in self.models_:
model.fit(X,y)
return self
def predict(self,X):
w = list()
# pred 返回每个模型的交叉验证结果,大小为 (模型个数)x(验证集样本个数)
pred = np.array([model.predict(X) for model in self.models_])
# 针对于每一个数据点,单一的模型是乘以权重,然后加起来
for data in range(pred.shape[1]):
single = [pred[model,data]*weight for model,weight in zip(range(pred.shape[0]),self.weight)]
w.append(np.sum(single))
return w
定义6个初始权重
w1 = 0.02
w2 = 0.2
w3 = 0.25
w4 = 0.3
w5 = 0.03
w6 = 0.2
weight_avg = AverageWeight(mod = [lasso,ridge,svr,ker,ela,bay],weight=[w1,w2,w3,w4,w5,w6])
print(rmse_cv(weight_avg,X_scaled,y_log))
print(rmse_cv(weight_avg,X_scaled,y_log).mean())##计算出交叉验证的均值
[0.11579461, 0.12567714, 0.12194691, 0.10298115, 0.11307847]
0.11589565350500453
4. stacking 模型的堆叠
class stacking(BaseEstimator, RegressorMixin, TransformerMixin):
def __init__(self,mod,meta_model):
self.mod = mod
self.meta_model = meta_model#元模型
self.kf = KFold(n_splits=5, random_state=42, shuffle=True)##这就是堆叠的最大特征进行了几折的划分
def fit(self,X,y):
self.saved_model = [list() for i in self.mod] # 保存模型
oof_train = np.zeros((X.shape[0], len(self.mod))) # 这里得到一个训练集行数乘以模型个数的矩阵
for i,model in enumerate(self.mod):#返回的是索引和模型本身
for train_index, val_index in self.kf.split(X,y):##返回的是数据本省
renew_model = clone(model)##模型的复制
renew_model.fit(X[train_index], y[train_index])#对数据进行训练
self.saved_model[i].append(renew_model)##把模型添加进去
oof_train[val_index,i] = renew_model.predict(X[val_index])##用来预测验证集
self.meta_model.fit(oof_train,y)#元模型
return self
def predict(self,X):
whole_test = np.column_stack([np.column_stack(model.predict(X) for model in single_model).mean(axis=1)
for single_model in self.saved_model]) ##得到的是整个测试集
return self.meta_model.predict(whole_test)#返回的是利用元模型来对整个测试集进行预测
def get_oof(self,X,y,test_X):
oof = np.zeros((X.shape[0],len(self.mod)))##初始化为0
test_single = np.zeros((test_X.shape[0],5))##初始化为0
test_mean = np.zeros((test_X.shape[0],len(self.mod)))
for i,model in enumerate(self.mod):##i是模型
for j, (train_index,val_index) in enumerate(self.kf.split(X,y)):##j是所有划分好的的数据
clone_model = clone(model)##克隆模块,相当于把模型复制一下
clone_model.fit(X[train_index],y[train_index])##把分割好的数据进行训练
oof[val_index,i] = clone_model.predict(X[val_index])##对验证集进行预测
test_single[:,j] = clone_model.predict(test_X)##对测试集进行预测
test_mean[:,i] = test_single.mean(axis=1)##测试集算好均值
return oof, test_mean
##经过预处理之后才能放到堆叠的模型里面去计算
a = Imputer().fit_transform(X_scaled)#相当于x
b = Imputer().fit_transform(y_log.values.reshape(-1,1)).ravel()#相当于y
stack_model = stacking(mod=[lasso,ridge,svr,ker,ela,bay],meta_model=ker)
stack_model.fit(a,b)#模型进行训练
pred = np.exp(stack_model.predict(test_X_scaled))#进行预测
result.to_csv("submission2.csv",index=False)
上传后得分为0.12310