集成学习
通过建立几个模型来解决单一预测问题
工作原理:生成多个分类器/模型,各自独立地学习和作出预测。这些预测最好结合成组合预测,因此优于任何一个单分类的做出预测
Bagging和随机森林
随机森林是一个包含多个决策树的分类器,并且其输出的类别是由个别树输出的类别的众树而定
随机森林=Bagging+决策树
sklearn.ensemble.RandomForestClassifier(n_estimators=10,criterion='gini',max_depth=None,bootstrap=True,random_state=None,min_samples_split=2)
#实例化一个随机森林
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier()
#通过超参数调优
from sklearn.model_selection import GridSearchCV
param={'n_estimators':[100,120,300],'max_depth':[3,7,11]}
gc=GridSearchCV(rf,param_grid=param,cv=3)
gc.fit(x_train,y_train)
print("随机森林预测结果是:\n", gc.score(x_test, y_test))
随机森林预测结果是:
0.779467680608365
包外估计
在随机森林构造过程中,如果进行有放回的抽样,我们会发现,总有一部分样本是我们选不到的
没有选择到的数据,称为out-of-bag(OOB)数据,总有36.8%的数据抽不到
经验证,包外估计是对集成分类器泛化误差的无偏估计
无偏估计:所有样本出现的概率一样
用途:当基学习器是决策树时,可以使用包外样本来辅助剪枝
随机森林案例
对电子商务的品种进行分类
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
data=pd.read_csv('./data1/otto/train.csv')
# 图形可视化,查看数据分布
import seaborn as sns
sns.countplot(x=data.target)
plt.show()
由上图可以看出,该数据类别不均衡,所以需要后期处理
#截取部分数据
new1_data = data[:10000]
# 图形可视化,查看数据分布
import seaborn as sns
sns.countplot(new1_data.target)
plt.show()
使用上面方式获取数据不可行,然后使用随机欠采样获取响应的数据
# 随机欠采样获取数据
# 首先需要确定特征值/标签值
y = data["target"]
x = data.drop(["id", "target"], axis=1)
# 欠采样获取数据
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=0)
X_resampled, y_resampled = rus.fit_resample(x, y)
# 图形可视化,查看数据分布
import seaborn as sns
sns.countplot(x=y_resampled)
plt.show()
#把标签值转换为数字
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_resampled = le.fit_transform(y_resampled)
array([0, 0, 0, …, 8, 8, 8])
#分割数据
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2)
#基本模型训练
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(oob_score=True) #是否使用袋外数据来估计模型的有效性
rf.fit(x_train, y_train)
rf.oob_score_
0.7596486175115207
#logloss使用过程中,必须要求将输出用one-hot表示
#需要将这个多类别问题的输出结果通过OneHotEncoder修改
from sklearn.preprocessing import OneHotEncoder
one_hot = OneHotEncoder(sparse=False)
y_test1 = one_hot.fit_transform(y_test.reshape(-1, 1))
y_pre1 = one_hot.fit_transform(y_pre.reshape(-1, 1))
# logloss模型评估
from sklearn.metrics import log_loss
log_loss(y_test1, y_pre1, eps=1e-15, normalize=True)
7.737163269576891
# 改变预测值的输出模式,让输出结果为百分占比,降低logloss值
y_pre_proba = rf.predict_proba(x_test)
y_pre_proba
# logloss模型评估
log_loss(y_test1, y_pre_proba, eps=1e-15, normalize=True)
0.7316244385161541
模型调优
# 确定n_estimators的取值范围
tuned_parameters = range(10, 200, 10)
# 创建添加accuracy的一个numpy
accuracy_t = np.zeros(len(tuned_parameters))
# 创建添加error的一个numpy
error_t = np.zeros(len(tuned_parameters))
# 调优过程实现
for j, one_parameter in enumerate(tuned_parameters):
rf2 = RandomForestClassifier(n_estimators=one_parameter,
max_depth=10,
max_features=10,
min_samples_leaf=10,
oob_score=True,
random_state=0,
n_jobs=-1) #n_job是让电脑满负荷跑起来
rf2.fit(x_train, y_train)
# 输出accuracy
accuracy_t[j] = rf2.oob_score_
# 输出log_loss
y_pre = rf2.predict_proba(x_test)
error_t[j] = log_loss(y_test, y_pre, eps=1e-15, normalize=True)
print(error_t)
# 优化结果过程可视化
fig,axes = plt.subplots(nrows=1, ncols=2, figsize=(20, 4), dpi=100)
axes[0].plot(tuned_parameters, error_t)
axes[1].plot(tuned_parameters, accuracy_t)
axes[0].set_xlabel("n_estimators")
axes[0].set_ylabel("error_t")
axes[1].set_xlabel("n_estimators")
axes[1].set_ylabel("accuracy_t")
axes[0].grid(True)
axes[1].grid(True)
plt.show()
# 确定max_features的取值范围
tuned_parameters = range(5, 40, 5)
# 创建添加accuracy的一个numpy
accuracy_t = np.zeros(len(tuned_parameters))
# 创建添加error的一个numpy
error_t = np.zeros(len(tuned_parameters))
# 调优过程实现
for j, one_parameter in enumerate(tuned_parameters):
rf2 = RandomForestClassifier(n_estimators=175,
max_depth=10,
max_features=one_parameter,
min_samples_leaf=10,
oob_score=True,
random_state=0,
n_jobs=-1)
rf2.fit(x_train, y_train)
# 输出accuracy
accuracy_t[j] = rf2.oob_score_
# 输出log_loss
y_pre = rf2.predict_proba(x_test)
error_t[j] = log_loss(y_test, y_pre, eps=1e-15, normalize=True)
print(error_t)
# 优化结果过程可视化
fig,axes = plt.subplots(nrows=1, ncols=2, figsize=(20, 4), dpi=100)
axes[0].plot(tuned_parameters, error_t)
axes[1].plot(tuned_parameters, accuracy_t)
axes[0].set_xlabel("max_features")
axes[0].set_ylabel("error_t")
axes[1].set_xlabel("max_features")
axes[1].set_ylabel("accuracy_t")
axes[0].grid(True)
axes[1].grid(True)
plt.show()
# 确定max_depth的取值范围
tuned_parameters = range(10, 100, 10)
# 创建添加accuracy的一个numpy
accuracy_t = np.zeros(len(tuned_parameters))
# 创建添加error的一个numpy
error_t = np.zeros(len(tuned_parameters))
# 调优过程实现
for j, one_parameter in enumerate(tuned_parameters):
rf2 = RandomForestClassifier(n_estimators=175,
max_depth=one_parameter,
max_features=15,
min_samples_leaf=10,
oob_score=True,
random_state=0,
n_jobs=-1)
rf2.fit(x_train, y_train)
# 输出accuracy
accuracy_t[j] = rf2.oob_score_
# 输出log_loss
y_pre = rf2.predict_proba(x_test)
error_t[j] = log_loss(y_test, y_pre, eps=1e-15, normalize=True)
print(error_t)
# 优化结果过程可视化
fig,axes = plt.subplots(nrows=1, ncols=2, figsize=(20, 4), dpi=100)
axes[0].plot(tuned_parameters, error_t)
axes[1].plot(tuned_parameters, accuracy_t)
axes[0].set_xlabel("max_depth")
axes[0].set_ylabel("error_t")
axes[1].set_xlabel("max_depth")
axes[1].set_ylabel("accuracy_t")
axes[0].grid(True)
axes[1].grid(True)
plt.show()
# 确定min_sample_leaf的取值范围
tuned_parameters = range(1, 10, 2)
# 创建添加accuracy的一个numpy
accuracy_t = np.zeros(len(tuned_parameters))
# 创建添加error的一个numpy
error_t = np.zeros(len(tuned_parameters))
# 调优过程实现
for j, one_parameter in enumerate(tuned_parameters):
rf2 = RandomForestClassifier(n_estimators=175,
max_depth=30,
max_features=15,
min_samples_leaf=one_parameter,
oob_score=True,
random_state=0,
n_jobs=-1)
rf2.fit(x_train, y_train)
# 输出accuracy
accuracy_t[j] = rf2.oob_score_
# 输出log_loss
y_pre = rf2.predict_proba(x_test)
error_t[j] = log_loss(y_test, y_pre, eps=1e-15, normalize=True)
print(error_t)
# 优化结果过程可视化
fig,axes = plt.subplots(nrows=1, ncols=2, figsize=(20, 4), dpi=100)
axes[0].plot(tuned_parameters, error_t)
axes[1].plot(tuned_parameters, accuracy_t)
axes[0].set_xlabel("min_sample_leaf")
axes[0].set_ylabel("error_t")
axes[1].set_xlabel("min_sample_leaf")
axes[1].set_ylabel("accuracy_t")
axes[0].grid(True)
axes[1].grid(True)
plt.show()
#确定最优模型
rf3 = RandomForestClassifier(n_estimators=175, max_depth=30, max_features=15, min_samples_leaf=1,
oob_score=True, random_state=40, n_jobs=-1)
rf3.fit(x_train, y_train)
rf3.score(x_test, y_test)
0.7788655341203571
rf3.oob_score_
0.7693692396313364
y_pre_proba1 = rf3.predict_proba(x_test)
log_loss(y_test, y_pre_proba1)
0.7018021583384667
生成提交数据
test_data = pd.read_csv("./data/otto/test.csv")
test_data_drop_id = test_data.drop(["id"], axis=1)
y_pre_test = rf3.predict_proba(test_data_drop_id)
result_data = pd.DataFrame(y_pre_test, columns=["Class_"+str(i) for i in range(1, 10)])
result_data.insert(loc=0, column="id", value=test_data.id)
result_data.to_csv("./data/otto/submission.csv", index=False)
Boosting
每新加入一个弱学习器,整体能力就会得到提升
代表算法:Adaboost,GBDT,XGBoost,LightGBM
bagging和boosting的区别
sklearn.ensemble.AdaBoostClassifier
AdaBoost
基本思想是通过调整样本的权重来关注先前分类错误的样本,从而逐步提升模型的性能
GBDT
梯度提升树,使用的是CART回归树