引言
本节主要介绍了XGBoost库及其应用,同样也展现了一个项目的思考过程,从可视化到训练模型,到最后评估模型,以及如何调整参数。
本文链接
XGBoost
XGBoost是一个可供用户轻松解决分类、回归或排序问题的软件包,其内部实现了梯度提升树(GBDT)模型,而GBDT是一种基于决策树的集成算法。
可视化方式
本节使用的可视化方式与Task01中使用的大同小异,因此不过多介绍,可查看Task01文章:天池机器学习训练营(一) —— 基于逻辑回归的分类预测
主要用的库是seaborn库,该库的学习教程可查看:Seaborn教程
调整参数方法
调节模型参数的方法有贪心算法、网格调参、贝叶斯调参等。这里我们采用网格调参,它的基本思想是穷举搜索:在所有候选的参数选择中,通过循环遍历,尝试每一种可能性,表现最好的参数就是最终的结果。使用的函数为sklearn中的GridSearchCV函数
## 从sklearn库中导入网格调参函数
from sklearn.model_selection import GridSearchCV
## 定义参数取值范围
learning_rate = [0.1, 0.3, 0.6]
subsample = [0.8, 0.9]
colsample_bytree = [0.6, 0.8]
max_depth = [3,5,8]
parameters = { 'learning_rate': learning_rate,
'subsample': subsample,
'colsample_bytree':colsample_bytree,
'max_depth': max_depth}
model = XGBClassifier(n_estimators = 50)
## 进行网格搜索
clf = GridSearchCV(model, parameters, cv=3, scoring='accuracy',verbose=1,n_jobs=-1)
clf = clf.fit(x_train, y_train)
GBDT算法
GBDT是一种基于决策树的集成算法。其基模型是决策树,它的思想是串联多个决策树模型共同进行决策。
那么如何串联呢?XGBoost采用迭代预测误差的方法串联。举个通俗的例子,我们现在需要预测一辆车价值3000元。我们构建决策树1训练后预测为2600元,我们发现有400元的误差,那么决策树2的训练目标为400元,但决策树2的预测结果为350元,还存在50元的误差就交给第三棵树……以此类推,每一颗树用来估计之前所有树的误差,最后所有树预测结果的求和就是最终预测结果!
整体代码
'''
机器学习训练营
Task01: XGBoost
Date: 2022-09-19
'''
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#显示所有列
pd.set_option('display.max_columns',None)
def demo():
data = pd.read_csv('train.csv')
print(data.info())
data = data.fillna(-1)
print(data.tail())
print(pd.Series(data['RainTomorrow']).value_counts())
print(data.describe())
numerical_features = [x for x in data.columns if data[x].dtype == np.float]
category_features = [x for x in data.columns if data[x].dtype != np.float and x != 'RainTomorrow']
# 选取三个特征进行可视化
# sns.pairplot(data=data[['Rainfall','Evaporation','Sunshine'] + ['RainTomorrow']],
# diag_kind = 'hist', hue = 'RainTomorrow')
# plt.show()
# 查看各个数据的箱型图分布
# for col in data[numerical_features].columns:
# if col != 'RainTomorrow':
# sns.boxplot(x = 'RainTomorrow', y = col, saturation = 0.5, palette = 'pastel',data=data)
# plt.title(col)
# plt.show()
# 查看不同地区的降雨情况
tlog = {}
for i in category_features:
tlog[i] = data[data['RainTomorrow'] == 'Yes'][i].value_counts()
flog = {}
for i in category_features:
flog[i] = data[data['RainTomorrow'] == 'No'][i].value_counts()
plt.figure(figsize=(10,10))
plt.subplot(1,2,1)
plt.title('RainTomorrow')
print(pd.DataFrame(tlog['Location']).sort_index()['Location'])
sns.barplot(x = pd.DataFrame(tlog['Location']).sort_index()['Location'], y = pd.DataFrame(tlog['Location']).sort_index().index,color='red')
plt.subplot(1,2,2)
plt.title('No RainTomorrow')
sns.barplot(x = pd.DataFrame(flog['Location']).sort_index()['Location'], y = pd.DataFrame(flog['Location']).sort_index().index,color='blue')
plt.show()
# 今天不下雨,明天不一定不下雨,今天下雨,明天大概率下雨
plt.figure(figsize=(10,2))
plt.subplot(1,2,1)
plt.title('RainTomorrow')
print(tlog['RainToday'])
sns.barplot(x = pd.DataFrame(tlog['RainToday'][:2]).sort_index()['RainToday'], y = pd.DataFrame(tlog['RainToday'][:2]).sort_index().index, color='red')
plt.subplot(1,2,2)
plt.title('No RainTomorrow')
sns.barplot(x = pd.DataFrame(flog['RainToday'][:2]).sort_index()['RainToday'], y = pd.DataFrame(flog['RainToday'][:2]).sort_index().index,color='blue')
plt.show()
# 对离散数值进行编码 XGBoost无法处理字符串数据
def get_mapfunction(x):
mapp = dict(zip(x.unique().tolist(),
range(len(x.unique().tolist()))))
def mapfunction(y):
if y in mapp:
return mapp[y]
else:
return -1
return mapfunction
for i in category_features:
data[i] = data[i].apply(get_mapfunction(data[i]))
data['RainTomorrow'] = data['RainTomorrow'].apply(get_mapfunction(data['RainTomorrow']))
print(data['Location'].unique())
print(data['RainTomorrow'].unique())
print(data.tail())
# 利用XGBoost进行训练和预测
from sklearn.model_selection import train_test_split
data_features = data[[x for x in data.columns if x != 'RainTomorrow']]
data_labels = data['RainTomorrow']
print(data_labels[:10])
x_train,x_test,y_train,y_test = train_test_split(data_features,data_labels,test_size=0.2,random_state=2022)
# 导入XGBoost模型
from xgboost.sklearn import XGBClassifier
# 定义
cls = XGBClassifier()
# 训练
cls.fit(x_train,y_train)
# 预测
train_predict = cls.predict(x_train)
test_predict = cls.predict(x_test)
# 评估
from sklearn import metrics
print(y_train[:10])
print(train_predict[:10])
print(pd.Series(y_train).value_counts())
print(pd.Series(train_predict).value_counts())
print('The accuracy of the Logistic Regression is:',metrics.accuracy_score(y_train,train_predict))
print('The accuracy of the Logistic Regression is:',metrics.accuracy_score(y_test,test_predict))
# 混淆矩阵
confusion_matrix = metrics.confusion_matrix(y_test,test_predict)
print(confusion_matrix)
plt.figure(figsize=(9,6))
sns.heatmap(confusion_matrix,annot = True,cmap = 'Blues')
plt.xlabel('Prediction Labels')
plt.ylabel('True Labels')
plt.show()
sns.barplot(y=data_features.columns, x = cls.feature_importances_)
plt.show()
# 评估特征重要性
# weight: 是以特征用到的次数来评价
# gain: 当利用特征做划分的时候的评价基尼指数
# cover: 利用一个覆盖样本的指标二阶导数(具体原理不清楚有待探究)平均值来划分。
# total_gain: 总基尼指数
# total_cover: 总覆盖
from sklearn.metrics import accuracy_score
from xgboost import plot_importance
def estimate(model, data):
ax1 = plot_importance(model, importance_type='gain')
ax1.set_title('gain')
ax2 = plot_importance(model, importance_type='weight')
ax2.set_title('weight')
ax3 = plot_importance(model, importance_type='cover')
ax3.set_title('cover')
plt.show()
def classes(data, label, test):
model = XGBClassifier()
model.fit(data, label)
ans = model.predict(test)
estimate(model, data)
return ans
ans = classes(x_train,y_train,x_test)
acc = accuracy_score(y_test,ans)
print('acc=',acc)
# 调整参数以获得更好的效果
# 网格调参
from sklearn.model_selection import GridSearchCV
# 定义取值范围
learning_rate = [0.1,0.3,0.6]
subsample = [0.8,0.9]
colsample_bytree = [0.6,0.8]
max_depth = [3,5,8]
# parameters = {'learning_rate':learning_rate,'subsample':subsample,'colsample_bytree':colsample_bytree,'max_depth':max_depth}
# model = XGBClassifier(n_estimators = 50)
# 进行网格搜索
# cls = GridSearchCV(model, parameters, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
# cls = cls.fit(x_train,y_train)
# print(cls.best_params_)
# 用得到的参数定义模型
cls = XGBClassifier(colsample_bytree=0.8,learning_rate=0.1,max_depth=8,subsample=0.9)
cls.fit(x_train,y_train)
train_predict = cls.predict(x_train)
test_predict = cls.predict(x_test)
print('The accuracy of the Logistic Regression is:', metrics.accuracy_score(y_train, train_predict))
print('The accuracy of the Logistic Regression is:', metrics.accuracy_score(y_test, test_predict))
# 混淆矩阵
confusion_matrix = metrics.confusion_matrix(y_test, test_predict)
print(confusion_matrix)
plt.figure(figsize=(9, 6))
sns.heatmap(confusion_matrix, annot=True, cmap='Blues')
plt.xlabel('Prediction Labels')
plt.ylabel('True Labels')
plt.show()
if __name__ == "__main__":
demo()