【代码实战-个人笔记】随机森林/过采样/特征重要性排序/混淆矩阵/PDP部分依赖图/ICE个体条件期望图

最新推荐文章于 2024-06-24 09:58:32 发布

一大桶土豆

最新推荐文章于 2024-06-24 09:58:32 发布

阅读量612

点赞数 12

文章标签：笔记随机森林矩阵数据可视化

本文链接：https://blog.csdn.net/aftern/article/details/138816949

版权

导入相关第三方库

有的库用不到，也懒得删了。

from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold
from pdpbox import pdp, info_plots
from sklearn.metrics import roc_curve, auc, roc_auc_score
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler
from sklearn import tree
import pydotplus
from sklearn.metrics import confusion_matrix
from sklearn.inspection import partial_dependence
from sklearn.inspection import PartialDependenceDisplay
import shap

解决画图中文字体显示的问题

plt.rcParams['font.sans-serif'] = ['SimSun', 'Times New Roman']  # 汉字字体集
plt.rcParams['font.size'] = 10  # 字体大小
plt.rcParams['axes.unicode_minus'] = False

文件读取&变量设置

# 读取csv文件
df = pd.read_csv('C:/Users/.......csv')
# print(df.head())  # 打印前五行数据

feature_list = ["WorkingDay", "Weather", "At_Night", "Plane_Alignment", "Curve_Length", "Radius", "Angle", "Slope", "Slope_Length", "Traffic_Volume", "Truck_Ratio"]

# 设置自变量和因变量
x = df.drop('Accident_Level', axis=1)  # df中删掉某一列，扔掉列axis=1
y = df['Accident_Level']

过采样

个别类数据量过少，需要通过过采样技术进行扩充

oversampler = SMOTE(random_state=0)
os_x, os_y = oversampler.fit_resample(x, y)

随机森林/训练/十折交叉验证

# 随机森林分类器
rfc = RandomForestClassifier()
seed = 5

# 拆分训练集和测试集
xtrain, xtest, ytrain, ytest = train_test_split(os_x, os_y, test_size=0.3, random_state=seed)
# 用训练集训练分类器
rfc = rfc.fit(xtrain, ytrain)
print("测试集准确率:", rfc.score(xtest, ytest))
print("训练集准确率:", rfc.score(xtrain, ytrain))

ypredict = rfc.predict(xtest)
mse = metrics.mean_squared_error(ytest, ypredict)
print('MSE: %.4f', mse)

# 创建十折交叉验证对象
kfold = KFold(n_splits=10)
# 执行十折交叉验证
scores = cross_val_score(rfc, os_x, os_y, cv=kfold)
# 输出每折的准确率
for i, score in enumerate(scores):
    print("Fold {}: {:.4f}".format(i + 1, score))
# 输出平均准确率
print("Average Accuracy: {:.4f}".format(scores.mean()))

特征重要性排序

# 获取特征重要性得分
feature_importances = rfc.feature_importances_
# 创建特征名列表
feature_names = list(os_x.columns)
# 创建一个DataFrame，包含特征名和其重要性得分
feature_importances_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importances})
# 对特征重要性得分进行排序
feature_importances_df = feature_importances_df.sort_values('importance', ascending=False)

# 颜色映射
colors = plt.cm.viridis(np.linspace(0, 1, len(feature_names)))

# 可视化特征重要性
fig, ax = plt.subplots(figsize=(10, 6))
ax.barh(feature_importances_df['feature'], feature_importances_df['importance'], color=colors)
ax.invert_yaxis()  # 翻转y轴，使得最大的特征在最上面
ax.set_xlabel('特征重要性', fontsize=12)  # 图形的x标签
ax.set_title('随机森林特征重要性可视化', fontsize=16)
for i, v in enumerate(feature_importances_df['importance']):
    ax.text(v + 0.01, i, str(round(v, 3)), va='center', fontname='Times New Roman', fontsize=10)

# # 设置图形样式
# plt.style.use('default')
ax.spines['top'].set_visible(False)  # 去掉上边框
ax.spines['right'].set_visible(False)  # 去掉右边框
# ax.spines['left'].set_linewidth(0.5)#左边框粗细
# ax.spines['bottom'].set_linewidth(0.5)#下边框粗细
# ax.tick_params(width=0.5)
# ax.set_facecolor('white')#背景色为白色
# ax.grid(False)#关闭内部网格线

# 保存图形
plt.savefig('./特征重要性.jpg', dpi=400, bbox_inches='tight')
plt.show()

效果如下：
在这里插入图片描述

混淆矩阵

# 绘制混淆矩阵
C = confusion_matrix(ytest.tolist(), ypredict.tolist(), labels=[1, 2, 3])  # label与结果分类相对应
print(len(C))
plt.matshow(C, cmap=plt.cm.Greens)  # 根据最下面的图按自己需求更改颜色

for i in range(len(C)):
    for j in range(len(C)):
        plt.annotate(C[j, i], xy=(i, j), horizontalalignment='center', verticalalignment='center')

# plt.tick_params(labelsize=15) # 设置左边和上面的label类别如0,1,2,3,4的字体大小。

plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.ylabel('True label', fontdict={'family': 'Times New Roman', 'size': 20})  # 设置字体大小。
plt.xlabel('Predicted label', fontdict={'family': 'Times New Roman', 'size': 20})
plt.xticks(range(0, 3), labels=['1', '2', '3'])  # 将x轴或y轴坐标，刻度 替换为文字/字符
plt.yticks(range(0, 3), labels=['1', '2', '3'])
plt.savefig("混淆矩阵", dpi=300, bbox_inches='tight')
plt.show()

在这里插入图片描述 https://blog.csdn.net/weixin_43818631/article/details/121309660
混淆矩阵，效果如下：

PDP部分依赖图（单因素）

通过循环，绘制并保存各因素的部分依赖图

for c in feature_list:
    feature = c
    pdp_goals = pdp.pdp_isolate(model=rfc, dataset=os_x, model_features=feature_list, feature=feature)
    pdp.pdp_plot(pdp_goals, feature)
    plt.savefig(c + '.jpg', dpi=300, bbox_inches='tight')
    # plt.show()

分了三类，所以一个因素有三个图，效果如下：
在这里插入图片描述

PDP部分依赖图（双因素）

在最重要的5各因素里组合选择，循环绘制。

#  双因素PDP
feature_list_most = ["Traffic_Volume", "Truck_Ratio", "Curve_Length", "Slope", "Slope_Length"]

for i in range(len(feature_list_most)):
    for j in range(i+1, len(feature_list_most)):
        features = [feature_list_most[i], feature_list_most[j]]
        inter1 = pdp.pdp_interact(model=rfc, dataset=x, model_features=feature_list, features=features)

        pdp.pdp_interact_plot(pdp_interact_out=inter1, feature_names=features, plot_type='contour')
        plt.savefig(features[0] + "+" + features[1] + '.jpg', dpi=300, bbox_inches='tight')
        # plt.show()

效果如下：
在这里插入图片描述

个体条件期望图ICE

# 个体条件期望图ICE
feature_list_most = ["Traffic_Volume", "Truck_Ratio", "Curve_Length", "Slope", "Slope_Length"]
for i in range(len(feature_list_most)):
    shap.plots.partial_dependence(feature_list_most[i], rfc.predict, x, ice=True, model_expected_value=True, feature_expected_value=True)
    # plt.savefig(feature_list_most[i] + "+" + "ICE" + '.jpg', dpi=300, bbox_inches='tight')
    # plt.show()

效果如下：
在这里插入图片描述

一大桶土豆

关注

12
点赞
踩
7

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫