导入相关第三方库
有的库用不到,也懒得删了。
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold
from pdpbox import pdp, info_plots
from sklearn.metrics import roc_curve, auc, roc_auc_score
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler
from sklearn import tree
import pydotplus
from sklearn.metrics import confusion_matrix
from sklearn.inspection import partial_dependence
from sklearn.inspection import PartialDependenceDisplay
import shap
解决画图中文字体显示的问题
plt.rcParams['font.sans-serif'] = ['SimSun', 'Times New Roman'] # 汉字字体集
plt.rcParams['font.size'] = 10 # 字体大小
plt.rcParams['axes.unicode_minus'] = False
文件读取&变量设置
# 读取csv文件
df = pd.read_csv('C:/Users/.......csv')
# print(df.head()) # 打印前五行数据
feature_list = ["WorkingDay", "Weather", "At_Night", "Plane_Alignment", "Curve_Length", "Radius", "Angle", "Slope", "Slope_Length", "Traffic_Volume", "Truck_Ratio"]
# 设置自变量和因变量
x = df.drop('Accident_Level', axis=1) # df中删掉某一列,扔掉列axis=1
y = df['Accident_Level']
过采样
个别类数据量过少,需要通过过采样技术进行扩充
oversampler = SMOTE(random_state=0)
os_x, os_y = oversampler.fit_resample(x, y)
随机森林/训练/十折交叉验证
# 随机森林分类器
rfc = RandomForestClassifier()
seed = 5
# 拆分训练集和测试集
xtrain, xtest, ytrain, ytest = train_test_split(os_x, os_y, test_size=0.3, random_state=seed)
# 用训练集训练分类器
rfc = rfc.fit(xtrain, ytrain)
print("测试集准确率:", rfc.score(xtest, ytest))
print("训练集准确率:", rfc.score(xtrain, ytrain))
ypredict = rfc.predict(xtest)
mse = metrics.mean_squared_error(ytest, ypredict)
print('MSE: %.4f', mse)
# 创建十折交叉验证对象
kfold = KFold(n_splits=10)
# 执行十折交叉验证
scores = cross_val_score(rfc, os_x, os_y, cv=kfold)
# 输出每折的准确率
for i, score in enumerate(scores):
print("Fold {}: {:.4f}".format(i + 1, score))
# 输出平均准确率
print("Average Accuracy: {:.4f}".format(scores.mean()))
特征重要性排序
# 获取特征重要性得分
feature_importances = rfc.feature_importances_
# 创建特征名列表
feature_names = list(os_x.columns)
# 创建一个DataFrame,包含特征名和其重要性得分
feature_importances_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importances})
# 对特征重要性得分进行排序
feature_importances_df = feature_importances_df.sort_values('importance', ascending=False)
# 颜色映射
colors = plt.cm.viridis(np.linspace(0, 1, len(feature_names)))
# 可视化特征重要性
fig, ax = plt.subplots(figsize=(10, 6))
ax.barh(feature_importances_df['feature'], feature_importances_df['importance'], color=colors)
ax.invert_yaxis() # 翻转y轴,使得最大的特征在最上面
ax.set_xlabel('特征重要性', fontsize=12) # 图形的x标签
ax.set_title('随机森林特征重要性可视化', fontsize=16)
for i, v in enumerate(feature_importances_df['importance']):
ax.text(v + 0.01, i, str(round(v, 3)), va='center', fontname='Times New Roman', fontsize=10)
# # 设置图形样式
# plt.style.use('default')
ax.spines['top'].set_visible(False) # 去掉上边框
ax.spines['right'].set_visible(False) # 去掉右边框
# ax.spines['left'].set_linewidth(0.5)#左边框粗细
# ax.spines['bottom'].set_linewidth(0.5)#下边框粗细
# ax.tick_params(width=0.5)
# ax.set_facecolor('white')#背景色为白色
# ax.grid(False)#关闭内部网格线
# 保存图形
plt.savefig('./特征重要性.jpg', dpi=400, bbox_inches='tight')
plt.show()
效果如下:
混淆矩阵
# 绘制混淆矩阵
C = confusion_matrix(ytest.tolist(), ypredict.tolist(), labels=[1, 2, 3]) # label与结果分类相对应
print(len(C))
plt.matshow(C, cmap=plt.cm.Greens) # 根据最下面的图按自己需求更改颜色
for i in range(len(C)):
for j in range(len(C)):
plt.annotate(C[j, i], xy=(i, j), horizontalalignment='center', verticalalignment='center')
# plt.tick_params(labelsize=15) # 设置左边和上面的label类别如0,1,2,3,4的字体大小。
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.ylabel('True label', fontdict={'family': 'Times New Roman', 'size': 20}) # 设置字体大小。
plt.xlabel('Predicted label', fontdict={'family': 'Times New Roman', 'size': 20})
plt.xticks(range(0, 3), labels=['1', '2', '3']) # 将x轴或y轴坐标,刻度 替换为文字/字符
plt.yticks(range(0, 3), labels=['1', '2', '3'])
plt.savefig("混淆矩阵", dpi=300, bbox_inches='tight')
plt.show()
https://blog.csdn.net/weixin_43818631/article/details/121309660
混淆矩阵,效果如下:
PDP部分依赖图(单因素)
通过循环,绘制并保存各因素的部分依赖图
for c in feature_list:
feature = c
pdp_goals = pdp.pdp_isolate(model=rfc, dataset=os_x, model_features=feature_list, feature=feature)
pdp.pdp_plot(pdp_goals, feature)
plt.savefig(c + '.jpg', dpi=300, bbox_inches='tight')
# plt.show()
分了三类,所以一个因素有三个图,效果如下:
PDP部分依赖图(双因素)
在最重要的5各因素里组合选择,循环绘制。
# 双因素PDP
feature_list_most = ["Traffic_Volume", "Truck_Ratio", "Curve_Length", "Slope", "Slope_Length"]
for i in range(len(feature_list_most)):
for j in range(i+1, len(feature_list_most)):
features = [feature_list_most[i], feature_list_most[j]]
inter1 = pdp.pdp_interact(model=rfc, dataset=x, model_features=feature_list, features=features)
pdp.pdp_interact_plot(pdp_interact_out=inter1, feature_names=features, plot_type='contour')
plt.savefig(features[0] + "+" + features[1] + '.jpg', dpi=300, bbox_inches='tight')
# plt.show()
效果如下:
个体条件期望图ICE
# 个体条件期望图ICE
feature_list_most = ["Traffic_Volume", "Truck_Ratio", "Curve_Length", "Slope", "Slope_Length"]
for i in range(len(feature_list_most)):
shap.plots.partial_dependence(feature_list_most[i], rfc.predict, x, ice=True, model_expected_value=True, feature_expected_value=True)
# plt.savefig(feature_list_most[i] + "+" + "ICE" + '.jpg', dpi=300, bbox_inches='tight')
# plt.show()
效果如下: