特征工程实战2：随机森林模型用于疾病的判断工程

最新推荐文章于 2024-07-29 21:22:31 发布

每天都要被自己菜醒

最新推荐文章于 2024-07-29 21:22:31 发布

阅读量1.2k

点赞数

分类专栏：大数据文章标签： python 深度学习机器学习

本文链接：https://blog.csdn.net/qq_45531594/article/details/108372723

版权

大数据专栏收录该内容

36 篇文章 2 订阅

订阅专栏

x_train , y_train ：
在一张表里面。确定x,y 的时候会用到 drop函数
对整个矩阵删去，y的那一列，剩下的都是x
在这里插入图片描述

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier 
from sklearn.tree import export_graphviz 
from sklearn.model_selection import train_test_split #训练集 ，测试集分类
#特征工程重要三个插件
import eli5 
from eli5.sklearn import PermutationImportance
import shap #对比多个/所有特征对模型起到抑制和促进
from pdpbox import pdp, info_plots
np.random.seed(123)#跟random_state是一样的，第一次运行的时候，后面的结果是不会变的

```python
dt = pd.read_csv('C:/Users/lb/Desktop/test/heart.csv')
dt.head()
dt.info() #未发现存在缺失值

在这里插入图片描述

#列名可以根据自己易读性修改 
dt.columns = ['age', 'sex', 'chest_pain_type', 'resting_blood_pressure', 'cholesterol', 'fasting_blood_sugar', 'rest_ecg', 'max_heart_rate_achieved',
       'exercise_induced_angina','st_depression', 'st_slope', 'num_major_vessels', 'thalassemia', 'target']

在这里插入图片描述

#转换一下各个特征的属性，后面用astype强制转换回来
dt['sex'][dt['sex'] == 0] = 'female'
dt['sex'][dt['sex'] == 1] = 'male'
#    胸痛经历                                              
dt['chest_pain_type'][dt['chest_pain_type'] == 1] = 'typical angina' #典型心绞痛
dt['chest_pain_type'][dt['chest_pain_type'] == 2] = 'atypical angina' #非典型心绞痛
dt['chest_pain_type'][dt['chest_pain_type'] == 3] = 'non-anginal pain' #非心绞痛
dt['chest_pain_type'][dt['chest_pain_type'] == 4] = 'asymptomatic'  #无症状
#病人的静息血压
dt['fasting_blood_sugar'][dt['fasting_blood_sugar'] == 0] = 'lower than 120mg/ml' #低压
dt['fasting_blood_sugar'][dt['fasting_blood_sugar'] == 1] = 'greater than 120mg/ml'#高压
#心电图测量
dt['rest_ecg'][dt['rest_ecg'] == 0] = 'normal'#正常
dt['rest_ecg'][dt['rest_ecg'] == 1] = 'ST-T wave abnormality' #有ST-T波异常
dt['rest_ecg'][dt['rest_ecg'] == 2] = 'left ventricular hypertrophy'#按Estes标准显示可能或明确的左心室肥厚
#运动诱发心绞痛
dt['exercise_induced_angina'][dt['exercise_induced_angina'] == 0] = 'no' #否
dt['exercise_induced_angina'][dt['exercise_induced_angina'] == 1] = 'yes' #是
#峰值运动后ST段心电图的斜率
dt['st_slope'][dt['st_slope'] == 1] = 'upsloping'#上升
dt['st_slope'][dt['st_slope'] == 2] = 'flat' #平坦
dt['st_slope'][dt['st_slope'] == 3] = 'downsloping' #下降
#称为地中海贫血的血液疾病
dt['thalassemia'][dt['thalassemia'] == 1] = 'normal' #正常
dt['thalassemia'][dt['thalassemia'] == 2] = 'fixed defect' #固定
dt['thalassemia'][dt['thalassemia'] == 3] = 'reversable defect'#可逆缺陷

#原来数据是正常的都是数值特征，中间转换为str类型了 ， 用astype再强制转换回来
dt['sex'] = dt['sex'].astype('object')# 现在是”sex“是female为女，male为男 ，现在强反转过来男为1 ，女为0 
dt['chest_pain_type'] = dt['chest_pain_type'].astype('object') #现在都强制反转换过来1 2 3 4
dt['fasting_blood_sugar'] = dt['fasting_blood_sugar'].astype('object')
dt['rest_ecg'] = dt['rest_ecg'].astype('object')
dt['exercise_induced_angina'] = dt['exercise_induced_angina'].astype('object')
dt['st_slope'] = dt['st_slope'].astype('object')
dt['thalassemia'] = dt['thalassemia'].astype('object')

在这里插入图片描述

#pandas的读热编码
dt = pd.get_dummies(dt)#drop_first=True读热编码，#drop_first表示去除one-hot编码后的第一列数据，反之就有第一列
dt.head()

在这里插入图片描述

#实际模型使用的时候不会使用，random. =1 或者 0  会用一个大的数字
#比如 10  加入  100 个数据集 ， 80 个做训练集 ，20个测试集 ，那么在每次分割训练集的时候测试集总要10个数据不会变动
#
x_train, x_test, y_train, y_test = train_test_split(dt.drop('target',axis = 1), dt['target'], test_size=0.2,random_state =10)

#利用随机森林模型进行训练  ，这种选择数的深度是 5 
model = RandomForestClassifier(max_depth=5)
model.fit(x_train, y_train)
#bootstrap=True是否使用bootstrap，默认是true，自助法，有放回的重采样
#“balanced” 模式自动调整权重，每类的权重为 n_samples / (n_classes * np.bincount(y))，即类别数的倒数除以每类样本数的占比。
#树分裂的规则：gini系数，entropy熵,默认的是基尼系数
#max_depth=5：树的深度为5层
"""max_features='auto'：int, float, string or None, optional (default=”auto”)
查找最佳分裂所需考虑的特征数，
int：分裂的最大特征数，
float：分裂的特征占比，
auto、sqrt：sqrt(n_features)，
log2：log2(n_features)，
None：n_features，
"""
#max_leaf_nodes=None 最大叶子节点数；
#min_impurity_decrease=0 分裂的最小不纯度为0 
#n_estimators：随机森林中树的数量
#n_jobs : integer, optional (default=1)，并行job数，-1 代表全部
#oob_score : bool (default=False)，是否使用袋外（out-of-bag）样本估计准确度；
#random_state=None ，随机数种子，保持下一次运行不变
#verbose：控制树冗余
#warm_start : bool, optional (default=False)，如果设置为True，在之前的模型基础上预测并添加模型，否则，建立一个全新的森林；


print(model) #随机森林的参数

在这里插入图片描述

#graphviz 手动安装 ，这是一个模板，需要填的就填好了
#proportion=True ，设置均匀
#filled：装满
#feature_names特征名称，已定义

#feature_names = [i for i in x_train.columns]
#y 就是target 那一列  之前是 0 或者 1 ，现在我想修改为 字符串，强制转换格式
# y_train_str = y_train.astype('str')
# y_train_str[y_train_str == '0'] = 'no disease' #0代表没心脏病
# y_train_str[y_train_str == '1'] = 'disease' #1代表有心脏病
# y_train_str = y_train_str.values 

export_graphviz(estimator, out_file='tree.dot', 

                feature_names = feature_names,  #特征变量 ，已被定义
                class_names = y_train_str, # 类别变量，已被定义
                rounded = True, proportion = True, #树节点为圆角矩形
                label='root',
                precision = 2, filled = True) #precision=2：每个节点的杂质，阈值和值属性的值中浮点数的精度位数； filled：充满

# # 使用系统命令转换为png（需要Graphviz）
from subprocess import call
call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])
# # dot:生成可视化图片的命令  
# #-Tpng：指定图像类型是png
# #tree.dot：out_file输出的文件名
# #-o：output输出文件
# #tree.png：输出文件名
# #-Gdpi=600：图像每英寸含600个像素

# 显示在jupyter笔记本
from IPython.display import Image
Image(filename = 'tree.png')

在这里插入图片描述
重要可以查看概率和分类结果

训练完模型之后可以使用三个插件

#把随机森林加载进来 ，下一次运行结果不变 
perm = PermutationImportance(model, random_state=1).fit(X_test, y_test)
#               要求集成算法的特征重要度  ， 把所有特征加载进来
eli5.show_weights(perm, feature_names = X_test.columns.tolist())
#第一行的心绞痛的经历权重很高跟是否得心脏病很重要，中间的非心绞痛就跟心脏病和正常的贫血跟没关系

在这里插入图片描述

看一下别的因素

feat_name = 'age'
pdp_dist = pdp.pdp_isolate(model=model, dataset=X_test, model_features=base_features, feature=feat_name)

pdp.pdp_plot(pdp_dist, feat_name)
plt.show()
#随着年龄的升高，心脏病越小，但是37到42岁是危险的年龄，心脏病越高

在这里插入图片描述

#chest_pain_type：心绞痛从蓝变红，越来越大，代表越来越严重
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values[1], X_test)

在这里插入图片描述

def heart_disease_risk_factors(model, patient):

    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(patient)#shap_values是所有的测试特征
    shap.initjs()#显示格式转换 
    return shap.force_plot(explainer.expected_value[1], shap_values[1], patient) #patient患者

data_for_prediction = X_test.iloc[1,:].astype(float)#把测试样本中，第一行的所有特征拿到都强制转为”float“
heart_disease_risk_factors(model, data_for_prediction)
#图中红色的chest_pain_type = 2非典型心绞痛对没有患心脏病的强度很大；蓝色的num_magor_vessels=1血管数量越少，对换心脏病的强度越高