建议参考文章:
可解释的机器学习
kaggle教程:
https://www.kaggle.com/learn/machine-learning-explainability
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
%matplotlib inline
data = pd.read_csv('./train.csv')
print(data.shape)
data.columns
train.csv可在kaggle模型解释化教程中下载。首先做分类模型的解释实践。
y = data.readmitted
base_features = [c for c in data.columns if c != "readmitted"]
X = data[base_features]
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)
my_model = RandomForestClassifier(n_estimators=30, random_state=1).fit(train_X, train_y)
import eli5
from eli5.sklearn import PermutationImportance
perm = PermutationImportance(my_model, random_state=1).fit(val_X, val_y)
eli5.show_weights(perm, top=100,feature_names = val_X.columns.tolist())
# ??perm
#获取重要性列表
pd.concat([pd.Series(val_X.columns),pd.Series(perm.feature_importances_)],axis=1).sort_values(by=1,ascending=False)
from matplotlib import pyplot as plt
from pdpbox import pdp, get_dataset, info_plots
feature_name = 'number_inpatient'
# Create the data that we will plot
my_pdp = pdp.pdp_isolate(model=my_model, dataset=val_X, model_features=val_X.columns, feature=feature_name)
# plot it
pdp.pdp_plot(my_pdp, feature_name)
feature_name = 'time_in_hospital'
# Create the data that we will plot
my_pdp = pdp.pdp_isolate(model=my_model, dataset=val_X, model_features=val_X.columns, feature=feature_name)
# plot it
pdp.pdp_plot(my_pdp, feature_name)
features_to_plot = ['number_inpatient', 'time_in_hospital']
inter1 = pdp.pdp_interact(model=my_model, dataset=val_X, model_features=val_X.columns, features=features_to_plot)
pdp.pdp_interact_plot(pdp_interact_out=inter1, feature_names=features_to_plot, plot_type='contour')
plt.show()
import shap # package used to calculate Shap values
sample_data_for_prediction = val_X.iloc[0].astype(float) # to test function
def patient_risk_factors(model, patient_data):
# Create object that can calculate shap values
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(patient_data)
shap.initjs()
# explainer.expected_value[1], shap_values[1],取下标1是因为模型为分类模型,输出两个概率值,对应两个shap_value值
return shap.force_plot(explainer.expected_value[1], shap_values[1], patient_data)
explainer = shap.TreeExplainer(my_model)
# calculate shap values. This is what we will plot.
# Calculate shap_values for all of val_X rather than a single row, to have more data for plot.
shap_values = explainer.shap_values(val_X)
# Make plot. Index of [1] is explained in text below.
shap.summary_plot(shap_values[1], val_X)
回归模型实践
import xgboost
import shap
shap.initjs() # notebook环境下,加载用于可视化的JS代码
X,y = shap.datasets.boston()
model = xgboost.train({"learning_rate": 0.01}, xgboost.DMatrix(X, label=y), 100)
X.shape
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X) # 传入特征矩阵X,计算SHAP值
shap_values.shape
shap_values
# 可视化第一个prediction的解释 如果不想用JS,传入matplotlib=True
shap.force_plot(explainer.expected_value, shap_values[0,:], X.iloc[0,:])
# 基本值(base_value)是我们传入数据集上模型预测值的均值,可以通过自己计算来验证:
y_base = explainer.expected_value
print(y_base)
pred = model.predict(xgboost.DMatrix(X))
print(pred.mean())
#鼠标可以放图上面显示具体数值
shap.force_plot(explainer.expected_value, shap_values, X)
# summarize the effects of all the features
shap.summary_plot(shap_values, X)
#Feature Importance
# SHAP提供了另一种计算特征重要性的思路。
# 取每个特征的SHAP值的绝对值的平均值作为该特征的重要性,得到一个标准的条形图(multi-class则生成堆叠的条形图)
shap.summary_plot(shap_values, X, plot_type="bar")
# interaction value是将SHAP值推广到更高阶交互的一种方法。
# 实现了快速、精确的两两交互计算,这将为每个预测返回一个矩阵,其中主要影响在对角线上,交互影响在对角线外。
# 这些数值往往揭示了有趣的隐藏关系(交互作用)
shap_interaction_values = explainer.shap_interaction_values(X)
shap.summary_plot(shap_interaction_values, X)
# dependence_plot
# 为了理解单个feature如何影响模型的输出,我们可以将该feature的SHAP值与数据集中所有样本的feature值进行比较。
# 由于SHAP值表示一个feature对模型输出中的变动量的贡献,下面的图表示随着特征RM变化的预测房价(output)的变化。
# 单一RM(特征)值垂直方向上的色散表示与其他特征的相互作用,为了帮助揭示这些交互作用,“dependence_plot函数”
# 自动选择另一个用于着色的feature。在这个案例中,RAD特征着色强调了RM(每栋房屋的平均房间数)对RAD值较高地区的房价影响较小。
# create a SHAP dependence plot to show the effect of a single feature across the whole dataset
# interaction_index :“auto”, None, int, or string
shap.dependence_plot(ind="RM", shap_values=shap_values, features=X,interaction_index='RAD')
# 其他类型的explainers
# SHAP库可用的explainers有:
# deep:用于计算深度学习模型,基于DeepLIFT算法
# gradient:用于深度学习模型,综合了SHAP、集成梯度、和SmoothGrad等思想,形成单一期望值方程
# kernel:模型无关,适用于任何模型
# linear:适用于特征独立不相关的线性模型
# tree:适用于树模型和基于树模型的集成算法
# sampling :基于特征独立性假设,当你想使用的后台数据集很大时,kenel的一个很好的替代方案