Partial Dependence Plot 学习和应用
定义
偏依赖图(Partial Dependence Plot,简称PDP或PD图)展示了机器学习模型预测结果中一个或两个特征的边际影响效果([[Friedman 2001.pdf]])。 作用:预测结果与特征的关系是线性、多元还是复杂的
原理
f
^
S
(
x
S
)
=
E
X
C
[
f
^
(
x
S
,
X
C
)
]
=
∫
f
^
(
x
S
,
X
C
)
d
P
(
X
C
)
\hat{f}_S(x_S) = E_{X_C}[\hat{f}(x_S,X_C)] = \int \hat{f}(x_S,X_C)dP(X_C)
f ^ S ( x S ) = E X C [ f ^ ( x S , X C )] = ∫ f ^ ( x S , X C ) d P ( X C )
其中
x
S
∪
X
C
=
X
x_S \cup X_C = X
x S ∪ X C = X ,
X
X
X 是所有的特征集合,
x
S
x_S
x S 是要探究与预测结果关系的特征
f
^
S
\hat{f}_S
f ^ S 使用训练集的均值计算
f
^
S
(
x
S
)
=
1
n
∑
i
=
1
n
f
^
(
x
S
,
x
C
(
i
)
)
\hat{f}_S(x_S) = \frac{1}{n}\sum_{i=1}^n \hat{f}(x_S,x_C^{(i)})
f ^ S ( x S ) = n 1 ∑ i = 1 n f ^ ( x S , x C ( i ) ) (Monte Carlo method:通过不同抽样来逼近)
Limitation
应用
code:使用xgboost+pdp画图分析相关性
import math
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib. pyplot as plt
from xgboost import XGBRegressor
from xgboost import XGBClassifier
from sklearn. inspection import partial_dependence
from sklearn. metrics import r2_score, mean_absolute_error, mean_squared_error, roc_auc_score, roc_curve
from sklearn. model_selection import train_test_split as TTS
from scipy. interpolate import splev, splrep
def create_subplot_figure ( num_plots, fixColsNum = 3 ) :
'''
生成一个固定列数的figure
Args:
num_plots (int): 总ax数
fixColsNum (int): 固定行数
Returns:
(plt.Figures)
(plt.axes): axes的序号已经被重置为一位数
'''
num_rows = ( num_plots + 2 ) // fixColsNum
num_cols = min ( num_plots, fixColsNum)
fig, axes = plt. subplots( num_rows, num_cols, figsize= ( 6 * num_cols, 6 * num_rows) )
axes = axes. flatten( )
for i in range ( num_plots, num_rows* num_cols) :
fig. delaxes( axes[ i] )
return fig, axes
def plot_pdp_feature_importance ( model, test, x) :
'''
Args:
model: machine learning model
test (pandas.DataFrame): 测试集 test dataset
x (list): feature list, 模型特征
Returns:
(pandas.DataFrame): 特征重要性表,已经按重要性倒序
(plt.Figure): 输出的图
'''
featureImportance = pd. DataFrame( { 'feature' : x, 'feature_importance' : model. feature_importances_} ) . sort_values( 'feature_importance' , ascending= False ) . reset_index( drop= True )
features = featureImportance[ 'feature' ] . values. tolist( )
print ( featureImportance)
featuresNum = len ( features)
fig, axs = create_subplot_figure( featuresNum+ 1 )
for i in range ( featuresNum) :
pdp = partial_dependence( model, test[ x] , [ features[ i] ] , kind= 'both' , grid_resolution= 50 )
sns. set_theme( style= 'ticks' , palette= 'deep' , font_scale= 1.1 )
plot_x = pd. Series( pdp[ 'values' ] [ 0 ] ) . rename( 'x' )
plot_i = pdp[ 'individual' ]
plot_y = pdp[ 'average' ] [ 0 ]
tck = splrep( plot_x, plot_y, s= 30 )
xnew = np. linspace( plot_x. min ( ) , plot_x. max ( ) , 300 )
ynew = splev( xnew, tck, der= 0 )
plot_df = pd. DataFrame( columns= [ 'x' , 'y' ] )
for a in plot_i[ 0 ] :
a2 = pd. Series( a)
df_i = pd. concat( [ plot_x, a2. rename( 'y' ) ] , axis= 1 )
plot_df = pd. concat( [ plot_df, df_i] , axis= 0 )
sns. lineplot( data= plot_df, x= "x" , y= "y" , color= 'k' , linewidth = 1.5 , linestyle= '--' , alpha= 0.6 , ax= axs[ i] )
axs[ i] . plot( xnew, ynew, linewidth= 2 )
sns. rugplot( data = test. sample( min ( 200 , test. shape[ 0 ] ) ) , x = features[ i] , height= .05 , color= 'k' , alpha = 0.3 , ax = axs[ i] )
x_min = plot_x. min ( ) - ( plot_x. max ( ) - plot_x. min ( ) ) * 0.1
x_max = plot_x. max ( ) + ( plot_x. max ( ) - plot_x. min ( ) ) * 0.1
axs[ i] . set_xlim( x_min, x_max)
axs[ i] . set_title( features[ i] , fontsize= 12 )
axs[ i] . set_ylabel( 'Partial Dependence' , fontsize = 10 )
sns. barplot( y = featureImportance[ 'feature' ] , x = featureImportance[ 'feature_importance' ] , ax= axs[ featuresNum] )
for i in axs[ featuresNum] . patches:
_x = i. get_x( ) + i. get_width( )
_y = i. get_y( ) + 0.5 * i. get_height( )
value = '{:.3f}' . format ( i. get_width( ) )
axs[ featuresNum] . text( _x, _y, value, ha= 'left' , fontsize= 8 )
axs[ featuresNum] . set_title( 'Feature importance' , fontsize = 10 )
return featureImportance, fig
def xgboostPDP ( data, x, y, type = 'r' , test_size= 0.3 , ** kwargs) :
'''
函数执行3个步骤:1划分训练集和测试集; 2xgboost建模; 3绘制pdp图
Args:
data: (pandas.DataFrame): 数据集
x (list): feature list, 模型特征
y (list): 模型标签
type (string): r or c, 回归模型时使用'r',分类模型使用'c'
test_size (float): 测试集比例
**kwargs : args for XGBRegressor or XGBClassifier, using "??XGBRegressor" for more info
returns:
(xgboost.XGBRegressor or xgboost.XGBClassifier): xgboost model
(pandas.DataFrame): the test dataset
(pandas.DataFrame): the train dataset
(pandas.DataFrame): 特征重要性表,按重要性倒排
(matplotlib.pylib.Figure): pdp图
'''
data = data[ x+ y] . dropna( how= 'any' )
print ( 'The shape of data source: {}' . format ( data. shape) )
test, train = TTS( data, test_size= test_size)
if type == 'r' :
clf = XGBRegressor( ** kwargs) . fit( train[ x] , train[ y] )
elif type == 'c' :
clf = XGBClassifier( ** kwargs) . fit( train[ x] , train[ y] )
test[ 'y_pred' ] = clf. predict( test[ x] )
print ( 'r2: {:.6f}' . format ( r2_score( test[ y] , test[ 'y_pred' ] ) ) )
print ( 'mae: {:.6f}' . format ( mean_absolute_error( test[ y] , test[ 'y_pred' ] ) ) )
print ( 'mse: {:.6f}' . format ( mean_squared_error( test[ y] , test[ 'y_pred' ] ) ) )
if type == 'c' :
print ( 'auc: {:.6f}' . format ( roc_auc_score( test[ y] , test[ 'y_pred' ] ) ) )
featureImportance, fig = plot_pdp_feature_importance( clf, test, x)
return clf, test, train, featureImportance, fig
附录