5 Model Building:模型构建
划分数据集及相关函数构建
1. 加载相应库
from sklearn. model_selection import train_test_split
from sklearn. linear_model import LogisticRegression
from sklearn. metrics import confusion_matrix, accuracy_score, classification_report
from sklearn. metrics import roc_auc_score, roc_curve, scorer, f1_score
import statsmodels. api as sm
from sklearn. metrics import precision_score, recall_score
from yellowbrick. classifier import DiscriminationThreshold
2. 划分数据
train, test = train_test_split( telcom, test_size= 0.25 , random_state= 111 )
cols = [ i for i in telcom. columns if i not in Id_col+ target_col]
train_X = train[ cols]
train_Y = train[ target_col]
test_X = test[ cols]
test_Y = test[ target_col]
3. 建模函数及可视化函数构建
3.1 建模函数变量说明
3.2 构建函数,实现传入算法及数据集,输出模型表现指标
def telecom_churn_prediction ( algorithm, training_x, testing_x, training_y, testing_y, cols, cf, thredshold_plot) :
algorithm. fit( training_x, training_y)
preds = algorithm. predict( testing_x)
prob = algorithm. predict_proba( testing_x)
if cf == 'coefficients' :
coefficients = pd. DataFrame( algorithm. coef_. ravel( ) )
elif cf == 'features' :
coefficients = pd. DataFrame( algorithm. feature_importances_)
column_df = pd. DataFrame( cols)
coef_sumry = ( pd. merge( coefficients, column_df, left_index= True ,
right_index= True , how= 'left' ) )
coef_sumry. columns = [ 'coefficients' , 'features' ]
coef_sumry = coef_sumry. sort_values( by= 'coefficients' , ascending= False )
print ( algorithm)
print ( '\n Classification report: \n' , classification_report( testing_y, preds) )
print ( 'Accuracy Score: ' , accuracy_score( testing_y, preds) )
conf_matrix = confusion_matrix( testing_y, preds)
model_roc_auc = roc_auc_score( testing_y, preds)
print ( 'Area under curve: ' , model_roc_auc, '\n' )
fpr, tpr, thredsholds = roc_curve( testing_y, prob[ : , 1 ] )
trace1 = go. Heatmap( z= conf_matrix, x= [ 'Not Churn' , 'Churn' ] , y= [ 'Not Churn' , 'Churn' ] ,
showscale= False , colorscale= 'Picnic' , name= 'Matrix' )
trace2 = go. Scatter( x= fpr, y= tpr, name= 'Roc: ' + str ( model_roc_auc) ,
line= dict ( color= 'rgb(22,96,167)' , width= 2 ) )
trace3 = go. Scatter( x= [ 0 , 1 ] , y= [ 0 , 1 ] , line= dict ( color= ( 'rgb(205, 12, 24)' ) ,
width= 2 , dash= 'dot' ) )
trace4 = go. Bar( x= coef_sumry[ 'features' ] , y= coef_sumry[ 'coefficients' ] ,
name= 'Coefficients' ,
marker= dict ( color= coef_sumry[ 'coefficients' ] ,
colorscale= 'Picnic' ,
line= dict ( width= 0.6 , color= 'black' ) ) )
fig = tls. make_subplots( rows= 2 , cols= 2 , specs= [ [ {
} , {
} ] , [ {
'colspan' : 2 } , None ] ] ,
subplot_titles= ( 'Confusion Matrix' , 'Receiver operating characteristic' ,
'Feature Importances' ) )
fig. append_trace( trace1, 1 , 1 )
fig. append_trace( trace2, 1 , 2 )
fig. append_trace( trace3, 1 , 2 )
fig. append_trace( trace4, 2 , 1 )
fig[ 'layout' ] . update( showlegend= False , title= 'Model perfomance' , autosize= False ,
height= 900 , width= 800 ,
plot_bgcolor