平台:win10,anaconda,python3.7.3版本,scikit-learn0.20.3版本。
《跟着迪哥学Python数据分析与机器学习实战》一书中的信用卡欺诈检测案例,书中的源代码需要做如下改动才能正确执行(标红的代码是需要额外添加和修改的):
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import recall_score
def printing_Kfold_scores(x_train_data,y_train_data):
fold=KFold(5,shuffle=False)
c_param_range=[0.01,0.1,1,10,100]
results_table=pd.DataFrame(index=range(len(c_param_range),2),columns=['C_parameter','Mean recall score'])
results_table['C_parameter']=c_param_range
j=0
for c_param in c_param_range:
print('----------------------------------------')
print('正则化惩罚力度:',c_param)
print('----------------------------------------')
print('')
recall_accs=[]
for iteration,indices in enumerate(fold.split(x_train_data)):
lr=LogisticRegression(C=c_param,penalty='l1')
lr.fit(x_train_data.iloc[indices[0],:],y_train_data.iloc[indices[0],:].values.ravel())
y_pred_undersample=lr.predict(x_train_data.iloc[indices[1],:].values)
recall_acc=recall_score(y_train_data.iloc[indices[1],:].values,y_pred_undersample)
recall_accs.append(recall_acc)
print('Iteration',iteration,':召回率=',recall_acc)
results_table.loc[j,'Mean recall score']=np.mean(recall_accs)
j+=1
print('')
print('平均召回率',np.mean(recall_accs))
print('')
best_c=results_table.loc[results_table['Mean recall score'].astype('float32').idxmax()]['C_parameter']
print('****************************************')
print('效果最好的模型所选参数= ',best_c)
print('****************************************')
return best_c
best_c=printing_Kfold_scores(X_train_undersample,y_train_undersample)