观察数据
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
导入数据并查看前5行
data=pd.read_csv('creditcard.csv')
data.head()
Time | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | ... | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | Amount | Class | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | -1.359807 | -0.072781 | 2.536347 | 1.378155 | -0.338321 | 0.462388 | 0.239599 | 0.098698 | 0.363787 | ... | -0.018307 | 0.277838 | -0.110474 | 0.066928 | 0.128539 | -0.189115 | 0.133558 | -0.021053 | 149.62 | 0 |
1 | 0.0 | 1.191857 | 0.266151 | 0.166480 | 0.448154 | 0.060018 | -0.082361 | -0.078803 | 0.085102 | -0.255425 | ... | -0.225775 | -0.638672 | 0.101288 | -0.339846 | 0.167170 | 0.125895 | -0.008983 | 0.014724 | 2.69 | 0 |
2 | 1.0 | -1.358354 | -1.340163 | 1.773209 | 0.379780 | -0.503198 | 1.800499 | 0.791461 | 0.247676 | -1.514654 | ... | 0.247998 | 0.771679 | 0.909412 | -0.689281 | -0.327642 | -0.139097 | -0.055353 | -0.059752 | 378.66 | 0 |
3 | 1.0 | -0.966272 | -0.185226 | 1.792993 | -0.863291 | -0.010309 | 1.247203 | 0.237609 | 0.377436 | -1.387024 | ... | -0.108300 | 0.005274 | -0.190321 | -1.175575 | 0.647376 | -0.221929 | 0.062723 | 0.061458 | 123.50 | 0 |
4 | 2.0 | -1.158233 | 0.877737 | 1.548718 | 0.403034 | -0.407193 | 0.095921 | 0.592941 | -0.270533 | 0.817739 | ... | -0.009431 | 0.798278 | -0.137458 | 0.141267 | -0.206010 | 0.502292 | 0.219422 | 0.215153 | 69.99 | 0 |
5 rows × 31 columns
数据有31列:Time、V1-V28、Amount和Class,注意到最后一列Class,这是我们的label值,0代表正常数据,1代表欺诈数据。首先习惯性地画个图观察一下欺诈数据的分布。
# 使用方法pd.value_counts()来统计某列不同元素及出现次数,返回一个series
count_classes=pd.value_counts(data['Class'],sort=True).sort_index()
count_classes.plot(kind='bar')
plt.title("Fraud class histogram")
plt.xlabel("Class")
plt.ylabel("Frequency")
Text(0,0.5,'Frequency')
可以看到Class=0的数据大概有28W,欺诈数据Class=1极少,极度不均匀的分布状态。
通常有两种处理方法:
- 过采样(让1变得和0一样多);
- 下采样(在0中取出部分数据,数量与1一致)
标准化
在特征数据中,Amount与其他特征数据的取值范围相比,太大了,应该是还没有标准化。
所以,需要先对这一列进行标准化:
# 导入sklearn的预处理模块preprocessing的标准化模块StandardScaler
from sklearn.preprocessing import StandardScaler
# 利用标准化模块中的StandardScaler().fit_transform将'Amount'列中的值进行标准化,再作为新的一列
data['normAmount']=StandardScaler().fit_transform(data['Amount'].values.reshape(-1,1))
# 利用方法drop删除无用的两列
data=data.drop(['Time','Amount'],axis=1)
data.head()
V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | V10 | ... | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | Class | normAmount | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | -1.359807 | -0.072781 | 2.536347 | 1.378155 | -0.338321 | 0.462388 | 0.239599 | 0.098698 | 0.363787 | 0.090794 | ... | -0.018307 | 0.277838 | -0.110474 | 0.066928 | 0.128539 | -0.189115 | 0.133558 | -0.021053 | 0 | 0.244964 |
1 | 1.191857 | 0.266151 | 0.166480 | 0.448154 | 0.060018 | -0.082361 | -0.078803 | 0.085102 | -0.255425 | -0.166974 | ... | -0.225775 | -0.638672 | 0.101288 | -0.339846 | 0.167170 | 0.125895 | -0.008983 | 0.014724 | 0 | -0.342475 |
2 | -1.358354 | -1.340163 | 1.773209 | 0.379780 | -0.503198 | 1.800499 | 0.791461 | 0.247676 | -1.514654 | 0.207643 | ... | 0.247998 | 0.771679 | 0.909412 | -0.689281 | -0.327642 | -0.139097 | -0.055353 | -0.059752 | 0 | 1.160686 |
3 | -0.966272 | -0.185226 | 1.792993 | -0.863291 | -0.010309 | 1.247203 | 0.237609 | 0.377436 | -1.387024 | -0.054952 | ... | -0.108300 | 0.005274 | -0.190321 | -1.175575 | 0.647376 | -0.221929 | 0.062723 | 0.061458 | 0 | 0.140534 |
4 | -1.158233 | 0.877737 | 1.548718 | 0.403034 | -0.407193 | 0.095921 | 0.592941 | -0.270533 | 0.817739 | 0.753074 | ... | -0.009431 | 0.798278 | -0.137458 | 0.141267 | -0.206010 | 0.502292 | 0.219422 | 0.215153 | 0 | -0.073403 |
5 rows × 30 columns
这个时候所有特征数据都已经完成了标准化的操作。
随机下采样
我们先进行下采样。现在,分别取出特征和标签:
X=data.loc[:,data.columns!='Class']#取特征(列名不为'Class'的所有列)
y=data.loc[:,data.columns=='Class']#取label(列名为'Class'的一列)
为了保证拿到的是数据的原始分布,我们采用的是随机的下采样:
# 随机下采样
# 筛选出class为1的数据总数,并取得其索引值
number_records_fraud=len(data[data.Class==1])
fraud_indices=np.array(data[data.Class==1].index)
# 把class为0的数据索引拿到手
normal_indices=data[data.Class==0].index
# 利用np.random模块的choice函数在calss为0的index值中取出与class为1数据总数相同的index值,
# 参数replace=False规定随机取出值不重复
random_normal_indices=np.random.choice(normal_indices,number_records_fraud,replace=False)
random_normal_indices=np.array(random_normal_indices)#转化成ndarray的格式
# 将两组索引数据连接成新的数据索引
under_sample_indices=np.concatenate([fraud_indices,random_normal_indices])
# 根据之前的数据索引构造一个下采样数据
under_sample_data=data.iloc[under_sample_indices,:]
# 切分出下采样数据特征数据和标签数据
X_undersample=under_sample_data.loc[:,under_sample_data.columns!='Class']
y_undersample=under_sample_data.loc[:,under_sample_data.columns=='Class']
# 展示下采样数据比例
print("Percentage of normal transactions(正常交易百分比):",
len(under_sample_data[under_sample_data.Class==0])/len(under_sample_data))
print("Percentage of fraud transactioms(欺诈交易百分比):",
len(under_sample_data[under_sample_data.Class==1])/len(under_sample_data))
print("Total number of transactions in resampled data:(下采样数据的交易次数总数)",
len(under_sample_data))
Percentage of normal transactions(正常交易百分比): 0.5
Percentage of fraud transactioms(欺诈交易百分比): 0.5
Total number of transactions in resampled data:(下采样数据的交易次数总数) 984
数据切分
将数据集切分为训练集和测试集:
# 从sklearn的选择模块中导入训练测试分割模块
from sklearn.model_selection import train_test_split
# 对全部数据集进行切分,注意使用相同的随机策略
X_train,X_test,y_train,y_test=train_test_split(X,y,
test_size=0.3, # 30%作为测试集
random_state=0) # random_state = 0保证数据集一致性,以便调参
print("Number transactions train dataset(训练数据集交易数量):",
len(X_train))
print("Number transactions test dataset(测试数据集交易数量):",
len(X_test))
print("Total number of transactions(总数据交易数量):",
len(X_train)+len(X_test))
print("--------------------------------------------------------------------")
# 对下取样数据集进行切分,使用相同的随机策略
X_train_undersample,X_test_undersample,y_train_undersample,y_test_undersample=train_test_split(X_undersample,y_undersample,
test_size=0.3,
random_state=0)
print("Number transactions train dataset(训练数据集交易数量):",
len(X_train_undersample))
print("Number transactions test dataset(测试数据集交易数量):",
len(X_test_undersample))
print("Total number of transactions(总数据交易数量):",
len(X_train_undersample)+len(X_test_undersample))
Number transactions train dataset(训练数据集交易数量): 199364
Number transactions test dataset(测试数据集交易数量): 85443
Total number of transactions(总数据交易数量): 284807
--------------------------------------------------------------------
Number transactions train dataset(训练数据集交易数量): 688
Number transactions test dataset(测试数据集交易数量): 296
Total number of transactions(总数据交易数量): 984
模型评估
在建模之前,我们还先考虑一下,选定哪些参数,指定什么作为评估标准?
TP(True Positive):被判定为正样本,事实上也是证样本。
TN(True Negative):被判定为负样本,事实上也是负样本。
FP(False Positive):被判定为正样本,但事实上是负样本。
FN(False Negative):被判定为负样本,但事实上是正样本。
由于我们是要尽可能将所有信用卡欺诈的数据找出来,所以有个很重要的衡量标准:
召回率:Recall = TP/(TP+FN)
假设1000条信用卡数据中,有10条是欺诈数据,召回率有别于准确率,它关注的目标就是这10条数据,找出3条,那么召回率为0.3。
建模
接下来就是建模了,很多时候我们也不知道参数设置为多少比较合适,所以最好的办法写一个脚本让机器分别去跑,我们根据各个模型结果再做选择比较省心。
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold,cross_val_score
from sklearn.metrics import confusion_matrix,recall_score,classification_report
# 训练模型,实例化逻辑回归模型,指定不同的惩罚系数,利用交叉验证找到最合适的参数,打印每个结果
def printing_Kfold_scores(x_train_data,y_train_data):
fold = KFold(5,shuffle=False) # 五折交叉验证
# 正则化权重参数,指定惩罚力度,用以控制过拟合
c_param_range = [0.01,0.1,1,10,100]
results_table = pd.DataFrame(columns = ['C_parameter','Mean recall score'])
results_table['C_parameter'] = c_param_range
# k折交叉验证将返回两个式子: train_indices = indices[0], test_indices = indices[1]
j = 0
# 外层循环,调节权重参数
for c_param in c_param_range:
print('-------------------------------------------')
print('C parameter: ', c_param)
print('-------------------------------------------')
print('')
recall_accs = []
# 内层循环,调节交叉验证参数
for iteration, indices in enumerate(fold.split(y_train_data)): # 调节交叉验证
# 用于输出下标及对应元素
# 建立逻辑回归模型,逻辑回归中有很多惩罚参数,这里使用的是惩罚力度,指定惩罚方案为L1(或L2)
lr = LogisticRegression(C = c_param, penalty = 'l1')
# 使用训练集训练模型,并做交叉验证
lr.fit(x_train_data.iloc[indices[0],:],y_train_data.iloc[indices[0],:].values.ravel())
#将y_train_data降为一维
# 在训练集中,交叉验证预测出的结果y
y_pred_undersample = lr.predict(x_train_data.iloc[indices[1],:].values)
# 用预测的y值与真实的y值计算recall值,打印结果
recall_acc = recall_score(y_train_data.iloc[indices[1],:].values,y_pred_undersample)
recall_accs.append(recall_acc)
print('Iteration ', iteration,': recall score = ', recall_acc)
# 计算交叉验证结果得出的recall的平均值,并打印
results_table.loc[j,'Mean recall score'] = np.mean(recall_accs)#将每次的的平均值写入DataFrame
j += 1
print('')
print('Mean recall score ', np.mean(recall_accs))
print('')
results_table.dtypes.eq(object)# 你可以查看数据类型
results_table = results_table.apply(pd.to_numeric, errors = 'coerce', axis=0)# 强制将数据类型转换为数值型
best_c = results_table.loc[results_table['Mean recall score'].idxmax(),'C_parameter']# 找出平均值最大 # 的对应参数
# 最后,我们可以选择C参数之间的最优值
print('*********************************************************************************')
print('Best model to choose from cross validation is with C parameter = ', best_c)
print('*********************************************************************************')
return best_c
best_c = printing_Kfold_scores(X_train_undersample,y_train_undersample)
-------------------------------------------
C parameter: 0.01
-------------------------------------------
Iteration 0 : recall score = 0.9726027397260274
Iteration 1 : recall score = 0.9315068493150684
Iteration 2 : recall score = 1.0
Iteration 3 : recall score = 0.972972972972973
Iteration 4 : recall score = 0.9848484848484849
Mean recall score 0.9723862093725106
-------------------------------------------
C parameter: 0.1
-------------------------------------------
Iteration 0 : recall score = 0.8356164383561644
Iteration 1 : recall score = 0.863013698630137
Iteration 2 : recall score = 0.9322033898305084
Iteration 3 : recall score = 0.9324324324324325
Iteration 4 : recall score = 0.8939393939393939
Mean recall score 0.8914410706377272
-------------------------------------------
C parameter: 1
-------------------------------------------
Iteration 0 : recall score = 0.8356164383561644
Iteration 1 : recall score = 0.863013698630137
Iteration 2 : recall score = 0.9830508474576272
Iteration 3 : recall score = 0.9459459459459459
Iteration 4 : recall score = 0.9090909090909091
Mean recall score 0.9073435678961568
-------------------------------------------
C parameter: 10
-------------------------------------------
Iteration 0 : recall score = 0.8493150684931506
Iteration 1 : recall score = 0.863013698630137
Iteration 2 : recall score = 0.9830508474576272
Iteration 3 : recall score = 0.9459459459459459
Iteration 4 : recall score = 0.9090909090909091
Mean recall score 0.9100832939235539
-------------------------------------------
C parameter: 100
-------------------------------------------
Iteration 0 : recall score = 0.8493150684931506
Iteration 1 : recall score = 0.863013698630137
Iteration 2 : recall score = 0.9830508474576272
Iteration 3 : recall score = 0.9459459459459459
Iteration 4 : recall score = 0.9090909090909091
Mean recall score 0.9100832939235539
*********************************************************************************
Best model to choose from cross validation is with C parameter = 0.01
*********************************************************************************
def plot_confusion_matrix(cm,classes,title='Confusion matrix',cmap=plt.cm.Blues):
# 利用imshow画出热图,interpolation='nearest'将颜色分块显示,自定义cmap设置为蓝色过渡
plt.imshow(cm,interpolation='nearest',cmap=cmap)
plt.title(title) # 自定义标题Confusion matrix
plt.colorbar() # 显示颜色色标
tick_marks=np.arange(len(classes))
plt.xticks(tick_marks,classes,rotation=0) # 文本坐标,文本,旋转度为0
plt.yticks(tick_marks,classes) # 参数同上
thresh=cm.max()/2 # 找出混淆矩阵最大值除以二
# 笛卡儿积
for i,j in itertools.product(range(cm.shape[0]),range(cm.shape[1])):
plt.text(j,i,cm[i,j], # 设置位置坐标,文本
horizontalalignment='center', # 居中
color='white' if cm[i,j] > thresh else 'black') # 设置文本颜色
plt.tight_layout() # 调整子图之间的间隔,防止堆叠
plt.ylabel('True label') # 设置x轴标签
plt.xlabel('Predicted label') # 设置y轴标签
import itertools
lr=LogisticRegression(C=best_c,penalty='l1') # 根据训练结果,建立最优模型
lr.fit(X_train_undersample,y_train_undersample.values.ravel()) # 输入下采样训练集集,训练模型
y_pred_undersample=lr.predict(X_test_undersample.values) # 输入下采样测试集得到预测结果
# 建立混淆矩阵
cnf_matrix=confusion_matrix(y_test_undersample,y_pred_undersample)# 输入下采样实际标签,下采样预测标签
np.set_printoptions(precision=2) # 输出时将每个元素精度设置为小数点后两位
# 通过混淆矩阵计算召回率:Recall=TP/(TP+FN)
print('Recall metric in the testing dataset: ',cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))
# 非归一化混淆矩阵
class_names=[0,1]
plt.figure() # 构建一个图表
plot_confusion_matrix(cnf_matrix,classes=class_names,title='Confusion matrix') # 混淆矩阵可视化
plt.show()
Recall metric in the testing dataset: 0.9387755102040817
如图所示,找出了137个真实的欺诈模型,误杀了17个,漏掉了10个,召回率达到0.93。这并非我们要的结果,这是下采样数据计算得来的混淆矩阵
lr=LogisticRegression(C=best_c,penalty='l1')
lr.fit(X_train_undersample,y_train_undersample.values.ravel())
y_pred=lr.predict(X_test.values)
np.set_printoptions(precision=2)
cnf_matrix=confusion_matrix(y_test,y_pred)
print('Recall metric in testing dataset: ',cnf_matrix[1,1]/(cnf_matrix[1,1]+cnf_matrix[1,0]))
classes_names=[0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix,classes=classes_names,title='Confusion matrix')
Recall metric in testing dataset: 0.9319727891156463
这是我们对数据不做任何处理得到的混淆矩阵,找出正确案例139,漏掉12,但是误杀竟然高达9269。这是由下采样数据得到的模型。由于下采样数据,正常的少,异常的也少,样本有局限性,这种情况可以理解。
best_c=printing_Kfold_scores(X_train,y_train)
-------------------------------------------
C parameter: 0.01
-------------------------------------------
Iteration 0 : recall score = 0.4925373134328358
Iteration 1 : recall score = 0.6027397260273972
Iteration 2 : recall score = 0.6833333333333333
Iteration 3 : recall score = 0.5692307692307692
Iteration 4 : recall score = 0.45
Mean recall score 0.5595682284048672
-------------------------------------------
C parameter: 0.1
-------------------------------------------
Iteration 0 : recall score = 0.5671641791044776
Iteration 1 : recall score = 0.6164383561643836
Iteration 2 : recall score = 0.6833333333333333
Iteration 3 : recall score = 0.5846153846153846
Iteration 4 : recall score = 0.525
Mean recall score 0.5953102506435158
-------------------------------------------
C parameter: 1
-------------------------------------------
Iteration 0 : recall score = 0.5522388059701493
Iteration 1 : recall score = 0.6164383561643836
Iteration 2 : recall score = 0.7166666666666667
Iteration 3 : recall score = 0.6153846153846154
Iteration 4 : recall score = 0.5625
Mean recall score 0.612645688837163
-------------------------------------------
C parameter: 10
-------------------------------------------
Iteration 0 : recall score = 0.5522388059701493
Iteration 1 : recall score = 0.6164383561643836
Iteration 2 : recall score = 0.7333333333333333
Iteration 3 : recall score = 0.6153846153846154
Iteration 4 : recall score = 0.575
Mean recall score 0.6184790221704963
-------------------------------------------
C parameter: 100
-------------------------------------------
Iteration 0 : recall score = 0.5522388059701493
Iteration 1 : recall score = 0.6164383561643836
Iteration 2 : recall score = 0.7333333333333333
Iteration 3 : recall score = 0.6153846153846154
Iteration 4 : recall score = 0.575
Mean recall score 0.6184790221704963
*********************************************************************************
Best model to choose from cross validation is with C parameter = 10.0
*********************************************************************************
上述是我们对数据不做任何预处理得到的召回率,显然在用这种极度不均衡的数据建模效果很差。所以对数据的预处理非常有必要。数据决定上限,参数决定下限。
lr = LogisticRegression(C = best_c, penalty = 'l1')
lr.fit(X_train,y_train.values.ravel())
y_pred=lr.predict(X_test.values)
cnf_matrix=confusion_matrix(y_test,y_pred)
np.set_printoptions(precision=2)
print('Recall metric in testing dataset: ',cnf_matrix[1,1]/(cnf_matrix[1,1]+cnf_matrix[1,0]))
classes_names=[0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix,classes=classes_names,title='Confusion matrix')
Recall metric in testing dataset: 0.6190476190476191
从结果看误差减少了,但是仍有很多没有找出来
lr=LogisticRegression(C=0.01,penalty='l1')
lr.fit(X_train_undersample,y_train_undersample.values.ravel())
y_pred_undersample_proba=lr.predict_proba(X_test_undersample.values)
y_test_predictions_high_recall=y_pred_undersample_proba[:,1]>0.1
cnf_matrix=confusion_matrix(y_test_undersample,y_test_predictions_high_recall)
lr=LogisticRegression(C=0.01,penalty='l1')
lr.fit(X_train_undersample,y_train_undersample.values.ravel())
y_pred_undersample_proba=lr.predict_proba(X_test_undersample.values) # 返回预测是0的概率,是1的概率
thresholds=np.arange(0.1,1,0.1) # 设置阈值0.1-0.9
print(thresholds)
plt.figure(figsize=(10,10)) # 建立一个画图域
j=1
for i in thresholds:
y_test_predictions_high_recall=y_pred_undersample_proba[:,1]>i # 如果概率大于阈值,预测标签为True
plt.subplot(3,3,j) # 分别画出子图
j+=1
cnf_matrix=confusion_matrix(y_test_undersample,y_test_predictions_high_recall) # 计算混淆矩阵,
# 此函数中1/0等价True/False
np.set_printoptions(precision=2)
print("Recall metric in the testing dataset: ",cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))
class_names=[0,1]
plot_confusion_matrix(cnf_matrix,
classes=classes_names,
title='Threshold>=%s'%i)
[0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9]
Recall metric in the testing dataset: 1.0
Recall metric in the testing dataset: 1.0
Recall metric in the testing dataset: 1.0
Recall metric in the testing dataset: 0.9931972789115646
Recall metric in the testing dataset: 0.9387755102040817
Recall metric in the testing dataset: 0.891156462585034
Recall metric in the testing dataset: 0.8435374149659864
Recall metric in the testing dataset: 0.7482993197278912
Recall metric in the testing dataset: 0.5850340136054422
由上述图示分析可得,阈值在0.1—0.3的时候将所有的样本都判断为了欺诈样本,随着阈值不断增大,误判越来越少,但是检测出的欺诈样本也越来越少。分析发现阈值0.5—0.6是相对较好的
过采样-SMOTE样本生成策略
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
credit_cards=pd.read_csv('creditcard.csv')
columns=credit_cards.columns # 获取特征名
features_columns=columns.delete(len(columns)-1) # 删除标签名,只剩纯特征名
features=credit_cards[features_columns] # 获取特征数据
labels=credit_cards['Class'] # 获取标签数据
# 将数据切分成百分之八十训练数据和百分之二十的预测数据
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.2, random_state=0)
oversampler=SMOTE(random_state=0) # 使用SMOTE算法模型
os_features,os_labels=oversampler.fit_sample(features_train,labels_train) # 将训练数据过采样
len(os_labels[os_labels==1]) # 验证一下是否过采样
227454
# 转化为DataFrame的形式
os_features = pd.DataFrame(os_features)
os_labels = pd.DataFrame(os_labels)
best_c = printing_Kfold_scores(os_features,os_labels) # 计算下采样各惩罚力度的召回率
-------------------------------------------
C parameter: 0.01
-------------------------------------------
Iteration 0 : recall score = 0.8903225806451613
Iteration 1 : recall score = 0.8947368421052632
Iteration 2 : recall score = 0.968861347792409
Iteration 3 : recall score = 0.9578593332673855
Iteration 4 : recall score = 0.9584198898671151
Mean recall score 0.9340399987354668
-------------------------------------------
C parameter: 0.1
-------------------------------------------
Iteration 0 : recall score = 0.8903225806451613
Iteration 1 : recall score = 0.8947368421052632
Iteration 2 : recall score = 0.9702113533252186
Iteration 3 : recall score = 0.9598707422428859
Iteration 4 : recall score = 0.9605961684307712
Mean recall score 0.93514753734986
-------------------------------------------
C parameter: 1
-------------------------------------------
Iteration 0 : recall score = 0.8903225806451613
Iteration 1 : recall score = 0.8947368421052632
Iteration 2 : recall score = 0.9704769281841319
Iteration 3 : recall score = 0.9603543597014761
Iteration 4 : recall score = 0.9604752640661237
Mean recall score 0.9352731949404312
-------------------------------------------
C parameter: 10
-------------------------------------------
Iteration 0 : recall score = 0.8903225806451613
Iteration 1 : recall score = 0.8947368421052632
Iteration 2 : recall score = 0.9701449596104902
Iteration 3 : recall score = 0.9600246205251646
Iteration 4 : recall score = 0.9600575944427957
Mean recall score 0.9350573194657749
-------------------------------------------
C parameter: 100
-------------------------------------------
Iteration 0 : recall score = 0.8903225806451613
Iteration 1 : recall score = 0.8947368421052632
Iteration 2 : recall score = 0.9705433218988603
Iteration 3 : recall score = 0.9601455248898122
Iteration 4 : recall score = 0.9606621162660336
Mean recall score 0.9352820771610262
*********************************************************************************
Best model to choose from cross validation is with C parameter = 100.0
*********************************************************************************
lr = LogisticRegression(C = best_c, penalty = 'l1')
lr.fit(os_features,os_labels.values.ravel())
y_pred = lr.predict(features_test.values)
cnf_matrix = confusion_matrix(labels_test,y_pred)
np.set_printoptions(precision=2)
print("Recall metric in the testing dataset: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix
, classes=class_names
, title='Confusion matrix')
plt.show()
Recall metric in the testing dataset: 0.9108910891089109
上述是过采样数据构造的混淆矩阵,分析发现它优于前几个模型