import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df=pd.read_csv('creditcard.csv')
df.head()
Time | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | ... | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | Amount | Class | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | -1.359807 | -0.072781 | 2.536347 | 1.378155 | -0.338321 | 0.462388 | 0.239599 | 0.098698 | 0.363787 | ... | -0.018307 | 0.277838 | -0.110474 | 0.066928 | 0.128539 | -0.189115 | 0.133558 | -0.021053 | 149.62 | 0 |
1 | 0.0 | 1.191857 | 0.266151 | 0.166480 | 0.448154 | 0.060018 | -0.082361 | -0.078803 | 0.085102 | -0.255425 | ... | -0.225775 | -0.638672 | 0.101288 | -0.339846 | 0.167170 | 0.125895 | -0.008983 | 0.014724 | 2.69 | 0 |
2 | 1.0 | -1.358354 | -1.340163 | 1.773209 | 0.379780 | -0.503198 | 1.800499 | 0.791461 | 0.247676 | -1.514654 | ... | 0.247998 | 0.771679 | 0.909412 | -0.689281 | -0.327642 | -0.139097 | -0.055353 | -0.059752 | 378.66 | 0 |
3 | 1.0 | -0.966272 | -0.185226 | 1.792993 | -0.863291 | -0.010309 | 1.247203 | 0.237609 | 0.377436 | -1.387024 | ... | -0.108300 | 0.005274 | -0.190321 | -1.175575 | 0.647376 | -0.221929 | 0.062723 | 0.061458 | 123.50 | 0 |
4 | 2.0 | -1.158233 | 0.877737 | 1.548718 | 0.403034 | -0.407193 | 0.095921 | 0.592941 | -0.270533 | 0.817739 | ... | -0.009431 | 0.798278 | -0.137458 | 0.141267 | -0.206010 | 0.502292 | 0.219422 | 0.215153 | 69.99 | 0 |
5 rows × 31 columns
df.shape
(284807, 31)
time列无用,删除
V1-V28是特征
Amount交易金额,大小差异大,预处理
df=df.drop(labels='Time',axis=1)
df.head()
V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | V10 | ... | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | Amount | Class | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | -1.359807 | -0.072781 | 2.536347 | 1.378155 | -0.338321 | 0.462388 | 0.239599 | 0.098698 | 0.363787 | 0.090794 | ... | -0.018307 | 0.277838 | -0.110474 | 0.066928 | 0.128539 | -0.189115 | 0.133558 | -0.021053 | 149.62 | 0 |
1 | 1.191857 | 0.266151 | 0.166480 | 0.448154 | 0.060018 | -0.082361 | -0.078803 | 0.085102 | -0.255425 | -0.166974 | ... | -0.225775 | -0.638672 | 0.101288 | -0.339846 | 0.167170 | 0.125895 | -0.008983 | 0.014724 | 2.69 | 0 |
2 | -1.358354 | -1.340163 | 1.773209 | 0.379780 | -0.503198 | 1.800499 | 0.791461 | 0.247676 | -1.514654 | 0.207643 | ... | 0.247998 | 0.771679 | 0.909412 | -0.689281 | -0.327642 | -0.139097 | -0.055353 | -0.059752 | 378.66 | 0 |
3 | -0.966272 | -0.185226 | 1.792993 | -0.863291 | -0.010309 | 1.247203 | 0.237609 | 0.377436 | -1.387024 | -0.054952 | ... | -0.108300 | 0.005274 | -0.190321 | -1.175575 | 0.647376 | -0.221929 | 0.062723 | 0.061458 | 123.50 | 0 |
4 | -1.158233 | 0.877737 | 1.548718 | 0.403034 | -0.407193 | 0.095921 | 0.592941 | -0.270533 | 0.817739 | 0.753074 | ... | -0.009431 | 0.798278 | -0.137458 | 0.141267 | -0.206010 | 0.502292 | 0.219422 | 0.215153 | 69.99 | 0 |
5 rows × 30 columns
预处理
df['Amount']
0 149.62
1 2.69
2 378.66
3 123.50
4 69.99
...
284802 0.77
284803 24.79
284804 67.88
284805 10.00
284806 217.00
Name: Amount, Length: 284807, dtype: float64
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
df['Amount']=sc.fit_transform(df['Amount'].values.reshape(-1,1))
df.head()
V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | V10 | ... | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | Amount | Class | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | -1.359807 | -0.072781 | 2.536347 | 1.378155 | -0.338321 | 0.462388 | 0.239599 | 0.098698 | 0.363787 | 0.090794 | ... | -0.018307 | 0.277838 | -0.110474 | 0.066928 | 0.128539 | -0.189115 | 0.133558 | -0.021053 | 0.244964 | 0 |
1 | 1.191857 | 0.266151 | 0.166480 | 0.448154 | 0.060018 | -0.082361 | -0.078803 | 0.085102 | -0.255425 | -0.166974 | ... | -0.225775 | -0.638672 | 0.101288 | -0.339846 | 0.167170 | 0.125895 | -0.008983 | 0.014724 | -0.342475 | 0 |
2 | -1.358354 | -1.340163 | 1.773209 | 0.379780 | -0.503198 | 1.800499 | 0.791461 | 0.247676 | -1.514654 | 0.207643 | ... | 0.247998 | 0.771679 | 0.909412 | -0.689281 | -0.327642 | -0.139097 | -0.055353 | -0.059752 | 1.160686 | 0 |
3 | -0.966272 | -0.185226 | 1.792993 | -0.863291 | -0.010309 | 1.247203 | 0.237609 | 0.377436 | -1.387024 | -0.054952 | ... | -0.108300 | 0.005274 | -0.190321 | -1.175575 | 0.647376 | -0.221929 | 0.062723 | 0.061458 | 0.140534 | 0 |
4 | -1.158233 | 0.877737 | 1.548718 | 0.403034 | -0.407193 | 0.095921 | 0.592941 | -0.270533 | 0.817739 | 0.753074 | ... | -0.009431 | 0.798278 | -0.137458 | 0.141267 | -0.206010 | 0.502292 | 0.219422 | 0.215153 | -0.073403 | 0 |
5 rows × 30 columns
可视化
class_numbers=pd.value_counts(df['Class'],sort=True).sort_index()
class_numbers.plot.bar()
plt.xlabel('class')
plt.ylabel('numbers')
plt.show()
可以看出异常样本少,样本不均衡
解决方法:
- 过采样
- 下采样
下采样
最多的数据和最少的数据一样少
X=df.loc[:,df.columns != 'Class']
y=df.loc[:,df.columns=='Class']
X.shape
(284807, 29)
y.shape
(284807, 1)
#最少的数目
number_min=len(df[df['Class']==1])
number_min
492
index_min=np.array(df[df['Class']==1].index)
#最多样本的index拿出来
index_max=np.array(df[df['Class']==0].index)
#从最多的样本中选择和最少数目一样多的样本
index_max_random=np.random.choice(index_max,size=number_min,replace=True)
#合并,numpy中一维数组列向量
under_sample_indices=np.concatenate([index_min,index_max_random])
under_sample_indices.shape
(984,)
under_sample_data=df.iloc[under_sample_indices,:]
under_sample_data.shape
(984, 30)
X_under_sample=under_sample_data.loc[:,under_sample_data.columns != 'Class']
y_under_sample=under_sample_data.loc[:,under_sample_data.columns == 'Class']
X_under_sample.shape,y_under_sample.shape
((984, 29), (984, 1))
#展示比例
print('正常交易比例:',len(under_sample_data[under_sample_data.Class==0])/len(under_sample_data.Class))
print('欺诈交易比例:',len(under_sample_data[under_sample_data.Class==1])/len(under_sample_data.Class))
print('总的交易数目:',len(under_sample_data))
正常交易比例: 0.5
欺诈交易比例: 0.5
总的交易数目: 984
划分数据集
from sklearn.model_selection import train_test_split
#在数据不平衡背景下,划分训练集和验证集
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3)
print("number transaction train dataset:",len(X_train))
print("number transaction test dataset:",len(X_test))
print("total number of transactions:",len(X_train)+len(X_test))
number transaction train dataset: 199364
number transaction test dataset: 85443
total number of transactions: 284807
#使用下采样数据,划分训练集和验证集
X_train_und_sam,X_test_und_sam,y_train_und_sam,y_test_und_sam=train_test_split(X_under_sample,
y_under_sample,
test_size=0.3)
print("number transaction train dataset:",len(X_train_und_sam))
print("number transaction test dataset:",len(X_test_und_sam))
print("total number of transactions:",len(X_train_und_sam)+len(X_test_und_sam))
number transaction train dataset: 688
number transaction test dataset: 296
total number of transactions: 984
交叉验证
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix,recall_score,classification_report
#计算不同惩罚项的参数下的recall
def print_Kfold_scores(X_data,y_data):#输入训练集的数据
#设置不同C
c_params=[0.01,0.1,1,10,100]
for c in c_params:
print('-------------------------------------------')
print('C parameter: ', c)
print('-------------------------------------------')
print('')
recall_scores=[]
recall_scores_mean=[]
for k ,(train,test) in enumerate(StratifiedKFold(n_splits=10).split(X_data,y_data)):#train,test表示原始training data的子集
lr=LogisticRegression(C=c,penalty='l1',solver='liblinear')
#使用训练集的拆分出来的训练集拟合模型
lr.fit(X_data.iloc[train,:].values,y_data.iloc[train,:].values.ravel())
#使用训练集的拆分出来的测试集预测
y_under_sample_pred=lr.predict(X_data.iloc[test,:].values)
#计算recal得分
score=recall_score(y_data.iloc[test,:].values,y_under_sample_pred)
recall_scores.append(score)
print('第%d折'%k,":recall_score=%.3f"%score)
#计算平均recall_score
print('')
print("mean recall score:%.3f"%np.mean(recall_scores))
print('')
#储存平均recall_score
recall_scores_mean.append(np.mean(recall_scores))
#计算best C
best_c=max(recall_scores_mean)
print('*********************************************************************************')
print('Best model to choose from cross validation is with C parameter =%.3f '%best_c)
print('*********************************************************************************')
return best_c
best_c=print_Kfold_scores(X_train_und_sam,y_train_und_sam)
-------------------------------------------
C parameter: 0.01
-------------------------------------------
第0折 :recall_score=1.000
第1折 :recall_score=0.941
第2折 :recall_score=0.941
第3折 :recall_score=1.000
第4折 :recall_score=0.941
第5折 :recall_score=0.971
第6折 :recall_score=0.882
第7折 :recall_score=1.000
第8折 :recall_score=0.941
第9折 :recall_score=0.909
mean recall score:0.953
-------------------------------------------
C parameter: 0.1
-------------------------------------------
第0折 :recall_score=0.971
第1折 :recall_score=0.912
第2折 :recall_score=0.882
第3折 :recall_score=1.000
第4折 :recall_score=0.941
第5折 :recall_score=0.882
第6折 :recall_score=0.882
第7折 :recall_score=0.882
第8折 :recall_score=0.882
第9折 :recall_score=0.879
mean recall score:0.911
-------------------------------------------
C parameter: 1
-------------------------------------------
第0折 :recall_score=0.971
第1折 :recall_score=0.912
第2折 :recall_score=0.882
第3折 :recall_score=1.000
第4折 :recall_score=0.941
第5折 :recall_score=0.882
第6折 :recall_score=0.882
第7折 :recall_score=0.912
第8折 :recall_score=0.882
第9折 :recall_score=0.879
mean recall score:0.914
-------------------------------------------
C parameter: 10
-------------------------------------------
第0折 :recall_score=0.971
第1折 :recall_score=0.912
第2折 :recall_score=0.912
第3折 :recall_score=1.000
第4折 :recall_score=0.941
第5折 :recall_score=0.882
第6折 :recall_score=0.882
第7折 :recall_score=0.912
第8折 :recall_score=0.912
第9折 :recall_score=0.879
mean recall score:0.920
-------------------------------------------
C parameter: 100
-------------------------------------------
第0折 :recall_score=0.971
第1折 :recall_score=0.912
第2折 :recall_score=0.912
第3折 :recall_score=1.000
第4折 :recall_score=0.941
第5折 :recall_score=0.882
第6折 :recall_score=0.882
第7折 :recall_score=0.912
第8折 :recall_score=0.941
第9折 :recall_score=0.879
mean recall score:0.923
*********************************************************************************
Best model to choose from cross validation is with C parameter =0.923
*********************************************************************************
错误记录:
百思不得其解的错误:内层只执行了一次,c=0.1以后不执行。
解决方法网址:https://www.jianshu.com/p/6d6f7ffa1977
stackoverflow上的解决办法:https://stackoverflow.com/questions/11569535/python-nested-loop-with-generators-does-not-work-in-some-cases
best_c = print_Kfold_scores(X_train,y_train)#使用不平衡样本的训练集
-------------------------------------------
C parameter: 0.01
-------------------------------------------
第0折 :recall_score=0.429
第1折 :recall_score=0.543
第2折 :recall_score=0.400
第3折 :recall_score=0.571
第4折 :recall_score=0.559
第5折 :recall_score=0.676
第6折 :recall_score=0.559
第7折 :recall_score=0.647
第8折 :recall_score=0.500
第9折 :recall_score=0.657
mean recall score:0.554
-------------------------------------------
C parameter: 0.1
-------------------------------------------
第0折 :recall_score=0.400
第1折 :recall_score=0.543
第2折 :recall_score=0.486
第3折 :recall_score=0.600
第4折 :recall_score=0.588
第5折 :recall_score=0.706
第6折 :recall_score=0.618
第7折 :recall_score=0.647
第8折 :recall_score=0.588
第9折 :recall_score=0.743
mean recall score:0.592
-------------------------------------------
C parameter: 1
-------------------------------------------
第0折 :recall_score=0.486
第1折 :recall_score=0.571
第2折 :recall_score=0.486
第3折 :recall_score=0.657
第4折 :recall_score=0.588
第5折 :recall_score=0.706
第6折 :recall_score=0.676
第7折 :recall_score=0.706
第8折 :recall_score=0.559
第9折 :recall_score=0.743
mean recall score:0.618
-------------------------------------------
C parameter: 10
-------------------------------------------
第0折 :recall_score=0.486
第1折 :recall_score=0.571
第2折 :recall_score=0.486
第3折 :recall_score=0.657
第4折 :recall_score=0.588
第5折 :recall_score=0.706
第6折 :recall_score=0.647
第7折 :recall_score=0.706
第8折 :recall_score=0.559
第9折 :recall_score=0.743
mean recall score:0.615
-------------------------------------------
C parameter: 100
-------------------------------------------
第0折 :recall_score=0.486
第1折 :recall_score=0.571
第2折 :recall_score=0.486
第3折 :recall_score=0.657
第4折 :recall_score=0.588
第5折 :recall_score=0.706
第6折 :recall_score=0.647
第7折 :recall_score=0.706
第8折 :recall_score=0.559
第9折 :recall_score=0.743
mean recall score:0.615
*********************************************************************************
Best model to choose from cross validation is with C parameter =0.615
*********************************************************************************
混淆矩阵
def plot_confusion_matrix(cm,classes,
title='Confusion matrix',
cmap='Blues'):
"""此函数用于绘制混淆矩阵
cm:混淆矩阵
classes:类名称,如1,0
cmap:颜色板
title:标题
"""
plt.imshow(cm,interpolation='nearest',cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks=np.arange(len(classes))
plt.xticks(tick_marks,classes,rotation=0)
plt.yticks(tick_marks,classes)
thresh=cm.max()/2.0
for i,j in itertools.product(range(cm.shape[0]),range(cm.shape[1])):
plt.text(j,i,
cm[i,j],
horizontalalignment="center",
color='white'if cm[i,j]>thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
热力图是一种数据的图形化表示,具体而言,就是将二维数组中的元素用颜色表示。热力图之所以非常有用,是因为它能够从整体视角上展示数据,更确切的说是数值型数据。
使用imshow()函数可以非常容易地制作热力图。
import itertools
lr=LogisticRegression(C=best_c,penalty='l1',solver='liblinear')
lr.fit(X_train_und_sam.values,y_train_und_sam.values.ravel())
y_pred_under_sample=lr.predict(X_test_und_sam.values)
#计算混淆矩阵
conf_matrix=confusion_matrix(y_test_und_sam,y_pred_under_sample)
np.set_printoptions(precision=2)
conf_matrix
array([[138, 5],
[ 18, 135]], dtype=int64)
print("Recall metric in the testing dataset:%.3f"%(conf_matrix[1,1]/(conf_matrix[1,0]+conf_matrix[1,1])))
Recall metric in the testing dataset:0.882
#绘出混淆矩阵
class_names=[0,1]
plot_confusion_matrix(conf_matrix,
classes=class_names,
title='Consusion Matrix')
plt.show()
#使用平衡样本训练,不平衡样本预测
lr2=LogisticRegression(C=best_c,penalty='l1',solver='liblinear')
lr2.fit(X_train_und_sam.values,y_train_und_sam.values.ravel())
y_pred=lr2.predict(X_test.values)
conf_matrix2=confusion_matrix(y_test,y_pred)
print("Recall metric in the testing dataset:%.3f"%(conf_matrix2[1,1]/(conf_matrix2[1,0]+conf_matrix2[1,1])))
class_names=[0,1]
plot_confusion_matrix(conf_matrix2,
classes=class_names,
title='Consusion Matrix',
cmap='OrRd')
plt.show()
Recall metric in the testing dataset:0.939
#使用不平衡样本训练,不平衡样本预测
lr3=LogisticRegression(C=best_c,penalty='l1',solver='liblinear')
lr3.fit(X_train.values,y_train.values.ravel())
y_pred_3=lr3.predict(X_test.values)
conf_matrix3=confusion_matrix(y_test,y_pred_3)
print("Recall metric in the testing dataset:%.3f"%(conf_matrix3[1,1]/(conf_matrix3[1,0]+conf_matrix3[1,1])))
class_names=[0,1]
plot_confusion_matrix(conf_matrix3,
classes=class_names,
title='Consusion Matrix',
cmap='YlGn')
plt.show()
Recall metric in the testing dataset:0.626
逻辑回归阈值对结果影响
lr4=LogisticRegression(C=0.01,penalty='l1',solver='liblinear')
lr4.fit(X_train_und_sam.values,y_train_und_sam.values.ravel())
y_pred4_proba=lr4.predict_proba(X_test_und_sam.values)#获得每个样本属于每类的概率
thresholds = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
plt.figure(figsize=(10,10))
j=1
for i in thresholds:
y_test_predictions_high_recall =y_pred4_proba[:,1]>i
plt.subplot(3,3,j)
j += 1
#计算混淆矩阵
conf_matrix4=confusion_matrix(y_test_und_sam,y_test_predictions_high_recall)
print("Recall metric in the testing dataset:%.3f"%(conf_matrix4[1,1]/(conf_matrix4[1,0]+conf_matrix4[1,1])))
#绘出混淆矩阵的图
class_names=[0,1]
plot_confusion_matrix(conf_matrix4,
classes=class_names,
title='Threshold >=%.2f'%i)
Recall metric in the testing dataset:1.000
Recall metric in the testing dataset:0.993
Recall metric in the testing dataset:0.993
Recall metric in the testing dataset:0.961
Recall metric in the testing dataset:0.895
Recall metric in the testing dataset:0.863
Recall metric in the testing dataset:0.850
Recall metric in the testing dataset:0.804
Recall metric in the testing dataset:0.614
SMOTE采样
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
credit_cards=pd.read_csv('creditcard.csv')
credit_cards.head()
Time | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | ... | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | Amount | Class | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | -1.359807 | -0.072781 | 2.536347 | 1.378155 | -0.338321 | 0.462388 | 0.239599 | 0.098698 | 0.363787 | ... | -0.018307 | 0.277838 | -0.110474 | 0.066928 | 0.128539 | -0.189115 | 0.133558 | -0.021053 | 149.62 | 0 |
1 | 0.0 | 1.191857 | 0.266151 | 0.166480 | 0.448154 | 0.060018 | -0.082361 | -0.078803 | 0.085102 | -0.255425 | ... | -0.225775 | -0.638672 | 0.101288 | -0.339846 | 0.167170 | 0.125895 | -0.008983 | 0.014724 | 2.69 | 0 |
2 | 1.0 | -1.358354 | -1.340163 | 1.773209 | 0.379780 | -0.503198 | 1.800499 | 0.791461 | 0.247676 | -1.514654 | ... | 0.247998 | 0.771679 | 0.909412 | -0.689281 | -0.327642 | -0.139097 | -0.055353 | -0.059752 | 378.66 | 0 |
3 | 1.0 | -0.966272 | -0.185226 | 1.792993 | -0.863291 | -0.010309 | 1.247203 | 0.237609 | 0.377436 | -1.387024 | ... | -0.108300 | 0.005274 | -0.190321 | -1.175575 | 0.647376 | -0.221929 | 0.062723 | 0.061458 | 123.50 | 0 |
4 | 2.0 | -1.158233 | 0.877737 | 1.548718 | 0.403034 | -0.407193 | 0.095921 | 0.592941 | -0.270533 | 0.817739 | ... | -0.009431 | 0.798278 | -0.137458 | 0.141267 | -0.206010 | 0.502292 | 0.219422 | 0.215153 | 69.99 | 0 |
5 rows × 31 columns
columns=credit_cards.columns
columns
Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
'Class'],
dtype='object')
features_columns=columns.delete((len(columns)-1))
features_columns
Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount'],
dtype='object')
features=credit_cards[features_columns]
features.head()
Time | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | ... | V20 | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | Amount | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | -1.359807 | -0.072781 | 2.536347 | 1.378155 | -0.338321 | 0.462388 | 0.239599 | 0.098698 | 0.363787 | ... | 0.251412 | -0.018307 | 0.277838 | -0.110474 | 0.066928 | 0.128539 | -0.189115 | 0.133558 | -0.021053 | 149.62 |
1 | 0.0 | 1.191857 | 0.266151 | 0.166480 | 0.448154 | 0.060018 | -0.082361 | -0.078803 | 0.085102 | -0.255425 | ... | -0.069083 | -0.225775 | -0.638672 | 0.101288 | -0.339846 | 0.167170 | 0.125895 | -0.008983 | 0.014724 | 2.69 |
2 | 1.0 | -1.358354 | -1.340163 | 1.773209 | 0.379780 | -0.503198 | 1.800499 | 0.791461 | 0.247676 | -1.514654 | ... | 0.524980 | 0.247998 | 0.771679 | 0.909412 | -0.689281 | -0.327642 | -0.139097 | -0.055353 | -0.059752 | 378.66 |
3 | 1.0 | -0.966272 | -0.185226 | 1.792993 | -0.863291 | -0.010309 | 1.247203 | 0.237609 | 0.377436 | -1.387024 | ... | -0.208038 | -0.108300 | 0.005274 | -0.190321 | -1.175575 | 0.647376 | -0.221929 | 0.062723 | 0.061458 | 123.50 |
4 | 2.0 | -1.158233 | 0.877737 | 1.548718 | 0.403034 | -0.407193 | 0.095921 | 0.592941 | -0.270533 | 0.817739 | ... | 0.408542 | -0.009431 | 0.798278 | -0.137458 | 0.141267 | -0.206010 | 0.502292 | 0.219422 | 0.215153 | 69.99 |
5 rows × 30 columns
labels=credit_cards['Class']
X_train_1,X_test_1,y_train_1,y_test_1=train_test_split(features,
labels,
test_size=0.2)
oversampler=SMOTE()
X_train_1_os,y_train_1_os=oversampler.fit_resample(X_train_1,y_train_1)
len(y_train_1_os[y_train_1_os==1])
227449
print(type(X_train_1_os))
<class 'pandas.core.frame.DataFrame'>
print(type(y_train_1_os))
<class 'pandas.core.series.Series'>
#计算不同惩罚项的参数下的recall
def printing_Kfold_scores(X_data,y_data):#输入训练集的数据
#设置不同C
c_params=[0.01,0.1,1,10,100]
for c in c_params:
print('-------------------------------------------')
print('C parameter: ', c)
print('-------------------------------------------')
print('')
recall_scores=[]
recall_scores_mean=[]
for k ,(train,test) in enumerate(StratifiedKFold(n_splits=10).split(X_data,y_data)):#train,test表示原始training data的子集
lr=LogisticRegression(C=c,penalty='l1',solver='liblinear')
#使用训练集的拆分出来的训练集拟合模型
lr.fit(X_data.iloc[train,:].values,y_data.iloc[train,:].values.ravel())
#使用训练集的拆分出来的测试集预测
y_under_sample_pred=lr.predict(X_data.iloc[test,:].values)
#计算recal得分
score=recall_score(y_data.iloc[test,:].values,y_under_sample_pred)
recall_scores.append(score)
print('第%d折'%k,":recall_score=%.3f"%score)
#计算平均recall_score
print('')
print("mean recall score:%.3f"%np.mean(recall_scores))
print('')
#储存平均recall_score
recall_scores_mean.append(np.mean(recall_scores))
#计算best C
best_c=max(recall_scores_mean)
print('*********************************************************************************')
print('Best model to choose from cross validation is with C parameter =%.3f '%best_c)
print('*********************************************************************************')
return best_c
y_train_1_os = pd.DataFrame(y_train_1_os)#由series变成dataframe
best_c = printing_Kfold_scores(X_train_1_os,y_train_1_os)
-------------------------------------------
C parameter: 0.01
-------------------------------------------
第0折 :recall_score=0.964
第1折 :recall_score=0.966
第2折 :recall_score=0.968
第3折 :recall_score=0.969
第4折 :recall_score=0.965
第5折 :recall_score=0.965
第6折 :recall_score=0.968
第7折 :recall_score=0.966
第8折 :recall_score=0.966
第9折 :recall_score=0.967
mean recall score:0.966
-------------------------------------------
C parameter: 0.1
-------------------------------------------
第0折 :recall_score=0.965
第1折 :recall_score=0.967
第2折 :recall_score=0.969
第3折 :recall_score=0.970
第4折 :recall_score=0.967
第5折 :recall_score=0.966
第6折 :recall_score=0.970
第7折 :recall_score=0.967
第8折 :recall_score=0.968
第9折 :recall_score=0.969
mean recall score:0.968
-------------------------------------------
C parameter: 1
-------------------------------------------
第0折 :recall_score=0.965
第1折 :recall_score=0.966
第2折 :recall_score=0.969
第3折 :recall_score=0.970
第4折 :recall_score=0.966
第5折 :recall_score=0.967
第6折 :recall_score=0.969
第7折 :recall_score=0.967
第8折 :recall_score=0.968
第9折 :recall_score=0.969
mean recall score:0.968
-------------------------------------------
C parameter: 10
-------------------------------------------
第0折 :recall_score=0.965
第1折 :recall_score=0.967
第2折 :recall_score=0.969
第3折 :recall_score=0.970
第4折 :recall_score=0.967
第5折 :recall_score=0.966
第6折 :recall_score=0.970
第7折 :recall_score=0.967
第8折 :recall_score=0.968
第9折 :recall_score=0.969
mean recall score:0.968
-------------------------------------------
C parameter: 100
-------------------------------------------
第0折 :recall_score=0.965
第1折 :recall_score=0.967
第2折 :recall_score=0.969
第3折 :recall_score=0.970
第4折 :recall_score=0.967
第5折 :recall_score=0.966
第6折 :recall_score=0.970
第7折 :recall_score=0.967
第8折 :recall_score=0.968
第9折 :recall_score=0.969
mean recall score:0.968
*********************************************************************************
Best model to choose from cross validation is with C parameter =0.968
*********************************************************************************
#使用SMOTE得到平衡样本训练
lr5=LogisticRegression(C=best_c,penalty='l1',solver='liblinear')
lr5.fit(X_train_1_os.values,y_train_1_os.values.ravel())
y_pred_5=lr5.predict(X_test_1.values)
conf_matrix5=confusion_matrix(y_test_1,y_pred_5)
print("Recall metric in the testing dataset:%.3f"%(conf_matrix5[1,1]/(conf_matrix5[1,0]+conf_matrix5[1,1])))
class_names=[0,1]
plot_confusion_matrix(conf_matrix5,
classes=class_names,
title='Consusion Matrix')
plt.show()
Recall metric in the testing dataset:0.895