信用卡欺诈

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df=pd.read_csv('creditcard.csv')
df.head()
TimeV1V2V3V4V5V6V7V8V9...V21V22V23V24V25V26V27V28AmountClass
00.0-1.359807-0.0727812.5363471.378155-0.3383210.4623880.2395990.0986980.363787...-0.0183070.277838-0.1104740.0669280.128539-0.1891150.133558-0.021053149.620
10.01.1918570.2661510.1664800.4481540.060018-0.082361-0.0788030.085102-0.255425...-0.225775-0.6386720.101288-0.3398460.1671700.125895-0.0089830.0147242.690
21.0-1.358354-1.3401631.7732090.379780-0.5031981.8004990.7914610.247676-1.514654...0.2479980.7716790.909412-0.689281-0.327642-0.139097-0.055353-0.059752378.660
31.0-0.966272-0.1852261.792993-0.863291-0.0103091.2472030.2376090.377436-1.387024...-0.1083000.005274-0.190321-1.1755750.647376-0.2219290.0627230.061458123.500
42.0-1.1582330.8777371.5487180.403034-0.4071930.0959210.592941-0.2705330.817739...-0.0094310.798278-0.1374580.141267-0.2060100.5022920.2194220.21515369.990

5 rows × 31 columns

df.shape
(284807, 31)

time列无用,删除

V1-V28是特征

Amount交易金额,大小差异大,预处理

df=df.drop(labels='Time',axis=1)
df.head()
V1V2V3V4V5V6V7V8V9V10...V21V22V23V24V25V26V27V28AmountClass
0-1.359807-0.0727812.5363471.378155-0.3383210.4623880.2395990.0986980.3637870.090794...-0.0183070.277838-0.1104740.0669280.128539-0.1891150.133558-0.021053149.620
11.1918570.2661510.1664800.4481540.060018-0.082361-0.0788030.085102-0.255425-0.166974...-0.225775-0.6386720.101288-0.3398460.1671700.125895-0.0089830.0147242.690
2-1.358354-1.3401631.7732090.379780-0.5031981.8004990.7914610.247676-1.5146540.207643...0.2479980.7716790.909412-0.689281-0.327642-0.139097-0.055353-0.059752378.660
3-0.966272-0.1852261.792993-0.863291-0.0103091.2472030.2376090.377436-1.387024-0.054952...-0.1083000.005274-0.190321-1.1755750.647376-0.2219290.0627230.061458123.500
4-1.1582330.8777371.5487180.403034-0.4071930.0959210.592941-0.2705330.8177390.753074...-0.0094310.798278-0.1374580.141267-0.2060100.5022920.2194220.21515369.990

5 rows × 30 columns

预处理
df['Amount']
0         149.62
1           2.69
2         378.66
3         123.50
4          69.99
           ...  
284802      0.77
284803     24.79
284804     67.88
284805     10.00
284806    217.00
Name: Amount, Length: 284807, dtype: float64
from sklearn.preprocessing import StandardScaler

sc=StandardScaler()
df['Amount']=sc.fit_transform(df['Amount'].values.reshape(-1,1))
df.head()
V1V2V3V4V5V6V7V8V9V10...V21V22V23V24V25V26V27V28AmountClass
0-1.359807-0.0727812.5363471.378155-0.3383210.4623880.2395990.0986980.3637870.090794...-0.0183070.277838-0.1104740.0669280.128539-0.1891150.133558-0.0210530.2449640
11.1918570.2661510.1664800.4481540.060018-0.082361-0.0788030.085102-0.255425-0.166974...-0.225775-0.6386720.101288-0.3398460.1671700.125895-0.0089830.014724-0.3424750
2-1.358354-1.3401631.7732090.379780-0.5031981.8004990.7914610.247676-1.5146540.207643...0.2479980.7716790.909412-0.689281-0.327642-0.139097-0.055353-0.0597521.1606860
3-0.966272-0.1852261.792993-0.863291-0.0103091.2472030.2376090.377436-1.387024-0.054952...-0.1083000.005274-0.190321-1.1755750.647376-0.2219290.0627230.0614580.1405340
4-1.1582330.8777371.5487180.403034-0.4071930.0959210.592941-0.2705330.8177390.753074...-0.0094310.798278-0.1374580.141267-0.2060100.5022920.2194220.215153-0.0734030

5 rows × 30 columns

可视化
class_numbers=pd.value_counts(df['Class'],sort=True).sort_index()
class_numbers.plot.bar()
plt.xlabel('class')
plt.ylabel('numbers')
plt.show()

在这里插入图片描述

可以看出异常样本少,样本不均衡

解决方法:

  • 过采样
  • 下采样
下采样

最多的数据和最少的数据一样少

X=df.loc[:,df.columns != 'Class']
y=df.loc[:,df.columns=='Class']
X.shape
(284807, 29)
y.shape
(284807, 1)
#最少的数目
number_min=len(df[df['Class']==1])
number_min
492
index_min=np.array(df[df['Class']==1].index)
#最多样本的index拿出来
index_max=np.array(df[df['Class']==0].index)
#从最多的样本中选择和最少数目一样多的样本
index_max_random=np.random.choice(index_max,size=number_min,replace=True)
#合并,numpy中一维数组列向量
under_sample_indices=np.concatenate([index_min,index_max_random])
under_sample_indices.shape
(984,)
under_sample_data=df.iloc[under_sample_indices,:]
under_sample_data.shape
(984, 30)
X_under_sample=under_sample_data.loc[:,under_sample_data.columns != 'Class']
y_under_sample=under_sample_data.loc[:,under_sample_data.columns == 'Class']
X_under_sample.shape,y_under_sample.shape
((984, 29), (984, 1))
#展示比例
print('正常交易比例:',len(under_sample_data[under_sample_data.Class==0])/len(under_sample_data.Class))
print('欺诈交易比例:',len(under_sample_data[under_sample_data.Class==1])/len(under_sample_data.Class))
print('总的交易数目:',len(under_sample_data))
正常交易比例: 0.5
欺诈交易比例: 0.5
总的交易数目: 984
划分数据集
from sklearn.model_selection import train_test_split
#在数据不平衡背景下,划分训练集和验证集
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3)

print("number transaction train dataset:",len(X_train))
print("number transaction test dataset:",len(X_test))
print("total number of transactions:",len(X_train)+len(X_test))
number transaction train dataset: 199364
number transaction test dataset: 85443
total number of transactions: 284807
#使用下采样数据,划分训练集和验证集
X_train_und_sam,X_test_und_sam,y_train_und_sam,y_test_und_sam=train_test_split(X_under_sample,
                                                                              y_under_sample,
                                                                              test_size=0.3)
print("number transaction train dataset:",len(X_train_und_sam))
print("number transaction test dataset:",len(X_test_und_sam))
print("total number of transactions:",len(X_train_und_sam)+len(X_test_und_sam))
number transaction train dataset: 688
number transaction test dataset: 296
total number of transactions: 984
交叉验证
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix,recall_score,classification_report
#计算不同惩罚项的参数下的recall
def print_Kfold_scores(X_data,y_data):#输入训练集的数据
    
    
    #设置不同C
    c_params=[0.01,0.1,1,10,100]
    
    for c in c_params:
        print('-------------------------------------------')
        print('C parameter: ', c)
        print('-------------------------------------------')
        print('')
        
        recall_scores=[]
        recall_scores_mean=[]
    
        for k ,(train,test) in enumerate(StratifiedKFold(n_splits=10).split(X_data,y_data)):#train,test表示原始training data的子集
            lr=LogisticRegression(C=c,penalty='l1',solver='liblinear')
            
            #使用训练集的拆分出来的训练集拟合模型
            lr.fit(X_data.iloc[train,:].values,y_data.iloc[train,:].values.ravel())
            
            #使用训练集的拆分出来的测试集预测
            y_under_sample_pred=lr.predict(X_data.iloc[test,:].values)
            
            #计算recal得分
            score=recall_score(y_data.iloc[test,:].values,y_under_sample_pred)
            recall_scores.append(score)
            print('第%d折'%k,":recall_score=%.3f"%score)
        
        #计算平均recall_score
        print('')
        print("mean recall score:%.3f"%np.mean(recall_scores))  
        print('')

        #储存平均recall_score
        recall_scores_mean.append(np.mean(recall_scores))
    
        
    #计算best C
    best_c=max(recall_scores_mean)
    print('*********************************************************************************')
    print('Best model to choose from cross validation is with C parameter =%.3f '%best_c)
    print('*********************************************************************************')
    
    
    return best_c

        
    

best_c=print_Kfold_scores(X_train_und_sam,y_train_und_sam)
-------------------------------------------
C parameter:  0.01
-------------------------------------------

第0折 :recall_score=1.000
第1折 :recall_score=0.941
第2折 :recall_score=0.941
第3折 :recall_score=1.000
第4折 :recall_score=0.941
第5折 :recall_score=0.971
第6折 :recall_score=0.882
第7折 :recall_score=1.000
第8折 :recall_score=0.941
第9折 :recall_score=0.909

mean recall score:0.953

-------------------------------------------
C parameter:  0.1
-------------------------------------------

第0折 :recall_score=0.971
第1折 :recall_score=0.912
第2折 :recall_score=0.882
第3折 :recall_score=1.000
第4折 :recall_score=0.941
第5折 :recall_score=0.882
第6折 :recall_score=0.882
第7折 :recall_score=0.882
第8折 :recall_score=0.882
第9折 :recall_score=0.879

mean recall score:0.911

-------------------------------------------
C parameter:  1
-------------------------------------------

第0折 :recall_score=0.971
第1折 :recall_score=0.912
第2折 :recall_score=0.882
第3折 :recall_score=1.000
第4折 :recall_score=0.941
第5折 :recall_score=0.882
第6折 :recall_score=0.882
第7折 :recall_score=0.912
第8折 :recall_score=0.882
第9折 :recall_score=0.879

mean recall score:0.914

-------------------------------------------
C parameter:  10
-------------------------------------------

第0折 :recall_score=0.971
第1折 :recall_score=0.912
第2折 :recall_score=0.912
第3折 :recall_score=1.000
第4折 :recall_score=0.941
第5折 :recall_score=0.882
第6折 :recall_score=0.882
第7折 :recall_score=0.912
第8折 :recall_score=0.912
第9折 :recall_score=0.879

mean recall score:0.920

-------------------------------------------
C parameter:  100
-------------------------------------------

第0折 :recall_score=0.971
第1折 :recall_score=0.912
第2折 :recall_score=0.912
第3折 :recall_score=1.000
第4折 :recall_score=0.941
第5折 :recall_score=0.882
第6折 :recall_score=0.882
第7折 :recall_score=0.912
第8折 :recall_score=0.941
第9折 :recall_score=0.879

mean recall score:0.923

*********************************************************************************
Best model to choose from cross validation is with C parameter =0.923 
*********************************************************************************

错误记录:

百思不得其解的错误:内层只执行了一次,c=0.1以后不执行。

解决方法网址:https://www.jianshu.com/p/6d6f7ffa1977

stackoverflow上的解决办法:https://stackoverflow.com/questions/11569535/python-nested-loop-with-generators-does-not-work-in-some-cases

best_c = print_Kfold_scores(X_train,y_train)#使用不平衡样本的训练集
-------------------------------------------
C parameter:  0.01
-------------------------------------------

第0折 :recall_score=0.429
第1折 :recall_score=0.543
第2折 :recall_score=0.400
第3折 :recall_score=0.571
第4折 :recall_score=0.559
第5折 :recall_score=0.676
第6折 :recall_score=0.559
第7折 :recall_score=0.647
第8折 :recall_score=0.500
第9折 :recall_score=0.657

mean recall score:0.554

-------------------------------------------
C parameter:  0.1
-------------------------------------------

第0折 :recall_score=0.400
第1折 :recall_score=0.543
第2折 :recall_score=0.486
第3折 :recall_score=0.600
第4折 :recall_score=0.588
第5折 :recall_score=0.706
第6折 :recall_score=0.618
第7折 :recall_score=0.647
第8折 :recall_score=0.588
第9折 :recall_score=0.743

mean recall score:0.592

-------------------------------------------
C parameter:  1
-------------------------------------------

第0折 :recall_score=0.486
第1折 :recall_score=0.571
第2折 :recall_score=0.486
第3折 :recall_score=0.657
第4折 :recall_score=0.588
第5折 :recall_score=0.706
第6折 :recall_score=0.676
第7折 :recall_score=0.706
第8折 :recall_score=0.559
第9折 :recall_score=0.743

mean recall score:0.618

-------------------------------------------
C parameter:  10
-------------------------------------------

第0折 :recall_score=0.486
第1折 :recall_score=0.571
第2折 :recall_score=0.486
第3折 :recall_score=0.657
第4折 :recall_score=0.588
第5折 :recall_score=0.706
第6折 :recall_score=0.647
第7折 :recall_score=0.706
第8折 :recall_score=0.559
第9折 :recall_score=0.743

mean recall score:0.615

-------------------------------------------
C parameter:  100
-------------------------------------------

第0折 :recall_score=0.486
第1折 :recall_score=0.571
第2折 :recall_score=0.486
第3折 :recall_score=0.657
第4折 :recall_score=0.588
第5折 :recall_score=0.706
第6折 :recall_score=0.647
第7折 :recall_score=0.706
第8折 :recall_score=0.559
第9折 :recall_score=0.743

mean recall score:0.615

*********************************************************************************
Best model to choose from cross validation is with C parameter =0.615 
*********************************************************************************
混淆矩阵
def plot_confusion_matrix(cm,classes,
                         title='Confusion matrix',
                         cmap='Blues'):
    """此函数用于绘制混淆矩阵
    
    cm:混淆矩阵
    
    classes:类名称,如1,0
    
    cmap:颜色板
    
    title:标题
    
    """

    plt.imshow(cm,interpolation='nearest',cmap=cmap)
    plt.title(title)
    
    plt.colorbar()
    tick_marks=np.arange(len(classes))
    plt.xticks(tick_marks,classes,rotation=0)
    plt.yticks(tick_marks,classes)
    
    thresh=cm.max()/2.0
    
    for i,j in itertools.product(range(cm.shape[0]),range(cm.shape[1])):
        plt.text(j,i,
                cm[i,j],
                horizontalalignment="center",
                color='white'if cm[i,j]>thresh else "black")
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

热力图是一种数据的图形化表示,具体而言,就是将二维数组中的元素用颜色表示。热力图之所以非常有用,是因为它能够从整体视角上展示数据,更确切的说是数值型数据。

使用imshow()函数可以非常容易地制作热力图。

import itertools
lr=LogisticRegression(C=best_c,penalty='l1',solver='liblinear')
lr.fit(X_train_und_sam.values,y_train_und_sam.values.ravel())
y_pred_under_sample=lr.predict(X_test_und_sam.values)

#计算混淆矩阵
conf_matrix=confusion_matrix(y_test_und_sam,y_pred_under_sample)
np.set_printoptions(precision=2)
conf_matrix
array([[138,   5],
       [ 18, 135]], dtype=int64)

print("Recall metric in the testing dataset:%.3f"%(conf_matrix[1,1]/(conf_matrix[1,0]+conf_matrix[1,1])))
Recall metric in the testing dataset:0.882
#绘出混淆矩阵
class_names=[0,1]
plot_confusion_matrix(conf_matrix,
                     classes=class_names,
                     title='Consusion Matrix')
plt.show()

在这里插入图片描述

#使用平衡样本训练,不平衡样本预测
lr2=LogisticRegression(C=best_c,penalty='l1',solver='liblinear')
lr2.fit(X_train_und_sam.values,y_train_und_sam.values.ravel())
y_pred=lr2.predict(X_test.values)

conf_matrix2=confusion_matrix(y_test,y_pred)


print("Recall metric in the testing dataset:%.3f"%(conf_matrix2[1,1]/(conf_matrix2[1,0]+conf_matrix2[1,1])))

class_names=[0,1]
plot_confusion_matrix(conf_matrix2,
                     classes=class_names,
                     title='Consusion Matrix',
                     cmap='OrRd')
plt.show()
Recall metric in the testing dataset:0.939

在这里插入图片描述

#使用不平衡样本训练,不平衡样本预测
lr3=LogisticRegression(C=best_c,penalty='l1',solver='liblinear')
lr3.fit(X_train.values,y_train.values.ravel())
y_pred_3=lr3.predict(X_test.values)

conf_matrix3=confusion_matrix(y_test,y_pred_3)


print("Recall metric in the testing dataset:%.3f"%(conf_matrix3[1,1]/(conf_matrix3[1,0]+conf_matrix3[1,1])))

class_names=[0,1]
plot_confusion_matrix(conf_matrix3,
                     classes=class_names,
                     title='Consusion Matrix',
                     cmap='YlGn')
plt.show()
Recall metric in the testing dataset:0.626

在这里插入图片描述

逻辑回归阈值对结果影响
lr4=LogisticRegression(C=0.01,penalty='l1',solver='liblinear')
lr4.fit(X_train_und_sam.values,y_train_und_sam.values.ravel())
y_pred4_proba=lr4.predict_proba(X_test_und_sam.values)#获得每个样本属于每类的概率

thresholds = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]


plt.figure(figsize=(10,10))
j=1
for i in thresholds:
    y_test_predictions_high_recall =y_pred4_proba[:,1]>i
    plt.subplot(3,3,j)
    j += 1
    
    #计算混淆矩阵
    conf_matrix4=confusion_matrix(y_test_und_sam,y_test_predictions_high_recall)
    
    print("Recall metric in the testing dataset:%.3f"%(conf_matrix4[1,1]/(conf_matrix4[1,0]+conf_matrix4[1,1])))
    
    #绘出混淆矩阵的图
    class_names=[0,1]
    plot_confusion_matrix(conf_matrix4,
                         classes=class_names,
                         title='Threshold >=%.2f'%i)
Recall metric in the testing dataset:1.000
Recall metric in the testing dataset:0.993
Recall metric in the testing dataset:0.993
Recall metric in the testing dataset:0.961
Recall metric in the testing dataset:0.895
Recall metric in the testing dataset:0.863
Recall metric in the testing dataset:0.850
Recall metric in the testing dataset:0.804
Recall metric in the testing dataset:0.614

在这里插入图片描述

SMOTE采样
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
credit_cards=pd.read_csv('creditcard.csv')
credit_cards.head()
TimeV1V2V3V4V5V6V7V8V9...V21V22V23V24V25V26V27V28AmountClass
00.0-1.359807-0.0727812.5363471.378155-0.3383210.4623880.2395990.0986980.363787...-0.0183070.277838-0.1104740.0669280.128539-0.1891150.133558-0.021053149.620
10.01.1918570.2661510.1664800.4481540.060018-0.082361-0.0788030.085102-0.255425...-0.225775-0.6386720.101288-0.3398460.1671700.125895-0.0089830.0147242.690
21.0-1.358354-1.3401631.7732090.379780-0.5031981.8004990.7914610.247676-1.514654...0.2479980.7716790.909412-0.689281-0.327642-0.139097-0.055353-0.059752378.660
31.0-0.966272-0.1852261.792993-0.863291-0.0103091.2472030.2376090.377436-1.387024...-0.1083000.005274-0.190321-1.1755750.647376-0.2219290.0627230.061458123.500
42.0-1.1582330.8777371.5487180.403034-0.4071930.0959210.592941-0.2705330.817739...-0.0094310.798278-0.1374580.141267-0.2060100.5022920.2194220.21515369.990

5 rows × 31 columns

columns=credit_cards.columns
columns
Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
       'Class'],
      dtype='object')
features_columns=columns.delete((len(columns)-1))
features_columns
Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount'],
      dtype='object')
features=credit_cards[features_columns]
features.head()
TimeV1V2V3V4V5V6V7V8V9...V20V21V22V23V24V25V26V27V28Amount
00.0-1.359807-0.0727812.5363471.378155-0.3383210.4623880.2395990.0986980.363787...0.251412-0.0183070.277838-0.1104740.0669280.128539-0.1891150.133558-0.021053149.62
10.01.1918570.2661510.1664800.4481540.060018-0.082361-0.0788030.085102-0.255425...-0.069083-0.225775-0.6386720.101288-0.3398460.1671700.125895-0.0089830.0147242.69
21.0-1.358354-1.3401631.7732090.379780-0.5031981.8004990.7914610.247676-1.514654...0.5249800.2479980.7716790.909412-0.689281-0.327642-0.139097-0.055353-0.059752378.66
31.0-0.966272-0.1852261.792993-0.863291-0.0103091.2472030.2376090.377436-1.387024...-0.208038-0.1083000.005274-0.190321-1.1755750.647376-0.2219290.0627230.061458123.50
42.0-1.1582330.8777371.5487180.403034-0.4071930.0959210.592941-0.2705330.817739...0.408542-0.0094310.798278-0.1374580.141267-0.2060100.5022920.2194220.21515369.99

5 rows × 30 columns

labels=credit_cards['Class']
X_train_1,X_test_1,y_train_1,y_test_1=train_test_split(features,
                                                      labels,
                                                      test_size=0.2)
oversampler=SMOTE()
X_train_1_os,y_train_1_os=oversampler.fit_resample(X_train_1,y_train_1)
len(y_train_1_os[y_train_1_os==1])
227449
print(type(X_train_1_os))
<class 'pandas.core.frame.DataFrame'>
print(type(y_train_1_os))
<class 'pandas.core.series.Series'>
#计算不同惩罚项的参数下的recall
def printing_Kfold_scores(X_data,y_data):#输入训练集的数据
    
    
    #设置不同C
    c_params=[0.01,0.1,1,10,100]
    
    for c in c_params:
        print('-------------------------------------------')
        print('C parameter: ', c)
        print('-------------------------------------------')
        print('')
        
        recall_scores=[]
        recall_scores_mean=[]
    
        for k ,(train,test) in enumerate(StratifiedKFold(n_splits=10).split(X_data,y_data)):#train,test表示原始training data的子集
            lr=LogisticRegression(C=c,penalty='l1',solver='liblinear')
            
            #使用训练集的拆分出来的训练集拟合模型
            lr.fit(X_data.iloc[train,:].values,y_data.iloc[train,:].values.ravel())
            
            #使用训练集的拆分出来的测试集预测
            y_under_sample_pred=lr.predict(X_data.iloc[test,:].values)
            
            #计算recal得分
            score=recall_score(y_data.iloc[test,:].values,y_under_sample_pred)
            recall_scores.append(score)
            print('第%d折'%k,":recall_score=%.3f"%score)
        
        #计算平均recall_score
        print('')
        print("mean recall score:%.3f"%np.mean(recall_scores))  
        print('')

        #储存平均recall_score
        recall_scores_mean.append(np.mean(recall_scores))
    
        
    #计算best C
    best_c=max(recall_scores_mean)
    print('*********************************************************************************')
    print('Best model to choose from cross validation is with C parameter =%.3f '%best_c)
    print('*********************************************************************************')
    
    
    return best_c

        
    
y_train_1_os = pd.DataFrame(y_train_1_os)#由series变成dataframe
best_c = printing_Kfold_scores(X_train_1_os,y_train_1_os)
-------------------------------------------
C parameter:  0.01
-------------------------------------------

第0折 :recall_score=0.964
第1折 :recall_score=0.966
第2折 :recall_score=0.968
第3折 :recall_score=0.969
第4折 :recall_score=0.965
第5折 :recall_score=0.965
第6折 :recall_score=0.968
第7折 :recall_score=0.966
第8折 :recall_score=0.966
第9折 :recall_score=0.967

mean recall score:0.966

-------------------------------------------
C parameter:  0.1
-------------------------------------------

第0折 :recall_score=0.965
第1折 :recall_score=0.967
第2折 :recall_score=0.969
第3折 :recall_score=0.970
第4折 :recall_score=0.967
第5折 :recall_score=0.966
第6折 :recall_score=0.970
第7折 :recall_score=0.967
第8折 :recall_score=0.968
第9折 :recall_score=0.969

mean recall score:0.968

-------------------------------------------
C parameter:  1
-------------------------------------------

第0折 :recall_score=0.965
第1折 :recall_score=0.966
第2折 :recall_score=0.969
第3折 :recall_score=0.970
第4折 :recall_score=0.966
第5折 :recall_score=0.967
第6折 :recall_score=0.969
第7折 :recall_score=0.967
第8折 :recall_score=0.968
第9折 :recall_score=0.969

mean recall score:0.968

-------------------------------------------
C parameter:  10
-------------------------------------------

第0折 :recall_score=0.965
第1折 :recall_score=0.967
第2折 :recall_score=0.969
第3折 :recall_score=0.970
第4折 :recall_score=0.967
第5折 :recall_score=0.966
第6折 :recall_score=0.970
第7折 :recall_score=0.967
第8折 :recall_score=0.968
第9折 :recall_score=0.969

mean recall score:0.968

-------------------------------------------
C parameter:  100
-------------------------------------------

第0折 :recall_score=0.965
第1折 :recall_score=0.967
第2折 :recall_score=0.969
第3折 :recall_score=0.970
第4折 :recall_score=0.967
第5折 :recall_score=0.966
第6折 :recall_score=0.970
第7折 :recall_score=0.967
第8折 :recall_score=0.968
第9折 :recall_score=0.969

mean recall score:0.968

*********************************************************************************
Best model to choose from cross validation is with C parameter =0.968 
*********************************************************************************
#使用SMOTE得到平衡样本训练
lr5=LogisticRegression(C=best_c,penalty='l1',solver='liblinear')
lr5.fit(X_train_1_os.values,y_train_1_os.values.ravel())
y_pred_5=lr5.predict(X_test_1.values)

conf_matrix5=confusion_matrix(y_test_1,y_pred_5)


print("Recall metric in the testing dataset:%.3f"%(conf_matrix5[1,1]/(conf_matrix5[1,0]+conf_matrix5[1,1])))

class_names=[0,1]
plot_confusion_matrix(conf_matrix5,
                     classes=class_names,
                     title='Consusion Matrix')
plt.show()
Recall metric in the testing dataset:0.895

在这里插入图片描述


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
PyTorch 是一种基于Python的开源机器学习框架,可以用于信用卡欺诈检测。信用卡欺诈是一种严重的问题,给消费者和金融机构带来了巨大的损失。 利用 PyTorch 可以构建深度学习模型来识别和预测信用卡欺诈行为。在使用 PyTorch 之前,我们需要准备一个包含真实数据集的样本集,其中包含标记为欺诈和非欺诈的交易。我们可以使用监督学习算法,例如神经网络,来训练模型,以使其能够自动识别欺诈行为。 首先,我们需要将数据集划分为训练集和测试集。然后,我们可以使用 PyTorch 中的数据加载器将数据加载到模型中。接下来,我们可以定义一个包含多个隐藏层的深度神经网络模型,并选择适当的损失函数和优化算法。 通过训练模型,我们可以使其学习如何从输入特征中提取有关交易的相关信息,并对其进行分类。训练过程中,我们可以使用交叉验证技术来评估模型的性能和准确性。 在完成模型训练后,我们可以使用测试集来评估模型的性能。通过比较模型对测试集中交易的分类结果与实际的标签,我们可以计算出模型的准确率、精确度、召回率等指标,从而评估模型对于信用卡欺诈检测的效果。 总之,利用 PyTorch,我们可以构建一个强大的深度学习模型,通过对信用卡交易数据进行训练和测试,预测和识别潜在的信用卡欺诈行为。这有助于金融机构和消费者及时发现欺诈行为,减少损失,并提高金融系统的安全性。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值