案例实战-信用卡欺诈检测

# 观察数据
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

%matplotlib inline

# 导入数据并查看前5行

data = pd.read_csv("creditcard.csv")
data.head()

在这里插入图片描述

# 数据有31列:Time、V1-V28、Amount和Class,注意到最后一列Class,这是我们的label值,0代表正常数据,1代表欺诈数据。首先习惯性地画个图观察一下欺诈数据的分布。



count_classes = pd.value_counts(data['Class'], sort = True).sort_index()
count_classes.plot(kind = 'bar')
plt.title("Fraud class histogram")
plt.xlabel("Class")
plt.ylabel("Frequency")

在这里插入图片描述

#可以看到Class=0的数据大概有28W,欺诈数据Class=1极少,极度不均匀的分布状态。 
# 通常有两种处理方法: 
1. 过采样(让1变得和0一样多); 
2. 下采样(在0中取出部分数据,数量与1一致)

# 标准化
在特征数据中,Amount与其他特征数据的取值范围相比,太大了,应该是还没有标准化。所以,需要先对这一列进行标准化:

from sklearn.preprocessing import StandardScaler
# 标准化,将Amount这一列传进
data['normAmount'] = StandardScaler().fit_transform(data['Amount'].reshape(-1, 1)) #reshape(-1,1)# -1表示默认计算,转化行数模糊,1表示维度,最终转化为一列
data = data.drop(['Time','Amount'],axis=1) # 删除没用的两列数据,得到一个新的数据集
data.head()


在这里插入图片描述

这个时候所有特征数据都已经完成了标准化的操作。

随机下采样
下采样相对简单,所以我们先进行下采样。现在,分别取出特征和标签:

X = data.loc[:, data.columns != 'Class'] # 取特征(列名不等于class的所有数据)
y = data.loc[:, data.columns == 'Class'] # 取label
1
2
为了保证拿到的是数据的原始分布,我们采用的是随机的下采样:

# 随机下采样
# 筛选出class为1的数据总数,并取得其索引值
number_records_fraud = len(data[data.Class == 1]) 
fraud_indices = np.array(data[data.Class == 1].index) 

# 把class为0的数据索引拿到手
normal_indices = data[data.Class == 0].index

random_normal_indices = np.random.choice(normal_indices, number_records_fraud, replace = False)  # 随机采样,并不对原始dataframe进行替换
random_normal_indices = np.array(random_normal_indices)  # 转换成numpy的array格式

# 将两组索引数据连接成性的数据索引
under_sample_indices = np.concatenate([fraud_indices,random_normal_indices]) 

# 下采样数据集
under_sample_data = data.iloc[under_sample_indices,:] # 定位到真正的数据

# 切分出下采样数据的特征和标签
X_undersample = under_sample_data.loc[:, under_sample_data.columns != 'Class']
y_undersample = under_sample_data.loc[:, under_sample_data.columns == 'Class']

# 展示下比例
print("Percentage of normal transactions: ", len(under_sample_data[under_sample_data.Class == 0])/len(under_sample_data))
print("Percentage of fraud transactions: ", len(under_sample_data[under_sample_data.Class == 1])/len(under_sample_data))
print("Total number of transactions in resampled data: ", len(under_sample_data))

在这里插入图片描述

数据切分
将数据集切分为训练集和测试集:

from sklearn.model_selection import train_test_split

# 对全部数据集进行切分,注意使用相同的随机策略
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3, random_state = 0)  # 30%作为测试集,random_state = 0保证数据集一致性,以便调参

print("Number transactions train dataset: ", len(X_train))
print("Number transactions test dataset: ",  len(X_test))
print("Total number of transactions: ", len(X_train)+len(X_test))

# 对下采样数据集进行切分
X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample = train_test_split(X_undersample,y_undersample,test_size = 0.3,random_state = 0)
print("")
print("Number transactions train dataset: ", len(X_train_undersample))
print("Number transactions test dataset: ", len(X_test_undersample))
print("Total number of transactions: ", len(X_train_undersample)+len(X_test_undersample))


在这里插入图片描述

模型效果评估
在建模之前,我们还先考虑一下,选定哪些参数,指定什么作为评估标准?

TP(true positives):被正确分类的正例个数 
FN(false negatives):被错误分类的负例个数 
FP(false positives):被错误分类的负例个数 
TN(true negatives):被正确分类的负例个数

由于我们是要尽可能将所有信用卡欺诈的数据找出来,所以有个很重要的衡量标准: 
召回率:Recall = TP/(TP+FN)

假设1000条信用卡数据中,有10条是欺诈数据,召回率有别于准确率,它关注的目标就是这10条数据,找出3条,那么召回率为0.3。

建模
接下来就是建模了,很多时候我们也不知道参数设置为多少比较合适,所以最好的办法写一个脚本让机器分别去跑,我们根据各个模型结果再做选择比较省心。

from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import KFold, cross_val_score
from sklearn.metrics import confusion_matrix,recall_score,classification_report 
1
2
3
# 训练模型,实例化逻辑回归模型,指定不同的惩罚系数,利用交叉验证找到最合适的参数,打印每个结果

def printing_Kfold_scores(x_train_data,y_train_data):

    fold = KFold(len(y_train_data),5,shuffle=False) # 五折交叉验证

    # 正则化权重参数,指定惩罚力度,用以控制过拟合
    c_param_range = [0.01,0.1,1,10,100]  

    results_table = pd.DataFrame(index = range(len(c_param_range),2), columns = ['C_parameter','Mean recall score'])
    results_table['C_parameter'] = c_param_range

    # the k-fold will give 2 lists: train_indices = indices[0], test_indices = indices[1]
    j = 0
    # 外层循环,调节权重参数
    for c_param in c_param_range:  
        print('-------------------------------------------')
        print('C parameter: ', c_param)
        print('-------------------------------------------')
        print('')

        recall_accs = []
        # 内层循环,调节交叉验证参数
        for iteration, indices in enumerate(fold,start=1):  # 调节交叉验证

            # 建立逻辑回归模型,逻辑回归中有很多惩罚参数,这里使用的是惩罚力度,指定惩罚方案为L1(或L2)
            lr = LogisticRegression(C = c_param, penalty = 'l1') 

            # 使用训练集训练模型,并做交叉验证
            lr.fit(x_train_data.iloc[indices[0],:],y_train_data.iloc[indices[0],:].values.ravel())

            # 在训练集中,交叉验证预测出的结果y
            y_pred_undersample = lr.predict(x_train_data.iloc[indices[1],:].values)

            # 用预测的y值与真实的y值计算recall值,打印结果
            recall_acc = recall_score(y_train_data.iloc[indices[1],:].values,y_pred_undersample)
            recall_accs.append(recall_acc)
            print('Iteration ', iteration,': recall score = ', recall_acc)

        # 计算交叉验证结果得出的recall的平均值,并打印
        results_table.ix[j,'Mean recall score'] = np.mean(recall_accs)
        j += 1
        print('')
        print('Mean recall score ', np.mean(recall_accs))
        print('')

    best_c = results_table.loc[results_table['Mean recall score'].idxmax()]['C_parameter']

    # 最后,我们可以选择C参数之间的最优值
    print('*********************************************************************************')
    print('Best model to choose from cross validation is with C parameter = ', best_c)
    print('*********************************************************************************')

    return best_c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
best_c = printing_Kfold_scores(X_train_undersample,y_train_undersample)
1
– – – – – – – – – – – – – – – – – – – – – - 
C parameter: 0.01 
– – – – – – – – – – – – – – – – – – – – – -

Iteration 1 : recall score = 0.958904109589 
Iteration 2 : recall score = 0.917808219178 
Iteration 3 : recall score = 1.0 
Iteration 4 : recall score = 0.972972972973 
Iteration 5 : recall score = 0.954545454545

Mean recall score 0.960846151257

– – – – – – – – – – – – – – – – – – – – – - 
C parameter: 0.1 
– – – – – – – – – – – – – – – – – – – – – -

Iteration 1 : recall score = 0.835616438356 
Iteration 2 : recall score = 0.86301369863 
Iteration 3 : recall score = 0.915254237288 
Iteration 4 : recall score = 0.932432432432 
Iteration 5 : recall score = 0.878787878788

Mean recall score 0.885020937099

– – – – – – – – – – – – – – – – – – – – – - 
C parameter: 1 
– – – – – – – – – – – – – – – – – – – – – -

Iteration 1 : recall score = 0.835616438356 
Iteration 2 : recall score = 0.86301369863 
Iteration 3 : recall score = 0.966101694915 
Iteration 4 : recall score = 0.945945945946 
Iteration 5 : recall score = 0.893939393939

Mean recall score 0.900923434357

– – – – – – – – – – – – – – – – – – – – – - 
C parameter: 10 
– – – – – – – – – – – – – – – – – – – – – -

Iteration 1 : recall score = 0.849315068493 
Iteration 2 : recall score = 0.86301369863 
Iteration 3 : recall score = 0.966101694915 
Iteration 4 : recall score = 0.959459459459 
Iteration 5 : recall score = 0.893939393939

Mean recall score 0.906365863087

– – – – – – – – – – – – – – – – – – – – – - 
C parameter: 100 
– – – – – – – – – – – – – – – – – – – – – -

Iteration 1 : recall score = 0.86301369863 
Iteration 2 : recall score = 0.86301369863 
Iteration 3 : recall score = 0.966101694915 
Iteration 4 : recall score = 0.959459459459 
Iteration 5 : recall score = 0.893939393939

Mean recall score 0.909105589115

* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * 
Best model to choose from cross validation is with C parameter = 0.01 
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * 
由以上结果可以看到,当前最好的值为0.96 
接下来,画一个更直观的混淆矩阵图出来

def plot_confusion_matrix(cm, classes,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=0)
    plt.yticks(tick_marks, classes)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
import itertools
lr = LogisticRegression(C = best_c, penalty = 'l1')
lr.fit(X_train_undersample,y_train_undersample.values.ravel())
y_pred_undersample = lr.predict(X_test_undersample.values)

# 计算混淆矩阵
cnf_matrix = confusion_matrix(y_test_undersample,y_pred_undersample)
np.set_printoptions(precision=2)

print("Recall metric in the testing dataset: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))

# 非归一化混淆矩阵
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix
                      , classes=class_names
                      , title='Confusion matrix')
plt.show()


在这里插入图片描述

一目了然的图,可以看到,138个真实的欺诈被模型找出来了,但是有9个漏网之鱼,同时有17个正常数据被误杀。Recall值能达到0.93,看起来挺高的,这就是我们要的结果吗?并非如此,这是用的下采样数据计算的混淆矩阵。 
接下来,我们用原始数据画出混淆矩阵图,看看结果:

lr = LogisticRegression(C = best_c, penalty = 'l1')
lr.fit(X_train_undersample,y_train_undersample.values.ravel())
y_pred = lr.predict(X_test.values)

# 计算混淆矩阵
cnf_matrix = confusion_matrix(y_test,y_pred)
np.set_printoptions(precision=2)

print("Recall metric in the testing dataset: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))

# 非归一化混淆矩阵
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix
                      , classes=class_names
                      , title='Confusion matrix')
plt.show()


在这里插入图片描述

这里我们能看到,模型出现一个很大的问题,误杀数量竟然达到了10318条,这无疑对业务产生了重大影响。为什么会出现这个问题呢?这是根据下采样模型得到的效果,而在下采样数据中,数据量太少,正常的少,异常的同样也少,样本是有局限的,出现这种情况也很正常。

那么如何解决这个问题呢? 
如果我们一开始没有对数据进行任何预处理操作,我们能不能得到好的结果呢?

best_c = printing_Kfold_scores(X_train,y_train)
1
– – – – – – – – – – – – – – – – – – – – – - 
C parameter: 0.01 
– – – – – – – – – – – – – – – – – – – – – - 
Iteration 1 : recall score = 0.492537313433 
Iteration 2 : recall score = 0.602739726027 
Iteration 3 : recall score = 0.683333333333 
Iteration 4 : recall score = 0.569230769231 
Iteration 5 : recall score = 0.45

Mean recall score 0.559568228405

– – – – – – – – – – – – – – – – – – – – – - 
C parameter: 0.1 
– – – – – – – – – – – – – – – – – – – – – -

Iteration 1 : recall score = 0.567164179104 
Iteration 2 : recall score = 0.616438356164 
Iteration 3 : recall score = 0.683333333333 
Iteration 4 : recall score = 0.584615384615 
Iteration 5 : recall score = 0.525

Mean recall score 0.595310250644

– – – – – – – – – – – – – – – – – – – – – - 
C parameter: 1 
– – – – – – – – – – – – – – – – – – – – – -

Iteration 1 : recall score = 0.55223880597 
Iteration 2 : recall score = 0.616438356164 
Iteration 3 : recall score = 0.716666666667 
Iteration 4 : recall score = 0.615384615385 
Iteration 5 : recall score = 0.5625

Mean recall score 0.612645688837

– – – – – – – – – – – – – – – – – – – – – - 
C parameter: 10 
– – – – – – – – – – – – – – – – – – – – – -

Iteration 1 : recall score = 0.55223880597 
Iteration 2 : recall score = 0.616438356164 
Iteration 3 : recall score = 0.733333333333 
Iteration 4 : recall score = 0.615384615385 
Iteration 5 : recall score = 0.575

Mean recall score 0.61847902217

– – – – – – – – – – – – – – – – – – – – – - 
C parameter: 100 
– – – – – – – – – – – – – – – – – – – – – -

Iteration 1 : recall score = 0.55223880597 
Iteration 2 : recall score = 0.616438356164 
Iteration 3 : recall score = 0.733333333333 
Iteration 4 : recall score = 0.615384615385 
Iteration 5 : recall score = 0.575

Mean recall score 0.61847902217

* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * 
Best model to choose from cross validation is with C parameter = 10.0 
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * 
可以看到,直接用极度不均衡数据建模的话,效果都很差。所以对数据进行预处理是非常有必要的。 
数据决定上限,参数决定下限。

我们还是先看看它的混淆矩阵结果:

lr = LogisticRegression(C = best_c, penalty = 'l1')
lr.fit(X_train,y_train.values.ravel())
y_pred_undersample = lr.predict(X_test.values)


cnf_matrix = confusion_matrix(y_test,y_pred_undersample)
np.set_printoptions(precision=2)

print("Recall metric in the testing dataset: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))


class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix
                      , classes=class_names
                      , title='Confusion matrix')
plt.show()


在这里插入图片描述

从结果看到,误杀少了,但是很多欺诈数据没有找出来。

之前我们使用的是Sigmoid函数中默认的阈值:0.5,如果我们自己指定阈值,会对结果产生什么影响呢?

lr = LogisticRegression(C = 0.01, penalty = 'l1')
lr.fit(X_train_undersample,y_train_undersample.values.ravel())
y_pred_undersample_proba = lr.predict_proba(X_test_undersample.values)  #这里改成计算结果的概率值
# 指定阈值
thresholds = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]

plt.figure(figsize=(10,10))

# 将预测的概率值与阈值进行对比
j = 1
for i in thresholds:
    y_test_predictions_high_recall = y_pred_undersample_proba[:,1] > i

    # 画出3*3的子图
    plt.subplot(3,3,j)
    j += 1

    # Compute confusion matrix
    cnf_matrix = confusion_matrix(y_test_undersample,y_test_predictions_high_recall)
    np.set_printoptions(precision=2)

    print("Recall metric in the testing dataset: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))

    # Plot non-normalized confusion matrix
    class_names = [0,1]
    plot_confusion_matrix(cnf_matrix
                          , classes=class_names
                          , title='Threshold >= %s'%i) 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
Recall metric in the testing dataset: 1.0 
Recall metric in the testing dataset: 1.0 
Recall metric in the testing dataset: 1.0 
Recall metric in the testing dataset: 0.986394557823 
Recall metric in the testing dataset: 0.925170068027 
Recall metric in the testing dataset: 0.863945578231 
Recall metric in the testing dataset: 0.829931972789 
Recall metric in the testing dataset: 0.748299319728 
Recall metric in the testing dataset: 0.585034013605 


在这里插入图片描述

当阈值为0.1-0.3时,recall值为1,说明太过严苛。随着阈值越来越大,模型的要求越来越宽松。这里需要根据实际业务需求,权衡利弊,选定一个代价最低的模型。

过采样-SMOTE样本生成策略
既然下采样有局限性,误杀这么高,那过采样呢? 
说到过采样,那么就有个问题,怎么生成数据呢? 
在机器学习中,有这么个套路,即SMOTE样本生成策略: 


其中k值为要翻的倍数,假设少数类样本为100,你想变成500,K就取5。先算x到其他少数类样本的距离,然后找出离它最近的5个样本,分别得到距离,将这个距离乘上一个0-1之间的随机数,加上样本本身,得到新数据。相当于对样本进行了微调的过程。

import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
1
2
3
4
5
credit_cards=pd.read_csv('creditcard.csv')

columns=credit_cards.columns
# 最后一类是Class,简单地删除它,获得特征列
features_columns=columns.delete(len(columns)-1)

features=credit_cards[features_columns]
labels=credit_cards['Class']
1
2
3
4
5
6
7
8
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.2, random_state=0)
1
生成新数据

oversampler=SMOTE(random_state=0)
os_features,os_labels=oversampler.fit_sample(features_train,labels_train)
1
2
查看下

len(os_labels[os_labels==1])
1
227454

os_features = pd.DataFrame(os_features)
os_labels = pd.DataFrame(os_labels)
best_c = printing_Kfold_scores(os_features,os_labels)
1
2
3
– – – – – – – – – – – – – – – – – – – – – - 
C parameter: 0.01 
– – – – – – – – – – – – – – – – – – – – – -

Iteration 1 : recall score = 0.890322580645 
Iteration 2 : recall score = 0.894736842105 
Iteration 3 : recall score = 0.968794954078 
Iteration 4 : recall score = 0.957760411514 
Iteration 5 : recall score = 0.958266011585

Mean recall score 0.933976159985

– – – – – – – – – – – – – – – – – – – – – - 
C parameter: 0.1 
– – – – – – – – – – – – – – – – – – – – – -

Iteration 1 : recall score = 0.890322580645 
Iteration 2 : recall score = 0.894736842105 
Iteration 3 : recall score = 0.970432665708 
Iteration 4 : recall score = 0.960046603137 
Iteration 5 : recall score = 0.957650498456

Mean recall score 0.93463783801

– – – – – – – – – – – – – – – – – – – – – - 
C parameter: 1 
– – – – – – – – – – – – – – – – – – – – – -

Iteration 1 : recall score = 0.890322580645 
Iteration 2 : recall score = 0.894736842105 
Iteration 3 : recall score = 0.970432665708 
Iteration 4 : recall score = 0.960321385784 
Iteration 5 : recall score = 0.960750046713

Mean recall score 0.935312704191

– – – – – – – – – – – – – – – – – – – – – - 
C parameter: 10 
– – – – – – – – – – – – – – – – – – – – – -

Iteration 1 : recall score = 0.890322580645 
Iteration 2 : recall score = 0.894736842105 
Iteration 3 : recall score = 0.970499059422 
Iteration 4 : recall score = 0.960211472725 
Iteration 5 : recall score = 0.96009056836

Mean recall score 0.935172104652

– – – – – – – – – – – – – – – – – – – – – - 
C parameter: 100 
– – – – – – – – – – – – – – – – – – – – – -

Iteration 1 : recall score = 0.890322580645 
Iteration 2 : recall score = 0.894736842105 
Iteration 3 : recall score = 0.970543321899 
Iteration 4 : recall score = 0.960398324925 
Iteration 5 : recall score = 0.956903089656

Mean recall score 0.934580831846

* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * 
Best model to choose from cross validation is with C parameter = 1.0 
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *

lr = LogisticRegression(C = best_c, penalty = 'l1')
lr.fit(os_features,os_labels.values.ravel())
y_pred = lr.predict(features_test.values)

# Compute confusion matrix
cnf_matrix = confusion_matrix(labels_test,y_pred)
np.set_printoptions(precision=2)

print("Recall metric in the testing dataset: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))

# Plot non-normalized confusion matrix
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix
                      , classes=class_names
                      , title='Confusion matrix')
plt.show()


在这里插入图片描述

看结果,与下采样对比,误杀比例明显小得多,也就是说,当我们用过采样策略,模型效果最好。




评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值