import pandas as pd
import matplotlib. pyplot as plt
import numpy as np
% matplotlib inline
data = pd. read_csv( "creditcard.csv" )
data. head( )
count_classes = pd. value_counts( data[ 'Class' ] , sort = True ) . sort_index( )
count_classes. plot( kind = 'bar' )
plt. title( "Fraud class histogram" )
plt. xlabel( "Class" )
plt. ylabel( "Frequency" )
1 . 过采样(让1 变得和0 一样多);
2 . 下采样(在0 中取出部分数据,数量与1 一致)
在特征数据中,Amount与其他特征数据的取值范围相比,太大了,应该是还没有标准化。所以,需要先对这一列进行标准化:
from sklearn. preprocessing import StandardScaler
data[ 'normAmount' ] = StandardScaler( ) . fit_transform( data[ 'Amount' ] . reshape( - 1 , 1 ) )
data = data. drop( [ 'Time' , 'Amount' ] , axis= 1 )
data. head( )
这个时候所有特征数据都已经完成了标准化的操作。
随机下采样
下采样相对简单,所以我们先进行下采样。现在,分别取出特征和标签:
X = data. loc[ : , data. columns != 'Class' ]
y = data. loc[ : , data. columns == 'Class' ]
1
2
为了保证拿到的是数据的原始分布,我们采用的是随机的下采样:
number_records_fraud = len ( data[ data. Class == 1 ] )
fraud_indices = np. array( data[ data. Class == 1 ] . index)
normal_indices = data[ data. Class == 0 ] . index
random_normal_indices = np. random. choice( normal_indices, number_records_fraud, replace = False )
random_normal_indices = np. array( random_normal_indices)
under_sample_indices = np. concatenate( [ fraud_indices, random_normal_indices] )
under_sample_data = data. iloc[ under_sample_indices, : ]
X_undersample = under_sample_data. loc[ : , under_sample_data. columns != 'Class' ]
y_undersample = under_sample_data. loc[ : , under_sample_data. columns == 'Class' ]
print ( "Percentage of normal transactions: " , len ( under_sample_data[ under_sample_data. Class == 0 ] ) / len ( under_sample_data) )
print ( "Percentage of fraud transactions: " , len ( under_sample_data[ under_sample_data. Class == 1 ] ) / len ( under_sample_data) )
print ( "Total number of transactions in resampled data: " , len ( under_sample_data) )
数据切分
将数据集切分为训练集和测试集:
from sklearn. model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size = 0.3 , random_state = 0 )
print ( "Number transactions train dataset: " , len ( X_train) )
print ( "Number transactions test dataset: " , len ( X_test) )
print ( "Total number of transactions: " , len ( X_train) + len ( X_test) )
X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample = train_test_split( X_undersample, y_undersample, test_size = 0.3 , random_state = 0 )
print ( "" )
print ( "Number transactions train dataset: " , len ( X_train_undersample) )
print ( "Number transactions test dataset: " , len ( X_test_undersample) )
print ( "Total number of transactions: " , len ( X_train_undersample) + len ( X_test_undersample) )
模型效果评估
在建模之前,我们还先考虑一下,选定哪些参数,指定什么作为评估标准?
TP( true positives) :被正确分类的正例个数
FN( false negatives) :被错误分类的负例个数
FP( false positives) :被错误分类的负例个数
TN( true negatives) :被正确分类的负例个数
由于我们是要尽可能将所有信用卡欺诈的数据找出来,所以有个很重要的衡量标准:
召回率:Recall = TP/ ( TP+ FN)
假设1000 条信用卡数据中,有10 条是欺诈数据,召回率有别于准确率,它关注的目标就是这10 条数据,找出3 条,那么召回率为0.3 。
建模
接下来就是建模了,很多时候我们也不知道参数设置为多少比较合适,所以最好的办法写一个脚本让机器分别去跑,我们根据各个模型结果再做选择比较省心。
from sklearn. linear_model import LogisticRegression
from sklearn. cross_validation import KFold, cross_val_score
from sklearn. metrics import confusion_matrix, recall_score, classification_report
1
2
3
def printing_Kfold_scores ( x_train_data, y_train_data) :
fold = KFold( len ( y_train_data) , 5 , shuffle= False )
c_param_range = [ 0.01 , 0.1 , 1 , 10 , 100 ]
results_table = pd. DataFrame( index = range ( len ( c_param_range) , 2 ) , columns = [ 'C_parameter' , 'Mean recall score' ] )
results_table[ 'C_parameter' ] = c_param_range
j = 0
for c_param in c_param_range:
print ( '-------------------------------------------' )
print ( 'C parameter: ' , c_param)
print ( '-------------------------------------------' )
print ( '' )
recall_accs = [ ]
for iteration, indices in enumerate ( fold, start= 1 ) :
lr = LogisticRegression( C = c_param, penalty = 'l1' )
lr. fit( x_train_data. iloc[ indices[ 0 ] , : ] , y_train_data. iloc[ indices[ 0 ] , : ] . values. ravel( ) )
y_pred_undersample = lr. predict( x_train_data. iloc[ indices[ 1 ] , : ] . values)
recall_acc = recall_score( y_train_data. iloc[ indices[ 1 ] , : ] . values, y_pred_undersample)
recall_accs. append( recall_acc)
print ( 'Iteration ' , iteration, ': recall score = ' , recall_acc)
results_table. ix[ j, 'Mean recall score' ] = np. mean( recall_accs)
j += 1
print ( '' )
print ( 'Mean recall score ' , np. mean( recall_accs) )
print ( '' )
best_c = results_table. loc[ results_table[ 'Mean recall score' ] . idxmax( ) ] [ 'C_parameter' ]
print ( '*********************************************************************************' )
print ( 'Best model to choose from cross validation is with C parameter = ' , best_c)
print ( '*********************************************************************************' )
return best_c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
best_c = printing_Kfold_scores( X_train_undersample, y_train_undersample)
1
– – – – – – – – – – – – – – – – – – – – – -
C parameter: 0.01
– – – – – – – – – – – – – – – – – – – – – -
Iteration 1 : recall score = 0.958904109589
Iteration 2 : recall score = 0.917808219178
Iteration 3 : recall score = 1.0
Iteration 4 : recall score = 0.972972972973
Iteration 5 : recall score = 0.954545454545
Mean recall score 0.960846151257
– – – – – – – – – – – – – – – – – – – – – -
C parameter: 0.1
– – – – – – – – – – – – – – – – – – – – – -
Iteration 1 : recall score = 0.835616438356
Iteration 2 : recall score = 0.86301369863
Iteration 3 : recall score = 0.915254237288
Iteration 4 : recall score = 0.932432432432
Iteration 5 : recall score = 0.878787878788
Mean recall score 0.885020937099
– – – – – – – – – – – – – – – – – – – – – -
C parameter: 1
– – – – – – – – – – – – – – – – – – – – – -
Iteration 1 : recall score = 0.835616438356
Iteration 2 : recall score = 0.86301369863
Iteration 3 : recall score = 0.966101694915
Iteration 4 : recall score = 0.945945945946
Iteration 5 : recall score = 0.893939393939
Mean recall score 0.900923434357
– – – – – – – – – – – – – – – – – – – – – -
C parameter: 10
– – – – – – – – – – – – – – – – – – – – – -
Iteration 1 : recall score = 0.849315068493
Iteration 2 : recall score = 0.86301369863
Iteration 3 : recall score = 0.966101694915
Iteration 4 : recall score = 0.959459459459
Iteration 5 : recall score = 0.893939393939
Mean recall score 0.906365863087
– – – – – – – – – – – – – – – – – – – – – -
C parameter: 100
– – – – – – – – – – – – – – – – – – – – – -
Iteration 1 : recall score = 0.86301369863
Iteration 2 : recall score = 0.86301369863
Iteration 3 : recall score = 0.966101694915
Iteration 4 : recall score = 0.959459459459
Iteration 5 : recall score = 0.893939393939
Mean recall score 0.909105589115
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
Best model to choose from cross validation is with C parameter = 0.01
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
由以上结果可以看到,当前最好的值为0.96
接下来,画一个更直观的混淆矩阵图出来
def plot_confusion_matrix ( cm, classes,
title= 'Confusion matrix' ,
cmap= plt. cm. Blues) :
"""
This function prints and plots the confusion matrix.
"""
plt. imshow( cm, interpolation= 'nearest' , cmap= cmap)
plt. title( title)
plt. colorbar( )
tick_marks = np. arange( len ( classes) )
plt. xticks( tick_marks, classes, rotation= 0 )
plt. yticks( tick_marks, classes)
thresh = cm. max ( ) / 2 .
for i, j in itertools. product( range ( cm. shape[ 0 ] ) , range ( cm. shape[ 1 ] ) ) :
plt. text( j, i, cm[ i, j] ,
horizontalalignment= "center" ,
color= "white" if cm[ i, j] > thresh else "black" )
plt. tight_layout( )
plt. ylabel( 'True label' )
plt. xlabel( 'Predicted label' )
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
import itertools
lr = LogisticRegression( C = best_c, penalty = 'l1' )
lr. fit( X_train_undersample, y_train_undersample. values. ravel( ) )
y_pred_undersample = lr. predict( X_test_undersample. values)
cnf_matrix = confusion_matrix( y_test_undersample, y_pred_undersample)
np. set_printoptions( precision= 2 )
print ( "Recall metric in the testing dataset: " , cnf_matrix[ 1 , 1 ] / ( cnf_matrix[ 1 , 0 ] + cnf_matrix[ 1 , 1 ] ) )
class_names = [ 0 , 1 ]
plt. figure( )
plot_confusion_matrix( cnf_matrix
, classes= class_names
, title= 'Confusion matrix' )
plt. show( )
一目了然的图,可以看到,138 个真实的欺诈被模型找出来了,但是有9 个漏网之鱼,同时有17 个正常数据被误杀。Recall值能达到0.93 ,看起来挺高的,这就是我们要的结果吗?并非如此,这是用的下采样数据计算的混淆矩阵。
接下来,我们用原始数据画出混淆矩阵图,看看结果:
lr = LogisticRegression( C = best_c, penalty = 'l1' )
lr. fit( X_train_undersample, y_train_undersample. values. ravel( ) )
y_pred = lr. predict( X_test. values)
cnf_matrix = confusion_matrix( y_test, y_pred)
np. set_printoptions( precision= 2 )
print ( "Recall metric in the testing dataset: " , cnf_matrix[ 1 , 1 ] / ( cnf_matrix[ 1 , 0 ] + cnf_matrix[ 1 , 1 ] ) )
class_names = [ 0 , 1 ]
plt. figure( )
plot_confusion_matrix( cnf_matrix
, classes= class_names
, title= 'Confusion matrix' )
plt. show( )
这里我们能看到,模型出现一个很大的问题,误杀数量竟然达到了10318 条,这无疑对业务产生了重大影响。为什么会出现这个问题呢?这是根据下采样模型得到的效果,而在下采样数据中,数据量太少,正常的少,异常的同样也少,样本是有局限的,出现这种情况也很正常。
那么如何解决这个问题呢?
如果我们一开始没有对数据进行任何预处理操作,我们能不能得到好的结果呢?
best_c = printing_Kfold_scores( X_train, y_train)
1
– – – – – – – – – – – – – – – – – – – – – -
C parameter: 0.01
– – – – – – – – – – – – – – – – – – – – – -
Iteration 1 : recall score = 0.492537313433
Iteration 2 : recall score = 0.602739726027
Iteration 3 : recall score = 0.683333333333
Iteration 4 : recall score = 0.569230769231
Iteration 5 : recall score = 0.45
Mean recall score 0.559568228405
– – – – – – – – – – – – – – – – – – – – – -
C parameter: 0.1
– – – – – – – – – – – – – – – – – – – – – -
Iteration 1 : recall score = 0.567164179104
Iteration 2 : recall score = 0.616438356164
Iteration 3 : recall score = 0.683333333333
Iteration 4 : recall score = 0.584615384615
Iteration 5 : recall score = 0.525
Mean recall score 0.595310250644
– – – – – – – – – – – – – – – – – – – – – -
C parameter: 1
– – – – – – – – – – – – – – – – – – – – – -
Iteration 1 : recall score = 0.55223880597
Iteration 2 : recall score = 0.616438356164
Iteration 3 : recall score = 0.716666666667
Iteration 4 : recall score = 0.615384615385
Iteration 5 : recall score = 0.5625
Mean recall score 0.612645688837
– – – – – – – – – – – – – – – – – – – – – -
C parameter: 10
– – – – – – – – – – – – – – – – – – – – – -
Iteration 1 : recall score = 0.55223880597
Iteration 2 : recall score = 0.616438356164
Iteration 3 : recall score = 0.733333333333
Iteration 4 : recall score = 0.615384615385
Iteration 5 : recall score = 0.575
Mean recall score 0.61847902217
– – – – – – – – – – – – – – – – – – – – – -
C parameter: 100
– – – – – – – – – – – – – – – – – – – – – -
Iteration 1 : recall score = 0.55223880597
Iteration 2 : recall score = 0.616438356164
Iteration 3 : recall score = 0.733333333333
Iteration 4 : recall score = 0.615384615385
Iteration 5 : recall score = 0.575
Mean recall score 0.61847902217
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
Best model to choose from cross validation is with C parameter = 10.0
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
可以看到,直接用极度不均衡数据建模的话,效果都很差。所以对数据进行预处理是非常有必要的。
数据决定上限,参数决定下限。
我们还是先看看它的混淆矩阵结果:
lr = LogisticRegression( C = best_c, penalty = 'l1' )
lr. fit( X_train, y_train. values. ravel( ) )
y_pred_undersample = lr. predict( X_test. values)
cnf_matrix = confusion_matrix( y_test, y_pred_undersample)
np. set_printoptions( precision= 2 )
print ( "Recall metric in the testing dataset: " , cnf_matrix[ 1 , 1 ] / ( cnf_matrix[ 1 , 0 ] + cnf_matrix[ 1 , 1 ] ) )
class_names = [ 0 , 1 ]
plt. figure( )
plot_confusion_matrix( cnf_matrix
, classes= class_names
, title= 'Confusion matrix' )
plt. show( )
从结果看到,误杀少了,但是很多欺诈数据没有找出来。
之前我们使用的是Sigmoid函数中默认的阈值:0.5 ,如果我们自己指定阈值,会对结果产生什么影响呢?
lr = LogisticRegression( C = 0.01 , penalty = 'l1' )
lr. fit( X_train_undersample, y_train_undersample. values. ravel( ) )
y_pred_undersample_proba = lr. predict_proba( X_test_undersample. values)
thresholds = [ 0.1 , 0.2 , 0.3 , 0.4 , 0.5 , 0.6 , 0.7 , 0.8 , 0.9 ]
plt. figure( figsize= ( 10 , 10 ) )
j = 1
for i in thresholds:
y_test_predictions_high_recall = y_pred_undersample_proba[ : , 1 ] > i
plt. subplot( 3 , 3 , j)
j += 1
cnf_matrix = confusion_matrix( y_test_undersample, y_test_predictions_high_recall)
np. set_printoptions( precision= 2 )
print ( "Recall metric in the testing dataset: " , cnf_matrix[ 1 , 1 ] / ( cnf_matrix[ 1 , 0 ] + cnf_matrix[ 1 , 1 ] ) )
class_names = [ 0 , 1 ]
plot_confusion_matrix( cnf_matrix
, classes= class_names
, title= 'Threshold >= %s' % i)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
Recall metric in the testing dataset: 1.0
Recall metric in the testing dataset: 1.0
Recall metric in the testing dataset: 1.0
Recall metric in the testing dataset: 0.986394557823
Recall metric in the testing dataset: 0.925170068027
Recall metric in the testing dataset: 0.863945578231
Recall metric in the testing dataset: 0.829931972789
Recall metric in the testing dataset: 0.748299319728
Recall metric in the testing dataset: 0.585034013605
当阈值为0.1 - 0.3 时,recall值为1 ,说明太过严苛。随着阈值越来越大,模型的要求越来越宽松。这里需要根据实际业务需求,权衡利弊,选定一个代价最低的模型。
过采样- SMOTE样本生成策略
既然下采样有局限性,误杀这么高,那过采样呢?
说到过采样,那么就有个问题,怎么生成数据呢?
在机器学习中,有这么个套路,即SMOTE样本生成策略:
其中k值为要翻的倍数,假设少数类样本为100 ,你想变成500 ,K就取5 。先算x到其他少数类样本的距离,然后找出离它最近的5 个样本,分别得到距离,将这个距离乘上一个0 - 1 之间的随机数,加上样本本身,得到新数据。相当于对样本进行了微调的过程。
import pandas as pd
from imblearn. over_sampling import SMOTE
from sklearn. ensemble import RandomForestClassifier
from sklearn. metrics import confusion_matrix
from sklearn. model_selection import train_test_split
1
2
3
4
5
credit_cards= pd. read_csv( 'creditcard.csv' )
columns= credit_cards. columns
features_columns= columns. delete( len ( columns) - 1 )
features= credit_cards[ features_columns]
labels= credit_cards[ 'Class' ]
1
2
3
4
5
6
7
8
features_train, features_test, labels_train, labels_test = train_test_split( features, labels, test_size= 0.2 , random_state= 0 )
1
生成新数据
oversampler= SMOTE( random_state= 0 )
os_features, os_labels= oversampler. fit_sample( features_train, labels_train)
1
2
查看下
len ( os_labels[ os_labels== 1 ] )
1
227454
os_features = pd. DataFrame( os_features)
os_labels = pd. DataFrame( os_labels)
best_c = printing_Kfold_scores( os_features, os_labels)
1
2
3
– – – – – – – – – – – – – – – – – – – – – -
C parameter: 0.01
– – – – – – – – – – – – – – – – – – – – – -
Iteration 1 : recall score = 0.890322580645
Iteration 2 : recall score = 0.894736842105
Iteration 3 : recall score = 0.968794954078
Iteration 4 : recall score = 0.957760411514
Iteration 5 : recall score = 0.958266011585
Mean recall score 0.933976159985
– – – – – – – – – – – – – – – – – – – – – -
C parameter: 0.1
– – – – – – – – – – – – – – – – – – – – – -
Iteration 1 : recall score = 0.890322580645
Iteration 2 : recall score = 0.894736842105
Iteration 3 : recall score = 0.970432665708
Iteration 4 : recall score = 0.960046603137
Iteration 5 : recall score = 0.957650498456
Mean recall score 0.93463783801
– – – – – – – – – – – – – – – – – – – – – -
C parameter: 1
– – – – – – – – – – – – – – – – – – – – – -
Iteration 1 : recall score = 0.890322580645
Iteration 2 : recall score = 0.894736842105
Iteration 3 : recall score = 0.970432665708
Iteration 4 : recall score = 0.960321385784
Iteration 5 : recall score = 0.960750046713
Mean recall score 0.935312704191
– – – – – – – – – – – – – – – – – – – – – -
C parameter: 10
– – – – – – – – – – – – – – – – – – – – – -
Iteration 1 : recall score = 0.890322580645
Iteration 2 : recall score = 0.894736842105
Iteration 3 : recall score = 0.970499059422
Iteration 4 : recall score = 0.960211472725
Iteration 5 : recall score = 0.96009056836
Mean recall score 0.935172104652
– – – – – – – – – – – – – – – – – – – – – -
C parameter: 100
– – – – – – – – – – – – – – – – – – – – – -
Iteration 1 : recall score = 0.890322580645
Iteration 2 : recall score = 0.894736842105
Iteration 3 : recall score = 0.970543321899
Iteration 4 : recall score = 0.960398324925
Iteration 5 : recall score = 0.956903089656
Mean recall score 0.934580831846
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
Best model to choose from cross validation is with C parameter = 1.0
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
lr = LogisticRegression( C = best_c, penalty = 'l1' )
lr. fit( os_features, os_labels. values. ravel( ) )
y_pred = lr. predict( features_test. values)
cnf_matrix = confusion_matrix( labels_test, y_pred)
np. set_printoptions( precision= 2 )
print ( "Recall metric in the testing dataset: " , cnf_matrix[ 1 , 1 ] / ( cnf_matrix[ 1 , 0 ] + cnf_matrix[ 1 , 1 ] ) )
class_names = [ 0 , 1 ]
plt. figure( )
plot_confusion_matrix( cnf_matrix
, classes= class_names
, title= 'Confusion matrix' )
plt. show( )
看结果,与下采样对比,误杀比例明显小得多,也就是说,当我们用过采样策略,模型效果最好。