【风控实践】信用卡欺诈检测(下)

传送门:【风控实践】信用卡欺诈检测(上)

针对不平衡数据,采用上采样的方法STOME算法进行分析。

处理数据

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('ggplot')

from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
/Applications/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88
  return f(*args, **kwds)
data=pd.read_csv('./creditcard.csv')
from sklearn.preprocessing import StandardScaler
# 标准化Amount列数据
data['normAmount']=StandardScaler().fit_transform(data['Amount'].values.reshape(-1,1))
data=data.drop(['Amount','Time'],axis=1)
data.shape,data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 30 columns):
V1            284807 non-null float64
V2            284807 non-null float64
V3            284807 non-null float64
V4            284807 non-null float64
V5            284807 non-null float64
V6            284807 non-null float64
V7            284807 non-null float64
V8            284807 non-null float64
V9            284807 non-null float64
V10           284807 non-null float64
V11           284807 non-null float64
V12           284807 non-null float64
V13           284807 non-null float64
V14           284807 non-null float64
V15           284807 non-null float64
V16           284807 non-null float64
V17           284807 non-null float64
V18           284807 non-null float64
V19           284807 non-null float64
V20           284807 non-null float64
V21           284807 non-null float64
V22           284807 non-null float64
V23           284807 non-null float64
V24           284807 non-null float64
V25           284807 non-null float64
V26           284807 non-null float64
V27           284807 non-null float64
V28           284807 non-null float64
Class         284807 non-null int64
normAmount    284807 non-null float64
dtypes: float64(29), int64(1)
memory usage: 65.2 MB
((284807, 30), None)

交叉验证与调参

import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import KFold,cross_val_score
from sklearn.metrics import (confusion_matrix,recall_score,
                             classification_report)
pass

# 定义求KFold的函数
def printing_Kfold_scores(X_train_data,Y_train_data):
    fold = KFold(len(Y_train_data),5,shuffle=False)
    print (fold)
    c_param_range = [0.01,0.1,1,10,100]
    # results_table为创建的DataFrame对象,来存储不同参数交叉验证后所得的recall值
    results_table = pd.DataFrame(index=range(len(c_param_range)),columns=['C_Parameter','Mean recall score'])
    results_table['C_Parameter'] = c_param_range

    j=0
    for c_param in c_param_range:
        print ('c_param:',c_param)
        recall_accs = []
        #enumerate将一个可遍历对象(如列表、字符串)组成一个索引序列,
        #获得索引和元素值,start=1表示索引从1开始(默认为0)
        for iteration,indices in enumerate(fold, start=1):
            lr = LogisticRegression(C = c_param, penalty = 'l1')
            lr.fit(X_train_data.iloc[indices[0],:],Y_train_data.iloc[indices[0],:].values.ravel())
            Y_pred_undersample = lr.predict(X_train_data.iloc[indices[1],:].values)
            recall_acc = recall_score(Y_train_data.iloc[indices[1],:].values,Y_pred_undersample)
            recall_accs.append(recall_acc)
            print ('Iteration:',iteration,'recall_acc:',recall_acc)
        #求每个C参数的平均recall值
        print ('Mean recall score',np.mean(recall_accs))
        results_table.loc[j,'Mean recall score'] = np.mean(recall_accs)
        j+=1

    # 最佳C参数
    # 千万注意results_table['Mean recall score']的类型是object,要转成float64!
    results_table['Mean recall score']=results_table['Mean recall score'].astype('float64')
    #hh=results_table['Mean recall score']#.idxmax()
    #print('hh',results_table.info())
    best_c = results_table['C_Parameter'].iloc[results_table['Mean recall score'].idxmax()]

    print ('best_c is :',best_c)
    return best_c

# 带入下采样数据
best_c = printing_Kfold_scores(X_train_under_sample,
                               Y_train_under_sample)
sklearn.cross_validation.KFold(n=688, 
                               n_folds=5, 
                               shuffle=False, 
                               random_state=None)

c_param: 0.01
Iteration: 1 recall_acc: 0.931506849315
Iteration: 2 recall_acc: 0.917808219178
Iteration: 3 recall_acc: 1.0
Iteration: 4 recall_acc: 0.959459459459
Iteration: 5 recall_acc: 0.954545454545
Mean recall score 0.9526639965

c_param: 0.1
Iteration: 1 recall_acc: 0.835616438356
Iteration: 2 recall_acc: 0.86301369863
Iteration: 3 recall_acc: 0.915254237288
Iteration: 4 recall_acc: 0.918918918919
Iteration: 5 recall_acc: 0.893939393939
Mean recall score 0.885348537427

c_param: 1
Iteration: 1 recall_acc: 0.849315068493
Iteration: 2 recall_acc: 0.890410958904
Iteration: 3 recall_acc: 0.966101694915
Iteration: 4 recall_acc: 0.945945945946
Iteration: 5 recall_acc: 0.893939393939
Mean recall score 0.90914261244

c_param: 10
Iteration: 1 recall_acc: 0.86301369863
Iteration: 2 recall_acc: 0.904109589041
Iteration: 3 recall_acc: 0.966101694915
Iteration: 4 recall_acc: 0.932432432432
Iteration: 5 recall_acc: 0.909090909091
Mean recall score 0.914949664822

c_param: 100
Iteration: 1 recall_acc: 0.890410958904
Iteration: 2 recall_acc: 0.904109589041
Iteration: 3 recall_acc: 0.983050847458
Iteration: 4 recall_acc: 0.959459459459
Iteration: 5 recall_acc: 0.909090909091
Mean recall score 0.929224352791

best_c is : 0.01

过采样

# SMOTE Technique (OverSampling) After splitting and Cross Validating
sm = SMOTE(ratio='minority', random_state=42)
# Xsm_train, ysm_train = sm.fit_sample(X_train, y_train)


# This will be the data were we are going to 
Xsm_train, ysm_train = sm.fit_sample(original_Xtrain, original_ytrain)
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, RandomizedSearchCV


print('Length of X (train): {} | Length of y (train): {}'.format(len(original_Xtrain), len(original_ytrain)))
print('Length of X (test): {} | Length of y (test): {}'.format(len(original_Xtest), len(original_ytest)))

# List to append the score and then find the average
accuracy_lst = []
precision_lst = []
recall_lst = []
f1_lst = []
auc_lst = []

# Classifier with optimal parameters
# log_reg_sm = grid_log_reg.best_estimator_
log_reg_sm = LogisticRegression()

rand_log_reg = RandomizedSearchCV(LogisticRegression(), log_reg_params, n_iter=4)


# Implementing SMOTE Technique 
# Cross Validating the right way
# Parameters
log_reg_params = {"penalty": ['l1', 'l2'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
for train, test in sss.split(original_Xtrain, original_ytrain):
# 在交叉验证里进行,且交叉验证是在原始的训练集上进行
    pipeline = imbalanced_make_pipeline(SMOTE(sampling_strategy='minority'), rand_log_reg) # SMOTE happens during Cross Validation not before..
    model = pipeline.fit(original_Xtrain[train], original_ytrain[train])
    best_est = rand_log_reg.best_estimator_
    prediction = best_est.predict(original_Xtrain[test])
    
    accuracy_lst.append(pipeline.score(original_Xtrain[test], original_ytrain[test]))
    precision_lst.append(precision_score(original_ytrain[test], prediction))
    recall_lst.append(recall_score(original_ytrain[test], prediction))
    f1_lst.append(f1_score(original_ytrain[test], prediction))
    auc_lst.append(roc_auc_score(original_ytrain[test], prediction))
    
print('---' * 45)
print('')
print("accuracy: {}".format(np.mean(accuracy_lst)))
print("precision: {}".format(np.mean(precision_lst)))
print("recall: {}".format(np.mean(recall_lst)))
print("f1: {}".format(np.mean(f1_lst)))
print('---' * 45)

注意

必须在交叉验证期间而不是在交叉验证之前创建合成数据点。SMOTE occurs "during" cross validation and not "prior" to the cross validation process. Synthetic data are created only for the training set without affecting the validation set.

正确的方式:

错的方式:

回顾一下上次“下采样”的处理

# 获取原始的特征、标签数据集
X = data.loc[:,data.columns != 'Class']
Y = data.loc[:,data.columns == 'Class']
X.shape,Y.shape
((284807, 29), (284807, 1))
# 找出负样本的个数
number_record_fraud = len(Y[Y.Class==1])
# 获取负样本的索引
fraud_indices = np.array(data[data.Class == 1].index)
normal_indices = np.array(data[data.Class == 0].index)
# 通过np.random.choice在正样本的索引(normal_indices)中随机选负样本个数(number_record_fraud )个索引
random_normal_indices = np.array(np.random.choice(normal_indices,number_record_fraud,replace=False))

# 汇总正、负样本的索引
under_sample_indices = np.concatenate([fraud_indices,random_normal_indices])
# 根据汇总的索引提取数据集
under_sample_data = data.iloc[under_sample_indices,:]

# 在数据集中提取特征、标签数据
X_under_sample = under_sample_data.iloc[:,under_sample_data.columns != 'Class']
Y_under_sample = under_sample_data.iloc[:,under_sample_data.columns == 'Class']

# 检查获取的样本特征、标签数据
X_under_sample.shape,Y_under_sample.shape
((984, 29), (984, 1))

# 拆分数据集
from sklearn.cross_validation import train_test_split
# 拆分获取的下采样特征、标签数据集
X_train_under_sample,X_test_under_sample,Y_train_under_sample,Y_test_under_sample = train_test_split(X_under_sample,
                                                                                                     Y_under_sample,
                                                                                                     test_size=0.3,
                                                                                                     random_state=0)
# 拆分原始的未处理的特征、标签数据集,以备后面之需
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
                                                    test_size=0.3,
                                                    random_state=0)
/Applications/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88
  return f(*args, **kwds)

# 查看采样数据拆分后的形状,应经常检查,及时发现异常
print(X_train_under_sample.shape,
      Y_train_under_sample.shape,
 '\n',X_test_under_sample.shape,
      Y_test_under_sample.shape)
(688, 29) (688, 1) 
 (296, 29) (296, 1)

# 查看原始的未处理的数据拆分后的形状
print(X_train.shape,
      Y_train.shape,
 '\n',X_test.shape,
      Y_test.shape)
(199364, 29) (199364, 1) 
 (85443, 29) (85443, 1)

通过下采样处理数据得到的逻辑回归模型,虽然recall值挺高的,但NP值非常高8404,也就是误杀率非常高。这也是用下采样处理数据的一个弊端。

  • 0
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值