传送门:【风控实践】信用卡欺诈检测(上)
针对不平衡数据,采用上采样的方法STOME算法进行分析。
处理数据
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
/Applications/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88
return f(*args, **kwds)
data=pd.read_csv('./creditcard.csv')
from sklearn.preprocessing import StandardScaler
# 标准化Amount列数据
data['normAmount']=StandardScaler().fit_transform(data['Amount'].values.reshape(-1,1))
data=data.drop(['Amount','Time'],axis=1)
data.shape,data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 30 columns):
V1 284807 non-null float64
V2 284807 non-null float64
V3 284807 non-null float64
V4 284807 non-null float64
V5 284807 non-null float64
V6 284807 non-null float64
V7 284807 non-null float64
V8 284807 non-null float64
V9 284807 non-null float64
V10 284807 non-null float64
V11 284807 non-null float64
V12 284807 non-null float64
V13 284807 non-null float64
V14 284807 non-null float64
V15 284807 non-null float64
V16 284807 non-null float64
V17 284807 non-null float64
V18 284807 non-null float64
V19 284807 non-null float64
V20 284807 non-null float64
V21 284807 non-null float64
V22 284807 non-null float64
V23 284807 non-null float64
V24 284807 non-null float64
V25 284807 non-null float64
V26 284807 non-null float64
V27 284807 non-null float64
V28 284807 non-null float64
Class 284807 non-null int64
normAmount 284807 non-null float64
dtypes: float64(29), int64(1)
memory usage: 65.2 MB
((284807, 30), None)
交叉验证与调参
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import KFold,cross_val_score
from sklearn.metrics import (confusion_matrix,recall_score,
classification_report)
pass
# 定义求KFold的函数
def printing_Kfold_scores(X_train_data,Y_train_data):
fold = KFold(len(Y_train_data),5,shuffle=False)
print (fold)
c_param_range = [0.01,0.1,1,10,100]
# results_table为创建的DataFrame对象,来存储不同参数交叉验证后所得的recall值
results_table = pd.DataFrame(index=range(len(c_param_range)),columns=['C_Parameter','Mean recall score'])
results_table['C_Parameter'] = c_param_range
j=0
for c_param in c_param_range:
print ('c_param:',c_param)
recall_accs = []
#enumerate将一个可遍历对象(如列表、字符串)组成一个索引序列,
#获得索引和元素值,start=1表示索引从1开始(默认为0)
for iteration,indices in enumerate(fold, start=1):
lr = LogisticRegression(C = c_param, penalty = 'l1')
lr.fit(X_train_data.iloc[indices[0],:],Y_train_data.iloc[indices[0],:].values.ravel())
Y_pred_undersample = lr.predict(X_train_data.iloc[indices[1],:].values)
recall_acc = recall_score(Y_train_data.iloc[indices[1],:].values,Y_pred_undersample)
recall_accs.append(recall_acc)
print ('Iteration:',iteration,'recall_acc:',recall_acc)
#求每个C参数的平均recall值
print ('Mean recall score',np.mean(recall_accs))
results_table.loc[j,'Mean recall score'] = np.mean(recall_accs)
j+=1
# 最佳C参数
# 千万注意results_table['Mean recall score']的类型是object,要转成float64!
results_table['Mean recall score']=results_table['Mean recall score'].astype('float64')
#hh=results_table['Mean recall score']#.idxmax()
#print('hh',results_table.info())
best_c = results_table['C_Parameter'].iloc[results_table['Mean recall score'].idxmax()]
print ('best_c is :',best_c)
return best_c
# 带入下采样数据
best_c = printing_Kfold_scores(X_train_under_sample,
Y_train_under_sample)
sklearn.cross_validation.KFold(n=688,
n_folds=5,
shuffle=False,
random_state=None)
c_param: 0.01
Iteration: 1 recall_acc: 0.931506849315
Iteration: 2 recall_acc: 0.917808219178
Iteration: 3 recall_acc: 1.0
Iteration: 4 recall_acc: 0.959459459459
Iteration: 5 recall_acc: 0.954545454545
Mean recall score 0.9526639965
c_param: 0.1
Iteration: 1 recall_acc: 0.835616438356
Iteration: 2 recall_acc: 0.86301369863
Iteration: 3 recall_acc: 0.915254237288
Iteration: 4 recall_acc: 0.918918918919
Iteration: 5 recall_acc: 0.893939393939
Mean recall score 0.885348537427
c_param: 1
Iteration: 1 recall_acc: 0.849315068493
Iteration: 2 recall_acc: 0.890410958904
Iteration: 3 recall_acc: 0.966101694915
Iteration: 4 recall_acc: 0.945945945946
Iteration: 5 recall_acc: 0.893939393939
Mean recall score 0.90914261244
c_param: 10
Iteration: 1 recall_acc: 0.86301369863
Iteration: 2 recall_acc: 0.904109589041
Iteration: 3 recall_acc: 0.966101694915
Iteration: 4 recall_acc: 0.932432432432
Iteration: 5 recall_acc: 0.909090909091
Mean recall score 0.914949664822
c_param: 100
Iteration: 1 recall_acc: 0.890410958904
Iteration: 2 recall_acc: 0.904109589041
Iteration: 3 recall_acc: 0.983050847458
Iteration: 4 recall_acc: 0.959459459459
Iteration: 5 recall_acc: 0.909090909091
Mean recall score 0.929224352791
best_c is : 0.01
过采样
# SMOTE Technique (OverSampling) After splitting and Cross Validating
sm = SMOTE(ratio='minority', random_state=42)
# Xsm_train, ysm_train = sm.fit_sample(X_train, y_train)
# This will be the data were we are going to
Xsm_train, ysm_train = sm.fit_sample(original_Xtrain, original_ytrain)
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, RandomizedSearchCV
print('Length of X (train): {} | Length of y (train): {}'.format(len(original_Xtrain), len(original_ytrain)))
print('Length of X (test): {} | Length of y (test): {}'.format(len(original_Xtest), len(original_ytest)))
# List to append the score and then find the average
accuracy_lst = []
precision_lst = []
recall_lst = []
f1_lst = []
auc_lst = []
# Classifier with optimal parameters
# log_reg_sm = grid_log_reg.best_estimator_
log_reg_sm = LogisticRegression()
rand_log_reg = RandomizedSearchCV(LogisticRegression(), log_reg_params, n_iter=4)
# Implementing SMOTE Technique
# Cross Validating the right way
# Parameters
log_reg_params = {"penalty": ['l1', 'l2'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
for train, test in sss.split(original_Xtrain, original_ytrain):
# 在交叉验证里进行,且交叉验证是在原始的训练集上进行
pipeline = imbalanced_make_pipeline(SMOTE(sampling_strategy='minority'), rand_log_reg) # SMOTE happens during Cross Validation not before..
model = pipeline.fit(original_Xtrain[train], original_ytrain[train])
best_est = rand_log_reg.best_estimator_
prediction = best_est.predict(original_Xtrain[test])
accuracy_lst.append(pipeline.score(original_Xtrain[test], original_ytrain[test]))
precision_lst.append(precision_score(original_ytrain[test], prediction))
recall_lst.append(recall_score(original_ytrain[test], prediction))
f1_lst.append(f1_score(original_ytrain[test], prediction))
auc_lst.append(roc_auc_score(original_ytrain[test], prediction))
print('---' * 45)
print('')
print("accuracy: {}".format(np.mean(accuracy_lst)))
print("precision: {}".format(np.mean(precision_lst)))
print("recall: {}".format(np.mean(recall_lst)))
print("f1: {}".format(np.mean(f1_lst)))
print('---' * 45)
注意
必须在交叉验证期间而不是在交叉验证之前创建合成数据点。SMOTE occurs "during" cross validation and not "prior" to the cross validation process. Synthetic data are created only for the training set without affecting the validation set.
正确的方式:
错的方式:
回顾一下上次“下采样”的处理
# 获取原始的特征、标签数据集
X = data.loc[:,data.columns != 'Class']
Y = data.loc[:,data.columns == 'Class']
X.shape,Y.shape
((284807, 29), (284807, 1))
# 找出负样本的个数
number_record_fraud = len(Y[Y.Class==1])
# 获取负样本的索引
fraud_indices = np.array(data[data.Class == 1].index)
normal_indices = np.array(data[data.Class == 0].index)
# 通过np.random.choice在正样本的索引(normal_indices)中随机选负样本个数(number_record_fraud )个索引
random_normal_indices = np.array(np.random.choice(normal_indices,number_record_fraud,replace=False))
# 汇总正、负样本的索引
under_sample_indices = np.concatenate([fraud_indices,random_normal_indices])
# 根据汇总的索引提取数据集
under_sample_data = data.iloc[under_sample_indices,:]
# 在数据集中提取特征、标签数据
X_under_sample = under_sample_data.iloc[:,under_sample_data.columns != 'Class']
Y_under_sample = under_sample_data.iloc[:,under_sample_data.columns == 'Class']
# 检查获取的样本特征、标签数据
X_under_sample.shape,Y_under_sample.shape
((984, 29), (984, 1))
# 拆分数据集
from sklearn.cross_validation import train_test_split
# 拆分获取的下采样特征、标签数据集
X_train_under_sample,X_test_under_sample,Y_train_under_sample,Y_test_under_sample = train_test_split(X_under_sample,
Y_under_sample,
test_size=0.3,
random_state=0)
# 拆分原始的未处理的特征、标签数据集,以备后面之需
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
test_size=0.3,
random_state=0)
/Applications/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88
return f(*args, **kwds)
# 查看采样数据拆分后的形状,应经常检查,及时发现异常
print(X_train_under_sample.shape,
Y_train_under_sample.shape,
'\n',X_test_under_sample.shape,
Y_test_under_sample.shape)
(688, 29) (688, 1)
(296, 29) (296, 1)
# 查看原始的未处理的数据拆分后的形状
print(X_train.shape,
Y_train.shape,
'\n',X_test.shape,
Y_test.shape)
(199364, 29) (199364, 1)
(85443, 29) (85443, 1)
通过下采样处理数据得到的逻辑回归模型,虽然recall值挺高的,但NP值非常高8404,也就是误杀率非常高。这也是用下采样处理数据的一个弊端。