在前一篇
> kaggle 欺诈信用卡预测(由浅入深(一)之数据探索及过采样)
我们利用SMOTE过采样和LogisticRegression来预测信用卡欺诈。
现在我们利用样本类别本身的不平衡,用AutoEncoder来对特征进行表达,并将表达后的特征送入LogisticRegression。得到了好于SMOTE+LogisticRegression的效果。文章最后会进行比较。
我们依然直接从代码入手
#引入依赖库
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import missingno
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
import random
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE,RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV
from sklearn.metrics import roc_auc_score,roc_curve,precision_score,auc,precision_recall_curve, \
accuracy_score,recall_score,f1_score,confusion_matrix,classification_report
%matplotlib inline
data = pd.read_csv("creditcard.csv")
data.shape
#不均衡的样本分布,在欺诈问题
data.Class.value_counts()
0 284315
1 492
#数据处理
data['Hour'] =data["Time"].apply(lambda x : divmod(x, 3600)[0]) #单位转换
Xfraud = data[data["Class"] == 1]
XnonFraud = data[data["Class"] == 0]
droplist = ['V8', 'V13', 'V15', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28','Time']
data_new = data.drop(droplist, axis = 1)
data_new.shape # 查看数据的维度
# 对Amount和Hour 进行特征缩放
col = ['Amount','Hour']
sc =StandardScaler() # 初始化缩放器
data_new[col] =sc.fit_transform(data_new[col])#对数据进行标准化
data_new.head()
抽取样本进行可视化
non_fraud = data_new[data_new['Class'] == 0].sample(2000)
fraud = data_new[data_new['Class'] == 1]
df = non_fraud.append(fraud).sample(frac=1).reset_index(drop=True)
X = df.drop(['Class'], axis = 1).values
index1 = df[df.Class==1].index
index0 = df[df.Class==0].index
from mpl_toolkits.mplot3d import Axes3D
data_tsne = TSNE(n_components=3).fit_transform(X)
fig = plt.figure(figsize=(16,16))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(data_tsne[index1,0],data_tsne[index1,1],data_tsne[index1,2], c='r') # 绘制数据点
ax.scatter(data_tsne[index0,0],data_tsne[index0,1],data_tsne[index0,2], c='g')
ax.set_zlabel('Z') # 坐标轴
ax.set_ylabel('Y')
ax.set_xlabel('X')
我们用TSNE将数据降维到3维,并进行可视化,结果显示,正负样例区分较明显。
#构建AutoEncoder
import tensorflow as tf
from tensorflow.keras import layers,Input,regularizers,Model,Sequential
x_norm = data_new[data_new['Class'] == 0].drop('Class',axis=1)
x_fraud = data_new[data_new['Class'] == 1].drop('Class',axis=1)
input_layer = Input(shape=(x_norm.shape[1],))
## encoding part
encoding_dim = 128
encoded = layers.Dense(encoding_dim,activation='tanh',activity_regularizer=regularizers.l1(1e-4))(input_layer)
encoded = layers.Dense(encoding_dim/2, activation='relu')(encoded)
## decoding part
decoded = layers.Dense(encoding_dim/2, activation='tanh')(encoded)
decoded = layers.Dense(encoding_dim, activation='tanh')(decoded)
output_layer = layers.Dense(x_norm.shape[1],activation='relu')(decoded)
autoencoder = Model(input_layer,output_layer)
autoencoder.compile(optimizer="adam", loss="mse")
autoencoder.fit(x_norm, x_norm,
batch_size = 64, epochs = 10,
shuffle = True, validation_split = 0.20)
特征表达
hidden_representation = Sequential()
hidden_representation.add(autoencoder.layers[0])
hidden_representation.add(autoencoder.layers[1])
hidden_representation.add(autoencoder.layers[2])
norm_hid_rep = hidden_representation.predict(x_norm[:2000])
fraud_hid_rep = hidden_representation.predict(x_fraud)
rep_x = np.append(norm_hid_rep, fraud_hid_rep, axis = 0)
y_n = np.zeros(norm_hid_rep.shape[0])
y_f = np.ones(fraud_hid_rep.shape[0])
rep_y = np.append(y_n, y_f)
rep_x.shape
(2492, 64)
将AutoEncoder表达后的特征进行可视化
data_tsne = TSNE(n_components=3).fit_transform(rep_x)
fig = plt.figure(figsize=(16,16))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(data_tsne[:norm_hid_rep.shape[0],0],data_tsne[:norm_hid_rep.shape[0],1],data_tsne[:norm_hid_rep.shape[0],2], c='r') # 绘制数据点
ax.scatter(data_tsne[norm_hid_rep.shape[0]:,0],data_tsne[norm_hid_rep.shape[0]:,1],data_tsne[norm_hid_rep.shape[0]:,2], c='g')
ax.set_zlabel('Z') # 坐标轴
ax.set_ylabel('Y')
ax.set_xlabel('X')
对比看来,正负样本区分度略微提升。
#全部样本,进行特征表达
norm_hid_rep = hidden_representation.predict(x_norm)
fraud_hid_rep = hidden_representation.predict(x_fraud)
rep_x = np.append(norm_hid_rep, fraud_hid_rep, axis = 0)
y_n = np.zeros(norm_hid_rep.shape[0])
y_f = np.ones(fraud_hid_rep.shape[0])
rep_y = np.append(y_n, y_f)
#直接训练LogisticRegression,不采用SMOTE过采样
# 构建参数组合
param_grid = {'C': [0.01,0.1, 1, 10, 100, 1000,],
'penalty': [ 'l1', 'l2']}
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=10) # 确定模型LogisticRegression,和参数组合param_grid ,cv指定10折
grid_search.fit(X_train, y_train) # 使用训练集学习算法
best_model = grid_search.best_estimator_
print('accuracy_score:',accuracy_score(y_test,best_model.predict(X_test)))
print('roc_auc_score:',roc_auc_score(y_test,best_model.predict(X_test)))
print('recall_score:',recall_score(y_test,best_model.predict(X_test)))
print('precision_score:',precision_score(y_test,best_model.predict(X_test)))
accuracy_score: 0.9994616293903538
roc_auc_score: 0.8749472419250836
recall_score: 0.75
precision_score: 0.925
print(classification_report(y_test, best_model.predict(X_test)))
precision recall f1-score support
0.0 1.00 1.00 1.00 85295
1.0 0.93 0.75 0.83 148
micro avg 1.00 1.00 1.00 85443
macro avg 0.96 0.87 0.91 85443
weighted avg 1.00 1.00 1.00 85443
在> kaggle 欺诈信用卡预测(由浅入深(一)之数据探索及过采样)
可以看到,经过AutoEncoder编码之后的特征,建模后效果提升明显。
#通过SMOTE过采样,得到正负样本各50%
X_sample,y_sample = SMOTE(random_state=2019).fit_sample(rep_x, rep_y)
X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size = 0.3, random_state = 2019)
# 构建参数组合
param_grid = {'C': [0.01,0.1, 1, 10, 100, 1000,],
'penalty': [ 'l1', 'l2']}
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=10) # 确定模型LogisticRegression,和参数组合param_grid ,cv指定10折
grid_search.fit(X_train, y_train) # 使用训练集学习算法
best_model = grid_search.best_estimator_
print('accuracy_score:',accuracy_score(y_test,best_model.predict(X_test)))
print('roc_auc_score:',roc_auc_score(y_test,best_model.predict(X_test)))
print('recall_score:',recall_score(y_test,best_model.predict(X_test)))
print('precision_score:',precision_score(y_test,best_model.predict(X_test)))
accuracy_score: 0.9501667751144564
roc_auc_score: 0.9502462163988626
recall_score: 0.9258285420464638
precision_score: 0.9735312549920742
confusion_matrix(y_test, best_model.predict(X_test)) # 生成混淆矩阵
print(classification_report(y_test, best_model.predict(X_test)))
precision recall f1-score support
0.0 0.93 0.97 0.95 85017
1.0 0.97 0.93 0.95 85572
micro avg 0.95 0.95 0.95 170589
macro avg 0.95 0.95 0.95 170589
weighted avg 0.95 0.95 0.95 170589
在> kaggle 欺诈信用卡预测(由浅入深(一)之数据探索及过采样)
总体上看,同样经过SMOTE过采样之后,AutoEncoder+LogisticRegression效果更好。
再来看recall-precision曲线
在SMOTE+LogisticRegression效果如下:
from itertools import cycle
colors = cycle(['navy', 'turquoise', 'darkorange', 'cornflowerblue', 'teal', 'red', 'yellow', 'green', 'blue','black'])
y_pred_proba = best_model.predict_proba(X_test) #predict_prob 获得一个概率值
thresholds = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9] # 设定不同阈值
plt.figure(figsize=(12,7))
j = 1
for i,color in zip(thresholds,colors):
y_test_predictions_prob = y_pred_proba[:,1] > i #预测出来的概率值是否大于阈值
precision, recall, thresholds = precision_recall_curve(y_test, y_test_predictions_prob)
area = auc(recall, precision)
# Plot Precision-Recall curve
plt.plot(recall, precision, color=color,
label='Threshold: %s, AUC=%0.5f' %(i , area))
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('Precision-Recall Curve')
plt.legend(loc="lower left")
从recall-precision曲线来看,依然是AutoEncoder+LogisticRegression效果更好。
在下一篇中,来尝试下AutoEncoder直接用于异常检测,而不是建立分类器的方式。