所需环境库以环境
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import sklearn
print('pandas:',pd.__version__)
print('matplotlib:',matplotlib.__version__)
print('numpy:',np.__version__)
print('sklearn:',sklearn.__version__)
pandas: 0.23.4
matplotlib: 2.2.3
numpy: 1.16.4
sklearn: 0.22.2.post1
读取数据并显示数据各列信息
data = pd.read_csv('creditcard.csv')
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
Time 284807 non-null float64
V1 284807 non-null float64
V2 284807 non-null float64
V3 284807 non-null float64
V4 284807 non-null float64
V5 284807 non-null float64
V6 284807 non-null float64
V7 284807 non-null float64
V8 284807 non-null float64
V9 284807 non-null float64
V10 284807 non-null float64
V11 284807 non-null float64
V12 284807 non-null float64
V13 284807 non-null float64
V14 284807 non-null float64
V15 284807 non-null float64
V16 284807 non-null float64
V17 284807 non-null float64
V18 284807 non-null float64
V19 284807 non-null float64
V20 284807 non-null float64
V21 284807 non-null float64
V22 284807 non-null float64
V23 284807 non-null float64
V24 284807 non-null float64
V25 284807 non-null float64
V26 284807 non-null float64
V27 284807 non-null float64
V28 284807 non-null float64
Amount 284807 non-null float64
Class 284807 non-null int64
dtypes: float64(30), int64(1)
memory usage: 67.4 MB
有上述信息可以看出,各列信息不存在缺失值,而且数值类型皆为数值,不需要进行离散化。
后续在检查一下各特征数据分布:
data.describe()
Time | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | ... | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | Amount | Class | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 284807.000000 | 2.848070e+05 | 2.848070e+05 | 2.848070e+05 | 2.848070e+05 | 2.848070e+05 | 2.848070e+05 | 2.848070e+05 | 2.848070e+05 | 2.848070e+05 | ... | 2.848070e+05 | 2.848070e+05 | 2.848070e+05 | 2.848070e+05 | 2.848070e+05 | 2.848070e+05 | 2.848070e+05 | 2.848070e+05 | 284807.000000 | 284807.000000 |
mean | 94813.859575 | 3.919560e-15 | 5.688174e-16 | -8.769071e-15 | 2.782312e-15 | -1.552563e-15 | 2.010663e-15 | -1.694249e-15 | -1.927028e-16 | -3.137024e-15 | ... | 1.537294e-16 | 7.959909e-16 | 5.367590e-16 | 4.458112e-15 | 1.453003e-15 | 1.699104e-15 | -3.660161e-16 | -1.206049e-16 | 88.349619 | 0.001727 |
std | 47488.145955 | 1.958696e+00 | 1.651309e+00 | 1.516255e+00 | 1.415869e+00 | 1.380247e+00 | 1.332271e+00 | 1.237094e+00 | 1.194353e+00 | 1.098632e+00 | ... | 7.345240e-01 | 7.257016e-01 | 6.244603e-01 | 6.056471e-01 | 5.212781e-01 | 4.822270e-01 | 4.036325e-01 | 3.300833e-01 | 250.120109 | 0.041527 |
min | 0.000000 | -5.640751e+01 | -7.271573e+01 | -4.832559e+01 | -5.683171e+00 | -1.137433e+02 | -2.616051e+01 | -4.355724e+01 | -7.321672e+01 | -1.343407e+01 | ... | -3.483038e+01 | -1.093314e+01 | -4.480774e+01 | -2.836627e+00 | -1.029540e+01 | -2.604551e+00 | -2.256568e+01 | -1.543008e+01 | 0.000000 | 0.000000 |
25% | 54201.500000 | -9.203734e-01 | -5.985499e-01 | -8.903648e-01 | -8.486401e-01 | -6.915971e-01 | -7.682956e-01 | -5.540759e-01 | -2.086297e-01 | -6.430976e-01 | ... | -2.283949e-01 | -5.423504e-01 | -1.618463e-01 | -3.545861e-01 | -3.171451e-01 | -3.269839e-01 | -7.083953e-02 | -5.295979e-02 | 5.600000 | 0.000000 |
50% | 84692.000000 | 1.810880e-02 | 6.548556e-02 | 1.798463e-01 | -1.984653e-02 | -5.433583e-02 | -2.741871e-01 | 4.010308e-02 | 2.235804e-02 | -5.142873e-02 | ... | -2.945017e-02 | 6.781943e-03 | -1.119293e-02 | 4.097606e-02 | 1.659350e-02 | -5.213911e-02 | 1.342146e-03 | 1.124383e-02 | 22.000000 | 0.000000 |
75% | 139320.500000 | 1.315642e+00 | 8.037239e-01 | 1.027196e+00 | 7.433413e-01 | 6.119264e-01 | 3.985649e-01 | 5.704361e-01 | 3.273459e-01 | 5.971390e-01 | ... | 1.863772e-01 | 5.285536e-01 | 1.476421e-01 | 4.395266e-01 | 3.507156e-01 | 2.409522e-01 | 9.104512e-02 | 7.827995e-02 | 77.165000 | 0.000000 |
max | 172792.000000 | 2.454930e+00 | 2.205773e+01 | 9.382558e+00 | 1.687534e+01 | 3.480167e+01 | 7.330163e+01 | 1.205895e+02 | 2.000721e+01 | 1.559499e+01 | ... | 2.720284e+01 | 1.050309e+01 | 2.252841e+01 | 4.584549e+00 | 7.519589e+00 | 3.517346e+00 | 3.161220e+01 | 3.384781e+01 | 25691.160000 | 1.000000 |
8 rows × 31 columns
从上表可以看出特征V1至V28的特征可以纲量比较统一;而Time特征属于连续递增数据,不适合作为训练特征,舍去该特征;而对于Amount特征是否需要进行标准化,通过后续训练以及测试准确率来判断。
print('0:{:d}, 1:{:d}'.format(sum(data.Class==0),sum(data.Class==1)))
0:284315, 1:492
而对于Class类别,只有0(正常),1(异常),可以明显看出标签是非常不均衡的。
设置训练集和测试集
由于原数据标签十分不均衡,为了测试集的准确性,需将训练集设为类别数量1:1,因此正样本50个,负样本50个。
data_fixed = data.drop(['Time'], axis=1)
data_pos = data_fixed[data_fixed['Class'].values == 0].sample(frac = 1).reset_index(drop=True)
data_neg = data_fixed[data_fixed['Class'].values == 1].sample(frac = 1).reset_index(drop=True)
data_train = pd.concat([data_neg.iloc[50:,:], data_pos.iloc[50:,:]] ).sample(frac = 1).reset_index(drop=True)
data_test = pd.concat([data_neg.iloc[:50,:], data_pos.iloc[:50,:]] ).sample(frac = 1).reset_index(drop=True)
data_train.to_csv('creditcard_train.csv')
data_test.to_csv('creditcard_test.csv')
X_train, y_train = data_train.iloc[:,:-1], data_train.iloc[:,-1]
X_test, y_test = data_test.iloc[:,:-1], data_test.iloc[:,-1]
print('0:{:d}, 1:{:d}'.format(sum(y_test==0),sum(y_test==1)))
0:50, 1:50
训练阶段
首先定义一个训练并可以计算测试集准确率的函数
from sklearn.metrics import confusion_matrix
def model_train(model):
model = model.fit(X_train, y_train)
y_predict = model.predict(X_test)
matrix = confusion_matrix(y_test, y_predict)
return (sum(y_predict == y_test)/len(y_test)),matrix
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
NBM = [KNeighborsClassifier(n_neighbors=6, n_jobs=8),
GaussianNB(),
DecisionTreeClassifier(max_depth=5, min_samples_split=5),
RandomForestClassifier(n_estimators= 100, max_depth=10, n_jobs=8),
RandomForestClassifier(n_estimators= 100, max_depth=10, n_jobs=8, class_weight='balanced'),
xgb.XGBClassifier(tree_method = "hist", n_estimators=100, n_jobs = 8)]
NAME= ["KNN", "GNB", "DCT", "RF", "RF_Balanced", "XGBT"]
for itr, itrname in zip(NBM, NAME):
acc, con_matrix = model_train(itr)
print(itrname+' '+str(acc*100)+'%\n',con_matrix)
KNN 83.0%
[[50 0]
[17 33]]
GNB 91.0%
[[49 1]
[ 8 42]]
DCT 91.0%
[[50 0]
[ 9 41]]
RF 91.0%
[[50 0]
[ 9 41]]
RF_Balanced 91.0%
[[50 0]
[ 9 41]]
XGBT 92.0%
[[50 0]
[ 8 42]]
由于数据量较大,所以选取了训练速度很快,以及可以多线程进行的模型。通过结果可以看出整体表现比较平均,所有模型都出现了同一个问题,正样本训练的结果非常好,预测的错误都出现在了负样本。这也是由于样本标签不均衡的结果,由于是交易欺诈的预测,实际情况上负样本确实占少部分,由于该项目的数据集非常标准,根据生活经验也可以发现欺诈交易中与普通交易存在比较大的差距(也就是说在样本空间上,正样本和负样本距离会比较大),所以得到的预测效果比较好。