1、 脱敏后的数据文件
最后一列Class,0为正常,1为欺诈
2、程序解读
2.1 读取文件
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
import seaborn as sns
import matplotlib.gridspec as gridspec
#读取csv文件
data = pd.read_csv('../dataset/creditcard.csv')
# .iloc:根据标签的所在位置,从0开始计数,选取列
x_train = np.array(data.iloc[:,0:29])
y_train = np.array(data.iloc[:,30])
2.2 查看前5行记录
# df.head(n):查看DataFrame对象的前n行
print(data.head())
Time V1 V2 V3 V4 V5 V6 V7 \
0 0.0 -1.359807 -0.072781 2.536347 1.378155 -0.338321 0.462388 0.239599
1 0.0 1.191857 0.266151 0.166480 0.448154 0.060018 -0.082361 -0.078803
2 1.0 -1.358354 -1.340163 1.773209 0.379780 -0.503198 1.800499 0.791461
3 1.0 -0.966272 -0.185226 1.792993 -0.863291 -0.010309 1.247203 0.237609
4 2.0 -1.158233 0.877737 1.548718 0.403034 -0.407193 0.095921 0.592941
V8 V9 V10 V11 V12 V13 V14 \
0 0.098698 0.363787 0.090794 -0.551600 -0.617801 -0.991390 -0.311169
1 0.085102 -0.255425 -0.166974 1.612727 1.065235 0.489095 -0.143772
2 0.247676 -1.514654 0.207643 0.624501 0.066084 0.717293 -0.165946
3 0.377436 -1.387024 -0.054952 -0.226487 0.178228 0.507757 -0.287924
4 -0.270533 0.817739 0.753074 -0.822843 0.538196 1.345852 -1.119670
V15 V16 V17 V18 V19 V20 V21 \
0 1.468177 -0.470401 0.207971 0.025791 0.403993 0.251412 -0.018307
1 0.635558 0.463917 -0.114805 -0.183361 -0.145783 -0.069083 -0.225775
2 2.345865 -2.890083 1.109969 -0.121359 -2.261857 0.524980 0.247998
3 -0.631418 -1.059647 -0.684093 1.965775 -1.232622 -0.208038 -0.108300
4 0.175121 -0.451449 -0.237033 -0.038195 0.803487 0.408542 -0.009431
V22 V23 V24 V25 V26 V27 V28 \
0 0.277838 -0.110474 0.066928 0.128539 -0.189115 0.133558 -0.021053
1 -0.638672 0.101288 -0.339846 0.167170 0.125895 -0.008983 0.014724
2 0.771679 0.909412 -0.689281 -0.327642 -0.139097 -0.055353 -0.059752
3 0.005274 -0.190321 -1.175575 0.647376 -0.221929 0.062723 0.061458
4 0.798278 -0.137458 0.141267 -0.206010 0.502292 0.219422 0.215153
Amount Class
0 149.62 0
1 2.69 0
2 378.66 0
3 123.50 0
4 69.99 0
2.3 统计描述
# 生成描述性统计,总结数据集分布的中心趋势,分散和形状,不包括NaN值。
print(data.describe())
Time V1 V2 V3 V4 \
count 284807.000000 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05
mean 94813.859575 1.759072e-12 -8.251146e-13 -9.655448e-13 8.321385e-13
std 47488.145955 1.958696e+00 1.651309e+00 1.516255e+00 1.415869e+00
min 0.000000 -5.640751e+01 -7.271573e+01 -4.832559e+01 -5.683171e+00
25% 54201.500000 -9.203734e-01 -5.985499e-01 -8.903648e-01 -8.486401e-01
50% 84692.000000 1.810880e-02 6.548556e-02 1.798463e-01 -1.984653e-02
75% 139320.500000 1.315642e+00 8.037239e-01 1.027196e+00 7.433413e-01
max 172792.000000 2.454930e+00 2.205773e+01 9.382558e+00 1.687534e+01
V5 V6 V7 V8 V9 \
count 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05
mean 1.649983e-13 4.248434e-13 -3.054696e-13 8.777981e-14 -1.179757e-12
std 1.380247e+00 1.332271e+00 1.237094e+00 1.194353e+00 1.098632e+00
min -1.137433e+02 -2.616051e+01 -4.355724e+01 -7.321672e+01 -1.343407e+01
25% -6.915971e-01 -7.682956e-01 -5.540759e-01 -2.086297e-01 -6.430976e-01
50% -5.433583e-02 -2.741871e-01 4.010308e-02 2.235804e-02 -5.142873e-02
75% 6.119264e-01 3.985649e-01 5.704361e-01 3.273459e-01 5.971390e-01
max 3.480167e+01 7.330163e+01 1.205895e+02 2.000721e+01 1.559499e+01
V10 V11 V12 V13 V14 \
count 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05
mean 7.092627e-13 1.874974e-12 1.053347e-12 7.127607e-13 -1.474787e-13
std 1.088850e+00 1.020713e+00 9.992014e-01 9.952742e-01 9.585956e-01
min -2.458826e+01 -4.797473e+00 -1.868371e+01 -5.791881e+00 -1.921433e+01
25% -5.354257e-01 -7.624942e-01 -4.055715e-01 -6.485393e-01 -4.255740e-01
50% -9.291738e-02 -3.275735e-02 1.400326e-01 -1.356806e-02 5.060132e-02
75% 4.539234e-01 7.395934e-01 6.182380e-01 6.625050e-01 4.931498e-01
max 2.374514e+01 1.201891e+01 7.848392e+00 7.126883e+00 1.052677e+01
V15 V16 V17 V18 V19 \
count 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05
mean -5.231430e-13 -2.282231e-13 -6.425412e-13 4.950748e-13 7.057401e-13
std 9.153160e-01 8.762529e-01 8.493371e-01 8.381762e-01 8.140405e-01
min -4.498945e+00 -1.412985e+01 -2.516280e+01 -9.498746e+00 -7.213527e+00
25% -5.828843e-01 -4.680368e-01 -4.837483e-01 -4.988498e-01 -4.562989e-01
50% 4.807155e-02 6.641332e-02 -6.567575e-02 -3.636312e-03 3.734823e-03
75% 6.488208e-01 5.232963e-01 3.996750e-01 5.008067e-01 4.589494e-01
max 8.877742e+00 1.731511e+01 9.253526e+00 5.041069e+00 5.591971e+00
V20 V21 V22 V23 V24 \
count 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05
mean 1.766109e-12 -3.405785e-13 -5.723165e-13 -9.725860e-13 1.464148e-12
std 7.709250e-01 7.345240e-01 7.257016e-01 6.244603e-01 6.056471e-01
min -5.449772e+01 -3.483038e+01 -1.093314e+01 -4.480774e+01 -2.836627e+00
25% -2.117214e-01 -2.283949e-01 -5.423504e-01 -1.618463e-01 -3.545861e-01
50% -6.248109e-02 -2.945017e-02 6.781943e-03 -1.119293e-02 4.097606e-02
75% 1.330408e-01 1.863772e-01 5.285536e-01 1.476421e-01 4.395266e-01
max 3.942090e+01 2.720284e+01 1.050309e+01 2.252841e+01 4.584549e+00
V25 V26 V27 V28 Amount \
count 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 284807.000000
mean -6.987110e-13 -5.617884e-13 3.332082e-12 -3.518875e-12 88.349619
std 5.212781e-01 4.822270e-01 4.036325e-01 3.300833e-01 250.120109
min -1.029540e+01 -2.604551e+00 -2.256568e+01 -1.543008e+01 0.000000
25% -3.171451e-01 -3.269839e-01 -7.083953e-02 -5.295979e-02 5.600000
50% 1.659350e-02 -5.213911e-02 1.342146e-03 1.124383e-02 22.000000
75% 3.507156e-01 2.409522e-01 9.104512e-02 7.827995e-02 77.165000
max 7.519589e+00 3.517346e+00 3.161220e+01 3.384781e+01 25691.160000
Class
count 284807.000000
mean 0.001727
std 0.041527
min 0.000000
25% 0.000000
50% 0.000000
75% 0.000000
max 1.000000
2.4 空值统计
print(data.isnull().sum())
Time 0
V1 0
V2 0
V3 0
V4 0
V5 0
V6 0
V7 0
V8 0
V9 0
V10 0
V11 0
V12 0
V13 0
V14 0
V15 0
V16 0
V17 0
V18 0
V19 0
V20 0
V21 0
V22 0
V23 0
V24 0
V25 0
V26 0
V27 0
V28 0
Amount 0
Class 0
dtype: int64
2.5 维度统计描述
# Time, 时间维度
print("Fraud")
# data.Class == 1表示选择Class列值等于1的记录
print(data.Time[data.Class == 1].describe())
print()
print("Normal")
print(data.Time[data.Class == 0].describe())
print()
Fraud
count 492.000000
mean 80746.806911
std 47835.365138
min 406.000000
25% 41241.500000
50% 75568.500000
75% 128483.000000
max 170348.000000
Name: Time, dtype: float64
Normal
count 284315.000000
mean 94838.202258
std 47484.015786
min 0.000000
25% 54230.000000
50% 84711.000000
75% 139333.000000
max 172792.000000
Name: Time, dtype: float64
2.6
f, (ax1, ax2) = plt.subplots(2, 1, sharex=True, figsize=(12,4))
bins = 50
ax1.hist(data.Time[data.Class == 1], bins = bins)
ax1.set_title('Fraud')
ax2.hist(data.Time[data.Class == 0], bins = bins)
ax2.set_title('Normal')
plt.xlabel('Time (in Seconds)')
plt.ylabel('Number of Transactions')
plt.show()
图
代码
# Amount 金额
print("Fraud")
print(data.Amount[data.Class == 1].describe())
print()
print("Normal")
print(data.Amount[data.Class == 0].describe())
Fraud
count 492.000000
mean 122.211321
std 256.683288
min 0.000000
25% 1.000000
50% 9.250000
75% 105.890000
max 2125.870000
Name: Amount, dtype: float64
Normal
count 284315.000000
mean 88.291022
std 250.105092
min 0.000000
25% 5.650000
50% 22.000000
75% 77.050000
max 25691.160000
Name: Amount, dtype: float64
f, (ax1, ax2) = plt.subplots(2, 1, sharex=True, figsize=(12,4))
bins = 30
ax1.hist(data.Amount[data.Class == 1], bins = bins)
ax1.set_title('Fraud')
ax2.hist(data.Amount[data.Class == 0], bins = bins)
ax2.set_title('Normal')
plt.xlabel('Amount ($)')
plt.ylabel('Number of Transactions')
plt.yscale('log')
plt.show()
data['Amount_max_fraud'] = 1
data.loc[data.Amount <= 2125.87, 'Amount_max_fraud'] = 0
f, (ax1, ax2) = plt.subplots(2, 1, sharex=True, figsize=(12,6))
ax1.scatter(data.Time[data.Class == 1], data.Amount[data.Class == 1])
ax1.set_title('Fraud')
ax2.scatter(data.Time[data.Class == 0], data.Amount[data.Class == 0])
ax2.set_title('Normal')
plt.xlabel('Time (in Seconds)')
plt.ylabel('Amount')
plt.show()
# analysis the anonymized features.
#Select only the anonymized features.
v_features = data.iloc[:,1:29].columns
plt.figure(figsize=(12,28*4))
gs = gridspec.GridSpec(28, 1)
for i, cn in enumerate(data[v_features]):
ax = plt.subplot(gs[i])
sns.distplot(data[cn][data.Class == 1], bins=50) # 看两者的形状差异
sns.distplot(data[cn][data.Class == 0], bins=50)
ax.set_xlabel('')
ax.set_title('histogram of feature: ' + str(cn))
plt.show()
#Drop all of the features that have very similar distributions between the two types of transactions.
data = data.drop(['V28','V27','V26','V25','V24','V23','V22','V20','V15','V13','V8'], axis =1)
#Based on the plots above, these features are created to identify values where fraudulent transaction are more common.
data['V1_'] = data.V1.map(lambda x: 1 if x < -3 else 0)
data['V2_'] = data.V2.map(lambda x: 1 if x > 2.5 else 0)
data['V3_'] = data.V3.map(lambda x: 1 if x < -4 else 0)
data['V4_'] = data.V4.map(lambda x: 1 if x > 2.5 else 0)
data['V5_'] = data.V5.map(lambda x: 1 if x < -4.5 else 0)
data['V6_'] = data.V6.map(lambda x: 1 if x < -2.5 else 0)
data['V7_'] = data.V7.map(lambda x: 1 if x < -3 else 0)
data['V9_'] = data.V9.map(lambda x: 1 if x < -2 else 0)
data['V10_'] = data.V10.map(lambda x: 1 if x < -2.5 else 0)
data['V11_'] = data.V11.map(lambda x: 1 if x > 2 else 0)
data['V12_'] = data.V12.map(lambda x: 1 if x < -2 else 0)
data['V14_'] = data.V14.map(lambda x: 1 if x < -2.5 else 0)
data['V16_'] = data.V16.map(lambda x: 1 if x < -2 else 0)
data['V17_'] = data.V17.map(lambda x: 1 if x < -2 else 0)
data['V18_'] = data.V18.map(lambda x: 1 if x < -2 else 0)
data['V19_'] = data.V19.map(lambda x: 1 if x > 1.5 else 0)
data['V21_'] = data.V21.map(lambda x: 1 if x > 0.6 else 0)
print('每个单一属性的欺诈记录与整车记录的差异统计:')
print(data.describe())
print(data.sum())
每个单一属性的欺诈记录与整车记录的差异统计:
Time V1 V2 V3 V4 \
count 284807.000000 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05
mean 94813.859575 1.759072e-12 -8.251146e-13 -9.655448e-13 8.321385e-13
std 47488.145955 1.958696e+00 1.651309e+00 1.516255e+00 1.415869e+00
min 0.000000 -5.640751e+01 -7.271573e+01 -4.832559e+01 -5.683171e+00
25% 54201.500000 -9.203734e-01 -5.985499e-01 -8.903648e-01 -8.486401e-01
50% 84692.000000 1.810880e-02 6.548556e-02 1.798463e-01 -1.984653e-02
75% 139320.500000 1.315642e+00 8.037239e-01 1.027196e+00 7.433413e-01
max 172792.000000 2.454930e+00 2.205773e+01 9.382558e+00 1.687534e+01
V5 V6 V7 V9 V10 \
count 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05
mean 1.649983e-13 4.248434e-13 -3.054696e-13 -1.179757e-12 7.092627e-13
std 1.380247e+00 1.332271e+00 1.237094e+00 1.098632e+00 1.088850e+00
min -1.137433e+02 -2.616051e+01 -4.355724e+01 -1.343407e+01 -2.458826e+01
25% -6.915971e-01 -7.682956e-01 -5.540759e-01 -6.430976e-01 -5.354257e-01
50% -5.433583e-02 -2.741871e-01 4.010308e-02 -5.142873e-02 -9.291738e-02
75% 6.119264e-01 3.985649e-01 5.704361e-01 5.971390e-01 4.539234e-01
max 3.480167e+01 7.330163e+01 1.205895e+02 1.559499e+01 2.374514e+01
V11 V12 V14 V16 V17 \
count 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05
mean 1.874974e-12 1.053347e-12 -1.474787e-13 -2.282231e-13 -6.425412e-13
std 1.020713e+00 9.992014e-01 9.585956e-01 8.762529e-01 8.493371e-01
min -4.797473e+00 -1.868371e+01 -1.921433e+01 -1.412985e+01 -2.516280e+01
25% -7.624942e-01 -4.055715e-01 -4.255740e-01 -4.680368e-01 -4.837483e-01
50% -3.275735e-02 1.400326e-01 5.060132e-02 6.641332e-02 -6.567575e-02
75% 7.395934e-01 6.182380e-01 4.931498e-01 5.232963e-01 3.996750e-01
max 1.201891e+01 7.848392e+00 1.052677e+01 1.731511e+01 9.253526e+00
V18 V19 V21 Amount Class \
count 2.848070e+05 2.848070e+05 2.848070e+05 284807.000000 284807.000000
mean 4.950748e-13 7.057401e-13 -3.405785e-13 88.349619 0.001727
std 8.381762e-01 8.140405e-01 7.345240e-01 250.120109 0.041527
min -9.498746e+00 -7.213527e+00 -3.483038e+01 0.000000 0.000000
25% -4.988498e-01 -4.562989e-01 -2.283949e-01 5.600000 0.000000
50% -3.636312e-03 3.734823e-03 -2.945017e-02 22.000000 0.000000
75% 5.008067e-01 4.589494e-01 1.863772e-01 77.165000 0.000000
max 5.041069e+00 5.591971e+00 2.720284e+01 25691.160000 1.000000
Amount_max_fraud V1_ V2_ V3_ \
count 284807.000000 284807.000000 284807.000000 284807.000000
mean 0.002117 0.047042 0.024771 0.009838
std 0.045965 0.211730 0.155427 0.098699
min 0.000000 0.000000 0.000000 0.000000
25% 0.000000 0.000000 0.000000 0.000000
50% 0.000000 0.000000 0.000000 0.000000
75% 0.000000 0.000000 0.000000 0.000000
max 1.000000 1.000000 1.000000 1.000000
V4_ V5_ V6_ V7_ \
count 284807.000000 284807.000000 284807.000000 284807.000000
mean 0.052794 0.004579 0.006274 0.010059
std 0.223622 0.067510 0.078963 0.099791
min 0.000000 0.000000 0.000000 0.000000
25% 0.000000 0.000000 0.000000 0.000000
50% 0.000000 0.000000 0.000000 0.000000
75% 0.000000 0.000000 0.000000 0.000000
max 1.000000 1.000000 1.000000 1.000000
V9_ V10_ V11_ V12_ \
count 284807.000000 284807.000000 284807.000000 284807.000000
mean 0.031530 0.005049 0.018244 0.048408
std 0.174746 0.070877 0.133833 0.214628
min 0.000000 0.000000 0.000000 0.000000
25% 0.000000 0.000000 0.000000 0.000000
50% 0.000000 0.000000 0.000000 0.000000
75% 0.000000 0.000000 0.000000 0.000000
max 1.000000 1.000000 1.000000 1.000000
V14_ V16_ V17_ V18_ \
count 284807.000000 284807.000000 284807.000000 284807.000000
mean 0.013697 0.021165 0.002173 0.013943
std 0.116230 0.143935 0.046569 0.117254
min 0.000000 0.000000 0.000000 0.000000
25% 0.000000 0.000000 0.000000 0.000000
50% 0.000000 0.000000 0.000000 0.000000
75% 0.000000 0.000000 0.000000 0.000000
max 1.000000 1.000000 1.000000 1.000000
V19_ V21_
count 284807.000000 284807.000000
mean 0.032952 0.041958
std 0.178512 0.200494
min 0.000000 0.000000
25% 0.000000 0.000000
50% 0.000000 0.000000
75% 0.000000 0.000000
max 1.000000 1.000000
Time 2.700365e+10
V1 5.009022e-07
V2 -2.350312e-07
V3 -2.744665e-07
V4 2.368500e-07
V5 4.533991e-08
V6 1.209676e-07
V7 -8.687127e-08
V9 -3.359903e-07
V10 2.020664e-07
V11 5.340173e-07
V12 3.000407e-07
V14 -4.247506e-08
V16 -6.495627e-08
V17 -1.830887e-07
V18 1.412354e-07
V19 2.010940e-07
V21 -9.702072e-08
Amount 2.516259e+07
Class 4.920000e+02
Amount_max_fraud 6.030000e+02
V1_ 1.339800e+04
V2_ 7.055000e+03
V3_ 2.802000e+03
V4_ 1.503600e+04
V5_ 1.304000e+03
V6_ 1.787000e+03
V7_ 2.865000e+03
V9_ 8.980000e+03
V10_ 1.438000e+03
V11_ 5.196000e+03
V12_ 1.378700e+04
V14_ 3.901000e+03
V16_ 6.028000e+03
V17_ 6.190000e+02
V18_ 3.971000e+03
V19_ 9.385000e+03
V21_ 1.195000e+04
dtype: float64
#Create a new feature for normal (non-fraudulent) transactions.
data.loc[data.Class == 0, 'Normal'] = 1
data.loc[data.Class == 1, 'Normal'] = 0
#Rename 'Class' to 'Fraud'.
data = data.rename(columns={'Class': 'Fraud'})
#492 fraudulent transactions, 284,315 normal transactions.
#0.172% of transactions were fraud.
print('欺诈记录的占比:')
print(data.Normal.value_counts())
print()
print(data.Fraud.value_counts())
pd.set_option("display.max_columns",101)
print(data.head())
欺诈记录的占比:
1.0 284315
0.0 492
Name: Normal, dtype: int64
0 284315
1 492
Name: Fraud, dtype: int64
Time V1 V2 V3 V4 V5 V6 V7 \
0 0.0 -1.359807 -0.072781 2.536347 1.378155 -0.338321 0.462388 0.239599
1 0.0 1.191857 0.266151 0.166480 0.448154 0.060018 -0.082361 -0.078803
2 1.0 -1.358354 -1.340163 1.773209 0.379780 -0.503198 1.800499 0.791461
3 1.0 -0.966272 -0.185226 1.792993 -0.863291 -0.010309 1.247203 0.237609
4 2.0 -1.158233 0.877737 1.548718 0.403034 -0.407193 0.095921 0.592941
V9 V10 V11 V12 V14 V16 V17 \
0 0.363787 0.090794 -0.551600 -0.617801 -0.311169 -0.470401 0.207971
1 -0.255425 -0.166974 1.612727 1.065235 -0.143772 0.463917 -0.114805
2 -1.514654 0.207643 0.624501 0.066084 -0.165946 -2.890083 1.109969
3 -1.387024 -0.054952 -0.226487 0.178228 -0.287924 -1.059647 -0.684093
4 0.817739 0.753074 -0.822843 0.538196 -1.119670 -0.451449 -0.237033
V18 V19 V21 Amount Fraud Amount_max_fraud V1_ V2_ \
0 0.025791 0.403993 -0.018307 149.62 0 0 0 0
1 -0.183361 -0.145783 -0.225775 2.69 0 0 0 0
2 -0.121359 -2.261857 0.247998 378.66 0 0 0 0
3 1.965775 -1.232622 -0.108300 123.50 0 0 0 0
4 -0.038195 0.803487 -0.009431 69.99 0 0 0 0
V3_ V4_ V5_ V6_ V7_ V9_ V10_ V11_ V12_ V14_ V16_ V17_ V18_ \
0 0 0 0 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0 0 0 0
2 0 0 0 0 0 0 0 0 0 0 1 0 0
3 0 0 0 0 0 0 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 0 0 0 0
V19_ V21_ Normal
0 0 0 1.0
1 0 0 1.0
2 0 0 1.0
3 0 0 1.0
4 0 0 1.0
#Create dataframes of only Fraud and Normal transactions.
Fraud = data[data.Fraud == 1]
Normal = data[data.Normal == 1]
# Set X_train equal to 80% of the fraudulent transactions.
X_train = Fraud.sample(frac=0.8)
count_Frauds = len(X_train)
# Add 80% of the normal transactions to X_train.
X_train = pd.concat([X_train, Normal.sample(frac = 0.8)], axis = 0)
# X_test contains all the transaction not in X_train.
X_test = data.loc[~data.index.isin(X_train.index)]
#Shuffle the dataframes so that the training is done in a random order.
X_train = shuffle(X_train)
X_test = shuffle(X_test)
#Add our target features to y_train and y_test.
y_train = X_train.Fraud
y_train = pd.concat([y_train, X_train.Normal], axis=1)
y_test = X_test.Fraud
y_test = pd.concat([y_test, X_test.Normal], axis=1)
#Drop target features from X_train and X_test.
X_train = X_train.drop(['Fraud','Normal'], axis = 1)
X_test = X_test.drop(['Fraud','Normal'], axis = 1)
#Check to ensure all of the training/testing dataframes are of the correct length
print()
print('切割[学习、校验]处理后的记录数量:')
print(len(X_train))
print(len(y_train))
print(len(X_test))
print(len(y_test))
切割[学习、校验]处理后的记录数量:
227846
227846
56961
56961
'''
Due to the imbalance in the data, ratio will act as an equal weighting system for our model.
By dividing the number of transactions by those that are fraudulent, ratio will equal the value that when multiplied
by the number of fraudulent transactions will equal the number of normal transaction.
Simply put: # of fraud * ratio = # of normal
'''
ratio = len(X_train)/count_Frauds
print()
print('数据的占比:', ratio)
y_train.Fraud *= ratio
y_test.Fraud *= ratio
print('训练数据的数量:\n', y_train.Fraud)
print('测试数据的数量:\n', y_train.Fraud)
数据的占比: 578.2893401015228
训练数据的数量:
22023 0.0
185560 0.0
112703 0.0
165996 0.0
245243 0.0
238885 0.0
39966 0.0
112043 0.0
171013 0.0
255567 0.0
283619 0.0
203942 0.0
82908 0.0
245906 0.0
225464 0.0
13679 0.0
107609 0.0
140858 0.0
156028 0.0
158914 0.0
72341 0.0
208184 0.0
111027 0.0
217998 0.0
229747 0.0
281186 0.0
259994 0.0
112170 0.0
204651 0.0
184758 0.0
118430 0.0
15155 0.0
28982 0.0
193685 0.0
209645 0.0
201038 0.0
226108 0.0
219122 0.0
266437 0.0
45419 0.0
99879 0.0
167812 0.0
117954 0.0
20935 0.0
238062 0.0
13355 0.0
71356 0.0
54123 0.0
95958 0.0
280240 0.0
271372 0.0
259493 0.0
149400 0.0
231110 0.0
30784 0.0
186483 0.0
74528 0.0
187912 0.0
17719 0.0
42839 0.0
Name: Fraud, Length: 227846, dtype: float64
测试数据的数量:
22023 0.0
185560 0.0
112703 0.0
165996 0.0
245243 0.0
238885 0.0
39966 0.0
112043 0.0
171013 0.0
255567 0.0
283619 0.0
203942 0.0
82908 0.0
245906 0.0
225464 0.0
13679 0.0
107609 0.0
140858 0.0
156028 0.0
158914 0.0
72341 0.0
208184 0.0
111027 0.0
217998 0.0
229747 0.0
281186 0.0
259994 0.0
112170 0.0
204651 0.0
184758 0.0
118430 0.0
15155 0.0
28982 0.0
193685 0.0
209645 0.0
201038 0.0
226108 0.0
219122 0.0
266437 0.0
45419 0.0
99879 0.0
167812 0.0
117954 0.0
20935 0.0
238062 0.0
13355 0.0
71356 0.0
54123 0.0
95958 0.0
280240 0.0
271372 0.0
259493 0.0
149400 0.0
231110 0.0
30784 0.0
186483 0.0
74528 0.0
187912 0.0
17719 0.0
42839 0.0