import numpy as np
import sklearn
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from collections import Counter
import tensorflow as tf
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.linear_model import LogisticRegression as LR
from sklearn.model_selection import learning_curve #画学习曲线的类
from sklearn.model_selection import ShuffleSplit #设定交叉验证模式的类
from imblearn.under_sampling import RandomUnderSampler
from time import time
import datetime
from sklearn.metrics import mean_squared_error
import itertools
from sklearn.metrics import confusion_matrix
import sys
data = pd.read_csv("./creditcard.csv")
data.head()
Time | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | ... | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | Amount | Class | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | -1.359807 | -0.072781 | 2.536347 | 1.378155 | -0.338321 | 0.462388 | 0.239599 | 0.098698 | 0.363787 | ... | -0.018307 | 0.277838 | -0.110474 | 0.066928 | 0.128539 | -0.189115 | 0.133558 | -0.021053 | 149.62 | 0 |
1 | 0.0 | 1.191857 | 0.266151 | 0.166480 | 0.448154 | 0.060018 | -0.082361 | -0.078803 | 0.085102 | -0.255425 | ... | -0.225775 | -0.638672 | 0.101288 | -0.339846 | 0.167170 | 0.125895 | -0.008983 | 0.014724 | 2.69 | 0 |
2 | 1.0 | -1.358354 | -1.340163 | 1.773209 | 0.379780 | -0.503198 | 1.800499 | 0.791461 | 0.247676 | -1.514654 | ... | 0.247998 | 0.771679 | 0.909412 | -0.689281 | -0.327642 | -0.139097 | -0.055353 | -0.059752 | 378.66 | 0 |
3 | 1.0 | -0.966272 | -0.185226 | 1.792993 | -0.863291 | -0.010309 | 1.247203 | 0.237609 | 0.377436 | -1.387024 | ... | -0.108300 | 0.005274 | -0.190321 | -1.175575 | 0.647376 | -0.221929 | 0.062723 | 0.061458 | 123.50 | 0 |
4 | 2.0 | -1.158233 | 0.877737 | 1.548718 | 0.403034 | -0.407193 | 0.095921 | 0.592941 | -0.270533 | 0.817739 | ... | -0.009431 | 0.798278 | -0.137458 | 0.141267 | -0.206010 | 0.502292 | 0.219422 | 0.215153 | 69.99 | 0 |
5 rows × 31 columns
count_classes = pd.value_counts(data['Class'], sort = True).sort_index(ascending=True)
count_classes.plot(kind = 'bar')
plt.title("Fraud and normal")
plt.xlabel("Class")
plt.ylabel("Frequency")
Text(0, 0.5, 'Frequency')
X=data.iloc[:,1:30]
y=data['Class']
X = StandardScaler().fit_transform(X)
pd.DataFrame(X).head()
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | -0.694242 | -0.044075 | 1.672773 | 0.973366 | -0.245117 | 0.347068 | 0.193679 | 0.082637 | 0.331128 | 0.083386 | ... | 0.326118 | -0.024923 | 0.382854 | -0.176911 | 0.110507 | 0.246585 | -0.392170 | 0.330892 | -0.063781 | 0.244964 |
1 | 0.608496 | 0.161176 | 0.109797 | 0.316523 | 0.043483 | -0.061820 | -0.063700 | 0.071253 | -0.232494 | -0.153350 | ... | -0.089611 | -0.307377 | -0.880077 | 0.162201 | -0.561131 | 0.320694 | 0.261069 | -0.022256 | 0.044608 | -0.342475 |
2 | -0.693500 | -0.811578 | 1.169468 | 0.268231 | -0.364572 | 1.351454 | 0.639776 | 0.207373 | -1.378675 | 0.190700 | ... | 0.680975 | 0.337632 | 1.063358 | 1.456320 | -1.138092 | -0.628537 | -0.288447 | -0.137137 | -0.181021 | 1.160686 |
3 | -0.493325 | -0.112169 | 1.182516 | -0.609727 | -0.007469 | 0.936150 | 0.192071 | 0.316018 | -1.262503 | -0.050468 | ... | -0.269855 | -0.147443 | 0.007267 | -0.304777 | -1.941027 | 1.241904 | -0.460217 | 0.155396 | 0.186189 | 0.140534 |
4 | -0.591330 | 0.531541 | 1.021412 | 0.284655 | -0.295015 | 0.071999 | 0.479302 | -0.226510 | 0.744326 | 0.691625 | ... | 0.529939 | -0.012839 | 1.100011 | -0.220123 | 0.233250 | -0.395202 | 1.041611 | 0.543620 | 0.651816 | -0.073403 |
5 rows × 29 columns
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=0)
X_train.shape,y_train.shape,X_test.shape,y_test.shape
((199364, 29), (199364,), (85443, 29), (85443,))
## 下采样
rus = RandomUnderSampler(random_state=0)
X_resampled, y_resampled = rus.fit_sample(X_train, y_train)
Counter(y_resampled)
Counter({0: 345, 1: 345})
X_train = np.array(X_resampled,dtype='float32')
X_test = np.array(X_test,dtype='float32')
y_train = np.array(y_resampled,dtype='int64')
y_test = np.array(y_test,dtype='int64')
def plot_confusion_matrix(cm, classes,
title='Confusion matrix',
cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
"""
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=0)
plt.yticks(tick_marks, classes)
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, cm[i, j],
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
from __future__ import absolute_import, division, print_function
from IPython.display import clear_output
x_train = pd.DataFrame(X_train)
x_test = pd.DataFrame(X_test)
type(x_train)
pandas.core.frame.DataFrame
x_train.head()
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | -1.661722 | 0.522969 | -1.678534 | -2.088352 | 1.215200 | -1.286211 | 1.833157 | -1.482226 | 2.061864 | 5.848073 | ... | 0.920462 | -0.572086 | 2.962909 | 0.788291 | 1.332814 | -0.584676 | -0.496160 | 2.524529 | -0.315584 | -0.313249 |
1 | -1.958276 | -1.184221 | -0.266491 | 0.309968 | 1.295294 | -0.800223 | -0.222711 | 0.185070 | 0.076339 | 0.637193 | ... | -1.531449 | -0.900136 | 0.393577 | 2.141047 | -1.136080 | 1.505126 | 2.656882 | 0.555429 | -2.676608 | -0.295257 |
2 | -0.067535 | 0.559817 | -0.240419 | -0.757746 | 0.783281 | -0.041964 | 0.670519 | 0.046802 | 0.024487 | 0.074268 | ... | 0.153998 | -0.454839 | -1.043606 | -0.170155 | -2.318232 | -0.569980 | 0.438470 | 0.864596 | 0.397261 | -0.349671 |
3 | -0.795785 | -0.257322 | 1.273879 | -1.202887 | -0.677656 | 0.254827 | -0.718847 | 0.788730 | -0.840405 | -0.179861 | ... | 0.412371 | 0.648935 | 1.401767 | -0.329352 | -0.498010 | 0.933679 | -0.168390 | 0.434578 | -0.061287 | -0.049375 |
4 | -0.159912 | 0.446601 | 0.303160 | -0.240456 | 0.433733 | -0.718019 | 0.826952 | -0.251612 | -0.250003 | -0.398699 | ... | -0.141851 | -0.233167 | -0.610940 | 0.022573 | -0.158219 | -1.484146 | 0.371343 | 0.050953 | 0.545120 | -0.245321 |
5 rows × 29 columns
data.columns
Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
'Class'],
dtype='object')
VVV = data.columns[1:30].to_list()
VVV
['V1',
'V2',
'V3',
'V4',
'V5',
'V6',
'V7',
'V8',
'V9',
'V10',
'V11',
'V12',
'V13',
'V14',
'V15',
'V16',
'V17',
'V18',
'V19',
'V20',
'V21',
'V22',
'V23',
'V24',
'V25',
'V26',
'V27',
'V28',
'Amount']
x_train.columns = data.columns[1:30]
x_test.columns = data.columns[1:30]
x_train.head()
V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | V10 | ... | V20 | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | Amount | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | -1.661722 | 0.522969 | -1.678534 | -2.088352 | 1.215200 | -1.286211 | 1.833157 | -1.482226 | 2.061864 | 5.848073 | ... | 0.920462 | -0.572086 | 2.962909 | 0.788291 | 1.332814 | -0.584676 | -0.496160 | 2.524529 | -0.315584 | -0.313249 |
1 | -1.958276 | -1.184221 | -0.266491 | 0.309968 | 1.295294 | -0.800223 | -0.222711 | 0.185070 | 0.076339 | 0.637193 | ... | -1.531449 | -0.900136 | 0.393577 | 2.141047 | -1.136080 | 1.505126 | 2.656882 | 0.555429 | -2.676608 | -0.295257 |
2 | -0.067535 | 0.559817 | -0.240419 | -0.757746 | 0.783281 | -0.041964 | 0.670519 | 0.046802 | 0.024487 | 0.074268 | ... | 0.153998 | -0.454839 | -1.043606 | -0.170155 | -2.318232 | -0.569980 | 0.438470 | 0.864596 | 0.397261 | -0.349671 |
3 | -0.795785 | -0.257322 | 1.273879 | -1.202887 | -0.677656 | 0.254827 | -0.718847 | 0.788730 | -0.840405 | -0.179861 | ... | 0.412371 | 0.648935 | 1.401767 | -0.329352 | -0.498010 | 0.933679 | -0.168390 | 0.434578 | -0.061287 | -0.049375 |
4 | -0.159912 | 0.446601 | 0.303160 | -0.240456 | 0.433733 | -0.718019 | 0.826952 | -0.251612 | -0.250003 | -0.398699 | ... | -0.141851 | -0.233167 | -0.610940 | 0.022573 | -0.158219 | -1.484146 | 0.371343 | 0.050953 | 0.545120 | -0.245321 |
5 rows × 29 columns
NUMERIC_COLUMNS = x_train.columns.to_list()
字符串列表用
def one_hot_cat_column(feature_name, vocab):
return tf.feature_column.indicator_column(
tf.feature_column.categorical_column_with_vocabulary_list(feature_name,
vocab))
for feature_name in CATEGORICAL_COLUMNS:
# Need to one-hot encode categorical features.
vocabulary = dftrain[feature_name].unique()
feature_columns.append(one_hot_cat_column(feature_name, vocabulary))
数字列表用
feature_columns = []
for feature_name in NUMERIC_COLUMNS:
feature_columns.append(tf.feature_column.numeric_column(feature_name,
dtype=tf.float32))
创建dataset ,输入管道
# Use entire batch since this is such a small dataset.
NUM_EXAMPLES = len(y_train)
def make_input_fn(X, y, n_epochs=None, shuffle=True):
def input_fn():
dataset = tf.data.Dataset.from_tensor_slices((dict(X), y))
if shuffle:
dataset = dataset.shuffle(NUM_EXAMPLES)
# For training, cycle thru dataset as many times as need (n_epochs=None).
dataset = dataset.repeat(n_epochs)
# In memory training doesn't use batching.
dataset = dataset.batch(NUM_EXAMPLES)
return dataset
return input_fn
# Training and evaluation input functions.
train_input_fn = make_input_fn(x_train, y_train) # x_train 为Dataframe
eval_input_fn = make_input_fn(x_test, y_test, shuffle=False, n_epochs=1)
线性分类器(逻辑回归模型)
linear_est = tf.estimator.LinearClassifier(feature_columns)
# Train model.
linear_est.train(train_input_fn, max_steps=100)
# Evaluation.
result = linear_est.evaluate(eval_input_fn)
clear_output()
print(pd.Series(result))
accuracy 0.964339
accuracy_baseline 0.998280
auc 0.979284
auc_precision_recall 0.420575
average_loss 0.201447
label/mean 0.001720
loss 0.201443
precision 0.042875
prediction/mean 0.151229
recall 0.925170
global_step 100.000000
dtype: float64
线性分类器ROC
pred_dicts = list(linear_est.predict(eval_input_fn))
probs = pd.Series([pred['probabilities'][1] for pred in pred_dicts])
probs.plot(kind='hist', bins=20, title='predicted probabilities')
plt.show()
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\ADMINI~1\AppData\Local\Temp\tmplvozpqp9\model.ckpt-100
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
result_1 = linear_est.predict(eval_input_fn,predict_keys='class_ids')
AAA = pd.DataFrame(result_1)
QQQ = (AAA == 1)
QQB = np.array(QQQ+0)
QQB = np.squeeze(QQB,-1)
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\ADMINI~1\AppData\Local\Temp\tmplvozpqp9\model.ckpt-100
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
Counter(QQB)
Counter({0: 82271, 1: 3172})
cnf_matrix = confusion_matrix(y_test,QQB)
np.set_printoptions(precision=2)
print("Recall metric in the testing dataset: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))
print("Accuracy metric in the testing dataset: ",
(cnf_matrix[1,1]+cnf_matrix[0,0])/(cnf_matrix[1,1]+cnf_matrix[0,0]+cnf_matrix[1,0]+cnf_matrix[0,1]))
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix
, classes=class_names
, title='Linear estimator Confusion matrix')
plt.show()
Recall metric in the testing dataset: 0.9251700680272109
Accuracy metric in the testing dataset: 0.9643387989653921
梯度提升树
# Since data fits into memory, use entire dataset per layer. It will be faster.
# Above one batch is defined as the entire dataset.
n_batches = 1
est = tf.estimator.BoostedTreesClassifier(feature_columns,
n_batches_per_layer=n_batches)
# The model will stop training once the specified number of trees is built, not
# based on the number of steps.
est.train(train_input_fn, max_steps=100)
# Eval.
result = est.evaluate(eval_input_fn)
clear_output()
print(pd.Series(result))
accuracy 0.967897
accuracy_baseline 0.998280
auc 0.974405
auc_precision_recall 0.598076
average_loss 0.093176
label/mean 0.001720
loss 0.093208
precision 0.045518
prediction/mean 0.046975
recall 0.884354
global_step 100.000000
dtype: float64
result_2 = est.predict(eval_input_fn,predict_keys='class_ids')
AAA = pd.DataFrame(result_2)
QQQ = (AAA == 1)
QQB = np.array(QQQ+0)
QQB = np.squeeze(QQB,-1)
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\ADMINI~1\AppData\Local\Temp\tmpriy4cjlj\model.ckpt-100
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
Counter(QQB)
Counter({1: 2856, 0: 82587})
cnf_matrix = confusion_matrix(y_test,QQB)
np.set_printoptions(precision=2)
print("Recall metric in the testing dataset: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))
print("Accuracy metric in the testing dataset: ",
(cnf_matrix[1,1]+cnf_matrix[0,0])/(cnf_matrix[1,1]+cnf_matrix[0,0]+cnf_matrix[1,0]+cnf_matrix[0,1]))
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix
, classes=class_names
, title='BoostedTreesClassifier estimator Confusion matrix')
plt.show()
Recall metric in the testing dataset: 0.8843537414965986
Accuracy metric in the testing dataset: 0.9678967264726192
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-72VlGvEr-1590061836310)(output_39_1.png)]
pred_dicts = list(est.predict(eval_input_fn))
probs = pd.Series([pred['probabilities'][1] for pred in pred_dicts])
probs.plot(kind='hist', bins=20, title='predicted probabilities')
plt.show()
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\ADMINI~1\AppData\Local\Temp\tmpriy4cjlj\model.ckpt-100
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-brjgpLRw-1590061836312)(output_40_1.png)]
from sklearn.metrics import roc_curve
from matplotlib import pyplot as plt
fpr, tpr, _ = roc_curve(y_test, probs)
plt.plot(fpr, tpr)
plt.title('ROC curve')
plt.xlabel('false positive rate')
plt.ylabel('true positive rate')
plt.xlim(0,)
plt.ylim(0,);
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-KnN4gpwR-1590061836313)(output_41_0.png)]