线性回归.提升树

import numpy as np
import sklearn
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from collections import Counter
import tensorflow as tf
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.linear_model import LogisticRegression as LR
from sklearn.model_selection import learning_curve #画学习曲线的类
from sklearn.model_selection import ShuffleSplit #设定交叉验证模式的类
from imblearn.under_sampling import RandomUnderSampler
from time import time
import datetime
from sklearn.metrics import mean_squared_error
import itertools
from sklearn.metrics import confusion_matrix
import sys
data = pd.read_csv("./creditcard.csv")
data.head()
TimeV1V2V3V4V5V6V7V8V9...V21V22V23V24V25V26V27V28AmountClass
00.0-1.359807-0.0727812.5363471.378155-0.3383210.4623880.2395990.0986980.363787...-0.0183070.277838-0.1104740.0669280.128539-0.1891150.133558-0.021053149.620
10.01.1918570.2661510.1664800.4481540.060018-0.082361-0.0788030.085102-0.255425...-0.225775-0.6386720.101288-0.3398460.1671700.125895-0.0089830.0147242.690
21.0-1.358354-1.3401631.7732090.379780-0.5031981.8004990.7914610.247676-1.514654...0.2479980.7716790.909412-0.689281-0.327642-0.139097-0.055353-0.059752378.660
31.0-0.966272-0.1852261.792993-0.863291-0.0103091.2472030.2376090.377436-1.387024...-0.1083000.005274-0.190321-1.1755750.647376-0.2219290.0627230.061458123.500
42.0-1.1582330.8777371.5487180.403034-0.4071930.0959210.592941-0.2705330.817739...-0.0094310.798278-0.1374580.141267-0.2060100.5022920.2194220.21515369.990

5 rows × 31 columns

count_classes = pd.value_counts(data['Class'], sort = True).sort_index(ascending=True)
count_classes.plot(kind = 'bar')
plt.title("Fraud and normal")
plt.xlabel("Class")
plt.ylabel("Frequency")
Text(0, 0.5, 'Frequency')

在这里插入图片描述

X=data.iloc[:,1:30]
y=data['Class']
X = StandardScaler().fit_transform(X)
pd.DataFrame(X).head()
0123456789...19202122232425262728
0-0.694242-0.0440751.6727730.973366-0.2451170.3470680.1936790.0826370.3311280.083386...0.326118-0.0249230.382854-0.1769110.1105070.246585-0.3921700.330892-0.0637810.244964
10.6084960.1611760.1097970.3165230.043483-0.061820-0.0637000.071253-0.232494-0.153350...-0.089611-0.307377-0.8800770.162201-0.5611310.3206940.261069-0.0222560.044608-0.342475
2-0.693500-0.8115781.1694680.268231-0.3645721.3514540.6397760.207373-1.3786750.190700...0.6809750.3376321.0633581.456320-1.138092-0.628537-0.288447-0.137137-0.1810211.160686
3-0.493325-0.1121691.182516-0.609727-0.0074690.9361500.1920710.316018-1.262503-0.050468...-0.269855-0.1474430.007267-0.304777-1.9410271.241904-0.4602170.1553960.1861890.140534
4-0.5913300.5315411.0214120.284655-0.2950150.0719990.479302-0.2265100.7443260.691625...0.529939-0.0128391.100011-0.2201230.233250-0.3952021.0416110.5436200.651816-0.073403

5 rows × 29 columns

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=0)
X_train.shape,y_train.shape,X_test.shape,y_test.shape
((199364, 29), (199364,), (85443, 29), (85443,))
## 下采样
rus = RandomUnderSampler(random_state=0)
X_resampled, y_resampled = rus.fit_sample(X_train, y_train)
Counter(y_resampled)
Counter({0: 345, 1: 345})
X_train = np.array(X_resampled,dtype='float32')
X_test = np.array(X_test,dtype='float32')
y_train = np.array(y_resampled,dtype='int64')
y_test = np.array(y_test,dtype='int64')
def plot_confusion_matrix(cm, classes,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=0)
    plt.yticks(tick_marks, classes)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
from __future__ import absolute_import, division, print_function
from IPython.display import clear_output
x_train = pd.DataFrame(X_train)
x_test = pd.DataFrame(X_test)
type(x_train)
pandas.core.frame.DataFrame
x_train.head()
0123456789...19202122232425262728
0-1.6617220.522969-1.678534-2.0883521.215200-1.2862111.833157-1.4822262.0618645.848073...0.920462-0.5720862.9629090.7882911.332814-0.584676-0.4961602.524529-0.315584-0.313249
1-1.958276-1.184221-0.2664910.3099681.295294-0.800223-0.2227110.1850700.0763390.637193...-1.531449-0.9001360.3935772.141047-1.1360801.5051262.6568820.555429-2.676608-0.295257
2-0.0675350.559817-0.240419-0.7577460.783281-0.0419640.6705190.0468020.0244870.074268...0.153998-0.454839-1.043606-0.170155-2.318232-0.5699800.4384700.8645960.397261-0.349671
3-0.795785-0.2573221.273879-1.202887-0.6776560.254827-0.7188470.788730-0.840405-0.179861...0.4123710.6489351.401767-0.329352-0.4980100.933679-0.1683900.434578-0.061287-0.049375
4-0.1599120.4466010.303160-0.2404560.433733-0.7180190.826952-0.251612-0.250003-0.398699...-0.141851-0.233167-0.6109400.022573-0.158219-1.4841460.3713430.0509530.545120-0.245321

5 rows × 29 columns

data.columns
Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
       'Class'],
      dtype='object')
VVV = data.columns[1:30].to_list()
VVV
['V1',
 'V2',
 'V3',
 'V4',
 'V5',
 'V6',
 'V7',
 'V8',
 'V9',
 'V10',
 'V11',
 'V12',
 'V13',
 'V14',
 'V15',
 'V16',
 'V17',
 'V18',
 'V19',
 'V20',
 'V21',
 'V22',
 'V23',
 'V24',
 'V25',
 'V26',
 'V27',
 'V28',
 'Amount']
x_train.columns = data.columns[1:30]
x_test.columns = data.columns[1:30]
x_train.head()
V1V2V3V4V5V6V7V8V9V10...V20V21V22V23V24V25V26V27V28Amount
0-1.6617220.522969-1.678534-2.0883521.215200-1.2862111.833157-1.4822262.0618645.848073...0.920462-0.5720862.9629090.7882911.332814-0.584676-0.4961602.524529-0.315584-0.313249
1-1.958276-1.184221-0.2664910.3099681.295294-0.800223-0.2227110.1850700.0763390.637193...-1.531449-0.9001360.3935772.141047-1.1360801.5051262.6568820.555429-2.676608-0.295257
2-0.0675350.559817-0.240419-0.7577460.783281-0.0419640.6705190.0468020.0244870.074268...0.153998-0.454839-1.043606-0.170155-2.318232-0.5699800.4384700.8645960.397261-0.349671
3-0.795785-0.2573221.273879-1.202887-0.6776560.254827-0.7188470.788730-0.840405-0.179861...0.4123710.6489351.401767-0.329352-0.4980100.933679-0.1683900.434578-0.061287-0.049375
4-0.1599120.4466010.303160-0.2404560.433733-0.7180190.826952-0.251612-0.250003-0.398699...-0.141851-0.233167-0.6109400.022573-0.158219-1.4841460.3713430.0509530.545120-0.245321

5 rows × 29 columns

NUMERIC_COLUMNS = x_train.columns.to_list()

字符串列表用

def one_hot_cat_column(feature_name, vocab):
return tf.feature_column.indicator_column(
tf.feature_column.categorical_column_with_vocabulary_list(feature_name,
vocab))
for feature_name in CATEGORICAL_COLUMNS:
# Need to one-hot encode categorical features.
vocabulary = dftrain[feature_name].unique()
feature_columns.append(one_hot_cat_column(feature_name, vocabulary))

数字列表用

feature_columns = []
for feature_name in NUMERIC_COLUMNS:
    feature_columns.append(tf.feature_column.numeric_column(feature_name,
                                                            dtype=tf.float32))

创建dataset ,输入管道

# Use entire batch since this is such a small dataset.
NUM_EXAMPLES = len(y_train)

def make_input_fn(X, y, n_epochs=None, shuffle=True):
    def input_fn():
        dataset = tf.data.Dataset.from_tensor_slices((dict(X), y))
        if shuffle:
            dataset = dataset.shuffle(NUM_EXAMPLES)
        # For training, cycle thru dataset as many times as need (n_epochs=None).    
        dataset = dataset.repeat(n_epochs)
        # In memory training doesn't use batching.
        dataset = dataset.batch(NUM_EXAMPLES)
        return dataset
    return input_fn
# Training and evaluation input functions.
train_input_fn = make_input_fn(x_train, y_train) # x_train 为Dataframe
eval_input_fn = make_input_fn(x_test, y_test, shuffle=False, n_epochs=1)

线性分类器(逻辑回归模型)

linear_est = tf.estimator.LinearClassifier(feature_columns)
# Train model.
linear_est.train(train_input_fn, max_steps=100)
# Evaluation.
result = linear_est.evaluate(eval_input_fn)
clear_output()
print(pd.Series(result))
accuracy                  0.964339
accuracy_baseline         0.998280
auc                       0.979284
auc_precision_recall      0.420575
average_loss              0.201447
label/mean                0.001720
loss                      0.201443
precision                 0.042875
prediction/mean           0.151229
recall                    0.925170
global_step             100.000000
dtype: float64

线性分类器ROC

pred_dicts = list(linear_est.predict(eval_input_fn))
probs = pd.Series([pred['probabilities'][1] for pred in pred_dicts])

probs.plot(kind='hist', bins=20, title='predicted probabilities')
plt.show()
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\ADMINI~1\AppData\Local\Temp\tmplvozpqp9\model.ckpt-100
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.

在这里插入图片描述

result_1 = linear_est.predict(eval_input_fn,predict_keys='class_ids')
AAA = pd.DataFrame(result_1)
QQQ = (AAA == 1)
QQB = np.array(QQQ+0)
QQB = np.squeeze(QQB,-1)
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\ADMINI~1\AppData\Local\Temp\tmplvozpqp9\model.ckpt-100
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
Counter(QQB)
Counter({0: 82271, 1: 3172})
cnf_matrix = confusion_matrix(y_test,QQB)
np.set_printoptions(precision=2)
print("Recall metric in the testing dataset: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))
print("Accuracy metric in the testing dataset: ", 
      (cnf_matrix[1,1]+cnf_matrix[0,0])/(cnf_matrix[1,1]+cnf_matrix[0,0]+cnf_matrix[1,0]+cnf_matrix[0,1]))

class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix
                      , classes=class_names
                      , title='Linear estimator Confusion matrix')
plt.show()
Recall metric in the testing dataset:  0.9251700680272109
Accuracy metric in the testing dataset:  0.9643387989653921

在这里插入图片描述

在这里插入图片描述
在这里插入图片描述
在这里插入图片描述

梯度提升树

# Since data fits into memory, use entire dataset per layer. It will be faster.
# Above one batch is defined as the entire dataset. 
n_batches = 1
est = tf.estimator.BoostedTreesClassifier(feature_columns,
                                          n_batches_per_layer=n_batches)

# The model will stop training once the specified number of trees is built, not 
# based on the number of steps.
est.train(train_input_fn, max_steps=100)

# Eval.
result = est.evaluate(eval_input_fn)
clear_output()
print(pd.Series(result))
accuracy                  0.967897
accuracy_baseline         0.998280
auc                       0.974405
auc_precision_recall      0.598076
average_loss              0.093176
label/mean                0.001720
loss                      0.093208
precision                 0.045518
prediction/mean           0.046975
recall                    0.884354
global_step             100.000000
dtype: float64
result_2 = est.predict(eval_input_fn,predict_keys='class_ids')
AAA = pd.DataFrame(result_2)
QQQ = (AAA == 1)
QQB = np.array(QQQ+0)
QQB = np.squeeze(QQB,-1)
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\ADMINI~1\AppData\Local\Temp\tmpriy4cjlj\model.ckpt-100
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
Counter(QQB)
Counter({1: 2856, 0: 82587})
cnf_matrix = confusion_matrix(y_test,QQB)
np.set_printoptions(precision=2)
print("Recall metric in the testing dataset: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))
print("Accuracy metric in the testing dataset: ", 
      (cnf_matrix[1,1]+cnf_matrix[0,0])/(cnf_matrix[1,1]+cnf_matrix[0,0]+cnf_matrix[1,0]+cnf_matrix[0,1]))

class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix
                      , classes=class_names
                      , title='BoostedTreesClassifier estimator Confusion matrix')
plt.show()
Recall metric in the testing dataset:  0.8843537414965986
Accuracy metric in the testing dataset:  0.9678967264726192

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-72VlGvEr-1590061836310)(output_39_1.png)]

pred_dicts = list(est.predict(eval_input_fn))
probs = pd.Series([pred['probabilities'][1] for pred in pred_dicts])

probs.plot(kind='hist', bins=20, title='predicted probabilities')
plt.show()
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\ADMINI~1\AppData\Local\Temp\tmpriy4cjlj\model.ckpt-100
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-brjgpLRw-1590061836312)(output_40_1.png)]

from sklearn.metrics import roc_curve
from matplotlib import pyplot as plt

fpr, tpr, _ = roc_curve(y_test, probs)
plt.plot(fpr, tpr)
plt.title('ROC curve')
plt.xlabel('false positive rate')
plt.ylabel('true positive rate')
plt.xlim(0,)
plt.ylim(0,);

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-KnN4gpwR-1590061836313)(output_41_0.png)]


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

潘诺西亚的火山

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值