线性回归.提升树

最新推荐文章于 2024-07-16 20:28:23 发布

潘诺西亚的火山

最新推荐文章于 2024-07-16 20:28:23 发布

阅读量338

点赞数

文章标签： tensorflow 深度学习机器学习

本文链接：https://blog.csdn.net/helldoger/article/details/106266248

版权

import numpy as np
import sklearn
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from collections import Counter
import tensorflow as tf
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.linear_model import LogisticRegression as LR
from sklearn.model_selection import learning_curve #画学习曲线的类
from sklearn.model_selection import ShuffleSplit #设定交叉验证模式的类
from imblearn.under_sampling import RandomUnderSampler
from time import time
import datetime
from sklearn.metrics import mean_squared_error
import itertools
from sklearn.metrics import confusion_matrix
import sys

data = pd.read_csv("./creditcard.csv")
data.head()

	Time	V1	V2	V3	V4	V5	V6	V7	V8	V9	...	V21	V22	V23	V24	V25	V26	V27	V28	Amount
0	0.0	-1.359807	-0.072781	2.536347	1.378155	-0.338321	0.462388	0.239599	0.098698	0.363787	...	-0.018307	0.277838	-0.110474	0.066928	0.128539	-0.189115	0.133558	-0.021053	149.62
1	0.0	1.191857	0.266151	0.166480	0.448154	0.060018	-0.082361	-0.078803	0.085102	-0.255425	...	-0.225775	-0.638672	0.101288	-0.339846	0.167170	0.125895	-0.008983	0.014724	2.69
2	1.0	-1.358354	-1.340163	1.773209	0.379780	-0.503198	1.800499	0.791461	0.247676	-1.514654	...	0.247998	0.771679	0.909412	-0.689281	-0.327642	-0.139097	-0.055353	-0.059752	378.66
3	1.0	-0.966272	-0.185226	1.792993	-0.863291	-0.010309	1.247203	0.237609	0.377436	-1.387024	...	-0.108300	0.005274	-0.190321	-1.175575	0.647376	-0.221929	0.062723	0.061458	123.50
4	2.0	-1.158233	0.877737	1.548718	0.403034	-0.407193	0.095921	0.592941	-0.270533	0.817739	...	-0.009431	0.798278	-0.137458	0.141267	-0.206010	0.502292	0.219422	0.215153	69.99

5 rows × 31 columns

count_classes = pd.value_counts(data['Class'], sort = True).sort_index(ascending=True)
count_classes.plot(kind = 'bar')
plt.title("Fraud and normal")
plt.xlabel("Class")
plt.ylabel("Frequency")

Text(0, 0.5, 'Frequency')

在这里插入图片描述

X=data.iloc[:,1:30]
y=data['Class']

X = StandardScaler().fit_transform(X)

pd.DataFrame(X).head()

	0	1	2	3	4	5	6	7	8	9	...	19	20	21	22	23	24	25	26	27	28
0	-0.694242	-0.044075	1.672773	0.973366	-0.245117	0.347068	0.193679	0.082637	0.331128	0.083386	...	0.326118	-0.024923	0.382854	-0.176911	0.110507	0.246585	-0.392170	0.330892	-0.063781	0.244964
1	0.608496	0.161176	0.109797	0.316523	0.043483	-0.061820	-0.063700	0.071253	-0.232494	-0.153350	...	-0.089611	-0.307377	-0.880077	0.162201	-0.561131	0.320694	0.261069	-0.022256	0.044608	-0.342475
2	-0.693500	-0.811578	1.169468	0.268231	-0.364572	1.351454	0.639776	0.207373	-1.378675	0.190700	...	0.680975	0.337632	1.063358	1.456320	-1.138092	-0.628537	-0.288447	-0.137137	-0.181021	1.160686
3	-0.493325	-0.112169	1.182516	-0.609727	-0.007469	0.936150	0.192071	0.316018	-1.262503	-0.050468	...	-0.269855	-0.147443	0.007267	-0.304777	-1.941027	1.241904	-0.460217	0.155396	0.186189	0.140534
4	-0.591330	0.531541	1.021412	0.284655	-0.295015	0.071999	0.479302	-0.226510	0.744326	0.691625	...	0.529939	-0.012839	1.100011	-0.220123	0.233250	-0.395202	1.041611	0.543620	0.651816	-0.073403

5 rows × 29 columns

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=0)
X_train.shape,y_train.shape,X_test.shape,y_test.shape

((199364, 29), (199364,), (85443, 29), (85443,))

## 下采样
rus = RandomUnderSampler(random_state=0)
X_resampled, y_resampled = rus.fit_sample(X_train, y_train)

Counter(y_resampled)

Counter({0: 345, 1: 345})

X_train = np.array(X_resampled,dtype='float32')
X_test = np.array(X_test,dtype='float32')
y_train = np.array(y_resampled,dtype='int64')
y_test = np.array(y_test,dtype='int64')

def plot_confusion_matrix(cm, classes,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=0)
    plt.yticks(tick_marks, classes)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

from __future__ import absolute_import, division, print_function
from IPython.display import clear_output

x_train = pd.DataFrame(X_train)
x_test = pd.DataFrame(X_test)

type(x_train)

pandas.core.frame.DataFrame

x_train.head()

	0	1	2	3	4	5	6	7	8	9	...	19	20	21	22	23	24	25	26	27	28
0	-1.661722	0.522969	-1.678534	-2.088352	1.215200	-1.286211	1.833157	-1.482226	2.061864	5.848073	...	0.920462	-0.572086	2.962909	0.788291	1.332814	-0.584676	-0.496160	2.524529	-0.315584	-0.313249
1	-1.958276	-1.184221	-0.266491	0.309968	1.295294	-0.800223	-0.222711	0.185070	0.076339	0.637193	...	-1.531449	-0.900136	0.393577	2.141047	-1.136080	1.505126	2.656882	0.555429	-2.676608	-0.295257
2	-0.067535	0.559817	-0.240419	-0.757746	0.783281	-0.041964	0.670519	0.046802	0.024487	0.074268	...	0.153998	-0.454839	-1.043606	-0.170155	-2.318232	-0.569980	0.438470	0.864596	0.397261	-0.349671
3	-0.795785	-0.257322	1.273879	-1.202887	-0.677656	0.254827	-0.718847	0.788730	-0.840405	-0.179861	...	0.412371	0.648935	1.401767	-0.329352	-0.498010	0.933679	-0.168390	0.434578	-0.061287	-0.049375
4	-0.159912	0.446601	0.303160	-0.240456	0.433733	-0.718019	0.826952	-0.251612	-0.250003	-0.398699	...	-0.141851	-0.233167	-0.610940	0.022573	-0.158219	-1.484146	0.371343	0.050953	0.545120	-0.245321

5 rows × 29 columns

data.columns

Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
       'Class'],
      dtype='object')

VVV = data.columns[1:30].to_list()

VVV

['V1',
 'V2',
 'V3',
 'V4',
 'V5',
 'V6',
 'V7',
 'V8',
 'V9',
 'V10',
 'V11',
 'V12',
 'V13',
 'V14',
 'V15',
 'V16',
 'V17',
 'V18',
 'V19',
 'V20',
 'V21',
 'V22',
 'V23',
 'V24',
 'V25',
 'V26',
 'V27',
 'V28',
 'Amount']

x_train.columns = data.columns[1:30]
x_test.columns = data.columns[1:30]

x_train.head()

	V1	V2	V3	V4	V5	V6	V7	V8	V9	V10	...	V20	V21	V22	V23	V24	V25	V26	V27	V28	Amount
0	-1.661722	0.522969	-1.678534	-2.088352	1.215200	-1.286211	1.833157	-1.482226	2.061864	5.848073	...	0.920462	-0.572086	2.962909	0.788291	1.332814	-0.584676	-0.496160	2.524529	-0.315584	-0.313249
1	-1.958276	-1.184221	-0.266491	0.309968	1.295294	-0.800223	-0.222711	0.185070	0.076339	0.637193	...	-1.531449	-0.900136	0.393577	2.141047	-1.136080	1.505126	2.656882	0.555429	-2.676608	-0.295257
2	-0.067535	0.559817	-0.240419	-0.757746	0.783281	-0.041964	0.670519	0.046802	0.024487	0.074268	...	0.153998	-0.454839	-1.043606	-0.170155	-2.318232	-0.569980	0.438470	0.864596	0.397261	-0.349671
3	-0.795785	-0.257322	1.273879	-1.202887	-0.677656	0.254827	-0.718847	0.788730	-0.840405	-0.179861	...	0.412371	0.648935	1.401767	-0.329352	-0.498010	0.933679	-0.168390	0.434578	-0.061287	-0.049375
4	-0.159912	0.446601	0.303160	-0.240456	0.433733	-0.718019	0.826952	-0.251612	-0.250003	-0.398699	...	-0.141851	-0.233167	-0.610940	0.022573	-0.158219	-1.484146	0.371343	0.050953	0.545120	-0.245321

5 rows × 29 columns

NUMERIC_COLUMNS = x_train.columns.to_list()

字符串列表用

def one_hot_cat_column(feature_name, vocab):
return tf.feature_column.indicator_column(
tf.feature_column.categorical_column_with_vocabulary_list(feature_name,
vocab))
for feature_name in CATEGORICAL_COLUMNS:
# Need to one-hot encode categorical features.
vocabulary = dftrain[feature_name].unique()
feature_columns.append(one_hot_cat_column(feature_name, vocabulary))

数字列表用

feature_columns = []
for feature_name in NUMERIC_COLUMNS:
    feature_columns.append(tf.feature_column.numeric_column(feature_name,
                                                            dtype=tf.float32))

创建dataset ,输入管道

# Use entire batch since this is such a small dataset.
NUM_EXAMPLES = len(y_train)

def make_input_fn(X, y, n_epochs=None, shuffle=True):
    def input_fn():
        dataset = tf.data.Dataset.from_tensor_slices((dict(X), y))
        if shuffle:
            dataset = dataset.shuffle(NUM_EXAMPLES)
        # For training, cycle thru dataset as many times as need (n_epochs=None).    
        dataset = dataset.repeat(n_epochs)
        # In memory training doesn't use batching.
        dataset = dataset.batch(NUM_EXAMPLES)
        return dataset
    return input_fn

# Training and evaluation input functions.
train_input_fn = make_input_fn(x_train, y_train) # x_train 为Dataframe
eval_input_fn = make_input_fn(x_test, y_test, shuffle=False, n_epochs=1)

线性分类器（逻辑回归模型）

linear_est = tf.estimator.LinearClassifier(feature_columns)
# Train model.
linear_est.train(train_input_fn, max_steps=100)
# Evaluation.
result = linear_est.evaluate(eval_input_fn)
clear_output()
print(pd.Series(result))

accuracy                  0.964339
accuracy_baseline         0.998280
auc                       0.979284
auc_precision_recall      0.420575
average_loss              0.201447
label/mean                0.001720
loss                      0.201443
precision                 0.042875
prediction/mean           0.151229
recall                    0.925170
global_step             100.000000
dtype: float64

线性分类器ROC

pred_dicts = list(linear_est.predict(eval_input_fn))
probs = pd.Series([pred['probabilities'][1] for pred in pred_dicts])

probs.plot(kind='hist', bins=20, title='predicted probabilities')
plt.show()

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\ADMINI~1\AppData\Local\Temp\tmplvozpqp9\model.ckpt-100
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.

在这里插入图片描述

result_1 = linear_est.predict(eval_input_fn,predict_keys='class_ids')
AAA = pd.DataFrame(result_1)
QQQ = (AAA == 1)
QQB = np.array(QQQ+0)
QQB = np.squeeze(QQB,-1)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\ADMINI~1\AppData\Local\Temp\tmplvozpqp9\model.ckpt-100
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.

Counter(QQB)

Counter({0: 82271, 1: 3172})

cnf_matrix = confusion_matrix(y_test,QQB)
np.set_printoptions(precision=2)
print("Recall metric in the testing dataset: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))
print("Accuracy metric in the testing dataset: ", 
      (cnf_matrix[1,1]+cnf_matrix[0,0])/(cnf_matrix[1,1]+cnf_matrix[0,0]+cnf_matrix[1,0]+cnf_matrix[0,1]))

class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix
                      , classes=class_names
                      , title='Linear estimator Confusion matrix')
plt.show()

Recall metric in the testing dataset:  0.9251700680272109
Accuracy metric in the testing dataset:  0.9643387989653921

在这里插入图片描述

梯度提升树

# Since data fits into memory, use entire dataset per layer. It will be faster.
# Above one batch is defined as the entire dataset. 
n_batches = 1
est = tf.estimator.BoostedTreesClassifier(feature_columns,
                                          n_batches_per_layer=n_batches)

# The model will stop training once the specified number of trees is built, not 
# based on the number of steps.
est.train(train_input_fn, max_steps=100)

# Eval.
result = est.evaluate(eval_input_fn)
clear_output()
print(pd.Series(result))

accuracy                  0.967897
accuracy_baseline         0.998280
auc                       0.974405
auc_precision_recall      0.598076
average_loss              0.093176
label/mean                0.001720
loss                      0.093208
precision                 0.045518
prediction/mean           0.046975
recall                    0.884354
global_step             100.000000
dtype: float64

result_2 = est.predict(eval_input_fn,predict_keys='class_ids')
AAA = pd.DataFrame(result_2)
QQQ = (AAA == 1)
QQB = np.array(QQQ+0)
QQB = np.squeeze(QQB,-1)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\ADMINI~1\AppData\Local\Temp\tmpriy4cjlj\model.ckpt-100
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.

Counter(QQB)

Counter({1: 2856, 0: 82587})

cnf_matrix = confusion_matrix(y_test,QQB)
np.set_printoptions(precision=2)
print("Recall metric in the testing dataset: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))
print("Accuracy metric in the testing dataset: ", 
      (cnf_matrix[1,1]+cnf_matrix[0,0])/(cnf_matrix[1,1]+cnf_matrix[0,0]+cnf_matrix[1,0]+cnf_matrix[0,1]))

class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix
                      , classes=class_names
                      , title='BoostedTreesClassifier estimator Confusion matrix')
plt.show()

Recall metric in the testing dataset:  0.8843537414965986
Accuracy metric in the testing dataset:  0.9678967264726192

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-72VlGvEr-1590061836310)(output_39_1.png)]

pred_dicts = list(est.predict(eval_input_fn))
probs = pd.Series([pred['probabilities'][1] for pred in pred_dicts])

probs.plot(kind='hist', bins=20, title='predicted probabilities')
plt.show()

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\ADMINI~1\AppData\Local\Temp\tmpriy4cjlj\model.ckpt-100
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-brjgpLRw-1590061836312)(output_40_1.png)]

from sklearn.metrics import roc_curve
from matplotlib import pyplot as plt

fpr, tpr, _ = roc_curve(y_test, probs)
plt.plot(fpr, tpr)
plt.title('ROC curve')
plt.xlabel('false positive rate')
plt.ylabel('true positive rate')
plt.xlim(0,)
plt.ylim(0,);

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-KnN4gpwR-1590061836313)(output_41_0.png)]

潘诺西亚的火山

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
打赏
0
评论
线性回归.提升树

import numpy as npimport sklearnimport matplotlib.pyplot as pltfrom sklearn.datasets import make_classificationfrom collections import Counterimport tensorflow as tfimport pandas as pdfrom sklearn.preprocessing import StandardScalerfrom sklearn.mod
复制链接

扫一扫