Tensorflow-estimator_2.预定义estimator(baseline_estimator,linear_estimator,dnn_estimator)的使用

最新推荐文章于 2022-07-23 19:00:14 发布
一定要学好JAVA
最新推荐文章于 2022-07-23 19:00:14 发布
阅读量389
点赞数
分类专栏： Tensorflow2.0
本文链接：https://blog.csdn.net/qq_45391763/article/details/103589373
版权
Tensorflow2.0 专栏收录该内容
19 篇文章 1 订阅
订阅专栏
"""
@author: khoing
@contact: Khoing@126.com
@time: 2019/12/17 16:58
@file: tf_premade_estimators.py
"""


import matplotlib as mpl  # Matplotlib 是 Python 的绘图库。 它可与 NumPy 一起使用

import matplotlib.pyplot as plt  # Python数据可视化matplotlib.pyplot

# %matplotlib inline #在使用jupyter notebook 或者 jupyter qtconsole的时候，经常会用到%matplotlib inline。其作用就是在你调用plot()进行画图或者直接输入Figure的实例对象的时候，会自动的显示并把figure嵌入到console中。

import numpy as np  # 数值计算扩展。这种工具可用来存储和处理大型矩阵

import sklearn  # 机器学习中常用的第三方模块，对常用的机器学习方法进行了封装，包括回归(Regression)、降维(Dimensionality Reduction)、分类(Classfication)、聚类(Clustering)等方法。

import pandas as pd  # 是python的一个数据分析包
import os  # 系统编程的操作模块，可以处理文件和目录
import sys  # sys模块包含了与Python解释器和它的环境有关的函数
import time
import tensorflow as tf

from tensorflow import keras

##################################################################################################
# 选择GPU
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

##################################################################################################

print(tf.__version__)
print(sys.version_info)
for module in mpl, np, pd, sklearn, tf, keras:
    print(module.__name__, module.__version__)

"""output:
    2.0.0
    sys.version_info(major=3, minor=7, micro=4, releaselevel='final', serial=0)
    matplotlib 3.1.1
    numpy 1.16.5
    pandas 0.25.3
    sklearn 0.21.3
    tensorflow 2.0.0
    tensorflow_core.keras 2.2.4-tf
"""

##################################################################################################
train_file = "./data/train.csv" #训练集
eval_file = "./data/eval.csv" #测试集

#pandas有读取csv的api
train_df = pd.read_csv(train_file)
eval_df = pd.read_csv(eval_file)

print(train_df.head())
print(eval_df.head())

"""output:
       survived     sex   age  ...     deck  embark_town  alone
    0         0    male  22.0  ...  unknown  Southampton      n
    1         1  female  38.0  ...        C    Cherbourg      n
    2         1  female  26.0  ...  unknown  Southampton      y
    3         1  female  35.0  ...        C  Southampton      n
    4         0    male  28.0  ...  unknown   Queenstown      y
    
    [5 rows x 10 columns]
    
    
       survived     sex   age  ...     deck  embark_town  alone
    0         0    male  35.0  ...  unknown  Southampton      y
    1         0    male  54.0  ...        E  Southampton      y
    2         1  female  58.0  ...        C  Southampton      y
    3         1  female  55.0  ...  unknown  Southampton      y
    4         1    male  34.0  ...        D  Southampton      y
    
    [5 rows x 10 columns]

"""


#把survived取出来，当作预测值
y_train = train_df.pop('survived')
y_eval = eval_df.pop('survived')

print(train_df.head())
print(eval_df.head())

"""output:
          sex   age  n_siblings_spouses  parch  ...  class     deck  embark_town alone
    0    male  22.0                   1      0  ...  Third  unknown  Southampton     n
    1  female  38.0                   1      0  ...  First        C    Cherbourg     n
    2  female  26.0                   0      0  ...  Third  unknown  Southampton     y
    3  female  35.0                   1      0  ...  First        C  Southampton     n
    4    male  28.0                   0      0  ...  Third  unknown   Queenstown     y
    
    [5 rows x 9 columns]
          sex   age  n_siblings_spouses  parch  ...   class     deck  embark_town alone
    0    male  35.0                   0      0  ...   Third  unknown  Southampton     y
    1    male  54.0                   0      0  ...   First        E  Southampton     y
    2  female  58.0                   0      0  ...   First        C  Southampton     y
    3  female  55.0                   0      0  ...  Second  unknown  Southampton     y
    4    male  34.0                   0      0  ...  Second        D  Southampton     y
    
    [5 rows x 9 columns]
"""
print(y_train.head())
print(y_eval.head())
"""output:
    0    0
    1    1
    2    1
    3    1
    4    0
    Name: survived, dtype: int64
    0    0
    1    0
    2    1
    3    1
    4    1
    Name: survived, dtype: int64    
"""

##################################################################################################
# 离散特征，连续特征
# 训练模型时，而离散特征需要OneHot编码返回一个向量

#离散型特征
categoryical_columns = ['sex','n_siblings_spouses','parch','class','deck','embark_town','alone']
#连续型特征
numerical_columns = ['age','fare']

feature_columns = []
for categoryical_column in categoryical_columns:
    vocab = train_df[categoryical_column].unique()#离散型特征的个数（有多少类）
    #print(tf.feature_column.categorical_column_with_vocabulary_list(categoryical_column,vocab))#参数1：名字，参数 2：所有可能的值（sex，['male','female']
    """output:
        VocabularyListCategoricalColumn(key='sex', vocabulary_list=('male', 'female'), dtype=tf.string, default_value=-1, num_oov_buckets=0)
        VocabularyListCategoricalColumn(key='n_siblings_spouses', vocabulary_list=(1, 0, 3, 4, 2, 5, 8), dtype=tf.int64, default_value=-1, num_oov_buckets=0)
        VocabularyListCategoricalColumn(key='parch', vocabulary_list=(0, 1, 2, 5, 3, 4), dtype=tf.int64, default_value=-1, num_oov_buckets=0)
        VocabularyListCategoricalColumn(key='class', vocabulary_list=('Third', 'First', 'Second'), dtype=tf.string, default_value=-1, num_oov_buckets=0)
        VocabularyListCategoricalColumn(key='deck', vocabulary_list=('unknown', 'C', 'G', 'A', 'B', 'D', 'F', 'E'), dtype=tf.string, default_value=-1, num_oov_buckets=0)
        VocabularyListCategoricalColumn(key='embark_town', vocabulary_list=('Southampton', 'Cherbourg', 'Queenstown', 'unknown'), dtype=tf.string, default_value=-1, num_oov_buckets=0)
        VocabularyListCategoricalColumn(key='alone', vocabulary_list=('n', 'y'), dtype=tf.string, default_value=-1, num_oov_buckets=0)
    """
    #转为ont-hot编码
    #print(tf.feature_column.indicator_column(tf.feature_column.categorical_column_with_vocabulary_list(categoryical_column, vocab)))
    """output:
        IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='sex', vocabulary_list=('male', 'female'), dtype=tf.string, default_value=-1, num_oov_buckets=0))
        IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='n_siblings_spouses', vocabulary_list=(1, 0, 3, 4, 2, 5, 8), dtype=tf.int64, default_value=-1, num_oov_buckets=0))
        IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='parch', vocabulary_list=(0, 1, 2, 5, 3, 4), dtype=tf.int64, default_value=-1, num_oov_buckets=0))
        IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='class', vocabulary_list=('Third', 'First', 'Second'), dtype=tf.string, default_value=-1, num_oov_buckets=0))
        IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='deck', vocabulary_list=('unknown', 'C', 'G', 'A', 'B', 'D', 'F', 'E'), dtype=tf.string, default_value=-1, num_oov_buckets=0))
        IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='embark_town', vocabulary_list=('Southampton', 'Cherbourg', 'Queenstown', 'unknown'), dtype=tf.string, default_value=-1, num_oov_buckets=0))
        IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='alone', vocabulary_list=('n', 'y'), dtype=tf.string, default_value=-1, num_oov_buckets=0))
    """
    feature_columns.append(tf.feature_column.indicator_column(
        tf.feature_column.categorical_column_with_vocabulary_list(categoryical_column, vocab)))

print(feature_columns)

for numerical_column in numerical_columns:
    feature_columns.append(tf.feature_column.numeric_column(numerical_column,dtype=tf.float32))
print(feature_columns)

"""output
    [IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='sex', vocabulary_list=('male', 'female'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='n_siblings_spouses', vocabulary_list=(1, 0, 3, 4, 2, 5, 8), dtype=tf.int64, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='parch', vocabulary_list=(0, 1, 2, 5, 3, 4), dtype=tf.int64, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='class', vocabulary_list=('Third', 'First', 'Second'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='deck', vocabulary_list=('unknown', 'C', 'G', 'A', 'B', 'D', 'F', 'E'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='embark_town', vocabulary_list=('Southampton', 'Cherbourg', 'Queenstown', 'unknown'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='alone', vocabulary_list=('n', 'y'), dtype=tf.string, default_value=-1, num_oov_buckets=0))]
    [IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='sex', vocabulary_list=('male', 'female'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='n_siblings_spouses', vocabulary_list=(1, 0, 3, 4, 2, 5, 8), dtype=tf.int64, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='parch', vocabulary_list=(0, 1, 2, 5, 3, 4), dtype=tf.int64, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='class', vocabulary_list=('Third', 'First', 'Second'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='deck', vocabulary_list=('unknown', 'C', 'G', 'A', 'B', 'D', 'F', 'E'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='embark_town', vocabulary_list=('Southampton', 'Cherbourg', 'Queenstown', 'unknown'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='alone', vocabulary_list=('n', 'y'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), NumericColumn(key='age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='fare', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)]
    
"""



#########################################################################################################################
#cross_featrues: age[1,2,3], gender[male,female]
#age&gender :[(1,male),(2,male),(3,male).......]
#

#tf.feature_column.crossed_column(['age','sex'],hash_bucket_size=100)#表示压缩矩阵
#DNN需要indicator
feature_columns.append(tf.feature_column.indicator_column(tf.feature_column.crossed_column(['age','sex'],hash_bucket_size=100)))#表示压缩矩阵



def make_dataset(data_df,label_df,epochs = 10, shuffle = True,batch_size = 32):
    dataset = tf.data.Dataset.from_tensor_slices((dict(data_df),label_df))
    if shuffle:
        dataset = dataset.shuffle(10000)
    dataset = dataset.repeat(epochs).batch(batch_size)
    return dataset


##############################################################################################
output_dir = 'baseline_model'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
baseline_estimator = tf.estimator.BaselineClassifier(model_dir=output_dir,n_classes=2)
baseline_estimator.train(input_fn= lambda : make_dataset(train_df,y_train,epochs=100))
baseline_estimator.evaluate(input_fn=lambda :make_dataset(eval_df,y_eval,epochs=1,shuffle=False,batch_size=20))
##############################################################################################
linear_output_dir =  'linear_model'
if not os.path.exists(linear_output_dir):
    os.makedirs(output_dir)
linear_estimator = tf.estimator.LinearClassifier(model_dir=linear_output_dir,n_classes=2,feature_columns = feature_columns)
linear_estimator.train(input_fn=lambda : make_dataset(train_df,y_train,epochs=100))

linear_estimator.evaluate(input_fn=lambda :make_dataset(eval_df,y_eval,epochs=1,shuffle=False,batch_size=20))

##############################################################################################
dnn_output_dir = 'dnn_model'
if not os.path.exists(dnn_output_dir):
    os.makedirs(dnn_output_dir)
dnn_estimator = tf.estimator.DNNClassifier(model_dir=dnn_output_dir,n_classes=2,feature_columns = feature_columns,
                                           hidden_units=[128,128],
                                           activation_fn=tf.nn.relu,
                                           optimizer='Adam')
dnn_estimator.train(input_fn=lambda :make_dataset(train_df,y_train,epochs=100))
dnn_estimator.evaluate(input_fn=lambda : make_dataset(eval_df,y_eval,epochs=1,shuffle=False,batch_size=20))