Tensorflow-estimator_1.featurecolumn以及estimator的使用

最新推荐文章于 2022-01-31 14:31:51 发布
一定要学好JAVA
最新推荐文章于 2022-01-31 14:31:51 发布
阅读量339
点赞数
分类专栏： Tensorflow2.0
本文链接：https://blog.csdn.net/qq_45391763/article/details/103589360
版权
Tensorflow2.0 专栏收录该内容
19 篇文章 1 订阅
订阅专栏
本文深入探讨了TensorFlow Estimator框架及其与FeatureColumn的配合使用。通过实例解析，展示了如何利用Estimator进行模型训练和评估，同时详细介绍了FeatureColumn在特征工程中的重要作用，帮助读者掌握在TensorFlow中构建高效机器学习模型的关键技巧。
摘要由CSDN通过智能技术生成
"""
@author: khoing
@contact: Khoing@126.com
@time: 2019/12/16 16:58
@file: tf_keras_to_estimator.py
"""


import matplotlib as mpl  # Matplotlib 是 Python 的绘图库。 它可与 NumPy 一起使用

import matplotlib.pyplot as plt  # Python数据可视化matplotlib.pyplot

# %matplotlib inline #在使用jupyter notebook 或者 jupyter qtconsole的时候，经常会用到%matplotlib inline。其作用就是在你调用plot()进行画图或者直接输入Figure的实例对象的时候，会自动的显示并把figure嵌入到console中。

import numpy as np  # 数值计算扩展。这种工具可用来存储和处理大型矩阵

import sklearn  # 机器学习中常用的第三方模块，对常用的机器学习方法进行了封装，包括回归(Regression)、降维(Dimensionality Reduction)、分类(Classfication)、聚类(Clustering)等方法。

import pandas as pd  # 是python的一个数据分析包
import os  # 系统编程的操作模块，可以处理文件和目录
import sys  # sys模块包含了与Python解释器和它的环境有关的函数
import time
import tensorflow as tf

from tensorflow import keras

##################################################################################################
# 选择GPU
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

##################################################################################################

print(tf.__version__)
print(sys.version_info)
for module in mpl, np, pd, sklearn, tf, keras:
    print(module.__name__, module.__version__)

"""output:
    2.0.0
    sys.version_info(major=3, minor=7, micro=4, releaselevel='final', serial=0)
    matplotlib 3.1.1
    numpy 1.16.5
    pandas 0.25.3
    sklearn 0.21.3
    tensorflow 2.0.0
    tensorflow_core.keras 2.2.4-tf
"""

##################################################################################################
train_file = "./data/train.csv" #训练集
eval_file = "./data/eval.csv" #测试集

#pandas有读取csv的api
train_df = pd.read_csv(train_file)
eval_df = pd.read_csv(eval_file)

print(train_df.head())
print(eval_df.head())

"""output:
       survived     sex   age  ...     deck  embark_town  alone
    0         0    male  22.0  ...  unknown  Southampton      n
    1         1  female  38.0  ...        C    Cherbourg      n
    2         1  female  26.0  ...  unknown  Southampton      y
    3         1  female  35.0  ...        C  Southampton      n
    4         0    male  28.0  ...  unknown   Queenstown      y
    
    [5 rows x 10 columns]
    
    
       survived     sex   age  ...     deck  embark_town  alone
    0         0    male  35.0  ...  unknown  Southampton      y
    1         0    male  54.0  ...        E  Southampton      y
    2         1  female  58.0  ...        C  Southampton      y
    3         1  female  55.0  ...  unknown  Southampton      y
    4         1    male  34.0  ...        D  Southampton      y
    
    [5 rows x 10 columns]

"""


#把survived取出来，当作预测值
y_train = train_df.pop('survived')
y_eval = eval_df.pop('survived')

print(train_df.head())
print(eval_df.head())

"""output:
          sex   age  n_siblings_spouses  parch  ...  class     deck  embark_town alone
    0    male  22.0                   1      0  ...  Third  unknown  Southampton     n
    1  female  38.0                   1      0  ...  First        C    Cherbourg     n
    2  female  26.0                   0      0  ...  Third  unknown  Southampton     y
    3  female  35.0                   1      0  ...  First        C  Southampton     n
    4    male  28.0                   0      0  ...  Third  unknown   Queenstown     y
    
    [5 rows x 9 columns]
          sex   age  n_siblings_spouses  parch  ...   class     deck  embark_town alone
    0    male  35.0                   0      0  ...   Third  unknown  Southampton     y
    1    male  54.0                   0      0  ...   First        E  Southampton     y
    2  female  58.0                   0      0  ...   First        C  Southampton     y
    3  female  55.0                   0      0  ...  Second  unknown  Southampton     y
    4    male  34.0                   0      0  ...  Second        D  Southampton     y
    
    [5 rows x 9 columns]
"""
print(y_train.head())
print(y_eval.head())
"""output:
    0    0
    1    1
    2    1
    3    1
    4    0
    Name: survived, dtype: int64
    0    0
    1    0
    2    1
    3    1
    4    1
    Name: survived, dtype: int64    
"""

#pandas可以容易的看到数据的统计量
print(train_df.describe())
"""output:
                  age  n_siblings_spouses       parch        fare
    count  627.000000          627.000000  627.000000  627.000000
    mean    29.631308            0.545455    0.379585   34.385399
    std     12.511818            1.151090    0.792999   54.597730
    min      0.750000            0.000000    0.000000    0.000000
    25%     23.000000            0.000000    0.000000    7.895800   #低于25%的值
    50%     28.000000            0.000000    0.000000   15.045800   #低于50%的值
    75%     35.000000            1.000000    0.000000   31.387500   #低于75%的值
    max     80.000000            8.000000    5.000000  512.329200
"""
print(train_df.shape , eval_df.shape)
"""output:
    (627, 9) (264, 9)
"""

# 分成20份（有20个柱形）
# train_df.age.hist(bins = 20) ipython中只需这一句就能直接显示直方图

plt.hist(train_df.age , 20)
plt.show()

#plot函数用来绘制二维图像。

##################################################################################################
# 离散特征，连续特征
# 训练模型时，而离散特征需要OneHot编码返回一个向量

#离散型特征
categoryical_columns = ['sex','n_siblings_spouses','parch','class','deck','embark_town','alone']
#连续型特征
numerical_columns = ['age','fare']

feature_columns = []
for categoryical_column in categoryical_columns:
    vocab = train_df[categoryical_column].unique()#离散型特征的个数（有多少类）
    #print(tf.feature_column.categorical_column_with_vocabulary_list(categoryical_column,vocab))#参数1：名字，参数 2：所有可能的值（sex，['male','female']
    """output:
        VocabularyListCategoricalColumn(key='sex', vocabulary_list=('male', 'female'), dtype=tf.string, default_value=-1, num_oov_buckets=0)
        VocabularyListCategoricalColumn(key='n_siblings_spouses', vocabulary_list=(1, 0, 3, 4, 2, 5, 8), dtype=tf.int64, default_value=-1, num_oov_buckets=0)
        VocabularyListCategoricalColumn(key='parch', vocabulary_list=(0, 1, 2, 5, 3, 4), dtype=tf.int64, default_value=-1, num_oov_buckets=0)
        VocabularyListCategoricalColumn(key='class', vocabulary_list=('Third', 'First', 'Second'), dtype=tf.string, default_value=-1, num_oov_buckets=0)
        VocabularyListCategoricalColumn(key='deck', vocabulary_list=('unknown', 'C', 'G', 'A', 'B', 'D', 'F', 'E'), dtype=tf.string, default_value=-1, num_oov_buckets=0)
        VocabularyListCategoricalColumn(key='embark_town', vocabulary_list=('Southampton', 'Cherbourg', 'Queenstown', 'unknown'), dtype=tf.string, default_value=-1, num_oov_buckets=0)
        VocabularyListCategoricalColumn(key='alone', vocabulary_list=('n', 'y'), dtype=tf.string, default_value=-1, num_oov_buckets=0)
    """
    #转为ont-hot编码
    #print(tf.feature_column.indicator_column(tf.feature_column.categorical_column_with_vocabulary_list(categoryical_column, vocab)))
    """output:
        IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='sex', vocabulary_list=('male', 'female'), dtype=tf.string, default_value=-1, num_oov_buckets=0))
        IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='n_siblings_spouses', vocabulary_list=(1, 0, 3, 4, 2, 5, 8), dtype=tf.int64, default_value=-1, num_oov_buckets=0))
        IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='parch', vocabulary_list=(0, 1, 2, 5, 3, 4), dtype=tf.int64, default_value=-1, num_oov_buckets=0))
        IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='class', vocabulary_list=('Third', 'First', 'Second'), dtype=tf.string, default_value=-1, num_oov_buckets=0))
        IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='deck', vocabulary_list=('unknown', 'C', 'G', 'A', 'B', 'D', 'F', 'E'), dtype=tf.string, default_value=-1, num_oov_buckets=0))
        IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='embark_town', vocabulary_list=('Southampton', 'Cherbourg', 'Queenstown', 'unknown'), dtype=tf.string, default_value=-1, num_oov_buckets=0))
        IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='alone', vocabulary_list=('n', 'y'), dtype=tf.string, default_value=-1, num_oov_buckets=0))
    """
    feature_columns.append(tf.feature_column.indicator_column(
        tf.feature_column.categorical_column_with_vocabulary_list(categoryical_column, vocab)))

print(feature_columns)

for numerical_column in numerical_columns:
    feature_columns.append(tf.feature_column.numeric_column(numerical_column,dtype=tf.float32))
print(feature_columns)

"""output:
    [IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='sex', vocabulary_list=('male', 'female'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='n_siblings_spouses', vocabulary_list=(1, 0, 3, 4, 2, 5, 8), dtype=tf.int64, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='parch', vocabulary_list=(0, 1, 2, 5, 3, 4), dtype=tf.int64, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='class', vocabulary_list=('Third', 'First', 'Second'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='deck', vocabulary_list=('unknown', 'C', 'G', 'A', 'B', 'D', 'F', 'E'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='embark_town', vocabulary_list=('Southampton', 'Cherbourg', 'Queenstown', 'unknown'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='alone', vocabulary_list=('n', 'y'), dtype=tf.string, default_value=-1, num_oov_buckets=0))]
    [IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='sex', vocabulary_list=('male', 'female'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='n_siblings_spouses', vocabulary_list=(1, 0, 3, 4, 2, 5, 8), dtype=tf.int64, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='parch', vocabulary_list=(0, 1, 2, 5, 3, 4), dtype=tf.int64, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='class', vocabulary_list=('Third', 'First', 'Second'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='deck', vocabulary_list=('unknown', 'C', 'G', 'A', 'B', 'D', 'F', 'E'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='embark_town', vocabulary_list=('Southampton', 'Cherbourg', 'Queenstown', 'unknown'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='alone', vocabulary_list=('n', 'y'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), NumericColumn(key='age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='fare', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)]
    
"""

#print(train_df)
print((dict(train_df),y_train))

def make_dataset(data_df,label_df,epochs = 10, shuffle = True,batch_size = 32):
    dataset = tf.data.Dataset.from_tensor_slices((dict(data_df),label_df))
    if shuffle:
        dataset = dataset.shuffle(10000)
    dataset = dataset.repeat(epochs).batch(batch_size)
    return dataset

train_dataset = make_dataset(train_df,y_train,batch_size=5)
for x , y in train_dataset.take(1):
    print(x)
    print(y)
"""output:
        {'sex': <tf.Tensor: id=38, shape=(5,), dtype=string, numpy=array([b'female', b'male', b'male', b'female', b'male'], dtype=object)>,
         'age': <tf.Tensor: id=30, shape=(5,), dtype=float64, numpy=array([28., 28., 28., 28., 33.])>, 
         'n_siblings_spouses': <tf.Tensor: id=36, shape=(5,), dtype=int32, numpy=array([1, 0, 0, 0, 0])>, 
         'parch': <tf.Tensor: id=37, shape=(5,), dtype=int32, numpy=array([0, 0, 0, 0, 0])>, 
         'fare': <tf.Tensor: id=35, shape=(5,), dtype=float64, numpy=array([146.5208,  26.55  ,  31.    ,  33.    ,  12.275 ])>, 
         'class': <tf.Tensor: id=32, shape=(5,), dtype=string, numpy=array([b'First', b'First', b'First', b'Second', b'Second'], dtype=object)>, 
         'deck': <tf.Tensor: id=33, shape=(5,), dtype=string, numpy=array([b'B', b'C', b'unknown', b'unknown', b'unknown'], dtype=object)>, 
         'embark_town': <tf.Tensor: id=34, shape=(5,), dtype=string, numpy=array([b'Cherbourg', b'Southampton', b'Southampton', b'Southampton',b'Southampton'], dtype=object)>,
        'alone': <tf.Tensor: id=31, shape=(5,), dtype=string, numpy=array([b'n', b'y', b'y', b'y', b'y'], dtype=object)>}
               
        tf.Tensor([1 0 0 1 0], shape=(5,), dtype=int32)
"""

#DenseFeature 可以将定义的featureColumn应用到dataset中

for x , y in train_dataset.take(1):
    age_column = feature_columns[7]
    gender_column = feature_columns[0]
    print(keras.layers.DenseFeatures(age_column)(x))
    print(keras.layers.DenseFeatures(gender_column)(x))
    print(keras.layers.DenseFeatures(age_column)(x).numpy())
    print(keras.layers.DenseFeatures(gender_column)(x).numpy())
    print(y)


"""output:
    tf.Tensor(
    [[18.]
     [29.]
     [27.]
     [20.]
     [60.]], shape=(5, 1), dtype=float32)
     
     tf.Tensor(
    [[0. 1.]
     [1. 0.]
     [1. 0.]
     [1. 0.]
     [0. 1.]], shape=(5, 2), dtype=float32)
     
    [[18.]
     [29.]
     [27.]
     [20.]
     [60.]]
     
    [[0. 1.]
     [1. 0.]
     [1. 0.]
     [1. 0.]
     [0. 1.]]
     
     tf.Tensor([0 1 0 0 0], shape=(5,), dtype=int32)

"""
for x , y in train_dataset.take(1):
    print(keras.layers.DenseFeatures(feature_columns)(x).numpy())
"""output:

[[ 6.      1.      0.      0.      0.      1.      1.      0.      0.
   0.      0.      0.      0.      0.      1.      0.      0.      0.
  33.      0.      1.      0.      0.      0.      0.      0.      0.
   1.      0.      0.      0.      0.      0.      1.    ]
 [28.      0.      1.      1.      0.      0.      1.      0.      0.
   0.      0.      0.      0.      0.      0.      0.      1.      0.
   7.7875  0.      1.      0.      0.      0.      0.      0.      1.
   0.      0.      0.      0.      0.      0.      1.    ]
 [28.      0.      1.      0.      0.      1.      1.      0.      0.
   0.      0.      0.      0.      0.      1.      0.      0.      0.
   0.      0.      1.      0.      0.      0.      0.      0.      1.
   0.      0.      0.      0.      0.      1.      0.    ]
 [45.      1.      0.      1.      0.      0.      1.      0.      0.
   0.      0.      0.      0.      0.      0.      1.      0.      0.
  14.4542  0.      1.      0.      0.      0.      0.      0.      0.
   1.      0.      0.      0.      0.      0.      1.    ]
 [28.      0.      1.      0.      0.      1.      1.      0.      0.
   0.      0.      0.      0.      0.      1.      0.      0.      0.
   0.      0.      1.      0.      0.      0.      0.      0.      1.
   0.      0.      0.      0.      0.      1.      0.    ]]
"""

########################################################################################
model = keras.models.Sequential([
    keras.layers.DenseFeatures(feature_columns),
    keras.layers.Dense(100,activation='relu'),
    keras.layers.Dense(100,activation='relu'),
    keras.layers.Dense(2,activation='softmax'),

])

model.compile(loss = 'sparse_categorical_crossentropy',
                 optimizer = keras.optimizers.SGD(lr = 0.01),
              metrics = ['accuracy']
               )

#两种方法训练模型
#model.fit 训练模型
train_dataset = make_dataset(train_df,y_train,epochs=100)
eval_dataset = make_dataset(eval_df,y_eval,epochs=100)
history = model.fit(train_dataset,validation_data = eval_dataset,steps_per_epoch = 627//32,validation_steps = 264//32,epochs = 100)

#model 转成estimator，然后再训练模型
estimator = keras.estimator.model_to_estimator(model)
estimator.train(input_fn  = lambda :make_dataset(train_df,y_train,epochs=100) )#input_fn要求时没有参数的返回值为1：(features,labels)组成的列表、 元组、numpy数组，2dateset（但是其中必须是（features，labels））