"""
@author: khoing
@contact: Khoing@126.com
@time: 2019/12/16 16:58
@file: tf_keras_to_estimator.py
"""
import matplotlib as mpl # Matplotlib 是 Python 的绘图库。 它可与 NumPy 一起使用
import matplotlib.pyplot as plt # Python数据可视化matplotlib.pyplot
# %matplotlib inline #在使用jupyter notebook 或者 jupyter qtconsole的时候,经常会用到%matplotlib inline。其作用就是在你调用plot()进行画图或者直接输入Figure的实例对象的时候,会自动的显示并把figure嵌入到console中。
import numpy as np # 数值计算扩展。这种工具可用来存储和处理大型矩阵
import sklearn # 机器学习中常用的第三方模块,对常用的机器学习方法进行了封装,包括回归(Regression)、降维(Dimensionality Reduction)、分类(Classfication)、聚类(Clustering)等方法。
import pandas as pd # 是python的一个数据分析包
import os # 系统编程的操作模块,可以处理文件和目录
import sys # sys模块包含了与Python解释器和它的环境有关的函数
import time
import tensorflow as tf
from tensorflow import keras
##################################################################################################
# 选择GPU
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
##################################################################################################
print(tf.__version__)
print(sys.version_info)
for module in mpl, np, pd, sklearn, tf, keras:
print(module.__name__, module.__version__)
"""output:
2.0.0
sys.version_info(major=3, minor=7, micro=4, releaselevel='final', serial=0)
matplotlib 3.1.1
numpy 1.16.5
pandas 0.25.3
sklearn 0.21.3
tensorflow 2.0.0
tensorflow_core.keras 2.2.4-tf
"""
##################################################################################################
train_file = "./data/train.csv" #训练集
eval_file = "./data/eval.csv" #测试集
#pandas有读取csv的api
train_df = pd.read_csv(train_file)
eval_df = pd.read_csv(eval_file)
print(train_df.head())
print(eval_df.head())
"""output:
survived sex age ... deck embark_town alone
0 0 male 22.0 ... unknown Southampton n
1 1 female 38.0 ... C Cherbourg n
2 1 female 26.0 ... unknown Southampton y
3 1 female 35.0 ... C Southampton n
4 0 male 28.0 ... unknown Queenstown y
[5 rows x 10 columns]
survived sex age ... deck embark_town alone
0 0 male 35.0 ... unknown Southampton y
1 0 male 54.0 ... E Southampton y
2 1 female 58.0 ... C Southampton y
3 1 female 55.0 ... unknown Southampton y
4 1 male 34.0 ... D Southampton y
[5 rows x 10 columns]
"""
#把survived取出来,当作预测值
y_train = train_df.pop('survived')
y_eval = eval_df.pop('survived')
print(train_df.head())
print(eval_df.head())
"""output:
sex age n_siblings_spouses parch ... class deck embark_town alone
0 male 22.0 1 0 ... Third unknown Southampton n
1 female 38.0 1 0 ... First C Cherbourg n
2 female 26.0 0 0 ... Third unknown Southampton y
3 female 35.0 1 0 ... First C Southampton n
4 male 28.0 0 0 ... Third unknown Queenstown y
[5 rows x 9 columns]
sex age n_siblings_spouses parch ... class deck embark_town alone
0 male 35.0 0 0 ... Third unknown Southampton y
1 male 54.0 0 0 ... First E Southampton y
2 female 58.0 0 0 ... First C Southampton y
3 female 55.0 0 0 ... Second unknown Southampton y
4 male 34.0 0 0 ... Second D Southampton y
[5 rows x 9 columns]
"""
print(y_train.head())
print(y_eval.head())
"""output:
0 0
1 1
2 1
3 1
4 0
Name: survived, dtype: int64
0 0
1 0
2 1
3 1
4 1
Name: survived, dtype: int64
"""
#pandas可以容易的看到数据的统计量
print(train_df.describe())
"""output:
age n_siblings_spouses parch fare
count 627.000000 627.000000 627.000000 627.000000
mean 29.631308 0.545455 0.379585 34.385399
std 12.511818 1.151090 0.792999 54.597730
min 0.750000 0.000000 0.000000 0.000000
25% 23.000000 0.000000 0.000000 7.895800 #低于25%的值
50% 28.000000 0.000000 0.000000 15.045800 #低于50%的值
75% 35.000000 1.000000 0.000000 31.387500 #低于75%的值
max 80.000000 8.000000 5.000000 512.329200
"""
print(train_df.shape , eval_df.shape)
"""output:
(627, 9) (264, 9)
"""
# 分成20份(有20个柱形)
# train_df.age.hist(bins = 20) ipython中只需这一句就能直接显示直方图
plt.hist(train_df.age , 20)
plt.show()
#plot函数用来绘制二维图像。
##################################################################################################
# 离散特征,连续特征
# 训练模型时,而离散特征需要OneHot编码返回一个向量
#离散型特征
categoryical_columns = ['sex','n_siblings_spouses','parch','class','deck','embark_town','alone']
#连续型特征
numerical_columns = ['age','fare']
feature_columns = []
for categoryical_column in categoryical_columns:
vocab = train_df[categoryical_column].unique()#离散型特征的个数(有多少类)
#print(tf.feature_column.categorical_column_with_vocabulary_list(categoryical_column,vocab))#参数1:名字,参数 2:所有可能的值(sex,['male','female']
"""output:
VocabularyListCategoricalColumn(key='sex', vocabulary_list=('male', 'female'), dtype=tf.string, default_value=-1, num_oov_buckets=0)
VocabularyListCategoricalColumn(key='n_siblings_spouses', vocabulary_list=(1, 0, 3, 4, 2, 5, 8), dtype=tf.int64, default_value=-1, num_oov_buckets=0)
VocabularyListCategoricalColumn(key='parch', vocabulary_list=(0, 1, 2, 5, 3, 4), dtype=tf.int64, default_value=-1, num_oov_buckets=0)
VocabularyListCategoricalColumn(key='class', vocabulary_list=('Third', 'First', 'Second'), dtype=tf.string, default_value=-1, num_oov_buckets=0)
VocabularyListCategoricalColumn(key='deck', vocabulary_list=('unknown', 'C', 'G', 'A', 'B', 'D', 'F', 'E'), dtype=tf.string, default_value=-1, num_oov_buckets=0)
VocabularyListCategoricalColumn(key='embark_town', vocabulary_list=('Southampton', 'Cherbourg', 'Queenstown', 'unknown'), dtype=tf.string, default_value=-1, num_oov_buckets=0)
VocabularyListCategoricalColumn(key='alone', vocabulary_list=('n', 'y'), dtype=tf.string, default_value=-1, num_oov_buckets=0)
"""
#转为ont-hot编码
#print(tf.feature_column.indicator_column(tf.feature_column.categorical_column_with_vocabulary_list(categoryical_column, vocab)))
"""output:
IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='sex', vocabulary_list=('male', 'female'), dtype=tf.string, default_value=-1, num_oov_buckets=0))
IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='n_siblings_spouses', vocabulary_list=(1, 0, 3, 4, 2, 5, 8), dtype=tf.int64, default_value=-1, num_oov_buckets=0))
IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='parch', vocabulary_list=(0, 1, 2, 5, 3, 4), dtype=tf.int64, default_value=-1, num_oov_buckets=0))
IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='class', vocabulary_list=('Third', 'First', 'Second'), dtype=tf.string, default_value=-1, num_oov_buckets=0))
IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='deck', vocabulary_list=('unknown', 'C', 'G', 'A', 'B', 'D', 'F', 'E'), dtype=tf.string, default_value=-1, num_oov_buckets=0))
IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='embark_town', vocabulary_list=('Southampton', 'Cherbourg', 'Queenstown', 'unknown'), dtype=tf.string, default_value=-1, num_oov_buckets=0))
IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='alone', vocabulary_list=('n', 'y'), dtype=tf.string, default_value=-1, num_oov_buckets=0))
"""
feature_columns.append(tf.feature_column.indicator_column(
tf.feature_column.categorical_column_with_vocabulary_list(categoryical_column, vocab)))
print(feature_columns)
for numerical_column in numerical_columns:
feature_columns.append(tf.feature_column.numeric_column(numerical_column,dtype=tf.float32))
print(feature_columns)
"""output:
[IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='sex', vocabulary_list=('male', 'female'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='n_siblings_spouses', vocabulary_list=(1, 0, 3, 4, 2, 5, 8), dtype=tf.int64, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='parch', vocabulary_list=(0, 1, 2, 5, 3, 4), dtype=tf.int64, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='class', vocabulary_list=('Third', 'First', 'Second'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='deck', vocabulary_list=('unknown', 'C', 'G', 'A', 'B', 'D', 'F', 'E'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='embark_town', vocabulary_list=('Southampton', 'Cherbourg', 'Queenstown', 'unknown'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='alone', vocabulary_list=('n', 'y'), dtype=tf.string, default_value=-1, num_oov_buckets=0))]
[IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='sex', vocabulary_list=('male', 'female'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='n_siblings_spouses', vocabulary_list=(1, 0, 3, 4, 2, 5, 8), dtype=tf.int64, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='parch', vocabulary_list=(0, 1, 2, 5, 3, 4), dtype=tf.int64, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='class', vocabulary_list=('Third', 'First', 'Second'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='deck', vocabulary_list=('unknown', 'C', 'G', 'A', 'B', 'D', 'F', 'E'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='embark_town', vocabulary_list=('Southampton', 'Cherbourg', 'Queenstown', 'unknown'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='alone', vocabulary_list=('n', 'y'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), NumericColumn(key='age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='fare', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)]
"""
#print(train_df)
print((dict(train_df),y_train))
def make_dataset(data_df,label_df,epochs = 10, shuffle = True,batch_size = 32):
dataset = tf.data.Dataset.from_tensor_slices((dict(data_df),label_df))
if shuffle:
dataset = dataset.shuffle(10000)
dataset = dataset.repeat(epochs).batch(batch_size)
return dataset
train_dataset = make_dataset(train_df,y_train,batch_size=5)
for x , y in train_dataset.take(1):
print(x)
print(y)
"""output:
{'sex': <tf.Tensor: id=38, shape=(5,), dtype=string, numpy=array([b'female', b'male', b'male', b'female', b'male'], dtype=object)>,
'age': <tf.Tensor: id=30, shape=(5,), dtype=float64, numpy=array([28., 28., 28., 28., 33.])>,
'n_siblings_spouses': <tf.Tensor: id=36, shape=(5,), dtype=int32, numpy=array([1, 0, 0, 0, 0])>,
'parch': <tf.Tensor: id=37, shape=(5,), dtype=int32, numpy=array([0, 0, 0, 0, 0])>,
'fare': <tf.Tensor: id=35, shape=(5,), dtype=float64, numpy=array([146.5208, 26.55 , 31. , 33. , 12.275 ])>,
'class': <tf.Tensor: id=32, shape=(5,), dtype=string, numpy=array([b'First', b'First', b'First', b'Second', b'Second'], dtype=object)>,
'deck': <tf.Tensor: id=33, shape=(5,), dtype=string, numpy=array([b'B', b'C', b'unknown', b'unknown', b'unknown'], dtype=object)>,
'embark_town': <tf.Tensor: id=34, shape=(5,), dtype=string, numpy=array([b'Cherbourg', b'Southampton', b'Southampton', b'Southampton',b'Southampton'], dtype=object)>,
'alone': <tf.Tensor: id=31, shape=(5,), dtype=string, numpy=array([b'n', b'y', b'y', b'y', b'y'], dtype=object)>}
tf.Tensor([1 0 0 1 0], shape=(5,), dtype=int32)
"""
#DenseFeature 可以将定义的featureColumn应用到dataset中
for x , y in train_dataset.take(1):
age_column = feature_columns[7]
gender_column = feature_columns[0]
print(keras.layers.DenseFeatures(age_column)(x))
print(keras.layers.DenseFeatures(gender_column)(x))
print(keras.layers.DenseFeatures(age_column)(x).numpy())
print(keras.layers.DenseFeatures(gender_column)(x).numpy())
print(y)
"""output:
tf.Tensor(
[[18.]
[29.]
[27.]
[20.]
[60.]], shape=(5, 1), dtype=float32)
tf.Tensor(
[[0. 1.]
[1. 0.]
[1. 0.]
[1. 0.]
[0. 1.]], shape=(5, 2), dtype=float32)
[[18.]
[29.]
[27.]
[20.]
[60.]]
[[0. 1.]
[1. 0.]
[1. 0.]
[1. 0.]
[0. 1.]]
tf.Tensor([0 1 0 0 0], shape=(5,), dtype=int32)
"""
for x , y in train_dataset.take(1):
print(keras.layers.DenseFeatures(feature_columns)(x).numpy())
"""output:
[[ 6. 1. 0. 0. 0. 1. 1. 0. 0.
0. 0. 0. 0. 0. 1. 0. 0. 0.
33. 0. 1. 0. 0. 0. 0. 0. 0.
1. 0. 0. 0. 0. 0. 1. ]
[28. 0. 1. 1. 0. 0. 1. 0. 0.
0. 0. 0. 0. 0. 0. 0. 1. 0.
7.7875 0. 1. 0. 0. 0. 0. 0. 1.
0. 0. 0. 0. 0. 0. 1. ]
[28. 0. 1. 0. 0. 1. 1. 0. 0.
0. 0. 0. 0. 0. 1. 0. 0. 0.
0. 0. 1. 0. 0. 0. 0. 0. 1.
0. 0. 0. 0. 0. 1. 0. ]
[45. 1. 0. 1. 0. 0. 1. 0. 0.
0. 0. 0. 0. 0. 0. 1. 0. 0.
14.4542 0. 1. 0. 0. 0. 0. 0. 0.
1. 0. 0. 0. 0. 0. 1. ]
[28. 0. 1. 0. 0. 1. 1. 0. 0.
0. 0. 0. 0. 0. 1. 0. 0. 0.
0. 0. 1. 0. 0. 0. 0. 0. 1.
0. 0. 0. 0. 0. 1. 0. ]]
"""
########################################################################################
model = keras.models.Sequential([
keras.layers.DenseFeatures(feature_columns),
keras.layers.Dense(100,activation='relu'),
keras.layers.Dense(100,activation='relu'),
keras.layers.Dense(2,activation='softmax'),
])
model.compile(loss = 'sparse_categorical_crossentropy',
optimizer = keras.optimizers.SGD(lr = 0.01),
metrics = ['accuracy']
)
#两种方法训练模型
#model.fit 训练模型
train_dataset = make_dataset(train_df,y_train,epochs=100)
eval_dataset = make_dataset(eval_df,y_eval,epochs=100)
history = model.fit(train_dataset,validation_data = eval_dataset,steps_per_epoch = 627//32,validation_steps = 264//32,epochs = 100)
#model 转成estimator,然后再训练模型
estimator = keras.estimator.model_to_estimator(model)
estimator.train(input_fn = lambda :make_dataset(train_df,y_train,epochs=100) )#input_fn要求时没有参数的返回值为1:(features,labels)组成的列表、 元组、numpy数组,2dateset(但是其中必须是(features,labels))
Tensorflow-estimator_1.featurecolumn以及estimator的使用
最新推荐文章于 2022-01-31 14:31:51 发布
本文深入探讨了TensorFlow Estimator框架及其与FeatureColumn的配合使用。通过实例解析,展示了如何利用Estimator进行模型训练和评估,同时详细介绍了FeatureColumn在特征工程中的重要作用,帮助读者掌握在TensorFlow中构建高效机器学习模型的关键技巧。
摘要由CSDN通过智能技术生成