"""
@author: khoing
@contact: Khoing@126.com
@time: 2019/12/17 16:58
@file: tf_premade_estimators.py
"""
import matplotlib as mpl # Matplotlib 是 Python 的绘图库。 它可与 NumPy 一起使用
import matplotlib.pyplot as plt # Python数据可视化matplotlib.pyplot
# %matplotlib inline #在使用jupyter notebook 或者 jupyter qtconsole的时候,经常会用到%matplotlib inline。其作用就是在你调用plot()进行画图或者直接输入Figure的实例对象的时候,会自动的显示并把figure嵌入到console中。
import numpy as np # 数值计算扩展。这种工具可用来存储和处理大型矩阵
import sklearn # 机器学习中常用的第三方模块,对常用的机器学习方法进行了封装,包括回归(Regression)、降维(Dimensionality Reduction)、分类(Classfication)、聚类(Clustering)等方法。
import pandas as pd # 是python的一个数据分析包
import os # 系统编程的操作模块,可以处理文件和目录
import sys # sys模块包含了与Python解释器和它的环境有关的函数
import time
import tensorflow as tf
from tensorflow import keras
##################################################################################################
# 选择GPU
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
##################################################################################################
print(tf.__version__)
print(sys.version_info)
for module in mpl, np, pd, sklearn, tf, keras:
print(module.__name__, module.__version__)
"""output:
2.0.0
sys.version_info(major=3, minor=7, micro=4, releaselevel='final', serial=0)
matplotlib 3.1.1
numpy 1.16.5
pandas 0.25.3
sklearn 0.21.3
tensorflow 2.0.0
tensorflow_core.keras 2.2.4-tf
"""
##################################################################################################
train_file = "./data/train.csv" #训练集
eval_file = "./data/eval.csv" #测试集
#pandas有读取csv的api
train_df = pd.read_csv(train_file)
eval_df = pd.read_csv(eval_file)
print(train_df.head())
print(eval_df.head())
"""output:
survived sex age ... deck embark_town alone
0 0 male 22.0 ... unknown Southampton n
1 1 female 38.0 ... C Cherbourg n
2 1 female 26.0 ... unknown Southampton y
3 1 female 35.0 ... C Southampton n
4 0 male 28.0 ... unknown Queenstown y
[5 rows x 10 columns]
survived sex age ... deck embark_town alone
0 0 male 35.0 ... unknown Southampton y
1 0 male 54.0 ... E Southampton y
2 1 female 58.0 ... C Southampton y
3 1 female 55.0 ... unknown Southampton y
4 1 male 34.0 ... D Southampton y
[5 rows x 10 columns]
"""
#把survived取出来,当作预测值
y_train = train_df.pop('survived')
y_eval = eval_df.pop('survived')
print(train_df.head())
print(eval_df.head())
"""output:
sex age n_siblings_spouses parch ... class deck embark_town alone
0 male 22.0 1 0 ... Third unknown Southampton n
1 female 38.0 1 0 ... First C Cherbourg n
2 female 26.0 0 0 ... Third unknown Southampton y
3 female 35.0 1 0 ... First C Southampton n
4 male 28.0 0 0 ... Third unknown Queenstown y
[5 rows x 9 columns]
sex age n_siblings_spouses parch ... class deck embark_town alone
0 male 35.0 0 0 ... Third unknown Southampton y
1 male 54.0 0 0 ... First E Southampton y
2 female 58.0 0 0 ... First C Southampton y
3 female 55.0 0 0 ... Second unknown Southampton y
4 male 34.0 0 0 ... Second D Southampton y
[5 rows x 9 columns]
"""
print(y_train.head())
print(y_eval.head())
"""output:
0 0
1 1
2 1
3 1
4 0
Name: survived, dtype: int64
0 0
1 0
2 1
3 1
4 1
Name: survived, dtype: int64
"""
##################################################################################################
# 离散特征,连续特征
# 训练模型时,而离散特征需要OneHot编码返回一个向量
#离散型特征
categoryical_columns = ['sex','n_siblings_spouses','parch','class','deck','embark_town','alone']
#连续型特征
numerical_columns = ['age','fare']
feature_columns = []
for categoryical_column in categoryical_columns:
vocab = train_df[categoryical_column].unique()#离散型特征的个数(有多少类)
#print(tf.feature_column.categorical_column_with_vocabulary_list(categoryical_column,vocab))#参数1:名字,参数 2:所有可能的值(sex,['male','female']
"""output:
VocabularyListCategoricalColumn(key='sex', vocabulary_list=('male', 'female'), dtype=tf.string, default_value=-1, num_oov_buckets=0)
VocabularyListCategoricalColumn(key='n_siblings_spouses', vocabulary_list=(1, 0, 3, 4, 2, 5, 8), dtype=tf.int64, default_value=-1, num_oov_buckets=0)
VocabularyListCategoricalColumn(key='parch', vocabulary_list=(0, 1, 2, 5, 3, 4), dtype=tf.int64, default_value=-1, num_oov_buckets=0)
VocabularyListCategoricalColumn(key='class', vocabulary_list=('Third', 'First', 'Second'), dtype=tf.string, default_value=-1, num_oov_buckets=0)
VocabularyListCategoricalColumn(key='deck', vocabulary_list=('unknown', 'C', 'G', 'A', 'B', 'D', 'F', 'E'), dtype=tf.string, default_value=-1, num_oov_buckets=0)
VocabularyListCategoricalColumn(key='embark_town', vocabulary_list=('Southampton', 'Cherbourg', 'Queenstown', 'unknown'), dtype=tf.string, default_value=-1, num_oov_buckets=0)
VocabularyListCategoricalColumn(key='alone', vocabulary_list=('n', 'y'), dtype=tf.string, default_value=-1, num_oov_buckets=0)
"""
#转为ont-hot编码
#print(tf.feature_column.indicator_column(tf.feature_column.categorical_column_with_vocabulary_list(categoryical_column, vocab)))
"""output:
IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='sex', vocabulary_list=('male', 'female'), dtype=tf.string, default_value=-1, num_oov_buckets=0))
IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='n_siblings_spouses', vocabulary_list=(1, 0, 3, 4, 2, 5, 8), dtype=tf.int64, default_value=-1, num_oov_buckets=0))
IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='parch', vocabulary_list=(0, 1, 2, 5, 3, 4), dtype=tf.int64, default_value=-1, num_oov_buckets=0))
IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='class', vocabulary_list=('Third', 'First', 'Second'), dtype=tf.string, default_value=-1, num_oov_buckets=0))
IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='deck', vocabulary_list=('unknown', 'C', 'G', 'A', 'B', 'D', 'F', 'E'), dtype=tf.string, default_value=-1, num_oov_buckets=0))
IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='embark_town', vocabulary_list=('Southampton', 'Cherbourg', 'Queenstown', 'unknown'), dtype=tf.string, default_value=-1, num_oov_buckets=0))
IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='alone', vocabulary_list=('n', 'y'), dtype=tf.string, default_value=-1, num_oov_buckets=0))
"""
feature_columns.append(tf.feature_column.indicator_column(
tf.feature_column.categorical_column_with_vocabulary_list(categoryical_column, vocab)))
print(feature_columns)
for numerical_column in numerical_columns:
feature_columns.append(tf.feature_column.numeric_column(numerical_column,dtype=tf.float32))
print(feature_columns)
"""output
[IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='sex', vocabulary_list=('male', 'female'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='n_siblings_spouses', vocabulary_list=(1, 0, 3, 4, 2, 5, 8), dtype=tf.int64, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='parch', vocabulary_list=(0, 1, 2, 5, 3, 4), dtype=tf.int64, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='class', vocabulary_list=('Third', 'First', 'Second'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='deck', vocabulary_list=('unknown', 'C', 'G', 'A', 'B', 'D', 'F', 'E'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='embark_town', vocabulary_list=('Southampton', 'Cherbourg', 'Queenstown', 'unknown'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='alone', vocabulary_list=('n', 'y'), dtype=tf.string, default_value=-1, num_oov_buckets=0))]
[IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='sex', vocabulary_list=('male', 'female'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='n_siblings_spouses', vocabulary_list=(1, 0, 3, 4, 2, 5, 8), dtype=tf.int64, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='parch', vocabulary_list=(0, 1, 2, 5, 3, 4), dtype=tf.int64, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='class', vocabulary_list=('Third', 'First', 'Second'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='deck', vocabulary_list=('unknown', 'C', 'G', 'A', 'B', 'D', 'F', 'E'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='embark_town', vocabulary_list=('Southampton', 'Cherbourg', 'Queenstown', 'unknown'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='alone', vocabulary_list=('n', 'y'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), NumericColumn(key='age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='fare', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)]
"""
#########################################################################################################################
#cross_featrues: age[1,2,3], gender[male,female]
#age&gender :[(1,male),(2,male),(3,male).......]
#
#tf.feature_column.crossed_column(['age','sex'],hash_bucket_size=100)#表示压缩矩阵
#DNN需要indicator
feature_columns.append(tf.feature_column.indicator_column(tf.feature_column.crossed_column(['age','sex'],hash_bucket_size=100)))#表示压缩矩阵
def make_dataset(data_df,label_df,epochs = 10, shuffle = True,batch_size = 32):
dataset = tf.data.Dataset.from_tensor_slices((dict(data_df),label_df))
if shuffle:
dataset = dataset.shuffle(10000)
dataset = dataset.repeat(epochs).batch(batch_size)
return dataset
##############################################################################################
output_dir = 'baseline_model'
if not os.path.exists(output_dir):
os.makedirs(output_dir)
baseline_estimator = tf.estimator.BaselineClassifier(model_dir=output_dir,n_classes=2)
baseline_estimator.train(input_fn= lambda : make_dataset(train_df,y_train,epochs=100))
baseline_estimator.evaluate(input_fn=lambda :make_dataset(eval_df,y_eval,epochs=1,shuffle=False,batch_size=20))
##############################################################################################
linear_output_dir = 'linear_model'
if not os.path.exists(linear_output_dir):
os.makedirs(output_dir)
linear_estimator = tf.estimator.LinearClassifier(model_dir=linear_output_dir,n_classes=2,feature_columns = feature_columns)
linear_estimator.train(input_fn=lambda : make_dataset(train_df,y_train,epochs=100))
linear_estimator.evaluate(input_fn=lambda :make_dataset(eval_df,y_eval,epochs=1,shuffle=False,batch_size=20))
##############################################################################################
dnn_output_dir = 'dnn_model'
if not os.path.exists(dnn_output_dir):
os.makedirs(dnn_output_dir)
dnn_estimator = tf.estimator.DNNClassifier(model_dir=dnn_output_dir,n_classes=2,feature_columns = feature_columns,
hidden_units=[128,128],
activation_fn=tf.nn.relu,
optimizer='Adam')
dnn_estimator.train(input_fn=lambda :make_dataset(train_df,y_train,epochs=100))
dnn_estimator.evaluate(input_fn=lambda : make_dataset(eval_df,y_eval,epochs=1,shuffle=False,batch_size=20))
Tensorflow-estimator_2.预定义estimator(baseline_estimator,linear_estimator,dnn_estimator)的使用
最新推荐文章于 2022-07-23 19:00:14 发布