tf.estimator使用
train.csv文件
一共628条样本
测试集,一共265条样本
# https://storage.googleapis.com/tf-datasets/titanic/train.csv
# https://storage.googleapis.com/tf-datasets/titanic/eval.csv
train_file = "./data/titanic/train.csv"
eval_file = "./data/titanic/eval.csv"
# pandas读取数据
train_df = pd.read_csv(train_file)
eval_df = pd.read_csv(eval_file)
# .head() 默认调用前五条数据
print(train_df.head())
print(eval_df.head())
survived是我们要预测的数值,所以需要取出survived特征
查看数据中的统计量
查看shape
分析数据
查看年龄分布
查看不同性别的人各占多少
barh 纵向horizontal
barv 横向vertical
统计不同客舱的乘客有多少
统计男性多少人获救,女性多少人获救
拼接后按性别分组取出survived之后算均值
模型搭建
# 离散值特征
categorical_columns = ['sex', 'n_siblings_spouses', 'parch', 'class',
'deck', 'embark_town', 'alone']
# 连续值特征
numeric_columns = ['age', 'fare']
# 定义feature_columns集合
feature_columns = []
# 对每一个离散值特征进行处理
for categorical_column in categorical_columns:
#获取这个离散特征所在列的所有可能的值
vocab = train_df[categorical_column].unique()
print(categorical_column, vocab)
feature_columns.append(
# one-hot编码
tf.feature_column.indicator_column(
tf.feature_column.categorical_column_with_vocabulary_list(
categorical_column, vocab)))
#处理连续特征
for categorical_column in numeric_columns:
feature_columns.append(
tf.feature_column.numeric_column(
categorical_column, dtype=tf.float32))
构建dataset函数
# shuffle 洗牌
def make_dataset(data_df, label_df, epochs = 10, shuffle = True,
batch_size = 32):
dataset = tf.data.Dataset.from_tensor_slices(
(dict(data_df), label_df))
if shuffle:
dataset = dataset.shuffle(10000)
dataset = dataset.repeat(epochs).batch(batch_size)
return dataset
train_dataset = make_dataset(train_df, y_train, batch_size = 5)
x是字典,y是列表
# keras.layers.DenseFeature
for x, y in train_dataset.take(1):
age_column = feature_columns[7]
gender_column = feature_columns[0]
print(keras.layers.DenseFeatures(age_column)(x).numpy())
print(keras.layers.DenseFeatures(gender_column)(x).numpy())
连续特征和离散特征(进行了one-hot)
feature_colums 密集特征转为来的,保持原有的值,其他变成0-1这样的值。
构建keras模型
model = keras.models.Sequential([
keras.layers.DenseFeatures(feature_columns),
keras.layers.Dense(100, activation='relu'),
keras.layers.Dense(100, activation='relu'),
keras.layers.Dense(2, activation='softmax'),
])
model.compile(loss='sparse_categorical_crossentropy',
optimizer = keras.optimizers.SGD(lr=0.01),
metrics = ['accuracy'])
# 1. model.fit
# 2. model -> estimator -> train
# 生成dataset
train_dataset = make_dataset(train_df, y_train, epochs = 100)
eval_dataset = make_dataset(eval_df, y_eval, epochs = 1, shuffle = False)
history = model.fit(train_dataset,
validation_data = eval_dataset,
#训练集样本数 // batch.size() 32
steps_per_epoch = 20,
validation_steps = 8,
epochs = 100)
steps_per_epoch 设置偏大,导致在最后一次迭代没有足够数据,训练提前终止
把model转成 estimator 进行训练
estimator = keras.estimator.model_to_estimator(model)
# 1. function
# 2. input_fn格式 return a. (features, labels) b. dataset -> (feature, label)
estimator.train(input_fn = lambda : make_dataset(
train_df, y_train, epochs=100))
据说是框架的bug