这里写自定义目录标题
训练一个简单的模型预测泰坦尼克号生还者模型
训练一个模型需要:
- 获取数据文件
- 数据预处理
1 . 非数值数据处理(映射字符串为数值,处理空值)
2 . 数值处理(归一化) - 构建模型
- 训练模型,可视化训练过程
- 预测
数据预处理代码
为了数据处理模块适应多种数据类型,这里定义了基类DataProcessBase
以实现后续处理不同的数据类型。每个类需要实现自己的process方法。
from DataProcess.DataProcessBase import DataProcessBase
import pandas as pd
import tensorflow as tf
from collections import OrderedDict
from functools import partial
class CsvDataProcess(DataProcessBase):
def __init__(self, train_csv, eval_csv, mode='train'):
self.train_csv = train_csv
self.eval_csv = eval_csv
self.mode = mode
self.train_data = pd.read_csv(self.train_csv)
self.columns, self.label_name = self.data_field_process(label_index=0)
def data_field_process(self, label_index):
# 获取数据列名称
column_names = self.train_data.columns.tolist()
# 标签名称,(名称列中第一位)
label_name = column_names[label_index]
return column_names, label_name
def get_dataset(self, batch_size=12, num_epochs=1):
train_dataset = tf.data.experimental.make_csv_dataset(self.train_csv, batch_size=batch_size,
label_name=self.label_name, na_value='?',
num_epochs=num_epochs, ignore_errors=True)
eval_dataset = tf.data.experimental.make_csv_dataset(self.eval_csv, batch_size=batch_size,
label_name=self.label_name, na_value='?',
num_epochs=num_epochs, ignore_errors=True)
return train_dataset, eval_dataset
# 对连续数值除以均值
def continous_data_mean(self, mean, data):
data = tf.cast(data, tf.float32) * 1 / (2 * mean)
return tf.reshape(data, [-1, 1])
# 连续数值计算均值,将特征列设置为数值类型
def process_continous_data(self, continous_fields=['age', 'n_siblings_spouses', 'parch', 'fare']):
means = OrderedDict()
numerical_columns = []
for field in continous_fields:
means[field] = self.train_data[field].mean()
for feature in means.keys():
num_col = tf.feature_column.numeric_column(feature, normalizer_fn=partial(self.continous_data_mean,
means[feature]))
numerical_columns.append(num_col)
return numerical_columns
# 处理字段类型,映射为数值
def process_discrete_data(self, discrete_fields=['sex', 'class', 'deck', 'embark_town', 'alone']):
discrete_map = OrderedDict()
categorical_columns = []
for field in discrete_fields:
discrete_value = list(filter(lambda x: x != 'unknown', sorted(set(self.train_data[field]))))
discrete_map[field] = discrete_value
# discrete_map['deck']=['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
# print("Class:{}".format(discrete_map['deck']))
for feature, vocab in discrete_map.items():
cat_col = tf.feature_column.categorical_column_with_vocabulary_list(
key=feature, vocabulary_list=vocab)
categorical_columns.append(tf.feature_column.indicator_column(cat_col))
return categorical_columns
def process(self, continous_fields, descrete_fields):
data_columns = []
numerical_columns = self.process_continous_data(continous_fields)
discrete_columns = self.process_discrete_data(descrete_fields)
data_columns = discrete_columns + numerical_columns
return data_columns
定义模型
train_csv = join(pardir,'datas/train.csv')
eval_csv = join(pardir,'datas/eval.csv')
cdp = CsvDataProcess(train_csv,eval_csv=eval_csv)
descret_fields = ['sex','class','deck','embark_town','alone']
continuous_fields = ['age','n_siblings_spouses','parch','fare']
result = cdp.process(continous_fields=continuous_fields,descrete_fields=descret_fields)
preprocessing_layer = tf.keras.layers.DenseFeatures(result)
model = tf.keras.Sequential([
preprocessing_layer,
tf.keras.layers.Dense(128, activation='relu'),
tf.keras.layers.Dense(128, activation='relu'),
tf.keras.layers.Dense(1, activation='sigmoid'),
])
model.compile(
loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy'])
raw_train_dataset,raw_eval_dataset = cdp.get_dataset()
# train_dataset = raw_train_dataset.batch(500)
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir='/tmp/train_model', histogram_freq=1)
model.fit(raw_train_dataset,epochs=20,callbacks=[tensorboard_callback])
模型为一个多层感知机。