先把dataset转出tfrecord格式,再用tf.data进行读取,然后模型进行训练
# tfrecord 文件格式
# -> tf.train.Example
# -> tf.train.Features -> {"key": tf.train.Feature}
# -> tf.train.Feature -> tf.train.ByteList/FloatList/Int64List
favorite_books = [name.encode('utf-8')
for name in ["machine learning", "cc150"]]
favorite_books_bytelist = tf.train.BytesList(value = favorite_books)
print(favorite_books_bytelist)
hours_floatlist = tf.train.FloatList(value = [15.5, 9.5, 7.0, 8.0])
print(hours_floatlist)
age_int64list = tf.train.Int64List(value = [42])
print(age_int64list)
features = tf.train.Features(
feature = {
"favorite_books": tf.train.Feature(
bytes_list = favorite_books_bytelist),
"hours": tf.train.Feature(
float_list = hours_floatlist),
"age": tf.train.Feature(int64_list = age_int64list),
}
)
print(features)
example 与 序列化(压缩减小size)
保存tfrecord文件
读取
反序列化
tfrecord压缩
压缩后读取tfrecord
tfrecord 生成 读取 使用
source_dir = "./generate_csv/"
def get_filenames_by_prefix(source_dir, prefix_name):
all_files = os.listdir(source_dir)
results = []
for filename in all_files:
if filename.startswith(prefix_name):
results.append(os.path.join(source_dir, filename))
return results
train_filenames = get_filenames_by_prefix(source_dir, "train")
valid_filenames = get_filenames_by_prefix(source_dir, "valid")
test_filenames = get_filenames_by_prefix(source_dir, "test")
import pprint
pprint.pprint(train_filenames)
pprint.pprint(valid_filenames)
pprint.pprint(test_filenames)
读取csv装成dataset
读取dataset进行遍历,把取到的数据写到tfrecord文件里去
读取tfrecord
expected_features = {
"input_features": tf.io.FixedLenFeature([8], dtype=tf.float32),
"label": tf.io.FixedLenFeature([1], dtype=tf.float32)
}
def parse_example(serialized_example):
example = tf.io.parse_single_example(serialized_example,
expected_features)
return example["input_features"], example["label"]
def tfrecords_reader_dataset(filenames, n_readers=5,
batch_size=32, n_parse_threads=5,
shuffle_buffer_size=10000):
dataset = tf.data.Dataset.list_files(filenames)
dataset = dataset.repeat()
dataset = dataset.interleave(
lambda filename: tf.data.TFRecordDataset(
filename, compression_type = "GZIP"),
cycle_length = n_readers
)
dataset.shuffle(shuffle_buffer_size)
dataset = dataset.map(parse_example,
num_parallel_calls=n_parse_threads)
dataset = dataset.batch(batch_size)
return dataset
tfrecords_train = tfrecords_reader_dataset(train_tfrecord_filenames,
batch_size = 3)
for x_batch, y_batch in tfrecords_train.take(2):
print(x_batch)
print(y_batch)
生成训练中使用的数据集
batch_size = 32
tfrecords_train_set = tfrecords_reader_dataset(
train_tfrecord_filenames, batch_size = batch_size)
tfrecords_valid_set = tfrecords_reader_dataset(
valid_tfrecord_filenames, batch_size = batch_size)
tfrecords_test_set = tfrecords_reader_dataset(
test_tfrecord_fielnames, batch_size = batch_size)
训练
model = keras.models.Sequential([
keras.layers.Dense(30, activation='relu',
input_shape=[8]),
keras.layers.Dense(1),
])
model.compile(loss="mean_squared_error", optimizer="sgd")
callbacks = [keras.callbacks.EarlyStopping(
patience=5, min_delta=1e-2)]
history = model.fit(tfrecords_train_set,
validation_data = tfrecords_valid_set,
steps_per_epoch = 11160 // batch_size,
validation_steps = 3870 // batch_size,
epochs = 100,
callbacks = callbacks)