数据输入流水线
通过构建tf.data.Dataset对象进行数据的提取、转换、载入
有多种创建数据集对象的方法,具体选择哪种取决于数据源
1.内存中的张量:tf.data.Dataset.from_tensors或者tf.data.Dataset.from_tensors_slices
2.python生成器:tf.data.Dataset.from_generator
3.一系列模式匹配的文件中:tf.data.Dataset.list_files
还有两种特殊的
1.tf.data.TFRecordDataset
2.tf.data.TexLineDataset
import tensorflow as tf
dataset = tf.data.Dataset.from_tensor_slices(
{
"a": tf.random.uniform([4]),
"b": tf.random.uniform([4, 100], maxval=100, dtype=tf.int32),
}
)
for value in dataset:
print(value["a"])
def noise():
while True:
yield tf.random.uniform((100,))
dataset = tf.data.Dataset.from_generator(noise, (tf.float32))
buffer_size = 10
batch_size = 32
dataset = dataset.map(lambda x: x + 10).shuffle(buffer_size).batch(batch_size)
for idx, noise in enumerate(dataset):
if idx == 2:
break
print(idx)
print(noise.shape)
常见的变化有map、shuffle、Batch、repeat等方法
import tensorflow as tf
from tensorflow.keras.datasets import fashion_mnist
def train_dataset(batch_size=32, num_epochs=1):
(train_x, train_y), (test_x, test_y) = fashion_mnist.load_data()
input_x, input_y = train_x, train_y
def scale_fn(image, label):
return (tf.image.convert_image_dtype(image, tf.float32) - 0.5) * 2.0, label
dataset = tf.data.Dataset.from_tensor_slices(
(tf.expand_dims(input_x, -1), tf.expand_dims(input_y, -1))
).map(scale_fn)
dataset = dataset.cache().repeat(num_epochs)
dataset = dataset.shuffle(batch_size)
return dataset.batch(batch_size).prefetch(1)
def make_model(n_classes):
return tf.keras.Sequential(
[
tf.keras.layers.Conv2D(
32, (5, 5), activation=tf.nn.relu, input_shape=(28, 28, 1)
),
tf.keras.layers.MaxPool2D((2, 2), (2, 2)),
tf.keras.layers.Conv2D(64, (3, 3), activation=tf.nn.relu),
tf.keras.layers.MaxPool2D((2, 2), (2, 2)),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(1024, activation=tf.nn.relu),
tf.keras.layers.Dropout(0.5),
tf.keras.layers.Dense(n_classes),
]
)
def train():
# Define the model
n_classes = 10
model = make_model(n_classes)
# Input data
dataset = train_dataset(num_epochs=10)
# Training parameters
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
step = tf.Variable(1, name="global_step")
optimizer = tf.optimizers.Adam(1e-3)
accuracy = tf.metrics.Accuracy()
# Train step function
@tf.function
def train_step(inputs, labels):
with tf.GradientTape() as tape:
logits = model(inputs)
loss_value = loss(labels, logits)
gradients = tape.gradient(loss_value, model.trainable_variables)
optimizer.apply_gradients(zip(gradients, model.trainable_variables))
step.assign_add(1)
accuracy_value = accuracy(labels, tf.argmax(logits, -1))
return loss_value, accuracy_value
@tf.function
def loop():
for features, labels in dataset:
loss_value, accuracy_value = train_step(features, labels)
if tf.equal(tf.math.mod(step, 10), 0):
tf.print(step, ": ", loss_value, " - accuracy: ", accuracy_value)
loop()
if __name__ == "__main__":
train()
此部分代码可以和此处比较keras框架
性能优化
prefetch是预取内存的内容,程序员告诉CPU哪些内容可能马上用到,CPU预取,用于优化。
cache(缓存) 性能
使用.cache()方法:当计算缓存空间足够时,将preprocess的数据存储在缓存空间中将大幅提高计算速度。
使用TFRecords,它是一个二进制文件,将数据转化为此类型可以快速读取文件
tensorflow数据集(tensorflow_datasets)就是一个立即可用的数据集集合,它不仅将数据集下载和转换标准的格式,还可以转化成TFRecords格式,它是一个独立的包需要利用pip进行安装
有两个方法:
list_builders:返回可用数据集列表
load(name,split):接受可用数据集的名字以及分割类型
import tensorflow_datasets as tfds
# See available datasets
print(tfds.list_builders())
# Construct 2 tf.data.Dataset objects
# The training dataset and the test dataset
ds_train, ds_test = tfds.load(name="mnist", split=["train", "test"])
builder = tfds.builder("mnist")
print(builder.info)
估计器API
为了简化机器学习编程,减少非重复性部分,提出了估计器API
估计器包括定制和预制两类,共享一个架构,都是为了构建一个tf.estimator.EstimatorSpec对象,该对象可以定义被tf.estimator.Estimator运行的模型。具体代码如下
import tensorflow as tf
from tensorflow.keras.datasets import fashion_mnist
#参数化定义输入数据集对象,并对数据集进行处理
def get_input_fn(mode, batch_size=32, num_epochs=1):
(train_x, train_y), (test_x, test_y) = fashion_mnist.load_data()
half = test_x.shape[0] // 2
if mode == tf.estimator.ModeKeys.TRAIN:
input_x, input_y = train_x, train_y
train = True
elif mode == tf.estimator.ModeKeys.EVAL:
input_x, input_y = test_x[:half], test_y[:half]
train = False
elif mode == tf.estimator.ModeKeys.PREDICT:
input_x, input_y = test_x[half:-1], test_y[half:-1]
train = False
else:
raise ValueError("tf.estimator.ModeKeys required!")
def scale_fn(image, label):
return (
(tf.image.convert_image_dtype(image, tf.float32) - 0.5) * 2.0,
tf.cast(label, tf.int32),
)
def input_fn():
dataset = tf.data.Dataset.from_tensor_slices(
(tf.expand_dims(input_x, -1), tf.expand_dims(input_y, -1))
).map(scale_fn)
if train:
dataset = dataset.shuffle(10).repeat(num_epochs)
dataset = dataset.batch(batch_size).prefetch(1)
return dataset
return input_fn
#模型
def make_model(n_classes):
return tf.keras.Sequential(
[
tf.keras.layers.Conv2D(
32, (5, 5), activation=tf.nn.relu, input_shape=(28, 28, 1)
),
tf.keras.layers.MaxPool2D((2, 2), (2, 2)),
tf.keras.layers.Conv2D(64, (3, 3), activation=tf.nn.relu),
tf.keras.layers.MaxPool2D((2, 2), (2, 2)),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(1024, activation=tf.nn.relu),
tf.keras.layers.Dropout(0.5),
tf.keras.layers.Dense(n_classes),
]
)
#定义能够被估计器运行的模型
def model_fn(features, labels, mode):
v1 = tf.compat.v1
model = make_model(10)
logits = model(features)
if mode == tf.estimator.ModeKeys.PREDICT:
# Extract the predictions
predictions = v1.argmax(logits, -1)
return tf.estimator.EstimatorSpec(mode, predictions=predictions)
loss = v1.reduce_mean(
v1.nn.sparse_softmax_cross_entropy_with_logits(
logits=logits, labels=v1.squeeze(labels)
)
)
global_step = v1.train.get_global_step()
# Compute evaluation metrics.
accuracy = v1.metrics.accuracy(
labels=labels, predictions=v1.argmax(logits, -1), name="accuracy"
)
# The metrics dictionary is used by the estimator during the evaluation
metrics = {"accuracy": accuracy}
if mode == tf.estimator.ModeKeys.EVAL:
return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops=metrics)
if mode == tf.estimator.ModeKeys.TRAIN:
opt = v1.train.AdamOptimizer(1e-4)
train_op = opt.minimize(
loss, var_list=model.trainable_variables, global_step=global_step
)
return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
raise NotImplementedError(f"Unknown mode {mode}")
print("Every log is on Tensorboard, please run tensorboard --logidr log")
estimator = tf.estimator.Estimator(model_fn, model_dir="log")
for epoch in range(50):
print(f"Training for the {epoch}-th epoch")
estimator.train(get_input_fn(tf.estimator.ModeKeys.TRAIN, num_epochs=1))
print("Evaluating...")
estimator.evaluate(get_input_fn(tf.estimator.ModeKeys.EVAL))