# 如何使用TensorFlow中的Dataset API

https://github.com/FrancescoSaverioZuppichini/Tensorflow-Dataset-Tutorial/blob/master/dataset_tutorial.ipynb

## ▌概述

1. 载入数据：为数据创建一个Dataset实例

2. 创建一个迭代器：使用创建的数据集来构造一个Iterator实例以遍历数据集

3. 使用数据：使用创建的迭代器，我们可以从数据集中获取数据元素，从而输入到模型中去。

## ▌载入数据

### 从numpy载入

# create a random vector of shape (100,2)
x = np.random.sample((100,2))
# make a dataset from a numpy array
dataset = tf.data.Dataset.from_tensor_slices(x)

features, labels = (np.random.sample((100,2)), np.random.sample((100,1)))
dataset = tf.data.Dataset.from_tensor_slices((features,labels))

### 从tensors中载入

# using a tensor
dataset = tf.data.Dataset.from_tensor_slices(tf.random_uniform([100, 2]))

### 从placeholder中载入

x = tf.placeholder(tf.float32, shape=[None,2])
dataset = tf.data.Dataset.from_tensor_slices(x)

### 从generator载入

sequence = np.array([[1],[2,3],[3,4]])

def generator():
for el in sequence:
yield el

dataset = tf.data.Dataset().from_generator(generator,

output_types=tf.float32,
output_shapes=[tf.float32])

## ▌创建一个迭代器

### One shot Iterator

x = np.random.sample((100,2))
# make a dataset from a numpy array
dataset = tf.data.Dataset.from_tensor_slices(x)

# create the iterator
iter = dataset.make_one_shot_iterator()

...
# create the iterator
iter = dataset.make_one_shot_iterator()
el = iter.get_next()

with tf.Session() as sess:
print(sess.run(el)) # output: [ 0.42116176  0.40666069]

### 可初始化的迭代器

# using a placeholder
x = tf.placeholder(tf.float32, shape=[None,2])
dataset = tf.data.Dataset.from_tensor_slices(x)
data = np.random.sample((100,2))
iter = dataset.make_initializable_iterator() # create the iterator
el = iter.get_next()
with tf.Session() as sess:
# feed the placeholder with data
sess.run(iter.initializer, feed_dict={ x: data })
print(sess.run(el)) # output [ 0.52374458  0.71968478]

train_data = (np.random.sample((100,2)), np.random.sample((100,1)))
test_data = (np.array([[1,2]]), np.array([[0]]))

# initializable iterator to switch between dataset
EPOCHS = 10
x, y = tf.placeholder(tf.float32, shape=[None,2]), tf.placeholder(tf.float32, shape=[None,1])
dataset = tf.data.Dataset.from_tensor_slices((x, y))
train_data = (np.random.sample((100,2)), np.random.sample((100,1)))
test_data = (np.array([[1,2]]), np.array([[0]]))
iter = dataset.make_initializable_iterator()
features, labels = iter.get_next()
with tf.Session() as sess:
#     initialise iterator with train data
sess.run(iter.initializer, feed_dict={ x: train_data[0], y: train_data[1]})
for _ in range(EPOCHS):
sess.run([features, labels])
#     switch to test data
sess.run(iter.initializer, feed_dict={ x: test_data[0], y: test_data[1]})
print(sess.run([features, labels]))

### 可重新初始化的迭代器

# making fake data using numpy
train_data = (np.random.sample((100,2)), np.random.sample((100,1)))
test_data = (np.random.sample((10,2)), np.random.sample((10,1)))

# create two datasets, one for training and one for test
train_dataset = tf.data.Dataset.from_tensor_slices(train_data)
test_dataset = tf.data.Dataset.from_tensor_slices(test_data)

# create a iterator of the correct shape and type
iter = tf.data.Iterator.from_structure(train_dataset.output_types,
train_dataset.output_shapes)

# create the initialisation operations
train_init_op = iter.make_initializer(train_dataset)
test_init_op = iter.make_initializer(test_dataset)

features, labels = iter.get_next()

# Reinitializable iterator to switch between Datasets
EPOCHS = 10
# making fake data using numpy
train_data = (np.random.sample((100,2)), np.random.sample((100,1)))
test_data = (np.random.sample((10,2)), np.random.sample((10,1)))
# create two datasets, one for training and one for test
train_dataset = tf.data.Dataset.from_tensor_slices(train_data)
test_dataset = tf.data.Dataset.from_tensor_slices(test_data)
# create a iterator of the correct shape and type
iter = tf.data.Iterator.from_structure(train_dataset.output_types,
train_dataset.output_shapes)
features, labels = iter.get_next()
# create the initialisation operations
train_init_op = iter.make_initializer(train_dataset)
test_init_op = iter.make_initializer(test_dataset)
with tf.Session() as sess:
sess.run(train_init_op) # switch to train dataset
for _ in range(EPOCHS):
sess.run([features, labels])
sess.run(test_init_op) # switch to val dataset
print(sess.run([features, labels]))

## ▌使用数据

...
next_el = iter.get_next()
...
print(sess.run(next_el)) # will output the current element

# using two numpy arrays
features, labels = (np.array([np.random.sample((100,2))]),
np.array([np.random.sample((100,1))]))
dataset =
tf.data.Dataset.from_tensor_slices((features,labels)).repeat().batch(BATCH_SIZE)

iter = dataset.make_one_shot_iterator()x, y = iter.get_next()

# make a simple model
net = tf.layers.dense(x, 8) # pass the first value from iter.get_next() as input
net = tf.layers.dense(net, 8)
prediction = tf.layers.dense(net, 1)
loss = tf.losses.mean_squared_error(prediction, y) # pass the second value from iter.get_net() as label

EPOCHS = 10
BATCH_SIZE = 16
# using two numpy arrays
features, labels = (np.array([np.random.sample((100,2))]),
np.array([np.random.sample((100,1))]))
dataset = tf.data.Dataset.from_tensor_slices((features,labels)).repeat().batch(BATCH_SIZE)
iter = dataset.make_one_shot_iterator()
x, y = iter.get_next()
# make a simple model
net = tf.layers.dense(x, 8, activation=tf.tanh) # pass the first value from iter.get_next() as input
net = tf.layers.dense(net, 8, activation=tf.tanh)
prediction = tf.layers.dense(net, 1, activation=tf.tanh)
loss = tf.losses.mean_squared_error(prediction, y) # pass the second value from iter.get_net() as label
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for i in range(EPOCHS):
_, loss_value = sess.run([train_op, loss])
print("Iter: {}, Loss: {:.4f}".format(i, loss_value))

Iter: 0, Loss: 0.1328
Iter: 1, Loss: 0.1312
Iter: 2, Loss: 0.1296
Iter: 3, Loss: 0.1281
Iter: 4, Loss: 0.1267
Iter: 5, Loss: 0.1254
Iter: 6, Loss: 0.1242
Iter: 7, Loss: 0.1231
Iter: 8, Loss: 0.1220
Iter: 9, Loss: 0.1210

## ▌有用的技巧

### batch

# BATCHING
BATCH_SIZE = 4
x = np.random.sample((100,2))
# make a dataset from a numpy array
dataset = tf.data.Dataset.from_tensor_slices(x).batch(BATCH_SIZE)

iter = dataset.make_one_shot_iterator()
el = iter.get_next()

with tf.Session() as sess:
print(sess.run(el))

[[ 0.65686128  0.99373963]
[ 0.69690451  0.32446826]
[ 0.57148422  0.68688242]
[ 0.20335116  0.82473219]]

Repeat

### Shuffle

# BATCHING
BATCH_SIZE = 4
x = np.array([[1],[2],[3],[4]])
# make a dataset from a numpy array
dataset = tf.data.Dataset.from_tensor_slices(x)
dataset = dataset.shuffle(buffer_size=100)
dataset = dataset.batch(BATCH_SIZE)
iter = dataset.make_one_shot_iterator()
el = iter.get_next()
with tf.Session() as sess:
print(sess.run(el))

[[4]
[2]
[3]
[1]]

[[3]
[1]
[2]
[4]]

## ▌Map

# MAP
x = np.array([[1],[2],[3],[4]])
# make a dataset from a numpy array
dataset = tf.data.Dataset.from_tensor_slices(x)
dataset = dataset.map(lambda x: x*2)
iter = dataset.make_one_shot_iterator()
el = iter.get_next()
with tf.Session() as sess:
#     this will run forever
for _ in range(len(x)):
print(sess.run(el))

[2]
[4]
[6]
[8]

## 其他资源

TensorFlow dataset tutorial: https://www.tensorflow.org/programmers_guide/datasets

Dataset docs:https://www.tensorflow.org/api_docs/python/tf/data/Dataset

## ▌结论

Dataset API提供了一种快速而且鲁棒的方法来创建优化的输入管道来训练、评估和测试我们的模型。在这篇文章中，我们了解了很多常见的利用Dataset API的操作。

• 本文已收录于以下专栏：

举报原因： 您举报文章：如何使用TensorFlow中的Dataset API 色情 政治 抄袭 广告 招聘 骂人 其他 (最多只允许输入30个字)