###### 如何使用TensorFlow中的Dataset API

https://github.com/FrancescoSaverioZuppichini/Tensorflow-Dataset-Tutorial/blob/master/dataset_tutorial.ipynb

## ▌概述

1. 载入数据：为数据创建一个Dataset实例

2. 创建一个迭代器：使用创建的数据集来构造一个Iterator实例以遍历数据集

3. 使用数据：使用创建的迭代器，我们可以从数据集中获取数据元素，从而输入到模型中去。

## ▌载入数据

### 从numpy载入

# create a random vector of shape (100,2)x = np.random.sample((100,2))# make a dataset from a numpy arraydataset = tf.data.Dataset.from_tensor_slices(x)

features, labels = (np.random.sample((100,2)), np.random.sample((100,1)))dataset = tf.data.Dataset.from_tensor_slices((features,labels))

### 从tensors中载入

# using a tensordataset = tf.data.Dataset.from_tensor_slices(tf.random_uniform([100, 2]))

### 从placeholder中载入

x = tf.placeholder(tf.float32, shape=[None,2])dataset = tf.data.Dataset.from_tensor_slices(x)

### 从generator载入

sequence = np.array([[1],[2,3],[3,4]])def generator():    for el in sequence:        yield el        dataset = tf.data.Dataset().from_generator(generator,output_types=tf.float32,                                   output_shapes=[tf.float32])

## ▌创建一个迭代器

### One shot Iterator

x = np.random.sample((100,2))# make a dataset from a numpy arraydataset = tf.data.Dataset.from_tensor_slices(x)# create the iteratoriter = dataset.make_one_shot_iterator()

...# create the iteratoriter = dataset.make_one_shot_iterator()el = iter.get_next()

with tf.Session() as sess:    print(sess.run(el)) # output: [ 0.42116176  0.40666069]

### 可初始化的迭代器

# using a placeholderx = tf.placeholder(tf.float32, shape=[None,2])dataset = tf.data.Dataset.from_tensor_slices(x)data = np.random.sample((100,2))iter = dataset.make_initializable_iterator() # create the iteratorel = iter.get_next()with tf.Session() as sess:    # feed the placeholder with data    sess.run(iter.initializer, feed_dict={ x: data })     print(sess.run(el)) # output [ 0.52374458  0.71968478]

train_data = (np.random.sample((100,2)), np.random.sample((100,1)))test_data = (np.array([[1,2]]), np.array([[0]]))

# initializable iterator to switch between datasetEPOCHS = 10x, y = tf.placeholder(tf.float32, shape=[None,2]), tf.placeholder(tf.float32, shape=[None,1])dataset = tf.data.Dataset.from_tensor_slices((x, y))train_data = (np.random.sample((100,2)), np.random.sample((100,1)))test_data = (np.array([[1,2]]), np.array([[0]]))iter = dataset.make_initializable_iterator()features, labels = iter.get_next()with tf.Session() as sess:#     initialise iterator with train data    sess.run(iter.initializer, feed_dict={ x: train_data[0], y: train_data[1]})    for _ in range(EPOCHS):        sess.run([features, labels])#     switch to test data    sess.run(iter.initializer, feed_dict={ x: test_data[0], y: test_data[1]})    print(sess.run([features, labels]))

### 可重新初始化的迭代器

# making fake data using numpytrain_data = (np.random.sample((100,2)), np.random.sample((100,1)))test_data = (np.random.sample((10,2)), np.random.sample((10,1)))

# create two datasets, one for training and one for testtrain_dataset = tf.data.Dataset.from_tensor_slices(train_data)test_dataset = tf.data.Dataset.from_tensor_slices(test_data)

# create a iterator of the correct shape and typeiter = tf.data.Iterator.from_structure(train_dataset.output_types,                                           train_dataset.output_shapes)

# create the initialisation operationstrain_init_op = iter.make_initializer(train_dataset)test_init_op = iter.make_initializer(test_dataset)

features, labels = iter.get_next()

# Reinitializable iterator to switch between DatasetsEPOCHS = 10# making fake data using numpytrain_data = (np.random.sample((100,2)), np.random.sample((100,1)))test_data = (np.random.sample((10,2)), np.random.sample((10,1)))# create two datasets, one for training and one for testtrain_dataset = tf.data.Dataset.from_tensor_slices(train_data)test_dataset = tf.data.Dataset.from_tensor_slices(test_data)# create a iterator of the correct shape and typeiter = tf.data.Iterator.from_structure(train_dataset.output_types,                                           train_dataset.output_shapes)features, labels = iter.get_next()# create the initialisation operationstrain_init_op = iter.make_initializer(train_dataset)test_init_op = iter.make_initializer(test_dataset)with tf.Session() as sess:    sess.run(train_init_op) # switch to train dataset    for _ in range(EPOCHS):        sess.run([features, labels])    sess.run(test_init_op) # switch to val dataset    print(sess.run([features, labels]))

## ▌使用数据

...next_el = iter.get_next()...print(sess.run(next_el)) # will output the current element

# using two numpy arraysfeatures, labels = (np.array([np.random.sample((100,2))]),                     np.array([np.random.sample((100,1))]))dataset = tf.data.Dataset.from_tensor_slices((features,labels)).repeat().batch(BATCH_SIZE)

iter = dataset.make_one_shot_iterator()x, y = iter.get_next()

# make a simple modelnet = tf.layers.dense(x, 8) # pass the first value from iter.get_next() as inputnet = tf.layers.dense(net, 8)prediction = tf.layers.dense(net, 1)loss = tf.losses.mean_squared_error(prediction, y) # pass the second value from iter.get_net() as labeltrain_op = tf.train.AdamOptimizer().minimize(loss)

EPOCHS = 10BATCH_SIZE = 16# using two numpy arraysfeatures, labels = (np.array([np.random.sample((100,2))]),                     np.array([np.random.sample((100,1))]))dataset = tf.data.Dataset.from_tensor_slices((features,labels)).repeat().batch(BATCH_SIZE)iter = dataset.make_one_shot_iterator()x, y = iter.get_next()# make a simple modelnet = tf.layers.dense(x, 8, activation=tf.tanh) # pass the first value from iter.get_next() as inputnet = tf.layers.dense(net, 8, activation=tf.tanh)prediction = tf.layers.dense(net, 1, activation=tf.tanh)loss = tf.losses.mean_squared_error(prediction, y) # pass the second value from iter.get_net() as labeltrain_op = tf.train.AdamOptimizer().minimize(loss)with tf.Session() as sess:    sess.run(tf.global_variables_initializer())    for i in range(EPOCHS):        _, loss_value = sess.run([train_op, loss])        print("Iter: {}, Loss: {:.4f}".format(i, loss_value))

Iter: 0, Loss: 0.1328 Iter: 1, Loss: 0.1312 Iter: 2, Loss: 0.1296 Iter: 3, Loss: 0.1281 Iter: 4, Loss: 0.1267 Iter: 5, Loss: 0.1254 Iter: 6, Loss: 0.1242 Iter: 7, Loss: 0.1231 Iter: 8, Loss: 0.1220 Iter: 9, Loss: 0.1210

## ▌有用的技巧

### batch

# BATCHINGBATCH_SIZE = 4x = np.random.sample((100,2))# make a dataset from a numpy arraydataset = tf.data.Dataset.from_tensor_slices(x).batch(BATCH_SIZE)iter = dataset.make_one_shot_iterator()el = iter.get_next()with tf.Session() as sess:    print(sess.run(el))

[[ 0.65686128  0.99373963] [ 0.69690451  0.32446826] [ 0.57148422  0.68688242] [ 0.20335116  0.82473219]]

Repeat

### Shuffle

# BATCHINGBATCH_SIZE = 4x = np.array([[1],[2],[3],[4]])# make a dataset from a numpy arraydataset = tf.data.Dataset.from_tensor_slices(x)dataset = dataset.shuffle(buffer_size=100)dataset = dataset.batch(BATCH_SIZE)iter = dataset.make_one_shot_iterator()el = iter.get_next()with tf.Session() as sess:    print(sess.run(el))

[[4] [2] [3] [1]]

[[3] [1] [2] [4]]

## ▌Map

# MAPx = np.array([[1],[2],[3],[4]])# make a dataset from a numpy arraydataset = tf.data.Dataset.from_tensor_slices(x)dataset = dataset.map(lambda x: x*2)iter = dataset.make_one_shot_iterator()el = iter.get_next()with tf.Session() as sess:#     this will run forever        for _ in range(len(x)):            print(sess.run(el))

[2][4][6][8]

## 其他资源

TensorFlow dataset tutorial: https://www.tensorflow.org/programmers_guide/datasets

Dataset docs:https://www.tensorflow.org/api_docs/python/tf/data/Dataset

## ▌结论

Dataset API提供了一种快速而且鲁棒的方法来创建优化的输入管道来训练、评估和测试我们的模型。在这篇文章中，我们了解了很多常见的利用Dataset API的操作。

#### TensorFlow全新的数据读取方式：Dataset API入门教程(转)

2017-11-06 09:58:31

#### Dataset的用法简析

2018-02-24 22:40:51

#### Tensorflow 学习笔记：Input Pipeline - Dataset

2017-11-23 13:29:47

#### tf.data.Dataset.from_tensor_slices

2018-03-06 17:05:34

#### 第二阶段-tensorflow程序图文详解（六） Importing Data

2017-12-27 14:14:07

#### TensorFlow全新的数据读取方式：Dataset API——tf.data.Dataset

2017-12-20 19:12:02

#### TensorFlow全新的数据读取方式：Dataset API入门教程

2017-11-20 09:11:29

#### 用tensorflow DataSet高效加载变长文本输入

2017-11-06 14:01:24

#### Tensorflow Dataset API 入门

2018-03-27 17:12:21

#### tensorflow 细节用法

2018-04-07 08:20:38