# tf读取数据的几种方式

### 1.最简单的方式

import tensorflow as tf

a = tf.zeros([2,3])
b = tf.ones([2,3])
with tf.Session() as sess:
print(sess.run(b))


### 2.通过feed_dict

import numpy as np
import tensorflow as tf

x = np.reshape(np.arange(6), [2,3])
a = tf.zeros([2,3])
b = tf.placeholder(dtype=tf.float32, shape=[2,3])
with tf.Session() as sess:
print(sess.run(b, feed_dict={b:x}))


### 3.直接从文件中读取

• 通过tf.train.slice_input_producer，管理线程队列读取

import tensorflow as tf

x = [[1,2],[2,3],[4,5],[6,7],[7,8],[9,10],[11,12],[13,14]]
label = ["a","b","c","d","a","b","c","d"]
# 此处shuffle=True的话不需要tf.train.shuffle_batch,batch即可
input_queues = tf.train.slice_input_producer([x, label],shuffle=False,num_epochs=2)
x, y = tf.train.batch(input_queues,
batch_size=3,
capacity= 128,
allow_smaller_final_batch=False)
with tf.Session() as sess:
tf.local_variables_initializer().run()
# 使用start_queue_runners之后，才会开始填充队列
coord = tf.train.Coordinator()
try:
while not coord.should_stop():
print("---------")
# 单独的分开run，因为tf的机制，它不是一起执行的，run了两次，所以x，y不对应
print(sess.run([x, y]))
# 如果读取到文件队列末尾会抛出此异常
except tf.errors.OutOfRangeError:
print("done! now lets kill all the threads……")
finally:
coord.request_stop()



out:

---------
[array([[1, 2],
[2, 3],
[4, 5]]), array([b'a', b'b', b'c'], dtype=object)]
---------
[array([[ 6,  7],
[ 7,  8],
[ 9, 10]]), array([b'd', b'a', b'b'], dtype=object)]
---------
[array([[11, 12],
[13, 14],
[ 2,  3]]), array([b'c', b'd', b'b'], dtype=object)]
---------
[array([[ 1,  2],
[13, 14],
[11, 12]]), array([b'a', b'd', b'c'], dtype=object)]
---------
[array([[ 4,  5],
[ 6,  7],
[ 9, 10]]), array([b'c', b'd', b'b'], dtype=object)]
---------
done! now lets kill all the threads……


# 导入tensorflow
import tensorflow as tf

filename = ['A.jpg', 'B.jpg', 'C.jpg']
# string_input_producer会产生一个文件名队列
filename_queue = tf.train.string_input_producer(filename, shuffle=False, num_epochs=3)
#key保存的是filename_queue中的文件名，value则是文件本身
# image_resized, labels = _parse_function(value, key) # 可以对原始图片处理，这里就不演示了
x, y = tf.train.batch([value, key],
batch_size=2,
capacity= 128,
allow_smaller_final_batch=False)

sv = tf.train.Supervisor(logdir='./', save_model_secs=0)
# Supervisor不需要手动启动线程管理线程；不需要手动global_variable_initializer()等,非常强大
# 之后再单独讲一讲
with sv.managed_session() as sess:
while 1:
if sv.should_stop(): break
print("----------")
print(sess.run(y)) # 因为是图片就不print x了


out:

----------
[b'A.jpg' b'B.jpg']
----------
[b'C.jpg' b'A.jpg']
----------
[b'B.jpg' b'C.jpg']
----------
[b'A.jpg' b'B.jpg']
----------


tf.train.string_input_producertf.train.slice_input_producer的区别简单来说，前者是传入文件名列表[data_file_names]，用tf.WholeFileReader()等专门处理输入的类去读；后者是传入已经加载到内存中的数据，传入[data, labels]，其他都一样

• tf.data.Datase处理

dataset = tf.data.Dataset.from_tensor_slices(np.zeros([4, 10])) # 根据可迭代对象划分
iterator = dataset.make_one_shot_iterator() # 构造迭代器
element = iterator.get_next() # get_next()迭代获取元素

with tf.Session() as sess:
for i in range(3):
print(sess.run(element))


out：

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


Dataset类中的各种操作介绍：https://www.imooc.com/article/68648

def parse(x):
return x+1
dataset = tf.data.Dataset.from_tensor_slices([1,2,3,4,5])
dataset = dataset.map(parse) # 让dataset中的每条数据都经过parse函数的解析
dataset=dataset.batch(3, drop_remainder=False).repeat().shuffle(1000)
# Dataset对数据进行处理的函数，返回仍是Dataset类
iterator = dataset.make_one_shot_iterator() # 构造迭代器
element = iterator.get_next() # get_next()迭代获取元素

with tf.Session() as sess:
for i in range(5):
print(sess.run(element))


out:

[5 6]
[5 6]
[2 3 4]
[5 6]
[2 3 4]


dataset = dataset.padded_batch(batch_size,padded_shapes,..)


import tensorflow as tf
import numpy as np

x = 'i looove you'
x_ids = np.array([1, 2, 3])
label = 1

writer = tf.python_io.TFRecordWriter('./test.tfrecords') # 创建写类
for i in range(3):
# 这两个features ||features = tf.train.Features|| 都要加s！！！否则报错 奇葩- -
one_record = tf.train.Example(features = tf.train.Features(feature = {
"x_raw":tf.train.Feature(bytes_list=tf.train.BytesList(value=[bytes(x+' this is'+str(i), 'utf-8')])),
"x_ids":tf.train.Feature(bytes_list=tf.train.BytesList(value=[np.append(x_ids, np.array([i])).tostring()])),
"label":tf.train.Feature(int64_list=tf.train.Int64List(value=[label+i]))
}))
writer.write(one_record.SerializeToString())
writer.close()


import tensorflow as tf

def parser(example):
features = tf.parse_single_example(example,features={
"x_raw": tf.FixedLenFeature([], tf.string),
"x_ids": tf.FixedLenFeature([], tf.string),
"label": tf.FixedLenFeature([], tf.int64)
})
x_ids = tf.decode_raw(features["x_ids"], tf.int32)
x_raw = features["x_raw"]
label = features["label"]
return x_ids, x_raw, label

dataset = tf.data.TFRecordDataset('./test.tfrecords').map(parser).shuffle(10).repeat().batch(2)
iterator = dataset.make_one_shot_iterator()
x_ids, x_raw, label = iterator.get_next()
with tf.Session() as sess:
for i in range(3):
a, b, c = sess.run((x_ids, x_raw, label))
print(a, b, c)


out：

[[1 2 3 1]
[1 2 3 0]] [b'i looove you this is1' b'i looove you this is0'] [2 1]
[[1 2 3 2]
[1 2 3 0]] [b'i looove you this is2' b'i looove you this is0'] [3 1]
[[1 2 3 2]
[1 2 3 1]] [b'i looove you this is2' b'i looove you this is1'] [3 2]


import tensorflow as tf

def parser(example):
features = tf.parse_single_example(example,features={
"x_raw": tf.FixedLenFeature([], tf.string),
"x_ids": tf.FixedLenFeature([], tf.string),
"label": tf.FixedLenFeature([], tf.int64)
})
x_ids = tf.decode_raw(features["x_ids"], tf.int32)
x_raw = features["x_raw"]
label = features["label"]
return x_ids, x_raw, label

dataset1 = tf.data.TFRecordDataset('./test.tfrecords').map(parser).shuffle(10).repeat().batch(2)
dataset2 = tf.data.TFRecordDataset('./test.tfrecords').map(parser).shuffle(10).repeat().batch(3)
handle = tf.placeholder(tf.string, shape=[])
# .from_string_handle()里的dataset1和dataset2要是相同格式的数据，不然不能'绑'起来
iterator = tf.data.Iterator.from_string_handle(handle, dataset1.output_types, dataset2.output_shapes)
element = iterator.get_next()

iterator1 = dataset1.make_one_shot_iterator()
iterator2 = dataset2.make_one_shot_iterator()

with tf.Session() as sess:
for i in range(3):
dataset1_handle = sess.run(iterator1.string_handle())
a1, b1, c1 = sess.run(element, feed_dict={handle: dataset1_handle})
print(a1, b1, c1)
dataset2_handle = sess.run(iterator2.string_handle())
a2, b2, c2 = sess.run(element, feed_dict={handle: dataset2_handle})
print(a2, b2, c2)


out:

[[1 2 3 1]
[1 2 3 0]] [b'i looove you this is1' b'i looove you this is0'] [2 1]
[[1 2 3 1]
[1 2 3 2]
[1 2 3 0]] [b'i looove you this is1' b'i looove you this is2'
b'i looove you this is0'] [2 3 1]
[[1 2 3 2]
[1 2 3 1]] [b'i looove you this is2' b'i looove you this is1'] [3 2]
[[1 2 3 1]
[1 2 3 0]
[1 2 3 2]] [b'i looove you this is1' b'i looove you this is0'
b'i looove you this is2'] [2 1 3]
[[1 2 3 0]
[1 2 3 2]] [b'i looove you this is0' b'i looove you this is2'] [1 3]
[[1 2 3 2]
[1 2 3 0]
[1 2 3 1]] [b'i looove you this is2' b'i looove you this is0'
b'i looove you this is1'] [3 1 2]


with tf.Session() as sess:
try:
while True:
print(sess.run(element))
except tf.errors.OutOfRangeError:
print("DONE")


• 总的来说，两种方式都是通过多线程队列来实现高效的组织输入，tf.data.Dataset不需要手动管理和启动线程队列；tf.train.slice_input_producer结合sv也可以实现这个目的。
• tf.train.slice_input_producer对于不是最后一个epoch的情况，最后不满一个batch的数据仍然留在队列中，供下一次使用，对于最后一个epoch的情况，最后不满一个batch的数据直接丢弃；
• tf.data.Dataset通过.batch(drop_remainder=False/True)来控制是否丢弃，要实现tf.train.slice_input_producer的效果，需要把.repeat()放在.batch()之前；如果顺序倒过来，像博主在本文中举的所有例子，则在每个epoch都会存在一个小尾巴！可以说很有意思了！！！
• 都很方便使用，看个人习惯使用吧

.#

• 广告
• 抄袭
• 版权
• 政治
• 色情
• 无意义
• 其他

120