数据读取方法总结有3种:
(1)预加载数据:预定义常量或变量来保存数据
(2)供给数据:在会话中运行run()函数的时候通过复制给feed_dict参数的方式将数据注入placeholder中。
(3)从文件中读取数据:在TensorFlow图的起始,让一个输入管线从文件中读取数据。
12.1 TFRecord格式
message Example{
Features features=1;
};
message Features{
map<string,Feature> feature=1;
};
message Feature{
oneof kind{
BytesList bytes_list=1;
FloatList float_list=2;
Int64List int64_list=3;
}
};
下面通过一段代码将Fashion_MNIST数据集转化为TFRecord文件。大致过程就是:
- 将数据填入到Example协议内存块
- 将协议内存块序列化为一个字符串
- 通过io.TFRecordWriter类写入到TFRecord文件
import tensorflow as tf
import numpy as np
(train_images,train_labels),(test_images,test_labels)=tf.keras.datasets.fashion_mnist.load_data()
def Int64_feature(value):
return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
def Bytes_feature(value):
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
pixels=train_images.shape[1]
num_examples=train_images.shape[0]
filename="/home/xxy/TFRecord/fashion_MNIST.tfrecords"
writer=tf.io.TFRecordWriter(filename)
for i in range(num_examples):
image_to_string=train_images[i].tostring()
feature={
"pixels":Int64_feature(pixels),
"label":Int64_feature(train_labels[i]),
"image_raw":Bytes_feature(image_to_string)
}
features=tf.train.Features(feature=feature)
example=tf.train.Example(features=features)
writer.write(example.SerializaToString())
print("writed ok")
writer.close()
使用data模块中的data.experimental模块中的TFRecordWriter类将Fashion_MNIIST数据集写入到一个TFRecord文件中:
import tensorflow as tf
(train_images,train_labels),(test_images,test_labels)=tf.keras.datasets.fashion_mnist.load_data()
def Int64_feature(value):
return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
def Bytes_feature(value):
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
pixels=train_images.shape[1]
num_examples=train_images.shape[0]
filename="/home/xxy/TFRecord/fashion_MNIST2.tfrecords"
#writer=tf.io.TFRecordWriter(filename)
writer=tf.data.experimental.TFRecordWriter(filename)
def serialize_example(i):
iamge_to_string=train_images[i].tostring()
feature={
"pixels":Int64_feature(pixels),
"labels=":Int64_feature(train_labels[i]),
"image_raw":Bytes_feature(image_to_string)
}
feature=tf.train.Feature(feature=feature)
example_proto=tf.train.Example(features=features)
return example_proto.SerializeToString()
def generator():
for i in range(num_examples):
yield serialize_example(i)
serialized_features_dataset=tf.data.Dataset.from_generator(generator,output_types=tf.string,outpt_shape=())
writer.write(serialized_features_dataset)
print("writed ok")
writer.close()
从TFRecord文件中解析数据:
import tensorflow as tf
filename="/home/xxy/TFRecord/fashion-MNIST.tfrecords"
tfrecord_dataset=tf.data.TFRecordDataset(filename)
''' test part
for sample in tfrecord_dataset.take(5):
print(repr(sample))
'''
def parse_function(example_proto):
feature=tf.io.parse_single_example(example_proto,
features={
"pixels":tf.io.FixedLenFeature([],tf.int64),
"label":tf.io.FixedLenFeature([],tf.int64),
"iamge_raw":tf.io.FixedLenFeature([],tf.string),
})
return feature
parsed_dataset=tfrecord_dataset.map(parse_function)
'''test code
print(parsed_dataset)
'''
for parsed_record in parsed_dataset.take(1):
images=tf.io.decode_raw(parsed_record["image_raw"],tf.uint8)
labels=tf.cast(parsed_record["label"],tf.int32)
pixels=tf.cast(parsed_record["pixels"],tf.int32)
print(images)
print(labels)
print(pixels)
12.2 CSV格式
csv-comma-Separated values,字符分隔值。以纯文本形式存储表格数据(数字和文本),这意味这该文件是一个字符序列,读取该文件不需要经过像二进制数据那样反序列化的过程。
import tensorflow as tf
file_name="/home/xxy/CSV/data.csv"
CSV_CLOUMNS=['col1','col2','col3','col4']
csv_dataset=tf.data.experimental.make_csv_dataset(file_name,batch_size=10,shuffle=False,column_names=CSV_CLOUMNS,ignore_errors=True)
examples=next(iter(csv_dataset))
print(examples)
12.3 队列
import tensorflow as tf
Queue=tf.queue.FIFOQueue(2,"int32")
queue_init=Queue.enqueue_many(([10,100],))
for i in range(5):
a=Queue.dequeue()
b=a+10
Queue_en=Queue.enqueue([b])
print(a)
文件队列
import tensorflow as tf
import numpy as np
(train_images,train_labels),(test_images,test_labels)=tf.keras.datasets.fashion_mnist.load_data()
def _int64_feature(value):
return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
def _bytes_feature(value):
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
pixels=test_images.shape[1]
num_examples=test_images.shape[0]
num_files=2
for i in range(num_files):
filename=("/home/xxy/TFRecord/data_tfrecord-%.1d-of-%.1d"%(i,num_files))
writer=tf.io.TFRecordWriter(filename)
for index in range(num_examples):
image_string=test_images[index].tostring()
example=tf.train.Example(features=tf.train.Features(feature={
"pixels":_int64_feature(pixels),
"label":_int64_feature(np.argmax(test_labels[indes])),
"image_raw":_bytes_feature(image_string)
}))
writer.write(example.SerializaToString())
print("writed ok")
writer.close()
import tensorflow as tf
files=tf.train.match_filenames_once("/home/xxy/TFRecord/data_tfrecords-*")
filename_queue=tf.train.string_input_producer(files,shuffle=False)
reader=tf.TFRecordReader()
_,Serialized_example=reader.read(filename_queue)
features=tf.parse_single_example(
serialized_example,
features={
"image_raw":tf.FixedLenFeature([],tf.string),
"pixels":tf.FixedLenFeature([],tf.int64),
"label":tf.FixedLenFeature([],tf.int64)
})
images=tf.decode_raw(features["image_raw"],tf.uint8)
lables=tf.cast(features["labels"],tf.int32)
pixels=tf.cast(features["pixels"],tf.int32)
with tf.Session() as sess:
tf.global_variables_initializer().run()
print(sess.run(files))
coordinator=tf.train.Coordinator()
threads=tf.train.start_queue_runners(sess=sess,coord=coordinator)
for i in range(6):
print(sess.run([images,labels]))
coordinator.request_stop()
coordinator.join(threads)
12.4 多线程处理输入的数据
1.使用coordinator类管理线程
- should_stop():通过函数的返回值判断线程是否停止,如果线程停止了,则函数返回true
- request_stop():请求该线程及其他线程停止
- join():等待被指定的线程终止
import tensorflow as tf
import nnumpy as np
import threading
import time
def Thread_op(coordinator,thread_id):
while coordinator.should_stop() == False:
if np.random.rand()<0.1:
print("Stoping from thread_id : %d "% thread_id)
coordinator.request_stop()
else:
print("Working on thread_id ; %d" % thread_id)
time.sleep(10)
coordinator=tf.train.Coordinator()
threads=[threading.Thread(target=Thread_op,args=(coordinator,i)) for i in range(5)]
for j in threads:
j.start()
coordinator.join(threads)
2.在Tensorflow1.x中使用QueueRunner创建线程
import tensorflow as tf
import nnumpy as np
import threading
import time
queue=tf.FIFOQueue(100,"float")
enqueue=queue.enqueue([tf.random_normal([10])])
qr=tf.train.QueueRunner(queue,[enqueue]*10)
tf.train.add_queue_runner(qr)
out_tensor=queue.dequeue()
with tf.Session() as sess:
coordinator= tf.train.Coordinator()
start_queue_runners(sess,coord,daemon,start,collection)
threads=tf.train.start_queue_runners(sess=sess,coord=coordinator)
for i in range(10):
print(sess.run(out_tensor))
coordinator.request_stop()
coordinator.join(threads)
12.5 组织数据batch
import tensorflow as tf
file = tf.train.match_filename_once("/home/xxy/TFRecord/data_tfrecords-*")
filename_queue=tf.train.string_input_producer(files,shuffle=True)
reader=tf.TFRecordReader()
_,serialized_example=reader.read(filename_queue)
features=tf.parse_single_example(
serialized_example,
features={
"image_raw":tf.FIxedLenFeature([],tf.string),
"pixels":tf.FIxedLenFeature([],tf.int64),
"label":tf.FIxedLenFeature([],tf.int64)
})
images=tf.decode_raw(features["image_raw",tf.uint8])
labels=tf.cast(features["labels",tf.int32])
pixels=tf.cast(features["pixels",tf.int32])
batch_size=10
capacity=5000+3*batch_size
tf.train.batch([images,labels],batch_size=batch_size,capacity=capacity,)
with tf.Session() as sess:
tf.global_variables_initializer().run()
coord=tf.train.Coordinator()
threads=tf.train.start_queue_runners(sess=sess,coord=coord)
for i in range(3):
xs,ys=sess.run([image_batch,label_batch])
print(xs,ys)
coord.request_stop()
coord.join(threads)
import tensorflow as tf
(train_images,train_labels),(test_images,test_labels)=tf.keras.datasets.fashion_mnist.load_data()
train_images=train_images.reshape(60000,784).astype('float32')/255
test_images=train_images.reshape(1000,784).astype('float32')/255
train_datasets=tf.data.Dataset.from_tensor_slices((train_images,train_labels))
train_datasets=train_datasets.shuffle(buffer_size=1024).batch(100)
test_dataset=tf.data.Dataset.from_tensor_slices((test_images,test_labels))
test_dataset=test_dataset.batch(100)
for train_images,train_labels in train_datasets:
print(train_images,train_labels)
import tensorflow as tf
import numpy as np
n_observations=int(1e4)
feature0=np.random.choice([False,True],n_observations)
feature1=np.random.randint(0,5,n_observations)
strings=np.array([b'cat',b'dog',b'chicken',b'horse',b'goat'])
feature2=strings[feature1]
feature3=np.random.randn(n_observations)
features_dataset=tf.data.Dataset.from_tensor_slices((feature0,feature1,feature2,feature3))
for fo,f1,f2,f3 in features_dataset.take(1):
print(f0)
print(f1)
print(f2)
print(f3)
import tensorflow as tf
import numpy as np
(train_images,train_labels),(test_images,test_labels)=tf.keras.datasets.fashion_mnist.load_data()
train_images=train_images.reshape(60000,784).astype('float32')/255
test_images=train_images.reshape(1000,784).astype('float32')/255
train_datasets=tf.data.Dataset.from_tensor_slices((train_images,train_labels))
train_datasets=train_datasets.shuffle(buffer_size=1024).batch(100)
for step,x_batch_train in enumerate(train_datasets):
print(step,x_batch_train)