一 简介
为解决数据来源的复杂性(各数据集的数据格式各不相同),Tensorflow使用TFRecord格式来统一输入数据的格式。
TFRecord是一种将图像数据和标签放在一起的二进制文件,能更好的利用内存,在tensorflow中快速的复制,移动,读取,存储
二 TFRecord文件中的数据格式是通过Proto Buffer定义的
Proto Buffer的简单使用请看我的另一篇博客
syntax = "proto3";
option cc_enable_arenas = true;
option java_outer_classname = "FeatureProtos";
option java_multiple_files = true;
option java_package = "org.tensorflow.example";
package tensorflow;
// Containers to hold repeated fundamental values.
message BytesList {
repeated bytes value = 1;
}
message FloatList {
repeated float value = 1 [packed = true];
}
message Int64List {
repeated int64 value = 1 [packed = true];
}
message Example {
Features features = 1;
};
// Containers for non-sequential data.
message Feature {
// Each feature can be exactly one kind.
oneof kind {
BytesList bytes_list = 1;
FloatList float_list = 2;
Int64List int64_list = 3;
}
};
message Features {
// Map from feature name to feature.
map<string, Feature> feature = 1;
};
// Containers for sequential data.
//
// A FeatureList contains lists of Features. These may hold zero or more
// Feature values.
//
// FeatureLists are organized into categories by name. The FeatureLists message
// contains the mapping from name to FeatureList.
//
message FeatureList {
repeated Feature feature = 1;
};
message FeatureLists {
// Map from feature name to feature list.
map<string, FeatureList> feature_list = 1;
message Example {
Features features = 1;
};
// Containers for non-sequential data.
message Feature {
// Each feature can be exactly one kind.
oneof kind {
BytesList bytes_list = 1;
FloatList float_list = 2;
Int64List int64_list = 3;
}
};
message Features {
// Map from feature name to feature.
map<string, Feature> feature = 1;
};
// Containers for sequential data.
//
// A FeatureList contains lists of Features. These may hold zero or more
// Feature values.
//
// FeatureLists are organized into categories by name. The FeatureLists message
// contains the mapping from name to FeatureList.
//
message FeatureList {
repeated Feature feature = 1;
};
message FeatureLists {
// Map from feature name to feature list.
map<string, FeatureList> feature_list = 1;
三 简单的写入与读取TFRecord
(1)转化成TFRecord文件
① 将要转化成TFRecord的数据转化成feature.proto中message Example{}定义的数据格式,
②使用example.SerializeToString()方法将转化好数据格式的数据序列化为字符串,
③使用tf.python_io.TFRecordWriter(outp_dir).write()将序列化后的数据写入TFRecord文件
#制作TFRecord格式
def createTFRecord(filename,mapfile):
class_map = {}
data_dir = '/home/wc/DataSet/traffic/testTFRecord/'
classes = {'xiansu60','xiansu100'}
writer = tf.python_io.TFRecordWriter(filename)
for index,name in enumerate(classes):
class_path=data_dir+name+'/'
class_map[index] = name
for img_name in os.listdir(class_path):
img_path = class_path + img_name
img = Image.open(img_path)
img= img.resize((224,224))
img_raw = img.tobytes() #将图片转化成二进制格式
example = tf.train.Example(features = tf.train.Features(feature = {
'label':tf.train.Feature(int64_list = tf.train.Int64List(value = [value])),
'image_raw': tf.train.Feature(bytes_list = tf.train.BytesList(value = [value]))
}))
writer.write(example.SerializeToString())
writer.close()
(2)读取TFRecord文件
Tensorflow用队列读取数据
①创建一个队列维护输入文件 tf.train.string_input_producer([filename], shuffle=False,num_epochs = 1)
②使用读取器tf.TFRecordReader()读取TFRecord中序列化的数据
③使用解析器tf.parse_single_example解析(反序列化)读取的数据
#读取train.tfrecord中的数据
def read_and_decode(filename):
reader = tf.TFRecordReader()
filename_queue = tf.train.string_input_producer([filename], shuffle=False,num_epochs = 1)
_,serialized_example = reader.read(filename_queue)
#解析读入的一个样例,如果需要解析多个,可以用parse_example
features = tf.parse_single_example(
serialized_example,
features = {'label':tf.FixedLenFeature([], tf.int64),
'image_raw': tf.FixedLenFeature([], tf.string),})
#将字符串解析成图像对应的像素数组
img = tf.decode_raw(features['image_raw'], tf.uint8)
img = tf.reshape(img,[224, 224, 3]) #reshape为128*128*3通道图片
img = tf.image.per_image_standardization(img)
labels = tf.cast(features['label'], tf.int32)
return img, labels