TFRecord制作
TFRecord是一种比较常用的存储二进制序列数据的方法, 便于网络流式读取数据. 本文基于tf2.0, 讲述.tfrecord
格式数据集制作和解析。
TFRecord概述
Tensorflow经常使用tf.Example
来写入、读取TFRecord数据. 可以将数据表示为{"string": value}
形式的meassage类型, 常用有三种转化形式:
tf.train.BytesList
: 可以使用的类型包括 string和bytetf.train.FloatList
: 可以使用的类型包括 float和doubletf.train.Int64List
: 可以使用的类型包括 enum,bool, int32, uint32, int64
具体转化实例如下, 其实就是TensorFlow制定了一系列标准的数据转化模式.
def _bytes_feature(value):
"""Returns a bytes_list from a string/byte."""
if isinstance(value, type(tf.constant(0))):
value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
def _float_feature(value):
"""Return a float_list form a float/double."""
return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))
def _int64_feature(value):
"""Return a int64_list from a bool/enum/int/uint."""
return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
那我们看看转化实例
# 转化一个字符串
print(_bytes_feature('test_string'.encode('utf8')))
# 转化一个整数
print(_int64_feature(1))
输出就是这样,类似一个字典
bytes_list {
value: "test_string"
}
int64_list {
value: 1
}
单张图片操作
制作成TFRecord格式
比如一张狗子的照片,标签为0
# 读取图片,定义标签
image_string = tf.io.read_file('./img/dog.jpg')
label = 0
图片信息不只有有单一信息,还有长度,宽度,深度,标签之类的, 我们需要创建一个Example, 把这些信息读转化为一条信息.
# 创建图像数据的Example
def image_example(image_string, label):
image_shape = tf.image.decode_jpeg(image_string).shape
# feature 定义转换成相应类型
feature = {
'height': _int64_feature(image_shape[0]),
'width': _int64_feature(image_shape[1]),
'depth': _int64_feature(image_shape[2]),
'label': _int64_feature(label),
'image_raw': _bytes_feature(image_string),
}
# 使用tf.train.Example来创建
return tf.train.Example(features=tf.train.Features(feature=feature))
# 将对应图像进行转化
image_example_proto = image_example(image_string, label)
看看转化之后的结果(部分信息)
features {
feature {
key: "depth"
value {
int64_list {
value: 3
}
}
}
feature {
key: "height"
value {
int64_list {
value: 576
}
..
保存为.tfrecord
格式
record_file = 'image1.tfrecord'
with tf.io.TFRecordWriter(record_file) as writer:
writer.write(image_example_proto.SerializeToString())
读取解析TFRecord格式文件
运用tf.data.TFRecordDataset
读取.tfrecord
文件
record_file = 'image1.tfrecord'
tf_record = tf.data.TFRecordDataset(record_file)
Example数据都进行了序列化,还需要解析以下之前写入的序列化,tf.io.parse_single_example(example_proto, feature_description)
可以解析单条Example
-
先进行特征描述
feature_description
要保证和之前转化的时候feature保持一致# 解析的格式需要跟之前创建example时一致 image_feature_description = { 'height': tf.io.FixedLenFeature([], tf.int64), 'width': tf.io.FixedLenFeature([], tf.int64), 'depth': tf.io.FixedLenFeature([], tf.int64), 'label': tf.io.FixedLenFeature([], tf.int64), 'image_raw': tf.io.FixedLenFeature([], tf.string), }
-
定义函数解析单独的Example
def parse_tf_example(example_proto): # 解析出来 parsed_example = tf.io.parse_single_example(example_proto, image_feature_description) # 此处可以对图像预处理 return parsed_example
.map
逐条解析
parsed_image_dataset=tf_record.map(parse_tf_example)
把图像和类别显示一下
from IPython.display import clear_output, Image, display
for image_features in parsed_image_dataset:
image_raw = image_features['image_raw'].numpy()
print(image_features['label'].numpy())
display(Image(data=image_raw))
这样,一个狗子的图片和标签(标签为0)就出来啦
TFRecord对数据集处理
训练集、测试集、验证集原理相同,以训练集为例
多线程生成 TFRecord
1. 数据准备
data
文件夹下分别分类放训练集,验证集图像,label.txt
每一行存放类别标签(与文件名保持一致)
train_directory = './data/cats_and_dogs/train/'
output_directory = './data/'
# 生成几个patch/tfrecord文件个数
train_shards = 2
# 线程数, 必须保证shards可以整除num_threads
num_threads = 1
labels_file = './label.txt'
2.读取文件名和标签
将训练集中filenames
文件路径、texts
类别名、labels
类别标签(0,1,2形式)分别读入一一对应,,并打乱顺序
def _find_image_files(data_dir, labels_file):
print('目标文件夹位置: %s.' % data_dir)
unique_labels = [l.strip() for l in tf.io.gfile.GFile(labels_file, 'r').readlines()]
labels = []
filenames = []
texts = []
label_index = 0
# Construct the list of JPEG files and labels.
for text in unique_labels:
jpeg_file_path = data_dir + '/' + text + '/*'
try:
matching_files = tf.io.gfile.glob(jpeg_file_path)
except:
print(jpeg_file_path)
continue
labels.extend([label_index] * len(matching_files))
texts.extend([text] * len(matching_files))
filenames.extend(matching_files)
label_index += 1
shuffled_index = list(range(len(filenames)))
# 让之后shuffle一致
random.seed(12345)
random.shuffle(shuffled_index)
filenames = [filenames[i] for i in shuffled_index]
texts = [texts[i] for i in shuffled_index]
labels = [labels[i] for i in shuffled_index]
print('Found %d JPEG files across %d labels inside %s.' %(len(filenames),
len(unique_labels), data_dir))
return filenames, texts, labels
3.多线程分配
如果总共1000张生成2个tfrecord文件,对应2个线程,那么就是让 [0,500) 张图片在第1个线程生成第1个tfrecord,[500,999)张图片在第2个线程生成第2个tfrecord文件,两者同步,这样可以大大提高效率
# Break all images into batches with a [ranges[i][0], ranges[i][1]].
# 多线程分配
spacing = np.linspace(0, len(filenames), num_threads + 1).astype(np.int)
ranges = []
for i in range(len(spacing) - 1):
ranges.append([spacing[i], spacing[i+1]])
# Launch a thread for each batch.
print('Launching %d threads for spacings: %s' % (num_threads, ranges))
sys.stdout.flush()
# Create a mechanism for monitoring when all threads are finished.
coord = tf.train.Coordinator()
# Create a generic TensorFlow-based utility for converting all image codings.
threads = []
for thread_index in range(len(ranges)):
args = (name, thread_index, ranges, filenames,
texts, labels, num_shards)
# 每个线程分别处理 _process_image_files_batch
t = threading.Thread(target=_process_image_files_batch, args=args)
t.start()
threads.append(t)
# Wait for all the threads to terminate.
coord.join(threads)
print('%s: Finished writing all %d images in data set.' %(datetime.now(), len(filenames)))
sys.stdout.flush()
4.写入文件并保存
每个线程下定义 writer = tf.io.TFRecordWriter(output_file)
做writer.write
操作
def _process_image_files_batch(name, thread_index, ranges, filenames, texts, labels,
num_shards):
"""Processes and saves list of images as TFRecord in 1 thread.
"""
num_threads = len(ranges)
assert not num_shards % num_threads
num_shards_per_batch = int(num_shards / num_threads)
shard_ranges = np.linspace(ranges[thread_index][0],
ranges[thread_index][1],
num_shards_per_batch + 1).astype(int)
num_files_in_thread = ranges[thread_index][1] - ranges[thread_index][0]
counter = 0
for s in range(num_shards_per_batch):
# Generate a sharded version of the file name, e.g. 'train-00002-of-00010'
shard = thread_index * num_shards_per_batch + s
output_filename = '%s-%.5d-of-%.5d.tfrecord' % (name, shard, num_shards)
output_file = os.path.join(output_directory, output_filename)
writer = tf.io.TFRecordWriter(output_file)
shard_counter = 0
files_in_shard = np.arange(shard_ranges[s], shard_ranges[s + 1], dtype=int)
# 每个图片信息写入
for i in files_in_shard:
filename = filenames[i]
label = labels[i]
text = texts[i]
# _process_image 读取图片数据和相关信息,把png图像都转成jpeg编码
image_data, height, width, channel = _process_image(filename)
# _convert_to_example 创建example
example = _convert_to_example(filename, image_data, label,
text, height, width, channel)
writer.write(example.SerializeToString())
shard_counter += 1
counter += 1
if not counter % 1000:
print('%s [thread %d]: Processed %d of %d images in thread batch.' %
(datetime.now(), thread_index, counter, num_files_in_thread))
sys.stdout.flush()
writer.close()
print('%s [thread %d]: Wrote %d images to %s' %
(datetime.now(), thread_index, shard_counter, output_file))
sys.stdout.flush()
shard_counter = 0
print('%s [thread %d]: Wrote %d images to %d shards.' %
(datetime.now(), thread_index, counter, num_files_in_thread))
sys.stdout.flush()
方便后续解码,把png转化为jpeg编码格式的image_data
,并读取图像长、宽、深度信息 height
, width
,channel
。和单张图片一样创建example, 把相应信息转化为一条tf格式信息.
def _convert_to_example(filename, image_data, label, text, height, width, channel):
example = tf.train.Example(features=tf.train.Features(feature={
'image/height': _int64_feature(height),
'image/width': _int64_feature(width),
'image/channels': _int64_feature(channel),
'image/class/label': _int64_feature(label),
'image/class/text': _bytes_feature(tf.compat.as_bytes(text)),
'image/encoded': _bytes_feature(image_data)}))
return example
训练集生成了2个文件,验证集生成1个文件
读取TFRecord并训练
1.读取文件
注意多个文件的读入
DATA_DIR = "./data/"
raw_val_dataset = tf.data.TFRecordDataset('./data/val-00000-of-00001.tfrecord')
train_dataset_root = [os.path.join(DATA_DIR, "train-0000%d-of-00002.tfrecord" % i) for i in range(0, 2)]
# 多个文件,先转化为tf.string
raw_train_dataset =tf.data.TFRecordDataset(tf.data.Dataset.from_tensor_slices(train_dataset_root))
2.定义解析函数
解析之后,对每张图像进行预处理,decode
,resize
等。
def parse_tf_example(example_proto):
# 解析出来
parsed_example = tf.io.parse_single_example(example_proto, image_feature_description)
# 预处理
x_train = tf.image.decode_jpeg(parsed_example['image/encoded'], channels=channels)
x_train = tf.image.resize(x_train, (image_height, image_width))
x_train /= 255.
label = parsed_example['image/class/label']
return x_train, label
3.数据解析并放入batch
def prepare_dataset(raw_train, raw_val):
# .map解析
train_ds = raw_train.map(parse_tf_example)
val_ds = raw_val.map(parse_tf_example)
# batch,已经乱序,无需shuffer
train_ds = train_ds.batch(batch_size=batch_size)
val_ds = val_ds.batch(batch_size=batch_size)
return train_ds, val_ds
4.得到可训练数据集
# 得到训练集,验证集
train_dataset, val_dataset = prepare_dataset(raw_train_dataset, raw_val_dataset)
# model部分省略
# 训练
model.fit(train_dataset, epochs=EPOCHS, validation_data=val_dataset)
得到一个 BatchDataset
包含图像信息和标签
ch,已经乱序,无需shuffer
train_ds = train_ds.batch(batch_size=batch_size)
val_ds = val_ds.batch(batch_size=batch_size)
return train_ds, val_ds
**4.得到可训练数据集**
```python
# 得到训练集,验证集
train_dataset, val_dataset = prepare_dataset(raw_train_dataset, raw_val_dataset)
# model部分省略
# 训练
model.fit(train_dataset, epochs=EPOCHS, validation_data=val_dataset)
得到一个 BatchDataset
包含图像信息和标签