TFRecords制作
用的时候直接抄下面的实例
(把需要深度学习的图片文件等转化为标准的TFRecords格式文件)
为了高效地读取数据,可以将数据进行序列化存储,这样也便于网络流式读取数据。TFRecord是一种比较常用的存储二进制序列数据的方法
tf.Example类是一种将数据表示为{“string”: value}形式的meassage类型,Tensorflow经常使用tf.Example来写入、
读取TFRecord数据
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import tensorflow as tf
通常情况下,tf.Example中可以使用以下几种格式:
tf.train.BytesList: 可以使用的类型包括 string和byte
tf.train.FloatList: 可以使用的类型包括 float和double
tf.train.Int64List: 可以使用的类型包括 enum,bool, int32, uint32, int64
转化实例
def _bytes_feature(value):
"""Returns a bytes_list from a string/byte."""
if isinstance(value, type(tf.constant(0))):
value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value])) #希望转化为BytesList格式
def _float_feature(value):
"""Return a float_list form a float/double."""
return tf.train.Feature(float_list=tf.train.FloatList(value=[value])) #同上FloatList
def _int64_feature(value):
"""Return a int64_list from a bool/enum/int/uint."""
return tf.train.Feature(int64_list=tf.train.Int64List(value=[value])) #同上Int64List
# tf.train.BytesList
print(_bytes_feature(b'test_string'))
print(_bytes_feature('test_string'.encode('utf8')))
# tf.train.FloatList
print(_float_feature(np.exp(1)))
# tf.train.Int64List
print(_int64_feature(True))
print(_int64_feature(1))
bytes_list { value: “test_string” }
bytes_list { value: “test_string” }
float_list { value: 2.7182817459106445 }
int64_list { value: 1 }
int64_list { value: 1 }
tfrecord制作方法
创建tf.Example
def serialize_example(feature0, feature1, feature2, feature3):
"""
创建tf.Example
"""
# 转换成相应类型
feature = {
'feature0': _int64_feature(feature0),
'feature1': _int64_feature(feature1),
'feature2': _bytes_feature(feature2),
'feature3': _float_feature(feature3),
}
#使用tf.train.Example来创建
example_proto = tf.train.Example(features=tf.train.Features(feature=feature)) #传入features,转入了一条数据
#SerializeToString方法转换为二进制字符串
return example_proto.SerializeToString()
# 数据量
n_observations = int(1e4)
# Boolean feature
feature0 = np.random.choice([False, True], n_observations) #举个栗子
# Integer feature
feature1 = np.random.randint(0, 5, n_observations) #举个栗子
# String feature
strings = np.array([b'cat', b'dog', b'chicken', b'horse', b'goat'])
feature2 = strings[feature1]
# Float feature
feature3 = np.random.randn(n_observations)
filename = 'tfrecord-1' #命名
with tf.io.TFRecordWriter(filename) as writer: #写入TFRecord
for i in range(n_observations):
example = serialize_example(feature0[i], feature1[i], feature2[i], feature3[i])
writer.write(example)
读取tfrecord文件
filenames = [filename]
# 读取,以后就不用读图片了
raw_dataset = tf.data.TFRecordDataset(filenames)
raw_dataset
<TFRecordDatasetV2 shapes: (), types: tf.string>
实例
import os
import glob
from datetime import datetime
import cv2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
%matplotlib inline
image_path = '../img/'
images = glob.glob(image_path + '*.jpg')
for fname in images:
image = mpimg.imread(fname)
f, (ax1) = plt.subplots(1, 1, figsize=(8,8))
f.subplots_adjust(hspace = .2, wspace = .05)
ax1.imshow(image)
ax1.set_title('Image', fontsize=20)
image_labels = {
'dog': 0,
'kangaroo': 1,
}
制作TFRecord
# 读数据,binary格式
image_string = open('./img/dog.jpg', 'rb').read()
label = image_labels['dog']
def _bytes_feature(value):
"""Returns a bytes_list from a string/byte."""
if isinstance(value, type(tf.constant(0))):
value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
def _float_feature(value):
"""Return a float_list form a float/double."""
return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))
def _int64_feature(value):
"""Return a int64_list from a bool/enum/int/uint."""
return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
# 创建图像数据的Example
def image_example(image_string, label):
image_shape = tf.image.decode_jpeg(image_string).shape
feature = {
'height': _int64_feature(image_shape[0]),
'width': _int64_feature(image_shape[1]),
'depth': _int64_feature(image_shape[2]),
'label': _int64_feature(label),
'image_raw': _bytes_feature(image_string),
}
return tf.train.Example(features=tf.train.Features(feature=feature))
#打印部分信息
image_example_proto = image_example(image_string, label)
for line in str(image_example_proto).split('\n')[:15]:
print(line)
print('...')
features { feature {
key: “depth”
value {
int64_list {
value: 3
}
} } feature {
key: “height”
value {
int64_list {
value: 576
} …
# 制作 `images.tfrecords`.
image_path = './img/'
images = glob.glob(image_path + '*.jpg') #数据路径
record_file = 'images.tfrecord'
counter = 0
with tf.io.TFRecordWriter(record_file) as writer:
for fname in images:
with open(fname, 'rb') as f:
image_string = f.read()
label = image_labels[os.path.basename(fname).replace('.jpg', '')]
# `tf.Example`
tf_example = image_example(image_string, label)
# 将`tf.example` 写入 TFRecord
writer.write(tf_example.SerializeToString())
counter += 1
print('Processed {:d} of {:d} images.'.format(
counter, len(images)))
print(' Wrote {} images to {}'.format(counter, record_file))
Processed 1 of 2 images.
Processed 2 of 2 images.
Wrote 2 images toimages.tfrecord
到这里就只做好了,下面是使用
加载制作好的TFRecord
raw_train_dataset = tf.data.TFRecordDataset('images.tfrecord')
raw_train_dataset
<TFRecordDatasetV2 shapes: (), types: tf.string>
example数据都进行了序列化,还需要解析以下之前写入的序列化string
解析,就是反向转化,把TFRecord转换为图片
tf.io.parse_single_example(example_proto, feature_description)函数可以解析单条example
# 解析的格式需要跟之前创建example时一致
image_feature_description = {
'height': tf.io.FixedLenFeature([], tf.int64),
'width': tf.io.FixedLenFeature([], tf.int64),
'depth': tf.io.FixedLenFeature([], tf.int64),
'label': tf.io.FixedLenFeature([], tf.int64),
'image_raw': tf.io.FixedLenFeature([], tf.string),
}
现在看起来仅仅完成了一个样本的解析,实际数据不可能一个个来写吧,可以定义一个映射规则map函数
def parse_tf_example(example_proto):
# 解析出来
parsed_example = tf.io.parse_single_example(example_proto, image_feature_description) #example_proto,样本。image_feature_description,对照关系
# 预处理
x_train = tf.image.decode_jpeg(parsed_example['image_raw'], channels=3)
x_train = tf.image.resize(x_train, (416, 416))
x_train /= 255.
lebel = parsed_example['label']
y_train = lebel
return x_train, y_train
train_dataset = raw_train_dataset.map(parse_tf_example) #map相当于映射,功能和循环一样,但比循环更快。
train_dataset
<MapDataset shapes: ((416, 416, 3), ()), types: (tf.float32,
tf.int64)>
制作训练集
num_epochs = 10
train_ds = train_dataset.shuffle(buffer_size=10000).batch(2).repeat(num_epochs) #构建batch
train_ds
<RepeatDataset shapes: ((None, 416, 416, 3), (None,)), types: (tf.float32, tf.int64)>
for batch, (x, y) in enumerate(train_ds):
print(batch, x.shape, y) #打印10个batch
到这里就导入成功了
下面是做神经网络小试验
model = tf.keras.Sequential([
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(2, activation='softmax')
])
model.compile(optimizer='adam',
loss=tf.keras.losses.SparseCategoricalCrossentropy(),
metrics=['accuracy'])
model.fit(train_ds, epochs=num_epochs)