这部分主要是讲怎么把自己的数据转换成TFRecord
主要步骤如下:
1.转换数据格式
2.读取数据以及解码
3.生成Batch
4.构建tensorflow图谱
5.训练模型/验证、测试
原作者Kevin的视频课程地址:Youtube
关于TFRecord
是tensorflow的官方数据格式,请看截图,不是这部分的重点,略过
关于dataset:notMINST
下载数据集
数据集地址:点击打开链接
有以上两种可供选择,large的数据集比较大,包含所有数据。small每个类别有1873张,看自己选择吧,我用small。
--------------------------------------代码分解-----------------------------------------------
1.导入库
import tensorflow as tf
import numpy as np
import os
import matplotlib.pyplot as plt
import skimage.io as io # anaconda 自带
2.通过数据文件地址得到images and labels
def get_file(file_dir):
'''Get full image directory and corresponding labels
Args:
file_dir: file directory
Returns:
images: image directories, list, string
labels: label, list, int
'''
images = []
temp = []
for root, sub_folders, files in os.walk(file_dir):
# image directories
for name in files:
images.append(os.path.join(root, name))
# get 10 sub-folder names
for name in sub_folders:
temp.append(os.path.join(root, name))
# assign 10 labels based on the folder names
labels = []
for one_folder in temp:
n_img = len(os.listdir(one_folder))
letter = one_folder.split('/')[-1]
# 通过子文件夹名字获取标签,标签编号为1-10
if letter == 'A':
labels = np.append(labels, n_img * [1])
elif letter == 'B':
labels = np.append(labels, n_img * [2])
elif letter == 'C':
labels = np.append(labels, n_img * [3])
elif letter == 'D':
labels = np.append(labels, n_img * [4])
elif letter == 'E':
labels = np.append(labels, n_img * [5])
elif letter == 'F':
labels = np.append(labels, n_img * [6])
elif letter == 'G':
labels = np.append(labels, n_img * [7])
elif letter == 'H':
labels = np.append(labels, n_img * [8])
elif letter == 'I':
labels = np.append(labels, n_img * [9])
else:
labels = np.append(labels, n_img * [10])
# shuffle
temp = np.array([images, labels])
temp = temp.transpose()
np.random.shuffle(temp)
image_list = list(temp[:, 0])
label_list = list(temp[:, 1])
label_list = [int(float(i)) for i in label_list]
return image_list, label_list
3.将标签和图像转为特定的格式:
因为后面convert_to_tfrecord函数用到的编码函数tf.train.Example的数据结构中包含了一个从属性到取值的字典。而属性的取值可以为:字符串列表(BytesList)、实数列表(FloatList)或者整数列表(Int64List)通过以下函数编码为Example proto形式的返回值,所以提前在这里转换。
# 以下两个函数来自tensorflow官网
def int64_feature(value):
"""Wrapper for inserting int64 features into Example proto."""
if not isinstance(value, list):
value = [value]
return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
def bytes_feature(value):
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
4.将数据转换成TFRecord
def convert_to_tfrecord(images, labels, save_dir, name):
'''convert all images and labels to one tfrecord file.
Args:
images: list of image directories, string type
labels: list of labels, int type
save_dir: the directory to save tfrecord file, e.g.: '/home/folder1/'
name: the name of tfrecord file, string type, e.g.: 'train'
Return:
no return
Note:
converting needs some time, be patient...
转换时间有点长
'''
# 指定数据转换格式后的保存路径和名称
filename = os.path.join(save_dir, name + '.tfrecords')
# 取得图片的样本总数
n_samples = len(labels)
# 检查图像和label是不是对应的
if np.shape(images)[0] != n_samples:
raise ValueError('Images size %d does not match label size %d.' % (images.shape[0], n_samples))
# wait some time here, transforming need some time based on the size of your data.
# 创建一个实例对象 writer,用于后面序列化数据的写入
writer = tf.python_io.TFRecordWriter(filename)
print('\nTransform start......')
# 将所有数据(包括标签等)按照 tf.train.Example Protocol Buffer 的格式存储
for i in np.arange(0, n_samples):
try:
image = io.imread(images[i]) # type(image) must be array! 一张一张的读取
image_raw = image.tostring() # 将图片矩阵转化为字符串
label = int(labels[i])
# 创建tf.train.Example 协议内存块,把标签、图片数据作为特定字段存入(数据类型转换)
example = tf.train.Example(features=tf.train.Features(feature={
'label': int64_feature(label),
'image_raw': bytes_feature(image_raw)}))
# 调用实例对象 writer 的 write 方法将序列化后的 example 协议内存块写入 TFRecord 文件
writer.write(example.SerializeToString())
except IOError as e:
print('Could not read:', images[i])
print('error: %s' % e)
print('Skip it!\n')
# 原始数据有几张图片有错误不能读取,用这种方式略过
# 调用实例对象 writer 的 close 方法结束写入过程
writer.close()
print('Transform done!')
5. 读取 、解码生成batch
# 以上将数据转换成了TFRecord形式,将其解码出来
def read_and_decode(tfrecords_file, batch_size):
'''read and decode tfrecord file, generate (image, label) batches
Args:
tfrecords_file: the directory of tfrecord file
batch_size: number of images in each batch
Returns:
image: 4D tensor - [batch_size, width, height, channel]
label: 1D tensor - [batch_size]
'''
# make an input queue from the tfrecord file 创建文件名队列,方便数据的读取并节约时间
# 关于队列的原理见知乎地址: https://zhuanlan.zhihu.com/p/27238630
filename_queue = tf.train.string_input_producer([tfrecords_file])
# 创建一个实例对象 reader, 用于读取 TFRecord 中的样例
reader = tf.TFRecordReader()
_, serialized_example = reader.read(filename_queue)
# 按照字段格式,解析数据,从TFRecord 到 tf.int64、tf.string
img_features = tf.parse_single_example(
serialized_example,
features={
'label': tf.FixedLenFeature([], tf.int64),
'image_raw': tf.FixedLenFeature([], tf.string),
})
# 解码
image = tf.decode_raw(img_features['image_raw'], tf.uint8)
##########################################################
# you can put data augmentation here, I didn't use it
##########################################################
# all the images of notMNIST are 28*28, you need to change the image size if you use other dataset.
image = tf.reshape(image, [28, 28])
label = tf.cast(img_features['label'], tf.int32)
image_batch, label_batch = tf.train.batch([image, label],
batch_size=batch_size,
num_threads=64,
capacity=2000)
# num_threads:可以指定多个线程同时执行入队操作(数据读取和预处理),通过队列实现多线程处理机制
# capacity: 队列中最多可以存储的样例个数
return image_batch, tf.reshape(label_batch, [batch_size])
6.以上就是数据的转换及读取,接下来测试以下刚才的代码
这部分注意自己的地址要更改过来,我的项目总览如下
# %% Convert data to TFRecord
test_dir = 'D://python//neural network//notMNIST//notMNIST_small//'
save_dir = 'D://python//neural network//notMNIST//'
BATCH_SIZE = 25
# Convert test data: you just need to run it ONCE !
name_test = 'test'
images, labels = get_file(test_dir)
convert_to_tfrecord(images, labels, save_dir, name_test)
# %% TO test train.tfrecord file
def plot_images(images, labels):
'''plot one batch size
'''
for i in np.arange(0, BATCH_SIZE):
plt.subplot(5, 5, i + 1)
plt.axis('off')
plt.title(chr(ord('A') + labels[i] - 1), fontsize=14)
plt.subplots_adjust(top=1.5)
plt.imshow(images[i])
plt.show()
tfrecords_file = 'D://python//neural network//notMNIST//test.tfrecords'
# 这个地址为 save_file的地址加上数据的名字,详情见convert_to_tfrecord函数
image_batch, label_batch = read_and_decode(tfrecords_file, batch_size=BATCH_SIZE)
with tf.Session() as sess:
i = 0
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
try:
while not coord.should_stop() and i < 1:
# just plot one batch size
image, label = sess.run([image_batch, label_batch])
plot_images(image, label)
i += 1
except tf.errors.OutOfRangeError:
print('done!')
finally:
coord.request_stop()
coord.join(threads)
7.报错可能:
tensorflow.python.framework.errors_impl.NotFoundError: Failed to create a NewWriteableFile: D:/python/neural network/notMNIST/test.tfrecords ϵͳ\udcd5Ҳ\udcbb\udcb5\udcbdָ\udcb6\udca8\udcb5\udcc4·\udcbe\udcb6\udca1\udca3
其实这是个傻瓜错误,因为路径给错了。把文件夹MINST写成了MNIST,以至于找不到文件,你们可要仔细哦,最好是复制路径,避免手写错误。
8.结果:转换后的image部分展示
附全部代码:
# 原作者相关信息
# By @Kevin Xu
# kevin28520@gmail.com
# My youtube: https://www.youtube.com/channel/UCVCSn4qQXTDAtGWpWAe4Plw
# The aim of this project is to use TensorFlow to transform our own data into TFRecord format.
# data: notMNIST
# http://yaroslavvb.blogspot.ca/2011/09/notmnist-dataset.html
# http://yaroslavvb.com/upload/notMNIST/
# 注意 :文件下载地址如上,而非在代码里下载
# 导库
import tensorflow as tf
import numpy as np
import os
import matplotlib.pyplot as plt
import skimage.io as io # anaconda 自带
# %% 通过数据文件地址得到images and labels
def get_file(file_dir):
'''Get full image directory and corresponding labels
Args:
file_dir: file directory
Returns:
images: image directories, list, string
labels: label, list, int
'''
images = []
temp = []
for root, sub_folders, files in os.walk(file_dir):
# image directories
for name in files:
images.append(os.path.join(root, name))
# get 10 sub-folder names
for name in sub_folders:
temp.append(os.path.join(root, name))
# assign 10 labels based on the folder names
labels = []
for one_folder in temp:
n_img = len(os.listdir(one_folder))
letter = one_folder.split('/')[-1]
# 通过子文件夹名字获取标签,标签编号为1-10
if letter == 'A':
labels = np.append(labels, n_img * [1])
elif letter == 'B':
labels = np.append(labels, n_img * [2])
elif letter == 'C':
labels = np.append(labels, n_img * [3])
elif letter == 'D':
labels = np.append(labels, n_img * [4])
elif letter == 'E':
labels = np.append(labels, n_img * [5])
elif letter == 'F':
labels = np.append(labels, n_img * [6])
elif letter == 'G':
labels = np.append(labels, n_img * [7])
elif letter == 'H':
labels = np.append(labels, n_img * [8])
elif letter == 'I':
labels = np.append(labels, n_img * [9])
else:
labels = np.append(labels, n_img * [10])
# shuffle
temp = np.array([images, labels])
temp = temp.transpose()
np.random.shuffle(temp)
image_list = list(temp[:, 0])
label_list = list(temp[:, 1])
label_list = [int(float(i)) for i in label_list]
return image_list, label_list
#%% 将标签和图像转为特定的格式:
# 因为后面convert_to_tfrecord函数用到的编码函数tf.train.Example的数据结构中包含了一个从属性到取值的字典。而属性的取值可以为
# 字符串列表(BytesList)、实数列表(FloatList)或者整数列表(Int64List)通过以下函数编码为Example proto形式的返回值,所以提前在这里转换
# 以下两个函数来自tensorflow官网
def int64_feature(value):
"""Wrapper for inserting int64 features into Example proto."""
if not isinstance(value, list):
value = [value]
return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
def bytes_feature(value):
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
# %% 转换成TFRecord
def convert_to_tfrecord(images, labels, save_dir, name):
'''convert all images and labels to one tfrecord file.
Args:
images: list of image directories, string type
labels: list of labels, int type
save_dir: the directory to save tfrecord file, e.g.: '/home/folder1/'
name: the name of tfrecord file, string type, e.g.: 'train'
Return:
no return
Note:
converting needs some time, be patient...
转换时间有点长
'''
# 指定数据转换格式后的保存路径和名称
filename = os.path.join(save_dir, name + '.tfrecords')
# 取得图片的样本总数
n_samples = len(labels)
# 检查图像和label是不是对应的
if np.shape(images)[0] != n_samples:
raise ValueError('Images size %d does not match label size %d.' % (images.shape[0], n_samples))
# wait some time here, transforming need some time based on the size of your data.
# 创建一个实例对象 writer,用于后面序列化数据的写入
writer = tf.python_io.TFRecordWriter(filename)
print('\nTransform start......')
# 将所有数据(包括标签等)按照 tf.train.Example Protocol Buffer 的格式存储
for i in np.arange(0, n_samples):
try:
image = io.imread(images[i]) # type(image) must be array! 一张一张的读取
image_raw = image.tostring() # 将图片矩阵转化为字符串
label = int(labels[i])
# 创建tf.train.Example 协议内存块,把标签、图片数据作为特定字段存入(数据类型转换)
example = tf.train.Example(features=tf.train.Features(feature={
'label': int64_feature(label),
'image_raw': bytes_feature(image_raw)}))
# 调用实例对象 writer 的 write 方法将序列化后的 example 协议内存块写入 TFRecord 文件
writer.write(example.SerializeToString())
except IOError as e:
print('Could not read:', images[i])
print('error: %s' % e)
print('Skip it!\n')
# 原始数据有几张图片有错误不能读取,用这种方式略过
# 调用实例对象 writer 的 close 方法结束写入过程
writer.close()
print('Transform done!')
# %% 读取 、解码生成batch
# 以上将数据转换成了TFRecord形式,将其解码出来
def read_and_decode(tfrecords_file, batch_size):
'''read and decode tfrecord file, generate (image, label) batches
Args:
tfrecords_file: the directory of tfrecord file
batch_size: number of images in each batch
Returns:
image: 4D tensor - [batch_size, width, height, channel]
label: 1D tensor - [batch_size]
'''
# make an input queue from the tfrecord file 创建文件名队列,方便数据的读取并节约时间
# 关于队列的原理见知乎地址: https://zhuanlan.zhihu.com/p/27238630
filename_queue = tf.train.string_input_producer([tfrecords_file])
# 创建一个实例对象 reader, 用于读取 TFRecord 中的样例
reader = tf.TFRecordReader()
_, serialized_example = reader.read(filename_queue)
# 按照字段格式,解析数据,从TFRecord 到 tf.int64、tf.string
img_features = tf.parse_single_example(
serialized_example,
features={
'label': tf.FixedLenFeature([], tf.int64),
'image_raw': tf.FixedLenFeature([], tf.string),
})
# 解码
image = tf.decode_raw(img_features['image_raw'], tf.uint8)
##########################################################
# you can put data augmentation here, I didn't use it
##########################################################
# all the images of notMNIST are 28*28, you need to change the image size if you use other dataset.
image = tf.reshape(image, [28, 28])
label = tf.cast(img_features['label'], tf.int32)
image_batch, label_batch = tf.train.batch([image, label],
batch_size=batch_size,
num_threads=64,
capacity=2000)
# num_threads:可以指定多个线程同时执行入队操作(数据读取和预处理),通过队列实现多线程处理机制
# capacity: 队列中最多可以存储的样例个数
return image_batch, tf.reshape(label_batch, [batch_size])
## 测试
# %% Convert data to TFRecord
test_dir = 'D:/python/neural network/notMINST/notMNIST_small/'
save_dir = 'D:/python/neural network/notMINST/'
BATCH_SIZE = 25
# Convert test data: you just need to run it ONCE !
name_test = 'test'
images, labels = get_file(test_dir)
convert_to_tfrecord(images, labels, save_dir, name_test)
# %% TO test train.tfrecord file
def plot_images(images, labels):
'''plot one batch size
'''
for i in np.arange(0, BATCH_SIZE):
plt.subplot(5, 5, i + 1)
plt.axis('off')
plt.title(chr(ord('A') + labels[i] - 1), fontsize=14)
plt.subplots_adjust(top=1.5)
plt.imshow(images[i])
plt.show()
tfrecords_file = 'D:/python/neural network/notMINST/test.tfrecords'
# 这个地址为 save_file的地址加上数据的名字,详情见convert_to_tfrecord函数
image_batch, label_batch = read_and_decode(tfrecords_file, batch_size=BATCH_SIZE)
with tf.Session() as sess:
i = 0
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
try:
while not coord.should_stop() and i < 1:
# just plot one batch size
image, label = sess.run([image_batch, label_batch])
plot_images(image, label)
i += 1
except tf.errors.OutOfRangeError:
print('done!')
finally:
coord.request_stop()
coord.join(threads)