由于在tensorflow最初时一直用的就是最经典的mnist手写字符识别中的数据集了,而且在tensorflow中直接封装好的是mnist手写字符的数据集类,方便直接用input_data.*中的方法调用其读取数据,读取数据标签,以及对数据feed时的next_batch()等操作,但是涉及后面进一步深入时,如果想要训练和测试自己的数据集时没有封装好的数据类方法,此时就需要自己来进行制作数据集。那么首先就要从官方tensorflow.examples.tutorials.mnist import input_data找到启发。
首先根据其导入包的路径在tensorflow官方GitHub上找到其定义:
接着找到代码定义在:
然后打开mnist.py代码定义数据集类和方法:
发现其DataSet(object)类与方法定义如下:
1.定义DataSet(object)类:
class DataSet(object):
"""Container class for a dataset (deprecated).
THIS CLASS IS DEPRECATED. See
[contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
for general migration instructions.
"""
def __init__(self,
images,
labels,
fake_data=False,
one_hot=False,
dtype=dtypes.float32,
reshape=True,
seed=None):
"""构造一个DataSet类.
仅当fake_data为true时才使用one_hot(独热编码)参数。
`dtype`可以为`uint8`,其将输入保留为`[0,255]`
也可以为`float32` 将重新缩放为`[0,1]`。
种子seed参数提供方便的确定性测试。
"""
seed1, seed2 = random_seed.get_seed(seed)
# If op level seed is not set, use whatever graph level seed is returned
numpy.random.seed(seed1 if seed is None else seed2)
dtype = dtypes.as_dtype(dtype).base_dtype
if dtype not in (dtypes.uint8, dtypes.float32):
raise TypeError(
'Invalid image dtype %r, expected uint8 or float32' % dtype)
if fake_data:
self._num_examples = 10000
self.one_hot = one_hot
else:
assert images.shape[0] == labels.shape[0], (
'images.shape: %s labels.shape: %s' % (images.shape, labels.shape))
self._num_examples = images.shape[0]
# 将输入类型 [num examples, rows, columns, depth]
# 转换为 [num examples, rows*columns] (假设 depth == 1)
if reshape:
assert images.shape[3] == 1
images = images.reshape(images.shape[0],
images.shape[1] * images.shape[2])
if dtype == dtypes.float32:
# 将数据类型 [0, 255] ->缩放转换 [0.0, 1.0].
images = images.astype(numpy.float32)
images = numpy.multiply(images, 1.0 / 255.0)
self._images = images
self._labels = labels
self._epochs_completed = 0
self._index_in_epoch = 0
@property
def images(self):
return self._images
@property
def labels(self):
return self._labels
@property
def num_examples(self):
return self._num_examples
@property
def epochs_completed(self):
return self._epochs_completed
2.定义next_batch方法:
关键点在于记录此时读取数据集起始位置,然后每个迭代周期数据集就是(index_in_epoch+batch_size)。以及开头第一个epoch怎么处理, 每个epoch的结尾连接下一个epoch的开头怎么处理, 非第一个epoch和非结尾怎么处理。
def next_batch(self, batch_size, fake_data=False, shuffle=True):
"""根据当前数据集,从此数据集返回下一个`batch_size`数据。"""
if fake_data:
fake_image = [1] * 784
if self.one_hot:
fake_label = [1] + [0] * 9
else:
fake_label = 0
return [fake_image for _ in xrange(batch_size)], [
fake_label for _ in xrange(batch_size)
]
start = self._index_in_epoch
#处理epoch起始位置
# 起始时Shuffle打乱数据顺序
if self._epochs_completed == 0 and start == 0 and shuffle:
perm0 = numpy.arange(self._num_examples)
numpy.random.shuffle(perm0)
self._images = self.images[perm0]
self._labels = self.labels[perm0]
# 执行下一次迭代周期
if start + batch_size > self._num_examples:
# 完成一个迭代周期
self._epochs_completed += 1
# Get the rest examples in this epoch
rest_num_examples = self._num_examples - start
images_rest_part = self._images[start:self._num_examples]
labels_rest_part = self._labels[start:self._num_examples]
# 采用Shuffle打乱数据顺序
if shuffle:
perm = numpy.arange(self._num_examples)
numpy.random.shuffle(perm)
self._images = self.images[perm]
self._labels = self.labels[perm]
# 开始下一次迭代周期epoch
start = 0
self._index_in_epoch = batch_size - rest_num_examples
end = self._index_in_epoch
images_new_part = self._images[start:end]
labels_new_part = self._labels[start:end]
return numpy.concatenate(
(images_rest_part, images_new_part), axis=0), numpy.concatenate(
(labels_rest_part, labels_new_part), axis=0)
#除了第一个epoch,以及每个epoch的开头,剩下中间batch批次的处理方式
else:
self._index_in_epoch += batch_size#起始位置:index_in_epoch
end = self._index_in_epoch#结束位置:起始位置(index_in_epoch)加batch_size
return self._images[start:end], self._labels[start:end]#返回图像和其对应的标签
3.定义read_data_set方法:
def read_data_sets(train_dir,
fake_data=False,
one_hot=False,
dtype=dtypes.float32,
reshape=True,
validation_size=5000,
seed=None,
source_url=DEFAULT_SOURCE_URL):
if fake_data:
def fake():
return DataSet(
[], [], fake_data=True, one_hot=one_hot, dtype=dtype, seed=seed)
train = fake()
validation = fake()
test = fake()
#返回数据类。
return base.Datasets(train=train, validation=validation, test=test)
if not source_url: # empty string check
source_url = DEFAULT_SOURCE_URL
TRAIN_IMAGES = 'train-images-idx3-ubyte.gz'
TRAIN_LABELS = 'train-labels-idx1-ubyte.gz'
TEST_IMAGES = 't10k-images-idx3-ubyte.gz'
TEST_LABELS = 't10k-labels-idx1-ubyte.gz'
#base模块定义在:from tensorflow.contrib.learn.python.learn.datasets import base
local_file = base.maybe_download(TRAIN_IMAGES, train_dir,
source_url + TRAIN_IMAGES)
#gfile模块定义在:from tensorflow.python.platform import gfile
with gfile.Open(local_file, 'rb') as f:
train_images = extract_images(f)
local_file = base.maybe_download(TRAIN_LABELS, train_dir,
source_url + TRAIN_LABELS)
with gfile.Open(local_file, 'rb') as f:
train_labels = extract_labels(f, one_hot=one_hot)
local_file = base.maybe_download(TEST_IMAGES, train_dir,
source_url + TEST_IMAGES)
with gfile.Open(local_file, 'rb') as f:
test_images = extract_images(f)
local_file = base.maybe_download(TEST_LABELS, train_dir,
source_url + TEST_LABELS)
with gfile.Open(local_file, 'rb') as f:
test_labels = extract_labels(f, one_hot=one_hot)
if not 0 <= validation_size <= len(train_images):
raise ValueError('Validation size should be between 0 and {}. Received: {}.'
.format(len(train_images), validation_size))
validation_images = train_images[:validation_size]
validation_labels = train_labels[:validation_size]
train_images = train_images[validation_size:]
train_labels = train_labels[validation_size:]
options = dict(dtype=dtype, reshape=reshape, seed=seed)
#读取完数据后按照上面定义的DataSet类返回训练集,验证集和测试集的数据类(包含其方法和属性)。
train = DataSet(train_images, train_labels, **options)
validation = DataSet(validation_images, validation_labels, **options)
test = DataSet(test_images, test_labels, **options)
return base.Datasets(train=train, validation=validation, test=test)
def load_mnist(train_dir='MNIST-data'):
#返回载入数据函数
return read_data_sets(train_dir)
参考博客:https://blog.csdn.net/u013608336/article/details/78747102