简析Tensorflow中examples.tutorials.mnist中 input_data数据类

最新推荐文章于 2024-06-08 19:25:36 发布

秋名鱼酱

最新推荐文章于 2024-06-08 19:25:36 发布

阅读量1.8w

点赞数 4

文章标签： TensorFlow mnist

本文链接：https://blog.csdn.net/KID_yuan/article/details/89040245

版权

由于在tensorflow最初时一直用的就是最经典的mnist手写字符识别中的数据集了，而且在tensorflow中直接封装好的是mnist手写字符的数据集类，方便直接用input_data.*中的方法调用其读取数据，读取数据标签，以及对数据feed时的next_batch()等操作，但是涉及后面进一步深入时，如果想要训练和测试自己的数据集时没有封装好的数据类方法，此时就需要自己来进行制作数据集。那么首先就要从官方tensorflow.examples.tutorials.mnist import input_data找到启发。

首先根据其导入包的路径在tensorflow官方GitHub上找到其定义：

接着找到代码定义在：

然后打开mnist.py代码定义数据集类和方法：

发现其DataSet(object)类与方法定义如下：

1.定义DataSet(object)类：

class DataSet(object):
  """Container class for a dataset (deprecated).
  THIS CLASS IS DEPRECATED. See
  [contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
  for general migration instructions.
  """
  def __init__(self,
               images,
               labels,
               fake_data=False,
               one_hot=False,
               dtype=dtypes.float32,
               reshape=True,
               seed=None):
    """构造一个DataSet类.
      仅当fake_data为true时才使用one_hot（独热编码）参数。
     `dtype`可以为`uint8`，其将输入保留为`[0,255]`
        也可以为`float32` 将重新缩放为`[0,1]`。 
        种子seed参数提供方便的确定性测试。
    """
    seed1, seed2 = random_seed.get_seed(seed)
    # If op level seed is not set, use whatever graph level seed is returned
    numpy.random.seed(seed1 if seed is None else seed2)
    dtype = dtypes.as_dtype(dtype).base_dtype
    if dtype not in (dtypes.uint8, dtypes.float32):
      raise TypeError(
          'Invalid image dtype %r, expected uint8 or float32' % dtype)
    if fake_data:
      self._num_examples = 10000
      self.one_hot = one_hot
    else:
      assert images.shape[0] == labels.shape[0], (
          'images.shape: %s labels.shape: %s' % (images.shape, labels.shape))
      self._num_examples = images.shape[0]

      # 将输入类型 [num examples, rows, columns, depth]
      # 转换为 [num examples, rows*columns] (假设 depth == 1)
      if reshape:
        assert images.shape[3] == 1
        images = images.reshape(images.shape[0],
                                images.shape[1] * images.shape[2])
      if dtype == dtypes.float32:
        # 将数据类型 [0, 255] ->缩放转换 [0.0, 1.0].
        images = images.astype(numpy.float32)
        images = numpy.multiply(images, 1.0 / 255.0)
    self._images = images
    self._labels = labels
    self._epochs_completed = 0
    self._index_in_epoch = 0

  @property
  def images(self):
    return self._images

  @property
  def labels(self):
    return self._labels

  @property
  def num_examples(self):
    return self._num_examples

  @property
  def epochs_completed(self):
    return self._epochs_completed

2.定义next_batch方法：

关键点在于记录此时读取数据集起始位置，然后每个迭代周期数据集就是(index_in_epoch+batch_size)。以及开头第一个epoch怎么处理, 每个epoch的结尾连接下一个epoch的开头怎么处理，非第一个epoch和非结尾怎么处理。


  def next_batch(self, batch_size, fake_data=False, shuffle=True):
    """根据当前数据集，从此数据集返回下一个`batch_size`数据。"""
    if fake_data:
      fake_image = [1] * 784
      if self.one_hot:
        fake_label = [1] + [0] * 9
      else:
        fake_label = 0
      return [fake_image for _ in xrange(batch_size)], [
          fake_label for _ in xrange(batch_size)
      ]
    start = self._index_in_epoch
    #处理epoch起始位置
    # 起始时Shuffle打乱数据顺序 
    if self._epochs_completed == 0 and start == 0 and shuffle:
      perm0 = numpy.arange(self._num_examples)
      numpy.random.shuffle(perm0)
      self._images = self.images[perm0]
      self._labels = self.labels[perm0]
    # 执行下一次迭代周期
    if start + batch_size > self._num_examples:
      # 完成一个迭代周期
      self._epochs_completed += 1
      # Get the rest examples in this epoch
      rest_num_examples = self._num_examples - start
      images_rest_part = self._images[start:self._num_examples]
      labels_rest_part = self._labels[start:self._num_examples]
      # 采用Shuffle打乱数据顺序
      if shuffle:
        perm = numpy.arange(self._num_examples)
        numpy.random.shuffle(perm)
        self._images = self.images[perm]
        self._labels = self.labels[perm]
      # 开始下一次迭代周期epoch
      start = 0
      self._index_in_epoch = batch_size - rest_num_examples
      end = self._index_in_epoch
      images_new_part = self._images[start:end]
      labels_new_part = self._labels[start:end]
      return numpy.concatenate(
          (images_rest_part, images_new_part), axis=0), numpy.concatenate(
              (labels_rest_part, labels_new_part), axis=0)

   #除了第一个epoch，以及每个epoch的开头，剩下中间batch批次的处理方式
     else:
      self._index_in_epoch += batch_size#起始位置：index_in_epoch
      end = self._index_in_epoch#结束位置：起始位置（index_in_epoch）加batch_size
      return self._images[start:end], self._labels[start:end]#返回图像和其对应的标签

3.定义read_data_set方法：

def read_data_sets(train_dir,
                   fake_data=False,
                   one_hot=False,
                   dtype=dtypes.float32,
                   reshape=True,
                   validation_size=5000,
                   seed=None,
                   source_url=DEFAULT_SOURCE_URL):
  if fake_data:

    def fake():
      return DataSet(
          [], [], fake_data=True, one_hot=one_hot, dtype=dtype, seed=seed)

    train = fake()
    validation = fake()
    test = fake()
    #返回数据类。
    return base.Datasets(train=train, validation=validation, test=test)

  if not source_url:  # empty string check
    source_url = DEFAULT_SOURCE_URL

  TRAIN_IMAGES = 'train-images-idx3-ubyte.gz'
  TRAIN_LABELS = 'train-labels-idx1-ubyte.gz'
  TEST_IMAGES = 't10k-images-idx3-ubyte.gz'
  TEST_LABELS = 't10k-labels-idx1-ubyte.gz'
  #base模块定义在：from tensorflow.contrib.learn.python.learn.datasets import base
  local_file = base.maybe_download(TRAIN_IMAGES, train_dir,
                                   source_url + TRAIN_IMAGES)
  #gfile模块定义在：from tensorflow.python.platform import gfile
  with gfile.Open(local_file, 'rb') as f:
    train_images = extract_images(f)

  local_file = base.maybe_download(TRAIN_LABELS, train_dir,
                                   source_url + TRAIN_LABELS)
  with gfile.Open(local_file, 'rb') as f:
    train_labels = extract_labels(f, one_hot=one_hot)

  local_file = base.maybe_download(TEST_IMAGES, train_dir,
                                   source_url + TEST_IMAGES)
  with gfile.Open(local_file, 'rb') as f:
    test_images = extract_images(f)

  local_file = base.maybe_download(TEST_LABELS, train_dir,
                                   source_url + TEST_LABELS)
  with gfile.Open(local_file, 'rb') as f:
    test_labels = extract_labels(f, one_hot=one_hot)

  if not 0 <= validation_size <= len(train_images):
    raise ValueError('Validation size should be between 0 and {}. Received: {}.'
                     .format(len(train_images), validation_size))

  validation_images = train_images[:validation_size]
  validation_labels = train_labels[:validation_size]
  train_images = train_images[validation_size:]
  train_labels = train_labels[validation_size:]

  options = dict(dtype=dtype, reshape=reshape, seed=seed)

 #读取完数据后按照上面定义的DataSet类返回训练集，验证集和测试集的数据类（包含其方法和属性）。

  train = DataSet(train_images, train_labels, **options)
  validation = DataSet(validation_images, validation_labels, **options)
  test = DataSet(test_images, test_labels, **options)

  return base.Datasets(train=train, validation=validation, test=test)

def load_mnist(train_dir='MNIST-data'):
  #返回载入数据函数
  return read_data_sets(train_dir)

参考博客：https://blog.csdn.net/u013608336/article/details/78747102