1.导入数据包:cifar-10-batches-py
import tensorflow as tf
import os
import pickle as pk
import numpy as np
CIFAR_DIR = "./cifar-10-batches-py/"
print (os.listdir(CIFAR_DIR)) # listdir--返回指定的文件夹包含的文件或文件夹的名字的列表。
2.读取数据并处理
def load_data(filename):
'''从数据文件中读取数据'''
with open(filename, 'rb') as f:
data = pk.load(f , encoding='bytes')
return data[b'data'],data[b'labels']
class CifarData:
'''数据处理'''
def __init__(self, filenames, need_shuffle):
# need_shuffle 打乱数据集, 是数据之间没有相互依赖关系
all_data = []
all_labels = []
for filename in filenames:
data , labels = load_data(filename)
all_data.append(data)
all_labels.append(labels)
self._data = np.vstack(all_data) # 纵向合并为矩阵
self._data = self._data / 127.5 - 1 # 相当于归一化 ,因为像素是255
self._labels = np.hstack(all_labels)
print(self._data.shape)
print(self._labels.shape)
self._num_examples = self._data.shape[0]
self._need_shuffle = need_shuffle
self._indicator = 0 # 当前数据遍历到哪个位置
if self._need_shuffle:
self._shuffle_data()
def _shuffle_data(self):
'''打乱数据'''
p = np.random.permutation(self._num_examples) # 做混排,eg:[0,1,2,3,4,5] --> [5,3,2,4,0,1]
self._data = self._data[p]
self._labels = self._labels[p]
def next_batch(self, batch_size):
'''返回batch_size个样本'''
end_indicator = self._indicator + batch_size
if end_indicator > self._num_examples:
if self._need_shuffle:
self._shuffle_data()
self._indicator = 0
end_indicator = batch_size
else:
raise Exception('have no more examples')
if end_indicator > self._num_examples:
raise Exception('batch size in larger than all examples')
batch_data = self._data[self._indicator:end_indicator]
batch_labels = self._labels[s