由于使用tensorflow.keras时加载fashion-mnist数据集都要在线下载,因此选择加载本地已经下载好的.gz格式的数据集,如图所示
加载数据集的程序如下:
其中注释行不太理解,gzip解压后,使用numpy.frombuffer读入时,为什么需要设置读取样本时的offset=16,读取标签时的offset=8?
def get_data():
x_train_path = r"F:/Data/fashion_minist/train-images-idx3-ubyte.gz"
y_train_path = r"F:/Data/fashion_minist/train-labels-idx1-ubyte.gz"
x_test_path = r"F:/Data/fashion_minist/t10k-images-idx3-ubyte.gz"
y_test_path = r"F:/Data/fashion_minist/t10k-labels-idx1-ubyte.gz"
with gzip.open(x_train_path, "rb") as data:
x_train = np.frombuffer(data.read(), np.uint8, offset=16).reshape(-1, 28, 28) # 有疑问处!
with gzip.open(y_train_path, "rb") as data:
y_train = np.frombuffer(data.read(), np.uint8, offset=8) # 有疑问处!
with gzip.open(x_test_path, "rb") as data:
x_test = np.frombuffer(data.read(), np.uint8, offset=16).reshape(-1, 28, 28) # 有疑问处!
with gzip.open(y_test_path, "rb") as data:
y_test = np.frombuffer(data.read(), np.uint8, offset=8) # 有疑问处!
return (x_train, y_train), (x_test, y_test)