最近在搞ML,很多都是在anaconda上使用数据集,找了很久才找用python的方式使用数据,并在pycharm上使用。
转载:https://www.cnblogs.com/jimobuwu/p/9161531.html
先下载数据集(python版本)http://www.cs.toronto.edu/~kriz/cifar.html
解压后将文件夹放在程序所在的目录下。
#!/usr/bin/python
# -*- coding: UTF-8 -*-
from __future__ import print_function
from six.moves import cPickle as pickle
import numpy as np
import os
from scipy.misc import imread
import platform
import random
# 读取文件
def load_pickle(f):
version = platform.python_version_tuple() # 取python版本号
if version[0] == '2':
return pickle.load(f) # pickle.load, 反序列化为python的数据类型
elif version[0] == '3':
return pickle.load(f, encoding='latin1')
#这里去掉encoding='latin1'也会得到相同的结果,对于编码不是很了解
raise ValueError("invalid python version: {}".format(version))
def load_CIFAR_batch(filename):
""" load single batch of cifar """
with open(filename, 'rb') as f:
datadict = load_pickle(f) # dict类型
X = datadict['data'] # X, ndarray, 像素值
Y = datadict['labels'] # Y, list, 标签, 分类
# reshape, 一维数组转为矩阵10000行3列。每个entries是32x32
# transpose,转置
# astype,复制,同时指定类型
X = X.reshape(10000, 3, 32, 32).transpose(0, 2, 3, 1).astype("float")
Y = np.array(Y)
return X, Y
def load_CIFAR10(ROOT):
""" load all of cifar """
xs = [] # list
ys = []
X = None
Y = None
# 训练集batch 1~5
for b in range(1, 6):
f = os.path.join(ROOT, 'data_batch_%d' % (b,))
X, Y = load_CIFAR_batch(f)
xs.append(X) # 在list尾部添加对象X, x = [..., [X]]
ys.append(Y)
Xtr = np.concatenate(xs) # [ndarray, ndarray] 合并为一个ndarray
Ytr = np.concatenate(ys)
del X, Y
# 测试集
Xte, Yte = load_CIFAR_batch(os.path.join(ROOT, 'test_batch'))
return Xtr, Ytr, Xte, Yte
cifar10_dir = 'cs231n/datasets/cifar-10-batches-py' #这里是下载数据集的地址
X_train, y_train, X_test, y_test = load_CIFAR10(cifar10_dir)
print out the size of the training and test data.
print('Training data shape: ', X_train.shape)
print('Training labels shape: ', y_train.shape)
print('Test data shape: ', X_test.shape)
print('Test labels shape: ', y_test.shape)