1、说明
数据集下载地址:http://yann.lecun.com/exdb/mnist/
参考的官网地址:http://www.tensorfly.cn/tfdoc/tutorials/mnist_beginners.html
2、关于程序的说明
整个程序用的是最简单的神经网络。
其中decode_idx3_ubyte(idx3_ubyte_file)和decode_idx1_ubyte(idx1_ubyte_file)用来解析原始的数据集。这里返回的images的维度和labels的维度分别是:
print(type(train_images))
print(len(train_images))
print(len(train_images[0]))
print(type(train_labels))
print(len(train_labels))
print(len(train_labels[0]))
print("----------------")
print(type(test_images))
print(len(test_images))
print(len(test_images[0]))
print(type(test_labels))
print(len(test_labels))
print(len(test_labels[0]))
训练集一共是60000张图片,784是28*28,10表示标签向量的长度是10,因为有10个类别。
测试集一共是10000张图片
<class 'numpy.ndarray'>
60000
784
<class 'numpy.ndarray'>
60000
10
----------------
<class 'numpy.ndarray'>
10000
784
<class 'numpy.ndarray'>
10000
10
解析图片数据集的程序的参考博客:https://www.jianshu.com/p/84f72791806f ,感谢!
3、整个完整的程序,修改下数据集地址后可以直接复制到编译器中运行。
import tensorflow as tf
import struct
import numpy as np
# 训练集文件
train_images_idx3_ubyte_file = './data/train-images.idx3-ubyte'
# 训练集标签文件
train_labels_idx1_ubyte_file = './data/train-labels.idx1-ubyte'
# 测试集文件
test_images_idx3_ubyte_file = './data/t10k-images.idx3-ubyte'
# 测试集标签文件
test_labels_idx1_ubyte_file = './data/t10k-labels.idx1-ubyte'
def decode_idx3_ubyte(idx3_ubyte_file):
"""
解析idx3文件的通用函数
:param idx3_ubyte_file: idx3文件路径
:return: 数据集
"""
# 读取二进制数据
bin_data = open(idx3_ubyte_file, 'rb').read()
# 解析文件头信息,依次为魔数、图片数量、每张图片高、每张图片宽
offset = 0
fmt_header = '>iiii'
magic_number, num_images, num_rows, num_cols = struct.unpack_from(fmt_header, bin_data, offset)
print('魔数:%d, 图片数量: %d张, 图片大小: %d*%d' % (magic_number, num_images, num_rows, num_cols))
# 解析数据集
image_size = num_rows * num_cols
offset += struct.calcsize(fmt_header)
fmt_image = '>' + str(image_size) + 'B'
# images = np.empty((num_images, num_rows, num_cols))
images = np.empty((num_images,784))
for i in range(num_images):
if (i + 1) % 10000 == 0:
print('已解析 %d' % (i + 1) + '张')
# images[i] = np.array(struct.unpack_from(fmt_image, bin_data, offset)).reshape((num_rows, num_cols))
images[i] = np.array(struct.unpack_from(fmt_image, bin_data, offset))
offset += struct.calcsize(fmt_image)
return images
def decode_idx1_ubyte(idx1_ubyte_file):
"""
解析idx1文件的通用函数
:param idx1_ubyte_file: idx1文件路径
:return: 数据集
"""
# 读取二进制数据
bin_data = open(idx1_ubyte_file, 'rb').read()
# 解析文件头信息,依次为魔数和标签数
offset = 0
fmt_header = '>ii'
magic_number, num_images = struct.unpack_from(fmt_header, bin_data, offset)
print('魔数:%d, 图片数量: %d张' % (magic_number, num_images))
# 解析数据集
offset += struct.calcsize(fmt_header)
fmt_image = '>B'
labels = np.empty(num_images)
for i in range(num_images):
if (i + 1) % 10000 == 0:
print('已解析 %d' % (i + 1) + '张')
labels[i] = struct.unpack_from(fmt_image, bin_data, offset)[0]
offset += struct.calcsize(fmt_image)
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()
# enc.fit(train_labels.reshape(-1,1))
labels = enc.fit_transform(labels.reshape(-1, 1)).toarray()
# print(labels)
return labels
def next_batch(train_data, train_target, batch_size): # 随机选取一个batch_size大小的数据集
index = [ i for i in range(0,len(train_target)) ]
np.random.shuffle(index)
batch_data = []
batch_target = []
for i in range(0,batch_size):
batch_data.append(train_data[index[i]])
batch_target.append(train_target[index[i]])
return batch_data, batch_target
train_images = decode_idx3_ubyte(train_images_idx3_ubyte_file)
train_labels = decode_idx1_ubyte(train_labels_idx1_ubyte_file)
test_images = decode_idx3_ubyte(test_images_idx3_ubyte_file)
test_labels = decode_idx1_ubyte(test_labels_idx1_ubyte_file)
# 设置权重weights和偏置biases作为优化变量,初始值设为0
weights = tf.Variable(tf.zeros([784, 10]))
biases = tf.Variable(tf.zeros([10]))
# 构建模型
x = tf.placeholder("float", [None, 784])
y = tf.nn.softmax(tf.matmul(x, weights) + biases) # 模型的预测值
y_real = tf.placeholder("float", [None, 10]) # 真实值
cross_entropy = -tf.reduce_sum(y_real * tf.log(y)) # 预测值与真实值的交叉熵
train_step = tf.train.GradientDescentOptimizer(0.01).minimize(cross_entropy) # 使用梯度下降优化器最小化交叉熵
# 开始训练
init = tf.initialize_all_variables()
sess = tf.Session()
sess.run(init)
for i in range(1000):
batch_xs, batch_ys = next_batch(train_images,train_labels,100) # 每次随机选取100个数据进行训练,即所谓的“随机梯度下降(Stochastic Gradient Descent,SGD)”
# from sklearn import preprocessing
# batch_xs = preprocessing.scale(batch_xs) # 这样处理batch_xs就有问题,不知道为什么???
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler() # 注意这里一定要对训练集数据进行归一化,否则训练模型的准确率会很低
batch_xs = scaler.fit_transform(batch_xs)
sess.run(train_step, feed_dict={x: batch_xs, y_real:batch_ys}) # 正式执行train_step,用feed_dict的数据取代placeholder
if i % 100 == 0:
# 每训练100次后用测试集评估模型
correct_prediction = tf.equal(tf.argmax(y, 1), tf.arg_max(y_real, 1)) # 比较预测值和真实值是否一致
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float")) # 统计预测正确的个数,取均值得到准确率
test_images = scaler.fit_transform(test_images) # 测试集也要像训练集一样进行归一化
print("times:",i,"accuracy:",sess.run(accuracy, feed_dict={x:test_images, y_real: test_labels}))
准确率结果如下:
times: 0 accuracy: 0.1927
times: 100 accuracy: 0.8979
times: 200 accuracy: 0.9077
times: 300 accuracy: 0.905
times: 400 accuracy: 0.911
times: 500 accuracy: 0.9098
times: 600 accuracy: 0.9149
times: 700 accuracy: 0.9061
times: 800 accuracy: 0.9158
times: 900 accuracy: 0.9076