本篇博客将会提高神经网络对MNIST数据集预测的准确率。
1、标准化样本数据
对于MNIST数据集来讲,它们都是1字节的像素,不需要将它们的取值缩放到一个相似的范围之内(特征缩放)。
为了使得样本数据的取值接近于零,我们需要进行标准化处理,对标准化可以理解为:“重新调整输入,使其平均值为0,标准差为1”。标准差衡量的是一个变量“分布”的情况。
2、超参数的调优
本次实例中,超参数是历元epochs,隐藏节点的数量n_hidden_nodes,学习率lr。
选择历元数量的常见方法:从一个比较高的数字开始,找到在准确率趋于平稳的历元数量。
隐藏节点越多,训练速度就越慢,但在处理一些粗糙的数据时,会使得网络模型更加灵活。
学习率越小,训练速度就越慢,但这些小的步长可以帮助网络模型接近最小的损失,可以对一些值进行尝试。对于隐藏节点和学习率,这里编写了一个python程序,锁定一个范围,进行调试。在此,将不进行展示。
3、代码
mnist_standardized.py
import numpy as np
import struct
import gzip
# 加载图像
def load_images(filename):
# 打开并解压文件
with gzip.open(filename, 'rb') as f:
# 定义变量存储文件里的标题信息,struct.unpack()函数是根据模式字符串从二进制文件中读取数据
_ignored, n_images, columns, rows = struct.unpack('>IIII', f.read(16))
# 往Numpy的字节数组中读取所有的像素
all_pixels = np.frombuffer(f.read(), dtype=np.uint8)
# 将像素重塑为一个矩阵,其每一行都是一个图像,并返回
return all_pixels.reshape(n_images, columns * rows)
# 计算训练样本数据集的平均值和标准差
# 可以使用这个函数来标准化训练集、验证集和测试集
def standardize(training_set, test_set):
average = np.average(training_set)
standard_deviation = np.std(training_set)
training_set_standardize = (training_set - average) / standard_deviation # 标准化样本数据,将每个输入变量值减去这些变量的平均值,再除以这些变量的标准差
test_set_standardize = (test_set - average) / standard_deviation
return training_set_standardize, test_set_standardize
X_train_raw = load_images("../data/mnist/train-images-idx3-ubyte.gz")
X_test_raw = load_images("../data/mnist/t10k-images-idx3-ubyte.gz")
X_train, X_test_all = standardize(X_train_raw, X_test_raw)
X_validation, X_test = np.split(X_test_all, 2)
# 加载标签
def load_labels(filename):
with gzip.open(filename, 'rb') as f:
# 跳过标题字节
f.read(8)
# 将所有的标签放入一个列表
all_labels = f.read()
# 将标签列表重塑为一列的矩阵
return np.frombuffer(all_labels, dtype=np.uint8).reshape(-1, 1)
def one_hot_encode(Y):
n_labels = Y.shape[0]
n_classes = 10
encoded_Y = np.zeros((n_labels, n_classes))
for i in range(n_labels):
label = Y[i]
encoded_Y[i][label] = 1
return encoded_Y
Y_train_unencoded = load_labels("../data/mnist/train-labels-idx1-ubyte.gz")
Y_train = one_hot_encode(Y_train_unencoded)
Y_test_all = load_labels("../data/mnist/t10k-labels-idx1-ubyte.gz")
Y_validation, Y_test = np.split(Y_test_all, 2)
neural_network.py
import numpy as np
def sigmoid(z):
return 1 / (1 + np.exp(-z))
def softmax(logits):
exponentials = np.exp(logits)
return exponentials / np.sum(exponentials, axis=1).reshape(-1, 1)
# 从S型函数输出计算S型函数的梯度,帮助计算w1与w2
def sigmoid_gradient(sigmoid):
return np.multiply(sigmoid, (1 - sigmoid))
def loss(Y, y_hat):
return -np.sum(Y * np.log(y_hat)) / Y.shape[0]
def prepend_bias(X):
return np.insert(X, 0, 1, axis=1)
# 将训练样本数据集分成若干个批量
def prepare_batches(X_train, Y_train, batch_size):
x_batches = []
y_batches = []
n_examples = X_train.shape[0]
for batch in range(0, n_examples, batch_size):
batch_end = batch + batch_size
x_batches.append(X_train[batch:batch_end])
y_batches.append(Y_train[batch:batch_end])
return x_batches, y_batches
def forward(X, w1, w2):
h = sigmoid(np.matmul(prepend_bias(X), w1))
y_hat = softmax(np.matmul(prepend_bias(h), w2))
return (y_hat, h)
# 反向传播算法
def back(X, Y, y_hat, w2, h):
w2_gradient = np.matmul(prepend_bias(h).T, (y_hat - Y)) / X.shape[0]
w1_gradient = np.matmul(prepend_bias(X).T, np.matmul(y_hat - Y, w2[1:].T)
* sigmoid_gradient(h)) / X.shape[0]
return w1_gradient, w2_gradient
def classify(X, w1, w2):
y_hat, _ = forward(X, w1, w2)
labels = np.argmax(y_hat, axis=1)
return labels.reshape(-1, 1)
# 初始化权重,采用w=正负根号下r分之一,r是权重矩阵的行数
def initialize_weights(n_input_variables, n_hidden_nodes, n_classes):
w1_rows = n_input_variables + 1
w1 = np.random.randn(w1_rows, n_hidden_nodes) * np.sqrt(1 / w1_rows) # 从标准正态分布中抽取一个随机数矩阵
w2_rows = n_hidden_nodes + 1
w2 = np.random.randn(w2_rows, n_classes) * np.sqrt(1 / w2_rows)
return w1, w2
def report(epoch, batch, X_train, Y_train, X_test, Y_test, w1, w2):
y_hat, _ = forward(X_train, w1, w2)
training_loss = loss(Y_train, y_hat)
classifications = classify(X_test, w1, w2)
accuracy = np.average(classifications == Y_test) * 100.0
print("%5d-%d, Loss: %.8f, Accuracy: %.2f%%" %
(epoch, batch, training_loss, accuracy))
def train(X_train, Y_train, X_test, Y_test, n_hidden_nodes, epochs, batch_size, lr):
n_input_variables = X_train.shape[1]
n_classes = Y_train.shape[1]
w1, w2 = initialize_weights(n_input_variables, n_hidden_nodes, n_classes)
x_batches, y_batches = prepare_batches(X_train, Y_train, batch_size)
# epoch是历元的意思,遍历训练集中的所有小批量样本数据
for epoch in range(epochs):
# 对单个小批量样本数据进行梯度下降的一步迭代计算
for batch in range(len(x_batches)):
y_hat, h = forward(x_batches[batch], w1, w2)
w1_gradient, w2_gradient = back(x_batches[batch], y_batches[batch], y_hat, w2, h)
w1 = w1 - (w1_gradient * lr)
w2 = w2 - (w2_gradient * lr)
report(epoch, batch, X_train, Y_train, X_test, Y_test, w1, w2)
return w1, w2
开始测试
import mnist_standardized as mns
import neural_network as nn
nn.train(mns.X_train, mns.Y_train, mns.X_test, mns.Y_test, n_hidden_nodes=100, epochs=10, batch_size=256, lr=1)
训练的结果:
修改一下 neural_network.py 和测试中的代码,提高准确率:
import mnist_standardized as mns
import neural_network as nn
nn.train(mns.X_train, mns.Y_train, mns.X_test, mns.Y_test, n_hidden_nodes=1200, epochs=100, batch_size=600, lr=0.8)
参考文献:
Programming Machine Learning: Form Coding to Deep Learning.[M],Paolo Perrotta,2021.6.