初始的代码如下:
import numpy as np
import math
import struct
import copy
from pathlib import Path
def tanh(x):
return np.tanh(x)
def softmax(x):
exp = np.exp(x - x.max())
return exp / np.sum(exp)
def d_softmax(data):
sm=softmax(data)
return np.diag(sm)-np.outer(sm,sm)
def d_tanh(data):
return 1/(np.cosh(data))**2
dimension = [28*28, 10]
activation = [tanh, softmax]
differential={softmax:d_softmax,tanh:d_tanh}
distribution = [
{'b': [0, 0]},
{'b': [0, 0], 'w': [-math.sqrt(6 / (dimension[0] + dimension[1])), math.sqrt(6 / (dimension[0] + dimension[1]))]}
]
def init_parameters_b(layer):
dist = distribution[layer]['b']
return np.random.rand(dimension[layer]) * (dist[1] - dist[0]) + dist[0]
def init_parameters_w(layer):
dist = distribution[layer]['w']
return np.random.rand(dimension[layer-1], dimension[layer]) * (dist[1] - dist[0]) + dist[0]
def init_parameters():
parameter = []
for i in range(len(distribution)):
layer_parameter = {}
for j in distribution[i].keys():
if j == 'b':
layer_parameter['b'] = init_parameters_b(i)
if j == 'w':
layer_parameter['w'] = init_parameters_w(i)
parameter.append(layer_parameter)
return parameter
parameters = init_parameters()
def predict(img, parameters):
l0_in = img + parameters[0]['b']
l0_out = activation[0](l0_in)
l1_in = np.dot(l0_out, parameters[1]['w']) + parameters[1]['b']
l1_out = activation[1](l1_in)
return l1_out
dataset_path=Path('.Mnist')
train_img_path=dataset_path/'train-images.idx3-ubyte'
train_label_path=dataset_path/'train-label.idx1-ubyte'
test_img_path=dataset_path/'t10k-images.idx3-ubyte'
test_label_path=dataset_path/'t10k-labels.idx1-ubyte'
train_radio=0.9
def load_dataset(img_path,label_path):
with open(img_path, 'rb') as f1:
struct.unpack('>4i',f1.read(16))
imgs=np.fromfile(f1,dtype=np.uint8).reshape(-1,28*28)
with open(label_path, 'rb') as f2:
struct.unpack('>2i',f2.read(8))
label=np.fromfile(f2,dtype=np.uint8)
return imgs,label
train_img, train_label = load_dataset(train_img_path,train_label_path)
tset_img,test_label=load_dataset(test_img_path,test_label_path)
train_img=train_img[:int(train_img.shape[0]*train_radio)]
validation_img=train_img[int(train_img.shape[0]*train_radio):]
train_label=train_label[:int(train_label.shape[0]*train_radio)]
validation_label=train_label[int(train_label.shape[0]*train_radio):]
one_hot=np.identity(10)
def square_loss(image,label,parameter):
y_pred=predict(image,parameter)
y=one_hot[label]
return np.dot(y-y_pred,y-y_pred)
#验证集损失
def valid_loss(parameters):
loss=0
for i in range (validation_img.shape[0]):
loss+=square_loss(image=validation_img[i],label=validation_label[i],parameter=parameters)
return loss/validation_img.shape[0]
#验证集精度
def valid_accuracy(parameters):
correct=[]
for i in range (validation_img.shape[0]):
correct.append(predict(validation_img[i],parameters)).argmax()==validation_label[i]
return correct.count(True)/len(correct)
#梯度
def grad_parameters(img,label,parameters):
L0_in=img+parameters[0]['b']
L0_out=activation[0](L0_in)
L1_in=np.dot(L0_out, parameters[1]['w']) + parameters[1]['b']
L1_out=activation[1](L1_in)
diff=one_hot[label]-L1_out
act_1=differential[activation[0]](L0_in)
act_2=np.dot(differential[activation[1]](L1_in),diff)
grad_b1=-2*act_2
grad_w1=-2*np.outer(L0_out,act_2)
grad_b0=-2*act_1*np.dot(parameters[1]['w'],act_2)
return {'w1':grad_w1,'b1':grad_b1,'b0':grad_b0}
#按照batch计算梯度。一般训练集数据比较多,不会一次全部计算,要分批(batch)进行计算,算出每一批的梯度然后输出。
def train_batch(current_batch,parameters):
grad_accu=grad_parameters(train_img[current_batch*batch_size],train_label[current_batch*batch_size],parameters)
for i in range(1,batch_size):
temp=grad_parameters(train_img[current_batch*batch_size+i],train_label[current_batch*batch_size+i],parameters)
for key in grad_accu.keys():
grad_accu[key]+=temp[key]
for key in grad_accu.keys():
grad_accu[key]=grad_accu[key]/batch_size
return grad_accu
#更新梯度。参数减去计算得到的梯度乘以学习率,从而得到更新后的梯度
def combine(parameters,grad,lr=1):
parameters_temp=copy.deepcopy(parameters)
parameters_temp[0]['b']=parameters_temp[0]['b']-grad['b0']*lr
parameters_temp[1]['b']=parameters_temp[1]['b']-grad['b1']*lr
parameters_temp[1]['w']=parameters_temp[1]['w']-grad['w1']*lr
return parameters_temp
batch_size=100
epoch=5
train_num=train_img.shape[0]
for a in range(epoch):
for i in range(int(train_num/batch_size)):
grad_temp=train_batch(i,parameters)
parameters=combine(parameters,grad_temp)
print("epoch:{:d},valid_acc:{:.3f},valid_loss:{:.3f}".format(a+1,valid_accuracy(parameters),valid_loss(parameters)))
下面是chatgpt修改后的代码:
-
验证集损失函数:
valid_loss
函数内的缩进有问题,这会导致函数在第一个循环后提前返回。应该将return loss/validation_img.shape[0]
移出for
循环之外。
-
梯度计算的维度问题:
- 在
grad_parameters
函数中,grad_b0=-2*act_1*np.dot(parameters[1]['w'],act_2)
这一行可能会产生维度不匹配的问题。建议使用np.dot(parameters[1]['w'], act_2.T)
以确保维度匹配。
- 在
-
学习率和参数更新:
- 在
combine
函数中,学习率被固定为lr=1
,但可以考虑将其设为可调参数,以便更灵活地控制训练速度。
- 在
-
代码结构建议:
- 在
train_batch
函数中,循环内的grad_accu
累加操作效率较低。可以直接在循环外初始化grad_accu
为零矩阵,然后在循环内直接累加,避免逐次更新。
- 在
-
代码执行效率:
- 在每个epoch之后打印验证精度和损失,但是这两个操作是相对昂贵的,因为它们涉及完整的前向传播。可以考虑减少打印频率,或者在最后一个epoch时再执行一次完整评估。
import numpy as np
import math
import struct
import copy
from pathlib import Path
def tanh(x):
return np.tanh(x)
def softmax(x):
exp = np.exp(x - x.max())
return exp / np.sum(exp)
def d_softmax(data):
sm = softmax(data)
return np.diag(sm) - np.outer(sm, sm)
def d_tanh(data):
return 1 / (np.cosh(data))**2
dimension = [28*28, 10]
activation = [tanh, softmax]
differential = {softmax: d_softmax, tanh: d_tanh}
distribution = [
{'b': [0, 0]},
{'b': [0, 0], 'w': [-math.sqrt(6 / (dimension[0] + dimension[1])), math.sqrt(6 / (dimension[0] + dimension[1]))]}
]
def init_parameters_b(layer):
dist = distribution[layer]['b']
return np.random.rand(dimension[layer]) * (dist[1] - dist[0]) + dist[0]
def init_parameters_w(layer):
dist = distribution[layer]['w']
return np.random.rand(dimension[layer-1], dimension[layer]) * (dist[1] - dist[0]) + dist[0]
def init_parameters():
parameter = []
for i in range(len(distribution)):
layer_parameter = {}
for j in distribution[i].keys():
if j == 'b':
layer_parameter['b'] = init_parameters_b(i)
if j == 'w':
layer_parameter['w'] = init_parameters_w(i)
parameter.append(layer_parameter)
return parameter
parameters = init_parameters()
def predict(img, parameters):
l0_in = img + parameters[0]['b']
l0_out = activation[0](l0_in)
l1_in = np.dot(l0_out, parameters[1]['w']) + parameters[1]['b']
l1_out = activation[1](l1_in)
return l1_out
dataset_path = Path('.Mnist')
train_img_path = dataset_path / 'train-images.idx3-ubyte'
train_label_path = dataset_path / 'train-label.idx1-ubyte'
test_img_path = dataset_path / 't10k-images.idx3-ubyte'
test_label_path = dataset_path / 't10k-labels.idx1-ubyte'
train_radio = 0.9
def load_dataset(img_path, label_path):
with open(img_path, 'rb') as f1:
struct.unpack('>4i', f1.read(16))
imgs = np.fromfile(f1, dtype=np.uint8).reshape(-1, 28*28)
with open(label_path, 'rb') as f2:
struct.unpack('>2i', f2.read(8))
label = np.fromfile(f2, dtype=np.uint8)
return imgs, label
train_img, train_label = load_dataset(train_img_path, train_label_path)
test_img, test_label = load_dataset(test_img_path, test_label_path)
train_img = train_img[:int(train_img.shape[0]*train_radio)]
validation_img = train_img[int(train_img.shape[0]*train_radio):]
train_label = train_label[:int(train_label.shape[0]*train_radio)]
validation_label = train_label[int(train_label.shape[0]*train_radio):]
one_hot = np.identity(10)
def square_loss(image, label, parameter):
y_pred = predict(image, parameter)
y = one_hot[label]
return np.dot(y - y_pred, y - y_pred)
def valid_loss(parameters):
loss = 0
for i in range(validation_img.shape[0]):
loss += square_loss(image=validation_img[i], label=validation_label[i], parameter=parameters)
return loss / validation_img.shape[0]
def valid_accuracy(parameters):
correct = []
for i in range(validation_img.shape[0]):
correct.append(predict(validation_img[i], parameters).argmax() == validation_label[i])
return correct.count(True) / len(correct)
def grad_parameters(img, label, parameters):
L0_in = img + parameters[0]['b']
L0_out = activation[0](L0_in)
L1_in = np.dot(L0_out, parameters[1]['w']) + parameters[1]['b']
L1_out = activation[1](L1_in)
diff = one_hot[label] - L1_out
act_1 = differential[activation[0]](L0_in)
act_2 = np.dot(differential[activation[1]](L1_in), diff)
grad_b1 = -2 * act_2
grad_w1 = -2 * np.outer(L0_out, act_2)
grad_b0 = -2 * act_1 * np.dot(parameters[1]['w'], act_2.T)
return {'w1': grad_w1, 'b1': grad_b1, 'b0': grad_b0}
def train_batch(current_batch, parameters):
grad_accu = {'w1': np.zeros_like(parameters[1]['w']), 'b1': np.zeros_like(parameters[1]['b']), 'b0': np.zeros_like(parameters[0]['b'])}
for i in range(batch_size):
temp = grad_parameters(train_img[current_batch*batch_size + i], train_label[current_batch*batch_size + i], parameters)
for key in grad_accu.keys():
grad_accu[key] += temp[key]
for key in grad_accu.keys():
grad_accu[key] /= batch_size
return grad_accu
def combine(parameters, grad, lr=1):
parameters_temp = copy.deepcopy(parameters)
parameters_temp[0]['b'] = parameters_temp[0]['b'] - grad['b0'] * lr
parameters_temp[1]['b'] = parameters_temp[1]['b'] - grad['b1'] * lr
parameters_temp[1]['w'] = parameters_temp[1]['w'] - grad['w1'] * lr
return parameters_temp
batch_size = 100
epoch = 5
train_num = train_img.shape[0]
for a in range(epoch):
for i in range(int(train_num / batch_size)):
grad_temp = train_batch(i, parameters)
parameters = combine(parameters, grad_temp)
print("epoch:{:d}, valid_acc:{:.3f}, valid_loss:{:.3f}".format(a+1, valid_accuracy(parameters), valid_loss(parameters)))
但是这篇代码依旧没有可视化,不够明显,于是再添上:
def plot_image(image, result, confidence, correct):
image = image.reshape(28, 28) # 重新调整为 28x28 的二维图像
plt.imshow(image, cmap="gray") # 使用灰度显示图像
title = f"Result: {result} | Confidence: {round(confidence, 2)} | {'Correct' if correct else 'Incorrect'}"
plt.title(title) # 设置标题
plt.axis('off') # 关闭坐标轴显示
plt.show() # 显示图像
num_images = 20
test_indices = np.random.choice(test_img.shape[0], num_images, replace=False)
correct_count = 0
for idx in test_indices:
test_pre = predict(test_img[idx], parameters)
test_pre = test_pre.reshape(10,)
test_result = test_pre.argmax()
test_confidence = test_pre[test_result]
is_correct = (test_result == test_label[idx])
correct_count += is_correct
plot_image(test_img[idx], test_result, test_confidence, is_correct)
print(f"Correct predictions: {correct_count}/{num_images}")
调用时,数据集路径选择的是本地的绝对路径。