动手学深度学习（第二版）注释后代码【持续更新】

亦梦亦醒乐逍遥

已于 2022-07-17 16:24:20 修改

阅读量1.1k

点赞数 2

分类专栏：人工智能文章标签：深度学习 python 机器学习

于 2022-07-17 16:24:01 首次发布

本文链接：https://blog.csdn.net/weixin_50295745/article/details/125817707

版权

人工智能专栏收录该内容

10 篇文章 5 订阅

订阅专栏

文章目录

前言
预备知识
线性神经网络

前言

动手学深度学习的代码中用到一些python的特性，但是并没有解释，而且一些torch库函数也并没有做解释，自己也在各大库的官方文档里都查了，做出注释，让人可以看懂每一步。

预备知识

线性神经网络

线性回归从零实现

# 线性回归从零实现
import random
import torch
from d2l import torch as d2l

# 合成数据 features,labels
def synthetic_data(w,b,num_examples):
    X=torch.normal(0,1,(num_examples,len(w))) # 生成随机数据点
    y=torch.matmul(X,w)+b # 计算y
    y+=torch.normal(0,0.01,y.shape) # 对y添加正态分布随机误差，对应MSE极大似然估计
    return X,y.reshape((-1,1))

true_w=torch.tensor([2,-3.4]).reshape(-1,1) # 实际上不规定形状也行，matmul会自动匹配维度
true_b=4.2
features,labels=synthetic_data(true_w,true_b,1000)
d2l.set_figsize()
d2l.plt.scatter(features[:,1].detach().numpy(), # 这里只画第一个特征，因为画不下了
                labels.detach().numpy(),1) # detach是因为转numpy不需要梯度

# 读取数据
def data_iter(batch_size,features,labels):
    num_examples=len(features)
    indices=list(range(num_examples))
    random.shuffle(indices) # 打乱索引，对打乱后的索引顺序访问，相当于乱序
    for i in range(0,num_examples,batch_size):
        batch_indices=torch.tensor(indices[i:min(i+batch_size,num_examples)]) # 生成目标行列表
        #print(batch_indices)
        yield features[batch_indices],labels[batch_indices] # 通过列表索引
        # 生成generator类，相当于规定了提取方式
        # 每次外部的迭代，都会从函数中寻找原来的循环，从里面提取features[batch_indices]

batch_size=10
#for X,y in data_iter(batch_size,features,labels):
#    print(X,'\n',y)
    
# 初始化模型参数
w=torch.normal(0,0.001,size=(2,1),requires_grad=True)
b=torch.zeros(1,requires_grad=True)

# 定义模型，实现输入到输出，模型和参数分离
def linreg(X,w,b):
    return torch.matmul(X,w)+b

# 定义损失函数，这里的损失是给出一个向量
def squared_loss(y_hat,y):
    return (y_hat-y.reshape(y_hat.shape))**2/2

# 定义优化算法
def sgd(params,lr,batch_size):
    with torch.no_grad(): # 进行计算的时候不自动求导
        for param in params:
            param-=lr*param.grad/batch_size
            param.grad.zero_() # 清空梯度

# 训练
lr=0.03
num_epochs=3
net=linreg # 正向传播
loss=squared_loss # 损失函数

for epoch in range(num_epochs):
    for X,y in data_iter(batch_size,features,labels): # batch_gradient
        l=loss(net(X,w,b),y) # 正向传播
        l.sum().backward() # 反向传播，此时相关requeres_grad变量的梯度都更新了
        sgd([w,b],lr,batch_size) # 使用反向传播后储存在参数中的梯度，
        # 更新参数，同时清空梯度
    with torch.no_grad(): # 输出当前代loss
        train_l=loss(net(features,w,b),labels)
        print(f'epoch {epoch+1}, loss {float(train_l.mean()):f}')

print(f'w的估计误差: {true_w - w.reshape(true_w.shape)}')
print(f'b的估计误差: {true_b - b}')

线性回归简洁实现

# 线性回归简洁实现

import numpy as np
import torch
from torch.utils import data
from d2l import torch as d2l
from torch import nn

# 生成数据
true_w=torch.tensor([2,-3.4])
true_b=4.2
features,labels=d2l.synthetic_data(true_w,true_b,1000)
# 读取数据——TensorDateset类与DataLoader类，将数据集与读取分离
def load_array(data_arrays,batch_size,is_train=True):
    dataset=data.TensorDataset(*data_arrays)
    print(dataset)
    return data.DataLoader(dataset,batch_size,shuffle=is_train) # 生成一个generator
batch_size=10
data_iter=load_array((features,labels),batch_size)
# 定义模型——nn.Sequential是一个网络，里面可以加各种层
net=nn.Sequential(nn.Linear(2,1)) # 定义一个网络，具有一个2输入1输出的线性全连接层
# 初始化参数——对Sequential类用下标索引可以对一层操作
net[0].weight.data.normal_(0,0.01) # 正态分布初始化，weight
net[0].bias.data.fill_(0) # 直接填充，bias
# 定义损失函数——损失函数类
loss=nn.MSELoss()# MSE类，但是后面用的时候却像一个函数一样
# 定义优化器——SGD，绑定net，minibatch上的梯度是original的无偏估计，所以可以近似，而且引入随机，不会陷入局部最优
trainer=torch.optim.SGD(net.parameters(),lr=0.03)
# 训练流程
num_epochs=3
for epoch in range(num_epochs):
    for X,y in data_iter:
        l=loss(net(X),y) # 前向传播+损失计算
        trainer.zero_grad() # 清零
        l.backward() # 反向传播
        trainer.step() # 优化器自动根据lost的梯度，优化一步
    l=loss(net(features),labels)
    print(f'epoch {epoch+1},loss {l:f}')
print(f'w:{net[0].weight.data},b:{net[0].bias.data}')

softmax从零实现

这里你就会发现从零实现的难度逐渐提升，当层数变多，需要处理的参数之类的就越来越多，对应的函数也越来越大。一个尤其大的问题就是，程序隐式地使用了全局变量，因为参数太多了，不能全部传入，比如W和b，这两个参数都是默认全局使用的，这就让程序有隐患且不易读。

# softmax从零实现
import torch 
from IPython import display
from d2l import torch as d2l

# 读取数据
batch_size=256
train_iter,test_iter=d2l.load_data_fashion_mnist(batch_size)
# 初始化参数
num_inputs=28*28
num_outputs=10
W=torch.normal(0,0.01,size=(num_inputs,num_outputs),requires_grad=True)
b=torch.zeros(num_outputs,requires_grad=True)
# 定义softmax
def softmax(X):
    X_exp=torch.exp(X)
    partition=X_exp.sum(1,keepdim=True)
    return X_exp/partition # 广播归一化
# 定义模型
def net(X):# 先将图片变成一维向量，然后再操作，最后softmax
    return softmax(torch.matmul(X.reshape(-1,W.shape[0]),W)+b)
# 定义损失函数，实际上y标签并不是单热点编码，所以要进行一点处理
def cross_entropy(y_hat,y):# 这里是花式索引，给两个列表进行交点选择，回归交叉熵本质
    return -torch.log(y_hat[range(len(y_hat)),y])

# 定义训练精度(但是这个仅仅计算正确个数)
def accuracy(y_hat,y):
    if len(y_hat.shape)>1 and y_hat.shape[1]>1: # y_hat是矩阵，就降维
        y_hat=y_hat.argmax(axis=1) # 降维原则：在axis=1方向上找到最大的下标
    cmp=y_hat.type(y.dtype)==y # 先转换类型，再比较是否预测正确，得到bool列表
    return float(cmp.type(y.dtype).sum()) # True算1，计数

class Accumulator:  #@save
    """在n个变量上累加"""
    def __init__(self, n):
        self.data = [0.0] * n

    def add(self, *args):# 因为是list，所以不能用‘+’，而是用zip实现并行处理两个list
        self.data = [a + float(b) for a, b in zip(self.data, args)]

    def reset(self):
        self.data = [0.0] * len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]
    
def evaluate_accuracy(net,data_iter):
    """计算在指定数据集上模型的精度，实际上不用这个"""
    if isinstance(net,torch.nn.Module): # 如果是自定义函数，就跳过
        net.eval() # 设置模型为评估模式
    metric=Accumulator(2) # 累积，第一个是正确个数，第二个是总个数
    with torch.no_grad():
        for X,y in data_iter: # 对每个batch产生的y_hat,y，都把个数累积到Accumulator里
            metric.add(accuracy(net(X),y),y.numel())
    return metric[0]/metric[1] # 最后得出所有batch的结果

# 定义优化器
lr=0.1
def updater(batch_size):
    return d2l.sgd([W,b],lr,batch_size)

# 训练一代，返回训练损失和训练精度
def train_epoch_ch3(net, train_iter, loss, updater):  #@save
    """训练模型一个迭代周期（定义见第3章）"""
    # 将模型设置为训练模式，如果是个函数，就跳过
    if isinstance(net, torch.nn.Module):
        net.train()
    # 训练损失总和、训练准确度总和、样本数
    metric = Accumulator(3)
    for X, y in train_iter:
        # 计算梯度并更新参数
        y_hat = net(X) # 前向
        l = loss(y_hat, y) # 损失
        if isinstance(updater, torch.optim.Optimizer): # 反向传播
            # 使用PyTorch内置的优化器和损失函数
            updater.zero_grad() 
            l.mean().backward() # 这里感觉用mean和sum没啥区别，反正会自动求导
            updater.step()
        else:
            # 使用定制的优化器和损失函数
            # 清零梯度在updater里面
            l.sum().backward()
            updater(X.shape[0]) # 调用一个updater函数，传入的是batchsize
            # 而updater内部会生成一个sgd优化器，调用全局变量W，b，实际不会这么实现。
        metric.add(float(l.sum()), accuracy(y_hat, y), y.numel())
    # 返回训练损失和训练精度
    return metric[0] / metric[2], metric[1] / metric[2]

# 动画绘图
class Animator:  #@save
    """在动画中绘制数据"""
    def __init__(self, xlabel=None, ylabel=None, legend=None, xlim=None,
                 ylim=None, xscale='linear', yscale='linear',
                 fmts=('-', 'm--', 'g-.', 'r:'), nrows=1, ncols=1,
                 figsize=(3.5, 2.5)):
        # 增量地绘制多条线
        if legend is None:
            legend = []
        d2l.use_svg_display()
        self.fig, self.axes = d2l.plt.subplots(nrows, ncols, figsize=figsize)
        if nrows * ncols == 1:
            self.axes = [self.axes, ] # 即使只有一个，也要变成list，适应d2l绘图
        # 使用lambda函数捕获参数，理解为将参数以预先写好的函数形式存到config_axes函数里
        # 实际上就是把后面一大串表述用一个config_axes()代替
        # 好处就是可以在另一个函数里间接使用这一大堆参数了，
        self.config_axes = lambda: d2l.set_axes(
            self.axes[0], xlabel, ylabel, xlim, ylim, xscale, yscale, legend)
        # 这几个参数因为不是set_axes的参数，所以不能放在那个lambda里，就单独用self过渡
        self.X, self.Y, self.fmts = None, None, fmts

    def add(self, x, y):
        # 向图表中添加多个数据点
        if not hasattr(y, "__len__"): # 保证y是iterable
            y = [y]
        n = len(y)
        if not hasattr(x, "__len__"): # 保证x和y的数量对齐，适应d2l的二维列表绘图
            x = [x] * n
        if not self.X: # 创建shape=(n,0)的空列表X，Y，用于承接逐渐加入的x，y数据
            self.X = [[] for _ in range(n)]
        if not self.Y:
            self.Y = [[] for _ in range(n)]
        for i, (a, b) in enumerate(zip(x, y)): # zip并行处理x，y，将一列数据加入X，Y
            if a is not None and b is not None:
                self.X[i].append(a)
                self.Y[i].append(b)
        self.axes[0].cla() # 清空图像0，实际上只有这一个图
        #画出三条线
        for x, y, fmt in zip(self.X, self.Y, self.fmts): # zip并行处理x，y，fmt，
            self.axes[0].plot(x, y, fmt)
        self.config_axes()
        display.display(self.fig)
        display.clear_output(wait=True)  # 清楚输出

# 训练
def train_ch3(net, train_iter, test_iter, loss, num_epochs, updater):  #@save
    """训练模型（定义见第3章）"""
    animator = Animator(xlabel='epoch', xlim=[1, num_epochs], ylim=[0.3, 0.9],
                        legend=['train loss', 'train acc', 'test acc'])
    for epoch in range(num_epochs):
        train_metrics = train_epoch_ch3(net, train_iter, loss, updater) # 训练一次
        test_acc = evaluate_accuracy(net, test_iter) # 测试一次
        animator.add(epoch + 1, train_metrics + (test_acc,)) # 动态绘图，x是标量，y是三元组
        # print(hasattr( train_metrics + (test_acc,),'__len__'))
    # 如果脱离正常范围，说明没训练好，可能是数值计算错误出现了
    train_loss, train_acc = train_metrics
    assert train_acc <= 1 and train_acc > 0.7, train_acc # 这个，后面的东西没啥用
    assert test_acc <= 1 and test_acc > 0.7, test_acc
    assert train_loss < 0.5, train_loss
    
num_epochs=10
train_ch3(net,train_iter,test_iter,cross_entropy,num_epochs,updater)

# 预测
def predict_ch3(net,test_iter,n=6):
    for X,y in test_iter: # 获取一个batch
        break
    trues=d2l.get_fashion_mnist_labels(y)
    preds=d2l.get_fashion_mnist_labels(net(X).argmax(axis=1))
    titles=[true+'\n'+pred for true,pred in zip(trues,preds)]
    d2l.show_images( # 从batch中取n个样本绘制
            X[0:n].reshape(n,28,28),1,n,titles=titles[0:n])
    
predict_ch3(net,test_iter,20) # 取20个大概有一两个错的。

softmax简洁实现

层数变多+动态绘图，一下子将代码变多。

这个时候，一个顺理成章的思路应该是将这些参数封装到类中，pytorch框架也确实是这么做的，使用Sequential类将网络直接封装为一个类，调整参数之类的操作可以直接通过访问net的某一层实现，也可以将net和优化器绑定，通过优化器简洁地实现。

使用了pytorch库的神经网络，所有模块都变成了类。
请添加图片描述

# softmax简洁实现
import torch
from torch import nn
from d2l import torch as d2l

# 加载数据
batch_size=256
train_iter,test_iter=d2l.load_data_fashion_mnist(batch_size)
# 定义模型+初始化模型参数
def init_weights(model):
    if type(model)==nn.Linear:
        nn.init.normal_(model.weight,std=0.01) # 默认mean=0
net=nn.Sequential(nn.Flatten(),nn.Linear(28*28,10)) # 添加flatten将图片展成1维
net.apply(init_weights) # 将函数应用到每一个子模块(但是Flatten没有weight)
# softmax+交叉熵
loss=nn.CrossEntropyLoss(reduction='none')
# 定义优化算法
trainer=torch.optim.SGD(net.parameters(),lr=0.1)
# 训练，同样的d2l训练函数，函数通过if来适应net为自定义函数与nn.Module两种情况
num_epochs=10 # 自定义net中，softmax在net里，而这里的softmax在loss里，但是效果一样
d2l.train_ch3(net,train_iter,test_iter,loss,num_epochs,trainer)