hw2
实现功能
- 实现参数的多种初始化:用代码实现对应的数学公式即可,原理和公式推导见这里
- Xavier均匀分布版
# 对于全连接层来说,fan_in是上一层的神经元数目,fan_out是当前层的神经元数目 # gain是放缩因子,用于调整初始化的范围,默认为1 def xavier_uniform(fan_in, fan_out, gain=1.0, **kwargs): a = gain * math.sqrt(6/(fan_in+fan_out)) # w服从(-a,a)的均匀分布 # rand(fan_in,fan_out)用随机数生成上一层和当前层之间的参数矩阵,范围0~1 # 通过乘2减1后,此时参数矩阵中的值落在[-1,1]内 # 然后再乘上a,让最后的参数矩阵服从-a到a的均匀分布 return a * (2*rand(fan_in, fan_out, **kwargs)-1)
- Xavier正态分布版
def xavier_normal(fan_in, fan_out, gain=1.0, **kwargs): std = gain * math.sqrt(2/(fan_in+fan_out)) return std * randn(fan_in, fan_out, **kwargs)
- Kaiming均匀分布版
# 使用ReLU激活函数的推荐放缩因子gain=根号2 def kaiming_uniform(fan_in, fan_out, nonlinearity="relu", **kwargs): assert nonlinearity == "relu", "Only relu supported currently" gain = math.sqrt(2) bound = gain * math.sqrt(3/fan_in) return bound * (2*rand(fan_in, fan_out, **kwargs)-1)
- Kaiming 正态分布版
def kaiming_normal(fan_in, fan_out, nonlinearity="relu", **kwargs): assert nonlinearity == "relu", "Only relu supported currently" gain = math.sqrt(2) std = gain / math.sqrt(fan_in) return std * randn(fan_in, fan_out, **kwargs)
- 实现神经网络里的一些模块
- Linear
class Linear(Module): def __init__(self, in_features, out_features, bias=True, device=None, dtype="float32"): super().__init__() self.in_features = in_features self.out_features = out_features ### BEGIN YOUR SOLUTION self.weight = Parameter(init.kaiming_uniform(in_features, out_features, requires_grad=True)) if bias: self.bias = Parameter(init.kaiming_uniform(out_features, 1, requires_grad=True).reshape((1, out_features))) else: self.bias = None ### END YOUR SOLUTION def forward(self, X: Tensor) -> Tensor: ### BEGIN YOUR SOLUTION # y=Wx+b,注意b要广播 X_mul_weight = X @ self.weight if self.bias: return X_mul_weight + self.bias.broadcast_to(X_mul_weight.shape) else: return X_mul_weight ### END YOUR SOLUTION
- ReLU
class ReLU(Module): def forward(self, x: Tensor) -> Tensor: ### BEGIN YOUR SOLUTION return ops.relu(x) ### END YOUR SOLUTION # 在ops.py中,ReLU算子的定义如下 class ReLU(TensorOp): def compute(self, a): ### BEGIN YOUR SOLUTION return array_api.maximum(a, 0) ### END YOUR SOLUTION def gradient(self, out_grad, node): ### BEGIN YOUR SOLUTION a = node.inputs[0].realize_cached_data() mask = Tensor(a > 0) return out_grad * mask ### END YOUR SOLUTION
- Sequential
# sequential是一个容器模块,将多个子模块按顺序串联起来形成一个完整的神经网络模型 class Sequential(Module): def __init__(self, *modules): super().__init__() self.modules = modules def forward(self, x: Tensor) -> Tensor: ### BEGIN YOUR SOLUTION for module in self.modules: x = module(x) return x ### END YOUR SOLUTION
- 实现一个算子LogSumExp
class LogSumExp(TensorOp): def __init__(self, axes: Optional[tuple] = None): self.axes = axes # axes用于指定进行操作的轴或轴组: # 若axes是一个整数,表示在指定的单个轴上执行操作 # 若axes是一个元组或列表,表示在指定的多个轴上执行操作 # 在LogSumExp中,先找到Z在指定轴上的最大值max,然后通过减去最大值作数值稳定化处理 # 接着对稳定化后的结果进行指定轴上的求和,并取对数 # 最后将ret按照指定的轴形状进行调整 # 例如二维张量Z的形状为(3,4),计算Z.LogSumExp(axes=(0,)):先计算第一个轴上的最大值,得到(1,4)的张量 # 然后减去最大值、计算指数、求和、取对数,得到(1,4)的张量 # 最后根据指定的轴形状,将结果调整为(4,)的张量 def compute(self, Z): ### BEGIN YOUR SOLUTION max = array_api.max(Z, axis=self.axes, keepdims=1) ret = array_api.log( array_api.exp(Z-max).sum(axis=self.axes, keepdims=1)) \ + max if self.axes: # 若有指定的轴或轴组,就根据它来确定输出形状 # 列表推导式。enumerate(Z.shape)将返回一个包含索引和形状大小的迭代器 # for i, size in enumerate(Z.shape) if i not in self.axes:若不是指定的轴,就保留(在指定的轴上会进行上面的一系列操作) out_shape = [size for i, size in enumerate(Z.shape) if i not in self.axes] else: out_shape = () ret.resize(tuple(out_shape)) return ret ### END YOUR SOLUTION def gradient(self, out_grad, node): ### BEGIN YOUR SOLUTION Z = node.inputs[0] if self.axes: shape = [1] * len(Z.shape) s = set(self.axes) j = 0 for i in range(len(shape)): if i not in s: shape[i] = node.shape[j] j += 1 node_new = node.reshape(shape) grad_new = out_grad.reshape(shape) else: node_new = node grad_new = out_grad return grad_new * exp(Z-node_new) ### END YOUR SOLUTION
- SoftmaxLoss
class SoftmaxLoss(Module): def forward(self, logits: Tensor, y: Tensor): ### BEGIN YOUR SOLUTION # 用LogSumExp来完成softmaxloss exp_sum = ops.logsumexp(logits, axes=(1,)).sum() z_y_sum = (logits*init.one_hot(logits.shape[1],y)).sum() return (exp_sum-z_y_sum) / logits.shape[0] ### END YOUR SOLUTION
- LayerNorm1d
class LayerNorm1d(Module): def __init__(self, dim, eps=1e-5, device=None, dtype="float32"): super().__init__() self.dim = dim self.eps = eps ### BEGIN YOUR SOLUTION self.weight = Parameter(init.ones(self.dim, requires_grad=True)) self.bias = Parameter(init.zeros(self.dim, requires_grad=True)) ### END YOUR SOLUTION def forward(self, x: Tensor) -> Tensor: ### BEGIN YOUR SOLUTION batch_size = x.shape[0] feature_size = x.shape[1] # x.sum(axes=(1,))表示沿着列求和,即对一行的元素相加,每一行都如此 mean = x.sum(axes=(1,)).reshape((batch_size,1)) / feature_size x_minus_mean = x - mean.broadcast_to(x.shape) x_std = ((x_minus_mean**2).sum(axes=(1,)).reshape((batch_size,1)) / feature_size + self.eps) ** 0.5 normed = x_minus_mean / x_std.broadcast_to(x.shape) return self.weight.broadcast_to(x.shape) * normed + self.bias.broadcast_to(x.shape) ### END YOUR SOLUTION
- Flatten
class Flatten(Module): def forward(self, X): ### BEGIN YOUR SOLUTION # 假设输入X形状为(2,3,4),则X.reshape(2,-1)后的形状为(2,12),即第2、3维度的数据被拉平成了一行 return X.reshape((X.shape[0], -1)) ### END YOUR SOLUTION
- BatchNorm1d
class BatchNorm1d(Module): def __init__(self, dim, eps=1e-5, momentum=0.1, device=None, dtype="float32"): super().__init__() self.dim = dim self.eps = eps self.momentum = momentum ### BEGIN YOUR SOLUTION self.weight = Parameter(init.ones(self.dim, requires_grad=True)) self.bias = Parameter(init.zeros(self.dim, requires_grad=True)) # test时需要使用全局均值和方差 self.running_mean = init.zeros(self.dim) self.running_var = init.ones(self.dim) ### END YOUR SOLUTION def forward(self, x: Tensor) -> Tensor: ### BEGIN YOUR SOLUTION batch_size = x.shape[0] mean = x.sum((0,)) / batch_size x_minus_mean = x - mean.broadcast_to(x.shape) var = (x_minus_mean**2).sum((0,)) / batch_size if self.training: self.running_mean = (1-self.momentum) * self.running_mean + self.momentum * mean.data self.running_var = (1-self.momentum) * self.running_var + self.momentum * var.data x_std = ((var+self.eps)**0.5).broadcast_to(x.shape) x_normed = x_minus_mean / x_std return x_normed * self.weight.broadcast_to(x.shape) + self.bias.broadcast_to(x.shape) else: # test阶段,需要用全局的running_mean和running_var x_normed = (x-self.running_mean) / (self.running_var+self.eps)**0.5 return x_normed * self.weight.broadcast_to(x.shape) + self.bias.broadcast_to(x.shape) ### END YOUR SOLUTION
- Dropout
class Dropout(Module): def __init__(self, p = 0.5): super().__init__() self.p = p def forward(self, x: Tensor) -> Tensor: ### BEGIN YOUR SOLUTION # 只有训练的时候才会用到dropout mask = init.randb(*x.shape, p=1-self.p) # 返回与x形状相同的tensor,其每个元素值以概率p被保留为1,概率1-p被设置为0 if self.training: x_mask = x * mask return x_mask / (1-self.p) else: return x ### END YOUR SOLUTION
- Residual
class Residual(Module): def __init__(self, fn: Module): super().__init__() self.fn = fn def forward(self, x: Tensor) -> Tensor: ### BEGIN YOUR SOLUTION # 残差就是简单的输入加输出 return x + self.fn(x) ### END YOUR SOLUTION
- Linear
- 实现各种优化器(常见优化算法看这里)
- SGD
class SGD(Optimizer): def __init__(self, params, lr=0.01, momentum=0.0, weight_decay=0.0): super().__init__(params) self.lr = lr self.momentum = momentum self.u = { } # 表示动量momentum,u[i]表示i号参数的动量值 self.weight_decay = weight_decay # 权重衰减系数 def step(self): ### BEGIN YOUR SOLUTION # SGD with momentum,且包含正则项 for i, param in enumerate(self.params): if i not in self.u: self.u[i] = 0 # 将动量初始化为0 if param.grad is None: # 若梯度为None时,跳过后面对梯度的操作,直接进入下一次循环 continue grad_data = ndl.Tensor(param.grad.numpy(), dtype='float32').data \ + self.weight_decay * param.data # 这里实现的是 L1 norm self.u[i] = self.momentum * self.u[i] + (1-self.momentum) * grad_data param.data = param.data - self.u[i] * self.lr ### END YOUR SOLUTION
- Adam
class Adam(Optimizer): def __init__( self, params, lr=0.01, beta1=0.9, beta2=0.999, eps=1e-8, weight_decay=0.0, ): super().__init__(params) self.lr = lr self.beta1 = beta1 self.beta2 = beta2 self.eps = eps self.weight_decay = weight_decay self.t = 0 self.m = { } self.v = { } def step(self): ### BEGIN YOUR SOLUTION self.t += 1 for i, param in enumerate(self.params): if i not in self.m: self.m[i] = ndl.init.zeros(*param.shape) self.v[i] = ndl.init.zeros(*param.shape) if param.grad is None: continue # 和SGD momentum一样按照各自的公式写 grad_data = ndl.Tensor(param.grad.numpy(), dtype='float32').data \ + param.data * self.weight_decay self.m[i] = self.beta1 * self.m[i] \ + (1 - self.beta1) * grad_data self.v[i] = self.beta2 * self.v[i] \ + (1 - self.beta2) * grad_data**2 # 修正 u_hat = (self.m[i]) / (1 - self.beta1 ** self.t) v_hat = (self.v[i]) / (1 - self.beta2 ** self.t) param.data = param.data - self.lr * u_hat / (v_hat ** 0.5 + self.eps) ### END YOUR SOLUTION
- SGD
- 实现Dataset和DataLoader(非框架重点,可跳过)
- Dataset:存储样本和对应标签
- DataLoader:把Dataset包装成一个可迭代对象,以便访问样本
- 目前已经完成了神经网络的所有组件,那么就自行建造并训练一个MLP ResNet吧
- 先根据ResNet的结构图把一个个模块堆叠起来
- 然后定义一个epoch中的训练或测试流程(前向计算、输出loss、反向传播、用优化器里的step函数更新模型参数、再记录一下平均loss值什么的)
- 最后编写在mnist数据集上训练模型的代码(先读取数据集、声明一个ResNet模型、声明优化器、用上面定义的训练/测试流程进行一个epoch的训练/测试
知识补充
优化算法
目前有很多优化算法,但是SGD with momentum和Adam是相对来说最重要的两个,是了解深度学习必须知道的两个优化算法
- Newton’s method:
牛顿法的核心思想是对函数的一阶泰勒展开求解,推导过程如下:
假设有函数 f ( x ) f(x) f(x),需要求解 x x x使 f ( x ) = 0 f(x)=0 f(x)=0。则在初始点 x 0 x_0 x0处将函数进行一阶泰勒展开有:
f ( x ) = f ( x 0 ) + Δ f ( x ) ( x − x 0 ) f(x)=f(x_0)+\Delta f(x)(x-x_0) f(x)=f(x0)+Δf(x)(x−x0)
将 f ( x ) = 0 f(x)=0 f(x)=0带入有:
x = x 0 − f ( x 0 ) f ′ ( x 0 ) x=x_0-\frac{f(x_0)}{f'(x_0)} x=x0−f′(x0)f(x0)
由泰勒展开的原理,这里得到的 x x x只是对方程根的近似,但肯定比 x 0 x_0 x0更接近方程根。因此可以通过迭代的方式,在这个近似解 x x x处再进行一阶展开,进而得到更接近方程根的值,即得到下面迭代解方程根的公式:
x t + 1 = x t − f ( x t ) f ′ ( x t ) x^{t+1}=x^t-\frac{f(x^t)}{f'(x^t)} xt+1=xt−f′(xt)f(xt)
上面原理搞清楚之后,将损失函数相关值代入。我们的目的是求解 θ \theta θ使最小化 J ( θ ) J(\theta) J(θ),而 J ( θ ) J(\theta) J(θ)最小值对应着 Δ J (