python 实现深度学习各个基础网络

一、CNN

1.1 激活函数

1.1.1 sigmoid

def f_sigmoid(z):
    return 1.0/(1.0 + np.exp(-z))

def f_sigmoid_derivate(z):
    y = f_sigmoid(z)
    return y*(1-y)

1.1.2 relu

def f_relu(z):
    return np.maximum(z, 0)

def f_relu_derivate(z):
    return np.heaviside(z,0.5)

1.1.3 softmax

def f_softmax(z):
    # 直接使用np.exp(z)可能产生非常大的数以至出现nan
    # 所以分子分母同时乘以一个数来限制它
    # 这里用 exp(-np.max(z))
    exps = np.exp(z-np.max(z))
    exp_sum = np.sum(exps)
    return exps/exp_sum

def f_softmax_derivate(z):
    y = f_softmax(z).reshape((-1,))
    return np.diag(y)-y.reshape((-1,1)).dot(y.reshape(1,-1))

1.2 全连接层

class FullConnectedLayer(Layer):
    def __init__(self, input_size, output_size):
        self.i_size = input_size
        self.o_size = output_size
        if self.i_size is not None:
            self.__init(self.i_size)

    def __init(self, input_size):
        self.i_size = input_size
        self.w = np.random.normal(
            loc=0.0, scale=1.0, size=(self.i_size, self.o_size))
        self.b = np.random.normal(loc=0.0, scale=1.0, size=(1, self.o_size))
        self.x: np.ndarray = None  # input

    def __call__(self, x: np.ndarray) -> np.ndarray:
        x = x.reshape(1, -1)
        # 如果 self.i_size 还没有确定,则根据x.shape来初始化
        if self.i_size is None:
            self.__init(x.shape[1])
        self.x = x
        self.z = x.dot(self.w)+self.b
        return self.z

    def backward(self, dc_dz: np.ndarray) -> np.ndarray:
        dc_dx = dc_dz.dot(self.w.T)
        self.w += self.x.T.dot(dc_dz)
        self.b += dc_dz
        return dc_dx

1.3 池化层

class MeanPoolingLayer(Layer):
    def __init__(self, kernel_size: int, stride: int):
        self.ks = kernel_size
        self.kernel_shape = (kernel_size, kernel_size)
        self.channels: int = None
        self.stride = stride
        self.input_shape: tuple = None  # row_cnt,col_cnt,channels
        self.target_shape: tuple = None  # 目标的shape

    def __call__(self, mat: np.ndarray) -> np.ndarray:
        self.input_shape = mat.shape
        self.channels = mat.shape[2]
        row, col = mat.shape[0], mat.shape[1]
        (kr, kc), s = self.kernel_shape, self.stride
        self.target_shape = ((row-kr)//s+1, (col-kc)//s+1, self.channels)
        target = np.zeros(self.target_shape)
        for i in range(self.target_shape[0]):
            for j in range(self.target_shape[1]):
                r, c = i*s, j*s
                target[i, j] = np.average(mat[r:r+kr, c:c+kc], axis=(0, 1))
        return target

    def backward(self, d_out: np.ndarray) -> np.ndarray:
        d_input = np.zeros(self.input_shape)
        n = self.kernel_shape[0]*self.kernel_shape[1]
        d_mat = d_out/n  # mean-pooling 求导后恰好是 1/n
        (kr, kc), s = self.kernel_shape, self.stride
        for i in range(self.target_shape[0]):
            for j in range(self.target_shape[1]):
                r, c = i*s, j*s
                d_input[r:r+kr, c:c+kc] += d_mat[i, j]
        return d_input

1.4 卷积层

class ConvolutionLayer(Layer):
    def __init__(self, in_channels, out_channels, kernel_size, stride):
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.ks = kernel_size
        self.kernel_shape = (kernel_size, kernel_size)
        self.stride = stride
        self.x: Optional[np.ndarray] = None  # input
        # 卷积核: row,col,channel 顺序
        # 共有 out_channels 个 (row,col,in_channel) 的 kernels
        self.kernel = np.random.normal(loc=0.0, scale=1.0, size=(
            kernel_size, kernel_size, in_channels, out_channels,))
        # 每个卷积核共用一个 bias, 总共有 out_channels 个 biases
        self.bias = np.random.normal(loc=0.0, scale=1.0, size=(out_channels,))

    def check_x_mat_shape(self, x_mat):
        '''
            要求卷积核在卷积过程中可以把矩阵铺满(stride空隙不算)
            右侧(下侧)不能有多余的列(行)
            如 28x28 不能用(5x5,stride=2)的卷积核,因为它只能覆盖(27x27)
        '''
        row, col = x_mat.shape[0], x_mat.shape[1]
        k, s = self.ks, self.stride
        assert (row - k) // s * s + k == row
        assert (col - k) // s * s + k == col

    def __call__(self, x_mat: np.ndarray) -> np.ndarray:
        self.check_x_mat_shape(x_mat)
        self.x = x_mat
        return self.__conv(
            stride=self.stride,
            mat=x_mat,
            kernel=self.kernel,
            bias=None,
            einsum_formula="ijk,ijkl->l",
            out_ele_shape=[self.out_channels]
        )

    def backward(self, dc_dz: np.ndarray) -> np.ndarray:
        # 反向卷积的目标是dc_dz补0之后的矩阵(张量)
        # (padding + dilation)
        # 补0规则为:边缘padding kernel_size-1 层0;间隔处补 stride-1 层0
        # 只看横向,如果dc_dz有c列,那该矩阵有 2kernel_size+(m-1)stride-1 列
        # 反向卷积的stride固定为1
        (kr, kc, in_ch, out_ch), s = self.kernel.shape, self.stride
        dc_dz_with_zeros_shape = (
            2 * kr + (dc_dz.shape[0] - 1) * s - 1,
            2 * kc + (dc_dz.shape[1] - 1) * s - 1,
            dc_dz.shape[2]
        )
        D = np.zeros(dc_dz_with_zeros_shape)  # 为了简化,用D表示补充0之后的张量
        for i in range(dc_dz.shape[0]):
            for j in range(dc_dz.shape[1]):
                D[kr + i * s - 1, kc + j * s - 1] = dc_dz[i, j]
        # 求 dc_da(a指的是该层的输入self.x,因为习惯上称呼上一层的激活值为a[l-1])
        # 注意stride(步长)是1
        # kernel[i,j,k,l]在正向推导时i表示row,j表示col,k表示in_ch,l表示out_ch
        # 反向推导时i表示row,j表示col,l表示in_ch,k表示out_ch,其余计算步骤和正向推导一致
        dc_da = self.__conv(
            stride=1,
            mat=D,
            kernel=self.kernel[::-1, ::-1],  # 注意不能漏了反向传播中卷积核的180度旋转 rot180(w)
            bias=None,
            einsum_formula="ijl,ijkl->k",
            out_ele_shape=[in_ch])
        # 求 dc_dw(即dc_d kernel)
        # 也是卷积,只不过是用 rot180(a_input) 对 D 卷积
        dc_dw = self.__conv(
            stride=1,
            mat=D,
            kernel=self.x[::-1, ::-1],
            bias=None,
            einsum_formula="ijl,ijk->kl",
            out_ele_shape=[in_ch, out_ch])
        # 求 dc_db
        dc_db = np.einsum("ijk->k", dc_dz)
        # 更新w(kernel)和b(bias),并返回 dc_da
        self.kernel += dc_dw
        self.bias += dc_db
        return dc_da

    def __conv(self,
               stride: int,
               mat: np.ndarray,  # shape=(row, col, in_ch)
               kernel: np.ndarray,  # shape=(k_row, k_col, in_ch, out_ch)
               bias: np.ndarray = None,  # shape=(out_ch,)
               einsum_formula: str = "ijk,ijkl->l",
               out_ele_shape: Iterable[int] = None) -> np.ndarray:
        '''
            einsum_formula:
                卷积核kernel对mat的某个子矩阵进行全卷积要使用这个爱因斯坦求和约定式子进行计算。
                卷积结束后得到一个 shape=(I,J) 的结果矩阵。
                矩阵的每一个元素不一定是值,有可能是一个张量,这需要要看 einsum_formula 的设置。
                结果矩阵本质上可以写成 shape=(I,J,...) 的张量

            out_ele_shape:
                注意 out_ele_shape 要与 einsum_formula 相对应
                out_ele_shape 表示作卷积后,结果矩阵中每个元素的shape
                out_ele_shape 会被用来构造结果张量。
                -------------------------------------------------------------
                举个例子:
                    "ijk,ijl->kl",用这个式子卷积后结果矩阵的每个元素都是 shape=(K,L)
                    的矩阵,那么结果其实是一个 (I,J,K,L) 的4维张量,此时应该设置
                    out_ele_shape=[K,L]
                -------------------------------------------------------------
                如果是单通道卷积,则每个元素就是一个数值,应该设置 out_ele_shape=[]
                默认设置是针对正向传播的,此时out_ele_shape可以设置为None(只是为了方便)
        '''
        # 卷积运算 sub_np_tensor * kernel_np_tensor + bias
        if bias is None:
            def f(m):
                return np.einsum(
                    einsum_formula, m, kernel)
        else:
            def f(m):
                return np.einsum(einsum_formula, m, kernel) + bias
        row, col = mat.shape[0], mat.shape[1]
        s = stride  # 简写
        (kr, kc, *omit), s = kernel.shape, stride

        # out_ele_shape 默认为 (kernel.shape[2],)
        # 针对正向推导
        if out_ele_shape is None:
            assert len(kernel.shape) == 3
            out_ch = kernel.shape[-1]
            out_ele_shape = (out_ch,)

        target_shape = ((row - kr) // s + 1, (col - kc) // s + 1, *out_ele_shape)
        target = np.zeros(target_shape)
        for i in range(target_shape[0]):
            for j in range(target_shape[1]):
                r, c = i * s, j * s
                target[i, j] = f(mat[r:r + kr, c:c + kc])
        return target

二、序列模型

2.1 RNN

class Rnn():
 
  def __init__(self, input_size, hidden_size, num_layers, bidirectional=False):
    self.input_size = input_size
    self.hidden_size = hidden_size
    self.num_layers = num_layers
    self.bidirectional = bidirectional
 
  def feed(self, x):
    '''
 
    :param x: [seq, batch_size, embedding]
    :return: out, hidden
    '''
 
    # x.shape [sep, batch, feature]
    # hidden.shape [hidden_size, batch]
    # Whh0.shape [hidden_size, hidden_size] Wih0.shape [hidden_size, feature]
    # Whh1.shape [hidden_size, hidden_size] Wih1.size [hidden_size, hidden_size]
 
    out = []
    x, hidden = np.array(x), [np.zeros((self.hidden_size, x.shape[1])) for i in range(self.num_layers)]
    Wih = [np.random.random((self.hidden_size, self.hidden_size)) for i in range(1, self.num_layers)]
    Wih.insert(0, np.random.random((self.hidden_size, x.shape[2])))
    Whh = [np.random.random((self.hidden_size, self.hidden_size)) for i in range(self.num_layers)]
 
    time = x.shape[0]
    for i in range(time):
      hidden[0] = np.tanh((np.dot(Wih[0], np.transpose(x[i, ...], (1, 0))) +
               np.dot(Whh[0], hidden[0])
               ))
 
      for i in range(1, self.num_layers):
        hidden[i] = np.tanh((np.dot(Wih[i], hidden[i-1]) +
                   np.dot(Whh[i], hidden[i])
                   ))
 
      out.append(hidden[self.num_layers-1])
 
    return np.array(out), np.array(hidden)

2.2 LSTM

class LSTM()
	def __init__(self):
        self.wordDim = 6000
        self.hiddenDim = 100
        self.Wi, self.Ui = self.initWeights()   #输入门
        self.Wf, self.Uf = self.initWeights()   #遗忘门
        self.Wo, self.Uo = self.initWeights()   #输出门
        self.Wa, self.Ua = self.initWeights()   #记忆门
        self.Wy = np.random.uniform(-np.sqrt(1. / self.wordDim), np.sqrt(1. / self.wordDim), (self.wordDim, self.hiddenDim))  #隐含层到输出层的权重矩阵(100, 6000)

    def initWeights(self):
        W = np.random.uniform(-np.sqrt(1. / self.wordDim), np.sqrt(1. / self.wordDim), (self.hiddenDim, self.wordDim))  #输入层到隐含层的权重矩阵(100, 6000)
        U = np.random.uniform(-np.sqrt(1. / self.hiddenDim), np.sqrt(1. / self.hiddenDim), (self.hiddenDim, self.hiddenDim))  #隐含层到隐含层的权重矩阵(100, 100)
        return W, U
    def forward(self, data):  #前向传播,原则上传入一个数据样本和标签
        T = len(data)
        output = np.zeros((T, self.wordDim, 1)) #输出
        hidden = np.zeros((T+1, self.hiddenDim, 1)) #隐层状态
        cPre = np.zeros((self.hiddenDim, 1))
        states = list()
        for t in range(T): #时间循环
            state = dict()
            X = np.zeros((self.wordDim, 1)) #构建(6000,1)的向量
            X[data[t]][0] = 1   #将对应的值置为1,形成词向量
            a = np.tanh(np.dot(self.Wa, X) + np.dot(self.Ua, hidden[t-1]))
            i = self.sigmoid(np.dot(self.Wi, X) + np.dot(self.Ui, hidden[t-1]))
            f = self.sigmoid(np.dot(self.Wf, X) + np.dot(self.Uf, hidden[t-1]))
            o = self.sigmoid(np.dot(self.Wo, X) + np.dot(self.Uo, hidden[t-1]))
            c = np.multiply(i, a) + np.multiply(f, cPre)
            state['a'] = a
            state['i'] = i
            state['f'] = f
            state['o'] = o
            state['c'] = c
            states.append(state.copy())
            cPre = c
            hidden[t] = np.multiply(o, np.tanh(c))
            y = self.softmax(np.dot(self.Wy, hidden[t]))
            output[t] = y
        state = dict()
        state['c'] = np.zeros((self.hiddenDim, 1))
        states.append(state.copy())
        return hidden, output, states
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值