一、CNN
1.1 激活函数
1.1.1 sigmoid
def f_sigmoid(z):
return 1.0/(1.0 + np.exp(-z))
def f_sigmoid_derivate(z):
y = f_sigmoid(z)
return y*(1-y)
1.1.2 relu
def f_relu(z):
return np.maximum(z, 0)
def f_relu_derivate(z):
return np.heaviside(z,0.5)
1.1.3 softmax
def f_softmax(z):
exps = np.exp(z-np.max(z))
exp_sum = np.sum(exps)
return exps/exp_sum
def f_softmax_derivate(z):
y = f_softmax(z).reshape((-1,))
return np.diag(y)-y.reshape((-1,1)).dot(y.reshape(1,-1))
1.2 全连接层
class FullConnectedLayer(Layer):
def __init__(self, input_size, output_size):
self.i_size = input_size
self.o_size = output_size
if self.i_size is not None:
self.__init(self.i_size)
def __init(self, input_size):
self.i_size = input_size
self.w = np.random.normal(
loc=0.0, scale=1.0, size=(self.i_size, self.o_size))
self.b = np.random.normal(loc=0.0, scale=1.0, size=(1, self.o_size))
self.x: np.ndarray = None
def __call__(self, x: np.ndarray) -> np.ndarray:
x = x.reshape(1, -1)
if self.i_size is None:
self.__init(x.shape[1])
self.x = x
self.z = x.dot(self.w)+self.b
return self.z
def backward(self, dc_dz: np.ndarray) -> np.ndarray:
dc_dx = dc_dz.dot(self.w.T)
self.w += self.x.T.dot(dc_dz)
self.b += dc_dz
return dc_dx
1.3 池化层
class MeanPoolingLayer(Layer):
def __init__(self, kernel_size: int, stride: int):
self.ks = kernel_size
self.kernel_shape = (kernel_size, kernel_size)
self.channels: int = None
self.stride = stride
self.input_shape: tuple = None
self.target_shape: tuple = None
def __call__(self, mat: np.ndarray) -> np.ndarray:
self.input_shape = mat.shape
self.channels = mat.shape[2]
row, col = mat.shape[0], mat.shape[1]
(kr, kc), s = self.kernel_shape, self.stride
self.target_shape = ((row-kr)//s+1, (col-kc)//s+1, self.channels)
target = np.zeros(self.target_shape)
for i in range(self.target_shape[0]):
for j in range(self.target_shape[1]):
r, c = i*s, j*s
target[i, j] = np.average(mat[r:r+kr, c:c+kc], axis=(0, 1))
return target
def backward(self, d_out: np.ndarray) -> np.ndarray:
d_input = np.zeros(self.input_shape)
n = self.kernel_shape[0]*self.kernel_shape[1]
d_mat = d_out/n
(kr, kc), s = self.kernel_shape, self.stride
for i in range(self.target_shape[0]):
for j in range(self.target_shape[1]):
r, c = i*s, j*s
d_input[r:r+kr, c:c+kc] += d_mat[i, j]
return d_input
1.4 卷积层
class ConvolutionLayer(Layer):
def __init__(self, in_channels, out_channels, kernel_size, stride):
self.in_channels = in_channels
self.out_channels = out_channels
self.ks = kernel_size
self.kernel_shape = (kernel_size, kernel_size)
self.stride = stride
self.x: Optional[np.ndarray] = None
self.kernel = np.random.normal(loc=0.0, scale=1.0, size=(
kernel_size, kernel_size, in_channels, out_channels,))
self.bias = np.random.normal(loc=0.0, scale=1.0, size=(out_channels,))
def check_x_mat_shape(self, x_mat):
'''
要求卷积核在卷积过程中可以把矩阵铺满(stride空隙不算)
右侧(下侧)不能有多余的列(行)
如 28x28 不能用(5x5,stride=2)的卷积核,因为它只能覆盖(27x27)
'''
row, col = x_mat.shape[0], x_mat.shape[1]
k, s = self.ks, self.stride
assert (row - k) // s * s + k == row
assert (col - k) // s * s + k == col
def __call__(self, x_mat: np.ndarray) -> np.ndarray:
self.check_x_mat_shape(x_mat)
self.x = x_mat
return self.__conv(
stride=self.stride,
mat=x_mat,
kernel=self.kernel,
bias=None,
einsum_formula="ijk,ijkl->l",
out_ele_shape=[self.out_channels]
)
def backward(self, dc_dz: np.ndarray) -> np.ndarray:
(kr, kc, in_ch, out_ch), s = self.kernel.shape, self.stride
dc_dz_with_zeros_shape = (
2 * kr + (dc_dz.shape[0] - 1) * s - 1,
2 * kc + (dc_dz.shape[1] - 1) * s - 1,
dc_dz.shape[2]
)
D = np.zeros(dc_dz_with_zeros_shape)
for i in range(dc_dz.shape[0]):
for j in range(dc_dz.shape[1]):
D[kr + i * s - 1, kc + j * s - 1] = dc_dz[i, j]
dc_da = self.__conv(
stride=1,
mat=D,
kernel=self.kernel[::-1, ::-1],
bias=None,
einsum_formula="ijl,ijkl->k",
out_ele_shape=[in_ch])
dc_dw = self.__conv(
stride=1,
mat=D,
kernel=self.x[::-1, ::-1],
bias=None,
einsum_formula="ijl,ijk->kl",
out_ele_shape=[in_ch, out_ch])
dc_db = np.einsum("ijk->k", dc_dz)
self.kernel += dc_dw
self.bias += dc_db
return dc_da
def __conv(self,
stride: int,
mat: np.ndarray,
kernel: np.ndarray,
bias: np.ndarray = None,
einsum_formula: str = "ijk,ijkl->l",
out_ele_shape: Iterable[int] = None) -> np.ndarray:
'''
einsum_formula:
卷积核kernel对mat的某个子矩阵进行全卷积要使用这个爱因斯坦求和约定式子进行计算。
卷积结束后得到一个 shape=(I,J) 的结果矩阵。
矩阵的每一个元素不一定是值,有可能是一个张量,这需要要看 einsum_formula 的设置。
结果矩阵本质上可以写成 shape=(I,J,...) 的张量
out_ele_shape:
注意 out_ele_shape 要与 einsum_formula 相对应
out_ele_shape 表示作卷积后,结果矩阵中每个元素的shape
out_ele_shape 会被用来构造结果张量。
-------------------------------------------------------------
举个例子:
"ijk,ijl->kl",用这个式子卷积后结果矩阵的每个元素都是 shape=(K,L)
的矩阵,那么结果其实是一个 (I,J,K,L) 的4维张量,此时应该设置
out_ele_shape=[K,L]
-------------------------------------------------------------
如果是单通道卷积,则每个元素就是一个数值,应该设置 out_ele_shape=[]
默认设置是针对正向传播的,此时out_ele_shape可以设置为None(只是为了方便)
'''
if bias is None:
def f(m):
return np.einsum(
einsum_formula, m, kernel)
else:
def f(m):
return np.einsum(einsum_formula, m, kernel) + bias
row, col = mat.shape[0], mat.shape[1]
s = stride
(kr, kc, *omit), s = kernel.shape, stride
if out_ele_shape is None:
assert len(kernel.shape) == 3
out_ch = kernel.shape[-1]
out_ele_shape = (out_ch,)
target_shape = ((row - kr) // s + 1, (col - kc) // s + 1, *out_ele_shape)
target = np.zeros(target_shape)
for i in range(target_shape[0]):
for j in range(target_shape[1]):
r, c = i * s, j * s
target[i, j] = f(mat[r:r + kr, c:c + kc])
return target
二、序列模型
2.1 RNN
class Rnn():
def __init__(self, input_size, hidden_size, num_layers, bidirectional=False):
self.input_size = input_size
self.hidden_size = hidden_size
self.num_layers = num_layers
self.bidirectional = bidirectional
def feed(self, x):
'''
:param x: [seq, batch_size, embedding]
:return: out, hidden
'''
out = []
x, hidden = np.array(x), [np.zeros((self.hidden_size, x.shape[1])) for i in range(self.num_layers)]
Wih = [np.random.random((self.hidden_size, self.hidden_size)) for i in range(1, self.num_layers)]
Wih.insert(0, np.random.random((self.hidden_size, x.shape[2])))
Whh = [np.random.random((self.hidden_size, self.hidden_size)) for i in range(self.num_layers)]
time = x.shape[0]
for i in range(time):
hidden[0] = np.tanh((np.dot(Wih[0], np.transpose(x[i, ...], (1, 0))) +
np.dot(Whh[0], hidden[0])
))
for i in range(1, self.num_layers):
hidden[i] = np.tanh((np.dot(Wih[i], hidden[i-1]) +
np.dot(Whh[i], hidden[i])
))
out.append(hidden[self.num_layers-1])
return np.array(out), np.array(hidden)
2.2 LSTM
class LSTM()
def __init__(self):
self.wordDim = 6000
self.hiddenDim = 100
self.Wi, self.Ui = self.initWeights()
self.Wf, self.Uf = self.initWeights()
self.Wo, self.Uo = self.initWeights()
self.Wa, self.Ua = self.initWeights()
self.Wy = np.random.uniform(-np.sqrt(1. / self.wordDim), np.sqrt(1. / self.wordDim), (self.wordDim, self.hiddenDim))
def initWeights(self):
W = np.random.uniform(-np.sqrt(1. / self.wordDim), np.sqrt(1. / self.wordDim), (self.hiddenDim, self.wordDim))
U = np.random.uniform(-np.sqrt(1. / self.hiddenDim), np.sqrt(1. / self.hiddenDim), (self.hiddenDim, self.hiddenDim))
return W, U
def forward(self, data):
T = len(data)
output = np.zeros((T, self.wordDim, 1))
hidden = np.zeros((T+1, self.hiddenDim, 1))
cPre = np.zeros((self.hiddenDim, 1))
states = list()
for t in range(T):
state = dict()
X = np.zeros((self.wordDim, 1))
X[data[t]][0] = 1
a = np.tanh(np.dot(self.Wa, X) + np.dot(self.Ua, hidden[t-1]))
i = self.sigmoid(np.dot(self.Wi, X) + np.dot(self.Ui, hidden[t-1]))
f = self.sigmoid(np.dot(self.Wf, X) + np.dot(self.Uf, hidden[t-1]))
o = self.sigmoid(np.dot(self.Wo, X) + np.dot(self.Uo, hidden[t-1]))
c = np.multiply(i, a) + np.multiply(f, cPre)
state['a'] = a
state['i'] = i
state['f'] = f
state['o'] = o
state['c'] = c
states.append(state.copy())
cPre = c
hidden[t] = np.multiply(o, np.tanh(c))
y = self.softmax(np.dot(self.Wy, hidden[t]))
output[t] = y
state = dict()
state['c'] = np.zeros((self.hiddenDim, 1))
states.append(state.copy())
return hidden, output, states