Python Basic
$ python3
>>> 1 + 2
3
>>> 1 - 2
-1
>>> 4 * 5
20
>>> 7 / 5
1.4
>>> 3 ** 2
9
>>> type(10)
<class 'int'>
>>> type(2.718)
<class 'float'>
>>> type("hello")
<class 'str'>
>>> x = 10
>>> print(x)
10
>>> x = 100
>>> print(x)
100
>>> y = 3.14
>>> x * y
314.0
>>> type(x * y)
<class 'float'>
>>> a = [1, 2, 3, 4, 5]
>>> print(a)
[1, 2, 3, 4, 5]
>>> len(a)
5
>>> a[0]
1
>>> a[4]
5
>>> a[4] = 99
>>> print(a)
[1, 2, 3, 4, 99]
>>> a[0:2]
[1, 2]
>>> a[1:]
[2, 3, 4, 99]
>>> a[:3]
[1, 2, 3]
>>> a[:-1]
[1, 2, 3, 4]
>>> a[:-2]
[1, 2, 3]
>>> me = {'height' : 100}
>>> me['height']
100
>>> me['height'] = 70
>>> print(me)
{'height': 70}
>>> me['weight'] = 80
>>> print(me)
{'height': 70, 'weight': 80}
>>> hungry = True
>>> sleepy = False
>>> type(hungry)
<class 'bool'>
>>> not hungry
False
>>> hungry and sleepy
False
>>> hungry or sleepy
True
>>> hungry = True
>>> if hungry:
... print("I'm hungry")
...
"I'm hungry"
>>> hungry = False
>>> if hungry:
... print("I'm hungry")
... else:
... print("I'm not hungry")
... print("I'm sleepy")
...
"I'm not hungry"
"I'm sleepy"
>>> for i in [1, 2, 3]:
... print(i)
...
1
2
3
>>> def hello():
... print("Hello world")
...
>>> hello()
Hello world
>>> def hello(object):
... print("Hello " + object + "!")
...
>>> hello("cat")
Hello cat!
>>>
Class
class Man:
def __init__(self, name):
self.name = name
print("Initialized!")
def hello(self):
print("Hello " + self.name + "!")
def goodbye(self):
print("Good-bye " + self.name + "!")
m = Man("David")
m.hello()
m.goodbye()
Numpy
>>> import numpy as np
>>> x = np.array([1.0, 2.0, 3.0])
>>> print(x)
[1. 2. 3.]
>>> type(x)
<class 'numpy.ndarray'>
>>> x = np.array([1.0, 2.0, 3.0])
>>> y = np.array([2.0, 4.0, 6.0])
>>> x + y
array([3., 6., 9.])
>>> x - y
array([-1., -2., -3.])
>>> x * y
array([ 2., 8., 18.])
>>> x / y
array([0.5, 0.5, 0.5])
>>> x = np.array([1.0, 2.0, 3.0])
>>> x / 2.0
array([0.5, 1. , 1.5])
>>> A = np.array([[1,2], [3,4]])
>>> print(A)
[[1 2]
[3 4]]
>>> A.shape
(2, 2)
>>> A.dtype
dtype('int64')
>>> B = np.array([[3,0], [0,6]])
>>> A + B
array([[ 4, 2],
[ 3, 10]])
>>> A * B
array([[ 3, 0],
[ 0, 24]])
>>> print(A)
[[1 2]
[3 4]]
>>> A * 10
array([[10, 20],
[30, 40]])
>>> A = np.array([[1,2], [3,4]])
>>> B = np.array([10, 20])
>>> A * B
array([[10, 40],
[30, 80]])
>>> X = np.array([[51,55], [14, 19], [0, 4]])
>>> print(X)
[[51 55]
[14 19]
[ 0 4]]
>>> X[0] # get the first row of this array
array([51, 55])
>>> X[0][1] # get the number of (0, 1)
55
>>> for row in X:
... print(row)
...
[51 55]
[14 19]
[0 4]
>>> X = X.flatten()
>>> print(X)
[51 55 14 19 0 4]
>>> X[np.array([0, 2, 4])] # get the number whose the index is 0, 2, 4
array([51, 14, 0])
>>> X > 15 # filter the number that greater than 15
array([ True, True, False, True, False, False])
>>> X[X > 15]
array([51, 55, 19])
>>>
Matplotlib
Plot the sin function
import numpy as np
import matplotlib.pyplot as plt
x = np.arange(0, 6, 0.1)
y = np.sin(x)
plt.plot(x, y)
plt.show()
Plot the image of function sin and cos
import numpy as np
import matplotlib.pyplot as plt
x = np.arange(0, 6, 0.1)
y1 = np.sin(x)
y2 = np.cos(x)
plt.plot(x, y1, label="sin")
plt.plot(x, y2, linestyle = "--", label="cos")
plt.xlabel("x")
plt.ylabel("y")
plt.title('sin & cos')
plt.legend()
plt.show()
Display the image
import matplotlib.pyplot as plt
from matplotlib.image import imread
img = imread('lena.png') # the path of the image
plt.imshow(img)
plt.show()
Perceptron
感知机的运行原理
The equation below can represent the behavior of the perceptron
y = { 0 ( w 1 x 1 + w 2 x 2 ≤ θ ) 1 ( w 1 x 1 + w 2 x 2 > θ ) y = \left\{\begin{matrix}0 \ (w_1 x_1 + w_2 x_2 \le \theta) \\ 1 \ (w_1 x_1 + w_2 x_2 \gt \theta) \end{matrix}\right. y={0 (w1x1+w2x2≤θ)1 (w1x1+w2x2>θ)
Simple implementation
def AND(x1, x2):
w1, w2, theta = 0.5, 0.5, 0.7
tmp = w1 * x1 + w2 * x2
if tmp <= theta:
return 0
elif tmp > theta:
return 1
print(AND(0, 0))
print(AND(0, 1))
print(AND(1, 0))
print(AND(1, 1))
导入权重和偏置
b b b is called bias, w 1 w_1 w1 and w 2 w_2 w2 are called weight
y = { 0 ( b + w 1 x 1 + w 2 x 2 ≤ 0 ) 1 ( b + w 1 x 1 + w 2 x 2 > 0 ) y = \left\{\begin{matrix} 0 \ (b + w_1 x_1 + w_2 x_2 \le 0) \\ 1 \ (b + w_1 x_1 + w_2 x_2 \gt 0)\end{matrix}\right. y={0 (b+w1x1+w2x2≤0)1 (b+w1x1+w2x2>0)
use numpy to complete a simple neuron
import numpy as np
x = np.array([0, 1]) # input
w = np.array([0.5, 0.5]) # weight
b = -0.7 # bias
print(w * x)
print(np.sum(w * x))
print(np.sum(w * x) + b)
use bias and weight to complete an AND gate
def AND(x1, x2):
x = np.array([x1, x2])
w = np.array([0.5, 0.5])
b = -0.7
tmp = np.sum(w * x) + b
if tmp <= 0:
return 0
else:
return 1
NAND gate, OR gate
def NAND(x1, x2):
x = np.array([x1, x2])
w = np.array([-0.5, -0.5]) # just bias and weight different from AND
b = 0.7
tmp = np.sum(w * x) + b
if tmp <= 0:
return 0
else:
return 1
def OR(x1, x2):
x = np.array([x1, x2])
w = np.array([0.5, 0.5]) # just bias and weight different from AND
b = -0.2
tmp = np.sum(w * x) + b
if tmp <= 0:
return 0
else:
return 1
Use AND, NAND, OR gate to build NOR gate
def XOR(x1, x2):
s1 = NAND(x1, x2)
s2 = OR(x1, x2)
y = AND(s1, s2)
return y
Neuron Network
y = { 0 ( b + w 1 x 1 + w 2 x 2 ≤ 0 ) 1 ( b + w 1 x 1 + w 2 x 2 > 0 ) y = \left\{\begin{matrix} 0 \ (b + w_1 x_1 + w_2 x_2 \le 0) \\ 1 \ (b + w_1 x_1 + w_2 x_2 \gt 0) \end{matrix}\right. y={0 (b+w1x1+w2x2≤0)1 (b+w1x1+w2x2>0)
引入 h ( x ) h(x) h(x)
y = h ( b + w 1 x 1 + w 2 x 2 ) y = h(b + w_1 x_1 + w_2 x_2) y=h(b+w1x1+w2x2)
h ( x ) = { 0 ( x ≤ 0 ) 1 ( x > 0 ) h(x) = \left\{\begin{matrix} 0 \ (x \le 0) \\ 1 \ (x \gt 0) \end{matrix}\right. h(x)={0 (x≤0)1 (x>0)
激活函数
activation function
a
=
b
+
w
1
x
1
+
w
2
x
2
a = b + w_1 x_1 + w_2 x_2
a=b+w1x1+w2x2
y
=
h
(
a
)
y = h(a)
y=h(a)
sigmoid 函数
Sigmoid function
h ( x ) = 1 1 + e x p ( − x ) h(x) = \frac{1}{1 + exp(-x)} h(x)=1+exp(−x)1 , e x p ( − x ) exp(-x) exp(−x) represent e − x e^{-x} e−x
def sigmoid(x):
return 1 / (1 + np.exp(-x))
Function of step
import numpy as np
import matplotlib.pylab as plt
# def step_function(x):
# if x > 0:
# return 1
# else:
# return 0
# def step_function(x):
# y = x > 0
# return y.astype(np.int)
def step_function(x):
return np.array(x > 0, dtype=np.int64)
x = np.arange(-5.0, 5.0, 0.1)
y = step_function(x)
plt.plot(x, y)
plt.ylim(-0.1, 1.1) # the arange of axis y plt.show()
plt.show()
Sigmoid function and step function:
import numpy as np
import matplotlib.pylab as plt
# def step_function(x):
# if x > 0:
# return 1
# else:
# return 0
# def step_function(x):
# y = x > 0
# return y.astype(np.int)
def step_function(x):
return np.array(x > 0, dtype=np.int64)
def sigmoid(x):
return 1 / (1 + np.exp(-x))
x = np.arange(-5.0, 5.0, 0.1)
y1 = step_function(x)
y2 = sigmoid(x)
plt.plot(x, y1, linestyle = "--")
plt.plot(x, y2)
plt.ylim(-0.1, 1.1) # the arange of axis y plt.show()
plt.show()
ReLU函数
ReLU function
h ( x ) = { x ( x > 0 ) 0 ( x ≤ 0 ) h(x) = \left\{\begin{matrix} x \ (x \gt 0) \\ 0 \ (x \le 0) \end{matrix}\right. h(x)={x (x>0)0 (x≤0)
def relu(x):
return np.maximum(0, x)
多维数组
>>> import numpy as np
>>> A = np.array([1, 2, 3, 4])
>>> print(A)
[1 2 3 4]
>>> np.ndim(A)
1
>>> A.shape
(4,)
>>> A.shape[0]
4
>>> B = np.array([[1,2], [3,4], [5,6]])
>>> print(B)
[[1 2]
[3 4]
[5 6]]
>>> np.ndim(B)
2
>>> B.shape
(3, 2)
矩阵乘法
>>> A = np.array([[1,2], [3,4]])
>>> A.shape
(2, 2)
>>> B = np.array([[5,6], [7,8]])
>>> B.shape
(2, 2)
>>> np.dot(A, B)
array([[19, 22],
[43, 50]])
>>> A = np.array([[1,2,3], [4,5,6]])
>>> A.shape
(2, 3)
>>> B = np.array([[1,2], [3,4], [5,6]])
>>> B.shape
(3, 2)
>>> np.dot(A, B)
array([[22, 28],
[49, 64]])
>>> C = np.array([[1,2], [3,4]])
>>> C.shape
(2, 2)
>>> A.shape
(2, 3)
>>> np.dot(A, C)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
ValueError: shapes (2,3) and (2,2) not aligned: 3 (dim 1) != 2 (dim 0)
>>> A = np.array([[1,2], [3, 4], [5,6]])
>>> A.shape
(3, 2)
>>> B = np.array([7,8])
>>> B.shape
(2,)
>>> np.dot(A, B)
array([23, 53, 83])
>>> A = np.array([[1,2], [3, 4], [5,6]])
>>> A.shape
(3, 2)
>>> B = np.array([7,8])
>>> B.shape
(2,)
>>> np.dot(A, B)
array([23, 53, 83])
神经网络内积
>>> X = np.array([1, 2])
>>> X.shape
(2,)
>>> W = np.array([[1, 3, 5], [2, 4, 6]])
>>> print(W)
[[1 3 5]
[2 4 6]]
>>> W.shape
(2, 3)
>>> Y = np.dot(X, W)
>>> print(Y)
[ 5 11 17]
多层神经网络
w
12
(
1
)
w^{(1)}_{12}
w12(1) 中
(
1
)
(1)
(1) 表示第1层的权重,1 表示后一层的第一个神经元,2 表示前一层的第2个神经元
权重右下角按照“后一层的索引号、前一层的索引号”的顺序排列
各层间信号传递的实现
例如:
a 1 ( 1 ) = w 11 ( 1 ) x 1 + w 12 ( 1 ) x 2 + b 1 ( 1 ) a^{(1)}_1 = w^{(1)}_{11} x_1 + w^{(1)}_{12} x_2 + b^{(1)}_1 a1(1)=w11(1)x1+w12(1)x2+b1(1)
如果使用矩阵的乘法运算,则可以将第1层的加权和表示成下面的式
A
(
1
)
=
X
W
(
1
)
+
B
(
1
)
A^{(1)} = XW^{(1)} + B^{(1)}
A(1)=XW(1)+B(1)
其中, A ( 1 ) , X , W ( 1 ) , B ( 1 ) A^{(1)}, X, W^{(1)}, B^{(1)} A(1),X,W(1),B(1) 如下所示:
A ( 1 ) = ( a 1 ( 1 ) a 2 ( 1 ) a 3 ( 1 ) ) A^{(1)} = (a^{(1)}_1 \ a^{(1)}_2 \ a^{(1)}_3) A(1)=(a1(1) a2(1) a3(1)), X = ( x 1 x 2 ) X = (x_1 \ x_2) X=(x1 x2), B ( 1 ) = ( b 1 ( 1 ) b 2 ( 1 ) b 3 ( 1 ) ) B^{(1)} = (b^{(1)}_1 \ b^{(1)}_2 \ b^{(1)}_3) B(1)=(b1(1) b2(1) b3(1)), W ( 1 ) = ( w 11 ( 1 ) w 21 ( 1 ) w 31 ( 1 ) w 12 ( 1 ) w 22 ( 1 ) w 32 ( 1 ) ) W^{(1)} = \begin{pmatrix} w^{(1)}_{11} & w^{(1)}_{21} & w^{(1)}_{31} \\ w^{(1)}_{12} & w^{(1)}_{22} & w^{(1)}_{32} \end{pmatrix} W(1)=(w11(1)w12(1)w21(1)w22(1)w31(1)w32(1))
X = np.array([1.0, 0.5])
W1 = np.array([[0.1, 0.3, 0.5], [0.2, 0.4, 0.6]])
B1 = np.array([0.1, 0.2, 0.3])
print(W1.shape) # (2, 3)
print(X.shape) # (2,)
print(B1.shape) # (3,)
A1 = np.dot(X, W1) + B1 # (2,) * (2, 3) + (3,) = (3,)
W 1 W1 W1 为 2 × 3 2 \times 3 2×3 的数组, X X X 为元素个数为 2 的一维数组,是用 s i g m o i d sigmoid sigmoid 作为激活函数
Z1 = sigmoid(A1)
print(A1) # [0.3, 0.7, 1.1]
print(Z1) # [0.57444252, 0.66818777, 0.75026011]
第一层的输出变成第二层的输入
W2 = np.array([[0.1, 0.4], [0.2, 0.5], [0.3, 0.6]])
B2 = np.array([0.1, 0.2])
print(Z1.shape) # (3,)
print(W2.shape) # (3, 2)
print(B2.shape) # (2,)
A2 = np.dot(Z1, W2) + B2 # (3,) * (3, 2) + (2,) = (2,)
Z2 = sigmoid(A2) # (2,)
激活函数
def identity_function(x):
return x
W3 = np.array([[0.1, 0.3], [0.2, 0.4]])
B3 = np.array([0.1, 0.2])
A3 = np.dot(Z2, W3) + B3
Y = identity_function(A3) # 或者Y = A3
代码实现
import numpy as np
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def identity_function(x):
return x
def init_network():
network = {}
network['W1'] = np.array([[0.1, 0.3, 0.5], [0.2, 0.4, 0.6]])
network['b1'] = np.array([0.1, 0.2, 0.3])
network['W2'] = np.array([[0.1, 0.4], [0.2, 0.5], [0.3, 0.6]])
network['b2'] = np.array([0.1, 0.2])
network['W3'] = np.array([[0.1, 0.3], [0.2, 0.4]])
network['b3'] = np.array([0.1, 0.2])
return network
def forward(network, x):
W1, W2, W3 = network['W1'], network['W2'], network['W3']
b1, b2, b3 = network['b1'], network['b2'], network['b3']
a1 = np.dot(x, W1) + b1
z1 = sigmoid(a1)
a2 = np.dot(z1, W2) + b2
z2 = sigmoid(a2)
a3 = np.dot(z2, W3) + b3
y = identity_function(a3)
return y
network = init_network()
x = np.array([1.0, 0.5])
y = forward(network, x)
print(y)
softmax 函数
y k = e x p ( a k ) ∑ i = 1 n e x p ( a i ) y_k = \frac{exp(a_k)}{\sum^{n}_{i = 1} exp(a_i)} yk=∑i=1nexp(ai)exp(ak)
>>> import numpy as np
>>> a = np.array([0.3, 2.9, 4.0])
>>> exp_a = np.exp(a)
>>> print(exp_a)
[ 1.34985881 18.17414537 54.59815003]
>>> sum_exp_a = np.sum(exp_a)
>>> print(sum_exp_a)
74.1221542101633
>>> y = exp_a / sum_exp_a
>>> print(y)
[0.01821127 0.24519181 0.73659691]
Softmax function
def softmax(a):
exp_a = np.exp(a)
sum_exp_a = np.sum(exp_a)
y = exp_a / sum_exp_a
return y
y k = e x p ( a k ) ∑ i = 1 n e x p ( a i ) = C e x p ( a k ) C ∑ i = 1 n e x p ( a i ) = e x p ( a k + l o g C ) ∑ i = 1 n e x p ( a i + l o g C ) = e x p ( a k + C ′ ) ∑ i = 1 n e x p ( a i + C ′ ) \begin{matrix} y_k & = \frac{exp(a_k)}{\sum^{n}_{i = 1} exp(a_i)} & = \frac{Cexp(a_k)}{C\sum^{n}_{i = 1}exp(a_i)} \\ & & = \frac{exp(a_k + log C)}{\sum^{n}_{i = 1} exp(a_i + logC)} \\ & & = \frac{exp(a_k + C')}{\sum^{n}_{i = 1} exp(a_i + C')} \end{matrix} yk=∑i=1nexp(ai)exp(ak)=C∑i=1nexp(ai)Cexp(ak)=∑i=1nexp(ai+logC)exp(ak+logC)=∑i=1nexp(ai+C′)exp(ak+C′)
>>> a = np.array([1010, 1000, 990])
>>> np.exp(a) / np.sum(np.exp(a))
<stdin>:1: RuntimeWarning: invalid value encountered in divide
array([nan, nan, nan])
>>> c = np.max(a)
>>> a - c
array([ 0, -10, -20])
>>> np.exp(a - c) / np.sum(np.exp(a - c))
array([9.99954600e-01, 4.53978686e-05, 2.06106005e-09])
def softmax(a):
c = np.max(a)
exp_a = np.exp(a - c) # 溢出对策 sum_exp_a = np.sum(exp_a)
y = exp_a / sum_exp_a
return y
Deep Learning
均方误差
mean squared error
E = 1 2 ∑ k ( y k − t k ) 2 E= \frac{1}{2} \sum_k (y_k - t_k)^2 E=21∑k(yk−tk)2
import numpy as np
y = [0.1, 0.05, 0.6, 0.0, 0.05, 0.1, 0.0, 0.1, 0.0, 0.0]
t = [0, 0, 1, 0, 0, 0, 0, 0, 0, 0]
def mean_squared_error(y, t):
return 0.5 * np.sum((y - t)**2)
print(mean_squared_error(np.array(y), np.array(t)))
交叉熵误差
cross entropy error
E = − ∑ k t k l o g y k E = - \sum_k \ t_k \ log \ y_k E=−∑k tk log yk
def cross_entropy_error(y, t):
delta = 1e-7
return -np.sum(t * np.log(y + delta))
mini-batch 学习
E = − 1 N ∑ n ∑ k t n k l o g y n k E = - \frac{1}{N} \sum_n \sum_k \ t_{nk} \ log \ y_{nk} E=−N1∑n∑k tnk log ynk
mini-batch 版交叉误差
def cross_entropy_error(y, t):
if y.ndim == 1:
t = t.reshape(1, t.size)
y = y.reshape(1, y.size)
batch_size = y.shape[0]
return -np.sum(t * np.log(y + 1e-7)) / batch_size
# return -np.sum(np.log(y[np.arange(batch_size), t] + 1e-7)) / batch_size
导数
numerical differentiation
d f ( x ) d x = l i m h → 0 f ( x + h ) − f ( x ) h \frac{df(x)}{dx} = lim_{h \rightarrow 0} \frac{f(x + h) - f(x)}{h} dxdf(x)=limh→0hf(x+h)−f(x)
def numerical_diff(f, x):
h = 1e-4
return (f(x + h) - f(x - h)) / (2 * h)
偏导数
∂ f ∂ x 0 , ∂ f ∂ x 1 \frac{\partial f}{\partial x_0},\frac{\partial f}{\partial x_1} ∂x0∂f,∂x1∂f
梯度
( ∂ f ∂ x 0 , ∂ f ∂ x 1 ) (\frac{\partial f}{\partial x_0},\frac{\partial f}{\partial x_1}) (∂x0∂f,∂x1∂f)
def numerical_gradient(f, x):
h = 1e-4 # 0.0001
grad = np.zeros_like(x) # 生成和x形状相同的数组
for idx in range(x.size):
tmp_val = x[idx]
# f(x+h)的计算
x[idx] = tmp_val + h
fxh1 = f(x)
# f(x-h)的计算
x[idx] = tmp_val - h
fxh2 = f(x)
grad[idx] = (fxh1 - fxh2) / (2 * h)
x[idx] = tmp_val # 还原值
return grad
梯度法
x 0 = x 0 − η ∂ f ∂ x 0 x_0 = x_0 - \eta \frac{\partial f}{\partial x_0} x0=x0−η∂x0∂f
x 1 = x 1 − η ∂ f ∂ x 1 x_1 = x_1 - \eta \frac{\partial f}{\partial x_1} x1=x1−η∂x1∂f
梯度下降法:
def gradient_descent(f, init_x, lr=0.01, step_num=100):
x = init_x
for i in range(step_num):
grad = numerical_gradient(f, x)
x -= lr * grad
return x
神经网络的梯度
W W W 为 2 × 3 2 \times 3 2×3 权重的神经网络, L L L 表示损失函数
W = ( w 11 w 12 w 13 w 21 w 22 w 23 ) W = \begin{pmatrix} w_{11} & w_{12} & w_{13} \\ w_{21} & w_{22} & w_{23} \end{pmatrix} W=(w11w21w12w22w13w23)
梯度用 ∂ L ∂ W \frac{\partial L}{\partial W} ∂W∂L 表示
∂ L ∂ W = ( ∂ L ∂ w 11 ∂ L ∂ w 12 ∂ L ∂ w 13 ∂ L ∂ w 21 ∂ L ∂ w 22 ∂ L ∂ w 23 ) \frac{\partial L}{\partial W} = \begin{pmatrix} \frac{\partial L}{\partial w_{11}} & \frac{\partial L}{\partial w_{12}} & \frac{\partial L}{\partial w_{13}} \\ \frac{\partial L}{\partial w_{21}} & \frac{\partial L}{\partial w_{22}} & \frac{\partial L}{\partial w_{23}}\end{pmatrix} ∂W∂L=(∂w11∂L∂w21∂L∂w12∂L∂w22∂L∂w13∂L∂w23∂L)
简单层的实现
乘法层
class MulLayer:
def __init__(self):
self.x = None
self.y = None
def forward(self, x, y):
self.x = x
self.y = y
out = x * y
return out
def backward(self, dout):
dx = dout * self.y
dy = dout * self.x
return dx, dy
加法层
class AddLayer:
def __init__(self):
pass
def forward(self, x, y):
out = x + y
return out
def backward(self, dout):
dx = dout * 1
dy = dout * 1
return dx, dy
ReLU 层
class Relu:
def __init__(self):
self.mask = None
def forward(self, x):
self.mask = (x <= 0)
out = x.copy()
out[self.mask] = 0
return out
def backward(self, dout):
dout[self.mask] = 0
dx = dout
return dx
误差反向传播法
∂ z ∂ x = ∂ z ∂ t ∂ t ∂ x \frac{\partial{z}}{\partial{x}} = \frac{\partial{z}}{\partial{t}} \frac{\partial{t}}{\partial{x}} ∂x∂z=∂t∂z∂x∂t ,
y = { x ( x > 0 ) 0 ( x ≤ 0 ) y = \left\{\begin{matrix} x & (x \gt 0) \\ 0 & (x \le 0) \end{matrix}\right. y={x0(x>0)(x≤0) , ∂ y ∂ x = { 1 ( x > 0 ) 0 ( x ≤ 0 ) \frac{\partial{y}}{\partial{x}} = \left\{\begin{matrix} 1 & (x \gt 0) \\ 0 & (x \le 0) \end{matrix}\right. ∂x∂y={10(x>0)(x≤0) ,
y = 1 1 + e x p ( − x ) y = \frac{1}{1 + exp(-x) } y=1+exp(−x)1 , ∂ y ∂ x = − 1 x 2 = − y 2 \frac{\partial{y}}{\partial{x}} = - \frac{1}{x^2} = -y^2 ∂x∂y=−x21=−y2
x → × ( − 1 ) − x → e x p e x p ( − x ) → + 1 1 + e x p ( − x ) → / y ∂ L ∂ y y 2 e x p ( − x ) ← × ( − 1 ) ∂ L ∂ y y 2 e x p ( − x ) ← e x p ∂ L ∂ y y 2 ← + 1 ∂ L ∂ y y 2 ← / ∂ L ∂ y \begin{matrix} x & \overset{\times(-1)}{\rightarrow} & -x & \overset{exp}{\rightarrow} & exp(-x) & \overset{+ 1}{\rightarrow} & 1 + exp(-x) & \overset{/}{\rightarrow} & y \\ \frac{\partial{L}}{\partial{y}} y^2 exp(-x) & \overset{\times(-1)}{\leftarrow} & \frac{\partial{L}}{\partial{y}} y^2 exp(-x) & \overset{exp}{\leftarrow} & \frac{\partial{L}}{\partial{y}} y^2 & \overset{+ 1}{\leftarrow} & \frac{\partial{L}}{\partial{y}} y^2 & \overset{/}{\leftarrow} & \frac{\partial{L}}{\partial{y}}\end{matrix} x∂y∂Ly2exp(−x)→×(−1)←×(−1)−x∂y∂Ly2exp(−x)→exp←expexp(−x)∂y∂Ly2→+1←+11+exp(−x)∂y∂Ly2→/←/y∂y∂L
∂ L ∂ y y 2 e x p ( − x ) = ∂ L ∂ y 1 ( 1 + e x p ( − x ) ) 2 e x p ( − x ) = ∂ L ∂ y 1 1 + e x p ( − x ) e x p ( − x ) 1 + e x p ( − x ) = ∂ L ∂ y y ( 1 − y ) \begin{matrix}\frac{\partial{L}}{\partial{y}} y^2 exp(-x) & = & \frac{\partial{L}}{\partial{y}} \frac{1}{(1 + exp(-x))^2} exp(-x) \\ & = & \frac{\partial{L}}{\partial{y}} \frac{1}{1 + exp(-x)} \frac{exp(-x)}{1 + exp(-x)} \\ & = & \frac{\partial{L}}{\partial{y}} y (1-y)\end{matrix} ∂y∂Ly2exp(−x)===∂y∂L(1+exp(−x))21exp(−x)∂y∂L1+exp(−x)11+exp(−x)exp(−x)∂y∂Ly(1−y)
class Sigmoid:
def __init__(self):
self.out = None
def forward(self, x):
out = 1 / (1 + np.exp(-x))
self.out = out
return out
def backward(self, dout):
dx = dout * (1.0 - self.out) * self.out
return dx
∂ L ∂ X = ∂ L ∂ Y ⋅ W T \frac{\partial{L}}{\partial{X}} = \frac{\partial{L}}{\partial{Y}} \cdot W^T ∂X∂L=∂Y∂L⋅WT, ∂ L ∂ W = X T ⋅ ∂ L ∂ Y \frac{\partial{L}}{\partial{W}} = X^T \cdot \frac{\partial{L}}{\partial{Y}} ∂W∂L=XT⋅∂Y∂L
W = ( w 11 w 12 w 13 w 21 w 22 w 23 ) W = \begin{pmatrix} w_{11} & w_{12} & w_{13} \\ w_{21} & w_{22} & w_{23} \end{pmatrix} W=(w11w21w12w22w13w23), W T = ( w 11 w 21 w 12 w 22 w 13 w 23 ) W^T = \begin{pmatrix} w_{11} & w_{21} \\ w_{12} & w_{22} \\ w_{13} & w_{23} \end{pmatrix} WT= w11w12w13w21w22w23
X = ( x 0 , x 1 , . . . , x n ) X=(x_0, x_1, ... , x_n) X=(x0,x1,...,xn) , ∂ L ∂ X = ( ∂ L ∂ x 0 , ∂ L ∂ x 1 , . . . , ∂ L ∂ x n ) \frac{\partial{L}}{\partial{X}} = (\frac{\partial{L}}{\partial{x_0}}, \frac{\partial{L}}{\partial{x_1}}, ... , \frac{\partial{L}}{\partial{x_n}}) ∂X∂L=(∂x0∂L,∂x1∂L,...,∂xn∂L)
Affine 层
class Affine:
def __init__(self, W, b):
self.W = W
self.b = b
self.x = None
self.dW = None
self.db = None
def forward(self, x):
self.x = x
out = np.dot(x, self.W) + self.b
return out
def backward(self, dout):
dx = np.dot(dout, self.W.T)
self.dW = np.dot(self.x.T, dout)
self.db = np.sum(dout, axis=0)
return dx
Softmax-with-Loss 层
class SoftmaxWithLoss:
def __init__(self):
self.loss = None # 损失
self.y = None # softmax的输出
self.t = None # 监督数据(one-hot vector)
def forward(self, x, t):
self.t = t
self.y = softmax(x)
self.loss = cross_entropy_error(self.y, self.t)
return self.loss
def backward(self, dout=1):
batch_size = self.t.shape[0]
dx = (self.y - self.t) / batch_size
return dx
Trick
SGD
W ← W ← η ∂ L ∂ W W \leftarrow W \leftarrow \eta \frac{\partial{L}}{\partial{W}} W←W←η∂W∂L
class SGD:
def __init__(self, lr=0.01):
self.lr = lr
def update(self, params, grads):
for key in params.keys():
params[key] -= self.lr * grads[key]
Momentum
v
←
α
v
−
η
∂
L
∂
W
v \leftarrow \alpha v - \eta \frac{\partial{L}}{\partial{W}}
v←αv−η∂W∂L
W
←
W
+
v
W \leftarrow W + v
W←W+v
class Momentum:
def __init__(self, lr=0.01, momentum=0.9):
self.lr = lr
self.momentum = momentum
self.v = None
def update(self, params, grads):
if self.v is None:
self.v = {}
for key, val in params.items():
self.v[key] = np.zeros_like(val)
for key in params.keys():
self.v[key] = self.momentum * self.v[key] - self.lr * grads[key]
params[key] += self.v[key]
AdaGrad
h ← h + ∂ L ∂ W ⊙ ∂ L ∂ W h \leftarrow h + \frac{\partial{L}}{\partial{W}} \odot \frac{\partial{L}}{\partial{W}} h←h+∂W∂L⊙∂W∂L
W ← W − η 1 h ∂ L ∂ W W \leftarrow W - \eta \frac{1}{\sqrt{h}} \frac{\partial{L}}{\partial{W}} W←W−ηh1∂W∂L
class AdaGrad:
def __init__(self, lr=0.01):
self.lr = lr
self.h = None
def update(self, params, grads):
if self.h is None:
self.h = {}
for key, val in params.items():
self.h[key] = np.zeros_like(val)
for key in params.items():
self.h[key] += grads[key] * grads[key]
params[key] -= self.lr * grads[key] / (np.sqrt(self.h[key]) + 1e-7)
Batch Normalization
μ B ← 1 m ∑ i = 1 m x i \mu_B \leftarrow \frac{1}{m} \sum_{i=1}^{m} x_i μB←m1∑i=1mxi
σ B 2 ← 1 m ∑ i = 1 m ( x i − μ B ) 2 \sigma_{B}^{2} \leftarrow \frac{1}{m} \sum^{m}_{i = 1}(x_i - \mu_{B})^2 σB2←m1∑i=1m(xi−μB)2
x i ^ ← x i − μ B σ B 2 + ε \hat{x_i} \leftarrow \frac{x_i - \mu_B}{\sqrt{\sigma^2_B + \varepsilon}} xi^←σB2+εxi−μB
y i ← γ x i ^ + β y_i \leftarrow \gamma \hat{x_i} + \beta yi←γxi^+β
Dropout
class Dropout:
def __init__(self, dropout_ratio=0.5):
self.dropout_ratio = dropout_ratio
self.mask = None
def forward(self, x, train_flg=True):
if train_flg:
self.mask = np.random.rand(*x.shape) > self.dropout_ratio
return x * self.mask
else:
return x * (1.0 - self.dropout_ratio)
def backward(self, dout):
return dout * self.mask
卷积神经网络
假设输入大小为 ( H , W ) (H,W) (H,W) ,滤波器大小为 ( F H , F W ) (FH,FW) (FH,FW) ,输出大小为 ( O H , O W ) (OH,OW) (OH,OW) ,填充为 P P P,步幅为 S S S 。此时,输出大小可通过下式进行计算
O H = H + 2 P − F H S + 1 OH = \frac{H + 2P -FH}{S} + 1 OH=SH+2P−FH+1 , O W = W + 2 P − F W S + 1 OW = \frac{W + 2P -FW}{S} + 1 OW=SW+2P−FW+1
通道数为 C C C、高度为 H H H、长度为W的数据的形状可以写成 ( C , H , W ) (C,H,W) (C,H,W)。滤波器也一样,要按(channel, height,width) 的顺序书写。比如,通道数为 C C C、滤波器高度为 F H FH FH (Filter Height)、长度为 F W FW FW (Filter Width) 时,可以写成 ( C , F H , F W ) (C,FH,FW) (C,FH,FW)
( C , H , W ) ∗ ( C , F H , F W ) → ( 1 , O H , O W ) (C, H, W) \ast (C, FH, FW) \rightarrow (1, OH, OW) (C,H,W)∗(C,FH,FW)→(1,OH,OW)
基于多个滤波器的卷积运算
(
C
,
H
,
W
)
∗
(
F
N
,
C
,
F
H
,
F
W
)
→
(
F
N
,
O
H
,
O
W
)
(C, H, W) \ast (FN, C, FH, FW) \rightarrow (FN, OH, OW)
(C,H,W)∗(FN,C,FH,FW)→(FN,OH,OW)
卷积运算的处理流
(
C
,
H
,
W
)
∗
(
F
N
,
C
,
F
H
,
F
W
)
→
(
F
N
,
O
H
,
O
W
)
+
(
F
N
,
1
,
1
)
→
(
F
N
,
O
H
,
O
W
)
(C, H, W) \ast (FN, C, FH, FW) \rightarrow (FN, OH, OW) + (FN, 1, 1) \rightarrow (FN, OH, OW)
(C,H,W)∗(FN,C,FH,FW)→(FN,OH,OW)+(FN,1,1)→(FN,OH,OW)
卷积运算的处理流(批处理)
(
N
,
C
,
H
,
W
)
∗
(
F
N
,
C
,
F
H
,
F
W
)
→
(
N
,
F
N
,
O
H
,
O
W
)
+
(
F
N
,
1
,
1
)
→
(
N
,
F
N
,
O
H
,
O
W
)
(N, C, H, W) \ast (FN, C, FH, FW) \rightarrow (N, FN, OH, OW) + (FN, 1, 1) \rightarrow (N, FN, OH, OW)
(N,C,H,W)∗(FN,C,FH,FW)→(N,FN,OH,OW)+(FN,1,1)→(N,FN,OH,OW)
class Convolution:
def __init__(self, W, b, stride=1, pad=0):
self.W = W
self.b = b
self.stride = stride
self.pad = pad
def forward(self, x):
FN, C, FH, FW = self.W.shape
N, C, H, W = x.shape
out_h = int(1 + (H + 2*self.pad - FH) / self.stride)
out_w = int(1 + (W + 2*self.pad - FW) / self.stride)
col = im2col(x, FH, FW, self.stride, self.pad)
col_W = self.W.reshape(FN, -1).T # 滤波器的展开
out = np.dot(col, col_W) + self.b
out = out.reshape(N, out_h, out_w, -1).transpose(0, 3, 1, 2)
return out
池化层
class Pooling:
def __init__(self, pool_h, pool_w, stride=1, pad=0):
self.pool_h = pool_h
self.pool_w = pool_w
self.stride = stride
self.pad = pad
def forward(self, x):
N, C, H, W = x.shape
out_h = int(1 + (H - self.pool_h) / self.stride)
out_w = int(1 + (W - self.pool_w) / self.stride)
# 展开(1)
col = im2col(x, self.pool_h, self.pool_w, self.stride, self.pad)
col = col.reshape(-1, self.pool_h*self.pool_w)
# 最大值(2)
out = np.max(col, axis=1)
# 转换(3)
out = out.reshape(N, out_h, out_w, C).transpose(0, 3, 1, 2)
return out