导数
偏微分
梯度
#tf.GradientTape
w = tf.constant(1.)
x = tf.constant(2.)
y = x*w
with tf.GradientTape() as tape:
tape.watch([w])
y2 = x * w
grad1 = tape.gradient(y, [w])
with tf.GradientTape() as tape:
tape.watch([w])
y2 = x * w
grad2 = tape.gradient(y2, [w])
with tf.GradientTape(persistent=True) as tape:
tape.watch([w])
y2 = x * w
grad3 = tape.gradient(y2, [w])
grad4 = tape.gradient(y2, [w])
二阶导数
import tensorflow as tf
w = tf.Variable(1.)
b = tf.Variable(2.)
x = tf.Variable(3.)
with tf.GradientTape() as t1:
with tf.GradientTape() as t2:
y = x * w + b
dy_dw, dy_db = t2.gradient(y, [w, b])
d2y_dw2 = t1.gradient(dy_dw, w)
print(dy_dw)
print(dy_db)
print(d2y_dw2)
assert dy_dw.numpy() == 3.0
assert d2y_dw2 is None
激活函数
Sigmoid / Logistic
f
(
x
)
=
σ
(
x
)
=
1
1
+
e
−
x
f(x) = \sigma(x)=\frac1{1+e^{-x}}
f(x)=σ(x)=1+e−x1
σ
′
=
σ
(
1
−
σ
)
\sigma^\prime=\sigma(1-\sigma)
σ′=σ(1−σ)
a = tf.linspace(-10., 10., 10)
with tf.GradientTape() as tape:
tape.watch(a)
y = tf.nn.sigmoid(a)
grads = tape.gradient(y, [a])
Tanh
f
(
x
)
=
t
a
n
h
(
x
)
=
e
x
−
e
−
x
e
x
+
e
−
x
=
2
s
i
g
m
o
i
d
(
2
x
)
−
1
f(x)=tanh(x)=\frac{e^x-e^{-x}}{e^x+e^{-x}}=2sigmoid(2x)-1
f(x)=tanh(x)=ex+e−xex−e−x=2sigmoid(2x)−1
d
d
x
t
a
n
h
(
x
)
=
1
−
t
a
n
h
2
(
x
)
\frac{d}{dx}tanh(x) = 1-tanh^2(x)
dxdtanh(x)=1−tanh2(x)
a = tf.linspace(-5., 5., 10)
with tf.GradientTape() as tape:
tape.watch(a)
y = tf.nn.tanh(a)
grads = tape.gradient(y, [a])
Rectified Linear Unit
f
(
x
)
=
{
0
f
o
r
x
<
0
x
f
o
r
x
⩾
0
f(x)=\left \{ \begin{array}{rcl} 0 & for & x<0 \\ \\ x &for & x\geqslant0 \end{array}\right.
f(x)=⎩⎨⎧0xforforx<0x⩾0
f
′
(
x
)
=
{
0
f
o
r
x
<
0
1
f
o
r
x
⩾
0
f^\prime(x)=\left \{ \begin{array}{rcl} 0 & for & x<0 \\ \\ 1 &for & x\geqslant0 \end{array}\right.
f′(x)=⎩⎨⎧01forforx<0x⩾0
a = tf.linspace(-1., 1., 10)
tf.nn.relu(a)
tf.nn.leaky_relu(a)
Typical Loss
- Mean Squared Error
- l o s s = ∑ [ y − ( x w + b ) ] 2 loss =\sum[y-(xw+b)]^2 loss=∑[y−(xw+b)]2
- L 2 − n o r m = ∣ ∣ y − ( x w + b ) ∣ ∣ 2 L_{2-norm}=||y-(xw+b)||_2 L2−norm=∣∣y−(xw+b)∣∣2
- l o s s = n o r m ( y − ( x w + b ) ) 2 loss=norm(y-(xw+b))^2 loss=norm(y−(xw+b))2
- MSE Derivative
- l o s s = ∑ [ y − f θ ( x ) ] 2 loss=\sum[y-f_\theta(x)]^2 loss=∑[y−fθ(x)]2
- ∇ l o s s ∇ θ = 2 ∑ [ y − f θ ( x ) ] ∗ ∇ f θ ( x ) ∇ θ \frac{\nabla loss}{\nabla\theta}=2\sum[y-f_\theta(x)]*\frac{\nabla f_\theta(x)}{\nabla\theta} ∇θ∇loss=2∑[y−fθ(x)]∗∇θ∇fθ(x)
x = tf.random.normal([2, 4])
w = tf.random.normal([4, 3])
b = tf.zeros([3])
y = tf.constant([2, 0])
with tf.GradientTape() as tape:
tape.watch([w, b]) #tf.Variable则不需要
prob = tf.nn.softmax(x@w+b, axis=1)
loss = tf.reduce_mean(tf.losses.MSE(tf.one_hot(y, depth=3), prob))
grads = tape.gradient(loss, [w, b])
grads[0]
grads[1]
- Cross Entropy Loss
- binary
- multi-class
- softmax
- soft version of max
S ( y i ) = e y i ∑ j e y i S(y_i)=\frac{e^{y_i}}{\sum_{j}e^{y_i}} S(yi)=∑jeyieyi
- soft version of max
- Leave it to Logistic Regression Part
- SoftMax Derivative
p i = e a i ∑ k = 1 N e a k p_i=\frac{e^{a_i}}{\sum_{k=1}^Ne^{a_k}} pi=∑k=1Neakeai
∂ p i ∂ a j = { p i ( 1 − p j ) i f i = j − p j . p i i f i ≠ j \frac{\partial p_i}{\partial a_j} =\left \{ \begin{array}{rcl} p_i(1-p_j) & if & i=j \\ \\ -p_j.p_i &if & i\neq j \end{array}\right. ∂aj∂pi=⎩⎨⎧pi(1−pj)−pj.piififi=ji=j
Or using Kronecker delta δ i j = { 1 i f i = j 0 i f i ≠ j \delta_{ij}=\left \{ \begin{array}{rcl} 1 & if & i=j \\ \\ 0 &if & i\neq j \end{array}\right. δij=⎩⎨⎧10ififi=ji=j
∂ p i ∂ a j = p i ( δ i j − p j ) \frac{\partial p_i}{\partial a_j} = p_i(\delta_{ij}-p_j) ∂aj∂pi=pi(δij−pj)
x = tf.random.normal([2, 4])
w = tf.random.normal([4, 3])
b = tf.zeros([3])
y = tf.constant([2, 0])
with tf.GradientTape() as tape:
tape.watch([w, b])
logits = x@w+b
loss = tf.reduce_mean(tf.losses.categorical_crossentropy(tf.one_hot(y, depth=3), logits, from_logits=True))
grads = tape.gradient(loss, [w, b])
grads[0]
grads[1]
感知机及其梯度
单输出感知机
Perceptron
- y = X W + b y=XW+b y=XW+b
- y = ∑ x i ∗ w i + b y=\sum x_i*w_i+b y=∑xi∗wi+b
Derivative
E
=
1
2
(
O
0
1
−
t
)
2
E=\frac{1}{2}(O_0^1-t)^2
E=21(O01−t)2
∂
E
∂
w
j
0
=
(
O
0
−
t
)
∂
O
0
∂
w
j
0
\frac{\partial E}{\partial w_{j0}}=(O_0-t)\frac{\partial O_0}{\partial w_{j0}}
∂wj0∂E=(O0−t)∂wj0∂O0
∂
E
∂
w
j
0
=
(
O
0
−
t
)
∂
σ
(
x
0
)
∂
w
j
0
\frac{\partial E}{\partial w_{j0}}=(O_0-t)\frac{\partial \sigma(x_0)}{\partial w_{j0}}
∂wj0∂E=(O0−t)∂wj0∂σ(x0)
∂
E
∂
w
j
0
=
(
O
0
−
t
)
σ
(
x
0
)
(
1
−
σ
(
x
0
)
)
∂
x
0
1
∂
w
j
0
\frac{\partial E}{\partial w_{j0}}=(O_0-t)\sigma(x_0)(1-\sigma(x_0))\frac{\partial x_0^1}{\partial w_{j0}}
∂wj0∂E=(O0−t)σ(x0)(1−σ(x0))∂wj0∂x01
∂
E
∂
w
j
0
=
(
O
0
−
t
)
O
0
(
1
−
O
0
)
∂
x
0
1
∂
w
j
0
\frac{\partial E}{\partial w_{j0}}=(O_0-t)O_0(1-O_0)\frac{\partial x_0^1}{\partial w_{j0}}
∂wj0∂E=(O0−t)O0(1−O0)∂wj0∂x01
∂
E
∂
w
j
0
=
(
O
0
−
t
)
O
0
(
1
−
O
0
)
x
j
0
\frac{\partial E}{\partial w_{j0}}=(O_0-t)O_0(1-O_0)x_j^0
∂wj0∂E=(O0−t)O0(1−O0)xj0
x = tf.random.normal([1, 3])
w = tf.ones([3, 1])
b = tf.ones([1])
y = tf.constant([1])
with tf.GradientTape() as tape:
tape.watch([w, b])
prob = tf.nn.sigmoid(x@w+b)
loss = tf.reduce_mean(tf.losses.MSE(y, prob))
grads = tape.gradient(loss, [w, b])
grads[0]
grads[1]
多输出感知机
Derivative
E
=
1
2
∑
(
O
i
1
−
t
i
)
2
E=\frac{1}{2}\sum(O_i^1-t_i)^2
E=21∑(Oi1−ti)2
∂
E
∂
w
j
k
=
(
O
k
−
t
k
)
∂
O
k
∂
w
j
k
\frac{\partial E}{\partial w_{jk}}=(O_k-t_k)\frac{\partial O_k}{\partial w_{jk}}
∂wjk∂E=(Ok−tk)∂wjk∂Ok
∂
E
∂
w
j
k
=
(
O
k
−
t
k
)
∂
σ
(
x
k
)
∂
w
j
k
\frac{\partial E}{\partial w_{jk}}=(O_k-t_k)\frac{\partial \sigma(x_k)}{\partial w_{jk}}
∂wjk∂E=(Ok−tk)∂wjk∂σ(xk)
∂
E
∂
w
j
k
=
(
O
k
−
t
k
)
σ
(
x
k
)
(
1
−
σ
(
x
k
)
)
∂
x
k
1
∂
w
j
k
\frac{\partial E}{\partial w_{jk}}=(O_k-t_k)\sigma(x_k)(1-\sigma(x_k))\frac{\partial x_k^1}{\partial w_{jk}}
∂wjk∂E=(Ok−tk)σ(xk)(1−σ(xk))∂wjk∂xk1
∂
E
∂
w
j
k
=
(
O
k
−
t
k
)
O
k
(
1
−
O
k
)
∂
x
k
1
∂
w
j
k
\frac{\partial E}{\partial w_{jk}}=(O_k-t_k)O_k(1-O_k)\frac{\partial x_k^1}{\partial w_{jk}}
∂wjk∂E=(Ok−tk)Ok(1−Ok)∂wjk∂xk1
∂
E
∂
w
j
k
=
(
O
k
−
t
k
)
O
k
(
1
−
O
k
)
x
j
0
\frac{\partial E}{\partial w_{jk}}=(O_k-t_k)O_k(1-O_k)x_j^0
∂wjk∂E=(Ok−tk)Ok(1−Ok)xj0
x = tf.random.normal([2, 4])
w = tf.random.normal([4, 3])
b = tf.zeros([3])
y = tf.constant([2, 0])
with tf.GradientTape() as tape:
tape.watch([w, b])
prob = tf.nn.softmax(x@w+b, axis=1)
loss = tf.reduce_mean(tf.losses.MSE(tf.one_hot(y, depth=3), prob))
grads = tape.gradient(loss, [w, b])
grads[0]
grads[1]
链式法则
x = tf.Variable(1.)
w1 = tf.Variable(2.)
b1 = tf.Variable(1.)
w2 = tf.Variable(2.)
b2 = tf.Variable(1.)
with tf.GradientTape(persistent=True) as tape:
y1 = x * w1 + b1
y2 = y1 * w2 + b2
dy2_dy1 = tape.gradient(y2, [y1])[0]
dy1_dw1 = tape.gradient(y1, [w1])[0]
dy2_dw1 = tape.gradient(y2, [w1])[0]
多层感知机模型
∂
E
∂
w
j
k
=
(
O
k
−
t
k
)
O
k
(
1
−
O
k
)
O
j
J
\frac{\partial E}{\partial w_{jk}}=(O_k-t_k)O_k(1-O_k)O_j^J
∂wjk∂E=(Ok−tk)Ok(1−Ok)OjJ
∂
E
∂
w
j
k
=
δ
k
K
O
j
J
\frac{\partial E}{\partial w_{jk}}= \qquad\delta_k^K \qquad O_j^J
∂wjk∂E=δkKOjJ
∂
E
∂
w
i
j
=
∂
∂
w
i
j
1
2
∑
k
∈
K
(
O
k
−
t
k
)
2
\frac{\partial E}{\partial w_{ij}}= \frac{\partial }{\partial w_{ij}}\frac{1}{2}\sum_{k \in K}(O_k -t_k)^2
∂wij∂E=∂wij∂21k∈K∑(Ok−tk)2
∂
E
∂
w
i
j
=
∑
k
∈
K
(
O
k
−
t
k
)
∂
∂
W
i
j
O
k
\frac{\partial E}{\partial w_{ij}}= \sum_{k \in K}(O_k -t_k)\frac{\partial}{\partial W_{ij}}O_k
∂wij∂E=k∈K∑(Ok−tk)∂Wij∂Ok
∂
E
∂
w
i
j
=
∑
k
∈
K
(
O
k
−
t
k
)
∂
∂
W
i
j
σ
(
x
k
)
\frac{\partial E}{\partial w_{ij}}= \sum_{k \in K}(O_k -t_k)\frac{\partial}{\partial W_{ij}}\sigma(x_k)
∂wij∂E=k∈K∑(Ok−tk)∂Wij∂σ(xk)
∂
E
∂
w
i
j
=
∑
k
∈
K
(
O
k
−
t
k
)
O
k
(
1
−
O
k
)
∂
x
k
∂
O
j
.
∂
O
j
∂
W
i
j
\frac{\partial E}{\partial w_{ij}}= \sum_{k \in K}(O_k -t_k)O_k(1-O_k)\frac{\partial x_k}{\partial O_j}.\frac{\partial O_j}{\partial W_{ij}}
∂wij∂E=k∈K∑(Ok−tk)Ok(1−Ok)∂Oj∂xk.∂Wij∂Oj
∂
E
∂
w
i
j
=
∑
k
∈
K
(
O
k
−
t
k
)
O
k
(
1
−
O
k
)
W
j
k
∂
O
j
∂
W
i
j
\frac{\partial E}{\partial w_{ij}}= \sum_{k \in K}(O_k -t_k)O_k(1-O_k)W_{jk}\frac{\partial O_j}{\partial W_{ij}}
∂wij∂E=k∈K∑(Ok−tk)Ok(1−Ok)Wjk∂Wij∂Oj
∂
E
∂
w
i
j
=
∂
O
j
∂
W
i
j
∑
k
∈
K
(
O
k
−
t
k
)
O
k
(
1
−
O
k
)
W
j
k
\frac{\partial E}{\partial w_{ij}}= \frac{\partial O_j}{\partial W_{ij}}\sum_{k \in K}(O_k -t_k)O_k(1-O_k)W_{jk}
∂wij∂E=∂Wij∂Ojk∈K∑(Ok−tk)Ok(1−Ok)Wjk
∂
E
∂
w
i
j
=
O
j
(
1
−
O
j
)
∂
x
j
∂
W
i
j
∑
k
∈
K
(
O
k
−
t
k
)
O
k
(
1
−
O
k
)
W
j
k
\frac{\partial E}{\partial w_{ij}}= O_j(1-O_j)\frac{\partial x_j}{\partial W_{ij}}\sum_{k \in K}(O_k -t_k)O_k(1-O_k)W_{jk}
∂wij∂E=Oj(1−Oj)∂Wij∂xjk∈K∑(Ok−tk)Ok(1−Ok)Wjk
∂
E
∂
w
i
j
=
O
j
(
1
−
O
j
)
O
i
∑
k
∈
K
(
O
k
−
t
k
)
O
k
(
1
−
O
k
)
W
j
k
\frac{\partial E}{\partial w_{ij}}= O_j(1-O_j)O_i\sum_{k \in K}(O_k -t_k)O_k(1-O_k)W_{jk}
∂wij∂E=Oj(1−Oj)Oik∈K∑(Ok−tk)Ok(1−Ok)Wjk
∂
E
∂
w
i
j
=
O
i
O
j
(
1
−
O
j
)
∑
k
∈
K
δ
k
W
j
k
\frac{\partial E}{\partial w_{ij}}= O_iO_j(1-O_j)\sum_{k \in K}\delta_kW_{jk}
∂wij∂E=OiOj(1−Oj)k∈K∑δkWjk
For an output layer node k ∈ K k\in K k∈K
∂ E ∂ W j k = O j δ k \frac{\partial E}{\partial W_{jk}}=O_j\delta_k ∂Wjk∂E=Ojδk
where δ k = O k ( 1 − O k ) ( O k − t k ) \delta_k=O_k(1-O_k)(O_k-t_k) δk=Ok(1−Ok)(Ok−tk)For a hidden layer node j ∈ J j\in J j∈J
∂ E ∂ W i j = O i δ j \frac{\partial E}{\partial W_{ij}}=O_i\delta_j ∂Wij∂E=Oiδj
where δ j = O j ( 1 − O j ) ∑ k ∈ K δ k W j k \delta_j=O_j(1-O_j)\sum_{k\in K}\delta_kW_{jk} δj=Oj(1−Oj)k∈K∑δkWjk
函数优化
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
def himmelblau(x):
return (x[0] ** 2 + x[1] - 11) ** 2 + (x[0] + x[1] ** 2 - 7) ** 2
x = np.arange(-6, 6, 0.1)
y = np.arange(-6, 6, 0.1)
print('x,y range:', x.shape, y.shape)
X, Y = np.meshgrid(x, y)
print('X,Y maps:', X.shape, Y.shape)
Z = himmelblau([X, Y])
fig = plt.figure('himmelblau')
ax = fig.gca(projection='3d')
ax.plot_surface(X, Y, Z)
ax.view_init(60, -30)
ax.set_xlabel('x')
ax.set_ylabel('y')
plt.show()
x = tf.constant([-4., 0.])
for step in range(200):
with tf.GradientTape() as tape:
tape.watch(x)
y = himmelblau(x)
grads = tape.gradient(y, [x])[0]
x -= 0.01 * grads
if step % 20 == 0:
print('step {}: x = {}, f(x) = {}'.format(step, x.numpy(), y.numpy()))
import os
import tensorflow as tf
import tensorflow_datasets as tfds
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
dataset, metadata = tfds.load('fashion_mnist', as_supervised=True, with_info=True)
train_dataset, test_dataset = dataset['train'], dataset['test']
def normalize(images, labels):
images = tf.cast(images, tf.float32)
images /= 255
return images, labels
print("datasets", train_dataset.map(normalize))
train_dataset = train_dataset.map(normalize)
test_dataset = test_dataset.map(normalize)
num_train_examples = metadata.splits['train'].num_examples
num_test_examples = metadata.splits['test'].num_examples
BATCH_SIZE = 100
train_dataset = train_dataset.repeat().shuffle(num_train_examples).batch(BATCH_SIZE)
test_dataset = test_dataset.batch(BATCH_SIZE)
model = tf.keras.Sequential([
tf.keras.layers.Dense(256, activation=tf.nn.relu),
tf.keras.layers.Dense(128, activation=tf.nn.relu),
tf.keras.layers.Dense(64, activation=tf.nn.relu),
tf.keras.layers.Dense(32, activation=tf.nn.relu),
tf.keras.layers.Dense(10)
])
model.build(input_shape=[None, 28*28])
model.summary()
optimizer = tf.keras.optimizers.Adam(lr=1e-3)
def main():
for epoch in range(30):
for step, (x,y) in enumerate(train_dataset):
x = tf.reshape(x, [-1, 28*28])
with tf.GradientTape() as tape:
logits = model(x)
y_ = tf.one_hot(y, depth=10)
#loss_mse = tf.reduce_mean(tf.losses.MSE(y_, logits))
loss_ce = tf.reduce_mean(tf.losses.categorical_crossentropy(y_, logits, from_logits=True))
grads = tape.gradient(loss_ce, model.trainable_variables)
optimizer.apply_gradients(zip(grads, model.trainable_variables))
if step % 500 == 0:
# test
total_correct = 0
total_num = 0
for x, y in test_dataset:
x = tf.reshape(x, [-1, 28 * 28])
logits = model(x)
prob = tf.nn.softmax(logits, axis=1)
pred = tf.argmax(prob, axis=1)
correct = tf.equal(pred, y)
correct = tf.reduce_sum(tf.cast(correct, dtype=tf.int32)).numpy()
total_correct += int(correct)
total_num += x.shape[0]
acc = total_correct / total_num
print(epoch, step, 'loss:', float(loss_ce), 'test acc:', acc)
if __name__ == '__main__':
main()