x=tf.random.normal([2,4])
w=tf.random.normal([4,3])
b=tf.zeros([3])
y=tf.constant([2,0])
with tf.GradientTape() as tape:
tape.watch([w,b])
prob=tf.nn.softmax(x@w+b,axis=1)
loss=tf.reduce_mean(tf.losses.MSE(tf.one_hot(y,depth=3),prob))
grads=tape.gradient(loss,[w,b])
grads
[<tf.Tensor: shape=(4, 3), dtype=float32, numpy=
array([[-0.02361509, 0.1223269 , -0.0987118 ],
[ 0.01090372, -0.06179954, 0.05089583],
[-0.00767319, 0.03826673, -0.03059354],
[-0.00784626, 0.0357439 , -0.02789764]], dtype=float32)>, <tf.Tensor: shape=(3,), dtype=float32, numpy=array([-0.01442908, 0.0709531 , -0.05652402], dtype=float32)>]
grads[0]
<tf.Tensor: shape=(4, 3), dtype=float32, numpy=
array([[-0.02361509, 0.1223269 , -0.0987118 ],
[ 0.01090372, -0.06179954, 0.05089583],
[-0.00767319, 0.03826673, -0.03059354],
[-0.00784626, 0.0357439 , -0.02789764]], dtype=float32)>
grads[1]
<tf.Tensor: shape=(3,), dtype=float32, numpy=array([-0.01442908, 0.0709531 , -0.05652402], dtype=float32)>
交叉熵
Crossentropy gradient
x=tf.random.normal([2,4])
w=tf.random.normal([4,3])
b=tf.zeros([3])
y=tf.constant([2,0])
with tf.GradientTape() as tape:
tape.watch([w,b])
logits=x@w+b
loss=tf.reduce_mean(tf.losses.categorical_crossentropy(tf.one_hot(y,depth=3),logits,from_logits=True))
grads=tape.gradient(loss,[w,b])
grads
[<tf.Tensor: shape=(4, 3), dtype=float32, numpy=
array([[ 8.8816382e-02, 6.6718370e-01, -7.5600004e-01],
[-7.2452836e-02, -3.2902423e-01, 4.0147704e-01],
[ 2.0104319e-02, 2.1094838e-01, -2.3105267e-01],
[ 5.3726695e-04, 2.0235626e-01, -2.0289350e-01]], dtype=float32)>, <tf.Tensor: shape=(3,), dtype=float32, numpy=array([ 0.03185844, 0.3927134 , -0.4245718 ], dtype=float32)>]