import tensorflow as tf
1 张量排序实例
output = tf.random.normal([10,6])# 随机生成一个正态分布
output
<tf.Tensor: id=5, shape=(10, 6), dtype=float32, numpy=
array([[ 0.76310456, -1.1337202 , 1.0353428 , 1.0621719 , -1.3435235 ,
-0.8340364 ],
[-0.47014472, -0.2422621 , 0.2518393 , -0.23825932, -1.0721133 ,
0.24922352],
[ 0.44158378, 0.6831124 , -0.54429495, 1.1736444 , -0.26429346,
0.7027973 ],
[-0.7662839 , 1.4855492 , 0.42412958, -0.29403406, 1.043192 ,
1.0433921 ],
[ 0.44711986, 0.7275903 , 0.31700605, 0.2726328 , -0.16104753,
-1.1572416 ],
[-0.90731895, -0.2063934 , 0.952755 , 0.6108949 , 2.9908571 ,
-1.7417939 ],
[-0.17370664, -0.03342665, 0.5075081 , 1.3842217 , 0.4897528 ,
-0.37641558],
[-1.0750929 , -0.03694849, 1.319619 , 1.5328622 , 1.5641279 ,
-0.2661516 ],
[ 0.4573638 , 0.8733606 , 0.0470969 , 1.0739747 , -0.8262296 ,
-0.59460646],
[ 1.420134 , 1.216806 , 0.7350984 , -2.035839 , 0.675249 ,
0.5640526 ]], dtype=float32)>
output = tf.math.softmax(output, axis = 1)# 使这六类的概率和为1
output
<tf.Tensor: id=7, shape=(10, 6), dtype=float32, numpy=
array([[0.24179898, 0.03628056, 0.31745782, 0.32609025, 0.02941423,
0.04895815],
[0.12230478, 0.15360723, 0.25176606, 0.15422331, 0.0669903 ,
0.25110835],
[0.15342492, 0.19533965, 0.05724456, 0.31902578, 0.07574209,
0.199223 ],
[0.03621496, 0.34422845, 0.11909077, 0.05807425, 0.22117367,
0.22121793],
[0.20889905, 0.27653062, 0.18341242, 0.17545176, 0.11371368,
0.04199244],
[0.01568617, 0.03161731, 0.10077128, 0.07159271, 0.7735231 ,
0.00680941],
[0.08595599, 0.09890062, 0.16987287, 0.4082027 , 0.16688333,
0.07018443],
[0.02241746, 0.06330627, 0.24580826, 0.30423334, 0.3138957 ,
0.05033905],
[0.17673211, 0.26790482, 0.117257 , 0.3274207 , 0.04896186,
0.0617235 ],
[0.30757383, 0.2509835 , 0.15503944, 0.00970598, 0.14603263,
0.1306646 ]], dtype=float32)>
target = tf.random.uniform([10],maxval = 6,dtype = tf.int32)# 生成一个随机的lable
target
<tf.Tensor: id=12, shape=(10,), dtype=int32, numpy=array([4, 5, 5, 5, 4, 1, 3, 3, 4, 3])>
print('prob:',output.numpy())
prob: [[0.24179898 0.03628056 0.31745782 0.32609025 0.02941423 0.04895815]
[0.12230478 0.15360723 0.25176606 0.15422331 0.0669903 0.25110835]
[0.15342492 0.19533965 0.05724456 0.31902578 0.07574209 0.199223 ]
[0.03621496 0.34422845 0.11909077 0.05807425 0.22117367 0.22121793]
[0.20889905 0.27653062 0.18341242 0.17545176 0.11371368 0.04199244]
[0.01568617 0.03161731 0.10077128 0.07159271 0.7735231 0.00680941]
[0.08595599 0.09890062 0.16987287 0.4082027 0.16688333 0.07018443]
[0.02241746 0.06330627 0.24580826 0.30423334 0.3138957 0.05033905]
[0.17673211 0.26790482 0.117257 0.3274207 0.04896186 0.0617235 ]
[0.30757383 0.2509835 0.15503944 0.00970598 0.14603263 0.1306646 ]]
pred = tf.argmax(output,axis = 1)
print('pred:',pred.numpy())
pred: [3 2 3 1 1 4 3 4 3 0]
print('lable:',target.numpy())
lable: [4 5 5 5 4 1 3 3 4 3]
计算准确度
topk = (1,2,3,4,5,6)
maxk = max(topk)
batch_size = target.shape[0]
print(batch_size)
10
pred = tf.math.top_k(output,maxk).indices
print(pred.numpy())
[[3 2 0 5 1 4]
[2 5 3 1 0 4]
[3 5 1 0 4 2]
[1 5 4 2 3 0]
[1 0 2 3 4 5]
[4 2 3 1 0 5]
[3 2 4 1 0 5]
[4 3 2 1 5 0]
[3 1 0 2 5 4]
[0 1 2 4 5 3]]
pred = tf.transpose(pred,perm = [1,0])
target_ = tf.broadcast_to(target, pred.shape)
print(target_.numpy())
[[4 5 5 5 4 1 3 3 4 3]
[4 5 5 5 4 1 3 3 4 3]
[4 5 5 5 4 1 3 3 4 3]
[4 5 5 5 4 1 3 3 4 3]
[4 5 5 5 4 1 3 3 4 3]
[4 5 5 5 4 1 3 3 4 3]]
correct = tf.equal(pred,target_)
print(correct.numpy())
[[False False False False False False True False False False]
[False True True True False False False True False False]
[False False False False False False False False False False]
[False False False False False True False False False False]
[False False False False True False False False False False]
[ True False False False False False False False True True]]
res = []
print(correct[:1])
#tf.reshape() -1所代表的含义是我们不用亲自去指定这一维的大小,
#函数会自动进行计算,但是列表中只能存在一个-1。
#(如果存在多个-1,就是一个存在多解的方程)
correct_1 = tf.cast(tf.reshape(correct[:1],[-1]),dtype = tf.float32)
print(correct_1.numpy())
correct_1 = tf.reduce_sum(correct_1)
print(correct_1.numpy())
acc = float(correct_1*(100.0/batch_size))
print(acc)
res.append(acc)
print(res)
tf.Tensor([[False False False False False False True False False False]], shape=(1, 10), dtype=bool)
[0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
1.0
10.0
[10.0]
res = []
for k in topk:
correct_k = tf.cast(tf.reshape(correct[:k],[-1]),dtype = tf.float32)
correct_k = tf.reduce_sum(correct_k)
acc = float(correct_k*(100.0/batch_size))
res.append(acc)
print(res)
[10.0, 50.0, 50.0, 60.0, 70.0, 100.0]
2 填充与复制
填充:pad
(在图片的填充时是经常使用的)
a = tf.reshape(tf.range(9),[3,3])
b = tf.pad(a,[[0,0],[0,0]])
c = tf.pad(a,[[1,1],[1,1]])
print('a\n',a.numpy())
print('b\n',b.numpy())
print('\nc',c.numpy())
a
[[0 1 2]
[3 4 5]
[6 7 8]]
b
[[0 1 2]
[3 4 5]
[6 7 8]]
c [[0 0 0 0 0]
[0 0 1 2 0]
[0 3 4 5 0]
[0 6 7 8 0]
[0 0 0 0 0]]
a = tf.random.normal([4,28,28,3])
b = tf.pad(a,[[0,0],[2,2],[2,2],[0,0]])
print(b.shape)
(4, 32, 32, 3)
复制:tile
(broadcast_to也是复制数据,不过不是在真实的内存中复制)
a = [[0,1,2],[3,4,5],[5,3,3]]
b = tf.tile(a,[1,2])
c = tf.tile(a,[2,1])
d = tf.tile(a,[2,2])
print(a)
print(b)
print(c)
print(d)
[[0, 1, 2], [3, 4, 5], [5, 3, 3]]
tf.Tensor(
[[0 1 2 0 1 2]
[3 4 5 3 4 5]
[5 3 3 5 3 3]], shape=(3, 6), dtype=int32)
tf.Tensor(
[[0 1 2]
[3 4 5]
[5 3 3]
[0 1 2]
[3 4 5]
[5 3 3]], shape=(6, 3), dtype=int32)
tf.Tensor(
[[0 1 2 0 1 2]
[3 4 5 3 4 5]
[5 3 3 5 3 3]
[0 1 2 0 1 2]
[3 4 5 3 4 5]
[5 3 3 5 3 3]], shape=(6, 6), dtype=int32)
3 张量的限幅
a = tf.range(10)
b = tf.maximum(a,2)
c = tf.minimum(a,8)
tf.clip_by_value(a,2,8)
print(a)
print(b)
print(c)
tf.Tensor([0 1 2 3 4 5 6 7 8 9], shape=(10,), dtype=int32)
tf.Tensor([2 2 2 3 4 5 6 7 8 9], shape=(10,), dtype=int32)
tf.Tensor([0 1 2 3 4 5 6 7 8 8], shape=(10,), dtype=int32)
a = a-5
d = tf.nn.relu(a)
e = tf.maximum(a,0)
print(a)
print(d)
print(e)
tf.Tensor([-5 -4 -3 -2 -1 0 1 2 3 4], shape=(10,), dtype=int32)
tf.Tensor([0 0 0 0 0 0 1 2 3 4], shape=(10,), dtype=int32)
tf.Tensor([0 0 0 0 0 0 1 2 3 4], shape=(10,), dtype=int32)
等比例缩放,完成向量方向不变
a = tf.random.normal([2,2],mean = 10)
b = tf.norm(a)
c = tf.clip_by_norm(a,15)
d = tf.norm(c)
print(a)
print(b)
print(c)
print(d)
tf.Tensor(
[[ 9.901226 11.381033]
[10.905721 9.152639]], shape=(2, 2), dtype=float32)
tf.Tensor(20.742897, shape=(), dtype=float32)
tf.Tensor(
[[7.1599636 8.23007 ]
[7.8863535 6.618632 ]], shape=(2, 2), dtype=float32)
tf.Tensor(14.999999, shape=(), dtype=float32)
Gradient Clipping
from tensorflow.keras import datasets
(x,y),_ = datasets.mnist.load_data()
x = tf.convert_to_tensor(x,dtype = tf.float32)
4 高阶OP
where(tensor)
import tensorflow as tf
a = tf.random.normal([3,3])
mask = a>0
mask
<tf.Tensor: id=7, shape=(3, 3), dtype=bool, numpy=
array([[False, True, True],
[False, False, True],
[ True, True, True]])>
tf.boolean_mask(a,mask)
<tf.Tensor: id=35, shape=(6,), dtype=float32, numpy=
array([0.52404463, 0.62450945, 0.6324052 , 0.4014356 , 1.3766853 ,
0.8314979 ], dtype=float32)>
indices = tf.where(mask)
indices
<tf.Tensor: id=38, shape=(6, 2), dtype=int64, numpy=
array([[0, 1],
[0, 2],
[1, 2],
[2, 0],
[2, 1],
[2, 2]], dtype=int64)>
tf.gather_nd(a,indices)
<tf.Tensor: id=40, shape=(6,), dtype=float32, numpy=
array([0.52404463, 0.62450945, 0.6324052 , 0.4014356 , 1.3766853 ,
0.8314979 ], dtype=float32)>
A = tf.ones([3,3])
B = tf.zeros([3,3])
tf.where(mask,A,B)
<tf.Tensor: id=59, shape=(3, 3), dtype=float32, numpy=
array([[0., 1., 1.],
[0., 0., 1.],
[1., 1., 1.]], dtype=float32)>
scatter_nd
indices = tf.constant([[4],[3],[1],[7]])
indices
<tf.Tensor: id=62, shape=(4, 1), dtype=int32, numpy=
array([[4],
[3],
[1],
[7]])>
updates = tf.constant([9,10,11,12])
updates
<tf.Tensor: id=64, shape=(4,), dtype=int32, numpy=array([ 9, 10, 11, 12])>
shape = tf.constant([8])
print(shape.numpy())
[8]
tf.scatter_nd(indices,updates,shape)
<tf.Tensor: id=68, shape=(8,), dtype=int32, numpy=array([ 0, 11, 0, 10, 9, 0, 0, 12])>
indices = tf.constant([[0],[2]])
indices
<tf.Tensor: id=71, shape=(2, 1), dtype=int32, numpy=
array([[0],
[2]])>
updates = tf.constant([[[5,5,5,5],[6,6,6,6],[7,7,7,7],[8,8,8,8]],[[5,5,5,5],[6,6,6,6],[7,7,7,7],[8,8,8,8]]])
updates
<tf.Tensor: id=73, shape=(2, 4, 4), dtype=int32, numpy=
array([[[5, 5, 5, 5],
[6, 6, 6, 6],
[7, 7, 7, 7],
[8, 8, 8, 8]],
[[5, 5, 5, 5],
[6, 6, 6, 6],
[7, 7, 7, 7],
[8, 8, 8, 8]]])>
updates.shape
TensorShape([2, 4, 4])
shape = tf.constant([4,4,4])
print(shape.numpy())
[4 4 4]
tf.scatter_nd(indices,updates,shape)
<tf.Tensor: id=78, shape=(4, 4, 4), dtype=int32, numpy=
array([[[5, 5, 5, 5],
[6, 6, 6, 6],
[7, 7, 7, 7],
[8, 8, 8, 8]],
[[0, 0, 0, 0],
[0, 0, 0, 0],
[0, 0, 0, 0],
[0, 0, 0, 0]],
[[5, 5, 5, 5],
[6, 6, 6, 6],
[7, 7, 7, 7],
[8, 8, 8, 8]],
[[0, 0, 0, 0],
[0, 0, 0, 0],
[0, 0, 0, 0],
[0, 0, 0, 0]]])>
meshgird画三位坐标轴
y = tf.linspace(-2.,2,5)
y
<tf.Tensor: id=83, shape=(5,), dtype=float32, numpy=array([-2., -1., 0., 1., 2.], dtype=float32)>
x = tf.linspace(-2.,2,5)
x
<tf.Tensor: id=93, shape=(5,), dtype=float32, numpy=array([-2., -1., 0., 1., 2.], dtype=float32)>
points_x,points_y = tf.meshgrid(x,y)
points_x.shape
TensorShape([5, 5])
points_x
<tf.Tensor: id=115, shape=(5, 5), dtype=float32, numpy=
array([[-2., -1., 0., 1., 2.],
[-2., -1., 0., 1., 2.],
[-2., -1., 0., 1., 2.],
[-2., -1., 0., 1., 2.],
[-2., -1., 0., 1., 2.]], dtype=float32)>
points_y
<tf.Tensor: id=116, shape=(5, 5), dtype=float32, numpy=
array([[-2., -2., -2., -2., -2.],
[-1., -1., -1., -1., -1.],
[ 0., 0., 0., 0., 0.],
[ 1., 1., 1., 1., 1.],
[ 2., 2., 2., 2., 2.]], dtype=float32)>
points = tf.stack([points_x,points_y],axis = 2)
points
<tf.Tensor: id=119, shape=(5, 5, 2), dtype=float32, numpy=
array([[[-2., -2.],
[-1., -2.],
[ 0., -2.],
[ 1., -2.],
[ 2., -2.]],
[[-2., -1.],
[-1., -1.],
[ 0., -1.],
[ 1., -1.],
[ 2., -1.]],
[[-2., 0.],
[-1., 0.],
[ 0., 0.],
[ 1., 0.],
[ 2., 0.]],
[[-2., 1.],
[-1., 1.],
[ 0., 1.],
[ 1., 1.],
[ 2., 1.]],
[[-2., 2.],
[-1., 2.],
[ 0., 2.],
[ 1., 2.],
[ 2., 2.]]], dtype=float32)>
5 数据加载
keras.datasets:
boston housing
mnist/fasion mnist
cifar10/100
imdb
1)载入cifar100
import tensorflow as tf
from tensorflow import keras
(x, y), (x_test, y_test) = keras.datasets.mnist.load_data()
x.shape
(60000, 28, 28)
y.shape
(60000,)
x.min(),x.max(),x.mean()
(0, 255, 33.318421449829934)
x_test.shape,y_test.shape
((10000, 28, 28), (10000,))
y[:4]
array([5, 0, 4, 1], dtype=uint8)
y_onehot = tf.one_hot(y,depth = 10)
y_onehot[:2]
<tf.Tensor: id=8, shape=(2, 10), dtype=float32, numpy=
array([[0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
[1., 0., 0., 0., 0., 0., 0., 0., 0., 0.]], dtype=float32)>
2)载入cifar10/100
(x,y) = (x_test,y_test) = keras.datasets.cifar10.load_data()
#此处本人暂停了,因为下载速度实在是太慢。其中遇到个小问题,网址不对,后来进入python文件更新了网址。
Downloading data from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
4890624/170498071 [..............................] - ETA: 9:40:41
import tensorflow as tf
from tensorflow import keras
(x,y),(x_test,y_test) = keras.datasets.cifar10.load_data()
#因为本人直接在网上下载好了数据集,并且直接拷贝在了目标文件夹‘C:\Users\wanfuchun\.keras\datasets’,因此此处加载特别快
x.shape,y.shape,x_test.shape,y_test.shape
((50000, 32, 32, 3), (50000, 1), (10000, 32, 32, 3), (10000, 1))
x.min(),x.max()
(0, 255)
y[:4]
array([[6],
[9],
[9],
[4]], dtype=uint8)
db = tf.data.Dataset.from_tensor_slices(x_test)
next(iter(db)).shape
TensorShape([32, 32, 3])
db = tf.data.Dataset.from_tensor_slices((x_test,y_test))
next(iter(db))[0].shape
TensorShape([32, 32, 3])
next(iter(db))[1].shape
TensorShape([1])
3)Dataset api的shuffle打散功能
db = db.shuffle(10000)
4).map 数据预处理功能
def preprocess(x,y):
x = tf.cast(x,dtype=tf.float32)/255.
y = tf.cast(y,dtype=tf.int32)
y = tf.one_hot(y,depth = 10)
return x,y
db2 = db.map(preprocess)
res = next(iter(db2))
res[0].shape,res[1].shape
(TensorShape([32, 32, 3]), TensorShape([1, 10]))
res[1][:2]
<tf.Tensor: id=221, shape=(1, 10), dtype=float32, numpy=array([[0., 0., 0., 0., 1., 0., 0., 0., 0., 0.]], dtype=float32)>
db3 = db2.batch(32)
res = next(iter(db3))
res[0].shape,res[1].shape
(TensorShape([32, 32, 32, 3]), TensorShape([32, 1, 10]))
6 误差计算
1)熵的概念
import tensorflow as tf
a = tf.fill([4],0.25)
a * tf.math.log(a)/tf.math.log(2.)
<tf.Tensor: id=7, shape=(4,), dtype=float32, numpy=array([-0.5, -0.5, -0.5, -0.5], dtype=float32)>
-tf.reduce_sum(a*tf.math.log(a)/tf.math.log(2.))
<tf.Tensor: id=16, shape=(), dtype=float32, numpy=2.0>
由此可见,当概率均分时,此时的惊喜最小,信息最少,熵最大
a = tf.constant([0.01,0.01,0.01,0.97])
a * tf.math.log(a)/tf.math.log(2.)
<tf.Tensor: id=23, shape=(4,), dtype=float32, numpy=array([-0.06643856, -0.06643856, -0.06643856, -0.04262501], dtype=float32)>
-tf.reduce_sum(a*tf.math.log(a)/tf.math.log(2.))
<tf.Tensor: id=32, shape=(), dtype=float32, numpy=0.24194068>
此时惊喜最大,信息越有用,熵越小。
2)交叉熵的概念
tf.losses.categorical_crossentropy([0,1,0,0],[0.25,0.25,0.25,0.25])
<tf.Tensor: id=51, shape=(), dtype=float32, numpy=1.3862944>
tf.losses.categorical_crossentropy([0,1,0,0],[0.1,0.1,0.8,0.1])
<tf.Tensor: id=70, shape=(), dtype=float32, numpy=2.3978953>
tf.losses.categorical_crossentropy([0,1,0,0],[0.01,0.97,0.01,0.01])
<tf.Tensor: id=89, shape=(), dtype=float32, numpy=0.030459179>
tf.losses.BinaryCrossentropy()([1],[0.1])
<tf.Tensor: id=128, shape=(), dtype=float32, numpy=2.3025842>
tf.losses.binary_crossentropy([1],[0.1])
<tf.Tensor: id=155, shape=(), dtype=float32, numpy=2.3025842>
一般损失函数使用交叉熵函数,而不是MSE,因为对于sigmoid激活函数来讲,如果使用MSE函数作为损失函数,会出现梯度消失的现象。但是这仅仅是从工程领域来考虑。
7 梯度下降
一阶梯度
import tensorflow as tf
w = tf.constant(1.)
x = tf.constant(2.)
y = x*w
with tf.GradientTape() as tape:
tape.watch([w])
y = x*w
grad1 = tape.gradient(y,[w])
grad1
[<tf.Tensor: id=6, shape=(), dtype=float32, numpy=2.0>]
grad1 = tape.gradient(y,[w])
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-4-b4d14a0cffac> in <module>
----> 1 grad1 = tape.gradient(y,[w])
F:\Anaconda3\envs\gpu\lib\site-packages\tensorflow\python\eager\backprop.py in gradient(self, target, sources, output_gradients, unconnected_gradients)
918 """
919 if self._tape is None:
--> 920 raise RuntimeError("GradientTape.gradient can only be called once on "
921 "non-persistent tapes.")
922 if self._recording:
RuntimeError: GradientTape.gradient can only be called once on non-persistent tapes.
with tf.GradientTape(persistent = True) as tape:
tape.watch([w])
y = x*w
grad1 = tape.gradient(y,[w])
grad1
[<tf.Tensor: id=11, shape=(), dtype=float32, numpy=2.0>]
grad1 = tape.gradient(y,[w])
二阶梯度
import tensorflow as tf
略,嵌套两层即可
8 激活函数及其梯度
sigmoid
import tensorflow as tf
a = tf.linspace(-10.,10.,10)
with tf.GradientTape() as tape:
tape.watch(a)
y = tf.sigmoid(a)
grads = tape.gradient(y,[a])
print(a)
print(y)
print(grads)
tf.Tensor(
[-10. -7.7777777 -5.5555553 -3.333333 -1.1111107 1.1111116
3.333334 5.5555563 7.7777786 10. ], shape=(10,), dtype=float32)
tf.Tensor(
[4.5388937e-05 4.1878223e-04 3.8510561e-03 3.4445226e-02 2.4766389e-01
7.5233626e-01 9.6555483e-01 9.9614894e-01 9.9958128e-01 9.9995458e-01], shape=(10,), dtype=float32)
[<tf.Tensor: id=12, shape=(10,), dtype=float32, numpy=
array([4.5386874e-05, 4.1860685e-04, 3.8362255e-03, 3.3258751e-02,
1.8632649e-01, 1.8632641e-01, 3.3258699e-02, 3.8362255e-03,
4.1854731e-04, 4.5416677e-05], dtype=float32)>]
Tanh(在RNN循环神经网络中使用的较多)
a = tf.linspace(-10.,10.,10)
y = tf.tanh(a)
print(a)
print(y)
tf.Tensor(
[-10. -7.7777777 -5.5555553 -3.333333 -1.1111107 1.1111116
3.333334 5.5555563 7.7777786 10. ], shape=(10,), dtype=float32)
tf.Tensor(
[-1. -0.99999964 -0.99997014 -0.997458 -0.8044547 0.804455
0.997458 0.99997014 0.99999964 1. ], shape=(10,), dtype=float32)
Rectified Linear Unit
a = tf.linspace(-1.,1.,10)
tf.nn.relu(a).numpy()
array([0. , 0. , 0. , 0. , 0. ,
0.11111116, 0.33333337, 0.5555556 , 0.7777778 , 1. ],
dtype=float32)
tf.nn.leaky_relu(a).numpy()
array([-0.2 , -0.15555556, -0.11111112, -0.06666666, -0.02222222,
0.11111116, 0.33333337, 0.5555556 , 0.7777778 , 1. ],
dtype=float32)
9 损失函数及其梯度
Mean Squared error(MSE)
x = tf.random.normal([3,4])
w = tf.random.normal([4,3])
b = tf.random.normal([3])
y = tf.constant([2,0,1])
with tf.GradientTape() as tape:
tape.watch([w,b]) #此处必须要观测,否则就要在上方添加 w = tf.Variable(w)这样的语句,这是规范,必须人为添加观测
prob = tf.nn.softmax(x @ w + b, axis = 1) #probality 概率
print(prob.numpy())
loss = tf.reduce_mean(tf.losses.MSE(tf.one_hot(y,depth = 3),prob)) #先把求得的softmax预测概率变为onehot,然后再求其与标签的每一项的均方误差,然后再求平均
grads = tape.gradient(loss,[w,b])
print('grads[0]:\n',grads[0].numpy())
print('grads[1]:\n',grads[1].numpy())
[[0.6746528 0.0133454 0.31200182]
[0.74632466 0.15744632 0.09622896]
[0.5631675 0.4008012 0.0360313 ]]
grads[0]:
[[ 0.0165339 -0.03772166 0.02118775]
[ 0.03422298 -0.08465307 0.05043009]
[ 0.01001721 -0.0136021 0.00358488]
[ 0.05285116 0.02282622 -0.07567739]]
grads[1]:
[ 0.10942388 -0.05007509 -0.05934878]
Cross Entropy Loss:(binary,multi-class,+softmax,leave it to losgistic regression part)
softmax(soft version of max)使强者俞强,弱者越弱,类似与金字塔效应,高度比别人高一点点,得到的收入会比别人高好多倍
Crossentropy gradient
x = tf.random.normal([2,4])
w = tf.random.normal([4,3])
b = tf.zeros([3])
y = tf.constant([2,0])
with tf.GradientTape()as tape:
tape.watch([w,b])
logits = x@w+b
#将logits输入激活函数会输出概率的形式probability,当和one_hot放在一起的时候,会涉及到数据稳定性的问题,
#在这里集成在categorical_crossentropy一起做了,不过要输入from_logits = True
loss = tf.reduce_mean(tf.losses.categorical_crossentropy(tf.one_hot(y,depth = 3),logits, from_logits = True))
grads = tape.gradient(loss,[w,b])
grads
[<tf.Tensor: id=579, shape=(4, 3), dtype=float32, numpy=
array([[ 0.59179735, -0.6549072 , 0.06310985],
[-0.51698416, 0.70245177, -0.18546759],
[ 0.38274866, -0.50988907, 0.12714042],
[ 0.3166223 , 0.03204821, -0.3486705 ]], dtype=float32)>,
<tf.Tensor: id=577, shape=(3,), dtype=float32, numpy=array([-0.31501144, 0.7013504 , -0.38633895], dtype=float32)>]
grads[1]
<tf.Tensor: id=577, shape=(3,), dtype=float32, numpy=array([-0.31501144, 0.7013504 , -0.38633895], dtype=float32)>
grads[0]
<tf.Tensor: id=579, shape=(4, 3), dtype=float32, numpy=
array([[ 0.59179735, -0.6549072 , 0.06310985],
[-0.51698416, 0.70245177, -0.18546759],
[ 0.38274866, -0.50988907, 0.12714042],
[ 0.3166223 , 0.03204821, -0.3486705 ]], dtype=float32)>
10 单输出感知机梯度
x = tf.random.normal([1,3])
w = tf.ones([3,1])
b = tf.ones([1])
y = tf.constant([1])
with tf.GradientTape() as tape:
tape.watch([w,b])
prob = tf.sigmoid(x@w+b)
loss = tf.reduce_mean(tf.losses.MSE(y,prob))
grads = tape.gradient(loss,[w,b])
grads[0]
<tf.Tensor: id=869, shape=(3, 1), dtype=float32, numpy=
array([[ 0.41205588],
[-0.10961942],
[ 0.04508047]], dtype=float32)>
grads[1]
<tf.Tensor: id=867, shape=(1,), dtype=float32, numpy=array([-0.27709824], dtype=float32)>
11 多输出感知机梯度
x = tf.random.normal([2,4])
w = tf.ones([4,3])
b = tf.ones([3])
y = tf.constant([2,0])
with tf.GradientTape() as tape:
tape.watch([w,b])
prob = tf.nn.softmax(x@w+b,axis=1) #axis等于1意味着:softmax之后是形状为[b,3]的输出,我们希望axis = 1这一维度上代表的是概率。
loss = tf.reduce_mean(tf.losses.MSE(tf.one_hot(y,depth = 3),prob))
grads = tape.gradient(loss,[w,b])
grads
[<tf.Tensor: id=957, shape=(4, 3), dtype=float32, numpy=
array([[-0.033397 , 0.05529919, -0.02190218],
[-0.02506173, -0.00697992, 0.03204165],
[ 0.00255801, 0.04580436, -0.04836237],
[-0.04055597, 0.01948298, 0.021073 ]], dtype=float32)>,
<tf.Tensor: id=955, shape=(3,), dtype=float32, numpy=array([-0.03703704, 0.07407407, -0.03703704], dtype=float32)>]
12 链式法则
x = tf.constant(1.)
w1 = tf.constant(2.)
b1 = tf.constant(1.)
w2 = tf.constant(2.)
b2 = tf.constant(1.)
with tf.GradientTape(persistent = True) as tape:
tape.watch([w1,b1,w2,b2])
y1 = x * w1 + b1
y2 = y1 * w2 + b2
dy2_dy1 = tape.gradient(y2,[y1])
dy1_dw1 = tape.gradient(y1,[w1])
dy2_dw1 = tape.gradient(y2,[w1])
print(dy2_dy1.numpy())
print(dy1_dw1.numpy())
print(dy2_dw1.numpy())
2.0
1.0
2.0
13 多层感知机梯度
公式推导过程省略。从后向前,一层一层的计算,并没有我们想象中的复杂,激活函数就那么几种,因此是可以编写程序解决的,唯一的难点可能是矩阵的运算以及算法的优化,不过这些都是搞底层研发的人搞得,我们只需要明白其原理并会加以使用就好。
14 函数优化实战(Himmelblau函数优化)
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import pyplot as plt
import tensorflow as tf
def himmelblau(x):
return (x[0] ** 2 + x[1] - 11) ** 2 + (x[0] + x[1] ** 2 - 7) ** 2
x = np.arange(-6, 6, 0.1)
y = np.arange(-6, 6, 0.1)
print('x,y range:', x.shape, y.shape)
X, Y = np.meshgrid(x, y)
print('X,Y maps:', X.shape, Y.shape)
Z = himmelblau([X, Y])
fig = plt.figure('himmelblau')
ax = fig.gca(projection='3d')
ax.plot_surface(X, Y, Z)
ax.view_init(60, -30)
ax.set_xlabel('x')
ax.set_ylabel('y')
plt.show()
x,y range: (120,) (120,)
X,Y maps: (120, 120) (120, 120)
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-w0yl8RE5-1570938316958)(output_171_1.png)]
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import pyplot as plt
import tensorflow as tf
def himmelblau(x):
return (x[0] ** 2 + x[1] - 11) ** 2 + (x[0] + x[1] ** 2 - 7) ** 2
x = np.arange(-6, 6, 0.1)
y = np.arange(-6, 6, 0.1)
print('x,y range:', x.shape, y.shape)
X, Y = np.meshgrid(x, y)
print('X,Y maps:', X.shape, Y.shape)
Z = himmelblau([X, Y])
fig = plt.figure('himmelblau')
ax = fig.gca(projection='3d')
ax.plot_surface(X, Y, Z)
ax.view_init(60, -30)
ax.set_xlabel('x')
ax.set_ylabel('y')
plt.show()
# [1., 0.], [-4, 0.], [4, 0.]
x = tf.constant([-4., 0.]) #此处可以发现,初始化的值不同,会影响最终的结果
for step in range(200):
with tf.GradientTape() as tape:
tape.watch([x])
y = himmelblau(x)
grads = tape.gradient(y, [x])[0]
x -= 0.01*grads
if step % 20 == 0:
print ('step {}: x = {}, f(x) = {}'
.format(step, x.numpy(), y.numpy())) #格式化
x,y range: (120,) (120,)
X,Y maps: (120, 120) (120, 120)
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-jdcu7mRU-1570938316959)(output_172_1.png)]
step 0: x = [-2.98 -0.09999999], f(x) = 146.0
step 20: x = [-3.6890156 -3.1276684], f(x) = 6.054738998413086
step 40: x = [-3.7793102 -3.283186 ], f(x) = 0.0
step 60: x = [-3.7793102 -3.283186 ], f(x) = 0.0
step 80: x = [-3.7793102 -3.283186 ], f(x) = 0.0
step 100: x = [-3.7793102 -3.283186 ], f(x) = 0.0
step 120: x = [-3.7793102 -3.283186 ], f(x) = 0.0
step 140: x = [-3.7793102 -3.283186 ], f(x) = 0.0
step 160: x = [-3.7793102 -3.283186 ], f(x) = 0.0
step 180: x = [-3.7793102 -3.283186 ], f(x) = 0.0