矩阵向量求导之微分法
标量微分
d f = ∂ f ∂ x d x df = \frac {\partial f}{\partial x} dx df=∂x∂fdx
向量微分
d f = ∑ i = 1 n ∂ f x i d x i = [ ∂ f x 1 ∂ f x 2 ∂ f x 3 . . . ∂ f x n ] [ d x 1 d x 2 d x 3 . . . . d x 1 ] = ( ∂ f ∂ x ) T d x df = \displaystyle\sum_{i=1}^n\frac{\partial f}{x_i}dx_i= \begin{bmatrix} \frac{\partial f}{x_1}\\ \\ \frac{\partial f}{x_2}\\ \\ \frac{\partial f}{x_3}\\ .\\ .\\ .\\ \\ \frac{\partial f}{x_n}\\ \end{bmatrix} \begin{bmatrix} d_{x1}&d_{x2}&d_{x3}&....&d_{x1}\end{bmatrix} =\begin{pmatrix}\frac {\partial f}{\partial \Large x} \end{pmatrix}^Td\Large x df=i=1∑nxi∂fdxi= x1∂fx2∂fx3∂f...xn∂f [dx1dx2dx3....dx1]=(∂x∂f)Tdx
矩阵微分
d
f
=
∑
i
=
1
m
∑
j
=
1
n
∂
f
∂
x
i
j
d
X
i
j
df = \displaystyle\sum_{i=1}^m\displaystyle\sum_{j=1}^n\frac {\partial f}{\partial \LARGE x_{ij}}d\LARGE X_{ij}
df=i=1∑mj=1∑n∂xij∂fdXij
[
∂
f
x
11
∂
f
x
21
∂
f
x
31
.
.
.
x
m
1
∂
f
x
12
∂
f
x
22
∂
f
x
32
.
.
.
x
m
2
.
.
.
∂
f
x
1
n
∂
f
x
2
n
∂
f
x
3
n
.
.
.
x
m
n
]
[
d
x
11
d
x
12
d
x
13
.
.
.
d
x
1
n
d
x
21
d
x
22
d
x
23
.
.
.
d
x
2
n
.
.
.
d
x
m
1
d
x
m
2
d
x
m
3
.
.
.
d
x
m
n
]
=
[
∂
f
x
11
d
x
11
+
∂
f
x
21
d
x
21
+
∂
f
x
31
d
x
31
+
.
.
.
+
∂
f
x
m
1
d
x
m
1
∂
f
x
11
d
x
12
+
∂
f
x
21
d
x
22
+
∂
f
x
31
d
x
32
+
.
.
.
+
∂
f
x
m
1
d
x
m
2
∂
f
x
11
d
x
13
+
∂
f
x
21
d
x
23
+
∂
f
x
31
d
x
33
+
.
.
.
+
∂
f
x
m
1
d
x
m
3
.
.
.
∂
f
x
11
d
x
1
n
+
∂
f
x
21
d
x
2
n
+
∂
f
x
31
d
x
3
n
+
.
.
.
+
∂
f
x
m
1
d
x
m
n
∂
f
x
12
d
x
11
+
∂
f
x
22
d
x
21
+
∂
f
x
32
d
x
31
+
.
.
.
+
∂
f
x
m
2
d
x
m
1
∂
f
x
12
d
x
12
+
∂
f
x
22
d
x
21
+
∂
f
x
32
d
x
32
+
.
.
.
+
∂
f
x
m
2
d
x
m
2
∂
f
x
12
d
x
13
+
∂
f
x
22
d
x
23
+
∂
f
x
32
d
x
33
+
.
.
.
+
∂
f
x
m
2
d
x
m
3
.
.
.
∂
f
x
12
d
x
1
n
+
∂
f
x
22
d
x
2
n
+
∂
f
x
32
d
x
3
n
+
.
.
.
+
∂
f
x
m
2
d
x
m
n
.
.
.
∂
f
x
1
n
d
x
11
+
∂
f
x
2
n
d
x
21
+
∂
f
x
3
n
d
x
31
+
.
.
.
+
∂
f
x
m
n
d
x
m
1
∂
f
x
1
n
d
x
12
+
∂
f
x
2
n
d
x
22
+
∂
f
x
3
n
d
x
32
+
.
.
.
+
∂
f
x
m
n
d
x
m
2
∂
f
x
1
n
d
x
13
+
∂
f
x
2
n
d
x
23
+
∂
f
x
3
n
d
x
33
+
.
.
.
+
∂
f
x
m
n
d
x
m
3
.
.
.
∂
f
x
1
n
d
x
1
n
+
∂
f
x
2
n
d
x
2
n
+
∂
f
x
3
n
d
x
3
n
+
.
.
.
+
∂
f
x
m
n
d
x
m
n
]
\begin{bmatrix} \frac{\partial f}{x_{11}}&\frac{\partial f}{x_{21}}&\frac{\partial f}{x_{31}}&...&{x_{m1}}\\ \\ \frac{\partial f}{x_{12}} &\frac{\partial f}{x_{22}}&\frac{\partial f}{x_{32}}&...&{x_{m2}}\\ .\\ .\\ .\\ \frac{\partial f}{x_{1n}}&\frac{\partial f}{x_{2n}}&\frac{\partial f}{x_{3n}}&...&{x_{mn}}\\ \end{bmatrix} \begin{bmatrix} dx_{11}&dx_{12}&dx_{13}&...&dx_{1n}\\ \\ dx_{21}&dx_{22}&dx_{23}&...&dx_{2n}\\ .\\ .\\ .\\ dx_{m1}&dx_{m2}&dx_{m3}&...&dx_{mn}\\ \end{bmatrix} = \begin{bmatrix} \frac{\partial f}{x_{11}}dx_{11}+ \frac{\partial f}{x_{21}}dx_{21}+ \frac{\partial f}{x_{31}}dx_{31}+ . . . + \frac{\partial f}{x_{m1}}dx_{m1} & \frac{\partial f}{x_{11}}dx_{12}+ \frac{\partial f}{x_{21}}dx_{22}+ \frac{\partial f}{x_{31}}dx_{32}+ . . . + \frac{\partial f}{x_{m1}}dx_{m2} & \frac{\partial f}{x_{11}}dx_{13}+ \frac{\partial f}{x_{21}}dx_{23}+ \frac{\partial f}{x_{31}}dx_{33}+ . . . + \frac{\partial f}{x_{m1}}dx_{m3}&...& \frac{\partial f}{x_{11}}dx_{1n}+ \frac{\partial f}{x_{21}}dx_{2n}+ \frac{\partial f}{x_{31}}dx_{3n}+ . . . + \frac{\partial f}{x_{m1}}dx_{mn}\\ \\ \frac{\partial f}{x_{12}}dx_{11}+ \frac{\partial f}{x_{22}}dx_{21}+ \frac{\partial f}{x_{32}}dx_{31}+ . . . + \frac{\partial f}{x_{m2}}dx_{m1} & \frac{\partial f}{x_{12}}dx_{12}+ \frac{\partial f}{x_{22}}dx_{21}+ \frac{\partial f}{x_{32}}dx_{32}+ . . . + \frac{\partial f}{x_{m2}}dx_{m2}& \frac{\partial f}{x_{12}}dx_{13}+ \frac{\partial f}{x_{22}}dx_{23}+ \frac{\partial f}{x_{32}}dx_{33}+ . . . + \frac{\partial f}{x_{m2}}dx_{m3}&...& \frac{\partial f}{x_{12}}dx_{1n}+ \frac{\partial f}{x_{22}}dx_{2n}+ \frac{\partial f}{x_{32}}dx_{3n}+ . . . + \frac{\partial f}{x_{m2}}dx_{mn}\\ .\\ .\\ .\\ \frac{\partial f}{x_{1n}}dx_{11}+ \frac{\partial f}{x_{2n}}dx_{21}+ \frac{\partial f}{x_{3n}}dx_{31}+ . . . + \frac{\partial f}{x_{mn}}dx_{m1}& \frac{\partial f}{x_{1n}}dx_{12}+ \frac{\partial f}{x_{2n}}dx_{22}+ \frac{\partial f}{x_{3n}}dx_{32}+ . . . + \frac{\partial f}{x_{mn}}dx_{m2}& \frac{\partial f}{x_{1n}}dx_{13}+ \frac{\partial f}{x_{2n}}dx_{23}+ \frac{\partial f}{x_{3n}}dx_{33}+ . . . + \frac{\partial f}{x_{mn}}dx_{m3}&...& \frac{\partial f}{x_{1n}}dx_{1n}+ \frac{\partial f}{x_{2n}}dx_{2n}+ \frac{\partial f}{x_{3n}}dx_{3n}+ . . . + \frac{\partial f}{x_{mn}}dx_{mn} \end{bmatrix}
x11∂fx12∂f...x1n∂fx21∂fx22∂fx2n∂fx31∂fx32∂fx3n∂f.........xm1xm2xmn
dx11dx21...dxm1dx12dx22dxm2dx13dx23dxm3.........dx1ndx2ndxmn
=
x11∂fdx11+x21∂fdx21+x31∂fdx31+...+xm1∂fdxm1x12∂fdx11+x22∂fdx21+x32∂fdx31+...+xm2∂fdxm1...x1n∂fdx11+x2n∂fdx21+x3n∂fdx31+...+xmn∂fdxm1x11∂fdx12+x21∂fdx22+x31∂fdx32+...+xm1∂fdxm2x12∂fdx12+x22∂fdx21+x32∂fdx32+...+xm2∂fdxm2x1n∂fdx12+x2n∂fdx22+x3n∂fdx32+...+xmn∂fdxm2x11∂fdx13+x21∂fdx23+x31∂fdx33+...+xm1∂fdxm3x12∂fdx13+x22∂fdx23+x32∂fdx33+...+xm2∂fdxm3x1n∂fdx13+x2n∂fdx23+x3n∂fdx33+...+xmn∂fdxm3.........x11∂fdx1n+x21∂fdx2n+x31∂fdx3n+...+xm1∂fdxmnx12∂fdx1n+x22∂fdx2n+x32∂fdx3n+...+xm2∂fdxmnx1n∂fdx1n+x2n∂fdx2n+x3n∂fdx3n+...+xmn∂fdxmn
所以矩阵的微分可以表示为
d
f
=
t
r
(
(
∂
f
∂
X
)
T
d
x
)
df = tr\begin{pmatrix} \begin{pmatrix} \frac {\partial f}{\partial \Large X} \end{pmatrix}^Td\Large x \end{pmatrix}
df=tr((∂X∂f)Tdx)
向量微分也可以用矩阵微分来表示
d
f
=
t
r
(
(
∂
f
∂
x
)
T
d
x
)
df = tr \begin{pmatrix} \begin{pmatrix} \frac{\partial f}{\partial \Large x} \end{pmatrix}^T d\Large x \end{pmatrix}
df=tr((∂x∂f)Tdx)
矩阵微分的性质
我们在讨论如何使用矩阵微分来求导前,先看看矩阵微分的性质
d
(
X
+
Y
)
=
d
Y
+
d
X
,
d
(
X
−
Y
)
=
d
X
−
d
Y
d\begin{pmatrix} \LARGE X + \LARGE Y \end{pmatrix}= d \LARGE Y + d \LARGE {X} , d\begin{pmatrix} \LARGE X - \LARGE Y \end {pmatrix}= d\LARGE X - d\LARGE Y
d(X+Y)=dY+dX,d(X−Y)=dX−dY
d
(
X
Y
)
=
X
d
(
Y
)
+
Y
d
(
X
)
\LARGE d\begin{pmatrix} \LARGE X \LARGE Y \end{pmatrix} =\LARGE X \LARGE d\begin{pmatrix} \LARGE Y \end{pmatrix}+ \LARGE Y \LARGE d \begin{pmatrix} \LARGE X \end{pmatrix}
d(XY)=Xd(Y)+Yd(X)
d
(
X
T
)
=
(
d
X
)
T
d \begin{pmatrix} \LARGE X^T \end{pmatrix}= \begin{pmatrix} \LARGE {dX} \end{pmatrix}^T
d(XT)=(dX)T
d
t
r
(
X
)
=
t
r
(
d
X
)
d tr \begin{pmatrix} \LARGE X \end{pmatrix}=tr \begin{pmatrix} \LARGE {dX} \end{pmatrix}
dtr(X)=tr(dX)
d ( X ⨀ Y ) = X ⨀ d ( Y ) + Y ⨀ d ( X ) d \begin{pmatrix} \LARGE X \normalsize \bigodot \LARGE Y \end{pmatrix}=\LARGE X \normalsize \bigodot d \begin{pmatrix} \LARGE Y \end{pmatrix} +\LARGE Y \normalsize \bigodot d \begin{pmatrix} \LARGE X \end{pmatrix} d(X⨀Y)=X⨀d(Y)+Y⨀d(X)
d δ ( X ) = δ ′ ( X ) ⨀ d X d \delta \begin{pmatrix} \LARGE X \end{pmatrix}=\delta ^{\prime} \begin{pmatrix} \LARGE X \end{pmatrix} \bigodot d \LARGE X dδ(X)=δ′(X)⨀dX
d X − 1 = − X − 1 d X X − 1 d\LARGE X^{\normalsize -1} = \LARGE -X^{-1}\normalsize d\LARGE X \LARGE X^{-1} dX−1=−X−1dXX−1