解:将概率分布代入对数似然函数,
l ( ψ , μ 0 , μ 1 , ∑ ) = ∑ i = 1 m l o g p X ∣ Y ( x ( i ) ∣ y ( i ) ; μ 0 , μ 1 , ∑ ) + ∑ i = 1 m l o g p Y ( y ( i ) ; ψ ) l(\psi,\mu_0,\mu_1,\sum)=\sum^m_{i=1}{log{p_{X|Y}(x^{(i)}|y^{(i)};\mu_0,\mu_1,\sum)}}+\sum^m_{i=1}log{p_Y}(y^{(i)};\psi) l(ψ,μ0,μ1,∑)=∑i=1mlogpX∣Y(x(i)∣y(i);μ0,μ1,∑)+∑i=1mlogpY(y(i);ψ)
= ∑ i = 1 m ( 1 − y ( i ) ) l o g 1 ( 2 π ) n / 2 ∣ ∑ ∣ 1 / 2 e x p ( 1 2 ( x ( i ) − μ 0 ) T ∑ − 1 ( x ( i ) − μ 0 ) ) =\sum^m_{i=1}(1-y^{(i)}){log \frac{1}{(2\pi)^{n/2}|\sum|^{1/2}}exp(\frac{1}{2}(x^{(i)}-\mu_0)^T\sum^{-1}(x^{(i)}-\mu_0))} =∑i=1m(1−y(i))log(2π)n/2∣∑∣1/21exp(21(x(i)−μ0)T∑−1(x(i)−μ0))
+ ∑ i = 1 m y ( i ) l o g 1 ( 2 π ) n / 2 ∣ ∑ ∣ 1 / 2 e x p ( 1 2 ( x ( i ) − μ 1 ) T ∑ − 1 ( x ( i ) − μ 1 ) ) +\sum^m_{i=1}y^{(i)}{log \frac{1}{(2\pi)^{n/2}|\sum|^{1/2}}exp(\frac{1}{2}(x^{(i)}-\mu_1)^T\sum^{-1}(x^{(i)}-\mu_1))} +∑i=1my(i)log(2π)n/2∣∑∣1/21exp(21(x(i)−μ1)T∑−1(x(i)−μ1))
+ ∑ i = 1 m l o g ψ y ( i ) ( 1 − ψ ) 1 − y ( i ) +\sum^m_{i=1}{log\psi^{y^{(i)}}(1-\psi)^{1-y^{(i)}}} +∑i=1mlogψy(i)(1−ψ)1−y(i)
求取 l ( ψ , μ 0 , μ 1 , ∑ ) l(\psi,\mu_0,\mu_1,\sum) l(ψ,μ0,μ1,∑)的最大值,令
∂ ∂ ψ l ( ψ , μ 0 , μ 1 , ∑ ) = 0 \frac{\partial}{\partial\psi}l(\psi,\mu_0,\mu_1,\sum)=0 ∂ψ∂l(ψ,μ0,μ1,∑)=0 (1)
∇ μ 0 l ( ψ , μ 0 , μ 1 , ∑ ) = 0 \nabla_{\mu_0}l(\psi,\mu_0,\mu_1,\sum)=0 ∇μ0l(ψ,μ0,μ1,∑)=0 (2)
∇ μ 1 l ( ψ , μ 0 , μ 1 , ∑ ) = 0 \nabla_{\mu_1}l(\psi,\mu_0,\mu_1,\sum)=0 ∇μ1l(ψ,μ0,μ1,∑)=0 (3)
∇ ∑ l ( ψ , μ 0 , μ 1 , ∑ ) = 0 \nabla_{\sum}l(\psi,\mu_0,\mu_1,\sum)=0 ∇∑l(ψ,μ0,μ1,∑)=0 (4)
对于(1)式:
∂ ∂ ψ ∑ i = 1 m y ( i ) l o g ψ + ( 1 − y ( i ) ) l o g ( 1 − ψ ) = 0 \frac{\partial}{\partial\psi}{\sum^m_{i=1}y^{(i)}log\psi+(1-y^{(i)})log(1-\psi)}=0 ∂ψ∂∑i=1my(i)logψ+(1−y(i))log(1−ψ)=0
∑ i = 1 m y ( i ) ψ + 1 − y ( i ) 1 − ψ = 0 {\sum^m_{i=1}\frac{y^{(i)}}{\psi}+\frac{1-y^{(i)}}{1-\psi}}=0 ∑i=1mψy(i)+1−ψ1−y(i)=0
∑ i = 1 m y ( i ) ( 1 − ψ ) + ( 1 − y ( i ) ) ψ = 0 {\sum^m_{i=1}y^{(i)}{(1-\psi)}+(1-y^{(i)}){\psi}}=0 ∑i=1my(i)(1−ψ)+(1−y(i))ψ=0
∑ i = 1 m y ( i ) = m ψ {\sum^m_{i=1}y^{(i)}}=m\psi ∑i=1my(i)=mψ
ψ = ∑ i = 1 m 1 { y ( i ) = 1 } m \psi=\frac{\sum^m_{i=1}1\{y^{(i)}=1\}}{m} ψ=m∑i=1m1{y(i)=1}
对于(2)式:
∇ μ 0 ∑ i = 1 m ( 1 − y ( i ) ) ( x ( i ) − μ 0 ) T ∑ − 1 ( x ( i ) − μ 0 ) = 0 \nabla_{\mu_0}\sum^m_{i=1}(1-y^{(i)})(x^{(i)}-\mu_0)^T\sum^{-1}(x^{(i)}-\mu_0)=0 ∇μ0∑i=1m(1−y(i))(x(i)−μ0)T∑−1(x(i)−μ0)=0
∑ i = 1 m ( 1 − y ( i ) ) ( x ( i ) − μ 0 ) T ∑ − 1 ( x ( i ) − μ 0 ) = 0 \sum^m_{i=1}(1-y^{(i)})(x^{(i)}-\mu_0)^T\sum^{-1}(x^{(i)}-\mu_0)=0 ∑i=1m(1−y(i))(x(i)−μ0)T∑−1(x(i)−μ0)=0
∑ i = 1 m ( 1 − y ( i ) ) [ ∑ − 1 ( x ( i ) − μ 0 ) d ( x ( i ) − μ 0 ) T + ( x ( i ) − μ 0 ) T ∑ − 1 d ( x ( i ) − μ 0 ) ] = 0 \sum^m_{i=1}(1-y^{(i)})[\sum^{-1}(x^{(i)}-\mu_0)d(x^{(i)}-\mu_0)^T+(x^{(i)}-\mu_0)^T\sum^{-1}d(x^{(i)}-\mu_0)]=0 ∑i=1m(1−y(i))[∑−1(x(i)−μ0)d(x(i)−μ0)T+(x(i)−μ0)T∑−1d(x(i)−μ0)]=0
∑ i = 1 m ( 1 − y ( i ) ) ∑ − 1 ( x ( i ) − μ 0 ) = 0 \sum^m_{i=1}(1-y^{(i)})\sum^{-1}(x^{(i)}-\mu_0)=0 ∑i=1m(1−y(i))∑−1(x(i)−μ0)=0
∑ i = 1 m ( 1 − y ( i ) ) ( x ( i ) − μ 0 ) = 0 \sum^m_{i=1}(1-y^{(i)})(x^{(i)}-\mu_0)=0 ∑i=1m(1−y(i))(x(i)−μ0)=0
∑ i = 1 m ( 1 − y ( i ) ) x ( i ) = ∑ i = 1 m ( 1 − y ( i ) ) μ 0 \sum^m_{i=1}(1-y^{(i)})x^{(i)}=\sum^m_{i=1}(1-y^{(i)})\mu_0 ∑i=1m(1−y(i))x(i)=∑i=1m(1−y(i))μ0
μ 0 = ∑ i = 1 m 1 { y ( i ) = 0 } x ( i ) / ∑ i = 1 m 1 { y ( i ) = 0 } \mu_0=\sum^m_{i=1}1\{y^{(i)}=0\}x^{(i)}/\sum^m_{i=1}1\{y^{(i)}=0\} μ0=∑i=1m1{y(i)=0}x(i)/∑i=1m1{y(i)=0}
对于(3)式,类同(2)式:
μ 0 = ∑ i = 1 m 1 { y ( i ) = 1 } x ( i ) / ∑ i = 1 m 1 { y ( i ) = 1 } \mu_0=\sum^m_{i=1}1\{y^{(i)}=1\}x^{(i)}/\sum^m_{i=1}1\{y^{(i)}=1\} μ0=∑i=1m1{y(i)=1}x(i)/∑i=1m1{y(i)=1}
对于(4)式:
∇ ∑ ( − m 2 l o g ∣ ∑ ∣ ) − 1 2 ∑ i = 1 m ( 1 − y ( i ) ) ( x ( i ) − μ 0 ) T ∑ − 1 ( x ( i ) − μ 0 ) − 1 2 ∑ i = 1 m y ( i ) ( x ( i ) − μ 1 ) T ∑ − 1 ( x ( i ) − μ 1 ) = 0 \nabla_{\sum}(-\frac{m}{2}log|\sum|)-\frac{1}{2}\sum^m_{i=1}(1-y^{(i)})(x^{(i)}-\mu_0)^T\sum^{-1}(x^{(i)}-\mu_0)-\frac{1}{2}\sum^m_{i=1}y^{(i)}(x^{(i)}-\mu_1)^T\sum^{-1}(x^{(i)}-\mu_1)=0 ∇∑(−2mlog∣∑∣)−21∑i=1m(1−y(i))(x(i)−μ0)T∑−1(x(i)−μ0)−21∑i=1my(i)(x(i)−μ1)T∑−1(x(i)−μ1)=0
∇ ∑ ( m l o g ∣ ∑ ∣ ) + ∇ ∑ ∑ i = 1 m ( 1 − y ( i ) ) ( x ( i ) − μ 0 ) T ∑ − 1 ( x ( i ) − μ 0 ) + ∇ ∑ ∑ i = 1 m y ( i ) ( x ( i ) − μ 1 ) T ∑ − 1 ( x ( i ) − μ 1 ) = 0 \nabla_{\sum}(mlog|\sum|)+\nabla_{\sum}\sum^m_{i=1}(1-y^{(i)})(x^{(i)}-\mu_0)^T\sum^{-1}(x^{(i)}-\mu_0)+\nabla_{\sum}\sum^m_{i=1}y^{(i)}(x^{(i)}-\mu_1)^T\sum^{-1}(x^{(i)}-\mu_1)=0 ∇∑(mlog∣∑∣)+∇∑∑i=1m(1−y(i))(x(i)−μ0)T∑−1(x(i)−μ0)+∇∑∑i=1my(i)(x(i)−μ1)T∑−1(x(i)−μ1)=0
已知协方差矩阵 S i = 1 m ∑ i = 1 m ( x ( i ) − μ i ) ( x ( i ) − μ i ) T S_i=\frac{1}{m}\sum^m_{i=1}(x^{(i)}-\mu_i)(x^{(i)}-\mu_i)^T Si=m1∑i=1m(x(i)−μi)(x(i)−μi)T,将通过 S i S_i Si简化表达上式
∇ ∑ ∑ i = 1 m ( x ( i ) − μ i ) T ∑ − 1 ( x ( i ) − μ i ) \nabla_{\sum}\sum^m_{i=1}(x^{(i)}-\mu_i)^T\sum^{-1}(x^{(i)}-\mu_i) ∇∑∑i=1m(x(i)−μi)T∑−1(x(i)−μi)
= ∇ ∑ t r ( ∑ i = 1 m ( x ( i ) − μ i ) T ∑ − 1 ( x ( i ) − μ i ) ) =\nabla_{\sum}tr(\sum^m_{i=1}(x^{(i)}-\mu_i)^T\sum^{-1}(x^{(i)}-\mu_i)) =∇∑tr(∑i=1m(x(i)−μi)T∑−1(x(i)−μi))
= ∇ ∑ t r ( ∑ i = 1 m ( x ( i ) − μ i ) ( x ( i ) − μ i ) T ∑ − 1 ) =\nabla_{\sum}tr(\sum^m_{i=1}(x^{(i)}-\mu_i)(x^{(i)}-\mu_i)^T\sum^{-1}) =∇∑tr(∑i=1m(x(i)−μi)(x(i)−μi)T∑−1)
= ∇ ∑ t r ( m i S i ∑ − 1 ) =\nabla_{\sum}tr(m_iS_i\sum^{-1}) =∇∑tr(miSi∑−1)
其中 m i = ∑ k = 1 m 1 { y ( k ) = i } m_i=\sum^m_{k=1}1\{y^{(k)}=i\} mi=∑k=1m1{y(k)=i},
∇ ∑ t r ( m i S i ∑ − 1 ) = − m i S i T ∑ − 2 \nabla_{\sum}tr(m_iS_i\sum^{-1})=-m_iS_i^T\sum^{-2} ∇∑tr(miSi∑−1)=−miSiT∑−2,
而 ∇ ∑ ( m l o g ∣ ∑ ∣ ) = m 1 ∣ ∑ ∣ ∣ ∑ ∣ ∑ − 1 = m ∑ − 1 \nabla_{\sum}(mlog|\sum|)=m\frac{1}{|\sum|}|\sum|\sum^{-1}=m\sum^{-1} ∇∑(mlog∣∑∣)=m∣∑∣1∣∑∣∑−1=m∑−1,
因此,(4)式可简化为
m ∑ − 1 − ∑ i 2 m i S i T ∑ − 2 = 0 m\sum^{-1}-\sum_i^{2}m_iS_i^T\sum^{-2}=0 m∑−1−∑i2miSiT∑−2=0
∑ = 1 m ∑ i 2 m i S i T \sum=\frac{1}{m}\sum_i^{2}m_iS_i^T ∑=m1∑i2miSiT
∑ = 1 m ∑ i = 1 m ( x ( i ) − μ y ( i ) ) T ( x ( i ) − μ y ( i ) ) \sum=\frac{1}{m}\sum_{i=1}^{m}(x^{(i)}-\mu_{y^{(i)}})^T(x^{(i)}-\mu_{y^{(i)}}) ∑=m1∑i=1m(x(i)−μy(i))T(x(i)−μy(i))