1、假设函数
假设函数为:
P
(
y
∣
x
;
θ
)
=
[
h
θ
(
x
)
]
y
[
1
−
h
θ
(
x
)
]
(
1
−
y
)
=
(
1
1
+
e
−
θ
T
x
)
y
(
1
−
1
1
+
e
−
θ
T
x
)
1
−
y
\begin{aligned} P(y|x;\theta)&=[h_{\theta}(x)]^{y}[1-h_{\theta}(x)]^{(1-y)}\\ &=(\frac{1}{1+e^{-\theta^{T}x}})^{y}(1-\frac{1}{1+e^{-\theta^{T}x}})^{1-y} \end{aligned}
P(y∣x;θ)=[hθ(x)]y[1−hθ(x)](1−y)=(1+e−θTx1)y(1−1+e−θTx1)1−y
2、最大似然估计
似然函数
L
(
θ
)
=
∏
i
=
1
N
P
(
y
(
i
)
∣
x
i
;
θ
)
=
∏
i
=
1
N
[
h
θ
(
x
(
i
)
)
]
y
(
i
)
[
1
−
h
θ
(
x
(
i
)
)
]
(
1
−
y
(
i
)
)
=
∏
i
=
1
N
(
1
1
+
e
−
θ
T
x
(
i
)
)
y
(
i
)
(
1
−
1
1
+
e
−
θ
T
x
(
i
)
)
1
−
y
(
i
)
\begin{aligned} L(\theta)&=\prod_{i=1}^{N}P(y^{(i)}|x^{i};\theta)\\ &=\prod_{i=1}^{N}[h_{\theta}(x^{(i)})]^{y^{(i)}}[1-h_{\theta}(x^{(i)})]^{(1-y^{(i)})}\\ &=\prod_{i=1}^{N}(\frac{1}{1+e^{-\theta^{T}x^{(i)}}})^{y^{(i)}}(1-\frac{1}{1+e^{-\theta^{T}x^{(i)}}})^{1-y^{(i)}} \end{aligned}
L(θ)=i=1∏NP(y(i)∣xi;θ)=i=1∏N[hθ(x(i))]y(i)[1−hθ(x(i))](1−y(i))=i=1∏N(1+e−θTx(i)1)y(i)(1−1+e−θTx(i)1)1−y(i)
Logistic回归的目标函数即为对数似然函数,进行最大似然估计则可求解
m
a
x
θ
L
(
θ
)
⇔
m
a
x
θ
∑
i
=
1
n
[
y
(
i
)
l
n
h
θ
(
x
(
i
)
)
+
(
1
−
y
(
i
)
)
l
n
(
1
−
h
θ
(
x
(
i
)
)
)
]
\underset{\theta}{max}L(\theta)\Leftrightarrow \underset{\theta}{max}\sum_{i=1}^{n}[y^{(i)}lnh_{\theta}(x^{(i)})+(1-y^{(i)})ln(1-h_{\theta}(x^{(i)}))]
θmaxL(θ)⇔θmaxi=1∑n[y(i)lnhθ(x(i))+(1−y(i))ln(1−hθ(x(i)))]
3、梯度上升法求解
梯度
d
L
(
θ
)
d
θ
=
d
d
θ
∑
i
=
1
N
[
y
(
i
)
l
n
h
θ
(
x
(
i
)
)
+
(
1
−
y
(
i
)
)
l
n
(
1
−
h
θ
(
x
(
i
)
)
)
]
=
∑
i
=
1
N
[
y
(
i
)
⋅
1
h
θ
(
x
(
i
)
)
−
(
1
−
y
(
i
)
)
⋅
1
1
−
h
θ
(
x
(
i
)
)
]
⋅
∂
∂
θ
h
θ
(
x
(
i
)
)
=
∑
i
=
1
N
[
y
(
i
)
⋅
1
h
θ
(
x
(
i
)
)
−
(
1
−
y
(
i
)
)
⋅
1
1
−
h
θ
(
x
(
i
)
)
]
⋅
h
θ
(
x
(
i
)
)
⋅
(
1
−
h
θ
(
x
(
i
)
)
)
⋅
∂
∂
θ
θ
T
x
(
i
)
=
∑
i
=
1
N
[
y
(
i
)
⋅
(
1
−
h
θ
(
x
(
i
)
)
)
−
(
1
−
y
(
i
)
)
⋅
h
θ
(
x
(
i
)
)
]
x
(
i
)
=
∑
i
=
1
N
(
y
(
i
)
−
h
θ
(
x
(
i
)
)
)
x
(
i
)
误
差
✖
特
征
\begin{aligned} \frac{dL(\theta)}{d\theta}&=\frac{d}{d\theta}\sum_{i=1}^{N}[y^{(i)}lnh_{\theta}(x^{(i)})+(1-y^{(i)})ln(1-h_{\theta}(x^{(i)}))]\\ &=\sum_{i=1}^{N}[y^{(i)} \cdot \frac{1}{h_{\theta}(x^{(i)})}-(1-y^{(i)})\cdot \frac{1}{1-h_{\theta}(x^{(i)})}] \cdot \frac{\partial}{\partial\theta}h_{\theta}(x^{(i)})\\ &=\sum_{i=1}^{N}[y^{(i)} \cdot \frac{1}{h_{\theta}(x^{(i)})}-(1-y^{(i)})\cdot \frac{1}{1-h_{\theta}(x^{(i)})}] \cdot h_{\theta}(x^{(i)})\cdot(1-h_{\theta}(x^{(i)})) \cdot \frac{\partial}{\partial\theta}\theta^{T}x^{(i)}\\ &=\sum_{i=1}^{N}[y^{(i)} \cdot (1-h_{\theta}(x^{(i)})) -(1-y^{(i)})\cdot h_{\theta}(x^{(i)})] x^{(i)}\\ &=\sum_{i=1}^{N}(y^{(i)}-h_{\theta}(x^{(i)}))x^{(i)} \quad \quad {\color{red}误差✖特征} \end{aligned}
dθdL(θ)=dθdi=1∑N[y(i)lnhθ(x(i))+(1−y(i))ln(1−hθ(x(i)))]=i=1∑N[y(i)⋅hθ(x(i))1−(1−y(i))⋅1−hθ(x(i))1]⋅∂θ∂hθ(x(i))=i=1∑N[y(i)⋅hθ(x(i))1−(1−y(i))⋅1−hθ(x(i))1]⋅hθ(x(i))⋅(1−hθ(x(i)))⋅∂θ∂θTx(i)=i=1∑N[y(i)⋅(1−hθ(x(i)))−(1−y(i))⋅hθ(x(i))]x(i)=i=1∑N(y(i)−hθ(x(i)))x(i)误差✖特征
梯度上升优化:
θ
:
=
θ
+
α
d
L
(
θ
)
d
θ
=
θ
+
α
∑
i
=
1
N
(
y
(
i
)
−
h
θ
(
x
(
i
)
)
)
x
(
i
)
\begin{aligned} \theta:=\theta+\alpha \frac{dL(\theta)}{d\theta}=\theta+\alpha\sum_{i=1}^{N}(y^{(i)}-h_{\theta}(x^{(i)}))x^{(i)} \end{aligned}
θ:=θ+αdθdL(θ)=θ+αi=1∑N(y(i)−hθ(x(i)))x(i)
4、纯python实现
代码入下
import numpy as np
import matplotlib.pyplot as plt
import time
# 加载数据
def load_data():
X_train = np.loadtxt("./Exam/train/x.txt")
Y_train = np.loadtxt("./Exam/train/y.txt", dtype=int)
X_test = np.loadtxt("./Exam/test/x.txt")
Y_test = np.loadtxt("./Exam/test/y.txt", dtype=int)
return X_train, Y_train, X_test, Y_test
# Logistic回归类
class Logistic(object):
def __init__(self, X_train, Y_train):
self.X_train = X_train
self.Y_train = Y_train
# M:特征数,N:样本数
self.M = X_train.shape[1]
self.N = X_train.shape[0]
self.lr = 0.005
self.train()
def normalization(self):
# 均值方差归一化
mean = np.mean(self.X_train)
variance = np.std(self.X_train)
self.X_train = (self.X_train - mean)/variance
self.X_train = np.insert(self.X_train, 0, values=1.0, axis=1)
self.Y_train = self.Y_train.reshape(self.N, 1)
self.M += 1
def sigmoid(self, X):
eta = -np.dot(X, self.theta) # N*1
H = np.exp(eta)
H = 1.0 / (1.0 + H)
return H
def Gradient_ascent(self):
self.theta = -np.ones((self.M, 1))
for i in range(500):
self.H = self.sigmoid(self.X_train)
self.theta += self.lr * np.dot(self.X_train.T, (self.Y_train - self.H))
self.loss = -np.sum(self.Y_train * np.log(self.H) + (1.0 - self.Y_train) * np.log(1 - self.H))/self.N
print("iter: %d, loss: %f" % (i, self.loss))
print(self.theta)
def train(self):
self.normalization()
self.Gradient_ascent()
if __name__ == "__main__":
X_train, Y_train, X_test, Y_test = load_data()
Logistic(X_train, Y_train)
优化过程如下:
(自己学习机器学习的笔记,如有错误望提醒修正)