J ( θ ) = − 1 m [ ∑ i = 1 m ( y ( i ) log ( y h a t θ ( x ( i ) ) ) + ( 1 − y ( i ) ) log ( 1 − y h a t θ ( x ( i ) ) ) ) ] + λ 1 2 m ∑ j = 1 n θ j 2 J(\theta) = - \frac{1}{m} [ \sum_{i=1}^{m} (y^{(i)}\log(yhat_{\theta}(x^{(i)})) + (1-y^{(i)})\log{(1-yhat_{\theta}(x^{(i)}))})] + \lambda \frac{1}{2m} \sum_{j=1}^n \theta_j^2 J(θ)=−m1[i=1∑m(y(i)log(yhatθ(x(i)))+(1−y(i))log(1−yhatθ(x(i))))]+λ2m1j=1∑nθj2
y h a t θ ( X ) = g ( θ T X ) = s i g m o i d ( θ T X ) yhat_\theta(X) = g(\theta^T X) = sigmoid(\theta^T X) yhatθ(X)=g(θTX)=sigmoid(θTX)
λ 1 2 m ∑ j = 1 n θ j 2 在 神 经 网 络 里 就 是 b 偏 置 项 \lambda \frac{1}{2m} \sum_{j=1}^n \theta_j^2 在神经网络里就是b偏置项 λ2m1j=1∑nθj2在神经网络里就是b偏置项
def h(x,w):
return np.dot(x,w)
def sigmoid(z):
return 1/(1+np.exp(-z))
def loss(x,y,w,b=0):
m,n = x.shape
yhat = sigmoid(h(x,w))
J = np.dot(np.log(1-yhat).T,-y)-np.dot(np.log(1-yhat).T,1-y)/m+(b/(2*m))*np.sum(np.square(w[1:]))
return np.isnan(J[0]) == True and np.inf or J[0]
def bgd(x,y,alpha = 0.05,epsilon = 0.00000001,maxloop=100000,b=0):
m,n = x.shape
w = np.zeros((n,1))
cost = loss(x,y,w,b)
costs = [cost]
ws = [w]
for time in range(maxloop):
yhat = sigmoid(np.dot(x,w))
w = w-alpha*((1/m)*np.dot(x.T,yhat-y))+(b/m)*np.r_[[0], np.squeeze(w[1:])][:,None]
ws.append(w)
cost = loss(x,y,w,b)
costs.append(cost)
lossvar = abs(costs[-1]-costs[-2])
if lossvar < epsilon:
return time, costs,ws
elif np.isinf(lossvar) or np.isnan(lossvar):
return lossvar
else:
continue
## 数据
Xlo = np.linspace(-10,10,10000)[:,None]
Ylo = (np.random.randn(10000)*10)[:,None]
m,n = Xlo.shape
b = 0
x = np.c_[np.ones((m,1)), Xlo]
y = (np.abs(Ylo/10)>1).astype(int)
##调用批量梯度下降
results1 = bgd(x,y)
time, costs,thetas = results1