参考:
深入浅出–梯度下降法及其实现
算法描述
X ∈ R m × 1 \mathbf{X} \in R^{m \times 1 } X∈Rm×1 , y ∈ R m × 1 \mathbf{y} \in R^{m \times 1} y∈Rm×1,找到一个函数 f ( x ) f(x) f(x)实现 f : X → y f:\mathbf{X} \rightarrow \mathbf{y} f:X→y,其损失函数使用MSE,定义如下:
假设 f ( X ) = X θ + b f(X)=X \theta+b f(X)=Xθ+b ,其中 θ ∈ R 1 × 1 \theta \in R^{1 \times 1 } θ∈R1×1 , b ∈ R 1 × 1 b \in R^{1 \times 1} b∈R1×1
J ( θ , b ) = 1 2 m ∑ x ∈ X ( f ( x ( i ) ) − y i ) 2 = 1 2 m ( f ( X ) − y ) T ( f ( X ) − y ) = 1 2 m ( X θ + b − y ) T ( X θ + b − y ) J(\theta,b)=\frac{1}{2m}\sum_{x \in \mathbf{X}} (f(x^{(i)})-y^i)^2=\frac{1}{2m} (f(\mathbf{X})-\mathbf{y})^T(f(\mathbf{X})-\mathbf{y})=\frac{1}{2m} (\mathbf{X} \theta+b-\mathbf{y})^T(\mathbf{X} \theta+b-\mathbf{y}) J(θ,b)=2m1∑x∈X(f(x(i))−yi)2=2m1(f(X)−y)T(f(X)−y)=2m1(Xθ+b−y)T(Xθ+b−y)
使用梯度下降算法更新参数 θ , b \theta,b θ,b:
∂ J ∂ θ = ▽ θ J = 1 m X T ( X θ + b − y ) = 1 m ∑ i = 1 m x ( i ) ( x ( i ) θ + b − y ( i ) ) \frac{\partial J}{\partial \theta}=\bigtriangledown_\theta J =\frac{1}{m}\mathbf{X}^T(\mathbf{X}\theta+b-\mathbf{y})=\frac{1}{m}\sum_{i=1}^m x^{(i)}(x^{(i)}\theta+b-y^{(i)}) ∂θ∂J=▽θJ=m1XT(Xθ+b−y)=m1∑i=1mx(i)(x(i)θ+b−y(i))
▽ b J = 1 m ( X θ + b − y ) = 1 m ∑ i = 1 m ( x ( i ) θ + b − y ( i ) ) \bigtriangledown_b J=\frac{1}{m}(\mathbf{X}\theta+b-\mathbf{y})=\frac{1}{m} \sum_{i=1}^m (x^{(i)}\theta+b-y^{(i)}) ▽bJ=m1(Xθ+b−y)=m1∑i=1m(x(i)θ+b−y(i))
θ = θ − α ▽ θ J = θ − α 1 m ∑ i = 1 m x ( i ) ( x ( i ) θ + b − y ( i ) ) \theta=\theta-\alpha\bigtriangledown_\theta J=\theta-\alpha \frac{1}{m} \sum_{i=1}^m x^{(i)}(x^{(i)}\theta+b-y^{(i)}) θ=θ−α▽θJ=θ−αm1∑i=1mx(i)(x(i)θ+b−y(i))
b = b − α ▽ b J = b − α 1 m ∑ i = 1 m ( x ( i ) θ + b − y ( i ) ) b=b-\alpha\bigtriangledown_b J=b-\alpha \frac{1}{m} \sum_{i=1}^m (x^{(i)}\theta+b-y^{(i)}) b=b−α▽bJ=b−αm1∑i=1m(x(i)θ+b−y(i))
代码实现
numpy
# -*- coding:utf-8 -*-
import numpy as np
import matplotlib.pyplot as plt
m=20
alpha=0.0005
X=np.linspace(1,100,m).reshape((m,1))
y=5*X+3+np.random.randn(m,1)*10
def error_function(theta,b ,X, y):
'''Error function J definition.'''
diff = np.dot(X, theta)+b - y
return (1./2*m) * np.dot(np.transpose(diff), diff)
def gradient_function(theta,b,X, y):
'''Gradient of the function J definition.'''
diff = np.dot(X, theta)+b -y
return (1./m) * np.dot(np.transpose(X), diff),np.mean(diff)
"""
def gradient_function(theta,b,X, y):
gradient_theta=0
gradient_bias=0
for i in range(m):
gradient_theta+=X[i]*(X[i]*theta+b-y[i])
gradient_bias += (X[i] * theta + b - y[i])
return gradient_theta/m,gradient_bias/m
"""
def gradient_descent(X, y, alpha):
'''Perform gradient descent.'''
theta = np.random.random((1,1))
b=np.random.random((1,1))
gradient_theta,gradient_bias = gradient_function(theta,b, X, y)
while not np.all(np.absolute(gradient_theta)+np.absolute(gradient_bias) <= 1e-5):
theta = theta - alpha * gradient_theta
b=b-alpha*gradient_bias
gradient_theta, gradient_bias = gradient_function(theta,b,X, y)
print("loss:",error_function(theta,b,X, y))
return theta,b
optimal = gradient_descent(X, y, alpha)
print('optimal:', optimal)
# print('error function:', error_function(optimal[0],optimal[1], X, y)[0,0])
plt.figure(figsize=(6,6))
plt.scatter(X,y,c='r',marker='o')
plt.plot(X,np.dot(X, optimal[0])+optimal[1])
plt.show()
numpy+cupy
将numpy数据转成cupy,使用GPU加速
# -*- coding:utf-8 -*-
import numpy as np
import cupy as cp
import matplotlib.pyplot as plt
m=20
alpha=0.0005
X_cpu=np.linspace(1,100,m).reshape((m,1))
y_cpu=5*X_cpu+3+np.random.randn(m,1)*10
# 转移到GPU
X=cp.asarray(X_cpu)
y=cp.asarray(y_cpu)
def error_function(theta,b ,X, y):
'''Error function J definition.'''
diff = cp.dot(X, theta)+b - y
return (1./2*m) * cp.dot(np.transpose(diff), diff)
def gradient_function(theta,b,X, y):
'''Gradient of the function J definition.'''
# diff = np.dot(X, theta)+b -y
# return (1./m) * np.dot(np.transpose(X), diff),np.mean(diff)
diff=cp.dot(X,theta)+b-y
return (1. / m) * cp.dot(cp.transpose(X), diff), cp.mean(diff)
"""
def gradient_function(theta,b,X, y):
gradient_theta=0
gradient_bias=0
for i in range(m):
gradient_theta+=X[i]*(X[i]*theta+b-y[i])
gradient_bias += (X[i] * theta + b - y[i])
return gradient_theta/m,gradient_bias/m
"""
def gradient_descent(X, y, alpha):
'''Perform gradient descent.'''
# theta = np.random.random((1,1))
# b=np.random.random((1,1))
# gpu模式
theta=cp.random.random((1,1))
b=cp.random.random((1,1))
gradient_theta,gradient_bias = gradient_function(theta,b, X, y)
# while not np.all(np.absolute(gradient_theta)+np.absolute(gradient_bias) <= 1e-5):
while not cp.all(cp.absolute(gradient_theta) + cp.absolute(gradient_bias) <= 1e-5):
theta = theta - alpha * gradient_theta
b=b-alpha*gradient_bias
gradient_theta, gradient_bias = gradient_function(theta,b,X, y)
print("loss:",error_function(theta,b,X, y).get())
return theta,b
optimal = gradient_descent(X, y, alpha)
print('optimal:', optimal)
# print('error function:', error_function(optimal[0],optimal[1], X, y)[0,0])
plt.figure(figsize=(6,6))
plt.scatter(X_cpu,y_cpu,c='r',marker='o')
plt.plot(cp.asnumpy(X),cp.asnumpy(cp.dot(X, optimal[0])+optimal[1])) # GPU-->CPU , cp.asnumpy(X) or X.get()
plt.show()
numpy+pytorch
将numpy数据放在pytorch中使用GPU加速
# -*- coding:utf-8 -*-
import torch
#import torch.nn as nn
#import torchvision
#import torchvision.transforms as transforms
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
import numpy as np
import matplotlib.pyplot as plt
m=20
alpha=0.0005
X_cpu=np.linspace(1,100,m).reshape((m,1))
y_cpu=5*X_cpu+3+np.random.randn(m,1)*10
# 转移到GPU
X=torch.from_numpy(X_cpu).to(device)
y=torch.from_numpy(y_cpu).to(device)
def error_function(theta,b ,X, y):
'''Error function J definition.'''
# diff = np.dot(X, theta)+b - y
# return (1./2*m) * np.dot(np.transpose(diff), diff)
diff =torch.matmul(X,theta)+b-y
return (1. / 2 * m) * torch.matmul(torch.transpose(diff,1,0), diff)
def gradient_function(theta,b,X, y):
'''Gradient of the function J definition.'''
# diff = np.dot(X, theta)+b -y
diff = torch.matmul(X, theta) + b - y
# return (1./m) * np.dot(np.transpose(X), diff),np.mean(diff)
return (1./m) * torch.matmul(torch.transpose(X, 1, 0), diff),torch.mean(diff)
"""
def gradient_function(theta,b,X, y):
gradient_theta=0
gradient_bias=0
for i in range(m):
gradient_theta+=X[i]*(X[i]*theta+b-y[i])
gradient_bias += (X[i] * theta + b - y[i])
return gradient_theta/m,gradient_bias/m
"""
def gradient_descent(X, y, alpha):
'''Perform gradient descent.'''
theta = torch.from_numpy(np.random.random((1,1))).to(device)
b=torch.from_numpy(np.random.random((1,1))).to(device)
gradient_theta,gradient_bias = gradient_function(theta,b, X, y)
while not np.all(np.absolute(gradient_theta.to("cpu").numpy())+np.absolute(gradient_bias.to("cpu").numpy()) <= 1e-5):
theta = theta - alpha * gradient_theta
b=b-alpha*gradient_bias
gradient_theta, gradient_bias = gradient_function(theta,b,X, y)
print("loss:",error_function(theta,b,X, y).to("cpu").numpy())
return theta,b
optimal = gradient_descent(X, y, alpha)
print('optimal:', optimal)
# print('error function:', error_function(optimal[0],optimal[1], X, y)[0,0])
plt.figure(figsize=(6,6))
plt.scatter(X_cpu,y_cpu,c='r',marker='o')
plt.plot(X.to("cpu").numpy(),np.dot(X_cpu, optimal[0].to("cpu").numpy())+optimal[1].to("cpu").numpy())
plt.show()
pytorch
# -*- coding:utf-8 -*-
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
import numpy as np
import matplotlib.pyplot as plt
m=20
alpha=0.00005
num_epochs=20
X_cpu=np.linspace(1,100,m).reshape((m,1))
X_cpu=X_cpu.astype(np.float32)
y_cpu=5*X_cpu+3+np.random.randn(m,1)*10
y_cpu=y_cpu.astype(np.float32)
# 转移到GPU
X=torch.from_numpy(X_cpu).to(device)
y=torch.from_numpy(y_cpu).to(device)
# Linear regression model
# Fully connected neural network with one hidden layer
class LineNet(nn.Module):
def __init__(self, input_size, num_classes):
super(LineNet, self).__init__()
self.fc1 = nn.Linear(input_size, num_classes)
# self.relu = nn.ReLU()
# self.softplus=nn.Softplus()
# self.fc2 = nn.Linear(hidden_size, num_classes)
# self.output=nn.Softmax()
def forward(self, x):
out = self.fc1(x)
# out = self.relu(out)
# out = self.softplus(out)
# out = self.fc2(out)
# out=self.output(out)
return out
# model = nn.Linear(1, 1).to(device) # [1,1]
# or
model = LineNet(1, 1).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=alpha)
for epoch in range(num_epochs):
outputs = model(X)
loss = criterion(outputs, y)
# Backward and optimize
optimizer.zero_grad()
loss.backward()
optimizer.step()
if (epoch + 1) % 5 == 0:
print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch + 1, num_epochs, loss.item()))
# Save the model checkpoint
# torch.save(model.state_dict(), 'model.ckpt')
with torch.no_grad():
plt.figure(figsize=(6,6))
plt.scatter(X_cpu,y_cpu,c='r',marker='o')
# predicted = model(X).to("cpu").numpy()
# plt.plot(X.to("cpu").numpy(),predicted)
plt.plot(X_cpu, np.dot(X_cpu, model.fc1.weight.to("cpu").numpy()) + model.fc1.bias.to("cpu").numpy())
plt.show()
pytorch-2
手动更新参数
# -*- coding:utf-8 -*-
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
import numpy as np
import matplotlib.pyplot as plt
m=20
alpha=0.00005
num_epochs=20
X_cpu=np.linspace(1,100,m).reshape((m,1))
X_cpu=X_cpu.astype(np.float32)
y_cpu=5*X_cpu+3+np.random.randn(m,1)*10
y_cpu=y_cpu.astype(np.float32)
# 转移到GPU
X=torch.from_numpy(X_cpu).to(device)
y=torch.from_numpy(y_cpu).to(device)
# Linear regression model
# Fully connected neural network with one hidden layer
class LineNet(nn.Module):
def __init__(self, input_size, num_classes):
super(LineNet, self).__init__()
self.fc1 = nn.Linear(input_size, num_classes)
# self.relu = nn.ReLU()
# self.softplus=nn.Softplus()
# self.fc2 = nn.Linear(hidden_size, num_classes)
# self.output=nn.Softmax()
def forward(self, x):
out = self.fc1(x)
# out = self.relu(out)
# out = self.softplus(out)
# out = self.fc2(out)
# out=self.output(out)
return out
# model = nn.Linear(1, 1).to(device) # [1,1]
# or
model = LineNet(1, 1).to(device)
criterion = nn.MSELoss()
# optimizer = torch.optim.SGD(model.parameters(), lr=alpha)
curr_lr=alpha
# For updating learning rate
def update_lr(optimizer, lr):
for param_group in optimizer.param_groups:
param_group['lr'] = lr
for epoch in range(num_epochs):
outputs = model(X)
loss = criterion(outputs, y)
# Backward and optimize
# optimizer.zero_grad()
loss.backward()
# optimizer.step()
# 手动更新参数
model.fc1.weight.data.zero_()
model.fc1.weight.data.sub_(alpha*model.fc1.weight.grad.data)
model.fc1.bias.data.zero_()
model.fc1.bias.data.sub_(alpha * model.fc1.bias.grad.data)
if (epoch + 1) % 5 == 0:
print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch + 1, num_epochs, loss.item()))
# Save the model checkpoint
# torch.save(model.state_dict(), 'model.ckpt')
"""
# Decay learning rate
if (epoch + 1) % 20 == 0:
curr_lr /= 3
update_lr(optimizer, curr_lr)
"""
with torch.no_grad():
plt.figure(figsize=(6,6))
plt.scatter(X_cpu,y_cpu,c='r',marker='o')
# predicted = model(X).to("cpu").numpy()
# plt.plot(X.to("cpu").numpy(),predicted)
plt.plot(X_cpu, np.dot(X_cpu, model.fc1.weight.to("cpu").numpy()) + model.fc1.bias.to("cpu").numpy())
plt.show()
numpy+tensorflow
使用tf.device("/gpu:0")
设置gpu,加速
# -*- coding:utf-8 -*-
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ['CUDA_VISIBLE_DEVICES'] = "0"
m=20
alpha=0.0005
with tf.device("/gpu:0"):
X=np.linspace(1,100,m).reshape((m,1))
y=5*X+3+np.random.randn(m,1)*10
def error_function(theta,b ,X, y):
with tf.device("/gpu:0"):
'''Error function J definition.'''
diff = np.dot(X, theta)+b - y
return (1./2*m) * np.dot(np.transpose(diff), diff)
def gradient_function(theta,b,X, y):
with tf.device("/gpu:0"):
'''Gradient of the function J definition.'''
diff = np.dot(X, theta)+b -y
return (1./m) * np.dot(np.transpose(X), diff),np.mean(diff)
"""
def gradient_function(theta,b,X, y):
gradient_theta=0
gradient_bias=0
for i in range(m):
gradient_theta+=X[i]*(X[i]*theta+b-y[i])
gradient_bias += (X[i] * theta + b - y[i])
return gradient_theta/m,gradient_bias/m
"""
def gradient_descent(X, y, alpha):
with tf.device("/gpu:0"):
'''Perform gradient descent.'''
theta = np.random.random((1,1))
b=np.random.random((1,1))
gradient_theta,gradient_bias = gradient_function(theta,b, X, y)
while not np.all(np.absolute(gradient_theta)+np.absolute(gradient_bias) <= 1e-5):
theta = theta - alpha * gradient_theta
b=b-alpha*gradient_bias
gradient_theta, gradient_bias = gradient_function(theta,b,X, y)
print("loss:",error_function(theta,b,X, y))
return theta,b
with tf.device("/gpu:0"):
optimal = gradient_descent(X, y, alpha)
print('optimal:', optimal)
# print('error function:', error_function(optimal[0],optimal[1], X, y)[0,0])
plt.figure(figsize=(6,6))
plt.scatter(X,y,c='r',marker='o')
plt.plot(X,np.dot(X, optimal[0])+optimal[1])
plt.show()
tensorflow
# -*- coding:utf-8 -*-
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ['CUDA_VISIBLE_DEVICES'] = "0"
m=20
alpha=0.00005
epochs=20
with tf.device("/gpu:0"):
X=np.linspace(1,100,m).reshape((m,1))
y=5*X+3+np.random.randn(m,1)*10
input_x=tf.placeholder(tf.float32,(m,1))
output_y=tf.placeholder(tf.float32,(m,1))
# pred=tf.layers.dense(input_x,1)
b = tf.Variable(tf.zeros([1,1]))
W=tf.Variable(tf.ones([1,1]))
pred=tf.matmul(input_x,W)+b
loss=tf.losses.mean_squared_error(output_y,pred)
optimizer=tf.train.GradientDescentOptimizer(alpha)
train_op=optimizer.minimize(loss,var_list=tf.trainable_variables())
# 手动更新参数
# Computing the gradient of cost with respect to W and b
grad_W, grad_b=tf.gradients(loss,[W,b])
# Gradient Step
new_W = W.assign(W - alpha * grad_W) # 相当于 W-=learning_rate*grad_W
new_b = b.assign(b - alpha * grad_b) # 相当于 b-=learning_rate*grad_b
# gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.7)
# sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
gpu_options = tf.GPUOptions(allow_growth=True)
# sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
tf.global_variables_initializer().run()
for epoch in range(epochs):
# _,loss_=sess.run([train_op,loss],{input_x:X,output_y:y})
# 手动更新参数
_, _,loss_= sess.run([new_W, new_b,loss],{input_x:X,output_y:y})
print("loss:",loss_)
plt.figure(figsize=(6,6))
plt.scatter(X,y,c='r',marker='o')
plt.plot(X,pred.eval({input_x:X}))
plt.show()
julia
using LinearAlgebra # or import LinearAlgebra
using Statistics
using PyPlot
m=20;
alpha=0.0005;
epochs=20;
X=reshape(linspace(1,100,m),(m,1));
y=5 .*X .+3+randn((m,1)).*10;
function error_function(theta,b,X,y)
#diff=dot(X,theta)+b-y;
#(1.0/2*m).*dot(transpose(diff),diff);
diff=X.*theta.+b-y;
(1.0/2*m).*dot(transpose(diff),diff);
end
gradient_function(theta,b,X, y)=begin
diff=X.*theta.+b-y; #diff=dot(X,theta)+b-y;
# or
# diff=@. X*theta+b-y;
return (1.0/m) .* dot(transpose(X), diff),mean(diff)
end
gradient_descent=(X, y, alpha)->begin
theta=randn(1,1);
b=randn(1,1);
gradient_theta,gradient_bias = gradient_function(theta,b, X, y);
while !all(abs(gradient_theta)+abs(gradient_bias)<=1e-5)
theta = theta - alpha * gradient_theta;
b=b-alpha*gradient_bias;
gradient_theta, gradient_bias = gradient_function(theta,b,X, y);
print("loss:"*string(error_function(theta,b,X, y))*"\n");
end
# print("$theta,$b")
return theta,b
end
optimal = gradient_descent(X, y, alpha);
print("optimal:$optimal");
figure(figsize=(6,6));
scatter(X,y,c="r",marker="o"); # 注意必须使用双引号
plot(X,X.*optimal[1].+optimal[2]);
show();