线性回归-详解

7 篇文章 0 订阅
1 篇文章 0 订阅

参考:
深入浅出–梯度下降法及其实现


算法描述

X ∈ R m × 1 \mathbf{X} \in R^{m \times 1 } XRm×1 , y ∈ R m × 1 \mathbf{y} \in R^{m \times 1} yRm×1,找到一个函数 f ( x ) f(x) f(x)实现 f : X → y f:\mathbf{X} \rightarrow \mathbf{y} f:Xy,其损失函数使用MSE,定义如下:

假设 f ( X ) = X θ + b f(X)=X \theta+b f(X)=Xθ+b ,其中 θ ∈ R 1 × 1 \theta \in R^{1 \times 1 } θR1×1 , b ∈ R 1 × 1 b \in R^{1 \times 1} bR1×1

J ( θ , b ) = 1 2 m ∑ x ∈ X ( f ( x ( i ) ) − y i ) 2 = 1 2 m ( f ( X ) − y ) T ( f ( X ) − y ) = 1 2 m ( X θ + b − y ) T ( X θ + b − y ) J(\theta,b)=\frac{1}{2m}\sum_{x \in \mathbf{X}} (f(x^{(i)})-y^i)^2=\frac{1}{2m} (f(\mathbf{X})-\mathbf{y})^T(f(\mathbf{X})-\mathbf{y})=\frac{1}{2m} (\mathbf{X} \theta+b-\mathbf{y})^T(\mathbf{X} \theta+b-\mathbf{y}) J(θ,b)=2m1xX(f(x(i))yi)2=2m1(f(X)y)T(f(X)y)=2m1(Xθ+by)T(Xθ+by)

使用梯度下降算法更新参数 θ , b \theta,b θ,b:

∂ J ∂ θ = ▽ θ J = 1 m X T ( X θ + b − y ) = 1 m ∑ i = 1 m x ( i ) ( x ( i ) θ + b − y ( i ) ) \frac{\partial J}{\partial \theta}=\bigtriangledown_\theta J =\frac{1}{m}\mathbf{X}^T(\mathbf{X}\theta+b-\mathbf{y})=\frac{1}{m}\sum_{i=1}^m x^{(i)}(x^{(i)}\theta+b-y^{(i)}) θJ=θJ=m1XT(Xθ+by)=m1i=1mx(i)(x(i)θ+by(i))

▽ b J = 1 m ( X θ + b − y ) = 1 m ∑ i = 1 m ( x ( i ) θ + b − y ( i ) ) \bigtriangledown_b J=\frac{1}{m}(\mathbf{X}\theta+b-\mathbf{y})=\frac{1}{m} \sum_{i=1}^m (x^{(i)}\theta+b-y^{(i)}) bJ=m1(Xθ+by)=m1i=1m(x(i)θ+by(i))

θ = θ − α ▽ θ J = θ − α 1 m ∑ i = 1 m x ( i ) ( x ( i ) θ + b − y ( i ) ) \theta=\theta-\alpha\bigtriangledown_\theta J=\theta-\alpha \frac{1}{m} \sum_{i=1}^m x^{(i)}(x^{(i)}\theta+b-y^{(i)}) θ=θαθJ=θαm1i=1mx(i)(x(i)θ+by(i))

b = b − α ▽ b J = b − α 1 m ∑ i = 1 m ( x ( i ) θ + b − y ( i ) ) b=b-\alpha\bigtriangledown_b J=b-\alpha \frac{1}{m} \sum_{i=1}^m (x^{(i)}\theta+b-y^{(i)}) b=bαbJ=bαm1i=1m(x(i)θ+by(i))

代码实现

numpy

# -*- coding:utf-8 -*-

import numpy as np
import matplotlib.pyplot as plt

m=20
alpha=0.0005

X=np.linspace(1,100,m).reshape((m,1))
y=5*X+3+np.random.randn(m,1)*10


def error_function(theta,b ,X, y):
    '''Error function J definition.'''
    diff = np.dot(X, theta)+b - y
    return (1./2*m) * np.dot(np.transpose(diff), diff)


def gradient_function(theta,b,X, y):
    '''Gradient of the function J definition.'''
    diff = np.dot(X, theta)+b -y
    return (1./m) * np.dot(np.transpose(X), diff),np.mean(diff)

"""
def gradient_function(theta,b,X, y):
    gradient_theta=0
    gradient_bias=0
    for i in range(m):
        gradient_theta+=X[i]*(X[i]*theta+b-y[i])
        gradient_bias += (X[i] * theta + b - y[i])

    return gradient_theta/m,gradient_bias/m
"""


def gradient_descent(X, y, alpha):
    '''Perform gradient descent.'''
    theta = np.random.random((1,1))
    b=np.random.random((1,1))
    gradient_theta,gradient_bias = gradient_function(theta,b, X, y)
    while not np.all(np.absolute(gradient_theta)+np.absolute(gradient_bias) <= 1e-5):
        theta = theta - alpha * gradient_theta
        b=b-alpha*gradient_bias
        gradient_theta, gradient_bias = gradient_function(theta,b,X, y)
        print("loss:",error_function(theta,b,X, y))
    return theta,b

optimal = gradient_descent(X, y, alpha)
print('optimal:', optimal)
# print('error function:', error_function(optimal[0],optimal[1], X, y)[0,0])

plt.figure(figsize=(6,6))
plt.scatter(X,y,c='r',marker='o')

plt.plot(X,np.dot(X, optimal[0])+optimal[1])

plt.show()

numpy+cupy

参考:https://docs-cupy.chainer.org/en/stable/

将numpy数据转成cupy,使用GPU加速

# -*- coding:utf-8 -*-

import numpy as np
import cupy as cp

import matplotlib.pyplot as plt

m=20
alpha=0.0005

X_cpu=np.linspace(1,100,m).reshape((m,1))
y_cpu=5*X_cpu+3+np.random.randn(m,1)*10

# 转移到GPU
X=cp.asarray(X_cpu)
y=cp.asarray(y_cpu)

def error_function(theta,b ,X, y):
    '''Error function J definition.'''
    diff = cp.dot(X, theta)+b - y
    return (1./2*m) * cp.dot(np.transpose(diff), diff)


def gradient_function(theta,b,X, y):
    '''Gradient of the function J definition.'''
    # diff = np.dot(X, theta)+b -y
    # return (1./m) * np.dot(np.transpose(X), diff),np.mean(diff)
    diff=cp.dot(X,theta)+b-y
    return (1. / m) * cp.dot(cp.transpose(X), diff), cp.mean(diff)


"""
def gradient_function(theta,b,X, y):
    gradient_theta=0
    gradient_bias=0
    for i in range(m):
        gradient_theta+=X[i]*(X[i]*theta+b-y[i])
        gradient_bias += (X[i] * theta + b - y[i])

    return gradient_theta/m,gradient_bias/m
"""


def gradient_descent(X, y, alpha):
    '''Perform gradient descent.'''
    # theta = np.random.random((1,1))
    # b=np.random.random((1,1))

    # gpu模式
    theta=cp.random.random((1,1))
    b=cp.random.random((1,1))

    gradient_theta,gradient_bias = gradient_function(theta,b, X, y)
    # while not np.all(np.absolute(gradient_theta)+np.absolute(gradient_bias) <= 1e-5):
    while not cp.all(cp.absolute(gradient_theta) + cp.absolute(gradient_bias) <= 1e-5):
        theta = theta - alpha * gradient_theta
        b=b-alpha*gradient_bias
        gradient_theta, gradient_bias = gradient_function(theta,b,X, y)
        print("loss:",error_function(theta,b,X, y).get())
    return theta,b

optimal = gradient_descent(X, y, alpha)
print('optimal:', optimal)
# print('error function:', error_function(optimal[0],optimal[1], X, y)[0,0])

plt.figure(figsize=(6,6))
plt.scatter(X_cpu,y_cpu,c='r',marker='o')

plt.plot(cp.asnumpy(X),cp.asnumpy(cp.dot(X, optimal[0])+optimal[1])) # GPU-->CPU , cp.asnumpy(X) or X.get()

plt.show()

numpy+pytorch

将numpy数据放在pytorch中使用GPU加速

# -*- coding:utf-8 -*-

import torch
#import torch.nn as nn
#import torchvision
#import torchvision.transforms as transforms

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

import numpy as np

import matplotlib.pyplot as plt

m=20
alpha=0.0005

X_cpu=np.linspace(1,100,m).reshape((m,1))
y_cpu=5*X_cpu+3+np.random.randn(m,1)*10

# 转移到GPU
X=torch.from_numpy(X_cpu).to(device)
y=torch.from_numpy(y_cpu).to(device)

def error_function(theta,b ,X, y):
    '''Error function J definition.'''
    # diff = np.dot(X, theta)+b - y
    # return (1./2*m) * np.dot(np.transpose(diff), diff)
    diff =torch.matmul(X,theta)+b-y
    return (1. / 2 * m) * torch.matmul(torch.transpose(diff,1,0), diff)


def gradient_function(theta,b,X, y):
    '''Gradient of the function J definition.'''
    # diff = np.dot(X, theta)+b -y
    diff = torch.matmul(X, theta) + b - y
    # return (1./m) * np.dot(np.transpose(X), diff),np.mean(diff)
    return (1./m) * torch.matmul(torch.transpose(X, 1, 0), diff),torch.mean(diff)

"""
def gradient_function(theta,b,X, y):
    gradient_theta=0
    gradient_bias=0
    for i in range(m):
        gradient_theta+=X[i]*(X[i]*theta+b-y[i])
        gradient_bias += (X[i] * theta + b - y[i])

    return gradient_theta/m,gradient_bias/m
"""



def gradient_descent(X, y, alpha):
    '''Perform gradient descent.'''
    theta = torch.from_numpy(np.random.random((1,1))).to(device)
    b=torch.from_numpy(np.random.random((1,1))).to(device)

    gradient_theta,gradient_bias = gradient_function(theta,b, X, y)
    while not np.all(np.absolute(gradient_theta.to("cpu").numpy())+np.absolute(gradient_bias.to("cpu").numpy()) <= 1e-5):
        theta = theta - alpha * gradient_theta
        b=b-alpha*gradient_bias
        gradient_theta, gradient_bias = gradient_function(theta,b,X, y)
        print("loss:",error_function(theta,b,X, y).to("cpu").numpy())
    return theta,b

optimal = gradient_descent(X, y, alpha)
print('optimal:', optimal)
# print('error function:', error_function(optimal[0],optimal[1], X, y)[0,0])

plt.figure(figsize=(6,6))
plt.scatter(X_cpu,y_cpu,c='r',marker='o')

plt.plot(X.to("cpu").numpy(),np.dot(X_cpu, optimal[0].to("cpu").numpy())+optimal[1].to("cpu").numpy())

plt.show()

pytorch

# -*- coding:utf-8 -*-

import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

import numpy as np

import matplotlib.pyplot as plt

m=20
alpha=0.00005
num_epochs=20

X_cpu=np.linspace(1,100,m).reshape((m,1))
X_cpu=X_cpu.astype(np.float32)
y_cpu=5*X_cpu+3+np.random.randn(m,1)*10
y_cpu=y_cpu.astype(np.float32)

# 转移到GPU
X=torch.from_numpy(X_cpu).to(device)
y=torch.from_numpy(y_cpu).to(device)



# Linear regression model


# Fully connected neural network with one hidden layer
class LineNet(nn.Module):
    def __init__(self, input_size, num_classes):
        super(LineNet, self).__init__()
        self.fc1 = nn.Linear(input_size, num_classes)
        # self.relu = nn.ReLU()
        # self.softplus=nn.Softplus()
        # self.fc2 = nn.Linear(hidden_size, num_classes)
        # self.output=nn.Softmax()

    def forward(self, x):
        out = self.fc1(x)
        # out = self.relu(out)
        # out = self.softplus(out)
        # out = self.fc2(out)
        # out=self.output(out)
        return out


# model = nn.Linear(1, 1).to(device) # [1,1]

# or
model = LineNet(1, 1).to(device)

criterion = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=alpha)

for epoch in range(num_epochs):
    outputs = model(X)
    loss = criterion(outputs, y)

    # Backward and optimize
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 5 == 0:
        print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch + 1, num_epochs, loss.item()))

    # Save the model checkpoint
    # torch.save(model.state_dict(), 'model.ckpt')

with torch.no_grad():
    plt.figure(figsize=(6,6))
    plt.scatter(X_cpu,y_cpu,c='r',marker='o')

    # predicted = model(X).to("cpu").numpy()
    # plt.plot(X.to("cpu").numpy(),predicted)

    plt.plot(X_cpu, np.dot(X_cpu, model.fc1.weight.to("cpu").numpy()) + model.fc1.bias.to("cpu").numpy())

    plt.show()

pytorch-2

手动更新参数

# -*- coding:utf-8 -*-

import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

import numpy as np

import matplotlib.pyplot as plt

m=20
alpha=0.00005
num_epochs=20

X_cpu=np.linspace(1,100,m).reshape((m,1))
X_cpu=X_cpu.astype(np.float32)
y_cpu=5*X_cpu+3+np.random.randn(m,1)*10
y_cpu=y_cpu.astype(np.float32)

# 转移到GPU
X=torch.from_numpy(X_cpu).to(device)
y=torch.from_numpy(y_cpu).to(device)



# Linear regression model


# Fully connected neural network with one hidden layer
class LineNet(nn.Module):
    def __init__(self, input_size, num_classes):
        super(LineNet, self).__init__()
        self.fc1 = nn.Linear(input_size, num_classes)
        # self.relu = nn.ReLU()
        # self.softplus=nn.Softplus()
        # self.fc2 = nn.Linear(hidden_size, num_classes)
        # self.output=nn.Softmax()

    def forward(self, x):
        out = self.fc1(x)
        # out = self.relu(out)
        # out = self.softplus(out)
        # out = self.fc2(out)
        # out=self.output(out)
        return out


# model = nn.Linear(1, 1).to(device) # [1,1]

# or
model = LineNet(1, 1).to(device)

criterion = nn.MSELoss()
# optimizer = torch.optim.SGD(model.parameters(), lr=alpha)

curr_lr=alpha

# For updating learning rate
def update_lr(optimizer, lr):
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr


for epoch in range(num_epochs):
    outputs = model(X)
    loss = criterion(outputs, y)

    # Backward and optimize
    # optimizer.zero_grad()
    loss.backward()
    # optimizer.step()

    # 手动更新参数
    model.fc1.weight.data.zero_()
    model.fc1.weight.data.sub_(alpha*model.fc1.weight.grad.data)

    model.fc1.bias.data.zero_()
    model.fc1.bias.data.sub_(alpha * model.fc1.bias.grad.data)

    if (epoch + 1) % 5 == 0:
        print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch + 1, num_epochs, loss.item()))

    # Save the model checkpoint
    # torch.save(model.state_dict(), 'model.ckpt')
    
    """
    # Decay learning rate
    if (epoch + 1) % 20 == 0:
        curr_lr /= 3
        update_lr(optimizer, curr_lr)
    """

with torch.no_grad():
    plt.figure(figsize=(6,6))
    plt.scatter(X_cpu,y_cpu,c='r',marker='o')

    # predicted = model(X).to("cpu").numpy()
    # plt.plot(X.to("cpu").numpy(),predicted)

    plt.plot(X_cpu, np.dot(X_cpu, model.fc1.weight.to("cpu").numpy()) + model.fc1.bias.to("cpu").numpy())

    plt.show()

numpy+tensorflow

使用tf.device("/gpu:0")设置gpu,加速

# -*- coding:utf-8 -*-

import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ['CUDA_VISIBLE_DEVICES'] = "0"

m=20
alpha=0.0005

with tf.device("/gpu:0"):
    X=np.linspace(1,100,m).reshape((m,1))
    y=5*X+3+np.random.randn(m,1)*10


def error_function(theta,b ,X, y):
    with tf.device("/gpu:0"):
        '''Error function J definition.'''
        diff = np.dot(X, theta)+b - y
    return (1./2*m) * np.dot(np.transpose(diff), diff)


def gradient_function(theta,b,X, y):
    with tf.device("/gpu:0"):
        '''Gradient of the function J definition.'''
        diff = np.dot(X, theta)+b -y
    return (1./m) * np.dot(np.transpose(X), diff),np.mean(diff)

"""
def gradient_function(theta,b,X, y):
    gradient_theta=0
    gradient_bias=0
    for i in range(m):
        gradient_theta+=X[i]*(X[i]*theta+b-y[i])
        gradient_bias += (X[i] * theta + b - y[i])

    return gradient_theta/m,gradient_bias/m
"""


def gradient_descent(X, y, alpha):
    with tf.device("/gpu:0"):
        '''Perform gradient descent.'''
        theta = np.random.random((1,1))
        b=np.random.random((1,1))
        gradient_theta,gradient_bias = gradient_function(theta,b, X, y)
        while not np.all(np.absolute(gradient_theta)+np.absolute(gradient_bias) <= 1e-5):
            theta = theta - alpha * gradient_theta
            b=b-alpha*gradient_bias
            gradient_theta, gradient_bias = gradient_function(theta,b,X, y)
            print("loss:",error_function(theta,b,X, y))
    return theta,b

with tf.device("/gpu:0"):
    optimal = gradient_descent(X, y, alpha)

print('optimal:', optimal)
# print('error function:', error_function(optimal[0],optimal[1], X, y)[0,0])

plt.figure(figsize=(6,6))
plt.scatter(X,y,c='r',marker='o')

plt.plot(X,np.dot(X, optimal[0])+optimal[1])

plt.show()

tensorflow

# -*- coding:utf-8 -*-

import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ['CUDA_VISIBLE_DEVICES'] = "0"

m=20
alpha=0.00005
epochs=20

with tf.device("/gpu:0"):
    X=np.linspace(1,100,m).reshape((m,1))
    y=5*X+3+np.random.randn(m,1)*10

input_x=tf.placeholder(tf.float32,(m,1))
output_y=tf.placeholder(tf.float32,(m,1))


# pred=tf.layers.dense(input_x,1)
b = tf.Variable(tf.zeros([1,1]))
W=tf.Variable(tf.ones([1,1]))
pred=tf.matmul(input_x,W)+b

loss=tf.losses.mean_squared_error(output_y,pred)
optimizer=tf.train.GradientDescentOptimizer(alpha)
train_op=optimizer.minimize(loss,var_list=tf.trainable_variables())

# 手动更新参数
# Computing the gradient of cost with respect to W and b
grad_W, grad_b=tf.gradients(loss,[W,b])

# Gradient Step
new_W = W.assign(W - alpha * grad_W) # 相当于 W-=learning_rate*grad_W
new_b = b.assign(b - alpha * grad_b) # 相当于 b-=learning_rate*grad_b

# gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.7)
# sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

gpu_options = tf.GPUOptions(allow_growth=True)
# sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
    tf.global_variables_initializer().run()

    for epoch in range(epochs):
        # _,loss_=sess.run([train_op,loss],{input_x:X,output_y:y})

        # 手动更新参数
        _, _,loss_= sess.run([new_W, new_b,loss],{input_x:X,output_y:y})


        print("loss:",loss_)

    plt.figure(figsize=(6,6))
    plt.scatter(X,y,c='r',marker='o')

    plt.plot(X,pred.eval({input_x:X}))

    plt.show()

julia

using LinearAlgebra # or import LinearAlgebra
using Statistics
using PyPlot

m=20;
alpha=0.0005;
epochs=20;

X=reshape(linspace(1,100,m),(m,1));
y=5 .*X .+3+randn((m,1)).*10;


function error_function(theta,b,X,y)
    #diff=dot(X,theta)+b-y;
    #(1.0/2*m).*dot(transpose(diff),diff);
    diff=X.*theta.+b-y;
    (1.0/2*m).*dot(transpose(diff),diff);
end

gradient_function(theta,b,X, y)=begin
    diff=X.*theta.+b-y; #diff=dot(X,theta)+b-y;
    # or
    # diff=@. X*theta+b-y;
    return (1.0/m) .* dot(transpose(X), diff),mean(diff)
end 

gradient_descent=(X, y, alpha)->begin
    theta=randn(1,1);
    b=randn(1,1);
    gradient_theta,gradient_bias = gradient_function(theta,b, X, y);
    while !all(abs(gradient_theta)+abs(gradient_bias)<=1e-5)
        theta = theta - alpha * gradient_theta;
        b=b-alpha*gradient_bias;
        gradient_theta, gradient_bias = gradient_function(theta,b,X, y);
        print("loss:"*string(error_function(theta,b,X, y))*"\n");
    end
    # print("$theta,$b")
    return theta,b
end

optimal = gradient_descent(X, y, alpha);
print("optimal:$optimal");

figure(figsize=(6,6));
scatter(X,y,c="r",marker="o"); # 注意必须使用双引号

plot(X,X.*optimal[1].+optimal[2]);

show();

  • 2
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值