1. 公式推导
原式
J ( θ ) = 1 2 ( X θ − Y ) T ( X θ − Y ) = 1 2 ( θ T X T − Y T ) ( X θ − Y ) = 1 2 ( θ T X T X θ − θ T X T Y − Y T X θ + Y T Y ) \begin{aligned} J\left( \theta \right) \,\,&=\,\,\frac{1}{2}\left( X\theta -Y \right) ^T\left( X\theta -Y \right) \\ \\ &=\frac{1}{2}\left( \theta ^TX^T-Y^T \right) \left( X\theta -Y \right) \\ \\ &=\frac{1}{2}\left( \theta ^TX^TX\theta -\theta ^TX^TY-Y^TX\theta +Y^TY \right) \end{aligned} J(θ)=21(Xθ−Y)T(Xθ−Y)=21(θTXT−YT)(Xθ−Y)=21(θTXTXθ−θTXTY−YTXθ+YTY)
求导
求导知识
d
X
T
A
X
d
X
=
2
A
X
\frac{dX^TAX}{dX}=2AX
dXdXTAX=2AX
d X T A d X = A \frac{dX^TA}{dX} = A dXdXTA=A
d A X d X = A T \frac{dAX}{dX} = A^T dXdAX=AT
开始求导
d J ( θ ) d θ = 1 2 ( d θ T X T X θ d θ − d θ T X T Y d θ − d Y T X θ d θ + d Y T Y d θ ) = 1 2 ( 2 X T X θ − X T Y − X T Y ) = X T X θ − X T Y \begin{aligned} \frac{dJ(\theta)}{d\theta} &= \frac{1}{2} (\frac{d\,\, \theta ^TX^TX\theta}{d\theta} - \frac{d\,\theta ^TX^TY}{d\theta}- \frac{d\,Y^TX\theta }{d\theta}+\frac{d\,Y^TY }{d \theta}) \\\, \\ &= \frac{1}{2}(2X^TX\theta - X^TY - X^TY) \\\, \\ &=X^TX\theta-X^TY \end{aligned} dθdJ(θ)=21(dθdθTXTXθ−dθdθTXTY−dθdYTXθ+dθdYTY)=21(2XTXθ−XTY−XTY)=XTXθ−XTY
令导数为0
X T X θ − X T Y = 0 X^TX\theta-X^TY=0 XTXθ−XTY=0
θ = ( X T X ) − 1 X T Y \theta = (X^TX)^{-1}X^TY θ=(XTX)−1XTY
2. torch实现
import torch
from torch import nn
class LinearReg(nn.Module):
def __init__(self, input_dim, out_dim):
super(LinearReg, self).__init__()
self.input_dim = input_dim
self.out_dim = out_dim
self.linear = nn.Linear(input_dim, out_dim, bias=True)
def forward(self, x):
out = self.linear(x)
return out
def main():
data = torch.randn(100, 2)*10
# 假设 y=3*x_1 + 2*x_2 + 5
weight = torch.tensor([[3.], [2]])
bias = torch.tensor([[5.]])
# 构建数据集,增加一些扰动
y = (data @ weight + bias) + torch.randn(100, 2)*2
model = LinearReg(input_dim=2, out_dim=1)
loss_func = nn.MSELoss(reduction="mean")
optimizer = torch.optim.SGD(lr=5e-3, params=model.parameters())
epochs = 1000
for step in range(epochs):
pred = model(data)
loss = loss_func(pred, y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if (step+1) % 10 == 0:
print(f"{step}/{epochs} steps, loss: {loss.item():.4f}")
print("train finished")
# 打印模型权重
print("训练后模型权重如下")
print("weight", model.linear.weight)
print("bias", model.linear.bias)
if __name__ == "__main__":
main()
输出如下
手动求梯度实现
import torch
def diff_mse(x, y, w):
"""
delta_w = x.t@(x@w-y)
"""
return x.transpose(0, 1)@(x@w-y)/x.shape[0]
def mse_loss(x, y, w):
return 0.5*torch.mean(torch.square(x@w-y))
def get_batch_data(x, y, batch_size, step):
data_len = x.shape[0]
start = step*batch_size
end = min(start + batch_size, data_len)
return x[start:end], y[start:end]
def train(epochs, batch_size, lr):
data = torch.randn(100, 2)*2 # [100, 2]
weight = torch.tensor([[3.], [2]]) # [2, 1]
y = data@weight + torch.randn(100, 1)*2
param_w = torch.randn(2, 1)
steps = data.shape[0]//batch_size
for epoch in range(epochs):
for step in range(steps):
x, lb = get_batch_data(data, y, batch_size, step)
loss = mse_loss(x, lb, param_w)
grad = diff_mse(x, lb, param_w)
param_w = param_w - lr*grad
if step % 10 == 0:
print(f"epoch:{epoch}; step:{step}; loss:{loss.item()}")
print(f"train finished, param w: {param_w}")
if __name__ == "__main__":
train(epochs=200, batch_size=8, lr=5e-4)