线性回归、softmax与分类模型、多层感知机

线性回归

线性回归模型的从零开始实现
导入所需的包或模块
%matplotlib inline
import torch
from IPython import display
from matplotlib import pyplot as plt
import numpy as np
import random

print(torch.__version__)

1.3.1

生成数据集
# set input feature number 
num_inputs = 2
# set example number
num_examples = 1000

# set true weight and bias in order to generate corresponded label
true_w = [2, -3.4]
true_b = 4.2

features = torch.randn(num_examples, num_inputs,
                      dtype=torch.float32)
labels = true_w[0] * features[:, 0] + true_w[1] * features[:, 1] + true_b
labels += torch.tensor(np.random.normal(0, 0.01, size=labels.size()),
                       dtype=torch.float32)
使用图像来展示生成的数据
plt.scatter(features[:, 1].numpy(), labels.numpy(), 1)

在这里插入图片描述

读取数据集
def data_iter(batch_size, features, labels):
    num_examples = len(features)
    indices = list(range(num_examples))
    random.shuffle(indices)  # random read 10 samples
    for i in range(0, num_examples, batch_size):
        j = torch.LongTensor(indices[i: min(i + batch_size, num_examples)]) # the last time may be not enough for a whole batch
        yield  features.index_select(0, j), labels.index_select(0, j)
batch_size = 10

for X, y in data_iter(batch_size, features, labels):
    print(X, '\n', y)
    break

tensor([[-0.1904, 1.0174],
[-1.1366, 0.8831],
[ 0.5497, 0.0321],
[ 0.5849, -0.7822],
[-2.3851, 1.1355],
[ 1.4613, -0.4030],
[-0.5366, 1.4289],
[-0.2079, -1.4442],
[-0.3157, -1.3547],
[ 0.1486, 0.8594]])
tensor([ 0.3688, -1.0770, 5.1813, 8.0217, -4.4280, 8.5005, -1.7360, 8.6975,
8.1616, 1.5916])

初始化模型参数
w = torch.tensor(np.random.normal(0, 0.01, (num_inputs, 1)), dtype=torch.float32)
b = torch.zeros(1, dtype=torch.float32)

w.requires_grad_(requires_grad=True)
b.requires_grad_(requires_grad=True)

tensor([0.], requires_grad=True)

定义模型
def linreg(X, w, b):
    return torch.mm(X, w) + b
定义损失函数
def squared_loss(y_hat, y): 
    return (y_hat - y.view(y_hat.size())) ** 2 / 2
定义优化函数
def sgd(params, lr, batch_size): 
    for param in params:
        param.data -= lr * param.grad / batch_size # use .data to operate param without gradient track
训练模型
# super parameters init
lr = 0.03
num_epochs = 5

net = linreg
loss = squared_loss

# training
for epoch in range(num_epochs):  # training repeats num_epochs times
    # in each epoch, all the samples in dataset will be used once
    
    # X is the feature and y is the label of a batch sample
    for X, y in data_iter(batch_size, features, labels):
        l = loss(net(X, w, b), y).sum()  
        # calculate the gradient of batch sample loss 
        l.backward()  
        # using small batch random gradient descent to iter model parameters
        sgd([w, b], lr, batch_size)  
        # reset parameter gradient
        w.grad.data.zero_()
        b.grad.data.zero_()
    train_l = loss(net(features, w, b), labels)
    print('epoch %d, loss %f' % (epoch + 1, train_l.mean().item()))

epoch 1, loss 0.031837
epoch 2, loss 0.000110
epoch 3, loss 0.000050
epoch 4, loss 0.000050
epoch 5, loss 0.000050

w, true_w, b, true_b

(tensor([[ 2.0003],
[-3.4003]], requires_grad=True),
[2, -3.4],
tensor([4.1998], requires_grad=True),
4.2)

线性回归模型使用pytorch的简洁实现
导入所需的包或模块
import torch
from torch import nn
import numpy as np
torch.manual_seed(1)

print(torch.__version__)
torch.set_default_tensor_type('torch.FloatTensor')

1.3.1

生成数据集
num_inputs = 2
num_examples = 1000

true_w = [2, -3.4]
true_b = 4.2

features = torch.tensor(np.random.normal(0, 1, (num_examples, num_inputs)), dtype=torch.float)
labels = true_w[0] * features[:, 0] + true_w[1] * features[:, 1] + true_b
labels += torch.tensor(np.random.normal(0, 0.01, size=labels.size()), dtype=torch.float)
读取数据集
import torch.utils.data as Data

batch_size = 10

# combine featues and labels of dataset
dataset = Data.TensorDataset(features, labels)

# put dataset into DataLoader
data_iter = Data.DataLoader(
    dataset=dataset,            # torch TensorDataset format
    batch_size=batch_size,      # mini batch size
    shuffle=True,               # whether shuffle the data or not
    num_workers=2,              # read data in multithreading
)
for X, y in data_iter:
    print(X, '\n', y)
    break

tensor([[-0.1686, 0.4017],
[ 0.2182, -0.3702],
[-0.4322, -0.1381],
[-0.2643, 0.1469],
[-0.3553, -0.9967],
[ 0.1340, 0.4509],
[-0.2119, 1.7887],
[-0.7555, -0.8219],
[ 1.7160, 0.0423],
[-0.4883, 0.3018]])
tensor([ 2.4891, 5.9067, 3.8126, 3.1779, 6.8955, 2.9154, -2.3233, 5.4826,
7.5139, 2.2167])

定义模型
class LinearNet(nn.Module):
    def __init__(self, n_feature):
        super(LinearNet, self).__init__()      # call father function to init 
        self.linear = nn.Linear(n_feature, 1)  # function prototype: `torch.nn.Linear(in_features, out_features, bias=True)`

    def forward(self, x):
        y = self.linear(x)
        return y
    
net = LinearNet(num_inputs)
print(net)

LinearNet(
(linear): Linear(in_features=2, out_features=1, bias=True)
)

# ways to init a multilayer network
# method one
net = nn.Sequential(
    nn.Linear(num_inputs, 1)
    # other layers can be added here
    )

# method two
net = nn.Sequential()
net.add_module('linear', nn.Linear(num_inputs, 1))
# net.add_module ......

# method three
from collections import OrderedDict
net = nn.Sequential(OrderedDict([
          ('linear', nn.Linear(num_inputs, 1))
          # ......
        ]))

print(net)
print(net[0])

Sequential(
(linear): Linear(in_features=2, out_features=1, bias=True)
)
Linear(in_features=2, out_features=1, bias=True)

初始化模型参数
from torch.nn import init

init.normal_(net[0].weight, mean=0.0, std=0.01)
init.constant_(net[0].bias, val=0.0)  # or you can use `net[0].bias.data.fill_(0)` to modify it directly

Parameter containing:
tensor([0.], requires_grad=True)

for param in net.parameters():
    print(param)

Parameter containing:
tensor([[-0.0071, 0.0026]], requires_grad=True)
Parameter containing:
tensor([0.], requires_grad=True)

定义损失函数
loss = nn.MSELoss()    # nn built-in squared loss function
                       # function prototype: `torch.nn.MSELoss(size_average=None, reduce=None, reduction='mean')`
定义优化函数
import torch.optim as optim

optimizer = optim.SGD(net.parameters(), lr=0.03)   # built-in random gradient descent function
print(optimizer)  # function prototype: `torch.optim.SGD(params, lr=, momentum=0, dampening=0, weight_decay=0, nesterov=False)`

SGD (
Parameter Group 0
dampening: 0
lr: 0.03
momentum: 0
nesterov: False
weight_decay: 0
)

训练模型
num_epochs = 3
for epoch in range(1, num_epochs + 1):
    for X, y in data_iter:
        output = net(X)
        l = loss(output, y.view(-1, 1))
        optimizer.zero_grad() # reset gradient, equal to net.zero_grad()
        l.backward()
        optimizer.step()
    print('epoch %d, loss: %f' % (epoch, l.item()))

epoch 1, loss: 0.000193
epoch 2, loss: 0.000135
epoch 3, loss: 0.000083

# result comparision
dense = net[0]
print(true_w, dense.weight.data)
print(true_b, dense.bias.data)

[2, -3.4] tensor([[ 1.9996, -3.4014]])
4.2 tensor([4.2006])

题目

1 假如你正在实现一个全连接层,全连接层的输入形状是7x8,输出形状是7×1,其中7是批量大小,则权重参数ww和偏置参数bb的形状分别是____和____?

答案:8x1和7x1。

参数的形状与批量大小没关系,所以同一个模型可以选择不同的批量大小。

2 定义如下损失函数:

def squared_loss(y_hat, y):
	return (y_hat - y.view(y_hat.size())) ** 2 / 2

将返回结果替换为下面的哪一个会导致会导致模型无法训练?
A (y_hat.view(-1) - y) ** 2 / 2
B (y_hat - y.view(-1)) ** 2 / 2
C (y_hat - y.view(y_hat.shape)) ** 2 / 2
D (y_hat - y.view(-1, 1)) ** 2 / 2

答案:B。

这道题的意思是y_hat和y的形状不同,y_hat是[n,1],y是[n],y.view(y_hat.size())的操作就是把y从[n]变成[n,1]。

(1) .view(-1)的作用是把[n,1]变成[n]。例如:

import torch
 
a = torch.Tensor(4,1)
print(a)
print(a.view(-1))
print(a.shape)
print(a.view(-1).shape)

tensor([[-3.6894e+19],
[ 3.2001e+01],
[ 8.5920e+09],
[ 1.8654e-40]])
tensor([-3.6894e+19, 3.2001e+01, 8.5920e+09, 1.8654e-40])
torch.Size([4, 1])
torch.Size([4])

(2) .view(-1,1)的作用是把[n]变成[n,1],在这道题里和y.view(y_hat.size()),y.view(y_hat.shape)是一样的。

import torch
 
a = torch.Tensor(4)
print(a)
print(a.view(-1,1))
print(a.shape)
print(a.view(-1,1).shape)

tensor([2.3694e-38, 3.2001e+01, 8.5920e+09, 1.8654e-40])
tensor([[2.3694e-38],
[3.2001e+01],
[8.5920e+09],
[1.8654e-40]])
torch.Size([4])
torch.Size([4, 1])

3 在线性回归模型中,对于某个大小为3的批量,标签的预测值和真实值如下表所示:
在这里插入图片描述
该批量的损失函数的平均值为(结果保留三位小数)?

答案:0.112。

用(y-y预测值)2 / 2 ,三个样本求和再求平均。

softmax与分类模型

题目

softmax([100, 101, 102])的结果等于以下的哪一项:
A softmax([10.0, 10.1, 10.2])
B softmax([-100, -101, -102])
C softmax([-2 -1, 0])
D softmax([1000, 1010, 1020])

答案:D。

先用e100+e101+e102求和,再用100,101,102分别除以这个数,就得到softmax之后的三个数字。通过计算可以得到结果一样的是D选项。

多层感知机

多层感知机(multilayer perceptron,MLP)是含有至少一个隐藏层的由全连接层组成的神经网络,且每个隐藏层的输出通过激活函数进行变换。

多层感知机的从零开始实现

使用Fashion-MNIST数据集,用多层感知机对图像进行分类。Fashion-MNIST数据集中图像形状为 28×28 ,类别数为10。

导入所需的包或模块
%matplotlib inline
import d2lzh as d2l
from mxnet import nd
from mxnet.gluon import loss as gloss
获取和读取数据
batch_size = 256
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)
定义模型参数
num_inputs, num_outputs, num_hiddens = 784, 10, 256

W1 = nd.random.normal(scale=0.01, shape=(num_inputs, num_hiddens))
b1 = nd.zeros(num_hiddens)
W2 = nd.random.normal(scale=0.01, shape=(num_hiddens, num_outputs))
b2 = nd.zeros(num_outputs)
params = [W1, b1, W2, b2]

for param in params:
    param.attach_grad()
定义激活函数
def relu(X):
    return nd.maximum(X, 0)
定义模型
def net(X):
    X = X.reshape((-1, num_inputs))
    H = relu(nd.dot(X, W1) + b1)
    return nd.dot(H, W2) + b2
定义损失函数
loss = gloss.SoftmaxCrossEntropyLoss()
训练模型
num_epochs, lr = 5, 0.5
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size,
              params, lr)

epoch 1, loss 0.7971, train acc 0.703, test acc 0.830
epoch 2, loss 0.4974, train acc 0.816, test acc 0.850
epoch 3, loss 0.4273, train acc 0.841, test acc 0.863
epoch 4, loss 0.3912, train acc 0.855, test acc 0.866
epoch 5, loss 0.3699, train acc 0.863, test acc 0.871

多层感知机的简洁实现
导入所需的包或模块
import d2lzh as d2l
from mxnet import gluon, init
from mxnet.gluon import loss as gloss, nn
定义模型
net = nn.Sequential()
net.add(nn.Dense(256, activation='relu'),
        nn.Dense(10))
net.initialize(init.Normal(sigma=0.01))
读取数据并训练模型
batch_size = 256
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)

loss = gloss.SoftmaxCrossEntropyLoss()
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.5})
num_epochs = 5
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, None,
              None, trainer)
题目

1 关于激活函数,以下说法中错误的是:
A 在多层感知机中引入激活函数的原因是,将多个无激活函数的线性层叠加起来,其表达能力与单个线性层相同。
B tanh可以由sigmoid平移伸缩得到,所以两者没有区别。
C 相较于sigmoid和tanh,Relu的主要优势是计算效率高且不会出现梯度消失问题。
D 如果我们需要网络输出范围是[0,1],可以考虑使用sigmoid函数。

答案:B。

2 对于只含有一个隐藏层的多层感知机,输入是256×256的图片,隐藏单元个数是1000,输出类别个数是10,则模型的所有权重矩阵Wi的元素数量之和是多少?

答案:65546000。

用256x256x1000+1000X10计算得出。

资料来源:
伯禹学习平台-ElitesAI·动手学深度学习PyTorch版-线性回归
https://www.boyuai.com/elites/course/cZu18YmweLv10OeV/jupyter/FUT2TsxGNn4g4JY1ayb1W

伯禹学习平台-ElitesAI·动手学深度学习PyTorch版-Softmax与分类模型
https://www.boyuai.com/elites/course/cZu18YmweLv10OeV/jupyter/sgHW6P_Qf9EfSEMnIfBJx

伯禹学习平台-ElitesAI·动手学深度学习PyTorch版-多层感知机
https://www.boyuai.com/elites/course/cZu18YmweLv10OeV/jupyter/U-WdzWhU6C29MaLj0udI5

《动手学深度学习》(DIVE INTO DEEP LEARNING)-3 深度学习基础-3.8 多层感知机
http://zh.gluon.ai/chapter_deep-learning-basics/mlp.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值