一、线性回归;Softmax与分类模型、多层感知机
- 注意数据的shape:x, y, y_hat
- 读取数据时,用生成器的方式读取,节约内存。一个epoch,遍历完所有数据。
vs. 随机选取bach_size个数据:不一定遍历完数据集。
3. 交叉熵只关心对正确类别的预测概率
二、文本预处理;语言模型;循环神经网络基础
2.1 语料库中预先加入 pad, bos, eos, unk
if use_special_tokens:
# padding, begin of sentence, end of sentence, unknown
self.pad, self.bos, self.eos, self.unk = (0, 1, 2, 3)
self.idx_to_token += ['', '', '', '']
else:
self.unk = 0
self.idx_to_token += ['']
2.2 对时序数据进行采样:随机采样和相邻采样
1)在随机采样中,每个样本是原始序列上任意截取的一段序列,相邻的两个随机小批量在原始序列上的位置不一定相毗邻。
import torch
import random
def data_iter_random(corpus_indices, batch_size, num_steps, device=None):
# 减1是因为对于长度为n的序列,X最多只有包含其中的前n - 1个字符
num_examples = (len(corpus_indices) - 1) // num_steps # 下取整,得到【不重叠】情况下的样本个数
example_indices = [i * num_steps for i in range(num_examples)] # 每个样本的第一个字符在corpus_indices中的下标
random.shuffle(example_indices)
def _data(i):
# 返回从i开始的长为num_steps的序列
return corpus_indices[i: i + num_steps]
if device is None:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
for i in range(0, num_examples, batch_size):
# 每次选出batch_size个随机样本
batch_indices = example_indices[i: i + batch_size] # 当前batch的各个样本的首字符的下标
X = [_data(j) for j in batch_indices]
Y = [_data(j + 1) for j in batch_indices]
yield torch.tensor(X, device=device), torch.tensor(Y, device=device)
my_seq = list(range(30))
for X, Y in data_iter_random(my_seq, batch_size=2, num_steps=6):
print('X: ', X, '\nY:', Y, '\n')
# X: tensor([[ 6, 7, 8, 9, 10, 11],
# [ 0, 1, 2, 3, 4, 5]])
# Y: tensor([[ 7, 8, 9, 10, 11, 12],
# [ 1, 2, 3, 4, 5, 6]])
# X: tensor([[12, 13, 14, 15, 16, 17],
# [18, 19, 20, 21, 22, 23]])
# Y: tensor([[13, 14, 15, 16, 17, 18],
# [19, 20, 21, 22, 23, 24]])
2)在相邻采样中,相邻的两个随机小批量在原始序列上的位置相毗邻。
def data_iter_consecutive(corpus_indices, batch_size, num_steps, device=None):
if device is None:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
corpus_len = len(corpus_indices) // batch_size * batch_size # 保留下来的序列的长度
# eg. corpus_indices = 11, batch_size = 2, 则保留下来的序列长度为10
corpus_indices = corpus_indices[: corpus_len] # 仅保留前corpus_len个字符,即前10个字符 [0,..,9]
indices = torch.tensor(corpus_indices, device=device)
indices = indices.view(batch_size, -1) # resize成(batch_size, )
# tensor([[0, 1, 2, 3, 4],
# [5, 6, 7, 8, 9]])
batch_num = (indices.shape[1] - 1) // num_steps
# 每个batch,有多少组num_step,减1是因为最多只有包含其中的前n - 1个字符
for i in range(batch_num): # 相当于同时在所有batch上(列),以num_step为步伐(行),进行滑动。
i = i * num_steps
X = indices[:, i: i + num_steps]
Y = indices[:, i + 1: i + num_steps + 1]
yield X, Y
new_seq = list(range(11))
for X, Y in data_iter_consecutive(new_seq, batch_size=2, num_steps=2):
print('X: ', X, '\nY:', Y, '\n')
# X: tensor([[0, 1],
# [5, 6]])
# Y: tensor([[1, 2],
# [6, 7]])
# X: tensor([[2, 3],
# [7, 8]])
# Y: tensor([[3, 4],
# [8, 9]])
2.3.1 index -> one-hot词向量
def one_hot(x, n_class, dtype=torch.float32):
result = torch.zeros(x.shape[0], n_class, dtype=dtype, device=x.device) # shape: (n, n_class)
result.scatter_(1, x.long().view(-1, 1), 1) # result[i, x[i, 0]] = 1
return result
# ``self.long()`` is equivalent to ``self.to(torch.int64)``
# scatter_(dim, index, src) -> Tensor
'''
>>> x = torch.rand(2, 5)
>>> x
tensor([[ 0.3992, 0.2908, 0.9044, 0.4850, 0.6004],
[ 0.5735, 0.9006, 0.6797, 0.4152, 0.1732]])
>>> torch.zeros(3, 5).scatter_(0, torch.tensor([[0, 1, 2, 0, 0], [2, 0, 0, 1, 2]]), x) 行索引
tensor([[ 0.3992, 0.9006, 0.6797, 0.4850, 0.6004],
[ 0.0000, 0.2908, 0.0000, 0.4152, 0.0000],
[ 0.5735, 0.0000, 0.9044, 0.0000, 0.1732]])
>>> z = torch.zeros(2, 4).scatter_(1, torch.tensor([[2], [3]]), 1.23) 列索引
>>> z
tensor([[ 0.0000, 0.0000, 1.2300, 0.0000],
[ 0.0000, 0.0000, 0.0000, 1.2300]])
'''
x = torch.tensor([1,2])
one_hot(x,3)
# tensor([[0., 1., 0.],
# [0., 0., 1.]])
2.3.2 小批量的形状是(批量大小, 时间步数) -- reshape ->时间步数个形状为(批量大小, 词典大小)的矩阵
def to_onehot(X, n_class): # X shape: [batch_size, num_steps]
return [one_hot(X[:, i], n_class) for i in range(X.shape[1])]
X = torch.arange(10).view(2, 5) # batch_size = 2, num_steps = 5
inputs = to_onehot(X, vocab_size) # 返回一个list,list元素个数为num_step,每个元素shape:[batch_size, vocab_size]
2.3.3 初始化模型参数
num_inputs, num_hiddens, num_outputs = vocab_size, 256, vocab_size
# num_inputs: d
# num_hiddens: h, 隐藏单元的个数是超参数
# num_outputs: q
def get_params():
def _one(shape):
param = torch.zeros(shape, device=device, dtype=torch.float32)
nn.init.normal_(param, 0, 0.01)
return torch.nn.Parameter(param)
# 隐藏层参数
W_xh = _one((num_inputs, num_hiddens)) # W不能用0初始化
W_hh = _one((num_hiddens, num_hiddens))
b_h = torch.nn.Parameter(torch.zeros(num_hiddens, device=device)) # b可以用0初始化
# 输出层参数
W_hq = _one((num_hiddens, num_outputs))
b_q = torch.nn.Parameter(torch.zeros(num_outputs, device=device))
return (W_xh, W_hh, b_h, W_hq, b_q)
TBC.