#知识点:
序列数据的采样方法:(1)随机采样 (2)顺序采样
1.随机采样
import random
import torch
def seq_data_iter_random(corous, batch_size, num_steps):
"""参数:
corous:传入的数据
batch_size:每批数据的数量
num_steps:子序列的数据长度(有多少个文本词)
"""
#设置一个随机偏移量
corous = corous[random.randint(0, num_steps-1):] #在python自带的random.randint里面是全闭的,右边包含num_steps-1
#计算原数据长度内可以有多少个子序列,要考虑到标签y,最后一个原数据没有标签
num_subseqs = (len(corous) - 1) // num_steps #整除
#获取取数据的批次的起始索引的序列
initial_indices = list(range(0, num_subseqs * num_steps, num_steps))
#打乱initial_indices内的数据
random.shuffle(initial_indices) #无任何输出值,直接打乱数据来修改原数据
#定义一个内部调用的函数: 取数据
def data(pos):
"""根据传入的索引来取数据
pos:传入的数据"""
return corous[pos: pos + num_steps]
#计算可以取的批次总数
num_batches = num_subseqs // batch_size
#循环取数据
for i in range(0, num_batches * batch_size, batch_size):
#取数据
initial_indices_per_batch = initial_indices[i:i+batch_size]
X = [data(j) for j in initial_indices_per_batch]
Y = [data(j+1) for j in initial_indices_per_batch] #在文本数据中,每一个词的标签目标值Y就是下一个词
yield torch.tensor(X), torch.tensor(Y) #将数据转化为生成器类型的,可以不用持续输出数据,根据每次调用来输出数据
# yield表示返回一个生成器对象,逐个返回数据元素
#测试代码:
my_seq = list(range(35))
for X, Y in seq_data_iter_random(my_seq, batch_size=2, num_steps=5):
print('X:', X, '\nY:', Y)
测试输出结果:
X: tensor([[17, 18, 19, 20, 21], [27, 28, 29, 30, 31]]) Y: tensor([[18, 19, 20, 21, 22], [28, 29, 30, 31, 32]]) X: tensor([[12, 13, 14, 15, 16], [ 2, 3, 4, 5, 6]]) Y: tensor([[13, 14, 15, 16, 17], [ 3, 4, 5, 6, 7]]) X: tensor([[ 7, 8, 9, 10, 11], [22, 23, 24, 25, 26]]) Y: tensor([[ 8, 9, 10, 11, 12], [23, 24, 25, 26, 27]])
2.顺序采样
# 顺序采样
def seq_data_iter_sequential(corpus, batch_size, num_steps):
# 加一个随机偏移量
offset = random.randint(0, num_steps)
# 有效token的长度
num_tokens = ((len(corpus) - offset - 1) // batch_size) * batch_size
Xs = torch.tensor(corpus[offset: offset + num_tokens])
Ys = torch.tensor(corpus[offset + 1: offset + num_tokens + 1])
Xs, Ys = Xs.reshape(batch_size, -1), Ys.reshape(batch_size, -1)
num_batches = Xs.shape[1] // num_steps
for i in range(0, num_steps * num_batches, num_steps):
X = Xs[:, i: i + num_steps]
Y = Ys[:, i: i + num_steps]
yield X, Y
#测试代码:
my_seq = list(range(35))
for X, Y in seq_data_iter_sequential(my_seq, batch_size=2, num_steps=5):
print('X:', X, '\nY:', Y)
X: tensor([[ 4, 5, 6, 7, 8], [19, 20, 21, 22, 23]]) Y: tensor([[ 5, 6, 7, 8, 9], [20, 21, 22, 23, 24]]) X: tensor([[ 9, 10, 11, 12, 13], [24, 25, 26, 27, 28]]) Y: tensor([[10, 11, 12, 13, 14], [25, 26, 27, 28, 29]]) X: tensor([[14, 15, 16, 17, 18], [29, 30, 31, 32, 33]]) Y: tensor([[15, 16, 17, 18, 19], [30, 31, 32, 33, 34]])