1.sequence model
import torch from torch import nn from d2l import torch as d2l def init_weights(m): """初始化网络权重""" if type(m) == nn.Linear: nn.init.xavier_uniform_(m.weight) def get_net(): """简单多层感知机""" """nn.Sequential:这是 PyTorch 中用于顺序组合多个模块的容器。它按照模块在构造函数中传递的顺序来执行它们。""" net = nn.Sequential(nn.Linear(4, 10), nn.ReLU(), nn.Linear(10, 1)) """net.apply(init_weights):使用 apply 方法将 init_weights 函数应用到网络中的每一个模块上,从而初始化所有线性层的权重。""" net.apply(init_weights) return net def train(net, train_iter, loss, epochs, lr): trainer = torch.optim.Adam(net.parameters(), lr) for epoch in range(epochs): for X, y in train_iter: trainer.zero_grad() l = loss(net(X), y) l.sum().backward() trainer.step() print(f'epoch{epoch + 1}' f'loss:{d2l.evaluate_loss(net, train_iter, loss):f}') if __name__ == '__main__': T = 1000 #产生的点 time = torch.arange(1, T + 1, dtype=torch.float32) x = torch.sin(0.01 * time) + torch.normal(0, 0.2, (T,)) #d2l.plot(time, [x], 'time', 'x', xlim=[1, 1000], figsize=(6, 3)) tau = 4 #预测值 #创建了一个形状为(T - tau, tau)的零张量features。其中,T是x的长度。features的每一行可能代表一个时间窗口,其长度为tau。 features = torch.zeros((T - tau, tau)) #在这个循环中,features的每一列被填充为x的相应子序列。具体来说,features的第i列被填充为x中从索引i开始到T - tau + i的子序列。 for i in range(tau): features[:, i] = x[i: T - tau + i] labels = x[tau:].reshape(-1, 1) batch_size, n_train = 16, 600 train_iter = d2l.load_array((features[:n_train], labels[:n_train]), batch_size, is_train=True) """reduction 参数决定了如何对多个样本的损失进行聚合。当你设置 reduction='none' 时,损失函数不会进行任何聚合操作,而是返回每个样本的损失值。 这意味着,如果你有一个批量的数据,nn.MSELoss(reduction='none') 会返回一个与批量大小相同的张量,其中每个元素都是对应样本的损失值。""" """你希望得到一个单一的损失值,你可以使用 reduction='mean' 或 reduction='sum'。 reduction='mean' 会计算所有样本损失的平均值,而 reduction='sum' 会计算所有样本损失的总和。""" loss = nn.MSELoss(reduction='none') net = get_net() train(net, train_iter, loss, 5, 0.01) onestep_preds = net(features) multistep_preds = torch.zeros(T) multistep_preds[: n_train + tau] = x[: n_train + tau] """multistep_preds[i - tau:i]:这部分代码取出 multistep_preds 中从 i - tau 到 i - 1 的子序列,这表示当前要预测的时间步的前 tau 个预测值。 reshape(1, -1):将取出的子序列重新塑形,以适应 net 的输入要求。假设 net 的输入是一个二维张量,其中第一维是批次大小(在这里是 1,表示单个样本),第二维是特征数量(即 tau)。 net(multistep_preds[i - tau:i].reshape(1, -1)):将重新塑形后的子序列输入到神经网络 net 中,得到当前时间步的预测值。""" for i in range(n_train + tau, T): multistep_preds[i] = net(multistep_preds[i - tau:i].reshape(1, -1)) d2l.plot([time, time[tau:], time[n_train + tau:]], [x.detach().numpy(), onestep_preds.detach().numpy(), multistep_preds[n_train + tau:].detach().numpy()], 'time', 'x', legend=['data', '1-step preds', 'multistep_preds'], xlim=[1, 1000], figsize=(6, 3)) max_steps = 64 features1 = torch.zeros((T - tau - max_steps + 1, tau + max_steps)) #列i(i<tau)来自x的观测,时间步由(i+1)到(i+T-tau-max_steps+1) for i in range(tau): features1[:, i] = x[i: i + T - tau - max_steps + 1] #列i(tau<i<tau+max_steps)是来自(i-tau+1)的预测,时间步由(i+1)到(i+T-tau-max_steps+1) for i in range(tau, tau + max_steps): """.reshape(-1) 是一种用于调整张量形状的方法。这里的 -1 是一个特殊的值,它告诉 PyTorch 自动计算该维度的大小,以便保持张量中元素的总数不变。 你对一个张量调用 .reshape(-1) 时,你实际上是在告诉 PyTorch:我想改变这个张量的形状,但我不知道新的形状应该是多少。请自动计算这个维度的大小,以确保元素总数保持不变。 这通常在不知道新形状的确切维度,但想将张量转换为一维数组时使用。""" features1[:, i] = net(features1[:, i - tau:i]).reshape(-1) steps = (1, 4, 16, 64) d2l.plot([time[tau + j -1: T - max_steps + j]for j in steps], [features1[:, (tau + j -1)].detach().numpy() for j in steps], 'time', 'x', legend=[f'{j}-step preds' for j in steps], xlim=[5, 1000], figsize=(6,3))
2.language model
import random import collections import torch import re from d2l import torch as d2l class SeqDataLoader: """加载序列数据迭代器""" def __init__(self, batch_size, num_steps, use_random_iter, max_tokens): if use_random_iter: self.data_iter_fn = seq_data_iter_random else: self.data_iter_fn = seq_data_iter_sequential self.corpus, self.vocab = load_corpus_time_machine(max_tokens) self.batch_size, self.num_steps = batch_size, num_steps def __iter__(self): return self.data_iter_fn(self.corpus, self.batch_size, self.num_steps) class Vocab: """文本词表""" def __init__(self, tokens=None, min_freq=0, reserved_tokens=None): # 如果一个token在文本序列里面出现的次数少于min_freq词我们就将它丢掉 if tokens is None: tokens = [] if reserved_tokens is None: reserved_tokens = [] counter = count_corpus(tokens) self.token_freqs = sorted(counter.items(), key=lambda x: x[1], reverse=True) # 如果我们有每个token出现的频率的话,我们就将它sort一下,从大排到小 self.unk, uniq_tokens = 0, ['<unk>'] + reserved_tokens # 这里有个unknow token这就是位置的token uniq_tokens += [ token for token, freq in self.token_freqs if freq >= min_freq and token not in uniq_tokens] # 意思就是将token次数大于等于min_freq的东西放到这里,小于的就丢掉 self.idx_to_token, self.token_to_idx = [], dict() # 给一个token返回一个index for token in uniq_tokens: self.idx_to_token.append(token) self.token_to_idx[token] = len(self.idx_to_token) - 1 def __len__(self): return len(self.idx_to_token) # uniq_token的个数 def __getitem__(self, tokens): if not isinstance(tokens, (list, tuple)): return self.token_to_idx.get(tokens, self.unk) return [self.__getitem__(token) for token in tokens] # 返回index def to_tokens(self, indices): if not isinstance(indices, (list, tuple)): return self.idx_to_token[indices] return [self.idx_to_token[index] for index in indices] # 给一个下标返回一个token def count_corpus(tokens): # 计算每个token出现的次数 """统计标记的频率。""" if len(tokens) == 0 or isinstance(tokens[0], list): tokens = [token for line in tokens for token in line] return collections.Counter(tokens) def load_data_time_machine(batch_size, num_steps, use_random_iter=False, max_tokens=10000): data_iter = SeqDataLoader(batch_size, num_steps, use_random_iter=False, max_tokens=10000) return data_iter, data_iter.vocab def tokenize(lines, token='word'): """将文本行拆分为单词或字符标记。""" if token == 'word': # 一个字符串 return [line.split() for line in lines] elif token == 'char': return [list(line) for line in lines] else: print('错误:未知令牌类型:' + token) def seq_data_iter_sequential(corpus, batch_size, num_steps): """使用顺序分区生成一个小批量自序列""" #从随机偏移量开始拆分序列 offset = random.randint(0, num_steps) num_tokens = ((len(corpus) - offset - 1) // batch_size) * batch_size Xs = torch.tensor(corpus[offset: offset + num_tokens]) Ys = torch.tensor(corpus[offset + 1: offset + 1 + num_tokens]) Xs, Ys = Xs.reshape(batch_size, -1), Ys.reshape(batch_size, -1) num_batches = Xs.shape[1] // num_steps for i in range(0, num_steps * num_batches, num_steps): X = Xs[:,i: i + num_steps] Y = Ys[:,i: i + num_steps] yield X, Y def load_corpus_time_machine(max_tokens=-1): """返回时光机器数据集的标记索引列表和词汇表。""" lines = read_time_machine() tokens = tokenize(lines, 'char') vocab = Vocab(tokens) corpus = [vocab[token] for line in tokens for token in line] # 将每一个行都丢进vacublary里面转换为数字下标 if max_tokens > 0: corpus = corpus[:max_tokens] return corpus, vocab def seq_data_iter_random(corpus, batch_size, num_steps): """使用随机抽样生成一个小批量自序列""" #从随机偏移量开始对序列进行分区,随机范围包括num_steps-1 corpus = corpus[random.randint(0, num_steps - 1):] #考虑标签,减去1 num_subseqs = (len(corpus) - 1) // num_steps #长度为num_steps的子序列的起始索引 initial_indices = list(range(0, num_subseqs * num_steps, num_steps)) #在随机抽样的迭代过程中,来自两个相邻的、小批量中的子序列不一定在原始序列中相邻 random.shuffle(initial_indices) def data(pos): #返回从pose开始的num_steps长的原始序列 return corpus[pos: pos + num_steps] num_batches = num_subseqs //batch_size for i in range(0, batch_size * num_batches, batch_size): #在这里,initial_indices包含子序列的随机起始索引 initial_indices_per_batch = initial_indices[i : i +batch_size] X = [data(j) for j in initial_indices_per_batch] Y = [data(j + 1) for j in initial_indices_per_batch] yield torch.tensor(X), torch.tensor(Y) def read_time_machine(): """将时间机器数据集加载到文本行列表中.""" d2l.DATA_HUB['time_machine'] = (d2l.DATA_URL + 'timemachine.txt', '090b5e7e70c295757f55df93cb0a180b9691891a') # load一本书 with open(d2l.download('time_machine'), 'r') as f: lines = f.readlines() return [re.sub('[^A-Za-z]+', ' ', line).strip().lower() for line in lines] # 将不是字母的东西全部变为空格 if __name__ == '__main__': lines = read_time_machine() tokens = d2l.tokenize(read_time_machine()) #因为每个文本行不一定是一个句子或者一个段落,所以将所有文本行联系到一起 corpus = [token for line in tokens for token in line] vocab = d2l.Vocab(corpus) #vocab.token_freqs[:10] freqs = [freq for token, freq in vocab.token_freqs] #d2l.plot(freqs, xlabel='token: x', ylabel='frequency: n(x)', # xscale='log', yscale='log') """ corpus[:-1]:取corpus中除了最后一个元素之外的所有元素。 corpus[1:]:取corpus中从第二个元素开始到最后一个元素的所有元素。 zip(corpus[:-1], corpus[1:])会把corpus中相邻的元素打包成元组 例如:如果corpus = ['我', '喜欢', '吃', '苹果'],那么zip(corpus[:-1], corpus[1:])的结果将会是:[('我', '喜欢'), ('喜欢', '吃'), ('吃', '苹果')]。 """ bigram_tokens = [pair for pair in zip(corpus[:-1], corpus[1:])] bigram_vocab = d2l.Vocab(bigram_tokens) trigram_tokens = [triple for triple in zip(corpus[:-2], corpus[:-1], corpus[1:])] trigram_vocab = d2l.Vocab(trigram_tokens) bigram_freqs = [freq for token, freq in bigram_vocab.token_freqs] trigram_freqs = [freq for token, freq in trigram_vocab.token_freqs] d2l.plot([freqs, bigram_freqs, trigram_freqs], xlabel='token: x', ylabel='frequency: n(x)', xscale='log', yscale='log', legend=['unigram', 'bigram', 'trigram'])
问题
有一些书上代码以及无法使用,需要自己写一下函数才能使用
3.RNN
import math import torch from torch import nn from torch.nn import functional as F from d2l import torch as d2l from language_model import load_data_time_machine class RNNModelScratch: def __init__(self, vocab_size, num_hiddens, device, get_params, init_state, forward_fn): self.vocab_size, self.num_hiddens = vocab_size, num_hiddens self.params = get_params(vocab_size, num_hiddens, device) self.init_state, self.forward_fn = init_state, forward_fn def __call__(self, X, state): X = F.one_hot(X.T, self.vocab_size).type(torch.float32) return self.forward_fn(X, state, self.params) def begin_state(self, batch_size, device): return self.init_state(batch_size, self.num_hiddens, device) def get_params(vocab_size, num_hiddens, device): num_inputs = num_outputs = vocab_size def normal(shape): return torch.randn(size=shape, device=device) * 0.01 #隐藏层参数 W_xh = normal((num_inputs, num_hiddens)) W_hh = normal((num_hiddens, num_hiddens)) b_h = torch.zeros(num_hiddens, device=device) #输出层参数 W_hq = normal((num_hiddens, num_outputs)) b_q = torch.zeros(num_outputs, device=device) #附加梯度 params = [W_xh, W_hh, b_h, W_hq, b_q] for param in params: param.requires_grad_(True) return params def init_rnn_state(batch_size, num_hiddens, device): """初始化返回隐状态""" return (torch.zeros((batch_size, num_hiddens), device=device)) def rnn(inputs, state, params): """计算一个时间步内的隐状态与输出""" W_xh, W_hh, b_h, W_hq, b_q = params H = state outputs = [] #input的形状为(时间步数, 批量大小, 词表大小), X的形状为(批量大小, 词表大小) for X in inputs: H = torch.tanh(torch.mm(X, W_xh) + torch.mm(H, W_hh) + b_h) Y = torch.mm(H, W_hq) + b_q outputs.append(Y) return torch.cat(outputs, dim=0), (H,) def predict_ch8(prefix, num_preds, net, vocab, device): """在prefix后面生成新字符""" state = net.begin_state(batch_size=1, device=device) outputs = [vocab[prefix[0]]] get_input = lambda :torch.tensor([outputs[-1]], device=device).reshape((1, 1)) for y in prefix[1:]: #预热期 _, state = net(get_input(), state) outputs.append(vocab[y]) for _ in range(num_preds): #预测num_preds步 y, state = net(get_input(), state) outputs.append(int(y.argmax(dim=1).reshape(1))) return ' '.join([vocab.idx_to_token[i] for i in outputs]) def grad_clipping(net, theta): """截断梯度""" if isinstance(net, nn.Module): params = [p for p in net.parameters() if p.requires_grad] else: params = net.params norm = torch.sqrt(sum(torch.sum((p.grad ** 2)) for p in params)) if norm > theta: for param in params: param.grad[:] *= theta / norm if __name__ == '__main__': batch_size, num_steps = 32, 35 train_iter, vocab = load_data_time_machine(batch_size, num_steps) """F.one_hot 是一个函数,用于将整数张量转换为one-hot编码张量。 F.one_hot(torch.tensor([0, 2]), len(vocab)) 将整数张量 [0, 2] 转换为one-hot编码张量。 如果 len(vocab) 是 3(例如,vocab中有三个类别),那么结果张量将如下: tensor([[1, 0, 0], [0, 0, 1]])""" #F.one_hot(torch.tensor([0, 2], len(vocab))) X = torch.arange(10).reshape((2, 5)) num_hiddens = 512 net = RNNModelScratch(len(vocab), num_hiddens, d2l.try_gpu(), get_params, init_rnn_state, rnn) state = net.begin_state(X.shape[0], d2l.try_gpu()) Y, new_state = net(X.to(d2l.try_gpu()), state)
训练部分待之后空闲时间写完