#语言模型和数据集
import matplotlib.pyplot as plt
import random
import torch
from d2l import torch as d2l
tokens = d2l.tokenize(d2l.read_time_machine())
corpus = [token for line in tokens for token in line]
vocab = d2l.Vocab(corpus)
# print(vocab.token_freqs[:10])
#二元语法
bigram_tokens = [pair for pair in zip(corpus[:-1],corpus[1:])]
bigram_vocab = d2l.Vocab(bigram_tokens)#token 就是两个words
# print(bigram_vocab.token_freqs[:10])
#三元语法
trigram_tokens = [
triple for triple in zip(corpus[:-2],corpus[1:-1],corpus[2:])]
trigram_vocab = d2l.Vocab(trigram_tokens)
# print(trigram_vocab.token_freqs[:10])
#画出freqs
freqs = [freq for token,freq in vocab.token_freqs]
bigram_freqs = [freq for token,freq in bigram_vocab.token_freqs]
trigram_freqs = [freq for token,freq in trigram_vocab.token_freqs]
d2l.plot([freqs,bigram_freqs,trigram_freqs],xlabel='token:x',
ylabel='frequency:n(x)',xscale='log',yscale=&
语言模型-pytorch
于 2022-03-20 13:50:02 首次发布
![](https://img-home.csdnimg.cn/images/20240711042549.png)