语言模型-pytorch


#语言模型和数据集

import matplotlib.pyplot as plt
import random
import torch
from d2l import torch as d2l

tokens = d2l.tokenize(d2l.read_time_machine())
corpus = [token for line in tokens for token in line]
vocab = d2l.Vocab(corpus)
# print(vocab.token_freqs[:10])

#二元语法
bigram_tokens = [pair for pair in zip(corpus[:-1],corpus[1:])]
bigram_vocab = d2l.Vocab(bigram_tokens)#token 就是两个words
# print(bigram_vocab.token_freqs[:10])

#三元语法
trigram_tokens = [
    triple for triple in zip(corpus[:-2],corpus[1:-1],corpus[2:])]
trigram_vocab = d2l.Vocab(trigram_tokens)
# print(trigram_vocab.token_freqs[:10])

#画出freqs
freqs = [freq for token,freq in vocab.token_freqs]
bigram_freqs = [freq for token,freq in bigram_vocab.token_freqs]
trigram_freqs = [freq for token,freq in trigram_vocab.token_freqs]
d2l.plot([freqs,bigram_freqs,trigram_freqs],xlabel='token:x',
         ylabel='frequency:n(x)',xscale='log',yscale='log',
         legend=['unigram','bigram','trigram'])
# d2l.plt.show()

#使用随机抽样随机生成一个小批量子序列
def seq_data_iter_random(corpus,batch_size,num_steps):#num_steps等于T是取的序列长度,用来预测下一个
    corpus = corpus[random.randint(0,num_steps-1):]
    num_subseqs = (len(corpus)-1)//num_steps
    initial_indices = list(range(0,num_subseqs*num_steps,num_steps))
    random.shuffle(initial_indices)

    def data(pos):
        return corpus[pos:pos + num_steps]

    num_batches = num_subseqs//batch_size
    for i in range(0,batch_size*num_batches,batch_size):
        initial_indices_per_batch = initial_indices[i:i +batch_size]#每个序列开始的下标
        X = [data(j) for j in initial_indices_per_batch]
        Y = [data(j+1) for j in initial_indices_per_batch]
        yield torch.tensor(X),torch.tensor(Y)

my_seq = list(range(35))
# for X,Y in seq_data_iter_random(my_seq,batch_size=2,num_steps=5):
#     print('X: ',X,'\nY:',Y)

"""
给定X,预测Y
比如15,预测16
15、16.预测17
20、21、22,预测23
最长T=5
"""

#连续的sequence
def seq_data_iter_sequential(corpus,batch_size,num_steps):
    offset = random.randint(0,num_steps)
    num_tokens = ((len(corpus) - offset - 1)//batch_size)*batch_size
    Xs = torch.tensor(corpus[offset:offset+num_tokens])
    Ys = torch.tensor(corpus[offset+1:offset+num_tokens+1])
    Xs,Ys = Xs.reshape(batch_size,-1),Ys.reshape(batch_size,-1)
    num_batches = Xs.shape[1]//num_steps
    for i in range(0,num_steps*num_batches,num_steps):
        X = Xs[:,i:i+num_steps]
        Y = Ys[:,i:i+num_steps]
        yield X,Y

for X,Y in seq_data_iter_sequential(my_seq,batch_size=2,num_steps=5):
    print('X: ',X,'\nY:',Y)

class SeqDataLoader:
    def __init__(self,batch_size,num_steps,use_random_iter,max_tokens):
        if use_random_iter:
            self.data_iter_fn = d2l.seq_data_iter_random
        else:
            self.data_iter_fn = d2l.seq_data_iter_sequential
        self.corpus,self.vocab = d2l.load_corpus_time_machine(max_tokens)
        self.batch_size,self.num_steps = batch_size,num_steps

    def __iter__(self):
        return self.data_iter_fn(self.corpus,self.batch_size,self.num_steps)

def load_data_time_machine(batch_size,num_steps,use_random_iter=False,max_tokens=10000):
    data_iter = SeqDataLoader(batch_size,num_steps,use_random_iter,max_tokens)
    return data_iter,data_iter.vocab

data_iter,data_iter.vocab=load_data_time_machine(2,5)
print(data_iter,data_iter.vocab)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值