9.language_model

#1.创建字典和语料库
import torch 
import os
class Dictionary(object):#字典
    def __init__(self):
        self.word2idx={}
        self.idx2word={}
        self.idx=0
    def add_word(self,word):
        if not word in self.word2idx:
            self.word2idx[word]=self.idx
            self.idx2word[self.idx]=word
            self.idx+=1
    def __len__(self):#对象的len方法调用时:len(dictionary)
        return len(self.word2idx)
class Corpus(object):#语料库
    def __init__(self):
        self.dictionary=Dictionary()
    def get_data(self,path,batch_size=20):# 建立字典,Add words to the dictionary
        with open(path,'r') as f:
            tokens=0#文本的单词和句号(eos)总共的个数
            for line in f:
                words = line.split()+['<eos>']#'<eos>'表示一句话的结束,类似于句号
                tokens+=len(words)
                for word in words:
                    self.dictionary.add_word(word)#建立带有句号eos的单词数字字典
        ids=torch.LongTensor(tokens)
        token=0
        with open(path,'r') as f:#将文本库的文字换成数字形式的语料库
            for line in f:
                words=line.split()+['<eos>']
                for word in words:
                    ids[token]=self.dictionary.word2idx[word]#ids[在txt中的第几个单词]=字典中的数字id号
                    token+=1
        num_batch=ids.size(0)//batch_size#一共的单词个数(加eos)//batch_size
        ids=ids[:num_batch*batch_size]#将语料库文字转换成batchsize的整倍数
        return ids.view(batch_size,-1)#返回整个语料库的数据
            
                    
                
                
                  
#网络训练
import torch.nn as nn
import numpy as np
from torch.nn.utils import clip_grad_norm_#梯度裁剪
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')

embed_size=128#将所有的单词映射到128维度,即每个单词是128维度
seq_length=30#输入是30个单词
batch_size=20
num_layers=1
hidden_size=1024
num_epochs=1
learning_rate=0.002
corpus=Corpus()
ids=corpus.get_data('data/train.txt',batch_size)#得到所有的数据,按batch个数据
vocab_size=len(corpus.dictionary)#调用字典的len方法,得到字典内单词的个数(包括一个eos)
num_batches=ids.size(1)//seq_length#所有数据有几个batch,即一个epoch内可以训练几次
class RNNLM(nn.Module):
    def __init__(self,vocab_size,embed_size,hidden_size,num_layers):
        super(RNNLM,self).__init__()
        self.embed=nn.Embedding(vocab_size,embed_size)#将字典内的所有单词映射到128维度上
        self.lstm=nn.LSTM(embed_size,hidden_size,num_layers,batch_first=True)
        self.linear=nn.Linear(hidden_size,vocab_size)#将30个单词输入预测出的30个单词(128维)通过linear层映射到字典坐标上
    def forward(self,x,h):#h是输入的初始状态
        x=self.embed(x)
        out,(h,c)=self.lstm(x,h)
        out=out.reshape(out.size(0)*out.size(1),out.size(2))# Reshape output to (batch_size*sequence_length, hidden_size)
        out=self.linear(out)
        return out,(h,c)
model=RNNLM(vocab_size,embed_size,hidden_size,num_layers).to(device)
criterion=nn.CrossEntropyLoss()
optimizer=torch.optim.Adam(model.parameters(),lr=learning_rate)
def detach(states):
    return [state.detach() for state in states]
for epoch in range(num_epochs):
    states=(torch.zeros(num_layers,batch_size,hidden_size).to(device),torch.zeros(num_layers,batch_size,hidden_size).to(device))
    for i in range(0,ids.size(1)-seq_length,seq_length):#每个batch数据的开始位置
        inputs=ids[:,i:i+seq_length].to(device)#0~30
        targets=ids[:,(i+1):(i+1)+seq_length].to(device)#1~31
        states=detach(states)
        outputs,states=model(inputs,states)
        loss=criterion(outputs,targets.reshape(-1))
        optimizer.zero_grad()
        loss.backward()
        clip_grad_norm_(model.parameters(),0.5)#梯度裁剪
        optimizer.step()
        step=(i+1)//seq_length
        if step%100==0:
            print('epoch[{}/{}], Step[{}/{}],loss:{:.4f},perplexity:{:5.2f}'.format(epoch+1,num_epochs,step,num_batches,loss.item(),np.exp(loss.item())))
num_samples = 100
with torch.no_grad():
    with open('sample.txt','w')as f:
        state=(torch.zeros(num_layers,1,hidden_size).to(device),torch.zeros(num_layers,1,hidden_size).to(device))
        prob=torch.ones(vocab_size)
        inputs=torch.multinomial(prob,num_samples=1).unsqueeze(1).to(device)
        for i in range(num_samples):
            output,state=model(inputs,state)
            prob=output.exp()
            word_id=torch.multinomial(prob,num_samples=1).item()
            inputs.fill_(word_id)
            word=corpus.directionary.idx2word[word_id]
            word='\n' if word =='<eos>' else word+''
            f.write(word)
            if(i+1)%100==0:
                print('sample[{}/{}] words and save to{}'.format(i+1,num_samples,'sample.txt'))
torch.save(model.state_dict(),'model.ckpt')
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

我是小z呀

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值