用LSTM、GRU来训练字符级的语言模型

用LSTM、GRU来训练字符级的语言模型

import torch
import torch.nn as nn
import torch.utils.data as Data
import torch.autograd as autograd
import torch.optim as optim
import numpy as np
from torch.autograd import Variable

# 读取文件
poetrys = []
poetry = ''
with open("poetryFromTang.txt", encoding='utf-8') as f:
    next(f)
    for line in f:
        if len(line)!=1:
            poetry += line.strip('\n')
        else:
            poetrys.append(poetry)
            poetry = ''

# 生成词库
all_word = ''
for potery in poetrys:
    all_word += potery

all_word = all_word.replace(',','').replace('。','')

# 统计词频
word_dict = {}

for word in all_word:
    if word not in word_dict:
        word_dict[word] = 1
    else:
        word_dict[word] += 1
        
word_sort = sorted(word_dict.items(),key=lambda x:x[1],reverse=True)
words, _ = zip(*word_sort)

# 获取词典
word_to_token = {word:id for id, word in enumerate(words)}
token_to_word = dict(enumerate(words))

# 将字序列转化为id序列
def transword(char_list):
    ids = [word_to_token.get(char, len(word)-1) for char in char_list]
    return ids 

生成数据集

# 生成数据集,用每句诗的前几个字预测最后一个字。因为每个batch的训练集长度要一致,所以五言诗和七言诗分开。
len1 = 4
len2 = 6
data = [line.replace(',', ' ').replace('。', ' ').split() for line in poetrys]

x_5 = []
x_7 = []
y_5 = []
y_7 = []
for i in data:
    for j in i:
        if len(j) == len1+1:
            x_5.append(j[:len1]) 
            y_5.append(j[-1])
        elif len(j) == len2+1:
            x_7.append(j[:len2]) 
            y_7.append(j[-1])
        else:
            pass

x_5_vec = [transword(i) for i in x_5]
x_7_vec = [transword(i) for i in x_7]
y_5_vec = [transword(i) for i in y_5]
y_7_vec = [transword(i) for i in y_7]

定义参数

# 定义参数

BATCH_SIZE = 32
learning_rate = 0.01
epoch_num = 100
embedding_size = 300
hidden_size = 256
dropout_size = 0.4
vocab_size = len(all_word)
model_name = 'gru'
num_layers = 2

构建数据生成器

# 先转换成 torch 能识别的 Dataset
torch_dataset1 = Data.TensorDataset(torch.tensor(x_5_vec, dtype=torch.long), torch.tensor(y_5_vec, dtype=torch.long))
torch_dataset2 = Data.TensorDataset(torch.tensor(x_7_vec, dtype=torch.long), torch.tensor(y_7_vec, dtype=torch.long))
 
# 把 dataset 放入 DataLoader
loader1 = Data.DataLoader(
    dataset=torch_dataset1,  # torch TensorDataset format
    batch_size=BATCH_SIZE,  # mini batch size
    shuffle=True,  #
    num_workers=2,  # 多线程来读数据
)

loader2 = Data.DataLoader(
    dataset=torch_dataset2,  # torch TensorDataset format
    batch_size=BATCH_SIZE,  # mini batch size
    shuffle=True,  #
    num_workers=2,  # 多线程来读数据
)

构建模型

# 建立
class PoetryGenerator(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_size, model_name='lstm'):
        super(PoetryGenerator, self).__init__()
        self.model = model_name
        self.embed = nn.Embedding(vocab_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers, batch_first=True, dropout=dropout_size)
        self.gru = nn.GRU(embedding_size, hidden_size, num_layers, batch_first=True, dropout=dropout_size )
        self.F = nn.Linear(hidden_size, vocab_size)
    
    def forward(self, x, hs=None): 
        x_embedding = self.embed(x)
        batch, seq_len = x.shape
        if hs is None:
            hs = Variable(
                torch.zeros(num_layers, batch, hidden_size))
        if self.model=='lstm':
            out, _ = self.lstm(x_embedding, hs)
        else:
            out, _ = self.gru(x_embedding, hs)
        outputs = self.F(out[:,-1,:])
        
        return outputs, _

model = PoetryGenerator(vocab_size, embedding_size, hidden_size, model_name)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

训练

from torch.autograd import Variable
# Training
for epoch in range(epoch_num):
    optimizer.zero_grad()
    for step, (batch_x, batch_y) in enumerate(loader1):
        output, _ = model(batch_x)
        loss = criterion(output, batch_y.view(-1))
        print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))
    for step, (batch_x, batch_y) in enumerate(loader2):
        output, _ = model(batch_x)
        loss = criterion(output, batch_y.view(-1))
        print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))

    loss.backward()
    optimizer.step()

生成一首诗

# 从前n个数据随机选
def pick_top_n(preds, top_n=10):
    top_pred_prob, top_pred_label = torch.topk(preds, top_n, 1)
    top_pred_prob /= torch.sum(top_pred_prob)
    top_pred_prob = top_pred_prob.squeeze(0).detach().numpy() 
    top_pred_label = top_pred_label.squeeze(0).detach().numpy() 
    c = np.random.choice(top_pred_label, size=1, p=top_pred_prob)
    
    return c[0]

def generate_random(max_len=20):
    """自由生成一首诗歌"""
    poetry = []
    sentence_len = 0
    random_word = [np.random.randint(0,vocab_size)]
    _ = Variable(torch.zeros(2, 1, 256))
    input = torch.LongTensor(random_word).reshape(1,1)
    for i in range(max_len):
        # 前向计算出概率最大的当前词
        proba, _ = model(input, _)
        top_index = pick_top_n(proba)
        char = token_to_word[top_index]
        
        input = (input.data.new([top_index])).view(1, 1)
        poetry.append(char)
    return poetry

poetry = generate_random()
i = 0
for word in poetry:
    print(word,end='')
    i += 1
    if i%5==0:
        print()

生成的诗

悬神翻苑屋
蝶侧末无已
路杀青孤前
干干功眠寒

最后貌似效果不是很好,可能数据集有点小。

实现过程中借鉴了很多资料,并且代码写得很烂,欢迎大佬们指导!

  • 1
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值